diff --git "a/700mOpen-R1-Distill/trainer_state.json" "b/700mOpen-R1-Distill/trainer_state.json" new file mode 100644--- /dev/null +++ "b/700mOpen-R1-Distill/trainer_state.json" @@ -0,0 +1,46915 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5859, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017067759003242875, + "grad_norm": 4.927246022675701, + "learning_rate": 4e-05, + "loss": 1.0624, + "num_tokens": 106946.0, + "step": 1 + }, + { + "epoch": 0.0003413551800648575, + "grad_norm": 3.048352235606045, + "learning_rate": 3.9993172896398704e-05, + "loss": 1.0364, + "num_tokens": 195642.0, + "step": 2 + }, + { + "epoch": 0.0005120327700972862, + "grad_norm": 2.234228813740128, + "learning_rate": 3.998634579279741e-05, + "loss": 0.9699, + "num_tokens": 296243.0, + "step": 3 + }, + { + "epoch": 0.000682710360129715, + "grad_norm": 1.3037893496300843, + "learning_rate": 3.997951868919611e-05, + "loss": 0.9454, + "num_tokens": 405415.0, + "step": 4 + }, + { + "epoch": 0.0008533879501621437, + "grad_norm": 1.1651162875802463, + "learning_rate": 3.997269158559482e-05, + "loss": 0.8487, + "num_tokens": 495347.0, + "step": 5 + }, + { + "epoch": 0.0010240655401945725, + "grad_norm": 1.1147829929966337, + "learning_rate": 3.996586448199352e-05, + "loss": 0.9132, + "num_tokens": 588334.0, + "step": 6 + }, + { + "epoch": 0.0011947431302270011, + "grad_norm": 1.0673615638167613, + "learning_rate": 3.9959037378392226e-05, + "loss": 0.8355, + "num_tokens": 689719.0, + "step": 7 + }, + { + "epoch": 0.00136542072025943, + "grad_norm": 1.060713406138517, + "learning_rate": 3.995221027479092e-05, + "loss": 0.8826, + "num_tokens": 776675.0, + "step": 8 + }, + { + "epoch": 0.0015360983102918587, + "grad_norm": 1.0379411159025862, + "learning_rate": 3.9945383171189627e-05, + "loss": 0.8632, + "num_tokens": 852690.0, + "step": 9 + }, + { + "epoch": 0.0017067759003242873, + "grad_norm": 1.06322243417711, + "learning_rate": 3.993855606758833e-05, + "loss": 0.9177, + "num_tokens": 941448.0, + "step": 10 + }, + { + "epoch": 0.0018774534903567162, + "grad_norm": 0.9303315782513246, + "learning_rate": 3.9931728963987034e-05, + "loss": 0.8984, + "num_tokens": 1041055.0, + "step": 11 + }, + { + "epoch": 0.002048131080389145, + "grad_norm": 0.9982996896210765, + "learning_rate": 3.9924901860385734e-05, + "loss": 0.9014, + "num_tokens": 1157013.0, + "step": 12 + }, + { + "epoch": 0.002218808670421574, + "grad_norm": 0.797165502082853, + "learning_rate": 3.9918074756784435e-05, + "loss": 0.8393, + "num_tokens": 1273374.0, + "step": 13 + }, + { + "epoch": 0.0023894862604540022, + "grad_norm": 0.826217184562919, + "learning_rate": 3.991124765318314e-05, + "loss": 0.9264, + "num_tokens": 1394287.0, + "step": 14 + }, + { + "epoch": 0.002560163850486431, + "grad_norm": 0.9091360844201513, + "learning_rate": 3.990442054958184e-05, + "loss": 0.8342, + "num_tokens": 1469440.0, + "step": 15 + }, + { + "epoch": 0.00273084144051886, + "grad_norm": 0.8403549260768445, + "learning_rate": 3.989759344598055e-05, + "loss": 0.8535, + "num_tokens": 1566286.0, + "step": 16 + }, + { + "epoch": 0.0029015190305512885, + "grad_norm": 0.7596256216386689, + "learning_rate": 3.989076634237925e-05, + "loss": 0.8317, + "num_tokens": 1659927.0, + "step": 17 + }, + { + "epoch": 0.0030721966205837174, + "grad_norm": 0.7813454419126862, + "learning_rate": 3.988393923877795e-05, + "loss": 0.7804, + "num_tokens": 1745869.0, + "step": 18 + }, + { + "epoch": 0.0032428742106161462, + "grad_norm": 0.8137842002351539, + "learning_rate": 3.987711213517666e-05, + "loss": 0.775, + "num_tokens": 1835846.0, + "step": 19 + }, + { + "epoch": 0.0034135518006485747, + "grad_norm": 0.8056565641218325, + "learning_rate": 3.987028503157536e-05, + "loss": 0.7516, + "num_tokens": 1912401.0, + "step": 20 + }, + { + "epoch": 0.0035842293906810036, + "grad_norm": 0.9069662951755969, + "learning_rate": 3.986345792797406e-05, + "loss": 0.785, + "num_tokens": 2003446.0, + "step": 21 + }, + { + "epoch": 0.0037549069807134325, + "grad_norm": 0.7653799056401251, + "learning_rate": 3.985663082437276e-05, + "loss": 0.8327, + "num_tokens": 2110659.0, + "step": 22 + }, + { + "epoch": 0.003925584570745861, + "grad_norm": 0.756816022288689, + "learning_rate": 3.9849803720771466e-05, + "loss": 0.7609, + "num_tokens": 2184697.0, + "step": 23 + }, + { + "epoch": 0.00409626216077829, + "grad_norm": 0.745738850707005, + "learning_rate": 3.9842976617170166e-05, + "loss": 0.8217, + "num_tokens": 2286020.0, + "step": 24 + }, + { + "epoch": 0.004266939750810718, + "grad_norm": 0.7727404391184417, + "learning_rate": 3.983614951356887e-05, + "loss": 0.7643, + "num_tokens": 2372368.0, + "step": 25 + }, + { + "epoch": 0.004437617340843148, + "grad_norm": 0.9247724452839717, + "learning_rate": 3.9829322409967574e-05, + "loss": 0.8605, + "num_tokens": 2438364.0, + "step": 26 + }, + { + "epoch": 0.004608294930875576, + "grad_norm": 0.7257810538456507, + "learning_rate": 3.982249530636628e-05, + "loss": 0.7053, + "num_tokens": 2511279.0, + "step": 27 + }, + { + "epoch": 0.0047789725209080045, + "grad_norm": 0.8044674783173333, + "learning_rate": 3.981566820276498e-05, + "loss": 0.7294, + "num_tokens": 2601295.0, + "step": 28 + }, + { + "epoch": 0.004949650110940434, + "grad_norm": 0.6872460019225002, + "learning_rate": 3.980884109916368e-05, + "loss": 0.7687, + "num_tokens": 2692058.0, + "step": 29 + }, + { + "epoch": 0.005120327700972862, + "grad_norm": 0.7555484216601004, + "learning_rate": 3.980201399556239e-05, + "loss": 0.7828, + "num_tokens": 2771157.0, + "step": 30 + }, + { + "epoch": 0.005291005291005291, + "grad_norm": 0.7640368483439702, + "learning_rate": 3.979518689196109e-05, + "loss": 0.6756, + "num_tokens": 2844059.0, + "step": 31 + }, + { + "epoch": 0.00546168288103772, + "grad_norm": 0.7715086618293171, + "learning_rate": 3.978835978835979e-05, + "loss": 0.8123, + "num_tokens": 2923493.0, + "step": 32 + }, + { + "epoch": 0.0056323604710701485, + "grad_norm": 0.761091174042492, + "learning_rate": 3.978153268475849e-05, + "loss": 0.7255, + "num_tokens": 2995263.0, + "step": 33 + }, + { + "epoch": 0.005803038061102577, + "grad_norm": 0.7837044890766441, + "learning_rate": 3.97747055811572e-05, + "loss": 0.8633, + "num_tokens": 3098116.0, + "step": 34 + }, + { + "epoch": 0.005973715651135006, + "grad_norm": 0.7374929404358518, + "learning_rate": 3.97678784775559e-05, + "loss": 0.7765, + "num_tokens": 3200843.0, + "step": 35 + }, + { + "epoch": 0.006144393241167435, + "grad_norm": 0.7118141241703863, + "learning_rate": 3.9761051373954604e-05, + "loss": 0.8119, + "num_tokens": 3300176.0, + "step": 36 + }, + { + "epoch": 0.006315070831199863, + "grad_norm": 0.8088010123025907, + "learning_rate": 3.9754224270353305e-05, + "loss": 0.8129, + "num_tokens": 3400458.0, + "step": 37 + }, + { + "epoch": 0.0064857484212322925, + "grad_norm": 0.7055686129637639, + "learning_rate": 3.974739716675201e-05, + "loss": 0.7729, + "num_tokens": 3518889.0, + "step": 38 + }, + { + "epoch": 0.006656426011264721, + "grad_norm": 0.6931394597489553, + "learning_rate": 3.974057006315071e-05, + "loss": 0.693, + "num_tokens": 3598339.0, + "step": 39 + }, + { + "epoch": 0.006827103601297149, + "grad_norm": 0.6779583132071767, + "learning_rate": 3.973374295954941e-05, + "loss": 0.7554, + "num_tokens": 3702099.0, + "step": 40 + }, + { + "epoch": 0.006997781191329579, + "grad_norm": 0.7203237143857879, + "learning_rate": 3.972691585594812e-05, + "loss": 0.6692, + "num_tokens": 3792354.0, + "step": 41 + }, + { + "epoch": 0.007168458781362007, + "grad_norm": 0.8007313963972836, + "learning_rate": 3.972008875234682e-05, + "loss": 0.7137, + "num_tokens": 3853390.0, + "step": 42 + }, + { + "epoch": 0.007339136371394436, + "grad_norm": 0.7375648687071122, + "learning_rate": 3.971326164874553e-05, + "loss": 0.7329, + "num_tokens": 3949208.0, + "step": 43 + }, + { + "epoch": 0.007509813961426865, + "grad_norm": 0.7396283530838207, + "learning_rate": 3.970643454514423e-05, + "loss": 0.7885, + "num_tokens": 4058020.0, + "step": 44 + }, + { + "epoch": 0.007680491551459293, + "grad_norm": 0.8321220234691021, + "learning_rate": 3.969960744154293e-05, + "loss": 0.8469, + "num_tokens": 4141321.0, + "step": 45 + }, + { + "epoch": 0.007851169141491723, + "grad_norm": 0.7350589135804083, + "learning_rate": 3.969278033794163e-05, + "loss": 0.8921, + "num_tokens": 4250607.0, + "step": 46 + }, + { + "epoch": 0.008021846731524151, + "grad_norm": 0.7201791502605341, + "learning_rate": 3.9685953234340336e-05, + "loss": 0.7519, + "num_tokens": 4358693.0, + "step": 47 + }, + { + "epoch": 0.00819252432155658, + "grad_norm": 0.6822472222413762, + "learning_rate": 3.9679126130739036e-05, + "loss": 0.8139, + "num_tokens": 4472404.0, + "step": 48 + }, + { + "epoch": 0.008363201911589008, + "grad_norm": 0.7205918708852296, + "learning_rate": 3.9672299027137736e-05, + "loss": 0.6795, + "num_tokens": 4540941.0, + "step": 49 + }, + { + "epoch": 0.008533879501621437, + "grad_norm": 0.689769600573838, + "learning_rate": 3.9665471923536444e-05, + "loss": 0.7055, + "num_tokens": 4654813.0, + "step": 50 + }, + { + "epoch": 0.008704557091653867, + "grad_norm": 0.8397726567442072, + "learning_rate": 3.9658644819935144e-05, + "loss": 0.9061, + "num_tokens": 4748387.0, + "step": 51 + }, + { + "epoch": 0.008875234681686295, + "grad_norm": 0.7299234289093371, + "learning_rate": 3.965181771633385e-05, + "loss": 0.7132, + "num_tokens": 4828488.0, + "step": 52 + }, + { + "epoch": 0.009045912271718724, + "grad_norm": 0.7971945861085944, + "learning_rate": 3.964499061273255e-05, + "loss": 0.812, + "num_tokens": 4908943.0, + "step": 53 + }, + { + "epoch": 0.009216589861751152, + "grad_norm": 0.7470664732538292, + "learning_rate": 3.963816350913126e-05, + "loss": 0.7451, + "num_tokens": 4991700.0, + "step": 54 + }, + { + "epoch": 0.00938726745178358, + "grad_norm": 0.7267148612906924, + "learning_rate": 3.963133640552996e-05, + "loss": 0.8426, + "num_tokens": 5090238.0, + "step": 55 + }, + { + "epoch": 0.009557945041816009, + "grad_norm": 0.7060878197200826, + "learning_rate": 3.962450930192866e-05, + "loss": 0.8536, + "num_tokens": 5184502.0, + "step": 56 + }, + { + "epoch": 0.00972862263184844, + "grad_norm": 0.7164828805072689, + "learning_rate": 3.961768219832736e-05, + "loss": 0.7879, + "num_tokens": 5305502.0, + "step": 57 + }, + { + "epoch": 0.009899300221880868, + "grad_norm": 0.7336781645910154, + "learning_rate": 3.961085509472607e-05, + "loss": 0.7847, + "num_tokens": 5380570.0, + "step": 58 + }, + { + "epoch": 0.010069977811913296, + "grad_norm": 0.6848179383612278, + "learning_rate": 3.960402799112477e-05, + "loss": 0.7052, + "num_tokens": 5462973.0, + "step": 59 + }, + { + "epoch": 0.010240655401945725, + "grad_norm": 0.7378105624521154, + "learning_rate": 3.959720088752347e-05, + "loss": 0.8381, + "num_tokens": 5567876.0, + "step": 60 + }, + { + "epoch": 0.010411332991978153, + "grad_norm": 0.7275572362419059, + "learning_rate": 3.9590373783922175e-05, + "loss": 0.7242, + "num_tokens": 5663667.0, + "step": 61 + }, + { + "epoch": 0.010582010582010581, + "grad_norm": 0.6738102894577289, + "learning_rate": 3.9583546680320875e-05, + "loss": 0.8847, + "num_tokens": 5789458.0, + "step": 62 + }, + { + "epoch": 0.010752688172043012, + "grad_norm": 0.7007636635184095, + "learning_rate": 3.957671957671958e-05, + "loss": 0.7467, + "num_tokens": 5872500.0, + "step": 63 + }, + { + "epoch": 0.01092336576207544, + "grad_norm": 0.6551726547438986, + "learning_rate": 3.956989247311828e-05, + "loss": 0.7711, + "num_tokens": 5982089.0, + "step": 64 + }, + { + "epoch": 0.011094043352107869, + "grad_norm": 0.6885511285462077, + "learning_rate": 3.956306536951698e-05, + "loss": 0.688, + "num_tokens": 6072778.0, + "step": 65 + }, + { + "epoch": 0.011264720942140297, + "grad_norm": 0.7097694668409246, + "learning_rate": 3.955623826591569e-05, + "loss": 0.8276, + "num_tokens": 6210764.0, + "step": 66 + }, + { + "epoch": 0.011435398532172725, + "grad_norm": 0.7143955035794974, + "learning_rate": 3.954941116231439e-05, + "loss": 0.7693, + "num_tokens": 6306727.0, + "step": 67 + }, + { + "epoch": 0.011606076122205154, + "grad_norm": 0.6695535596327769, + "learning_rate": 3.95425840587131e-05, + "loss": 0.7821, + "num_tokens": 6398381.0, + "step": 68 + }, + { + "epoch": 0.011776753712237584, + "grad_norm": 0.7686511455049567, + "learning_rate": 3.95357569551118e-05, + "loss": 0.6947, + "num_tokens": 6464689.0, + "step": 69 + }, + { + "epoch": 0.011947431302270013, + "grad_norm": 0.6799563772555682, + "learning_rate": 3.95289298515105e-05, + "loss": 0.816, + "num_tokens": 6592934.0, + "step": 70 + }, + { + "epoch": 0.012118108892302441, + "grad_norm": 0.6534270630652051, + "learning_rate": 3.95221027479092e-05, + "loss": 0.7495, + "num_tokens": 6705546.0, + "step": 71 + }, + { + "epoch": 0.01228878648233487, + "grad_norm": 0.7663017891810427, + "learning_rate": 3.9515275644307906e-05, + "loss": 0.8666, + "num_tokens": 6809773.0, + "step": 72 + }, + { + "epoch": 0.012459464072367298, + "grad_norm": 0.6589745695367404, + "learning_rate": 3.9508448540706606e-05, + "loss": 0.7142, + "num_tokens": 6916368.0, + "step": 73 + }, + { + "epoch": 0.012630141662399726, + "grad_norm": 0.6518812664326954, + "learning_rate": 3.9501621437105314e-05, + "loss": 0.7112, + "num_tokens": 7009718.0, + "step": 74 + }, + { + "epoch": 0.012800819252432157, + "grad_norm": 0.7168443925954251, + "learning_rate": 3.9494794333504014e-05, + "loss": 0.7763, + "num_tokens": 7093304.0, + "step": 75 + }, + { + "epoch": 0.012971496842464585, + "grad_norm": 0.8518409513538764, + "learning_rate": 3.9487967229902714e-05, + "loss": 0.7652, + "num_tokens": 7172731.0, + "step": 76 + }, + { + "epoch": 0.013142174432497013, + "grad_norm": 0.6764496851467736, + "learning_rate": 3.948114012630142e-05, + "loss": 0.7308, + "num_tokens": 7269718.0, + "step": 77 + }, + { + "epoch": 0.013312852022529442, + "grad_norm": 0.6741632570449712, + "learning_rate": 3.947431302270012e-05, + "loss": 0.8394, + "num_tokens": 7384158.0, + "step": 78 + }, + { + "epoch": 0.01348352961256187, + "grad_norm": 0.6543456516854816, + "learning_rate": 3.946748591909883e-05, + "loss": 0.6941, + "num_tokens": 7474574.0, + "step": 79 + }, + { + "epoch": 0.013654207202594299, + "grad_norm": 0.6303454367812126, + "learning_rate": 3.946065881549753e-05, + "loss": 0.7465, + "num_tokens": 7577425.0, + "step": 80 + }, + { + "epoch": 0.013824884792626729, + "grad_norm": 0.7107294639214967, + "learning_rate": 3.9453831711896237e-05, + "loss": 0.743, + "num_tokens": 7649837.0, + "step": 81 + }, + { + "epoch": 0.013995562382659157, + "grad_norm": 0.7020757619747062, + "learning_rate": 3.944700460829493e-05, + "loss": 0.7375, + "num_tokens": 7730824.0, + "step": 82 + }, + { + "epoch": 0.014166239972691586, + "grad_norm": 0.7308074943683739, + "learning_rate": 3.944017750469364e-05, + "loss": 0.8706, + "num_tokens": 7834039.0, + "step": 83 + }, + { + "epoch": 0.014336917562724014, + "grad_norm": 0.7018958240322349, + "learning_rate": 3.943335040109234e-05, + "loss": 0.6853, + "num_tokens": 7934944.0, + "step": 84 + }, + { + "epoch": 0.014507595152756443, + "grad_norm": 0.697643161098838, + "learning_rate": 3.9426523297491045e-05, + "loss": 0.7309, + "num_tokens": 8035166.0, + "step": 85 + }, + { + "epoch": 0.014678272742788871, + "grad_norm": 0.6468565003915663, + "learning_rate": 3.9419696193889745e-05, + "loss": 0.734, + "num_tokens": 8139451.0, + "step": 86 + }, + { + "epoch": 0.014848950332821301, + "grad_norm": 0.7296842711049382, + "learning_rate": 3.9412869090288446e-05, + "loss": 0.6937, + "num_tokens": 8214176.0, + "step": 87 + }, + { + "epoch": 0.01501962792285373, + "grad_norm": 0.7757639961466596, + "learning_rate": 3.940604198668715e-05, + "loss": 0.7869, + "num_tokens": 8283789.0, + "step": 88 + }, + { + "epoch": 0.015190305512886158, + "grad_norm": 0.7054345909222848, + "learning_rate": 3.939921488308585e-05, + "loss": 0.7599, + "num_tokens": 8374370.0, + "step": 89 + }, + { + "epoch": 0.015360983102918587, + "grad_norm": 0.712552810457742, + "learning_rate": 3.939238777948456e-05, + "loss": 0.6566, + "num_tokens": 8457534.0, + "step": 90 + }, + { + "epoch": 0.015531660692951015, + "grad_norm": 0.7226337413539574, + "learning_rate": 3.938556067588326e-05, + "loss": 0.8167, + "num_tokens": 8560410.0, + "step": 91 + }, + { + "epoch": 0.015702338282983445, + "grad_norm": 0.6961355791629177, + "learning_rate": 3.937873357228196e-05, + "loss": 0.7731, + "num_tokens": 8657358.0, + "step": 92 + }, + { + "epoch": 0.015873015873015872, + "grad_norm": 0.7188650431735463, + "learning_rate": 3.937190646868067e-05, + "loss": 0.7436, + "num_tokens": 8758804.0, + "step": 93 + }, + { + "epoch": 0.016043693463048302, + "grad_norm": 0.7198155622289866, + "learning_rate": 3.936507936507937e-05, + "loss": 0.7015, + "num_tokens": 8830660.0, + "step": 94 + }, + { + "epoch": 0.01621437105308073, + "grad_norm": 0.7283468599367112, + "learning_rate": 3.935825226147807e-05, + "loss": 0.8131, + "num_tokens": 8926170.0, + "step": 95 + }, + { + "epoch": 0.01638504864311316, + "grad_norm": 1.7094760002588176, + "learning_rate": 3.935142515787677e-05, + "loss": 0.7444, + "num_tokens": 9025990.0, + "step": 96 + }, + { + "epoch": 0.01655572623314559, + "grad_norm": 0.710438774268186, + "learning_rate": 3.9344598054275476e-05, + "loss": 0.7075, + "num_tokens": 9124019.0, + "step": 97 + }, + { + "epoch": 0.016726403823178016, + "grad_norm": 0.6950972504166626, + "learning_rate": 3.933777095067418e-05, + "loss": 0.7482, + "num_tokens": 9211588.0, + "step": 98 + }, + { + "epoch": 0.016897081413210446, + "grad_norm": 0.7197346812206425, + "learning_rate": 3.9330943847072884e-05, + "loss": 0.7874, + "num_tokens": 9300925.0, + "step": 99 + }, + { + "epoch": 0.017067759003242873, + "grad_norm": 0.789788312202588, + "learning_rate": 3.9324116743471584e-05, + "loss": 0.7629, + "num_tokens": 9399015.0, + "step": 100 + }, + { + "epoch": 0.017238436593275303, + "grad_norm": 0.7342924974604631, + "learning_rate": 3.931728963987029e-05, + "loss": 0.7348, + "num_tokens": 9485097.0, + "step": 101 + }, + { + "epoch": 0.017409114183307733, + "grad_norm": 0.7166646732289269, + "learning_rate": 3.931046253626899e-05, + "loss": 0.8149, + "num_tokens": 9575701.0, + "step": 102 + }, + { + "epoch": 0.01757979177334016, + "grad_norm": 0.7437774544414028, + "learning_rate": 3.930363543266769e-05, + "loss": 0.7368, + "num_tokens": 9662488.0, + "step": 103 + }, + { + "epoch": 0.01775046936337259, + "grad_norm": 0.6808010328404168, + "learning_rate": 3.92968083290664e-05, + "loss": 0.7039, + "num_tokens": 9761350.0, + "step": 104 + }, + { + "epoch": 0.017921146953405017, + "grad_norm": 0.7169724509315818, + "learning_rate": 3.92899812254651e-05, + "loss": 0.7202, + "num_tokens": 9838430.0, + "step": 105 + }, + { + "epoch": 0.018091824543437447, + "grad_norm": 0.7126977172379187, + "learning_rate": 3.928315412186381e-05, + "loss": 0.7707, + "num_tokens": 9921053.0, + "step": 106 + }, + { + "epoch": 0.018262502133469874, + "grad_norm": 0.6432027470949758, + "learning_rate": 3.92763270182625e-05, + "loss": 0.7867, + "num_tokens": 10030160.0, + "step": 107 + }, + { + "epoch": 0.018433179723502304, + "grad_norm": 0.726049307681308, + "learning_rate": 3.926949991466121e-05, + "loss": 0.712, + "num_tokens": 10115182.0, + "step": 108 + }, + { + "epoch": 0.018603857313534734, + "grad_norm": 0.6402177577648095, + "learning_rate": 3.926267281105991e-05, + "loss": 0.6827, + "num_tokens": 10211924.0, + "step": 109 + }, + { + "epoch": 0.01877453490356716, + "grad_norm": 0.7152309853940433, + "learning_rate": 3.9255845707458615e-05, + "loss": 0.6939, + "num_tokens": 10290113.0, + "step": 110 + }, + { + "epoch": 0.01894521249359959, + "grad_norm": 0.7822974515684203, + "learning_rate": 3.9249018603857316e-05, + "loss": 0.7529, + "num_tokens": 10352671.0, + "step": 111 + }, + { + "epoch": 0.019115890083632018, + "grad_norm": 0.6626356348746878, + "learning_rate": 3.924219150025602e-05, + "loss": 0.7616, + "num_tokens": 10440263.0, + "step": 112 + }, + { + "epoch": 0.019286567673664448, + "grad_norm": 0.6888370834474341, + "learning_rate": 3.923536439665472e-05, + "loss": 0.7527, + "num_tokens": 10533923.0, + "step": 113 + }, + { + "epoch": 0.01945724526369688, + "grad_norm": 0.6833325132325303, + "learning_rate": 3.9228537293053423e-05, + "loss": 0.7188, + "num_tokens": 10639389.0, + "step": 114 + }, + { + "epoch": 0.019627922853729305, + "grad_norm": 0.6415478788112731, + "learning_rate": 3.922171018945213e-05, + "loss": 0.7446, + "num_tokens": 10738893.0, + "step": 115 + }, + { + "epoch": 0.019798600443761735, + "grad_norm": 0.6272733771819627, + "learning_rate": 3.921488308585083e-05, + "loss": 0.6737, + "num_tokens": 10839793.0, + "step": 116 + }, + { + "epoch": 0.019969278033794162, + "grad_norm": 0.6729755805869422, + "learning_rate": 3.920805598224954e-05, + "loss": 0.6776, + "num_tokens": 10965549.0, + "step": 117 + }, + { + "epoch": 0.020139955623826592, + "grad_norm": 0.6715717110894642, + "learning_rate": 3.920122887864824e-05, + "loss": 0.6907, + "num_tokens": 11057779.0, + "step": 118 + }, + { + "epoch": 0.02031063321385902, + "grad_norm": 0.6743663355343439, + "learning_rate": 3.919440177504694e-05, + "loss": 0.677, + "num_tokens": 11142241.0, + "step": 119 + }, + { + "epoch": 0.02048131080389145, + "grad_norm": 0.7636216756077997, + "learning_rate": 3.918757467144564e-05, + "loss": 0.7891, + "num_tokens": 11227376.0, + "step": 120 + }, + { + "epoch": 0.02065198839392388, + "grad_norm": 0.7994741365750552, + "learning_rate": 3.9180747567844346e-05, + "loss": 0.7041, + "num_tokens": 11290240.0, + "step": 121 + }, + { + "epoch": 0.020822665983956306, + "grad_norm": 0.6883108576026933, + "learning_rate": 3.917392046424305e-05, + "loss": 0.6549, + "num_tokens": 11380542.0, + "step": 122 + }, + { + "epoch": 0.020993343573988736, + "grad_norm": 0.6697062213556211, + "learning_rate": 3.916709336064175e-05, + "loss": 0.8257, + "num_tokens": 11476991.0, + "step": 123 + }, + { + "epoch": 0.021164021164021163, + "grad_norm": 0.6889690539813721, + "learning_rate": 3.9160266257040454e-05, + "loss": 0.7905, + "num_tokens": 11554220.0, + "step": 124 + }, + { + "epoch": 0.021334698754053593, + "grad_norm": 0.6998892555161134, + "learning_rate": 3.9153439153439155e-05, + "loss": 0.8183, + "num_tokens": 11671609.0, + "step": 125 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 0.7057675693723754, + "learning_rate": 3.914661204983786e-05, + "loss": 0.6897, + "num_tokens": 11736584.0, + "step": 126 + }, + { + "epoch": 0.02167605393411845, + "grad_norm": 0.7983389150672903, + "learning_rate": 3.913978494623656e-05, + "loss": 0.7056, + "num_tokens": 11794801.0, + "step": 127 + }, + { + "epoch": 0.02184673152415088, + "grad_norm": 0.7015293342096629, + "learning_rate": 3.913295784263527e-05, + "loss": 0.6693, + "num_tokens": 11860247.0, + "step": 128 + }, + { + "epoch": 0.022017409114183307, + "grad_norm": 0.659835217469634, + "learning_rate": 3.912613073903397e-05, + "loss": 0.7694, + "num_tokens": 11948286.0, + "step": 129 + }, + { + "epoch": 0.022188086704215737, + "grad_norm": 0.7021900827469132, + "learning_rate": 3.911930363543267e-05, + "loss": 0.7816, + "num_tokens": 12045966.0, + "step": 130 + }, + { + "epoch": 0.022358764294248164, + "grad_norm": 0.597681021149141, + "learning_rate": 3.911247653183138e-05, + "loss": 0.7412, + "num_tokens": 12154680.0, + "step": 131 + }, + { + "epoch": 0.022529441884280594, + "grad_norm": 0.6689515069328568, + "learning_rate": 3.910564942823008e-05, + "loss": 0.6829, + "num_tokens": 12240611.0, + "step": 132 + }, + { + "epoch": 0.022700119474313024, + "grad_norm": 0.6427117507301527, + "learning_rate": 3.909882232462878e-05, + "loss": 0.736, + "num_tokens": 12341226.0, + "step": 133 + }, + { + "epoch": 0.02287079706434545, + "grad_norm": 0.641298842454867, + "learning_rate": 3.909199522102748e-05, + "loss": 0.7794, + "num_tokens": 12470035.0, + "step": 134 + }, + { + "epoch": 0.02304147465437788, + "grad_norm": 0.6559941578850332, + "learning_rate": 3.9085168117426186e-05, + "loss": 0.7824, + "num_tokens": 12569182.0, + "step": 135 + }, + { + "epoch": 0.023212152244410308, + "grad_norm": 0.5797879584252981, + "learning_rate": 3.9078341013824886e-05, + "loss": 0.7486, + "num_tokens": 12689260.0, + "step": 136 + }, + { + "epoch": 0.023382829834442738, + "grad_norm": 0.5785520978942137, + "learning_rate": 3.907151391022359e-05, + "loss": 0.7169, + "num_tokens": 12799814.0, + "step": 137 + }, + { + "epoch": 0.023553507424475168, + "grad_norm": 0.63145340884011, + "learning_rate": 3.9064686806622293e-05, + "loss": 0.7159, + "num_tokens": 12895531.0, + "step": 138 + }, + { + "epoch": 0.023724185014507595, + "grad_norm": 0.6718641945416584, + "learning_rate": 3.9057859703020994e-05, + "loss": 0.7595, + "num_tokens": 12970095.0, + "step": 139 + }, + { + "epoch": 0.023894862604540025, + "grad_norm": 0.6688154953483435, + "learning_rate": 3.90510325994197e-05, + "loss": 0.6489, + "num_tokens": 13047947.0, + "step": 140 + }, + { + "epoch": 0.024065540194572452, + "grad_norm": 0.6119465046315931, + "learning_rate": 3.90442054958184e-05, + "loss": 0.6582, + "num_tokens": 13131231.0, + "step": 141 + }, + { + "epoch": 0.024236217784604882, + "grad_norm": 0.6071349966689983, + "learning_rate": 3.903737839221711e-05, + "loss": 0.6955, + "num_tokens": 13226746.0, + "step": 142 + }, + { + "epoch": 0.02440689537463731, + "grad_norm": 0.707819543546065, + "learning_rate": 3.903055128861581e-05, + "loss": 0.785, + "num_tokens": 13326100.0, + "step": 143 + }, + { + "epoch": 0.02457757296466974, + "grad_norm": 0.7101140252787286, + "learning_rate": 3.902372418501451e-05, + "loss": 0.7377, + "num_tokens": 13408131.0, + "step": 144 + }, + { + "epoch": 0.02474825055470217, + "grad_norm": 0.7137763420622705, + "learning_rate": 3.901689708141321e-05, + "loss": 0.7411, + "num_tokens": 13492002.0, + "step": 145 + }, + { + "epoch": 0.024918928144734596, + "grad_norm": 0.6302117820569665, + "learning_rate": 3.901006997781192e-05, + "loss": 0.749, + "num_tokens": 13608989.0, + "step": 146 + }, + { + "epoch": 0.025089605734767026, + "grad_norm": 0.6520119814156172, + "learning_rate": 3.900324287421062e-05, + "loss": 0.7218, + "num_tokens": 13693079.0, + "step": 147 + }, + { + "epoch": 0.025260283324799453, + "grad_norm": 0.721973686477052, + "learning_rate": 3.8996415770609324e-05, + "loss": 0.7097, + "num_tokens": 13759793.0, + "step": 148 + }, + { + "epoch": 0.025430960914831883, + "grad_norm": 0.6444815617055734, + "learning_rate": 3.8989588667008025e-05, + "loss": 0.6814, + "num_tokens": 13853566.0, + "step": 149 + }, + { + "epoch": 0.025601638504864313, + "grad_norm": 0.6470144981350449, + "learning_rate": 3.8982761563406725e-05, + "loss": 0.7195, + "num_tokens": 13940952.0, + "step": 150 + }, + { + "epoch": 0.02577231609489674, + "grad_norm": 0.6482708654305512, + "learning_rate": 3.897593445980543e-05, + "loss": 0.7509, + "num_tokens": 14046620.0, + "step": 151 + }, + { + "epoch": 0.02594299368492917, + "grad_norm": 0.6709976747086389, + "learning_rate": 3.896910735620413e-05, + "loss": 0.63, + "num_tokens": 14138480.0, + "step": 152 + }, + { + "epoch": 0.026113671274961597, + "grad_norm": 0.6223641618582216, + "learning_rate": 3.896228025260284e-05, + "loss": 0.702, + "num_tokens": 14245728.0, + "step": 153 + }, + { + "epoch": 0.026284348864994027, + "grad_norm": 0.6702374688661782, + "learning_rate": 3.895545314900154e-05, + "loss": 0.6915, + "num_tokens": 14368951.0, + "step": 154 + }, + { + "epoch": 0.026455026455026454, + "grad_norm": 0.7064029455559451, + "learning_rate": 3.894862604540025e-05, + "loss": 0.8063, + "num_tokens": 14447989.0, + "step": 155 + }, + { + "epoch": 0.026625704045058884, + "grad_norm": 0.6958089994646517, + "learning_rate": 3.894179894179894e-05, + "loss": 0.7468, + "num_tokens": 14531991.0, + "step": 156 + }, + { + "epoch": 0.026796381635091314, + "grad_norm": 0.660823616166228, + "learning_rate": 3.893497183819765e-05, + "loss": 0.6994, + "num_tokens": 14633378.0, + "step": 157 + }, + { + "epoch": 0.02696705922512374, + "grad_norm": 0.6632760553080884, + "learning_rate": 3.892814473459635e-05, + "loss": 0.8085, + "num_tokens": 14742735.0, + "step": 158 + }, + { + "epoch": 0.02713773681515617, + "grad_norm": 0.6707076778792253, + "learning_rate": 3.8921317630995056e-05, + "loss": 0.704, + "num_tokens": 14824038.0, + "step": 159 + }, + { + "epoch": 0.027308414405188598, + "grad_norm": 0.6780371496938489, + "learning_rate": 3.8914490527393756e-05, + "loss": 0.7246, + "num_tokens": 14912709.0, + "step": 160 + }, + { + "epoch": 0.027479091995221028, + "grad_norm": 0.6502275180697508, + "learning_rate": 3.8907663423792456e-05, + "loss": 0.7429, + "num_tokens": 15007000.0, + "step": 161 + }, + { + "epoch": 0.027649769585253458, + "grad_norm": 0.730209499791411, + "learning_rate": 3.8900836320191163e-05, + "loss": 0.6582, + "num_tokens": 15063963.0, + "step": 162 + }, + { + "epoch": 0.027820447175285885, + "grad_norm": 0.7114305063785422, + "learning_rate": 3.8894009216589864e-05, + "loss": 0.7225, + "num_tokens": 15136962.0, + "step": 163 + }, + { + "epoch": 0.027991124765318315, + "grad_norm": 0.6053172228852393, + "learning_rate": 3.888718211298857e-05, + "loss": 0.7763, + "num_tokens": 15253506.0, + "step": 164 + }, + { + "epoch": 0.02816180235535074, + "grad_norm": 0.6351742686250045, + "learning_rate": 3.888035500938727e-05, + "loss": 0.8324, + "num_tokens": 15370189.0, + "step": 165 + }, + { + "epoch": 0.028332479945383172, + "grad_norm": 0.648084921078821, + "learning_rate": 3.887352790578597e-05, + "loss": 0.7172, + "num_tokens": 15462760.0, + "step": 166 + }, + { + "epoch": 0.0285031575354156, + "grad_norm": 0.6197071544972387, + "learning_rate": 3.886670080218468e-05, + "loss": 0.772, + "num_tokens": 15566974.0, + "step": 167 + }, + { + "epoch": 0.02867383512544803, + "grad_norm": 0.637730639108508, + "learning_rate": 3.885987369858338e-05, + "loss": 0.7895, + "num_tokens": 15683081.0, + "step": 168 + }, + { + "epoch": 0.02884451271548046, + "grad_norm": 0.6378121371213399, + "learning_rate": 3.885304659498208e-05, + "loss": 0.7316, + "num_tokens": 15786689.0, + "step": 169 + }, + { + "epoch": 0.029015190305512886, + "grad_norm": 0.8736094176914838, + "learning_rate": 3.884621949138078e-05, + "loss": 0.7443, + "num_tokens": 15890450.0, + "step": 170 + }, + { + "epoch": 0.029185867895545316, + "grad_norm": 0.6316372649344915, + "learning_rate": 3.883939238777949e-05, + "loss": 0.6948, + "num_tokens": 15983760.0, + "step": 171 + }, + { + "epoch": 0.029356545485577742, + "grad_norm": 0.6362300091662586, + "learning_rate": 3.883256528417819e-05, + "loss": 0.7181, + "num_tokens": 16079744.0, + "step": 172 + }, + { + "epoch": 0.029527223075610173, + "grad_norm": 0.6445033196461297, + "learning_rate": 3.8825738180576895e-05, + "loss": 0.7482, + "num_tokens": 16180459.0, + "step": 173 + }, + { + "epoch": 0.029697900665642603, + "grad_norm": 0.6744404233553876, + "learning_rate": 3.8818911076975595e-05, + "loss": 0.8001, + "num_tokens": 16279669.0, + "step": 174 + }, + { + "epoch": 0.02986857825567503, + "grad_norm": 0.6763245820930079, + "learning_rate": 3.88120839733743e-05, + "loss": 0.802, + "num_tokens": 16388120.0, + "step": 175 + }, + { + "epoch": 0.03003925584570746, + "grad_norm": 0.6266783074188829, + "learning_rate": 3.8805256869773e-05, + "loss": 0.6685, + "num_tokens": 16472724.0, + "step": 176 + }, + { + "epoch": 0.030209933435739886, + "grad_norm": 0.6634402306452151, + "learning_rate": 3.87984297661717e-05, + "loss": 0.6102, + "num_tokens": 16544904.0, + "step": 177 + }, + { + "epoch": 0.030380611025772317, + "grad_norm": 0.6605043556586717, + "learning_rate": 3.879160266257041e-05, + "loss": 0.7208, + "num_tokens": 16626077.0, + "step": 178 + }, + { + "epoch": 0.030551288615804743, + "grad_norm": 0.6315022793756768, + "learning_rate": 3.878477555896911e-05, + "loss": 0.6938, + "num_tokens": 16703787.0, + "step": 179 + }, + { + "epoch": 0.030721966205837174, + "grad_norm": 0.6207006201392815, + "learning_rate": 3.877794845536782e-05, + "loss": 0.7035, + "num_tokens": 16799190.0, + "step": 180 + }, + { + "epoch": 0.030892643795869604, + "grad_norm": 0.6875204836935707, + "learning_rate": 3.877112135176651e-05, + "loss": 0.7634, + "num_tokens": 16876553.0, + "step": 181 + }, + { + "epoch": 0.03106332138590203, + "grad_norm": 0.7156268844019292, + "learning_rate": 3.876429424816522e-05, + "loss": 0.7638, + "num_tokens": 16944538.0, + "step": 182 + }, + { + "epoch": 0.03123399897593446, + "grad_norm": 0.5877463398085545, + "learning_rate": 3.875746714456392e-05, + "loss": 0.7017, + "num_tokens": 17037853.0, + "step": 183 + }, + { + "epoch": 0.03140467656596689, + "grad_norm": 0.6238952839009698, + "learning_rate": 3.8750640040962626e-05, + "loss": 0.7264, + "num_tokens": 17152355.0, + "step": 184 + }, + { + "epoch": 0.03157535415599932, + "grad_norm": 0.6858243373374306, + "learning_rate": 3.8743812937361326e-05, + "loss": 0.7088, + "num_tokens": 17234595.0, + "step": 185 + }, + { + "epoch": 0.031746031746031744, + "grad_norm": 0.6572960239702703, + "learning_rate": 3.873698583376003e-05, + "loss": 0.7093, + "num_tokens": 17317415.0, + "step": 186 + }, + { + "epoch": 0.03191670933606418, + "grad_norm": 0.6328308842872361, + "learning_rate": 3.8730158730158734e-05, + "loss": 0.7003, + "num_tokens": 17413091.0, + "step": 187 + }, + { + "epoch": 0.032087386926096605, + "grad_norm": 0.5921868197175639, + "learning_rate": 3.8723331626557434e-05, + "loss": 0.7636, + "num_tokens": 17520220.0, + "step": 188 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 0.6042507252815604, + "learning_rate": 3.871650452295614e-05, + "loss": 0.6863, + "num_tokens": 17613686.0, + "step": 189 + }, + { + "epoch": 0.03242874210616146, + "grad_norm": 0.6533037514095091, + "learning_rate": 3.870967741935484e-05, + "loss": 0.744, + "num_tokens": 17717456.0, + "step": 190 + }, + { + "epoch": 0.03259941969619389, + "grad_norm": 0.726955578017353, + "learning_rate": 3.870285031575355e-05, + "loss": 0.6877, + "num_tokens": 17783454.0, + "step": 191 + }, + { + "epoch": 0.03277009728622632, + "grad_norm": 0.7614930718095211, + "learning_rate": 3.869602321215225e-05, + "loss": 0.8384, + "num_tokens": 17913168.0, + "step": 192 + }, + { + "epoch": 0.032940774876258745, + "grad_norm": 0.6862381946151612, + "learning_rate": 3.868919610855095e-05, + "loss": 0.695, + "num_tokens": 17983070.0, + "step": 193 + }, + { + "epoch": 0.03311145246629118, + "grad_norm": 0.6842782212356128, + "learning_rate": 3.868236900494965e-05, + "loss": 0.779, + "num_tokens": 18081234.0, + "step": 194 + }, + { + "epoch": 0.033282130056323606, + "grad_norm": 0.689032641168344, + "learning_rate": 3.867554190134836e-05, + "loss": 0.6845, + "num_tokens": 18185061.0, + "step": 195 + }, + { + "epoch": 0.03345280764635603, + "grad_norm": 0.578829188316049, + "learning_rate": 3.866871479774706e-05, + "loss": 0.6813, + "num_tokens": 18302410.0, + "step": 196 + }, + { + "epoch": 0.03362348523638846, + "grad_norm": 0.7154253172582636, + "learning_rate": 3.866188769414576e-05, + "loss": 0.7297, + "num_tokens": 18406166.0, + "step": 197 + }, + { + "epoch": 0.03379416282642089, + "grad_norm": 0.6878518398805505, + "learning_rate": 3.8655060590544465e-05, + "loss": 0.7107, + "num_tokens": 18479968.0, + "step": 198 + }, + { + "epoch": 0.03396484041645332, + "grad_norm": 0.6513611133794195, + "learning_rate": 3.8648233486943165e-05, + "loss": 0.6678, + "num_tokens": 18566343.0, + "step": 199 + }, + { + "epoch": 0.034135518006485746, + "grad_norm": 0.6795852143155388, + "learning_rate": 3.864140638334187e-05, + "loss": 0.607, + "num_tokens": 18633535.0, + "step": 200 + }, + { + "epoch": 0.03430619559651818, + "grad_norm": 0.6588749909750174, + "learning_rate": 3.863457927974057e-05, + "loss": 0.7402, + "num_tokens": 18721880.0, + "step": 201 + }, + { + "epoch": 0.034476873186550606, + "grad_norm": 0.6577964593126544, + "learning_rate": 3.862775217613928e-05, + "loss": 0.7538, + "num_tokens": 18813666.0, + "step": 202 + }, + { + "epoch": 0.03464755077658303, + "grad_norm": 0.6450724558535014, + "learning_rate": 3.862092507253798e-05, + "loss": 0.6576, + "num_tokens": 18919744.0, + "step": 203 + }, + { + "epoch": 0.03481822836661547, + "grad_norm": 0.5807406133798961, + "learning_rate": 3.861409796893668e-05, + "loss": 0.6417, + "num_tokens": 19038949.0, + "step": 204 + }, + { + "epoch": 0.034988905956647894, + "grad_norm": 0.6984421285960188, + "learning_rate": 3.860727086533539e-05, + "loss": 0.8612, + "num_tokens": 19136460.0, + "step": 205 + }, + { + "epoch": 0.03515958354668032, + "grad_norm": 0.7131514515999404, + "learning_rate": 3.860044376173409e-05, + "loss": 0.7844, + "num_tokens": 19226334.0, + "step": 206 + }, + { + "epoch": 0.03533026113671275, + "grad_norm": 0.6570968467641974, + "learning_rate": 3.859361665813279e-05, + "loss": 0.658, + "num_tokens": 19307207.0, + "step": 207 + }, + { + "epoch": 0.03550093872674518, + "grad_norm": 0.7083973990928504, + "learning_rate": 3.858678955453149e-05, + "loss": 0.7186, + "num_tokens": 19377238.0, + "step": 208 + }, + { + "epoch": 0.03567161631677761, + "grad_norm": 0.6121794210125173, + "learning_rate": 3.8579962450930196e-05, + "loss": 0.7109, + "num_tokens": 19486090.0, + "step": 209 + }, + { + "epoch": 0.035842293906810034, + "grad_norm": 0.5895451855219302, + "learning_rate": 3.8573135347328897e-05, + "loss": 0.6954, + "num_tokens": 19606784.0, + "step": 210 + }, + { + "epoch": 0.03601297149684247, + "grad_norm": 0.622315316048051, + "learning_rate": 3.8566308243727604e-05, + "loss": 0.7422, + "num_tokens": 19703675.0, + "step": 211 + }, + { + "epoch": 0.036183649086874894, + "grad_norm": 0.714850231197574, + "learning_rate": 3.8559481140126304e-05, + "loss": 0.7167, + "num_tokens": 19792382.0, + "step": 212 + }, + { + "epoch": 0.03635432667690732, + "grad_norm": 0.6372197127312111, + "learning_rate": 3.855265403652501e-05, + "loss": 0.7109, + "num_tokens": 19887823.0, + "step": 213 + }, + { + "epoch": 0.03652500426693975, + "grad_norm": 0.6648650930455394, + "learning_rate": 3.854582693292371e-05, + "loss": 0.6563, + "num_tokens": 19960808.0, + "step": 214 + }, + { + "epoch": 0.03669568185697218, + "grad_norm": 0.5822412068870859, + "learning_rate": 3.853899982932241e-05, + "loss": 0.6669, + "num_tokens": 20074750.0, + "step": 215 + }, + { + "epoch": 0.03686635944700461, + "grad_norm": 0.6449844810308072, + "learning_rate": 3.853217272572112e-05, + "loss": 0.6678, + "num_tokens": 20158166.0, + "step": 216 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 0.7442869270444779, + "learning_rate": 3.852534562211982e-05, + "loss": 0.8017, + "num_tokens": 20247917.0, + "step": 217 + }, + { + "epoch": 0.03720771462706947, + "grad_norm": 0.6215022534031864, + "learning_rate": 3.851851851851852e-05, + "loss": 0.6049, + "num_tokens": 20332387.0, + "step": 218 + }, + { + "epoch": 0.037378392217101895, + "grad_norm": 0.6734722286985496, + "learning_rate": 3.851169141491722e-05, + "loss": 0.6136, + "num_tokens": 20401060.0, + "step": 219 + }, + { + "epoch": 0.03754906980713432, + "grad_norm": 0.6240104562725489, + "learning_rate": 3.850486431131593e-05, + "loss": 0.6873, + "num_tokens": 20493176.0, + "step": 220 + }, + { + "epoch": 0.03771974739716675, + "grad_norm": 0.6222008441982153, + "learning_rate": 3.849803720771463e-05, + "loss": 0.8038, + "num_tokens": 20600517.0, + "step": 221 + }, + { + "epoch": 0.03789042498719918, + "grad_norm": 0.8500778189172165, + "learning_rate": 3.8491210104113335e-05, + "loss": 0.7052, + "num_tokens": 20664185.0, + "step": 222 + }, + { + "epoch": 0.03806110257723161, + "grad_norm": 0.590603893198743, + "learning_rate": 3.8484383000512035e-05, + "loss": 0.6399, + "num_tokens": 20762512.0, + "step": 223 + }, + { + "epoch": 0.038231780167264036, + "grad_norm": 0.5892461323309548, + "learning_rate": 3.8477555896910736e-05, + "loss": 0.7103, + "num_tokens": 20858697.0, + "step": 224 + }, + { + "epoch": 0.03840245775729647, + "grad_norm": 0.6487089789063719, + "learning_rate": 3.847072879330944e-05, + "loss": 0.7493, + "num_tokens": 20946163.0, + "step": 225 + }, + { + "epoch": 0.038573135347328896, + "grad_norm": 0.6969749327857867, + "learning_rate": 3.846390168970814e-05, + "loss": 0.7399, + "num_tokens": 21022050.0, + "step": 226 + }, + { + "epoch": 0.03874381293736132, + "grad_norm": 0.6755166218497488, + "learning_rate": 3.845707458610685e-05, + "loss": 0.7194, + "num_tokens": 21088689.0, + "step": 227 + }, + { + "epoch": 0.03891449052739376, + "grad_norm": 0.6609915257851098, + "learning_rate": 3.845024748250555e-05, + "loss": 0.7361, + "num_tokens": 21179593.0, + "step": 228 + }, + { + "epoch": 0.03908516811742618, + "grad_norm": 0.6387367390293444, + "learning_rate": 3.844342037890426e-05, + "loss": 0.7075, + "num_tokens": 21283981.0, + "step": 229 + }, + { + "epoch": 0.03925584570745861, + "grad_norm": 0.6331325617527113, + "learning_rate": 3.843659327530296e-05, + "loss": 0.7143, + "num_tokens": 21384169.0, + "step": 230 + }, + { + "epoch": 0.03942652329749104, + "grad_norm": 0.6210236461894455, + "learning_rate": 3.842976617170166e-05, + "loss": 0.6492, + "num_tokens": 21476891.0, + "step": 231 + }, + { + "epoch": 0.03959720088752347, + "grad_norm": 0.6114145772564389, + "learning_rate": 3.842293906810036e-05, + "loss": 0.7085, + "num_tokens": 21576805.0, + "step": 232 + }, + { + "epoch": 0.0397678784775559, + "grad_norm": 0.7269263405346674, + "learning_rate": 3.8416111964499066e-05, + "loss": 0.761, + "num_tokens": 21657085.0, + "step": 233 + }, + { + "epoch": 0.039938556067588324, + "grad_norm": 0.6820360944752177, + "learning_rate": 3.8409284860897767e-05, + "loss": 0.8129, + "num_tokens": 21759879.0, + "step": 234 + }, + { + "epoch": 0.04010923365762076, + "grad_norm": 0.6088167202958221, + "learning_rate": 3.840245775729647e-05, + "loss": 0.7167, + "num_tokens": 21854206.0, + "step": 235 + }, + { + "epoch": 0.040279911247653184, + "grad_norm": 0.715618163345676, + "learning_rate": 3.8395630653695174e-05, + "loss": 0.8192, + "num_tokens": 21943842.0, + "step": 236 + }, + { + "epoch": 0.04045058883768561, + "grad_norm": 0.5716060122844864, + "learning_rate": 3.8388803550093874e-05, + "loss": 0.6185, + "num_tokens": 22031117.0, + "step": 237 + }, + { + "epoch": 0.04062126642771804, + "grad_norm": 0.5886556672114311, + "learning_rate": 3.838197644649258e-05, + "loss": 0.6897, + "num_tokens": 22136506.0, + "step": 238 + }, + { + "epoch": 0.04079194401775047, + "grad_norm": 0.620802577880916, + "learning_rate": 3.837514934289128e-05, + "loss": 0.6853, + "num_tokens": 22223545.0, + "step": 239 + }, + { + "epoch": 0.0409626216077829, + "grad_norm": 0.6952570223287441, + "learning_rate": 3.836832223928998e-05, + "loss": 0.7166, + "num_tokens": 22302727.0, + "step": 240 + }, + { + "epoch": 0.041133299197815325, + "grad_norm": 0.5964784433761416, + "learning_rate": 3.836149513568869e-05, + "loss": 0.756, + "num_tokens": 22410338.0, + "step": 241 + }, + { + "epoch": 0.04130397678784776, + "grad_norm": 0.5745001434209539, + "learning_rate": 3.835466803208739e-05, + "loss": 0.6975, + "num_tokens": 22520736.0, + "step": 242 + }, + { + "epoch": 0.041474654377880185, + "grad_norm": 0.684094979118212, + "learning_rate": 3.834784092848609e-05, + "loss": 0.7132, + "num_tokens": 22623372.0, + "step": 243 + }, + { + "epoch": 0.04164533196791261, + "grad_norm": 0.6525461038791344, + "learning_rate": 3.83410138248848e-05, + "loss": 0.6438, + "num_tokens": 22717206.0, + "step": 244 + }, + { + "epoch": 0.04181600955794504, + "grad_norm": 0.6349344841255985, + "learning_rate": 3.83341867212835e-05, + "loss": 0.7239, + "num_tokens": 22795800.0, + "step": 245 + }, + { + "epoch": 0.04198668714797747, + "grad_norm": 0.6760001681040225, + "learning_rate": 3.83273596176822e-05, + "loss": 0.7082, + "num_tokens": 22889168.0, + "step": 246 + }, + { + "epoch": 0.0421573647380099, + "grad_norm": 0.6525881334988204, + "learning_rate": 3.8320532514080905e-05, + "loss": 0.6191, + "num_tokens": 22958499.0, + "step": 247 + }, + { + "epoch": 0.042328042328042326, + "grad_norm": 0.6995991558863518, + "learning_rate": 3.8313705410479606e-05, + "loss": 0.7662, + "num_tokens": 23032072.0, + "step": 248 + }, + { + "epoch": 0.04249871991807476, + "grad_norm": 0.6222321578658456, + "learning_rate": 3.830687830687831e-05, + "loss": 0.7402, + "num_tokens": 23150064.0, + "step": 249 + }, + { + "epoch": 0.042669397508107186, + "grad_norm": 0.7883703436400517, + "learning_rate": 3.830005120327701e-05, + "loss": 0.7649, + "num_tokens": 23238540.0, + "step": 250 + }, + { + "epoch": 0.04284007509813961, + "grad_norm": 0.6108075570979011, + "learning_rate": 3.8293224099675714e-05, + "loss": 0.7755, + "num_tokens": 23349352.0, + "step": 251 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 0.5629583837106155, + "learning_rate": 3.828639699607442e-05, + "loss": 0.7611, + "num_tokens": 23487435.0, + "step": 252 + }, + { + "epoch": 0.04318143027820447, + "grad_norm": 0.7766316251312405, + "learning_rate": 3.827956989247312e-05, + "loss": 0.7377, + "num_tokens": 23555548.0, + "step": 253 + }, + { + "epoch": 0.0433521078682369, + "grad_norm": 0.688981917011655, + "learning_rate": 3.827274278887183e-05, + "loss": 0.8426, + "num_tokens": 23651358.0, + "step": 254 + }, + { + "epoch": 0.04352278545826933, + "grad_norm": 0.6003815064076454, + "learning_rate": 3.826591568527052e-05, + "loss": 0.6793, + "num_tokens": 23751596.0, + "step": 255 + }, + { + "epoch": 0.04369346304830176, + "grad_norm": 0.6082933288781598, + "learning_rate": 3.825908858166923e-05, + "loss": 0.7241, + "num_tokens": 23864315.0, + "step": 256 + }, + { + "epoch": 0.04386414063833419, + "grad_norm": 0.6035536001130845, + "learning_rate": 3.825226147806793e-05, + "loss": 0.7836, + "num_tokens": 23983063.0, + "step": 257 + }, + { + "epoch": 0.044034818228366614, + "grad_norm": 0.6964374777788611, + "learning_rate": 3.8245434374466637e-05, + "loss": 0.7099, + "num_tokens": 24074922.0, + "step": 258 + }, + { + "epoch": 0.04420549581839905, + "grad_norm": 0.6194280084542265, + "learning_rate": 3.823860727086534e-05, + "loss": 0.7005, + "num_tokens": 24178724.0, + "step": 259 + }, + { + "epoch": 0.044376173408431474, + "grad_norm": 0.6941184117633363, + "learning_rate": 3.8231780167264044e-05, + "loss": 0.6211, + "num_tokens": 24230818.0, + "step": 260 + }, + { + "epoch": 0.0445468509984639, + "grad_norm": 0.636335070564167, + "learning_rate": 3.8224953063662744e-05, + "loss": 0.7341, + "num_tokens": 24335908.0, + "step": 261 + }, + { + "epoch": 0.04471752858849633, + "grad_norm": 0.5880543296634169, + "learning_rate": 3.8218125960061445e-05, + "loss": 0.674, + "num_tokens": 24437899.0, + "step": 262 + }, + { + "epoch": 0.04488820617852876, + "grad_norm": 0.6455051656522064, + "learning_rate": 3.821129885646015e-05, + "loss": 0.7503, + "num_tokens": 24524874.0, + "step": 263 + }, + { + "epoch": 0.04505888376856119, + "grad_norm": 0.6307888036065953, + "learning_rate": 3.820447175285885e-05, + "loss": 0.6867, + "num_tokens": 24624753.0, + "step": 264 + }, + { + "epoch": 0.045229561358593615, + "grad_norm": 0.5626945875143103, + "learning_rate": 3.819764464925756e-05, + "loss": 0.7063, + "num_tokens": 24744237.0, + "step": 265 + }, + { + "epoch": 0.04540023894862605, + "grad_norm": 0.6297842303859197, + "learning_rate": 3.819081754565626e-05, + "loss": 0.6838, + "num_tokens": 24827242.0, + "step": 266 + }, + { + "epoch": 0.045570916538658475, + "grad_norm": 0.57803160300511, + "learning_rate": 3.818399044205496e-05, + "loss": 0.6907, + "num_tokens": 24923264.0, + "step": 267 + }, + { + "epoch": 0.0457415941286909, + "grad_norm": 0.57561982584915, + "learning_rate": 3.817716333845366e-05, + "loss": 0.6745, + "num_tokens": 25016783.0, + "step": 268 + }, + { + "epoch": 0.04591227171872333, + "grad_norm": 0.5898647618667103, + "learning_rate": 3.817033623485237e-05, + "loss": 0.7535, + "num_tokens": 25125599.0, + "step": 269 + }, + { + "epoch": 0.04608294930875576, + "grad_norm": 0.5617337295246779, + "learning_rate": 3.816350913125107e-05, + "loss": 0.722, + "num_tokens": 25233826.0, + "step": 270 + }, + { + "epoch": 0.04625362689878819, + "grad_norm": 0.6567101200269726, + "learning_rate": 3.815668202764977e-05, + "loss": 0.6623, + "num_tokens": 25311285.0, + "step": 271 + }, + { + "epoch": 0.046424304488820615, + "grad_norm": 0.5736134523614869, + "learning_rate": 3.8149854924048476e-05, + "loss": 0.7753, + "num_tokens": 25439181.0, + "step": 272 + }, + { + "epoch": 0.04659498207885305, + "grad_norm": 0.5735245682460768, + "learning_rate": 3.8143027820447176e-05, + "loss": 0.7254, + "num_tokens": 25555197.0, + "step": 273 + }, + { + "epoch": 0.046765659668885476, + "grad_norm": 0.5937705901912523, + "learning_rate": 3.813620071684588e-05, + "loss": 0.7542, + "num_tokens": 25684504.0, + "step": 274 + }, + { + "epoch": 0.0469363372589179, + "grad_norm": 0.610793352589693, + "learning_rate": 3.8129373613244584e-05, + "loss": 0.6858, + "num_tokens": 25773466.0, + "step": 275 + }, + { + "epoch": 0.047107014848950336, + "grad_norm": 0.6209349900879957, + "learning_rate": 3.812254650964329e-05, + "loss": 0.6717, + "num_tokens": 25859770.0, + "step": 276 + }, + { + "epoch": 0.04727769243898276, + "grad_norm": 0.618560937061168, + "learning_rate": 3.811571940604199e-05, + "loss": 0.6747, + "num_tokens": 25945581.0, + "step": 277 + }, + { + "epoch": 0.04744837002901519, + "grad_norm": 0.5893492194310449, + "learning_rate": 3.810889230244069e-05, + "loss": 0.7323, + "num_tokens": 26052467.0, + "step": 278 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.6103763777874639, + "learning_rate": 3.81020651988394e-05, + "loss": 0.6923, + "num_tokens": 26159719.0, + "step": 279 + }, + { + "epoch": 0.04778972520908005, + "grad_norm": 0.7591735604873373, + "learning_rate": 3.80952380952381e-05, + "loss": 0.7395, + "num_tokens": 26237527.0, + "step": 280 + }, + { + "epoch": 0.04796040279911248, + "grad_norm": 0.6214375798546801, + "learning_rate": 3.80884109916368e-05, + "loss": 0.6459, + "num_tokens": 26318069.0, + "step": 281 + }, + { + "epoch": 0.048131080389144903, + "grad_norm": 0.6347159899200633, + "learning_rate": 3.80815838880355e-05, + "loss": 0.6436, + "num_tokens": 26385961.0, + "step": 282 + }, + { + "epoch": 0.04830175797917734, + "grad_norm": 0.6059597111647619, + "learning_rate": 3.807475678443421e-05, + "loss": 0.676, + "num_tokens": 26480913.0, + "step": 283 + }, + { + "epoch": 0.048472435569209764, + "grad_norm": 0.5379958560062462, + "learning_rate": 3.806792968083291e-05, + "loss": 0.655, + "num_tokens": 26590390.0, + "step": 284 + }, + { + "epoch": 0.04864311315924219, + "grad_norm": 0.6063138258324356, + "learning_rate": 3.8061102577231614e-05, + "loss": 0.6646, + "num_tokens": 26684363.0, + "step": 285 + }, + { + "epoch": 0.04881379074927462, + "grad_norm": 0.6113104295984738, + "learning_rate": 3.8054275473630315e-05, + "loss": 0.6162, + "num_tokens": 26762027.0, + "step": 286 + }, + { + "epoch": 0.04898446833930705, + "grad_norm": 0.590332168720213, + "learning_rate": 3.804744837002902e-05, + "loss": 0.6839, + "num_tokens": 26848967.0, + "step": 287 + }, + { + "epoch": 0.04915514592933948, + "grad_norm": 0.6497749731867216, + "learning_rate": 3.804062126642772e-05, + "loss": 0.7286, + "num_tokens": 26932063.0, + "step": 288 + }, + { + "epoch": 0.049325823519371904, + "grad_norm": 0.6389317629167284, + "learning_rate": 3.803379416282642e-05, + "loss": 0.6919, + "num_tokens": 27020798.0, + "step": 289 + }, + { + "epoch": 0.04949650110940434, + "grad_norm": 0.6849937248453838, + "learning_rate": 3.802696705922513e-05, + "loss": 0.7606, + "num_tokens": 27098494.0, + "step": 290 + }, + { + "epoch": 0.049667178699436765, + "grad_norm": 0.5898908982039505, + "learning_rate": 3.802013995562383e-05, + "loss": 0.6489, + "num_tokens": 27208445.0, + "step": 291 + }, + { + "epoch": 0.04983785628946919, + "grad_norm": 0.6912780578303257, + "learning_rate": 3.801331285202254e-05, + "loss": 0.6651, + "num_tokens": 27284892.0, + "step": 292 + }, + { + "epoch": 0.05000853387950162, + "grad_norm": 0.6263191348893974, + "learning_rate": 3.800648574842123e-05, + "loss": 0.6532, + "num_tokens": 27380275.0, + "step": 293 + }, + { + "epoch": 0.05017921146953405, + "grad_norm": 0.6175922576658898, + "learning_rate": 3.799965864481994e-05, + "loss": 0.6085, + "num_tokens": 27467130.0, + "step": 294 + }, + { + "epoch": 0.05034988905956648, + "grad_norm": 0.628784904960062, + "learning_rate": 3.799283154121864e-05, + "loss": 0.6951, + "num_tokens": 27564913.0, + "step": 295 + }, + { + "epoch": 0.050520566649598905, + "grad_norm": 0.7112834864156032, + "learning_rate": 3.7986004437617346e-05, + "loss": 0.708, + "num_tokens": 27641317.0, + "step": 296 + }, + { + "epoch": 0.05069124423963134, + "grad_norm": 0.6080603299975411, + "learning_rate": 3.7979177334016046e-05, + "loss": 0.7182, + "num_tokens": 27749631.0, + "step": 297 + }, + { + "epoch": 0.050861921829663766, + "grad_norm": 0.6359672161568899, + "learning_rate": 3.7972350230414746e-05, + "loss": 0.7225, + "num_tokens": 27851888.0, + "step": 298 + }, + { + "epoch": 0.05103259941969619, + "grad_norm": 0.6243897079554416, + "learning_rate": 3.7965523126813454e-05, + "loss": 0.6366, + "num_tokens": 27939513.0, + "step": 299 + }, + { + "epoch": 0.051203277009728626, + "grad_norm": 0.6336824410583523, + "learning_rate": 3.7958696023212154e-05, + "loss": 0.6492, + "num_tokens": 28014087.0, + "step": 300 + }, + { + "epoch": 0.05137395459976105, + "grad_norm": 0.6836687269582991, + "learning_rate": 3.795186891961086e-05, + "loss": 0.7505, + "num_tokens": 28106455.0, + "step": 301 + }, + { + "epoch": 0.05154463218979348, + "grad_norm": 0.58897601103485, + "learning_rate": 3.794504181600956e-05, + "loss": 0.6842, + "num_tokens": 28228968.0, + "step": 302 + }, + { + "epoch": 0.051715309779825906, + "grad_norm": 0.5686816880536397, + "learning_rate": 3.793821471240827e-05, + "loss": 0.6842, + "num_tokens": 28336086.0, + "step": 303 + }, + { + "epoch": 0.05188598736985834, + "grad_norm": 0.6351971025260795, + "learning_rate": 3.793138760880697e-05, + "loss": 0.6816, + "num_tokens": 28418632.0, + "step": 304 + }, + { + "epoch": 0.05205666495989077, + "grad_norm": 0.6692197003414279, + "learning_rate": 3.792456050520567e-05, + "loss": 0.7205, + "num_tokens": 28525830.0, + "step": 305 + }, + { + "epoch": 0.05222734254992319, + "grad_norm": 0.5915651352988014, + "learning_rate": 3.791773340160437e-05, + "loss": 0.6356, + "num_tokens": 28632684.0, + "step": 306 + }, + { + "epoch": 0.05239802013995563, + "grad_norm": 0.6430012226562288, + "learning_rate": 3.791090629800308e-05, + "loss": 0.6208, + "num_tokens": 28722531.0, + "step": 307 + }, + { + "epoch": 0.052568697729988054, + "grad_norm": 0.6148858177377022, + "learning_rate": 3.790407919440178e-05, + "loss": 0.7157, + "num_tokens": 28816925.0, + "step": 308 + }, + { + "epoch": 0.05273937532002048, + "grad_norm": 0.5532109935484513, + "learning_rate": 3.789725209080048e-05, + "loss": 0.6494, + "num_tokens": 28939697.0, + "step": 309 + }, + { + "epoch": 0.05291005291005291, + "grad_norm": 0.6252304951482661, + "learning_rate": 3.7890424987199185e-05, + "loss": 0.8028, + "num_tokens": 29043697.0, + "step": 310 + }, + { + "epoch": 0.05308073050008534, + "grad_norm": 0.6396855593769076, + "learning_rate": 3.7883597883597885e-05, + "loss": 0.6816, + "num_tokens": 29133084.0, + "step": 311 + }, + { + "epoch": 0.05325140809011777, + "grad_norm": 0.5486214048846672, + "learning_rate": 3.787677077999659e-05, + "loss": 0.6813, + "num_tokens": 29241495.0, + "step": 312 + }, + { + "epoch": 0.053422085680150194, + "grad_norm": 0.6346625740127403, + "learning_rate": 3.786994367639529e-05, + "loss": 0.6512, + "num_tokens": 29327619.0, + "step": 313 + }, + { + "epoch": 0.05359276327018263, + "grad_norm": 0.6518780783411734, + "learning_rate": 3.786311657279399e-05, + "loss": 0.8498, + "num_tokens": 29443552.0, + "step": 314 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 0.6771060215630701, + "learning_rate": 3.78562894691927e-05, + "loss": 0.6308, + "num_tokens": 29517036.0, + "step": 315 + }, + { + "epoch": 0.05393411845024748, + "grad_norm": 0.655692053255562, + "learning_rate": 3.78494623655914e-05, + "loss": 0.7358, + "num_tokens": 29591582.0, + "step": 316 + }, + { + "epoch": 0.05410479604027991, + "grad_norm": 0.6529142599954677, + "learning_rate": 3.78426352619901e-05, + "loss": 0.7736, + "num_tokens": 29702915.0, + "step": 317 + }, + { + "epoch": 0.05427547363031234, + "grad_norm": 0.5849937177045179, + "learning_rate": 3.783580815838881e-05, + "loss": 0.6948, + "num_tokens": 29803767.0, + "step": 318 + }, + { + "epoch": 0.05444615122034477, + "grad_norm": 0.6470684234957662, + "learning_rate": 3.782898105478751e-05, + "loss": 0.6585, + "num_tokens": 29880489.0, + "step": 319 + }, + { + "epoch": 0.054616828810377195, + "grad_norm": 0.5871947799779641, + "learning_rate": 3.782215395118621e-05, + "loss": 0.6613, + "num_tokens": 29966807.0, + "step": 320 + }, + { + "epoch": 0.05478750640040963, + "grad_norm": 0.6748519426257876, + "learning_rate": 3.7815326847584916e-05, + "loss": 0.6879, + "num_tokens": 30034718.0, + "step": 321 + }, + { + "epoch": 0.054958183990442055, + "grad_norm": 0.5726725282393017, + "learning_rate": 3.7808499743983616e-05, + "loss": 0.7544, + "num_tokens": 30160195.0, + "step": 322 + }, + { + "epoch": 0.05512886158047448, + "grad_norm": 0.6397946190359531, + "learning_rate": 3.7801672640382324e-05, + "loss": 0.7291, + "num_tokens": 30262680.0, + "step": 323 + }, + { + "epoch": 0.055299539170506916, + "grad_norm": 0.6130145787681108, + "learning_rate": 3.7794845536781024e-05, + "loss": 0.6148, + "num_tokens": 30354916.0, + "step": 324 + }, + { + "epoch": 0.05547021676053934, + "grad_norm": 0.5974031396073681, + "learning_rate": 3.7788018433179724e-05, + "loss": 0.6979, + "num_tokens": 30448414.0, + "step": 325 + }, + { + "epoch": 0.05564089435057177, + "grad_norm": 0.5976981697259844, + "learning_rate": 3.778119132957843e-05, + "loss": 0.6524, + "num_tokens": 30526628.0, + "step": 326 + }, + { + "epoch": 0.055811571940604196, + "grad_norm": 0.6304826988991508, + "learning_rate": 3.777436422597713e-05, + "loss": 0.6256, + "num_tokens": 30624545.0, + "step": 327 + }, + { + "epoch": 0.05598224953063663, + "grad_norm": 0.6813711287723402, + "learning_rate": 3.776753712237584e-05, + "loss": 0.7305, + "num_tokens": 30716230.0, + "step": 328 + }, + { + "epoch": 0.056152927120669056, + "grad_norm": 0.563907626488399, + "learning_rate": 3.776071001877454e-05, + "loss": 0.642, + "num_tokens": 30816378.0, + "step": 329 + }, + { + "epoch": 0.05632360471070148, + "grad_norm": 0.6031698672329959, + "learning_rate": 3.775388291517324e-05, + "loss": 0.6443, + "num_tokens": 30906312.0, + "step": 330 + }, + { + "epoch": 0.05649428230073392, + "grad_norm": 0.5755762702383291, + "learning_rate": 3.774705581157194e-05, + "loss": 0.6443, + "num_tokens": 31014384.0, + "step": 331 + }, + { + "epoch": 0.056664959890766343, + "grad_norm": 0.6595818766113125, + "learning_rate": 3.774022870797065e-05, + "loss": 0.5974, + "num_tokens": 31081340.0, + "step": 332 + }, + { + "epoch": 0.05683563748079877, + "grad_norm": 0.5897464323445675, + "learning_rate": 3.773340160436935e-05, + "loss": 0.7197, + "num_tokens": 31194102.0, + "step": 333 + }, + { + "epoch": 0.0570063150708312, + "grad_norm": 0.7092730698637518, + "learning_rate": 3.7726574500768055e-05, + "loss": 0.7674, + "num_tokens": 31273069.0, + "step": 334 + }, + { + "epoch": 0.05717699266086363, + "grad_norm": 0.5633646092664663, + "learning_rate": 3.7719747397166755e-05, + "loss": 0.7509, + "num_tokens": 31395089.0, + "step": 335 + }, + { + "epoch": 0.05734767025089606, + "grad_norm": 0.6596642203984346, + "learning_rate": 3.7712920293565456e-05, + "loss": 0.7441, + "num_tokens": 31471097.0, + "step": 336 + }, + { + "epoch": 0.057518347840928484, + "grad_norm": 0.618376222573911, + "learning_rate": 3.770609318996416e-05, + "loss": 0.7575, + "num_tokens": 31578871.0, + "step": 337 + }, + { + "epoch": 0.05768902543096092, + "grad_norm": 0.6655296451467967, + "learning_rate": 3.769926608636286e-05, + "loss": 0.7395, + "num_tokens": 31663154.0, + "step": 338 + }, + { + "epoch": 0.057859703020993344, + "grad_norm": 0.5654197679659851, + "learning_rate": 3.769243898276157e-05, + "loss": 0.6801, + "num_tokens": 31752713.0, + "step": 339 + }, + { + "epoch": 0.05803038061102577, + "grad_norm": 0.557726729457756, + "learning_rate": 3.768561187916027e-05, + "loss": 0.7357, + "num_tokens": 31870373.0, + "step": 340 + }, + { + "epoch": 0.0582010582010582, + "grad_norm": 0.6511869748115202, + "learning_rate": 3.767878477555897e-05, + "loss": 0.6266, + "num_tokens": 31942321.0, + "step": 341 + }, + { + "epoch": 0.05837173579109063, + "grad_norm": 0.6126139390912213, + "learning_rate": 3.767195767195767e-05, + "loss": 0.6416, + "num_tokens": 32020304.0, + "step": 342 + }, + { + "epoch": 0.05854241338112306, + "grad_norm": 0.6207906938465051, + "learning_rate": 3.766513056835638e-05, + "loss": 0.8902, + "num_tokens": 32147859.0, + "step": 343 + }, + { + "epoch": 0.058713090971155485, + "grad_norm": 0.5839115586986341, + "learning_rate": 3.765830346475508e-05, + "loss": 0.6709, + "num_tokens": 32255454.0, + "step": 344 + }, + { + "epoch": 0.05888376856118792, + "grad_norm": 0.6117749194430162, + "learning_rate": 3.765147636115378e-05, + "loss": 0.6951, + "num_tokens": 32347959.0, + "step": 345 + }, + { + "epoch": 0.059054446151220345, + "grad_norm": 0.7899599700946952, + "learning_rate": 3.7644649257552486e-05, + "loss": 0.7121, + "num_tokens": 32434993.0, + "step": 346 + }, + { + "epoch": 0.05922512374125277, + "grad_norm": 0.696110797268787, + "learning_rate": 3.763782215395119e-05, + "loss": 0.7693, + "num_tokens": 32515984.0, + "step": 347 + }, + { + "epoch": 0.059395801331285206, + "grad_norm": 0.6020602800551419, + "learning_rate": 3.7630995050349894e-05, + "loss": 0.6211, + "num_tokens": 32601492.0, + "step": 348 + }, + { + "epoch": 0.05956647892131763, + "grad_norm": 0.6354827029858855, + "learning_rate": 3.7624167946748594e-05, + "loss": 0.6755, + "num_tokens": 32679240.0, + "step": 349 + }, + { + "epoch": 0.05973715651135006, + "grad_norm": 0.630934904500074, + "learning_rate": 3.76173408431473e-05, + "loss": 0.646, + "num_tokens": 32761575.0, + "step": 350 + }, + { + "epoch": 0.059907834101382486, + "grad_norm": 0.6980156017972984, + "learning_rate": 3.7610513739546e-05, + "loss": 0.8192, + "num_tokens": 32869168.0, + "step": 351 + }, + { + "epoch": 0.06007851169141492, + "grad_norm": 0.5432163540470442, + "learning_rate": 3.76036866359447e-05, + "loss": 0.6837, + "num_tokens": 32995115.0, + "step": 352 + }, + { + "epoch": 0.060249189281447346, + "grad_norm": 0.6253340539995649, + "learning_rate": 3.759685953234341e-05, + "loss": 0.6686, + "num_tokens": 33071548.0, + "step": 353 + }, + { + "epoch": 0.06041986687147977, + "grad_norm": 0.5661238104760647, + "learning_rate": 3.759003242874211e-05, + "loss": 0.6937, + "num_tokens": 33171193.0, + "step": 354 + }, + { + "epoch": 0.06059054446151221, + "grad_norm": 0.7015252786346565, + "learning_rate": 3.758320532514081e-05, + "loss": 0.7226, + "num_tokens": 33262750.0, + "step": 355 + }, + { + "epoch": 0.06076122205154463, + "grad_norm": 0.6352343880154788, + "learning_rate": 3.757637822153951e-05, + "loss": 0.7279, + "num_tokens": 33364584.0, + "step": 356 + }, + { + "epoch": 0.06093189964157706, + "grad_norm": 0.5681604111780297, + "learning_rate": 3.756955111793822e-05, + "loss": 0.648, + "num_tokens": 33458233.0, + "step": 357 + }, + { + "epoch": 0.06110257723160949, + "grad_norm": 0.6941505043963863, + "learning_rate": 3.756272401433692e-05, + "loss": 0.6391, + "num_tokens": 33517862.0, + "step": 358 + }, + { + "epoch": 0.06127325482164192, + "grad_norm": 0.593842649567541, + "learning_rate": 3.7555896910735625e-05, + "loss": 0.6306, + "num_tokens": 33601436.0, + "step": 359 + }, + { + "epoch": 0.06144393241167435, + "grad_norm": 0.6513768917133473, + "learning_rate": 3.7549069807134325e-05, + "loss": 0.7172, + "num_tokens": 33721288.0, + "step": 360 + }, + { + "epoch": 0.061614610001706774, + "grad_norm": 0.6448241351864141, + "learning_rate": 3.754224270353303e-05, + "loss": 0.6901, + "num_tokens": 33808335.0, + "step": 361 + }, + { + "epoch": 0.06178528759173921, + "grad_norm": 0.6314749265241472, + "learning_rate": 3.753541559993173e-05, + "loss": 0.729, + "num_tokens": 33896170.0, + "step": 362 + }, + { + "epoch": 0.061955965181771634, + "grad_norm": 0.6249680159870818, + "learning_rate": 3.7528588496330433e-05, + "loss": 0.6657, + "num_tokens": 33990712.0, + "step": 363 + }, + { + "epoch": 0.06212664277180406, + "grad_norm": 0.7819666474285782, + "learning_rate": 3.752176139272914e-05, + "loss": 0.7161, + "num_tokens": 34084129.0, + "step": 364 + }, + { + "epoch": 0.06229732036183649, + "grad_norm": 0.623136573424545, + "learning_rate": 3.751493428912784e-05, + "loss": 0.7713, + "num_tokens": 34182650.0, + "step": 365 + }, + { + "epoch": 0.06246799795186892, + "grad_norm": 0.6212610483902984, + "learning_rate": 3.750810718552655e-05, + "loss": 0.7185, + "num_tokens": 34273681.0, + "step": 366 + }, + { + "epoch": 0.06263867554190135, + "grad_norm": 0.6213927455182943, + "learning_rate": 3.750128008192524e-05, + "loss": 0.6249, + "num_tokens": 34372247.0, + "step": 367 + }, + { + "epoch": 0.06280935313193378, + "grad_norm": 0.6930180718157347, + "learning_rate": 3.749445297832395e-05, + "loss": 0.6806, + "num_tokens": 34436953.0, + "step": 368 + }, + { + "epoch": 0.0629800307219662, + "grad_norm": 0.596709588123384, + "learning_rate": 3.748762587472265e-05, + "loss": 0.7123, + "num_tokens": 34534992.0, + "step": 369 + }, + { + "epoch": 0.06315070831199864, + "grad_norm": 0.5996450257006977, + "learning_rate": 3.7480798771121356e-05, + "loss": 0.6974, + "num_tokens": 34636515.0, + "step": 370 + }, + { + "epoch": 0.06332138590203107, + "grad_norm": 0.6658216095479699, + "learning_rate": 3.747397166752006e-05, + "loss": 0.6464, + "num_tokens": 34724459.0, + "step": 371 + }, + { + "epoch": 0.06349206349206349, + "grad_norm": 0.5763381602786025, + "learning_rate": 3.746714456391876e-05, + "loss": 0.6996, + "num_tokens": 34819163.0, + "step": 372 + }, + { + "epoch": 0.06366274108209592, + "grad_norm": 0.6051052836228817, + "learning_rate": 3.7460317460317464e-05, + "loss": 0.7139, + "num_tokens": 34908285.0, + "step": 373 + }, + { + "epoch": 0.06383341867212836, + "grad_norm": 0.6835901793665402, + "learning_rate": 3.7453490356716165e-05, + "loss": 0.6645, + "num_tokens": 34969884.0, + "step": 374 + }, + { + "epoch": 0.06400409626216078, + "grad_norm": 0.6229655940402707, + "learning_rate": 3.744666325311487e-05, + "loss": 0.709, + "num_tokens": 35060653.0, + "step": 375 + }, + { + "epoch": 0.06417477385219321, + "grad_norm": 0.6953826673855853, + "learning_rate": 3.743983614951357e-05, + "loss": 0.7187, + "num_tokens": 35158562.0, + "step": 376 + }, + { + "epoch": 0.06434545144222563, + "grad_norm": 0.6494139911621649, + "learning_rate": 3.743300904591228e-05, + "loss": 0.6973, + "num_tokens": 35262656.0, + "step": 377 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 0.8114792870367081, + "learning_rate": 3.742618194231098e-05, + "loss": 0.7079, + "num_tokens": 35347985.0, + "step": 378 + }, + { + "epoch": 0.0646868066222905, + "grad_norm": 0.6485297541376812, + "learning_rate": 3.741935483870968e-05, + "loss": 0.6747, + "num_tokens": 35456474.0, + "step": 379 + }, + { + "epoch": 0.06485748421232292, + "grad_norm": 0.8323778201312184, + "learning_rate": 3.741252773510838e-05, + "loss": 0.6492, + "num_tokens": 35525737.0, + "step": 380 + }, + { + "epoch": 0.06502816180235535, + "grad_norm": 0.6501435260781027, + "learning_rate": 3.740570063150709e-05, + "loss": 0.7383, + "num_tokens": 35614137.0, + "step": 381 + }, + { + "epoch": 0.06519883939238778, + "grad_norm": 0.6287688700486896, + "learning_rate": 3.739887352790579e-05, + "loss": 0.6872, + "num_tokens": 35704315.0, + "step": 382 + }, + { + "epoch": 0.0653695169824202, + "grad_norm": 0.6275446535269974, + "learning_rate": 3.739204642430449e-05, + "loss": 0.7269, + "num_tokens": 35805797.0, + "step": 383 + }, + { + "epoch": 0.06554019457245264, + "grad_norm": 0.6502394930113807, + "learning_rate": 3.7385219320703195e-05, + "loss": 0.7353, + "num_tokens": 35881059.0, + "step": 384 + }, + { + "epoch": 0.06571087216248507, + "grad_norm": 0.6145683864620181, + "learning_rate": 3.7378392217101896e-05, + "loss": 0.6867, + "num_tokens": 35978961.0, + "step": 385 + }, + { + "epoch": 0.06588154975251749, + "grad_norm": 0.6445877564143657, + "learning_rate": 3.73715651135006e-05, + "loss": 0.7501, + "num_tokens": 36089043.0, + "step": 386 + }, + { + "epoch": 0.06605222734254992, + "grad_norm": 0.6063159022792604, + "learning_rate": 3.73647380098993e-05, + "loss": 0.7085, + "num_tokens": 36176176.0, + "step": 387 + }, + { + "epoch": 0.06622290493258236, + "grad_norm": 0.6727770573549958, + "learning_rate": 3.735791090629801e-05, + "loss": 0.8711, + "num_tokens": 36277250.0, + "step": 388 + }, + { + "epoch": 0.06639358252261478, + "grad_norm": 0.623310534898411, + "learning_rate": 3.735108380269671e-05, + "loss": 0.7457, + "num_tokens": 36373957.0, + "step": 389 + }, + { + "epoch": 0.06656426011264721, + "grad_norm": 0.5806434325474403, + "learning_rate": 3.734425669909541e-05, + "loss": 0.6898, + "num_tokens": 36490637.0, + "step": 390 + }, + { + "epoch": 0.06673493770267964, + "grad_norm": 0.6405041237951864, + "learning_rate": 3.733742959549412e-05, + "loss": 0.7375, + "num_tokens": 36580948.0, + "step": 391 + }, + { + "epoch": 0.06690561529271206, + "grad_norm": 0.5741425983530408, + "learning_rate": 3.733060249189282e-05, + "loss": 0.6373, + "num_tokens": 36675286.0, + "step": 392 + }, + { + "epoch": 0.0670762928827445, + "grad_norm": 0.594371912247555, + "learning_rate": 3.732377538829152e-05, + "loss": 0.6641, + "num_tokens": 36761001.0, + "step": 393 + }, + { + "epoch": 0.06724697047277692, + "grad_norm": 0.5391937711030764, + "learning_rate": 3.731694828469022e-05, + "loss": 0.6588, + "num_tokens": 36866705.0, + "step": 394 + }, + { + "epoch": 0.06741764806280935, + "grad_norm": 0.5911466604120026, + "learning_rate": 3.731012118108893e-05, + "loss": 0.6659, + "num_tokens": 36955975.0, + "step": 395 + }, + { + "epoch": 0.06758832565284179, + "grad_norm": 0.6671953460804453, + "learning_rate": 3.730329407748763e-05, + "loss": 0.6408, + "num_tokens": 37018213.0, + "step": 396 + }, + { + "epoch": 0.0677590032428742, + "grad_norm": 0.6097411852759803, + "learning_rate": 3.7296466973886334e-05, + "loss": 0.677, + "num_tokens": 37102646.0, + "step": 397 + }, + { + "epoch": 0.06792968083290664, + "grad_norm": 0.6489379649230457, + "learning_rate": 3.7289639870285035e-05, + "loss": 0.7303, + "num_tokens": 37188453.0, + "step": 398 + }, + { + "epoch": 0.06810035842293907, + "grad_norm": 0.6608131972519038, + "learning_rate": 3.7282812766683735e-05, + "loss": 0.733, + "num_tokens": 37270546.0, + "step": 399 + }, + { + "epoch": 0.06827103601297149, + "grad_norm": 0.529758043822333, + "learning_rate": 3.727598566308244e-05, + "loss": 0.6306, + "num_tokens": 37384124.0, + "step": 400 + }, + { + "epoch": 0.06844171360300393, + "grad_norm": 0.6109197802484728, + "learning_rate": 3.726915855948114e-05, + "loss": 0.6559, + "num_tokens": 37458924.0, + "step": 401 + }, + { + "epoch": 0.06861239119303636, + "grad_norm": 0.5896082109101846, + "learning_rate": 3.726233145587985e-05, + "loss": 0.7135, + "num_tokens": 37550494.0, + "step": 402 + }, + { + "epoch": 0.06878306878306878, + "grad_norm": 0.5910216988125868, + "learning_rate": 3.725550435227855e-05, + "loss": 0.6942, + "num_tokens": 37654010.0, + "step": 403 + }, + { + "epoch": 0.06895374637310121, + "grad_norm": 0.567421516260058, + "learning_rate": 3.724867724867725e-05, + "loss": 0.7834, + "num_tokens": 37772866.0, + "step": 404 + }, + { + "epoch": 0.06912442396313365, + "grad_norm": 0.5818669821307247, + "learning_rate": 3.724185014507595e-05, + "loss": 0.716, + "num_tokens": 37862071.0, + "step": 405 + }, + { + "epoch": 0.06929510155316607, + "grad_norm": 0.7090259986050121, + "learning_rate": 3.723502304147466e-05, + "loss": 0.7839, + "num_tokens": 37955278.0, + "step": 406 + }, + { + "epoch": 0.0694657791431985, + "grad_norm": 0.6272499259135247, + "learning_rate": 3.722819593787336e-05, + "loss": 0.7571, + "num_tokens": 38041561.0, + "step": 407 + }, + { + "epoch": 0.06963645673323093, + "grad_norm": 0.5797182356151931, + "learning_rate": 3.7221368834272065e-05, + "loss": 0.6863, + "num_tokens": 38144188.0, + "step": 408 + }, + { + "epoch": 0.06980713432326335, + "grad_norm": 0.5637511954820482, + "learning_rate": 3.7214541730670766e-05, + "loss": 0.6038, + "num_tokens": 38227874.0, + "step": 409 + }, + { + "epoch": 0.06997781191329579, + "grad_norm": 0.577565541492488, + "learning_rate": 3.7207714627069466e-05, + "loss": 0.6494, + "num_tokens": 38324140.0, + "step": 410 + }, + { + "epoch": 0.0701484895033282, + "grad_norm": 0.6986802549714193, + "learning_rate": 3.720088752346817e-05, + "loss": 0.7747, + "num_tokens": 38387616.0, + "step": 411 + }, + { + "epoch": 0.07031916709336064, + "grad_norm": 0.6095864465584332, + "learning_rate": 3.7194060419866874e-05, + "loss": 0.6442, + "num_tokens": 38481855.0, + "step": 412 + }, + { + "epoch": 0.07048984468339307, + "grad_norm": 0.5755819955879264, + "learning_rate": 3.718723331626558e-05, + "loss": 0.7263, + "num_tokens": 38582235.0, + "step": 413 + }, + { + "epoch": 0.0706605222734255, + "grad_norm": 0.6223497068927212, + "learning_rate": 3.718040621266428e-05, + "loss": 0.6992, + "num_tokens": 38670560.0, + "step": 414 + }, + { + "epoch": 0.07083119986345793, + "grad_norm": 0.6233453697396772, + "learning_rate": 3.717357910906298e-05, + "loss": 0.6405, + "num_tokens": 38754610.0, + "step": 415 + }, + { + "epoch": 0.07100187745349036, + "grad_norm": 0.6934015078700733, + "learning_rate": 3.716675200546168e-05, + "loss": 0.6963, + "num_tokens": 38821997.0, + "step": 416 + }, + { + "epoch": 0.07117255504352278, + "grad_norm": 0.6209536790566113, + "learning_rate": 3.715992490186039e-05, + "loss": 0.7221, + "num_tokens": 38908817.0, + "step": 417 + }, + { + "epoch": 0.07134323263355521, + "grad_norm": 0.5524276254441424, + "learning_rate": 3.715309779825909e-05, + "loss": 0.7409, + "num_tokens": 39008987.0, + "step": 418 + }, + { + "epoch": 0.07151391022358765, + "grad_norm": 0.5617908344440492, + "learning_rate": 3.71462706946578e-05, + "loss": 0.7365, + "num_tokens": 39139396.0, + "step": 419 + }, + { + "epoch": 0.07168458781362007, + "grad_norm": 0.6392847987935839, + "learning_rate": 3.71394435910565e-05, + "loss": 0.6621, + "num_tokens": 39235560.0, + "step": 420 + }, + { + "epoch": 0.0718552654036525, + "grad_norm": 0.5398960958143693, + "learning_rate": 3.71326164874552e-05, + "loss": 0.6687, + "num_tokens": 39337106.0, + "step": 421 + }, + { + "epoch": 0.07202594299368494, + "grad_norm": 0.6077077542844802, + "learning_rate": 3.7125789383853905e-05, + "loss": 0.7235, + "num_tokens": 39429462.0, + "step": 422 + }, + { + "epoch": 0.07219662058371736, + "grad_norm": 0.642831709156473, + "learning_rate": 3.7118962280252605e-05, + "loss": 0.7353, + "num_tokens": 39510402.0, + "step": 423 + }, + { + "epoch": 0.07236729817374979, + "grad_norm": 0.6132545983772113, + "learning_rate": 3.711213517665131e-05, + "loss": 0.6588, + "num_tokens": 39612590.0, + "step": 424 + }, + { + "epoch": 0.07253797576378221, + "grad_norm": 0.5916884491996948, + "learning_rate": 3.710530807305001e-05, + "loss": 0.6029, + "num_tokens": 39696752.0, + "step": 425 + }, + { + "epoch": 0.07270865335381464, + "grad_norm": 0.5669085638432203, + "learning_rate": 3.709848096944871e-05, + "loss": 0.6751, + "num_tokens": 39809029.0, + "step": 426 + }, + { + "epoch": 0.07287933094384708, + "grad_norm": 0.6276322795085529, + "learning_rate": 3.709165386584742e-05, + "loss": 0.7275, + "num_tokens": 39902041.0, + "step": 427 + }, + { + "epoch": 0.0730500085338795, + "grad_norm": 0.7684142821181216, + "learning_rate": 3.708482676224612e-05, + "loss": 0.6753, + "num_tokens": 39986122.0, + "step": 428 + }, + { + "epoch": 0.07322068612391193, + "grad_norm": 0.6416831587743044, + "learning_rate": 3.707799965864482e-05, + "loss": 0.677, + "num_tokens": 40082636.0, + "step": 429 + }, + { + "epoch": 0.07339136371394436, + "grad_norm": 0.6019502529843771, + "learning_rate": 3.707117255504352e-05, + "loss": 0.7416, + "num_tokens": 40181734.0, + "step": 430 + }, + { + "epoch": 0.07356204130397678, + "grad_norm": 0.6534992919623296, + "learning_rate": 3.706434545144223e-05, + "loss": 0.6258, + "num_tokens": 40254613.0, + "step": 431 + }, + { + "epoch": 0.07373271889400922, + "grad_norm": 0.6128214905076879, + "learning_rate": 3.705751834784093e-05, + "loss": 0.7237, + "num_tokens": 40345002.0, + "step": 432 + }, + { + "epoch": 0.07390339648404165, + "grad_norm": 0.654110835969018, + "learning_rate": 3.7050691244239636e-05, + "loss": 0.6948, + "num_tokens": 40419207.0, + "step": 433 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.587687684508063, + "learning_rate": 3.7043864140638336e-05, + "loss": 0.7702, + "num_tokens": 40527451.0, + "step": 434 + }, + { + "epoch": 0.0742447516641065, + "grad_norm": 0.5962117540564699, + "learning_rate": 3.703703703703704e-05, + "loss": 0.66, + "num_tokens": 40614978.0, + "step": 435 + }, + { + "epoch": 0.07441542925413894, + "grad_norm": 0.5857044251877943, + "learning_rate": 3.7030209933435744e-05, + "loss": 0.7406, + "num_tokens": 40723224.0, + "step": 436 + }, + { + "epoch": 0.07458610684417136, + "grad_norm": 0.5813161379638386, + "learning_rate": 3.7023382829834444e-05, + "loss": 0.7878, + "num_tokens": 40843058.0, + "step": 437 + }, + { + "epoch": 0.07475678443420379, + "grad_norm": 0.6404650823857034, + "learning_rate": 3.701655572623315e-05, + "loss": 0.6928, + "num_tokens": 40942724.0, + "step": 438 + }, + { + "epoch": 0.07492746202423622, + "grad_norm": 0.5302747980206566, + "learning_rate": 3.700972862263185e-05, + "loss": 0.671, + "num_tokens": 41065701.0, + "step": 439 + }, + { + "epoch": 0.07509813961426864, + "grad_norm": 0.5717265825404524, + "learning_rate": 3.700290151903056e-05, + "loss": 0.7167, + "num_tokens": 41188117.0, + "step": 440 + }, + { + "epoch": 0.07526881720430108, + "grad_norm": 0.6321329034107778, + "learning_rate": 3.699607441542925e-05, + "loss": 0.6929, + "num_tokens": 41292957.0, + "step": 441 + }, + { + "epoch": 0.0754394947943335, + "grad_norm": 0.6559715204135413, + "learning_rate": 3.698924731182796e-05, + "loss": 0.6586, + "num_tokens": 41354391.0, + "step": 442 + }, + { + "epoch": 0.07561017238436593, + "grad_norm": 0.5906856574426936, + "learning_rate": 3.698242020822666e-05, + "loss": 0.7089, + "num_tokens": 41464498.0, + "step": 443 + }, + { + "epoch": 0.07578084997439836, + "grad_norm": 0.5557546275010231, + "learning_rate": 3.697559310462537e-05, + "loss": 0.6359, + "num_tokens": 41557700.0, + "step": 444 + }, + { + "epoch": 0.07595152756443078, + "grad_norm": 0.6222419947426912, + "learning_rate": 3.696876600102407e-05, + "loss": 0.6926, + "num_tokens": 41631938.0, + "step": 445 + }, + { + "epoch": 0.07612220515446322, + "grad_norm": 0.6253324108919642, + "learning_rate": 3.696193889742277e-05, + "loss": 0.7157, + "num_tokens": 41714289.0, + "step": 446 + }, + { + "epoch": 0.07629288274449565, + "grad_norm": 0.5581348306417503, + "learning_rate": 3.6955111793821475e-05, + "loss": 0.5908, + "num_tokens": 41796100.0, + "step": 447 + }, + { + "epoch": 0.07646356033452807, + "grad_norm": 0.6262323408886254, + "learning_rate": 3.6948284690220175e-05, + "loss": 0.711, + "num_tokens": 41892218.0, + "step": 448 + }, + { + "epoch": 0.0766342379245605, + "grad_norm": 0.5608123760180548, + "learning_rate": 3.694145758661888e-05, + "loss": 0.6624, + "num_tokens": 41994455.0, + "step": 449 + }, + { + "epoch": 0.07680491551459294, + "grad_norm": 0.5716962388896752, + "learning_rate": 3.693463048301758e-05, + "loss": 0.6826, + "num_tokens": 42083429.0, + "step": 450 + }, + { + "epoch": 0.07697559310462536, + "grad_norm": 0.700045660427373, + "learning_rate": 3.692780337941629e-05, + "loss": 0.5999, + "num_tokens": 42141344.0, + "step": 451 + }, + { + "epoch": 0.07714627069465779, + "grad_norm": 0.6418561966355081, + "learning_rate": 3.692097627581499e-05, + "loss": 0.807, + "num_tokens": 42244546.0, + "step": 452 + }, + { + "epoch": 0.07731694828469023, + "grad_norm": 0.5726563609384213, + "learning_rate": 3.691414917221369e-05, + "loss": 0.718, + "num_tokens": 42353015.0, + "step": 453 + }, + { + "epoch": 0.07748762587472265, + "grad_norm": 0.6131113670666067, + "learning_rate": 3.690732206861239e-05, + "loss": 0.6432, + "num_tokens": 42434951.0, + "step": 454 + }, + { + "epoch": 0.07765830346475508, + "grad_norm": 0.5978744216928057, + "learning_rate": 3.69004949650111e-05, + "loss": 0.6366, + "num_tokens": 42542422.0, + "step": 455 + }, + { + "epoch": 0.07782898105478751, + "grad_norm": 0.5935002375210909, + "learning_rate": 3.68936678614098e-05, + "loss": 0.6419, + "num_tokens": 42625018.0, + "step": 456 + }, + { + "epoch": 0.07799965864481993, + "grad_norm": 0.6694130666769859, + "learning_rate": 3.68868407578085e-05, + "loss": 0.7177, + "num_tokens": 42724024.0, + "step": 457 + }, + { + "epoch": 0.07817033623485237, + "grad_norm": 0.8939360181519956, + "learning_rate": 3.6880013654207206e-05, + "loss": 0.7087, + "num_tokens": 42816602.0, + "step": 458 + }, + { + "epoch": 0.07834101382488479, + "grad_norm": 0.603140795800803, + "learning_rate": 3.6873186550605907e-05, + "loss": 0.7126, + "num_tokens": 42908318.0, + "step": 459 + }, + { + "epoch": 0.07851169141491722, + "grad_norm": 0.6343385109275218, + "learning_rate": 3.6866359447004614e-05, + "loss": 0.8598, + "num_tokens": 43010891.0, + "step": 460 + }, + { + "epoch": 0.07868236900494965, + "grad_norm": 0.5514810574769844, + "learning_rate": 3.6859532343403314e-05, + "loss": 0.6591, + "num_tokens": 43101520.0, + "step": 461 + }, + { + "epoch": 0.07885304659498207, + "grad_norm": 0.6466220317601231, + "learning_rate": 3.685270523980202e-05, + "loss": 0.7448, + "num_tokens": 43184538.0, + "step": 462 + }, + { + "epoch": 0.07902372418501451, + "grad_norm": 0.6171968336720612, + "learning_rate": 3.684587813620072e-05, + "loss": 0.6687, + "num_tokens": 43255678.0, + "step": 463 + }, + { + "epoch": 0.07919440177504694, + "grad_norm": 0.9893091065104817, + "learning_rate": 3.683905103259942e-05, + "loss": 0.7077, + "num_tokens": 43341549.0, + "step": 464 + }, + { + "epoch": 0.07936507936507936, + "grad_norm": 0.6240041305699835, + "learning_rate": 3.683222392899813e-05, + "loss": 0.7945, + "num_tokens": 43449877.0, + "step": 465 + }, + { + "epoch": 0.0795357569551118, + "grad_norm": 0.5888619822231417, + "learning_rate": 3.682539682539683e-05, + "loss": 0.6639, + "num_tokens": 43557378.0, + "step": 466 + }, + { + "epoch": 0.07970643454514423, + "grad_norm": 0.5817277037957258, + "learning_rate": 3.681856972179553e-05, + "loss": 0.6526, + "num_tokens": 43651808.0, + "step": 467 + }, + { + "epoch": 0.07987711213517665, + "grad_norm": 0.5692245257127788, + "learning_rate": 3.681174261819423e-05, + "loss": 0.6627, + "num_tokens": 43757716.0, + "step": 468 + }, + { + "epoch": 0.08004778972520908, + "grad_norm": 0.6186624934267938, + "learning_rate": 3.680491551459294e-05, + "loss": 0.6637, + "num_tokens": 43835657.0, + "step": 469 + }, + { + "epoch": 0.08021846731524152, + "grad_norm": 0.5560640781277002, + "learning_rate": 3.679808841099164e-05, + "loss": 0.6141, + "num_tokens": 43925242.0, + "step": 470 + }, + { + "epoch": 0.08038914490527393, + "grad_norm": 0.6124977850905327, + "learning_rate": 3.6791261307390345e-05, + "loss": 0.6069, + "num_tokens": 43993838.0, + "step": 471 + }, + { + "epoch": 0.08055982249530637, + "grad_norm": 0.5696982475095156, + "learning_rate": 3.6784434203789045e-05, + "loss": 0.7307, + "num_tokens": 44088914.0, + "step": 472 + }, + { + "epoch": 0.08073050008533879, + "grad_norm": 0.5465347641317355, + "learning_rate": 3.6777607100187746e-05, + "loss": 0.6209, + "num_tokens": 44201381.0, + "step": 473 + }, + { + "epoch": 0.08090117767537122, + "grad_norm": 0.5663520542617837, + "learning_rate": 3.677077999658645e-05, + "loss": 0.5714, + "num_tokens": 44283784.0, + "step": 474 + }, + { + "epoch": 0.08107185526540366, + "grad_norm": 0.5651685933116898, + "learning_rate": 3.676395289298515e-05, + "loss": 0.6867, + "num_tokens": 44375593.0, + "step": 475 + }, + { + "epoch": 0.08124253285543608, + "grad_norm": 0.5421169567693844, + "learning_rate": 3.675712578938386e-05, + "loss": 0.5945, + "num_tokens": 44473760.0, + "step": 476 + }, + { + "epoch": 0.08141321044546851, + "grad_norm": 0.6171344694912092, + "learning_rate": 3.675029868578256e-05, + "loss": 0.7259, + "num_tokens": 44571663.0, + "step": 477 + }, + { + "epoch": 0.08158388803550094, + "grad_norm": 0.6193533591680713, + "learning_rate": 3.674347158218126e-05, + "loss": 0.7452, + "num_tokens": 44667768.0, + "step": 478 + }, + { + "epoch": 0.08175456562553336, + "grad_norm": 0.6117427566981729, + "learning_rate": 3.673664447857996e-05, + "loss": 0.7464, + "num_tokens": 44750774.0, + "step": 479 + }, + { + "epoch": 0.0819252432155658, + "grad_norm": 0.5799960616420552, + "learning_rate": 3.672981737497867e-05, + "loss": 0.7032, + "num_tokens": 44861644.0, + "step": 480 + }, + { + "epoch": 0.08209592080559823, + "grad_norm": 0.6361081087572035, + "learning_rate": 3.672299027137737e-05, + "loss": 0.6766, + "num_tokens": 44933113.0, + "step": 481 + }, + { + "epoch": 0.08226659839563065, + "grad_norm": 0.588205544901898, + "learning_rate": 3.6716163167776076e-05, + "loss": 0.7455, + "num_tokens": 45046569.0, + "step": 482 + }, + { + "epoch": 0.08243727598566308, + "grad_norm": 0.7752783986307354, + "learning_rate": 3.6709336064174777e-05, + "loss": 0.8238, + "num_tokens": 45107247.0, + "step": 483 + }, + { + "epoch": 0.08260795357569552, + "grad_norm": 0.6074951436464685, + "learning_rate": 3.670250896057348e-05, + "loss": 0.654, + "num_tokens": 45191412.0, + "step": 484 + }, + { + "epoch": 0.08277863116572794, + "grad_norm": 0.5849451194018884, + "learning_rate": 3.6695681856972184e-05, + "loss": 0.7134, + "num_tokens": 45290166.0, + "step": 485 + }, + { + "epoch": 0.08294930875576037, + "grad_norm": 0.5572397811694471, + "learning_rate": 3.6688854753370884e-05, + "loss": 0.633, + "num_tokens": 45379898.0, + "step": 486 + }, + { + "epoch": 0.0831199863457928, + "grad_norm": 0.6310970194616543, + "learning_rate": 3.668202764976959e-05, + "loss": 0.6827, + "num_tokens": 45452606.0, + "step": 487 + }, + { + "epoch": 0.08329066393582522, + "grad_norm": 0.5456901113610884, + "learning_rate": 3.667520054616829e-05, + "loss": 0.6189, + "num_tokens": 45555304.0, + "step": 488 + }, + { + "epoch": 0.08346134152585766, + "grad_norm": 0.6094337320529141, + "learning_rate": 3.6668373442567e-05, + "loss": 0.7914, + "num_tokens": 45651032.0, + "step": 489 + }, + { + "epoch": 0.08363201911589008, + "grad_norm": 0.5640544607770678, + "learning_rate": 3.66615463389657e-05, + "loss": 0.6418, + "num_tokens": 45735812.0, + "step": 490 + }, + { + "epoch": 0.08380269670592251, + "grad_norm": 0.5418123506293233, + "learning_rate": 3.66547192353644e-05, + "loss": 0.6468, + "num_tokens": 45834101.0, + "step": 491 + }, + { + "epoch": 0.08397337429595494, + "grad_norm": 0.5548156202834212, + "learning_rate": 3.66478921317631e-05, + "loss": 0.6736, + "num_tokens": 45934369.0, + "step": 492 + }, + { + "epoch": 0.08414405188598736, + "grad_norm": 0.58122308325762, + "learning_rate": 3.664106502816181e-05, + "loss": 0.7176, + "num_tokens": 46022911.0, + "step": 493 + }, + { + "epoch": 0.0843147294760198, + "grad_norm": 0.5832625645337377, + "learning_rate": 3.663423792456051e-05, + "loss": 0.7067, + "num_tokens": 46121503.0, + "step": 494 + }, + { + "epoch": 0.08448540706605223, + "grad_norm": 0.5136226933255437, + "learning_rate": 3.662741082095921e-05, + "loss": 0.6491, + "num_tokens": 46244053.0, + "step": 495 + }, + { + "epoch": 0.08465608465608465, + "grad_norm": 0.5366138938521392, + "learning_rate": 3.6620583717357915e-05, + "loss": 0.6466, + "num_tokens": 46353856.0, + "step": 496 + }, + { + "epoch": 0.08482676224611709, + "grad_norm": 0.5655264199164406, + "learning_rate": 3.6613756613756616e-05, + "loss": 0.7721, + "num_tokens": 46461446.0, + "step": 497 + }, + { + "epoch": 0.08499743983614952, + "grad_norm": 0.52875408467019, + "learning_rate": 3.660692951015532e-05, + "loss": 0.6596, + "num_tokens": 46577296.0, + "step": 498 + }, + { + "epoch": 0.08516811742618194, + "grad_norm": 0.5779672335867836, + "learning_rate": 3.660010240655402e-05, + "loss": 0.7262, + "num_tokens": 46666140.0, + "step": 499 + }, + { + "epoch": 0.08533879501621437, + "grad_norm": 0.55134507768759, + "learning_rate": 3.6593275302952724e-05, + "loss": 0.6877, + "num_tokens": 46761310.0, + "step": 500 + }, + { + "epoch": 0.0855094726062468, + "grad_norm": 0.5466686932508596, + "learning_rate": 3.658644819935143e-05, + "loss": 0.6517, + "num_tokens": 46861703.0, + "step": 501 + }, + { + "epoch": 0.08568015019627923, + "grad_norm": 0.5517273584736176, + "learning_rate": 3.657962109575013e-05, + "loss": 0.7141, + "num_tokens": 46966366.0, + "step": 502 + }, + { + "epoch": 0.08585082778631166, + "grad_norm": 0.5800037362559861, + "learning_rate": 3.657279399214883e-05, + "loss": 0.6958, + "num_tokens": 47046507.0, + "step": 503 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 0.5617836602155099, + "learning_rate": 3.656596688854753e-05, + "loss": 0.7618, + "num_tokens": 47149623.0, + "step": 504 + }, + { + "epoch": 0.08619218296637651, + "grad_norm": 0.5502705173909379, + "learning_rate": 3.655913978494624e-05, + "loss": 0.5706, + "num_tokens": 47231903.0, + "step": 505 + }, + { + "epoch": 0.08636286055640895, + "grad_norm": 0.666081316515715, + "learning_rate": 3.655231268134494e-05, + "loss": 0.739, + "num_tokens": 47331758.0, + "step": 506 + }, + { + "epoch": 0.08653353814644137, + "grad_norm": 0.5797301542556041, + "learning_rate": 3.6545485577743646e-05, + "loss": 0.5743, + "num_tokens": 47403309.0, + "step": 507 + }, + { + "epoch": 0.0867042157364738, + "grad_norm": 0.5904541896089521, + "learning_rate": 3.653865847414235e-05, + "loss": 0.7626, + "num_tokens": 47509820.0, + "step": 508 + }, + { + "epoch": 0.08687489332650623, + "grad_norm": 0.5981561246130371, + "learning_rate": 3.6531831370541054e-05, + "loss": 0.72, + "num_tokens": 47607368.0, + "step": 509 + }, + { + "epoch": 0.08704557091653865, + "grad_norm": 0.5965878657853652, + "learning_rate": 3.6525004266939754e-05, + "loss": 0.6892, + "num_tokens": 47702654.0, + "step": 510 + }, + { + "epoch": 0.08721624850657109, + "grad_norm": 0.6111497189704105, + "learning_rate": 3.6518177163338455e-05, + "loss": 0.6577, + "num_tokens": 47766700.0, + "step": 511 + }, + { + "epoch": 0.08738692609660352, + "grad_norm": 0.5791148451603378, + "learning_rate": 3.651135005973716e-05, + "loss": 0.7108, + "num_tokens": 47885720.0, + "step": 512 + }, + { + "epoch": 0.08755760368663594, + "grad_norm": 0.5725552727080536, + "learning_rate": 3.650452295613586e-05, + "loss": 0.7149, + "num_tokens": 47980372.0, + "step": 513 + }, + { + "epoch": 0.08772828127666837, + "grad_norm": 0.5651828738015333, + "learning_rate": 3.649769585253457e-05, + "loss": 0.6782, + "num_tokens": 48067383.0, + "step": 514 + }, + { + "epoch": 0.08789895886670081, + "grad_norm": 0.5444762935137033, + "learning_rate": 3.649086874893327e-05, + "loss": 0.6467, + "num_tokens": 48161663.0, + "step": 515 + }, + { + "epoch": 0.08806963645673323, + "grad_norm": 0.5900012700843957, + "learning_rate": 3.648404164533197e-05, + "loss": 0.5933, + "num_tokens": 48243234.0, + "step": 516 + }, + { + "epoch": 0.08824031404676566, + "grad_norm": 0.5773627576470945, + "learning_rate": 3.647721454173067e-05, + "loss": 0.6966, + "num_tokens": 48345296.0, + "step": 517 + }, + { + "epoch": 0.0884109916367981, + "grad_norm": 0.5508070391292066, + "learning_rate": 3.647038743812938e-05, + "loss": 0.6688, + "num_tokens": 48441283.0, + "step": 518 + }, + { + "epoch": 0.08858166922683051, + "grad_norm": 0.5935608672750368, + "learning_rate": 3.646356033452808e-05, + "loss": 0.719, + "num_tokens": 48534521.0, + "step": 519 + }, + { + "epoch": 0.08875234681686295, + "grad_norm": 0.5645208811010013, + "learning_rate": 3.645673323092678e-05, + "loss": 0.7341, + "num_tokens": 48645176.0, + "step": 520 + }, + { + "epoch": 0.08892302440689537, + "grad_norm": 0.6608966069365919, + "learning_rate": 3.6449906127325486e-05, + "loss": 0.8173, + "num_tokens": 48749390.0, + "step": 521 + }, + { + "epoch": 0.0890937019969278, + "grad_norm": 0.5271268839277463, + "learning_rate": 3.6443079023724186e-05, + "loss": 0.6352, + "num_tokens": 48857015.0, + "step": 522 + }, + { + "epoch": 0.08926437958696024, + "grad_norm": 0.5925351919095255, + "learning_rate": 3.643625192012289e-05, + "loss": 0.6904, + "num_tokens": 48940361.0, + "step": 523 + }, + { + "epoch": 0.08943505717699266, + "grad_norm": 0.5991121224215802, + "learning_rate": 3.6429424816521594e-05, + "loss": 0.7506, + "num_tokens": 49028322.0, + "step": 524 + }, + { + "epoch": 0.08960573476702509, + "grad_norm": 0.5989576559906101, + "learning_rate": 3.64225977129203e-05, + "loss": 0.6579, + "num_tokens": 49096177.0, + "step": 525 + }, + { + "epoch": 0.08977641235705752, + "grad_norm": 0.5426754113879474, + "learning_rate": 3.6415770609319e-05, + "loss": 0.6627, + "num_tokens": 49187665.0, + "step": 526 + }, + { + "epoch": 0.08994708994708994, + "grad_norm": 0.5602305100542257, + "learning_rate": 3.64089435057177e-05, + "loss": 0.5757, + "num_tokens": 49276433.0, + "step": 527 + }, + { + "epoch": 0.09011776753712238, + "grad_norm": 0.6304599194999569, + "learning_rate": 3.64021164021164e-05, + "loss": 0.6369, + "num_tokens": 49371701.0, + "step": 528 + }, + { + "epoch": 0.09028844512715481, + "grad_norm": 0.5719837438744131, + "learning_rate": 3.639528929851511e-05, + "loss": 0.6689, + "num_tokens": 49470912.0, + "step": 529 + }, + { + "epoch": 0.09045912271718723, + "grad_norm": 0.5545437279124883, + "learning_rate": 3.638846219491381e-05, + "loss": 0.6597, + "num_tokens": 49562256.0, + "step": 530 + }, + { + "epoch": 0.09062980030721966, + "grad_norm": 0.5171433323911107, + "learning_rate": 3.638163509131251e-05, + "loss": 0.5871, + "num_tokens": 49663638.0, + "step": 531 + }, + { + "epoch": 0.0908004778972521, + "grad_norm": 0.6665112915660268, + "learning_rate": 3.637480798771122e-05, + "loss": 0.7018, + "num_tokens": 49736857.0, + "step": 532 + }, + { + "epoch": 0.09097115548728452, + "grad_norm": 0.6021405988117964, + "learning_rate": 3.636798088410992e-05, + "loss": 0.6464, + "num_tokens": 49816289.0, + "step": 533 + }, + { + "epoch": 0.09114183307731695, + "grad_norm": 0.6222466420135279, + "learning_rate": 3.6361153780508624e-05, + "loss": 0.6123, + "num_tokens": 49897108.0, + "step": 534 + }, + { + "epoch": 0.09131251066734938, + "grad_norm": 0.5842448933340897, + "learning_rate": 3.6354326676907325e-05, + "loss": 0.6646, + "num_tokens": 49989367.0, + "step": 535 + }, + { + "epoch": 0.0914831882573818, + "grad_norm": 0.6357341901564604, + "learning_rate": 3.634749957330603e-05, + "loss": 0.7505, + "num_tokens": 50076209.0, + "step": 536 + }, + { + "epoch": 0.09165386584741424, + "grad_norm": 0.6056788322800208, + "learning_rate": 3.634067246970473e-05, + "loss": 0.7382, + "num_tokens": 50183685.0, + "step": 537 + }, + { + "epoch": 0.09182454343744666, + "grad_norm": 0.5614211588277851, + "learning_rate": 3.633384536610343e-05, + "loss": 0.6925, + "num_tokens": 50283125.0, + "step": 538 + }, + { + "epoch": 0.09199522102747909, + "grad_norm": 0.5334627053830949, + "learning_rate": 3.632701826250214e-05, + "loss": 0.5919, + "num_tokens": 50378292.0, + "step": 539 + }, + { + "epoch": 0.09216589861751152, + "grad_norm": 0.6869647579312297, + "learning_rate": 3.632019115890084e-05, + "loss": 0.7839, + "num_tokens": 50452949.0, + "step": 540 + }, + { + "epoch": 0.09233657620754394, + "grad_norm": 0.5558184357195769, + "learning_rate": 3.631336405529954e-05, + "loss": 0.6433, + "num_tokens": 50547915.0, + "step": 541 + }, + { + "epoch": 0.09250725379757638, + "grad_norm": 0.5842349893222868, + "learning_rate": 3.630653695169824e-05, + "loss": 0.6653, + "num_tokens": 50653440.0, + "step": 542 + }, + { + "epoch": 0.09267793138760881, + "grad_norm": 0.6685350636151807, + "learning_rate": 3.629970984809695e-05, + "loss": 0.6845, + "num_tokens": 50713541.0, + "step": 543 + }, + { + "epoch": 0.09284860897764123, + "grad_norm": 0.5512215679137163, + "learning_rate": 3.629288274449565e-05, + "loss": 0.7367, + "num_tokens": 50848756.0, + "step": 544 + }, + { + "epoch": 0.09301928656767366, + "grad_norm": 0.598199161021648, + "learning_rate": 3.6286055640894356e-05, + "loss": 0.6751, + "num_tokens": 50931087.0, + "step": 545 + }, + { + "epoch": 0.0931899641577061, + "grad_norm": 0.5468294611130413, + "learning_rate": 3.6279228537293056e-05, + "loss": 0.6635, + "num_tokens": 51042095.0, + "step": 546 + }, + { + "epoch": 0.09336064174773852, + "grad_norm": 0.5849218216045272, + "learning_rate": 3.6272401433691756e-05, + "loss": 0.6247, + "num_tokens": 51131588.0, + "step": 547 + }, + { + "epoch": 0.09353131933777095, + "grad_norm": 0.5798367401027197, + "learning_rate": 3.6265574330090464e-05, + "loss": 0.6618, + "num_tokens": 51216211.0, + "step": 548 + }, + { + "epoch": 0.09370199692780339, + "grad_norm": 0.6132243264188201, + "learning_rate": 3.6258747226489164e-05, + "loss": 0.7182, + "num_tokens": 51294920.0, + "step": 549 + }, + { + "epoch": 0.0938726745178358, + "grad_norm": 0.5942224486250892, + "learning_rate": 3.625192012288787e-05, + "loss": 0.6176, + "num_tokens": 51374099.0, + "step": 550 + }, + { + "epoch": 0.09404335210786824, + "grad_norm": 0.6196569300399327, + "learning_rate": 3.624509301928657e-05, + "loss": 0.6019, + "num_tokens": 51447836.0, + "step": 551 + }, + { + "epoch": 0.09421402969790067, + "grad_norm": 0.6465822489696604, + "learning_rate": 3.623826591568528e-05, + "loss": 0.7073, + "num_tokens": 51526598.0, + "step": 552 + }, + { + "epoch": 0.09438470728793309, + "grad_norm": 0.6522141897092714, + "learning_rate": 3.623143881208397e-05, + "loss": 0.7982, + "num_tokens": 51615753.0, + "step": 553 + }, + { + "epoch": 0.09455538487796553, + "grad_norm": 0.5776636817634605, + "learning_rate": 3.622461170848268e-05, + "loss": 0.6442, + "num_tokens": 51703857.0, + "step": 554 + }, + { + "epoch": 0.09472606246799795, + "grad_norm": 0.5896416832310668, + "learning_rate": 3.621778460488138e-05, + "loss": 0.7461, + "num_tokens": 51805181.0, + "step": 555 + }, + { + "epoch": 0.09489674005803038, + "grad_norm": 0.5998681509325083, + "learning_rate": 3.621095750128009e-05, + "loss": 0.6354, + "num_tokens": 51879924.0, + "step": 556 + }, + { + "epoch": 0.09506741764806281, + "grad_norm": 0.5941086950073793, + "learning_rate": 3.620413039767879e-05, + "loss": 0.7689, + "num_tokens": 51990472.0, + "step": 557 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.7768918607888808, + "learning_rate": 3.619730329407749e-05, + "loss": 0.7049, + "num_tokens": 52085459.0, + "step": 558 + }, + { + "epoch": 0.09540877282812767, + "grad_norm": 0.6195553119074377, + "learning_rate": 3.6190476190476195e-05, + "loss": 0.6888, + "num_tokens": 52158776.0, + "step": 559 + }, + { + "epoch": 0.0955794504181601, + "grad_norm": 0.570006828896315, + "learning_rate": 3.6183649086874895e-05, + "loss": 0.6311, + "num_tokens": 52242465.0, + "step": 560 + }, + { + "epoch": 0.09575012800819252, + "grad_norm": 0.5323105926739331, + "learning_rate": 3.61768219832736e-05, + "loss": 0.6609, + "num_tokens": 52363273.0, + "step": 561 + }, + { + "epoch": 0.09592080559822495, + "grad_norm": 0.6027235861329099, + "learning_rate": 3.61699948796723e-05, + "loss": 0.6049, + "num_tokens": 52443139.0, + "step": 562 + }, + { + "epoch": 0.09609148318825739, + "grad_norm": 0.5246191328850529, + "learning_rate": 3.616316777607101e-05, + "loss": 0.6259, + "num_tokens": 52540951.0, + "step": 563 + }, + { + "epoch": 0.09626216077828981, + "grad_norm": 0.5920691370350928, + "learning_rate": 3.615634067246971e-05, + "loss": 0.666, + "num_tokens": 52619939.0, + "step": 564 + }, + { + "epoch": 0.09643283836832224, + "grad_norm": 0.5176477082552898, + "learning_rate": 3.614951356886841e-05, + "loss": 0.656, + "num_tokens": 52730835.0, + "step": 565 + }, + { + "epoch": 0.09660351595835467, + "grad_norm": 0.6985077638203099, + "learning_rate": 3.614268646526711e-05, + "loss": 0.7695, + "num_tokens": 52839984.0, + "step": 566 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 0.642586488676836, + "learning_rate": 3.613585936166582e-05, + "loss": 0.6719, + "num_tokens": 52913484.0, + "step": 567 + }, + { + "epoch": 0.09694487113841953, + "grad_norm": 0.5612241710274725, + "learning_rate": 3.612903225806452e-05, + "loss": 0.668, + "num_tokens": 52999344.0, + "step": 568 + }, + { + "epoch": 0.09711554872845195, + "grad_norm": 0.5472889069565046, + "learning_rate": 3.612220515446322e-05, + "loss": 0.7276, + "num_tokens": 53115112.0, + "step": 569 + }, + { + "epoch": 0.09728622631848438, + "grad_norm": 0.5233292586874325, + "learning_rate": 3.6115378050861926e-05, + "loss": 0.7095, + "num_tokens": 53234593.0, + "step": 570 + }, + { + "epoch": 0.09745690390851681, + "grad_norm": 0.6349032007720222, + "learning_rate": 3.6108550947260626e-05, + "loss": 0.6504, + "num_tokens": 53327280.0, + "step": 571 + }, + { + "epoch": 0.09762758149854923, + "grad_norm": 0.7107056808749788, + "learning_rate": 3.6101723843659333e-05, + "loss": 0.6554, + "num_tokens": 53413514.0, + "step": 572 + }, + { + "epoch": 0.09779825908858167, + "grad_norm": 0.5668050992936222, + "learning_rate": 3.6094896740058034e-05, + "loss": 0.6485, + "num_tokens": 53497875.0, + "step": 573 + }, + { + "epoch": 0.0979689366786141, + "grad_norm": 0.5728710367155241, + "learning_rate": 3.6088069636456734e-05, + "loss": 0.7061, + "num_tokens": 53593628.0, + "step": 574 + }, + { + "epoch": 0.09813961426864652, + "grad_norm": 0.6054900627449904, + "learning_rate": 3.608124253285544e-05, + "loss": 0.6771, + "num_tokens": 53663897.0, + "step": 575 + }, + { + "epoch": 0.09831029185867896, + "grad_norm": 0.5765057772033648, + "learning_rate": 3.607441542925414e-05, + "loss": 0.6809, + "num_tokens": 53753017.0, + "step": 576 + }, + { + "epoch": 0.09848096944871139, + "grad_norm": 0.5410896133828244, + "learning_rate": 3.606758832565284e-05, + "loss": 0.6848, + "num_tokens": 53858131.0, + "step": 577 + }, + { + "epoch": 0.09865164703874381, + "grad_norm": 0.5168766457987327, + "learning_rate": 3.606076122205154e-05, + "loss": 0.6287, + "num_tokens": 53970378.0, + "step": 578 + }, + { + "epoch": 0.09882232462877624, + "grad_norm": 0.5645138970719202, + "learning_rate": 3.605393411845025e-05, + "loss": 0.6984, + "num_tokens": 54061336.0, + "step": 579 + }, + { + "epoch": 0.09899300221880868, + "grad_norm": 0.6940957963644416, + "learning_rate": 3.604710701484895e-05, + "loss": 0.734, + "num_tokens": 54125510.0, + "step": 580 + }, + { + "epoch": 0.0991636798088411, + "grad_norm": 0.5512734524299959, + "learning_rate": 3.604027991124766e-05, + "loss": 0.5677, + "num_tokens": 54223533.0, + "step": 581 + }, + { + "epoch": 0.09933435739887353, + "grad_norm": 0.5859317266862567, + "learning_rate": 3.603345280764636e-05, + "loss": 0.6877, + "num_tokens": 54312651.0, + "step": 582 + }, + { + "epoch": 0.09950503498890596, + "grad_norm": 0.5179646033318259, + "learning_rate": 3.6026625704045065e-05, + "loss": 0.6673, + "num_tokens": 54415435.0, + "step": 583 + }, + { + "epoch": 0.09967571257893838, + "grad_norm": 0.5393411074453566, + "learning_rate": 3.6019798600443765e-05, + "loss": 0.6343, + "num_tokens": 54507459.0, + "step": 584 + }, + { + "epoch": 0.09984639016897082, + "grad_norm": 0.6429763528593041, + "learning_rate": 3.6012971496842465e-05, + "loss": 0.7146, + "num_tokens": 54575172.0, + "step": 585 + }, + { + "epoch": 0.10001706775900324, + "grad_norm": 0.5471737327436172, + "learning_rate": 3.600614439324117e-05, + "loss": 0.7036, + "num_tokens": 54679441.0, + "step": 586 + }, + { + "epoch": 0.10018774534903567, + "grad_norm": 0.5602914550960879, + "learning_rate": 3.599931728963987e-05, + "loss": 0.7858, + "num_tokens": 54774827.0, + "step": 587 + }, + { + "epoch": 0.1003584229390681, + "grad_norm": 0.5784132946075377, + "learning_rate": 3.599249018603858e-05, + "loss": 0.669, + "num_tokens": 54865214.0, + "step": 588 + }, + { + "epoch": 0.10052910052910052, + "grad_norm": 0.5567092662138868, + "learning_rate": 3.598566308243728e-05, + "loss": 0.708, + "num_tokens": 54982153.0, + "step": 589 + }, + { + "epoch": 0.10069977811913296, + "grad_norm": 0.541798675123736, + "learning_rate": 3.597883597883598e-05, + "loss": 0.6358, + "num_tokens": 55080347.0, + "step": 590 + }, + { + "epoch": 0.10087045570916539, + "grad_norm": 0.5987736762667357, + "learning_rate": 3.597200887523468e-05, + "loss": 0.7506, + "num_tokens": 55174425.0, + "step": 591 + }, + { + "epoch": 0.10104113329919781, + "grad_norm": 0.6188916045231516, + "learning_rate": 3.596518177163339e-05, + "loss": 0.7655, + "num_tokens": 55263893.0, + "step": 592 + }, + { + "epoch": 0.10121181088923024, + "grad_norm": 0.5457704484251621, + "learning_rate": 3.595835466803209e-05, + "loss": 0.693, + "num_tokens": 55377530.0, + "step": 593 + }, + { + "epoch": 0.10138248847926268, + "grad_norm": 0.5187006763402402, + "learning_rate": 3.5951527564430796e-05, + "loss": 0.7028, + "num_tokens": 55503291.0, + "step": 594 + }, + { + "epoch": 0.1015531660692951, + "grad_norm": 0.6567193334889375, + "learning_rate": 3.5944700460829496e-05, + "loss": 0.6959, + "num_tokens": 55588297.0, + "step": 595 + }, + { + "epoch": 0.10172384365932753, + "grad_norm": 0.5043553870412054, + "learning_rate": 3.59378733572282e-05, + "loss": 0.6399, + "num_tokens": 55690232.0, + "step": 596 + }, + { + "epoch": 0.10189452124935997, + "grad_norm": 0.5124965307494519, + "learning_rate": 3.5931046253626904e-05, + "loss": 0.6659, + "num_tokens": 55800734.0, + "step": 597 + }, + { + "epoch": 0.10206519883939238, + "grad_norm": 0.6025276071585761, + "learning_rate": 3.5924219150025604e-05, + "loss": 0.7468, + "num_tokens": 55893347.0, + "step": 598 + }, + { + "epoch": 0.10223587642942482, + "grad_norm": 0.5565139206444926, + "learning_rate": 3.591739204642431e-05, + "loss": 0.6324, + "num_tokens": 55972711.0, + "step": 599 + }, + { + "epoch": 0.10240655401945725, + "grad_norm": 0.5548616025790604, + "learning_rate": 3.591056494282301e-05, + "loss": 0.7894, + "num_tokens": 56084147.0, + "step": 600 + }, + { + "epoch": 0.10257723160948967, + "grad_norm": 0.5674907666339948, + "learning_rate": 3.590373783922171e-05, + "loss": 0.6687, + "num_tokens": 56193905.0, + "step": 601 + }, + { + "epoch": 0.1027479091995221, + "grad_norm": 0.6635213859265576, + "learning_rate": 3.589691073562041e-05, + "loss": 0.5992, + "num_tokens": 56268847.0, + "step": 602 + }, + { + "epoch": 0.10291858678955453, + "grad_norm": 0.5340676972917092, + "learning_rate": 3.589008363201912e-05, + "loss": 0.5837, + "num_tokens": 56360504.0, + "step": 603 + }, + { + "epoch": 0.10308926437958696, + "grad_norm": 0.5784960380177508, + "learning_rate": 3.588325652841782e-05, + "loss": 0.7168, + "num_tokens": 56450728.0, + "step": 604 + }, + { + "epoch": 0.10325994196961939, + "grad_norm": 0.5709450831352832, + "learning_rate": 3.587642942481652e-05, + "loss": 0.6832, + "num_tokens": 56534826.0, + "step": 605 + }, + { + "epoch": 0.10343061955965181, + "grad_norm": 0.5176013164041113, + "learning_rate": 3.586960232121523e-05, + "loss": 0.5745, + "num_tokens": 56639326.0, + "step": 606 + }, + { + "epoch": 0.10360129714968425, + "grad_norm": 0.5303786403010953, + "learning_rate": 3.586277521761393e-05, + "loss": 0.598, + "num_tokens": 56736623.0, + "step": 607 + }, + { + "epoch": 0.10377197473971668, + "grad_norm": 0.6308980351562006, + "learning_rate": 3.5855948114012635e-05, + "loss": 0.7189, + "num_tokens": 56815939.0, + "step": 608 + }, + { + "epoch": 0.1039426523297491, + "grad_norm": 0.524600649587053, + "learning_rate": 3.5849121010411335e-05, + "loss": 0.6463, + "num_tokens": 56925354.0, + "step": 609 + }, + { + "epoch": 0.10411332991978153, + "grad_norm": 0.5671445271456128, + "learning_rate": 3.584229390681004e-05, + "loss": 0.6202, + "num_tokens": 57008908.0, + "step": 610 + }, + { + "epoch": 0.10428400750981397, + "grad_norm": 0.5523335187034276, + "learning_rate": 3.583546680320874e-05, + "loss": 0.7338, + "num_tokens": 57117058.0, + "step": 611 + }, + { + "epoch": 0.10445468509984639, + "grad_norm": 0.57395364668904, + "learning_rate": 3.582863969960744e-05, + "loss": 0.6703, + "num_tokens": 57212519.0, + "step": 612 + }, + { + "epoch": 0.10462536268987882, + "grad_norm": 0.5791998327318868, + "learning_rate": 3.582181259600615e-05, + "loss": 0.5932, + "num_tokens": 57281204.0, + "step": 613 + }, + { + "epoch": 0.10479604027991125, + "grad_norm": 0.5440792842892852, + "learning_rate": 3.581498549240485e-05, + "loss": 0.6911, + "num_tokens": 57394089.0, + "step": 614 + }, + { + "epoch": 0.10496671786994367, + "grad_norm": 0.5740976238414511, + "learning_rate": 3.580815838880355e-05, + "loss": 0.6748, + "num_tokens": 57487956.0, + "step": 615 + }, + { + "epoch": 0.10513739545997611, + "grad_norm": 0.5919186305228085, + "learning_rate": 3.580133128520225e-05, + "loss": 0.6746, + "num_tokens": 57587245.0, + "step": 616 + }, + { + "epoch": 0.10530807305000853, + "grad_norm": 0.6284892141454418, + "learning_rate": 3.579450418160096e-05, + "loss": 0.6109, + "num_tokens": 57651510.0, + "step": 617 + }, + { + "epoch": 0.10547875064004096, + "grad_norm": 0.5929288494812709, + "learning_rate": 3.578767707799966e-05, + "loss": 0.6615, + "num_tokens": 57731553.0, + "step": 618 + }, + { + "epoch": 0.1056494282300734, + "grad_norm": 0.6204020244407263, + "learning_rate": 3.5780849974398366e-05, + "loss": 0.7288, + "num_tokens": 57816256.0, + "step": 619 + }, + { + "epoch": 0.10582010582010581, + "grad_norm": 0.6179173674902416, + "learning_rate": 3.577402287079707e-05, + "loss": 0.7287, + "num_tokens": 57903548.0, + "step": 620 + }, + { + "epoch": 0.10599078341013825, + "grad_norm": 0.6024255415203956, + "learning_rate": 3.576719576719577e-05, + "loss": 0.6828, + "num_tokens": 57992322.0, + "step": 621 + }, + { + "epoch": 0.10616146100017068, + "grad_norm": 0.6043488816920723, + "learning_rate": 3.5760368663594474e-05, + "loss": 0.7417, + "num_tokens": 58086476.0, + "step": 622 + }, + { + "epoch": 0.1063321385902031, + "grad_norm": 0.5677628466049353, + "learning_rate": 3.5753541559993175e-05, + "loss": 0.6786, + "num_tokens": 58186341.0, + "step": 623 + }, + { + "epoch": 0.10650281618023553, + "grad_norm": 0.5391761473766171, + "learning_rate": 3.574671445639188e-05, + "loss": 0.6106, + "num_tokens": 58280540.0, + "step": 624 + }, + { + "epoch": 0.10667349377026797, + "grad_norm": 0.5605435390881831, + "learning_rate": 3.573988735279058e-05, + "loss": 0.6509, + "num_tokens": 58379218.0, + "step": 625 + }, + { + "epoch": 0.10684417136030039, + "grad_norm": 0.584790193623198, + "learning_rate": 3.573306024918929e-05, + "loss": 0.6214, + "num_tokens": 58449303.0, + "step": 626 + }, + { + "epoch": 0.10701484895033282, + "grad_norm": 0.5484038897455566, + "learning_rate": 3.572623314558798e-05, + "loss": 0.6293, + "num_tokens": 58545522.0, + "step": 627 + }, + { + "epoch": 0.10718552654036526, + "grad_norm": 0.6224808704701998, + "learning_rate": 3.571940604198669e-05, + "loss": 0.6457, + "num_tokens": 58625187.0, + "step": 628 + }, + { + "epoch": 0.10735620413039768, + "grad_norm": 0.5882332570612269, + "learning_rate": 3.571257893838539e-05, + "loss": 0.6809, + "num_tokens": 58704207.0, + "step": 629 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 0.5305310610331889, + "learning_rate": 3.57057518347841e-05, + "loss": 0.646, + "num_tokens": 58800823.0, + "step": 630 + }, + { + "epoch": 0.10769755931046254, + "grad_norm": 0.5463129173035042, + "learning_rate": 3.56989247311828e-05, + "loss": 0.6448, + "num_tokens": 58900776.0, + "step": 631 + }, + { + "epoch": 0.10786823690049496, + "grad_norm": 0.5325082747052348, + "learning_rate": 3.56920976275815e-05, + "loss": 0.6471, + "num_tokens": 59013978.0, + "step": 632 + }, + { + "epoch": 0.1080389144905274, + "grad_norm": 0.570206467126212, + "learning_rate": 3.5685270523980205e-05, + "loss": 0.7536, + "num_tokens": 59118360.0, + "step": 633 + }, + { + "epoch": 0.10820959208055982, + "grad_norm": 0.5281426220588592, + "learning_rate": 3.5678443420378906e-05, + "loss": 0.6556, + "num_tokens": 59211982.0, + "step": 634 + }, + { + "epoch": 0.10838026967059225, + "grad_norm": 0.5424375843540711, + "learning_rate": 3.567161631677761e-05, + "loss": 0.7324, + "num_tokens": 59323498.0, + "step": 635 + }, + { + "epoch": 0.10855094726062468, + "grad_norm": 0.5750254991434355, + "learning_rate": 3.566478921317631e-05, + "loss": 0.6894, + "num_tokens": 59407340.0, + "step": 636 + }, + { + "epoch": 0.1087216248506571, + "grad_norm": 0.5696354034917398, + "learning_rate": 3.565796210957502e-05, + "loss": 0.6585, + "num_tokens": 59486116.0, + "step": 637 + }, + { + "epoch": 0.10889230244068954, + "grad_norm": 0.6049668250651179, + "learning_rate": 3.565113500597372e-05, + "loss": 0.6957, + "num_tokens": 59568759.0, + "step": 638 + }, + { + "epoch": 0.10906298003072197, + "grad_norm": 0.5367882923292222, + "learning_rate": 3.564430790237242e-05, + "loss": 0.6147, + "num_tokens": 59671512.0, + "step": 639 + }, + { + "epoch": 0.10923365762075439, + "grad_norm": 0.5550579418220778, + "learning_rate": 3.563748079877112e-05, + "loss": 0.6829, + "num_tokens": 59766116.0, + "step": 640 + }, + { + "epoch": 0.10940433521078682, + "grad_norm": 0.5789495866225279, + "learning_rate": 3.563065369516983e-05, + "loss": 0.717, + "num_tokens": 59860915.0, + "step": 641 + }, + { + "epoch": 0.10957501280081926, + "grad_norm": 0.5728288884353353, + "learning_rate": 3.562382659156853e-05, + "loss": 0.6002, + "num_tokens": 59938770.0, + "step": 642 + }, + { + "epoch": 0.10974569039085168, + "grad_norm": 0.5531591242819155, + "learning_rate": 3.561699948796723e-05, + "loss": 0.5673, + "num_tokens": 60030471.0, + "step": 643 + }, + { + "epoch": 0.10991636798088411, + "grad_norm": 0.6408390111413331, + "learning_rate": 3.561017238436594e-05, + "loss": 0.6907, + "num_tokens": 60108832.0, + "step": 644 + }, + { + "epoch": 0.11008704557091654, + "grad_norm": 0.5664650239959266, + "learning_rate": 3.560334528076464e-05, + "loss": 0.6183, + "num_tokens": 60200166.0, + "step": 645 + }, + { + "epoch": 0.11025772316094896, + "grad_norm": 0.5832864925019746, + "learning_rate": 3.5596518177163344e-05, + "loss": 0.6968, + "num_tokens": 60308933.0, + "step": 646 + }, + { + "epoch": 0.1104284007509814, + "grad_norm": 0.5494724765265943, + "learning_rate": 3.5589691073562045e-05, + "loss": 0.7025, + "num_tokens": 60426437.0, + "step": 647 + }, + { + "epoch": 0.11059907834101383, + "grad_norm": 0.5427701104136853, + "learning_rate": 3.5582863969960745e-05, + "loss": 0.6467, + "num_tokens": 60515961.0, + "step": 648 + }, + { + "epoch": 0.11076975593104625, + "grad_norm": 0.5435782818124965, + "learning_rate": 3.557603686635945e-05, + "loss": 0.6633, + "num_tokens": 60616322.0, + "step": 649 + }, + { + "epoch": 0.11094043352107869, + "grad_norm": 0.547105039704954, + "learning_rate": 3.556920976275815e-05, + "loss": 0.6909, + "num_tokens": 60725345.0, + "step": 650 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.5356518086169897, + "learning_rate": 3.556238265915686e-05, + "loss": 0.5577, + "num_tokens": 60808324.0, + "step": 651 + }, + { + "epoch": 0.11128178870114354, + "grad_norm": 0.5252960388426867, + "learning_rate": 3.555555555555555e-05, + "loss": 0.6546, + "num_tokens": 60919355.0, + "step": 652 + }, + { + "epoch": 0.11145246629117597, + "grad_norm": 0.5508017842633086, + "learning_rate": 3.554872845195426e-05, + "loss": 0.6561, + "num_tokens": 61016724.0, + "step": 653 + }, + { + "epoch": 0.11162314388120839, + "grad_norm": 0.569740907002769, + "learning_rate": 3.554190134835296e-05, + "loss": 0.6289, + "num_tokens": 61104022.0, + "step": 654 + }, + { + "epoch": 0.11179382147124083, + "grad_norm": 0.556758332365874, + "learning_rate": 3.553507424475167e-05, + "loss": 0.6981, + "num_tokens": 61228132.0, + "step": 655 + }, + { + "epoch": 0.11196449906127326, + "grad_norm": 0.5109855853599103, + "learning_rate": 3.552824714115037e-05, + "loss": 0.6899, + "num_tokens": 61355190.0, + "step": 656 + }, + { + "epoch": 0.11213517665130568, + "grad_norm": 0.6196811965495391, + "learning_rate": 3.5521420037549075e-05, + "loss": 0.6481, + "num_tokens": 61426473.0, + "step": 657 + }, + { + "epoch": 0.11230585424133811, + "grad_norm": 0.5442808006458479, + "learning_rate": 3.5514592933947776e-05, + "loss": 0.6244, + "num_tokens": 61514462.0, + "step": 658 + }, + { + "epoch": 0.11247653183137055, + "grad_norm": 0.5684489205744652, + "learning_rate": 3.5507765830346476e-05, + "loss": 0.6995, + "num_tokens": 61611963.0, + "step": 659 + }, + { + "epoch": 0.11264720942140297, + "grad_norm": 0.512020249952361, + "learning_rate": 3.550093872674518e-05, + "loss": 0.614, + "num_tokens": 61734085.0, + "step": 660 + }, + { + "epoch": 0.1128178870114354, + "grad_norm": 0.565846878298372, + "learning_rate": 3.5494111623143884e-05, + "loss": 0.645, + "num_tokens": 61824640.0, + "step": 661 + }, + { + "epoch": 0.11298856460146783, + "grad_norm": 0.5557365717593293, + "learning_rate": 3.548728451954259e-05, + "loss": 0.7263, + "num_tokens": 61948367.0, + "step": 662 + }, + { + "epoch": 0.11315924219150025, + "grad_norm": 0.5596420160579111, + "learning_rate": 3.548045741594129e-05, + "loss": 0.6531, + "num_tokens": 62043396.0, + "step": 663 + }, + { + "epoch": 0.11332991978153269, + "grad_norm": 0.5434475155983988, + "learning_rate": 3.547363031233999e-05, + "loss": 0.6726, + "num_tokens": 62154658.0, + "step": 664 + }, + { + "epoch": 0.1135005973715651, + "grad_norm": 0.5914132570803373, + "learning_rate": 3.546680320873869e-05, + "loss": 0.5936, + "num_tokens": 62224176.0, + "step": 665 + }, + { + "epoch": 0.11367127496159754, + "grad_norm": 0.5616039117313804, + "learning_rate": 3.54599761051374e-05, + "loss": 0.7573, + "num_tokens": 62339263.0, + "step": 666 + }, + { + "epoch": 0.11384195255162997, + "grad_norm": 0.6084718007476989, + "learning_rate": 3.54531490015361e-05, + "loss": 0.655, + "num_tokens": 62430588.0, + "step": 667 + }, + { + "epoch": 0.1140126301416624, + "grad_norm": 0.5851313234960157, + "learning_rate": 3.544632189793481e-05, + "loss": 0.6904, + "num_tokens": 62523970.0, + "step": 668 + }, + { + "epoch": 0.11418330773169483, + "grad_norm": 0.5566229959844345, + "learning_rate": 3.543949479433351e-05, + "loss": 0.6185, + "num_tokens": 62609796.0, + "step": 669 + }, + { + "epoch": 0.11435398532172726, + "grad_norm": 0.637078533330038, + "learning_rate": 3.543266769073221e-05, + "loss": 0.7284, + "num_tokens": 62700680.0, + "step": 670 + }, + { + "epoch": 0.11452466291175968, + "grad_norm": 0.5962917615407175, + "learning_rate": 3.5425840587130915e-05, + "loss": 0.54, + "num_tokens": 62768845.0, + "step": 671 + }, + { + "epoch": 0.11469534050179211, + "grad_norm": 0.5153351563085283, + "learning_rate": 3.5419013483529615e-05, + "loss": 0.6321, + "num_tokens": 62874436.0, + "step": 672 + }, + { + "epoch": 0.11486601809182455, + "grad_norm": 0.5981474491872827, + "learning_rate": 3.541218637992832e-05, + "loss": 0.7183, + "num_tokens": 62968898.0, + "step": 673 + }, + { + "epoch": 0.11503669568185697, + "grad_norm": 0.6000383143515502, + "learning_rate": 3.540535927632702e-05, + "loss": 0.685, + "num_tokens": 63066824.0, + "step": 674 + }, + { + "epoch": 0.1152073732718894, + "grad_norm": 0.5485883936716764, + "learning_rate": 3.539853217272572e-05, + "loss": 0.5885, + "num_tokens": 63158648.0, + "step": 675 + }, + { + "epoch": 0.11537805086192184, + "grad_norm": 0.5848365966341512, + "learning_rate": 3.539170506912443e-05, + "loss": 0.6895, + "num_tokens": 63250445.0, + "step": 676 + }, + { + "epoch": 0.11554872845195426, + "grad_norm": 0.5340453223733347, + "learning_rate": 3.538487796552313e-05, + "loss": 0.6515, + "num_tokens": 63350508.0, + "step": 677 + }, + { + "epoch": 0.11571940604198669, + "grad_norm": 0.6548762006997945, + "learning_rate": 3.537805086192183e-05, + "loss": 0.6536, + "num_tokens": 63424406.0, + "step": 678 + }, + { + "epoch": 0.11589008363201912, + "grad_norm": 0.5963683280322891, + "learning_rate": 3.537122375832053e-05, + "loss": 0.6713, + "num_tokens": 63518661.0, + "step": 679 + }, + { + "epoch": 0.11606076122205154, + "grad_norm": 0.6097308802062946, + "learning_rate": 3.536439665471924e-05, + "loss": 0.756, + "num_tokens": 63605994.0, + "step": 680 + }, + { + "epoch": 0.11623143881208398, + "grad_norm": 0.5461210887813226, + "learning_rate": 3.535756955111794e-05, + "loss": 0.6066, + "num_tokens": 63695613.0, + "step": 681 + }, + { + "epoch": 0.1164021164021164, + "grad_norm": 0.562178535912725, + "learning_rate": 3.5350742447516646e-05, + "loss": 0.607, + "num_tokens": 63794856.0, + "step": 682 + }, + { + "epoch": 0.11657279399214883, + "grad_norm": 0.5564281434986919, + "learning_rate": 3.5343915343915346e-05, + "loss": 0.6629, + "num_tokens": 63880747.0, + "step": 683 + }, + { + "epoch": 0.11674347158218126, + "grad_norm": 0.5828767144844905, + "learning_rate": 3.533708824031405e-05, + "loss": 0.7848, + "num_tokens": 63989013.0, + "step": 684 + }, + { + "epoch": 0.11691414917221368, + "grad_norm": 0.5445410125586091, + "learning_rate": 3.5330261136712754e-05, + "loss": 0.7009, + "num_tokens": 64091836.0, + "step": 685 + }, + { + "epoch": 0.11708482676224612, + "grad_norm": 0.5544837034260781, + "learning_rate": 3.5323434033111454e-05, + "loss": 0.6395, + "num_tokens": 64176998.0, + "step": 686 + }, + { + "epoch": 0.11725550435227855, + "grad_norm": 0.589313043913957, + "learning_rate": 3.531660692951016e-05, + "loss": 0.6402, + "num_tokens": 64257964.0, + "step": 687 + }, + { + "epoch": 0.11742618194231097, + "grad_norm": 0.5211055606935268, + "learning_rate": 3.530977982590886e-05, + "loss": 0.6722, + "num_tokens": 64380839.0, + "step": 688 + }, + { + "epoch": 0.1175968595323434, + "grad_norm": 0.6140494933032988, + "learning_rate": 3.530295272230756e-05, + "loss": 0.751, + "num_tokens": 64471371.0, + "step": 689 + }, + { + "epoch": 0.11776753712237584, + "grad_norm": 0.5361264491268078, + "learning_rate": 3.529612561870626e-05, + "loss": 0.6478, + "num_tokens": 64586076.0, + "step": 690 + }, + { + "epoch": 0.11793821471240826, + "grad_norm": 0.6055415250454533, + "learning_rate": 3.528929851510497e-05, + "loss": 0.7217, + "num_tokens": 64673598.0, + "step": 691 + }, + { + "epoch": 0.11810889230244069, + "grad_norm": 0.5871158410748388, + "learning_rate": 3.528247141150367e-05, + "loss": 0.594, + "num_tokens": 64746648.0, + "step": 692 + }, + { + "epoch": 0.11827956989247312, + "grad_norm": 0.7097608469064496, + "learning_rate": 3.527564430790238e-05, + "loss": 0.7103, + "num_tokens": 64848272.0, + "step": 693 + }, + { + "epoch": 0.11845024748250554, + "grad_norm": 0.617877239023509, + "learning_rate": 3.526881720430108e-05, + "loss": 0.6739, + "num_tokens": 64914062.0, + "step": 694 + }, + { + "epoch": 0.11862092507253798, + "grad_norm": 0.5486207440829552, + "learning_rate": 3.5261990100699785e-05, + "loss": 0.6708, + "num_tokens": 65008863.0, + "step": 695 + }, + { + "epoch": 0.11879160266257041, + "grad_norm": 0.5343670034139644, + "learning_rate": 3.5255162997098485e-05, + "loss": 0.6499, + "num_tokens": 65108628.0, + "step": 696 + }, + { + "epoch": 0.11896228025260283, + "grad_norm": 0.6036679497099515, + "learning_rate": 3.5248335893497185e-05, + "loss": 0.6476, + "num_tokens": 65182974.0, + "step": 697 + }, + { + "epoch": 0.11913295784263526, + "grad_norm": 0.5652892928121769, + "learning_rate": 3.524150878989589e-05, + "loss": 0.6871, + "num_tokens": 65275897.0, + "step": 698 + }, + { + "epoch": 0.11930363543266768, + "grad_norm": 0.5533258245574573, + "learning_rate": 3.523468168629459e-05, + "loss": 0.6732, + "num_tokens": 65383087.0, + "step": 699 + }, + { + "epoch": 0.11947431302270012, + "grad_norm": 0.5127905131870493, + "learning_rate": 3.52278545826933e-05, + "loss": 0.6221, + "num_tokens": 65495591.0, + "step": 700 + }, + { + "epoch": 0.11964499061273255, + "grad_norm": 0.6579466175029122, + "learning_rate": 3.5221027479091994e-05, + "loss": 0.7038, + "num_tokens": 65561850.0, + "step": 701 + }, + { + "epoch": 0.11981566820276497, + "grad_norm": 0.5355251394301593, + "learning_rate": 3.52142003754907e-05, + "loss": 0.6298, + "num_tokens": 65656789.0, + "step": 702 + }, + { + "epoch": 0.1199863457927974, + "grad_norm": 0.5865426242938332, + "learning_rate": 3.52073732718894e-05, + "loss": 0.6783, + "num_tokens": 65734510.0, + "step": 703 + }, + { + "epoch": 0.12015702338282984, + "grad_norm": 0.5530618058332301, + "learning_rate": 3.520054616828811e-05, + "loss": 0.7228, + "num_tokens": 65847469.0, + "step": 704 + }, + { + "epoch": 0.12032770097286226, + "grad_norm": 0.5796062311041674, + "learning_rate": 3.519371906468681e-05, + "loss": 0.7232, + "num_tokens": 65960147.0, + "step": 705 + }, + { + "epoch": 0.12049837856289469, + "grad_norm": 0.6494905916710584, + "learning_rate": 3.518689196108551e-05, + "loss": 0.7221, + "num_tokens": 66037222.0, + "step": 706 + }, + { + "epoch": 0.12066905615292713, + "grad_norm": 0.5554472609733303, + "learning_rate": 3.5180064857484216e-05, + "loss": 0.6513, + "num_tokens": 66152988.0, + "step": 707 + }, + { + "epoch": 0.12083973374295955, + "grad_norm": 0.6163880793106599, + "learning_rate": 3.5173237753882916e-05, + "loss": 0.7339, + "num_tokens": 66273209.0, + "step": 708 + }, + { + "epoch": 0.12101041133299198, + "grad_norm": 0.5402528152200645, + "learning_rate": 3.5166410650281624e-05, + "loss": 0.7121, + "num_tokens": 66381159.0, + "step": 709 + }, + { + "epoch": 0.12118108892302441, + "grad_norm": 0.5780526857918897, + "learning_rate": 3.5159583546680324e-05, + "loss": 0.7175, + "num_tokens": 66467766.0, + "step": 710 + }, + { + "epoch": 0.12135176651305683, + "grad_norm": 0.5698608862680794, + "learning_rate": 3.515275644307903e-05, + "loss": 0.6803, + "num_tokens": 66570668.0, + "step": 711 + }, + { + "epoch": 0.12152244410308927, + "grad_norm": 0.5775972964319694, + "learning_rate": 3.514592933947773e-05, + "loss": 0.64, + "num_tokens": 66657917.0, + "step": 712 + }, + { + "epoch": 0.12169312169312169, + "grad_norm": 0.5669815284631368, + "learning_rate": 3.513910223587643e-05, + "loss": 0.7067, + "num_tokens": 66772634.0, + "step": 713 + }, + { + "epoch": 0.12186379928315412, + "grad_norm": 0.5952144004827717, + "learning_rate": 3.513227513227513e-05, + "loss": 0.6136, + "num_tokens": 66847211.0, + "step": 714 + }, + { + "epoch": 0.12203447687318655, + "grad_norm": 0.5344734300408679, + "learning_rate": 3.512544802867384e-05, + "loss": 0.6589, + "num_tokens": 66949210.0, + "step": 715 + }, + { + "epoch": 0.12220515446321897, + "grad_norm": 0.5763565265778556, + "learning_rate": 3.511862092507254e-05, + "loss": 0.7708, + "num_tokens": 67049501.0, + "step": 716 + }, + { + "epoch": 0.12237583205325141, + "grad_norm": 0.5728648055688432, + "learning_rate": 3.511179382147124e-05, + "loss": 0.6328, + "num_tokens": 67136396.0, + "step": 717 + }, + { + "epoch": 0.12254650964328384, + "grad_norm": 0.5875223769966985, + "learning_rate": 3.510496671786995e-05, + "loss": 0.6405, + "num_tokens": 67231166.0, + "step": 718 + }, + { + "epoch": 0.12271718723331626, + "grad_norm": 0.56243640761712, + "learning_rate": 3.509813961426865e-05, + "loss": 0.613, + "num_tokens": 67306346.0, + "step": 719 + }, + { + "epoch": 0.1228878648233487, + "grad_norm": 0.5749550090728258, + "learning_rate": 3.5091312510667355e-05, + "loss": 0.7, + "num_tokens": 67414673.0, + "step": 720 + }, + { + "epoch": 0.12305854241338113, + "grad_norm": 0.5838955395160954, + "learning_rate": 3.5084485407066055e-05, + "loss": 0.7189, + "num_tokens": 67514267.0, + "step": 721 + }, + { + "epoch": 0.12322922000341355, + "grad_norm": 0.6146460634155827, + "learning_rate": 3.5077658303464756e-05, + "loss": 0.7205, + "num_tokens": 67620580.0, + "step": 722 + }, + { + "epoch": 0.12339989759344598, + "grad_norm": 0.5366200106455599, + "learning_rate": 3.507083119986346e-05, + "loss": 0.6447, + "num_tokens": 67714421.0, + "step": 723 + }, + { + "epoch": 0.12357057518347841, + "grad_norm": 0.672742690916885, + "learning_rate": 3.506400409626216e-05, + "loss": 0.7684, + "num_tokens": 67820977.0, + "step": 724 + }, + { + "epoch": 0.12374125277351083, + "grad_norm": 0.5389895675043395, + "learning_rate": 3.505717699266087e-05, + "loss": 0.6622, + "num_tokens": 67913285.0, + "step": 725 + }, + { + "epoch": 0.12391193036354327, + "grad_norm": 0.5403535634584391, + "learning_rate": 3.5050349889059564e-05, + "loss": 0.6472, + "num_tokens": 68012589.0, + "step": 726 + }, + { + "epoch": 0.1240826079535757, + "grad_norm": 0.5314325130864882, + "learning_rate": 3.504352278545827e-05, + "loss": 0.5942, + "num_tokens": 68106011.0, + "step": 727 + }, + { + "epoch": 0.12425328554360812, + "grad_norm": 0.5061729440467431, + "learning_rate": 3.503669568185697e-05, + "loss": 0.5794, + "num_tokens": 68203229.0, + "step": 728 + }, + { + "epoch": 0.12442396313364056, + "grad_norm": 0.542021096599923, + "learning_rate": 3.502986857825568e-05, + "loss": 0.7294, + "num_tokens": 68336419.0, + "step": 729 + }, + { + "epoch": 0.12459464072367298, + "grad_norm": 0.5421642292628917, + "learning_rate": 3.502304147465438e-05, + "loss": 0.5978, + "num_tokens": 68419797.0, + "step": 730 + }, + { + "epoch": 0.12476531831370541, + "grad_norm": 0.8101015240994291, + "learning_rate": 3.5016214371053086e-05, + "loss": 0.7748, + "num_tokens": 68509292.0, + "step": 731 + }, + { + "epoch": 0.12493599590373784, + "grad_norm": 0.563472349438742, + "learning_rate": 3.5009387267451786e-05, + "loss": 0.624, + "num_tokens": 68596728.0, + "step": 732 + }, + { + "epoch": 0.12510667349377028, + "grad_norm": 0.6377601048791521, + "learning_rate": 3.500256016385049e-05, + "loss": 0.7113, + "num_tokens": 68665695.0, + "step": 733 + }, + { + "epoch": 0.1252773510838027, + "grad_norm": 0.507629656843143, + "learning_rate": 3.4995733060249194e-05, + "loss": 0.6667, + "num_tokens": 68793323.0, + "step": 734 + }, + { + "epoch": 0.12544802867383512, + "grad_norm": 0.586540038158281, + "learning_rate": 3.4988905956647894e-05, + "loss": 0.6848, + "num_tokens": 68882375.0, + "step": 735 + }, + { + "epoch": 0.12561870626386756, + "grad_norm": 0.5668186299499408, + "learning_rate": 3.49820788530466e-05, + "loss": 0.6058, + "num_tokens": 68960664.0, + "step": 736 + }, + { + "epoch": 0.12578938385389998, + "grad_norm": 0.6118298570756933, + "learning_rate": 3.49752517494453e-05, + "loss": 0.7433, + "num_tokens": 69049374.0, + "step": 737 + }, + { + "epoch": 0.1259600614439324, + "grad_norm": 0.6237656062498739, + "learning_rate": 3.496842464584401e-05, + "loss": 0.7339, + "num_tokens": 69125588.0, + "step": 738 + }, + { + "epoch": 0.12613073903396485, + "grad_norm": 0.625323638151093, + "learning_rate": 3.49615975422427e-05, + "loss": 0.6889, + "num_tokens": 69199534.0, + "step": 739 + }, + { + "epoch": 0.12630141662399727, + "grad_norm": 0.5637020598943938, + "learning_rate": 3.495477043864141e-05, + "loss": 0.613, + "num_tokens": 69289122.0, + "step": 740 + }, + { + "epoch": 0.1264720942140297, + "grad_norm": 0.6252921034857423, + "learning_rate": 3.494794333504011e-05, + "loss": 0.6571, + "num_tokens": 69364679.0, + "step": 741 + }, + { + "epoch": 0.12664277180406214, + "grad_norm": 0.5857076398829067, + "learning_rate": 3.494111623143882e-05, + "loss": 0.686, + "num_tokens": 69470747.0, + "step": 742 + }, + { + "epoch": 0.12681344939409456, + "grad_norm": 0.5618516312358169, + "learning_rate": 3.493428912783752e-05, + "loss": 0.7222, + "num_tokens": 69585105.0, + "step": 743 + }, + { + "epoch": 0.12698412698412698, + "grad_norm": 0.5494599159570872, + "learning_rate": 3.492746202423622e-05, + "loss": 0.6635, + "num_tokens": 69680042.0, + "step": 744 + }, + { + "epoch": 0.12715480457415942, + "grad_norm": 0.6521494860329216, + "learning_rate": 3.4920634920634925e-05, + "loss": 0.7177, + "num_tokens": 69757519.0, + "step": 745 + }, + { + "epoch": 0.12732548216419184, + "grad_norm": 0.5543884711444046, + "learning_rate": 3.4913807817033626e-05, + "loss": 0.6137, + "num_tokens": 69849480.0, + "step": 746 + }, + { + "epoch": 0.12749615975422426, + "grad_norm": 0.5841184316184238, + "learning_rate": 3.490698071343233e-05, + "loss": 0.7003, + "num_tokens": 69947716.0, + "step": 747 + }, + { + "epoch": 0.1276668373442567, + "grad_norm": 0.589610969252541, + "learning_rate": 3.490015360983103e-05, + "loss": 0.7601, + "num_tokens": 70047569.0, + "step": 748 + }, + { + "epoch": 0.12783751493428913, + "grad_norm": 0.5756092704200104, + "learning_rate": 3.4893326506229733e-05, + "loss": 0.6678, + "num_tokens": 70138720.0, + "step": 749 + }, + { + "epoch": 0.12800819252432155, + "grad_norm": 0.5936404714254403, + "learning_rate": 3.488649940262844e-05, + "loss": 0.7536, + "num_tokens": 70235724.0, + "step": 750 + }, + { + "epoch": 0.128178870114354, + "grad_norm": 0.5859397442508912, + "learning_rate": 3.487967229902714e-05, + "loss": 0.6052, + "num_tokens": 70304151.0, + "step": 751 + }, + { + "epoch": 0.12834954770438642, + "grad_norm": 0.5578055958186685, + "learning_rate": 3.487284519542584e-05, + "loss": 0.75, + "num_tokens": 70408079.0, + "step": 752 + }, + { + "epoch": 0.12852022529441884, + "grad_norm": 0.6116526629020942, + "learning_rate": 3.486601809182454e-05, + "loss": 0.6866, + "num_tokens": 70491101.0, + "step": 753 + }, + { + "epoch": 0.12869090288445126, + "grad_norm": 0.5786351625569407, + "learning_rate": 3.485919098822325e-05, + "loss": 0.6195, + "num_tokens": 70566259.0, + "step": 754 + }, + { + "epoch": 0.1288615804744837, + "grad_norm": 0.5682226533594387, + "learning_rate": 3.485236388462195e-05, + "loss": 0.7104, + "num_tokens": 70684528.0, + "step": 755 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.5151472636579965, + "learning_rate": 3.4845536781020656e-05, + "loss": 0.6093, + "num_tokens": 70792835.0, + "step": 756 + }, + { + "epoch": 0.12920293565454855, + "grad_norm": 0.5206369316928973, + "learning_rate": 3.483870967741936e-05, + "loss": 0.6388, + "num_tokens": 70890168.0, + "step": 757 + }, + { + "epoch": 0.129373613244581, + "grad_norm": 0.5288438452975153, + "learning_rate": 3.4831882573818064e-05, + "loss": 0.6096, + "num_tokens": 70991953.0, + "step": 758 + }, + { + "epoch": 0.1295442908346134, + "grad_norm": 0.5481352587915014, + "learning_rate": 3.4825055470216764e-05, + "loss": 0.6252, + "num_tokens": 71080611.0, + "step": 759 + }, + { + "epoch": 0.12971496842464583, + "grad_norm": 0.5168432193753926, + "learning_rate": 3.4818228366615465e-05, + "loss": 0.6383, + "num_tokens": 71175942.0, + "step": 760 + }, + { + "epoch": 0.12988564601467828, + "grad_norm": 0.5454879782623707, + "learning_rate": 3.481140126301417e-05, + "loss": 0.65, + "num_tokens": 71266429.0, + "step": 761 + }, + { + "epoch": 0.1300563236047107, + "grad_norm": 0.5115899853285042, + "learning_rate": 3.480457415941287e-05, + "loss": 0.6653, + "num_tokens": 71385461.0, + "step": 762 + }, + { + "epoch": 0.13022700119474312, + "grad_norm": 0.4881318719071649, + "learning_rate": 3.479774705581157e-05, + "loss": 0.6248, + "num_tokens": 71495454.0, + "step": 763 + }, + { + "epoch": 0.13039767878477557, + "grad_norm": 0.5972276050395449, + "learning_rate": 3.479091995221027e-05, + "loss": 0.5875, + "num_tokens": 71576057.0, + "step": 764 + }, + { + "epoch": 0.130568356374808, + "grad_norm": 0.6002882317192966, + "learning_rate": 3.478409284860898e-05, + "loss": 0.6583, + "num_tokens": 71682346.0, + "step": 765 + }, + { + "epoch": 0.1307390339648404, + "grad_norm": 0.6008191414622202, + "learning_rate": 3.477726574500768e-05, + "loss": 0.6007, + "num_tokens": 71746013.0, + "step": 766 + }, + { + "epoch": 0.13090971155487285, + "grad_norm": 0.5577644206648226, + "learning_rate": 3.477043864140639e-05, + "loss": 0.6554, + "num_tokens": 71842149.0, + "step": 767 + }, + { + "epoch": 0.13108038914490527, + "grad_norm": 0.5412804598382006, + "learning_rate": 3.476361153780509e-05, + "loss": 0.6106, + "num_tokens": 71936603.0, + "step": 768 + }, + { + "epoch": 0.1312510667349377, + "grad_norm": 0.5859571792002924, + "learning_rate": 3.4756784434203795e-05, + "loss": 0.6571, + "num_tokens": 72031016.0, + "step": 769 + }, + { + "epoch": 0.13142174432497014, + "grad_norm": 0.5291821636709566, + "learning_rate": 3.4749957330602496e-05, + "loss": 0.6756, + "num_tokens": 72137968.0, + "step": 770 + }, + { + "epoch": 0.13159242191500256, + "grad_norm": 0.6394837166161085, + "learning_rate": 3.4743130227001196e-05, + "loss": 0.7097, + "num_tokens": 72210425.0, + "step": 771 + }, + { + "epoch": 0.13176309950503498, + "grad_norm": 0.5113193186003069, + "learning_rate": 3.47363031233999e-05, + "loss": 0.7028, + "num_tokens": 72337173.0, + "step": 772 + }, + { + "epoch": 0.13193377709506743, + "grad_norm": 0.5007580188937161, + "learning_rate": 3.4729476019798603e-05, + "loss": 0.6865, + "num_tokens": 72468609.0, + "step": 773 + }, + { + "epoch": 0.13210445468509985, + "grad_norm": 0.551752277013151, + "learning_rate": 3.472264891619731e-05, + "loss": 0.7417, + "num_tokens": 72583376.0, + "step": 774 + }, + { + "epoch": 0.13227513227513227, + "grad_norm": 0.665338391117444, + "learning_rate": 3.471582181259601e-05, + "loss": 0.6669, + "num_tokens": 72670603.0, + "step": 775 + }, + { + "epoch": 0.13244580986516472, + "grad_norm": 0.5235473644075423, + "learning_rate": 3.470899470899471e-05, + "loss": 0.5748, + "num_tokens": 72766288.0, + "step": 776 + }, + { + "epoch": 0.13261648745519714, + "grad_norm": 0.5751657854044766, + "learning_rate": 3.470216760539341e-05, + "loss": 0.7027, + "num_tokens": 72857506.0, + "step": 777 + }, + { + "epoch": 0.13278716504522955, + "grad_norm": 0.5711890665392755, + "learning_rate": 3.469534050179212e-05, + "loss": 0.6223, + "num_tokens": 72930470.0, + "step": 778 + }, + { + "epoch": 0.132957842635262, + "grad_norm": 0.5912051950645791, + "learning_rate": 3.468851339819082e-05, + "loss": 0.6185, + "num_tokens": 73015730.0, + "step": 779 + }, + { + "epoch": 0.13312852022529442, + "grad_norm": 0.554912499365756, + "learning_rate": 3.468168629458952e-05, + "loss": 0.5899, + "num_tokens": 73094330.0, + "step": 780 + }, + { + "epoch": 0.13329919781532684, + "grad_norm": 0.6044784058877728, + "learning_rate": 3.467485919098823e-05, + "loss": 0.7074, + "num_tokens": 73183653.0, + "step": 781 + }, + { + "epoch": 0.1334698754053593, + "grad_norm": 0.5527665736175474, + "learning_rate": 3.466803208738693e-05, + "loss": 0.634, + "num_tokens": 73296893.0, + "step": 782 + }, + { + "epoch": 0.1336405529953917, + "grad_norm": 0.5453087465186685, + "learning_rate": 3.4661204983785634e-05, + "loss": 0.655, + "num_tokens": 73394330.0, + "step": 783 + }, + { + "epoch": 0.13381123058542413, + "grad_norm": 0.521488374016347, + "learning_rate": 3.4654377880184335e-05, + "loss": 0.6173, + "num_tokens": 73498208.0, + "step": 784 + }, + { + "epoch": 0.13398190817545655, + "grad_norm": 0.536150553918683, + "learning_rate": 3.464755077658304e-05, + "loss": 0.6168, + "num_tokens": 73615196.0, + "step": 785 + }, + { + "epoch": 0.134152585765489, + "grad_norm": 0.5626934903443068, + "learning_rate": 3.464072367298174e-05, + "loss": 0.6755, + "num_tokens": 73698091.0, + "step": 786 + }, + { + "epoch": 0.13432326335552142, + "grad_norm": 0.5619197317110837, + "learning_rate": 3.463389656938044e-05, + "loss": 0.6618, + "num_tokens": 73786974.0, + "step": 787 + }, + { + "epoch": 0.13449394094555384, + "grad_norm": 0.5832545919952337, + "learning_rate": 3.462706946577914e-05, + "loss": 0.696, + "num_tokens": 73890501.0, + "step": 788 + }, + { + "epoch": 0.13466461853558628, + "grad_norm": 0.5605809110775734, + "learning_rate": 3.462024236217785e-05, + "loss": 0.6822, + "num_tokens": 73985951.0, + "step": 789 + }, + { + "epoch": 0.1348352961256187, + "grad_norm": 0.5742836037689109, + "learning_rate": 3.461341525857655e-05, + "loss": 0.685, + "num_tokens": 74071218.0, + "step": 790 + }, + { + "epoch": 0.13500597371565112, + "grad_norm": 0.5535374871589267, + "learning_rate": 3.460658815497525e-05, + "loss": 0.6409, + "num_tokens": 74165058.0, + "step": 791 + }, + { + "epoch": 0.13517665130568357, + "grad_norm": 0.786375103297193, + "learning_rate": 3.459976105137396e-05, + "loss": 0.7335, + "num_tokens": 74256953.0, + "step": 792 + }, + { + "epoch": 0.135347328895716, + "grad_norm": 0.6497376955322123, + "learning_rate": 3.459293394777266e-05, + "loss": 0.7739, + "num_tokens": 74362732.0, + "step": 793 + }, + { + "epoch": 0.1355180064857484, + "grad_norm": 0.5903316681542662, + "learning_rate": 3.4586106844171366e-05, + "loss": 0.6672, + "num_tokens": 74442391.0, + "step": 794 + }, + { + "epoch": 0.13568868407578086, + "grad_norm": 0.5518166236051448, + "learning_rate": 3.4579279740570066e-05, + "loss": 0.6349, + "num_tokens": 74544319.0, + "step": 795 + }, + { + "epoch": 0.13585936166581328, + "grad_norm": 0.5286737623324989, + "learning_rate": 3.4572452636968766e-05, + "loss": 0.6443, + "num_tokens": 74640019.0, + "step": 796 + }, + { + "epoch": 0.1360300392558457, + "grad_norm": 0.5439447586337827, + "learning_rate": 3.4565625533367473e-05, + "loss": 0.5757, + "num_tokens": 74715454.0, + "step": 797 + }, + { + "epoch": 0.13620071684587814, + "grad_norm": 0.5229571661421332, + "learning_rate": 3.4558798429766174e-05, + "loss": 0.6223, + "num_tokens": 74833448.0, + "step": 798 + }, + { + "epoch": 0.13637139443591056, + "grad_norm": 0.6118548998313607, + "learning_rate": 3.455197132616488e-05, + "loss": 0.7364, + "num_tokens": 74925608.0, + "step": 799 + }, + { + "epoch": 0.13654207202594298, + "grad_norm": 0.5511322691072621, + "learning_rate": 3.454514422256358e-05, + "loss": 0.6369, + "num_tokens": 75016990.0, + "step": 800 + }, + { + "epoch": 0.13671274961597543, + "grad_norm": 0.6395241835029489, + "learning_rate": 3.453831711896228e-05, + "loss": 0.6844, + "num_tokens": 75135001.0, + "step": 801 + }, + { + "epoch": 0.13688342720600785, + "grad_norm": 0.5970974833662913, + "learning_rate": 3.453149001536098e-05, + "loss": 0.6814, + "num_tokens": 75232395.0, + "step": 802 + }, + { + "epoch": 0.13705410479604027, + "grad_norm": 0.5655053991514898, + "learning_rate": 3.452466291175969e-05, + "loss": 0.6301, + "num_tokens": 75335921.0, + "step": 803 + }, + { + "epoch": 0.13722478238607272, + "grad_norm": 0.5429561610612147, + "learning_rate": 3.451783580815839e-05, + "loss": 0.5757, + "num_tokens": 75417992.0, + "step": 804 + }, + { + "epoch": 0.13739545997610514, + "grad_norm": 0.5435944660585238, + "learning_rate": 3.45110087045571e-05, + "loss": 0.6158, + "num_tokens": 75513624.0, + "step": 805 + }, + { + "epoch": 0.13756613756613756, + "grad_norm": 0.6006257552284013, + "learning_rate": 3.45041816009558e-05, + "loss": 0.7562, + "num_tokens": 75601479.0, + "step": 806 + }, + { + "epoch": 0.13773681515617, + "grad_norm": 0.508677824412768, + "learning_rate": 3.44973544973545e-05, + "loss": 0.6309, + "num_tokens": 75710409.0, + "step": 807 + }, + { + "epoch": 0.13790749274620243, + "grad_norm": 0.6324385676279917, + "learning_rate": 3.4490527393753205e-05, + "loss": 0.6818, + "num_tokens": 75789107.0, + "step": 808 + }, + { + "epoch": 0.13807817033623485, + "grad_norm": 0.5307546403786317, + "learning_rate": 3.4483700290151905e-05, + "loss": 0.6585, + "num_tokens": 75886992.0, + "step": 809 + }, + { + "epoch": 0.1382488479262673, + "grad_norm": 0.5816164019257226, + "learning_rate": 3.447687318655061e-05, + "loss": 0.6589, + "num_tokens": 75968412.0, + "step": 810 + }, + { + "epoch": 0.1384195255162997, + "grad_norm": 0.5261459375267986, + "learning_rate": 3.447004608294931e-05, + "loss": 0.6534, + "num_tokens": 76081735.0, + "step": 811 + }, + { + "epoch": 0.13859020310633213, + "grad_norm": 0.6270712091047304, + "learning_rate": 3.446321897934802e-05, + "loss": 0.6313, + "num_tokens": 76157256.0, + "step": 812 + }, + { + "epoch": 0.13876088069636458, + "grad_norm": 0.6688330334406163, + "learning_rate": 3.445639187574671e-05, + "loss": 0.643, + "num_tokens": 76255793.0, + "step": 813 + }, + { + "epoch": 0.138931558286397, + "grad_norm": 0.6032989792870185, + "learning_rate": 3.444956477214542e-05, + "loss": 0.7044, + "num_tokens": 76364978.0, + "step": 814 + }, + { + "epoch": 0.13910223587642942, + "grad_norm": 0.5539320088220586, + "learning_rate": 3.444273766854412e-05, + "loss": 0.6889, + "num_tokens": 76456007.0, + "step": 815 + }, + { + "epoch": 0.13927291346646187, + "grad_norm": 0.5659243053274413, + "learning_rate": 3.443591056494283e-05, + "loss": 0.7047, + "num_tokens": 76557543.0, + "step": 816 + }, + { + "epoch": 0.1394435910564943, + "grad_norm": 0.5053253232661202, + "learning_rate": 3.442908346134153e-05, + "loss": 0.6555, + "num_tokens": 76668233.0, + "step": 817 + }, + { + "epoch": 0.1396142686465267, + "grad_norm": 0.4863387968322268, + "learning_rate": 3.442225635774023e-05, + "loss": 0.6214, + "num_tokens": 76792295.0, + "step": 818 + }, + { + "epoch": 0.13978494623655913, + "grad_norm": 0.5379595070099465, + "learning_rate": 3.4415429254138936e-05, + "loss": 0.609, + "num_tokens": 76895764.0, + "step": 819 + }, + { + "epoch": 0.13995562382659157, + "grad_norm": 0.575718187139048, + "learning_rate": 3.4408602150537636e-05, + "loss": 0.7021, + "num_tokens": 77005295.0, + "step": 820 + }, + { + "epoch": 0.140126301416624, + "grad_norm": 0.5489611891045184, + "learning_rate": 3.4401775046936343e-05, + "loss": 0.6776, + "num_tokens": 77102339.0, + "step": 821 + }, + { + "epoch": 0.1402969790066564, + "grad_norm": 0.5877043861200004, + "learning_rate": 3.4394947943335044e-05, + "loss": 0.7584, + "num_tokens": 77205921.0, + "step": 822 + }, + { + "epoch": 0.14046765659668886, + "grad_norm": 0.5832019635546548, + "learning_rate": 3.4388120839733744e-05, + "loss": 0.7181, + "num_tokens": 77310363.0, + "step": 823 + }, + { + "epoch": 0.14063833418672128, + "grad_norm": 0.6068096383875914, + "learning_rate": 3.438129373613245e-05, + "loss": 0.6248, + "num_tokens": 77395536.0, + "step": 824 + }, + { + "epoch": 0.1408090117767537, + "grad_norm": 0.57974612144161, + "learning_rate": 3.437446663253115e-05, + "loss": 0.6875, + "num_tokens": 77492077.0, + "step": 825 + }, + { + "epoch": 0.14097968936678615, + "grad_norm": 0.5245884440317214, + "learning_rate": 3.436763952892985e-05, + "loss": 0.6929, + "num_tokens": 77602561.0, + "step": 826 + }, + { + "epoch": 0.14115036695681857, + "grad_norm": 0.5358480100176388, + "learning_rate": 3.436081242532855e-05, + "loss": 0.6355, + "num_tokens": 77690875.0, + "step": 827 + }, + { + "epoch": 0.141321044546851, + "grad_norm": 0.6403265344529828, + "learning_rate": 3.435398532172726e-05, + "loss": 0.6713, + "num_tokens": 77773084.0, + "step": 828 + }, + { + "epoch": 0.14149172213688344, + "grad_norm": 0.567706182590931, + "learning_rate": 3.434715821812596e-05, + "loss": 0.6534, + "num_tokens": 77860651.0, + "step": 829 + }, + { + "epoch": 0.14166239972691586, + "grad_norm": 0.5641943417796886, + "learning_rate": 3.434033111452467e-05, + "loss": 0.6828, + "num_tokens": 77970597.0, + "step": 830 + }, + { + "epoch": 0.14183307731694828, + "grad_norm": 0.5222471859748632, + "learning_rate": 3.433350401092337e-05, + "loss": 0.6294, + "num_tokens": 78065720.0, + "step": 831 + }, + { + "epoch": 0.14200375490698072, + "grad_norm": 0.6909340103703439, + "learning_rate": 3.4326676907322075e-05, + "loss": 0.6444, + "num_tokens": 78175834.0, + "step": 832 + }, + { + "epoch": 0.14217443249701314, + "grad_norm": 0.5585964256520968, + "learning_rate": 3.4319849803720775e-05, + "loss": 0.6043, + "num_tokens": 78270197.0, + "step": 833 + }, + { + "epoch": 0.14234511008704556, + "grad_norm": 0.5638396544966598, + "learning_rate": 3.4313022700119475e-05, + "loss": 0.63, + "num_tokens": 78351477.0, + "step": 834 + }, + { + "epoch": 0.142515787677078, + "grad_norm": 0.5391729190316967, + "learning_rate": 3.430619559651818e-05, + "loss": 0.6291, + "num_tokens": 78450853.0, + "step": 835 + }, + { + "epoch": 0.14268646526711043, + "grad_norm": 0.5556800449408063, + "learning_rate": 3.429936849291688e-05, + "loss": 0.7382, + "num_tokens": 78565133.0, + "step": 836 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.6596159826600803, + "learning_rate": 3.429254138931559e-05, + "loss": 0.5799, + "num_tokens": 78616681.0, + "step": 837 + }, + { + "epoch": 0.1430278204471753, + "grad_norm": 0.675667039133319, + "learning_rate": 3.4285714285714284e-05, + "loss": 0.6883, + "num_tokens": 78704537.0, + "step": 838 + }, + { + "epoch": 0.14319849803720772, + "grad_norm": 0.5080138701337171, + "learning_rate": 3.427888718211299e-05, + "loss": 0.6402, + "num_tokens": 78818860.0, + "step": 839 + }, + { + "epoch": 0.14336917562724014, + "grad_norm": 0.5353719038325939, + "learning_rate": 3.427206007851169e-05, + "loss": 0.7461, + "num_tokens": 78958394.0, + "step": 840 + }, + { + "epoch": 0.14353985321727258, + "grad_norm": 0.5293009890712081, + "learning_rate": 3.42652329749104e-05, + "loss": 0.565, + "num_tokens": 79045209.0, + "step": 841 + }, + { + "epoch": 0.143710530807305, + "grad_norm": 0.6021835330874902, + "learning_rate": 3.42584058713091e-05, + "loss": 0.6585, + "num_tokens": 79124440.0, + "step": 842 + }, + { + "epoch": 0.14388120839733742, + "grad_norm": 0.5669519889442475, + "learning_rate": 3.4251578767707806e-05, + "loss": 0.6451, + "num_tokens": 79216839.0, + "step": 843 + }, + { + "epoch": 0.14405188598736987, + "grad_norm": 0.5492342432203641, + "learning_rate": 3.4244751664106506e-05, + "loss": 0.6523, + "num_tokens": 79309475.0, + "step": 844 + }, + { + "epoch": 0.1442225635774023, + "grad_norm": 0.548250402472044, + "learning_rate": 3.423792456050521e-05, + "loss": 0.6598, + "num_tokens": 79401919.0, + "step": 845 + }, + { + "epoch": 0.1443932411674347, + "grad_norm": 0.5714021943175022, + "learning_rate": 3.4231097456903914e-05, + "loss": 0.6676, + "num_tokens": 79501645.0, + "step": 846 + }, + { + "epoch": 0.14456391875746716, + "grad_norm": 0.5132700535269695, + "learning_rate": 3.4224270353302614e-05, + "loss": 0.6232, + "num_tokens": 79597228.0, + "step": 847 + }, + { + "epoch": 0.14473459634749958, + "grad_norm": 0.5333870837297789, + "learning_rate": 3.421744324970132e-05, + "loss": 0.6073, + "num_tokens": 79687981.0, + "step": 848 + }, + { + "epoch": 0.144905273937532, + "grad_norm": 0.533021694273642, + "learning_rate": 3.421061614610002e-05, + "loss": 0.6459, + "num_tokens": 79797310.0, + "step": 849 + }, + { + "epoch": 0.14507595152756442, + "grad_norm": 0.5391687470578578, + "learning_rate": 3.420378904249872e-05, + "loss": 0.7281, + "num_tokens": 79905408.0, + "step": 850 + }, + { + "epoch": 0.14524662911759686, + "grad_norm": 0.5678254899186858, + "learning_rate": 3.419696193889742e-05, + "loss": 0.6234, + "num_tokens": 79981050.0, + "step": 851 + }, + { + "epoch": 0.14541730670762928, + "grad_norm": 0.5390661289818485, + "learning_rate": 3.419013483529613e-05, + "loss": 0.7044, + "num_tokens": 80074947.0, + "step": 852 + }, + { + "epoch": 0.1455879842976617, + "grad_norm": 0.6133116211006021, + "learning_rate": 3.418330773169483e-05, + "loss": 0.6831, + "num_tokens": 80162928.0, + "step": 853 + }, + { + "epoch": 0.14575866188769415, + "grad_norm": 0.5965568366288224, + "learning_rate": 3.417648062809353e-05, + "loss": 0.6656, + "num_tokens": 80249837.0, + "step": 854 + }, + { + "epoch": 0.14592933947772657, + "grad_norm": 0.568753341765387, + "learning_rate": 3.416965352449224e-05, + "loss": 0.6465, + "num_tokens": 80325714.0, + "step": 855 + }, + { + "epoch": 0.146100017067759, + "grad_norm": 0.5928803176909075, + "learning_rate": 3.416282642089094e-05, + "loss": 0.6234, + "num_tokens": 80406323.0, + "step": 856 + }, + { + "epoch": 0.14627069465779144, + "grad_norm": 0.6824749566651956, + "learning_rate": 3.4155999317289645e-05, + "loss": 0.6935, + "num_tokens": 80488217.0, + "step": 857 + }, + { + "epoch": 0.14644137224782386, + "grad_norm": 0.529640773078604, + "learning_rate": 3.4149172213688345e-05, + "loss": 0.6257, + "num_tokens": 80581710.0, + "step": 858 + }, + { + "epoch": 0.14661204983785628, + "grad_norm": 0.5518700798906355, + "learning_rate": 3.414234511008705e-05, + "loss": 0.7169, + "num_tokens": 80679433.0, + "step": 859 + }, + { + "epoch": 0.14678272742788873, + "grad_norm": 0.632374717551825, + "learning_rate": 3.413551800648575e-05, + "loss": 0.7082, + "num_tokens": 80754948.0, + "step": 860 + }, + { + "epoch": 0.14695340501792115, + "grad_norm": 0.5295219006000464, + "learning_rate": 3.412869090288445e-05, + "loss": 0.6333, + "num_tokens": 80863384.0, + "step": 861 + }, + { + "epoch": 0.14712408260795357, + "grad_norm": 0.6249389252472077, + "learning_rate": 3.4121863799283154e-05, + "loss": 0.733, + "num_tokens": 80941178.0, + "step": 862 + }, + { + "epoch": 0.147294760197986, + "grad_norm": 0.5075852102986409, + "learning_rate": 3.411503669568186e-05, + "loss": 0.6142, + "num_tokens": 81044253.0, + "step": 863 + }, + { + "epoch": 0.14746543778801843, + "grad_norm": 0.5699480628911868, + "learning_rate": 3.410820959208056e-05, + "loss": 0.7635, + "num_tokens": 81137905.0, + "step": 864 + }, + { + "epoch": 0.14763611537805085, + "grad_norm": 0.5332469757878634, + "learning_rate": 3.410138248847926e-05, + "loss": 0.6804, + "num_tokens": 81247257.0, + "step": 865 + }, + { + "epoch": 0.1478067929680833, + "grad_norm": 0.571505320531762, + "learning_rate": 3.409455538487797e-05, + "loss": 0.6447, + "num_tokens": 81324173.0, + "step": 866 + }, + { + "epoch": 0.14797747055811572, + "grad_norm": 0.5365533755167058, + "learning_rate": 3.408772828127667e-05, + "loss": 0.711, + "num_tokens": 81439400.0, + "step": 867 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.5504323635661605, + "learning_rate": 3.4080901177675376e-05, + "loss": 0.7185, + "num_tokens": 81551660.0, + "step": 868 + }, + { + "epoch": 0.1483188257381806, + "grad_norm": 0.5684316127600232, + "learning_rate": 3.4074074074074077e-05, + "loss": 0.6817, + "num_tokens": 81636139.0, + "step": 869 + }, + { + "epoch": 0.148489503328213, + "grad_norm": 0.4839870733512009, + "learning_rate": 3.4067246970472784e-05, + "loss": 0.5708, + "num_tokens": 81749314.0, + "step": 870 + }, + { + "epoch": 0.14866018091824543, + "grad_norm": 0.5912521547957299, + "learning_rate": 3.4060419866871484e-05, + "loss": 0.6969, + "num_tokens": 81831996.0, + "step": 871 + }, + { + "epoch": 0.14883085850827787, + "grad_norm": 0.5610188521504091, + "learning_rate": 3.4053592763270185e-05, + "loss": 0.6546, + "num_tokens": 81919024.0, + "step": 872 + }, + { + "epoch": 0.1490015360983103, + "grad_norm": 0.6110290208548839, + "learning_rate": 3.404676565966889e-05, + "loss": 0.6678, + "num_tokens": 81997921.0, + "step": 873 + }, + { + "epoch": 0.14917221368834271, + "grad_norm": 0.5560626084124021, + "learning_rate": 3.403993855606759e-05, + "loss": 0.7044, + "num_tokens": 82101933.0, + "step": 874 + }, + { + "epoch": 0.14934289127837516, + "grad_norm": 0.5376945524080493, + "learning_rate": 3.403311145246629e-05, + "loss": 0.6731, + "num_tokens": 82208881.0, + "step": 875 + }, + { + "epoch": 0.14951356886840758, + "grad_norm": 0.578938916421303, + "learning_rate": 3.402628434886499e-05, + "loss": 0.6998, + "num_tokens": 82302841.0, + "step": 876 + }, + { + "epoch": 0.14968424645844, + "grad_norm": 0.5375517284684109, + "learning_rate": 3.40194572452637e-05, + "loss": 0.6529, + "num_tokens": 82410544.0, + "step": 877 + }, + { + "epoch": 0.14985492404847245, + "grad_norm": 0.5287233760971404, + "learning_rate": 3.40126301416624e-05, + "loss": 0.6033, + "num_tokens": 82502850.0, + "step": 878 + }, + { + "epoch": 0.15002560163850487, + "grad_norm": 0.5879944050158549, + "learning_rate": 3.400580303806111e-05, + "loss": 0.612, + "num_tokens": 82571366.0, + "step": 879 + }, + { + "epoch": 0.1501962792285373, + "grad_norm": 0.5903316935120404, + "learning_rate": 3.399897593445981e-05, + "loss": 0.6653, + "num_tokens": 82654603.0, + "step": 880 + }, + { + "epoch": 0.1503669568185697, + "grad_norm": 0.7520579633509681, + "learning_rate": 3.399214883085851e-05, + "loss": 0.6168, + "num_tokens": 82743789.0, + "step": 881 + }, + { + "epoch": 0.15053763440860216, + "grad_norm": 0.5577305120035513, + "learning_rate": 3.3985321727257215e-05, + "loss": 0.693, + "num_tokens": 82828601.0, + "step": 882 + }, + { + "epoch": 0.15070831199863458, + "grad_norm": 0.5253273734816027, + "learning_rate": 3.3978494623655916e-05, + "loss": 0.5735, + "num_tokens": 82913110.0, + "step": 883 + }, + { + "epoch": 0.150878989588667, + "grad_norm": 0.6063959061247644, + "learning_rate": 3.397166752005462e-05, + "loss": 0.6041, + "num_tokens": 82988483.0, + "step": 884 + }, + { + "epoch": 0.15104966717869944, + "grad_norm": 0.5813380468100351, + "learning_rate": 3.396484041645332e-05, + "loss": 0.633, + "num_tokens": 83066310.0, + "step": 885 + }, + { + "epoch": 0.15122034476873186, + "grad_norm": 0.7052725031928047, + "learning_rate": 3.395801331285203e-05, + "loss": 0.7641, + "num_tokens": 83149633.0, + "step": 886 + }, + { + "epoch": 0.15139102235876428, + "grad_norm": 0.5936942731748192, + "learning_rate": 3.3951186209250724e-05, + "loss": 0.6059, + "num_tokens": 83218628.0, + "step": 887 + }, + { + "epoch": 0.15156169994879673, + "grad_norm": 0.6308273661680083, + "learning_rate": 3.394435910564943e-05, + "loss": 0.6729, + "num_tokens": 83299432.0, + "step": 888 + }, + { + "epoch": 0.15173237753882915, + "grad_norm": 0.6326456256089366, + "learning_rate": 3.393753200204813e-05, + "loss": 0.7423, + "num_tokens": 83378244.0, + "step": 889 + }, + { + "epoch": 0.15190305512886157, + "grad_norm": 0.5144625408900313, + "learning_rate": 3.393070489844684e-05, + "loss": 0.5509, + "num_tokens": 83477260.0, + "step": 890 + }, + { + "epoch": 0.15207373271889402, + "grad_norm": 0.5452452963018138, + "learning_rate": 3.392387779484554e-05, + "loss": 0.6092, + "num_tokens": 83574957.0, + "step": 891 + }, + { + "epoch": 0.15224441030892644, + "grad_norm": 0.5276207114347228, + "learning_rate": 3.391705069124424e-05, + "loss": 0.604, + "num_tokens": 83669714.0, + "step": 892 + }, + { + "epoch": 0.15241508789895886, + "grad_norm": 0.5620109076926356, + "learning_rate": 3.3910223587642947e-05, + "loss": 0.7082, + "num_tokens": 83778720.0, + "step": 893 + }, + { + "epoch": 0.1525857654889913, + "grad_norm": 0.5386623429541259, + "learning_rate": 3.390339648404165e-05, + "loss": 0.5727, + "num_tokens": 83868735.0, + "step": 894 + }, + { + "epoch": 0.15275644307902372, + "grad_norm": 0.5579105060993788, + "learning_rate": 3.3896569380440354e-05, + "loss": 0.6937, + "num_tokens": 83959529.0, + "step": 895 + }, + { + "epoch": 0.15292712066905614, + "grad_norm": 0.5781161004041685, + "learning_rate": 3.3889742276839054e-05, + "loss": 0.6259, + "num_tokens": 84061115.0, + "step": 896 + }, + { + "epoch": 0.1530977982590886, + "grad_norm": 0.6995594579127282, + "learning_rate": 3.3882915173237755e-05, + "loss": 0.7324, + "num_tokens": 84149028.0, + "step": 897 + }, + { + "epoch": 0.153268475849121, + "grad_norm": 0.566523654505438, + "learning_rate": 3.387608806963646e-05, + "loss": 0.5986, + "num_tokens": 84232104.0, + "step": 898 + }, + { + "epoch": 0.15343915343915343, + "grad_norm": 0.5469873349230053, + "learning_rate": 3.386926096603516e-05, + "loss": 0.6071, + "num_tokens": 84310852.0, + "step": 899 + }, + { + "epoch": 0.15360983102918588, + "grad_norm": 0.5115799836946827, + "learning_rate": 3.386243386243386e-05, + "loss": 0.5759, + "num_tokens": 84406035.0, + "step": 900 + }, + { + "epoch": 0.1537805086192183, + "grad_norm": 0.5895049281351106, + "learning_rate": 3.385560675883257e-05, + "loss": 0.6369, + "num_tokens": 84481023.0, + "step": 901 + }, + { + "epoch": 0.15395118620925072, + "grad_norm": 0.5386692301014158, + "learning_rate": 3.384877965523127e-05, + "loss": 0.6636, + "num_tokens": 84585993.0, + "step": 902 + }, + { + "epoch": 0.15412186379928317, + "grad_norm": 0.5576610714590958, + "learning_rate": 3.384195255162997e-05, + "loss": 0.7089, + "num_tokens": 84698808.0, + "step": 903 + }, + { + "epoch": 0.15429254138931559, + "grad_norm": 0.5367721066434097, + "learning_rate": 3.383512544802868e-05, + "loss": 0.7303, + "num_tokens": 84809730.0, + "step": 904 + }, + { + "epoch": 0.154463218979348, + "grad_norm": 0.6094351136428775, + "learning_rate": 3.382829834442738e-05, + "loss": 0.7967, + "num_tokens": 84912253.0, + "step": 905 + }, + { + "epoch": 0.15463389656938045, + "grad_norm": 0.5150450037609449, + "learning_rate": 3.3821471240826085e-05, + "loss": 0.6675, + "num_tokens": 85025209.0, + "step": 906 + }, + { + "epoch": 0.15480457415941287, + "grad_norm": 0.5804230665517117, + "learning_rate": 3.3814644137224786e-05, + "loss": 0.6983, + "num_tokens": 85128510.0, + "step": 907 + }, + { + "epoch": 0.1549752517494453, + "grad_norm": 0.5894012750820004, + "learning_rate": 3.3807817033623486e-05, + "loss": 0.6956, + "num_tokens": 85208648.0, + "step": 908 + }, + { + "epoch": 0.15514592933947774, + "grad_norm": 0.5140369261794331, + "learning_rate": 3.380098993002219e-05, + "loss": 0.6213, + "num_tokens": 85322037.0, + "step": 909 + }, + { + "epoch": 0.15531660692951016, + "grad_norm": 0.5244381664227199, + "learning_rate": 3.3794162826420894e-05, + "loss": 0.7328, + "num_tokens": 85440709.0, + "step": 910 + }, + { + "epoch": 0.15548728451954258, + "grad_norm": 0.5412290336834022, + "learning_rate": 3.37873357228196e-05, + "loss": 0.6704, + "num_tokens": 85535325.0, + "step": 911 + }, + { + "epoch": 0.15565796210957503, + "grad_norm": 0.6453882029157079, + "learning_rate": 3.3780508619218294e-05, + "loss": 0.7361, + "num_tokens": 85610257.0, + "step": 912 + }, + { + "epoch": 0.15582863969960745, + "grad_norm": 0.6136194163551394, + "learning_rate": 3.3773681515617e-05, + "loss": 0.7455, + "num_tokens": 85697387.0, + "step": 913 + }, + { + "epoch": 0.15599931728963987, + "grad_norm": 0.5655453757271677, + "learning_rate": 3.37668544120157e-05, + "loss": 0.6505, + "num_tokens": 85773056.0, + "step": 914 + }, + { + "epoch": 0.15616999487967229, + "grad_norm": 0.522637440104471, + "learning_rate": 3.376002730841441e-05, + "loss": 0.6776, + "num_tokens": 85883745.0, + "step": 915 + }, + { + "epoch": 0.15634067246970473, + "grad_norm": 0.6053805701485918, + "learning_rate": 3.375320020481311e-05, + "loss": 0.6248, + "num_tokens": 85954207.0, + "step": 916 + }, + { + "epoch": 0.15651135005973715, + "grad_norm": 0.5119620412505483, + "learning_rate": 3.3746373101211817e-05, + "loss": 0.6101, + "num_tokens": 86065915.0, + "step": 917 + }, + { + "epoch": 0.15668202764976957, + "grad_norm": 0.5233423587345852, + "learning_rate": 3.373954599761052e-05, + "loss": 0.6848, + "num_tokens": 86181134.0, + "step": 918 + }, + { + "epoch": 0.15685270523980202, + "grad_norm": 0.5324111482438306, + "learning_rate": 3.373271889400922e-05, + "loss": 0.6852, + "num_tokens": 86291990.0, + "step": 919 + }, + { + "epoch": 0.15702338282983444, + "grad_norm": 0.5392927331046415, + "learning_rate": 3.3725891790407924e-05, + "loss": 0.6738, + "num_tokens": 86397077.0, + "step": 920 + }, + { + "epoch": 0.15719406041986686, + "grad_norm": 0.50717024982269, + "learning_rate": 3.3719064686806625e-05, + "loss": 0.6185, + "num_tokens": 86490892.0, + "step": 921 + }, + { + "epoch": 0.1573647380098993, + "grad_norm": 0.5014889517801839, + "learning_rate": 3.371223758320533e-05, + "loss": 0.6326, + "num_tokens": 86595437.0, + "step": 922 + }, + { + "epoch": 0.15753541559993173, + "grad_norm": 0.6118482390908685, + "learning_rate": 3.370541047960403e-05, + "loss": 0.6787, + "num_tokens": 86679623.0, + "step": 923 + }, + { + "epoch": 0.15770609318996415, + "grad_norm": 0.6101416634989698, + "learning_rate": 3.369858337600273e-05, + "loss": 0.6299, + "num_tokens": 86745158.0, + "step": 924 + }, + { + "epoch": 0.1578767707799966, + "grad_norm": 0.5401460441470145, + "learning_rate": 3.369175627240143e-05, + "loss": 0.6318, + "num_tokens": 86841547.0, + "step": 925 + }, + { + "epoch": 0.15804744837002901, + "grad_norm": 0.5725386950595799, + "learning_rate": 3.368492916880014e-05, + "loss": 0.6515, + "num_tokens": 86923256.0, + "step": 926 + }, + { + "epoch": 0.15821812596006143, + "grad_norm": 0.5385837980711494, + "learning_rate": 3.367810206519884e-05, + "loss": 0.6305, + "num_tokens": 87015641.0, + "step": 927 + }, + { + "epoch": 0.15838880355009388, + "grad_norm": 0.5662574601274443, + "learning_rate": 3.367127496159754e-05, + "loss": 0.711, + "num_tokens": 87119060.0, + "step": 928 + }, + { + "epoch": 0.1585594811401263, + "grad_norm": 0.5509696204996667, + "learning_rate": 3.366444785799625e-05, + "loss": 0.6108, + "num_tokens": 87196661.0, + "step": 929 + }, + { + "epoch": 0.15873015873015872, + "grad_norm": 0.4888470744093665, + "learning_rate": 3.365762075439495e-05, + "loss": 0.6653, + "num_tokens": 87327572.0, + "step": 930 + }, + { + "epoch": 0.15890083632019117, + "grad_norm": 0.5696327711183752, + "learning_rate": 3.3650793650793656e-05, + "loss": 0.6556, + "num_tokens": 87403614.0, + "step": 931 + }, + { + "epoch": 0.1590715139102236, + "grad_norm": 0.5018097701442006, + "learning_rate": 3.3643966547192356e-05, + "loss": 0.6776, + "num_tokens": 87540156.0, + "step": 932 + }, + { + "epoch": 0.159242191500256, + "grad_norm": 0.596709295696763, + "learning_rate": 3.363713944359106e-05, + "loss": 0.6506, + "num_tokens": 87616234.0, + "step": 933 + }, + { + "epoch": 0.15941286909028846, + "grad_norm": 0.537564012056407, + "learning_rate": 3.3630312339989764e-05, + "loss": 0.614, + "num_tokens": 87710679.0, + "step": 934 + }, + { + "epoch": 0.15958354668032088, + "grad_norm": 0.5247536754955109, + "learning_rate": 3.3623485236388464e-05, + "loss": 0.5977, + "num_tokens": 87813388.0, + "step": 935 + }, + { + "epoch": 0.1597542242703533, + "grad_norm": 0.5157772608378306, + "learning_rate": 3.361665813278717e-05, + "loss": 0.5988, + "num_tokens": 87905931.0, + "step": 936 + }, + { + "epoch": 0.15992490186038574, + "grad_norm": 0.6212252384968989, + "learning_rate": 3.360983102918587e-05, + "loss": 0.683, + "num_tokens": 87980996.0, + "step": 937 + }, + { + "epoch": 0.16009557945041816, + "grad_norm": 0.5684460242884022, + "learning_rate": 3.360300392558457e-05, + "loss": 0.6387, + "num_tokens": 88068179.0, + "step": 938 + }, + { + "epoch": 0.16026625704045058, + "grad_norm": 0.5906123563953137, + "learning_rate": 3.359617682198327e-05, + "loss": 0.7555, + "num_tokens": 88156022.0, + "step": 939 + }, + { + "epoch": 0.16043693463048303, + "grad_norm": 0.5711459515553676, + "learning_rate": 3.358934971838198e-05, + "loss": 0.6565, + "num_tokens": 88251435.0, + "step": 940 + }, + { + "epoch": 0.16060761222051545, + "grad_norm": 0.5087385774415003, + "learning_rate": 3.358252261478068e-05, + "loss": 0.6698, + "num_tokens": 88368677.0, + "step": 941 + }, + { + "epoch": 0.16077828981054787, + "grad_norm": 0.6005839238852286, + "learning_rate": 3.357569551117939e-05, + "loss": 0.6596, + "num_tokens": 88460673.0, + "step": 942 + }, + { + "epoch": 0.16094896740058032, + "grad_norm": 0.6132494820395977, + "learning_rate": 3.356886840757809e-05, + "loss": 0.7569, + "num_tokens": 88560554.0, + "step": 943 + }, + { + "epoch": 0.16111964499061274, + "grad_norm": 0.5855934545784317, + "learning_rate": 3.3562041303976794e-05, + "loss": 0.5969, + "num_tokens": 88637620.0, + "step": 944 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 0.5544399279290957, + "learning_rate": 3.3555214200375495e-05, + "loss": 0.6316, + "num_tokens": 88723861.0, + "step": 945 + }, + { + "epoch": 0.16146100017067758, + "grad_norm": 0.5618989884003969, + "learning_rate": 3.3548387096774195e-05, + "loss": 0.6688, + "num_tokens": 88827754.0, + "step": 946 + }, + { + "epoch": 0.16163167776071002, + "grad_norm": 0.5307284892972446, + "learning_rate": 3.35415599931729e-05, + "loss": 0.591, + "num_tokens": 88925247.0, + "step": 947 + }, + { + "epoch": 0.16180235535074244, + "grad_norm": 0.5438388180778976, + "learning_rate": 3.35347328895716e-05, + "loss": 0.6895, + "num_tokens": 89037888.0, + "step": 948 + }, + { + "epoch": 0.16197303294077486, + "grad_norm": 0.5042504159395739, + "learning_rate": 3.35279057859703e-05, + "loss": 0.5396, + "num_tokens": 89136267.0, + "step": 949 + }, + { + "epoch": 0.1621437105308073, + "grad_norm": 0.5560336613164998, + "learning_rate": 3.3521078682369003e-05, + "loss": 0.6597, + "num_tokens": 89231922.0, + "step": 950 + }, + { + "epoch": 0.16231438812083973, + "grad_norm": 0.6000062700459601, + "learning_rate": 3.351425157876771e-05, + "loss": 0.5726, + "num_tokens": 89304872.0, + "step": 951 + }, + { + "epoch": 0.16248506571087215, + "grad_norm": 0.5617838527534526, + "learning_rate": 3.350742447516641e-05, + "loss": 0.6477, + "num_tokens": 89389995.0, + "step": 952 + }, + { + "epoch": 0.1626557433009046, + "grad_norm": 0.5544101163738012, + "learning_rate": 3.350059737156512e-05, + "loss": 0.6598, + "num_tokens": 89490503.0, + "step": 953 + }, + { + "epoch": 0.16282642089093702, + "grad_norm": 0.5771115770679669, + "learning_rate": 3.349377026796382e-05, + "loss": 0.7467, + "num_tokens": 89587181.0, + "step": 954 + }, + { + "epoch": 0.16299709848096944, + "grad_norm": 0.5381769375950448, + "learning_rate": 3.348694316436252e-05, + "loss": 0.6717, + "num_tokens": 89686524.0, + "step": 955 + }, + { + "epoch": 0.16316777607100189, + "grad_norm": 0.5268824851396524, + "learning_rate": 3.3480116060761226e-05, + "loss": 0.6475, + "num_tokens": 89796447.0, + "step": 956 + }, + { + "epoch": 0.1633384536610343, + "grad_norm": 0.564657557146007, + "learning_rate": 3.3473288957159926e-05, + "loss": 0.6913, + "num_tokens": 89908407.0, + "step": 957 + }, + { + "epoch": 0.16350913125106672, + "grad_norm": 0.525067481978308, + "learning_rate": 3.3466461853558634e-05, + "loss": 0.6545, + "num_tokens": 90007604.0, + "step": 958 + }, + { + "epoch": 0.16367980884109917, + "grad_norm": 0.5520817391694264, + "learning_rate": 3.3459634749957334e-05, + "loss": 0.6492, + "num_tokens": 90102323.0, + "step": 959 + }, + { + "epoch": 0.1638504864311316, + "grad_norm": 0.5230336953839675, + "learning_rate": 3.345280764635604e-05, + "loss": 0.5668, + "num_tokens": 90180654.0, + "step": 960 + }, + { + "epoch": 0.164021164021164, + "grad_norm": 0.5679019776850219, + "learning_rate": 3.3445980542754735e-05, + "loss": 0.6084, + "num_tokens": 90259316.0, + "step": 961 + }, + { + "epoch": 0.16419184161119646, + "grad_norm": 0.6600682913525433, + "learning_rate": 3.343915343915344e-05, + "loss": 0.6441, + "num_tokens": 90371191.0, + "step": 962 + }, + { + "epoch": 0.16436251920122888, + "grad_norm": 0.5692665177939955, + "learning_rate": 3.343232633555214e-05, + "loss": 0.6638, + "num_tokens": 90477961.0, + "step": 963 + }, + { + "epoch": 0.1645331967912613, + "grad_norm": 0.5664405221890251, + "learning_rate": 3.342549923195085e-05, + "loss": 0.6386, + "num_tokens": 90574624.0, + "step": 964 + }, + { + "epoch": 0.16470387438129375, + "grad_norm": 0.5929796956519587, + "learning_rate": 3.341867212834955e-05, + "loss": 0.601, + "num_tokens": 90643245.0, + "step": 965 + }, + { + "epoch": 0.16487455197132617, + "grad_norm": 0.6080988494772495, + "learning_rate": 3.341184502474825e-05, + "loss": 0.7273, + "num_tokens": 90729054.0, + "step": 966 + }, + { + "epoch": 0.1650452295613586, + "grad_norm": 0.5602185318921884, + "learning_rate": 3.340501792114696e-05, + "loss": 0.6423, + "num_tokens": 90813937.0, + "step": 967 + }, + { + "epoch": 0.16521590715139103, + "grad_norm": 0.5080185115236072, + "learning_rate": 3.339819081754566e-05, + "loss": 0.6018, + "num_tokens": 90917385.0, + "step": 968 + }, + { + "epoch": 0.16538658474142345, + "grad_norm": 0.572149736504798, + "learning_rate": 3.3391363713944365e-05, + "loss": 0.6441, + "num_tokens": 91016641.0, + "step": 969 + }, + { + "epoch": 0.16555726233145587, + "grad_norm": 0.5271122635046286, + "learning_rate": 3.3384536610343065e-05, + "loss": 0.7383, + "num_tokens": 91141154.0, + "step": 970 + }, + { + "epoch": 0.16572793992148832, + "grad_norm": 0.5256873891421264, + "learning_rate": 3.3377709506741766e-05, + "loss": 0.7044, + "num_tokens": 91254287.0, + "step": 971 + }, + { + "epoch": 0.16589861751152074, + "grad_norm": 0.5087632236592299, + "learning_rate": 3.337088240314047e-05, + "loss": 0.686, + "num_tokens": 91370648.0, + "step": 972 + }, + { + "epoch": 0.16606929510155316, + "grad_norm": 0.5707176086482143, + "learning_rate": 3.336405529953917e-05, + "loss": 0.7016, + "num_tokens": 91459373.0, + "step": 973 + }, + { + "epoch": 0.1662399726915856, + "grad_norm": 0.5078473761790777, + "learning_rate": 3.3357228195937873e-05, + "loss": 0.5858, + "num_tokens": 91555569.0, + "step": 974 + }, + { + "epoch": 0.16641065028161803, + "grad_norm": 0.5115535191677693, + "learning_rate": 3.335040109233658e-05, + "loss": 0.5867, + "num_tokens": 91660545.0, + "step": 975 + }, + { + "epoch": 0.16658132787165045, + "grad_norm": 0.5398910125710993, + "learning_rate": 3.334357398873528e-05, + "loss": 0.7007, + "num_tokens": 91757161.0, + "step": 976 + }, + { + "epoch": 0.16675200546168287, + "grad_norm": 0.5628928232737316, + "learning_rate": 3.333674688513398e-05, + "loss": 0.6087, + "num_tokens": 91834322.0, + "step": 977 + }, + { + "epoch": 0.16692268305171531, + "grad_norm": 0.5640094547448221, + "learning_rate": 3.332991978153269e-05, + "loss": 0.638, + "num_tokens": 91918212.0, + "step": 978 + }, + { + "epoch": 0.16709336064174773, + "grad_norm": 0.5521705909359437, + "learning_rate": 3.332309267793139e-05, + "loss": 0.6765, + "num_tokens": 92020926.0, + "step": 979 + }, + { + "epoch": 0.16726403823178015, + "grad_norm": 0.5212643391856508, + "learning_rate": 3.3316265574330096e-05, + "loss": 0.581, + "num_tokens": 92114569.0, + "step": 980 + }, + { + "epoch": 0.1674347158218126, + "grad_norm": 0.5363505715396621, + "learning_rate": 3.3309438470728796e-05, + "loss": 0.6397, + "num_tokens": 92204567.0, + "step": 981 + }, + { + "epoch": 0.16760539341184502, + "grad_norm": 0.5109377152235322, + "learning_rate": 3.33026113671275e-05, + "loss": 0.6629, + "num_tokens": 92319421.0, + "step": 982 + }, + { + "epoch": 0.16777607100187744, + "grad_norm": 0.5689780830519742, + "learning_rate": 3.3295784263526204e-05, + "loss": 0.6073, + "num_tokens": 92395771.0, + "step": 983 + }, + { + "epoch": 0.1679467485919099, + "grad_norm": 0.48892928067357494, + "learning_rate": 3.3288957159924904e-05, + "loss": 0.6402, + "num_tokens": 92502786.0, + "step": 984 + }, + { + "epoch": 0.1681174261819423, + "grad_norm": 0.5144695534470568, + "learning_rate": 3.328213005632361e-05, + "loss": 0.6171, + "num_tokens": 92593123.0, + "step": 985 + }, + { + "epoch": 0.16828810377197473, + "grad_norm": 0.5335187557280862, + "learning_rate": 3.3275302952722305e-05, + "loss": 0.6093, + "num_tokens": 92689308.0, + "step": 986 + }, + { + "epoch": 0.16845878136200718, + "grad_norm": 0.49972809867810575, + "learning_rate": 3.326847584912101e-05, + "loss": 0.7224, + "num_tokens": 92807877.0, + "step": 987 + }, + { + "epoch": 0.1686294589520396, + "grad_norm": 0.5675878316167894, + "learning_rate": 3.326164874551971e-05, + "loss": 0.7169, + "num_tokens": 92886274.0, + "step": 988 + }, + { + "epoch": 0.16880013654207202, + "grad_norm": 0.6081032682890283, + "learning_rate": 3.325482164191842e-05, + "loss": 0.7695, + "num_tokens": 92972551.0, + "step": 989 + }, + { + "epoch": 0.16897081413210446, + "grad_norm": 0.6048355601539958, + "learning_rate": 3.324799453831712e-05, + "loss": 0.7242, + "num_tokens": 93085625.0, + "step": 990 + }, + { + "epoch": 0.16914149172213688, + "grad_norm": 0.5096053077435836, + "learning_rate": 3.324116743471583e-05, + "loss": 0.6634, + "num_tokens": 93189829.0, + "step": 991 + }, + { + "epoch": 0.1693121693121693, + "grad_norm": 0.5677482548327386, + "learning_rate": 3.323434033111453e-05, + "loss": 0.5687, + "num_tokens": 93254334.0, + "step": 992 + }, + { + "epoch": 0.16948284690220175, + "grad_norm": 0.5017507355452938, + "learning_rate": 3.322751322751323e-05, + "loss": 0.578, + "num_tokens": 93352547.0, + "step": 993 + }, + { + "epoch": 0.16965352449223417, + "grad_norm": 0.5330566889094631, + "learning_rate": 3.3220686123911935e-05, + "loss": 0.6496, + "num_tokens": 93458295.0, + "step": 994 + }, + { + "epoch": 0.1698242020822666, + "grad_norm": 0.5700787268990278, + "learning_rate": 3.3213859020310636e-05, + "loss": 0.775, + "num_tokens": 93561313.0, + "step": 995 + }, + { + "epoch": 0.16999487967229904, + "grad_norm": 0.4916298915541908, + "learning_rate": 3.320703191670934e-05, + "loss": 0.568, + "num_tokens": 93661031.0, + "step": 996 + }, + { + "epoch": 0.17016555726233146, + "grad_norm": 0.5705816218462098, + "learning_rate": 3.320020481310804e-05, + "loss": 0.661, + "num_tokens": 93750979.0, + "step": 997 + }, + { + "epoch": 0.17033623485236388, + "grad_norm": 0.5732547380092387, + "learning_rate": 3.3193377709506743e-05, + "loss": 0.614, + "num_tokens": 93826929.0, + "step": 998 + }, + { + "epoch": 0.17050691244239632, + "grad_norm": 0.5893235762576946, + "learning_rate": 3.3186550605905444e-05, + "loss": 0.6944, + "num_tokens": 93920199.0, + "step": 999 + }, + { + "epoch": 0.17067759003242874, + "grad_norm": 0.5076384084987617, + "learning_rate": 3.317972350230415e-05, + "loss": 0.7353, + "num_tokens": 94041831.0, + "step": 1000 + }, + { + "epoch": 0.17084826762246116, + "grad_norm": 0.5372763661025041, + "learning_rate": 3.317289639870285e-05, + "loss": 0.6186, + "num_tokens": 94129732.0, + "step": 1001 + }, + { + "epoch": 0.1710189452124936, + "grad_norm": 0.5884394712052557, + "learning_rate": 3.316606929510155e-05, + "loss": 0.6221, + "num_tokens": 94201750.0, + "step": 1002 + }, + { + "epoch": 0.17118962280252603, + "grad_norm": 0.5678552599723303, + "learning_rate": 3.315924219150026e-05, + "loss": 0.5957, + "num_tokens": 94277552.0, + "step": 1003 + }, + { + "epoch": 0.17136030039255845, + "grad_norm": 0.5315261466164247, + "learning_rate": 3.315241508789896e-05, + "loss": 0.6889, + "num_tokens": 94376764.0, + "step": 1004 + }, + { + "epoch": 0.1715309779825909, + "grad_norm": 0.5905329852165664, + "learning_rate": 3.3145587984297666e-05, + "loss": 0.6871, + "num_tokens": 94455167.0, + "step": 1005 + }, + { + "epoch": 0.17170165557262332, + "grad_norm": 0.5570139304350328, + "learning_rate": 3.313876088069637e-05, + "loss": 0.586, + "num_tokens": 94542107.0, + "step": 1006 + }, + { + "epoch": 0.17187233316265574, + "grad_norm": 0.6019041874096699, + "learning_rate": 3.3131933777095074e-05, + "loss": 0.6133, + "num_tokens": 94617257.0, + "step": 1007 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 0.5370897078121935, + "learning_rate": 3.3125106673493774e-05, + "loss": 0.6372, + "num_tokens": 94719139.0, + "step": 1008 + }, + { + "epoch": 0.1722136883427206, + "grad_norm": 0.5448237519917914, + "learning_rate": 3.3118279569892475e-05, + "loss": 0.6826, + "num_tokens": 94814676.0, + "step": 1009 + }, + { + "epoch": 0.17238436593275303, + "grad_norm": 0.5653097312568957, + "learning_rate": 3.311145246629118e-05, + "loss": 0.764, + "num_tokens": 94911780.0, + "step": 1010 + }, + { + "epoch": 0.17255504352278545, + "grad_norm": 0.5918963709788372, + "learning_rate": 3.310462536268988e-05, + "loss": 0.6819, + "num_tokens": 95016750.0, + "step": 1011 + }, + { + "epoch": 0.1727257211128179, + "grad_norm": 0.5512379188149057, + "learning_rate": 3.309779825908858e-05, + "loss": 0.6109, + "num_tokens": 95095351.0, + "step": 1012 + }, + { + "epoch": 0.1728963987028503, + "grad_norm": 0.5494245451851791, + "learning_rate": 3.309097115548728e-05, + "loss": 0.647, + "num_tokens": 95178310.0, + "step": 1013 + }, + { + "epoch": 0.17306707629288273, + "grad_norm": 0.5553821764129004, + "learning_rate": 3.308414405188599e-05, + "loss": 0.7341, + "num_tokens": 95270267.0, + "step": 1014 + }, + { + "epoch": 0.17323775388291518, + "grad_norm": 0.5423383397745963, + "learning_rate": 3.307731694828469e-05, + "loss": 0.6005, + "num_tokens": 95355531.0, + "step": 1015 + }, + { + "epoch": 0.1734084314729476, + "grad_norm": 0.5804827520020734, + "learning_rate": 3.30704898446834e-05, + "loss": 0.766, + "num_tokens": 95470834.0, + "step": 1016 + }, + { + "epoch": 0.17357910906298002, + "grad_norm": 0.5604912890454723, + "learning_rate": 3.30636627410821e-05, + "loss": 0.694, + "num_tokens": 95594240.0, + "step": 1017 + }, + { + "epoch": 0.17374978665301247, + "grad_norm": 0.5386607336879292, + "learning_rate": 3.3056835637480805e-05, + "loss": 0.6089, + "num_tokens": 95689900.0, + "step": 1018 + }, + { + "epoch": 0.1739204642430449, + "grad_norm": 0.5327327196205804, + "learning_rate": 3.3050008533879506e-05, + "loss": 0.5876, + "num_tokens": 95783833.0, + "step": 1019 + }, + { + "epoch": 0.1740911418330773, + "grad_norm": 0.5585758229537341, + "learning_rate": 3.3043181430278206e-05, + "loss": 0.6383, + "num_tokens": 95884355.0, + "step": 1020 + }, + { + "epoch": 0.17426181942310975, + "grad_norm": 0.5100032411701333, + "learning_rate": 3.303635432667691e-05, + "loss": 0.6435, + "num_tokens": 96012522.0, + "step": 1021 + }, + { + "epoch": 0.17443249701314217, + "grad_norm": 0.5079707830850448, + "learning_rate": 3.3029527223075613e-05, + "loss": 0.6638, + "num_tokens": 96124094.0, + "step": 1022 + }, + { + "epoch": 0.1746031746031746, + "grad_norm": 0.5014631304180031, + "learning_rate": 3.3022700119474314e-05, + "loss": 0.5505, + "num_tokens": 96217511.0, + "step": 1023 + }, + { + "epoch": 0.17477385219320704, + "grad_norm": 0.573539139444039, + "learning_rate": 3.3015873015873014e-05, + "loss": 0.7568, + "num_tokens": 96333654.0, + "step": 1024 + }, + { + "epoch": 0.17494452978323946, + "grad_norm": 0.6098169419382015, + "learning_rate": 3.300904591227172e-05, + "loss": 0.7474, + "num_tokens": 96414352.0, + "step": 1025 + }, + { + "epoch": 0.17511520737327188, + "grad_norm": 0.5385515482086622, + "learning_rate": 3.300221880867042e-05, + "loss": 0.7492, + "num_tokens": 96528981.0, + "step": 1026 + }, + { + "epoch": 0.17528588496330433, + "grad_norm": 0.48718731497330936, + "learning_rate": 3.299539170506913e-05, + "loss": 0.6593, + "num_tokens": 96653422.0, + "step": 1027 + }, + { + "epoch": 0.17545656255333675, + "grad_norm": 0.5494473496232791, + "learning_rate": 3.298856460146783e-05, + "loss": 0.6081, + "num_tokens": 96736563.0, + "step": 1028 + }, + { + "epoch": 0.17562724014336917, + "grad_norm": 0.5690493942851514, + "learning_rate": 3.298173749786653e-05, + "loss": 0.6718, + "num_tokens": 96818360.0, + "step": 1029 + }, + { + "epoch": 0.17579791773340162, + "grad_norm": 0.5023918120088063, + "learning_rate": 3.297491039426524e-05, + "loss": 0.6211, + "num_tokens": 96926495.0, + "step": 1030 + }, + { + "epoch": 0.17596859532343403, + "grad_norm": 0.5311648937282897, + "learning_rate": 3.296808329066394e-05, + "loss": 0.6109, + "num_tokens": 97007574.0, + "step": 1031 + }, + { + "epoch": 0.17613927291346645, + "grad_norm": 0.5564817837314057, + "learning_rate": 3.2961256187062644e-05, + "loss": 0.6542, + "num_tokens": 97097027.0, + "step": 1032 + }, + { + "epoch": 0.1763099505034989, + "grad_norm": 0.504286089368628, + "learning_rate": 3.2954429083461345e-05, + "loss": 0.5959, + "num_tokens": 97195554.0, + "step": 1033 + }, + { + "epoch": 0.17648062809353132, + "grad_norm": 0.5113749281139018, + "learning_rate": 3.294760197986005e-05, + "loss": 0.5622, + "num_tokens": 97299934.0, + "step": 1034 + }, + { + "epoch": 0.17665130568356374, + "grad_norm": 0.6556929834039832, + "learning_rate": 3.294077487625875e-05, + "loss": 0.7035, + "num_tokens": 97390188.0, + "step": 1035 + }, + { + "epoch": 0.1768219832735962, + "grad_norm": 0.4709716661530288, + "learning_rate": 3.293394777265745e-05, + "loss": 0.5602, + "num_tokens": 97507186.0, + "step": 1036 + }, + { + "epoch": 0.1769926608636286, + "grad_norm": 0.4732807271980688, + "learning_rate": 3.292712066905615e-05, + "loss": 0.6962, + "num_tokens": 97647976.0, + "step": 1037 + }, + { + "epoch": 0.17716333845366103, + "grad_norm": 0.4959150405341842, + "learning_rate": 3.292029356545486e-05, + "loss": 0.6196, + "num_tokens": 97749518.0, + "step": 1038 + }, + { + "epoch": 0.17733401604369348, + "grad_norm": 0.5184711549397046, + "learning_rate": 3.291346646185356e-05, + "loss": 0.6441, + "num_tokens": 97849984.0, + "step": 1039 + }, + { + "epoch": 0.1775046936337259, + "grad_norm": 0.5545388565474023, + "learning_rate": 3.290663935825226e-05, + "loss": 0.6686, + "num_tokens": 97939752.0, + "step": 1040 + }, + { + "epoch": 0.17767537122375832, + "grad_norm": 0.5404986268524806, + "learning_rate": 3.289981225465097e-05, + "loss": 0.7049, + "num_tokens": 98039597.0, + "step": 1041 + }, + { + "epoch": 0.17784604881379074, + "grad_norm": 0.5942106861764868, + "learning_rate": 3.289298515104967e-05, + "loss": 0.7959, + "num_tokens": 98143825.0, + "step": 1042 + }, + { + "epoch": 0.17801672640382318, + "grad_norm": 0.5218674721373961, + "learning_rate": 3.2886158047448375e-05, + "loss": 0.6272, + "num_tokens": 98229972.0, + "step": 1043 + }, + { + "epoch": 0.1781874039938556, + "grad_norm": 0.5083313347998115, + "learning_rate": 3.2879330943847076e-05, + "loss": 0.6193, + "num_tokens": 98325135.0, + "step": 1044 + }, + { + "epoch": 0.17835808158388802, + "grad_norm": 0.5322947242586562, + "learning_rate": 3.287250384024578e-05, + "loss": 0.6541, + "num_tokens": 98419693.0, + "step": 1045 + }, + { + "epoch": 0.17852875917392047, + "grad_norm": 0.5114651870932921, + "learning_rate": 3.2865676736644483e-05, + "loss": 0.6395, + "num_tokens": 98530164.0, + "step": 1046 + }, + { + "epoch": 0.1786994367639529, + "grad_norm": 0.5250944059131747, + "learning_rate": 3.2858849633043184e-05, + "loss": 0.6687, + "num_tokens": 98623677.0, + "step": 1047 + }, + { + "epoch": 0.1788701143539853, + "grad_norm": 0.5549065838463765, + "learning_rate": 3.2852022529441884e-05, + "loss": 0.6395, + "num_tokens": 98712074.0, + "step": 1048 + }, + { + "epoch": 0.17904079194401776, + "grad_norm": 0.49051174338270936, + "learning_rate": 3.284519542584059e-05, + "loss": 0.6365, + "num_tokens": 98828421.0, + "step": 1049 + }, + { + "epoch": 0.17921146953405018, + "grad_norm": 0.5544320064390491, + "learning_rate": 3.283836832223929e-05, + "loss": 0.7162, + "num_tokens": 98939646.0, + "step": 1050 + }, + { + "epoch": 0.1793821471240826, + "grad_norm": 0.5112416115462997, + "learning_rate": 3.283154121863799e-05, + "loss": 0.6498, + "num_tokens": 99047578.0, + "step": 1051 + }, + { + "epoch": 0.17955282471411504, + "grad_norm": 0.5138905369972275, + "learning_rate": 3.28247141150367e-05, + "loss": 0.6544, + "num_tokens": 99153311.0, + "step": 1052 + }, + { + "epoch": 0.17972350230414746, + "grad_norm": 0.4971419593723526, + "learning_rate": 3.28178870114354e-05, + "loss": 0.5947, + "num_tokens": 99251817.0, + "step": 1053 + }, + { + "epoch": 0.17989417989417988, + "grad_norm": 0.8454443649508292, + "learning_rate": 3.281105990783411e-05, + "loss": 0.6998, + "num_tokens": 99354051.0, + "step": 1054 + }, + { + "epoch": 0.18006485748421233, + "grad_norm": 0.5485911571468621, + "learning_rate": 3.280423280423281e-05, + "loss": 0.5816, + "num_tokens": 99424473.0, + "step": 1055 + }, + { + "epoch": 0.18023553507424475, + "grad_norm": 0.5607732993723046, + "learning_rate": 3.279740570063151e-05, + "loss": 0.5992, + "num_tokens": 99502483.0, + "step": 1056 + }, + { + "epoch": 0.18040621266427717, + "grad_norm": 0.542588942197577, + "learning_rate": 3.2790578597030215e-05, + "loss": 0.6663, + "num_tokens": 99592999.0, + "step": 1057 + }, + { + "epoch": 0.18057689025430962, + "grad_norm": 0.5326090429984599, + "learning_rate": 3.2783751493428915e-05, + "loss": 0.6086, + "num_tokens": 99684311.0, + "step": 1058 + }, + { + "epoch": 0.18074756784434204, + "grad_norm": 0.5466597851546151, + "learning_rate": 3.277692438982762e-05, + "loss": 0.6688, + "num_tokens": 99780203.0, + "step": 1059 + }, + { + "epoch": 0.18091824543437446, + "grad_norm": 0.5263014501764363, + "learning_rate": 3.277009728622632e-05, + "loss": 0.6587, + "num_tokens": 99881625.0, + "step": 1060 + }, + { + "epoch": 0.1810889230244069, + "grad_norm": 0.5100129755144327, + "learning_rate": 3.276327018262502e-05, + "loss": 0.7083, + "num_tokens": 99994639.0, + "step": 1061 + }, + { + "epoch": 0.18125960061443933, + "grad_norm": 0.559290417245474, + "learning_rate": 3.275644307902372e-05, + "loss": 0.6071, + "num_tokens": 100074500.0, + "step": 1062 + }, + { + "epoch": 0.18143027820447175, + "grad_norm": 0.5568330258691305, + "learning_rate": 3.274961597542243e-05, + "loss": 0.6638, + "num_tokens": 100166121.0, + "step": 1063 + }, + { + "epoch": 0.1816009557945042, + "grad_norm": 0.4916206257729899, + "learning_rate": 3.274278887182113e-05, + "loss": 0.5968, + "num_tokens": 100269068.0, + "step": 1064 + }, + { + "epoch": 0.1817716333845366, + "grad_norm": 0.5640768881512479, + "learning_rate": 3.273596176821984e-05, + "loss": 0.6893, + "num_tokens": 100368625.0, + "step": 1065 + }, + { + "epoch": 0.18194231097456903, + "grad_norm": 0.5054984045064248, + "learning_rate": 3.272913466461854e-05, + "loss": 0.5573, + "num_tokens": 100457850.0, + "step": 1066 + }, + { + "epoch": 0.18211298856460148, + "grad_norm": 0.5339726695306779, + "learning_rate": 3.272230756101724e-05, + "loss": 0.6282, + "num_tokens": 100543187.0, + "step": 1067 + }, + { + "epoch": 0.1822836661546339, + "grad_norm": 0.5526661016089206, + "learning_rate": 3.2715480457415946e-05, + "loss": 0.7, + "num_tokens": 100638844.0, + "step": 1068 + }, + { + "epoch": 0.18245434374466632, + "grad_norm": 0.5817235797564136, + "learning_rate": 3.2708653353814646e-05, + "loss": 0.759, + "num_tokens": 100728471.0, + "step": 1069 + }, + { + "epoch": 0.18262502133469877, + "grad_norm": 0.5082540313932651, + "learning_rate": 3.270182625021335e-05, + "loss": 0.5713, + "num_tokens": 100825921.0, + "step": 1070 + }, + { + "epoch": 0.1827956989247312, + "grad_norm": 0.5351922159364358, + "learning_rate": 3.2694999146612054e-05, + "loss": 0.5924, + "num_tokens": 100910293.0, + "step": 1071 + }, + { + "epoch": 0.1829663765147636, + "grad_norm": 0.5630524388980729, + "learning_rate": 3.2688172043010754e-05, + "loss": 0.6728, + "num_tokens": 101010646.0, + "step": 1072 + }, + { + "epoch": 0.18313705410479603, + "grad_norm": 0.5481652878447458, + "learning_rate": 3.2681344939409454e-05, + "loss": 0.6167, + "num_tokens": 101110030.0, + "step": 1073 + }, + { + "epoch": 0.18330773169482847, + "grad_norm": 0.5595702418640467, + "learning_rate": 3.267451783580816e-05, + "loss": 0.6012, + "num_tokens": 101193384.0, + "step": 1074 + }, + { + "epoch": 0.1834784092848609, + "grad_norm": 0.48923386837257415, + "learning_rate": 3.266769073220686e-05, + "loss": 0.6201, + "num_tokens": 101297583.0, + "step": 1075 + }, + { + "epoch": 0.1836490868748933, + "grad_norm": 0.5643403237782005, + "learning_rate": 3.266086362860557e-05, + "loss": 0.6301, + "num_tokens": 101385692.0, + "step": 1076 + }, + { + "epoch": 0.18381976446492576, + "grad_norm": 0.5713528989538723, + "learning_rate": 3.265403652500427e-05, + "loss": 0.583, + "num_tokens": 101465736.0, + "step": 1077 + }, + { + "epoch": 0.18399044205495818, + "grad_norm": 0.582717857443111, + "learning_rate": 3.264720942140297e-05, + "loss": 0.6838, + "num_tokens": 101562087.0, + "step": 1078 + }, + { + "epoch": 0.1841611196449906, + "grad_norm": 0.6111038840317133, + "learning_rate": 3.264038231780168e-05, + "loss": 0.763, + "num_tokens": 101640927.0, + "step": 1079 + }, + { + "epoch": 0.18433179723502305, + "grad_norm": 0.6366982716146388, + "learning_rate": 3.263355521420038e-05, + "loss": 0.6473, + "num_tokens": 101716415.0, + "step": 1080 + }, + { + "epoch": 0.18450247482505547, + "grad_norm": 0.5608760171765451, + "learning_rate": 3.2626728110599085e-05, + "loss": 0.7037, + "num_tokens": 101812887.0, + "step": 1081 + }, + { + "epoch": 0.1846731524150879, + "grad_norm": 0.5198839245260992, + "learning_rate": 3.2619901006997785e-05, + "loss": 0.5937, + "num_tokens": 101908704.0, + "step": 1082 + }, + { + "epoch": 0.18484383000512034, + "grad_norm": 0.5601209048308777, + "learning_rate": 3.2613073903396485e-05, + "loss": 0.6666, + "num_tokens": 101989305.0, + "step": 1083 + }, + { + "epoch": 0.18501450759515276, + "grad_norm": 0.6055982527404735, + "learning_rate": 3.260624679979519e-05, + "loss": 0.803, + "num_tokens": 102069943.0, + "step": 1084 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.5107881762218199, + "learning_rate": 3.259941969619389e-05, + "loss": 0.6232, + "num_tokens": 102177168.0, + "step": 1085 + }, + { + "epoch": 0.18535586277521762, + "grad_norm": 0.4915864234454951, + "learning_rate": 3.259259259259259e-05, + "loss": 0.6661, + "num_tokens": 102291163.0, + "step": 1086 + }, + { + "epoch": 0.18552654036525004, + "grad_norm": 0.5122698837114523, + "learning_rate": 3.2585765488991294e-05, + "loss": 0.6154, + "num_tokens": 102391844.0, + "step": 1087 + }, + { + "epoch": 0.18569721795528246, + "grad_norm": 0.6201310717087297, + "learning_rate": 3.257893838539e-05, + "loss": 0.6881, + "num_tokens": 102458025.0, + "step": 1088 + }, + { + "epoch": 0.1858678955453149, + "grad_norm": 0.5870012509393363, + "learning_rate": 3.25721112817887e-05, + "loss": 0.592, + "num_tokens": 102521212.0, + "step": 1089 + }, + { + "epoch": 0.18603857313534733, + "grad_norm": 0.5706288198451396, + "learning_rate": 3.256528417818741e-05, + "loss": 0.7398, + "num_tokens": 102627007.0, + "step": 1090 + }, + { + "epoch": 0.18620925072537975, + "grad_norm": 0.6452831352495688, + "learning_rate": 3.255845707458611e-05, + "loss": 0.664, + "num_tokens": 102683494.0, + "step": 1091 + }, + { + "epoch": 0.1863799283154122, + "grad_norm": 0.5851078464776941, + "learning_rate": 3.2551629970984816e-05, + "loss": 0.665, + "num_tokens": 102796964.0, + "step": 1092 + }, + { + "epoch": 0.18655060590544462, + "grad_norm": 0.5185275827759959, + "learning_rate": 3.2544802867383516e-05, + "loss": 0.6472, + "num_tokens": 102894422.0, + "step": 1093 + }, + { + "epoch": 0.18672128349547704, + "grad_norm": 0.5545725065968523, + "learning_rate": 3.2537975763782217e-05, + "loss": 0.6604, + "num_tokens": 102980586.0, + "step": 1094 + }, + { + "epoch": 0.18689196108550948, + "grad_norm": 0.5554932364177928, + "learning_rate": 3.2531148660180924e-05, + "loss": 0.6521, + "num_tokens": 103066341.0, + "step": 1095 + }, + { + "epoch": 0.1870626386755419, + "grad_norm": 0.6262049191929879, + "learning_rate": 3.2524321556579624e-05, + "loss": 0.719, + "num_tokens": 103151568.0, + "step": 1096 + }, + { + "epoch": 0.18723331626557432, + "grad_norm": 0.5370813994031527, + "learning_rate": 3.251749445297833e-05, + "loss": 0.6343, + "num_tokens": 103234610.0, + "step": 1097 + }, + { + "epoch": 0.18740399385560677, + "grad_norm": 0.5026264533014893, + "learning_rate": 3.2510667349377025e-05, + "loss": 0.5625, + "num_tokens": 103330483.0, + "step": 1098 + }, + { + "epoch": 0.1875746714456392, + "grad_norm": 0.5370105848611278, + "learning_rate": 3.250384024577573e-05, + "loss": 0.6659, + "num_tokens": 103427811.0, + "step": 1099 + }, + { + "epoch": 0.1877453490356716, + "grad_norm": 0.5474789394609674, + "learning_rate": 3.249701314217443e-05, + "loss": 0.6211, + "num_tokens": 103513579.0, + "step": 1100 + }, + { + "epoch": 0.18791602662570406, + "grad_norm": 0.5729257506000883, + "learning_rate": 3.249018603857314e-05, + "loss": 0.711, + "num_tokens": 103596537.0, + "step": 1101 + }, + { + "epoch": 0.18808670421573648, + "grad_norm": 0.5576936814737763, + "learning_rate": 3.248335893497184e-05, + "loss": 0.7033, + "num_tokens": 103692831.0, + "step": 1102 + }, + { + "epoch": 0.1882573818057689, + "grad_norm": 0.5911318339809046, + "learning_rate": 3.247653183137054e-05, + "loss": 0.7309, + "num_tokens": 103776767.0, + "step": 1103 + }, + { + "epoch": 0.18842805939580134, + "grad_norm": 0.5278330098262008, + "learning_rate": 3.246970472776925e-05, + "loss": 0.6419, + "num_tokens": 103865563.0, + "step": 1104 + }, + { + "epoch": 0.18859873698583376, + "grad_norm": 0.5656291534696276, + "learning_rate": 3.246287762416795e-05, + "loss": 0.68, + "num_tokens": 103947893.0, + "step": 1105 + }, + { + "epoch": 0.18876941457586618, + "grad_norm": 0.571301167092783, + "learning_rate": 3.2456050520566655e-05, + "loss": 0.6352, + "num_tokens": 104031836.0, + "step": 1106 + }, + { + "epoch": 0.1889400921658986, + "grad_norm": 0.5869834634857306, + "learning_rate": 3.2449223416965355e-05, + "loss": 0.7461, + "num_tokens": 104131353.0, + "step": 1107 + }, + { + "epoch": 0.18911076975593105, + "grad_norm": 0.5568604812570788, + "learning_rate": 3.244239631336406e-05, + "loss": 0.5101, + "num_tokens": 104194395.0, + "step": 1108 + }, + { + "epoch": 0.18928144734596347, + "grad_norm": 0.5488974276307095, + "learning_rate": 3.243556920976276e-05, + "loss": 0.5461, + "num_tokens": 104266218.0, + "step": 1109 + }, + { + "epoch": 0.1894521249359959, + "grad_norm": 0.5911375918844386, + "learning_rate": 3.242874210616146e-05, + "loss": 0.7039, + "num_tokens": 104359938.0, + "step": 1110 + }, + { + "epoch": 0.18962280252602834, + "grad_norm": 0.586628923600477, + "learning_rate": 3.2421915002560164e-05, + "loss": 0.7038, + "num_tokens": 104447477.0, + "step": 1111 + }, + { + "epoch": 0.18979348011606076, + "grad_norm": 0.49682435622433574, + "learning_rate": 3.241508789895887e-05, + "loss": 0.6111, + "num_tokens": 104547191.0, + "step": 1112 + }, + { + "epoch": 0.18996415770609318, + "grad_norm": 0.5631442395035774, + "learning_rate": 3.240826079535757e-05, + "loss": 0.6986, + "num_tokens": 104655299.0, + "step": 1113 + }, + { + "epoch": 0.19013483529612563, + "grad_norm": 0.5498842309099883, + "learning_rate": 3.240143369175627e-05, + "loss": 0.6319, + "num_tokens": 104737789.0, + "step": 1114 + }, + { + "epoch": 0.19030551288615805, + "grad_norm": 0.5479643191141993, + "learning_rate": 3.239460658815498e-05, + "loss": 0.6228, + "num_tokens": 104828410.0, + "step": 1115 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.48133777335755307, + "learning_rate": 3.238777948455368e-05, + "loss": 0.6286, + "num_tokens": 104944506.0, + "step": 1116 + }, + { + "epoch": 0.1906468680662229, + "grad_norm": 0.5597883222557116, + "learning_rate": 3.2380952380952386e-05, + "loss": 0.7684, + "num_tokens": 105058534.0, + "step": 1117 + }, + { + "epoch": 0.19081754565625533, + "grad_norm": 0.5345610141127837, + "learning_rate": 3.2374125277351087e-05, + "loss": 0.7473, + "num_tokens": 105165763.0, + "step": 1118 + }, + { + "epoch": 0.19098822324628775, + "grad_norm": 0.6101094637750812, + "learning_rate": 3.2367298173749794e-05, + "loss": 0.6369, + "num_tokens": 105239622.0, + "step": 1119 + }, + { + "epoch": 0.1911589008363202, + "grad_norm": 0.541343466939214, + "learning_rate": 3.2360471070148494e-05, + "loss": 0.6896, + "num_tokens": 105344262.0, + "step": 1120 + }, + { + "epoch": 0.19132957842635262, + "grad_norm": 0.5039821701295021, + "learning_rate": 3.2353643966547194e-05, + "loss": 0.6215, + "num_tokens": 105456847.0, + "step": 1121 + }, + { + "epoch": 0.19150025601638504, + "grad_norm": 0.5021140546384327, + "learning_rate": 3.23468168629459e-05, + "loss": 0.5711, + "num_tokens": 105552030.0, + "step": 1122 + }, + { + "epoch": 0.1916709336064175, + "grad_norm": 0.5164271283580785, + "learning_rate": 3.23399897593446e-05, + "loss": 0.5541, + "num_tokens": 105638556.0, + "step": 1123 + }, + { + "epoch": 0.1918416111964499, + "grad_norm": 0.548664168761183, + "learning_rate": 3.23331626557433e-05, + "loss": 0.6119, + "num_tokens": 105714405.0, + "step": 1124 + }, + { + "epoch": 0.19201228878648233, + "grad_norm": 0.513142264188867, + "learning_rate": 3.2326335552142e-05, + "loss": 0.6251, + "num_tokens": 105815274.0, + "step": 1125 + }, + { + "epoch": 0.19218296637651477, + "grad_norm": 0.5681297877892892, + "learning_rate": 3.231950844854071e-05, + "loss": 0.6082, + "num_tokens": 105887672.0, + "step": 1126 + }, + { + "epoch": 0.1923536439665472, + "grad_norm": 0.5053382579900002, + "learning_rate": 3.231268134493941e-05, + "loss": 0.6346, + "num_tokens": 105986033.0, + "step": 1127 + }, + { + "epoch": 0.19252432155657961, + "grad_norm": 0.5772468086037074, + "learning_rate": 3.230585424133812e-05, + "loss": 0.5979, + "num_tokens": 106051766.0, + "step": 1128 + }, + { + "epoch": 0.19269499914661206, + "grad_norm": 0.5122724235561483, + "learning_rate": 3.229902713773682e-05, + "loss": 0.6438, + "num_tokens": 106158971.0, + "step": 1129 + }, + { + "epoch": 0.19286567673664448, + "grad_norm": 0.5085038865401955, + "learning_rate": 3.229220003413552e-05, + "loss": 0.5763, + "num_tokens": 106256629.0, + "step": 1130 + }, + { + "epoch": 0.1930363543266769, + "grad_norm": 0.54281220928071, + "learning_rate": 3.2285372930534225e-05, + "loss": 0.5855, + "num_tokens": 106339077.0, + "step": 1131 + }, + { + "epoch": 0.19320703191670935, + "grad_norm": 0.5261581795764751, + "learning_rate": 3.2278545826932926e-05, + "loss": 0.6462, + "num_tokens": 106439785.0, + "step": 1132 + }, + { + "epoch": 0.19337770950674177, + "grad_norm": 0.5148012673126799, + "learning_rate": 3.227171872333163e-05, + "loss": 0.6345, + "num_tokens": 106530994.0, + "step": 1133 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 0.5402074780243247, + "learning_rate": 3.226489161973033e-05, + "loss": 0.6763, + "num_tokens": 106636742.0, + "step": 1134 + }, + { + "epoch": 0.19371906468680664, + "grad_norm": 0.5603725284096069, + "learning_rate": 3.2258064516129034e-05, + "loss": 0.7461, + "num_tokens": 106743423.0, + "step": 1135 + }, + { + "epoch": 0.19388974227683906, + "grad_norm": 0.5508794878723339, + "learning_rate": 3.2251237412527734e-05, + "loss": 0.5919, + "num_tokens": 106819245.0, + "step": 1136 + }, + { + "epoch": 0.19406041986687148, + "grad_norm": 0.5356098131533387, + "learning_rate": 3.224441030892644e-05, + "loss": 0.6631, + "num_tokens": 106916175.0, + "step": 1137 + }, + { + "epoch": 0.1942310974569039, + "grad_norm": 0.48016732780647425, + "learning_rate": 3.223758320532514e-05, + "loss": 0.5783, + "num_tokens": 107019118.0, + "step": 1138 + }, + { + "epoch": 0.19440177504693634, + "grad_norm": 0.5031484153937038, + "learning_rate": 3.223075610172385e-05, + "loss": 0.5668, + "num_tokens": 107107651.0, + "step": 1139 + }, + { + "epoch": 0.19457245263696876, + "grad_norm": 0.5256326313692414, + "learning_rate": 3.222392899812255e-05, + "loss": 0.6508, + "num_tokens": 107207676.0, + "step": 1140 + }, + { + "epoch": 0.19474313022700118, + "grad_norm": 0.4911639325029448, + "learning_rate": 3.221710189452125e-05, + "loss": 0.6563, + "num_tokens": 107336884.0, + "step": 1141 + }, + { + "epoch": 0.19491380781703363, + "grad_norm": 0.5302807238634104, + "learning_rate": 3.2210274790919957e-05, + "loss": 0.7657, + "num_tokens": 107446751.0, + "step": 1142 + }, + { + "epoch": 0.19508448540706605, + "grad_norm": 0.49781215019971115, + "learning_rate": 3.220344768731866e-05, + "loss": 0.6265, + "num_tokens": 107550760.0, + "step": 1143 + }, + { + "epoch": 0.19525516299709847, + "grad_norm": 0.5053788189858137, + "learning_rate": 3.2196620583717364e-05, + "loss": 0.5882, + "num_tokens": 107640716.0, + "step": 1144 + }, + { + "epoch": 0.19542584058713092, + "grad_norm": 0.49393683153341494, + "learning_rate": 3.2189793480116064e-05, + "loss": 0.6, + "num_tokens": 107743473.0, + "step": 1145 + }, + { + "epoch": 0.19559651817716334, + "grad_norm": 0.5208763932077316, + "learning_rate": 3.2182966376514765e-05, + "loss": 0.5937, + "num_tokens": 107832800.0, + "step": 1146 + }, + { + "epoch": 0.19576719576719576, + "grad_norm": 0.5292656593605197, + "learning_rate": 3.2176139272913465e-05, + "loss": 0.6884, + "num_tokens": 107944386.0, + "step": 1147 + }, + { + "epoch": 0.1959378733572282, + "grad_norm": 0.5446768984319162, + "learning_rate": 3.216931216931217e-05, + "loss": 0.6519, + "num_tokens": 108038297.0, + "step": 1148 + }, + { + "epoch": 0.19610855094726062, + "grad_norm": 0.4887527013954778, + "learning_rate": 3.216248506571087e-05, + "loss": 0.5768, + "num_tokens": 108133689.0, + "step": 1149 + }, + { + "epoch": 0.19627922853729304, + "grad_norm": 0.5432612700896221, + "learning_rate": 3.215565796210958e-05, + "loss": 0.6427, + "num_tokens": 108222717.0, + "step": 1150 + }, + { + "epoch": 0.1964499061273255, + "grad_norm": 0.5202153373150797, + "learning_rate": 3.214883085850828e-05, + "loss": 0.6335, + "num_tokens": 108322894.0, + "step": 1151 + }, + { + "epoch": 0.1966205837173579, + "grad_norm": 0.557108573379649, + "learning_rate": 3.214200375490698e-05, + "loss": 0.6718, + "num_tokens": 108412341.0, + "step": 1152 + }, + { + "epoch": 0.19679126130739033, + "grad_norm": 0.4922600483102208, + "learning_rate": 3.213517665130569e-05, + "loss": 0.6421, + "num_tokens": 108514089.0, + "step": 1153 + }, + { + "epoch": 0.19696193889742278, + "grad_norm": 0.5073566943525165, + "learning_rate": 3.212834954770439e-05, + "loss": 0.6546, + "num_tokens": 108612899.0, + "step": 1154 + }, + { + "epoch": 0.1971326164874552, + "grad_norm": 0.6009183821473304, + "learning_rate": 3.2121522444103095e-05, + "loss": 0.7235, + "num_tokens": 108707226.0, + "step": 1155 + }, + { + "epoch": 0.19730329407748762, + "grad_norm": 0.5548843708551608, + "learning_rate": 3.2114695340501796e-05, + "loss": 0.6479, + "num_tokens": 108789048.0, + "step": 1156 + }, + { + "epoch": 0.19747397166752007, + "grad_norm": 0.5425894762001847, + "learning_rate": 3.2107868236900496e-05, + "loss": 0.6941, + "num_tokens": 108910065.0, + "step": 1157 + }, + { + "epoch": 0.19764464925755248, + "grad_norm": 0.5005868754631077, + "learning_rate": 3.21010411332992e-05, + "loss": 0.607, + "num_tokens": 109013359.0, + "step": 1158 + }, + { + "epoch": 0.1978153268475849, + "grad_norm": 0.5041495072246857, + "learning_rate": 3.2094214029697904e-05, + "loss": 0.6131, + "num_tokens": 109119889.0, + "step": 1159 + }, + { + "epoch": 0.19798600443761735, + "grad_norm": 0.5435513972300262, + "learning_rate": 3.2087386926096604e-05, + "loss": 0.6658, + "num_tokens": 109216118.0, + "step": 1160 + }, + { + "epoch": 0.19815668202764977, + "grad_norm": 0.5649783472557393, + "learning_rate": 3.2080559822495304e-05, + "loss": 0.7216, + "num_tokens": 109302462.0, + "step": 1161 + }, + { + "epoch": 0.1983273596176822, + "grad_norm": 0.5346780329708679, + "learning_rate": 3.207373271889401e-05, + "loss": 0.5438, + "num_tokens": 109378520.0, + "step": 1162 + }, + { + "epoch": 0.19849803720771464, + "grad_norm": 0.5580479633017513, + "learning_rate": 3.206690561529271e-05, + "loss": 0.5833, + "num_tokens": 109456659.0, + "step": 1163 + }, + { + "epoch": 0.19866871479774706, + "grad_norm": 0.5074333066417308, + "learning_rate": 3.206007851169142e-05, + "loss": 0.5869, + "num_tokens": 109543962.0, + "step": 1164 + }, + { + "epoch": 0.19883939238777948, + "grad_norm": 0.534418029734114, + "learning_rate": 3.205325140809012e-05, + "loss": 0.6113, + "num_tokens": 109622518.0, + "step": 1165 + }, + { + "epoch": 0.19901006997781193, + "grad_norm": 0.5678302954002895, + "learning_rate": 3.2046424304488827e-05, + "loss": 0.7108, + "num_tokens": 109721877.0, + "step": 1166 + }, + { + "epoch": 0.19918074756784435, + "grad_norm": 0.5574099816344207, + "learning_rate": 3.203959720088753e-05, + "loss": 0.6791, + "num_tokens": 109818902.0, + "step": 1167 + }, + { + "epoch": 0.19935142515787677, + "grad_norm": 0.4853571734349343, + "learning_rate": 3.203277009728623e-05, + "loss": 0.6847, + "num_tokens": 109930834.0, + "step": 1168 + }, + { + "epoch": 0.19952210274790919, + "grad_norm": 0.5806211728395615, + "learning_rate": 3.2025942993684934e-05, + "loss": 0.7187, + "num_tokens": 110018093.0, + "step": 1169 + }, + { + "epoch": 0.19969278033794163, + "grad_norm": 0.5215439719902569, + "learning_rate": 3.2019115890083635e-05, + "loss": 0.6969, + "num_tokens": 110130828.0, + "step": 1170 + }, + { + "epoch": 0.19986345792797405, + "grad_norm": 0.5000324385282668, + "learning_rate": 3.201228878648234e-05, + "loss": 0.6326, + "num_tokens": 110243388.0, + "step": 1171 + }, + { + "epoch": 0.20003413551800647, + "grad_norm": 0.5194375568293453, + "learning_rate": 3.2005461682881036e-05, + "loss": 0.5373, + "num_tokens": 110328721.0, + "step": 1172 + }, + { + "epoch": 0.20020481310803892, + "grad_norm": 0.5654553749664818, + "learning_rate": 3.199863457927974e-05, + "loss": 0.5911, + "num_tokens": 110393966.0, + "step": 1173 + }, + { + "epoch": 0.20037549069807134, + "grad_norm": 0.5945101509351507, + "learning_rate": 3.199180747567844e-05, + "loss": 0.6653, + "num_tokens": 110481065.0, + "step": 1174 + }, + { + "epoch": 0.20054616828810376, + "grad_norm": 0.5214837651290356, + "learning_rate": 3.198498037207715e-05, + "loss": 0.658, + "num_tokens": 110575153.0, + "step": 1175 + }, + { + "epoch": 0.2007168458781362, + "grad_norm": 0.5543542031231169, + "learning_rate": 3.197815326847585e-05, + "loss": 0.6772, + "num_tokens": 110660771.0, + "step": 1176 + }, + { + "epoch": 0.20088752346816863, + "grad_norm": 0.49039191995376574, + "learning_rate": 3.197132616487455e-05, + "loss": 0.6007, + "num_tokens": 110758654.0, + "step": 1177 + }, + { + "epoch": 0.20105820105820105, + "grad_norm": 0.5294079119762495, + "learning_rate": 3.196449906127326e-05, + "loss": 0.6643, + "num_tokens": 110855119.0, + "step": 1178 + }, + { + "epoch": 0.2012288786482335, + "grad_norm": 0.5662756836137125, + "learning_rate": 3.195767195767196e-05, + "loss": 0.7346, + "num_tokens": 110957561.0, + "step": 1179 + }, + { + "epoch": 0.20139955623826591, + "grad_norm": 0.49087761417807313, + "learning_rate": 3.1950844854070666e-05, + "loss": 0.6387, + "num_tokens": 111059159.0, + "step": 1180 + }, + { + "epoch": 0.20157023382829833, + "grad_norm": 0.5762812285546797, + "learning_rate": 3.1944017750469366e-05, + "loss": 0.7367, + "num_tokens": 111157079.0, + "step": 1181 + }, + { + "epoch": 0.20174091141833078, + "grad_norm": 0.4970947828644773, + "learning_rate": 3.193719064686807e-05, + "loss": 0.6478, + "num_tokens": 111264090.0, + "step": 1182 + }, + { + "epoch": 0.2019115890083632, + "grad_norm": 0.5945906816054343, + "learning_rate": 3.1930363543266774e-05, + "loss": 0.6572, + "num_tokens": 111333536.0, + "step": 1183 + }, + { + "epoch": 0.20208226659839562, + "grad_norm": 0.5629439022611719, + "learning_rate": 3.1923536439665474e-05, + "loss": 0.6658, + "num_tokens": 111428728.0, + "step": 1184 + }, + { + "epoch": 0.20225294418842807, + "grad_norm": 0.5415091052882329, + "learning_rate": 3.1916709336064174e-05, + "loss": 0.605, + "num_tokens": 111511506.0, + "step": 1185 + }, + { + "epoch": 0.2024236217784605, + "grad_norm": 0.5133376921347124, + "learning_rate": 3.190988223246288e-05, + "loss": 0.7027, + "num_tokens": 111625389.0, + "step": 1186 + }, + { + "epoch": 0.2025942993684929, + "grad_norm": 0.5413191664490349, + "learning_rate": 3.190305512886158e-05, + "loss": 0.5584, + "num_tokens": 111705376.0, + "step": 1187 + }, + { + "epoch": 0.20276497695852536, + "grad_norm": 0.5594397807955257, + "learning_rate": 3.189622802526028e-05, + "loss": 0.6565, + "num_tokens": 111784552.0, + "step": 1188 + }, + { + "epoch": 0.20293565454855778, + "grad_norm": 0.5168194206491402, + "learning_rate": 3.188940092165899e-05, + "loss": 0.5915, + "num_tokens": 111877352.0, + "step": 1189 + }, + { + "epoch": 0.2031063321385902, + "grad_norm": 0.5403510971648063, + "learning_rate": 3.188257381805769e-05, + "loss": 0.6026, + "num_tokens": 111964240.0, + "step": 1190 + }, + { + "epoch": 0.20327700972862264, + "grad_norm": 0.4957823174778685, + "learning_rate": 3.18757467144564e-05, + "loss": 0.6956, + "num_tokens": 112078514.0, + "step": 1191 + }, + { + "epoch": 0.20344768731865506, + "grad_norm": 0.5641963180064267, + "learning_rate": 3.18689196108551e-05, + "loss": 0.654, + "num_tokens": 112163096.0, + "step": 1192 + }, + { + "epoch": 0.20361836490868748, + "grad_norm": 0.5753269806900153, + "learning_rate": 3.1862092507253804e-05, + "loss": 0.593, + "num_tokens": 112241135.0, + "step": 1193 + }, + { + "epoch": 0.20378904249871993, + "grad_norm": 0.5345104319266557, + "learning_rate": 3.1855265403652505e-05, + "loss": 0.6506, + "num_tokens": 112342334.0, + "step": 1194 + }, + { + "epoch": 0.20395972008875235, + "grad_norm": 0.6258082947474475, + "learning_rate": 3.1848438300051205e-05, + "loss": 0.6784, + "num_tokens": 112417200.0, + "step": 1195 + }, + { + "epoch": 0.20413039767878477, + "grad_norm": 0.4790928286809921, + "learning_rate": 3.184161119644991e-05, + "loss": 0.5896, + "num_tokens": 112527627.0, + "step": 1196 + }, + { + "epoch": 0.20430107526881722, + "grad_norm": 0.6126519126918921, + "learning_rate": 3.183478409284861e-05, + "loss": 0.6608, + "num_tokens": 112594455.0, + "step": 1197 + }, + { + "epoch": 0.20447175285884964, + "grad_norm": 0.6237329224514461, + "learning_rate": 3.182795698924731e-05, + "loss": 0.7256, + "num_tokens": 112679144.0, + "step": 1198 + }, + { + "epoch": 0.20464243044888206, + "grad_norm": 0.5764426538288566, + "learning_rate": 3.1821129885646013e-05, + "loss": 0.6013, + "num_tokens": 112754030.0, + "step": 1199 + }, + { + "epoch": 0.2048131080389145, + "grad_norm": 0.5703048594151886, + "learning_rate": 3.181430278204472e-05, + "loss": 0.678, + "num_tokens": 112839477.0, + "step": 1200 + }, + { + "epoch": 0.20498378562894692, + "grad_norm": 0.5347004271125881, + "learning_rate": 3.180747567844342e-05, + "loss": 0.6478, + "num_tokens": 112944672.0, + "step": 1201 + }, + { + "epoch": 0.20515446321897934, + "grad_norm": 0.5619218222314204, + "learning_rate": 3.180064857484213e-05, + "loss": 0.6854, + "num_tokens": 113036654.0, + "step": 1202 + }, + { + "epoch": 0.20532514080901176, + "grad_norm": 0.5291634222269491, + "learning_rate": 3.179382147124083e-05, + "loss": 0.7076, + "num_tokens": 113148056.0, + "step": 1203 + }, + { + "epoch": 0.2054958183990442, + "grad_norm": 0.5609713633971667, + "learning_rate": 3.178699436763953e-05, + "loss": 0.6273, + "num_tokens": 113245588.0, + "step": 1204 + }, + { + "epoch": 0.20566649598907663, + "grad_norm": 0.5529606138094956, + "learning_rate": 3.1780167264038236e-05, + "loss": 0.6053, + "num_tokens": 113340484.0, + "step": 1205 + }, + { + "epoch": 0.20583717357910905, + "grad_norm": 0.6097823513189452, + "learning_rate": 3.1773340160436936e-05, + "loss": 0.651, + "num_tokens": 113417891.0, + "step": 1206 + }, + { + "epoch": 0.2060078511691415, + "grad_norm": 0.5522006009775958, + "learning_rate": 3.1766513056835644e-05, + "loss": 0.6357, + "num_tokens": 113500397.0, + "step": 1207 + }, + { + "epoch": 0.20617852875917392, + "grad_norm": 0.5536082952019296, + "learning_rate": 3.1759685953234344e-05, + "loss": 0.6665, + "num_tokens": 113589448.0, + "step": 1208 + }, + { + "epoch": 0.20634920634920634, + "grad_norm": 0.5046931773773035, + "learning_rate": 3.1752858849633044e-05, + "loss": 0.611, + "num_tokens": 113687162.0, + "step": 1209 + }, + { + "epoch": 0.20651988393923879, + "grad_norm": 0.5985336891710991, + "learning_rate": 3.1746031746031745e-05, + "loss": 0.6572, + "num_tokens": 113761596.0, + "step": 1210 + }, + { + "epoch": 0.2066905615292712, + "grad_norm": 0.5322990012012707, + "learning_rate": 3.173920464243045e-05, + "loss": 0.5766, + "num_tokens": 113846153.0, + "step": 1211 + }, + { + "epoch": 0.20686123911930362, + "grad_norm": 0.5368472584282229, + "learning_rate": 3.173237753882915e-05, + "loss": 0.6438, + "num_tokens": 113935361.0, + "step": 1212 + }, + { + "epoch": 0.20703191670933607, + "grad_norm": 0.5186546839971135, + "learning_rate": 3.172555043522786e-05, + "loss": 0.5708, + "num_tokens": 114021032.0, + "step": 1213 + }, + { + "epoch": 0.2072025942993685, + "grad_norm": 0.5477781359792933, + "learning_rate": 3.171872333162656e-05, + "loss": 0.6184, + "num_tokens": 114137486.0, + "step": 1214 + }, + { + "epoch": 0.2073732718894009, + "grad_norm": 0.542155697476986, + "learning_rate": 3.171189622802526e-05, + "loss": 0.5812, + "num_tokens": 114230341.0, + "step": 1215 + }, + { + "epoch": 0.20754394947943336, + "grad_norm": 0.5349297611024131, + "learning_rate": 3.170506912442397e-05, + "loss": 0.7017, + "num_tokens": 114339722.0, + "step": 1216 + }, + { + "epoch": 0.20771462706946578, + "grad_norm": 0.5236244210234993, + "learning_rate": 3.169824202082267e-05, + "loss": 0.6941, + "num_tokens": 114445866.0, + "step": 1217 + }, + { + "epoch": 0.2078853046594982, + "grad_norm": 0.5173004880775959, + "learning_rate": 3.1691414917221375e-05, + "loss": 0.5812, + "num_tokens": 114532325.0, + "step": 1218 + }, + { + "epoch": 0.20805598224953065, + "grad_norm": 0.5128247496683166, + "learning_rate": 3.1684587813620075e-05, + "loss": 0.5755, + "num_tokens": 114621593.0, + "step": 1219 + }, + { + "epoch": 0.20822665983956307, + "grad_norm": 0.4991163150905879, + "learning_rate": 3.167776071001878e-05, + "loss": 0.5972, + "num_tokens": 114734962.0, + "step": 1220 + }, + { + "epoch": 0.2083973374295955, + "grad_norm": 0.5498123984433351, + "learning_rate": 3.167093360641748e-05, + "loss": 0.6906, + "num_tokens": 114831124.0, + "step": 1221 + }, + { + "epoch": 0.20856801501962793, + "grad_norm": 0.5441310937127648, + "learning_rate": 3.166410650281618e-05, + "loss": 0.6631, + "num_tokens": 114925801.0, + "step": 1222 + }, + { + "epoch": 0.20873869260966035, + "grad_norm": 0.5498247424056939, + "learning_rate": 3.1657279399214883e-05, + "loss": 0.6332, + "num_tokens": 115003451.0, + "step": 1223 + }, + { + "epoch": 0.20890937019969277, + "grad_norm": 0.6331994291604266, + "learning_rate": 3.165045229561359e-05, + "loss": 0.573, + "num_tokens": 115107865.0, + "step": 1224 + }, + { + "epoch": 0.20908004778972522, + "grad_norm": 0.5453299785647201, + "learning_rate": 3.164362519201229e-05, + "loss": 0.6311, + "num_tokens": 115200366.0, + "step": 1225 + }, + { + "epoch": 0.20925072537975764, + "grad_norm": 0.6223780866393828, + "learning_rate": 3.163679808841099e-05, + "loss": 0.6405, + "num_tokens": 115260802.0, + "step": 1226 + }, + { + "epoch": 0.20942140296979006, + "grad_norm": 0.5740723518410151, + "learning_rate": 3.16299709848097e-05, + "loss": 0.7275, + "num_tokens": 115355658.0, + "step": 1227 + }, + { + "epoch": 0.2095920805598225, + "grad_norm": 0.5245484977786767, + "learning_rate": 3.16231438812084e-05, + "loss": 0.5905, + "num_tokens": 115452460.0, + "step": 1228 + }, + { + "epoch": 0.20976275814985493, + "grad_norm": 0.5683687881764138, + "learning_rate": 3.1616316777607106e-05, + "loss": 0.6309, + "num_tokens": 115571698.0, + "step": 1229 + }, + { + "epoch": 0.20993343573988735, + "grad_norm": 0.5307401254636978, + "learning_rate": 3.1609489674005806e-05, + "loss": 0.5892, + "num_tokens": 115666418.0, + "step": 1230 + }, + { + "epoch": 0.2101041133299198, + "grad_norm": 0.565734627882508, + "learning_rate": 3.160266257040451e-05, + "loss": 0.6696, + "num_tokens": 115768408.0, + "step": 1231 + }, + { + "epoch": 0.21027479091995221, + "grad_norm": 0.5729960832818445, + "learning_rate": 3.1595835466803214e-05, + "loss": 0.7195, + "num_tokens": 115862117.0, + "step": 1232 + }, + { + "epoch": 0.21044546850998463, + "grad_norm": 0.4843978401101547, + "learning_rate": 3.1589008363201914e-05, + "loss": 0.5948, + "num_tokens": 115973402.0, + "step": 1233 + }, + { + "epoch": 0.21061614610001705, + "grad_norm": 0.4948623988961894, + "learning_rate": 3.1582181259600615e-05, + "loss": 0.5926, + "num_tokens": 116075702.0, + "step": 1234 + }, + { + "epoch": 0.2107868236900495, + "grad_norm": 0.4769482260274982, + "learning_rate": 3.1575354155999315e-05, + "loss": 0.6842, + "num_tokens": 116221521.0, + "step": 1235 + }, + { + "epoch": 0.21095750128008192, + "grad_norm": 0.5233544410930263, + "learning_rate": 3.156852705239802e-05, + "loss": 0.6473, + "num_tokens": 116325459.0, + "step": 1236 + }, + { + "epoch": 0.21112817887011434, + "grad_norm": 0.47978276562972266, + "learning_rate": 3.156169994879672e-05, + "loss": 0.5443, + "num_tokens": 116415110.0, + "step": 1237 + }, + { + "epoch": 0.2112988564601468, + "grad_norm": 0.5003696070289188, + "learning_rate": 3.155487284519543e-05, + "loss": 0.5821, + "num_tokens": 116503684.0, + "step": 1238 + }, + { + "epoch": 0.2114695340501792, + "grad_norm": 0.49538542629099286, + "learning_rate": 3.154804574159413e-05, + "loss": 0.6478, + "num_tokens": 116617822.0, + "step": 1239 + }, + { + "epoch": 0.21164021164021163, + "grad_norm": 0.5672177984293683, + "learning_rate": 3.154121863799284e-05, + "loss": 0.6593, + "num_tokens": 116695938.0, + "step": 1240 + }, + { + "epoch": 0.21181088923024408, + "grad_norm": 0.5130558609397715, + "learning_rate": 3.153439153439154e-05, + "loss": 0.6222, + "num_tokens": 116795457.0, + "step": 1241 + }, + { + "epoch": 0.2119815668202765, + "grad_norm": 0.5119341795162801, + "learning_rate": 3.152756443079024e-05, + "loss": 0.5919, + "num_tokens": 116889602.0, + "step": 1242 + }, + { + "epoch": 0.21215224441030892, + "grad_norm": 0.5173500262513236, + "learning_rate": 3.1520737327188945e-05, + "loss": 0.6173, + "num_tokens": 116980366.0, + "step": 1243 + }, + { + "epoch": 0.21232292200034136, + "grad_norm": 0.5613048185328698, + "learning_rate": 3.1513910223587645e-05, + "loss": 0.7647, + "num_tokens": 117102312.0, + "step": 1244 + }, + { + "epoch": 0.21249359959037378, + "grad_norm": 0.5273003954455242, + "learning_rate": 3.150708311998635e-05, + "loss": 0.7458, + "num_tokens": 117208847.0, + "step": 1245 + }, + { + "epoch": 0.2126642771804062, + "grad_norm": 0.5770437430742941, + "learning_rate": 3.1500256016385046e-05, + "loss": 0.6892, + "num_tokens": 117284530.0, + "step": 1246 + }, + { + "epoch": 0.21283495477043865, + "grad_norm": 0.5839081360173941, + "learning_rate": 3.149342891278375e-05, + "loss": 0.6039, + "num_tokens": 117363528.0, + "step": 1247 + }, + { + "epoch": 0.21300563236047107, + "grad_norm": 0.5669818974123074, + "learning_rate": 3.1486601809182454e-05, + "loss": 0.5832, + "num_tokens": 117436570.0, + "step": 1248 + }, + { + "epoch": 0.2131763099505035, + "grad_norm": 0.5379855647756979, + "learning_rate": 3.147977470558116e-05, + "loss": 0.621, + "num_tokens": 117513214.0, + "step": 1249 + }, + { + "epoch": 0.21334698754053594, + "grad_norm": 0.5513990518624673, + "learning_rate": 3.147294760197986e-05, + "loss": 0.6013, + "num_tokens": 117591079.0, + "step": 1250 + }, + { + "epoch": 0.21351766513056836, + "grad_norm": 0.5551520326386693, + "learning_rate": 3.146612049837857e-05, + "loss": 0.6568, + "num_tokens": 117671224.0, + "step": 1251 + }, + { + "epoch": 0.21368834272060078, + "grad_norm": 0.5445912747580625, + "learning_rate": 3.145929339477727e-05, + "loss": 0.5659, + "num_tokens": 117746220.0, + "step": 1252 + }, + { + "epoch": 0.21385902031063322, + "grad_norm": 0.5234794583809146, + "learning_rate": 3.145246629117597e-05, + "loss": 0.6164, + "num_tokens": 117840980.0, + "step": 1253 + }, + { + "epoch": 0.21402969790066564, + "grad_norm": 0.49693692336591405, + "learning_rate": 3.1445639187574676e-05, + "loss": 0.6505, + "num_tokens": 117942460.0, + "step": 1254 + }, + { + "epoch": 0.21420037549069806, + "grad_norm": 0.5134943161102437, + "learning_rate": 3.143881208397338e-05, + "loss": 0.5543, + "num_tokens": 118044928.0, + "step": 1255 + }, + { + "epoch": 0.2143710530807305, + "grad_norm": 0.5183263056183482, + "learning_rate": 3.1431984980372084e-05, + "loss": 0.5879, + "num_tokens": 118128078.0, + "step": 1256 + }, + { + "epoch": 0.21454173067076293, + "grad_norm": 0.5154068562538553, + "learning_rate": 3.1425157876770784e-05, + "loss": 0.6568, + "num_tokens": 118222261.0, + "step": 1257 + }, + { + "epoch": 0.21471240826079535, + "grad_norm": 0.5260147692226151, + "learning_rate": 3.1418330773169485e-05, + "loss": 0.7319, + "num_tokens": 118335657.0, + "step": 1258 + }, + { + "epoch": 0.2148830858508278, + "grad_norm": 0.5152113677500072, + "learning_rate": 3.1411503669568185e-05, + "loss": 0.625, + "num_tokens": 118432427.0, + "step": 1259 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 0.5575494752441983, + "learning_rate": 3.140467656596689e-05, + "loss": 0.7142, + "num_tokens": 118527223.0, + "step": 1260 + }, + { + "epoch": 0.21522444103089264, + "grad_norm": 0.6291522595811296, + "learning_rate": 3.139784946236559e-05, + "loss": 0.6841, + "num_tokens": 118606716.0, + "step": 1261 + }, + { + "epoch": 0.21539511862092509, + "grad_norm": 0.4824992417597077, + "learning_rate": 3.139102235876429e-05, + "loss": 0.7177, + "num_tokens": 118755823.0, + "step": 1262 + }, + { + "epoch": 0.2155657962109575, + "grad_norm": 0.5518755713786561, + "learning_rate": 3.1384195255163e-05, + "loss": 0.6114, + "num_tokens": 118836442.0, + "step": 1263 + }, + { + "epoch": 0.21573647380098993, + "grad_norm": 0.5741911423299354, + "learning_rate": 3.13773681515617e-05, + "loss": 0.6782, + "num_tokens": 118941398.0, + "step": 1264 + }, + { + "epoch": 0.21590715139102234, + "grad_norm": 0.5633030350424183, + "learning_rate": 3.137054104796041e-05, + "loss": 0.7179, + "num_tokens": 119041387.0, + "step": 1265 + }, + { + "epoch": 0.2160778289810548, + "grad_norm": 0.5200680387264794, + "learning_rate": 3.136371394435911e-05, + "loss": 0.6356, + "num_tokens": 119137004.0, + "step": 1266 + }, + { + "epoch": 0.2162485065710872, + "grad_norm": 0.5660484564296533, + "learning_rate": 3.1356886840757815e-05, + "loss": 0.664, + "num_tokens": 119217999.0, + "step": 1267 + }, + { + "epoch": 0.21641918416111963, + "grad_norm": 0.5546447747144146, + "learning_rate": 3.1350059737156515e-05, + "loss": 0.6585, + "num_tokens": 119303808.0, + "step": 1268 + }, + { + "epoch": 0.21658986175115208, + "grad_norm": 0.4761753572864742, + "learning_rate": 3.1343232633555216e-05, + "loss": 0.6305, + "num_tokens": 119416053.0, + "step": 1269 + }, + { + "epoch": 0.2167605393411845, + "grad_norm": 0.4777789784823375, + "learning_rate": 3.133640552995392e-05, + "loss": 0.6262, + "num_tokens": 119527410.0, + "step": 1270 + }, + { + "epoch": 0.21693121693121692, + "grad_norm": 0.5179031741404582, + "learning_rate": 3.132957842635262e-05, + "loss": 0.6936, + "num_tokens": 119635084.0, + "step": 1271 + }, + { + "epoch": 0.21710189452124937, + "grad_norm": 0.45758807736388535, + "learning_rate": 3.1322751322751324e-05, + "loss": 0.5837, + "num_tokens": 119767281.0, + "step": 1272 + }, + { + "epoch": 0.2172725721112818, + "grad_norm": 0.5146491826768651, + "learning_rate": 3.1315924219150024e-05, + "loss": 0.6197, + "num_tokens": 119869259.0, + "step": 1273 + }, + { + "epoch": 0.2174432497013142, + "grad_norm": 0.48202232164069053, + "learning_rate": 3.130909711554873e-05, + "loss": 0.5868, + "num_tokens": 119964352.0, + "step": 1274 + }, + { + "epoch": 0.21761392729134665, + "grad_norm": 0.5544959261478811, + "learning_rate": 3.130227001194743e-05, + "loss": 0.6359, + "num_tokens": 120052485.0, + "step": 1275 + }, + { + "epoch": 0.21778460488137907, + "grad_norm": 0.594636136150917, + "learning_rate": 3.129544290834614e-05, + "loss": 0.6015, + "num_tokens": 120127301.0, + "step": 1276 + }, + { + "epoch": 0.2179552824714115, + "grad_norm": 0.5235087633329406, + "learning_rate": 3.128861580474484e-05, + "loss": 0.6517, + "num_tokens": 120231508.0, + "step": 1277 + }, + { + "epoch": 0.21812596006144394, + "grad_norm": 0.5410400563979285, + "learning_rate": 3.128178870114354e-05, + "loss": 0.6908, + "num_tokens": 120324441.0, + "step": 1278 + }, + { + "epoch": 0.21829663765147636, + "grad_norm": 0.5022069869264864, + "learning_rate": 3.127496159754225e-05, + "loss": 0.5828, + "num_tokens": 120437090.0, + "step": 1279 + }, + { + "epoch": 0.21846731524150878, + "grad_norm": 0.5721654539778839, + "learning_rate": 3.126813449394095e-05, + "loss": 0.628, + "num_tokens": 120510980.0, + "step": 1280 + }, + { + "epoch": 0.21863799283154123, + "grad_norm": 0.5061966069305591, + "learning_rate": 3.1261307390339654e-05, + "loss": 0.6091, + "num_tokens": 120602417.0, + "step": 1281 + }, + { + "epoch": 0.21880867042157365, + "grad_norm": 0.5853981615013306, + "learning_rate": 3.1254480286738355e-05, + "loss": 0.6632, + "num_tokens": 120690982.0, + "step": 1282 + }, + { + "epoch": 0.21897934801160607, + "grad_norm": 0.49649426661222085, + "learning_rate": 3.124765318313706e-05, + "loss": 0.6801, + "num_tokens": 120807785.0, + "step": 1283 + }, + { + "epoch": 0.21915002560163852, + "grad_norm": 0.5554993984773283, + "learning_rate": 3.1240826079535755e-05, + "loss": 0.7424, + "num_tokens": 120894739.0, + "step": 1284 + }, + { + "epoch": 0.21932070319167093, + "grad_norm": 0.5008720564652658, + "learning_rate": 3.123399897593446e-05, + "loss": 0.524, + "num_tokens": 120974440.0, + "step": 1285 + }, + { + "epoch": 0.21949138078170335, + "grad_norm": 0.5122543402641064, + "learning_rate": 3.122717187233316e-05, + "loss": 0.6278, + "num_tokens": 121066247.0, + "step": 1286 + }, + { + "epoch": 0.2196620583717358, + "grad_norm": 0.4840263928462038, + "learning_rate": 3.122034476873187e-05, + "loss": 0.6172, + "num_tokens": 121171603.0, + "step": 1287 + }, + { + "epoch": 0.21983273596176822, + "grad_norm": 0.5924820957646824, + "learning_rate": 3.121351766513057e-05, + "loss": 0.6743, + "num_tokens": 121243932.0, + "step": 1288 + }, + { + "epoch": 0.22000341355180064, + "grad_norm": 0.5204749135425585, + "learning_rate": 3.120669056152927e-05, + "loss": 0.6694, + "num_tokens": 121348566.0, + "step": 1289 + }, + { + "epoch": 0.2201740911418331, + "grad_norm": 0.5292652775647423, + "learning_rate": 3.119986345792798e-05, + "loss": 0.5834, + "num_tokens": 121438787.0, + "step": 1290 + }, + { + "epoch": 0.2203447687318655, + "grad_norm": 0.5061649487699258, + "learning_rate": 3.119303635432668e-05, + "loss": 0.6399, + "num_tokens": 121550565.0, + "step": 1291 + }, + { + "epoch": 0.22051544632189793, + "grad_norm": 0.5637121935073286, + "learning_rate": 3.1186209250725385e-05, + "loss": 0.6501, + "num_tokens": 121633554.0, + "step": 1292 + }, + { + "epoch": 0.22068612391193038, + "grad_norm": 0.5338703407322376, + "learning_rate": 3.1179382147124086e-05, + "loss": 0.6767, + "num_tokens": 121719543.0, + "step": 1293 + }, + { + "epoch": 0.2208568015019628, + "grad_norm": 0.529766493317216, + "learning_rate": 3.117255504352279e-05, + "loss": 0.6482, + "num_tokens": 121811864.0, + "step": 1294 + }, + { + "epoch": 0.22102747909199522, + "grad_norm": 0.527641882341383, + "learning_rate": 3.116572793992149e-05, + "loss": 0.6458, + "num_tokens": 121897716.0, + "step": 1295 + }, + { + "epoch": 0.22119815668202766, + "grad_norm": 0.5590862162114275, + "learning_rate": 3.1158900836320194e-05, + "loss": 0.6397, + "num_tokens": 121985184.0, + "step": 1296 + }, + { + "epoch": 0.22136883427206008, + "grad_norm": 0.5416493845992049, + "learning_rate": 3.1152073732718894e-05, + "loss": 0.6447, + "num_tokens": 122073494.0, + "step": 1297 + }, + { + "epoch": 0.2215395118620925, + "grad_norm": 0.5400498090162702, + "learning_rate": 3.11452466291176e-05, + "loss": 0.5836, + "num_tokens": 122155152.0, + "step": 1298 + }, + { + "epoch": 0.22171018945212492, + "grad_norm": 0.5118083289354799, + "learning_rate": 3.11384195255163e-05, + "loss": 0.6868, + "num_tokens": 122284729.0, + "step": 1299 + }, + { + "epoch": 0.22188086704215737, + "grad_norm": 0.5062459409330697, + "learning_rate": 3.1131592421915e-05, + "loss": 0.7157, + "num_tokens": 122410649.0, + "step": 1300 + }, + { + "epoch": 0.2220515446321898, + "grad_norm": 0.5252333494656604, + "learning_rate": 3.112476531831371e-05, + "loss": 0.596, + "num_tokens": 122489628.0, + "step": 1301 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.550696395801145, + "learning_rate": 3.111793821471241e-05, + "loss": 0.6555, + "num_tokens": 122568600.0, + "step": 1302 + }, + { + "epoch": 0.22239289981225466, + "grad_norm": 0.5209231106681205, + "learning_rate": 3.111111111111112e-05, + "loss": 0.6526, + "num_tokens": 122656549.0, + "step": 1303 + }, + { + "epoch": 0.22256357740228708, + "grad_norm": 0.7455578243253266, + "learning_rate": 3.110428400750982e-05, + "loss": 0.7653, + "num_tokens": 122738906.0, + "step": 1304 + }, + { + "epoch": 0.2227342549923195, + "grad_norm": 0.5236233959774621, + "learning_rate": 3.109745690390852e-05, + "loss": 0.6987, + "num_tokens": 122837428.0, + "step": 1305 + }, + { + "epoch": 0.22290493258235194, + "grad_norm": 0.5301093616548719, + "learning_rate": 3.1090629800307225e-05, + "loss": 0.6624, + "num_tokens": 122933751.0, + "step": 1306 + }, + { + "epoch": 0.22307561017238436, + "grad_norm": 0.5314253145963397, + "learning_rate": 3.1083802696705925e-05, + "loss": 0.621, + "num_tokens": 123032798.0, + "step": 1307 + }, + { + "epoch": 0.22324628776241678, + "grad_norm": 0.49901828025909467, + "learning_rate": 3.1076975593104625e-05, + "loss": 0.5859, + "num_tokens": 123134693.0, + "step": 1308 + }, + { + "epoch": 0.22341696535244923, + "grad_norm": 0.5058207035646075, + "learning_rate": 3.1070148489503326e-05, + "loss": 0.6623, + "num_tokens": 123243226.0, + "step": 1309 + }, + { + "epoch": 0.22358764294248165, + "grad_norm": 0.5788853745198406, + "learning_rate": 3.106332138590203e-05, + "loss": 0.628, + "num_tokens": 123313885.0, + "step": 1310 + }, + { + "epoch": 0.22375832053251407, + "grad_norm": 0.5570194632970338, + "learning_rate": 3.105649428230073e-05, + "loss": 0.6805, + "num_tokens": 123398174.0, + "step": 1311 + }, + { + "epoch": 0.22392899812254652, + "grad_norm": 0.5404540967594337, + "learning_rate": 3.104966717869944e-05, + "loss": 0.6568, + "num_tokens": 123493383.0, + "step": 1312 + }, + { + "epoch": 0.22409967571257894, + "grad_norm": 0.5226154995881893, + "learning_rate": 3.104284007509814e-05, + "loss": 0.6394, + "num_tokens": 123580513.0, + "step": 1313 + }, + { + "epoch": 0.22427035330261136, + "grad_norm": 0.538532128324198, + "learning_rate": 3.103601297149685e-05, + "loss": 0.5581, + "num_tokens": 123662885.0, + "step": 1314 + }, + { + "epoch": 0.2244410308926438, + "grad_norm": 0.5118192073086769, + "learning_rate": 3.102918586789555e-05, + "loss": 0.6074, + "num_tokens": 123754655.0, + "step": 1315 + }, + { + "epoch": 0.22461170848267623, + "grad_norm": 0.5275613232079875, + "learning_rate": 3.102235876429425e-05, + "loss": 0.7001, + "num_tokens": 123847024.0, + "step": 1316 + }, + { + "epoch": 0.22478238607270865, + "grad_norm": 0.5138498486398182, + "learning_rate": 3.1015531660692956e-05, + "loss": 0.6063, + "num_tokens": 123943318.0, + "step": 1317 + }, + { + "epoch": 0.2249530636627411, + "grad_norm": 0.5813332012820843, + "learning_rate": 3.1008704557091656e-05, + "loss": 0.7098, + "num_tokens": 124036114.0, + "step": 1318 + }, + { + "epoch": 0.2251237412527735, + "grad_norm": 0.48992493418432864, + "learning_rate": 3.100187745349036e-05, + "loss": 0.5663, + "num_tokens": 124144373.0, + "step": 1319 + }, + { + "epoch": 0.22529441884280593, + "grad_norm": 0.5277472344845527, + "learning_rate": 3.0995050349889064e-05, + "loss": 0.6055, + "num_tokens": 124230269.0, + "step": 1320 + }, + { + "epoch": 0.22546509643283838, + "grad_norm": 0.48760476000896436, + "learning_rate": 3.0988223246287764e-05, + "loss": 0.6243, + "num_tokens": 124341038.0, + "step": 1321 + }, + { + "epoch": 0.2256357740228708, + "grad_norm": 0.5320189614642145, + "learning_rate": 3.0981396142686464e-05, + "loss": 0.6857, + "num_tokens": 124432730.0, + "step": 1322 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 0.5422975348313949, + "learning_rate": 3.097456903908517e-05, + "loss": 0.5948, + "num_tokens": 124505320.0, + "step": 1323 + }, + { + "epoch": 0.22597712920293567, + "grad_norm": 0.5487965795405434, + "learning_rate": 3.096774193548387e-05, + "loss": 0.6711, + "num_tokens": 124598252.0, + "step": 1324 + }, + { + "epoch": 0.2261478067929681, + "grad_norm": 0.46048152737975895, + "learning_rate": 3.096091483188258e-05, + "loss": 0.5676, + "num_tokens": 124712349.0, + "step": 1325 + }, + { + "epoch": 0.2263184843830005, + "grad_norm": 0.5365308787572153, + "learning_rate": 3.095408772828128e-05, + "loss": 0.6876, + "num_tokens": 124823495.0, + "step": 1326 + }, + { + "epoch": 0.22648916197303295, + "grad_norm": 0.4572302619795821, + "learning_rate": 3.094726062467998e-05, + "loss": 0.5318, + "num_tokens": 124923331.0, + "step": 1327 + }, + { + "epoch": 0.22665983956306537, + "grad_norm": 0.5084389269930601, + "learning_rate": 3.094043352107869e-05, + "loss": 0.5132, + "num_tokens": 125020189.0, + "step": 1328 + }, + { + "epoch": 0.2268305171530978, + "grad_norm": 0.5231638894140928, + "learning_rate": 3.093360641747739e-05, + "loss": 0.6739, + "num_tokens": 125116920.0, + "step": 1329 + }, + { + "epoch": 0.2270011947431302, + "grad_norm": 0.4889906609430922, + "learning_rate": 3.0926779313876095e-05, + "loss": 0.6272, + "num_tokens": 125233557.0, + "step": 1330 + }, + { + "epoch": 0.22717187233316266, + "grad_norm": 0.5288945597525497, + "learning_rate": 3.0919952210274795e-05, + "loss": 0.6712, + "num_tokens": 125341875.0, + "step": 1331 + }, + { + "epoch": 0.22734254992319508, + "grad_norm": 0.5233123467930906, + "learning_rate": 3.0913125106673495e-05, + "loss": 0.5799, + "num_tokens": 125426633.0, + "step": 1332 + }, + { + "epoch": 0.2275132275132275, + "grad_norm": 0.5026944226909204, + "learning_rate": 3.0906298003072196e-05, + "loss": 0.6092, + "num_tokens": 125536749.0, + "step": 1333 + }, + { + "epoch": 0.22768390510325995, + "grad_norm": 0.5020655667074998, + "learning_rate": 3.08994708994709e-05, + "loss": 0.6413, + "num_tokens": 125627021.0, + "step": 1334 + }, + { + "epoch": 0.22785458269329237, + "grad_norm": 0.5369408916890366, + "learning_rate": 3.08926437958696e-05, + "loss": 0.6198, + "num_tokens": 125709899.0, + "step": 1335 + }, + { + "epoch": 0.2280252602833248, + "grad_norm": 0.49936836298814274, + "learning_rate": 3.0885816692268304e-05, + "loss": 0.621, + "num_tokens": 125819503.0, + "step": 1336 + }, + { + "epoch": 0.22819593787335724, + "grad_norm": 0.5830525163249084, + "learning_rate": 3.087898958866701e-05, + "loss": 0.6714, + "num_tokens": 125902692.0, + "step": 1337 + }, + { + "epoch": 0.22836661546338966, + "grad_norm": 0.5085374201600399, + "learning_rate": 3.087216248506571e-05, + "loss": 0.6382, + "num_tokens": 125998340.0, + "step": 1338 + }, + { + "epoch": 0.22853729305342207, + "grad_norm": 0.5158694398817232, + "learning_rate": 3.086533538146442e-05, + "loss": 0.6488, + "num_tokens": 126088456.0, + "step": 1339 + }, + { + "epoch": 0.22870797064345452, + "grad_norm": 0.6796306680345512, + "learning_rate": 3.085850827786312e-05, + "loss": 0.5784, + "num_tokens": 126180867.0, + "step": 1340 + }, + { + "epoch": 0.22887864823348694, + "grad_norm": 0.5354239726248454, + "learning_rate": 3.0851681174261826e-05, + "loss": 0.6374, + "num_tokens": 126272983.0, + "step": 1341 + }, + { + "epoch": 0.22904932582351936, + "grad_norm": 0.635998331687358, + "learning_rate": 3.0844854070660526e-05, + "loss": 0.6281, + "num_tokens": 126339449.0, + "step": 1342 + }, + { + "epoch": 0.2292200034135518, + "grad_norm": 0.49852917261294133, + "learning_rate": 3.0838026967059227e-05, + "loss": 0.6205, + "num_tokens": 126440401.0, + "step": 1343 + }, + { + "epoch": 0.22939068100358423, + "grad_norm": 0.5354176342893149, + "learning_rate": 3.0831199863457934e-05, + "loss": 0.7033, + "num_tokens": 126538743.0, + "step": 1344 + }, + { + "epoch": 0.22956135859361665, + "grad_norm": 0.5293416212419898, + "learning_rate": 3.0824372759856634e-05, + "loss": 0.6014, + "num_tokens": 126631433.0, + "step": 1345 + }, + { + "epoch": 0.2297320361836491, + "grad_norm": 0.5400589990098501, + "learning_rate": 3.0817545656255334e-05, + "loss": 0.599, + "num_tokens": 126725605.0, + "step": 1346 + }, + { + "epoch": 0.22990271377368152, + "grad_norm": 0.5784148263448391, + "learning_rate": 3.0810718552654035e-05, + "loss": 0.6421, + "num_tokens": 126793635.0, + "step": 1347 + }, + { + "epoch": 0.23007339136371394, + "grad_norm": 0.5232414247774434, + "learning_rate": 3.080389144905274e-05, + "loss": 0.6219, + "num_tokens": 126892672.0, + "step": 1348 + }, + { + "epoch": 0.23024406895374638, + "grad_norm": 0.4591713707736781, + "learning_rate": 3.079706434545144e-05, + "loss": 0.6109, + "num_tokens": 127006470.0, + "step": 1349 + }, + { + "epoch": 0.2304147465437788, + "grad_norm": 0.4474923368059129, + "learning_rate": 3.079023724185015e-05, + "loss": 0.6806, + "num_tokens": 127148405.0, + "step": 1350 + }, + { + "epoch": 0.23058542413381122, + "grad_norm": 0.47335854074438316, + "learning_rate": 3.078341013824885e-05, + "loss": 0.5751, + "num_tokens": 127258311.0, + "step": 1351 + }, + { + "epoch": 0.23075610172384367, + "grad_norm": 0.4914682074521922, + "learning_rate": 3.077658303464755e-05, + "loss": 0.7056, + "num_tokens": 127375688.0, + "step": 1352 + }, + { + "epoch": 0.2309267793138761, + "grad_norm": 0.5100803364563288, + "learning_rate": 3.076975593104626e-05, + "loss": 0.6796, + "num_tokens": 127472719.0, + "step": 1353 + }, + { + "epoch": 0.2310974569039085, + "grad_norm": 0.7190577578227213, + "learning_rate": 3.076292882744496e-05, + "loss": 0.6917, + "num_tokens": 127551021.0, + "step": 1354 + }, + { + "epoch": 0.23126813449394096, + "grad_norm": 0.589448165272954, + "learning_rate": 3.0756101723843665e-05, + "loss": 0.6715, + "num_tokens": 127635942.0, + "step": 1355 + }, + { + "epoch": 0.23143881208397338, + "grad_norm": 0.586582032772375, + "learning_rate": 3.0749274620242365e-05, + "loss": 0.7546, + "num_tokens": 127717832.0, + "step": 1356 + }, + { + "epoch": 0.2316094896740058, + "grad_norm": 0.5231884912491942, + "learning_rate": 3.074244751664107e-05, + "loss": 0.6418, + "num_tokens": 127822525.0, + "step": 1357 + }, + { + "epoch": 0.23178016726403824, + "grad_norm": 0.6200089333667927, + "learning_rate": 3.0735620413039766e-05, + "loss": 0.7015, + "num_tokens": 127913385.0, + "step": 1358 + }, + { + "epoch": 0.23195084485407066, + "grad_norm": 0.5364204964116007, + "learning_rate": 3.072879330943847e-05, + "loss": 0.5791, + "num_tokens": 127989122.0, + "step": 1359 + }, + { + "epoch": 0.23212152244410308, + "grad_norm": 0.5618887639884832, + "learning_rate": 3.0721966205837174e-05, + "loss": 0.6074, + "num_tokens": 128061353.0, + "step": 1360 + }, + { + "epoch": 0.2322922000341355, + "grad_norm": 0.5349320134033911, + "learning_rate": 3.071513910223588e-05, + "loss": 0.675, + "num_tokens": 128165253.0, + "step": 1361 + }, + { + "epoch": 0.23246287762416795, + "grad_norm": 0.5150730024192413, + "learning_rate": 3.070831199863458e-05, + "loss": 0.6151, + "num_tokens": 128250503.0, + "step": 1362 + }, + { + "epoch": 0.23263355521420037, + "grad_norm": 0.5543520272100232, + "learning_rate": 3.070148489503328e-05, + "loss": 0.6396, + "num_tokens": 128352845.0, + "step": 1363 + }, + { + "epoch": 0.2328042328042328, + "grad_norm": 0.5207715634138996, + "learning_rate": 3.069465779143199e-05, + "loss": 0.6198, + "num_tokens": 128450421.0, + "step": 1364 + }, + { + "epoch": 0.23297491039426524, + "grad_norm": 0.515682394566172, + "learning_rate": 3.068783068783069e-05, + "loss": 0.645, + "num_tokens": 128541865.0, + "step": 1365 + }, + { + "epoch": 0.23314558798429766, + "grad_norm": 0.5667396795942035, + "learning_rate": 3.0681003584229396e-05, + "loss": 0.686, + "num_tokens": 128636273.0, + "step": 1366 + }, + { + "epoch": 0.23331626557433008, + "grad_norm": 0.49709660908414344, + "learning_rate": 3.0674176480628097e-05, + "loss": 0.6001, + "num_tokens": 128722836.0, + "step": 1367 + }, + { + "epoch": 0.23348694316436253, + "grad_norm": 0.6256207419702865, + "learning_rate": 3.0667349377026804e-05, + "loss": 0.561, + "num_tokens": 128784457.0, + "step": 1368 + }, + { + "epoch": 0.23365762075439495, + "grad_norm": 0.5055766287518001, + "learning_rate": 3.0660522273425504e-05, + "loss": 0.6251, + "num_tokens": 128880993.0, + "step": 1369 + }, + { + "epoch": 0.23382829834442737, + "grad_norm": 0.5526813224569053, + "learning_rate": 3.0653695169824204e-05, + "loss": 0.6662, + "num_tokens": 128972307.0, + "step": 1370 + }, + { + "epoch": 0.2339989759344598, + "grad_norm": 0.5456343117132848, + "learning_rate": 3.0646868066222905e-05, + "loss": 0.7043, + "num_tokens": 129058065.0, + "step": 1371 + }, + { + "epoch": 0.23416965352449223, + "grad_norm": 0.5283988508782279, + "learning_rate": 3.064004096262161e-05, + "loss": 0.6409, + "num_tokens": 129149780.0, + "step": 1372 + }, + { + "epoch": 0.23434033111452465, + "grad_norm": 0.5125272199029696, + "learning_rate": 3.063321385902031e-05, + "loss": 0.6915, + "num_tokens": 129247527.0, + "step": 1373 + }, + { + "epoch": 0.2345110087045571, + "grad_norm": 0.5954929590919974, + "learning_rate": 3.062638675541901e-05, + "loss": 0.5977, + "num_tokens": 129315593.0, + "step": 1374 + }, + { + "epoch": 0.23468168629458952, + "grad_norm": 0.4850966661934879, + "learning_rate": 3.061955965181772e-05, + "loss": 0.6736, + "num_tokens": 129424816.0, + "step": 1375 + }, + { + "epoch": 0.23485236388462194, + "grad_norm": 0.5279503184160949, + "learning_rate": 3.061273254821642e-05, + "loss": 0.5783, + "num_tokens": 129508707.0, + "step": 1376 + }, + { + "epoch": 0.2350230414746544, + "grad_norm": 0.5344811732567887, + "learning_rate": 3.060590544461513e-05, + "loss": 0.6034, + "num_tokens": 129592049.0, + "step": 1377 + }, + { + "epoch": 0.2351937190646868, + "grad_norm": 0.556847563815085, + "learning_rate": 3.059907834101383e-05, + "loss": 0.7988, + "num_tokens": 129703006.0, + "step": 1378 + }, + { + "epoch": 0.23536439665471923, + "grad_norm": 0.5074626747522675, + "learning_rate": 3.059225123741253e-05, + "loss": 0.6395, + "num_tokens": 129822172.0, + "step": 1379 + }, + { + "epoch": 0.23553507424475167, + "grad_norm": 0.5730720136870313, + "learning_rate": 3.0585424133811235e-05, + "loss": 0.7556, + "num_tokens": 129918265.0, + "step": 1380 + }, + { + "epoch": 0.2357057518347841, + "grad_norm": 0.5190070165286906, + "learning_rate": 3.0578597030209936e-05, + "loss": 0.6719, + "num_tokens": 130016003.0, + "step": 1381 + }, + { + "epoch": 0.2358764294248165, + "grad_norm": 0.5812502794074597, + "learning_rate": 3.057176992660864e-05, + "loss": 0.664, + "num_tokens": 130106851.0, + "step": 1382 + }, + { + "epoch": 0.23604710701484896, + "grad_norm": 0.6765207666970194, + "learning_rate": 3.0564942823007336e-05, + "loss": 0.7287, + "num_tokens": 130203068.0, + "step": 1383 + }, + { + "epoch": 0.23621778460488138, + "grad_norm": 0.49947603410063446, + "learning_rate": 3.0558115719406044e-05, + "loss": 0.6347, + "num_tokens": 130297302.0, + "step": 1384 + }, + { + "epoch": 0.2363884621949138, + "grad_norm": 0.6336592616392603, + "learning_rate": 3.0551288615804744e-05, + "loss": 0.6892, + "num_tokens": 130435661.0, + "step": 1385 + }, + { + "epoch": 0.23655913978494625, + "grad_norm": 0.5067524771399632, + "learning_rate": 3.054446151220345e-05, + "loss": 0.6696, + "num_tokens": 130558865.0, + "step": 1386 + }, + { + "epoch": 0.23672981737497867, + "grad_norm": 0.5465546560326978, + "learning_rate": 3.053763440860215e-05, + "loss": 0.5636, + "num_tokens": 130670069.0, + "step": 1387 + }, + { + "epoch": 0.2369004949650111, + "grad_norm": 0.5430544087985929, + "learning_rate": 3.053080730500086e-05, + "loss": 0.5042, + "num_tokens": 130739100.0, + "step": 1388 + }, + { + "epoch": 0.23707117255504354, + "grad_norm": 0.5810494941696542, + "learning_rate": 3.052398020139956e-05, + "loss": 0.7156, + "num_tokens": 130814249.0, + "step": 1389 + }, + { + "epoch": 0.23724185014507596, + "grad_norm": 0.5506033751706608, + "learning_rate": 3.0517153097798263e-05, + "loss": 0.6876, + "num_tokens": 130911956.0, + "step": 1390 + }, + { + "epoch": 0.23741252773510838, + "grad_norm": 0.524734952522944, + "learning_rate": 3.0510325994196966e-05, + "loss": 0.679, + "num_tokens": 131022083.0, + "step": 1391 + }, + { + "epoch": 0.23758320532514082, + "grad_norm": 0.506059831101849, + "learning_rate": 3.050349889059567e-05, + "loss": 0.588, + "num_tokens": 131108627.0, + "step": 1392 + }, + { + "epoch": 0.23775388291517324, + "grad_norm": 0.56985570051146, + "learning_rate": 3.049667178699437e-05, + "loss": 0.6345, + "num_tokens": 131191129.0, + "step": 1393 + }, + { + "epoch": 0.23792456050520566, + "grad_norm": 0.5228954656831781, + "learning_rate": 3.0489844683393074e-05, + "loss": 0.6084, + "num_tokens": 131290128.0, + "step": 1394 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.5385817121588715, + "learning_rate": 3.0483017579791775e-05, + "loss": 0.6579, + "num_tokens": 131374383.0, + "step": 1395 + }, + { + "epoch": 0.23826591568527053, + "grad_norm": 0.5211749217575233, + "learning_rate": 3.047619047619048e-05, + "loss": 0.6712, + "num_tokens": 131478474.0, + "step": 1396 + }, + { + "epoch": 0.23843659327530295, + "grad_norm": 0.5748494848870591, + "learning_rate": 3.046936337258918e-05, + "loss": 0.7061, + "num_tokens": 131567451.0, + "step": 1397 + }, + { + "epoch": 0.23860727086533537, + "grad_norm": 0.5484056857840369, + "learning_rate": 3.0462536268987883e-05, + "loss": 0.6912, + "num_tokens": 131650422.0, + "step": 1398 + }, + { + "epoch": 0.23877794845536782, + "grad_norm": 0.5519249519350571, + "learning_rate": 3.0455709165386586e-05, + "loss": 0.624, + "num_tokens": 131725740.0, + "step": 1399 + }, + { + "epoch": 0.23894862604540024, + "grad_norm": 0.53809424433346, + "learning_rate": 3.044888206178529e-05, + "loss": 0.6654, + "num_tokens": 131822127.0, + "step": 1400 + }, + { + "epoch": 0.23911930363543266, + "grad_norm": 0.6233333456978932, + "learning_rate": 3.0442054958183994e-05, + "loss": 0.5506, + "num_tokens": 131880372.0, + "step": 1401 + }, + { + "epoch": 0.2392899812254651, + "grad_norm": 0.5283909674199262, + "learning_rate": 3.0435227854582698e-05, + "loss": 0.5969, + "num_tokens": 131976476.0, + "step": 1402 + }, + { + "epoch": 0.23946065881549752, + "grad_norm": 0.5921157552724325, + "learning_rate": 3.0428400750981398e-05, + "loss": 0.6781, + "num_tokens": 132052977.0, + "step": 1403 + }, + { + "epoch": 0.23963133640552994, + "grad_norm": 0.579250178609413, + "learning_rate": 3.0421573647380102e-05, + "loss": 0.6212, + "num_tokens": 132137316.0, + "step": 1404 + }, + { + "epoch": 0.2398020139955624, + "grad_norm": 0.49869281371283847, + "learning_rate": 3.0414746543778806e-05, + "loss": 0.631, + "num_tokens": 132234257.0, + "step": 1405 + }, + { + "epoch": 0.2399726915855948, + "grad_norm": 0.5272317540658565, + "learning_rate": 3.040791944017751e-05, + "loss": 0.6064, + "num_tokens": 132330861.0, + "step": 1406 + }, + { + "epoch": 0.24014336917562723, + "grad_norm": 0.5504209384489408, + "learning_rate": 3.0401092336576206e-05, + "loss": 0.6195, + "num_tokens": 132417050.0, + "step": 1407 + }, + { + "epoch": 0.24031404676565968, + "grad_norm": 0.5671488204833297, + "learning_rate": 3.039426523297491e-05, + "loss": 0.6905, + "num_tokens": 132506316.0, + "step": 1408 + }, + { + "epoch": 0.2404847243556921, + "grad_norm": 0.5101348836256138, + "learning_rate": 3.0387438129373614e-05, + "loss": 0.6053, + "num_tokens": 132602324.0, + "step": 1409 + }, + { + "epoch": 0.24065540194572452, + "grad_norm": 0.5661575277074523, + "learning_rate": 3.0380611025772318e-05, + "loss": 0.6529, + "num_tokens": 132681552.0, + "step": 1410 + }, + { + "epoch": 0.24082607953575697, + "grad_norm": 0.5637159314128432, + "learning_rate": 3.037378392217102e-05, + "loss": 0.6133, + "num_tokens": 132773747.0, + "step": 1411 + }, + { + "epoch": 0.24099675712578938, + "grad_norm": 0.5947030680547191, + "learning_rate": 3.0366956818569725e-05, + "loss": 0.626, + "num_tokens": 132847563.0, + "step": 1412 + }, + { + "epoch": 0.2411674347158218, + "grad_norm": 0.55575080933755, + "learning_rate": 3.0360129714968426e-05, + "loss": 0.6466, + "num_tokens": 132924515.0, + "step": 1413 + }, + { + "epoch": 0.24133811230585425, + "grad_norm": 0.4896266312261726, + "learning_rate": 3.035330261136713e-05, + "loss": 0.5947, + "num_tokens": 133040539.0, + "step": 1414 + }, + { + "epoch": 0.24150878989588667, + "grad_norm": 0.5372747104229078, + "learning_rate": 3.0346475507765833e-05, + "loss": 0.6771, + "num_tokens": 133132068.0, + "step": 1415 + }, + { + "epoch": 0.2416794674859191, + "grad_norm": 0.6568590129003236, + "learning_rate": 3.0339648404164537e-05, + "loss": 0.6828, + "num_tokens": 133213198.0, + "step": 1416 + }, + { + "epoch": 0.24185014507595154, + "grad_norm": 0.5496227499025967, + "learning_rate": 3.033282130056324e-05, + "loss": 0.6239, + "num_tokens": 133296925.0, + "step": 1417 + }, + { + "epoch": 0.24202082266598396, + "grad_norm": 0.5194967748960357, + "learning_rate": 3.0325994196961944e-05, + "loss": 0.6366, + "num_tokens": 133401006.0, + "step": 1418 + }, + { + "epoch": 0.24219150025601638, + "grad_norm": 0.5566946253149542, + "learning_rate": 3.0319167093360645e-05, + "loss": 0.6503, + "num_tokens": 133481188.0, + "step": 1419 + }, + { + "epoch": 0.24236217784604883, + "grad_norm": 0.5338227644578518, + "learning_rate": 3.0312339989759345e-05, + "loss": 0.623, + "num_tokens": 133561189.0, + "step": 1420 + }, + { + "epoch": 0.24253285543608125, + "grad_norm": 0.5180882479544722, + "learning_rate": 3.030551288615805e-05, + "loss": 0.52, + "num_tokens": 133651897.0, + "step": 1421 + }, + { + "epoch": 0.24270353302611367, + "grad_norm": 0.4931744842356081, + "learning_rate": 3.0298685782556753e-05, + "loss": 0.6046, + "num_tokens": 133749741.0, + "step": 1422 + }, + { + "epoch": 0.2428742106161461, + "grad_norm": 0.540365450613624, + "learning_rate": 3.0291858678955456e-05, + "loss": 0.6039, + "num_tokens": 133835537.0, + "step": 1423 + }, + { + "epoch": 0.24304488820617853, + "grad_norm": 0.6057705776430942, + "learning_rate": 3.0285031575354157e-05, + "loss": 0.752, + "num_tokens": 133916104.0, + "step": 1424 + }, + { + "epoch": 0.24321556579621095, + "grad_norm": 0.5414105943084075, + "learning_rate": 3.027820447175286e-05, + "loss": 0.5898, + "num_tokens": 133998897.0, + "step": 1425 + }, + { + "epoch": 0.24338624338624337, + "grad_norm": 0.5724644699432798, + "learning_rate": 3.0271377368151564e-05, + "loss": 0.6612, + "num_tokens": 134075965.0, + "step": 1426 + }, + { + "epoch": 0.24355692097627582, + "grad_norm": 0.5092635806793304, + "learning_rate": 3.0264550264550268e-05, + "loss": 0.6758, + "num_tokens": 134181854.0, + "step": 1427 + }, + { + "epoch": 0.24372759856630824, + "grad_norm": 0.47224962245264396, + "learning_rate": 3.0257723160948972e-05, + "loss": 0.6521, + "num_tokens": 134315075.0, + "step": 1428 + }, + { + "epoch": 0.24389827615634066, + "grad_norm": 0.49708555840002566, + "learning_rate": 3.0250896057347676e-05, + "loss": 0.5792, + "num_tokens": 134413861.0, + "step": 1429 + }, + { + "epoch": 0.2440689537463731, + "grad_norm": 0.5198995465483546, + "learning_rate": 3.0244068953746376e-05, + "loss": 0.6367, + "num_tokens": 134518347.0, + "step": 1430 + }, + { + "epoch": 0.24423963133640553, + "grad_norm": 0.4500627773403896, + "learning_rate": 3.023724185014508e-05, + "loss": 0.5546, + "num_tokens": 134637515.0, + "step": 1431 + }, + { + "epoch": 0.24441030892643795, + "grad_norm": 0.4857823323461901, + "learning_rate": 3.023041474654378e-05, + "loss": 0.5564, + "num_tokens": 134743996.0, + "step": 1432 + }, + { + "epoch": 0.2445809865164704, + "grad_norm": 0.5150518953201861, + "learning_rate": 3.0223587642942484e-05, + "loss": 0.6997, + "num_tokens": 134852154.0, + "step": 1433 + }, + { + "epoch": 0.24475166410650281, + "grad_norm": 0.5777695595037731, + "learning_rate": 3.0216760539341184e-05, + "loss": 0.6683, + "num_tokens": 134956239.0, + "step": 1434 + }, + { + "epoch": 0.24492234169653523, + "grad_norm": 0.519644823221684, + "learning_rate": 3.0209933435739888e-05, + "loss": 0.6095, + "num_tokens": 135043357.0, + "step": 1435 + }, + { + "epoch": 0.24509301928656768, + "grad_norm": 0.5479927110839381, + "learning_rate": 3.0203106332138592e-05, + "loss": 0.6592, + "num_tokens": 135124476.0, + "step": 1436 + }, + { + "epoch": 0.2452636968766001, + "grad_norm": 0.5890631093974773, + "learning_rate": 3.0196279228537296e-05, + "loss": 0.7234, + "num_tokens": 135235166.0, + "step": 1437 + }, + { + "epoch": 0.24543437446663252, + "grad_norm": 0.5003995989136789, + "learning_rate": 3.0189452124936e-05, + "loss": 0.7069, + "num_tokens": 135364067.0, + "step": 1438 + }, + { + "epoch": 0.24560505205666497, + "grad_norm": 0.5719995897117487, + "learning_rate": 3.0182625021334703e-05, + "loss": 0.5319, + "num_tokens": 135431557.0, + "step": 1439 + }, + { + "epoch": 0.2457757296466974, + "grad_norm": 0.5431888169755601, + "learning_rate": 3.0175797917733403e-05, + "loss": 0.6824, + "num_tokens": 135526866.0, + "step": 1440 + }, + { + "epoch": 0.2459464072367298, + "grad_norm": 0.5038567551986254, + "learning_rate": 3.0168970814132107e-05, + "loss": 0.6059, + "num_tokens": 135620295.0, + "step": 1441 + }, + { + "epoch": 0.24611708482676226, + "grad_norm": 0.5708391021082982, + "learning_rate": 3.016214371053081e-05, + "loss": 0.6251, + "num_tokens": 135689636.0, + "step": 1442 + }, + { + "epoch": 0.24628776241679468, + "grad_norm": 0.4981041167882797, + "learning_rate": 3.0155316606929515e-05, + "loss": 0.6608, + "num_tokens": 135804899.0, + "step": 1443 + }, + { + "epoch": 0.2464584400068271, + "grad_norm": 0.5394244853646124, + "learning_rate": 3.014848950332822e-05, + "loss": 0.6428, + "num_tokens": 135887679.0, + "step": 1444 + }, + { + "epoch": 0.24662911759685954, + "grad_norm": 0.44377890515202334, + "learning_rate": 3.0141662399726915e-05, + "loss": 0.6267, + "num_tokens": 136020787.0, + "step": 1445 + }, + { + "epoch": 0.24679979518689196, + "grad_norm": 0.5492992299824374, + "learning_rate": 3.013483529612562e-05, + "loss": 0.6447, + "num_tokens": 136103484.0, + "step": 1446 + }, + { + "epoch": 0.24697047277692438, + "grad_norm": 0.5101190344579398, + "learning_rate": 3.0128008192524323e-05, + "loss": 0.5725, + "num_tokens": 136184075.0, + "step": 1447 + }, + { + "epoch": 0.24714115036695683, + "grad_norm": 0.5810213296390501, + "learning_rate": 3.0121181088923027e-05, + "loss": 0.6121, + "num_tokens": 136257902.0, + "step": 1448 + }, + { + "epoch": 0.24731182795698925, + "grad_norm": 0.6841692914658968, + "learning_rate": 3.011435398532173e-05, + "loss": 0.7045, + "num_tokens": 136352675.0, + "step": 1449 + }, + { + "epoch": 0.24748250554702167, + "grad_norm": 0.5339948812279803, + "learning_rate": 3.010752688172043e-05, + "loss": 0.6558, + "num_tokens": 136439978.0, + "step": 1450 + }, + { + "epoch": 0.24765318313705412, + "grad_norm": 0.5379426458410761, + "learning_rate": 3.0100699778119135e-05, + "loss": 0.6052, + "num_tokens": 136511577.0, + "step": 1451 + }, + { + "epoch": 0.24782386072708654, + "grad_norm": 0.46775428335752056, + "learning_rate": 3.009387267451784e-05, + "loss": 0.5971, + "num_tokens": 136611136.0, + "step": 1452 + }, + { + "epoch": 0.24799453831711896, + "grad_norm": 0.5078766030838417, + "learning_rate": 3.0087045570916542e-05, + "loss": 0.6128, + "num_tokens": 136709378.0, + "step": 1453 + }, + { + "epoch": 0.2481652159071514, + "grad_norm": 0.5062203453170451, + "learning_rate": 3.0080218467315246e-05, + "loss": 0.5787, + "num_tokens": 136800260.0, + "step": 1454 + }, + { + "epoch": 0.24833589349718382, + "grad_norm": 0.4770989452241791, + "learning_rate": 3.007339136371395e-05, + "loss": 0.585, + "num_tokens": 136905678.0, + "step": 1455 + }, + { + "epoch": 0.24850657108721624, + "grad_norm": 0.531780153379372, + "learning_rate": 3.006656426011265e-05, + "loss": 0.589, + "num_tokens": 136987891.0, + "step": 1456 + }, + { + "epoch": 0.24867724867724866, + "grad_norm": 0.48479787849425293, + "learning_rate": 3.005973715651135e-05, + "loss": 0.6014, + "num_tokens": 137092227.0, + "step": 1457 + }, + { + "epoch": 0.2488479262672811, + "grad_norm": 0.4800305975876206, + "learning_rate": 3.0052910052910054e-05, + "loss": 0.5759, + "num_tokens": 137191163.0, + "step": 1458 + }, + { + "epoch": 0.24901860385731353, + "grad_norm": 0.5049393254941668, + "learning_rate": 3.0046082949308758e-05, + "loss": 0.63, + "num_tokens": 137299283.0, + "step": 1459 + }, + { + "epoch": 0.24918928144734595, + "grad_norm": 0.5289090285507199, + "learning_rate": 3.0039255845707462e-05, + "loss": 0.6543, + "num_tokens": 137391055.0, + "step": 1460 + }, + { + "epoch": 0.2493599590373784, + "grad_norm": 0.5356699481681063, + "learning_rate": 3.0032428742106162e-05, + "loss": 0.6295, + "num_tokens": 137473618.0, + "step": 1461 + }, + { + "epoch": 0.24953063662741082, + "grad_norm": 0.5175860090191465, + "learning_rate": 3.0025601638504866e-05, + "loss": 0.5944, + "num_tokens": 137557934.0, + "step": 1462 + }, + { + "epoch": 0.24970131421744324, + "grad_norm": 0.5049279104519903, + "learning_rate": 3.001877453490357e-05, + "loss": 0.6355, + "num_tokens": 137664052.0, + "step": 1463 + }, + { + "epoch": 0.24987199180747569, + "grad_norm": 0.4932894487570249, + "learning_rate": 3.0011947431302273e-05, + "loss": 0.6831, + "num_tokens": 137771633.0, + "step": 1464 + }, + { + "epoch": 0.25004266939750813, + "grad_norm": 0.5848552107663836, + "learning_rate": 3.0005120327700977e-05, + "loss": 0.6816, + "num_tokens": 137853045.0, + "step": 1465 + }, + { + "epoch": 0.25021334698754055, + "grad_norm": 0.518811311739012, + "learning_rate": 2.999829322409968e-05, + "loss": 0.5387, + "num_tokens": 137933230.0, + "step": 1466 + }, + { + "epoch": 0.250384024577573, + "grad_norm": 0.5132400973436496, + "learning_rate": 2.999146612049838e-05, + "loss": 0.6825, + "num_tokens": 138026819.0, + "step": 1467 + }, + { + "epoch": 0.2505547021676054, + "grad_norm": 0.5173734325022021, + "learning_rate": 2.9984639016897085e-05, + "loss": 0.5782, + "num_tokens": 138102793.0, + "step": 1468 + }, + { + "epoch": 0.2507253797576378, + "grad_norm": 0.5559103218411179, + "learning_rate": 2.9977811913295785e-05, + "loss": 0.6151, + "num_tokens": 138175069.0, + "step": 1469 + }, + { + "epoch": 0.25089605734767023, + "grad_norm": 0.5727172644222017, + "learning_rate": 2.997098480969449e-05, + "loss": 0.673, + "num_tokens": 138249766.0, + "step": 1470 + }, + { + "epoch": 0.2510667349377027, + "grad_norm": 0.503680922931298, + "learning_rate": 2.996415770609319e-05, + "loss": 0.5711, + "num_tokens": 138337209.0, + "step": 1471 + }, + { + "epoch": 0.2512374125277351, + "grad_norm": 0.534112527296179, + "learning_rate": 2.9957330602491893e-05, + "loss": 0.5777, + "num_tokens": 138412909.0, + "step": 1472 + }, + { + "epoch": 0.25140809011776755, + "grad_norm": 0.4748094170455023, + "learning_rate": 2.9950503498890597e-05, + "loss": 0.6563, + "num_tokens": 138531265.0, + "step": 1473 + }, + { + "epoch": 0.25157876770779997, + "grad_norm": 0.564087841189348, + "learning_rate": 2.99436763952893e-05, + "loss": 0.6354, + "num_tokens": 138614697.0, + "step": 1474 + }, + { + "epoch": 0.2517494452978324, + "grad_norm": 0.5029388860974616, + "learning_rate": 2.9936849291688005e-05, + "loss": 0.5813, + "num_tokens": 138706475.0, + "step": 1475 + }, + { + "epoch": 0.2519201228878648, + "grad_norm": 0.47714325409009445, + "learning_rate": 2.993002218808671e-05, + "loss": 0.6239, + "num_tokens": 138820127.0, + "step": 1476 + }, + { + "epoch": 0.2520908004778972, + "grad_norm": 0.6320909145211067, + "learning_rate": 2.992319508448541e-05, + "loss": 0.6576, + "num_tokens": 138884189.0, + "step": 1477 + }, + { + "epoch": 0.2522614780679297, + "grad_norm": 0.5890892367067465, + "learning_rate": 2.9916367980884113e-05, + "loss": 0.5982, + "num_tokens": 138948780.0, + "step": 1478 + }, + { + "epoch": 0.2524321556579621, + "grad_norm": 0.5315193409361323, + "learning_rate": 2.9909540877282816e-05, + "loss": 0.6815, + "num_tokens": 139063119.0, + "step": 1479 + }, + { + "epoch": 0.25260283324799454, + "grad_norm": 0.5617235238053899, + "learning_rate": 2.990271377368152e-05, + "loss": 0.6347, + "num_tokens": 139139361.0, + "step": 1480 + }, + { + "epoch": 0.25277351083802696, + "grad_norm": 0.5401930075759667, + "learning_rate": 2.9895886670080224e-05, + "loss": 0.4976, + "num_tokens": 139211150.0, + "step": 1481 + }, + { + "epoch": 0.2529441884280594, + "grad_norm": 0.46868075695582584, + "learning_rate": 2.988905956647892e-05, + "loss": 0.7158, + "num_tokens": 139361496.0, + "step": 1482 + }, + { + "epoch": 0.2531148660180918, + "grad_norm": 0.43582117870858883, + "learning_rate": 2.9882232462877625e-05, + "loss": 0.6029, + "num_tokens": 139500411.0, + "step": 1483 + }, + { + "epoch": 0.2532855436081243, + "grad_norm": 0.4930277717448138, + "learning_rate": 2.987540535927633e-05, + "loss": 0.6046, + "num_tokens": 139606564.0, + "step": 1484 + }, + { + "epoch": 0.2534562211981567, + "grad_norm": 0.5084470901240925, + "learning_rate": 2.9868578255675032e-05, + "loss": 0.5726, + "num_tokens": 139719725.0, + "step": 1485 + }, + { + "epoch": 0.2536268987881891, + "grad_norm": 0.48011145164763613, + "learning_rate": 2.9861751152073736e-05, + "loss": 0.6344, + "num_tokens": 139824776.0, + "step": 1486 + }, + { + "epoch": 0.25379757637822153, + "grad_norm": 0.4815992656785346, + "learning_rate": 2.9854924048472436e-05, + "loss": 0.5855, + "num_tokens": 139928172.0, + "step": 1487 + }, + { + "epoch": 0.25396825396825395, + "grad_norm": 0.5069969158999137, + "learning_rate": 2.984809694487114e-05, + "loss": 0.5625, + "num_tokens": 140011821.0, + "step": 1488 + }, + { + "epoch": 0.2541389315582864, + "grad_norm": 0.5152385983240086, + "learning_rate": 2.9841269841269844e-05, + "loss": 0.7513, + "num_tokens": 140133252.0, + "step": 1489 + }, + { + "epoch": 0.25430960914831885, + "grad_norm": 0.5766370174929791, + "learning_rate": 2.9834442737668548e-05, + "loss": 0.6693, + "num_tokens": 140225355.0, + "step": 1490 + }, + { + "epoch": 0.25448028673835127, + "grad_norm": 0.5463644226462849, + "learning_rate": 2.982761563406725e-05, + "loss": 0.7806, + "num_tokens": 140317996.0, + "step": 1491 + }, + { + "epoch": 0.2546509643283837, + "grad_norm": 0.5105852462959829, + "learning_rate": 2.9820788530465955e-05, + "loss": 0.6763, + "num_tokens": 140431939.0, + "step": 1492 + }, + { + "epoch": 0.2548216419184161, + "grad_norm": 0.5372631823244021, + "learning_rate": 2.9813961426864655e-05, + "loss": 0.6438, + "num_tokens": 140518173.0, + "step": 1493 + }, + { + "epoch": 0.25499231950844853, + "grad_norm": 0.5534768553444708, + "learning_rate": 2.9807134323263356e-05, + "loss": 0.7666, + "num_tokens": 140617973.0, + "step": 1494 + }, + { + "epoch": 0.25516299709848095, + "grad_norm": 0.4690692711003919, + "learning_rate": 2.980030721966206e-05, + "loss": 0.5878, + "num_tokens": 140729558.0, + "step": 1495 + }, + { + "epoch": 0.2553336746885134, + "grad_norm": 0.5519344538013113, + "learning_rate": 2.9793480116060763e-05, + "loss": 0.6622, + "num_tokens": 140815301.0, + "step": 1496 + }, + { + "epoch": 0.25550435227854584, + "grad_norm": 0.7214930033997207, + "learning_rate": 2.9786653012459467e-05, + "loss": 0.6531, + "num_tokens": 140919638.0, + "step": 1497 + }, + { + "epoch": 0.25567502986857826, + "grad_norm": 0.5853761900600749, + "learning_rate": 2.9779825908858167e-05, + "loss": 0.6237, + "num_tokens": 141014876.0, + "step": 1498 + }, + { + "epoch": 0.2558457074586107, + "grad_norm": 0.6263849408393748, + "learning_rate": 2.977299880525687e-05, + "loss": 0.7401, + "num_tokens": 141106796.0, + "step": 1499 + }, + { + "epoch": 0.2560163850486431, + "grad_norm": 0.5365435575890392, + "learning_rate": 2.9766171701655575e-05, + "loss": 0.6765, + "num_tokens": 141199769.0, + "step": 1500 + }, + { + "epoch": 0.2561870626386755, + "grad_norm": 0.5924826250981401, + "learning_rate": 2.975934459805428e-05, + "loss": 0.6891, + "num_tokens": 141287523.0, + "step": 1501 + }, + { + "epoch": 0.256357740228708, + "grad_norm": 0.5231091512943465, + "learning_rate": 2.9752517494452983e-05, + "loss": 0.6446, + "num_tokens": 141381797.0, + "step": 1502 + }, + { + "epoch": 0.2565284178187404, + "grad_norm": 0.8789586278375571, + "learning_rate": 2.9745690390851686e-05, + "loss": 0.6554, + "num_tokens": 141485024.0, + "step": 1503 + }, + { + "epoch": 0.25669909540877284, + "grad_norm": 0.524529035543788, + "learning_rate": 2.9738863287250387e-05, + "loss": 0.6575, + "num_tokens": 141575204.0, + "step": 1504 + }, + { + "epoch": 0.25686977299880526, + "grad_norm": 0.5617828009333108, + "learning_rate": 2.973203618364909e-05, + "loss": 0.6628, + "num_tokens": 141675952.0, + "step": 1505 + }, + { + "epoch": 0.2570404505888377, + "grad_norm": 0.5321845639672593, + "learning_rate": 2.9725209080047794e-05, + "loss": 0.6111, + "num_tokens": 141768017.0, + "step": 1506 + }, + { + "epoch": 0.2572111281788701, + "grad_norm": 0.4816956613082202, + "learning_rate": 2.9718381976446495e-05, + "loss": 0.5241, + "num_tokens": 141868134.0, + "step": 1507 + }, + { + "epoch": 0.2573818057689025, + "grad_norm": 0.5402964820566525, + "learning_rate": 2.9711554872845195e-05, + "loss": 0.5369, + "num_tokens": 141943108.0, + "step": 1508 + }, + { + "epoch": 0.257552483358935, + "grad_norm": 0.5619439883493221, + "learning_rate": 2.97047277692439e-05, + "loss": 0.6031, + "num_tokens": 142017650.0, + "step": 1509 + }, + { + "epoch": 0.2577231609489674, + "grad_norm": 0.5433155794270991, + "learning_rate": 2.9697900665642602e-05, + "loss": 0.6377, + "num_tokens": 142116197.0, + "step": 1510 + }, + { + "epoch": 0.25789383853899983, + "grad_norm": 0.5223656497393874, + "learning_rate": 2.9691073562041306e-05, + "loss": 0.642, + "num_tokens": 142207277.0, + "step": 1511 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 0.5176784238002053, + "learning_rate": 2.968424645844001e-05, + "loss": 0.737, + "num_tokens": 142328430.0, + "step": 1512 + }, + { + "epoch": 0.25823519371906467, + "grad_norm": 0.6071290315898796, + "learning_rate": 2.9677419354838714e-05, + "loss": 0.6037, + "num_tokens": 142391489.0, + "step": 1513 + }, + { + "epoch": 0.2584058713090971, + "grad_norm": 0.5655736272189225, + "learning_rate": 2.9670592251237414e-05, + "loss": 0.5664, + "num_tokens": 142455328.0, + "step": 1514 + }, + { + "epoch": 0.25857654889912957, + "grad_norm": 0.5182064486394659, + "learning_rate": 2.9663765147636118e-05, + "loss": 0.572, + "num_tokens": 142540202.0, + "step": 1515 + }, + { + "epoch": 0.258747226489162, + "grad_norm": 0.5504339113645859, + "learning_rate": 2.965693804403482e-05, + "loss": 0.5649, + "num_tokens": 142617592.0, + "step": 1516 + }, + { + "epoch": 0.2589179040791944, + "grad_norm": 0.5439145067927225, + "learning_rate": 2.9650110940433525e-05, + "loss": 0.7104, + "num_tokens": 142725462.0, + "step": 1517 + }, + { + "epoch": 0.2590885816692268, + "grad_norm": 0.5272139752010805, + "learning_rate": 2.964328383683223e-05, + "loss": 0.6386, + "num_tokens": 142816608.0, + "step": 1518 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.5024787201354364, + "learning_rate": 2.9636456733230926e-05, + "loss": 0.6259, + "num_tokens": 142928382.0, + "step": 1519 + }, + { + "epoch": 0.25942993684929166, + "grad_norm": 0.5299193403971175, + "learning_rate": 2.962962962962963e-05, + "loss": 0.6721, + "num_tokens": 143029084.0, + "step": 1520 + }, + { + "epoch": 0.25960061443932414, + "grad_norm": 0.5499690134059552, + "learning_rate": 2.9622802526028334e-05, + "loss": 0.6364, + "num_tokens": 143106807.0, + "step": 1521 + }, + { + "epoch": 0.25977129202935656, + "grad_norm": 0.49391904043146334, + "learning_rate": 2.9615975422427037e-05, + "loss": 0.5594, + "num_tokens": 143204105.0, + "step": 1522 + }, + { + "epoch": 0.259941969619389, + "grad_norm": 0.4955415042346165, + "learning_rate": 2.960914831882574e-05, + "loss": 0.5762, + "num_tokens": 143318252.0, + "step": 1523 + }, + { + "epoch": 0.2601126472094214, + "grad_norm": 0.5784310482788532, + "learning_rate": 2.960232121522444e-05, + "loss": 0.6551, + "num_tokens": 143396350.0, + "step": 1524 + }, + { + "epoch": 0.2602833247994538, + "grad_norm": 0.5357940290505442, + "learning_rate": 2.9595494111623145e-05, + "loss": 0.6191, + "num_tokens": 143488749.0, + "step": 1525 + }, + { + "epoch": 0.26045400238948624, + "grad_norm": 0.47657025406515896, + "learning_rate": 2.958866700802185e-05, + "loss": 0.643, + "num_tokens": 143609040.0, + "step": 1526 + }, + { + "epoch": 0.2606246799795187, + "grad_norm": 0.534966755733081, + "learning_rate": 2.9581839904420553e-05, + "loss": 0.5886, + "num_tokens": 143686369.0, + "step": 1527 + }, + { + "epoch": 0.26079535756955113, + "grad_norm": 0.5482248006816225, + "learning_rate": 2.9575012800819257e-05, + "loss": 0.6829, + "num_tokens": 143779401.0, + "step": 1528 + }, + { + "epoch": 0.26096603515958355, + "grad_norm": 0.462882752711681, + "learning_rate": 2.956818569721796e-05, + "loss": 0.6091, + "num_tokens": 143886778.0, + "step": 1529 + }, + { + "epoch": 0.261136712749616, + "grad_norm": 0.519357143908201, + "learning_rate": 2.9561358593616664e-05, + "loss": 0.6604, + "num_tokens": 143971964.0, + "step": 1530 + }, + { + "epoch": 0.2613073903396484, + "grad_norm": 0.6191592034850876, + "learning_rate": 2.955453149001536e-05, + "loss": 0.7069, + "num_tokens": 144091301.0, + "step": 1531 + }, + { + "epoch": 0.2614780679296808, + "grad_norm": 0.5112825178494692, + "learning_rate": 2.9547704386414065e-05, + "loss": 0.6098, + "num_tokens": 144194027.0, + "step": 1532 + }, + { + "epoch": 0.2616487455197133, + "grad_norm": 0.5045064120002258, + "learning_rate": 2.954087728281277e-05, + "loss": 0.6679, + "num_tokens": 144304814.0, + "step": 1533 + }, + { + "epoch": 0.2618194231097457, + "grad_norm": 0.5283253619728645, + "learning_rate": 2.9534050179211472e-05, + "loss": 0.5436, + "num_tokens": 144382924.0, + "step": 1534 + }, + { + "epoch": 0.26199010069977813, + "grad_norm": 0.5280618535734183, + "learning_rate": 2.9527223075610173e-05, + "loss": 0.5668, + "num_tokens": 144464507.0, + "step": 1535 + }, + { + "epoch": 0.26216077828981055, + "grad_norm": 0.5645763596542448, + "learning_rate": 2.9520395972008877e-05, + "loss": 0.6084, + "num_tokens": 144542962.0, + "step": 1536 + }, + { + "epoch": 0.26233145587984297, + "grad_norm": 0.5597915680962701, + "learning_rate": 2.951356886840758e-05, + "loss": 0.5585, + "num_tokens": 144608128.0, + "step": 1537 + }, + { + "epoch": 0.2625021334698754, + "grad_norm": 0.535778393663071, + "learning_rate": 2.9506741764806284e-05, + "loss": 0.6028, + "num_tokens": 144688473.0, + "step": 1538 + }, + { + "epoch": 0.2626728110599078, + "grad_norm": 0.5184764074921624, + "learning_rate": 2.9499914661204988e-05, + "loss": 0.6178, + "num_tokens": 144778701.0, + "step": 1539 + }, + { + "epoch": 0.2628434886499403, + "grad_norm": 0.5536001085036046, + "learning_rate": 2.949308755760369e-05, + "loss": 0.6388, + "num_tokens": 144857983.0, + "step": 1540 + }, + { + "epoch": 0.2630141662399727, + "grad_norm": 0.4995363070553543, + "learning_rate": 2.9486260454002392e-05, + "loss": 0.6089, + "num_tokens": 144949649.0, + "step": 1541 + }, + { + "epoch": 0.2631848438300051, + "grad_norm": 0.5726964066510651, + "learning_rate": 2.9479433350401096e-05, + "loss": 0.6093, + "num_tokens": 145037056.0, + "step": 1542 + }, + { + "epoch": 0.26335552142003754, + "grad_norm": 0.5349989121027221, + "learning_rate": 2.94726062467998e-05, + "loss": 0.5906, + "num_tokens": 145115312.0, + "step": 1543 + }, + { + "epoch": 0.26352619901006996, + "grad_norm": 0.5133903824577276, + "learning_rate": 2.94657791431985e-05, + "loss": 0.6846, + "num_tokens": 145219903.0, + "step": 1544 + }, + { + "epoch": 0.2636968766001024, + "grad_norm": 0.48934548669521827, + "learning_rate": 2.94589520395972e-05, + "loss": 0.6251, + "num_tokens": 145318911.0, + "step": 1545 + }, + { + "epoch": 0.26386755419013486, + "grad_norm": 0.5157008691365432, + "learning_rate": 2.9452124935995904e-05, + "loss": 0.6322, + "num_tokens": 145400457.0, + "step": 1546 + }, + { + "epoch": 0.2640382317801673, + "grad_norm": 0.4351696695983458, + "learning_rate": 2.9445297832394608e-05, + "loss": 0.5656, + "num_tokens": 145516565.0, + "step": 1547 + }, + { + "epoch": 0.2642089093701997, + "grad_norm": 0.52396412338884, + "learning_rate": 2.943847072879331e-05, + "loss": 0.6264, + "num_tokens": 145600986.0, + "step": 1548 + }, + { + "epoch": 0.2643795869602321, + "grad_norm": 0.5245396073408771, + "learning_rate": 2.9431643625192015e-05, + "loss": 0.6371, + "num_tokens": 145705379.0, + "step": 1549 + }, + { + "epoch": 0.26455026455026454, + "grad_norm": 0.5052271763599925, + "learning_rate": 2.942481652159072e-05, + "loss": 0.5799, + "num_tokens": 145797393.0, + "step": 1550 + }, + { + "epoch": 0.26472094214029696, + "grad_norm": 0.678321577737671, + "learning_rate": 2.941798941798942e-05, + "loss": 0.683, + "num_tokens": 145884000.0, + "step": 1551 + }, + { + "epoch": 0.26489161973032943, + "grad_norm": 0.5385645518113303, + "learning_rate": 2.9411162314388123e-05, + "loss": 0.6482, + "num_tokens": 145968971.0, + "step": 1552 + }, + { + "epoch": 0.26506229732036185, + "grad_norm": 0.4814562000917823, + "learning_rate": 2.9404335210786827e-05, + "loss": 0.6478, + "num_tokens": 146088393.0, + "step": 1553 + }, + { + "epoch": 0.26523297491039427, + "grad_norm": 0.5176513302065541, + "learning_rate": 2.939750810718553e-05, + "loss": 0.6146, + "num_tokens": 146185422.0, + "step": 1554 + }, + { + "epoch": 0.2654036525004267, + "grad_norm": 0.48039549607992427, + "learning_rate": 2.9390681003584235e-05, + "loss": 0.6648, + "num_tokens": 146311067.0, + "step": 1555 + }, + { + "epoch": 0.2655743300904591, + "grad_norm": 0.5086001135682076, + "learning_rate": 2.938385389998293e-05, + "loss": 0.589, + "num_tokens": 146407137.0, + "step": 1556 + }, + { + "epoch": 0.26574500768049153, + "grad_norm": 0.5342821299350323, + "learning_rate": 2.9377026796381635e-05, + "loss": 0.6506, + "num_tokens": 146490715.0, + "step": 1557 + }, + { + "epoch": 0.265915685270524, + "grad_norm": 0.5524565657044019, + "learning_rate": 2.937019969278034e-05, + "loss": 0.5685, + "num_tokens": 146563083.0, + "step": 1558 + }, + { + "epoch": 0.2660863628605564, + "grad_norm": 0.5823713005217535, + "learning_rate": 2.9363372589179043e-05, + "loss": 0.7462, + "num_tokens": 146650872.0, + "step": 1559 + }, + { + "epoch": 0.26625704045058884, + "grad_norm": 0.4928357675712604, + "learning_rate": 2.9356545485577747e-05, + "loss": 0.6557, + "num_tokens": 146759839.0, + "step": 1560 + }, + { + "epoch": 0.26642771804062126, + "grad_norm": 0.5307293776661819, + "learning_rate": 2.934971838197645e-05, + "loss": 0.6778, + "num_tokens": 146858971.0, + "step": 1561 + }, + { + "epoch": 0.2665983956306537, + "grad_norm": 0.5071972904843993, + "learning_rate": 2.934289127837515e-05, + "loss": 0.639, + "num_tokens": 146958459.0, + "step": 1562 + }, + { + "epoch": 0.2667690732206861, + "grad_norm": 0.5023480526581776, + "learning_rate": 2.9336064174773854e-05, + "loss": 0.687, + "num_tokens": 147073982.0, + "step": 1563 + }, + { + "epoch": 0.2669397508107186, + "grad_norm": 0.5520075992719014, + "learning_rate": 2.9329237071172558e-05, + "loss": 0.6476, + "num_tokens": 147168638.0, + "step": 1564 + }, + { + "epoch": 0.267110428400751, + "grad_norm": 0.500779511060749, + "learning_rate": 2.9322409967571262e-05, + "loss": 0.6647, + "num_tokens": 147270442.0, + "step": 1565 + }, + { + "epoch": 0.2672811059907834, + "grad_norm": 0.5093050911645667, + "learning_rate": 2.9315582863969966e-05, + "loss": 0.5698, + "num_tokens": 147358911.0, + "step": 1566 + }, + { + "epoch": 0.26745178358081584, + "grad_norm": 0.5120519755410098, + "learning_rate": 2.930875576036867e-05, + "loss": 0.647, + "num_tokens": 147457901.0, + "step": 1567 + }, + { + "epoch": 0.26762246117084826, + "grad_norm": 0.4848193489376691, + "learning_rate": 2.9301928656767366e-05, + "loss": 0.6118, + "num_tokens": 147565286.0, + "step": 1568 + }, + { + "epoch": 0.2677931387608807, + "grad_norm": 0.4984239405036281, + "learning_rate": 2.929510155316607e-05, + "loss": 0.631, + "num_tokens": 147683777.0, + "step": 1569 + }, + { + "epoch": 0.2679638163509131, + "grad_norm": 0.46443558506729576, + "learning_rate": 2.9288274449564774e-05, + "loss": 0.5916, + "num_tokens": 147794162.0, + "step": 1570 + }, + { + "epoch": 0.2681344939409456, + "grad_norm": 0.46908629472825814, + "learning_rate": 2.9281447345963478e-05, + "loss": 0.5933, + "num_tokens": 147909760.0, + "step": 1571 + }, + { + "epoch": 0.268305171530978, + "grad_norm": 0.5330693395031496, + "learning_rate": 2.9274620242362178e-05, + "loss": 0.6386, + "num_tokens": 147999056.0, + "step": 1572 + }, + { + "epoch": 0.2684758491210104, + "grad_norm": 0.5626345771938205, + "learning_rate": 2.9267793138760882e-05, + "loss": 0.6123, + "num_tokens": 148084130.0, + "step": 1573 + }, + { + "epoch": 0.26864652671104283, + "grad_norm": 0.46367283243322116, + "learning_rate": 2.9260966035159586e-05, + "loss": 0.6665, + "num_tokens": 148204228.0, + "step": 1574 + }, + { + "epoch": 0.26881720430107525, + "grad_norm": 0.5326705177851854, + "learning_rate": 2.925413893155829e-05, + "loss": 0.6379, + "num_tokens": 148281145.0, + "step": 1575 + }, + { + "epoch": 0.26898788189110767, + "grad_norm": 0.554595946493135, + "learning_rate": 2.9247311827956993e-05, + "loss": 0.6012, + "num_tokens": 148357719.0, + "step": 1576 + }, + { + "epoch": 0.26915855948114015, + "grad_norm": 0.5372268066791315, + "learning_rate": 2.9240484724355697e-05, + "loss": 0.6081, + "num_tokens": 148441023.0, + "step": 1577 + }, + { + "epoch": 0.26932923707117257, + "grad_norm": 0.6260486504727868, + "learning_rate": 2.9233657620754397e-05, + "loss": 0.606, + "num_tokens": 148503699.0, + "step": 1578 + }, + { + "epoch": 0.269499914661205, + "grad_norm": 0.5002745319355029, + "learning_rate": 2.92268305171531e-05, + "loss": 0.6704, + "num_tokens": 148610884.0, + "step": 1579 + }, + { + "epoch": 0.2696705922512374, + "grad_norm": 0.5170220802860002, + "learning_rate": 2.9220003413551805e-05, + "loss": 0.6601, + "num_tokens": 148720787.0, + "step": 1580 + }, + { + "epoch": 0.2698412698412698, + "grad_norm": 0.5384104724262995, + "learning_rate": 2.9213176309950505e-05, + "loss": 0.5771, + "num_tokens": 148796197.0, + "step": 1581 + }, + { + "epoch": 0.27001194743130225, + "grad_norm": 0.5058126860428649, + "learning_rate": 2.9206349206349206e-05, + "loss": 0.6212, + "num_tokens": 148894658.0, + "step": 1582 + }, + { + "epoch": 0.2701826250213347, + "grad_norm": 0.5568169329044079, + "learning_rate": 2.919952210274791e-05, + "loss": 0.639, + "num_tokens": 148972172.0, + "step": 1583 + }, + { + "epoch": 0.27035330261136714, + "grad_norm": 0.5700097961123177, + "learning_rate": 2.9192694999146613e-05, + "loss": 0.5667, + "num_tokens": 149059528.0, + "step": 1584 + }, + { + "epoch": 0.27052398020139956, + "grad_norm": 0.5274858102015078, + "learning_rate": 2.9185867895545317e-05, + "loss": 0.6488, + "num_tokens": 149152427.0, + "step": 1585 + }, + { + "epoch": 0.270694657791432, + "grad_norm": 0.5038832631003527, + "learning_rate": 2.917904079194402e-05, + "loss": 0.6656, + "num_tokens": 149261747.0, + "step": 1586 + }, + { + "epoch": 0.2708653353814644, + "grad_norm": 0.5826917571856942, + "learning_rate": 2.9172213688342724e-05, + "loss": 0.6818, + "num_tokens": 149366533.0, + "step": 1587 + }, + { + "epoch": 0.2710360129714968, + "grad_norm": 0.555409232448178, + "learning_rate": 2.9165386584741425e-05, + "loss": 0.7236, + "num_tokens": 149457793.0, + "step": 1588 + }, + { + "epoch": 0.2712066905615293, + "grad_norm": 0.4961785589691689, + "learning_rate": 2.915855948114013e-05, + "loss": 0.5858, + "num_tokens": 149552905.0, + "step": 1589 + }, + { + "epoch": 0.2713773681515617, + "grad_norm": 0.5706697997553639, + "learning_rate": 2.9151732377538832e-05, + "loss": 0.6508, + "num_tokens": 149623652.0, + "step": 1590 + }, + { + "epoch": 0.27154804574159414, + "grad_norm": 0.5556306327323185, + "learning_rate": 2.9144905273937536e-05, + "loss": 0.6392, + "num_tokens": 149713979.0, + "step": 1591 + }, + { + "epoch": 0.27171872333162655, + "grad_norm": 0.49730750102407684, + "learning_rate": 2.913807817033624e-05, + "loss": 0.6115, + "num_tokens": 149828848.0, + "step": 1592 + }, + { + "epoch": 0.271889400921659, + "grad_norm": 0.524169788304647, + "learning_rate": 2.9131251066734937e-05, + "loss": 0.6742, + "num_tokens": 149943653.0, + "step": 1593 + }, + { + "epoch": 0.2720600785116914, + "grad_norm": 0.501759367567607, + "learning_rate": 2.912442396313364e-05, + "loss": 0.556, + "num_tokens": 150034896.0, + "step": 1594 + }, + { + "epoch": 0.27223075610172387, + "grad_norm": 0.5119541631604985, + "learning_rate": 2.9117596859532344e-05, + "loss": 0.6759, + "num_tokens": 150141508.0, + "step": 1595 + }, + { + "epoch": 0.2724014336917563, + "grad_norm": 0.5340123503829572, + "learning_rate": 2.9110769755931048e-05, + "loss": 0.5938, + "num_tokens": 150238664.0, + "step": 1596 + }, + { + "epoch": 0.2725721112817887, + "grad_norm": 0.5188588504786178, + "learning_rate": 2.9103942652329752e-05, + "loss": 0.6742, + "num_tokens": 150343343.0, + "step": 1597 + }, + { + "epoch": 0.27274278887182113, + "grad_norm": 0.5354140245683103, + "learning_rate": 2.9097115548728456e-05, + "loss": 0.5912, + "num_tokens": 150434772.0, + "step": 1598 + }, + { + "epoch": 0.27291346646185355, + "grad_norm": 0.5672294241462507, + "learning_rate": 2.9090288445127156e-05, + "loss": 0.6247, + "num_tokens": 150513939.0, + "step": 1599 + }, + { + "epoch": 0.27308414405188597, + "grad_norm": 0.554496137292418, + "learning_rate": 2.908346134152586e-05, + "loss": 0.595, + "num_tokens": 150582432.0, + "step": 1600 + }, + { + "epoch": 0.2732548216419184, + "grad_norm": 0.4824454080424507, + "learning_rate": 2.9076634237924564e-05, + "loss": 0.6615, + "num_tokens": 150711343.0, + "step": 1601 + }, + { + "epoch": 0.27342549923195086, + "grad_norm": 0.4955043522938973, + "learning_rate": 2.9069807134323267e-05, + "loss": 0.607, + "num_tokens": 150812637.0, + "step": 1602 + }, + { + "epoch": 0.2735961768219833, + "grad_norm": 0.5123726943091212, + "learning_rate": 2.906298003072197e-05, + "loss": 0.5589, + "num_tokens": 150897417.0, + "step": 1603 + }, + { + "epoch": 0.2737668544120157, + "grad_norm": 0.527004506857155, + "learning_rate": 2.9056152927120675e-05, + "loss": 0.5154, + "num_tokens": 150965289.0, + "step": 1604 + }, + { + "epoch": 0.2739375320020481, + "grad_norm": 0.5413316770921324, + "learning_rate": 2.9049325823519375e-05, + "loss": 0.6539, + "num_tokens": 151046520.0, + "step": 1605 + }, + { + "epoch": 0.27410820959208054, + "grad_norm": 0.5109708538472376, + "learning_rate": 2.9042498719918076e-05, + "loss": 0.5681, + "num_tokens": 151133312.0, + "step": 1606 + }, + { + "epoch": 0.27427888718211296, + "grad_norm": 0.48138993842613137, + "learning_rate": 2.903567161631678e-05, + "loss": 0.5813, + "num_tokens": 151243634.0, + "step": 1607 + }, + { + "epoch": 0.27444956477214544, + "grad_norm": 0.5571396512911816, + "learning_rate": 2.9028844512715483e-05, + "loss": 0.6247, + "num_tokens": 151322382.0, + "step": 1608 + }, + { + "epoch": 0.27462024236217786, + "grad_norm": 0.6267887557000068, + "learning_rate": 2.9022017409114183e-05, + "loss": 0.6875, + "num_tokens": 151401989.0, + "step": 1609 + }, + { + "epoch": 0.2747909199522103, + "grad_norm": 0.5086875696311032, + "learning_rate": 2.9015190305512887e-05, + "loss": 0.5903, + "num_tokens": 151489934.0, + "step": 1610 + }, + { + "epoch": 0.2749615975422427, + "grad_norm": 0.4915241710507783, + "learning_rate": 2.900836320191159e-05, + "loss": 0.6196, + "num_tokens": 151590212.0, + "step": 1611 + }, + { + "epoch": 0.2751322751322751, + "grad_norm": 0.5243876527333179, + "learning_rate": 2.9001536098310295e-05, + "loss": 0.6387, + "num_tokens": 151695998.0, + "step": 1612 + }, + { + "epoch": 0.27530295272230754, + "grad_norm": 0.5187938875728016, + "learning_rate": 2.8994708994709e-05, + "loss": 0.6415, + "num_tokens": 151791260.0, + "step": 1613 + }, + { + "epoch": 0.27547363031234, + "grad_norm": 0.48631853824570576, + "learning_rate": 2.8987881891107702e-05, + "loss": 0.5941, + "num_tokens": 151898212.0, + "step": 1614 + }, + { + "epoch": 0.27564430790237243, + "grad_norm": 0.5641643980677884, + "learning_rate": 2.8981054787506403e-05, + "loss": 0.5975, + "num_tokens": 151971952.0, + "step": 1615 + }, + { + "epoch": 0.27581498549240485, + "grad_norm": 0.5841254208484151, + "learning_rate": 2.8974227683905106e-05, + "loss": 0.6195, + "num_tokens": 152045126.0, + "step": 1616 + }, + { + "epoch": 0.27598566308243727, + "grad_norm": 0.5330335610342765, + "learning_rate": 2.896740058030381e-05, + "loss": 0.6466, + "num_tokens": 152140892.0, + "step": 1617 + }, + { + "epoch": 0.2761563406724697, + "grad_norm": 0.501418615177315, + "learning_rate": 2.896057347670251e-05, + "loss": 0.6262, + "num_tokens": 152244288.0, + "step": 1618 + }, + { + "epoch": 0.2763270182625021, + "grad_norm": 0.4930359068405374, + "learning_rate": 2.895374637310121e-05, + "loss": 0.601, + "num_tokens": 152355518.0, + "step": 1619 + }, + { + "epoch": 0.2764976958525346, + "grad_norm": 0.7749953913051381, + "learning_rate": 2.8946919269499915e-05, + "loss": 0.6357, + "num_tokens": 152449688.0, + "step": 1620 + }, + { + "epoch": 0.276668373442567, + "grad_norm": 0.4354898483238495, + "learning_rate": 2.894009216589862e-05, + "loss": 0.6168, + "num_tokens": 152590298.0, + "step": 1621 + }, + { + "epoch": 0.2768390510325994, + "grad_norm": 0.5512742082300046, + "learning_rate": 2.8933265062297322e-05, + "loss": 0.6614, + "num_tokens": 152670584.0, + "step": 1622 + }, + { + "epoch": 0.27700972862263185, + "grad_norm": 0.5833277577715714, + "learning_rate": 2.8926437958696026e-05, + "loss": 0.7341, + "num_tokens": 152764278.0, + "step": 1623 + }, + { + "epoch": 0.27718040621266427, + "grad_norm": 0.5175480257914119, + "learning_rate": 2.891961085509473e-05, + "loss": 0.775, + "num_tokens": 152880824.0, + "step": 1624 + }, + { + "epoch": 0.2773510838026967, + "grad_norm": 0.5277410009278345, + "learning_rate": 2.891278375149343e-05, + "loss": 0.591, + "num_tokens": 152957985.0, + "step": 1625 + }, + { + "epoch": 0.27752176139272916, + "grad_norm": 0.5029482198829658, + "learning_rate": 2.8905956647892134e-05, + "loss": 0.596, + "num_tokens": 153060255.0, + "step": 1626 + }, + { + "epoch": 0.2776924389827616, + "grad_norm": 0.5363513224590286, + "learning_rate": 2.8899129544290838e-05, + "loss": 0.5822, + "num_tokens": 153138923.0, + "step": 1627 + }, + { + "epoch": 0.277863116572794, + "grad_norm": 0.6803750015006881, + "learning_rate": 2.889230244068954e-05, + "loss": 0.7348, + "num_tokens": 153218403.0, + "step": 1628 + }, + { + "epoch": 0.2780337941628264, + "grad_norm": 0.6129248117956091, + "learning_rate": 2.8885475337088245e-05, + "loss": 0.7012, + "num_tokens": 153311802.0, + "step": 1629 + }, + { + "epoch": 0.27820447175285884, + "grad_norm": 0.508415673243472, + "learning_rate": 2.8878648233486942e-05, + "loss": 0.5995, + "num_tokens": 153415132.0, + "step": 1630 + }, + { + "epoch": 0.27837514934289126, + "grad_norm": 0.5345503216174748, + "learning_rate": 2.8871821129885646e-05, + "loss": 0.631, + "num_tokens": 153508670.0, + "step": 1631 + }, + { + "epoch": 0.27854582693292373, + "grad_norm": 0.5429709552443895, + "learning_rate": 2.886499402628435e-05, + "loss": 0.5938, + "num_tokens": 153589728.0, + "step": 1632 + }, + { + "epoch": 0.27871650452295615, + "grad_norm": 0.5302578873525465, + "learning_rate": 2.8858166922683053e-05, + "loss": 0.6868, + "num_tokens": 153683480.0, + "step": 1633 + }, + { + "epoch": 0.2788871821129886, + "grad_norm": 0.5460780570492969, + "learning_rate": 2.8851339819081757e-05, + "loss": 0.6942, + "num_tokens": 153780343.0, + "step": 1634 + }, + { + "epoch": 0.279057859703021, + "grad_norm": 0.46991157853437066, + "learning_rate": 2.884451271548046e-05, + "loss": 0.6832, + "num_tokens": 153907818.0, + "step": 1635 + }, + { + "epoch": 0.2792285372930534, + "grad_norm": 0.5443205526752788, + "learning_rate": 2.883768561187916e-05, + "loss": 0.603, + "num_tokens": 153995538.0, + "step": 1636 + }, + { + "epoch": 0.27939921488308583, + "grad_norm": 0.517683163054388, + "learning_rate": 2.8830858508277865e-05, + "loss": 0.6431, + "num_tokens": 154092004.0, + "step": 1637 + }, + { + "epoch": 0.27956989247311825, + "grad_norm": 0.48843970974637657, + "learning_rate": 2.882403140467657e-05, + "loss": 0.6436, + "num_tokens": 154201559.0, + "step": 1638 + }, + { + "epoch": 0.27974057006315073, + "grad_norm": 0.5030594027997682, + "learning_rate": 2.8817204301075273e-05, + "loss": 0.6193, + "num_tokens": 154303723.0, + "step": 1639 + }, + { + "epoch": 0.27991124765318315, + "grad_norm": 0.5580081897152643, + "learning_rate": 2.8810377197473976e-05, + "loss": 0.6045, + "num_tokens": 154381365.0, + "step": 1640 + }, + { + "epoch": 0.28008192524321557, + "grad_norm": 0.5709744607317606, + "learning_rate": 2.880355009387268e-05, + "loss": 0.587, + "num_tokens": 154453583.0, + "step": 1641 + }, + { + "epoch": 0.280252602833248, + "grad_norm": 0.5362278412019073, + "learning_rate": 2.879672299027138e-05, + "loss": 0.5842, + "num_tokens": 154536007.0, + "step": 1642 + }, + { + "epoch": 0.2804232804232804, + "grad_norm": 0.497498615319572, + "learning_rate": 2.878989588667008e-05, + "loss": 0.5602, + "num_tokens": 154627405.0, + "step": 1643 + }, + { + "epoch": 0.2805939580133128, + "grad_norm": 0.5369642013038153, + "learning_rate": 2.8783068783068785e-05, + "loss": 0.7162, + "num_tokens": 154726118.0, + "step": 1644 + }, + { + "epoch": 0.2807646356033453, + "grad_norm": 0.5207523805730631, + "learning_rate": 2.877624167946749e-05, + "loss": 0.6706, + "num_tokens": 154822883.0, + "step": 1645 + }, + { + "epoch": 0.2809353131933777, + "grad_norm": 0.5631939522324584, + "learning_rate": 2.876941457586619e-05, + "loss": 0.6165, + "num_tokens": 154906962.0, + "step": 1646 + }, + { + "epoch": 0.28110599078341014, + "grad_norm": 0.8080780598389131, + "learning_rate": 2.8762587472264893e-05, + "loss": 0.731, + "num_tokens": 154986916.0, + "step": 1647 + }, + { + "epoch": 0.28127666837344256, + "grad_norm": 0.5160576424631989, + "learning_rate": 2.8755760368663596e-05, + "loss": 0.6873, + "num_tokens": 155087532.0, + "step": 1648 + }, + { + "epoch": 0.281447345963475, + "grad_norm": 0.5181131229758502, + "learning_rate": 2.87489332650623e-05, + "loss": 0.5582, + "num_tokens": 155181714.0, + "step": 1649 + }, + { + "epoch": 0.2816180235535074, + "grad_norm": 0.5240511077881997, + "learning_rate": 2.8742106161461004e-05, + "loss": 0.5725, + "num_tokens": 155261159.0, + "step": 1650 + }, + { + "epoch": 0.2817887011435399, + "grad_norm": 0.544777848444035, + "learning_rate": 2.8735279057859708e-05, + "loss": 0.6623, + "num_tokens": 155361701.0, + "step": 1651 + }, + { + "epoch": 0.2819593787335723, + "grad_norm": 0.5107417479074698, + "learning_rate": 2.8728451954258408e-05, + "loss": 0.598, + "num_tokens": 155445862.0, + "step": 1652 + }, + { + "epoch": 0.2821300563236047, + "grad_norm": 0.5033343261024319, + "learning_rate": 2.8721624850657112e-05, + "loss": 0.6627, + "num_tokens": 155541762.0, + "step": 1653 + }, + { + "epoch": 0.28230073391363714, + "grad_norm": 0.5756645373110948, + "learning_rate": 2.8714797747055816e-05, + "loss": 0.6597, + "num_tokens": 155613784.0, + "step": 1654 + }, + { + "epoch": 0.28247141150366956, + "grad_norm": 0.48027645387004453, + "learning_rate": 2.8707970643454516e-05, + "loss": 0.5671, + "num_tokens": 155722435.0, + "step": 1655 + }, + { + "epoch": 0.282642089093702, + "grad_norm": 0.5596435407868025, + "learning_rate": 2.8701143539853216e-05, + "loss": 0.6154, + "num_tokens": 155799709.0, + "step": 1656 + }, + { + "epoch": 0.28281276668373445, + "grad_norm": 0.49442078409359674, + "learning_rate": 2.869431643625192e-05, + "loss": 0.6084, + "num_tokens": 155900850.0, + "step": 1657 + }, + { + "epoch": 0.28298344427376687, + "grad_norm": 0.5557784292396412, + "learning_rate": 2.8687489332650624e-05, + "loss": 0.6328, + "num_tokens": 155990864.0, + "step": 1658 + }, + { + "epoch": 0.2831541218637993, + "grad_norm": 0.5240447377676549, + "learning_rate": 2.8680662229049328e-05, + "loss": 0.6037, + "num_tokens": 156075786.0, + "step": 1659 + }, + { + "epoch": 0.2833247994538317, + "grad_norm": 0.4661683862697689, + "learning_rate": 2.867383512544803e-05, + "loss": 0.5345, + "num_tokens": 156179294.0, + "step": 1660 + }, + { + "epoch": 0.28349547704386413, + "grad_norm": 0.5150501721970029, + "learning_rate": 2.8667008021846735e-05, + "loss": 0.5764, + "num_tokens": 156260846.0, + "step": 1661 + }, + { + "epoch": 0.28366615463389655, + "grad_norm": 0.48104035304427045, + "learning_rate": 2.8660180918245435e-05, + "loss": 0.6085, + "num_tokens": 156366999.0, + "step": 1662 + }, + { + "epoch": 0.283836832223929, + "grad_norm": 0.5068251463953185, + "learning_rate": 2.865335381464414e-05, + "loss": 0.6772, + "num_tokens": 156465835.0, + "step": 1663 + }, + { + "epoch": 0.28400750981396145, + "grad_norm": 0.5614672302892944, + "learning_rate": 2.8646526711042843e-05, + "loss": 0.6859, + "num_tokens": 156559445.0, + "step": 1664 + }, + { + "epoch": 0.28417818740399386, + "grad_norm": 0.5367991551876914, + "learning_rate": 2.8639699607441547e-05, + "loss": 0.6599, + "num_tokens": 156655568.0, + "step": 1665 + }, + { + "epoch": 0.2843488649940263, + "grad_norm": 0.5254668232814469, + "learning_rate": 2.863287250384025e-05, + "loss": 0.6808, + "num_tokens": 156753875.0, + "step": 1666 + }, + { + "epoch": 0.2845195425840587, + "grad_norm": 0.5191920314414764, + "learning_rate": 2.8626045400238954e-05, + "loss": 0.6822, + "num_tokens": 156858428.0, + "step": 1667 + }, + { + "epoch": 0.2846902201740911, + "grad_norm": 0.5259457940245835, + "learning_rate": 2.861921829663765e-05, + "loss": 0.6656, + "num_tokens": 156960292.0, + "step": 1668 + }, + { + "epoch": 0.28486089776412354, + "grad_norm": 0.510430535102296, + "learning_rate": 2.8612391193036355e-05, + "loss": 0.5978, + "num_tokens": 157062927.0, + "step": 1669 + }, + { + "epoch": 0.285031575354156, + "grad_norm": 0.513103908589769, + "learning_rate": 2.860556408943506e-05, + "loss": 0.5789, + "num_tokens": 157151695.0, + "step": 1670 + }, + { + "epoch": 0.28520225294418844, + "grad_norm": 0.5703710136604314, + "learning_rate": 2.8598736985833763e-05, + "loss": 0.603, + "num_tokens": 157216206.0, + "step": 1671 + }, + { + "epoch": 0.28537293053422086, + "grad_norm": 0.5179182189358502, + "learning_rate": 2.8591909882232466e-05, + "loss": 0.6139, + "num_tokens": 157303245.0, + "step": 1672 + }, + { + "epoch": 0.2855436081242533, + "grad_norm": 0.5195474858227033, + "learning_rate": 2.8585082778631167e-05, + "loss": 0.619, + "num_tokens": 157396679.0, + "step": 1673 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.5021110766087584, + "learning_rate": 2.857825567502987e-05, + "loss": 0.6013, + "num_tokens": 157495665.0, + "step": 1674 + }, + { + "epoch": 0.2858849633043181, + "grad_norm": 0.5322598361496493, + "learning_rate": 2.8571428571428574e-05, + "loss": 0.7452, + "num_tokens": 157603297.0, + "step": 1675 + }, + { + "epoch": 0.2860556408943506, + "grad_norm": 0.5238178421148074, + "learning_rate": 2.8564601467827278e-05, + "loss": 0.6383, + "num_tokens": 157692785.0, + "step": 1676 + }, + { + "epoch": 0.286226318484383, + "grad_norm": 0.5932445949990176, + "learning_rate": 2.8557774364225982e-05, + "loss": 0.6085, + "num_tokens": 157761519.0, + "step": 1677 + }, + { + "epoch": 0.28639699607441543, + "grad_norm": 0.4871736548089066, + "learning_rate": 2.8550947260624686e-05, + "loss": 0.5721, + "num_tokens": 157856041.0, + "step": 1678 + }, + { + "epoch": 0.28656767366444785, + "grad_norm": 0.49812014478642047, + "learning_rate": 2.8544120157023386e-05, + "loss": 0.7011, + "num_tokens": 157962439.0, + "step": 1679 + }, + { + "epoch": 0.2867383512544803, + "grad_norm": 0.49239755298640253, + "learning_rate": 2.8537293053422086e-05, + "loss": 0.6361, + "num_tokens": 158068439.0, + "step": 1680 + }, + { + "epoch": 0.2869090288445127, + "grad_norm": 0.5172606236576057, + "learning_rate": 2.853046594982079e-05, + "loss": 0.6616, + "num_tokens": 158161668.0, + "step": 1681 + }, + { + "epoch": 0.28707970643454517, + "grad_norm": 0.578086344953046, + "learning_rate": 2.8523638846219494e-05, + "loss": 0.5594, + "num_tokens": 158224030.0, + "step": 1682 + }, + { + "epoch": 0.2872503840245776, + "grad_norm": 0.5476552532679981, + "learning_rate": 2.8516811742618194e-05, + "loss": 0.605, + "num_tokens": 158305611.0, + "step": 1683 + }, + { + "epoch": 0.28742106161461, + "grad_norm": 0.5877391859469915, + "learning_rate": 2.8509984639016898e-05, + "loss": 0.6556, + "num_tokens": 158384269.0, + "step": 1684 + }, + { + "epoch": 0.2875917392046424, + "grad_norm": 0.4964676365156057, + "learning_rate": 2.8503157535415602e-05, + "loss": 0.6009, + "num_tokens": 158498931.0, + "step": 1685 + }, + { + "epoch": 0.28776241679467485, + "grad_norm": 0.6283381844791843, + "learning_rate": 2.8496330431814305e-05, + "loss": 0.731, + "num_tokens": 158587368.0, + "step": 1686 + }, + { + "epoch": 0.28793309438470727, + "grad_norm": 0.5137682436639205, + "learning_rate": 2.848950332821301e-05, + "loss": 0.5909, + "num_tokens": 158676182.0, + "step": 1687 + }, + { + "epoch": 0.28810377197473974, + "grad_norm": 0.4933016485529538, + "learning_rate": 2.8482676224611713e-05, + "loss": 0.6119, + "num_tokens": 158787762.0, + "step": 1688 + }, + { + "epoch": 0.28827444956477216, + "grad_norm": 0.507531991771271, + "learning_rate": 2.8475849121010413e-05, + "loss": 0.5726, + "num_tokens": 158876663.0, + "step": 1689 + }, + { + "epoch": 0.2884451271548046, + "grad_norm": 0.5510543538142747, + "learning_rate": 2.8469022017409117e-05, + "loss": 0.639, + "num_tokens": 158966122.0, + "step": 1690 + }, + { + "epoch": 0.288615804744837, + "grad_norm": 0.5158136775843158, + "learning_rate": 2.846219491380782e-05, + "loss": 0.6815, + "num_tokens": 159066120.0, + "step": 1691 + }, + { + "epoch": 0.2887864823348694, + "grad_norm": 0.4730053850427305, + "learning_rate": 2.845536781020652e-05, + "loss": 0.5958, + "num_tokens": 159177435.0, + "step": 1692 + }, + { + "epoch": 0.28895715992490184, + "grad_norm": 0.5321961516186255, + "learning_rate": 2.844854070660522e-05, + "loss": 0.6029, + "num_tokens": 159269408.0, + "step": 1693 + }, + { + "epoch": 0.2891278375149343, + "grad_norm": 0.4954207915356764, + "learning_rate": 2.8441713603003925e-05, + "loss": 0.6317, + "num_tokens": 159380272.0, + "step": 1694 + }, + { + "epoch": 0.28929851510496674, + "grad_norm": 0.5066299431551413, + "learning_rate": 2.843488649940263e-05, + "loss": 0.6033, + "num_tokens": 159478240.0, + "step": 1695 + }, + { + "epoch": 0.28946919269499916, + "grad_norm": 0.5071473342845899, + "learning_rate": 2.8428059395801333e-05, + "loss": 0.6482, + "num_tokens": 159571957.0, + "step": 1696 + }, + { + "epoch": 0.2896398702850316, + "grad_norm": 0.5243815368664668, + "learning_rate": 2.8421232292200037e-05, + "loss": 0.6234, + "num_tokens": 159663756.0, + "step": 1697 + }, + { + "epoch": 0.289810547875064, + "grad_norm": 0.557916779988723, + "learning_rate": 2.841440518859874e-05, + "loss": 0.6261, + "num_tokens": 159748831.0, + "step": 1698 + }, + { + "epoch": 0.2899812254650964, + "grad_norm": 0.5932982145420623, + "learning_rate": 2.840757808499744e-05, + "loss": 0.6271, + "num_tokens": 159832680.0, + "step": 1699 + }, + { + "epoch": 0.29015190305512883, + "grad_norm": 0.5310129591815193, + "learning_rate": 2.8400750981396145e-05, + "loss": 0.6758, + "num_tokens": 159921877.0, + "step": 1700 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 0.506480867611394, + "learning_rate": 2.839392387779485e-05, + "loss": 0.5958, + "num_tokens": 160024737.0, + "step": 1701 + }, + { + "epoch": 0.29049325823519373, + "grad_norm": 0.5780944752025957, + "learning_rate": 2.8387096774193552e-05, + "loss": 0.678, + "num_tokens": 160099760.0, + "step": 1702 + }, + { + "epoch": 0.29066393582522615, + "grad_norm": 0.5683723309708199, + "learning_rate": 2.8380269670592256e-05, + "loss": 0.7047, + "num_tokens": 160195603.0, + "step": 1703 + }, + { + "epoch": 0.29083461341525857, + "grad_norm": 0.4827392821323225, + "learning_rate": 2.837344256699096e-05, + "loss": 0.6839, + "num_tokens": 160310590.0, + "step": 1704 + }, + { + "epoch": 0.291005291005291, + "grad_norm": 0.48061717873652443, + "learning_rate": 2.8366615463389657e-05, + "loss": 0.5884, + "num_tokens": 160421649.0, + "step": 1705 + }, + { + "epoch": 0.2911759685953234, + "grad_norm": 0.5681248083317467, + "learning_rate": 2.835978835978836e-05, + "loss": 0.652, + "num_tokens": 160502410.0, + "step": 1706 + }, + { + "epoch": 0.2913466461853559, + "grad_norm": 0.4579952560179596, + "learning_rate": 2.8352961256187064e-05, + "loss": 0.6339, + "num_tokens": 160612592.0, + "step": 1707 + }, + { + "epoch": 0.2915173237753883, + "grad_norm": 0.5971478164209244, + "learning_rate": 2.8346134152585768e-05, + "loss": 0.5949, + "num_tokens": 160672167.0, + "step": 1708 + }, + { + "epoch": 0.2916880013654207, + "grad_norm": 0.49110699403490465, + "learning_rate": 2.833930704898447e-05, + "loss": 0.7011, + "num_tokens": 160788663.0, + "step": 1709 + }, + { + "epoch": 0.29185867895545314, + "grad_norm": 0.5124273592442077, + "learning_rate": 2.8332479945383172e-05, + "loss": 0.6277, + "num_tokens": 160883645.0, + "step": 1710 + }, + { + "epoch": 0.29202935654548556, + "grad_norm": 0.5421135968999488, + "learning_rate": 2.8325652841781876e-05, + "loss": 0.7712, + "num_tokens": 160995946.0, + "step": 1711 + }, + { + "epoch": 0.292200034135518, + "grad_norm": 0.5142972450794154, + "learning_rate": 2.831882573818058e-05, + "loss": 0.5976, + "num_tokens": 161091724.0, + "step": 1712 + }, + { + "epoch": 0.29237071172555046, + "grad_norm": 0.49630007121143865, + "learning_rate": 2.8311998634579283e-05, + "loss": 0.626, + "num_tokens": 161202128.0, + "step": 1713 + }, + { + "epoch": 0.2925413893155829, + "grad_norm": 0.4705735442102008, + "learning_rate": 2.8305171530977987e-05, + "loss": 0.5835, + "num_tokens": 161311836.0, + "step": 1714 + }, + { + "epoch": 0.2927120669056153, + "grad_norm": 0.511967959096828, + "learning_rate": 2.829834442737669e-05, + "loss": 0.6065, + "num_tokens": 161405879.0, + "step": 1715 + }, + { + "epoch": 0.2928827444956477, + "grad_norm": 0.4803112931376538, + "learning_rate": 2.829151732377539e-05, + "loss": 0.5966, + "num_tokens": 161531599.0, + "step": 1716 + }, + { + "epoch": 0.29305342208568014, + "grad_norm": 0.551101296608276, + "learning_rate": 2.828469022017409e-05, + "loss": 0.6409, + "num_tokens": 161616218.0, + "step": 1717 + }, + { + "epoch": 0.29322409967571256, + "grad_norm": 0.46614633528149774, + "learning_rate": 2.8277863116572795e-05, + "loss": 0.6368, + "num_tokens": 161734591.0, + "step": 1718 + }, + { + "epoch": 0.29339477726574503, + "grad_norm": 0.5118990426896141, + "learning_rate": 2.82710360129715e-05, + "loss": 0.5819, + "num_tokens": 161812859.0, + "step": 1719 + }, + { + "epoch": 0.29356545485577745, + "grad_norm": 0.4843980942700516, + "learning_rate": 2.82642089093702e-05, + "loss": 0.694, + "num_tokens": 161930940.0, + "step": 1720 + }, + { + "epoch": 0.29373613244580987, + "grad_norm": 0.5395507708966872, + "learning_rate": 2.8257381805768903e-05, + "loss": 0.5907, + "num_tokens": 162018219.0, + "step": 1721 + }, + { + "epoch": 0.2939068100358423, + "grad_norm": 0.5206632544226754, + "learning_rate": 2.8250554702167607e-05, + "loss": 0.6226, + "num_tokens": 162109290.0, + "step": 1722 + }, + { + "epoch": 0.2940774876258747, + "grad_norm": 0.4630002062000658, + "learning_rate": 2.824372759856631e-05, + "loss": 0.5932, + "num_tokens": 162228620.0, + "step": 1723 + }, + { + "epoch": 0.29424816521590713, + "grad_norm": 0.5268639520758155, + "learning_rate": 2.8236900494965015e-05, + "loss": 0.599, + "num_tokens": 162311846.0, + "step": 1724 + }, + { + "epoch": 0.2944188428059396, + "grad_norm": 0.5867923177223241, + "learning_rate": 2.823007339136372e-05, + "loss": 0.6144, + "num_tokens": 162390419.0, + "step": 1725 + }, + { + "epoch": 0.294589520395972, + "grad_norm": 0.5484132225371406, + "learning_rate": 2.822324628776242e-05, + "loss": 0.5794, + "num_tokens": 162459637.0, + "step": 1726 + }, + { + "epoch": 0.29476019798600445, + "grad_norm": 0.5195203589692147, + "learning_rate": 2.8216419184161122e-05, + "loss": 0.532, + "num_tokens": 162533610.0, + "step": 1727 + }, + { + "epoch": 0.29493087557603687, + "grad_norm": 0.5259597587426807, + "learning_rate": 2.8209592080559826e-05, + "loss": 0.6817, + "num_tokens": 162638222.0, + "step": 1728 + }, + { + "epoch": 0.2951015531660693, + "grad_norm": 0.5455422681391826, + "learning_rate": 2.8202764976958527e-05, + "loss": 0.5537, + "num_tokens": 162736856.0, + "step": 1729 + }, + { + "epoch": 0.2952722307561017, + "grad_norm": 0.531586259501683, + "learning_rate": 2.8195937873357227e-05, + "loss": 0.6022, + "num_tokens": 162831312.0, + "step": 1730 + }, + { + "epoch": 0.2954429083461341, + "grad_norm": 0.5425699455499043, + "learning_rate": 2.818911076975593e-05, + "loss": 0.5998, + "num_tokens": 162919865.0, + "step": 1731 + }, + { + "epoch": 0.2956135859361666, + "grad_norm": 0.4851087284761314, + "learning_rate": 2.8182283666154635e-05, + "loss": 0.5698, + "num_tokens": 163028135.0, + "step": 1732 + }, + { + "epoch": 0.295784263526199, + "grad_norm": 0.5273040486282454, + "learning_rate": 2.8175456562553338e-05, + "loss": 0.6194, + "num_tokens": 163122336.0, + "step": 1733 + }, + { + "epoch": 0.29595494111623144, + "grad_norm": 0.5436608886422389, + "learning_rate": 2.8168629458952042e-05, + "loss": 0.6251, + "num_tokens": 163211168.0, + "step": 1734 + }, + { + "epoch": 0.29612561870626386, + "grad_norm": 0.539230046357182, + "learning_rate": 2.8161802355350746e-05, + "loss": 0.6676, + "num_tokens": 163310473.0, + "step": 1735 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.4942881181501553, + "learning_rate": 2.815497525174945e-05, + "loss": 0.6275, + "num_tokens": 163415592.0, + "step": 1736 + }, + { + "epoch": 0.2964669738863287, + "grad_norm": 0.515105433581793, + "learning_rate": 2.814814814814815e-05, + "loss": 0.6048, + "num_tokens": 163511572.0, + "step": 1737 + }, + { + "epoch": 0.2966376514763612, + "grad_norm": 0.49012078976524454, + "learning_rate": 2.8141321044546854e-05, + "loss": 0.5594, + "num_tokens": 163598711.0, + "step": 1738 + }, + { + "epoch": 0.2968083290663936, + "grad_norm": 0.5841856298991178, + "learning_rate": 2.8134493940945557e-05, + "loss": 0.588, + "num_tokens": 163680671.0, + "step": 1739 + }, + { + "epoch": 0.296979006656426, + "grad_norm": 0.5567756181115766, + "learning_rate": 2.812766683734426e-05, + "loss": 0.6711, + "num_tokens": 163779349.0, + "step": 1740 + }, + { + "epoch": 0.29714968424645843, + "grad_norm": 0.49025007706922386, + "learning_rate": 2.8120839733742965e-05, + "loss": 0.5774, + "num_tokens": 163887779.0, + "step": 1741 + }, + { + "epoch": 0.29732036183649085, + "grad_norm": 0.4672875897886219, + "learning_rate": 2.8114012630141662e-05, + "loss": 0.6276, + "num_tokens": 164002007.0, + "step": 1742 + }, + { + "epoch": 0.2974910394265233, + "grad_norm": 0.47535410824303564, + "learning_rate": 2.8107185526540366e-05, + "loss": 0.6427, + "num_tokens": 164130073.0, + "step": 1743 + }, + { + "epoch": 0.29766171701655575, + "grad_norm": 0.5357808265527796, + "learning_rate": 2.810035842293907e-05, + "loss": 0.6159, + "num_tokens": 164209745.0, + "step": 1744 + }, + { + "epoch": 0.29783239460658817, + "grad_norm": 0.563139883766903, + "learning_rate": 2.8093531319337773e-05, + "loss": 0.6377, + "num_tokens": 164285529.0, + "step": 1745 + }, + { + "epoch": 0.2980030721966206, + "grad_norm": 0.5644131465937604, + "learning_rate": 2.8086704215736477e-05, + "loss": 0.619, + "num_tokens": 164355854.0, + "step": 1746 + }, + { + "epoch": 0.298173749786653, + "grad_norm": 0.522785443611465, + "learning_rate": 2.8079877112135177e-05, + "loss": 0.6134, + "num_tokens": 164442107.0, + "step": 1747 + }, + { + "epoch": 0.29834442737668543, + "grad_norm": 0.48210802971826366, + "learning_rate": 2.807305000853388e-05, + "loss": 0.5715, + "num_tokens": 164544873.0, + "step": 1748 + }, + { + "epoch": 0.29851510496671785, + "grad_norm": 0.4995027462352355, + "learning_rate": 2.8066222904932585e-05, + "loss": 0.5512, + "num_tokens": 164636742.0, + "step": 1749 + }, + { + "epoch": 0.2986857825567503, + "grad_norm": 0.4429018765768066, + "learning_rate": 2.805939580133129e-05, + "loss": 0.6401, + "num_tokens": 164775013.0, + "step": 1750 + }, + { + "epoch": 0.29885646014678274, + "grad_norm": 0.505423607327285, + "learning_rate": 2.8052568697729992e-05, + "loss": 0.5987, + "num_tokens": 164868240.0, + "step": 1751 + }, + { + "epoch": 0.29902713773681516, + "grad_norm": 0.49811876932705607, + "learning_rate": 2.8045741594128696e-05, + "loss": 0.5721, + "num_tokens": 164955406.0, + "step": 1752 + }, + { + "epoch": 0.2991978153268476, + "grad_norm": 0.511432564671813, + "learning_rate": 2.8038914490527397e-05, + "loss": 0.7195, + "num_tokens": 165066770.0, + "step": 1753 + }, + { + "epoch": 0.29936849291688, + "grad_norm": 0.6252294370394783, + "learning_rate": 2.8032087386926097e-05, + "loss": 0.6213, + "num_tokens": 165128255.0, + "step": 1754 + }, + { + "epoch": 0.2995391705069124, + "grad_norm": 0.5416844981134004, + "learning_rate": 2.80252602833248e-05, + "loss": 0.5556, + "num_tokens": 165213571.0, + "step": 1755 + }, + { + "epoch": 0.2997098480969449, + "grad_norm": 0.49429383458702514, + "learning_rate": 2.8018433179723505e-05, + "loss": 0.5651, + "num_tokens": 165304153.0, + "step": 1756 + }, + { + "epoch": 0.2998805256869773, + "grad_norm": 0.5334611689704962, + "learning_rate": 2.8011606076122205e-05, + "loss": 0.6676, + "num_tokens": 165405184.0, + "step": 1757 + }, + { + "epoch": 0.30005120327700974, + "grad_norm": 0.5090027833395734, + "learning_rate": 2.800477897252091e-05, + "loss": 0.6107, + "num_tokens": 165508278.0, + "step": 1758 + }, + { + "epoch": 0.30022188086704216, + "grad_norm": 0.47394020459295544, + "learning_rate": 2.7997951868919612e-05, + "loss": 0.6585, + "num_tokens": 165639377.0, + "step": 1759 + }, + { + "epoch": 0.3003925584570746, + "grad_norm": 0.6075424944060056, + "learning_rate": 2.7991124765318316e-05, + "loss": 0.701, + "num_tokens": 165710280.0, + "step": 1760 + }, + { + "epoch": 0.300563236047107, + "grad_norm": 0.49848372773680205, + "learning_rate": 2.798429766171702e-05, + "loss": 0.7296, + "num_tokens": 165833182.0, + "step": 1761 + }, + { + "epoch": 0.3007339136371394, + "grad_norm": 0.5334898970042283, + "learning_rate": 2.7977470558115724e-05, + "loss": 0.623, + "num_tokens": 165923095.0, + "step": 1762 + }, + { + "epoch": 0.3009045912271719, + "grad_norm": 0.4657300758711079, + "learning_rate": 2.7970643454514424e-05, + "loss": 0.6255, + "num_tokens": 166032595.0, + "step": 1763 + }, + { + "epoch": 0.3010752688172043, + "grad_norm": 0.5022585808641511, + "learning_rate": 2.7963816350913128e-05, + "loss": 0.6178, + "num_tokens": 166122687.0, + "step": 1764 + }, + { + "epoch": 0.30124594640723673, + "grad_norm": 0.48887779425893924, + "learning_rate": 2.795698924731183e-05, + "loss": 0.555, + "num_tokens": 166211277.0, + "step": 1765 + }, + { + "epoch": 0.30141662399726915, + "grad_norm": 0.48995370806412936, + "learning_rate": 2.7950162143710535e-05, + "loss": 0.582, + "num_tokens": 166303431.0, + "step": 1766 + }, + { + "epoch": 0.30158730158730157, + "grad_norm": 0.5055571019418379, + "learning_rate": 2.7943335040109232e-05, + "loss": 0.6334, + "num_tokens": 166414696.0, + "step": 1767 + }, + { + "epoch": 0.301757979177334, + "grad_norm": 0.508522098259203, + "learning_rate": 2.7936507936507936e-05, + "loss": 0.6011, + "num_tokens": 166504754.0, + "step": 1768 + }, + { + "epoch": 0.30192865676736647, + "grad_norm": 0.5366746232896207, + "learning_rate": 2.792968083290664e-05, + "loss": 0.6211, + "num_tokens": 166586524.0, + "step": 1769 + }, + { + "epoch": 0.3020993343573989, + "grad_norm": 0.5243274878034984, + "learning_rate": 2.7922853729305344e-05, + "loss": 0.6564, + "num_tokens": 166690884.0, + "step": 1770 + }, + { + "epoch": 0.3022700119474313, + "grad_norm": 0.5424351598875291, + "learning_rate": 2.7916026625704047e-05, + "loss": 0.7154, + "num_tokens": 166797631.0, + "step": 1771 + }, + { + "epoch": 0.3024406895374637, + "grad_norm": 0.4696572777072118, + "learning_rate": 2.790919952210275e-05, + "loss": 0.5983, + "num_tokens": 166910516.0, + "step": 1772 + }, + { + "epoch": 0.30261136712749614, + "grad_norm": 0.4964059848554195, + "learning_rate": 2.7902372418501455e-05, + "loss": 0.6756, + "num_tokens": 167012673.0, + "step": 1773 + }, + { + "epoch": 0.30278204471752856, + "grad_norm": 0.5060949723003197, + "learning_rate": 2.7895545314900155e-05, + "loss": 0.5822, + "num_tokens": 167090869.0, + "step": 1774 + }, + { + "epoch": 0.30295272230756104, + "grad_norm": 0.47902710469794624, + "learning_rate": 2.788871821129886e-05, + "loss": 0.6682, + "num_tokens": 167205260.0, + "step": 1775 + }, + { + "epoch": 0.30312339989759346, + "grad_norm": 0.5196875042659808, + "learning_rate": 2.7881891107697563e-05, + "loss": 0.6703, + "num_tokens": 167304997.0, + "step": 1776 + }, + { + "epoch": 0.3032940774876259, + "grad_norm": 0.5074622398663027, + "learning_rate": 2.7875064004096267e-05, + "loss": 0.6281, + "num_tokens": 167404134.0, + "step": 1777 + }, + { + "epoch": 0.3034647550776583, + "grad_norm": 0.5524582976347004, + "learning_rate": 2.786823690049497e-05, + "loss": 0.6749, + "num_tokens": 167478610.0, + "step": 1778 + }, + { + "epoch": 0.3036354326676907, + "grad_norm": 0.527367523083142, + "learning_rate": 2.7861409796893667e-05, + "loss": 0.6039, + "num_tokens": 167557600.0, + "step": 1779 + }, + { + "epoch": 0.30380611025772314, + "grad_norm": 0.49043165250995246, + "learning_rate": 2.785458269329237e-05, + "loss": 0.5935, + "num_tokens": 167650759.0, + "step": 1780 + }, + { + "epoch": 0.3039767878477556, + "grad_norm": 0.47423846137432085, + "learning_rate": 2.7847755589691075e-05, + "loss": 0.571, + "num_tokens": 167757325.0, + "step": 1781 + }, + { + "epoch": 0.30414746543778803, + "grad_norm": 0.49128981470440275, + "learning_rate": 2.784092848608978e-05, + "loss": 0.7346, + "num_tokens": 167873590.0, + "step": 1782 + }, + { + "epoch": 0.30431814302782045, + "grad_norm": 0.5108225845792723, + "learning_rate": 2.7834101382488482e-05, + "loss": 0.584, + "num_tokens": 167961122.0, + "step": 1783 + }, + { + "epoch": 0.3044888206178529, + "grad_norm": 0.5006221370161872, + "learning_rate": 2.7827274278887183e-05, + "loss": 0.5814, + "num_tokens": 168042945.0, + "step": 1784 + }, + { + "epoch": 0.3046594982078853, + "grad_norm": 0.49877527306263025, + "learning_rate": 2.7820447175285887e-05, + "loss": 0.6286, + "num_tokens": 168131153.0, + "step": 1785 + }, + { + "epoch": 0.3048301757979177, + "grad_norm": 0.5504116788321519, + "learning_rate": 2.781362007168459e-05, + "loss": 0.6304, + "num_tokens": 168203131.0, + "step": 1786 + }, + { + "epoch": 0.3050008533879502, + "grad_norm": 0.5086561188527078, + "learning_rate": 2.7806792968083294e-05, + "loss": 0.577, + "num_tokens": 168290116.0, + "step": 1787 + }, + { + "epoch": 0.3051715309779826, + "grad_norm": 0.46167745140054206, + "learning_rate": 2.7799965864481998e-05, + "loss": 0.5725, + "num_tokens": 168396860.0, + "step": 1788 + }, + { + "epoch": 0.305342208568015, + "grad_norm": 0.4852151896015887, + "learning_rate": 2.77931387608807e-05, + "loss": 0.544, + "num_tokens": 168488195.0, + "step": 1789 + }, + { + "epoch": 0.30551288615804745, + "grad_norm": 0.5100145069417077, + "learning_rate": 2.7786311657279402e-05, + "loss": 0.5671, + "num_tokens": 168568888.0, + "step": 1790 + }, + { + "epoch": 0.30568356374807987, + "grad_norm": 0.5437459247208325, + "learning_rate": 2.7779484553678102e-05, + "loss": 0.6318, + "num_tokens": 168655732.0, + "step": 1791 + }, + { + "epoch": 0.3058542413381123, + "grad_norm": 0.47594616249011495, + "learning_rate": 2.7772657450076806e-05, + "loss": 0.6756, + "num_tokens": 168768986.0, + "step": 1792 + }, + { + "epoch": 0.3060249189281447, + "grad_norm": 0.4838223137868776, + "learning_rate": 2.776583034647551e-05, + "loss": 0.5809, + "num_tokens": 168867051.0, + "step": 1793 + }, + { + "epoch": 0.3061955965181772, + "grad_norm": 0.43940993800862016, + "learning_rate": 2.775900324287421e-05, + "loss": 0.5336, + "num_tokens": 168987007.0, + "step": 1794 + }, + { + "epoch": 0.3063662741082096, + "grad_norm": 0.4941808717868668, + "learning_rate": 2.7752176139272914e-05, + "loss": 0.5371, + "num_tokens": 169069278.0, + "step": 1795 + }, + { + "epoch": 0.306536951698242, + "grad_norm": 0.4735302766967219, + "learning_rate": 2.7745349035671618e-05, + "loss": 0.5522, + "num_tokens": 169170145.0, + "step": 1796 + }, + { + "epoch": 0.30670762928827444, + "grad_norm": 0.5475719184359104, + "learning_rate": 2.773852193207032e-05, + "loss": 0.6598, + "num_tokens": 169264733.0, + "step": 1797 + }, + { + "epoch": 0.30687830687830686, + "grad_norm": 0.5457466617490664, + "learning_rate": 2.7731694828469025e-05, + "loss": 0.5895, + "num_tokens": 169349954.0, + "step": 1798 + }, + { + "epoch": 0.3070489844683393, + "grad_norm": 0.5180067423575152, + "learning_rate": 2.772486772486773e-05, + "loss": 0.5963, + "num_tokens": 169448899.0, + "step": 1799 + }, + { + "epoch": 0.30721966205837176, + "grad_norm": 0.5286605105348603, + "learning_rate": 2.771804062126643e-05, + "loss": 0.7339, + "num_tokens": 169573490.0, + "step": 1800 + }, + { + "epoch": 0.3073903396484042, + "grad_norm": 0.4709941108900789, + "learning_rate": 2.7711213517665133e-05, + "loss": 0.6428, + "num_tokens": 169707545.0, + "step": 1801 + }, + { + "epoch": 0.3075610172384366, + "grad_norm": 0.5077667289457547, + "learning_rate": 2.7704386414063837e-05, + "loss": 0.546, + "num_tokens": 169801017.0, + "step": 1802 + }, + { + "epoch": 0.307731694828469, + "grad_norm": 0.5481105061776905, + "learning_rate": 2.769755931046254e-05, + "loss": 0.6649, + "num_tokens": 169883887.0, + "step": 1803 + }, + { + "epoch": 0.30790237241850144, + "grad_norm": 0.5056467747842829, + "learning_rate": 2.769073220686124e-05, + "loss": 0.6004, + "num_tokens": 169971722.0, + "step": 1804 + }, + { + "epoch": 0.30807305000853386, + "grad_norm": 0.47282580956943737, + "learning_rate": 2.768390510325994e-05, + "loss": 0.621, + "num_tokens": 170082074.0, + "step": 1805 + }, + { + "epoch": 0.30824372759856633, + "grad_norm": 0.49349678664063634, + "learning_rate": 2.7677077999658645e-05, + "loss": 0.6435, + "num_tokens": 170193314.0, + "step": 1806 + }, + { + "epoch": 0.30841440518859875, + "grad_norm": 0.45846294702442064, + "learning_rate": 2.767025089605735e-05, + "loss": 0.5564, + "num_tokens": 170293087.0, + "step": 1807 + }, + { + "epoch": 0.30858508277863117, + "grad_norm": 0.5064441654400016, + "learning_rate": 2.7663423792456053e-05, + "loss": 0.6201, + "num_tokens": 170380367.0, + "step": 1808 + }, + { + "epoch": 0.3087557603686636, + "grad_norm": 0.48828113454236327, + "learning_rate": 2.7656596688854757e-05, + "loss": 0.6235, + "num_tokens": 170478458.0, + "step": 1809 + }, + { + "epoch": 0.308926437958696, + "grad_norm": 0.47972064749710336, + "learning_rate": 2.764976958525346e-05, + "loss": 0.6483, + "num_tokens": 170585398.0, + "step": 1810 + }, + { + "epoch": 0.30909711554872843, + "grad_norm": 0.5682547243955263, + "learning_rate": 2.764294248165216e-05, + "loss": 0.6849, + "num_tokens": 170675909.0, + "step": 1811 + }, + { + "epoch": 0.3092677931387609, + "grad_norm": 0.5415239075211329, + "learning_rate": 2.7636115378050864e-05, + "loss": 0.5571, + "num_tokens": 170745708.0, + "step": 1812 + }, + { + "epoch": 0.3094384707287933, + "grad_norm": 0.5254068525871738, + "learning_rate": 2.7629288274449568e-05, + "loss": 0.5868, + "num_tokens": 170826832.0, + "step": 1813 + }, + { + "epoch": 0.30960914831882574, + "grad_norm": 0.48323020000217637, + "learning_rate": 2.7622461170848272e-05, + "loss": 0.6276, + "num_tokens": 170931437.0, + "step": 1814 + }, + { + "epoch": 0.30977982590885816, + "grad_norm": 0.4853153131676094, + "learning_rate": 2.7615634067246976e-05, + "loss": 0.5623, + "num_tokens": 171030193.0, + "step": 1815 + }, + { + "epoch": 0.3099505034988906, + "grad_norm": 0.6064859646426419, + "learning_rate": 2.7608806963645673e-05, + "loss": 0.7596, + "num_tokens": 171124878.0, + "step": 1816 + }, + { + "epoch": 0.310121181088923, + "grad_norm": 0.5826341418542528, + "learning_rate": 2.7601979860044376e-05, + "loss": 0.552, + "num_tokens": 171190867.0, + "step": 1817 + }, + { + "epoch": 0.3102918586789555, + "grad_norm": 0.50782550815946, + "learning_rate": 2.759515275644308e-05, + "loss": 0.6346, + "num_tokens": 171293218.0, + "step": 1818 + }, + { + "epoch": 0.3104625362689879, + "grad_norm": 0.5398105581302006, + "learning_rate": 2.7588325652841784e-05, + "loss": 0.6782, + "num_tokens": 171395575.0, + "step": 1819 + }, + { + "epoch": 0.3106332138590203, + "grad_norm": 0.5317969320531754, + "learning_rate": 2.7581498549240488e-05, + "loss": 0.696, + "num_tokens": 171484776.0, + "step": 1820 + }, + { + "epoch": 0.31080389144905274, + "grad_norm": 0.5435586510883168, + "learning_rate": 2.7574671445639188e-05, + "loss": 0.5437, + "num_tokens": 171595611.0, + "step": 1821 + }, + { + "epoch": 0.31097456903908516, + "grad_norm": 0.5027587193175057, + "learning_rate": 2.7567844342037892e-05, + "loss": 0.6022, + "num_tokens": 171700507.0, + "step": 1822 + }, + { + "epoch": 0.3111452466291176, + "grad_norm": 0.5312511682260234, + "learning_rate": 2.7561017238436596e-05, + "loss": 0.6686, + "num_tokens": 171789174.0, + "step": 1823 + }, + { + "epoch": 0.31131592421915005, + "grad_norm": 0.516660746391616, + "learning_rate": 2.75541901348353e-05, + "loss": 0.6868, + "num_tokens": 171884448.0, + "step": 1824 + }, + { + "epoch": 0.3114866018091825, + "grad_norm": 0.5437733512185642, + "learning_rate": 2.7547363031234003e-05, + "loss": 0.6057, + "num_tokens": 171973042.0, + "step": 1825 + }, + { + "epoch": 0.3116572793992149, + "grad_norm": 0.48728277841021794, + "learning_rate": 2.7540535927632707e-05, + "loss": 0.5458, + "num_tokens": 172073884.0, + "step": 1826 + }, + { + "epoch": 0.3118279569892473, + "grad_norm": 0.4768720661962923, + "learning_rate": 2.7533708824031407e-05, + "loss": 0.6729, + "num_tokens": 172188655.0, + "step": 1827 + }, + { + "epoch": 0.31199863457927973, + "grad_norm": 0.4956906002567987, + "learning_rate": 2.752688172043011e-05, + "loss": 0.6768, + "num_tokens": 172295485.0, + "step": 1828 + }, + { + "epoch": 0.31216931216931215, + "grad_norm": 0.521469643222855, + "learning_rate": 2.752005461682881e-05, + "loss": 0.638, + "num_tokens": 172385206.0, + "step": 1829 + }, + { + "epoch": 0.31233998975934457, + "grad_norm": 0.48041061741556823, + "learning_rate": 2.7513227513227515e-05, + "loss": 0.6653, + "num_tokens": 172502539.0, + "step": 1830 + }, + { + "epoch": 0.31251066734937705, + "grad_norm": 0.45562281426101814, + "learning_rate": 2.7506400409626216e-05, + "loss": 0.5787, + "num_tokens": 172628034.0, + "step": 1831 + }, + { + "epoch": 0.31268134493940947, + "grad_norm": 0.4949885225637786, + "learning_rate": 2.749957330602492e-05, + "loss": 0.6161, + "num_tokens": 172733349.0, + "step": 1832 + }, + { + "epoch": 0.3128520225294419, + "grad_norm": 0.4626040482976906, + "learning_rate": 2.7492746202423623e-05, + "loss": 0.6161, + "num_tokens": 172842026.0, + "step": 1833 + }, + { + "epoch": 0.3130227001194743, + "grad_norm": 0.5175431064341234, + "learning_rate": 2.7485919098822327e-05, + "loss": 0.5952, + "num_tokens": 172926839.0, + "step": 1834 + }, + { + "epoch": 0.3131933777095067, + "grad_norm": 0.5695573932902815, + "learning_rate": 2.747909199522103e-05, + "loss": 0.6194, + "num_tokens": 173002371.0, + "step": 1835 + }, + { + "epoch": 0.31336405529953915, + "grad_norm": 0.5275380901542777, + "learning_rate": 2.7472264891619734e-05, + "loss": 0.5108, + "num_tokens": 173069427.0, + "step": 1836 + }, + { + "epoch": 0.3135347328895716, + "grad_norm": 0.4973606253723709, + "learning_rate": 2.7465437788018435e-05, + "loss": 0.5247, + "num_tokens": 173153429.0, + "step": 1837 + }, + { + "epoch": 0.31370541047960404, + "grad_norm": 0.4836843289548591, + "learning_rate": 2.745861068441714e-05, + "loss": 0.6438, + "num_tokens": 173265181.0, + "step": 1838 + }, + { + "epoch": 0.31387608806963646, + "grad_norm": 0.5453619188117896, + "learning_rate": 2.7451783580815842e-05, + "loss": 0.6606, + "num_tokens": 173355839.0, + "step": 1839 + }, + { + "epoch": 0.3140467656596689, + "grad_norm": 0.5178357122492119, + "learning_rate": 2.7444956477214546e-05, + "loss": 0.5805, + "num_tokens": 173454060.0, + "step": 1840 + }, + { + "epoch": 0.3142174432497013, + "grad_norm": 0.4716434720455672, + "learning_rate": 2.7438129373613246e-05, + "loss": 0.6406, + "num_tokens": 173583000.0, + "step": 1841 + }, + { + "epoch": 0.3143881208397337, + "grad_norm": 0.5019995561322425, + "learning_rate": 2.7431302270011947e-05, + "loss": 0.6606, + "num_tokens": 173680801.0, + "step": 1842 + }, + { + "epoch": 0.3145587984297662, + "grad_norm": 0.522465640323239, + "learning_rate": 2.742447516641065e-05, + "loss": 0.5697, + "num_tokens": 173758679.0, + "step": 1843 + }, + { + "epoch": 0.3147294760197986, + "grad_norm": 0.48450359517289643, + "learning_rate": 2.7417648062809354e-05, + "loss": 0.5556, + "num_tokens": 173846173.0, + "step": 1844 + }, + { + "epoch": 0.31490015360983103, + "grad_norm": 0.4903293105816486, + "learning_rate": 2.7410820959208058e-05, + "loss": 0.6359, + "num_tokens": 173956407.0, + "step": 1845 + }, + { + "epoch": 0.31507083119986345, + "grad_norm": 0.5427494133762597, + "learning_rate": 2.7403993855606762e-05, + "loss": 0.633, + "num_tokens": 174054474.0, + "step": 1846 + }, + { + "epoch": 0.3152415087898959, + "grad_norm": 0.5820584229016548, + "learning_rate": 2.7397166752005466e-05, + "loss": 0.5891, + "num_tokens": 174164062.0, + "step": 1847 + }, + { + "epoch": 0.3154121863799283, + "grad_norm": 0.5199327843146129, + "learning_rate": 2.7390339648404166e-05, + "loss": 0.5825, + "num_tokens": 174255757.0, + "step": 1848 + }, + { + "epoch": 0.31558286396996077, + "grad_norm": 0.5399310739661023, + "learning_rate": 2.738351254480287e-05, + "loss": 0.5958, + "num_tokens": 174333638.0, + "step": 1849 + }, + { + "epoch": 0.3157535415599932, + "grad_norm": 0.5189435799381489, + "learning_rate": 2.7376685441201574e-05, + "loss": 0.5622, + "num_tokens": 174410523.0, + "step": 1850 + }, + { + "epoch": 0.3159242191500256, + "grad_norm": 0.4987178819471887, + "learning_rate": 2.7369858337600277e-05, + "loss": 0.577, + "num_tokens": 174512096.0, + "step": 1851 + }, + { + "epoch": 0.31609489674005803, + "grad_norm": 0.4626250290048233, + "learning_rate": 2.736303123399898e-05, + "loss": 0.5802, + "num_tokens": 174616780.0, + "step": 1852 + }, + { + "epoch": 0.31626557433009045, + "grad_norm": 0.5193400373930481, + "learning_rate": 2.7356204130397678e-05, + "loss": 0.5313, + "num_tokens": 174693378.0, + "step": 1853 + }, + { + "epoch": 0.31643625192012287, + "grad_norm": 0.4820741022663225, + "learning_rate": 2.7349377026796382e-05, + "loss": 0.6021, + "num_tokens": 174802364.0, + "step": 1854 + }, + { + "epoch": 0.31660692951015534, + "grad_norm": 0.4947533438272115, + "learning_rate": 2.7342549923195086e-05, + "loss": 0.607, + "num_tokens": 174906049.0, + "step": 1855 + }, + { + "epoch": 0.31677760710018776, + "grad_norm": 0.5094831188743407, + "learning_rate": 2.733572281959379e-05, + "loss": 0.5771, + "num_tokens": 175001533.0, + "step": 1856 + }, + { + "epoch": 0.3169482846902202, + "grad_norm": 0.5468922572893605, + "learning_rate": 2.7328895715992493e-05, + "loss": 0.6257, + "num_tokens": 175071471.0, + "step": 1857 + }, + { + "epoch": 0.3171189622802526, + "grad_norm": 0.4736833243127142, + "learning_rate": 2.7322068612391193e-05, + "loss": 0.5623, + "num_tokens": 175167340.0, + "step": 1858 + }, + { + "epoch": 0.317289639870285, + "grad_norm": 0.4738319206684177, + "learning_rate": 2.7315241508789897e-05, + "loss": 0.5771, + "num_tokens": 175270562.0, + "step": 1859 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 0.47302938208009443, + "learning_rate": 2.73084144051886e-05, + "loss": 0.5542, + "num_tokens": 175365029.0, + "step": 1860 + }, + { + "epoch": 0.31763099505034986, + "grad_norm": 0.5233966923337117, + "learning_rate": 2.7301587301587305e-05, + "loss": 0.6239, + "num_tokens": 175457558.0, + "step": 1861 + }, + { + "epoch": 0.31780167264038234, + "grad_norm": 0.4691322810397915, + "learning_rate": 2.729476019798601e-05, + "loss": 0.6111, + "num_tokens": 175550617.0, + "step": 1862 + }, + { + "epoch": 0.31797235023041476, + "grad_norm": 0.5566789628767105, + "learning_rate": 2.7287933094384712e-05, + "loss": 0.6345, + "num_tokens": 175630607.0, + "step": 1863 + }, + { + "epoch": 0.3181430278204472, + "grad_norm": 0.46760884554418203, + "learning_rate": 2.7281105990783413e-05, + "loss": 0.611, + "num_tokens": 175733948.0, + "step": 1864 + }, + { + "epoch": 0.3183137054104796, + "grad_norm": 0.524677801942664, + "learning_rate": 2.7274278887182116e-05, + "loss": 0.6564, + "num_tokens": 175835431.0, + "step": 1865 + }, + { + "epoch": 0.318484383000512, + "grad_norm": 0.5321610463982587, + "learning_rate": 2.7267451783580817e-05, + "loss": 0.6587, + "num_tokens": 175923529.0, + "step": 1866 + }, + { + "epoch": 0.31865506059054444, + "grad_norm": 0.5081755308902504, + "learning_rate": 2.726062467997952e-05, + "loss": 0.5818, + "num_tokens": 176009724.0, + "step": 1867 + }, + { + "epoch": 0.3188257381805769, + "grad_norm": 0.48858769134510405, + "learning_rate": 2.725379757637822e-05, + "loss": 0.5474, + "num_tokens": 176107059.0, + "step": 1868 + }, + { + "epoch": 0.31899641577060933, + "grad_norm": 0.5381093309747611, + "learning_rate": 2.7246970472776925e-05, + "loss": 0.6329, + "num_tokens": 176191333.0, + "step": 1869 + }, + { + "epoch": 0.31916709336064175, + "grad_norm": 0.5392953689705774, + "learning_rate": 2.724014336917563e-05, + "loss": 0.6996, + "num_tokens": 176291374.0, + "step": 1870 + }, + { + "epoch": 0.31933777095067417, + "grad_norm": 0.5562671188218282, + "learning_rate": 2.7233316265574332e-05, + "loss": 0.5999, + "num_tokens": 176365126.0, + "step": 1871 + }, + { + "epoch": 0.3195084485407066, + "grad_norm": 0.5576451085408359, + "learning_rate": 2.7226489161973036e-05, + "loss": 0.5997, + "num_tokens": 176438803.0, + "step": 1872 + }, + { + "epoch": 0.319679126130739, + "grad_norm": 0.5700224174575984, + "learning_rate": 2.721966205837174e-05, + "loss": 0.7517, + "num_tokens": 176524425.0, + "step": 1873 + }, + { + "epoch": 0.3198498037207715, + "grad_norm": 0.4725734901847189, + "learning_rate": 2.721283495477044e-05, + "loss": 0.5826, + "num_tokens": 176641852.0, + "step": 1874 + }, + { + "epoch": 0.3200204813108039, + "grad_norm": 0.48938216803461215, + "learning_rate": 2.7206007851169144e-05, + "loss": 0.656, + "num_tokens": 176759562.0, + "step": 1875 + }, + { + "epoch": 0.3201911589008363, + "grad_norm": 0.5576330861832379, + "learning_rate": 2.7199180747567848e-05, + "loss": 0.5591, + "num_tokens": 176827604.0, + "step": 1876 + }, + { + "epoch": 0.32036183649086875, + "grad_norm": 0.48741672565527133, + "learning_rate": 2.719235364396655e-05, + "loss": 0.6414, + "num_tokens": 176931017.0, + "step": 1877 + }, + { + "epoch": 0.32053251408090117, + "grad_norm": 0.494794306828343, + "learning_rate": 2.7185526540365252e-05, + "loss": 0.6285, + "num_tokens": 177030531.0, + "step": 1878 + }, + { + "epoch": 0.3207031916709336, + "grad_norm": 0.5023381941105307, + "learning_rate": 2.7178699436763952e-05, + "loss": 0.5827, + "num_tokens": 177133933.0, + "step": 1879 + }, + { + "epoch": 0.32087386926096606, + "grad_norm": 0.5342844412480456, + "learning_rate": 2.7171872333162656e-05, + "loss": 0.6184, + "num_tokens": 177228283.0, + "step": 1880 + }, + { + "epoch": 0.3210445468509985, + "grad_norm": 0.4858691835206792, + "learning_rate": 2.716504522956136e-05, + "loss": 0.6008, + "num_tokens": 177322386.0, + "step": 1881 + }, + { + "epoch": 0.3212152244410309, + "grad_norm": 0.5045833000192924, + "learning_rate": 2.7158218125960063e-05, + "loss": 0.6819, + "num_tokens": 177436116.0, + "step": 1882 + }, + { + "epoch": 0.3213859020310633, + "grad_norm": 0.5069112210976342, + "learning_rate": 2.7151391022358767e-05, + "loss": 0.647, + "num_tokens": 177535594.0, + "step": 1883 + }, + { + "epoch": 0.32155657962109574, + "grad_norm": 0.46091934005088586, + "learning_rate": 2.714456391875747e-05, + "loss": 0.6061, + "num_tokens": 177644471.0, + "step": 1884 + }, + { + "epoch": 0.32172725721112816, + "grad_norm": 0.5192742772452592, + "learning_rate": 2.713773681515617e-05, + "loss": 0.6708, + "num_tokens": 177741133.0, + "step": 1885 + }, + { + "epoch": 0.32189793480116063, + "grad_norm": 0.5579497951834359, + "learning_rate": 2.7130909711554875e-05, + "loss": 0.6196, + "num_tokens": 177814966.0, + "step": 1886 + }, + { + "epoch": 0.32206861239119305, + "grad_norm": 0.5477769810074912, + "learning_rate": 2.712408260795358e-05, + "loss": 0.622, + "num_tokens": 177908325.0, + "step": 1887 + }, + { + "epoch": 0.3222392899812255, + "grad_norm": 0.5173005305963398, + "learning_rate": 2.7117255504352283e-05, + "loss": 0.6135, + "num_tokens": 177992322.0, + "step": 1888 + }, + { + "epoch": 0.3224099675712579, + "grad_norm": 0.45320846150921124, + "learning_rate": 2.7110428400750986e-05, + "loss": 0.5721, + "num_tokens": 178100090.0, + "step": 1889 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 0.5794935960018746, + "learning_rate": 2.710360129714969e-05, + "loss": 0.6199, + "num_tokens": 178168170.0, + "step": 1890 + }, + { + "epoch": 0.32275132275132273, + "grad_norm": 0.543870346153487, + "learning_rate": 2.7096774193548387e-05, + "loss": 0.6006, + "num_tokens": 178242688.0, + "step": 1891 + }, + { + "epoch": 0.32292200034135515, + "grad_norm": 0.5257829599894944, + "learning_rate": 2.708994708994709e-05, + "loss": 0.7227, + "num_tokens": 178342918.0, + "step": 1892 + }, + { + "epoch": 0.32309267793138763, + "grad_norm": 0.5215613176433986, + "learning_rate": 2.7083119986345795e-05, + "loss": 0.6973, + "num_tokens": 178453073.0, + "step": 1893 + }, + { + "epoch": 0.32326335552142005, + "grad_norm": 0.4684466259687379, + "learning_rate": 2.70762928827445e-05, + "loss": 0.6208, + "num_tokens": 178580050.0, + "step": 1894 + }, + { + "epoch": 0.32343403311145247, + "grad_norm": 0.4861496287039147, + "learning_rate": 2.70694657791432e-05, + "loss": 0.6621, + "num_tokens": 178686014.0, + "step": 1895 + }, + { + "epoch": 0.3236047107014849, + "grad_norm": 0.5597236775858695, + "learning_rate": 2.7062638675541903e-05, + "loss": 0.7195, + "num_tokens": 178786632.0, + "step": 1896 + }, + { + "epoch": 0.3237753882915173, + "grad_norm": 0.4629258004409705, + "learning_rate": 2.7055811571940606e-05, + "loss": 0.5354, + "num_tokens": 178886318.0, + "step": 1897 + }, + { + "epoch": 0.3239460658815497, + "grad_norm": 0.5046581629086911, + "learning_rate": 2.704898446833931e-05, + "loss": 0.6167, + "num_tokens": 178985327.0, + "step": 1898 + }, + { + "epoch": 0.3241167434715822, + "grad_norm": 0.5242531182441059, + "learning_rate": 2.7042157364738014e-05, + "loss": 0.5366, + "num_tokens": 179061975.0, + "step": 1899 + }, + { + "epoch": 0.3242874210616146, + "grad_norm": 0.5275215742279357, + "learning_rate": 2.7035330261136718e-05, + "loss": 0.5734, + "num_tokens": 179139224.0, + "step": 1900 + }, + { + "epoch": 0.32445809865164704, + "grad_norm": 0.5267199728284313, + "learning_rate": 2.7028503157535418e-05, + "loss": 0.5512, + "num_tokens": 179215454.0, + "step": 1901 + }, + { + "epoch": 0.32462877624167946, + "grad_norm": 0.5924769425670209, + "learning_rate": 2.7021676053934122e-05, + "loss": 0.5809, + "num_tokens": 179310359.0, + "step": 1902 + }, + { + "epoch": 0.3247994538317119, + "grad_norm": 0.5659247096423088, + "learning_rate": 2.7014848950332822e-05, + "loss": 0.6915, + "num_tokens": 179406546.0, + "step": 1903 + }, + { + "epoch": 0.3249701314217443, + "grad_norm": 0.47045762969952154, + "learning_rate": 2.7008021846731526e-05, + "loss": 0.6205, + "num_tokens": 179515154.0, + "step": 1904 + }, + { + "epoch": 0.3251408090117768, + "grad_norm": 0.5124701268666754, + "learning_rate": 2.7001194743130226e-05, + "loss": 0.5599, + "num_tokens": 179593136.0, + "step": 1905 + }, + { + "epoch": 0.3253114866018092, + "grad_norm": 0.50646867263811, + "learning_rate": 2.699436763952893e-05, + "loss": 0.6619, + "num_tokens": 179682868.0, + "step": 1906 + }, + { + "epoch": 0.3254821641918416, + "grad_norm": 0.49672186686685693, + "learning_rate": 2.6987540535927634e-05, + "loss": 0.6202, + "num_tokens": 179784221.0, + "step": 1907 + }, + { + "epoch": 0.32565284178187404, + "grad_norm": 0.5085628876249997, + "learning_rate": 2.6980713432326338e-05, + "loss": 0.6003, + "num_tokens": 179874686.0, + "step": 1908 + }, + { + "epoch": 0.32582351937190646, + "grad_norm": 0.5160845616846425, + "learning_rate": 2.697388632872504e-05, + "loss": 0.6629, + "num_tokens": 179980906.0, + "step": 1909 + }, + { + "epoch": 0.3259941969619389, + "grad_norm": 0.5552344749719358, + "learning_rate": 2.6967059225123745e-05, + "loss": 0.7109, + "num_tokens": 180060860.0, + "step": 1910 + }, + { + "epoch": 0.32616487455197135, + "grad_norm": 0.5244682226944963, + "learning_rate": 2.696023212152245e-05, + "loss": 0.6016, + "num_tokens": 180141540.0, + "step": 1911 + }, + { + "epoch": 0.32633555214200377, + "grad_norm": 0.5189860297345052, + "learning_rate": 2.695340501792115e-05, + "loss": 0.6044, + "num_tokens": 180227219.0, + "step": 1912 + }, + { + "epoch": 0.3265062297320362, + "grad_norm": 0.5259039160645965, + "learning_rate": 2.6946577914319853e-05, + "loss": 0.5874, + "num_tokens": 180315633.0, + "step": 1913 + }, + { + "epoch": 0.3266769073220686, + "grad_norm": 0.5195657921447284, + "learning_rate": 2.6939750810718557e-05, + "loss": 0.5559, + "num_tokens": 180398929.0, + "step": 1914 + }, + { + "epoch": 0.32684758491210103, + "grad_norm": 0.53289174753913, + "learning_rate": 2.6932923707117257e-05, + "loss": 0.7015, + "num_tokens": 180499400.0, + "step": 1915 + }, + { + "epoch": 0.32701826250213345, + "grad_norm": 0.5139681003118031, + "learning_rate": 2.6926096603515957e-05, + "loss": 0.6339, + "num_tokens": 180594638.0, + "step": 1916 + }, + { + "epoch": 0.3271889400921659, + "grad_norm": 0.5204326590827141, + "learning_rate": 2.691926949991466e-05, + "loss": 0.5818, + "num_tokens": 180675512.0, + "step": 1917 + }, + { + "epoch": 0.32735961768219835, + "grad_norm": 0.5211240059407786, + "learning_rate": 2.6912442396313365e-05, + "loss": 0.5462, + "num_tokens": 180754118.0, + "step": 1918 + }, + { + "epoch": 0.32753029527223076, + "grad_norm": 0.5182764154274138, + "learning_rate": 2.690561529271207e-05, + "loss": 0.6542, + "num_tokens": 180849545.0, + "step": 1919 + }, + { + "epoch": 0.3277009728622632, + "grad_norm": 0.4664177390275575, + "learning_rate": 2.6898788189110773e-05, + "loss": 0.6758, + "num_tokens": 180989346.0, + "step": 1920 + }, + { + "epoch": 0.3278716504522956, + "grad_norm": 0.5489103438718367, + "learning_rate": 2.6891961085509476e-05, + "loss": 0.63, + "num_tokens": 181065958.0, + "step": 1921 + }, + { + "epoch": 0.328042328042328, + "grad_norm": 0.5101175656176758, + "learning_rate": 2.6885133981908177e-05, + "loss": 0.6348, + "num_tokens": 181160327.0, + "step": 1922 + }, + { + "epoch": 0.32821300563236044, + "grad_norm": 0.5062551602033847, + "learning_rate": 2.687830687830688e-05, + "loss": 0.5502, + "num_tokens": 181239796.0, + "step": 1923 + }, + { + "epoch": 0.3283836832223929, + "grad_norm": 0.5332846453865447, + "learning_rate": 2.6871479774705584e-05, + "loss": 0.6408, + "num_tokens": 181324739.0, + "step": 1924 + }, + { + "epoch": 0.32855436081242534, + "grad_norm": 0.5315342797279429, + "learning_rate": 2.6864652671104288e-05, + "loss": 0.603, + "num_tokens": 181415968.0, + "step": 1925 + }, + { + "epoch": 0.32872503840245776, + "grad_norm": 0.4902117444361275, + "learning_rate": 2.6857825567502992e-05, + "loss": 0.6272, + "num_tokens": 181524889.0, + "step": 1926 + }, + { + "epoch": 0.3288957159924902, + "grad_norm": 0.47464486033562714, + "learning_rate": 2.6850998463901695e-05, + "loss": 0.6142, + "num_tokens": 181628459.0, + "step": 1927 + }, + { + "epoch": 0.3290663935825226, + "grad_norm": 0.5194344933837864, + "learning_rate": 2.6844171360300392e-05, + "loss": 0.6236, + "num_tokens": 181711001.0, + "step": 1928 + }, + { + "epoch": 0.329237071172555, + "grad_norm": 0.44988794519528585, + "learning_rate": 2.6837344256699096e-05, + "loss": 0.6125, + "num_tokens": 181837907.0, + "step": 1929 + }, + { + "epoch": 0.3294077487625875, + "grad_norm": 0.4903821401356589, + "learning_rate": 2.68305171530978e-05, + "loss": 0.5452, + "num_tokens": 181935225.0, + "step": 1930 + }, + { + "epoch": 0.3295784263526199, + "grad_norm": 0.4974956255249516, + "learning_rate": 2.6823690049496504e-05, + "loss": 0.685, + "num_tokens": 182042209.0, + "step": 1931 + }, + { + "epoch": 0.32974910394265233, + "grad_norm": 0.5178728612715789, + "learning_rate": 2.6816862945895204e-05, + "loss": 0.6737, + "num_tokens": 182138142.0, + "step": 1932 + }, + { + "epoch": 0.32991978153268475, + "grad_norm": 0.5535101268768201, + "learning_rate": 2.6810035842293908e-05, + "loss": 0.7063, + "num_tokens": 182245085.0, + "step": 1933 + }, + { + "epoch": 0.3300904591227172, + "grad_norm": 0.5228180010797745, + "learning_rate": 2.680320873869261e-05, + "loss": 0.5898, + "num_tokens": 182341902.0, + "step": 1934 + }, + { + "epoch": 0.3302611367127496, + "grad_norm": 0.5395768424917943, + "learning_rate": 2.6796381635091315e-05, + "loss": 0.6432, + "num_tokens": 182433906.0, + "step": 1935 + }, + { + "epoch": 0.33043181430278207, + "grad_norm": 0.532247254114516, + "learning_rate": 2.678955453149002e-05, + "loss": 0.5869, + "num_tokens": 182505842.0, + "step": 1936 + }, + { + "epoch": 0.3306024918928145, + "grad_norm": 0.508342782279506, + "learning_rate": 2.6782727427888723e-05, + "loss": 0.648, + "num_tokens": 182609549.0, + "step": 1937 + }, + { + "epoch": 0.3307731694828469, + "grad_norm": 0.49591208372725304, + "learning_rate": 2.6775900324287423e-05, + "loss": 0.5948, + "num_tokens": 182721902.0, + "step": 1938 + }, + { + "epoch": 0.3309438470728793, + "grad_norm": 0.5813788084963324, + "learning_rate": 2.6769073220686127e-05, + "loss": 0.7468, + "num_tokens": 182803328.0, + "step": 1939 + }, + { + "epoch": 0.33111452466291175, + "grad_norm": 0.5466027270035207, + "learning_rate": 2.6762246117084827e-05, + "loss": 0.7371, + "num_tokens": 182904076.0, + "step": 1940 + }, + { + "epoch": 0.33128520225294417, + "grad_norm": 0.5356861681061328, + "learning_rate": 2.675541901348353e-05, + "loss": 0.6216, + "num_tokens": 183006865.0, + "step": 1941 + }, + { + "epoch": 0.33145587984297664, + "grad_norm": 0.48244871047267024, + "learning_rate": 2.6748591909882235e-05, + "loss": 0.6706, + "num_tokens": 183130404.0, + "step": 1942 + }, + { + "epoch": 0.33162655743300906, + "grad_norm": 0.45873992818938203, + "learning_rate": 2.6741764806280935e-05, + "loss": 0.6017, + "num_tokens": 183244754.0, + "step": 1943 + }, + { + "epoch": 0.3317972350230415, + "grad_norm": 0.49143487468288766, + "learning_rate": 2.673493770267964e-05, + "loss": 0.5742, + "num_tokens": 183336947.0, + "step": 1944 + }, + { + "epoch": 0.3319679126130739, + "grad_norm": 0.5049277735375246, + "learning_rate": 2.6728110599078343e-05, + "loss": 0.64, + "num_tokens": 183440277.0, + "step": 1945 + }, + { + "epoch": 0.3321385902031063, + "grad_norm": 0.507140575427548, + "learning_rate": 2.6721283495477047e-05, + "loss": 0.5883, + "num_tokens": 183531265.0, + "step": 1946 + }, + { + "epoch": 0.33230926779313874, + "grad_norm": 0.47359848001669835, + "learning_rate": 2.671445639187575e-05, + "loss": 0.6307, + "num_tokens": 183638488.0, + "step": 1947 + }, + { + "epoch": 0.3324799453831712, + "grad_norm": 0.46068884496776163, + "learning_rate": 2.6707629288274454e-05, + "loss": 0.6216, + "num_tokens": 183752727.0, + "step": 1948 + }, + { + "epoch": 0.33265062297320364, + "grad_norm": 0.5724728146515059, + "learning_rate": 2.6700802184673155e-05, + "loss": 0.6718, + "num_tokens": 183822739.0, + "step": 1949 + }, + { + "epoch": 0.33282130056323606, + "grad_norm": 0.4857913367727272, + "learning_rate": 2.669397508107186e-05, + "loss": 0.5831, + "num_tokens": 183915487.0, + "step": 1950 + }, + { + "epoch": 0.3329919781532685, + "grad_norm": 0.5081941323526794, + "learning_rate": 2.6687147977470562e-05, + "loss": 0.6157, + "num_tokens": 184023509.0, + "step": 1951 + }, + { + "epoch": 0.3331626557433009, + "grad_norm": 0.48581796607884437, + "learning_rate": 2.6680320873869262e-05, + "loss": 0.5588, + "num_tokens": 184128920.0, + "step": 1952 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5169853295813558, + "learning_rate": 2.6673493770267963e-05, + "loss": 0.5707, + "num_tokens": 184218441.0, + "step": 1953 + }, + { + "epoch": 0.33350401092336573, + "grad_norm": 0.46853249578058764, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.5703, + "num_tokens": 184328911.0, + "step": 1954 + }, + { + "epoch": 0.3336746885133982, + "grad_norm": 0.5659009107860529, + "learning_rate": 2.665983956306537e-05, + "loss": 0.6438, + "num_tokens": 184410923.0, + "step": 1955 + }, + { + "epoch": 0.33384536610343063, + "grad_norm": 0.6941265517804126, + "learning_rate": 2.6653012459464074e-05, + "loss": 0.7971, + "num_tokens": 184521203.0, + "step": 1956 + }, + { + "epoch": 0.33401604369346305, + "grad_norm": 0.5032712047496162, + "learning_rate": 2.6646185355862778e-05, + "loss": 0.5654, + "num_tokens": 184608090.0, + "step": 1957 + }, + { + "epoch": 0.33418672128349547, + "grad_norm": 0.4913815613940602, + "learning_rate": 2.663935825226148e-05, + "loss": 0.6463, + "num_tokens": 184719296.0, + "step": 1958 + }, + { + "epoch": 0.3343573988735279, + "grad_norm": 0.9051111618803426, + "learning_rate": 2.6632531148660182e-05, + "loss": 0.7064, + "num_tokens": 184819483.0, + "step": 1959 + }, + { + "epoch": 0.3345280764635603, + "grad_norm": 0.6781336605397107, + "learning_rate": 2.6625704045058886e-05, + "loss": 0.6341, + "num_tokens": 184896315.0, + "step": 1960 + }, + { + "epoch": 0.3346987540535928, + "grad_norm": 0.4842960064807459, + "learning_rate": 2.661887694145759e-05, + "loss": 0.5971, + "num_tokens": 184992573.0, + "step": 1961 + }, + { + "epoch": 0.3348694316436252, + "grad_norm": 0.48721044708167066, + "learning_rate": 2.6612049837856293e-05, + "loss": 0.644, + "num_tokens": 185093912.0, + "step": 1962 + }, + { + "epoch": 0.3350401092336576, + "grad_norm": 0.6835529682178801, + "learning_rate": 2.6605222734254997e-05, + "loss": 0.7048, + "num_tokens": 185181848.0, + "step": 1963 + }, + { + "epoch": 0.33521078682369004, + "grad_norm": 0.4964705801061626, + "learning_rate": 2.65983956306537e-05, + "loss": 0.572, + "num_tokens": 185276775.0, + "step": 1964 + }, + { + "epoch": 0.33538146441372246, + "grad_norm": 0.5425145020573877, + "learning_rate": 2.6591568527052398e-05, + "loss": 0.5417, + "num_tokens": 185351602.0, + "step": 1965 + }, + { + "epoch": 0.3355521420037549, + "grad_norm": 0.46657957114185955, + "learning_rate": 2.65847414234511e-05, + "loss": 0.5982, + "num_tokens": 185461091.0, + "step": 1966 + }, + { + "epoch": 0.33572281959378736, + "grad_norm": 0.5500341453366612, + "learning_rate": 2.6577914319849805e-05, + "loss": 0.5635, + "num_tokens": 185541465.0, + "step": 1967 + }, + { + "epoch": 0.3358934971838198, + "grad_norm": 0.5598551318744879, + "learning_rate": 2.657108721624851e-05, + "loss": 0.6274, + "num_tokens": 185623328.0, + "step": 1968 + }, + { + "epoch": 0.3360641747738522, + "grad_norm": 0.49466582101723605, + "learning_rate": 2.656426011264721e-05, + "loss": 0.5753, + "num_tokens": 185719822.0, + "step": 1969 + }, + { + "epoch": 0.3362348523638846, + "grad_norm": 0.5481429406337164, + "learning_rate": 2.6557433009045913e-05, + "loss": 0.661, + "num_tokens": 185812552.0, + "step": 1970 + }, + { + "epoch": 0.33640552995391704, + "grad_norm": 0.4769052211906058, + "learning_rate": 2.6550605905444617e-05, + "loss": 0.608, + "num_tokens": 185926554.0, + "step": 1971 + }, + { + "epoch": 0.33657620754394946, + "grad_norm": 0.47723554284159864, + "learning_rate": 2.654377880184332e-05, + "loss": 0.6156, + "num_tokens": 186053031.0, + "step": 1972 + }, + { + "epoch": 0.33674688513398193, + "grad_norm": 0.504363311881063, + "learning_rate": 2.6536951698242025e-05, + "loss": 0.6452, + "num_tokens": 186145343.0, + "step": 1973 + }, + { + "epoch": 0.33691756272401435, + "grad_norm": 0.5381750117533676, + "learning_rate": 2.6530124594640728e-05, + "loss": 0.5821, + "num_tokens": 186230771.0, + "step": 1974 + }, + { + "epoch": 0.33708824031404677, + "grad_norm": 0.49059624245517675, + "learning_rate": 2.652329749103943e-05, + "loss": 0.5765, + "num_tokens": 186327375.0, + "step": 1975 + }, + { + "epoch": 0.3372589179040792, + "grad_norm": 0.5903381688615645, + "learning_rate": 2.6516470387438132e-05, + "loss": 0.6363, + "num_tokens": 186410889.0, + "step": 1976 + }, + { + "epoch": 0.3374295954941116, + "grad_norm": 0.4757470343349251, + "learning_rate": 2.6509643283836833e-05, + "loss": 0.5926, + "num_tokens": 186512937.0, + "step": 1977 + }, + { + "epoch": 0.33760027308414403, + "grad_norm": 0.5333065890697491, + "learning_rate": 2.6502816180235537e-05, + "loss": 0.5757, + "num_tokens": 186591291.0, + "step": 1978 + }, + { + "epoch": 0.3377709506741765, + "grad_norm": 0.4974748695315714, + "learning_rate": 2.649598907663424e-05, + "loss": 0.5865, + "num_tokens": 186683029.0, + "step": 1979 + }, + { + "epoch": 0.3379416282642089, + "grad_norm": 0.5397101787999651, + "learning_rate": 2.648916197303294e-05, + "loss": 0.6161, + "num_tokens": 186770545.0, + "step": 1980 + }, + { + "epoch": 0.33811230585424135, + "grad_norm": 0.5164728364028098, + "learning_rate": 2.6482334869431644e-05, + "loss": 0.5855, + "num_tokens": 186852884.0, + "step": 1981 + }, + { + "epoch": 0.33828298344427377, + "grad_norm": 0.5454482483601047, + "learning_rate": 2.6475507765830348e-05, + "loss": 0.6178, + "num_tokens": 186929956.0, + "step": 1982 + }, + { + "epoch": 0.3384536610343062, + "grad_norm": 0.4863253156492437, + "learning_rate": 2.6468680662229052e-05, + "loss": 0.5763, + "num_tokens": 187033834.0, + "step": 1983 + }, + { + "epoch": 0.3386243386243386, + "grad_norm": 0.5373455111350864, + "learning_rate": 2.6461853558627756e-05, + "loss": 0.559, + "num_tokens": 187116471.0, + "step": 1984 + }, + { + "epoch": 0.3387950162143711, + "grad_norm": 0.5073241815216564, + "learning_rate": 2.645502645502646e-05, + "loss": 0.5781, + "num_tokens": 187208305.0, + "step": 1985 + }, + { + "epoch": 0.3389656938044035, + "grad_norm": 0.595156934476474, + "learning_rate": 2.644819935142516e-05, + "loss": 0.6902, + "num_tokens": 187304436.0, + "step": 1986 + }, + { + "epoch": 0.3391363713944359, + "grad_norm": 0.6044839081059398, + "learning_rate": 2.6441372247823864e-05, + "loss": 0.6487, + "num_tokens": 187416207.0, + "step": 1987 + }, + { + "epoch": 0.33930704898446834, + "grad_norm": 0.5669580555126262, + "learning_rate": 2.6434545144222567e-05, + "loss": 0.6244, + "num_tokens": 187488973.0, + "step": 1988 + }, + { + "epoch": 0.33947772657450076, + "grad_norm": 0.4647826123746471, + "learning_rate": 2.642771804062127e-05, + "loss": 0.6799, + "num_tokens": 187617198.0, + "step": 1989 + }, + { + "epoch": 0.3396484041645332, + "grad_norm": 0.4558545948620897, + "learning_rate": 2.6420890937019968e-05, + "loss": 0.5887, + "num_tokens": 187742403.0, + "step": 1990 + }, + { + "epoch": 0.3398190817545656, + "grad_norm": 0.4971554965151551, + "learning_rate": 2.6414063833418672e-05, + "loss": 0.6157, + "num_tokens": 187840661.0, + "step": 1991 + }, + { + "epoch": 0.3399897593445981, + "grad_norm": 0.432505322727223, + "learning_rate": 2.6407236729817376e-05, + "loss": 0.5594, + "num_tokens": 187953088.0, + "step": 1992 + }, + { + "epoch": 0.3401604369346305, + "grad_norm": 0.5054505405491881, + "learning_rate": 2.640040962621608e-05, + "loss": 0.6566, + "num_tokens": 188040878.0, + "step": 1993 + }, + { + "epoch": 0.3403311145246629, + "grad_norm": 0.5786789748459928, + "learning_rate": 2.6393582522614783e-05, + "loss": 0.6009, + "num_tokens": 188132563.0, + "step": 1994 + }, + { + "epoch": 0.34050179211469533, + "grad_norm": 0.5025080580638932, + "learning_rate": 2.6386755419013487e-05, + "loss": 0.5753, + "num_tokens": 188219974.0, + "step": 1995 + }, + { + "epoch": 0.34067246970472775, + "grad_norm": 0.5277336572109107, + "learning_rate": 2.6379928315412187e-05, + "loss": 0.6328, + "num_tokens": 188317076.0, + "step": 1996 + }, + { + "epoch": 0.3408431472947602, + "grad_norm": 0.5035552898121727, + "learning_rate": 2.637310121181089e-05, + "loss": 0.5713, + "num_tokens": 188399604.0, + "step": 1997 + }, + { + "epoch": 0.34101382488479265, + "grad_norm": 0.4973047450474556, + "learning_rate": 2.6366274108209595e-05, + "loss": 0.6861, + "num_tokens": 188505926.0, + "step": 1998 + }, + { + "epoch": 0.34118450247482507, + "grad_norm": 0.5747142698065452, + "learning_rate": 2.63594470046083e-05, + "loss": 0.6086, + "num_tokens": 188569751.0, + "step": 1999 + }, + { + "epoch": 0.3413551800648575, + "grad_norm": 0.48729321236758205, + "learning_rate": 2.6352619901007002e-05, + "loss": 0.5877, + "num_tokens": 188676771.0, + "step": 2000 + }, + { + "epoch": 0.3415258576548899, + "grad_norm": 0.5332638658246801, + "learning_rate": 2.6345792797405706e-05, + "loss": 0.5766, + "num_tokens": 188751606.0, + "step": 2001 + }, + { + "epoch": 0.34169653524492233, + "grad_norm": 0.5116934329538998, + "learning_rate": 2.6338965693804403e-05, + "loss": 0.5408, + "num_tokens": 188827257.0, + "step": 2002 + }, + { + "epoch": 0.34186721283495475, + "grad_norm": 0.4355897628340976, + "learning_rate": 2.6332138590203107e-05, + "loss": 0.5831, + "num_tokens": 188952119.0, + "step": 2003 + }, + { + "epoch": 0.3420378904249872, + "grad_norm": 0.5087832964828028, + "learning_rate": 2.632531148660181e-05, + "loss": 0.6656, + "num_tokens": 189059696.0, + "step": 2004 + }, + { + "epoch": 0.34220856801501964, + "grad_norm": 0.4335782651188882, + "learning_rate": 2.6318484383000514e-05, + "loss": 0.5575, + "num_tokens": 189187113.0, + "step": 2005 + }, + { + "epoch": 0.34237924560505206, + "grad_norm": 0.4963739778559609, + "learning_rate": 2.6311657279399215e-05, + "loss": 0.6098, + "num_tokens": 189278978.0, + "step": 2006 + }, + { + "epoch": 0.3425499231950845, + "grad_norm": 0.5295843520995083, + "learning_rate": 2.630483017579792e-05, + "loss": 0.6712, + "num_tokens": 189375270.0, + "step": 2007 + }, + { + "epoch": 0.3427206007851169, + "grad_norm": 0.5173220114287612, + "learning_rate": 2.6298003072196622e-05, + "loss": 0.5692, + "num_tokens": 189461403.0, + "step": 2008 + }, + { + "epoch": 0.3428912783751493, + "grad_norm": 0.4890394967817501, + "learning_rate": 2.6291175968595326e-05, + "loss": 0.6561, + "num_tokens": 189577391.0, + "step": 2009 + }, + { + "epoch": 0.3430619559651818, + "grad_norm": 0.5190595554278892, + "learning_rate": 2.628434886499403e-05, + "loss": 0.533, + "num_tokens": 189658723.0, + "step": 2010 + }, + { + "epoch": 0.3432326335552142, + "grad_norm": 0.4888314885031783, + "learning_rate": 2.6277521761392734e-05, + "loss": 0.616, + "num_tokens": 189773328.0, + "step": 2011 + }, + { + "epoch": 0.34340331114524664, + "grad_norm": 0.5382879969037266, + "learning_rate": 2.6270694657791434e-05, + "loss": 0.5813, + "num_tokens": 189845493.0, + "step": 2012 + }, + { + "epoch": 0.34357398873527906, + "grad_norm": 0.5351753726932221, + "learning_rate": 2.6263867554190138e-05, + "loss": 0.6187, + "num_tokens": 189926806.0, + "step": 2013 + }, + { + "epoch": 0.3437446663253115, + "grad_norm": 0.48363917806378565, + "learning_rate": 2.6257040450588838e-05, + "loss": 0.5656, + "num_tokens": 190017708.0, + "step": 2014 + }, + { + "epoch": 0.3439153439153439, + "grad_norm": 0.49219373843971426, + "learning_rate": 2.6250213346987542e-05, + "loss": 0.6524, + "num_tokens": 190115902.0, + "step": 2015 + }, + { + "epoch": 0.34408602150537637, + "grad_norm": 0.5677379293902859, + "learning_rate": 2.6243386243386246e-05, + "loss": 0.6583, + "num_tokens": 190188603.0, + "step": 2016 + }, + { + "epoch": 0.3442566990954088, + "grad_norm": 0.5184138977011736, + "learning_rate": 2.6236559139784946e-05, + "loss": 0.6402, + "num_tokens": 190271784.0, + "step": 2017 + }, + { + "epoch": 0.3444273766854412, + "grad_norm": 0.4679885175028407, + "learning_rate": 2.622973203618365e-05, + "loss": 0.5913, + "num_tokens": 190382700.0, + "step": 2018 + }, + { + "epoch": 0.34459805427547363, + "grad_norm": 0.4974155333572019, + "learning_rate": 2.6222904932582354e-05, + "loss": 0.6058, + "num_tokens": 190481831.0, + "step": 2019 + }, + { + "epoch": 0.34476873186550605, + "grad_norm": 0.6032273792025741, + "learning_rate": 2.6216077828981057e-05, + "loss": 0.6234, + "num_tokens": 190557662.0, + "step": 2020 + }, + { + "epoch": 0.34493940945553847, + "grad_norm": 0.5034114171328479, + "learning_rate": 2.620925072537976e-05, + "loss": 0.6559, + "num_tokens": 190660957.0, + "step": 2021 + }, + { + "epoch": 0.3451100870455709, + "grad_norm": 0.5267131826893524, + "learning_rate": 2.6202423621778465e-05, + "loss": 0.5699, + "num_tokens": 190744050.0, + "step": 2022 + }, + { + "epoch": 0.34528076463560337, + "grad_norm": 0.508518025631889, + "learning_rate": 2.6195596518177165e-05, + "loss": 0.6131, + "num_tokens": 190844221.0, + "step": 2023 + }, + { + "epoch": 0.3454514422256358, + "grad_norm": 0.5182029929612848, + "learning_rate": 2.618876941457587e-05, + "loss": 0.6297, + "num_tokens": 190944847.0, + "step": 2024 + }, + { + "epoch": 0.3456221198156682, + "grad_norm": 0.5044400864456456, + "learning_rate": 2.6181942310974573e-05, + "loss": 0.6661, + "num_tokens": 191040544.0, + "step": 2025 + }, + { + "epoch": 0.3457927974057006, + "grad_norm": 0.4756422384982896, + "learning_rate": 2.6175115207373277e-05, + "loss": 0.5745, + "num_tokens": 191136159.0, + "step": 2026 + }, + { + "epoch": 0.34596347499573304, + "grad_norm": 0.5204590320327938, + "learning_rate": 2.6168288103771974e-05, + "loss": 0.6364, + "num_tokens": 191223743.0, + "step": 2027 + }, + { + "epoch": 0.34613415258576546, + "grad_norm": 0.48702673236325966, + "learning_rate": 2.6161461000170677e-05, + "loss": 0.614, + "num_tokens": 191323076.0, + "step": 2028 + }, + { + "epoch": 0.34630483017579794, + "grad_norm": 0.5248346225118843, + "learning_rate": 2.615463389656938e-05, + "loss": 0.5959, + "num_tokens": 191402459.0, + "step": 2029 + }, + { + "epoch": 0.34647550776583036, + "grad_norm": 0.5164412837996571, + "learning_rate": 2.6147806792968085e-05, + "loss": 0.6432, + "num_tokens": 191488264.0, + "step": 2030 + }, + { + "epoch": 0.3466461853558628, + "grad_norm": 0.4472357824730649, + "learning_rate": 2.614097968936679e-05, + "loss": 0.5858, + "num_tokens": 191605224.0, + "step": 2031 + }, + { + "epoch": 0.3468168629458952, + "grad_norm": 0.5096566965769875, + "learning_rate": 2.6134152585765492e-05, + "loss": 0.5795, + "num_tokens": 191695844.0, + "step": 2032 + }, + { + "epoch": 0.3469875405359276, + "grad_norm": 0.5700655464383156, + "learning_rate": 2.6127325482164193e-05, + "loss": 0.603, + "num_tokens": 191776523.0, + "step": 2033 + }, + { + "epoch": 0.34715821812596004, + "grad_norm": 0.5171411283024171, + "learning_rate": 2.6120498378562896e-05, + "loss": 0.7249, + "num_tokens": 191879820.0, + "step": 2034 + }, + { + "epoch": 0.3473288957159925, + "grad_norm": 0.5446799236720858, + "learning_rate": 2.61136712749616e-05, + "loss": 0.6684, + "num_tokens": 191975473.0, + "step": 2035 + }, + { + "epoch": 0.34749957330602493, + "grad_norm": 0.47105729277958236, + "learning_rate": 2.6106844171360304e-05, + "loss": 0.6223, + "num_tokens": 192093475.0, + "step": 2036 + }, + { + "epoch": 0.34767025089605735, + "grad_norm": 0.5202582850195848, + "learning_rate": 2.6100017067759008e-05, + "loss": 0.571, + "num_tokens": 192173854.0, + "step": 2037 + }, + { + "epoch": 0.3478409284860898, + "grad_norm": 0.44187254657236363, + "learning_rate": 2.609318996415771e-05, + "loss": 0.5984, + "num_tokens": 192287150.0, + "step": 2038 + }, + { + "epoch": 0.3480116060761222, + "grad_norm": 0.4513066216354664, + "learning_rate": 2.608636286055641e-05, + "loss": 0.5683, + "num_tokens": 192402683.0, + "step": 2039 + }, + { + "epoch": 0.3481822836661546, + "grad_norm": 0.5010398892757366, + "learning_rate": 2.6079535756955112e-05, + "loss": 0.565, + "num_tokens": 192483800.0, + "step": 2040 + }, + { + "epoch": 0.3483529612561871, + "grad_norm": 0.4995997282725358, + "learning_rate": 2.6072708653353816e-05, + "loss": 0.5754, + "num_tokens": 192583662.0, + "step": 2041 + }, + { + "epoch": 0.3485236388462195, + "grad_norm": 0.4251171402473513, + "learning_rate": 2.606588154975252e-05, + "loss": 0.5711, + "num_tokens": 192711252.0, + "step": 2042 + }, + { + "epoch": 0.3486943164362519, + "grad_norm": 0.5334860698264265, + "learning_rate": 2.605905444615122e-05, + "loss": 0.6172, + "num_tokens": 192797507.0, + "step": 2043 + }, + { + "epoch": 0.34886499402628435, + "grad_norm": 0.49395679500962253, + "learning_rate": 2.6052227342549924e-05, + "loss": 0.619, + "num_tokens": 192900312.0, + "step": 2044 + }, + { + "epoch": 0.34903567161631677, + "grad_norm": 0.5324051302287356, + "learning_rate": 2.6045400238948628e-05, + "loss": 0.5707, + "num_tokens": 192978251.0, + "step": 2045 + }, + { + "epoch": 0.3492063492063492, + "grad_norm": 0.5279983314437814, + "learning_rate": 2.603857313534733e-05, + "loss": 0.623, + "num_tokens": 193073498.0, + "step": 2046 + }, + { + "epoch": 0.34937702679638166, + "grad_norm": 0.5386378388811883, + "learning_rate": 2.6031746031746035e-05, + "loss": 0.6603, + "num_tokens": 193151284.0, + "step": 2047 + }, + { + "epoch": 0.3495477043864141, + "grad_norm": 0.5094145518042561, + "learning_rate": 2.602491892814474e-05, + "loss": 0.554, + "num_tokens": 193237850.0, + "step": 2048 + }, + { + "epoch": 0.3497183819764465, + "grad_norm": 0.4968825027641457, + "learning_rate": 2.6018091824543443e-05, + "loss": 0.5731, + "num_tokens": 193328809.0, + "step": 2049 + }, + { + "epoch": 0.3498890595664789, + "grad_norm": 0.49366842702430985, + "learning_rate": 2.6011264720942143e-05, + "loss": 0.6308, + "num_tokens": 193432351.0, + "step": 2050 + }, + { + "epoch": 0.35005973715651134, + "grad_norm": 0.47598785086562806, + "learning_rate": 2.6004437617340847e-05, + "loss": 0.5812, + "num_tokens": 193528122.0, + "step": 2051 + }, + { + "epoch": 0.35023041474654376, + "grad_norm": 0.4673227993043067, + "learning_rate": 2.5997610513739547e-05, + "loss": 0.5027, + "num_tokens": 193616135.0, + "step": 2052 + }, + { + "epoch": 0.3504010923365762, + "grad_norm": 0.4602869017848875, + "learning_rate": 2.599078341013825e-05, + "loss": 0.5329, + "num_tokens": 193710552.0, + "step": 2053 + }, + { + "epoch": 0.35057176992660866, + "grad_norm": 0.49594503488076164, + "learning_rate": 2.598395630653695e-05, + "loss": 0.5571, + "num_tokens": 193802749.0, + "step": 2054 + }, + { + "epoch": 0.3507424475166411, + "grad_norm": 0.49780797990806175, + "learning_rate": 2.5977129202935655e-05, + "loss": 0.6484, + "num_tokens": 193895930.0, + "step": 2055 + }, + { + "epoch": 0.3509131251066735, + "grad_norm": 0.5382288839671874, + "learning_rate": 2.597030209933436e-05, + "loss": 0.6764, + "num_tokens": 193991948.0, + "step": 2056 + }, + { + "epoch": 0.3510838026967059, + "grad_norm": 0.5776372989010286, + "learning_rate": 2.5963474995733063e-05, + "loss": 0.612, + "num_tokens": 194072739.0, + "step": 2057 + }, + { + "epoch": 0.35125448028673834, + "grad_norm": 0.4821459507523979, + "learning_rate": 2.5956647892131766e-05, + "loss": 0.5446, + "num_tokens": 194169678.0, + "step": 2058 + }, + { + "epoch": 0.35142515787677076, + "grad_norm": 0.48391099312412156, + "learning_rate": 2.594982078853047e-05, + "loss": 0.5613, + "num_tokens": 194259510.0, + "step": 2059 + }, + { + "epoch": 0.35159583546680323, + "grad_norm": 0.5275001849852385, + "learning_rate": 2.594299368492917e-05, + "loss": 0.6312, + "num_tokens": 194345431.0, + "step": 2060 + }, + { + "epoch": 0.35176651305683565, + "grad_norm": 0.5272837743169848, + "learning_rate": 2.5936166581327874e-05, + "loss": 0.6422, + "num_tokens": 194439896.0, + "step": 2061 + }, + { + "epoch": 0.35193719064686807, + "grad_norm": 0.5458418385615955, + "learning_rate": 2.5929339477726578e-05, + "loss": 0.6655, + "num_tokens": 194521361.0, + "step": 2062 + }, + { + "epoch": 0.3521078682369005, + "grad_norm": 0.46306305722170177, + "learning_rate": 2.5922512374125282e-05, + "loss": 0.6302, + "num_tokens": 194649642.0, + "step": 2063 + }, + { + "epoch": 0.3522785458269329, + "grad_norm": 0.5579964095063006, + "learning_rate": 2.591568527052398e-05, + "loss": 0.6907, + "num_tokens": 194747210.0, + "step": 2064 + }, + { + "epoch": 0.35244922341696533, + "grad_norm": 0.5248085732463491, + "learning_rate": 2.5908858166922683e-05, + "loss": 0.6308, + "num_tokens": 194843903.0, + "step": 2065 + }, + { + "epoch": 0.3526199010069978, + "grad_norm": 0.6171067580891004, + "learning_rate": 2.5902031063321386e-05, + "loss": 0.619, + "num_tokens": 194929997.0, + "step": 2066 + }, + { + "epoch": 0.3527905785970302, + "grad_norm": 0.4945771982424894, + "learning_rate": 2.589520395972009e-05, + "loss": 0.6516, + "num_tokens": 195043040.0, + "step": 2067 + }, + { + "epoch": 0.35296125618706264, + "grad_norm": 0.5203649599189657, + "learning_rate": 2.5888376856118794e-05, + "loss": 0.5886, + "num_tokens": 195130144.0, + "step": 2068 + }, + { + "epoch": 0.35313193377709506, + "grad_norm": 0.5045223590169916, + "learning_rate": 2.5881549752517498e-05, + "loss": 0.5192, + "num_tokens": 195202205.0, + "step": 2069 + }, + { + "epoch": 0.3533026113671275, + "grad_norm": 0.4664162961346976, + "learning_rate": 2.5874722648916198e-05, + "loss": 0.6197, + "num_tokens": 195310219.0, + "step": 2070 + }, + { + "epoch": 0.3534732889571599, + "grad_norm": 0.5243747444099038, + "learning_rate": 2.5867895545314902e-05, + "loss": 0.6374, + "num_tokens": 195385019.0, + "step": 2071 + }, + { + "epoch": 0.3536439665471924, + "grad_norm": 0.4513033937985607, + "learning_rate": 2.5861068441713606e-05, + "loss": 0.5908, + "num_tokens": 195497655.0, + "step": 2072 + }, + { + "epoch": 0.3538146441372248, + "grad_norm": 0.4920970970460804, + "learning_rate": 2.585424133811231e-05, + "loss": 0.5416, + "num_tokens": 195581927.0, + "step": 2073 + }, + { + "epoch": 0.3539853217272572, + "grad_norm": 0.5124056770023176, + "learning_rate": 2.5847414234511013e-05, + "loss": 0.5807, + "num_tokens": 195672223.0, + "step": 2074 + }, + { + "epoch": 0.35415599931728964, + "grad_norm": 0.49130454237484433, + "learning_rate": 2.5840587130909717e-05, + "loss": 0.5455, + "num_tokens": 195758725.0, + "step": 2075 + }, + { + "epoch": 0.35432667690732206, + "grad_norm": 0.5059496397177149, + "learning_rate": 2.5833760027308414e-05, + "loss": 0.53, + "num_tokens": 195854492.0, + "step": 2076 + }, + { + "epoch": 0.3544973544973545, + "grad_norm": 0.47653442343138347, + "learning_rate": 2.5826932923707118e-05, + "loss": 0.5724, + "num_tokens": 195956226.0, + "step": 2077 + }, + { + "epoch": 0.35466803208738695, + "grad_norm": 0.4873633398929542, + "learning_rate": 2.582010582010582e-05, + "loss": 0.6137, + "num_tokens": 196059271.0, + "step": 2078 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 0.5384631816509435, + "learning_rate": 2.5813278716504525e-05, + "loss": 0.6209, + "num_tokens": 196142986.0, + "step": 2079 + }, + { + "epoch": 0.3550093872674518, + "grad_norm": 0.4355623256025275, + "learning_rate": 2.5806451612903226e-05, + "loss": 0.6258, + "num_tokens": 196270508.0, + "step": 2080 + }, + { + "epoch": 0.3551800648574842, + "grad_norm": 0.4939639682709106, + "learning_rate": 2.579962450930193e-05, + "loss": 0.6453, + "num_tokens": 196376758.0, + "step": 2081 + }, + { + "epoch": 0.35535074244751663, + "grad_norm": 0.4778362025783641, + "learning_rate": 2.5792797405700633e-05, + "loss": 0.6417, + "num_tokens": 196486037.0, + "step": 2082 + }, + { + "epoch": 0.35552142003754905, + "grad_norm": 0.5770920926268133, + "learning_rate": 2.5785970302099337e-05, + "loss": 0.5922, + "num_tokens": 196546820.0, + "step": 2083 + }, + { + "epoch": 0.35569209762758147, + "grad_norm": 0.5248187977229507, + "learning_rate": 2.577914319849804e-05, + "loss": 0.6494, + "num_tokens": 196636635.0, + "step": 2084 + }, + { + "epoch": 0.35586277521761395, + "grad_norm": 0.4459819609627772, + "learning_rate": 2.5772316094896744e-05, + "loss": 0.586, + "num_tokens": 196747571.0, + "step": 2085 + }, + { + "epoch": 0.35603345280764637, + "grad_norm": 0.43820885082076577, + "learning_rate": 2.5765488991295448e-05, + "loss": 0.5882, + "num_tokens": 196864596.0, + "step": 2086 + }, + { + "epoch": 0.3562041303976788, + "grad_norm": 0.4713623818923499, + "learning_rate": 2.575866188769415e-05, + "loss": 0.5515, + "num_tokens": 196954773.0, + "step": 2087 + }, + { + "epoch": 0.3563748079877112, + "grad_norm": 0.538690610357371, + "learning_rate": 2.5751834784092852e-05, + "loss": 0.5935, + "num_tokens": 197036484.0, + "step": 2088 + }, + { + "epoch": 0.3565454855777436, + "grad_norm": 0.550332713740218, + "learning_rate": 2.5745007680491553e-05, + "loss": 0.6264, + "num_tokens": 197115132.0, + "step": 2089 + }, + { + "epoch": 0.35671616316777605, + "grad_norm": 0.5842399987064149, + "learning_rate": 2.5738180576890256e-05, + "loss": 0.683, + "num_tokens": 197190318.0, + "step": 2090 + }, + { + "epoch": 0.3568868407578085, + "grad_norm": 0.5471545067928836, + "learning_rate": 2.5731353473288957e-05, + "loss": 0.6173, + "num_tokens": 197262144.0, + "step": 2091 + }, + { + "epoch": 0.35705751834784094, + "grad_norm": 0.4887568777611135, + "learning_rate": 2.572452636968766e-05, + "loss": 0.6212, + "num_tokens": 197375976.0, + "step": 2092 + }, + { + "epoch": 0.35722819593787336, + "grad_norm": 0.47721648484484985, + "learning_rate": 2.5717699266086364e-05, + "loss": 0.5817, + "num_tokens": 197473953.0, + "step": 2093 + }, + { + "epoch": 0.3573988735279058, + "grad_norm": 0.5143757023725052, + "learning_rate": 2.5710872162485068e-05, + "loss": 0.7116, + "num_tokens": 197575240.0, + "step": 2094 + }, + { + "epoch": 0.3575695511179382, + "grad_norm": 0.5810588212322416, + "learning_rate": 2.5704045058883772e-05, + "loss": 0.7454, + "num_tokens": 197648166.0, + "step": 2095 + }, + { + "epoch": 0.3577402287079706, + "grad_norm": 0.6548140475894928, + "learning_rate": 2.5697217955282476e-05, + "loss": 0.7489, + "num_tokens": 197764641.0, + "step": 2096 + }, + { + "epoch": 0.3579109062980031, + "grad_norm": 0.5340769171192323, + "learning_rate": 2.5690390851681176e-05, + "loss": 0.5844, + "num_tokens": 197845332.0, + "step": 2097 + }, + { + "epoch": 0.3580815838880355, + "grad_norm": 0.5211830296062894, + "learning_rate": 2.568356374807988e-05, + "loss": 0.7011, + "num_tokens": 197952352.0, + "step": 2098 + }, + { + "epoch": 0.35825226147806793, + "grad_norm": 0.520787771963465, + "learning_rate": 2.5676736644478583e-05, + "loss": 0.6507, + "num_tokens": 198045180.0, + "step": 2099 + }, + { + "epoch": 0.35842293906810035, + "grad_norm": 0.535441568885237, + "learning_rate": 2.5669909540877287e-05, + "loss": 0.6275, + "num_tokens": 198123133.0, + "step": 2100 + }, + { + "epoch": 0.3585936166581328, + "grad_norm": 0.5046026939595865, + "learning_rate": 2.5663082437275984e-05, + "loss": 0.5878, + "num_tokens": 198203964.0, + "step": 2101 + }, + { + "epoch": 0.3587642942481652, + "grad_norm": 0.5217410625455451, + "learning_rate": 2.5656255333674688e-05, + "loss": 0.605, + "num_tokens": 198286084.0, + "step": 2102 + }, + { + "epoch": 0.35893497183819767, + "grad_norm": 0.5615094820129063, + "learning_rate": 2.5649428230073392e-05, + "loss": 0.5722, + "num_tokens": 198352796.0, + "step": 2103 + }, + { + "epoch": 0.3591056494282301, + "grad_norm": 0.4813820650903874, + "learning_rate": 2.5642601126472095e-05, + "loss": 0.5646, + "num_tokens": 198453224.0, + "step": 2104 + }, + { + "epoch": 0.3592763270182625, + "grad_norm": 0.5156569036408863, + "learning_rate": 2.56357740228708e-05, + "loss": 0.7282, + "num_tokens": 198563193.0, + "step": 2105 + }, + { + "epoch": 0.35944700460829493, + "grad_norm": 0.4967118068191588, + "learning_rate": 2.5628946919269503e-05, + "loss": 0.596, + "num_tokens": 198658527.0, + "step": 2106 + }, + { + "epoch": 0.35961768219832735, + "grad_norm": 0.5277302671010591, + "learning_rate": 2.5622119815668203e-05, + "loss": 0.6656, + "num_tokens": 198751358.0, + "step": 2107 + }, + { + "epoch": 0.35978835978835977, + "grad_norm": 0.5047592306800944, + "learning_rate": 2.5615292712066907e-05, + "loss": 0.5223, + "num_tokens": 198834077.0, + "step": 2108 + }, + { + "epoch": 0.35995903737839224, + "grad_norm": 0.5703568222144827, + "learning_rate": 2.560846560846561e-05, + "loss": 0.6013, + "num_tokens": 198927003.0, + "step": 2109 + }, + { + "epoch": 0.36012971496842466, + "grad_norm": 0.5674804410611268, + "learning_rate": 2.5601638504864315e-05, + "loss": 0.6325, + "num_tokens": 199006968.0, + "step": 2110 + }, + { + "epoch": 0.3603003925584571, + "grad_norm": 0.5308148368292055, + "learning_rate": 2.559481140126302e-05, + "loss": 0.683, + "num_tokens": 199097952.0, + "step": 2111 + }, + { + "epoch": 0.3604710701484895, + "grad_norm": 0.49863884709423634, + "learning_rate": 2.5587984297661722e-05, + "loss": 0.6413, + "num_tokens": 199196260.0, + "step": 2112 + }, + { + "epoch": 0.3606417477385219, + "grad_norm": 0.509344150857173, + "learning_rate": 2.558115719406042e-05, + "loss": 0.6807, + "num_tokens": 199298819.0, + "step": 2113 + }, + { + "epoch": 0.36081242532855434, + "grad_norm": 0.5417904852688515, + "learning_rate": 2.5574330090459123e-05, + "loss": 0.6023, + "num_tokens": 199376311.0, + "step": 2114 + }, + { + "epoch": 0.36098310291858676, + "grad_norm": 0.4926011859782852, + "learning_rate": 2.5567502986857827e-05, + "loss": 0.624, + "num_tokens": 199474057.0, + "step": 2115 + }, + { + "epoch": 0.36115378050861924, + "grad_norm": 0.4854151162057932, + "learning_rate": 2.556067588325653e-05, + "loss": 0.5768, + "num_tokens": 199571715.0, + "step": 2116 + }, + { + "epoch": 0.36132445809865166, + "grad_norm": 0.5637887058069204, + "learning_rate": 2.5553848779655234e-05, + "loss": 0.5641, + "num_tokens": 199633074.0, + "step": 2117 + }, + { + "epoch": 0.3614951356886841, + "grad_norm": 0.5247202179256627, + "learning_rate": 2.5547021676053935e-05, + "loss": 0.5356, + "num_tokens": 199711531.0, + "step": 2118 + }, + { + "epoch": 0.3616658132787165, + "grad_norm": 0.5267530410525614, + "learning_rate": 2.554019457245264e-05, + "loss": 0.5653, + "num_tokens": 199794600.0, + "step": 2119 + }, + { + "epoch": 0.3618364908687489, + "grad_norm": 0.5133251887939788, + "learning_rate": 2.5533367468851342e-05, + "loss": 0.6371, + "num_tokens": 199890508.0, + "step": 2120 + }, + { + "epoch": 0.36200716845878134, + "grad_norm": 0.5577368866644095, + "learning_rate": 2.5526540365250046e-05, + "loss": 0.5782, + "num_tokens": 199963544.0, + "step": 2121 + }, + { + "epoch": 0.3621778460488138, + "grad_norm": 0.498379243202252, + "learning_rate": 2.551971326164875e-05, + "loss": 0.6345, + "num_tokens": 200061825.0, + "step": 2122 + }, + { + "epoch": 0.36234852363884623, + "grad_norm": 0.5440032855011572, + "learning_rate": 2.5512886158047453e-05, + "loss": 0.5919, + "num_tokens": 200128521.0, + "step": 2123 + }, + { + "epoch": 0.36251920122887865, + "grad_norm": 0.4804495745074138, + "learning_rate": 2.5506059054446154e-05, + "loss": 0.5532, + "num_tokens": 200211553.0, + "step": 2124 + }, + { + "epoch": 0.36268987881891107, + "grad_norm": 0.5639072892477638, + "learning_rate": 2.5499231950844858e-05, + "loss": 0.6079, + "num_tokens": 200281097.0, + "step": 2125 + }, + { + "epoch": 0.3628605564089435, + "grad_norm": 0.5562027100584316, + "learning_rate": 2.5492404847243558e-05, + "loss": 0.5927, + "num_tokens": 200348442.0, + "step": 2126 + }, + { + "epoch": 0.3630312339989759, + "grad_norm": 0.5621631889625764, + "learning_rate": 2.5485577743642262e-05, + "loss": 0.6259, + "num_tokens": 200425632.0, + "step": 2127 + }, + { + "epoch": 0.3632019115890084, + "grad_norm": 0.47195319488016396, + "learning_rate": 2.5478750640040962e-05, + "loss": 0.5939, + "num_tokens": 200549813.0, + "step": 2128 + }, + { + "epoch": 0.3633725891790408, + "grad_norm": 0.4626633014973855, + "learning_rate": 2.5471923536439666e-05, + "loss": 0.586, + "num_tokens": 200658496.0, + "step": 2129 + }, + { + "epoch": 0.3635432667690732, + "grad_norm": 0.5028402082258967, + "learning_rate": 2.546509643283837e-05, + "loss": 0.6047, + "num_tokens": 200748832.0, + "step": 2130 + }, + { + "epoch": 0.36371394435910565, + "grad_norm": 0.51211947452596, + "learning_rate": 2.5458269329237073e-05, + "loss": 0.5522, + "num_tokens": 200845344.0, + "step": 2131 + }, + { + "epoch": 0.36388462194913807, + "grad_norm": 0.6261374584016216, + "learning_rate": 2.5451442225635777e-05, + "loss": 0.6914, + "num_tokens": 200917644.0, + "step": 2132 + }, + { + "epoch": 0.3640552995391705, + "grad_norm": 0.5144567432218283, + "learning_rate": 2.544461512203448e-05, + "loss": 0.6165, + "num_tokens": 201002976.0, + "step": 2133 + }, + { + "epoch": 0.36422597712920296, + "grad_norm": 0.5072144500371885, + "learning_rate": 2.543778801843318e-05, + "loss": 0.6516, + "num_tokens": 201116059.0, + "step": 2134 + }, + { + "epoch": 0.3643966547192354, + "grad_norm": 0.4755749250491814, + "learning_rate": 2.5430960914831885e-05, + "loss": 0.679, + "num_tokens": 201222233.0, + "step": 2135 + }, + { + "epoch": 0.3645673323092678, + "grad_norm": 0.48442210178620376, + "learning_rate": 2.542413381123059e-05, + "loss": 0.5746, + "num_tokens": 201311995.0, + "step": 2136 + }, + { + "epoch": 0.3647380098993002, + "grad_norm": 0.4473412228864404, + "learning_rate": 2.5417306707629293e-05, + "loss": 0.603, + "num_tokens": 201432133.0, + "step": 2137 + }, + { + "epoch": 0.36490868748933264, + "grad_norm": 0.4349525985678347, + "learning_rate": 2.541047960402799e-05, + "loss": 0.5181, + "num_tokens": 201545524.0, + "step": 2138 + }, + { + "epoch": 0.36507936507936506, + "grad_norm": 0.5640896031563569, + "learning_rate": 2.5403652500426693e-05, + "loss": 0.6612, + "num_tokens": 201621075.0, + "step": 2139 + }, + { + "epoch": 0.36525004266939753, + "grad_norm": 0.4901290522958736, + "learning_rate": 2.5396825396825397e-05, + "loss": 0.6862, + "num_tokens": 201721817.0, + "step": 2140 + }, + { + "epoch": 0.36542072025942995, + "grad_norm": 0.4555425457008379, + "learning_rate": 2.53899982932241e-05, + "loss": 0.5476, + "num_tokens": 201823744.0, + "step": 2141 + }, + { + "epoch": 0.3655913978494624, + "grad_norm": 0.5838687959267246, + "learning_rate": 2.5383171189622805e-05, + "loss": 0.7256, + "num_tokens": 201939243.0, + "step": 2142 + }, + { + "epoch": 0.3657620754394948, + "grad_norm": 0.5520156105447594, + "learning_rate": 2.537634408602151e-05, + "loss": 0.5483, + "num_tokens": 202024618.0, + "step": 2143 + }, + { + "epoch": 0.3659327530295272, + "grad_norm": 0.5241538928902035, + "learning_rate": 2.536951698242021e-05, + "loss": 0.6433, + "num_tokens": 202120080.0, + "step": 2144 + }, + { + "epoch": 0.36610343061955963, + "grad_norm": 0.4900672915906874, + "learning_rate": 2.5362689878818913e-05, + "loss": 0.636, + "num_tokens": 202215073.0, + "step": 2145 + }, + { + "epoch": 0.36627410820959205, + "grad_norm": 0.49630036537384176, + "learning_rate": 2.5355862775217616e-05, + "loss": 0.578, + "num_tokens": 202303559.0, + "step": 2146 + }, + { + "epoch": 0.36644478579962453, + "grad_norm": 0.4815116668922985, + "learning_rate": 2.534903567161632e-05, + "loss": 0.5827, + "num_tokens": 202402554.0, + "step": 2147 + }, + { + "epoch": 0.36661546338965695, + "grad_norm": 0.5134878785797574, + "learning_rate": 2.5342208568015024e-05, + "loss": 0.542, + "num_tokens": 202479114.0, + "step": 2148 + }, + { + "epoch": 0.36678614097968937, + "grad_norm": 0.48594195377218236, + "learning_rate": 2.5335381464413728e-05, + "loss": 0.6302, + "num_tokens": 202569077.0, + "step": 2149 + }, + { + "epoch": 0.3669568185697218, + "grad_norm": 0.5270678498789436, + "learning_rate": 2.5328554360812428e-05, + "loss": 0.6992, + "num_tokens": 202669222.0, + "step": 2150 + }, + { + "epoch": 0.3671274961597542, + "grad_norm": 0.48931732152455165, + "learning_rate": 2.5321727257211128e-05, + "loss": 0.5894, + "num_tokens": 202777957.0, + "step": 2151 + }, + { + "epoch": 0.3672981737497866, + "grad_norm": 0.5036365725366047, + "learning_rate": 2.5314900153609832e-05, + "loss": 0.552, + "num_tokens": 202894023.0, + "step": 2152 + }, + { + "epoch": 0.3674688513398191, + "grad_norm": 0.4538590119089527, + "learning_rate": 2.5308073050008536e-05, + "loss": 0.6023, + "num_tokens": 203011018.0, + "step": 2153 + }, + { + "epoch": 0.3676395289298515, + "grad_norm": 0.5275772239323716, + "learning_rate": 2.530124594640724e-05, + "loss": 0.6383, + "num_tokens": 203100479.0, + "step": 2154 + }, + { + "epoch": 0.36781020651988394, + "grad_norm": 0.4906530155085971, + "learning_rate": 2.529441884280594e-05, + "loss": 0.613, + "num_tokens": 203216841.0, + "step": 2155 + }, + { + "epoch": 0.36798088410991636, + "grad_norm": 0.5181664998580274, + "learning_rate": 2.5287591739204644e-05, + "loss": 0.6936, + "num_tokens": 203323262.0, + "step": 2156 + }, + { + "epoch": 0.3681515616999488, + "grad_norm": 0.5450301404654573, + "learning_rate": 2.5280764635603347e-05, + "loss": 0.5571, + "num_tokens": 203392008.0, + "step": 2157 + }, + { + "epoch": 0.3683222392899812, + "grad_norm": 0.4737497496090665, + "learning_rate": 2.527393753200205e-05, + "loss": 0.5648, + "num_tokens": 203486614.0, + "step": 2158 + }, + { + "epoch": 0.3684929168800137, + "grad_norm": 0.5416770755560676, + "learning_rate": 2.5267110428400755e-05, + "loss": 0.5843, + "num_tokens": 203564700.0, + "step": 2159 + }, + { + "epoch": 0.3686635944700461, + "grad_norm": 0.4798038407811237, + "learning_rate": 2.526028332479946e-05, + "loss": 0.6191, + "num_tokens": 203686103.0, + "step": 2160 + }, + { + "epoch": 0.3688342720600785, + "grad_norm": 0.5142864794975147, + "learning_rate": 2.525345622119816e-05, + "loss": 0.7211, + "num_tokens": 203795249.0, + "step": 2161 + }, + { + "epoch": 0.36900494965011094, + "grad_norm": 0.4682774859413058, + "learning_rate": 2.5246629117596863e-05, + "loss": 0.5379, + "num_tokens": 203882239.0, + "step": 2162 + }, + { + "epoch": 0.36917562724014336, + "grad_norm": 0.49730603038880045, + "learning_rate": 2.5239802013995563e-05, + "loss": 0.5716, + "num_tokens": 203969363.0, + "step": 2163 + }, + { + "epoch": 0.3693463048301758, + "grad_norm": 0.49435968513601425, + "learning_rate": 2.5232974910394267e-05, + "loss": 0.6496, + "num_tokens": 204078603.0, + "step": 2164 + }, + { + "epoch": 0.36951698242020825, + "grad_norm": 0.48563356419989684, + "learning_rate": 2.5226147806792967e-05, + "loss": 0.6061, + "num_tokens": 204183512.0, + "step": 2165 + }, + { + "epoch": 0.36968766001024067, + "grad_norm": 0.6743265303722185, + "learning_rate": 2.521932070319167e-05, + "loss": 0.458, + "num_tokens": 204275958.0, + "step": 2166 + }, + { + "epoch": 0.3698583376002731, + "grad_norm": 0.5015067215785812, + "learning_rate": 2.5212493599590375e-05, + "loss": 0.5902, + "num_tokens": 204374479.0, + "step": 2167 + }, + { + "epoch": 0.3700290151903055, + "grad_norm": 0.5647506131966961, + "learning_rate": 2.520566649598908e-05, + "loss": 0.676, + "num_tokens": 204459968.0, + "step": 2168 + }, + { + "epoch": 0.37019969278033793, + "grad_norm": 0.5106620763942856, + "learning_rate": 2.5198839392387782e-05, + "loss": 0.5921, + "num_tokens": 204552216.0, + "step": 2169 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.5087584064217722, + "learning_rate": 2.5192012288786486e-05, + "loss": 0.6792, + "num_tokens": 204658938.0, + "step": 2170 + }, + { + "epoch": 0.3705410479604028, + "grad_norm": 0.47053392089079127, + "learning_rate": 2.5185185185185187e-05, + "loss": 0.6521, + "num_tokens": 204764511.0, + "step": 2171 + }, + { + "epoch": 0.37071172555043524, + "grad_norm": 0.5000622509857107, + "learning_rate": 2.517835808158389e-05, + "loss": 0.6102, + "num_tokens": 204860229.0, + "step": 2172 + }, + { + "epoch": 0.37088240314046766, + "grad_norm": 0.5201307915686233, + "learning_rate": 2.5171530977982594e-05, + "loss": 0.6536, + "num_tokens": 204952076.0, + "step": 2173 + }, + { + "epoch": 0.3710530807305001, + "grad_norm": 0.5468825083893679, + "learning_rate": 2.5164703874381298e-05, + "loss": 0.6051, + "num_tokens": 205039522.0, + "step": 2174 + }, + { + "epoch": 0.3712237583205325, + "grad_norm": 0.47471000894454335, + "learning_rate": 2.5157876770779995e-05, + "loss": 0.54, + "num_tokens": 205148389.0, + "step": 2175 + }, + { + "epoch": 0.3713944359105649, + "grad_norm": 0.4802903838470032, + "learning_rate": 2.51510496671787e-05, + "loss": 0.5837, + "num_tokens": 205262818.0, + "step": 2176 + }, + { + "epoch": 0.3715651135005974, + "grad_norm": 0.46943871728196596, + "learning_rate": 2.5144222563577402e-05, + "loss": 0.6479, + "num_tokens": 205396548.0, + "step": 2177 + }, + { + "epoch": 0.3717357910906298, + "grad_norm": 0.546395910667123, + "learning_rate": 2.5137395459976106e-05, + "loss": 0.5842, + "num_tokens": 205489075.0, + "step": 2178 + }, + { + "epoch": 0.37190646868066224, + "grad_norm": 0.49919913775057834, + "learning_rate": 2.513056835637481e-05, + "loss": 0.5835, + "num_tokens": 205585046.0, + "step": 2179 + }, + { + "epoch": 0.37207714627069466, + "grad_norm": 0.45673300474574074, + "learning_rate": 2.5123741252773514e-05, + "loss": 0.5511, + "num_tokens": 205679125.0, + "step": 2180 + }, + { + "epoch": 0.3722478238607271, + "grad_norm": 0.4895892487875307, + "learning_rate": 2.5116914149172214e-05, + "loss": 0.58, + "num_tokens": 205776196.0, + "step": 2181 + }, + { + "epoch": 0.3724185014507595, + "grad_norm": 0.49616560675527227, + "learning_rate": 2.5110087045570918e-05, + "loss": 0.5584, + "num_tokens": 205865203.0, + "step": 2182 + }, + { + "epoch": 0.3725891790407919, + "grad_norm": 0.5315210182724598, + "learning_rate": 2.510325994196962e-05, + "loss": 0.6228, + "num_tokens": 205951589.0, + "step": 2183 + }, + { + "epoch": 0.3727598566308244, + "grad_norm": 0.5992902133081407, + "learning_rate": 2.5096432838368325e-05, + "loss": 0.6979, + "num_tokens": 206038580.0, + "step": 2184 + }, + { + "epoch": 0.3729305342208568, + "grad_norm": 0.5200186981777325, + "learning_rate": 2.508960573476703e-05, + "loss": 0.5951, + "num_tokens": 206122574.0, + "step": 2185 + }, + { + "epoch": 0.37310121181088923, + "grad_norm": 0.5511099710262809, + "learning_rate": 2.5082778631165733e-05, + "loss": 0.6956, + "num_tokens": 206228357.0, + "step": 2186 + }, + { + "epoch": 0.37327188940092165, + "grad_norm": 0.5786201393299616, + "learning_rate": 2.5075951527564433e-05, + "loss": 0.555, + "num_tokens": 206295160.0, + "step": 2187 + }, + { + "epoch": 0.3734425669909541, + "grad_norm": 0.5216168565313357, + "learning_rate": 2.5069124423963134e-05, + "loss": 0.5765, + "num_tokens": 206374644.0, + "step": 2188 + }, + { + "epoch": 0.3736132445809865, + "grad_norm": 0.4846179891763666, + "learning_rate": 2.5062297320361837e-05, + "loss": 0.5408, + "num_tokens": 206461586.0, + "step": 2189 + }, + { + "epoch": 0.37378392217101897, + "grad_norm": 0.4858060867496365, + "learning_rate": 2.505547021676054e-05, + "loss": 0.6087, + "num_tokens": 206560226.0, + "step": 2190 + }, + { + "epoch": 0.3739545997610514, + "grad_norm": 0.5108078811471278, + "learning_rate": 2.5048643113159245e-05, + "loss": 0.6249, + "num_tokens": 206650838.0, + "step": 2191 + }, + { + "epoch": 0.3741252773510838, + "grad_norm": 0.5083680259248301, + "learning_rate": 2.5041816009557945e-05, + "loss": 0.6415, + "num_tokens": 206744997.0, + "step": 2192 + }, + { + "epoch": 0.3742959549411162, + "grad_norm": 0.5485162771274682, + "learning_rate": 2.503498890595665e-05, + "loss": 0.5552, + "num_tokens": 206813930.0, + "step": 2193 + }, + { + "epoch": 0.37446663253114865, + "grad_norm": 0.5553339866446881, + "learning_rate": 2.5028161802355353e-05, + "loss": 0.5641, + "num_tokens": 206882845.0, + "step": 2194 + }, + { + "epoch": 0.37463731012118107, + "grad_norm": 0.5629611265467421, + "learning_rate": 2.5021334698754057e-05, + "loss": 0.6326, + "num_tokens": 206974554.0, + "step": 2195 + }, + { + "epoch": 0.37480798771121354, + "grad_norm": 0.44596276720850586, + "learning_rate": 2.501450759515276e-05, + "loss": 0.5492, + "num_tokens": 207096066.0, + "step": 2196 + }, + { + "epoch": 0.37497866530124596, + "grad_norm": 0.5026456149375772, + "learning_rate": 2.5007680491551464e-05, + "loss": 0.5995, + "num_tokens": 207184362.0, + "step": 2197 + }, + { + "epoch": 0.3751493428912784, + "grad_norm": 0.51234910337486, + "learning_rate": 2.5000853387950164e-05, + "loss": 0.6794, + "num_tokens": 207273794.0, + "step": 2198 + }, + { + "epoch": 0.3753200204813108, + "grad_norm": 0.47111656426449977, + "learning_rate": 2.4994026284348868e-05, + "loss": 0.5755, + "num_tokens": 207363287.0, + "step": 2199 + }, + { + "epoch": 0.3754906980713432, + "grad_norm": 0.48774236927144493, + "learning_rate": 2.498719918074757e-05, + "loss": 0.6086, + "num_tokens": 207462334.0, + "step": 2200 + }, + { + "epoch": 0.37566137566137564, + "grad_norm": 0.5627676232434694, + "learning_rate": 2.4980372077146272e-05, + "loss": 0.6992, + "num_tokens": 207539530.0, + "step": 2201 + }, + { + "epoch": 0.3758320532514081, + "grad_norm": 0.5076910461948151, + "learning_rate": 2.4973544973544973e-05, + "loss": 0.5052, + "num_tokens": 207614586.0, + "step": 2202 + }, + { + "epoch": 0.37600273084144054, + "grad_norm": 0.5003767494757384, + "learning_rate": 2.4966717869943677e-05, + "loss": 0.6437, + "num_tokens": 207712239.0, + "step": 2203 + }, + { + "epoch": 0.37617340843147296, + "grad_norm": 0.47796320845955104, + "learning_rate": 2.495989076634238e-05, + "loss": 0.556, + "num_tokens": 207816586.0, + "step": 2204 + }, + { + "epoch": 0.3763440860215054, + "grad_norm": 0.5277457582205102, + "learning_rate": 2.4953063662741084e-05, + "loss": 0.6163, + "num_tokens": 207901991.0, + "step": 2205 + }, + { + "epoch": 0.3765147636115378, + "grad_norm": 0.4981931351067329, + "learning_rate": 2.4946236559139788e-05, + "loss": 0.7048, + "num_tokens": 208012287.0, + "step": 2206 + }, + { + "epoch": 0.3766854412015702, + "grad_norm": 0.5606426192263063, + "learning_rate": 2.493940945553849e-05, + "loss": 0.6036, + "num_tokens": 208081218.0, + "step": 2207 + }, + { + "epoch": 0.3768561187916027, + "grad_norm": 0.5776305688872656, + "learning_rate": 2.4932582351937192e-05, + "loss": 0.5901, + "num_tokens": 208150998.0, + "step": 2208 + }, + { + "epoch": 0.3770267963816351, + "grad_norm": 0.5267997442772258, + "learning_rate": 2.4925755248335896e-05, + "loss": 0.7056, + "num_tokens": 208248761.0, + "step": 2209 + }, + { + "epoch": 0.37719747397166753, + "grad_norm": 0.4711932376018343, + "learning_rate": 2.49189281447346e-05, + "loss": 0.5487, + "num_tokens": 208348163.0, + "step": 2210 + }, + { + "epoch": 0.37736815156169995, + "grad_norm": 0.5462963440251482, + "learning_rate": 2.4912101041133303e-05, + "loss": 0.6103, + "num_tokens": 208435734.0, + "step": 2211 + }, + { + "epoch": 0.37753882915173237, + "grad_norm": 0.49231741481870267, + "learning_rate": 2.4905273937532007e-05, + "loss": 0.604, + "num_tokens": 208533232.0, + "step": 2212 + }, + { + "epoch": 0.3777095067417648, + "grad_norm": 0.4773116208153891, + "learning_rate": 2.4898446833930704e-05, + "loss": 0.6063, + "num_tokens": 208623347.0, + "step": 2213 + }, + { + "epoch": 0.3778801843317972, + "grad_norm": 0.4867860894506644, + "learning_rate": 2.4891619730329408e-05, + "loss": 0.6378, + "num_tokens": 208732855.0, + "step": 2214 + }, + { + "epoch": 0.3780508619218297, + "grad_norm": 0.49511846679846727, + "learning_rate": 2.488479262672811e-05, + "loss": 0.5793, + "num_tokens": 208826739.0, + "step": 2215 + }, + { + "epoch": 0.3782215395118621, + "grad_norm": 0.5680713110332243, + "learning_rate": 2.4877965523126815e-05, + "loss": 0.6219, + "num_tokens": 208928078.0, + "step": 2216 + }, + { + "epoch": 0.3783922171018945, + "grad_norm": 0.48627865462517955, + "learning_rate": 2.487113841952552e-05, + "loss": 0.6072, + "num_tokens": 209032999.0, + "step": 2217 + }, + { + "epoch": 0.37856289469192694, + "grad_norm": 0.5547474902712144, + "learning_rate": 2.486431131592422e-05, + "loss": 0.6802, + "num_tokens": 209111615.0, + "step": 2218 + }, + { + "epoch": 0.37873357228195936, + "grad_norm": 0.5172616377117005, + "learning_rate": 2.4857484212322923e-05, + "loss": 0.5023, + "num_tokens": 209178427.0, + "step": 2219 + }, + { + "epoch": 0.3789042498719918, + "grad_norm": 0.46504158875318513, + "learning_rate": 2.4850657108721627e-05, + "loss": 0.6332, + "num_tokens": 209300643.0, + "step": 2220 + }, + { + "epoch": 0.37907492746202426, + "grad_norm": 0.4477236535634431, + "learning_rate": 2.484383000512033e-05, + "loss": 0.4957, + "num_tokens": 209402228.0, + "step": 2221 + }, + { + "epoch": 0.3792456050520567, + "grad_norm": 0.4804598856307217, + "learning_rate": 2.4837002901519034e-05, + "loss": 0.6186, + "num_tokens": 209509608.0, + "step": 2222 + }, + { + "epoch": 0.3794162826420891, + "grad_norm": 0.5511649955259911, + "learning_rate": 2.4830175797917738e-05, + "loss": 0.6755, + "num_tokens": 209598188.0, + "step": 2223 + }, + { + "epoch": 0.3795869602321215, + "grad_norm": 0.5381740739728642, + "learning_rate": 2.4823348694316442e-05, + "loss": 0.614, + "num_tokens": 209677530.0, + "step": 2224 + }, + { + "epoch": 0.37975763782215394, + "grad_norm": 0.5318171123349746, + "learning_rate": 2.481652159071514e-05, + "loss": 0.6261, + "num_tokens": 209758366.0, + "step": 2225 + }, + { + "epoch": 0.37992831541218636, + "grad_norm": 0.5009440889062267, + "learning_rate": 2.4809694487113843e-05, + "loss": 0.5887, + "num_tokens": 209844484.0, + "step": 2226 + }, + { + "epoch": 0.38009899300221883, + "grad_norm": 0.5146779918174653, + "learning_rate": 2.4802867383512547e-05, + "loss": 0.5316, + "num_tokens": 209932276.0, + "step": 2227 + }, + { + "epoch": 0.38026967059225125, + "grad_norm": 0.4899092762395234, + "learning_rate": 2.479604027991125e-05, + "loss": 0.5465, + "num_tokens": 210016381.0, + "step": 2228 + }, + { + "epoch": 0.38044034818228367, + "grad_norm": 0.5146560833671472, + "learning_rate": 2.478921317630995e-05, + "loss": 0.6461, + "num_tokens": 210104991.0, + "step": 2229 + }, + { + "epoch": 0.3806110257723161, + "grad_norm": 0.4725983314144366, + "learning_rate": 2.4782386072708654e-05, + "loss": 0.5925, + "num_tokens": 210204967.0, + "step": 2230 + }, + { + "epoch": 0.3807817033623485, + "grad_norm": 0.5192639139389731, + "learning_rate": 2.4775558969107358e-05, + "loss": 0.5655, + "num_tokens": 210276589.0, + "step": 2231 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.5227151914833778, + "learning_rate": 2.4768731865506062e-05, + "loss": 0.5982, + "num_tokens": 210354878.0, + "step": 2232 + }, + { + "epoch": 0.3811230585424134, + "grad_norm": 0.5162548977100955, + "learning_rate": 2.4761904761904766e-05, + "loss": 0.5867, + "num_tokens": 210435533.0, + "step": 2233 + }, + { + "epoch": 0.3812937361324458, + "grad_norm": 0.4886404514584747, + "learning_rate": 2.475507765830347e-05, + "loss": 0.6311, + "num_tokens": 210539613.0, + "step": 2234 + }, + { + "epoch": 0.38146441372247825, + "grad_norm": 0.5746338244993011, + "learning_rate": 2.474825055470217e-05, + "loss": 0.6473, + "num_tokens": 210642102.0, + "step": 2235 + }, + { + "epoch": 0.38163509131251067, + "grad_norm": 0.5082037049380634, + "learning_rate": 2.4741423451100874e-05, + "loss": 0.6297, + "num_tokens": 210738390.0, + "step": 2236 + }, + { + "epoch": 0.3818057689025431, + "grad_norm": 0.5143933765220869, + "learning_rate": 2.4734596347499574e-05, + "loss": 0.6198, + "num_tokens": 210842903.0, + "step": 2237 + }, + { + "epoch": 0.3819764464925755, + "grad_norm": 0.4704258673813792, + "learning_rate": 2.4727769243898278e-05, + "loss": 0.6636, + "num_tokens": 210954390.0, + "step": 2238 + }, + { + "epoch": 0.382147124082608, + "grad_norm": 0.5604098404842648, + "learning_rate": 2.4720942140296978e-05, + "loss": 0.6487, + "num_tokens": 211028395.0, + "step": 2239 + }, + { + "epoch": 0.3823178016726404, + "grad_norm": 0.5811881839111597, + "learning_rate": 2.4714115036695682e-05, + "loss": 0.5619, + "num_tokens": 211090758.0, + "step": 2240 + }, + { + "epoch": 0.3824884792626728, + "grad_norm": 0.4602173810196816, + "learning_rate": 2.4707287933094386e-05, + "loss": 0.6254, + "num_tokens": 211212080.0, + "step": 2241 + }, + { + "epoch": 0.38265915685270524, + "grad_norm": 0.454945876096637, + "learning_rate": 2.470046082949309e-05, + "loss": 0.5127, + "num_tokens": 211309346.0, + "step": 2242 + }, + { + "epoch": 0.38282983444273766, + "grad_norm": 0.4860453344239281, + "learning_rate": 2.4693633725891793e-05, + "loss": 0.6221, + "num_tokens": 211416539.0, + "step": 2243 + }, + { + "epoch": 0.3830005120327701, + "grad_norm": 0.5025641045607832, + "learning_rate": 2.4686806622290497e-05, + "loss": 0.5453, + "num_tokens": 211496177.0, + "step": 2244 + }, + { + "epoch": 0.3831711896228025, + "grad_norm": 0.4308234889223253, + "learning_rate": 2.4679979518689197e-05, + "loss": 0.5855, + "num_tokens": 211618522.0, + "step": 2245 + }, + { + "epoch": 0.383341867212835, + "grad_norm": 0.4837403627892437, + "learning_rate": 2.46731524150879e-05, + "loss": 0.597, + "num_tokens": 211719405.0, + "step": 2246 + }, + { + "epoch": 0.3835125448028674, + "grad_norm": 0.4966736987089872, + "learning_rate": 2.4666325311486605e-05, + "loss": 0.6358, + "num_tokens": 211815579.0, + "step": 2247 + }, + { + "epoch": 0.3836832223928998, + "grad_norm": 0.544419689268734, + "learning_rate": 2.465949820788531e-05, + "loss": 0.667, + "num_tokens": 211898107.0, + "step": 2248 + }, + { + "epoch": 0.38385389998293223, + "grad_norm": 0.5183664714860735, + "learning_rate": 2.4652671104284012e-05, + "loss": 0.6837, + "num_tokens": 211999260.0, + "step": 2249 + }, + { + "epoch": 0.38402457757296465, + "grad_norm": 0.5386636278331157, + "learning_rate": 2.464584400068271e-05, + "loss": 0.6722, + "num_tokens": 212089774.0, + "step": 2250 + }, + { + "epoch": 0.3841952551629971, + "grad_norm": 0.5032535945796976, + "learning_rate": 2.4639016897081413e-05, + "loss": 0.692, + "num_tokens": 212192162.0, + "step": 2251 + }, + { + "epoch": 0.38436593275302955, + "grad_norm": 0.5400079637273572, + "learning_rate": 2.4632189793480117e-05, + "loss": 0.631, + "num_tokens": 212267825.0, + "step": 2252 + }, + { + "epoch": 0.38453661034306197, + "grad_norm": 0.4732898905362462, + "learning_rate": 2.462536268987882e-05, + "loss": 0.5764, + "num_tokens": 212362700.0, + "step": 2253 + }, + { + "epoch": 0.3847072879330944, + "grad_norm": 0.5486535296198755, + "learning_rate": 2.4618535586277524e-05, + "loss": 0.6939, + "num_tokens": 212445950.0, + "step": 2254 + }, + { + "epoch": 0.3848779655231268, + "grad_norm": 0.4756247125567084, + "learning_rate": 2.4611708482676228e-05, + "loss": 0.5818, + "num_tokens": 212548780.0, + "step": 2255 + }, + { + "epoch": 0.38504864311315923, + "grad_norm": 0.5390679380344604, + "learning_rate": 2.460488137907493e-05, + "loss": 0.6721, + "num_tokens": 212629558.0, + "step": 2256 + }, + { + "epoch": 0.38521932070319165, + "grad_norm": 0.47647293176486455, + "learning_rate": 2.4598054275473632e-05, + "loss": 0.5485, + "num_tokens": 212719202.0, + "step": 2257 + }, + { + "epoch": 0.3853899982932241, + "grad_norm": 0.47099850734202336, + "learning_rate": 2.4591227171872336e-05, + "loss": 0.6231, + "num_tokens": 212833253.0, + "step": 2258 + }, + { + "epoch": 0.38556067588325654, + "grad_norm": 0.46399822192899687, + "learning_rate": 2.458440006827104e-05, + "loss": 0.6393, + "num_tokens": 212953324.0, + "step": 2259 + }, + { + "epoch": 0.38573135347328896, + "grad_norm": 0.5510509372540577, + "learning_rate": 2.4577572964669744e-05, + "loss": 0.5974, + "num_tokens": 213035779.0, + "step": 2260 + }, + { + "epoch": 0.3859020310633214, + "grad_norm": 0.5269803875470527, + "learning_rate": 2.4570745861068447e-05, + "loss": 0.5769, + "num_tokens": 213115252.0, + "step": 2261 + }, + { + "epoch": 0.3860727086533538, + "grad_norm": 0.5174093565386114, + "learning_rate": 2.4563918757467144e-05, + "loss": 0.6668, + "num_tokens": 213213249.0, + "step": 2262 + }, + { + "epoch": 0.3862433862433862, + "grad_norm": 0.5167648911080228, + "learning_rate": 2.4557091653865848e-05, + "loss": 0.6932, + "num_tokens": 213321298.0, + "step": 2263 + }, + { + "epoch": 0.3864140638334187, + "grad_norm": 0.6641506584861525, + "learning_rate": 2.4550264550264552e-05, + "loss": 0.6193, + "num_tokens": 213408868.0, + "step": 2264 + }, + { + "epoch": 0.3865847414234511, + "grad_norm": 0.4634711328227321, + "learning_rate": 2.4543437446663256e-05, + "loss": 0.5869, + "num_tokens": 213516865.0, + "step": 2265 + }, + { + "epoch": 0.38675541901348354, + "grad_norm": 0.5024339285320417, + "learning_rate": 2.4536610343061956e-05, + "loss": 0.6476, + "num_tokens": 213613533.0, + "step": 2266 + }, + { + "epoch": 0.38692609660351596, + "grad_norm": 0.8292755299554095, + "learning_rate": 2.452978323946066e-05, + "loss": 0.6714, + "num_tokens": 213694452.0, + "step": 2267 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.4816080661977116, + "learning_rate": 2.4522956135859364e-05, + "loss": 0.6258, + "num_tokens": 213800420.0, + "step": 2268 + }, + { + "epoch": 0.3872674517835808, + "grad_norm": 0.49019000391049716, + "learning_rate": 2.4516129032258067e-05, + "loss": 0.5882, + "num_tokens": 213887646.0, + "step": 2269 + }, + { + "epoch": 0.38743812937361327, + "grad_norm": 0.481881527908397, + "learning_rate": 2.450930192865677e-05, + "loss": 0.6426, + "num_tokens": 213990047.0, + "step": 2270 + }, + { + "epoch": 0.3876088069636457, + "grad_norm": 0.49932581364452533, + "learning_rate": 2.4502474825055475e-05, + "loss": 0.5185, + "num_tokens": 214073420.0, + "step": 2271 + }, + { + "epoch": 0.3877794845536781, + "grad_norm": 0.5622899721188643, + "learning_rate": 2.4495647721454175e-05, + "loss": 0.6481, + "num_tokens": 214155199.0, + "step": 2272 + }, + { + "epoch": 0.38795016214371053, + "grad_norm": 0.5554369807013596, + "learning_rate": 2.448882061785288e-05, + "loss": 0.6796, + "num_tokens": 214255521.0, + "step": 2273 + }, + { + "epoch": 0.38812083973374295, + "grad_norm": 0.5136688780752265, + "learning_rate": 2.448199351425158e-05, + "loss": 0.5777, + "num_tokens": 214333594.0, + "step": 2274 + }, + { + "epoch": 0.38829151732377537, + "grad_norm": 0.46503265627801016, + "learning_rate": 2.4475166410650283e-05, + "loss": 0.5159, + "num_tokens": 214423849.0, + "step": 2275 + }, + { + "epoch": 0.3884621949138078, + "grad_norm": 0.5482201321633803, + "learning_rate": 2.4468339307048983e-05, + "loss": 0.5612, + "num_tokens": 214490397.0, + "step": 2276 + }, + { + "epoch": 0.38863287250384027, + "grad_norm": 0.5171428493142186, + "learning_rate": 2.4461512203447687e-05, + "loss": 0.6113, + "num_tokens": 214580235.0, + "step": 2277 + }, + { + "epoch": 0.3888035500938727, + "grad_norm": 0.4588844618726672, + "learning_rate": 2.445468509984639e-05, + "loss": 0.5962, + "num_tokens": 214701281.0, + "step": 2278 + }, + { + "epoch": 0.3889742276839051, + "grad_norm": 0.5130364264557178, + "learning_rate": 2.4447857996245095e-05, + "loss": 0.6144, + "num_tokens": 214782083.0, + "step": 2279 + }, + { + "epoch": 0.3891449052739375, + "grad_norm": 0.48789758963631613, + "learning_rate": 2.44410308926438e-05, + "loss": 0.7143, + "num_tokens": 214906548.0, + "step": 2280 + }, + { + "epoch": 0.38931558286396994, + "grad_norm": 0.5037139082356947, + "learning_rate": 2.4434203789042502e-05, + "loss": 0.5453, + "num_tokens": 215002869.0, + "step": 2281 + }, + { + "epoch": 0.38948626045400236, + "grad_norm": 0.5141485109322536, + "learning_rate": 2.4427376685441203e-05, + "loss": 0.7401, + "num_tokens": 215096685.0, + "step": 2282 + }, + { + "epoch": 0.38965693804403484, + "grad_norm": 0.4870337420148902, + "learning_rate": 2.4420549581839906e-05, + "loss": 0.5737, + "num_tokens": 215186777.0, + "step": 2283 + }, + { + "epoch": 0.38982761563406726, + "grad_norm": 0.4730650815651334, + "learning_rate": 2.441372247823861e-05, + "loss": 0.5876, + "num_tokens": 215284136.0, + "step": 2284 + }, + { + "epoch": 0.3899982932240997, + "grad_norm": 0.6651115384010468, + "learning_rate": 2.4406895374637314e-05, + "loss": 0.7528, + "num_tokens": 215387777.0, + "step": 2285 + }, + { + "epoch": 0.3901689708141321, + "grad_norm": 0.48102904555202314, + "learning_rate": 2.4400068271036018e-05, + "loss": 0.5774, + "num_tokens": 215478783.0, + "step": 2286 + }, + { + "epoch": 0.3903396484041645, + "grad_norm": 0.5114576406349645, + "learning_rate": 2.4393241167434715e-05, + "loss": 0.5534, + "num_tokens": 215587313.0, + "step": 2287 + }, + { + "epoch": 0.39051032599419694, + "grad_norm": 0.4573035386680115, + "learning_rate": 2.438641406383342e-05, + "loss": 0.6151, + "num_tokens": 215700525.0, + "step": 2288 + }, + { + "epoch": 0.3906810035842294, + "grad_norm": 0.43218440945781206, + "learning_rate": 2.4379586960232122e-05, + "loss": 0.5844, + "num_tokens": 215820084.0, + "step": 2289 + }, + { + "epoch": 0.39085168117426183, + "grad_norm": 0.4596394028665521, + "learning_rate": 2.4372759856630826e-05, + "loss": 0.5876, + "num_tokens": 215937993.0, + "step": 2290 + }, + { + "epoch": 0.39102235876429425, + "grad_norm": 0.5159870834352941, + "learning_rate": 2.436593275302953e-05, + "loss": 0.5776, + "num_tokens": 216014792.0, + "step": 2291 + }, + { + "epoch": 0.3911930363543267, + "grad_norm": 0.5209175275230461, + "learning_rate": 2.4359105649428234e-05, + "loss": 0.5665, + "num_tokens": 216090978.0, + "step": 2292 + }, + { + "epoch": 0.3913637139443591, + "grad_norm": 0.467625553479465, + "learning_rate": 2.4352278545826934e-05, + "loss": 0.6238, + "num_tokens": 216202052.0, + "step": 2293 + }, + { + "epoch": 0.3915343915343915, + "grad_norm": 0.5205690348617306, + "learning_rate": 2.4345451442225638e-05, + "loss": 0.585, + "num_tokens": 216283423.0, + "step": 2294 + }, + { + "epoch": 0.391705069124424, + "grad_norm": 0.5324064491616337, + "learning_rate": 2.433862433862434e-05, + "loss": 0.703, + "num_tokens": 216384113.0, + "step": 2295 + }, + { + "epoch": 0.3918757467144564, + "grad_norm": 0.4143767054971347, + "learning_rate": 2.4331797235023045e-05, + "loss": 0.5389, + "num_tokens": 216502345.0, + "step": 2296 + }, + { + "epoch": 0.3920464243044888, + "grad_norm": 0.4543370106769294, + "learning_rate": 2.432497013142175e-05, + "loss": 0.5754, + "num_tokens": 216604111.0, + "step": 2297 + }, + { + "epoch": 0.39221710189452125, + "grad_norm": 0.5252777270265523, + "learning_rate": 2.4318143027820453e-05, + "loss": 0.6452, + "num_tokens": 216687688.0, + "step": 2298 + }, + { + "epoch": 0.39238777948455367, + "grad_norm": 0.6387850887843657, + "learning_rate": 2.431131592421915e-05, + "loss": 0.6915, + "num_tokens": 216749142.0, + "step": 2299 + }, + { + "epoch": 0.3925584570745861, + "grad_norm": 0.5115380550218598, + "learning_rate": 2.4304488820617853e-05, + "loss": 0.6005, + "num_tokens": 216839291.0, + "step": 2300 + }, + { + "epoch": 0.39272913466461856, + "grad_norm": 0.4745447822057751, + "learning_rate": 2.4297661717016557e-05, + "loss": 0.5944, + "num_tokens": 216930324.0, + "step": 2301 + }, + { + "epoch": 0.392899812254651, + "grad_norm": 0.4972397891076115, + "learning_rate": 2.429083461341526e-05, + "loss": 0.6244, + "num_tokens": 217033691.0, + "step": 2302 + }, + { + "epoch": 0.3930704898446834, + "grad_norm": 0.4842003981105669, + "learning_rate": 2.428400750981396e-05, + "loss": 0.5336, + "num_tokens": 217131729.0, + "step": 2303 + }, + { + "epoch": 0.3932411674347158, + "grad_norm": 0.49144941846692886, + "learning_rate": 2.4277180406212665e-05, + "loss": 0.6132, + "num_tokens": 217224553.0, + "step": 2304 + }, + { + "epoch": 0.39341184502474824, + "grad_norm": 0.4397083396401116, + "learning_rate": 2.427035330261137e-05, + "loss": 0.5359, + "num_tokens": 217323036.0, + "step": 2305 + }, + { + "epoch": 0.39358252261478066, + "grad_norm": 0.5585310814597336, + "learning_rate": 2.4263526199010073e-05, + "loss": 0.6494, + "num_tokens": 217415709.0, + "step": 2306 + }, + { + "epoch": 0.3937532002048131, + "grad_norm": 0.5579155676702687, + "learning_rate": 2.4256699095408776e-05, + "loss": 0.5672, + "num_tokens": 217479464.0, + "step": 2307 + }, + { + "epoch": 0.39392387779484556, + "grad_norm": 0.4974477847418061, + "learning_rate": 2.424987199180748e-05, + "loss": 0.6113, + "num_tokens": 217591733.0, + "step": 2308 + }, + { + "epoch": 0.394094555384878, + "grad_norm": 0.4558657497302916, + "learning_rate": 2.424304488820618e-05, + "loss": 0.5437, + "num_tokens": 217688759.0, + "step": 2309 + }, + { + "epoch": 0.3942652329749104, + "grad_norm": 0.4694586579607243, + "learning_rate": 2.4236217784604884e-05, + "loss": 0.7115, + "num_tokens": 217814534.0, + "step": 2310 + }, + { + "epoch": 0.3944359105649428, + "grad_norm": 0.46086899942149856, + "learning_rate": 2.4229390681003588e-05, + "loss": 0.5685, + "num_tokens": 217923522.0, + "step": 2311 + }, + { + "epoch": 0.39460658815497524, + "grad_norm": 0.46478969650002044, + "learning_rate": 2.422256357740229e-05, + "loss": 0.6192, + "num_tokens": 218031935.0, + "step": 2312 + }, + { + "epoch": 0.39477726574500765, + "grad_norm": 0.5319914130393121, + "learning_rate": 2.421573647380099e-05, + "loss": 0.6238, + "num_tokens": 218114026.0, + "step": 2313 + }, + { + "epoch": 0.39494794333504013, + "grad_norm": 0.5779723480440926, + "learning_rate": 2.4208909370199693e-05, + "loss": 0.7279, + "num_tokens": 218212284.0, + "step": 2314 + }, + { + "epoch": 0.39511862092507255, + "grad_norm": 0.5762672420114009, + "learning_rate": 2.4202082266598396e-05, + "loss": 0.6496, + "num_tokens": 218278560.0, + "step": 2315 + }, + { + "epoch": 0.39528929851510497, + "grad_norm": 0.48883042399349996, + "learning_rate": 2.41952551629971e-05, + "loss": 0.7073, + "num_tokens": 218413619.0, + "step": 2316 + }, + { + "epoch": 0.3954599761051374, + "grad_norm": 0.5586017176607562, + "learning_rate": 2.4188428059395804e-05, + "loss": 0.6013, + "num_tokens": 218482865.0, + "step": 2317 + }, + { + "epoch": 0.3956306536951698, + "grad_norm": 0.5512211886992747, + "learning_rate": 2.4181600955794508e-05, + "loss": 0.6403, + "num_tokens": 218557773.0, + "step": 2318 + }, + { + "epoch": 0.39580133128520223, + "grad_norm": 0.4852230122561447, + "learning_rate": 2.4174773852193208e-05, + "loss": 0.5583, + "num_tokens": 218658094.0, + "step": 2319 + }, + { + "epoch": 0.3959720088752347, + "grad_norm": 0.4754562661645349, + "learning_rate": 2.4167946748591912e-05, + "loss": 0.5772, + "num_tokens": 218754065.0, + "step": 2320 + }, + { + "epoch": 0.3961426864652671, + "grad_norm": 0.5080498344507179, + "learning_rate": 2.4161119644990616e-05, + "loss": 0.6103, + "num_tokens": 218850353.0, + "step": 2321 + }, + { + "epoch": 0.39631336405529954, + "grad_norm": 0.4908565468542026, + "learning_rate": 2.415429254138932e-05, + "loss": 0.6196, + "num_tokens": 218958224.0, + "step": 2322 + }, + { + "epoch": 0.39648404164533196, + "grad_norm": 0.6296764515878164, + "learning_rate": 2.4147465437788023e-05, + "loss": 0.511, + "num_tokens": 219025568.0, + "step": 2323 + }, + { + "epoch": 0.3966547192353644, + "grad_norm": 0.49499959905163043, + "learning_rate": 2.414063833418672e-05, + "loss": 0.6086, + "num_tokens": 219122449.0, + "step": 2324 + }, + { + "epoch": 0.3968253968253968, + "grad_norm": 0.48240540296678136, + "learning_rate": 2.4133811230585424e-05, + "loss": 0.7033, + "num_tokens": 219242171.0, + "step": 2325 + }, + { + "epoch": 0.3969960744154293, + "grad_norm": 0.5098217900130855, + "learning_rate": 2.4126984126984128e-05, + "loss": 0.5922, + "num_tokens": 219326718.0, + "step": 2326 + }, + { + "epoch": 0.3971667520054617, + "grad_norm": 0.5108669573717571, + "learning_rate": 2.412015702338283e-05, + "loss": 0.6173, + "num_tokens": 219424305.0, + "step": 2327 + }, + { + "epoch": 0.3973374295954941, + "grad_norm": 0.4517055864832452, + "learning_rate": 2.4113329919781535e-05, + "loss": 0.5856, + "num_tokens": 219530723.0, + "step": 2328 + }, + { + "epoch": 0.39750810718552654, + "grad_norm": 0.5061113587740391, + "learning_rate": 2.410650281618024e-05, + "loss": 0.6616, + "num_tokens": 219629474.0, + "step": 2329 + }, + { + "epoch": 0.39767878477555896, + "grad_norm": 0.552531277615828, + "learning_rate": 2.409967571257894e-05, + "loss": 0.6024, + "num_tokens": 219703628.0, + "step": 2330 + }, + { + "epoch": 0.3978494623655914, + "grad_norm": 0.5264662392873181, + "learning_rate": 2.4092848608977643e-05, + "loss": 0.6131, + "num_tokens": 219776280.0, + "step": 2331 + }, + { + "epoch": 0.39802013995562385, + "grad_norm": 0.4996383012740999, + "learning_rate": 2.4086021505376347e-05, + "loss": 0.5061, + "num_tokens": 219850184.0, + "step": 2332 + }, + { + "epoch": 0.3981908175456563, + "grad_norm": 0.5304735562715119, + "learning_rate": 2.407919440177505e-05, + "loss": 0.611, + "num_tokens": 219954568.0, + "step": 2333 + }, + { + "epoch": 0.3983614951356887, + "grad_norm": 0.5741372854651873, + "learning_rate": 2.4072367298173754e-05, + "loss": 0.6273, + "num_tokens": 220031153.0, + "step": 2334 + }, + { + "epoch": 0.3985321727257211, + "grad_norm": 0.5139152640884838, + "learning_rate": 2.4065540194572458e-05, + "loss": 0.5774, + "num_tokens": 220122702.0, + "step": 2335 + }, + { + "epoch": 0.39870285031575353, + "grad_norm": 0.5070263410316677, + "learning_rate": 2.4058713090971155e-05, + "loss": 0.5575, + "num_tokens": 220208307.0, + "step": 2336 + }, + { + "epoch": 0.39887352790578595, + "grad_norm": 0.4766658312740394, + "learning_rate": 2.405188598736986e-05, + "loss": 0.5889, + "num_tokens": 220309717.0, + "step": 2337 + }, + { + "epoch": 0.39904420549581837, + "grad_norm": 0.5243311512465705, + "learning_rate": 2.4045058883768563e-05, + "loss": 0.5755, + "num_tokens": 220385919.0, + "step": 2338 + }, + { + "epoch": 0.39921488308585085, + "grad_norm": 0.4815921624764374, + "learning_rate": 2.4038231780167266e-05, + "loss": 0.5515, + "num_tokens": 220485031.0, + "step": 2339 + }, + { + "epoch": 0.39938556067588327, + "grad_norm": 0.5035206049681762, + "learning_rate": 2.4031404676565967e-05, + "loss": 0.523, + "num_tokens": 220563323.0, + "step": 2340 + }, + { + "epoch": 0.3995562382659157, + "grad_norm": 0.5869658867071449, + "learning_rate": 2.402457757296467e-05, + "loss": 0.6709, + "num_tokens": 220645640.0, + "step": 2341 + }, + { + "epoch": 0.3997269158559481, + "grad_norm": 0.5171815049901933, + "learning_rate": 2.4017750469363374e-05, + "loss": 0.6008, + "num_tokens": 220738449.0, + "step": 2342 + }, + { + "epoch": 0.3998975934459805, + "grad_norm": 0.4827910357112666, + "learning_rate": 2.4010923365762078e-05, + "loss": 0.7094, + "num_tokens": 220855292.0, + "step": 2343 + }, + { + "epoch": 0.40006827103601295, + "grad_norm": 0.5103880938337038, + "learning_rate": 2.4004096262160782e-05, + "loss": 0.6057, + "num_tokens": 220950820.0, + "step": 2344 + }, + { + "epoch": 0.4002389486260454, + "grad_norm": 0.4683107310434939, + "learning_rate": 2.3997269158559486e-05, + "loss": 0.6392, + "num_tokens": 221065939.0, + "step": 2345 + }, + { + "epoch": 0.40040962621607784, + "grad_norm": 0.5139949475253363, + "learning_rate": 2.3990442054958186e-05, + "loss": 0.5611, + "num_tokens": 221164794.0, + "step": 2346 + }, + { + "epoch": 0.40058030380611026, + "grad_norm": 0.48697691987566954, + "learning_rate": 2.398361495135689e-05, + "loss": 0.5973, + "num_tokens": 221254043.0, + "step": 2347 + }, + { + "epoch": 0.4007509813961427, + "grad_norm": 0.527218526229706, + "learning_rate": 2.3976787847755593e-05, + "loss": 0.5691, + "num_tokens": 221330027.0, + "step": 2348 + }, + { + "epoch": 0.4009216589861751, + "grad_norm": 0.5466100689281735, + "learning_rate": 2.3969960744154294e-05, + "loss": 0.5681, + "num_tokens": 221401400.0, + "step": 2349 + }, + { + "epoch": 0.4010923365762075, + "grad_norm": 0.4855558782707177, + "learning_rate": 2.3963133640552994e-05, + "loss": 0.5431, + "num_tokens": 221498615.0, + "step": 2350 + }, + { + "epoch": 0.40126301416624, + "grad_norm": 0.46768484952001627, + "learning_rate": 2.3956306536951698e-05, + "loss": 0.5398, + "num_tokens": 221595473.0, + "step": 2351 + }, + { + "epoch": 0.4014336917562724, + "grad_norm": 0.5048374618658158, + "learning_rate": 2.39494794333504e-05, + "loss": 0.6133, + "num_tokens": 221692338.0, + "step": 2352 + }, + { + "epoch": 0.40160436934630483, + "grad_norm": 0.5461956022975948, + "learning_rate": 2.3942652329749105e-05, + "loss": 0.6327, + "num_tokens": 221785415.0, + "step": 2353 + }, + { + "epoch": 0.40177504693633725, + "grad_norm": 0.5101241871893103, + "learning_rate": 2.393582522614781e-05, + "loss": 0.5569, + "num_tokens": 221869726.0, + "step": 2354 + }, + { + "epoch": 0.4019457245263697, + "grad_norm": 0.527911395728902, + "learning_rate": 2.3928998122546513e-05, + "loss": 0.644, + "num_tokens": 221971363.0, + "step": 2355 + }, + { + "epoch": 0.4021164021164021, + "grad_norm": 0.4447118524192706, + "learning_rate": 2.3922171018945213e-05, + "loss": 0.6409, + "num_tokens": 222093086.0, + "step": 2356 + }, + { + "epoch": 0.40228707970643457, + "grad_norm": 0.4647292105272507, + "learning_rate": 2.3915343915343917e-05, + "loss": 0.5174, + "num_tokens": 222188448.0, + "step": 2357 + }, + { + "epoch": 0.402457757296467, + "grad_norm": 0.551055848953137, + "learning_rate": 2.390851681174262e-05, + "loss": 0.5717, + "num_tokens": 222274483.0, + "step": 2358 + }, + { + "epoch": 0.4026284348864994, + "grad_norm": 0.4826773806394218, + "learning_rate": 2.3901689708141325e-05, + "loss": 0.6481, + "num_tokens": 222382240.0, + "step": 2359 + }, + { + "epoch": 0.40279911247653183, + "grad_norm": 0.5594223577606974, + "learning_rate": 2.389486260454003e-05, + "loss": 0.6243, + "num_tokens": 222480795.0, + "step": 2360 + }, + { + "epoch": 0.40296979006656425, + "grad_norm": 0.47913285272583367, + "learning_rate": 2.3888035500938725e-05, + "loss": 0.5492, + "num_tokens": 222563886.0, + "step": 2361 + }, + { + "epoch": 0.40314046765659667, + "grad_norm": 0.4780122212461984, + "learning_rate": 2.388120839733743e-05, + "loss": 0.5641, + "num_tokens": 222657442.0, + "step": 2362 + }, + { + "epoch": 0.40331114524662914, + "grad_norm": 0.5011742054445733, + "learning_rate": 2.3874381293736133e-05, + "loss": 0.5736, + "num_tokens": 222736945.0, + "step": 2363 + }, + { + "epoch": 0.40348182283666156, + "grad_norm": 0.5973194347165425, + "learning_rate": 2.3867554190134837e-05, + "loss": 0.5867, + "num_tokens": 222795727.0, + "step": 2364 + }, + { + "epoch": 0.403652500426694, + "grad_norm": 0.5093639915969538, + "learning_rate": 2.386072708653354e-05, + "loss": 0.5714, + "num_tokens": 222878082.0, + "step": 2365 + }, + { + "epoch": 0.4038231780167264, + "grad_norm": 0.45997697147042527, + "learning_rate": 2.3853899982932244e-05, + "loss": 0.5406, + "num_tokens": 222978612.0, + "step": 2366 + }, + { + "epoch": 0.4039938556067588, + "grad_norm": 0.49460956930756567, + "learning_rate": 2.3847072879330945e-05, + "loss": 0.6495, + "num_tokens": 223075037.0, + "step": 2367 + }, + { + "epoch": 0.40416453319679124, + "grad_norm": 0.4866215290968026, + "learning_rate": 2.384024577572965e-05, + "loss": 0.5108, + "num_tokens": 223160217.0, + "step": 2368 + }, + { + "epoch": 0.4043352107868237, + "grad_norm": 0.5300784892204703, + "learning_rate": 2.3833418672128352e-05, + "loss": 0.5694, + "num_tokens": 223247134.0, + "step": 2369 + }, + { + "epoch": 0.40450588837685614, + "grad_norm": 0.5479867847596525, + "learning_rate": 2.3826591568527056e-05, + "loss": 0.644, + "num_tokens": 223353843.0, + "step": 2370 + }, + { + "epoch": 0.40467656596688856, + "grad_norm": 0.5578073075743806, + "learning_rate": 2.381976446492576e-05, + "loss": 0.6847, + "num_tokens": 223458141.0, + "step": 2371 + }, + { + "epoch": 0.404847243556921, + "grad_norm": 0.47679874306291337, + "learning_rate": 2.3812937361324463e-05, + "loss": 0.6988, + "num_tokens": 223573339.0, + "step": 2372 + }, + { + "epoch": 0.4050179211469534, + "grad_norm": 0.5002483964172377, + "learning_rate": 2.3806110257723164e-05, + "loss": 0.58, + "num_tokens": 223674483.0, + "step": 2373 + }, + { + "epoch": 0.4051885987369858, + "grad_norm": 0.4697762806125055, + "learning_rate": 2.3799283154121864e-05, + "loss": 0.6308, + "num_tokens": 223779389.0, + "step": 2374 + }, + { + "epoch": 0.40535927632701824, + "grad_norm": 0.4761483736158134, + "learning_rate": 2.3792456050520568e-05, + "loss": 0.6346, + "num_tokens": 223883440.0, + "step": 2375 + }, + { + "epoch": 0.4055299539170507, + "grad_norm": 0.5493140696656084, + "learning_rate": 2.378562894691927e-05, + "loss": 0.565, + "num_tokens": 223947934.0, + "step": 2376 + }, + { + "epoch": 0.40570063150708313, + "grad_norm": 0.46575341319162544, + "learning_rate": 2.3778801843317972e-05, + "loss": 0.5905, + "num_tokens": 224055726.0, + "step": 2377 + }, + { + "epoch": 0.40587130909711555, + "grad_norm": 0.4596584934796199, + "learning_rate": 2.3771974739716676e-05, + "loss": 0.6416, + "num_tokens": 224173270.0, + "step": 2378 + }, + { + "epoch": 0.40604198668714797, + "grad_norm": 0.4967935085637729, + "learning_rate": 2.376514763611538e-05, + "loss": 0.5524, + "num_tokens": 224257734.0, + "step": 2379 + }, + { + "epoch": 0.4062126642771804, + "grad_norm": 0.4532189791400893, + "learning_rate": 2.3758320532514083e-05, + "loss": 0.5851, + "num_tokens": 224375030.0, + "step": 2380 + }, + { + "epoch": 0.4063833418672128, + "grad_norm": 0.4852826430586424, + "learning_rate": 2.3751493428912787e-05, + "loss": 0.5075, + "num_tokens": 224458659.0, + "step": 2381 + }, + { + "epoch": 0.4065540194572453, + "grad_norm": 0.477406283139159, + "learning_rate": 2.374466632531149e-05, + "loss": 0.5827, + "num_tokens": 224552861.0, + "step": 2382 + }, + { + "epoch": 0.4067246970472777, + "grad_norm": 0.512610816453536, + "learning_rate": 2.373783922171019e-05, + "loss": 0.6092, + "num_tokens": 224638266.0, + "step": 2383 + }, + { + "epoch": 0.4068953746373101, + "grad_norm": 0.5087510852432886, + "learning_rate": 2.3731012118108895e-05, + "loss": 0.5901, + "num_tokens": 224714313.0, + "step": 2384 + }, + { + "epoch": 0.40706605222734255, + "grad_norm": 0.4497426540487177, + "learning_rate": 2.37241850145076e-05, + "loss": 0.5658, + "num_tokens": 224837848.0, + "step": 2385 + }, + { + "epoch": 0.40723672981737497, + "grad_norm": 0.4882065882062325, + "learning_rate": 2.37173579109063e-05, + "loss": 0.5611, + "num_tokens": 224942198.0, + "step": 2386 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.48647759132024576, + "learning_rate": 2.3710530807305e-05, + "loss": 0.5719, + "num_tokens": 225035689.0, + "step": 2387 + }, + { + "epoch": 0.40757808499743986, + "grad_norm": 0.5479717528493968, + "learning_rate": 2.3703703703703703e-05, + "loss": 0.6187, + "num_tokens": 225116549.0, + "step": 2388 + }, + { + "epoch": 0.4077487625874723, + "grad_norm": 0.4744205595420556, + "learning_rate": 2.3696876600102407e-05, + "loss": 0.6759, + "num_tokens": 225243669.0, + "step": 2389 + }, + { + "epoch": 0.4079194401775047, + "grad_norm": 0.4988533550625587, + "learning_rate": 2.369004949650111e-05, + "loss": 0.5915, + "num_tokens": 225348734.0, + "step": 2390 + }, + { + "epoch": 0.4080901177675371, + "grad_norm": 0.4616925068734663, + "learning_rate": 2.3683222392899815e-05, + "loss": 0.5725, + "num_tokens": 225455621.0, + "step": 2391 + }, + { + "epoch": 0.40826079535756954, + "grad_norm": 0.46070930620842343, + "learning_rate": 2.367639528929852e-05, + "loss": 0.6043, + "num_tokens": 225561049.0, + "step": 2392 + }, + { + "epoch": 0.40843147294760196, + "grad_norm": 0.4561966977048284, + "learning_rate": 2.366956818569722e-05, + "loss": 0.6125, + "num_tokens": 225676137.0, + "step": 2393 + }, + { + "epoch": 0.40860215053763443, + "grad_norm": 0.4596587784533439, + "learning_rate": 2.3662741082095922e-05, + "loss": 0.6295, + "num_tokens": 225795570.0, + "step": 2394 + }, + { + "epoch": 0.40877282812766685, + "grad_norm": 0.5532546603083288, + "learning_rate": 2.3655913978494626e-05, + "loss": 0.5336, + "num_tokens": 225856180.0, + "step": 2395 + }, + { + "epoch": 0.4089435057176993, + "grad_norm": 0.48748599234187384, + "learning_rate": 2.364908687489333e-05, + "loss": 0.6001, + "num_tokens": 225945184.0, + "step": 2396 + }, + { + "epoch": 0.4091141833077317, + "grad_norm": 0.6030997464875462, + "learning_rate": 2.3642259771292034e-05, + "loss": 0.5947, + "num_tokens": 226003358.0, + "step": 2397 + }, + { + "epoch": 0.4092848608977641, + "grad_norm": 0.5200076901066409, + "learning_rate": 2.363543266769073e-05, + "loss": 0.6329, + "num_tokens": 226108602.0, + "step": 2398 + }, + { + "epoch": 0.40945553848779653, + "grad_norm": 0.49315943696077624, + "learning_rate": 2.3628605564089434e-05, + "loss": 0.5787, + "num_tokens": 226209665.0, + "step": 2399 + }, + { + "epoch": 0.409626216077829, + "grad_norm": 0.4940524731943455, + "learning_rate": 2.3621778460488138e-05, + "loss": 0.6455, + "num_tokens": 226318743.0, + "step": 2400 + }, + { + "epoch": 0.40979689366786143, + "grad_norm": 0.5164475516520829, + "learning_rate": 2.3614951356886842e-05, + "loss": 0.6021, + "num_tokens": 226399645.0, + "step": 2401 + }, + { + "epoch": 0.40996757125789385, + "grad_norm": 0.48030433159358443, + "learning_rate": 2.3608124253285546e-05, + "loss": 0.5736, + "num_tokens": 226495384.0, + "step": 2402 + }, + { + "epoch": 0.41013824884792627, + "grad_norm": 0.4607578997878919, + "learning_rate": 2.360129714968425e-05, + "loss": 0.5804, + "num_tokens": 226597902.0, + "step": 2403 + }, + { + "epoch": 0.4103089264379587, + "grad_norm": 0.4979041692378095, + "learning_rate": 2.359447004608295e-05, + "loss": 0.5749, + "num_tokens": 226691994.0, + "step": 2404 + }, + { + "epoch": 0.4104796040279911, + "grad_norm": 0.5392265036230711, + "learning_rate": 2.3587642942481654e-05, + "loss": 0.6306, + "num_tokens": 226789312.0, + "step": 2405 + }, + { + "epoch": 0.4106502816180235, + "grad_norm": 0.5316926644587151, + "learning_rate": 2.3580815838880357e-05, + "loss": 0.611, + "num_tokens": 226898554.0, + "step": 2406 + }, + { + "epoch": 0.410820959208056, + "grad_norm": 0.4516054681639296, + "learning_rate": 2.357398873527906e-05, + "loss": 0.5827, + "num_tokens": 227018560.0, + "step": 2407 + }, + { + "epoch": 0.4109916367980884, + "grad_norm": 0.5192620291469952, + "learning_rate": 2.3567161631677765e-05, + "loss": 0.5778, + "num_tokens": 227103628.0, + "step": 2408 + }, + { + "epoch": 0.41116231438812084, + "grad_norm": 0.5081503526354306, + "learning_rate": 2.356033452807647e-05, + "loss": 0.6432, + "num_tokens": 227201185.0, + "step": 2409 + }, + { + "epoch": 0.41133299197815326, + "grad_norm": 0.5030838762359839, + "learning_rate": 2.355350742447517e-05, + "loss": 0.6387, + "num_tokens": 227289777.0, + "step": 2410 + }, + { + "epoch": 0.4115036695681857, + "grad_norm": 0.5060705334257647, + "learning_rate": 2.354668032087387e-05, + "loss": 0.6217, + "num_tokens": 227385309.0, + "step": 2411 + }, + { + "epoch": 0.4116743471582181, + "grad_norm": 0.5148647024976987, + "learning_rate": 2.3539853217272573e-05, + "loss": 0.5266, + "num_tokens": 227460742.0, + "step": 2412 + }, + { + "epoch": 0.4118450247482506, + "grad_norm": 0.545406157889297, + "learning_rate": 2.3533026113671277e-05, + "loss": 0.5739, + "num_tokens": 227541305.0, + "step": 2413 + }, + { + "epoch": 0.412015702338283, + "grad_norm": 0.5090444406615731, + "learning_rate": 2.3526199010069977e-05, + "loss": 0.7265, + "num_tokens": 227659283.0, + "step": 2414 + }, + { + "epoch": 0.4121863799283154, + "grad_norm": 0.521322156575517, + "learning_rate": 2.351937190646868e-05, + "loss": 0.6251, + "num_tokens": 227745044.0, + "step": 2415 + }, + { + "epoch": 0.41235705751834784, + "grad_norm": 0.5783647950196269, + "learning_rate": 2.3512544802867385e-05, + "loss": 0.5987, + "num_tokens": 227824318.0, + "step": 2416 + }, + { + "epoch": 0.41252773510838026, + "grad_norm": 0.5772601369451659, + "learning_rate": 2.350571769926609e-05, + "loss": 0.6937, + "num_tokens": 227910868.0, + "step": 2417 + }, + { + "epoch": 0.4126984126984127, + "grad_norm": 0.4516412607910109, + "learning_rate": 2.3498890595664792e-05, + "loss": 0.6127, + "num_tokens": 228034685.0, + "step": 2418 + }, + { + "epoch": 0.41286909028844515, + "grad_norm": 0.46061570607257624, + "learning_rate": 2.3492063492063496e-05, + "loss": 0.5247, + "num_tokens": 228140044.0, + "step": 2419 + }, + { + "epoch": 0.41303976787847757, + "grad_norm": 0.46889340598196716, + "learning_rate": 2.3485236388462197e-05, + "loss": 0.6185, + "num_tokens": 228252518.0, + "step": 2420 + }, + { + "epoch": 0.41321044546851, + "grad_norm": 0.5636269323798917, + "learning_rate": 2.34784092848609e-05, + "loss": 0.6032, + "num_tokens": 228317673.0, + "step": 2421 + }, + { + "epoch": 0.4133811230585424, + "grad_norm": 0.5056278638901767, + "learning_rate": 2.3471582181259604e-05, + "loss": 0.6943, + "num_tokens": 228426921.0, + "step": 2422 + }, + { + "epoch": 0.41355180064857483, + "grad_norm": 0.4848544027612261, + "learning_rate": 2.3464755077658304e-05, + "loss": 0.5607, + "num_tokens": 228517763.0, + "step": 2423 + }, + { + "epoch": 0.41372247823860725, + "grad_norm": 0.5394153485261212, + "learning_rate": 2.3457927974057005e-05, + "loss": 0.601, + "num_tokens": 228602347.0, + "step": 2424 + }, + { + "epoch": 0.4138931558286397, + "grad_norm": 0.4597146703871082, + "learning_rate": 2.345110087045571e-05, + "loss": 0.6142, + "num_tokens": 228716552.0, + "step": 2425 + }, + { + "epoch": 0.41406383341867214, + "grad_norm": 0.4778422274346972, + "learning_rate": 2.3444273766854412e-05, + "loss": 0.6795, + "num_tokens": 228838043.0, + "step": 2426 + }, + { + "epoch": 0.41423451100870456, + "grad_norm": 0.46777090321775516, + "learning_rate": 2.3437446663253116e-05, + "loss": 0.6089, + "num_tokens": 228940764.0, + "step": 2427 + }, + { + "epoch": 0.414405188598737, + "grad_norm": 0.468592919993177, + "learning_rate": 2.343061955965182e-05, + "loss": 0.5406, + "num_tokens": 229041832.0, + "step": 2428 + }, + { + "epoch": 0.4145758661887694, + "grad_norm": 0.5210872537256965, + "learning_rate": 2.3423792456050524e-05, + "loss": 0.6356, + "num_tokens": 229127531.0, + "step": 2429 + }, + { + "epoch": 0.4147465437788018, + "grad_norm": 0.48649362788437384, + "learning_rate": 2.3416965352449227e-05, + "loss": 0.5794, + "num_tokens": 229228282.0, + "step": 2430 + }, + { + "epoch": 0.4149172213688343, + "grad_norm": 0.5476611757647712, + "learning_rate": 2.3410138248847928e-05, + "loss": 0.6485, + "num_tokens": 229307799.0, + "step": 2431 + }, + { + "epoch": 0.4150878989588667, + "grad_norm": 0.5485819060755103, + "learning_rate": 2.340331114524663e-05, + "loss": 0.6721, + "num_tokens": 229388122.0, + "step": 2432 + }, + { + "epoch": 0.41525857654889914, + "grad_norm": 0.5316969641454928, + "learning_rate": 2.3396484041645335e-05, + "loss": 0.6486, + "num_tokens": 229482232.0, + "step": 2433 + }, + { + "epoch": 0.41542925413893156, + "grad_norm": 0.49366758642462444, + "learning_rate": 2.338965693804404e-05, + "loss": 0.6202, + "num_tokens": 229587515.0, + "step": 2434 + }, + { + "epoch": 0.415599931728964, + "grad_norm": 0.5129202706098873, + "learning_rate": 2.3382829834442743e-05, + "loss": 0.6907, + "num_tokens": 229678038.0, + "step": 2435 + }, + { + "epoch": 0.4157706093189964, + "grad_norm": 0.520720213228936, + "learning_rate": 2.337600273084144e-05, + "loss": 0.6808, + "num_tokens": 229781869.0, + "step": 2436 + }, + { + "epoch": 0.4159412869090288, + "grad_norm": 0.46760161887191715, + "learning_rate": 2.3369175627240144e-05, + "loss": 0.6047, + "num_tokens": 229889667.0, + "step": 2437 + }, + { + "epoch": 0.4161119644990613, + "grad_norm": 0.5956009578607092, + "learning_rate": 2.3362348523638847e-05, + "loss": 0.6337, + "num_tokens": 229959265.0, + "step": 2438 + }, + { + "epoch": 0.4162826420890937, + "grad_norm": 0.4718005198326276, + "learning_rate": 2.335552142003755e-05, + "loss": 0.6128, + "num_tokens": 230067762.0, + "step": 2439 + }, + { + "epoch": 0.41645331967912613, + "grad_norm": 0.48236929535202605, + "learning_rate": 2.3348694316436255e-05, + "loss": 0.5846, + "num_tokens": 230161868.0, + "step": 2440 + }, + { + "epoch": 0.41662399726915855, + "grad_norm": 0.5400069729479058, + "learning_rate": 2.3341867212834955e-05, + "loss": 0.6538, + "num_tokens": 230249885.0, + "step": 2441 + }, + { + "epoch": 0.416794674859191, + "grad_norm": 0.4549652815730992, + "learning_rate": 2.333504010923366e-05, + "loss": 0.5093, + "num_tokens": 230348472.0, + "step": 2442 + }, + { + "epoch": 0.4169653524492234, + "grad_norm": 0.4795533703251996, + "learning_rate": 2.3328213005632363e-05, + "loss": 0.6035, + "num_tokens": 230457574.0, + "step": 2443 + }, + { + "epoch": 0.41713603003925587, + "grad_norm": 0.526187457336726, + "learning_rate": 2.3321385902031067e-05, + "loss": 0.5956, + "num_tokens": 230539249.0, + "step": 2444 + }, + { + "epoch": 0.4173067076292883, + "grad_norm": 0.4658642902949089, + "learning_rate": 2.331455879842977e-05, + "loss": 0.64, + "num_tokens": 230645202.0, + "step": 2445 + }, + { + "epoch": 0.4174773852193207, + "grad_norm": 0.5381344051767857, + "learning_rate": 2.3307731694828474e-05, + "loss": 0.6082, + "num_tokens": 230716906.0, + "step": 2446 + }, + { + "epoch": 0.4176480628093531, + "grad_norm": 0.5041649765101441, + "learning_rate": 2.3300904591227174e-05, + "loss": 0.6082, + "num_tokens": 230810481.0, + "step": 2447 + }, + { + "epoch": 0.41781874039938555, + "grad_norm": 0.4800391425260171, + "learning_rate": 2.3294077487625875e-05, + "loss": 0.5557, + "num_tokens": 230905938.0, + "step": 2448 + }, + { + "epoch": 0.41798941798941797, + "grad_norm": 0.6810146112254023, + "learning_rate": 2.328725038402458e-05, + "loss": 0.6736, + "num_tokens": 231004769.0, + "step": 2449 + }, + { + "epoch": 0.41816009557945044, + "grad_norm": 0.5585616555317745, + "learning_rate": 2.3280423280423282e-05, + "loss": 0.6563, + "num_tokens": 231083298.0, + "step": 2450 + }, + { + "epoch": 0.41833077316948286, + "grad_norm": 0.4946678806979562, + "learning_rate": 2.3273596176821983e-05, + "loss": 0.6006, + "num_tokens": 231178480.0, + "step": 2451 + }, + { + "epoch": 0.4185014507595153, + "grad_norm": 0.5231648478404225, + "learning_rate": 2.3266769073220686e-05, + "loss": 0.6382, + "num_tokens": 231274576.0, + "step": 2452 + }, + { + "epoch": 0.4186721283495477, + "grad_norm": 0.5336800825751757, + "learning_rate": 2.325994196961939e-05, + "loss": 0.6712, + "num_tokens": 231362586.0, + "step": 2453 + }, + { + "epoch": 0.4188428059395801, + "grad_norm": 0.48778946265858764, + "learning_rate": 2.3253114866018094e-05, + "loss": 0.5932, + "num_tokens": 231455955.0, + "step": 2454 + }, + { + "epoch": 0.41901348352961254, + "grad_norm": 0.5206615824413576, + "learning_rate": 2.3246287762416798e-05, + "loss": 0.671, + "num_tokens": 231552790.0, + "step": 2455 + }, + { + "epoch": 0.419184161119645, + "grad_norm": 0.5420527326218986, + "learning_rate": 2.32394606588155e-05, + "loss": 0.5903, + "num_tokens": 231637537.0, + "step": 2456 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 0.4942111123907891, + "learning_rate": 2.3232633555214202e-05, + "loss": 0.5574, + "num_tokens": 231723131.0, + "step": 2457 + }, + { + "epoch": 0.41952551629970986, + "grad_norm": 0.49141636576124403, + "learning_rate": 2.3225806451612906e-05, + "loss": 0.602, + "num_tokens": 231821859.0, + "step": 2458 + }, + { + "epoch": 0.4196961938897423, + "grad_norm": 0.5694458384248355, + "learning_rate": 2.321897934801161e-05, + "loss": 0.6384, + "num_tokens": 231889356.0, + "step": 2459 + }, + { + "epoch": 0.4198668714797747, + "grad_norm": 0.48563768685635605, + "learning_rate": 2.321215224441031e-05, + "loss": 0.5495, + "num_tokens": 231979313.0, + "step": 2460 + }, + { + "epoch": 0.4200375490698071, + "grad_norm": 0.5888112077119407, + "learning_rate": 2.3205325140809014e-05, + "loss": 0.6057, + "num_tokens": 232068070.0, + "step": 2461 + }, + { + "epoch": 0.4202082266598396, + "grad_norm": 0.5287289873795853, + "learning_rate": 2.3198498037207714e-05, + "loss": 0.6132, + "num_tokens": 232153413.0, + "step": 2462 + }, + { + "epoch": 0.420378904249872, + "grad_norm": 0.5273263731937167, + "learning_rate": 2.3191670933606418e-05, + "loss": 0.5684, + "num_tokens": 232225925.0, + "step": 2463 + }, + { + "epoch": 0.42054958183990443, + "grad_norm": 0.48314870230664186, + "learning_rate": 2.318484383000512e-05, + "loss": 0.6199, + "num_tokens": 232339882.0, + "step": 2464 + }, + { + "epoch": 0.42072025942993685, + "grad_norm": 0.5051384802987633, + "learning_rate": 2.3178016726403825e-05, + "loss": 0.6596, + "num_tokens": 232442942.0, + "step": 2465 + }, + { + "epoch": 0.42089093701996927, + "grad_norm": 0.4960526251571781, + "learning_rate": 2.317118962280253e-05, + "loss": 0.6749, + "num_tokens": 232549835.0, + "step": 2466 + }, + { + "epoch": 0.4210616146100017, + "grad_norm": 0.48984079711799244, + "learning_rate": 2.3164362519201233e-05, + "loss": 0.5682, + "num_tokens": 232640063.0, + "step": 2467 + }, + { + "epoch": 0.4212322922000341, + "grad_norm": 0.45729403752942915, + "learning_rate": 2.3157535415599933e-05, + "loss": 0.6049, + "num_tokens": 232750426.0, + "step": 2468 + }, + { + "epoch": 0.4214029697900666, + "grad_norm": 0.5517104808036845, + "learning_rate": 2.3150708311998637e-05, + "loss": 0.6752, + "num_tokens": 232834365.0, + "step": 2469 + }, + { + "epoch": 0.421573647380099, + "grad_norm": 0.5189941321409404, + "learning_rate": 2.314388120839734e-05, + "loss": 0.6505, + "num_tokens": 232930010.0, + "step": 2470 + }, + { + "epoch": 0.4217443249701314, + "grad_norm": 0.4522457187025338, + "learning_rate": 2.3137054104796044e-05, + "loss": 0.5736, + "num_tokens": 233050542.0, + "step": 2471 + }, + { + "epoch": 0.42191500256016384, + "grad_norm": 0.4744985319533887, + "learning_rate": 2.3130227001194748e-05, + "loss": 0.5721, + "num_tokens": 233151744.0, + "step": 2472 + }, + { + "epoch": 0.42208568015019626, + "grad_norm": 0.5143808051188117, + "learning_rate": 2.3123399897593445e-05, + "loss": 0.5339, + "num_tokens": 233223876.0, + "step": 2473 + }, + { + "epoch": 0.4222563577402287, + "grad_norm": 0.45284402369904325, + "learning_rate": 2.311657279399215e-05, + "loss": 0.6714, + "num_tokens": 233352950.0, + "step": 2474 + }, + { + "epoch": 0.42242703533026116, + "grad_norm": 0.4620876462403817, + "learning_rate": 2.3109745690390853e-05, + "loss": 0.6313, + "num_tokens": 233464419.0, + "step": 2475 + }, + { + "epoch": 0.4225977129202936, + "grad_norm": 0.5709251833202815, + "learning_rate": 2.3102918586789556e-05, + "loss": 0.596, + "num_tokens": 233525369.0, + "step": 2476 + }, + { + "epoch": 0.422768390510326, + "grad_norm": 0.46181121378830037, + "learning_rate": 2.309609148318826e-05, + "loss": 0.5706, + "num_tokens": 233620549.0, + "step": 2477 + }, + { + "epoch": 0.4229390681003584, + "grad_norm": 0.5219546298010829, + "learning_rate": 2.308926437958696e-05, + "loss": 0.6404, + "num_tokens": 233701882.0, + "step": 2478 + }, + { + "epoch": 0.42310974569039084, + "grad_norm": 0.6040177866618024, + "learning_rate": 2.3082437275985664e-05, + "loss": 0.7101, + "num_tokens": 233780016.0, + "step": 2479 + }, + { + "epoch": 0.42328042328042326, + "grad_norm": 0.48536466536905054, + "learning_rate": 2.3075610172384368e-05, + "loss": 0.6166, + "num_tokens": 233879642.0, + "step": 2480 + }, + { + "epoch": 0.42345110087045573, + "grad_norm": 0.5340276693024203, + "learning_rate": 2.3068783068783072e-05, + "loss": 0.5618, + "num_tokens": 233953978.0, + "step": 2481 + }, + { + "epoch": 0.42362177846048815, + "grad_norm": 0.4848365626571597, + "learning_rate": 2.3061955965181776e-05, + "loss": 0.5971, + "num_tokens": 234051855.0, + "step": 2482 + }, + { + "epoch": 0.42379245605052057, + "grad_norm": 0.521589634841622, + "learning_rate": 2.305512886158048e-05, + "loss": 0.6128, + "num_tokens": 234140974.0, + "step": 2483 + }, + { + "epoch": 0.423963133640553, + "grad_norm": 0.48215216891259505, + "learning_rate": 2.304830175797918e-05, + "loss": 0.5427, + "num_tokens": 234234149.0, + "step": 2484 + }, + { + "epoch": 0.4241338112305854, + "grad_norm": 0.5165289585602113, + "learning_rate": 2.304147465437788e-05, + "loss": 0.6776, + "num_tokens": 234348281.0, + "step": 2485 + }, + { + "epoch": 0.42430448882061783, + "grad_norm": 0.45347305931865717, + "learning_rate": 2.3034647550776584e-05, + "loss": 0.6131, + "num_tokens": 234473929.0, + "step": 2486 + }, + { + "epoch": 0.4244751664106503, + "grad_norm": 0.4824675015184745, + "learning_rate": 2.3027820447175288e-05, + "loss": 0.5758, + "num_tokens": 234566419.0, + "step": 2487 + }, + { + "epoch": 0.4246458440006827, + "grad_norm": 0.4869940415932061, + "learning_rate": 2.3020993343573988e-05, + "loss": 0.5198, + "num_tokens": 234647900.0, + "step": 2488 + }, + { + "epoch": 0.42481652159071515, + "grad_norm": 0.49459886685091925, + "learning_rate": 2.3014166239972692e-05, + "loss": 0.6043, + "num_tokens": 234735283.0, + "step": 2489 + }, + { + "epoch": 0.42498719918074757, + "grad_norm": 0.5226338468829149, + "learning_rate": 2.3007339136371396e-05, + "loss": 0.5799, + "num_tokens": 234811504.0, + "step": 2490 + }, + { + "epoch": 0.42515787677078, + "grad_norm": 0.5070939356920735, + "learning_rate": 2.30005120327701e-05, + "loss": 0.6901, + "num_tokens": 234903747.0, + "step": 2491 + }, + { + "epoch": 0.4253285543608124, + "grad_norm": 0.48947200297657145, + "learning_rate": 2.2993684929168803e-05, + "loss": 0.6145, + "num_tokens": 235011656.0, + "step": 2492 + }, + { + "epoch": 0.4254992319508449, + "grad_norm": 0.5239127944219475, + "learning_rate": 2.2986857825567507e-05, + "loss": 0.5847, + "num_tokens": 235088624.0, + "step": 2493 + }, + { + "epoch": 0.4256699095408773, + "grad_norm": 0.4792615995571878, + "learning_rate": 2.2980030721966207e-05, + "loss": 0.6434, + "num_tokens": 235194608.0, + "step": 2494 + }, + { + "epoch": 0.4258405871309097, + "grad_norm": 0.49173237234621275, + "learning_rate": 2.297320361836491e-05, + "loss": 0.6473, + "num_tokens": 235294274.0, + "step": 2495 + }, + { + "epoch": 0.42601126472094214, + "grad_norm": 0.5279406622433027, + "learning_rate": 2.2966376514763615e-05, + "loss": 0.651, + "num_tokens": 235378974.0, + "step": 2496 + }, + { + "epoch": 0.42618194231097456, + "grad_norm": 0.4826879677900294, + "learning_rate": 2.2959549411162315e-05, + "loss": 0.6371, + "num_tokens": 235484888.0, + "step": 2497 + }, + { + "epoch": 0.426352619901007, + "grad_norm": 0.479038634180759, + "learning_rate": 2.295272230756102e-05, + "loss": 0.5998, + "num_tokens": 235581422.0, + "step": 2498 + }, + { + "epoch": 0.4265232974910394, + "grad_norm": 0.5582766412730716, + "learning_rate": 2.294589520395972e-05, + "loss": 0.5587, + "num_tokens": 235654021.0, + "step": 2499 + }, + { + "epoch": 0.4266939750810719, + "grad_norm": 0.5496135746353961, + "learning_rate": 2.2939068100358423e-05, + "loss": 0.6148, + "num_tokens": 235722472.0, + "step": 2500 + }, + { + "epoch": 0.4268646526711043, + "grad_norm": 0.4498748705837028, + "learning_rate": 2.2932240996757127e-05, + "loss": 0.5618, + "num_tokens": 235818256.0, + "step": 2501 + }, + { + "epoch": 0.4270353302611367, + "grad_norm": 0.48511428247190175, + "learning_rate": 2.292541389315583e-05, + "loss": 0.6027, + "num_tokens": 235916641.0, + "step": 2502 + }, + { + "epoch": 0.42720600785116913, + "grad_norm": 0.4709496942005982, + "learning_rate": 2.2918586789554534e-05, + "loss": 0.5514, + "num_tokens": 236014457.0, + "step": 2503 + }, + { + "epoch": 0.42737668544120155, + "grad_norm": 0.5766047185812241, + "learning_rate": 2.2911759685953238e-05, + "loss": 0.6906, + "num_tokens": 236108106.0, + "step": 2504 + }, + { + "epoch": 0.427547363031234, + "grad_norm": 0.46978157140525084, + "learning_rate": 2.290493258235194e-05, + "loss": 0.5879, + "num_tokens": 236215864.0, + "step": 2505 + }, + { + "epoch": 0.42771804062126645, + "grad_norm": 0.5228237232266124, + "learning_rate": 2.2898105478750642e-05, + "loss": 0.6844, + "num_tokens": 236316254.0, + "step": 2506 + }, + { + "epoch": 0.42788871821129887, + "grad_norm": 0.4870277931848846, + "learning_rate": 2.2891278375149346e-05, + "loss": 0.6723, + "num_tokens": 236417712.0, + "step": 2507 + }, + { + "epoch": 0.4280593958013313, + "grad_norm": 0.5331310700142968, + "learning_rate": 2.288445127154805e-05, + "loss": 0.5773, + "num_tokens": 236490547.0, + "step": 2508 + }, + { + "epoch": 0.4282300733913637, + "grad_norm": 0.4846161002170908, + "learning_rate": 2.2877624167946754e-05, + "loss": 0.5691, + "num_tokens": 236577425.0, + "step": 2509 + }, + { + "epoch": 0.42840075098139613, + "grad_norm": 0.4934705576897008, + "learning_rate": 2.287079706434545e-05, + "loss": 0.5546, + "num_tokens": 236669099.0, + "step": 2510 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.4730443927304277, + "learning_rate": 2.2863969960744154e-05, + "loss": 0.5586, + "num_tokens": 236758688.0, + "step": 2511 + }, + { + "epoch": 0.428742106161461, + "grad_norm": 0.523490264049964, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.6435, + "num_tokens": 236850280.0, + "step": 2512 + }, + { + "epoch": 0.42891278375149344, + "grad_norm": 0.5629196344335939, + "learning_rate": 2.2850315753541562e-05, + "loss": 0.6088, + "num_tokens": 236937253.0, + "step": 2513 + }, + { + "epoch": 0.42908346134152586, + "grad_norm": 0.5224168737289571, + "learning_rate": 2.2843488649940266e-05, + "loss": 0.6908, + "num_tokens": 237027715.0, + "step": 2514 + }, + { + "epoch": 0.4292541389315583, + "grad_norm": 0.49040087288691053, + "learning_rate": 2.2836661546338966e-05, + "loss": 0.6032, + "num_tokens": 237120576.0, + "step": 2515 + }, + { + "epoch": 0.4294248165215907, + "grad_norm": 0.5202106349458903, + "learning_rate": 2.282983444273767e-05, + "loss": 0.5611, + "num_tokens": 237196200.0, + "step": 2516 + }, + { + "epoch": 0.4295954941116231, + "grad_norm": 0.49781323849789383, + "learning_rate": 2.2823007339136373e-05, + "loss": 0.572, + "num_tokens": 237283968.0, + "step": 2517 + }, + { + "epoch": 0.4297661717016556, + "grad_norm": 0.5112627399640833, + "learning_rate": 2.2816180235535077e-05, + "loss": 0.6533, + "num_tokens": 237376024.0, + "step": 2518 + }, + { + "epoch": 0.429936849291688, + "grad_norm": 0.5408163641323811, + "learning_rate": 2.280935313193378e-05, + "loss": 0.6143, + "num_tokens": 237450672.0, + "step": 2519 + }, + { + "epoch": 0.43010752688172044, + "grad_norm": 0.5196509086797625, + "learning_rate": 2.2802526028332485e-05, + "loss": 0.6255, + "num_tokens": 237538598.0, + "step": 2520 + }, + { + "epoch": 0.43027820447175286, + "grad_norm": 0.5341305528755987, + "learning_rate": 2.2795698924731185e-05, + "loss": 0.6018, + "num_tokens": 237636195.0, + "step": 2521 + }, + { + "epoch": 0.4304488820617853, + "grad_norm": 0.5460944241858391, + "learning_rate": 2.2788871821129886e-05, + "loss": 0.711, + "num_tokens": 237726539.0, + "step": 2522 + }, + { + "epoch": 0.4306195596518177, + "grad_norm": 0.5128657575994936, + "learning_rate": 2.278204471752859e-05, + "loss": 0.5686, + "num_tokens": 237809062.0, + "step": 2523 + }, + { + "epoch": 0.43079023724185017, + "grad_norm": 0.5186122599417522, + "learning_rate": 2.2775217613927293e-05, + "loss": 0.5845, + "num_tokens": 237891952.0, + "step": 2524 + }, + { + "epoch": 0.4309609148318826, + "grad_norm": 0.5029195570056039, + "learning_rate": 2.2768390510325993e-05, + "loss": 0.6409, + "num_tokens": 238000342.0, + "step": 2525 + }, + { + "epoch": 0.431131592421915, + "grad_norm": 0.4918946078050662, + "learning_rate": 2.2761563406724697e-05, + "loss": 0.6017, + "num_tokens": 238093585.0, + "step": 2526 + }, + { + "epoch": 0.43130227001194743, + "grad_norm": 0.48213839953670873, + "learning_rate": 2.27547363031234e-05, + "loss": 0.6904, + "num_tokens": 238208459.0, + "step": 2527 + }, + { + "epoch": 0.43147294760197985, + "grad_norm": 0.582103789167394, + "learning_rate": 2.2747909199522105e-05, + "loss": 0.6578, + "num_tokens": 238277853.0, + "step": 2528 + }, + { + "epoch": 0.43164362519201227, + "grad_norm": 0.5198258482005479, + "learning_rate": 2.274108209592081e-05, + "loss": 0.593, + "num_tokens": 238355832.0, + "step": 2529 + }, + { + "epoch": 0.4318143027820447, + "grad_norm": 0.5615283760894826, + "learning_rate": 2.2734254992319512e-05, + "loss": 0.6266, + "num_tokens": 238442937.0, + "step": 2530 + }, + { + "epoch": 0.43198498037207717, + "grad_norm": 0.5201473238667882, + "learning_rate": 2.2727427888718213e-05, + "loss": 0.6471, + "num_tokens": 238532721.0, + "step": 2531 + }, + { + "epoch": 0.4321556579621096, + "grad_norm": 0.44750227083814326, + "learning_rate": 2.2720600785116916e-05, + "loss": 0.5914, + "num_tokens": 238649020.0, + "step": 2532 + }, + { + "epoch": 0.432326335552142, + "grad_norm": 0.5471151883861114, + "learning_rate": 2.271377368151562e-05, + "loss": 0.5935, + "num_tokens": 238730208.0, + "step": 2533 + }, + { + "epoch": 0.4324970131421744, + "grad_norm": 0.496678497446435, + "learning_rate": 2.2706946577914324e-05, + "loss": 0.688, + "num_tokens": 238842108.0, + "step": 2534 + }, + { + "epoch": 0.43266769073220684, + "grad_norm": 0.7760185601836864, + "learning_rate": 2.2700119474313024e-05, + "loss": 0.629, + "num_tokens": 238938784.0, + "step": 2535 + }, + { + "epoch": 0.43283836832223926, + "grad_norm": 0.541915315942646, + "learning_rate": 2.2693292370711725e-05, + "loss": 0.707, + "num_tokens": 239024439.0, + "step": 2536 + }, + { + "epoch": 0.43300904591227174, + "grad_norm": 0.4916292661919091, + "learning_rate": 2.268646526711043e-05, + "loss": 0.5862, + "num_tokens": 239135100.0, + "step": 2537 + }, + { + "epoch": 0.43317972350230416, + "grad_norm": 0.5141653496301187, + "learning_rate": 2.2679638163509132e-05, + "loss": 0.548, + "num_tokens": 239224207.0, + "step": 2538 + }, + { + "epoch": 0.4333504010923366, + "grad_norm": 0.4820296772588601, + "learning_rate": 2.2672811059907836e-05, + "loss": 0.5072, + "num_tokens": 239304370.0, + "step": 2539 + }, + { + "epoch": 0.433521078682369, + "grad_norm": 0.5766977634631069, + "learning_rate": 2.266598395630654e-05, + "loss": 0.6845, + "num_tokens": 239390188.0, + "step": 2540 + }, + { + "epoch": 0.4336917562724014, + "grad_norm": 0.5284240515795154, + "learning_rate": 2.2659156852705243e-05, + "loss": 0.6475, + "num_tokens": 239476011.0, + "step": 2541 + }, + { + "epoch": 0.43386243386243384, + "grad_norm": 0.49447827270605055, + "learning_rate": 2.2652329749103944e-05, + "loss": 0.6855, + "num_tokens": 239608156.0, + "step": 2542 + }, + { + "epoch": 0.4340331114524663, + "grad_norm": 0.4589708700826861, + "learning_rate": 2.2645502645502648e-05, + "loss": 0.6203, + "num_tokens": 239727351.0, + "step": 2543 + }, + { + "epoch": 0.43420378904249873, + "grad_norm": 0.52053326723325, + "learning_rate": 2.263867554190135e-05, + "loss": 0.6016, + "num_tokens": 239813462.0, + "step": 2544 + }, + { + "epoch": 0.43437446663253115, + "grad_norm": 0.5113305967564903, + "learning_rate": 2.2631848438300055e-05, + "loss": 0.6591, + "num_tokens": 239903036.0, + "step": 2545 + }, + { + "epoch": 0.4345451442225636, + "grad_norm": 0.5251654475285746, + "learning_rate": 2.262502133469876e-05, + "loss": 0.6186, + "num_tokens": 239992282.0, + "step": 2546 + }, + { + "epoch": 0.434715821812596, + "grad_norm": 0.4902941961995629, + "learning_rate": 2.2618194231097456e-05, + "loss": 0.6007, + "num_tokens": 240098339.0, + "step": 2547 + }, + { + "epoch": 0.4348864994026284, + "grad_norm": 0.5442764419217503, + "learning_rate": 2.261136712749616e-05, + "loss": 0.5821, + "num_tokens": 240176305.0, + "step": 2548 + }, + { + "epoch": 0.4350571769926609, + "grad_norm": 0.4762878067200765, + "learning_rate": 2.2604540023894863e-05, + "loss": 0.6384, + "num_tokens": 240302197.0, + "step": 2549 + }, + { + "epoch": 0.4352278545826933, + "grad_norm": 0.7472243901535014, + "learning_rate": 2.2597712920293567e-05, + "loss": 0.5898, + "num_tokens": 240391107.0, + "step": 2550 + }, + { + "epoch": 0.4353985321727257, + "grad_norm": 0.5241849598962637, + "learning_rate": 2.259088581669227e-05, + "loss": 0.6005, + "num_tokens": 240468115.0, + "step": 2551 + }, + { + "epoch": 0.43556920976275815, + "grad_norm": 0.48292660462830933, + "learning_rate": 2.258405871309097e-05, + "loss": 0.5806, + "num_tokens": 240565732.0, + "step": 2552 + }, + { + "epoch": 0.43573988735279057, + "grad_norm": 0.516881008260058, + "learning_rate": 2.2577231609489675e-05, + "loss": 0.5778, + "num_tokens": 240648147.0, + "step": 2553 + }, + { + "epoch": 0.435910564942823, + "grad_norm": 0.4895792453705771, + "learning_rate": 2.257040450588838e-05, + "loss": 0.5437, + "num_tokens": 240728460.0, + "step": 2554 + }, + { + "epoch": 0.43608124253285546, + "grad_norm": 0.4758815736941176, + "learning_rate": 2.2563577402287083e-05, + "loss": 0.5684, + "num_tokens": 240824100.0, + "step": 2555 + }, + { + "epoch": 0.4362519201228879, + "grad_norm": 0.46943526819896236, + "learning_rate": 2.2556750298685786e-05, + "loss": 0.5979, + "num_tokens": 240929737.0, + "step": 2556 + }, + { + "epoch": 0.4364225977129203, + "grad_norm": 0.4946878756345388, + "learning_rate": 2.254992319508449e-05, + "loss": 0.6106, + "num_tokens": 241021393.0, + "step": 2557 + }, + { + "epoch": 0.4365932753029527, + "grad_norm": 0.4537656216802752, + "learning_rate": 2.254309609148319e-05, + "loss": 0.5839, + "num_tokens": 241128045.0, + "step": 2558 + }, + { + "epoch": 0.43676395289298514, + "grad_norm": 0.4387663055529923, + "learning_rate": 2.253626898788189e-05, + "loss": 0.5455, + "num_tokens": 241243825.0, + "step": 2559 + }, + { + "epoch": 0.43693463048301756, + "grad_norm": 0.5113537246380078, + "learning_rate": 2.2529441884280595e-05, + "loss": 0.5152, + "num_tokens": 241327034.0, + "step": 2560 + }, + { + "epoch": 0.43710530807305004, + "grad_norm": 0.4494886032275182, + "learning_rate": 2.25226147806793e-05, + "loss": 0.6752, + "num_tokens": 241462151.0, + "step": 2561 + }, + { + "epoch": 0.43727598566308246, + "grad_norm": 0.46342746330674767, + "learning_rate": 2.2515787677078e-05, + "loss": 0.5888, + "num_tokens": 241577934.0, + "step": 2562 + }, + { + "epoch": 0.4374466632531149, + "grad_norm": 0.5397426112038425, + "learning_rate": 2.2508960573476703e-05, + "loss": 0.6329, + "num_tokens": 241657174.0, + "step": 2563 + }, + { + "epoch": 0.4376173408431473, + "grad_norm": 0.49675335161778333, + "learning_rate": 2.2502133469875406e-05, + "loss": 0.6338, + "num_tokens": 241754481.0, + "step": 2564 + }, + { + "epoch": 0.4377880184331797, + "grad_norm": 0.46642326950866575, + "learning_rate": 2.249530636627411e-05, + "loss": 0.5577, + "num_tokens": 241882118.0, + "step": 2565 + }, + { + "epoch": 0.43795869602321214, + "grad_norm": 0.4646964692773092, + "learning_rate": 2.2488479262672814e-05, + "loss": 0.5837, + "num_tokens": 241990885.0, + "step": 2566 + }, + { + "epoch": 0.43812937361324455, + "grad_norm": 0.5281412133226361, + "learning_rate": 2.2481652159071518e-05, + "loss": 0.6268, + "num_tokens": 242079258.0, + "step": 2567 + }, + { + "epoch": 0.43830005120327703, + "grad_norm": 0.5111772076144409, + "learning_rate": 2.247482505547022e-05, + "loss": 0.6914, + "num_tokens": 242177265.0, + "step": 2568 + }, + { + "epoch": 0.43847072879330945, + "grad_norm": 0.4912385753566334, + "learning_rate": 2.2467997951868922e-05, + "loss": 0.6028, + "num_tokens": 242278310.0, + "step": 2569 + }, + { + "epoch": 0.43864140638334187, + "grad_norm": 0.4317212900158851, + "learning_rate": 2.2461170848267625e-05, + "loss": 0.5222, + "num_tokens": 242389825.0, + "step": 2570 + }, + { + "epoch": 0.4388120839733743, + "grad_norm": 0.5243356039464475, + "learning_rate": 2.245434374466633e-05, + "loss": 0.6291, + "num_tokens": 242482664.0, + "step": 2571 + }, + { + "epoch": 0.4389827615634067, + "grad_norm": 0.552420354029794, + "learning_rate": 2.244751664106503e-05, + "loss": 0.6183, + "num_tokens": 242551180.0, + "step": 2572 + }, + { + "epoch": 0.43915343915343913, + "grad_norm": 0.6364496903674042, + "learning_rate": 2.244068953746373e-05, + "loss": 0.7041, + "num_tokens": 242629729.0, + "step": 2573 + }, + { + "epoch": 0.4393241167434716, + "grad_norm": 0.45926330539581645, + "learning_rate": 2.2433862433862434e-05, + "loss": 0.6444, + "num_tokens": 242745446.0, + "step": 2574 + }, + { + "epoch": 0.439494794333504, + "grad_norm": 0.4732218506688462, + "learning_rate": 2.2427035330261138e-05, + "loss": 0.7685, + "num_tokens": 242890885.0, + "step": 2575 + }, + { + "epoch": 0.43966547192353644, + "grad_norm": 0.5692676424228504, + "learning_rate": 2.242020822665984e-05, + "loss": 0.6476, + "num_tokens": 242963528.0, + "step": 2576 + }, + { + "epoch": 0.43983614951356886, + "grad_norm": 0.5206638962334924, + "learning_rate": 2.2413381123058545e-05, + "loss": 0.5993, + "num_tokens": 243046368.0, + "step": 2577 + }, + { + "epoch": 0.4400068271036013, + "grad_norm": 0.552263682973172, + "learning_rate": 2.240655401945725e-05, + "loss": 0.6275, + "num_tokens": 243125148.0, + "step": 2578 + }, + { + "epoch": 0.4401775046936337, + "grad_norm": 0.45422858037733566, + "learning_rate": 2.239972691585595e-05, + "loss": 0.6028, + "num_tokens": 243232313.0, + "step": 2579 + }, + { + "epoch": 0.4403481822836662, + "grad_norm": 0.5349189504649133, + "learning_rate": 2.2392899812254653e-05, + "loss": 0.6405, + "num_tokens": 243319617.0, + "step": 2580 + }, + { + "epoch": 0.4405188598736986, + "grad_norm": 0.5266341657557501, + "learning_rate": 2.2386072708653357e-05, + "loss": 0.61, + "num_tokens": 243398057.0, + "step": 2581 + }, + { + "epoch": 0.440689537463731, + "grad_norm": 0.44233297941610783, + "learning_rate": 2.237924560505206e-05, + "loss": 0.5949, + "num_tokens": 243509749.0, + "step": 2582 + }, + { + "epoch": 0.44086021505376344, + "grad_norm": 0.4968970326965827, + "learning_rate": 2.2372418501450764e-05, + "loss": 0.5884, + "num_tokens": 243594011.0, + "step": 2583 + }, + { + "epoch": 0.44103089264379586, + "grad_norm": 0.48844097107824186, + "learning_rate": 2.236559139784946e-05, + "loss": 0.6413, + "num_tokens": 243695029.0, + "step": 2584 + }, + { + "epoch": 0.4412015702338283, + "grad_norm": 0.4888926129475806, + "learning_rate": 2.2358764294248165e-05, + "loss": 0.5904, + "num_tokens": 243799129.0, + "step": 2585 + }, + { + "epoch": 0.44137224782386075, + "grad_norm": 0.52000822457509, + "learning_rate": 2.235193719064687e-05, + "loss": 0.5118, + "num_tokens": 243867683.0, + "step": 2586 + }, + { + "epoch": 0.4415429254138932, + "grad_norm": 0.4528842311944032, + "learning_rate": 2.2345110087045572e-05, + "loss": 0.5863, + "num_tokens": 243973334.0, + "step": 2587 + }, + { + "epoch": 0.4417136030039256, + "grad_norm": 0.49125821302736017, + "learning_rate": 2.2338282983444276e-05, + "loss": 0.597, + "num_tokens": 244065410.0, + "step": 2588 + }, + { + "epoch": 0.441884280593958, + "grad_norm": 0.6186733275304095, + "learning_rate": 2.2331455879842977e-05, + "loss": 0.5989, + "num_tokens": 244131195.0, + "step": 2589 + }, + { + "epoch": 0.44205495818399043, + "grad_norm": 0.5729987207979114, + "learning_rate": 2.232462877624168e-05, + "loss": 0.5443, + "num_tokens": 244196766.0, + "step": 2590 + }, + { + "epoch": 0.44222563577402285, + "grad_norm": 0.5142090022939979, + "learning_rate": 2.2317801672640384e-05, + "loss": 0.6175, + "num_tokens": 244276240.0, + "step": 2591 + }, + { + "epoch": 0.4423963133640553, + "grad_norm": 0.4987865264073442, + "learning_rate": 2.2310974569039088e-05, + "loss": 0.5613, + "num_tokens": 244355625.0, + "step": 2592 + }, + { + "epoch": 0.44256699095408775, + "grad_norm": 0.5009183737494584, + "learning_rate": 2.230414746543779e-05, + "loss": 0.6436, + "num_tokens": 244452544.0, + "step": 2593 + }, + { + "epoch": 0.44273766854412017, + "grad_norm": 0.530079104388083, + "learning_rate": 2.2297320361836495e-05, + "loss": 0.6562, + "num_tokens": 244545761.0, + "step": 2594 + }, + { + "epoch": 0.4429083461341526, + "grad_norm": 0.44332239451849786, + "learning_rate": 2.2290493258235196e-05, + "loss": 0.5028, + "num_tokens": 244653411.0, + "step": 2595 + }, + { + "epoch": 0.443079023724185, + "grad_norm": 0.5366336966522953, + "learning_rate": 2.22836661546339e-05, + "loss": 0.5255, + "num_tokens": 244724951.0, + "step": 2596 + }, + { + "epoch": 0.4432497013142174, + "grad_norm": 0.5166689418663938, + "learning_rate": 2.22768390510326e-05, + "loss": 0.5879, + "num_tokens": 244808471.0, + "step": 2597 + }, + { + "epoch": 0.44342037890424985, + "grad_norm": 0.5717143998873989, + "learning_rate": 2.2270011947431304e-05, + "loss": 0.6585, + "num_tokens": 244882043.0, + "step": 2598 + }, + { + "epoch": 0.4435910564942823, + "grad_norm": 0.5366321214555893, + "learning_rate": 2.2263184843830004e-05, + "loss": 0.6412, + "num_tokens": 244967121.0, + "step": 2599 + }, + { + "epoch": 0.44376173408431474, + "grad_norm": 0.48118848368492245, + "learning_rate": 2.2256357740228708e-05, + "loss": 0.6207, + "num_tokens": 245064507.0, + "step": 2600 + }, + { + "epoch": 0.44393241167434716, + "grad_norm": 0.4489875101767695, + "learning_rate": 2.224953063662741e-05, + "loss": 0.6079, + "num_tokens": 245182602.0, + "step": 2601 + }, + { + "epoch": 0.4441030892643796, + "grad_norm": 0.5047008683575259, + "learning_rate": 2.2242703533026115e-05, + "loss": 0.5571, + "num_tokens": 245264495.0, + "step": 2602 + }, + { + "epoch": 0.444273766854412, + "grad_norm": 0.5761905172076729, + "learning_rate": 2.223587642942482e-05, + "loss": 0.61, + "num_tokens": 245340290.0, + "step": 2603 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.4473611792552422, + "learning_rate": 2.2229049325823523e-05, + "loss": 0.5845, + "num_tokens": 245455185.0, + "step": 2604 + }, + { + "epoch": 0.4446151220344769, + "grad_norm": 0.43914434042460493, + "learning_rate": 2.2222222222222227e-05, + "loss": 0.5744, + "num_tokens": 245564029.0, + "step": 2605 + }, + { + "epoch": 0.4447857996245093, + "grad_norm": 0.4870096486350764, + "learning_rate": 2.2215395118620927e-05, + "loss": 0.6041, + "num_tokens": 245662664.0, + "step": 2606 + }, + { + "epoch": 0.44495647721454173, + "grad_norm": 0.5109084552308337, + "learning_rate": 2.220856801501963e-05, + "loss": 0.6697, + "num_tokens": 245763917.0, + "step": 2607 + }, + { + "epoch": 0.44512715480457415, + "grad_norm": 0.5426651367902541, + "learning_rate": 2.2201740911418335e-05, + "loss": 0.5781, + "num_tokens": 245844796.0, + "step": 2608 + }, + { + "epoch": 0.4452978323946066, + "grad_norm": 0.546278051504539, + "learning_rate": 2.2194913807817035e-05, + "loss": 0.6757, + "num_tokens": 245931569.0, + "step": 2609 + }, + { + "epoch": 0.445468509984639, + "grad_norm": 0.5620117455950127, + "learning_rate": 2.2188086704215735e-05, + "loss": 0.5552, + "num_tokens": 245999461.0, + "step": 2610 + }, + { + "epoch": 0.44563918757467147, + "grad_norm": 0.5028485883604119, + "learning_rate": 2.218125960061444e-05, + "loss": 0.6174, + "num_tokens": 246094720.0, + "step": 2611 + }, + { + "epoch": 0.4458098651647039, + "grad_norm": 0.5023630423979022, + "learning_rate": 2.2174432497013143e-05, + "loss": 0.6232, + "num_tokens": 246184114.0, + "step": 2612 + }, + { + "epoch": 0.4459805427547363, + "grad_norm": 0.5359197112078263, + "learning_rate": 2.2167605393411847e-05, + "loss": 0.6819, + "num_tokens": 246278010.0, + "step": 2613 + }, + { + "epoch": 0.44615122034476873, + "grad_norm": 0.46660335018241733, + "learning_rate": 2.216077828981055e-05, + "loss": 0.5823, + "num_tokens": 246378332.0, + "step": 2614 + }, + { + "epoch": 0.44632189793480115, + "grad_norm": 0.4502390614138913, + "learning_rate": 2.2153951186209254e-05, + "loss": 0.5737, + "num_tokens": 246491698.0, + "step": 2615 + }, + { + "epoch": 0.44649257552483357, + "grad_norm": 0.45962104716658414, + "learning_rate": 2.2147124082607955e-05, + "loss": 0.5303, + "num_tokens": 246592932.0, + "step": 2616 + }, + { + "epoch": 0.44666325311486604, + "grad_norm": 0.49897038605380406, + "learning_rate": 2.2140296979006658e-05, + "loss": 0.5645, + "num_tokens": 246689124.0, + "step": 2617 + }, + { + "epoch": 0.44683393070489846, + "grad_norm": 0.5572183525256971, + "learning_rate": 2.2133469875405362e-05, + "loss": 0.6093, + "num_tokens": 246775559.0, + "step": 2618 + }, + { + "epoch": 0.4470046082949309, + "grad_norm": 0.5320918682040001, + "learning_rate": 2.2126642771804066e-05, + "loss": 0.6907, + "num_tokens": 246875798.0, + "step": 2619 + }, + { + "epoch": 0.4471752858849633, + "grad_norm": 0.49902347269483965, + "learning_rate": 2.211981566820277e-05, + "loss": 0.5803, + "num_tokens": 246967965.0, + "step": 2620 + }, + { + "epoch": 0.4473459634749957, + "grad_norm": 0.4697614620946911, + "learning_rate": 2.2112988564601467e-05, + "loss": 0.5624, + "num_tokens": 247065684.0, + "step": 2621 + }, + { + "epoch": 0.44751664106502814, + "grad_norm": 0.5098309958901, + "learning_rate": 2.210616146100017e-05, + "loss": 0.5588, + "num_tokens": 247143650.0, + "step": 2622 + }, + { + "epoch": 0.4476873186550606, + "grad_norm": 0.45301152421528595, + "learning_rate": 2.2099334357398874e-05, + "loss": 0.5317, + "num_tokens": 247241394.0, + "step": 2623 + }, + { + "epoch": 0.44785799624509304, + "grad_norm": 0.5791304347845814, + "learning_rate": 2.2092507253797578e-05, + "loss": 0.7056, + "num_tokens": 247317827.0, + "step": 2624 + }, + { + "epoch": 0.44802867383512546, + "grad_norm": 0.510602467356276, + "learning_rate": 2.208568015019628e-05, + "loss": 0.621, + "num_tokens": 247398968.0, + "step": 2625 + }, + { + "epoch": 0.4481993514251579, + "grad_norm": 0.5520524240197466, + "learning_rate": 2.2078853046594982e-05, + "loss": 0.6341, + "num_tokens": 247467696.0, + "step": 2626 + }, + { + "epoch": 0.4483700290151903, + "grad_norm": 0.440156804912293, + "learning_rate": 2.2072025942993686e-05, + "loss": 0.6334, + "num_tokens": 247589560.0, + "step": 2627 + }, + { + "epoch": 0.4485407066052227, + "grad_norm": 0.4814301688892049, + "learning_rate": 2.206519883939239e-05, + "loss": 0.6645, + "num_tokens": 247700874.0, + "step": 2628 + }, + { + "epoch": 0.44871138419525514, + "grad_norm": 0.5120669456164516, + "learning_rate": 2.2058371735791093e-05, + "loss": 0.577, + "num_tokens": 247789275.0, + "step": 2629 + }, + { + "epoch": 0.4488820617852876, + "grad_norm": 0.4851087413702751, + "learning_rate": 2.2051544632189797e-05, + "loss": 0.5764, + "num_tokens": 247873523.0, + "step": 2630 + }, + { + "epoch": 0.44905273937532003, + "grad_norm": 0.4813969553208375, + "learning_rate": 2.20447175285885e-05, + "loss": 0.6201, + "num_tokens": 247981152.0, + "step": 2631 + }, + { + "epoch": 0.44922341696535245, + "grad_norm": 0.4671701413468477, + "learning_rate": 2.20378904249872e-05, + "loss": 0.5984, + "num_tokens": 248091723.0, + "step": 2632 + }, + { + "epoch": 0.44939409455538487, + "grad_norm": 0.479262607556036, + "learning_rate": 2.2031063321385905e-05, + "loss": 0.5625, + "num_tokens": 248185121.0, + "step": 2633 + }, + { + "epoch": 0.4495647721454173, + "grad_norm": 0.5029360008865684, + "learning_rate": 2.2024236217784605e-05, + "loss": 0.5833, + "num_tokens": 248270213.0, + "step": 2634 + }, + { + "epoch": 0.4497354497354497, + "grad_norm": 0.47440513991610844, + "learning_rate": 2.201740911418331e-05, + "loss": 0.5571, + "num_tokens": 248364828.0, + "step": 2635 + }, + { + "epoch": 0.4499061273254822, + "grad_norm": 0.45780678804342867, + "learning_rate": 2.2010582010582013e-05, + "loss": 0.5074, + "num_tokens": 248462625.0, + "step": 2636 + }, + { + "epoch": 0.4500768049155146, + "grad_norm": 0.43583090453738677, + "learning_rate": 2.2003754906980713e-05, + "loss": 0.5985, + "num_tokens": 248587365.0, + "step": 2637 + }, + { + "epoch": 0.450247482505547, + "grad_norm": 0.4751326898654281, + "learning_rate": 2.1996927803379417e-05, + "loss": 0.549, + "num_tokens": 248675408.0, + "step": 2638 + }, + { + "epoch": 0.45041816009557945, + "grad_norm": 0.4897931279866619, + "learning_rate": 2.199010069977812e-05, + "loss": 0.5767, + "num_tokens": 248772781.0, + "step": 2639 + }, + { + "epoch": 0.45058883768561186, + "grad_norm": 0.464968438204167, + "learning_rate": 2.1983273596176824e-05, + "loss": 0.5985, + "num_tokens": 248875389.0, + "step": 2640 + }, + { + "epoch": 0.4507595152756443, + "grad_norm": 0.47934141604342356, + "learning_rate": 2.1976446492575528e-05, + "loss": 0.6473, + "num_tokens": 248978847.0, + "step": 2641 + }, + { + "epoch": 0.45093019286567676, + "grad_norm": 0.46202929448705005, + "learning_rate": 2.1969619388974232e-05, + "loss": 0.5557, + "num_tokens": 249077387.0, + "step": 2642 + }, + { + "epoch": 0.4511008704557092, + "grad_norm": 0.4669031593904029, + "learning_rate": 2.1962792285372932e-05, + "loss": 0.6377, + "num_tokens": 249186186.0, + "step": 2643 + }, + { + "epoch": 0.4512715480457416, + "grad_norm": 0.4818446992141536, + "learning_rate": 2.1955965181771636e-05, + "loss": 0.6142, + "num_tokens": 249276322.0, + "step": 2644 + }, + { + "epoch": 0.451442225635774, + "grad_norm": 0.5206873682325155, + "learning_rate": 2.194913807817034e-05, + "loss": 0.5931, + "num_tokens": 249356032.0, + "step": 2645 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 0.4904311948887937, + "learning_rate": 2.194231097456904e-05, + "loss": 0.6456, + "num_tokens": 249447783.0, + "step": 2646 + }, + { + "epoch": 0.45178358081583886, + "grad_norm": 0.5051070625167915, + "learning_rate": 2.193548387096774e-05, + "loss": 0.653, + "num_tokens": 249544918.0, + "step": 2647 + }, + { + "epoch": 0.45195425840587133, + "grad_norm": 0.48394695161020934, + "learning_rate": 2.1928656767366444e-05, + "loss": 0.6731, + "num_tokens": 249666134.0, + "step": 2648 + }, + { + "epoch": 0.45212493599590375, + "grad_norm": 0.6236400763954874, + "learning_rate": 2.1921829663765148e-05, + "loss": 0.6376, + "num_tokens": 249730722.0, + "step": 2649 + }, + { + "epoch": 0.4522956135859362, + "grad_norm": 0.5296528795195605, + "learning_rate": 2.1915002560163852e-05, + "loss": 0.6439, + "num_tokens": 249824393.0, + "step": 2650 + }, + { + "epoch": 0.4524662911759686, + "grad_norm": 0.5402670088569885, + "learning_rate": 2.1908175456562556e-05, + "loss": 0.6271, + "num_tokens": 249899777.0, + "step": 2651 + }, + { + "epoch": 0.452636968766001, + "grad_norm": 0.5214666083029563, + "learning_rate": 2.190134835296126e-05, + "loss": 0.6329, + "num_tokens": 249983658.0, + "step": 2652 + }, + { + "epoch": 0.45280764635603343, + "grad_norm": 0.5155060497767013, + "learning_rate": 2.189452124935996e-05, + "loss": 0.5468, + "num_tokens": 250056972.0, + "step": 2653 + }, + { + "epoch": 0.4529783239460659, + "grad_norm": 0.5123576252338988, + "learning_rate": 2.1887694145758664e-05, + "loss": 0.6084, + "num_tokens": 250146056.0, + "step": 2654 + }, + { + "epoch": 0.45314900153609833, + "grad_norm": 0.4573813913309695, + "learning_rate": 2.1880867042157367e-05, + "loss": 0.5401, + "num_tokens": 250239996.0, + "step": 2655 + }, + { + "epoch": 0.45331967912613075, + "grad_norm": 0.4900878244985309, + "learning_rate": 2.187403993855607e-05, + "loss": 0.5688, + "num_tokens": 250323790.0, + "step": 2656 + }, + { + "epoch": 0.45349035671616317, + "grad_norm": 0.4149883874484175, + "learning_rate": 2.1867212834954775e-05, + "loss": 0.5082, + "num_tokens": 250441476.0, + "step": 2657 + }, + { + "epoch": 0.4536610343061956, + "grad_norm": 0.47656844710860563, + "learning_rate": 2.1860385731353472e-05, + "loss": 0.6266, + "num_tokens": 250544290.0, + "step": 2658 + }, + { + "epoch": 0.453831711896228, + "grad_norm": 0.4845560280103682, + "learning_rate": 2.1853558627752176e-05, + "loss": 0.605, + "num_tokens": 250635137.0, + "step": 2659 + }, + { + "epoch": 0.4540023894862604, + "grad_norm": 0.5501062747650491, + "learning_rate": 2.184673152415088e-05, + "loss": 0.6716, + "num_tokens": 250723603.0, + "step": 2660 + }, + { + "epoch": 0.4541730670762929, + "grad_norm": 0.5060656645449053, + "learning_rate": 2.1839904420549583e-05, + "loss": 0.6075, + "num_tokens": 250807032.0, + "step": 2661 + }, + { + "epoch": 0.4543437446663253, + "grad_norm": 0.5490914490169315, + "learning_rate": 2.1833077316948287e-05, + "loss": 0.6039, + "num_tokens": 250881349.0, + "step": 2662 + }, + { + "epoch": 0.45451442225635774, + "grad_norm": 0.5518084638535478, + "learning_rate": 2.1826250213346987e-05, + "loss": 0.6173, + "num_tokens": 250954601.0, + "step": 2663 + }, + { + "epoch": 0.45468509984639016, + "grad_norm": 0.5188757259891337, + "learning_rate": 2.181942310974569e-05, + "loss": 0.5742, + "num_tokens": 251037888.0, + "step": 2664 + }, + { + "epoch": 0.4548557774364226, + "grad_norm": 0.42891735579509616, + "learning_rate": 2.1812596006144395e-05, + "loss": 0.6409, + "num_tokens": 251189573.0, + "step": 2665 + }, + { + "epoch": 0.455026455026455, + "grad_norm": 0.4983411705245901, + "learning_rate": 2.18057689025431e-05, + "loss": 0.687, + "num_tokens": 251287495.0, + "step": 2666 + }, + { + "epoch": 0.4551971326164875, + "grad_norm": 0.4277310209837677, + "learning_rate": 2.1798941798941802e-05, + "loss": 0.5603, + "num_tokens": 251399551.0, + "step": 2667 + }, + { + "epoch": 0.4553678102065199, + "grad_norm": 0.5379763310298462, + "learning_rate": 2.1792114695340506e-05, + "loss": 0.6491, + "num_tokens": 251495460.0, + "step": 2668 + }, + { + "epoch": 0.4555384877965523, + "grad_norm": 0.5793320463852714, + "learning_rate": 2.1785287591739207e-05, + "loss": 0.6586, + "num_tokens": 251569850.0, + "step": 2669 + }, + { + "epoch": 0.45570916538658474, + "grad_norm": 0.5138862774460591, + "learning_rate": 2.177846048813791e-05, + "loss": 0.4993, + "num_tokens": 251635219.0, + "step": 2670 + }, + { + "epoch": 0.45587984297661716, + "grad_norm": 0.46451610028617224, + "learning_rate": 2.177163338453661e-05, + "loss": 0.6478, + "num_tokens": 251734664.0, + "step": 2671 + }, + { + "epoch": 0.4560505205666496, + "grad_norm": 0.46347905200253525, + "learning_rate": 2.1764806280935314e-05, + "loss": 0.5132, + "num_tokens": 251825135.0, + "step": 2672 + }, + { + "epoch": 0.45622119815668205, + "grad_norm": 0.5343093870676354, + "learning_rate": 2.1757979177334018e-05, + "loss": 0.6766, + "num_tokens": 251913129.0, + "step": 2673 + }, + { + "epoch": 0.45639187574671447, + "grad_norm": 0.4807991156526324, + "learning_rate": 2.175115207373272e-05, + "loss": 0.6574, + "num_tokens": 252040267.0, + "step": 2674 + }, + { + "epoch": 0.4565625533367469, + "grad_norm": 0.449970939357211, + "learning_rate": 2.1744324970131422e-05, + "loss": 0.6522, + "num_tokens": 252162680.0, + "step": 2675 + }, + { + "epoch": 0.4567332309267793, + "grad_norm": 0.4899504671693258, + "learning_rate": 2.1737497866530126e-05, + "loss": 0.539, + "num_tokens": 252242028.0, + "step": 2676 + }, + { + "epoch": 0.45690390851681173, + "grad_norm": 0.534506894966146, + "learning_rate": 2.173067076292883e-05, + "loss": 0.6374, + "num_tokens": 252320352.0, + "step": 2677 + }, + { + "epoch": 0.45707458610684415, + "grad_norm": 0.5095412357643972, + "learning_rate": 2.1723843659327534e-05, + "loss": 0.5281, + "num_tokens": 252399500.0, + "step": 2678 + }, + { + "epoch": 0.4572452636968766, + "grad_norm": 0.49495185182561446, + "learning_rate": 2.1717016555726237e-05, + "loss": 0.6448, + "num_tokens": 252503539.0, + "step": 2679 + }, + { + "epoch": 0.45741594128690904, + "grad_norm": 0.5401902817117739, + "learning_rate": 2.1710189452124938e-05, + "loss": 0.6448, + "num_tokens": 252601035.0, + "step": 2680 + }, + { + "epoch": 0.45758661887694146, + "grad_norm": 0.5126373176164805, + "learning_rate": 2.170336234852364e-05, + "loss": 0.5221, + "num_tokens": 252675476.0, + "step": 2681 + }, + { + "epoch": 0.4577572964669739, + "grad_norm": 0.45887015454751556, + "learning_rate": 2.1696535244922345e-05, + "loss": 0.6513, + "num_tokens": 252793682.0, + "step": 2682 + }, + { + "epoch": 0.4579279740570063, + "grad_norm": 0.4851983085177145, + "learning_rate": 2.1689708141321046e-05, + "loss": 0.5626, + "num_tokens": 252883635.0, + "step": 2683 + }, + { + "epoch": 0.4580986516470387, + "grad_norm": 0.4475961757021382, + "learning_rate": 2.1682881037719746e-05, + "loss": 0.584, + "num_tokens": 252988266.0, + "step": 2684 + }, + { + "epoch": 0.4582693292370712, + "grad_norm": 0.533397037662657, + "learning_rate": 2.167605393411845e-05, + "loss": 0.5458, + "num_tokens": 253060434.0, + "step": 2685 + }, + { + "epoch": 0.4584400068271036, + "grad_norm": 0.468909483484282, + "learning_rate": 2.1669226830517154e-05, + "loss": 0.567, + "num_tokens": 253163661.0, + "step": 2686 + }, + { + "epoch": 0.45861068441713604, + "grad_norm": 0.49013215528410803, + "learning_rate": 2.1662399726915857e-05, + "loss": 0.5447, + "num_tokens": 253252443.0, + "step": 2687 + }, + { + "epoch": 0.45878136200716846, + "grad_norm": 0.5101037997252743, + "learning_rate": 2.165557262331456e-05, + "loss": 0.6503, + "num_tokens": 253343889.0, + "step": 2688 + }, + { + "epoch": 0.4589520395972009, + "grad_norm": 0.5416609106886574, + "learning_rate": 2.1648745519713265e-05, + "loss": 0.5865, + "num_tokens": 253410157.0, + "step": 2689 + }, + { + "epoch": 0.4591227171872333, + "grad_norm": 0.4763870751627381, + "learning_rate": 2.1641918416111965e-05, + "loss": 0.6105, + "num_tokens": 253510128.0, + "step": 2690 + }, + { + "epoch": 0.4592933947772657, + "grad_norm": 0.5192453133651226, + "learning_rate": 2.163509131251067e-05, + "loss": 0.586, + "num_tokens": 253613755.0, + "step": 2691 + }, + { + "epoch": 0.4594640723672982, + "grad_norm": 0.41801178293284635, + "learning_rate": 2.1628264208909373e-05, + "loss": 0.5344, + "num_tokens": 253729015.0, + "step": 2692 + }, + { + "epoch": 0.4596347499573306, + "grad_norm": 0.5175024187026647, + "learning_rate": 2.1621437105308076e-05, + "loss": 0.6966, + "num_tokens": 253835320.0, + "step": 2693 + }, + { + "epoch": 0.45980542754736303, + "grad_norm": 0.46692279736761694, + "learning_rate": 2.161461000170678e-05, + "loss": 0.6089, + "num_tokens": 253946172.0, + "step": 2694 + }, + { + "epoch": 0.45997610513739545, + "grad_norm": 0.4556377595573705, + "learning_rate": 2.1607782898105484e-05, + "loss": 0.6357, + "num_tokens": 254069081.0, + "step": 2695 + }, + { + "epoch": 0.46014678272742787, + "grad_norm": 0.5096058063937354, + "learning_rate": 2.160095579450418e-05, + "loss": 0.5666, + "num_tokens": 254143707.0, + "step": 2696 + }, + { + "epoch": 0.4603174603174603, + "grad_norm": 0.4697062558434564, + "learning_rate": 2.1594128690902885e-05, + "loss": 0.6854, + "num_tokens": 254257172.0, + "step": 2697 + }, + { + "epoch": 0.46048813790749277, + "grad_norm": 0.43452372290769165, + "learning_rate": 2.158730158730159e-05, + "loss": 0.4872, + "num_tokens": 254359008.0, + "step": 2698 + }, + { + "epoch": 0.4606588154975252, + "grad_norm": 0.47506255175268614, + "learning_rate": 2.1580474483700292e-05, + "loss": 0.5584, + "num_tokens": 254457732.0, + "step": 2699 + }, + { + "epoch": 0.4608294930875576, + "grad_norm": 0.4825256722489792, + "learning_rate": 2.1573647380098993e-05, + "loss": 0.5267, + "num_tokens": 254548013.0, + "step": 2700 + }, + { + "epoch": 0.46100017067759, + "grad_norm": 0.4475224760665684, + "learning_rate": 2.1566820276497696e-05, + "loss": 0.5225, + "num_tokens": 254643516.0, + "step": 2701 + }, + { + "epoch": 0.46117084826762245, + "grad_norm": 0.4970843091402074, + "learning_rate": 2.15599931728964e-05, + "loss": 0.5754, + "num_tokens": 254726894.0, + "step": 2702 + }, + { + "epoch": 0.46134152585765487, + "grad_norm": 0.532043048192978, + "learning_rate": 2.1553166069295104e-05, + "loss": 0.6824, + "num_tokens": 254813121.0, + "step": 2703 + }, + { + "epoch": 0.46151220344768734, + "grad_norm": 0.5809695355203921, + "learning_rate": 2.1546338965693808e-05, + "loss": 0.6376, + "num_tokens": 254884956.0, + "step": 2704 + }, + { + "epoch": 0.46168288103771976, + "grad_norm": 0.4798013170735045, + "learning_rate": 2.153951186209251e-05, + "loss": 0.6295, + "num_tokens": 254994943.0, + "step": 2705 + }, + { + "epoch": 0.4618535586277522, + "grad_norm": 0.4504287928624493, + "learning_rate": 2.1532684758491212e-05, + "loss": 0.5506, + "num_tokens": 255103187.0, + "step": 2706 + }, + { + "epoch": 0.4620242362177846, + "grad_norm": 0.44893247123842717, + "learning_rate": 2.1525857654889916e-05, + "loss": 0.6333, + "num_tokens": 255230709.0, + "step": 2707 + }, + { + "epoch": 0.462194913807817, + "grad_norm": 0.5165339470923785, + "learning_rate": 2.1519030551288616e-05, + "loss": 0.6054, + "num_tokens": 255324439.0, + "step": 2708 + }, + { + "epoch": 0.46236559139784944, + "grad_norm": 0.5165071690117639, + "learning_rate": 2.151220344768732e-05, + "loss": 0.6107, + "num_tokens": 255417881.0, + "step": 2709 + }, + { + "epoch": 0.4625362689878819, + "grad_norm": 0.5118061736669948, + "learning_rate": 2.1505376344086024e-05, + "loss": 0.6294, + "num_tokens": 255503647.0, + "step": 2710 + }, + { + "epoch": 0.46270694657791434, + "grad_norm": 0.4826331281296802, + "learning_rate": 2.1498549240484724e-05, + "loss": 0.6237, + "num_tokens": 255630711.0, + "step": 2711 + }, + { + "epoch": 0.46287762416794676, + "grad_norm": 0.495917927205697, + "learning_rate": 2.1491722136883428e-05, + "loss": 0.6224, + "num_tokens": 255723881.0, + "step": 2712 + }, + { + "epoch": 0.4630483017579792, + "grad_norm": 0.5281611216289528, + "learning_rate": 2.148489503328213e-05, + "loss": 0.569, + "num_tokens": 255802399.0, + "step": 2713 + }, + { + "epoch": 0.4632189793480116, + "grad_norm": 0.49943734826201736, + "learning_rate": 2.1478067929680835e-05, + "loss": 0.6321, + "num_tokens": 255901555.0, + "step": 2714 + }, + { + "epoch": 0.463389656938044, + "grad_norm": 0.4338083277999674, + "learning_rate": 2.147124082607954e-05, + "loss": 0.5535, + "num_tokens": 256020739.0, + "step": 2715 + }, + { + "epoch": 0.4635603345280765, + "grad_norm": 0.5052399142250522, + "learning_rate": 2.1464413722478243e-05, + "loss": 0.6914, + "num_tokens": 256124207.0, + "step": 2716 + }, + { + "epoch": 0.4637310121181089, + "grad_norm": 0.48899857994621937, + "learning_rate": 2.1457586618876943e-05, + "loss": 0.7021, + "num_tokens": 256238330.0, + "step": 2717 + }, + { + "epoch": 0.46390168970814133, + "grad_norm": 0.4792818614164604, + "learning_rate": 2.1450759515275647e-05, + "loss": 0.6451, + "num_tokens": 256356293.0, + "step": 2718 + }, + { + "epoch": 0.46407236729817375, + "grad_norm": 0.4711209198146715, + "learning_rate": 2.144393241167435e-05, + "loss": 0.6062, + "num_tokens": 256465675.0, + "step": 2719 + }, + { + "epoch": 0.46424304488820617, + "grad_norm": 0.4794865769132404, + "learning_rate": 2.143710530807305e-05, + "loss": 0.5781, + "num_tokens": 256567406.0, + "step": 2720 + }, + { + "epoch": 0.4644137224782386, + "grad_norm": 0.5822744476091425, + "learning_rate": 2.143027820447175e-05, + "loss": 0.5562, + "num_tokens": 256638073.0, + "step": 2721 + }, + { + "epoch": 0.464584400068271, + "grad_norm": 0.5174570146541233, + "learning_rate": 2.1423451100870455e-05, + "loss": 0.6498, + "num_tokens": 256728903.0, + "step": 2722 + }, + { + "epoch": 0.4647550776583035, + "grad_norm": 0.522380222445697, + "learning_rate": 2.141662399726916e-05, + "loss": 0.5305, + "num_tokens": 256803410.0, + "step": 2723 + }, + { + "epoch": 0.4649257552483359, + "grad_norm": 0.4705149360734729, + "learning_rate": 2.1409796893667863e-05, + "loss": 0.5351, + "num_tokens": 256894565.0, + "step": 2724 + }, + { + "epoch": 0.4650964328383683, + "grad_norm": 0.5125265450739548, + "learning_rate": 2.1402969790066566e-05, + "loss": 0.5662, + "num_tokens": 256968342.0, + "step": 2725 + }, + { + "epoch": 0.46526711042840074, + "grad_norm": 0.48535455295541086, + "learning_rate": 2.139614268646527e-05, + "loss": 0.6385, + "num_tokens": 257088775.0, + "step": 2726 + }, + { + "epoch": 0.46543778801843316, + "grad_norm": 0.5046206349465021, + "learning_rate": 2.138931558286397e-05, + "loss": 0.6356, + "num_tokens": 257180623.0, + "step": 2727 + }, + { + "epoch": 0.4656084656084656, + "grad_norm": 0.4566979731256016, + "learning_rate": 2.1382488479262674e-05, + "loss": 0.5548, + "num_tokens": 257283260.0, + "step": 2728 + }, + { + "epoch": 0.46577914319849806, + "grad_norm": 0.48635655470546224, + "learning_rate": 2.1375661375661378e-05, + "loss": 0.6265, + "num_tokens": 257383139.0, + "step": 2729 + }, + { + "epoch": 0.4659498207885305, + "grad_norm": 0.43836545371515856, + "learning_rate": 2.1368834272060082e-05, + "loss": 0.53, + "num_tokens": 257495692.0, + "step": 2730 + }, + { + "epoch": 0.4661204983785629, + "grad_norm": 0.5017524088201423, + "learning_rate": 2.1362007168458786e-05, + "loss": 0.5858, + "num_tokens": 257580900.0, + "step": 2731 + }, + { + "epoch": 0.4662911759685953, + "grad_norm": 0.5175788731086552, + "learning_rate": 2.135518006485749e-05, + "loss": 0.6439, + "num_tokens": 257666792.0, + "step": 2732 + }, + { + "epoch": 0.46646185355862774, + "grad_norm": 0.5013665939126793, + "learning_rate": 2.1348352961256186e-05, + "loss": 0.5731, + "num_tokens": 257746823.0, + "step": 2733 + }, + { + "epoch": 0.46663253114866016, + "grad_norm": 0.5213046230528113, + "learning_rate": 2.134152585765489e-05, + "loss": 0.5961, + "num_tokens": 257832161.0, + "step": 2734 + }, + { + "epoch": 0.46680320873869263, + "grad_norm": 0.4698592175420274, + "learning_rate": 2.1334698754053594e-05, + "loss": 0.5557, + "num_tokens": 257925650.0, + "step": 2735 + }, + { + "epoch": 0.46697388632872505, + "grad_norm": 0.5141208416163267, + "learning_rate": 2.1327871650452298e-05, + "loss": 0.5753, + "num_tokens": 258005482.0, + "step": 2736 + }, + { + "epoch": 0.46714456391875747, + "grad_norm": 0.47903234581461634, + "learning_rate": 2.1321044546850998e-05, + "loss": 0.6635, + "num_tokens": 258122242.0, + "step": 2737 + }, + { + "epoch": 0.4673152415087899, + "grad_norm": 0.48032257526866257, + "learning_rate": 2.1314217443249702e-05, + "loss": 0.5934, + "num_tokens": 258231666.0, + "step": 2738 + }, + { + "epoch": 0.4674859190988223, + "grad_norm": 0.5062134724087523, + "learning_rate": 2.1307390339648406e-05, + "loss": 0.557, + "num_tokens": 258315621.0, + "step": 2739 + }, + { + "epoch": 0.46765659668885473, + "grad_norm": 0.42315926867101145, + "learning_rate": 2.130056323604711e-05, + "loss": 0.5864, + "num_tokens": 258462027.0, + "step": 2740 + }, + { + "epoch": 0.4678272742788872, + "grad_norm": 0.5076751437387765, + "learning_rate": 2.1293736132445813e-05, + "loss": 0.574, + "num_tokens": 258540097.0, + "step": 2741 + }, + { + "epoch": 0.4679979518689196, + "grad_norm": 0.5828506260265786, + "learning_rate": 2.1286909028844517e-05, + "loss": 0.5912, + "num_tokens": 258603939.0, + "step": 2742 + }, + { + "epoch": 0.46816862945895205, + "grad_norm": 0.4918951916378864, + "learning_rate": 2.128008192524322e-05, + "loss": 0.5281, + "num_tokens": 258686714.0, + "step": 2743 + }, + { + "epoch": 0.46833930704898447, + "grad_norm": 0.599051915332191, + "learning_rate": 2.127325482164192e-05, + "loss": 0.5225, + "num_tokens": 258742602.0, + "step": 2744 + }, + { + "epoch": 0.4685099846390169, + "grad_norm": 0.48175779235830407, + "learning_rate": 2.126642771804062e-05, + "loss": 0.6622, + "num_tokens": 258860547.0, + "step": 2745 + }, + { + "epoch": 0.4686806622290493, + "grad_norm": 0.5039805038326765, + "learning_rate": 2.1259600614439325e-05, + "loss": 0.5565, + "num_tokens": 258946288.0, + "step": 2746 + }, + { + "epoch": 0.4688513398190818, + "grad_norm": 0.4456602429145936, + "learning_rate": 2.125277351083803e-05, + "loss": 0.5703, + "num_tokens": 259048471.0, + "step": 2747 + }, + { + "epoch": 0.4690220174091142, + "grad_norm": 0.5213329659127194, + "learning_rate": 2.124594640723673e-05, + "loss": 0.6473, + "num_tokens": 259136222.0, + "step": 2748 + }, + { + "epoch": 0.4691926949991466, + "grad_norm": 0.4810675999084171, + "learning_rate": 2.1239119303635433e-05, + "loss": 0.5731, + "num_tokens": 259228976.0, + "step": 2749 + }, + { + "epoch": 0.46936337258917904, + "grad_norm": 0.4616284869674558, + "learning_rate": 2.1232292200034137e-05, + "loss": 0.5457, + "num_tokens": 259327772.0, + "step": 2750 + }, + { + "epoch": 0.46953405017921146, + "grad_norm": 0.5143090665069365, + "learning_rate": 2.122546509643284e-05, + "loss": 0.5636, + "num_tokens": 259439027.0, + "step": 2751 + }, + { + "epoch": 0.4697047277692439, + "grad_norm": 0.5099116798577986, + "learning_rate": 2.1218637992831544e-05, + "loss": 0.5871, + "num_tokens": 259515938.0, + "step": 2752 + }, + { + "epoch": 0.46987540535927635, + "grad_norm": 0.5043400886320073, + "learning_rate": 2.1211810889230248e-05, + "loss": 0.5714, + "num_tokens": 259602114.0, + "step": 2753 + }, + { + "epoch": 0.4700460829493088, + "grad_norm": 0.5356962274851458, + "learning_rate": 2.120498378562895e-05, + "loss": 0.5962, + "num_tokens": 259681020.0, + "step": 2754 + }, + { + "epoch": 0.4702167605393412, + "grad_norm": 0.4933197566261178, + "learning_rate": 2.1198156682027652e-05, + "loss": 0.6703, + "num_tokens": 259788336.0, + "step": 2755 + }, + { + "epoch": 0.4703874381293736, + "grad_norm": 0.45607870193496064, + "learning_rate": 2.1191329578426356e-05, + "loss": 0.6339, + "num_tokens": 259899175.0, + "step": 2756 + }, + { + "epoch": 0.47055811571940603, + "grad_norm": 0.572336262821779, + "learning_rate": 2.118450247482506e-05, + "loss": 0.7077, + "num_tokens": 260013792.0, + "step": 2757 + }, + { + "epoch": 0.47072879330943845, + "grad_norm": 0.4747638104208514, + "learning_rate": 2.1177675371223757e-05, + "loss": 0.5952, + "num_tokens": 260107161.0, + "step": 2758 + }, + { + "epoch": 0.4708994708994709, + "grad_norm": 0.4880430412156721, + "learning_rate": 2.117084826762246e-05, + "loss": 0.6175, + "num_tokens": 260211867.0, + "step": 2759 + }, + { + "epoch": 0.47107014848950335, + "grad_norm": 0.48080589764162546, + "learning_rate": 2.1164021164021164e-05, + "loss": 0.6284, + "num_tokens": 260329563.0, + "step": 2760 + }, + { + "epoch": 0.47124082607953577, + "grad_norm": 0.5640412135714783, + "learning_rate": 2.1157194060419868e-05, + "loss": 0.5451, + "num_tokens": 260408467.0, + "step": 2761 + }, + { + "epoch": 0.4714115036695682, + "grad_norm": 0.5308643548808377, + "learning_rate": 2.1150366956818572e-05, + "loss": 0.6434, + "num_tokens": 260499690.0, + "step": 2762 + }, + { + "epoch": 0.4715821812596006, + "grad_norm": 0.5479100710111386, + "learning_rate": 2.1143539853217276e-05, + "loss": 0.572, + "num_tokens": 260568393.0, + "step": 2763 + }, + { + "epoch": 0.471752858849633, + "grad_norm": 0.4904881270152355, + "learning_rate": 2.1136712749615976e-05, + "loss": 0.6574, + "num_tokens": 260671939.0, + "step": 2764 + }, + { + "epoch": 0.47192353643966545, + "grad_norm": 0.48745440236315607, + "learning_rate": 2.112988564601468e-05, + "loss": 0.5118, + "num_tokens": 260750790.0, + "step": 2765 + }, + { + "epoch": 0.4720942140296979, + "grad_norm": 0.5120088791789342, + "learning_rate": 2.1123058542413383e-05, + "loss": 0.5665, + "num_tokens": 260825742.0, + "step": 2766 + }, + { + "epoch": 0.47226489161973034, + "grad_norm": 0.5025574965794489, + "learning_rate": 2.1116231438812087e-05, + "loss": 0.5627, + "num_tokens": 260906910.0, + "step": 2767 + }, + { + "epoch": 0.47243556920976276, + "grad_norm": 0.46105268417018186, + "learning_rate": 2.110940433521079e-05, + "loss": 0.6551, + "num_tokens": 261040205.0, + "step": 2768 + }, + { + "epoch": 0.4726062467997952, + "grad_norm": 0.5029042239548835, + "learning_rate": 2.1102577231609495e-05, + "loss": 0.6565, + "num_tokens": 261128147.0, + "step": 2769 + }, + { + "epoch": 0.4727769243898276, + "grad_norm": 0.44595621382141026, + "learning_rate": 2.109575012800819e-05, + "loss": 0.6284, + "num_tokens": 261253380.0, + "step": 2770 + }, + { + "epoch": 0.47294760197986, + "grad_norm": 0.4770356213613586, + "learning_rate": 2.1088923024406895e-05, + "loss": 0.5172, + "num_tokens": 261337748.0, + "step": 2771 + }, + { + "epoch": 0.4731182795698925, + "grad_norm": 0.48806674291261415, + "learning_rate": 2.10820959208056e-05, + "loss": 0.5656, + "num_tokens": 261424238.0, + "step": 2772 + }, + { + "epoch": 0.4732889571599249, + "grad_norm": 0.4732493503639952, + "learning_rate": 2.1075268817204303e-05, + "loss": 0.5372, + "num_tokens": 261506820.0, + "step": 2773 + }, + { + "epoch": 0.47345963474995734, + "grad_norm": 0.5368010190397625, + "learning_rate": 2.1068441713603007e-05, + "loss": 0.6426, + "num_tokens": 261614388.0, + "step": 2774 + }, + { + "epoch": 0.47363031233998976, + "grad_norm": 0.5238842412880865, + "learning_rate": 2.1061614610001707e-05, + "loss": 0.6612, + "num_tokens": 261707423.0, + "step": 2775 + }, + { + "epoch": 0.4738009899300222, + "grad_norm": 0.5583084495044456, + "learning_rate": 2.105478750640041e-05, + "loss": 0.5968, + "num_tokens": 261796233.0, + "step": 2776 + }, + { + "epoch": 0.4739716675200546, + "grad_norm": 0.4635386559002841, + "learning_rate": 2.1047960402799115e-05, + "loss": 0.5857, + "num_tokens": 261903272.0, + "step": 2777 + }, + { + "epoch": 0.47414234511008707, + "grad_norm": 0.5196872290181236, + "learning_rate": 2.104113329919782e-05, + "loss": 0.6859, + "num_tokens": 262001311.0, + "step": 2778 + }, + { + "epoch": 0.4743130227001195, + "grad_norm": 0.42604246411813196, + "learning_rate": 2.1034306195596522e-05, + "loss": 0.5824, + "num_tokens": 262128348.0, + "step": 2779 + }, + { + "epoch": 0.4744837002901519, + "grad_norm": 0.4932638343346999, + "learning_rate": 2.1027479091995226e-05, + "loss": 0.5973, + "num_tokens": 262219890.0, + "step": 2780 + }, + { + "epoch": 0.47465437788018433, + "grad_norm": 0.529869511773184, + "learning_rate": 2.1020651988393926e-05, + "loss": 0.6461, + "num_tokens": 262312992.0, + "step": 2781 + }, + { + "epoch": 0.47482505547021675, + "grad_norm": 0.48225732417768413, + "learning_rate": 2.1013824884792627e-05, + "loss": 0.6713, + "num_tokens": 262411274.0, + "step": 2782 + }, + { + "epoch": 0.47499573306024917, + "grad_norm": 0.46819328848847286, + "learning_rate": 2.100699778119133e-05, + "loss": 0.5983, + "num_tokens": 262506439.0, + "step": 2783 + }, + { + "epoch": 0.47516641065028165, + "grad_norm": 0.5659150884490732, + "learning_rate": 2.1000170677590034e-05, + "loss": 0.6237, + "num_tokens": 262574804.0, + "step": 2784 + }, + { + "epoch": 0.47533708824031407, + "grad_norm": 0.4763195479481333, + "learning_rate": 2.0993343573988735e-05, + "loss": 0.606, + "num_tokens": 262680329.0, + "step": 2785 + }, + { + "epoch": 0.4755077658303465, + "grad_norm": 0.4505571499406244, + "learning_rate": 2.098651647038744e-05, + "loss": 0.5603, + "num_tokens": 262784724.0, + "step": 2786 + }, + { + "epoch": 0.4756784434203789, + "grad_norm": 0.4973408721146918, + "learning_rate": 2.0979689366786142e-05, + "loss": 0.5913, + "num_tokens": 262875852.0, + "step": 2787 + }, + { + "epoch": 0.4758491210104113, + "grad_norm": 0.500000423358977, + "learning_rate": 2.0972862263184846e-05, + "loss": 0.5063, + "num_tokens": 262947849.0, + "step": 2788 + }, + { + "epoch": 0.47601979860044374, + "grad_norm": 0.44921344675656755, + "learning_rate": 2.096603515958355e-05, + "loss": 0.5514, + "num_tokens": 263047158.0, + "step": 2789 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.4444766323327889, + "learning_rate": 2.0959208055982253e-05, + "loss": 0.6219, + "num_tokens": 263160022.0, + "step": 2790 + }, + { + "epoch": 0.47636115378050864, + "grad_norm": 0.45276882487364545, + "learning_rate": 2.0952380952380954e-05, + "loss": 0.5747, + "num_tokens": 263260563.0, + "step": 2791 + }, + { + "epoch": 0.47653183137054106, + "grad_norm": 0.47520414357712126, + "learning_rate": 2.0945553848779658e-05, + "loss": 0.5485, + "num_tokens": 263348330.0, + "step": 2792 + }, + { + "epoch": 0.4767025089605735, + "grad_norm": 0.48415549480452974, + "learning_rate": 2.093872674517836e-05, + "loss": 0.539, + "num_tokens": 263443498.0, + "step": 2793 + }, + { + "epoch": 0.4768731865506059, + "grad_norm": 0.5112120567390389, + "learning_rate": 2.0931899641577065e-05, + "loss": 0.6542, + "num_tokens": 263545307.0, + "step": 2794 + }, + { + "epoch": 0.4770438641406383, + "grad_norm": 0.4843555789044252, + "learning_rate": 2.0925072537975762e-05, + "loss": 0.5634, + "num_tokens": 263629848.0, + "step": 2795 + }, + { + "epoch": 0.47721454173067074, + "grad_norm": 0.5222218856297645, + "learning_rate": 2.0918245434374466e-05, + "loss": 0.6744, + "num_tokens": 263717207.0, + "step": 2796 + }, + { + "epoch": 0.4773852193207032, + "grad_norm": 0.470252192997519, + "learning_rate": 2.091141833077317e-05, + "loss": 0.5468, + "num_tokens": 263805494.0, + "step": 2797 + }, + { + "epoch": 0.47755589691073563, + "grad_norm": 0.48854999449656045, + "learning_rate": 2.0904591227171873e-05, + "loss": 0.683, + "num_tokens": 263912486.0, + "step": 2798 + }, + { + "epoch": 0.47772657450076805, + "grad_norm": 0.4278552040817007, + "learning_rate": 2.0897764123570577e-05, + "loss": 0.5628, + "num_tokens": 264026976.0, + "step": 2799 + }, + { + "epoch": 0.4778972520908005, + "grad_norm": 0.49895084174843796, + "learning_rate": 2.089093701996928e-05, + "loss": 0.5633, + "num_tokens": 264105730.0, + "step": 2800 + }, + { + "epoch": 0.4780679296808329, + "grad_norm": 0.4679635040376754, + "learning_rate": 2.088410991636798e-05, + "loss": 0.6781, + "num_tokens": 264219219.0, + "step": 2801 + }, + { + "epoch": 0.4782386072708653, + "grad_norm": 0.5602714348621045, + "learning_rate": 2.0877282812766685e-05, + "loss": 0.582, + "num_tokens": 264289623.0, + "step": 2802 + }, + { + "epoch": 0.4784092848608978, + "grad_norm": 0.48337998899673135, + "learning_rate": 2.087045570916539e-05, + "loss": 0.6106, + "num_tokens": 264386781.0, + "step": 2803 + }, + { + "epoch": 0.4785799624509302, + "grad_norm": 0.4759665388205996, + "learning_rate": 2.0863628605564093e-05, + "loss": 0.5962, + "num_tokens": 264487751.0, + "step": 2804 + }, + { + "epoch": 0.4787506400409626, + "grad_norm": 0.46836843412874873, + "learning_rate": 2.0856801501962796e-05, + "loss": 0.5863, + "num_tokens": 264594699.0, + "step": 2805 + }, + { + "epoch": 0.47892131763099505, + "grad_norm": 0.500968308707824, + "learning_rate": 2.08499743983615e-05, + "loss": 0.556, + "num_tokens": 264671718.0, + "step": 2806 + }, + { + "epoch": 0.47909199522102747, + "grad_norm": 0.5893157032577367, + "learning_rate": 2.0843147294760197e-05, + "loss": 0.6597, + "num_tokens": 264763398.0, + "step": 2807 + }, + { + "epoch": 0.4792626728110599, + "grad_norm": 0.48047844707117543, + "learning_rate": 2.08363201911589e-05, + "loss": 0.6292, + "num_tokens": 264866139.0, + "step": 2808 + }, + { + "epoch": 0.47943335040109236, + "grad_norm": 0.4814903472753632, + "learning_rate": 2.0829493087557605e-05, + "loss": 0.6403, + "num_tokens": 264964569.0, + "step": 2809 + }, + { + "epoch": 0.4796040279911248, + "grad_norm": 0.4735962744889165, + "learning_rate": 2.082266598395631e-05, + "loss": 0.6227, + "num_tokens": 265070831.0, + "step": 2810 + }, + { + "epoch": 0.4797747055811572, + "grad_norm": 0.5462762968017906, + "learning_rate": 2.0815838880355012e-05, + "loss": 0.682, + "num_tokens": 265161962.0, + "step": 2811 + }, + { + "epoch": 0.4799453831711896, + "grad_norm": 0.47681667500741154, + "learning_rate": 2.0809011776753712e-05, + "loss": 0.6144, + "num_tokens": 265267361.0, + "step": 2812 + }, + { + "epoch": 0.48011606076122204, + "grad_norm": 0.47302407008804076, + "learning_rate": 2.0802184673152416e-05, + "loss": 0.7001, + "num_tokens": 265396435.0, + "step": 2813 + }, + { + "epoch": 0.48028673835125446, + "grad_norm": 0.45148044789709957, + "learning_rate": 2.079535756955112e-05, + "loss": 0.5564, + "num_tokens": 265504015.0, + "step": 2814 + }, + { + "epoch": 0.48045741594128694, + "grad_norm": 0.5184527820534015, + "learning_rate": 2.0788530465949824e-05, + "loss": 0.5347, + "num_tokens": 265575304.0, + "step": 2815 + }, + { + "epoch": 0.48062809353131936, + "grad_norm": 0.5296202969839803, + "learning_rate": 2.0781703362348528e-05, + "loss": 0.5559, + "num_tokens": 265646428.0, + "step": 2816 + }, + { + "epoch": 0.4807987711213518, + "grad_norm": 0.48816019063909155, + "learning_rate": 2.077487625874723e-05, + "loss": 0.5582, + "num_tokens": 265724991.0, + "step": 2817 + }, + { + "epoch": 0.4809694487113842, + "grad_norm": 0.48781723245203773, + "learning_rate": 2.076804915514593e-05, + "loss": 0.6339, + "num_tokens": 265816340.0, + "step": 2818 + }, + { + "epoch": 0.4811401263014166, + "grad_norm": 0.4575290190832969, + "learning_rate": 2.0761222051544635e-05, + "loss": 0.6281, + "num_tokens": 265928853.0, + "step": 2819 + }, + { + "epoch": 0.48131080389144903, + "grad_norm": 0.4796099583872098, + "learning_rate": 2.0754394947943336e-05, + "loss": 0.5578, + "num_tokens": 266022688.0, + "step": 2820 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.4719757609846022, + "learning_rate": 2.074756784434204e-05, + "loss": 0.5461, + "num_tokens": 266120945.0, + "step": 2821 + }, + { + "epoch": 0.48165215907151393, + "grad_norm": 0.4703801534661618, + "learning_rate": 2.074074074074074e-05, + "loss": 0.538, + "num_tokens": 266214668.0, + "step": 2822 + }, + { + "epoch": 0.48182283666154635, + "grad_norm": 0.5186243701215413, + "learning_rate": 2.0733913637139444e-05, + "loss": 0.7006, + "num_tokens": 266323538.0, + "step": 2823 + }, + { + "epoch": 0.48199351425157877, + "grad_norm": 0.5423821529180219, + "learning_rate": 2.0727086533538147e-05, + "loss": 0.6059, + "num_tokens": 266424684.0, + "step": 2824 + }, + { + "epoch": 0.4821641918416112, + "grad_norm": 0.5351325254007536, + "learning_rate": 2.072025942993685e-05, + "loss": 0.5537, + "num_tokens": 266494881.0, + "step": 2825 + }, + { + "epoch": 0.4823348694316436, + "grad_norm": 0.4850658743946264, + "learning_rate": 2.0713432326335555e-05, + "loss": 0.7092, + "num_tokens": 266597616.0, + "step": 2826 + }, + { + "epoch": 0.48250554702167603, + "grad_norm": 0.43852264950976283, + "learning_rate": 2.070660522273426e-05, + "loss": 0.5255, + "num_tokens": 266710415.0, + "step": 2827 + }, + { + "epoch": 0.4826762246117085, + "grad_norm": 0.5100733019147135, + "learning_rate": 2.069977811913296e-05, + "loss": 0.5831, + "num_tokens": 266802276.0, + "step": 2828 + }, + { + "epoch": 0.4828469022017409, + "grad_norm": 0.4853108524274792, + "learning_rate": 2.0692951015531663e-05, + "loss": 0.6497, + "num_tokens": 266901085.0, + "step": 2829 + }, + { + "epoch": 0.48301757979177334, + "grad_norm": 0.4886110931391704, + "learning_rate": 2.0686123911930367e-05, + "loss": 0.6721, + "num_tokens": 266996334.0, + "step": 2830 + }, + { + "epoch": 0.48318825738180576, + "grad_norm": 0.5475197851006306, + "learning_rate": 2.067929680832907e-05, + "loss": 0.6338, + "num_tokens": 267073096.0, + "step": 2831 + }, + { + "epoch": 0.4833589349718382, + "grad_norm": 0.5609227356508061, + "learning_rate": 2.0672469704727767e-05, + "loss": 0.6343, + "num_tokens": 267171610.0, + "step": 2832 + }, + { + "epoch": 0.4835296125618706, + "grad_norm": 0.4975415381615914, + "learning_rate": 2.066564260112647e-05, + "loss": 0.6528, + "num_tokens": 267281372.0, + "step": 2833 + }, + { + "epoch": 0.4837002901519031, + "grad_norm": 0.4761638301635794, + "learning_rate": 2.0658815497525175e-05, + "loss": 0.6004, + "num_tokens": 267379502.0, + "step": 2834 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.5140227357409637, + "learning_rate": 2.065198839392388e-05, + "loss": 0.6529, + "num_tokens": 267480547.0, + "step": 2835 + }, + { + "epoch": 0.4840416453319679, + "grad_norm": 0.503050097407809, + "learning_rate": 2.0645161290322582e-05, + "loss": 0.5572, + "num_tokens": 267561848.0, + "step": 2836 + }, + { + "epoch": 0.48421232292200034, + "grad_norm": 0.5389466118288465, + "learning_rate": 2.0638334186721286e-05, + "loss": 0.6482, + "num_tokens": 267650090.0, + "step": 2837 + }, + { + "epoch": 0.48438300051203276, + "grad_norm": 0.5085763390579215, + "learning_rate": 2.0631507083119987e-05, + "loss": 0.5152, + "num_tokens": 267727785.0, + "step": 2838 + }, + { + "epoch": 0.4845536781020652, + "grad_norm": 0.4899520289556035, + "learning_rate": 2.062467997951869e-05, + "loss": 0.6089, + "num_tokens": 267824441.0, + "step": 2839 + }, + { + "epoch": 0.48472435569209765, + "grad_norm": 0.4913559210543756, + "learning_rate": 2.0617852875917394e-05, + "loss": 0.6319, + "num_tokens": 267932216.0, + "step": 2840 + }, + { + "epoch": 0.4848950332821301, + "grad_norm": 0.4980148330177103, + "learning_rate": 2.0611025772316098e-05, + "loss": 0.6726, + "num_tokens": 268031290.0, + "step": 2841 + }, + { + "epoch": 0.4850657108721625, + "grad_norm": 0.5430757340372253, + "learning_rate": 2.06041986687148e-05, + "loss": 0.5945, + "num_tokens": 268095642.0, + "step": 2842 + }, + { + "epoch": 0.4852363884621949, + "grad_norm": 0.47229457625039367, + "learning_rate": 2.0597371565113505e-05, + "loss": 0.6917, + "num_tokens": 268233579.0, + "step": 2843 + }, + { + "epoch": 0.48540706605222733, + "grad_norm": 0.43624329222044694, + "learning_rate": 2.0590544461512202e-05, + "loss": 0.5778, + "num_tokens": 268348629.0, + "step": 2844 + }, + { + "epoch": 0.48557774364225975, + "grad_norm": 0.44498761098212736, + "learning_rate": 2.0583717357910906e-05, + "loss": 0.6101, + "num_tokens": 268462140.0, + "step": 2845 + }, + { + "epoch": 0.4857484212322922, + "grad_norm": 0.5003639627514537, + "learning_rate": 2.057689025430961e-05, + "loss": 0.645, + "num_tokens": 268572442.0, + "step": 2846 + }, + { + "epoch": 0.48591909882232465, + "grad_norm": 0.4856748373294264, + "learning_rate": 2.0570063150708314e-05, + "loss": 0.553, + "num_tokens": 268660026.0, + "step": 2847 + }, + { + "epoch": 0.48608977641235707, + "grad_norm": 0.4700848228046942, + "learning_rate": 2.0563236047107017e-05, + "loss": 0.5311, + "num_tokens": 268751811.0, + "step": 2848 + }, + { + "epoch": 0.4862604540023895, + "grad_norm": 0.4505358090745403, + "learning_rate": 2.0556408943505718e-05, + "loss": 0.5525, + "num_tokens": 268860115.0, + "step": 2849 + }, + { + "epoch": 0.4864311315924219, + "grad_norm": 0.49432532150934183, + "learning_rate": 2.054958183990442e-05, + "loss": 0.6154, + "num_tokens": 268960297.0, + "step": 2850 + }, + { + "epoch": 0.4866018091824543, + "grad_norm": 0.5674496830874917, + "learning_rate": 2.0542754736303125e-05, + "loss": 0.6641, + "num_tokens": 269031095.0, + "step": 2851 + }, + { + "epoch": 0.48677248677248675, + "grad_norm": 0.5063399703064866, + "learning_rate": 2.053592763270183e-05, + "loss": 0.5867, + "num_tokens": 269110924.0, + "step": 2852 + }, + { + "epoch": 0.4869431643625192, + "grad_norm": 0.5221460778498629, + "learning_rate": 2.0529100529100533e-05, + "loss": 0.706, + "num_tokens": 269213733.0, + "step": 2853 + }, + { + "epoch": 0.48711384195255164, + "grad_norm": 0.5396749092105997, + "learning_rate": 2.0522273425499237e-05, + "loss": 0.6733, + "num_tokens": 269310298.0, + "step": 2854 + }, + { + "epoch": 0.48728451954258406, + "grad_norm": 0.441556058435517, + "learning_rate": 2.0515446321897937e-05, + "loss": 0.5524, + "num_tokens": 269420576.0, + "step": 2855 + }, + { + "epoch": 0.4874551971326165, + "grad_norm": 0.4615154380288719, + "learning_rate": 2.050861921829664e-05, + "loss": 0.5911, + "num_tokens": 269540185.0, + "step": 2856 + }, + { + "epoch": 0.4876258747226489, + "grad_norm": 0.4295552421179953, + "learning_rate": 2.050179211469534e-05, + "loss": 0.5412, + "num_tokens": 269654267.0, + "step": 2857 + }, + { + "epoch": 0.4877965523126813, + "grad_norm": 0.4963979554435014, + "learning_rate": 2.0494965011094045e-05, + "loss": 0.6481, + "num_tokens": 269750827.0, + "step": 2858 + }, + { + "epoch": 0.4879672299027138, + "grad_norm": 0.5039026699213918, + "learning_rate": 2.0488137907492745e-05, + "loss": 0.5964, + "num_tokens": 269854249.0, + "step": 2859 + }, + { + "epoch": 0.4881379074927462, + "grad_norm": 0.5706880894505407, + "learning_rate": 2.048131080389145e-05, + "loss": 0.6873, + "num_tokens": 269926085.0, + "step": 2860 + }, + { + "epoch": 0.48830858508277863, + "grad_norm": 0.46280494591719906, + "learning_rate": 2.0474483700290153e-05, + "loss": 0.5435, + "num_tokens": 270019949.0, + "step": 2861 + }, + { + "epoch": 0.48847926267281105, + "grad_norm": 0.4557477938540628, + "learning_rate": 2.0467656596688857e-05, + "loss": 0.6436, + "num_tokens": 270133154.0, + "step": 2862 + }, + { + "epoch": 0.4886499402628435, + "grad_norm": 0.4675394053210359, + "learning_rate": 2.046082949308756e-05, + "loss": 0.629, + "num_tokens": 270240858.0, + "step": 2863 + }, + { + "epoch": 0.4888206178528759, + "grad_norm": 0.4652502111645639, + "learning_rate": 2.0454002389486264e-05, + "loss": 0.582, + "num_tokens": 270340225.0, + "step": 2864 + }, + { + "epoch": 0.48899129544290837, + "grad_norm": 0.5829988283039427, + "learning_rate": 2.0447175285884964e-05, + "loss": 0.6863, + "num_tokens": 270413709.0, + "step": 2865 + }, + { + "epoch": 0.4891619730329408, + "grad_norm": 0.48266785152693065, + "learning_rate": 2.0440348182283668e-05, + "loss": 0.511, + "num_tokens": 270497651.0, + "step": 2866 + }, + { + "epoch": 0.4893326506229732, + "grad_norm": 0.4968484465603754, + "learning_rate": 2.0433521078682372e-05, + "loss": 0.508, + "num_tokens": 270577026.0, + "step": 2867 + }, + { + "epoch": 0.48950332821300563, + "grad_norm": 0.47595925265430217, + "learning_rate": 2.0426693975081076e-05, + "loss": 0.5843, + "num_tokens": 270674391.0, + "step": 2868 + }, + { + "epoch": 0.48967400580303805, + "grad_norm": 0.46628377517710473, + "learning_rate": 2.0419866871479773e-05, + "loss": 0.6643, + "num_tokens": 270787274.0, + "step": 2869 + }, + { + "epoch": 0.48984468339307047, + "grad_norm": 0.5385215825334826, + "learning_rate": 2.0413039767878476e-05, + "loss": 0.5962, + "num_tokens": 270858889.0, + "step": 2870 + }, + { + "epoch": 0.49001536098310294, + "grad_norm": 0.5805884963067375, + "learning_rate": 2.040621266427718e-05, + "loss": 0.6293, + "num_tokens": 270957643.0, + "step": 2871 + }, + { + "epoch": 0.49018603857313536, + "grad_norm": 0.498084441145313, + "learning_rate": 2.0399385560675884e-05, + "loss": 0.6639, + "num_tokens": 271047676.0, + "step": 2872 + }, + { + "epoch": 0.4903567161631678, + "grad_norm": 0.487564019574099, + "learning_rate": 2.0392558457074588e-05, + "loss": 0.5933, + "num_tokens": 271129962.0, + "step": 2873 + }, + { + "epoch": 0.4905273937532002, + "grad_norm": 0.6472447182346212, + "learning_rate": 2.038573135347329e-05, + "loss": 0.714, + "num_tokens": 271231719.0, + "step": 2874 + }, + { + "epoch": 0.4906980713432326, + "grad_norm": 0.4971211809010418, + "learning_rate": 2.0378904249871992e-05, + "loss": 0.5887, + "num_tokens": 271328947.0, + "step": 2875 + }, + { + "epoch": 0.49086874893326504, + "grad_norm": 0.4863071185912599, + "learning_rate": 2.0372077146270696e-05, + "loss": 0.5036, + "num_tokens": 271422273.0, + "step": 2876 + }, + { + "epoch": 0.4910394265232975, + "grad_norm": 0.5413413168451584, + "learning_rate": 2.03652500426694e-05, + "loss": 0.632, + "num_tokens": 271502160.0, + "step": 2877 + }, + { + "epoch": 0.49121010411332994, + "grad_norm": 0.5144456312536052, + "learning_rate": 2.0358422939068103e-05, + "loss": 0.6403, + "num_tokens": 271585123.0, + "step": 2878 + }, + { + "epoch": 0.49138078170336236, + "grad_norm": 0.4842619958313146, + "learning_rate": 2.0351595835466807e-05, + "loss": 0.6668, + "num_tokens": 271684652.0, + "step": 2879 + }, + { + "epoch": 0.4915514592933948, + "grad_norm": 0.5284020508258055, + "learning_rate": 2.034476873186551e-05, + "loss": 0.6673, + "num_tokens": 271763620.0, + "step": 2880 + }, + { + "epoch": 0.4917221368834272, + "grad_norm": 0.4548950335050338, + "learning_rate": 2.0337941628264208e-05, + "loss": 0.6492, + "num_tokens": 271880062.0, + "step": 2881 + }, + { + "epoch": 0.4918928144734596, + "grad_norm": 0.496889245805252, + "learning_rate": 2.033111452466291e-05, + "loss": 0.5442, + "num_tokens": 271963728.0, + "step": 2882 + }, + { + "epoch": 0.49206349206349204, + "grad_norm": 0.6059432208166287, + "learning_rate": 2.0324287421061615e-05, + "loss": 0.6488, + "num_tokens": 272040540.0, + "step": 2883 + }, + { + "epoch": 0.4922341696535245, + "grad_norm": 0.5110023970730857, + "learning_rate": 2.031746031746032e-05, + "loss": 0.5592, + "num_tokens": 272114524.0, + "step": 2884 + }, + { + "epoch": 0.49240484724355693, + "grad_norm": 0.4839276755721502, + "learning_rate": 2.0310633213859023e-05, + "loss": 0.5364, + "num_tokens": 272200906.0, + "step": 2885 + }, + { + "epoch": 0.49257552483358935, + "grad_norm": 0.4801959231712062, + "learning_rate": 2.0303806110257723e-05, + "loss": 0.5353, + "num_tokens": 272293099.0, + "step": 2886 + }, + { + "epoch": 0.49274620242362177, + "grad_norm": 0.5388421563548168, + "learning_rate": 2.0296979006656427e-05, + "loss": 0.5697, + "num_tokens": 272368710.0, + "step": 2887 + }, + { + "epoch": 0.4929168800136542, + "grad_norm": 0.5320129170945297, + "learning_rate": 2.029015190305513e-05, + "loss": 0.5955, + "num_tokens": 272449086.0, + "step": 2888 + }, + { + "epoch": 0.4930875576036866, + "grad_norm": 0.48623700351007676, + "learning_rate": 2.0283324799453834e-05, + "loss": 0.5246, + "num_tokens": 272534506.0, + "step": 2889 + }, + { + "epoch": 0.4932582351937191, + "grad_norm": 0.5505810187608702, + "learning_rate": 2.0276497695852538e-05, + "loss": 0.6309, + "num_tokens": 272608333.0, + "step": 2890 + }, + { + "epoch": 0.4934289127837515, + "grad_norm": 0.491520130776598, + "learning_rate": 2.0269670592251242e-05, + "loss": 0.601, + "num_tokens": 272696781.0, + "step": 2891 + }, + { + "epoch": 0.4935995903737839, + "grad_norm": 0.5171691089471971, + "learning_rate": 2.0262843488649942e-05, + "loss": 0.5634, + "num_tokens": 272768975.0, + "step": 2892 + }, + { + "epoch": 0.49377026796381634, + "grad_norm": 0.4898250317116262, + "learning_rate": 2.0256016385048646e-05, + "loss": 0.5827, + "num_tokens": 272855964.0, + "step": 2893 + }, + { + "epoch": 0.49394094555384876, + "grad_norm": 0.465241279888239, + "learning_rate": 2.0249189281447346e-05, + "loss": 0.5259, + "num_tokens": 272948600.0, + "step": 2894 + }, + { + "epoch": 0.4941116231438812, + "grad_norm": 0.4819560221806663, + "learning_rate": 2.024236217784605e-05, + "loss": 0.594, + "num_tokens": 273044634.0, + "step": 2895 + }, + { + "epoch": 0.49428230073391366, + "grad_norm": 0.46378968328026815, + "learning_rate": 2.023553507424475e-05, + "loss": 0.5531, + "num_tokens": 273150236.0, + "step": 2896 + }, + { + "epoch": 0.4944529783239461, + "grad_norm": 0.4671772497805685, + "learning_rate": 2.0228707970643454e-05, + "loss": 0.5557, + "num_tokens": 273240551.0, + "step": 2897 + }, + { + "epoch": 0.4946236559139785, + "grad_norm": 0.4254915440115111, + "learning_rate": 2.0221880867042158e-05, + "loss": 0.5726, + "num_tokens": 273358847.0, + "step": 2898 + }, + { + "epoch": 0.4947943335040109, + "grad_norm": 0.45365023233787427, + "learning_rate": 2.0215053763440862e-05, + "loss": 0.6623, + "num_tokens": 273474304.0, + "step": 2899 + }, + { + "epoch": 0.49496501109404334, + "grad_norm": 0.4953276860605578, + "learning_rate": 2.0208226659839566e-05, + "loss": 0.5599, + "num_tokens": 273563073.0, + "step": 2900 + }, + { + "epoch": 0.49513568868407576, + "grad_norm": 0.535719945733631, + "learning_rate": 2.020139955623827e-05, + "loss": 0.6257, + "num_tokens": 273643942.0, + "step": 2901 + }, + { + "epoch": 0.49530636627410823, + "grad_norm": 0.49624489344201517, + "learning_rate": 2.019457245263697e-05, + "loss": 0.6399, + "num_tokens": 273746877.0, + "step": 2902 + }, + { + "epoch": 0.49547704386414065, + "grad_norm": 0.5094719768446331, + "learning_rate": 2.0187745349035674e-05, + "loss": 0.6273, + "num_tokens": 273838856.0, + "step": 2903 + }, + { + "epoch": 0.4956477214541731, + "grad_norm": 0.49548672356664875, + "learning_rate": 2.0180918245434377e-05, + "loss": 0.5558, + "num_tokens": 273918060.0, + "step": 2904 + }, + { + "epoch": 0.4958183990442055, + "grad_norm": 0.5398754637770871, + "learning_rate": 2.017409114183308e-05, + "loss": 0.6597, + "num_tokens": 273992093.0, + "step": 2905 + }, + { + "epoch": 0.4959890766342379, + "grad_norm": 0.45619085238186824, + "learning_rate": 2.0167264038231778e-05, + "loss": 0.5587, + "num_tokens": 274093531.0, + "step": 2906 + }, + { + "epoch": 0.49615975422427033, + "grad_norm": 0.5489587968582248, + "learning_rate": 2.0160436934630482e-05, + "loss": 0.6701, + "num_tokens": 274184099.0, + "step": 2907 + }, + { + "epoch": 0.4963304318143028, + "grad_norm": 0.43894239267208035, + "learning_rate": 2.0153609831029186e-05, + "loss": 0.561, + "num_tokens": 274293553.0, + "step": 2908 + }, + { + "epoch": 0.49650110940433523, + "grad_norm": 0.46366286245970384, + "learning_rate": 2.014678272742789e-05, + "loss": 0.5245, + "num_tokens": 274382139.0, + "step": 2909 + }, + { + "epoch": 0.49667178699436765, + "grad_norm": 0.491312583007306, + "learning_rate": 2.0139955623826593e-05, + "loss": 0.6272, + "num_tokens": 274474948.0, + "step": 2910 + }, + { + "epoch": 0.49684246458440007, + "grad_norm": 0.45909200225024077, + "learning_rate": 2.0133128520225297e-05, + "loss": 0.5362, + "num_tokens": 274566879.0, + "step": 2911 + }, + { + "epoch": 0.4970131421744325, + "grad_norm": 0.5163405294702257, + "learning_rate": 2.0126301416623997e-05, + "loss": 0.5565, + "num_tokens": 274639087.0, + "step": 2912 + }, + { + "epoch": 0.4971838197644649, + "grad_norm": 0.5249171033729432, + "learning_rate": 2.01194743130227e-05, + "loss": 0.6861, + "num_tokens": 274729536.0, + "step": 2913 + }, + { + "epoch": 0.4973544973544973, + "grad_norm": 0.5188809095733038, + "learning_rate": 2.0112647209421405e-05, + "loss": 0.595, + "num_tokens": 274822197.0, + "step": 2914 + }, + { + "epoch": 0.4975251749445298, + "grad_norm": 0.5029710825396685, + "learning_rate": 2.010582010582011e-05, + "loss": 0.6168, + "num_tokens": 274918420.0, + "step": 2915 + }, + { + "epoch": 0.4976958525345622, + "grad_norm": 0.476806919717425, + "learning_rate": 2.0098993002218812e-05, + "loss": 0.6054, + "num_tokens": 275024506.0, + "step": 2916 + }, + { + "epoch": 0.49786653012459464, + "grad_norm": 0.4906048200663985, + "learning_rate": 2.0092165898617516e-05, + "loss": 0.5873, + "num_tokens": 275113309.0, + "step": 2917 + }, + { + "epoch": 0.49803720771462706, + "grad_norm": 0.4890390341998896, + "learning_rate": 2.008533879501622e-05, + "loss": 0.6282, + "num_tokens": 275207914.0, + "step": 2918 + }, + { + "epoch": 0.4982078853046595, + "grad_norm": 0.5378272846379831, + "learning_rate": 2.0078511691414917e-05, + "loss": 0.5884, + "num_tokens": 275285327.0, + "step": 2919 + }, + { + "epoch": 0.4983785628946919, + "grad_norm": 0.5405475677384355, + "learning_rate": 2.007168458781362e-05, + "loss": 0.5701, + "num_tokens": 275357235.0, + "step": 2920 + }, + { + "epoch": 0.4985492404847244, + "grad_norm": 0.4560160124200505, + "learning_rate": 2.0064857484212324e-05, + "loss": 0.5768, + "num_tokens": 275466507.0, + "step": 2921 + }, + { + "epoch": 0.4987199180747568, + "grad_norm": 0.4747863422821953, + "learning_rate": 2.0058030380611028e-05, + "loss": 0.6002, + "num_tokens": 275557993.0, + "step": 2922 + }, + { + "epoch": 0.4988905956647892, + "grad_norm": 0.5033637466490302, + "learning_rate": 2.005120327700973e-05, + "loss": 0.5972, + "num_tokens": 275659357.0, + "step": 2923 + }, + { + "epoch": 0.49906127325482164, + "grad_norm": 0.4929275467909048, + "learning_rate": 2.0044376173408432e-05, + "loss": 0.5888, + "num_tokens": 275754575.0, + "step": 2924 + }, + { + "epoch": 0.49923195084485406, + "grad_norm": 0.48010205872811457, + "learning_rate": 2.0037549069807136e-05, + "loss": 0.5642, + "num_tokens": 275845434.0, + "step": 2925 + }, + { + "epoch": 0.4994026284348865, + "grad_norm": 0.4614143190938438, + "learning_rate": 2.003072196620584e-05, + "loss": 0.5571, + "num_tokens": 275943269.0, + "step": 2926 + }, + { + "epoch": 0.49957330602491895, + "grad_norm": 0.4250591814469685, + "learning_rate": 2.0023894862604544e-05, + "loss": 0.6295, + "num_tokens": 276079374.0, + "step": 2927 + }, + { + "epoch": 0.49974398361495137, + "grad_norm": 0.46939168719919083, + "learning_rate": 2.0017067759003247e-05, + "loss": 0.5895, + "num_tokens": 276187227.0, + "step": 2928 + }, + { + "epoch": 0.4999146612049838, + "grad_norm": 0.4719718062116493, + "learning_rate": 2.0010240655401948e-05, + "loss": 0.615, + "num_tokens": 276289234.0, + "step": 2929 + }, + { + "epoch": 0.5000853387950163, + "grad_norm": 0.44102594712594595, + "learning_rate": 2.000341355180065e-05, + "loss": 0.5523, + "num_tokens": 276394942.0, + "step": 2930 + }, + { + "epoch": 0.5002560163850487, + "grad_norm": 0.4975705580641857, + "learning_rate": 1.9996586448199352e-05, + "loss": 0.5007, + "num_tokens": 276464686.0, + "step": 2931 + }, + { + "epoch": 0.5004266939750811, + "grad_norm": 0.4771783426710486, + "learning_rate": 1.9989759344598056e-05, + "loss": 0.5579, + "num_tokens": 276562565.0, + "step": 2932 + }, + { + "epoch": 0.5005973715651135, + "grad_norm": 0.7741405083195809, + "learning_rate": 1.998293224099676e-05, + "loss": 0.6752, + "num_tokens": 276637420.0, + "step": 2933 + }, + { + "epoch": 0.500768049155146, + "grad_norm": 0.5251762038860214, + "learning_rate": 1.997610513739546e-05, + "loss": 0.5764, + "num_tokens": 276707390.0, + "step": 2934 + }, + { + "epoch": 0.5009387267451784, + "grad_norm": 0.5072670055509272, + "learning_rate": 1.9969278033794163e-05, + "loss": 0.5965, + "num_tokens": 276801890.0, + "step": 2935 + }, + { + "epoch": 0.5011094043352108, + "grad_norm": 0.4940913063499955, + "learning_rate": 1.9962450930192867e-05, + "loss": 0.6061, + "num_tokens": 276901189.0, + "step": 2936 + }, + { + "epoch": 0.5012800819252432, + "grad_norm": 0.41887722565286695, + "learning_rate": 1.995562382659157e-05, + "loss": 0.5439, + "num_tokens": 277009913.0, + "step": 2937 + }, + { + "epoch": 0.5014507595152756, + "grad_norm": 0.5261208203728652, + "learning_rate": 1.9948796722990275e-05, + "loss": 0.5819, + "num_tokens": 277089893.0, + "step": 2938 + }, + { + "epoch": 0.501621437105308, + "grad_norm": 0.4742556700588168, + "learning_rate": 1.9941969619388975e-05, + "loss": 0.5418, + "num_tokens": 277181275.0, + "step": 2939 + }, + { + "epoch": 0.5017921146953405, + "grad_norm": 0.4773173712133262, + "learning_rate": 1.993514251578768e-05, + "loss": 0.5873, + "num_tokens": 277291082.0, + "step": 2940 + }, + { + "epoch": 0.5019627922853729, + "grad_norm": 0.4810682830758116, + "learning_rate": 1.992831541218638e-05, + "loss": 0.6019, + "num_tokens": 277384470.0, + "step": 2941 + }, + { + "epoch": 0.5021334698754054, + "grad_norm": 0.5394096755076262, + "learning_rate": 1.9921488308585083e-05, + "loss": 0.6943, + "num_tokens": 277475574.0, + "step": 2942 + }, + { + "epoch": 0.5023041474654378, + "grad_norm": 0.4623212370764887, + "learning_rate": 1.9914661204983787e-05, + "loss": 0.4497, + "num_tokens": 277575476.0, + "step": 2943 + }, + { + "epoch": 0.5024748250554703, + "grad_norm": 0.4721381274963078, + "learning_rate": 1.990783410138249e-05, + "loss": 0.6449, + "num_tokens": 277691964.0, + "step": 2944 + }, + { + "epoch": 0.5026455026455027, + "grad_norm": 0.4740335866573741, + "learning_rate": 1.9901006997781194e-05, + "loss": 0.609, + "num_tokens": 277793285.0, + "step": 2945 + }, + { + "epoch": 0.5028161802355351, + "grad_norm": 0.49388965943517776, + "learning_rate": 1.9894179894179895e-05, + "loss": 0.6406, + "num_tokens": 277882339.0, + "step": 2946 + }, + { + "epoch": 0.5029868578255675, + "grad_norm": 0.436812256713808, + "learning_rate": 1.98873527905786e-05, + "loss": 0.5308, + "num_tokens": 277991391.0, + "step": 2947 + }, + { + "epoch": 0.5031575354155999, + "grad_norm": 0.46667214092929044, + "learning_rate": 1.9880525686977302e-05, + "loss": 0.665, + "num_tokens": 278110783.0, + "step": 2948 + }, + { + "epoch": 0.5033282130056324, + "grad_norm": 0.4460446183248579, + "learning_rate": 1.9873698583376006e-05, + "loss": 0.6056, + "num_tokens": 278231174.0, + "step": 2949 + }, + { + "epoch": 0.5034988905956648, + "grad_norm": 0.4610080488916346, + "learning_rate": 1.9866871479774706e-05, + "loss": 0.5698, + "num_tokens": 278336426.0, + "step": 2950 + }, + { + "epoch": 0.5036695681856972, + "grad_norm": 0.49105454013287486, + "learning_rate": 1.986004437617341e-05, + "loss": 0.6565, + "num_tokens": 278445529.0, + "step": 2951 + }, + { + "epoch": 0.5038402457757296, + "grad_norm": 0.48664315308457706, + "learning_rate": 1.9853217272572114e-05, + "loss": 0.5775, + "num_tokens": 278539674.0, + "step": 2952 + }, + { + "epoch": 0.504010923365762, + "grad_norm": 0.491773574244215, + "learning_rate": 1.9846390168970814e-05, + "loss": 0.6109, + "num_tokens": 278623893.0, + "step": 2953 + }, + { + "epoch": 0.5041816009557945, + "grad_norm": 0.4846123642846501, + "learning_rate": 1.9839563065369518e-05, + "loss": 0.5848, + "num_tokens": 278711398.0, + "step": 2954 + }, + { + "epoch": 0.504352278545827, + "grad_norm": 0.5400095007019781, + "learning_rate": 1.9832735961768222e-05, + "loss": 0.6801, + "num_tokens": 278795679.0, + "step": 2955 + }, + { + "epoch": 0.5045229561358594, + "grad_norm": 0.48445297245385405, + "learning_rate": 1.9825908858166926e-05, + "loss": 0.4877, + "num_tokens": 278873437.0, + "step": 2956 + }, + { + "epoch": 0.5046936337258918, + "grad_norm": 0.4879103365170065, + "learning_rate": 1.981908175456563e-05, + "loss": 0.5218, + "num_tokens": 278954245.0, + "step": 2957 + }, + { + "epoch": 0.5048643113159242, + "grad_norm": 0.46903809359514, + "learning_rate": 1.981225465096433e-05, + "loss": 0.5629, + "num_tokens": 279051252.0, + "step": 2958 + }, + { + "epoch": 0.5050349889059567, + "grad_norm": 0.43800524681905495, + "learning_rate": 1.9805427547363033e-05, + "loss": 0.5614, + "num_tokens": 279171527.0, + "step": 2959 + }, + { + "epoch": 0.5052056664959891, + "grad_norm": 0.48470298564778774, + "learning_rate": 1.9798600443761734e-05, + "loss": 0.6161, + "num_tokens": 279275253.0, + "step": 2960 + }, + { + "epoch": 0.5053763440860215, + "grad_norm": 0.4422180281282132, + "learning_rate": 1.9791773340160438e-05, + "loss": 0.5607, + "num_tokens": 279394381.0, + "step": 2961 + }, + { + "epoch": 0.5055470216760539, + "grad_norm": 0.5173804612580816, + "learning_rate": 1.978494623655914e-05, + "loss": 0.5145, + "num_tokens": 279463675.0, + "step": 2962 + }, + { + "epoch": 0.5057176992660863, + "grad_norm": 0.4861594430832971, + "learning_rate": 1.9778119132957845e-05, + "loss": 0.6544, + "num_tokens": 279563914.0, + "step": 2963 + }, + { + "epoch": 0.5058883768561188, + "grad_norm": 0.5370598110552905, + "learning_rate": 1.977129202935655e-05, + "loss": 0.5751, + "num_tokens": 279635373.0, + "step": 2964 + }, + { + "epoch": 0.5060590544461512, + "grad_norm": 0.40512112123146315, + "learning_rate": 1.976446492575525e-05, + "loss": 0.4642, + "num_tokens": 279738066.0, + "step": 2965 + }, + { + "epoch": 0.5062297320361836, + "grad_norm": 0.48516635962975396, + "learning_rate": 1.9757637822153953e-05, + "loss": 0.5324, + "num_tokens": 279825405.0, + "step": 2966 + }, + { + "epoch": 0.5064004096262161, + "grad_norm": 0.47761889825735676, + "learning_rate": 1.9750810718552657e-05, + "loss": 0.5675, + "num_tokens": 279916976.0, + "step": 2967 + }, + { + "epoch": 0.5065710872162486, + "grad_norm": 0.48467705809184164, + "learning_rate": 1.9743983614951357e-05, + "loss": 0.6388, + "num_tokens": 280014819.0, + "step": 2968 + }, + { + "epoch": 0.506741764806281, + "grad_norm": 0.49112037568722255, + "learning_rate": 1.973715651135006e-05, + "loss": 0.6284, + "num_tokens": 280101724.0, + "step": 2969 + }, + { + "epoch": 0.5069124423963134, + "grad_norm": 0.5196017705035091, + "learning_rate": 1.9730329407748765e-05, + "loss": 0.569, + "num_tokens": 280182468.0, + "step": 2970 + }, + { + "epoch": 0.5070831199863458, + "grad_norm": 0.48651490627590277, + "learning_rate": 1.9723502304147465e-05, + "loss": 0.6284, + "num_tokens": 280283600.0, + "step": 2971 + }, + { + "epoch": 0.5072537975763782, + "grad_norm": 0.4900399486625487, + "learning_rate": 1.971667520054617e-05, + "loss": 0.6125, + "num_tokens": 280395572.0, + "step": 2972 + }, + { + "epoch": 0.5074244751664106, + "grad_norm": 0.462590242455195, + "learning_rate": 1.9709848096944873e-05, + "loss": 0.5844, + "num_tokens": 280499670.0, + "step": 2973 + }, + { + "epoch": 0.5075951527564431, + "grad_norm": 0.46188075324290406, + "learning_rate": 1.9703020993343576e-05, + "loss": 0.6253, + "num_tokens": 280610315.0, + "step": 2974 + }, + { + "epoch": 0.5077658303464755, + "grad_norm": 0.4736056374948957, + "learning_rate": 1.969619388974228e-05, + "loss": 0.5839, + "num_tokens": 280706345.0, + "step": 2975 + }, + { + "epoch": 0.5079365079365079, + "grad_norm": 0.4848915550295775, + "learning_rate": 1.968936678614098e-05, + "loss": 0.5708, + "num_tokens": 280806614.0, + "step": 2976 + }, + { + "epoch": 0.5081071855265403, + "grad_norm": 0.48142681847192265, + "learning_rate": 1.9682539682539684e-05, + "loss": 0.6171, + "num_tokens": 280900276.0, + "step": 2977 + }, + { + "epoch": 0.5082778631165727, + "grad_norm": 0.48184087955603466, + "learning_rate": 1.9675712578938385e-05, + "loss": 0.7332, + "num_tokens": 281018160.0, + "step": 2978 + }, + { + "epoch": 0.5084485407066052, + "grad_norm": 0.5189285895755861, + "learning_rate": 1.966888547533709e-05, + "loss": 0.6189, + "num_tokens": 281112355.0, + "step": 2979 + }, + { + "epoch": 0.5086192182966377, + "grad_norm": 0.45114369378314184, + "learning_rate": 1.9662058371735792e-05, + "loss": 0.5742, + "num_tokens": 281216604.0, + "step": 2980 + }, + { + "epoch": 0.5087898958866701, + "grad_norm": 0.4778301970506635, + "learning_rate": 1.9655231268134496e-05, + "loss": 0.5944, + "num_tokens": 281305562.0, + "step": 2981 + }, + { + "epoch": 0.5089605734767025, + "grad_norm": 0.481077307398366, + "learning_rate": 1.96484041645332e-05, + "loss": 0.5897, + "num_tokens": 281399791.0, + "step": 2982 + }, + { + "epoch": 0.509131251066735, + "grad_norm": 0.48070077200139133, + "learning_rate": 1.9641577060931903e-05, + "loss": 0.5517, + "num_tokens": 281489601.0, + "step": 2983 + }, + { + "epoch": 0.5093019286567674, + "grad_norm": 0.4782524800222262, + "learning_rate": 1.9634749957330604e-05, + "loss": 0.6754, + "num_tokens": 281596912.0, + "step": 2984 + }, + { + "epoch": 0.5094726062467998, + "grad_norm": 0.4502046084842937, + "learning_rate": 1.9627922853729308e-05, + "loss": 0.6508, + "num_tokens": 281714977.0, + "step": 2985 + }, + { + "epoch": 0.5096432838368322, + "grad_norm": 0.47907545473856067, + "learning_rate": 1.962109575012801e-05, + "loss": 0.5671, + "num_tokens": 281802206.0, + "step": 2986 + }, + { + "epoch": 0.5098139614268646, + "grad_norm": 0.530366170950225, + "learning_rate": 1.9614268646526712e-05, + "loss": 0.5955, + "num_tokens": 281889443.0, + "step": 2987 + }, + { + "epoch": 0.5099846390168971, + "grad_norm": 0.4623370003924888, + "learning_rate": 1.9607441542925415e-05, + "loss": 0.5207, + "num_tokens": 281984193.0, + "step": 2988 + }, + { + "epoch": 0.5101553166069295, + "grad_norm": 0.49909173616891644, + "learning_rate": 1.960061443932412e-05, + "loss": 0.5945, + "num_tokens": 282084342.0, + "step": 2989 + }, + { + "epoch": 0.5103259941969619, + "grad_norm": 0.5179203294604595, + "learning_rate": 1.959378733572282e-05, + "loss": 0.5445, + "num_tokens": 282151303.0, + "step": 2990 + }, + { + "epoch": 0.5104966717869943, + "grad_norm": 0.49245418357466686, + "learning_rate": 1.9586960232121523e-05, + "loss": 0.6116, + "num_tokens": 282248160.0, + "step": 2991 + }, + { + "epoch": 0.5106673493770268, + "grad_norm": 0.4622715852388729, + "learning_rate": 1.9580133128520227e-05, + "loss": 0.5322, + "num_tokens": 282341664.0, + "step": 2992 + }, + { + "epoch": 0.5108380269670593, + "grad_norm": 0.5243470610776291, + "learning_rate": 1.957330602491893e-05, + "loss": 0.5974, + "num_tokens": 282423621.0, + "step": 2993 + }, + { + "epoch": 0.5110087045570917, + "grad_norm": 0.5556243636411453, + "learning_rate": 1.9566478921317635e-05, + "loss": 0.717, + "num_tokens": 282505588.0, + "step": 2994 + }, + { + "epoch": 0.5111793821471241, + "grad_norm": 0.5072434851303513, + "learning_rate": 1.9559651817716335e-05, + "loss": 0.573, + "num_tokens": 282598864.0, + "step": 2995 + }, + { + "epoch": 0.5113500597371565, + "grad_norm": 0.5225446207886094, + "learning_rate": 1.955282471411504e-05, + "loss": 0.6382, + "num_tokens": 282689028.0, + "step": 2996 + }, + { + "epoch": 0.511520737327189, + "grad_norm": 0.476109844287748, + "learning_rate": 1.954599761051374e-05, + "loss": 0.5917, + "num_tokens": 282793863.0, + "step": 2997 + }, + { + "epoch": 0.5116914149172214, + "grad_norm": 0.4360343762801639, + "learning_rate": 1.9539170506912443e-05, + "loss": 0.5952, + "num_tokens": 282917345.0, + "step": 2998 + }, + { + "epoch": 0.5118620925072538, + "grad_norm": 0.5629899474850296, + "learning_rate": 1.9532343403311147e-05, + "loss": 0.5938, + "num_tokens": 283025443.0, + "step": 2999 + }, + { + "epoch": 0.5120327700972862, + "grad_norm": 0.4827770436003431, + "learning_rate": 1.952551629970985e-05, + "loss": 0.6941, + "num_tokens": 283140841.0, + "step": 3000 + }, + { + "epoch": 0.5122034476873186, + "grad_norm": 0.4950193129650829, + "learning_rate": 1.9518689196108554e-05, + "loss": 0.604, + "num_tokens": 283242724.0, + "step": 3001 + }, + { + "epoch": 0.512374125277351, + "grad_norm": 0.538507416274736, + "learning_rate": 1.9511862092507255e-05, + "loss": 0.5611, + "num_tokens": 283314086.0, + "step": 3002 + }, + { + "epoch": 0.5125448028673835, + "grad_norm": 0.4902763877975039, + "learning_rate": 1.950503498890596e-05, + "loss": 0.5236, + "num_tokens": 283394447.0, + "step": 3003 + }, + { + "epoch": 0.512715480457416, + "grad_norm": 0.4808692674379869, + "learning_rate": 1.9498207885304662e-05, + "loss": 0.6275, + "num_tokens": 283495255.0, + "step": 3004 + }, + { + "epoch": 0.5128861580474484, + "grad_norm": 0.4524486935752449, + "learning_rate": 1.9491380781703363e-05, + "loss": 0.6077, + "num_tokens": 283607115.0, + "step": 3005 + }, + { + "epoch": 0.5130568356374808, + "grad_norm": 0.5013746394545435, + "learning_rate": 1.9484553678102066e-05, + "loss": 0.5317, + "num_tokens": 283685570.0, + "step": 3006 + }, + { + "epoch": 0.5132275132275133, + "grad_norm": 0.4696492811067798, + "learning_rate": 1.947772657450077e-05, + "loss": 0.537, + "num_tokens": 283790425.0, + "step": 3007 + }, + { + "epoch": 0.5133981908175457, + "grad_norm": 0.5181201771383211, + "learning_rate": 1.947089947089947e-05, + "loss": 0.5493, + "num_tokens": 283864698.0, + "step": 3008 + }, + { + "epoch": 0.5135688684075781, + "grad_norm": 0.4877515465542224, + "learning_rate": 1.9464072367298174e-05, + "loss": 0.6028, + "num_tokens": 283960900.0, + "step": 3009 + }, + { + "epoch": 0.5137395459976105, + "grad_norm": 0.5027771829819515, + "learning_rate": 1.9457245263696878e-05, + "loss": 0.5916, + "num_tokens": 284055415.0, + "step": 3010 + }, + { + "epoch": 0.5139102235876429, + "grad_norm": 0.48986552585127024, + "learning_rate": 1.9450418160095582e-05, + "loss": 0.5521, + "num_tokens": 284137168.0, + "step": 3011 + }, + { + "epoch": 0.5140809011776754, + "grad_norm": 0.5119016201134485, + "learning_rate": 1.9443591056494285e-05, + "loss": 0.5387, + "num_tokens": 284212400.0, + "step": 3012 + }, + { + "epoch": 0.5142515787677078, + "grad_norm": 0.4899372920087474, + "learning_rate": 1.9436763952892986e-05, + "loss": 0.5631, + "num_tokens": 284299926.0, + "step": 3013 + }, + { + "epoch": 0.5144222563577402, + "grad_norm": 0.4627070474435737, + "learning_rate": 1.942993684929169e-05, + "loss": 0.5282, + "num_tokens": 284390237.0, + "step": 3014 + }, + { + "epoch": 0.5145929339477726, + "grad_norm": 0.45876659524433944, + "learning_rate": 1.942310974569039e-05, + "loss": 0.5867, + "num_tokens": 284495904.0, + "step": 3015 + }, + { + "epoch": 0.514763611537805, + "grad_norm": 0.4531856732068326, + "learning_rate": 1.9416282642089094e-05, + "loss": 0.5674, + "num_tokens": 284599765.0, + "step": 3016 + }, + { + "epoch": 0.5149342891278376, + "grad_norm": 0.47073773617801373, + "learning_rate": 1.9409455538487798e-05, + "loss": 0.6272, + "num_tokens": 284704393.0, + "step": 3017 + }, + { + "epoch": 0.51510496671787, + "grad_norm": 0.5556281196176842, + "learning_rate": 1.94026284348865e-05, + "loss": 0.6858, + "num_tokens": 284831806.0, + "step": 3018 + }, + { + "epoch": 0.5152756443079024, + "grad_norm": 0.4636423093459759, + "learning_rate": 1.9395801331285205e-05, + "loss": 0.5445, + "num_tokens": 284925750.0, + "step": 3019 + }, + { + "epoch": 0.5154463218979348, + "grad_norm": 0.5172454843211848, + "learning_rate": 1.938897422768391e-05, + "loss": 0.5603, + "num_tokens": 285002267.0, + "step": 3020 + }, + { + "epoch": 0.5156169994879672, + "grad_norm": 0.5217880374596379, + "learning_rate": 1.938214712408261e-05, + "loss": 0.6191, + "num_tokens": 285094178.0, + "step": 3021 + }, + { + "epoch": 0.5157876770779997, + "grad_norm": 0.48150729983089013, + "learning_rate": 1.9375320020481313e-05, + "loss": 0.5423, + "num_tokens": 285194975.0, + "step": 3022 + }, + { + "epoch": 0.5159583546680321, + "grad_norm": 0.5104719614188584, + "learning_rate": 1.9368492916880017e-05, + "loss": 0.6351, + "num_tokens": 285279276.0, + "step": 3023 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.4876665873192858, + "learning_rate": 1.9361665813278717e-05, + "loss": 0.5916, + "num_tokens": 285369027.0, + "step": 3024 + }, + { + "epoch": 0.5162997098480969, + "grad_norm": 0.49809699138444763, + "learning_rate": 1.935483870967742e-05, + "loss": 0.6036, + "num_tokens": 285465401.0, + "step": 3025 + }, + { + "epoch": 0.5164703874381293, + "grad_norm": 0.4816305692056345, + "learning_rate": 1.9348011606076125e-05, + "loss": 0.6263, + "num_tokens": 285567129.0, + "step": 3026 + }, + { + "epoch": 0.5166410650281618, + "grad_norm": 0.5046099856709171, + "learning_rate": 1.9341184502474825e-05, + "loss": 0.6389, + "num_tokens": 285659715.0, + "step": 3027 + }, + { + "epoch": 0.5168117426181942, + "grad_norm": 0.5020514058115976, + "learning_rate": 1.933435739887353e-05, + "loss": 0.501, + "num_tokens": 285732251.0, + "step": 3028 + }, + { + "epoch": 0.5169824202082267, + "grad_norm": 0.511957456816222, + "learning_rate": 1.9327530295272232e-05, + "loss": 0.5947, + "num_tokens": 285813118.0, + "step": 3029 + }, + { + "epoch": 0.5171530977982591, + "grad_norm": 0.48154322659100113, + "learning_rate": 1.9320703191670936e-05, + "loss": 0.5768, + "num_tokens": 285911107.0, + "step": 3030 + }, + { + "epoch": 0.5173237753882916, + "grad_norm": 0.5167740776492398, + "learning_rate": 1.931387608806964e-05, + "loss": 0.6153, + "num_tokens": 286000637.0, + "step": 3031 + }, + { + "epoch": 0.517494452978324, + "grad_norm": 0.5226636203175864, + "learning_rate": 1.930704898446834e-05, + "loss": 0.565, + "num_tokens": 286078744.0, + "step": 3032 + }, + { + "epoch": 0.5176651305683564, + "grad_norm": 0.5882399039665996, + "learning_rate": 1.9300221880867044e-05, + "loss": 0.5891, + "num_tokens": 286141377.0, + "step": 3033 + }, + { + "epoch": 0.5178358081583888, + "grad_norm": 0.4848101422904708, + "learning_rate": 1.9293394777265745e-05, + "loss": 0.6931, + "num_tokens": 286251333.0, + "step": 3034 + }, + { + "epoch": 0.5180064857484212, + "grad_norm": 0.5328465917443551, + "learning_rate": 1.9286567673664448e-05, + "loss": 0.5384, + "num_tokens": 286321604.0, + "step": 3035 + }, + { + "epoch": 0.5181771633384537, + "grad_norm": 0.5044144193585542, + "learning_rate": 1.9279740570063152e-05, + "loss": 0.6092, + "num_tokens": 286410800.0, + "step": 3036 + }, + { + "epoch": 0.5183478409284861, + "grad_norm": 0.5054690928006791, + "learning_rate": 1.9272913466461856e-05, + "loss": 0.5397, + "num_tokens": 286487176.0, + "step": 3037 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.4813520903445108, + "learning_rate": 1.926608636286056e-05, + "loss": 0.5792, + "num_tokens": 286575659.0, + "step": 3038 + }, + { + "epoch": 0.5186891961085509, + "grad_norm": 0.5077837053874465, + "learning_rate": 1.925925925925926e-05, + "loss": 0.6257, + "num_tokens": 286672233.0, + "step": 3039 + }, + { + "epoch": 0.5188598736985833, + "grad_norm": 0.48236979674324726, + "learning_rate": 1.9252432155657964e-05, + "loss": 0.6064, + "num_tokens": 286768634.0, + "step": 3040 + }, + { + "epoch": 0.5190305512886159, + "grad_norm": 0.5495224752518717, + "learning_rate": 1.9245605052056667e-05, + "loss": 0.6363, + "num_tokens": 286844411.0, + "step": 3041 + }, + { + "epoch": 0.5192012288786483, + "grad_norm": 0.48521363641370185, + "learning_rate": 1.9238777948455368e-05, + "loss": 0.6568, + "num_tokens": 286941807.0, + "step": 3042 + }, + { + "epoch": 0.5193719064686807, + "grad_norm": 1.5228632554393573, + "learning_rate": 1.923195084485407e-05, + "loss": 0.6466, + "num_tokens": 287036506.0, + "step": 3043 + }, + { + "epoch": 0.5195425840587131, + "grad_norm": 0.5101526111811319, + "learning_rate": 1.9225123741252775e-05, + "loss": 0.6649, + "num_tokens": 287131012.0, + "step": 3044 + }, + { + "epoch": 0.5197132616487455, + "grad_norm": 0.4986510648102983, + "learning_rate": 1.921829663765148e-05, + "loss": 0.6099, + "num_tokens": 287221035.0, + "step": 3045 + }, + { + "epoch": 0.519883939238778, + "grad_norm": 0.5548382687774132, + "learning_rate": 1.921146953405018e-05, + "loss": 0.6217, + "num_tokens": 287292326.0, + "step": 3046 + }, + { + "epoch": 0.5200546168288104, + "grad_norm": 0.4554967398245753, + "learning_rate": 1.9204642430448883e-05, + "loss": 0.6382, + "num_tokens": 287413783.0, + "step": 3047 + }, + { + "epoch": 0.5202252944188428, + "grad_norm": 0.4322202331786147, + "learning_rate": 1.9197815326847587e-05, + "loss": 0.5768, + "num_tokens": 287531619.0, + "step": 3048 + }, + { + "epoch": 0.5203959720088752, + "grad_norm": 0.44901753249154897, + "learning_rate": 1.919098822324629e-05, + "loss": 0.5493, + "num_tokens": 287637645.0, + "step": 3049 + }, + { + "epoch": 0.5205666495989076, + "grad_norm": 0.5211045382420716, + "learning_rate": 1.918416111964499e-05, + "loss": 0.6247, + "num_tokens": 287734817.0, + "step": 3050 + }, + { + "epoch": 0.5207373271889401, + "grad_norm": 0.4886424431346369, + "learning_rate": 1.9177334016043695e-05, + "loss": 0.5002, + "num_tokens": 287818165.0, + "step": 3051 + }, + { + "epoch": 0.5209080047789725, + "grad_norm": 0.48465132078631906, + "learning_rate": 1.91705069124424e-05, + "loss": 0.6078, + "num_tokens": 287920763.0, + "step": 3052 + }, + { + "epoch": 0.5210786823690049, + "grad_norm": 0.5124813734123707, + "learning_rate": 1.91636798088411e-05, + "loss": 0.5601, + "num_tokens": 287994739.0, + "step": 3053 + }, + { + "epoch": 0.5212493599590374, + "grad_norm": 0.454315701155547, + "learning_rate": 1.9156852705239803e-05, + "loss": 0.6836, + "num_tokens": 288129005.0, + "step": 3054 + }, + { + "epoch": 0.5214200375490698, + "grad_norm": 0.4932409259681437, + "learning_rate": 1.9150025601638507e-05, + "loss": 0.5367, + "num_tokens": 288207646.0, + "step": 3055 + }, + { + "epoch": 0.5215907151391023, + "grad_norm": 0.45861993920417854, + "learning_rate": 1.914319849803721e-05, + "loss": 0.5717, + "num_tokens": 288310646.0, + "step": 3056 + }, + { + "epoch": 0.5217613927291347, + "grad_norm": 0.5059191566207744, + "learning_rate": 1.9136371394435914e-05, + "loss": 0.5841, + "num_tokens": 288386143.0, + "step": 3057 + }, + { + "epoch": 0.5219320703191671, + "grad_norm": 0.48893187170993, + "learning_rate": 1.9129544290834615e-05, + "loss": 0.6688, + "num_tokens": 288507418.0, + "step": 3058 + }, + { + "epoch": 0.5221027479091995, + "grad_norm": 0.48248134982254554, + "learning_rate": 1.9122717187233318e-05, + "loss": 0.5296, + "num_tokens": 288593725.0, + "step": 3059 + }, + { + "epoch": 0.522273425499232, + "grad_norm": 0.4290723430703719, + "learning_rate": 1.9115890083632022e-05, + "loss": 0.572, + "num_tokens": 288711237.0, + "step": 3060 + }, + { + "epoch": 0.5224441030892644, + "grad_norm": 0.5100046363756213, + "learning_rate": 1.9109062980030722e-05, + "loss": 0.6185, + "num_tokens": 288799765.0, + "step": 3061 + }, + { + "epoch": 0.5226147806792968, + "grad_norm": 2.1172531965142025, + "learning_rate": 1.9102235876429426e-05, + "loss": 0.6748, + "num_tokens": 288881655.0, + "step": 3062 + }, + { + "epoch": 0.5227854582693292, + "grad_norm": 0.4391269532085232, + "learning_rate": 1.909540877282813e-05, + "loss": 0.544, + "num_tokens": 288989117.0, + "step": 3063 + }, + { + "epoch": 0.5229561358593616, + "grad_norm": 0.4888247967699203, + "learning_rate": 1.908858166922683e-05, + "loss": 0.6287, + "num_tokens": 289080247.0, + "step": 3064 + }, + { + "epoch": 0.523126813449394, + "grad_norm": 0.4724934015259245, + "learning_rate": 1.9081754565625534e-05, + "loss": 0.5505, + "num_tokens": 289167228.0, + "step": 3065 + }, + { + "epoch": 0.5232974910394266, + "grad_norm": 0.52857593254418, + "learning_rate": 1.9074927462024238e-05, + "loss": 0.5131, + "num_tokens": 289231423.0, + "step": 3066 + }, + { + "epoch": 0.523468168629459, + "grad_norm": 0.5204026469098294, + "learning_rate": 1.906810035842294e-05, + "loss": 0.5975, + "num_tokens": 289309694.0, + "step": 3067 + }, + { + "epoch": 0.5236388462194914, + "grad_norm": 0.5199542356497531, + "learning_rate": 1.9061273254821645e-05, + "loss": 0.6471, + "num_tokens": 289397751.0, + "step": 3068 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 1.524366310335856, + "learning_rate": 1.9054446151220346e-05, + "loss": 0.5984, + "num_tokens": 289481609.0, + "step": 3069 + }, + { + "epoch": 0.5239802013995563, + "grad_norm": 0.508134895841481, + "learning_rate": 1.904761904761905e-05, + "loss": 0.5914, + "num_tokens": 289564347.0, + "step": 3070 + }, + { + "epoch": 0.5241508789895887, + "grad_norm": 0.454723155153608, + "learning_rate": 1.904079194401775e-05, + "loss": 0.5585, + "num_tokens": 289664958.0, + "step": 3071 + }, + { + "epoch": 0.5243215565796211, + "grad_norm": 0.4951123133062021, + "learning_rate": 1.9033964840416454e-05, + "loss": 0.6587, + "num_tokens": 289772482.0, + "step": 3072 + }, + { + "epoch": 0.5244922341696535, + "grad_norm": 0.5306634005039207, + "learning_rate": 1.9027137736815157e-05, + "loss": 0.5695, + "num_tokens": 289872338.0, + "step": 3073 + }, + { + "epoch": 0.5246629117596859, + "grad_norm": 0.44941872366850255, + "learning_rate": 1.902031063321386e-05, + "loss": 0.5187, + "num_tokens": 289971375.0, + "step": 3074 + }, + { + "epoch": 0.5248335893497184, + "grad_norm": 0.5033283957777007, + "learning_rate": 1.9013483529612565e-05, + "loss": 0.653, + "num_tokens": 290058980.0, + "step": 3075 + }, + { + "epoch": 0.5250042669397508, + "grad_norm": 0.4634991004334279, + "learning_rate": 1.900665642601127e-05, + "loss": 0.6152, + "num_tokens": 290172457.0, + "step": 3076 + }, + { + "epoch": 0.5251749445297832, + "grad_norm": 0.7397813267490337, + "learning_rate": 1.899982932240997e-05, + "loss": 0.5889, + "num_tokens": 290248559.0, + "step": 3077 + }, + { + "epoch": 0.5253456221198156, + "grad_norm": 0.4545408132966553, + "learning_rate": 1.8993002218808673e-05, + "loss": 0.5189, + "num_tokens": 290353084.0, + "step": 3078 + }, + { + "epoch": 0.5255162997098481, + "grad_norm": 0.5337862846065333, + "learning_rate": 1.8986175115207373e-05, + "loss": 0.5693, + "num_tokens": 290426274.0, + "step": 3079 + }, + { + "epoch": 0.5256869772998806, + "grad_norm": 0.5072801236477027, + "learning_rate": 1.8979348011606077e-05, + "loss": 0.626, + "num_tokens": 290520308.0, + "step": 3080 + }, + { + "epoch": 0.525857654889913, + "grad_norm": 0.6846596353320044, + "learning_rate": 1.897252090800478e-05, + "loss": 0.681, + "num_tokens": 290631274.0, + "step": 3081 + }, + { + "epoch": 0.5260283324799454, + "grad_norm": 0.46940235925860574, + "learning_rate": 1.8965693804403484e-05, + "loss": 0.5944, + "num_tokens": 290731753.0, + "step": 3082 + }, + { + "epoch": 0.5261990100699778, + "grad_norm": 0.4581819830385618, + "learning_rate": 1.8958866700802185e-05, + "loss": 0.6281, + "num_tokens": 290840533.0, + "step": 3083 + }, + { + "epoch": 0.5263696876600102, + "grad_norm": 0.5234861313459777, + "learning_rate": 1.895203959720089e-05, + "loss": 0.6172, + "num_tokens": 290924941.0, + "step": 3084 + }, + { + "epoch": 0.5265403652500427, + "grad_norm": 0.5443188455210963, + "learning_rate": 1.8945212493599592e-05, + "loss": 0.5897, + "num_tokens": 291012116.0, + "step": 3085 + }, + { + "epoch": 0.5267110428400751, + "grad_norm": 0.49991854968020083, + "learning_rate": 1.8938385389998296e-05, + "loss": 0.6558, + "num_tokens": 291104819.0, + "step": 3086 + }, + { + "epoch": 0.5268817204301075, + "grad_norm": 0.49915325354140133, + "learning_rate": 1.8931558286396997e-05, + "loss": 0.5598, + "num_tokens": 291187979.0, + "step": 3087 + }, + { + "epoch": 0.5270523980201399, + "grad_norm": 0.5816091477979147, + "learning_rate": 1.89247311827957e-05, + "loss": 0.6216, + "num_tokens": 291286621.0, + "step": 3088 + }, + { + "epoch": 0.5272230756101723, + "grad_norm": 0.47255607851583553, + "learning_rate": 1.8917904079194404e-05, + "loss": 0.5635, + "num_tokens": 291379406.0, + "step": 3089 + }, + { + "epoch": 0.5273937532002048, + "grad_norm": 0.4684916338653572, + "learning_rate": 1.8911076975593104e-05, + "loss": 0.5881, + "num_tokens": 291474745.0, + "step": 3090 + }, + { + "epoch": 0.5275644307902373, + "grad_norm": 0.5866435318981604, + "learning_rate": 1.8904249871991808e-05, + "loss": 0.5629, + "num_tokens": 291537098.0, + "step": 3091 + }, + { + "epoch": 0.5277351083802697, + "grad_norm": 0.48887748506146694, + "learning_rate": 1.8897422768390512e-05, + "loss": 0.539, + "num_tokens": 291624421.0, + "step": 3092 + }, + { + "epoch": 0.5279057859703021, + "grad_norm": 0.4769655046784254, + "learning_rate": 1.8890595664789216e-05, + "loss": 0.5615, + "num_tokens": 291714728.0, + "step": 3093 + }, + { + "epoch": 0.5280764635603346, + "grad_norm": 0.45306263899742144, + "learning_rate": 1.888376856118792e-05, + "loss": 0.5344, + "num_tokens": 291811401.0, + "step": 3094 + }, + { + "epoch": 0.528247141150367, + "grad_norm": 0.4718968143085379, + "learning_rate": 1.887694145758662e-05, + "loss": 0.6405, + "num_tokens": 291922253.0, + "step": 3095 + }, + { + "epoch": 0.5284178187403994, + "grad_norm": 0.49293646928296436, + "learning_rate": 1.8870114353985324e-05, + "loss": 0.6171, + "num_tokens": 292015737.0, + "step": 3096 + }, + { + "epoch": 0.5285884963304318, + "grad_norm": 0.4759670854501478, + "learning_rate": 1.8863287250384027e-05, + "loss": 0.5566, + "num_tokens": 292103536.0, + "step": 3097 + }, + { + "epoch": 0.5287591739204642, + "grad_norm": 0.49465844830277833, + "learning_rate": 1.8856460146782728e-05, + "loss": 0.6606, + "num_tokens": 292214865.0, + "step": 3098 + }, + { + "epoch": 0.5289298515104967, + "grad_norm": 0.5784176835781271, + "learning_rate": 1.884963304318143e-05, + "loss": 0.6092, + "num_tokens": 292295494.0, + "step": 3099 + }, + { + "epoch": 0.5291005291005291, + "grad_norm": 0.46785602039302854, + "learning_rate": 1.8842805939580135e-05, + "loss": 0.6332, + "num_tokens": 292406006.0, + "step": 3100 + }, + { + "epoch": 0.5292712066905615, + "grad_norm": 0.5076115154766822, + "learning_rate": 1.8835978835978836e-05, + "loss": 0.6072, + "num_tokens": 292500133.0, + "step": 3101 + }, + { + "epoch": 0.5294418842805939, + "grad_norm": 0.48243160609350183, + "learning_rate": 1.882915173237754e-05, + "loss": 0.5725, + "num_tokens": 292589253.0, + "step": 3102 + }, + { + "epoch": 0.5296125618706264, + "grad_norm": 0.4937433957404967, + "learning_rate": 1.8822324628776243e-05, + "loss": 0.6808, + "num_tokens": 292703622.0, + "step": 3103 + }, + { + "epoch": 0.5297832394606589, + "grad_norm": 0.5034398626535053, + "learning_rate": 1.8815497525174947e-05, + "loss": 0.5689, + "num_tokens": 292798827.0, + "step": 3104 + }, + { + "epoch": 0.5299539170506913, + "grad_norm": 0.4444816337100842, + "learning_rate": 1.880867042157365e-05, + "loss": 0.6595, + "num_tokens": 292935380.0, + "step": 3105 + }, + { + "epoch": 0.5301245946407237, + "grad_norm": 0.46148661851400324, + "learning_rate": 1.880184331797235e-05, + "loss": 0.6197, + "num_tokens": 293043197.0, + "step": 3106 + }, + { + "epoch": 0.5302952722307561, + "grad_norm": 0.4946595869764554, + "learning_rate": 1.8795016214371055e-05, + "loss": 0.5729, + "num_tokens": 293132361.0, + "step": 3107 + }, + { + "epoch": 0.5304659498207885, + "grad_norm": 0.4645059581582265, + "learning_rate": 1.8788189110769755e-05, + "loss": 0.5804, + "num_tokens": 293230618.0, + "step": 3108 + }, + { + "epoch": 0.530636627410821, + "grad_norm": 0.5138792024133612, + "learning_rate": 1.878136200716846e-05, + "loss": 0.6102, + "num_tokens": 293322605.0, + "step": 3109 + }, + { + "epoch": 0.5308073050008534, + "grad_norm": 0.4717327697724768, + "learning_rate": 1.8774534903567163e-05, + "loss": 0.5821, + "num_tokens": 293424794.0, + "step": 3110 + }, + { + "epoch": 0.5309779825908858, + "grad_norm": 0.5472640720404892, + "learning_rate": 1.8767707799965867e-05, + "loss": 0.5302, + "num_tokens": 293492098.0, + "step": 3111 + }, + { + "epoch": 0.5311486601809182, + "grad_norm": 0.502409823649258, + "learning_rate": 1.876088069636457e-05, + "loss": 0.5663, + "num_tokens": 293573506.0, + "step": 3112 + }, + { + "epoch": 0.5313193377709506, + "grad_norm": 0.5583028272867726, + "learning_rate": 1.8754053592763274e-05, + "loss": 0.6321, + "num_tokens": 293649862.0, + "step": 3113 + }, + { + "epoch": 0.5314900153609831, + "grad_norm": 0.4919084966524508, + "learning_rate": 1.8747226489161974e-05, + "loss": 0.6158, + "num_tokens": 293738238.0, + "step": 3114 + }, + { + "epoch": 0.5316606929510155, + "grad_norm": 0.44877547497738846, + "learning_rate": 1.8740399385560678e-05, + "loss": 0.5219, + "num_tokens": 293838737.0, + "step": 3115 + }, + { + "epoch": 0.531831370541048, + "grad_norm": 0.5091816078784778, + "learning_rate": 1.873357228195938e-05, + "loss": 0.6747, + "num_tokens": 293931223.0, + "step": 3116 + }, + { + "epoch": 0.5320020481310804, + "grad_norm": 0.513770049372497, + "learning_rate": 1.8726745178358082e-05, + "loss": 0.518, + "num_tokens": 294002444.0, + "step": 3117 + }, + { + "epoch": 0.5321727257211128, + "grad_norm": 0.5144762648658163, + "learning_rate": 1.8719918074756786e-05, + "loss": 0.6404, + "num_tokens": 294083709.0, + "step": 3118 + }, + { + "epoch": 0.5323434033111453, + "grad_norm": 0.47123935885642715, + "learning_rate": 1.871309097115549e-05, + "loss": 0.606, + "num_tokens": 294177381.0, + "step": 3119 + }, + { + "epoch": 0.5325140809011777, + "grad_norm": 0.5205291858748078, + "learning_rate": 1.870626386755419e-05, + "loss": 0.6072, + "num_tokens": 294257440.0, + "step": 3120 + }, + { + "epoch": 0.5326847584912101, + "grad_norm": 0.46569845528236886, + "learning_rate": 1.8699436763952894e-05, + "loss": 0.5659, + "num_tokens": 294356959.0, + "step": 3121 + }, + { + "epoch": 0.5328554360812425, + "grad_norm": 0.4477377960003955, + "learning_rate": 1.8692609660351598e-05, + "loss": 0.5357, + "num_tokens": 294460069.0, + "step": 3122 + }, + { + "epoch": 0.533026113671275, + "grad_norm": 0.46100425163723446, + "learning_rate": 1.86857825567503e-05, + "loss": 0.6119, + "num_tokens": 294567585.0, + "step": 3123 + }, + { + "epoch": 0.5331967912613074, + "grad_norm": 0.5375254138913531, + "learning_rate": 1.8678955453149005e-05, + "loss": 0.6738, + "num_tokens": 294652175.0, + "step": 3124 + }, + { + "epoch": 0.5333674688513398, + "grad_norm": 0.4696762081092029, + "learning_rate": 1.8672128349547706e-05, + "loss": 0.5812, + "num_tokens": 294753375.0, + "step": 3125 + }, + { + "epoch": 0.5335381464413722, + "grad_norm": 0.5569104416803978, + "learning_rate": 1.866530124594641e-05, + "loss": 0.6261, + "num_tokens": 294829117.0, + "step": 3126 + }, + { + "epoch": 0.5337088240314046, + "grad_norm": 0.47907533028386085, + "learning_rate": 1.865847414234511e-05, + "loss": 0.5961, + "num_tokens": 294928559.0, + "step": 3127 + }, + { + "epoch": 0.5338795016214372, + "grad_norm": 0.4871309903816328, + "learning_rate": 1.8651647038743814e-05, + "loss": 0.63, + "num_tokens": 295031030.0, + "step": 3128 + }, + { + "epoch": 0.5340501792114696, + "grad_norm": 0.498340397998727, + "learning_rate": 1.8644819935142517e-05, + "loss": 0.5985, + "num_tokens": 295124358.0, + "step": 3129 + }, + { + "epoch": 0.534220856801502, + "grad_norm": 0.4677177820403074, + "learning_rate": 1.863799283154122e-05, + "loss": 0.6105, + "num_tokens": 295235538.0, + "step": 3130 + }, + { + "epoch": 0.5343915343915344, + "grad_norm": 0.44278885921496336, + "learning_rate": 1.8631165727939925e-05, + "loss": 0.6469, + "num_tokens": 295352064.0, + "step": 3131 + }, + { + "epoch": 0.5345622119815668, + "grad_norm": 0.4997947680496291, + "learning_rate": 1.8624338624338625e-05, + "loss": 0.6643, + "num_tokens": 295458037.0, + "step": 3132 + }, + { + "epoch": 0.5347328895715993, + "grad_norm": 0.47983147106104573, + "learning_rate": 1.861751152073733e-05, + "loss": 0.5743, + "num_tokens": 295558378.0, + "step": 3133 + }, + { + "epoch": 0.5349035671616317, + "grad_norm": 0.5742657413633274, + "learning_rate": 1.8610684417136033e-05, + "loss": 0.5877, + "num_tokens": 295642216.0, + "step": 3134 + }, + { + "epoch": 0.5350742447516641, + "grad_norm": 0.5512940575987569, + "learning_rate": 1.8603857313534733e-05, + "loss": 0.546, + "num_tokens": 295703948.0, + "step": 3135 + }, + { + "epoch": 0.5352449223416965, + "grad_norm": 0.4946730163318817, + "learning_rate": 1.8597030209933437e-05, + "loss": 0.5631, + "num_tokens": 295790389.0, + "step": 3136 + }, + { + "epoch": 0.5354155999317289, + "grad_norm": 0.4887477887396777, + "learning_rate": 1.859020310633214e-05, + "loss": 0.5084, + "num_tokens": 295868277.0, + "step": 3137 + }, + { + "epoch": 0.5355862775217614, + "grad_norm": 0.4939331405656355, + "learning_rate": 1.858337600273084e-05, + "loss": 0.6121, + "num_tokens": 295960032.0, + "step": 3138 + }, + { + "epoch": 0.5357569551117938, + "grad_norm": 0.44332207046075983, + "learning_rate": 1.8576548899129545e-05, + "loss": 0.5278, + "num_tokens": 296064609.0, + "step": 3139 + }, + { + "epoch": 0.5359276327018262, + "grad_norm": 0.4716402345665519, + "learning_rate": 1.856972179552825e-05, + "loss": 0.5443, + "num_tokens": 296153273.0, + "step": 3140 + }, + { + "epoch": 0.5360983102918587, + "grad_norm": 0.7331001234109966, + "learning_rate": 1.8562894691926952e-05, + "loss": 0.6505, + "num_tokens": 296239939.0, + "step": 3141 + }, + { + "epoch": 0.5362689878818911, + "grad_norm": 0.6020887696965216, + "learning_rate": 1.8556067588325656e-05, + "loss": 0.5832, + "num_tokens": 296297131.0, + "step": 3142 + }, + { + "epoch": 0.5364396654719236, + "grad_norm": 0.5014675787987937, + "learning_rate": 1.8549240484724356e-05, + "loss": 0.677, + "num_tokens": 296398145.0, + "step": 3143 + }, + { + "epoch": 0.536610343061956, + "grad_norm": 0.4963523187021656, + "learning_rate": 1.854241338112306e-05, + "loss": 0.5831, + "num_tokens": 296486703.0, + "step": 3144 + }, + { + "epoch": 0.5367810206519884, + "grad_norm": 0.5281739579824689, + "learning_rate": 1.853558627752176e-05, + "loss": 0.4828, + "num_tokens": 296552110.0, + "step": 3145 + }, + { + "epoch": 0.5369516982420208, + "grad_norm": 0.4760106425177943, + "learning_rate": 1.8528759173920464e-05, + "loss": 0.5683, + "num_tokens": 296647754.0, + "step": 3146 + }, + { + "epoch": 0.5371223758320532, + "grad_norm": 0.5295359706150942, + "learning_rate": 1.8521932070319168e-05, + "loss": 0.5745, + "num_tokens": 296714173.0, + "step": 3147 + }, + { + "epoch": 0.5372930534220857, + "grad_norm": 0.4925007048696446, + "learning_rate": 1.8515104966717872e-05, + "loss": 0.6572, + "num_tokens": 296811929.0, + "step": 3148 + }, + { + "epoch": 0.5374637310121181, + "grad_norm": 0.5014920664614139, + "learning_rate": 1.8508277863116576e-05, + "loss": 0.5448, + "num_tokens": 296889693.0, + "step": 3149 + }, + { + "epoch": 0.5376344086021505, + "grad_norm": 0.509856093477438, + "learning_rate": 1.850145075951528e-05, + "loss": 0.5221, + "num_tokens": 296970419.0, + "step": 3150 + }, + { + "epoch": 0.5378050861921829, + "grad_norm": 0.4859194339045163, + "learning_rate": 1.849462365591398e-05, + "loss": 0.5701, + "num_tokens": 297058182.0, + "step": 3151 + }, + { + "epoch": 0.5379757637822153, + "grad_norm": 0.4260823101325428, + "learning_rate": 1.8487796552312684e-05, + "loss": 0.5962, + "num_tokens": 297183460.0, + "step": 3152 + }, + { + "epoch": 0.5381464413722479, + "grad_norm": 0.5117070554062627, + "learning_rate": 1.8480969448711384e-05, + "loss": 0.5743, + "num_tokens": 297271913.0, + "step": 3153 + }, + { + "epoch": 0.5383171189622803, + "grad_norm": 0.5398525168197303, + "learning_rate": 1.8474142345110088e-05, + "loss": 0.5196, + "num_tokens": 297335633.0, + "step": 3154 + }, + { + "epoch": 0.5384877965523127, + "grad_norm": 0.4495771082418198, + "learning_rate": 1.846731524150879e-05, + "loss": 0.5624, + "num_tokens": 297458953.0, + "step": 3155 + }, + { + "epoch": 0.5386584741423451, + "grad_norm": 0.5095522098206506, + "learning_rate": 1.8460488137907495e-05, + "loss": 0.5312, + "num_tokens": 297532331.0, + "step": 3156 + }, + { + "epoch": 0.5388291517323776, + "grad_norm": 0.5091703545242908, + "learning_rate": 1.8453661034306196e-05, + "loss": 0.5776, + "num_tokens": 297608284.0, + "step": 3157 + }, + { + "epoch": 0.53899982932241, + "grad_norm": 0.48677995448589406, + "learning_rate": 1.84468339307049e-05, + "loss": 0.6223, + "num_tokens": 297701487.0, + "step": 3158 + }, + { + "epoch": 0.5391705069124424, + "grad_norm": 0.5128182744358407, + "learning_rate": 1.8440006827103603e-05, + "loss": 0.5801, + "num_tokens": 297791443.0, + "step": 3159 + }, + { + "epoch": 0.5393411845024748, + "grad_norm": 0.4769646135195715, + "learning_rate": 1.8433179723502307e-05, + "loss": 0.5782, + "num_tokens": 297888605.0, + "step": 3160 + }, + { + "epoch": 0.5395118620925072, + "grad_norm": 0.4679065628152036, + "learning_rate": 1.842635261990101e-05, + "loss": 0.5389, + "num_tokens": 297984545.0, + "step": 3161 + }, + { + "epoch": 0.5396825396825397, + "grad_norm": 0.47102394877596043, + "learning_rate": 1.841952551629971e-05, + "loss": 0.6432, + "num_tokens": 298096207.0, + "step": 3162 + }, + { + "epoch": 0.5398532172725721, + "grad_norm": 0.4811281947811005, + "learning_rate": 1.8412698412698415e-05, + "loss": 0.5351, + "num_tokens": 298191476.0, + "step": 3163 + }, + { + "epoch": 0.5400238948626045, + "grad_norm": 0.4801625711916469, + "learning_rate": 1.8405871309097115e-05, + "loss": 0.5945, + "num_tokens": 298286416.0, + "step": 3164 + }, + { + "epoch": 0.540194572452637, + "grad_norm": 0.49265939682546595, + "learning_rate": 1.839904420549582e-05, + "loss": 0.6251, + "num_tokens": 298386764.0, + "step": 3165 + }, + { + "epoch": 0.5403652500426694, + "grad_norm": 0.5806949875090246, + "learning_rate": 1.8392217101894523e-05, + "loss": 0.5684, + "num_tokens": 298446944.0, + "step": 3166 + }, + { + "epoch": 0.5405359276327019, + "grad_norm": 0.5194972616554627, + "learning_rate": 1.8385389998293226e-05, + "loss": 0.5994, + "num_tokens": 298551130.0, + "step": 3167 + }, + { + "epoch": 0.5407066052227343, + "grad_norm": 0.462951292469605, + "learning_rate": 1.837856289469193e-05, + "loss": 0.6287, + "num_tokens": 298667621.0, + "step": 3168 + }, + { + "epoch": 0.5408772828127667, + "grad_norm": 0.5481396978317484, + "learning_rate": 1.837173579109063e-05, + "loss": 0.7019, + "num_tokens": 298787302.0, + "step": 3169 + }, + { + "epoch": 0.5410479604027991, + "grad_norm": 0.4514213527132757, + "learning_rate": 1.8364908687489334e-05, + "loss": 0.6121, + "num_tokens": 298900323.0, + "step": 3170 + }, + { + "epoch": 0.5412186379928315, + "grad_norm": 0.4739895635184822, + "learning_rate": 1.8358081583888038e-05, + "loss": 0.598, + "num_tokens": 299000916.0, + "step": 3171 + }, + { + "epoch": 0.541389315582864, + "grad_norm": 0.5373048455451247, + "learning_rate": 1.835125448028674e-05, + "loss": 0.6256, + "num_tokens": 299091810.0, + "step": 3172 + }, + { + "epoch": 0.5415599931728964, + "grad_norm": 0.49565904091367974, + "learning_rate": 1.8344427376685442e-05, + "loss": 0.6069, + "num_tokens": 299184629.0, + "step": 3173 + }, + { + "epoch": 0.5417306707629288, + "grad_norm": 0.5678050071708751, + "learning_rate": 1.8337600273084146e-05, + "loss": 0.6119, + "num_tokens": 299256201.0, + "step": 3174 + }, + { + "epoch": 0.5419013483529612, + "grad_norm": 0.5493665813661777, + "learning_rate": 1.833077316948285e-05, + "loss": 0.6038, + "num_tokens": 299342267.0, + "step": 3175 + }, + { + "epoch": 0.5420720259429936, + "grad_norm": 0.6061777124639972, + "learning_rate": 1.832394606588155e-05, + "loss": 0.6665, + "num_tokens": 299402288.0, + "step": 3176 + }, + { + "epoch": 0.5422427035330261, + "grad_norm": 0.4575587146131139, + "learning_rate": 1.8317118962280254e-05, + "loss": 0.511, + "num_tokens": 299495967.0, + "step": 3177 + }, + { + "epoch": 0.5424133811230586, + "grad_norm": 0.47335441681094187, + "learning_rate": 1.8310291858678958e-05, + "loss": 0.5521, + "num_tokens": 299589313.0, + "step": 3178 + }, + { + "epoch": 0.542584058713091, + "grad_norm": 0.5080892885533668, + "learning_rate": 1.830346475507766e-05, + "loss": 0.5423, + "num_tokens": 299674199.0, + "step": 3179 + }, + { + "epoch": 0.5427547363031234, + "grad_norm": 0.47616754706253833, + "learning_rate": 1.8296637651476362e-05, + "loss": 0.5814, + "num_tokens": 299768568.0, + "step": 3180 + }, + { + "epoch": 0.5429254138931559, + "grad_norm": 0.5268873533757173, + "learning_rate": 1.8289810547875066e-05, + "loss": 0.623, + "num_tokens": 299883966.0, + "step": 3181 + }, + { + "epoch": 0.5430960914831883, + "grad_norm": 0.5347463351585031, + "learning_rate": 1.8282983444273766e-05, + "loss": 0.7061, + "num_tokens": 299973541.0, + "step": 3182 + }, + { + "epoch": 0.5432667690732207, + "grad_norm": 0.4666365118649967, + "learning_rate": 1.827615634067247e-05, + "loss": 0.5217, + "num_tokens": 300059735.0, + "step": 3183 + }, + { + "epoch": 0.5434374466632531, + "grad_norm": 0.6678722340013037, + "learning_rate": 1.8269329237071173e-05, + "loss": 0.6449, + "num_tokens": 300128767.0, + "step": 3184 + }, + { + "epoch": 0.5436081242532855, + "grad_norm": 0.4778818539275673, + "learning_rate": 1.8262502133469877e-05, + "loss": 0.5767, + "num_tokens": 300233167.0, + "step": 3185 + }, + { + "epoch": 0.543778801843318, + "grad_norm": 0.47773791275368616, + "learning_rate": 1.825567502986858e-05, + "loss": 0.6249, + "num_tokens": 300340634.0, + "step": 3186 + }, + { + "epoch": 0.5439494794333504, + "grad_norm": 0.48543969441675605, + "learning_rate": 1.8248847926267285e-05, + "loss": 0.6437, + "num_tokens": 300441825.0, + "step": 3187 + }, + { + "epoch": 0.5441201570233828, + "grad_norm": 0.518559513130123, + "learning_rate": 1.8242020822665985e-05, + "loss": 0.6081, + "num_tokens": 300519321.0, + "step": 3188 + }, + { + "epoch": 0.5442908346134152, + "grad_norm": 0.4984595262484238, + "learning_rate": 1.823519371906469e-05, + "loss": 0.5742, + "num_tokens": 300603745.0, + "step": 3189 + }, + { + "epoch": 0.5444615122034477, + "grad_norm": 0.5137485556208886, + "learning_rate": 1.822836661546339e-05, + "loss": 0.5754, + "num_tokens": 300681396.0, + "step": 3190 + }, + { + "epoch": 0.5446321897934802, + "grad_norm": 0.5139848126854727, + "learning_rate": 1.8221539511862093e-05, + "loss": 0.6888, + "num_tokens": 300786118.0, + "step": 3191 + }, + { + "epoch": 0.5448028673835126, + "grad_norm": 0.5341011746051796, + "learning_rate": 1.8214712408260797e-05, + "loss": 0.5602, + "num_tokens": 300852332.0, + "step": 3192 + }, + { + "epoch": 0.544973544973545, + "grad_norm": 0.47136690118520136, + "learning_rate": 1.82078853046595e-05, + "loss": 0.6087, + "num_tokens": 300953361.0, + "step": 3193 + }, + { + "epoch": 0.5451442225635774, + "grad_norm": 0.4920469822534496, + "learning_rate": 1.82010582010582e-05, + "loss": 0.5494, + "num_tokens": 301039297.0, + "step": 3194 + }, + { + "epoch": 0.5453149001536098, + "grad_norm": 0.4045765986627514, + "learning_rate": 1.8194231097456905e-05, + "loss": 0.6028, + "num_tokens": 301189305.0, + "step": 3195 + }, + { + "epoch": 0.5454855777436423, + "grad_norm": 0.5290667768829725, + "learning_rate": 1.818740399385561e-05, + "loss": 0.653, + "num_tokens": 301271414.0, + "step": 3196 + }, + { + "epoch": 0.5456562553336747, + "grad_norm": 0.5606600705473238, + "learning_rate": 1.8180576890254312e-05, + "loss": 0.5793, + "num_tokens": 301342761.0, + "step": 3197 + }, + { + "epoch": 0.5458269329237071, + "grad_norm": 0.47757107445298597, + "learning_rate": 1.8173749786653016e-05, + "loss": 0.5759, + "num_tokens": 301426913.0, + "step": 3198 + }, + { + "epoch": 0.5459976105137395, + "grad_norm": 0.4916283032487723, + "learning_rate": 1.8166922683051716e-05, + "loss": 0.523, + "num_tokens": 301502425.0, + "step": 3199 + }, + { + "epoch": 0.5461682881037719, + "grad_norm": 0.4974874523625466, + "learning_rate": 1.816009557945042e-05, + "loss": 0.539, + "num_tokens": 301580795.0, + "step": 3200 + }, + { + "epoch": 0.5463389656938044, + "grad_norm": 0.5167836872333724, + "learning_rate": 1.815326847584912e-05, + "loss": 0.656, + "num_tokens": 301668144.0, + "step": 3201 + }, + { + "epoch": 0.5465096432838368, + "grad_norm": 0.4421607605024438, + "learning_rate": 1.8146441372247824e-05, + "loss": 0.5614, + "num_tokens": 301781480.0, + "step": 3202 + }, + { + "epoch": 0.5466803208738693, + "grad_norm": 0.4327382609998891, + "learning_rate": 1.8139614268646528e-05, + "loss": 0.5393, + "num_tokens": 301885740.0, + "step": 3203 + }, + { + "epoch": 0.5468509984639017, + "grad_norm": 0.5242168256508906, + "learning_rate": 1.8132787165045232e-05, + "loss": 0.6308, + "num_tokens": 301977618.0, + "step": 3204 + }, + { + "epoch": 0.5470216760539341, + "grad_norm": 0.5027019470236086, + "learning_rate": 1.8125960061443936e-05, + "loss": 0.5576, + "num_tokens": 302066331.0, + "step": 3205 + }, + { + "epoch": 0.5471923536439666, + "grad_norm": 0.4574739279951613, + "learning_rate": 1.811913295784264e-05, + "loss": 0.6986, + "num_tokens": 302193526.0, + "step": 3206 + }, + { + "epoch": 0.547363031233999, + "grad_norm": 0.49300264302155467, + "learning_rate": 1.811230585424134e-05, + "loss": 0.6016, + "num_tokens": 302279945.0, + "step": 3207 + }, + { + "epoch": 0.5475337088240314, + "grad_norm": 0.510589083852796, + "learning_rate": 1.8105478750640043e-05, + "loss": 0.6277, + "num_tokens": 302369656.0, + "step": 3208 + }, + { + "epoch": 0.5477043864140638, + "grad_norm": 0.5298805507825468, + "learning_rate": 1.8098651647038744e-05, + "loss": 0.7264, + "num_tokens": 302458956.0, + "step": 3209 + }, + { + "epoch": 0.5478750640040962, + "grad_norm": 0.5200215035228213, + "learning_rate": 1.8091824543437448e-05, + "loss": 0.6363, + "num_tokens": 302542248.0, + "step": 3210 + }, + { + "epoch": 0.5480457415941287, + "grad_norm": 0.5062231492177758, + "learning_rate": 1.808499743983615e-05, + "loss": 0.6594, + "num_tokens": 302639754.0, + "step": 3211 + }, + { + "epoch": 0.5482164191841611, + "grad_norm": 0.5107437967606423, + "learning_rate": 1.8078170336234855e-05, + "loss": 0.6301, + "num_tokens": 302729142.0, + "step": 3212 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 0.516081881588452, + "learning_rate": 1.8071343232633555e-05, + "loss": 0.6305, + "num_tokens": 302814021.0, + "step": 3213 + }, + { + "epoch": 0.5485577743642259, + "grad_norm": 0.5234151056720422, + "learning_rate": 1.806451612903226e-05, + "loss": 0.5049, + "num_tokens": 302881807.0, + "step": 3214 + }, + { + "epoch": 0.5487284519542585, + "grad_norm": 0.4823381538407324, + "learning_rate": 1.8057689025430963e-05, + "loss": 0.6385, + "num_tokens": 302989371.0, + "step": 3215 + }, + { + "epoch": 0.5488991295442909, + "grad_norm": 0.5150285631001777, + "learning_rate": 1.8050861921829667e-05, + "loss": 0.5675, + "num_tokens": 303069599.0, + "step": 3216 + }, + { + "epoch": 0.5490698071343233, + "grad_norm": 0.4723402181102485, + "learning_rate": 1.8044034818228367e-05, + "loss": 0.6057, + "num_tokens": 303174219.0, + "step": 3217 + }, + { + "epoch": 0.5492404847243557, + "grad_norm": 0.49337664265067743, + "learning_rate": 1.803720771462707e-05, + "loss": 0.5265, + "num_tokens": 303262956.0, + "step": 3218 + }, + { + "epoch": 0.5494111623143881, + "grad_norm": 0.5077334143231674, + "learning_rate": 1.803038061102577e-05, + "loss": 0.5436, + "num_tokens": 303348342.0, + "step": 3219 + }, + { + "epoch": 0.5495818399044206, + "grad_norm": 0.47025477353862094, + "learning_rate": 1.8023553507424475e-05, + "loss": 0.6163, + "num_tokens": 303450794.0, + "step": 3220 + }, + { + "epoch": 0.549752517494453, + "grad_norm": 0.4520548381352575, + "learning_rate": 1.801672640382318e-05, + "loss": 0.5988, + "num_tokens": 303561550.0, + "step": 3221 + }, + { + "epoch": 0.5499231950844854, + "grad_norm": 0.4772411906887436, + "learning_rate": 1.8009899300221883e-05, + "loss": 0.5595, + "num_tokens": 303655516.0, + "step": 3222 + }, + { + "epoch": 0.5500938726745178, + "grad_norm": 0.6399986101806774, + "learning_rate": 1.8003072196620586e-05, + "loss": 0.7234, + "num_tokens": 303749288.0, + "step": 3223 + }, + { + "epoch": 0.5502645502645502, + "grad_norm": 0.5830972218278365, + "learning_rate": 1.799624509301929e-05, + "loss": 0.6602, + "num_tokens": 303826917.0, + "step": 3224 + }, + { + "epoch": 0.5504352278545827, + "grad_norm": 0.4513218194573533, + "learning_rate": 1.798941798941799e-05, + "loss": 0.5475, + "num_tokens": 303927059.0, + "step": 3225 + }, + { + "epoch": 0.5506059054446151, + "grad_norm": 0.5090470490543042, + "learning_rate": 1.7982590885816694e-05, + "loss": 0.6211, + "num_tokens": 304017605.0, + "step": 3226 + }, + { + "epoch": 0.5507765830346476, + "grad_norm": 0.4817139696161923, + "learning_rate": 1.7975763782215398e-05, + "loss": 0.568, + "num_tokens": 304111586.0, + "step": 3227 + }, + { + "epoch": 0.55094726062468, + "grad_norm": 0.4876906531398363, + "learning_rate": 1.79689366786141e-05, + "loss": 0.5659, + "num_tokens": 304202554.0, + "step": 3228 + }, + { + "epoch": 0.5511179382147124, + "grad_norm": 0.5135803400123429, + "learning_rate": 1.7962109575012802e-05, + "loss": 0.5686, + "num_tokens": 304286081.0, + "step": 3229 + }, + { + "epoch": 0.5512886158047449, + "grad_norm": 0.5542935167033908, + "learning_rate": 1.7955282471411506e-05, + "loss": 0.6218, + "num_tokens": 304393147.0, + "step": 3230 + }, + { + "epoch": 0.5514592933947773, + "grad_norm": 0.4631522362795691, + "learning_rate": 1.7948455367810206e-05, + "loss": 0.5925, + "num_tokens": 304488182.0, + "step": 3231 + }, + { + "epoch": 0.5516299709848097, + "grad_norm": 0.46177547248283696, + "learning_rate": 1.794162826420891e-05, + "loss": 0.5887, + "num_tokens": 304595680.0, + "step": 3232 + }, + { + "epoch": 0.5518006485748421, + "grad_norm": 0.4249764851239371, + "learning_rate": 1.7934801160607614e-05, + "loss": 0.5523, + "num_tokens": 304711338.0, + "step": 3233 + }, + { + "epoch": 0.5519713261648745, + "grad_norm": 0.472141374427442, + "learning_rate": 1.7927974057006318e-05, + "loss": 0.567, + "num_tokens": 304802397.0, + "step": 3234 + }, + { + "epoch": 0.552142003754907, + "grad_norm": 0.499008726962044, + "learning_rate": 1.792114695340502e-05, + "loss": 0.5972, + "num_tokens": 304896005.0, + "step": 3235 + }, + { + "epoch": 0.5523126813449394, + "grad_norm": 0.4891375649919364, + "learning_rate": 1.791431984980372e-05, + "loss": 0.6879, + "num_tokens": 305022644.0, + "step": 3236 + }, + { + "epoch": 0.5524833589349718, + "grad_norm": 0.47382732594438787, + "learning_rate": 1.7907492746202425e-05, + "loss": 0.5186, + "num_tokens": 305104199.0, + "step": 3237 + }, + { + "epoch": 0.5526540365250042, + "grad_norm": 0.5170436842163904, + "learning_rate": 1.7900665642601126e-05, + "loss": 0.6089, + "num_tokens": 305189019.0, + "step": 3238 + }, + { + "epoch": 0.5528247141150366, + "grad_norm": 0.5197673760716421, + "learning_rate": 1.789383853899983e-05, + "loss": 0.5526, + "num_tokens": 305263058.0, + "step": 3239 + }, + { + "epoch": 0.5529953917050692, + "grad_norm": 0.545814739763571, + "learning_rate": 1.7887011435398533e-05, + "loss": 0.5579, + "num_tokens": 305355625.0, + "step": 3240 + }, + { + "epoch": 0.5531660692951016, + "grad_norm": 0.46661647062513395, + "learning_rate": 1.7880184331797237e-05, + "loss": 0.607, + "num_tokens": 305465993.0, + "step": 3241 + }, + { + "epoch": 0.553336746885134, + "grad_norm": 0.4664173992416769, + "learning_rate": 1.787335722819594e-05, + "loss": 0.5303, + "num_tokens": 305560054.0, + "step": 3242 + }, + { + "epoch": 0.5535074244751664, + "grad_norm": 0.568720812769375, + "learning_rate": 1.7866530124594645e-05, + "loss": 0.517, + "num_tokens": 305626284.0, + "step": 3243 + }, + { + "epoch": 0.5536781020651989, + "grad_norm": 0.43603321235188447, + "learning_rate": 1.7859703020993345e-05, + "loss": 0.5519, + "num_tokens": 305742173.0, + "step": 3244 + }, + { + "epoch": 0.5538487796552313, + "grad_norm": 0.46996134541818224, + "learning_rate": 1.785287591739205e-05, + "loss": 0.6059, + "num_tokens": 305840660.0, + "step": 3245 + }, + { + "epoch": 0.5540194572452637, + "grad_norm": 0.4531314697794946, + "learning_rate": 1.784604881379075e-05, + "loss": 0.6001, + "num_tokens": 305945919.0, + "step": 3246 + }, + { + "epoch": 0.5541901348352961, + "grad_norm": 0.45883437444837166, + "learning_rate": 1.7839221710189453e-05, + "loss": 0.518, + "num_tokens": 306037434.0, + "step": 3247 + }, + { + "epoch": 0.5543608124253285, + "grad_norm": 0.5067524085668664, + "learning_rate": 1.7832394606588157e-05, + "loss": 0.5823, + "num_tokens": 306123727.0, + "step": 3248 + }, + { + "epoch": 0.554531490015361, + "grad_norm": 0.49109835670158475, + "learning_rate": 1.782556750298686e-05, + "loss": 0.6271, + "num_tokens": 306221052.0, + "step": 3249 + }, + { + "epoch": 0.5547021676053934, + "grad_norm": 0.5248816548787603, + "learning_rate": 1.781874039938556e-05, + "loss": 0.5471, + "num_tokens": 306290238.0, + "step": 3250 + }, + { + "epoch": 0.5548728451954258, + "grad_norm": 0.5389995989238137, + "learning_rate": 1.7811913295784265e-05, + "loss": 0.599, + "num_tokens": 306365015.0, + "step": 3251 + }, + { + "epoch": 0.5550435227854583, + "grad_norm": 0.45246749456858393, + "learning_rate": 1.780508619218297e-05, + "loss": 0.5739, + "num_tokens": 306474320.0, + "step": 3252 + }, + { + "epoch": 0.5552142003754907, + "grad_norm": 0.47046632397711896, + "learning_rate": 1.7798259088581672e-05, + "loss": 0.5607, + "num_tokens": 306574641.0, + "step": 3253 + }, + { + "epoch": 0.5553848779655232, + "grad_norm": 0.5526187193711913, + "learning_rate": 1.7791431984980372e-05, + "loss": 0.6452, + "num_tokens": 306662624.0, + "step": 3254 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.5282506170555303, + "learning_rate": 1.7784604881379076e-05, + "loss": 0.6892, + "num_tokens": 306764459.0, + "step": 3255 + }, + { + "epoch": 0.555726233145588, + "grad_norm": 0.42626734291698226, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.6354, + "num_tokens": 306898202.0, + "step": 3256 + }, + { + "epoch": 0.5558969107356204, + "grad_norm": 0.45583998515904645, + "learning_rate": 1.777095067417648e-05, + "loss": 0.5292, + "num_tokens": 306989372.0, + "step": 3257 + }, + { + "epoch": 0.5560675883256528, + "grad_norm": 0.4729546708431539, + "learning_rate": 1.7764123570575184e-05, + "loss": 0.5586, + "num_tokens": 307079368.0, + "step": 3258 + }, + { + "epoch": 0.5562382659156853, + "grad_norm": 0.5412158141009671, + "learning_rate": 1.7757296466973888e-05, + "loss": 0.6058, + "num_tokens": 307154916.0, + "step": 3259 + }, + { + "epoch": 0.5564089435057177, + "grad_norm": 0.4985072417762757, + "learning_rate": 1.775046936337259e-05, + "loss": 0.5186, + "num_tokens": 307229811.0, + "step": 3260 + }, + { + "epoch": 0.5565796210957501, + "grad_norm": 0.49914609763029083, + "learning_rate": 1.7743642259771295e-05, + "loss": 0.5331, + "num_tokens": 307311504.0, + "step": 3261 + }, + { + "epoch": 0.5567502986857825, + "grad_norm": 0.45308885795214476, + "learning_rate": 1.7736815156169996e-05, + "loss": 0.5457, + "num_tokens": 307406460.0, + "step": 3262 + }, + { + "epoch": 0.5569209762758149, + "grad_norm": 0.42820504559889633, + "learning_rate": 1.77299880525687e-05, + "loss": 0.6331, + "num_tokens": 307542428.0, + "step": 3263 + }, + { + "epoch": 0.5570916538658475, + "grad_norm": 0.5014936856172574, + "learning_rate": 1.7723160948967403e-05, + "loss": 0.5959, + "num_tokens": 307634502.0, + "step": 3264 + }, + { + "epoch": 0.5572623314558799, + "grad_norm": 0.5181959545458348, + "learning_rate": 1.7716333845366104e-05, + "loss": 0.6621, + "num_tokens": 307729705.0, + "step": 3265 + }, + { + "epoch": 0.5574330090459123, + "grad_norm": 0.5127166513836698, + "learning_rate": 1.7709506741764807e-05, + "loss": 0.554, + "num_tokens": 307804167.0, + "step": 3266 + }, + { + "epoch": 0.5576036866359447, + "grad_norm": 0.509682024435058, + "learning_rate": 1.770267963816351e-05, + "loss": 0.6554, + "num_tokens": 307895708.0, + "step": 3267 + }, + { + "epoch": 0.5577743642259771, + "grad_norm": 0.6067119577624526, + "learning_rate": 1.7695852534562215e-05, + "loss": 0.6178, + "num_tokens": 307960130.0, + "step": 3268 + }, + { + "epoch": 0.5579450418160096, + "grad_norm": 0.5456001007185598, + "learning_rate": 1.7689025430960915e-05, + "loss": 0.5635, + "num_tokens": 308027455.0, + "step": 3269 + }, + { + "epoch": 0.558115719406042, + "grad_norm": 0.577521520516446, + "learning_rate": 1.768219832735962e-05, + "loss": 0.6347, + "num_tokens": 308112416.0, + "step": 3270 + }, + { + "epoch": 0.5582863969960744, + "grad_norm": 0.4788793616345669, + "learning_rate": 1.7675371223758323e-05, + "loss": 0.599, + "num_tokens": 308209810.0, + "step": 3271 + }, + { + "epoch": 0.5584570745861068, + "grad_norm": 0.5111544818376718, + "learning_rate": 1.7668544120157027e-05, + "loss": 0.5075, + "num_tokens": 308288632.0, + "step": 3272 + }, + { + "epoch": 0.5586277521761392, + "grad_norm": 0.48367623993346553, + "learning_rate": 1.7661717016555727e-05, + "loss": 0.5581, + "num_tokens": 308372956.0, + "step": 3273 + }, + { + "epoch": 0.5587984297661717, + "grad_norm": 0.47066521941545914, + "learning_rate": 1.765488991295443e-05, + "loss": 0.5028, + "num_tokens": 308453721.0, + "step": 3274 + }, + { + "epoch": 0.5589691073562041, + "grad_norm": 0.5346035154466612, + "learning_rate": 1.764806280935313e-05, + "loss": 0.6597, + "num_tokens": 308541153.0, + "step": 3275 + }, + { + "epoch": 0.5591397849462365, + "grad_norm": 0.44466765941216807, + "learning_rate": 1.7641235705751835e-05, + "loss": 0.6089, + "num_tokens": 308667945.0, + "step": 3276 + }, + { + "epoch": 0.559310462536269, + "grad_norm": 0.48343368120798147, + "learning_rate": 1.763440860215054e-05, + "loss": 0.5874, + "num_tokens": 308763287.0, + "step": 3277 + }, + { + "epoch": 0.5594811401263015, + "grad_norm": 0.5252910576541392, + "learning_rate": 1.7627581498549242e-05, + "loss": 0.6137, + "num_tokens": 308847429.0, + "step": 3278 + }, + { + "epoch": 0.5596518177163339, + "grad_norm": 0.48154936182873703, + "learning_rate": 1.7620754394947946e-05, + "loss": 0.6169, + "num_tokens": 308958300.0, + "step": 3279 + }, + { + "epoch": 0.5598224953063663, + "grad_norm": 0.4645424219602783, + "learning_rate": 1.761392729134665e-05, + "loss": 0.5806, + "num_tokens": 309059530.0, + "step": 3280 + }, + { + "epoch": 0.5599931728963987, + "grad_norm": 0.491364920286677, + "learning_rate": 1.760710018774535e-05, + "loss": 0.6145, + "num_tokens": 309153971.0, + "step": 3281 + }, + { + "epoch": 0.5601638504864311, + "grad_norm": 0.48665003552311026, + "learning_rate": 1.7600273084144054e-05, + "loss": 0.6084, + "num_tokens": 309250903.0, + "step": 3282 + }, + { + "epoch": 0.5603345280764636, + "grad_norm": 0.44883052975272497, + "learning_rate": 1.7593445980542754e-05, + "loss": 0.5212, + "num_tokens": 309365865.0, + "step": 3283 + }, + { + "epoch": 0.560505205666496, + "grad_norm": 0.4651044514767892, + "learning_rate": 1.7586618876941458e-05, + "loss": 0.6204, + "num_tokens": 309476572.0, + "step": 3284 + }, + { + "epoch": 0.5606758832565284, + "grad_norm": 0.5058720650060079, + "learning_rate": 1.7579791773340162e-05, + "loss": 0.6121, + "num_tokens": 309559113.0, + "step": 3285 + }, + { + "epoch": 0.5608465608465608, + "grad_norm": 0.4926489813992244, + "learning_rate": 1.7572964669738866e-05, + "loss": 0.5965, + "num_tokens": 309644246.0, + "step": 3286 + }, + { + "epoch": 0.5610172384365932, + "grad_norm": 0.5375769351508352, + "learning_rate": 1.7566137566137566e-05, + "loss": 0.6239, + "num_tokens": 309725665.0, + "step": 3287 + }, + { + "epoch": 0.5611879160266257, + "grad_norm": 0.45926084705654124, + "learning_rate": 1.755931046253627e-05, + "loss": 0.639, + "num_tokens": 309846550.0, + "step": 3288 + }, + { + "epoch": 0.5613585936166582, + "grad_norm": 0.5549364284843736, + "learning_rate": 1.7552483358934974e-05, + "loss": 0.5462, + "num_tokens": 309910070.0, + "step": 3289 + }, + { + "epoch": 0.5615292712066906, + "grad_norm": 0.5068280246466197, + "learning_rate": 1.7545656255333677e-05, + "loss": 0.6269, + "num_tokens": 310002689.0, + "step": 3290 + }, + { + "epoch": 0.561699948796723, + "grad_norm": 0.4409216472899892, + "learning_rate": 1.7538829151732378e-05, + "loss": 0.6414, + "num_tokens": 310122038.0, + "step": 3291 + }, + { + "epoch": 0.5618706263867554, + "grad_norm": 0.47063104687880447, + "learning_rate": 1.753200204813108e-05, + "loss": 0.5554, + "num_tokens": 310215537.0, + "step": 3292 + }, + { + "epoch": 0.5620413039767879, + "grad_norm": 0.414441529654539, + "learning_rate": 1.7525174944529782e-05, + "loss": 0.5173, + "num_tokens": 310336156.0, + "step": 3293 + }, + { + "epoch": 0.5622119815668203, + "grad_norm": 0.5081478153481397, + "learning_rate": 1.7518347840928486e-05, + "loss": 0.5493, + "num_tokens": 310421019.0, + "step": 3294 + }, + { + "epoch": 0.5623826591568527, + "grad_norm": 0.5151175075417055, + "learning_rate": 1.751152073732719e-05, + "loss": 0.5262, + "num_tokens": 310496166.0, + "step": 3295 + }, + { + "epoch": 0.5625533367468851, + "grad_norm": 0.5036068405546553, + "learning_rate": 1.7504693633725893e-05, + "loss": 0.618, + "num_tokens": 310598057.0, + "step": 3296 + }, + { + "epoch": 0.5627240143369175, + "grad_norm": 0.49653064779619194, + "learning_rate": 1.7497866530124597e-05, + "loss": 0.6215, + "num_tokens": 310697502.0, + "step": 3297 + }, + { + "epoch": 0.56289469192695, + "grad_norm": 0.4721057118463774, + "learning_rate": 1.74910394265233e-05, + "loss": 0.5991, + "num_tokens": 310803866.0, + "step": 3298 + }, + { + "epoch": 0.5630653695169824, + "grad_norm": 0.48490020173843534, + "learning_rate": 1.7484212322922005e-05, + "loss": 0.5705, + "num_tokens": 310891989.0, + "step": 3299 + }, + { + "epoch": 0.5632360471070148, + "grad_norm": 0.45234984273568635, + "learning_rate": 1.7477385219320705e-05, + "loss": 0.5557, + "num_tokens": 310996686.0, + "step": 3300 + }, + { + "epoch": 0.5634067246970472, + "grad_norm": 0.47512806095817806, + "learning_rate": 1.747055811571941e-05, + "loss": 0.7044, + "num_tokens": 311120945.0, + "step": 3301 + }, + { + "epoch": 0.5635774022870798, + "grad_norm": 0.5461026318725828, + "learning_rate": 1.746373101211811e-05, + "loss": 0.5392, + "num_tokens": 311187782.0, + "step": 3302 + }, + { + "epoch": 0.5637480798771122, + "grad_norm": 0.47704580210513703, + "learning_rate": 1.7456903908516813e-05, + "loss": 0.6069, + "num_tokens": 311277892.0, + "step": 3303 + }, + { + "epoch": 0.5639187574671446, + "grad_norm": 0.5048100465013765, + "learning_rate": 1.7450076804915517e-05, + "loss": 0.6023, + "num_tokens": 311361219.0, + "step": 3304 + }, + { + "epoch": 0.564089435057177, + "grad_norm": 0.5189239340111449, + "learning_rate": 1.744324970131422e-05, + "loss": 0.57, + "num_tokens": 311437595.0, + "step": 3305 + }, + { + "epoch": 0.5642601126472094, + "grad_norm": 0.4769703009731969, + "learning_rate": 1.743642259771292e-05, + "loss": 0.6548, + "num_tokens": 311561149.0, + "step": 3306 + }, + { + "epoch": 0.5644307902372419, + "grad_norm": 0.45474484344930943, + "learning_rate": 1.7429595494111624e-05, + "loss": 0.5948, + "num_tokens": 311676778.0, + "step": 3307 + }, + { + "epoch": 0.5646014678272743, + "grad_norm": 0.5000657917008792, + "learning_rate": 1.7422768390510328e-05, + "loss": 0.6407, + "num_tokens": 311770304.0, + "step": 3308 + }, + { + "epoch": 0.5647721454173067, + "grad_norm": 0.4600489107831235, + "learning_rate": 1.7415941286909032e-05, + "loss": 0.6723, + "num_tokens": 311889592.0, + "step": 3309 + }, + { + "epoch": 0.5649428230073391, + "grad_norm": 0.46654437464728876, + "learning_rate": 1.7409114183307732e-05, + "loss": 0.493, + "num_tokens": 311976365.0, + "step": 3310 + }, + { + "epoch": 0.5651135005973715, + "grad_norm": 0.5246576992050234, + "learning_rate": 1.7402287079706436e-05, + "loss": 0.595, + "num_tokens": 312062174.0, + "step": 3311 + }, + { + "epoch": 0.565284178187404, + "grad_norm": 0.5128550382420654, + "learning_rate": 1.7395459976105136e-05, + "loss": 0.5754, + "num_tokens": 312155023.0, + "step": 3312 + }, + { + "epoch": 0.5654548557774364, + "grad_norm": 0.48222214995864376, + "learning_rate": 1.738863287250384e-05, + "loss": 0.6161, + "num_tokens": 312252598.0, + "step": 3313 + }, + { + "epoch": 0.5656255333674689, + "grad_norm": 0.5103327983383292, + "learning_rate": 1.7381805768902544e-05, + "loss": 0.5536, + "num_tokens": 312335423.0, + "step": 3314 + }, + { + "epoch": 0.5657962109575013, + "grad_norm": 0.5437608050596344, + "learning_rate": 1.7374978665301248e-05, + "loss": 0.5488, + "num_tokens": 312403802.0, + "step": 3315 + }, + { + "epoch": 0.5659668885475337, + "grad_norm": 0.533864717457484, + "learning_rate": 1.736815156169995e-05, + "loss": 0.585, + "num_tokens": 312478822.0, + "step": 3316 + }, + { + "epoch": 0.5661375661375662, + "grad_norm": 0.44791649322539234, + "learning_rate": 1.7361324458098655e-05, + "loss": 0.5513, + "num_tokens": 312575105.0, + "step": 3317 + }, + { + "epoch": 0.5663082437275986, + "grad_norm": 0.5246317889557266, + "learning_rate": 1.7354497354497356e-05, + "loss": 0.6685, + "num_tokens": 312661242.0, + "step": 3318 + }, + { + "epoch": 0.566478921317631, + "grad_norm": 0.5177839841028439, + "learning_rate": 1.734767025089606e-05, + "loss": 0.5531, + "num_tokens": 312741927.0, + "step": 3319 + }, + { + "epoch": 0.5666495989076634, + "grad_norm": 0.5585216646486987, + "learning_rate": 1.734084314729476e-05, + "loss": 0.5543, + "num_tokens": 312806053.0, + "step": 3320 + }, + { + "epoch": 0.5668202764976958, + "grad_norm": 0.47028421604634163, + "learning_rate": 1.7334016043693464e-05, + "loss": 0.731, + "num_tokens": 312942352.0, + "step": 3321 + }, + { + "epoch": 0.5669909540877283, + "grad_norm": 0.6252852715412945, + "learning_rate": 1.7327188940092167e-05, + "loss": 0.6486, + "num_tokens": 313021684.0, + "step": 3322 + }, + { + "epoch": 0.5671616316777607, + "grad_norm": 0.4923556243855555, + "learning_rate": 1.732036183649087e-05, + "loss": 0.6038, + "num_tokens": 313116241.0, + "step": 3323 + }, + { + "epoch": 0.5673323092677931, + "grad_norm": 0.5399428176979846, + "learning_rate": 1.731353473288957e-05, + "loss": 0.671, + "num_tokens": 313196528.0, + "step": 3324 + }, + { + "epoch": 0.5675029868578255, + "grad_norm": 0.524253850592435, + "learning_rate": 1.7306707629288275e-05, + "loss": 0.5374, + "num_tokens": 313268453.0, + "step": 3325 + }, + { + "epoch": 0.567673664447858, + "grad_norm": 0.4353046469033207, + "learning_rate": 1.729988052568698e-05, + "loss": 0.5132, + "num_tokens": 313375265.0, + "step": 3326 + }, + { + "epoch": 0.5678443420378905, + "grad_norm": 0.4754664112088372, + "learning_rate": 1.7293053422085683e-05, + "loss": 0.5675, + "num_tokens": 313476516.0, + "step": 3327 + }, + { + "epoch": 0.5680150196279229, + "grad_norm": 0.47551302888807684, + "learning_rate": 1.7286226318484383e-05, + "loss": 0.5539, + "num_tokens": 313571078.0, + "step": 3328 + }, + { + "epoch": 0.5681856972179553, + "grad_norm": 0.4730134134112909, + "learning_rate": 1.7279399214883087e-05, + "loss": 0.5652, + "num_tokens": 313660954.0, + "step": 3329 + }, + { + "epoch": 0.5683563748079877, + "grad_norm": 0.42879111081414684, + "learning_rate": 1.727257211128179e-05, + "loss": 0.6032, + "num_tokens": 313788191.0, + "step": 3330 + }, + { + "epoch": 0.5685270523980201, + "grad_norm": 0.43649882470101564, + "learning_rate": 1.726574500768049e-05, + "loss": 0.5289, + "num_tokens": 313892184.0, + "step": 3331 + }, + { + "epoch": 0.5686977299880526, + "grad_norm": 0.4490969604872597, + "learning_rate": 1.7258917904079195e-05, + "loss": 0.5928, + "num_tokens": 314015912.0, + "step": 3332 + }, + { + "epoch": 0.568868407578085, + "grad_norm": 0.5081398929910466, + "learning_rate": 1.72520908004779e-05, + "loss": 0.5968, + "num_tokens": 314104624.0, + "step": 3333 + }, + { + "epoch": 0.5690390851681174, + "grad_norm": 0.5471754397727308, + "learning_rate": 1.7245263696876602e-05, + "loss": 0.6159, + "num_tokens": 314179968.0, + "step": 3334 + }, + { + "epoch": 0.5692097627581498, + "grad_norm": 0.4362566038774037, + "learning_rate": 1.7238436593275306e-05, + "loss": 0.5948, + "num_tokens": 314307149.0, + "step": 3335 + }, + { + "epoch": 0.5693804403481822, + "grad_norm": 0.509347342332704, + "learning_rate": 1.723160948967401e-05, + "loss": 0.6716, + "num_tokens": 314410779.0, + "step": 3336 + }, + { + "epoch": 0.5695511179382147, + "grad_norm": 0.4535973605151748, + "learning_rate": 1.722478238607271e-05, + "loss": 0.5139, + "num_tokens": 314511886.0, + "step": 3337 + }, + { + "epoch": 0.5697217955282471, + "grad_norm": 0.5260413596276682, + "learning_rate": 1.7217955282471414e-05, + "loss": 0.5629, + "num_tokens": 314595008.0, + "step": 3338 + }, + { + "epoch": 0.5698924731182796, + "grad_norm": 0.47918130690110916, + "learning_rate": 1.7211128178870114e-05, + "loss": 0.5383, + "num_tokens": 314678706.0, + "step": 3339 + }, + { + "epoch": 0.570063150708312, + "grad_norm": 0.5640412365333978, + "learning_rate": 1.7204301075268818e-05, + "loss": 0.5428, + "num_tokens": 314764439.0, + "step": 3340 + }, + { + "epoch": 0.5702338282983445, + "grad_norm": 0.5529933749582262, + "learning_rate": 1.7197473971667522e-05, + "loss": 0.6412, + "num_tokens": 314840691.0, + "step": 3341 + }, + { + "epoch": 0.5704045058883769, + "grad_norm": 0.5186054895910153, + "learning_rate": 1.7190646868066226e-05, + "loss": 0.5008, + "num_tokens": 314905972.0, + "step": 3342 + }, + { + "epoch": 0.5705751834784093, + "grad_norm": 0.5069785134096528, + "learning_rate": 1.7183819764464926e-05, + "loss": 0.5794, + "num_tokens": 314986142.0, + "step": 3343 + }, + { + "epoch": 0.5707458610684417, + "grad_norm": 0.51221383199397, + "learning_rate": 1.717699266086363e-05, + "loss": 0.5767, + "num_tokens": 315063894.0, + "step": 3344 + }, + { + "epoch": 0.5709165386584741, + "grad_norm": 0.4888638220785249, + "learning_rate": 1.7170165557262334e-05, + "loss": 0.6811, + "num_tokens": 315170636.0, + "step": 3345 + }, + { + "epoch": 0.5710872162485066, + "grad_norm": 0.4584002387486498, + "learning_rate": 1.7163338453661037e-05, + "loss": 0.6003, + "num_tokens": 315293863.0, + "step": 3346 + }, + { + "epoch": 0.571257893838539, + "grad_norm": 0.4642472111313573, + "learning_rate": 1.7156511350059738e-05, + "loss": 0.5873, + "num_tokens": 315389645.0, + "step": 3347 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.42611479044066325, + "learning_rate": 1.714968424645844e-05, + "loss": 0.5549, + "num_tokens": 315505170.0, + "step": 3348 + }, + { + "epoch": 0.5715992490186038, + "grad_norm": 0.48042256383374965, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.5723, + "num_tokens": 315596433.0, + "step": 3349 + }, + { + "epoch": 0.5717699266086362, + "grad_norm": 0.4705121959148618, + "learning_rate": 1.7136030039255846e-05, + "loss": 0.6016, + "num_tokens": 315699274.0, + "step": 3350 + }, + { + "epoch": 0.5719406041986688, + "grad_norm": 0.5088060286836834, + "learning_rate": 1.712920293565455e-05, + "loss": 0.6943, + "num_tokens": 315801714.0, + "step": 3351 + }, + { + "epoch": 0.5721112817887012, + "grad_norm": 0.5230649800713418, + "learning_rate": 1.7122375832053253e-05, + "loss": 0.676, + "num_tokens": 315889626.0, + "step": 3352 + }, + { + "epoch": 0.5722819593787336, + "grad_norm": 0.4635005858964634, + "learning_rate": 1.7115548728451957e-05, + "loss": 0.561, + "num_tokens": 315985559.0, + "step": 3353 + }, + { + "epoch": 0.572452636968766, + "grad_norm": 0.4841387927554741, + "learning_rate": 1.710872162485066e-05, + "loss": 0.5897, + "num_tokens": 316081438.0, + "step": 3354 + }, + { + "epoch": 0.5726233145587984, + "grad_norm": 0.6165731313023756, + "learning_rate": 1.710189452124936e-05, + "loss": 0.5351, + "num_tokens": 316149352.0, + "step": 3355 + }, + { + "epoch": 0.5727939921488309, + "grad_norm": 0.4431700143413621, + "learning_rate": 1.7095067417648065e-05, + "loss": 0.617, + "num_tokens": 316277260.0, + "step": 3356 + }, + { + "epoch": 0.5729646697388633, + "grad_norm": 0.4712993326962704, + "learning_rate": 1.7088240314046765e-05, + "loss": 0.5621, + "num_tokens": 316374626.0, + "step": 3357 + }, + { + "epoch": 0.5731353473288957, + "grad_norm": 0.4739631949659726, + "learning_rate": 1.708141321044547e-05, + "loss": 0.5851, + "num_tokens": 316474566.0, + "step": 3358 + }, + { + "epoch": 0.5733060249189281, + "grad_norm": 0.49056661930781337, + "learning_rate": 1.7074586106844173e-05, + "loss": 0.58, + "num_tokens": 316561447.0, + "step": 3359 + }, + { + "epoch": 0.5734767025089605, + "grad_norm": 0.547252617210736, + "learning_rate": 1.7067759003242876e-05, + "loss": 0.5989, + "num_tokens": 316632345.0, + "step": 3360 + }, + { + "epoch": 0.573647380098993, + "grad_norm": 0.4719195382780712, + "learning_rate": 1.7060931899641577e-05, + "loss": 0.5484, + "num_tokens": 316729205.0, + "step": 3361 + }, + { + "epoch": 0.5738180576890254, + "grad_norm": 0.5192758321686136, + "learning_rate": 1.705410479604028e-05, + "loss": 0.6434, + "num_tokens": 316811179.0, + "step": 3362 + }, + { + "epoch": 0.5739887352790578, + "grad_norm": 0.4364843825816534, + "learning_rate": 1.7047277692438984e-05, + "loss": 0.5282, + "num_tokens": 316916459.0, + "step": 3363 + }, + { + "epoch": 0.5741594128690903, + "grad_norm": 0.4781421180716582, + "learning_rate": 1.7040450588837688e-05, + "loss": 0.5172, + "num_tokens": 317000261.0, + "step": 3364 + }, + { + "epoch": 0.5743300904591228, + "grad_norm": 0.45568561940221425, + "learning_rate": 1.7033623485236392e-05, + "loss": 0.5677, + "num_tokens": 317101738.0, + "step": 3365 + }, + { + "epoch": 0.5745007680491552, + "grad_norm": 0.480393493024414, + "learning_rate": 1.7026796381635092e-05, + "loss": 0.5498, + "num_tokens": 317194884.0, + "step": 3366 + }, + { + "epoch": 0.5746714456391876, + "grad_norm": 0.5169768241879293, + "learning_rate": 1.7019969278033796e-05, + "loss": 0.5571, + "num_tokens": 317274231.0, + "step": 3367 + }, + { + "epoch": 0.57484212322922, + "grad_norm": 0.49364872326471004, + "learning_rate": 1.7013142174432496e-05, + "loss": 0.562, + "num_tokens": 317353009.0, + "step": 3368 + }, + { + "epoch": 0.5750128008192524, + "grad_norm": 0.5020528857820099, + "learning_rate": 1.70063150708312e-05, + "loss": 0.6303, + "num_tokens": 317447358.0, + "step": 3369 + }, + { + "epoch": 0.5751834784092849, + "grad_norm": 0.48347783466502814, + "learning_rate": 1.6999487967229904e-05, + "loss": 0.5552, + "num_tokens": 317538812.0, + "step": 3370 + }, + { + "epoch": 0.5753541559993173, + "grad_norm": 0.5024648160596316, + "learning_rate": 1.6992660863628608e-05, + "loss": 0.5793, + "num_tokens": 317626500.0, + "step": 3371 + }, + { + "epoch": 0.5755248335893497, + "grad_norm": 0.5362371262400161, + "learning_rate": 1.698583376002731e-05, + "loss": 0.6038, + "num_tokens": 317706960.0, + "step": 3372 + }, + { + "epoch": 0.5756955111793821, + "grad_norm": 0.43391250784740065, + "learning_rate": 1.6979006656426015e-05, + "loss": 0.5421, + "num_tokens": 317810467.0, + "step": 3373 + }, + { + "epoch": 0.5758661887694145, + "grad_norm": 0.47675995754529843, + "learning_rate": 1.6972179552824716e-05, + "loss": 0.5922, + "num_tokens": 317907758.0, + "step": 3374 + }, + { + "epoch": 0.576036866359447, + "grad_norm": 0.5022474620928103, + "learning_rate": 1.696535244922342e-05, + "loss": 0.6584, + "num_tokens": 318004398.0, + "step": 3375 + }, + { + "epoch": 0.5762075439494795, + "grad_norm": 0.4477429113790345, + "learning_rate": 1.695852534562212e-05, + "loss": 0.5855, + "num_tokens": 318114251.0, + "step": 3376 + }, + { + "epoch": 0.5763782215395119, + "grad_norm": 0.48127642391089953, + "learning_rate": 1.6951698242020823e-05, + "loss": 0.6467, + "num_tokens": 318211029.0, + "step": 3377 + }, + { + "epoch": 0.5765488991295443, + "grad_norm": 0.4821702938382031, + "learning_rate": 1.6944871138419527e-05, + "loss": 0.6376, + "num_tokens": 318318309.0, + "step": 3378 + }, + { + "epoch": 0.5767195767195767, + "grad_norm": 0.44235193879225015, + "learning_rate": 1.693804403481823e-05, + "loss": 0.5271, + "num_tokens": 318417759.0, + "step": 3379 + }, + { + "epoch": 0.5768902543096092, + "grad_norm": 0.5169100775006121, + "learning_rate": 1.693121693121693e-05, + "loss": 0.5878, + "num_tokens": 318498972.0, + "step": 3380 + }, + { + "epoch": 0.5770609318996416, + "grad_norm": 0.5174659272685124, + "learning_rate": 1.6924389827615635e-05, + "loss": 0.6089, + "num_tokens": 318579422.0, + "step": 3381 + }, + { + "epoch": 0.577231609489674, + "grad_norm": 0.5248974556096488, + "learning_rate": 1.691756272401434e-05, + "loss": 0.5333, + "num_tokens": 318666205.0, + "step": 3382 + }, + { + "epoch": 0.5774022870797064, + "grad_norm": 0.446110804800162, + "learning_rate": 1.6910735620413043e-05, + "loss": 0.5003, + "num_tokens": 318760235.0, + "step": 3383 + }, + { + "epoch": 0.5775729646697388, + "grad_norm": 0.5245651352758642, + "learning_rate": 1.6903908516811743e-05, + "loss": 0.6187, + "num_tokens": 318842738.0, + "step": 3384 + }, + { + "epoch": 0.5777436422597713, + "grad_norm": 0.4673888904571171, + "learning_rate": 1.6897081413210447e-05, + "loss": 0.5896, + "num_tokens": 318945050.0, + "step": 3385 + }, + { + "epoch": 0.5779143198498037, + "grad_norm": 0.5369846244791284, + "learning_rate": 1.6890254309609147e-05, + "loss": 0.661, + "num_tokens": 319033597.0, + "step": 3386 + }, + { + "epoch": 0.5780849974398361, + "grad_norm": 0.49766684146399287, + "learning_rate": 1.688342720600785e-05, + "loss": 0.6465, + "num_tokens": 319129380.0, + "step": 3387 + }, + { + "epoch": 0.5782556750298686, + "grad_norm": 0.4975961151790149, + "learning_rate": 1.6876600102406555e-05, + "loss": 0.5302, + "num_tokens": 319203591.0, + "step": 3388 + }, + { + "epoch": 0.578426352619901, + "grad_norm": 2.0160519694258014, + "learning_rate": 1.686977299880526e-05, + "loss": 0.6782, + "num_tokens": 319312177.0, + "step": 3389 + }, + { + "epoch": 0.5785970302099335, + "grad_norm": 0.4938330690464966, + "learning_rate": 1.6862945895203962e-05, + "loss": 0.6279, + "num_tokens": 319401740.0, + "step": 3390 + }, + { + "epoch": 0.5787677077999659, + "grad_norm": 0.5621636643947034, + "learning_rate": 1.6856118791602666e-05, + "loss": 0.6063, + "num_tokens": 319492618.0, + "step": 3391 + }, + { + "epoch": 0.5789383853899983, + "grad_norm": 0.501252848975728, + "learning_rate": 1.6849291688001366e-05, + "loss": 0.6815, + "num_tokens": 319592787.0, + "step": 3392 + }, + { + "epoch": 0.5791090629800307, + "grad_norm": 0.4714149380259448, + "learning_rate": 1.684246458440007e-05, + "loss": 0.5801, + "num_tokens": 319682183.0, + "step": 3393 + }, + { + "epoch": 0.5792797405700632, + "grad_norm": 0.48810039371887937, + "learning_rate": 1.683563748079877e-05, + "loss": 0.6165, + "num_tokens": 319773046.0, + "step": 3394 + }, + { + "epoch": 0.5794504181600956, + "grad_norm": 0.5123057491727071, + "learning_rate": 1.6828810377197474e-05, + "loss": 0.637, + "num_tokens": 319861148.0, + "step": 3395 + }, + { + "epoch": 0.579621095750128, + "grad_norm": 0.5065731024667902, + "learning_rate": 1.6821983273596178e-05, + "loss": 0.5179, + "num_tokens": 319928854.0, + "step": 3396 + }, + { + "epoch": 0.5797917733401604, + "grad_norm": 0.4784781144423495, + "learning_rate": 1.6815156169994882e-05, + "loss": 0.6186, + "num_tokens": 320028031.0, + "step": 3397 + }, + { + "epoch": 0.5799624509301928, + "grad_norm": 0.5845857359924372, + "learning_rate": 1.6808329066393586e-05, + "loss": 0.4784, + "num_tokens": 320088039.0, + "step": 3398 + }, + { + "epoch": 0.5801331285202252, + "grad_norm": 0.4613729136728334, + "learning_rate": 1.6801501962792286e-05, + "loss": 0.5477, + "num_tokens": 320181891.0, + "step": 3399 + }, + { + "epoch": 0.5803038061102577, + "grad_norm": 0.5207832946692702, + "learning_rate": 1.679467485919099e-05, + "loss": 0.5746, + "num_tokens": 320254737.0, + "step": 3400 + }, + { + "epoch": 0.5804744837002902, + "grad_norm": 0.5186217913825224, + "learning_rate": 1.6787847755589693e-05, + "loss": 0.4771, + "num_tokens": 320327074.0, + "step": 3401 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.4984518346444063, + "learning_rate": 1.6781020651988397e-05, + "loss": 0.6004, + "num_tokens": 320407614.0, + "step": 3402 + }, + { + "epoch": 0.580815838880355, + "grad_norm": 0.45282959761243674, + "learning_rate": 1.6774193548387098e-05, + "loss": 0.5178, + "num_tokens": 320498586.0, + "step": 3403 + }, + { + "epoch": 0.5809865164703875, + "grad_norm": 0.4514384082547995, + "learning_rate": 1.67673664447858e-05, + "loss": 0.5511, + "num_tokens": 320608409.0, + "step": 3404 + }, + { + "epoch": 0.5811571940604199, + "grad_norm": 0.5100196565298788, + "learning_rate": 1.6760539341184502e-05, + "loss": 0.5652, + "num_tokens": 320690206.0, + "step": 3405 + }, + { + "epoch": 0.5813278716504523, + "grad_norm": 0.5063598064805167, + "learning_rate": 1.6753712237583206e-05, + "loss": 0.5257, + "num_tokens": 320770437.0, + "step": 3406 + }, + { + "epoch": 0.5814985492404847, + "grad_norm": 0.5251953606112958, + "learning_rate": 1.674688513398191e-05, + "loss": 0.5117, + "num_tokens": 320846835.0, + "step": 3407 + }, + { + "epoch": 0.5816692268305171, + "grad_norm": 0.539291200314105, + "learning_rate": 1.6740058030380613e-05, + "loss": 0.6663, + "num_tokens": 320929595.0, + "step": 3408 + }, + { + "epoch": 0.5818399044205496, + "grad_norm": 0.5151923309641129, + "learning_rate": 1.6733230926779317e-05, + "loss": 0.5059, + "num_tokens": 320998454.0, + "step": 3409 + }, + { + "epoch": 0.582010582010582, + "grad_norm": 0.4926914025348591, + "learning_rate": 1.672640382317802e-05, + "loss": 0.6207, + "num_tokens": 321085049.0, + "step": 3410 + }, + { + "epoch": 0.5821812596006144, + "grad_norm": 0.44443432165146446, + "learning_rate": 1.671957671957672e-05, + "loss": 0.6027, + "num_tokens": 321202767.0, + "step": 3411 + }, + { + "epoch": 0.5823519371906468, + "grad_norm": 0.5062688247618641, + "learning_rate": 1.6712749615975425e-05, + "loss": 0.6641, + "num_tokens": 321302548.0, + "step": 3412 + }, + { + "epoch": 0.5825226147806793, + "grad_norm": 0.4563485148005084, + "learning_rate": 1.6705922512374125e-05, + "loss": 0.5794, + "num_tokens": 321408481.0, + "step": 3413 + }, + { + "epoch": 0.5826932923707118, + "grad_norm": 0.5225899332240858, + "learning_rate": 1.669909540877283e-05, + "loss": 0.6255, + "num_tokens": 321493497.0, + "step": 3414 + }, + { + "epoch": 0.5828639699607442, + "grad_norm": 0.5070986731535245, + "learning_rate": 1.6692268305171533e-05, + "loss": 0.5849, + "num_tokens": 321583292.0, + "step": 3415 + }, + { + "epoch": 0.5830346475507766, + "grad_norm": 0.5769005656588899, + "learning_rate": 1.6685441201570236e-05, + "loss": 0.7271, + "num_tokens": 321658333.0, + "step": 3416 + }, + { + "epoch": 0.583205325140809, + "grad_norm": 0.4650899537162327, + "learning_rate": 1.6678614097968937e-05, + "loss": 0.5953, + "num_tokens": 321763643.0, + "step": 3417 + }, + { + "epoch": 0.5833760027308414, + "grad_norm": 0.5003784372431751, + "learning_rate": 1.667178699436764e-05, + "loss": 0.6099, + "num_tokens": 321856084.0, + "step": 3418 + }, + { + "epoch": 0.5835466803208739, + "grad_norm": 0.6941578168595592, + "learning_rate": 1.6664959890766344e-05, + "loss": 0.6836, + "num_tokens": 321935023.0, + "step": 3419 + }, + { + "epoch": 0.5837173579109063, + "grad_norm": 0.462340659045164, + "learning_rate": 1.6658132787165048e-05, + "loss": 0.5839, + "num_tokens": 322034661.0, + "step": 3420 + }, + { + "epoch": 0.5838880355009387, + "grad_norm": 0.4770013136665141, + "learning_rate": 1.665130568356375e-05, + "loss": 0.5751, + "num_tokens": 322140680.0, + "step": 3421 + }, + { + "epoch": 0.5840587130909711, + "grad_norm": 0.45840028420734075, + "learning_rate": 1.6644478579962452e-05, + "loss": 0.6443, + "num_tokens": 322249435.0, + "step": 3422 + }, + { + "epoch": 0.5842293906810035, + "grad_norm": 0.44390763152656665, + "learning_rate": 1.6637651476361153e-05, + "loss": 0.5211, + "num_tokens": 322355746.0, + "step": 3423 + }, + { + "epoch": 0.584400068271036, + "grad_norm": 0.4783874905055336, + "learning_rate": 1.6630824372759856e-05, + "loss": 0.6059, + "num_tokens": 322452987.0, + "step": 3424 + }, + { + "epoch": 0.5845707458610685, + "grad_norm": 0.7490223457084462, + "learning_rate": 1.662399726915856e-05, + "loss": 0.5941, + "num_tokens": 322552204.0, + "step": 3425 + }, + { + "epoch": 0.5847414234511009, + "grad_norm": 0.44919708444501066, + "learning_rate": 1.6617170165557264e-05, + "loss": 0.7078, + "num_tokens": 322690801.0, + "step": 3426 + }, + { + "epoch": 0.5849121010411333, + "grad_norm": 0.470743677896631, + "learning_rate": 1.6610343061955968e-05, + "loss": 0.6059, + "num_tokens": 322791294.0, + "step": 3427 + }, + { + "epoch": 0.5850827786311658, + "grad_norm": 0.5172693049275585, + "learning_rate": 1.660351595835467e-05, + "loss": 0.6096, + "num_tokens": 322871595.0, + "step": 3428 + }, + { + "epoch": 0.5852534562211982, + "grad_norm": 0.46113037665088324, + "learning_rate": 1.6596688854753372e-05, + "loss": 0.5223, + "num_tokens": 322960633.0, + "step": 3429 + }, + { + "epoch": 0.5854241338112306, + "grad_norm": 0.588500438022913, + "learning_rate": 1.6589861751152075e-05, + "loss": 0.6459, + "num_tokens": 323045837.0, + "step": 3430 + }, + { + "epoch": 0.585594811401263, + "grad_norm": 0.4769898101821736, + "learning_rate": 1.6583034647550776e-05, + "loss": 0.5695, + "num_tokens": 323140385.0, + "step": 3431 + }, + { + "epoch": 0.5857654889912954, + "grad_norm": 0.42093291299997915, + "learning_rate": 1.657620754394948e-05, + "loss": 0.5549, + "num_tokens": 323248644.0, + "step": 3432 + }, + { + "epoch": 0.5859361665813279, + "grad_norm": 0.5123637575469956, + "learning_rate": 1.6569380440348183e-05, + "loss": 0.5827, + "num_tokens": 323328879.0, + "step": 3433 + }, + { + "epoch": 0.5861068441713603, + "grad_norm": 0.47854014614612067, + "learning_rate": 1.6562553336746887e-05, + "loss": 0.6199, + "num_tokens": 323432197.0, + "step": 3434 + }, + { + "epoch": 0.5862775217613927, + "grad_norm": 0.515272611651013, + "learning_rate": 1.655572623314559e-05, + "loss": 0.5453, + "num_tokens": 323514814.0, + "step": 3435 + }, + { + "epoch": 0.5864481993514251, + "grad_norm": 0.49263240467192193, + "learning_rate": 1.654889912954429e-05, + "loss": 0.5177, + "num_tokens": 323593158.0, + "step": 3436 + }, + { + "epoch": 0.5866188769414575, + "grad_norm": 0.5181548944993186, + "learning_rate": 1.6542072025942995e-05, + "loss": 0.5929, + "num_tokens": 323665981.0, + "step": 3437 + }, + { + "epoch": 0.5867895545314901, + "grad_norm": 0.5107243707506156, + "learning_rate": 1.65352449223417e-05, + "loss": 0.4775, + "num_tokens": 323739396.0, + "step": 3438 + }, + { + "epoch": 0.5869602321215225, + "grad_norm": 0.5215371731113385, + "learning_rate": 1.6528417818740403e-05, + "loss": 0.6111, + "num_tokens": 323825209.0, + "step": 3439 + }, + { + "epoch": 0.5871309097115549, + "grad_norm": 0.4875260370869305, + "learning_rate": 1.6521590715139103e-05, + "loss": 0.6784, + "num_tokens": 323930517.0, + "step": 3440 + }, + { + "epoch": 0.5873015873015873, + "grad_norm": 0.46072543772918223, + "learning_rate": 1.6514763611537807e-05, + "loss": 0.6258, + "num_tokens": 324047082.0, + "step": 3441 + }, + { + "epoch": 0.5874722648916197, + "grad_norm": 0.47503874849886124, + "learning_rate": 1.6507936507936507e-05, + "loss": 0.5885, + "num_tokens": 324143441.0, + "step": 3442 + }, + { + "epoch": 0.5876429424816522, + "grad_norm": 0.4946061522468672, + "learning_rate": 1.650110940433521e-05, + "loss": 0.6077, + "num_tokens": 324235127.0, + "step": 3443 + }, + { + "epoch": 0.5878136200716846, + "grad_norm": 0.5204089628531647, + "learning_rate": 1.6494282300733915e-05, + "loss": 0.5703, + "num_tokens": 324317138.0, + "step": 3444 + }, + { + "epoch": 0.587984297661717, + "grad_norm": 0.4749032719934435, + "learning_rate": 1.648745519713262e-05, + "loss": 0.6395, + "num_tokens": 324420156.0, + "step": 3445 + }, + { + "epoch": 0.5881549752517494, + "grad_norm": 0.4627421487943803, + "learning_rate": 1.6480628093531322e-05, + "loss": 0.6156, + "num_tokens": 324541241.0, + "step": 3446 + }, + { + "epoch": 0.5883256528417818, + "grad_norm": 0.5009049822801436, + "learning_rate": 1.6473800989930026e-05, + "loss": 0.5754, + "num_tokens": 324640168.0, + "step": 3447 + }, + { + "epoch": 0.5884963304318143, + "grad_norm": 0.478099548768233, + "learning_rate": 1.6466973886328726e-05, + "loss": 0.6544, + "num_tokens": 324752657.0, + "step": 3448 + }, + { + "epoch": 0.5886670080218467, + "grad_norm": 0.4634614994278779, + "learning_rate": 1.646014678272743e-05, + "loss": 0.5427, + "num_tokens": 324847347.0, + "step": 3449 + }, + { + "epoch": 0.5888376856118792, + "grad_norm": 0.46070321839494754, + "learning_rate": 1.645331967912613e-05, + "loss": 0.57, + "num_tokens": 324946610.0, + "step": 3450 + }, + { + "epoch": 0.5890083632019116, + "grad_norm": 0.5231379243175022, + "learning_rate": 1.6446492575524834e-05, + "loss": 0.5411, + "num_tokens": 325023241.0, + "step": 3451 + }, + { + "epoch": 0.589179040791944, + "grad_norm": 0.43480062830402616, + "learning_rate": 1.6439665471923538e-05, + "loss": 0.61, + "num_tokens": 325138078.0, + "step": 3452 + }, + { + "epoch": 0.5893497183819765, + "grad_norm": 0.49422081951620705, + "learning_rate": 1.6432838368322242e-05, + "loss": 0.5664, + "num_tokens": 325220044.0, + "step": 3453 + }, + { + "epoch": 0.5895203959720089, + "grad_norm": 0.48258124512385187, + "learning_rate": 1.6426011264720942e-05, + "loss": 0.5962, + "num_tokens": 325314906.0, + "step": 3454 + }, + { + "epoch": 0.5896910735620413, + "grad_norm": 0.5145229911048186, + "learning_rate": 1.6419184161119646e-05, + "loss": 0.53, + "num_tokens": 325386693.0, + "step": 3455 + }, + { + "epoch": 0.5898617511520737, + "grad_norm": 0.48729252116828337, + "learning_rate": 1.641235705751835e-05, + "loss": 0.5791, + "num_tokens": 325484313.0, + "step": 3456 + }, + { + "epoch": 0.5900324287421062, + "grad_norm": 0.5037730755123196, + "learning_rate": 1.6405529953917053e-05, + "loss": 0.5767, + "num_tokens": 325558561.0, + "step": 3457 + }, + { + "epoch": 0.5902031063321386, + "grad_norm": 0.422431551370684, + "learning_rate": 1.6398702850315754e-05, + "loss": 0.6485, + "num_tokens": 325699214.0, + "step": 3458 + }, + { + "epoch": 0.590373783922171, + "grad_norm": 0.4679279275414914, + "learning_rate": 1.6391875746714457e-05, + "loss": 0.5909, + "num_tokens": 325795069.0, + "step": 3459 + }, + { + "epoch": 0.5905444615122034, + "grad_norm": 0.4691872704594372, + "learning_rate": 1.638504864311316e-05, + "loss": 0.5186, + "num_tokens": 325886181.0, + "step": 3460 + }, + { + "epoch": 0.5907151391022358, + "grad_norm": 0.46263030185825793, + "learning_rate": 1.637822153951186e-05, + "loss": 0.5903, + "num_tokens": 325988852.0, + "step": 3461 + }, + { + "epoch": 0.5908858166922683, + "grad_norm": 0.4161144559742367, + "learning_rate": 1.6371394435910565e-05, + "loss": 0.5229, + "num_tokens": 326097376.0, + "step": 3462 + }, + { + "epoch": 0.5910564942823008, + "grad_norm": 0.4440427479851766, + "learning_rate": 1.636456733230927e-05, + "loss": 0.5386, + "num_tokens": 326199750.0, + "step": 3463 + }, + { + "epoch": 0.5912271718723332, + "grad_norm": 0.48290536824263913, + "learning_rate": 1.6357740228707973e-05, + "loss": 0.5759, + "num_tokens": 326305771.0, + "step": 3464 + }, + { + "epoch": 0.5913978494623656, + "grad_norm": 0.4596765439069304, + "learning_rate": 1.6350913125106677e-05, + "loss": 0.4881, + "num_tokens": 326393093.0, + "step": 3465 + }, + { + "epoch": 0.591568527052398, + "grad_norm": 0.5348192412486799, + "learning_rate": 1.6344086021505377e-05, + "loss": 0.508, + "num_tokens": 326453418.0, + "step": 3466 + }, + { + "epoch": 0.5917392046424305, + "grad_norm": 0.4517653221110361, + "learning_rate": 1.633725891790408e-05, + "loss": 0.5361, + "num_tokens": 326548824.0, + "step": 3467 + }, + { + "epoch": 0.5919098822324629, + "grad_norm": 0.5209496808804358, + "learning_rate": 1.6330431814302785e-05, + "loss": 0.5144, + "num_tokens": 326625250.0, + "step": 3468 + }, + { + "epoch": 0.5920805598224953, + "grad_norm": 0.47025465322751436, + "learning_rate": 1.6323604710701485e-05, + "loss": 0.5512, + "num_tokens": 326726340.0, + "step": 3469 + }, + { + "epoch": 0.5922512374125277, + "grad_norm": 0.47061330754100467, + "learning_rate": 1.631677760710019e-05, + "loss": 0.6403, + "num_tokens": 326849618.0, + "step": 3470 + }, + { + "epoch": 0.5924219150025601, + "grad_norm": 0.4511645431403223, + "learning_rate": 1.6309950503498892e-05, + "loss": 0.521, + "num_tokens": 326951923.0, + "step": 3471 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.44792677288294896, + "learning_rate": 1.6303123399897596e-05, + "loss": 0.5256, + "num_tokens": 327053352.0, + "step": 3472 + }, + { + "epoch": 0.592763270182625, + "grad_norm": 0.4481250766516246, + "learning_rate": 1.6296296296296297e-05, + "loss": 0.5542, + "num_tokens": 327159320.0, + "step": 3473 + }, + { + "epoch": 0.5929339477726574, + "grad_norm": 0.4982954391133741, + "learning_rate": 1.6289469192695e-05, + "loss": 0.5682, + "num_tokens": 327242255.0, + "step": 3474 + }, + { + "epoch": 0.5931046253626899, + "grad_norm": 0.5619331447282091, + "learning_rate": 1.6282642089093704e-05, + "loss": 0.5164, + "num_tokens": 327300763.0, + "step": 3475 + }, + { + "epoch": 0.5932753029527223, + "grad_norm": 0.5505898344465563, + "learning_rate": 1.6275814985492408e-05, + "loss": 0.622, + "num_tokens": 327374608.0, + "step": 3476 + }, + { + "epoch": 0.5934459805427548, + "grad_norm": 0.4465007355509528, + "learning_rate": 1.6268987881891108e-05, + "loss": 0.5923, + "num_tokens": 327500652.0, + "step": 3477 + }, + { + "epoch": 0.5936166581327872, + "grad_norm": 0.4483297398820762, + "learning_rate": 1.6262160778289812e-05, + "loss": 0.5692, + "num_tokens": 327603369.0, + "step": 3478 + }, + { + "epoch": 0.5937873357228196, + "grad_norm": 0.5097390202052126, + "learning_rate": 1.6255333674688512e-05, + "loss": 0.6141, + "num_tokens": 327709011.0, + "step": 3479 + }, + { + "epoch": 0.593958013312852, + "grad_norm": 0.5230288550716223, + "learning_rate": 1.6248506571087216e-05, + "loss": 0.725, + "num_tokens": 327811581.0, + "step": 3480 + }, + { + "epoch": 0.5941286909028844, + "grad_norm": 0.4398686395277153, + "learning_rate": 1.624167946748592e-05, + "loss": 0.5388, + "num_tokens": 327917696.0, + "step": 3481 + }, + { + "epoch": 0.5942993684929169, + "grad_norm": 0.50371967442367, + "learning_rate": 1.6234852363884624e-05, + "loss": 0.5583, + "num_tokens": 327998586.0, + "step": 3482 + }, + { + "epoch": 0.5944700460829493, + "grad_norm": 0.5321201117379377, + "learning_rate": 1.6228025260283327e-05, + "loss": 0.6396, + "num_tokens": 328082008.0, + "step": 3483 + }, + { + "epoch": 0.5946407236729817, + "grad_norm": 0.4498250081348172, + "learning_rate": 1.622119815668203e-05, + "loss": 0.582, + "num_tokens": 328193830.0, + "step": 3484 + }, + { + "epoch": 0.5948114012630141, + "grad_norm": 0.513290958466778, + "learning_rate": 1.621437105308073e-05, + "loss": 0.5557, + "num_tokens": 328272092.0, + "step": 3485 + }, + { + "epoch": 0.5949820788530465, + "grad_norm": 0.5433823877590496, + "learning_rate": 1.6207543949479435e-05, + "loss": 0.6033, + "num_tokens": 328344585.0, + "step": 3486 + }, + { + "epoch": 0.5951527564430791, + "grad_norm": 0.4818543042340627, + "learning_rate": 1.6200716845878136e-05, + "loss": 0.6621, + "num_tokens": 328450016.0, + "step": 3487 + }, + { + "epoch": 0.5953234340331115, + "grad_norm": 0.41308126649874743, + "learning_rate": 1.619388974227684e-05, + "loss": 0.6051, + "num_tokens": 328592559.0, + "step": 3488 + }, + { + "epoch": 0.5954941116231439, + "grad_norm": 0.5005934198017281, + "learning_rate": 1.6187062638675543e-05, + "loss": 0.5956, + "num_tokens": 328673783.0, + "step": 3489 + }, + { + "epoch": 0.5956647892131763, + "grad_norm": 0.5248849979332141, + "learning_rate": 1.6180235535074247e-05, + "loss": 0.6591, + "num_tokens": 328768225.0, + "step": 3490 + }, + { + "epoch": 0.5958354668032088, + "grad_norm": 0.45621046431560197, + "learning_rate": 1.617340843147295e-05, + "loss": 0.5551, + "num_tokens": 328873183.0, + "step": 3491 + }, + { + "epoch": 0.5960061443932412, + "grad_norm": 0.45422241597130736, + "learning_rate": 1.616658132787165e-05, + "loss": 0.4965, + "num_tokens": 328966564.0, + "step": 3492 + }, + { + "epoch": 0.5961768219832736, + "grad_norm": 0.5068246496678622, + "learning_rate": 1.6159754224270355e-05, + "loss": 0.5908, + "num_tokens": 329046390.0, + "step": 3493 + }, + { + "epoch": 0.596347499573306, + "grad_norm": 0.43414529565329013, + "learning_rate": 1.615292712066906e-05, + "loss": 0.5891, + "num_tokens": 329160218.0, + "step": 3494 + }, + { + "epoch": 0.5965181771633384, + "grad_norm": 0.5405466094334678, + "learning_rate": 1.614610001706776e-05, + "loss": 0.5697, + "num_tokens": 329244435.0, + "step": 3495 + }, + { + "epoch": 0.5966888547533709, + "grad_norm": 0.4282377330947544, + "learning_rate": 1.6139272913466463e-05, + "loss": 0.5577, + "num_tokens": 329358832.0, + "step": 3496 + }, + { + "epoch": 0.5968595323434033, + "grad_norm": 0.45888735950087634, + "learning_rate": 1.6132445809865167e-05, + "loss": 0.6194, + "num_tokens": 329481881.0, + "step": 3497 + }, + { + "epoch": 0.5970302099334357, + "grad_norm": 0.5424363732866481, + "learning_rate": 1.6125618706263867e-05, + "loss": 0.6625, + "num_tokens": 329565152.0, + "step": 3498 + }, + { + "epoch": 0.5972008875234681, + "grad_norm": 0.4370678219583965, + "learning_rate": 1.611879160266257e-05, + "loss": 0.5212, + "num_tokens": 329663206.0, + "step": 3499 + }, + { + "epoch": 0.5973715651135006, + "grad_norm": 0.5050054667231645, + "learning_rate": 1.6111964499061275e-05, + "loss": 0.6248, + "num_tokens": 329762091.0, + "step": 3500 + }, + { + "epoch": 0.5975422427035331, + "grad_norm": 0.5165606190454154, + "learning_rate": 1.6105137395459978e-05, + "loss": 0.5777, + "num_tokens": 329841719.0, + "step": 3501 + }, + { + "epoch": 0.5977129202935655, + "grad_norm": 0.4917888329517648, + "learning_rate": 1.6098310291858682e-05, + "loss": 0.5343, + "num_tokens": 329922561.0, + "step": 3502 + }, + { + "epoch": 0.5978835978835979, + "grad_norm": 0.5153612126517976, + "learning_rate": 1.6091483188257382e-05, + "loss": 0.5774, + "num_tokens": 330005363.0, + "step": 3503 + }, + { + "epoch": 0.5980542754736303, + "grad_norm": 0.5060206634497117, + "learning_rate": 1.6084656084656086e-05, + "loss": 0.5555, + "num_tokens": 330091185.0, + "step": 3504 + }, + { + "epoch": 0.5982249530636627, + "grad_norm": 0.47266395070301465, + "learning_rate": 1.607782898105479e-05, + "loss": 0.6076, + "num_tokens": 330197046.0, + "step": 3505 + }, + { + "epoch": 0.5983956306536952, + "grad_norm": 0.514218987118427, + "learning_rate": 1.607100187745349e-05, + "loss": 0.613, + "num_tokens": 330283439.0, + "step": 3506 + }, + { + "epoch": 0.5985663082437276, + "grad_norm": 0.4776126173692995, + "learning_rate": 1.6064174773852194e-05, + "loss": 0.5759, + "num_tokens": 330379182.0, + "step": 3507 + }, + { + "epoch": 0.59873698583376, + "grad_norm": 0.5092469133654185, + "learning_rate": 1.6057347670250898e-05, + "loss": 0.5583, + "num_tokens": 330452996.0, + "step": 3508 + }, + { + "epoch": 0.5989076634237924, + "grad_norm": 0.501264829006857, + "learning_rate": 1.60505205666496e-05, + "loss": 0.5288, + "num_tokens": 330538109.0, + "step": 3509 + }, + { + "epoch": 0.5990783410138248, + "grad_norm": 0.4663941297479572, + "learning_rate": 1.6043693463048302e-05, + "loss": 0.5535, + "num_tokens": 330635178.0, + "step": 3510 + }, + { + "epoch": 0.5992490186038573, + "grad_norm": 0.4596591299349046, + "learning_rate": 1.6036866359447006e-05, + "loss": 0.5657, + "num_tokens": 330738394.0, + "step": 3511 + }, + { + "epoch": 0.5994196961938898, + "grad_norm": 0.41470433142727503, + "learning_rate": 1.603003925584571e-05, + "loss": 0.546, + "num_tokens": 330861565.0, + "step": 3512 + }, + { + "epoch": 0.5995903737839222, + "grad_norm": 0.47116253014389337, + "learning_rate": 1.6023212152244413e-05, + "loss": 0.5132, + "num_tokens": 330945830.0, + "step": 3513 + }, + { + "epoch": 0.5997610513739546, + "grad_norm": 0.48306542865657215, + "learning_rate": 1.6016385048643114e-05, + "loss": 0.5767, + "num_tokens": 331039640.0, + "step": 3514 + }, + { + "epoch": 0.599931728963987, + "grad_norm": 0.4832665039034823, + "learning_rate": 1.6009557945041817e-05, + "loss": 0.5714, + "num_tokens": 331133593.0, + "step": 3515 + }, + { + "epoch": 0.6001024065540195, + "grad_norm": 0.43546121304830837, + "learning_rate": 1.6002730841440518e-05, + "loss": 0.5046, + "num_tokens": 331241107.0, + "step": 3516 + }, + { + "epoch": 0.6002730841440519, + "grad_norm": 0.508546166804123, + "learning_rate": 1.599590373783922e-05, + "loss": 0.5704, + "num_tokens": 331315245.0, + "step": 3517 + }, + { + "epoch": 0.6004437617340843, + "grad_norm": 0.48173405077260556, + "learning_rate": 1.5989076634237925e-05, + "loss": 0.5209, + "num_tokens": 331395653.0, + "step": 3518 + }, + { + "epoch": 0.6006144393241167, + "grad_norm": 0.44117044114018444, + "learning_rate": 1.598224953063663e-05, + "loss": 0.5706, + "num_tokens": 331502992.0, + "step": 3519 + }, + { + "epoch": 0.6007851169141492, + "grad_norm": 0.48419777722006313, + "learning_rate": 1.5975422427035333e-05, + "loss": 0.5231, + "num_tokens": 331583352.0, + "step": 3520 + }, + { + "epoch": 0.6009557945041816, + "grad_norm": 0.4319149087876771, + "learning_rate": 1.5968595323434037e-05, + "loss": 0.4707, + "num_tokens": 331681125.0, + "step": 3521 + }, + { + "epoch": 0.601126472094214, + "grad_norm": 0.5336806178316883, + "learning_rate": 1.5961768219832737e-05, + "loss": 0.683, + "num_tokens": 331766761.0, + "step": 3522 + }, + { + "epoch": 0.6012971496842464, + "grad_norm": 0.50913828563718, + "learning_rate": 1.595494111623144e-05, + "loss": 0.4925, + "num_tokens": 331832986.0, + "step": 3523 + }, + { + "epoch": 0.6014678272742788, + "grad_norm": 0.4999149141786681, + "learning_rate": 1.594811401263014e-05, + "loss": 0.6036, + "num_tokens": 331920176.0, + "step": 3524 + }, + { + "epoch": 0.6016385048643114, + "grad_norm": 0.45643424507846037, + "learning_rate": 1.5941286909028845e-05, + "loss": 0.5373, + "num_tokens": 332011380.0, + "step": 3525 + }, + { + "epoch": 0.6018091824543438, + "grad_norm": 0.4895805807897716, + "learning_rate": 1.593445980542755e-05, + "loss": 0.6227, + "num_tokens": 332101430.0, + "step": 3526 + }, + { + "epoch": 0.6019798600443762, + "grad_norm": 0.5248339298189859, + "learning_rate": 1.5927632701826252e-05, + "loss": 0.4964, + "num_tokens": 332169401.0, + "step": 3527 + }, + { + "epoch": 0.6021505376344086, + "grad_norm": 0.45333932853880793, + "learning_rate": 1.5920805598224956e-05, + "loss": 0.5414, + "num_tokens": 332263475.0, + "step": 3528 + }, + { + "epoch": 0.602321215224441, + "grad_norm": 0.5081325201182729, + "learning_rate": 1.5913978494623657e-05, + "loss": 0.558, + "num_tokens": 332335930.0, + "step": 3529 + }, + { + "epoch": 0.6024918928144735, + "grad_norm": 0.4673834374942599, + "learning_rate": 1.590715139102236e-05, + "loss": 0.5624, + "num_tokens": 332442448.0, + "step": 3530 + }, + { + "epoch": 0.6026625704045059, + "grad_norm": 0.4553827742008108, + "learning_rate": 1.5900324287421064e-05, + "loss": 0.5981, + "num_tokens": 332547154.0, + "step": 3531 + }, + { + "epoch": 0.6028332479945383, + "grad_norm": 0.4721699582839534, + "learning_rate": 1.5893497183819764e-05, + "loss": 0.5869, + "num_tokens": 332644612.0, + "step": 3532 + }, + { + "epoch": 0.6030039255845707, + "grad_norm": 0.5061967467746666, + "learning_rate": 1.5886670080218468e-05, + "loss": 0.6863, + "num_tokens": 332749829.0, + "step": 3533 + }, + { + "epoch": 0.6031746031746031, + "grad_norm": 0.47637961348332303, + "learning_rate": 1.5879842976617172e-05, + "loss": 0.5655, + "num_tokens": 332835923.0, + "step": 3534 + }, + { + "epoch": 0.6033452807646356, + "grad_norm": 0.5154119209436892, + "learning_rate": 1.5873015873015872e-05, + "loss": 0.6265, + "num_tokens": 332923170.0, + "step": 3535 + }, + { + "epoch": 0.603515958354668, + "grad_norm": 0.5081112935766111, + "learning_rate": 1.5866188769414576e-05, + "loss": 0.6191, + "num_tokens": 333022093.0, + "step": 3536 + }, + { + "epoch": 0.6036866359447005, + "grad_norm": 0.5748614251507108, + "learning_rate": 1.585936166581328e-05, + "loss": 0.6006, + "num_tokens": 333081119.0, + "step": 3537 + }, + { + "epoch": 0.6038573135347329, + "grad_norm": 0.49975371316108547, + "learning_rate": 1.5852534562211984e-05, + "loss": 0.5518, + "num_tokens": 333185782.0, + "step": 3538 + }, + { + "epoch": 0.6040279911247654, + "grad_norm": 0.5011537556388476, + "learning_rate": 1.5845707458610687e-05, + "loss": 0.5993, + "num_tokens": 333267473.0, + "step": 3539 + }, + { + "epoch": 0.6041986687147978, + "grad_norm": 0.5065451376089161, + "learning_rate": 1.583888035500939e-05, + "loss": 0.6006, + "num_tokens": 333351484.0, + "step": 3540 + }, + { + "epoch": 0.6043693463048302, + "grad_norm": 0.42931378241159973, + "learning_rate": 1.583205325140809e-05, + "loss": 0.578, + "num_tokens": 333464833.0, + "step": 3541 + }, + { + "epoch": 0.6045400238948626, + "grad_norm": 0.5265061258277138, + "learning_rate": 1.5825226147806795e-05, + "loss": 0.6603, + "num_tokens": 333559982.0, + "step": 3542 + }, + { + "epoch": 0.604710701484895, + "grad_norm": 0.4666433059466304, + "learning_rate": 1.5818399044205496e-05, + "loss": 0.6654, + "num_tokens": 333678733.0, + "step": 3543 + }, + { + "epoch": 0.6048813790749274, + "grad_norm": 0.46007571990397467, + "learning_rate": 1.58115719406042e-05, + "loss": 0.6184, + "num_tokens": 333786644.0, + "step": 3544 + }, + { + "epoch": 0.6050520566649599, + "grad_norm": 0.4836505037708075, + "learning_rate": 1.5804744837002903e-05, + "loss": 0.5411, + "num_tokens": 333864465.0, + "step": 3545 + }, + { + "epoch": 0.6052227342549923, + "grad_norm": 0.44643590287133966, + "learning_rate": 1.5797917733401607e-05, + "loss": 0.6334, + "num_tokens": 333985826.0, + "step": 3546 + }, + { + "epoch": 0.6053934118450247, + "grad_norm": 0.4563441077611402, + "learning_rate": 1.5791090629800307e-05, + "loss": 0.5683, + "num_tokens": 334082731.0, + "step": 3547 + }, + { + "epoch": 0.6055640894350571, + "grad_norm": 0.43824989500312383, + "learning_rate": 1.578426352619901e-05, + "loss": 0.509, + "num_tokens": 334183951.0, + "step": 3548 + }, + { + "epoch": 0.6057347670250897, + "grad_norm": 0.4809467067314409, + "learning_rate": 1.5777436422597715e-05, + "loss": 0.6405, + "num_tokens": 334279305.0, + "step": 3549 + }, + { + "epoch": 0.6059054446151221, + "grad_norm": 0.40416997720200354, + "learning_rate": 1.577060931899642e-05, + "loss": 0.5865, + "num_tokens": 334412376.0, + "step": 3550 + }, + { + "epoch": 0.6060761222051545, + "grad_norm": 0.5443869338741327, + "learning_rate": 1.576378221539512e-05, + "loss": 0.5922, + "num_tokens": 334481050.0, + "step": 3551 + }, + { + "epoch": 0.6062467997951869, + "grad_norm": 0.4958878213686479, + "learning_rate": 1.5756955111793823e-05, + "loss": 0.5748, + "num_tokens": 334565789.0, + "step": 3552 + }, + { + "epoch": 0.6064174773852193, + "grad_norm": 0.5333063975251238, + "learning_rate": 1.5750128008192523e-05, + "loss": 0.5862, + "num_tokens": 334641650.0, + "step": 3553 + }, + { + "epoch": 0.6065881549752518, + "grad_norm": 0.5328468918164831, + "learning_rate": 1.5743300904591227e-05, + "loss": 0.4957, + "num_tokens": 334708991.0, + "step": 3554 + }, + { + "epoch": 0.6067588325652842, + "grad_norm": 0.5315588286706288, + "learning_rate": 1.573647380098993e-05, + "loss": 0.5913, + "num_tokens": 334775757.0, + "step": 3555 + }, + { + "epoch": 0.6069295101553166, + "grad_norm": 0.4502736466342284, + "learning_rate": 1.5729646697388634e-05, + "loss": 0.613, + "num_tokens": 334893450.0, + "step": 3556 + }, + { + "epoch": 0.607100187745349, + "grad_norm": 0.4847829872648731, + "learning_rate": 1.5722819593787338e-05, + "loss": 0.6052, + "num_tokens": 334999495.0, + "step": 3557 + }, + { + "epoch": 0.6072708653353814, + "grad_norm": 0.47564502300934663, + "learning_rate": 1.5715992490186042e-05, + "loss": 0.5599, + "num_tokens": 335097311.0, + "step": 3558 + }, + { + "epoch": 0.6074415429254139, + "grad_norm": 0.48197325739122276, + "learning_rate": 1.5709165386584742e-05, + "loss": 0.5643, + "num_tokens": 335192835.0, + "step": 3559 + }, + { + "epoch": 0.6076122205154463, + "grad_norm": 0.4732091802320942, + "learning_rate": 1.5702338282983446e-05, + "loss": 0.6446, + "num_tokens": 335302038.0, + "step": 3560 + }, + { + "epoch": 0.6077828981054787, + "grad_norm": 0.478078090808869, + "learning_rate": 1.5695511179382146e-05, + "loss": 0.6546, + "num_tokens": 335415942.0, + "step": 3561 + }, + { + "epoch": 0.6079535756955112, + "grad_norm": 0.510997394901381, + "learning_rate": 1.568868407578085e-05, + "loss": 0.5994, + "num_tokens": 335529698.0, + "step": 3562 + }, + { + "epoch": 0.6081242532855436, + "grad_norm": 0.4677029569142117, + "learning_rate": 1.5681856972179554e-05, + "loss": 0.5525, + "num_tokens": 335627305.0, + "step": 3563 + }, + { + "epoch": 0.6082949308755761, + "grad_norm": 0.6072226921708521, + "learning_rate": 1.5675029868578258e-05, + "loss": 0.6404, + "num_tokens": 335687412.0, + "step": 3564 + }, + { + "epoch": 0.6084656084656085, + "grad_norm": 0.5280063348059985, + "learning_rate": 1.566820276497696e-05, + "loss": 0.6567, + "num_tokens": 335776330.0, + "step": 3565 + }, + { + "epoch": 0.6086362860556409, + "grad_norm": 0.5638495669546878, + "learning_rate": 1.5661375661375662e-05, + "loss": 0.532, + "num_tokens": 335834418.0, + "step": 3566 + }, + { + "epoch": 0.6088069636456733, + "grad_norm": 0.494067904647009, + "learning_rate": 1.5654548557774366e-05, + "loss": 0.6539, + "num_tokens": 335952102.0, + "step": 3567 + }, + { + "epoch": 0.6089776412357057, + "grad_norm": 0.5039296280612839, + "learning_rate": 1.564772145417307e-05, + "loss": 0.6218, + "num_tokens": 336042594.0, + "step": 3568 + }, + { + "epoch": 0.6091483188257382, + "grad_norm": 0.45085058129902134, + "learning_rate": 1.564089435057177e-05, + "loss": 0.6364, + "num_tokens": 336153601.0, + "step": 3569 + }, + { + "epoch": 0.6093189964157706, + "grad_norm": 0.5652739850821955, + "learning_rate": 1.5634067246970474e-05, + "loss": 0.6205, + "num_tokens": 336218455.0, + "step": 3570 + }, + { + "epoch": 0.609489674005803, + "grad_norm": 0.4998241610889815, + "learning_rate": 1.5627240143369177e-05, + "loss": 0.6067, + "num_tokens": 336303128.0, + "step": 3571 + }, + { + "epoch": 0.6096603515958354, + "grad_norm": 0.5042686542986183, + "learning_rate": 1.5620413039767878e-05, + "loss": 0.5986, + "num_tokens": 336381366.0, + "step": 3572 + }, + { + "epoch": 0.6098310291858678, + "grad_norm": 0.5123094512281448, + "learning_rate": 1.561358593616658e-05, + "loss": 0.56, + "num_tokens": 336469528.0, + "step": 3573 + }, + { + "epoch": 0.6100017067759004, + "grad_norm": 0.44067480916934315, + "learning_rate": 1.5606758832565285e-05, + "loss": 0.5235, + "num_tokens": 336573003.0, + "step": 3574 + }, + { + "epoch": 0.6101723843659328, + "grad_norm": 0.4963773347075571, + "learning_rate": 1.559993172896399e-05, + "loss": 0.686, + "num_tokens": 336683559.0, + "step": 3575 + }, + { + "epoch": 0.6103430619559652, + "grad_norm": 0.5238960286407632, + "learning_rate": 1.5593104625362693e-05, + "loss": 0.5669, + "num_tokens": 336763733.0, + "step": 3576 + }, + { + "epoch": 0.6105137395459976, + "grad_norm": 0.46028308280279734, + "learning_rate": 1.5586277521761396e-05, + "loss": 0.5363, + "num_tokens": 336859202.0, + "step": 3577 + }, + { + "epoch": 0.61068441713603, + "grad_norm": 0.5610403841862394, + "learning_rate": 1.5579450418160097e-05, + "loss": 0.5747, + "num_tokens": 336920085.0, + "step": 3578 + }, + { + "epoch": 0.6108550947260625, + "grad_norm": 0.5263054310428272, + "learning_rate": 1.55726233145588e-05, + "loss": 0.556, + "num_tokens": 336997800.0, + "step": 3579 + }, + { + "epoch": 0.6110257723160949, + "grad_norm": 0.4721633104674769, + "learning_rate": 1.55657962109575e-05, + "loss": 0.5685, + "num_tokens": 337096537.0, + "step": 3580 + }, + { + "epoch": 0.6111964499061273, + "grad_norm": 0.49339995410004833, + "learning_rate": 1.5558969107356205e-05, + "loss": 0.5393, + "num_tokens": 337181166.0, + "step": 3581 + }, + { + "epoch": 0.6113671274961597, + "grad_norm": 0.4569740964315961, + "learning_rate": 1.555214200375491e-05, + "loss": 0.5712, + "num_tokens": 337282377.0, + "step": 3582 + }, + { + "epoch": 0.6115378050861922, + "grad_norm": 0.5057941783869859, + "learning_rate": 1.5545314900153612e-05, + "loss": 0.6078, + "num_tokens": 337367178.0, + "step": 3583 + }, + { + "epoch": 0.6117084826762246, + "grad_norm": 0.4529236860369162, + "learning_rate": 1.5538487796552313e-05, + "loss": 0.587, + "num_tokens": 337474853.0, + "step": 3584 + }, + { + "epoch": 0.611879160266257, + "grad_norm": 0.4743824641380396, + "learning_rate": 1.5531660692951016e-05, + "loss": 0.5184, + "num_tokens": 337563244.0, + "step": 3585 + }, + { + "epoch": 0.6120498378562894, + "grad_norm": 0.49151963027893303, + "learning_rate": 1.552483358934972e-05, + "loss": 0.5794, + "num_tokens": 337646559.0, + "step": 3586 + }, + { + "epoch": 0.612220515446322, + "grad_norm": 0.4517849612027927, + "learning_rate": 1.5518006485748424e-05, + "loss": 0.5162, + "num_tokens": 337759048.0, + "step": 3587 + }, + { + "epoch": 0.6123911930363544, + "grad_norm": 0.5222405296769319, + "learning_rate": 1.5511179382147124e-05, + "loss": 0.642, + "num_tokens": 337853256.0, + "step": 3588 + }, + { + "epoch": 0.6125618706263868, + "grad_norm": 0.4171613405524083, + "learning_rate": 1.5504352278545828e-05, + "loss": 0.5469, + "num_tokens": 337979580.0, + "step": 3589 + }, + { + "epoch": 0.6127325482164192, + "grad_norm": 0.46140265164928784, + "learning_rate": 1.5497525174944532e-05, + "loss": 0.6138, + "num_tokens": 338078482.0, + "step": 3590 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 0.4487878058578686, + "learning_rate": 1.5490698071343232e-05, + "loss": 0.555, + "num_tokens": 338178684.0, + "step": 3591 + }, + { + "epoch": 0.613073903396484, + "grad_norm": 0.4549476267048519, + "learning_rate": 1.5483870967741936e-05, + "loss": 0.5798, + "num_tokens": 338276604.0, + "step": 3592 + }, + { + "epoch": 0.6132445809865165, + "grad_norm": 0.4722610788044621, + "learning_rate": 1.547704386414064e-05, + "loss": 0.5803, + "num_tokens": 338377702.0, + "step": 3593 + }, + { + "epoch": 0.6134152585765489, + "grad_norm": 0.44884286717765065, + "learning_rate": 1.5470216760539344e-05, + "loss": 0.6061, + "num_tokens": 338494643.0, + "step": 3594 + }, + { + "epoch": 0.6135859361665813, + "grad_norm": 0.4608032421507256, + "learning_rate": 1.5463389656938047e-05, + "loss": 0.581, + "num_tokens": 338591452.0, + "step": 3595 + }, + { + "epoch": 0.6137566137566137, + "grad_norm": 0.4742398469308564, + "learning_rate": 1.5456562553336748e-05, + "loss": 0.5448, + "num_tokens": 338680769.0, + "step": 3596 + }, + { + "epoch": 0.6139272913466461, + "grad_norm": 0.4542343168789292, + "learning_rate": 1.544973544973545e-05, + "loss": 0.5889, + "num_tokens": 338783746.0, + "step": 3597 + }, + { + "epoch": 0.6140979689366786, + "grad_norm": 0.6392609180940966, + "learning_rate": 1.5442908346134152e-05, + "loss": 0.6332, + "num_tokens": 338894749.0, + "step": 3598 + }, + { + "epoch": 0.6142686465267111, + "grad_norm": 0.4979693800373054, + "learning_rate": 1.5436081242532856e-05, + "loss": 0.6316, + "num_tokens": 338990261.0, + "step": 3599 + }, + { + "epoch": 0.6144393241167435, + "grad_norm": 0.4466541577265138, + "learning_rate": 1.542925413893156e-05, + "loss": 0.6486, + "num_tokens": 339113866.0, + "step": 3600 + }, + { + "epoch": 0.6146100017067759, + "grad_norm": 0.45129280163638863, + "learning_rate": 1.5422427035330263e-05, + "loss": 0.5356, + "num_tokens": 339210252.0, + "step": 3601 + }, + { + "epoch": 0.6147806792968084, + "grad_norm": 0.47195079061526096, + "learning_rate": 1.5415599931728967e-05, + "loss": 0.6129, + "num_tokens": 339309689.0, + "step": 3602 + }, + { + "epoch": 0.6149513568868408, + "grad_norm": 0.5212364805207232, + "learning_rate": 1.5408772828127667e-05, + "loss": 0.5477, + "num_tokens": 339378051.0, + "step": 3603 + }, + { + "epoch": 0.6151220344768732, + "grad_norm": 0.5396847459763521, + "learning_rate": 1.540194572452637e-05, + "loss": 0.585, + "num_tokens": 339451859.0, + "step": 3604 + }, + { + "epoch": 0.6152927120669056, + "grad_norm": 0.4341881765379073, + "learning_rate": 1.5395118620925075e-05, + "loss": 0.5842, + "num_tokens": 339561517.0, + "step": 3605 + }, + { + "epoch": 0.615463389656938, + "grad_norm": 0.48833094372894487, + "learning_rate": 1.5388291517323775e-05, + "loss": 0.541, + "num_tokens": 339641740.0, + "step": 3606 + }, + { + "epoch": 0.6156340672469705, + "grad_norm": 0.48360585686163576, + "learning_rate": 1.538146441372248e-05, + "loss": 0.5787, + "num_tokens": 339729608.0, + "step": 3607 + }, + { + "epoch": 0.6158047448370029, + "grad_norm": 0.4518395899250133, + "learning_rate": 1.5374637310121183e-05, + "loss": 0.6031, + "num_tokens": 339869426.0, + "step": 3608 + }, + { + "epoch": 0.6159754224270353, + "grad_norm": 0.47905926666327536, + "learning_rate": 1.5367810206519883e-05, + "loss": 0.6564, + "num_tokens": 339975709.0, + "step": 3609 + }, + { + "epoch": 0.6161461000170677, + "grad_norm": 0.4096668339608625, + "learning_rate": 1.5360983102918587e-05, + "loss": 0.6037, + "num_tokens": 340121865.0, + "step": 3610 + }, + { + "epoch": 0.6163167776071002, + "grad_norm": 0.47677538889693727, + "learning_rate": 1.535415599931729e-05, + "loss": 0.648, + "num_tokens": 340228887.0, + "step": 3611 + }, + { + "epoch": 0.6164874551971327, + "grad_norm": 0.6006899294497313, + "learning_rate": 1.5347328895715994e-05, + "loss": 0.6627, + "num_tokens": 340292466.0, + "step": 3612 + }, + { + "epoch": 0.6166581327871651, + "grad_norm": 0.459436354100249, + "learning_rate": 1.5340501792114698e-05, + "loss": 0.6557, + "num_tokens": 340397727.0, + "step": 3613 + }, + { + "epoch": 0.6168288103771975, + "grad_norm": 0.4516739355932989, + "learning_rate": 1.5333674688513402e-05, + "loss": 0.6281, + "num_tokens": 340504407.0, + "step": 3614 + }, + { + "epoch": 0.6169994879672299, + "grad_norm": 0.47308332035647443, + "learning_rate": 1.5326847584912102e-05, + "loss": 0.553, + "num_tokens": 340602015.0, + "step": 3615 + }, + { + "epoch": 0.6171701655572623, + "grad_norm": 0.5103576392384896, + "learning_rate": 1.5320020481310806e-05, + "loss": 0.6255, + "num_tokens": 340688774.0, + "step": 3616 + }, + { + "epoch": 0.6173408431472948, + "grad_norm": 0.4833232896366674, + "learning_rate": 1.5313193377709506e-05, + "loss": 0.5409, + "num_tokens": 340774762.0, + "step": 3617 + }, + { + "epoch": 0.6175115207373272, + "grad_norm": 0.499264150849373, + "learning_rate": 1.530636627410821e-05, + "loss": 0.5944, + "num_tokens": 340870740.0, + "step": 3618 + }, + { + "epoch": 0.6176821983273596, + "grad_norm": 0.43180361352943164, + "learning_rate": 1.5299539170506914e-05, + "loss": 0.5842, + "num_tokens": 340991550.0, + "step": 3619 + }, + { + "epoch": 0.617852875917392, + "grad_norm": 0.5572215292936863, + "learning_rate": 1.5292712066905618e-05, + "loss": 0.5474, + "num_tokens": 341052416.0, + "step": 3620 + }, + { + "epoch": 0.6180235535074244, + "grad_norm": 0.5269219426126724, + "learning_rate": 1.528588496330432e-05, + "loss": 0.6321, + "num_tokens": 341144747.0, + "step": 3621 + }, + { + "epoch": 0.6181942310974569, + "grad_norm": 0.5033562469962622, + "learning_rate": 1.5279057859703022e-05, + "loss": 0.5558, + "num_tokens": 341223769.0, + "step": 3622 + }, + { + "epoch": 0.6183649086874893, + "grad_norm": 0.49387216400216066, + "learning_rate": 1.5272230756101726e-05, + "loss": 0.5865, + "num_tokens": 341312144.0, + "step": 3623 + }, + { + "epoch": 0.6185355862775218, + "grad_norm": 0.46471698786936644, + "learning_rate": 1.526540365250043e-05, + "loss": 0.6981, + "num_tokens": 341427029.0, + "step": 3624 + }, + { + "epoch": 0.6187062638675542, + "grad_norm": 0.45268702190855903, + "learning_rate": 1.5258576548899131e-05, + "loss": 0.6694, + "num_tokens": 341547117.0, + "step": 3625 + }, + { + "epoch": 0.6188769414575866, + "grad_norm": 0.4939130836944239, + "learning_rate": 1.5251749445297835e-05, + "loss": 0.5347, + "num_tokens": 341623379.0, + "step": 3626 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.4561296898615653, + "learning_rate": 1.5244922341696537e-05, + "loss": 0.5654, + "num_tokens": 341726766.0, + "step": 3627 + }, + { + "epoch": 0.6192182966376515, + "grad_norm": 0.49732555709580856, + "learning_rate": 1.523809523809524e-05, + "loss": 0.5812, + "num_tokens": 341804954.0, + "step": 3628 + }, + { + "epoch": 0.6193889742276839, + "grad_norm": 0.44490121308273284, + "learning_rate": 1.5231268134493941e-05, + "loss": 0.5199, + "num_tokens": 341908951.0, + "step": 3629 + }, + { + "epoch": 0.6195596518177163, + "grad_norm": 0.4987171664391396, + "learning_rate": 1.5224441030892645e-05, + "loss": 0.5479, + "num_tokens": 342000341.0, + "step": 3630 + }, + { + "epoch": 0.6197303294077487, + "grad_norm": 0.5496376064303287, + "learning_rate": 1.5217613927291349e-05, + "loss": 0.605, + "num_tokens": 342085336.0, + "step": 3631 + }, + { + "epoch": 0.6199010069977812, + "grad_norm": 0.4591253207101078, + "learning_rate": 1.5210786823690051e-05, + "loss": 0.5267, + "num_tokens": 342176899.0, + "step": 3632 + }, + { + "epoch": 0.6200716845878136, + "grad_norm": 0.5301172577030788, + "learning_rate": 1.5203959720088755e-05, + "loss": 0.7093, + "num_tokens": 342264592.0, + "step": 3633 + }, + { + "epoch": 0.620242362177846, + "grad_norm": 0.4866651815236698, + "learning_rate": 1.5197132616487455e-05, + "loss": 0.5601, + "num_tokens": 342346530.0, + "step": 3634 + }, + { + "epoch": 0.6204130397678784, + "grad_norm": 0.4931079672594089, + "learning_rate": 1.5190305512886159e-05, + "loss": 0.5935, + "num_tokens": 342437306.0, + "step": 3635 + }, + { + "epoch": 0.620583717357911, + "grad_norm": 0.46196211170807955, + "learning_rate": 1.5183478409284863e-05, + "loss": 0.595, + "num_tokens": 342540328.0, + "step": 3636 + }, + { + "epoch": 0.6207543949479434, + "grad_norm": 0.4772542000080506, + "learning_rate": 1.5176651305683565e-05, + "loss": 0.598, + "num_tokens": 342634785.0, + "step": 3637 + }, + { + "epoch": 0.6209250725379758, + "grad_norm": 0.4655459115376325, + "learning_rate": 1.5169824202082268e-05, + "loss": 0.5042, + "num_tokens": 342718914.0, + "step": 3638 + }, + { + "epoch": 0.6210957501280082, + "grad_norm": 0.4808679672955145, + "learning_rate": 1.5162997098480972e-05, + "loss": 0.5475, + "num_tokens": 342806165.0, + "step": 3639 + }, + { + "epoch": 0.6212664277180406, + "grad_norm": 0.48392231549808584, + "learning_rate": 1.5156169994879673e-05, + "loss": 0.6064, + "num_tokens": 342906883.0, + "step": 3640 + }, + { + "epoch": 0.6214371053080731, + "grad_norm": 0.4529870164825527, + "learning_rate": 1.5149342891278376e-05, + "loss": 0.672, + "num_tokens": 343026004.0, + "step": 3641 + }, + { + "epoch": 0.6216077828981055, + "grad_norm": 0.5280642991632045, + "learning_rate": 1.5142515787677078e-05, + "loss": 0.6072, + "num_tokens": 343101526.0, + "step": 3642 + }, + { + "epoch": 0.6217784604881379, + "grad_norm": 0.5079651106200068, + "learning_rate": 1.5135688684075782e-05, + "loss": 0.6263, + "num_tokens": 343193465.0, + "step": 3643 + }, + { + "epoch": 0.6219491380781703, + "grad_norm": 0.5036191103268727, + "learning_rate": 1.5128861580474486e-05, + "loss": 0.6105, + "num_tokens": 343281564.0, + "step": 3644 + }, + { + "epoch": 0.6221198156682027, + "grad_norm": 0.5003318758885824, + "learning_rate": 1.5122034476873188e-05, + "loss": 0.5803, + "num_tokens": 343361820.0, + "step": 3645 + }, + { + "epoch": 0.6222904932582352, + "grad_norm": 0.45579182914669025, + "learning_rate": 1.511520737327189e-05, + "loss": 0.6344, + "num_tokens": 343486923.0, + "step": 3646 + }, + { + "epoch": 0.6224611708482676, + "grad_norm": 0.5129605331748879, + "learning_rate": 1.5108380269670592e-05, + "loss": 0.5799, + "num_tokens": 343568737.0, + "step": 3647 + }, + { + "epoch": 0.6226318484383001, + "grad_norm": 0.46515014431960366, + "learning_rate": 1.5101553166069296e-05, + "loss": 0.601, + "num_tokens": 343672498.0, + "step": 3648 + }, + { + "epoch": 0.6228025260283325, + "grad_norm": 0.5467994798525068, + "learning_rate": 1.5094726062468e-05, + "loss": 0.6429, + "num_tokens": 343744753.0, + "step": 3649 + }, + { + "epoch": 0.622973203618365, + "grad_norm": 0.5322713736920139, + "learning_rate": 1.5087898958866702e-05, + "loss": 0.536, + "num_tokens": 343812170.0, + "step": 3650 + }, + { + "epoch": 0.6231438812083974, + "grad_norm": 0.518424488638914, + "learning_rate": 1.5081071855265405e-05, + "loss": 0.5642, + "num_tokens": 343886817.0, + "step": 3651 + }, + { + "epoch": 0.6233145587984298, + "grad_norm": 0.5423349107231067, + "learning_rate": 1.507424475166411e-05, + "loss": 0.5732, + "num_tokens": 343999190.0, + "step": 3652 + }, + { + "epoch": 0.6234852363884622, + "grad_norm": 0.5440556783414959, + "learning_rate": 1.506741764806281e-05, + "loss": 0.6365, + "num_tokens": 344078927.0, + "step": 3653 + }, + { + "epoch": 0.6236559139784946, + "grad_norm": 0.46203045289940403, + "learning_rate": 1.5060590544461513e-05, + "loss": 0.6083, + "num_tokens": 344182562.0, + "step": 3654 + }, + { + "epoch": 0.623826591568527, + "grad_norm": 0.46694938751930487, + "learning_rate": 1.5053763440860215e-05, + "loss": 0.5157, + "num_tokens": 344266315.0, + "step": 3655 + }, + { + "epoch": 0.6239972691585595, + "grad_norm": 0.49518303254528184, + "learning_rate": 1.504693633725892e-05, + "loss": 0.5158, + "num_tokens": 344345526.0, + "step": 3656 + }, + { + "epoch": 0.6241679467485919, + "grad_norm": 0.48470456800599965, + "learning_rate": 1.5040109233657623e-05, + "loss": 0.5933, + "num_tokens": 344447954.0, + "step": 3657 + }, + { + "epoch": 0.6243386243386243, + "grad_norm": 0.49802194703712266, + "learning_rate": 1.5033282130056325e-05, + "loss": 0.5587, + "num_tokens": 344532540.0, + "step": 3658 + }, + { + "epoch": 0.6245093019286567, + "grad_norm": 0.5275025191007251, + "learning_rate": 1.5026455026455027e-05, + "loss": 0.6993, + "num_tokens": 344622861.0, + "step": 3659 + }, + { + "epoch": 0.6246799795186891, + "grad_norm": 0.484272541688487, + "learning_rate": 1.5019627922853731e-05, + "loss": 0.6051, + "num_tokens": 344721117.0, + "step": 3660 + }, + { + "epoch": 0.6248506571087217, + "grad_norm": 0.5426041229609327, + "learning_rate": 1.5012800819252433e-05, + "loss": 0.5952, + "num_tokens": 344792058.0, + "step": 3661 + }, + { + "epoch": 0.6250213346987541, + "grad_norm": 0.5375199277688153, + "learning_rate": 1.5005973715651137e-05, + "loss": 0.5403, + "num_tokens": 344864527.0, + "step": 3662 + }, + { + "epoch": 0.6251920122887865, + "grad_norm": 0.48323595311673584, + "learning_rate": 1.499914661204984e-05, + "loss": 0.6009, + "num_tokens": 344959125.0, + "step": 3663 + }, + { + "epoch": 0.6253626898788189, + "grad_norm": 0.5188249775848618, + "learning_rate": 1.4992319508448543e-05, + "loss": 0.5248, + "num_tokens": 345058892.0, + "step": 3664 + }, + { + "epoch": 0.6255333674688514, + "grad_norm": 0.45328848902892505, + "learning_rate": 1.4985492404847245e-05, + "loss": 0.6351, + "num_tokens": 345182736.0, + "step": 3665 + }, + { + "epoch": 0.6257040450588838, + "grad_norm": 0.5154085227923573, + "learning_rate": 1.4978665301245947e-05, + "loss": 0.5787, + "num_tokens": 345267006.0, + "step": 3666 + }, + { + "epoch": 0.6258747226489162, + "grad_norm": 0.4700823181980873, + "learning_rate": 1.497183819764465e-05, + "loss": 0.5573, + "num_tokens": 345365916.0, + "step": 3667 + }, + { + "epoch": 0.6260454002389486, + "grad_norm": 0.44215985499443855, + "learning_rate": 1.4965011094043354e-05, + "loss": 0.6259, + "num_tokens": 345492549.0, + "step": 3668 + }, + { + "epoch": 0.626216077828981, + "grad_norm": 0.6734393352781106, + "learning_rate": 1.4958183990442056e-05, + "loss": 0.6583, + "num_tokens": 345576894.0, + "step": 3669 + }, + { + "epoch": 0.6263867554190135, + "grad_norm": 0.44737622704841884, + "learning_rate": 1.495135688684076e-05, + "loss": 0.5609, + "num_tokens": 345677144.0, + "step": 3670 + }, + { + "epoch": 0.6265574330090459, + "grad_norm": 0.4904059630611964, + "learning_rate": 1.494452978323946e-05, + "loss": 0.5663, + "num_tokens": 345760087.0, + "step": 3671 + }, + { + "epoch": 0.6267281105990783, + "grad_norm": 0.4846798933422681, + "learning_rate": 1.4937702679638164e-05, + "loss": 0.6625, + "num_tokens": 345857558.0, + "step": 3672 + }, + { + "epoch": 0.6268987881891108, + "grad_norm": 0.5140338397321844, + "learning_rate": 1.4930875576036868e-05, + "loss": 0.5745, + "num_tokens": 345940449.0, + "step": 3673 + }, + { + "epoch": 0.6270694657791432, + "grad_norm": 0.4579086435169565, + "learning_rate": 1.492404847243557e-05, + "loss": 0.5259, + "num_tokens": 346034060.0, + "step": 3674 + }, + { + "epoch": 0.6272401433691757, + "grad_norm": 0.46718407084506575, + "learning_rate": 1.4917221368834274e-05, + "loss": 0.5531, + "num_tokens": 346118852.0, + "step": 3675 + }, + { + "epoch": 0.6274108209592081, + "grad_norm": 0.466267934052504, + "learning_rate": 1.4910394265232978e-05, + "loss": 0.5438, + "num_tokens": 346215185.0, + "step": 3676 + }, + { + "epoch": 0.6275814985492405, + "grad_norm": 0.46413998348802865, + "learning_rate": 1.4903567161631678e-05, + "loss": 0.6132, + "num_tokens": 346318696.0, + "step": 3677 + }, + { + "epoch": 0.6277521761392729, + "grad_norm": 0.4504372498577167, + "learning_rate": 1.4896740058030382e-05, + "loss": 0.5718, + "num_tokens": 346424870.0, + "step": 3678 + }, + { + "epoch": 0.6279228537293053, + "grad_norm": 0.5087467728312661, + "learning_rate": 1.4889912954429084e-05, + "loss": 0.4863, + "num_tokens": 346499973.0, + "step": 3679 + }, + { + "epoch": 0.6280935313193378, + "grad_norm": 0.5121803175158001, + "learning_rate": 1.4883085850827787e-05, + "loss": 0.5054, + "num_tokens": 346570215.0, + "step": 3680 + }, + { + "epoch": 0.6282642089093702, + "grad_norm": 0.5058349750572209, + "learning_rate": 1.4876258747226491e-05, + "loss": 0.5895, + "num_tokens": 346647290.0, + "step": 3681 + }, + { + "epoch": 0.6284348864994026, + "grad_norm": 0.49298229833583507, + "learning_rate": 1.4869431643625193e-05, + "loss": 0.5666, + "num_tokens": 346736277.0, + "step": 3682 + }, + { + "epoch": 0.628605564089435, + "grad_norm": 0.46180474787414244, + "learning_rate": 1.4862604540023897e-05, + "loss": 0.6018, + "num_tokens": 346846314.0, + "step": 3683 + }, + { + "epoch": 0.6287762416794674, + "grad_norm": 0.4722385134379064, + "learning_rate": 1.4855777436422597e-05, + "loss": 0.5161, + "num_tokens": 346937773.0, + "step": 3684 + }, + { + "epoch": 0.6289469192694999, + "grad_norm": 0.47672708593527663, + "learning_rate": 1.4848950332821301e-05, + "loss": 0.5062, + "num_tokens": 347024863.0, + "step": 3685 + }, + { + "epoch": 0.6291175968595324, + "grad_norm": 0.45980006040918736, + "learning_rate": 1.4842123229220005e-05, + "loss": 0.5513, + "num_tokens": 347124855.0, + "step": 3686 + }, + { + "epoch": 0.6292882744495648, + "grad_norm": 0.5087906334866202, + "learning_rate": 1.4835296125618707e-05, + "loss": 0.5893, + "num_tokens": 347210676.0, + "step": 3687 + }, + { + "epoch": 0.6294589520395972, + "grad_norm": 0.4345086644347096, + "learning_rate": 1.482846902201741e-05, + "loss": 0.6718, + "num_tokens": 347352099.0, + "step": 3688 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.49246945368243084, + "learning_rate": 1.4821641918416115e-05, + "loss": 0.5786, + "num_tokens": 347443628.0, + "step": 3689 + }, + { + "epoch": 0.6298003072196621, + "grad_norm": 0.5326390358560981, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.5436, + "num_tokens": 347514180.0, + "step": 3690 + }, + { + "epoch": 0.6299709848096945, + "grad_norm": 0.9621833306767478, + "learning_rate": 1.4807987711213519e-05, + "loss": 0.631, + "num_tokens": 347597539.0, + "step": 3691 + }, + { + "epoch": 0.6301416623997269, + "grad_norm": 0.42885374568712187, + "learning_rate": 1.480116060761222e-05, + "loss": 0.4931, + "num_tokens": 347700173.0, + "step": 3692 + }, + { + "epoch": 0.6303123399897593, + "grad_norm": 0.46785250904922543, + "learning_rate": 1.4794333504010925e-05, + "loss": 0.5889, + "num_tokens": 347805623.0, + "step": 3693 + }, + { + "epoch": 0.6304830175797917, + "grad_norm": 0.4751398418236468, + "learning_rate": 1.4787506400409628e-05, + "loss": 0.5489, + "num_tokens": 347893698.0, + "step": 3694 + }, + { + "epoch": 0.6306536951698242, + "grad_norm": 0.49032594241916083, + "learning_rate": 1.4780679296808332e-05, + "loss": 0.6338, + "num_tokens": 347987041.0, + "step": 3695 + }, + { + "epoch": 0.6308243727598566, + "grad_norm": 0.4386323849733401, + "learning_rate": 1.4773852193207032e-05, + "loss": 0.5962, + "num_tokens": 348097986.0, + "step": 3696 + }, + { + "epoch": 0.630995050349889, + "grad_norm": 0.5450382832701353, + "learning_rate": 1.4767025089605736e-05, + "loss": 0.633, + "num_tokens": 348177045.0, + "step": 3697 + }, + { + "epoch": 0.6311657279399215, + "grad_norm": 0.49356701007848697, + "learning_rate": 1.4760197986004438e-05, + "loss": 0.6194, + "num_tokens": 348272461.0, + "step": 3698 + }, + { + "epoch": 0.631336405529954, + "grad_norm": 0.426814678418761, + "learning_rate": 1.4753370882403142e-05, + "loss": 0.4986, + "num_tokens": 348386429.0, + "step": 3699 + }, + { + "epoch": 0.6315070831199864, + "grad_norm": 0.4873136220503191, + "learning_rate": 1.4746543778801846e-05, + "loss": 0.5275, + "num_tokens": 348468522.0, + "step": 3700 + }, + { + "epoch": 0.6316777607100188, + "grad_norm": 0.4958706273963355, + "learning_rate": 1.4739716675200548e-05, + "loss": 0.5487, + "num_tokens": 348554210.0, + "step": 3701 + }, + { + "epoch": 0.6318484383000512, + "grad_norm": 0.5304705339998864, + "learning_rate": 1.473288957159925e-05, + "loss": 0.7096, + "num_tokens": 348640467.0, + "step": 3702 + }, + { + "epoch": 0.6320191158900836, + "grad_norm": 0.5283853808155656, + "learning_rate": 1.4726062467997952e-05, + "loss": 0.6158, + "num_tokens": 348734098.0, + "step": 3703 + }, + { + "epoch": 0.6321897934801161, + "grad_norm": 0.473656765582445, + "learning_rate": 1.4719235364396656e-05, + "loss": 0.5666, + "num_tokens": 348825712.0, + "step": 3704 + }, + { + "epoch": 0.6323604710701485, + "grad_norm": 0.4475526464790234, + "learning_rate": 1.471240826079536e-05, + "loss": 0.5841, + "num_tokens": 348938880.0, + "step": 3705 + }, + { + "epoch": 0.6325311486601809, + "grad_norm": 0.4652775060530408, + "learning_rate": 1.4705581157194062e-05, + "loss": 0.512, + "num_tokens": 349026217.0, + "step": 3706 + }, + { + "epoch": 0.6327018262502133, + "grad_norm": 0.44225836407471136, + "learning_rate": 1.4698754053592765e-05, + "loss": 0.597, + "num_tokens": 349137986.0, + "step": 3707 + }, + { + "epoch": 0.6328725038402457, + "grad_norm": 0.5023214308626308, + "learning_rate": 1.4691926949991466e-05, + "loss": 0.5555, + "num_tokens": 349216389.0, + "step": 3708 + }, + { + "epoch": 0.6330431814302782, + "grad_norm": 0.45603836233473594, + "learning_rate": 1.468509984639017e-05, + "loss": 0.5528, + "num_tokens": 349307720.0, + "step": 3709 + }, + { + "epoch": 0.6332138590203107, + "grad_norm": 0.454297782264187, + "learning_rate": 1.4678272742788873e-05, + "loss": 0.5349, + "num_tokens": 349402403.0, + "step": 3710 + }, + { + "epoch": 0.6333845366103431, + "grad_norm": 0.45377734597370345, + "learning_rate": 1.4671445639187575e-05, + "loss": 0.6106, + "num_tokens": 349507600.0, + "step": 3711 + }, + { + "epoch": 0.6335552142003755, + "grad_norm": 0.5062774700847578, + "learning_rate": 1.4664618535586279e-05, + "loss": 0.5726, + "num_tokens": 349582864.0, + "step": 3712 + }, + { + "epoch": 0.633725891790408, + "grad_norm": 0.4583254916136759, + "learning_rate": 1.4657791431984983e-05, + "loss": 0.5538, + "num_tokens": 349679529.0, + "step": 3713 + }, + { + "epoch": 0.6338965693804404, + "grad_norm": 0.558291336086708, + "learning_rate": 1.4650964328383683e-05, + "loss": 0.6403, + "num_tokens": 349753937.0, + "step": 3714 + }, + { + "epoch": 0.6340672469704728, + "grad_norm": 0.46138867312532283, + "learning_rate": 1.4644137224782387e-05, + "loss": 0.5873, + "num_tokens": 349859941.0, + "step": 3715 + }, + { + "epoch": 0.6342379245605052, + "grad_norm": 0.5290761272650552, + "learning_rate": 1.4637310121181089e-05, + "loss": 0.5552, + "num_tokens": 349926983.0, + "step": 3716 + }, + { + "epoch": 0.6344086021505376, + "grad_norm": 0.48811342586908746, + "learning_rate": 1.4630483017579793e-05, + "loss": 0.6403, + "num_tokens": 350042706.0, + "step": 3717 + }, + { + "epoch": 0.63457927974057, + "grad_norm": 0.45835958355892853, + "learning_rate": 1.4623655913978497e-05, + "loss": 0.6075, + "num_tokens": 350156388.0, + "step": 3718 + }, + { + "epoch": 0.6347499573306025, + "grad_norm": 0.5193060728839891, + "learning_rate": 1.4616828810377199e-05, + "loss": 0.543, + "num_tokens": 350232403.0, + "step": 3719 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.4499430473617779, + "learning_rate": 1.4610001706775902e-05, + "loss": 0.5548, + "num_tokens": 350333080.0, + "step": 3720 + }, + { + "epoch": 0.6350913125106673, + "grad_norm": 0.44897183442136046, + "learning_rate": 1.4603174603174603e-05, + "loss": 0.5513, + "num_tokens": 350452093.0, + "step": 3721 + }, + { + "epoch": 0.6352619901006997, + "grad_norm": 0.5116532833015546, + "learning_rate": 1.4596347499573307e-05, + "loss": 0.6153, + "num_tokens": 350531851.0, + "step": 3722 + }, + { + "epoch": 0.6354326676907323, + "grad_norm": 0.4788419997066486, + "learning_rate": 1.458952039597201e-05, + "loss": 0.5584, + "num_tokens": 350625250.0, + "step": 3723 + }, + { + "epoch": 0.6356033452807647, + "grad_norm": 0.4525914919743234, + "learning_rate": 1.4582693292370712e-05, + "loss": 0.5445, + "num_tokens": 350745541.0, + "step": 3724 + }, + { + "epoch": 0.6357740228707971, + "grad_norm": 0.4601355358581514, + "learning_rate": 1.4575866188769416e-05, + "loss": 0.6311, + "num_tokens": 350853486.0, + "step": 3725 + }, + { + "epoch": 0.6359447004608295, + "grad_norm": 0.5329886492811607, + "learning_rate": 1.456903908516812e-05, + "loss": 0.6641, + "num_tokens": 350935298.0, + "step": 3726 + }, + { + "epoch": 0.6361153780508619, + "grad_norm": 0.4455291735330309, + "learning_rate": 1.456221198156682e-05, + "loss": 0.542, + "num_tokens": 351039912.0, + "step": 3727 + }, + { + "epoch": 0.6362860556408944, + "grad_norm": 0.4876350485841393, + "learning_rate": 1.4555384877965524e-05, + "loss": 0.5164, + "num_tokens": 351117100.0, + "step": 3728 + }, + { + "epoch": 0.6364567332309268, + "grad_norm": 0.49137010819118804, + "learning_rate": 1.4548557774364228e-05, + "loss": 0.6718, + "num_tokens": 351220881.0, + "step": 3729 + }, + { + "epoch": 0.6366274108209592, + "grad_norm": 0.44709323161464476, + "learning_rate": 1.454173067076293e-05, + "loss": 0.5254, + "num_tokens": 351323434.0, + "step": 3730 + }, + { + "epoch": 0.6367980884109916, + "grad_norm": 0.5250752865022018, + "learning_rate": 1.4534903567161634e-05, + "loss": 0.6903, + "num_tokens": 351408250.0, + "step": 3731 + }, + { + "epoch": 0.636968766001024, + "grad_norm": 0.5187653491823154, + "learning_rate": 1.4528076463560337e-05, + "loss": 0.5808, + "num_tokens": 351482993.0, + "step": 3732 + }, + { + "epoch": 0.6371394435910565, + "grad_norm": 0.5763755890986934, + "learning_rate": 1.4521249359959038e-05, + "loss": 0.6076, + "num_tokens": 351547977.0, + "step": 3733 + }, + { + "epoch": 0.6373101211810889, + "grad_norm": 0.5630513685715748, + "learning_rate": 1.4514422256357742e-05, + "loss": 0.6017, + "num_tokens": 351610899.0, + "step": 3734 + }, + { + "epoch": 0.6374807987711214, + "grad_norm": 0.5155879540563318, + "learning_rate": 1.4507595152756444e-05, + "loss": 0.6155, + "num_tokens": 351692402.0, + "step": 3735 + }, + { + "epoch": 0.6376514763611538, + "grad_norm": 0.4816079220069336, + "learning_rate": 1.4500768049155147e-05, + "loss": 0.526, + "num_tokens": 351779989.0, + "step": 3736 + }, + { + "epoch": 0.6378221539511862, + "grad_norm": 0.46713109047215534, + "learning_rate": 1.4493940945553851e-05, + "loss": 0.6258, + "num_tokens": 351883951.0, + "step": 3737 + }, + { + "epoch": 0.6379928315412187, + "grad_norm": 0.4936754808222198, + "learning_rate": 1.4487113841952553e-05, + "loss": 0.682, + "num_tokens": 351989465.0, + "step": 3738 + }, + { + "epoch": 0.6381635091312511, + "grad_norm": 0.5183543763455014, + "learning_rate": 1.4480286738351255e-05, + "loss": 0.6539, + "num_tokens": 352086491.0, + "step": 3739 + }, + { + "epoch": 0.6383341867212835, + "grad_norm": 0.46370058373013345, + "learning_rate": 1.4473459634749957e-05, + "loss": 0.7025, + "num_tokens": 352205771.0, + "step": 3740 + }, + { + "epoch": 0.6385048643113159, + "grad_norm": 0.5102338274372333, + "learning_rate": 1.4466632531148661e-05, + "loss": 0.5586, + "num_tokens": 352284606.0, + "step": 3741 + }, + { + "epoch": 0.6386755419013483, + "grad_norm": 0.4738402000806786, + "learning_rate": 1.4459805427547365e-05, + "loss": 0.5812, + "num_tokens": 352386500.0, + "step": 3742 + }, + { + "epoch": 0.6388462194913808, + "grad_norm": 0.4932969001076414, + "learning_rate": 1.4452978323946067e-05, + "loss": 0.5934, + "num_tokens": 352478818.0, + "step": 3743 + }, + { + "epoch": 0.6390168970814132, + "grad_norm": 0.5241791669624905, + "learning_rate": 1.444615122034477e-05, + "loss": 0.6022, + "num_tokens": 352573485.0, + "step": 3744 + }, + { + "epoch": 0.6391875746714456, + "grad_norm": 0.47739991325212305, + "learning_rate": 1.4439324116743471e-05, + "loss": 0.5739, + "num_tokens": 352673466.0, + "step": 3745 + }, + { + "epoch": 0.639358252261478, + "grad_norm": 0.5473925501149179, + "learning_rate": 1.4432497013142175e-05, + "loss": 0.6279, + "num_tokens": 352750706.0, + "step": 3746 + }, + { + "epoch": 0.6395289298515104, + "grad_norm": 0.5235512408597872, + "learning_rate": 1.4425669909540879e-05, + "loss": 0.5958, + "num_tokens": 352826075.0, + "step": 3747 + }, + { + "epoch": 0.639699607441543, + "grad_norm": 0.46722217146907263, + "learning_rate": 1.441884280593958e-05, + "loss": 0.5519, + "num_tokens": 352917742.0, + "step": 3748 + }, + { + "epoch": 0.6398702850315754, + "grad_norm": 0.49945002733656146, + "learning_rate": 1.4412015702338284e-05, + "loss": 0.6494, + "num_tokens": 353019844.0, + "step": 3749 + }, + { + "epoch": 0.6400409626216078, + "grad_norm": 0.47881003653860266, + "learning_rate": 1.4405188598736988e-05, + "loss": 0.624, + "num_tokens": 353136413.0, + "step": 3750 + }, + { + "epoch": 0.6402116402116402, + "grad_norm": 0.443405708421006, + "learning_rate": 1.439836149513569e-05, + "loss": 0.61, + "num_tokens": 353247375.0, + "step": 3751 + }, + { + "epoch": 0.6403823178016727, + "grad_norm": 0.5620455584694852, + "learning_rate": 1.4391534391534392e-05, + "loss": 0.6152, + "num_tokens": 353315152.0, + "step": 3752 + }, + { + "epoch": 0.6405529953917051, + "grad_norm": 0.5512611706778124, + "learning_rate": 1.4384707287933094e-05, + "loss": 0.5554, + "num_tokens": 353378303.0, + "step": 3753 + }, + { + "epoch": 0.6407236729817375, + "grad_norm": 0.4492987975372472, + "learning_rate": 1.4377880184331798e-05, + "loss": 0.5514, + "num_tokens": 353479876.0, + "step": 3754 + }, + { + "epoch": 0.6408943505717699, + "grad_norm": 0.5470720570886061, + "learning_rate": 1.4371053080730502e-05, + "loss": 0.533, + "num_tokens": 353548305.0, + "step": 3755 + }, + { + "epoch": 0.6410650281618023, + "grad_norm": 0.4750913669050522, + "learning_rate": 1.4364225977129204e-05, + "loss": 0.6744, + "num_tokens": 353663938.0, + "step": 3756 + }, + { + "epoch": 0.6412357057518348, + "grad_norm": 0.4704155386337519, + "learning_rate": 1.4357398873527908e-05, + "loss": 0.5246, + "num_tokens": 353752590.0, + "step": 3757 + }, + { + "epoch": 0.6414063833418672, + "grad_norm": 0.5019138475475852, + "learning_rate": 1.4350571769926608e-05, + "loss": 0.5972, + "num_tokens": 353837683.0, + "step": 3758 + }, + { + "epoch": 0.6415770609318996, + "grad_norm": 0.6192455871800803, + "learning_rate": 1.4343744666325312e-05, + "loss": 0.6349, + "num_tokens": 353965500.0, + "step": 3759 + }, + { + "epoch": 0.6417477385219321, + "grad_norm": 0.7692675502128963, + "learning_rate": 1.4336917562724016e-05, + "loss": 0.6936, + "num_tokens": 354065614.0, + "step": 3760 + }, + { + "epoch": 0.6419184161119645, + "grad_norm": 0.4723790604081513, + "learning_rate": 1.4330090459122718e-05, + "loss": 0.5073, + "num_tokens": 354142643.0, + "step": 3761 + }, + { + "epoch": 0.642089093701997, + "grad_norm": 0.45032966659767376, + "learning_rate": 1.4323263355521422e-05, + "loss": 0.5841, + "num_tokens": 354260220.0, + "step": 3762 + }, + { + "epoch": 0.6422597712920294, + "grad_norm": 0.4247892854563498, + "learning_rate": 1.4316436251920125e-05, + "loss": 0.5496, + "num_tokens": 354376728.0, + "step": 3763 + }, + { + "epoch": 0.6424304488820618, + "grad_norm": 0.4787239698405895, + "learning_rate": 1.4309609148318826e-05, + "loss": 0.5465, + "num_tokens": 354465252.0, + "step": 3764 + }, + { + "epoch": 0.6426011264720942, + "grad_norm": 0.44270779526128357, + "learning_rate": 1.430278204471753e-05, + "loss": 0.4938, + "num_tokens": 354558980.0, + "step": 3765 + }, + { + "epoch": 0.6427718040621266, + "grad_norm": 0.483882307699064, + "learning_rate": 1.4295954941116233e-05, + "loss": 0.5903, + "num_tokens": 354649463.0, + "step": 3766 + }, + { + "epoch": 0.6429424816521591, + "grad_norm": 0.4964214518537526, + "learning_rate": 1.4289127837514935e-05, + "loss": 0.5542, + "num_tokens": 354734746.0, + "step": 3767 + }, + { + "epoch": 0.6431131592421915, + "grad_norm": 0.49157338235783127, + "learning_rate": 1.4282300733913639e-05, + "loss": 0.5337, + "num_tokens": 354818852.0, + "step": 3768 + }, + { + "epoch": 0.6432838368322239, + "grad_norm": 0.4487574602954322, + "learning_rate": 1.4275473630312343e-05, + "loss": 0.5847, + "num_tokens": 354925027.0, + "step": 3769 + }, + { + "epoch": 0.6434545144222563, + "grad_norm": 0.500312550545791, + "learning_rate": 1.4268646526711043e-05, + "loss": 0.5067, + "num_tokens": 355006510.0, + "step": 3770 + }, + { + "epoch": 0.6436251920122887, + "grad_norm": 0.4714850495194229, + "learning_rate": 1.4261819423109747e-05, + "loss": 0.5702, + "num_tokens": 355102256.0, + "step": 3771 + }, + { + "epoch": 0.6437958696023213, + "grad_norm": 0.45024301794887495, + "learning_rate": 1.4254992319508449e-05, + "loss": 0.6018, + "num_tokens": 355218351.0, + "step": 3772 + }, + { + "epoch": 0.6439665471923537, + "grad_norm": 0.4376018469857722, + "learning_rate": 1.4248165215907153e-05, + "loss": 0.5019, + "num_tokens": 355321424.0, + "step": 3773 + }, + { + "epoch": 0.6441372247823861, + "grad_norm": 0.49762643059750217, + "learning_rate": 1.4241338112305857e-05, + "loss": 0.5992, + "num_tokens": 355415149.0, + "step": 3774 + }, + { + "epoch": 0.6443079023724185, + "grad_norm": 0.5385516527773379, + "learning_rate": 1.4234511008704559e-05, + "loss": 0.5905, + "num_tokens": 355487991.0, + "step": 3775 + }, + { + "epoch": 0.644478579962451, + "grad_norm": 0.48099440183462305, + "learning_rate": 1.422768390510326e-05, + "loss": 0.6146, + "num_tokens": 355589825.0, + "step": 3776 + }, + { + "epoch": 0.6446492575524834, + "grad_norm": 0.471949567960015, + "learning_rate": 1.4220856801501963e-05, + "loss": 0.5476, + "num_tokens": 355681377.0, + "step": 3777 + }, + { + "epoch": 0.6448199351425158, + "grad_norm": 0.5537635568798891, + "learning_rate": 1.4214029697900666e-05, + "loss": 0.6676, + "num_tokens": 355756965.0, + "step": 3778 + }, + { + "epoch": 0.6449906127325482, + "grad_norm": 0.5043811754287045, + "learning_rate": 1.420720259429937e-05, + "loss": 0.5912, + "num_tokens": 355846644.0, + "step": 3779 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.47095812943265924, + "learning_rate": 1.4200375490698072e-05, + "loss": 0.5618, + "num_tokens": 355934349.0, + "step": 3780 + }, + { + "epoch": 0.645331967912613, + "grad_norm": 0.4736574695521975, + "learning_rate": 1.4193548387096776e-05, + "loss": 0.5598, + "num_tokens": 356018550.0, + "step": 3781 + }, + { + "epoch": 0.6455026455026455, + "grad_norm": 0.46991381477626787, + "learning_rate": 1.418672128349548e-05, + "loss": 0.5593, + "num_tokens": 356118335.0, + "step": 3782 + }, + { + "epoch": 0.6456733230926779, + "grad_norm": 0.49323274881496043, + "learning_rate": 1.417989417989418e-05, + "loss": 0.519, + "num_tokens": 356196914.0, + "step": 3783 + }, + { + "epoch": 0.6458440006827103, + "grad_norm": 0.5201692555192612, + "learning_rate": 1.4173067076292884e-05, + "loss": 0.5645, + "num_tokens": 356269150.0, + "step": 3784 + }, + { + "epoch": 0.6460146782727428, + "grad_norm": 0.4521880936211394, + "learning_rate": 1.4166239972691586e-05, + "loss": 0.6278, + "num_tokens": 356389342.0, + "step": 3785 + }, + { + "epoch": 0.6461853558627753, + "grad_norm": 0.4684579853999267, + "learning_rate": 1.415941286909029e-05, + "loss": 0.4999, + "num_tokens": 356471130.0, + "step": 3786 + }, + { + "epoch": 0.6463560334528077, + "grad_norm": 0.5083423087705352, + "learning_rate": 1.4152585765488994e-05, + "loss": 0.6163, + "num_tokens": 356556260.0, + "step": 3787 + }, + { + "epoch": 0.6465267110428401, + "grad_norm": 0.4621525280929079, + "learning_rate": 1.4145758661887696e-05, + "loss": 0.5773, + "num_tokens": 356650921.0, + "step": 3788 + }, + { + "epoch": 0.6466973886328725, + "grad_norm": 0.42477973824923165, + "learning_rate": 1.4138931558286398e-05, + "loss": 0.53, + "num_tokens": 356761096.0, + "step": 3789 + }, + { + "epoch": 0.6468680662229049, + "grad_norm": 0.49359292319936676, + "learning_rate": 1.41321044546851e-05, + "loss": 0.4914, + "num_tokens": 356834927.0, + "step": 3790 + }, + { + "epoch": 0.6470387438129374, + "grad_norm": 0.4652983849899644, + "learning_rate": 1.4125277351083804e-05, + "loss": 0.6744, + "num_tokens": 356948551.0, + "step": 3791 + }, + { + "epoch": 0.6472094214029698, + "grad_norm": 0.6116197856950916, + "learning_rate": 1.4118450247482507e-05, + "loss": 0.6452, + "num_tokens": 357032365.0, + "step": 3792 + }, + { + "epoch": 0.6473800989930022, + "grad_norm": 0.46856990185529046, + "learning_rate": 1.411162314388121e-05, + "loss": 0.5193, + "num_tokens": 357117204.0, + "step": 3793 + }, + { + "epoch": 0.6475507765830346, + "grad_norm": 0.5335093984872318, + "learning_rate": 1.4104796040279913e-05, + "loss": 0.5757, + "num_tokens": 357190350.0, + "step": 3794 + }, + { + "epoch": 0.647721454173067, + "grad_norm": 0.519910066996408, + "learning_rate": 1.4097968936678613e-05, + "loss": 0.6568, + "num_tokens": 357287409.0, + "step": 3795 + }, + { + "epoch": 0.6478921317630995, + "grad_norm": 0.4794030378778992, + "learning_rate": 1.4091141833077317e-05, + "loss": 0.52, + "num_tokens": 357370802.0, + "step": 3796 + }, + { + "epoch": 0.648062809353132, + "grad_norm": 0.47085393760406064, + "learning_rate": 1.4084314729476021e-05, + "loss": 0.5131, + "num_tokens": 357460751.0, + "step": 3797 + }, + { + "epoch": 0.6482334869431644, + "grad_norm": 0.45758280946260976, + "learning_rate": 1.4077487625874725e-05, + "loss": 0.5732, + "num_tokens": 357563036.0, + "step": 3798 + }, + { + "epoch": 0.6484041645331968, + "grad_norm": 0.4715463199161767, + "learning_rate": 1.4070660522273427e-05, + "loss": 0.5605, + "num_tokens": 357649387.0, + "step": 3799 + }, + { + "epoch": 0.6485748421232292, + "grad_norm": 0.45928005350275714, + "learning_rate": 1.406383341867213e-05, + "loss": 0.6481, + "num_tokens": 357774860.0, + "step": 3800 + }, + { + "epoch": 0.6487455197132617, + "grad_norm": 0.5070566708477471, + "learning_rate": 1.4057006315070831e-05, + "loss": 0.5737, + "num_tokens": 357848372.0, + "step": 3801 + }, + { + "epoch": 0.6489161973032941, + "grad_norm": 0.5247942028080713, + "learning_rate": 1.4050179211469535e-05, + "loss": 0.6493, + "num_tokens": 357925095.0, + "step": 3802 + }, + { + "epoch": 0.6490868748933265, + "grad_norm": 0.5367026230330154, + "learning_rate": 1.4043352107868239e-05, + "loss": 0.6618, + "num_tokens": 358008635.0, + "step": 3803 + }, + { + "epoch": 0.6492575524833589, + "grad_norm": 0.46948072264818336, + "learning_rate": 1.403652500426694e-05, + "loss": 0.5128, + "num_tokens": 358098072.0, + "step": 3804 + }, + { + "epoch": 0.6494282300733913, + "grad_norm": 0.45925761924160596, + "learning_rate": 1.4029697900665644e-05, + "loss": 0.5214, + "num_tokens": 358193639.0, + "step": 3805 + }, + { + "epoch": 0.6495989076634238, + "grad_norm": 0.453543468518027, + "learning_rate": 1.4022870797064348e-05, + "loss": 0.618, + "num_tokens": 358304673.0, + "step": 3806 + }, + { + "epoch": 0.6497695852534562, + "grad_norm": 0.47912465555542594, + "learning_rate": 1.4016043693463048e-05, + "loss": 0.5717, + "num_tokens": 358409941.0, + "step": 3807 + }, + { + "epoch": 0.6499402628434886, + "grad_norm": 0.521424638758478, + "learning_rate": 1.4009216589861752e-05, + "loss": 0.663, + "num_tokens": 358499947.0, + "step": 3808 + }, + { + "epoch": 0.6501109404335211, + "grad_norm": 0.46707697621446365, + "learning_rate": 1.4002389486260454e-05, + "loss": 0.5787, + "num_tokens": 358602414.0, + "step": 3809 + }, + { + "epoch": 0.6502816180235536, + "grad_norm": 0.5011905552941018, + "learning_rate": 1.3995562382659158e-05, + "loss": 0.5703, + "num_tokens": 358681273.0, + "step": 3810 + }, + { + "epoch": 0.650452295613586, + "grad_norm": 0.43915677429411487, + "learning_rate": 1.3988735279057862e-05, + "loss": 0.5245, + "num_tokens": 358784752.0, + "step": 3811 + }, + { + "epoch": 0.6506229732036184, + "grad_norm": 0.48464019174607387, + "learning_rate": 1.3981908175456564e-05, + "loss": 0.5379, + "num_tokens": 358866608.0, + "step": 3812 + }, + { + "epoch": 0.6507936507936508, + "grad_norm": 0.4516495812404038, + "learning_rate": 1.3975081071855268e-05, + "loss": 0.6232, + "num_tokens": 358974827.0, + "step": 3813 + }, + { + "epoch": 0.6509643283836832, + "grad_norm": 0.494694300343105, + "learning_rate": 1.3968253968253968e-05, + "loss": 0.641, + "num_tokens": 359065953.0, + "step": 3814 + }, + { + "epoch": 0.6511350059737157, + "grad_norm": 0.5595818227980645, + "learning_rate": 1.3961426864652672e-05, + "loss": 0.6359, + "num_tokens": 359151497.0, + "step": 3815 + }, + { + "epoch": 0.6513056835637481, + "grad_norm": 0.4331794975008109, + "learning_rate": 1.3954599761051376e-05, + "loss": 0.5319, + "num_tokens": 359267988.0, + "step": 3816 + }, + { + "epoch": 0.6514763611537805, + "grad_norm": 0.48754286493112237, + "learning_rate": 1.3947772657450078e-05, + "loss": 0.5589, + "num_tokens": 359357092.0, + "step": 3817 + }, + { + "epoch": 0.6516470387438129, + "grad_norm": 0.5042861628473454, + "learning_rate": 1.3940945553848781e-05, + "loss": 0.4697, + "num_tokens": 359433466.0, + "step": 3818 + }, + { + "epoch": 0.6518177163338453, + "grad_norm": 0.5419075112174828, + "learning_rate": 1.3934118450247485e-05, + "loss": 0.6368, + "num_tokens": 359511416.0, + "step": 3819 + }, + { + "epoch": 0.6519883939238778, + "grad_norm": 0.524463326016875, + "learning_rate": 1.3927291346646186e-05, + "loss": 0.494, + "num_tokens": 359579854.0, + "step": 3820 + }, + { + "epoch": 0.6521590715139102, + "grad_norm": 0.501050749758819, + "learning_rate": 1.392046424304489e-05, + "loss": 0.6182, + "num_tokens": 359683262.0, + "step": 3821 + }, + { + "epoch": 0.6523297491039427, + "grad_norm": 0.4579773648560269, + "learning_rate": 1.3913637139443591e-05, + "loss": 0.5012, + "num_tokens": 359770007.0, + "step": 3822 + }, + { + "epoch": 0.6525004266939751, + "grad_norm": 0.4805502822957503, + "learning_rate": 1.3906810035842295e-05, + "loss": 0.5969, + "num_tokens": 359871491.0, + "step": 3823 + }, + { + "epoch": 0.6526711042840075, + "grad_norm": 0.5574061231943033, + "learning_rate": 1.3899982932240999e-05, + "loss": 0.6261, + "num_tokens": 359959348.0, + "step": 3824 + }, + { + "epoch": 0.65284178187404, + "grad_norm": 0.45864521019314236, + "learning_rate": 1.3893155828639701e-05, + "loss": 0.581, + "num_tokens": 360060554.0, + "step": 3825 + }, + { + "epoch": 0.6530124594640724, + "grad_norm": 0.545446093445203, + "learning_rate": 1.3886328725038403e-05, + "loss": 0.5554, + "num_tokens": 360125603.0, + "step": 3826 + }, + { + "epoch": 0.6531831370541048, + "grad_norm": 0.5837132429395301, + "learning_rate": 1.3879501621437105e-05, + "loss": 0.5555, + "num_tokens": 360195176.0, + "step": 3827 + }, + { + "epoch": 0.6533538146441372, + "grad_norm": 0.5393913593362436, + "learning_rate": 1.3872674517835809e-05, + "loss": 0.6063, + "num_tokens": 360267449.0, + "step": 3828 + }, + { + "epoch": 0.6535244922341696, + "grad_norm": 0.47833750959271826, + "learning_rate": 1.3865847414234513e-05, + "loss": 0.6126, + "num_tokens": 360375699.0, + "step": 3829 + }, + { + "epoch": 0.6536951698242021, + "grad_norm": 0.4876729998359854, + "learning_rate": 1.3859020310633215e-05, + "loss": 0.6065, + "num_tokens": 360473788.0, + "step": 3830 + }, + { + "epoch": 0.6538658474142345, + "grad_norm": 0.46982986606844346, + "learning_rate": 1.3852193207031918e-05, + "loss": 0.5124, + "num_tokens": 360563861.0, + "step": 3831 + }, + { + "epoch": 0.6540365250042669, + "grad_norm": 0.5057571263258188, + "learning_rate": 1.384536610343062e-05, + "loss": 0.5681, + "num_tokens": 360644215.0, + "step": 3832 + }, + { + "epoch": 0.6542072025942993, + "grad_norm": 0.4738716921155277, + "learning_rate": 1.3838538999829323e-05, + "loss": 0.5399, + "num_tokens": 360735974.0, + "step": 3833 + }, + { + "epoch": 0.6543778801843319, + "grad_norm": 0.5184390907452492, + "learning_rate": 1.3831711896228026e-05, + "loss": 0.5918, + "num_tokens": 360821384.0, + "step": 3834 + }, + { + "epoch": 0.6545485577743643, + "grad_norm": 0.5080531304522219, + "learning_rate": 1.382488479262673e-05, + "loss": 0.5368, + "num_tokens": 360893999.0, + "step": 3835 + }, + { + "epoch": 0.6547192353643967, + "grad_norm": 0.4385195608657318, + "learning_rate": 1.3818057689025432e-05, + "loss": 0.531, + "num_tokens": 361004858.0, + "step": 3836 + }, + { + "epoch": 0.6548899129544291, + "grad_norm": 0.5757929451370575, + "learning_rate": 1.3811230585424136e-05, + "loss": 0.5966, + "num_tokens": 361095720.0, + "step": 3837 + }, + { + "epoch": 0.6550605905444615, + "grad_norm": 0.47335493281966323, + "learning_rate": 1.3804403481822836e-05, + "loss": 0.588, + "num_tokens": 361191803.0, + "step": 3838 + }, + { + "epoch": 0.655231268134494, + "grad_norm": 0.4469422686721553, + "learning_rate": 1.379757637822154e-05, + "loss": 0.5872, + "num_tokens": 361303048.0, + "step": 3839 + }, + { + "epoch": 0.6554019457245264, + "grad_norm": 0.49706397009095593, + "learning_rate": 1.3790749274620244e-05, + "loss": 0.5718, + "num_tokens": 361381006.0, + "step": 3840 + }, + { + "epoch": 0.6555726233145588, + "grad_norm": 0.43484543141682247, + "learning_rate": 1.3783922171018946e-05, + "loss": 0.5354, + "num_tokens": 361488160.0, + "step": 3841 + }, + { + "epoch": 0.6557433009045912, + "grad_norm": 0.4861595081507606, + "learning_rate": 1.377709506741765e-05, + "loss": 0.6466, + "num_tokens": 361592977.0, + "step": 3842 + }, + { + "epoch": 0.6559139784946236, + "grad_norm": 0.47344903594427895, + "learning_rate": 1.3770267963816353e-05, + "loss": 0.6212, + "num_tokens": 361710327.0, + "step": 3843 + }, + { + "epoch": 0.656084656084656, + "grad_norm": 0.47487003140995504, + "learning_rate": 1.3763440860215056e-05, + "loss": 0.562, + "num_tokens": 361792686.0, + "step": 3844 + }, + { + "epoch": 0.6562553336746885, + "grad_norm": 0.442425311545529, + "learning_rate": 1.3756613756613758e-05, + "loss": 0.5177, + "num_tokens": 361906590.0, + "step": 3845 + }, + { + "epoch": 0.6564260112647209, + "grad_norm": 0.46280129930172653, + "learning_rate": 1.374978665301246e-05, + "loss": 0.6114, + "num_tokens": 362009758.0, + "step": 3846 + }, + { + "epoch": 0.6565966888547534, + "grad_norm": 0.46018879007957286, + "learning_rate": 1.3742959549411163e-05, + "loss": 0.4948, + "num_tokens": 362096133.0, + "step": 3847 + }, + { + "epoch": 0.6567673664447858, + "grad_norm": 0.5199273694038551, + "learning_rate": 1.3736132445809867e-05, + "loss": 0.6525, + "num_tokens": 362187768.0, + "step": 3848 + }, + { + "epoch": 0.6569380440348183, + "grad_norm": 0.49922608597260465, + "learning_rate": 1.372930534220857e-05, + "loss": 0.5936, + "num_tokens": 362282605.0, + "step": 3849 + }, + { + "epoch": 0.6571087216248507, + "grad_norm": 0.4482358578309397, + "learning_rate": 1.3722478238607273e-05, + "loss": 0.6662, + "num_tokens": 362399966.0, + "step": 3850 + }, + { + "epoch": 0.6572793992148831, + "grad_norm": 0.48430908163091957, + "learning_rate": 1.3715651135005973e-05, + "loss": 0.6159, + "num_tokens": 362495109.0, + "step": 3851 + }, + { + "epoch": 0.6574500768049155, + "grad_norm": 0.4639578483438303, + "learning_rate": 1.3708824031404677e-05, + "loss": 0.5711, + "num_tokens": 362586814.0, + "step": 3852 + }, + { + "epoch": 0.6576207543949479, + "grad_norm": 0.4773415211107689, + "learning_rate": 1.3701996927803381e-05, + "loss": 0.5699, + "num_tokens": 362673763.0, + "step": 3853 + }, + { + "epoch": 0.6577914319849804, + "grad_norm": 0.48879983215784506, + "learning_rate": 1.3695169824202083e-05, + "loss": 0.51, + "num_tokens": 362751817.0, + "step": 3854 + }, + { + "epoch": 0.6579621095750128, + "grad_norm": 0.47587606704829305, + "learning_rate": 1.3688342720600787e-05, + "loss": 0.6143, + "num_tokens": 362854457.0, + "step": 3855 + }, + { + "epoch": 0.6581327871650452, + "grad_norm": 0.49575094076396786, + "learning_rate": 1.368151561699949e-05, + "loss": 0.6411, + "num_tokens": 362943245.0, + "step": 3856 + }, + { + "epoch": 0.6583034647550776, + "grad_norm": 0.48861598435958487, + "learning_rate": 1.3674688513398191e-05, + "loss": 0.6705, + "num_tokens": 363042520.0, + "step": 3857 + }, + { + "epoch": 0.65847414234511, + "grad_norm": 0.48341797585162827, + "learning_rate": 1.3667861409796895e-05, + "loss": 0.5824, + "num_tokens": 363136031.0, + "step": 3858 + }, + { + "epoch": 0.6586448199351426, + "grad_norm": 0.46725213663166304, + "learning_rate": 1.3661034306195597e-05, + "loss": 0.6268, + "num_tokens": 363234006.0, + "step": 3859 + }, + { + "epoch": 0.658815497525175, + "grad_norm": 0.4931048048171122, + "learning_rate": 1.36542072025943e-05, + "loss": 0.6134, + "num_tokens": 363325240.0, + "step": 3860 + }, + { + "epoch": 0.6589861751152074, + "grad_norm": 0.4847845144139469, + "learning_rate": 1.3647380098993004e-05, + "loss": 0.5639, + "num_tokens": 363416207.0, + "step": 3861 + }, + { + "epoch": 0.6591568527052398, + "grad_norm": 0.46882004002283767, + "learning_rate": 1.3640552995391706e-05, + "loss": 0.5423, + "num_tokens": 363500021.0, + "step": 3862 + }, + { + "epoch": 0.6593275302952722, + "grad_norm": 0.4859102114932772, + "learning_rate": 1.3633725891790408e-05, + "loss": 0.5727, + "num_tokens": 363588761.0, + "step": 3863 + }, + { + "epoch": 0.6594982078853047, + "grad_norm": 0.4844895718136432, + "learning_rate": 1.362689878818911e-05, + "loss": 0.5283, + "num_tokens": 363669923.0, + "step": 3864 + }, + { + "epoch": 0.6596688854753371, + "grad_norm": 0.4964913259096266, + "learning_rate": 1.3620071684587814e-05, + "loss": 0.6087, + "num_tokens": 363754801.0, + "step": 3865 + }, + { + "epoch": 0.6598395630653695, + "grad_norm": 0.5240712327122897, + "learning_rate": 1.3613244580986518e-05, + "loss": 0.6576, + "num_tokens": 363844255.0, + "step": 3866 + }, + { + "epoch": 0.6600102406554019, + "grad_norm": 0.45436054425535355, + "learning_rate": 1.360641747738522e-05, + "loss": 0.6251, + "num_tokens": 363960685.0, + "step": 3867 + }, + { + "epoch": 0.6601809182454343, + "grad_norm": 0.5339749741248881, + "learning_rate": 1.3599590373783924e-05, + "loss": 0.5333, + "num_tokens": 364030461.0, + "step": 3868 + }, + { + "epoch": 0.6603515958354668, + "grad_norm": 0.43954772104875783, + "learning_rate": 1.3592763270182626e-05, + "loss": 0.563, + "num_tokens": 364138622.0, + "step": 3869 + }, + { + "epoch": 0.6605222734254992, + "grad_norm": 0.5404528285182921, + "learning_rate": 1.3585936166581328e-05, + "loss": 0.58, + "num_tokens": 364219300.0, + "step": 3870 + }, + { + "epoch": 0.6606929510155317, + "grad_norm": 0.4458182578221126, + "learning_rate": 1.3579109062980032e-05, + "loss": 0.5655, + "num_tokens": 364333521.0, + "step": 3871 + }, + { + "epoch": 0.6608636286055641, + "grad_norm": 0.5590390059422352, + "learning_rate": 1.3572281959378735e-05, + "loss": 0.5754, + "num_tokens": 364415357.0, + "step": 3872 + }, + { + "epoch": 0.6610343061955966, + "grad_norm": 0.46700229721333314, + "learning_rate": 1.3565454855777438e-05, + "loss": 0.5134, + "num_tokens": 364512313.0, + "step": 3873 + }, + { + "epoch": 0.661204983785629, + "grad_norm": 0.44910310152083815, + "learning_rate": 1.3558627752176141e-05, + "loss": 0.5374, + "num_tokens": 364618434.0, + "step": 3874 + }, + { + "epoch": 0.6613756613756614, + "grad_norm": 0.5262434006250625, + "learning_rate": 1.3551800648574845e-05, + "loss": 0.6543, + "num_tokens": 364696947.0, + "step": 3875 + }, + { + "epoch": 0.6615463389656938, + "grad_norm": 0.5201023426647235, + "learning_rate": 1.3544973544973545e-05, + "loss": 0.6271, + "num_tokens": 364790363.0, + "step": 3876 + }, + { + "epoch": 0.6617170165557262, + "grad_norm": 0.5064768933774368, + "learning_rate": 1.353814644137225e-05, + "loss": 0.5623, + "num_tokens": 364867253.0, + "step": 3877 + }, + { + "epoch": 0.6618876941457587, + "grad_norm": 0.4381762921032272, + "learning_rate": 1.3531319337770951e-05, + "loss": 0.6004, + "num_tokens": 364987837.0, + "step": 3878 + }, + { + "epoch": 0.6620583717357911, + "grad_norm": 0.4476766985155958, + "learning_rate": 1.3524492234169655e-05, + "loss": 0.5693, + "num_tokens": 365090954.0, + "step": 3879 + }, + { + "epoch": 0.6622290493258235, + "grad_norm": 0.4901774184494246, + "learning_rate": 1.3517665130568359e-05, + "loss": 0.5989, + "num_tokens": 365179209.0, + "step": 3880 + }, + { + "epoch": 0.6623997269158559, + "grad_norm": 0.4773411585180803, + "learning_rate": 1.3510838026967061e-05, + "loss": 0.5837, + "num_tokens": 365268462.0, + "step": 3881 + }, + { + "epoch": 0.6625704045058883, + "grad_norm": 0.4706484490210508, + "learning_rate": 1.3504010923365763e-05, + "loss": 0.5719, + "num_tokens": 365367679.0, + "step": 3882 + }, + { + "epoch": 0.6627410820959208, + "grad_norm": 0.5451557908769266, + "learning_rate": 1.3497183819764465e-05, + "loss": 0.5761, + "num_tokens": 365440078.0, + "step": 3883 + }, + { + "epoch": 0.6629117596859533, + "grad_norm": 0.5146778705425954, + "learning_rate": 1.3490356716163169e-05, + "loss": 0.5539, + "num_tokens": 365516659.0, + "step": 3884 + }, + { + "epoch": 0.6630824372759857, + "grad_norm": 0.4783250610052764, + "learning_rate": 1.3483529612561873e-05, + "loss": 0.5073, + "num_tokens": 365597268.0, + "step": 3885 + }, + { + "epoch": 0.6632531148660181, + "grad_norm": 0.5284659316820723, + "learning_rate": 1.3476702508960575e-05, + "loss": 0.5947, + "num_tokens": 365674682.0, + "step": 3886 + }, + { + "epoch": 0.6634237924560505, + "grad_norm": 0.48087486450286565, + "learning_rate": 1.3469875405359278e-05, + "loss": 0.5022, + "num_tokens": 365745696.0, + "step": 3887 + }, + { + "epoch": 0.663594470046083, + "grad_norm": 0.4758439553270209, + "learning_rate": 1.3463048301757979e-05, + "loss": 0.5669, + "num_tokens": 365842111.0, + "step": 3888 + }, + { + "epoch": 0.6637651476361154, + "grad_norm": 0.4917384493645637, + "learning_rate": 1.3456221198156683e-05, + "loss": 0.5259, + "num_tokens": 365926833.0, + "step": 3889 + }, + { + "epoch": 0.6639358252261478, + "grad_norm": 0.46048475148750806, + "learning_rate": 1.3449394094555386e-05, + "loss": 0.5695, + "num_tokens": 366029225.0, + "step": 3890 + }, + { + "epoch": 0.6641065028161802, + "grad_norm": 0.5072643775193107, + "learning_rate": 1.3442566990954088e-05, + "loss": 0.5585, + "num_tokens": 366109952.0, + "step": 3891 + }, + { + "epoch": 0.6642771804062126, + "grad_norm": 0.46807258143500996, + "learning_rate": 1.3435739887352792e-05, + "loss": 0.559, + "num_tokens": 366197881.0, + "step": 3892 + }, + { + "epoch": 0.6644478579962451, + "grad_norm": 0.4468003780211038, + "learning_rate": 1.3428912783751496e-05, + "loss": 0.5977, + "num_tokens": 366311366.0, + "step": 3893 + }, + { + "epoch": 0.6646185355862775, + "grad_norm": 0.4709087979991452, + "learning_rate": 1.3422085680150196e-05, + "loss": 0.62, + "num_tokens": 366414750.0, + "step": 3894 + }, + { + "epoch": 0.6647892131763099, + "grad_norm": 0.483905127038126, + "learning_rate": 1.34152585765489e-05, + "loss": 0.494, + "num_tokens": 366489913.0, + "step": 3895 + }, + { + "epoch": 0.6649598907663424, + "grad_norm": 0.48153906224264775, + "learning_rate": 1.3408431472947602e-05, + "loss": 0.6077, + "num_tokens": 366583627.0, + "step": 3896 + }, + { + "epoch": 0.6651305683563749, + "grad_norm": 0.4523695685062877, + "learning_rate": 1.3401604369346306e-05, + "loss": 0.597, + "num_tokens": 366690692.0, + "step": 3897 + }, + { + "epoch": 0.6653012459464073, + "grad_norm": 0.4640931836133353, + "learning_rate": 1.339477726574501e-05, + "loss": 0.6443, + "num_tokens": 366801858.0, + "step": 3898 + }, + { + "epoch": 0.6654719235364397, + "grad_norm": 0.4930684426229937, + "learning_rate": 1.3387950162143712e-05, + "loss": 0.481, + "num_tokens": 366881346.0, + "step": 3899 + }, + { + "epoch": 0.6656426011264721, + "grad_norm": 0.4354349849340033, + "learning_rate": 1.3381123058542414e-05, + "loss": 0.5328, + "num_tokens": 366989461.0, + "step": 3900 + }, + { + "epoch": 0.6658132787165045, + "grad_norm": 0.5413073345400635, + "learning_rate": 1.3374295954941117e-05, + "loss": 0.5045, + "num_tokens": 367081799.0, + "step": 3901 + }, + { + "epoch": 0.665983956306537, + "grad_norm": 0.549986675776134, + "learning_rate": 1.336746885133982e-05, + "loss": 0.5585, + "num_tokens": 367149466.0, + "step": 3902 + }, + { + "epoch": 0.6661546338965694, + "grad_norm": 0.4718377973415796, + "learning_rate": 1.3360641747738523e-05, + "loss": 0.5956, + "num_tokens": 367252723.0, + "step": 3903 + }, + { + "epoch": 0.6663253114866018, + "grad_norm": 0.5427661063119847, + "learning_rate": 1.3353814644137227e-05, + "loss": 0.6714, + "num_tokens": 367348346.0, + "step": 3904 + }, + { + "epoch": 0.6664959890766342, + "grad_norm": 0.47243663929427127, + "learning_rate": 1.334698754053593e-05, + "loss": 0.6018, + "num_tokens": 367448078.0, + "step": 3905 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.45987168143886736, + "learning_rate": 1.3340160436934631e-05, + "loss": 0.6429, + "num_tokens": 367562428.0, + "step": 3906 + }, + { + "epoch": 0.666837344256699, + "grad_norm": 0.44058195082701784, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.6336, + "num_tokens": 367678712.0, + "step": 3907 + }, + { + "epoch": 0.6670080218467315, + "grad_norm": 0.5079800186377516, + "learning_rate": 1.3326506229732037e-05, + "loss": 0.6066, + "num_tokens": 367771069.0, + "step": 3908 + }, + { + "epoch": 0.667178699436764, + "grad_norm": 0.43435918763356135, + "learning_rate": 1.331967912613074e-05, + "loss": 0.6009, + "num_tokens": 367890124.0, + "step": 3909 + }, + { + "epoch": 0.6673493770267964, + "grad_norm": 0.4738886606817851, + "learning_rate": 1.3312852022529443e-05, + "loss": 0.5716, + "num_tokens": 367987285.0, + "step": 3910 + }, + { + "epoch": 0.6675200546168288, + "grad_norm": 0.5054669784639049, + "learning_rate": 1.3306024918928147e-05, + "loss": 0.5851, + "num_tokens": 368073108.0, + "step": 3911 + }, + { + "epoch": 0.6676907322068613, + "grad_norm": 0.44863718837625305, + "learning_rate": 1.329919781532685e-05, + "loss": 0.5563, + "num_tokens": 368169836.0, + "step": 3912 + }, + { + "epoch": 0.6678614097968937, + "grad_norm": 0.4832844885958517, + "learning_rate": 1.329237071172555e-05, + "loss": 0.5392, + "num_tokens": 368256300.0, + "step": 3913 + }, + { + "epoch": 0.6680320873869261, + "grad_norm": 0.4614198883947568, + "learning_rate": 1.3285543608124255e-05, + "loss": 0.586, + "num_tokens": 368359679.0, + "step": 3914 + }, + { + "epoch": 0.6682027649769585, + "grad_norm": 0.5614347400781755, + "learning_rate": 1.3278716504522957e-05, + "loss": 0.6733, + "num_tokens": 368428215.0, + "step": 3915 + }, + { + "epoch": 0.6683734425669909, + "grad_norm": 0.3989020829987458, + "learning_rate": 1.327188940092166e-05, + "loss": 0.5173, + "num_tokens": 368558774.0, + "step": 3916 + }, + { + "epoch": 0.6685441201570234, + "grad_norm": 0.4995362728923673, + "learning_rate": 1.3265062297320364e-05, + "loss": 0.5046, + "num_tokens": 368636531.0, + "step": 3917 + }, + { + "epoch": 0.6687147977470558, + "grad_norm": 0.5403513042804551, + "learning_rate": 1.3258235193719066e-05, + "loss": 0.6608, + "num_tokens": 368721908.0, + "step": 3918 + }, + { + "epoch": 0.6688854753370882, + "grad_norm": 0.41583835953363724, + "learning_rate": 1.3251408090117768e-05, + "loss": 0.5023, + "num_tokens": 368829971.0, + "step": 3919 + }, + { + "epoch": 0.6690561529271206, + "grad_norm": 0.4675339979005938, + "learning_rate": 1.324458098651647e-05, + "loss": 0.6641, + "num_tokens": 368953536.0, + "step": 3920 + }, + { + "epoch": 0.6692268305171531, + "grad_norm": 0.4692610056469804, + "learning_rate": 1.3237753882915174e-05, + "loss": 0.6242, + "num_tokens": 369052120.0, + "step": 3921 + }, + { + "epoch": 0.6693975081071856, + "grad_norm": 0.4886875527172367, + "learning_rate": 1.3230926779313878e-05, + "loss": 0.5556, + "num_tokens": 369132887.0, + "step": 3922 + }, + { + "epoch": 0.669568185697218, + "grad_norm": 0.44807815673139656, + "learning_rate": 1.322409967571258e-05, + "loss": 0.5242, + "num_tokens": 369229738.0, + "step": 3923 + }, + { + "epoch": 0.6697388632872504, + "grad_norm": 0.4845760094393384, + "learning_rate": 1.3217272572111284e-05, + "loss": 0.5615, + "num_tokens": 369317223.0, + "step": 3924 + }, + { + "epoch": 0.6699095408772828, + "grad_norm": 0.4730877862858023, + "learning_rate": 1.3210445468509984e-05, + "loss": 0.5812, + "num_tokens": 369413333.0, + "step": 3925 + }, + { + "epoch": 0.6700802184673152, + "grad_norm": 0.46345381787556394, + "learning_rate": 1.3203618364908688e-05, + "loss": 0.5272, + "num_tokens": 369504426.0, + "step": 3926 + }, + { + "epoch": 0.6702508960573477, + "grad_norm": 0.43509041327947295, + "learning_rate": 1.3196791261307392e-05, + "loss": 0.5962, + "num_tokens": 369623353.0, + "step": 3927 + }, + { + "epoch": 0.6704215736473801, + "grad_norm": 0.47421324070794063, + "learning_rate": 1.3189964157706094e-05, + "loss": 0.6358, + "num_tokens": 369726183.0, + "step": 3928 + }, + { + "epoch": 0.6705922512374125, + "grad_norm": 0.522656948766794, + "learning_rate": 1.3183137054104797e-05, + "loss": 0.5813, + "num_tokens": 369809992.0, + "step": 3929 + }, + { + "epoch": 0.6707629288274449, + "grad_norm": 0.5629849587636582, + "learning_rate": 1.3176309950503501e-05, + "loss": 0.5955, + "num_tokens": 369884565.0, + "step": 3930 + }, + { + "epoch": 0.6709336064174773, + "grad_norm": 0.4563283812427275, + "learning_rate": 1.3169482846902202e-05, + "loss": 0.5311, + "num_tokens": 369979825.0, + "step": 3931 + }, + { + "epoch": 0.6711042840075098, + "grad_norm": 0.5297552232792441, + "learning_rate": 1.3162655743300905e-05, + "loss": 0.5443, + "num_tokens": 370048712.0, + "step": 3932 + }, + { + "epoch": 0.6712749615975423, + "grad_norm": 0.44006101274232795, + "learning_rate": 1.3155828639699607e-05, + "loss": 0.4604, + "num_tokens": 370137061.0, + "step": 3933 + }, + { + "epoch": 0.6714456391875747, + "grad_norm": 0.4715885369183258, + "learning_rate": 1.3149001536098311e-05, + "loss": 0.6032, + "num_tokens": 370231380.0, + "step": 3934 + }, + { + "epoch": 0.6716163167776071, + "grad_norm": 0.47471174736142724, + "learning_rate": 1.3142174432497015e-05, + "loss": 0.6287, + "num_tokens": 370326872.0, + "step": 3935 + }, + { + "epoch": 0.6717869943676396, + "grad_norm": 0.5096784268127553, + "learning_rate": 1.3135347328895717e-05, + "loss": 0.5666, + "num_tokens": 370407851.0, + "step": 3936 + }, + { + "epoch": 0.671957671957672, + "grad_norm": 0.550460980030132, + "learning_rate": 1.3128520225294419e-05, + "loss": 0.5654, + "num_tokens": 370480855.0, + "step": 3937 + }, + { + "epoch": 0.6721283495477044, + "grad_norm": 0.4482154268881826, + "learning_rate": 1.3121693121693123e-05, + "loss": 0.5897, + "num_tokens": 370592762.0, + "step": 3938 + }, + { + "epoch": 0.6722990271377368, + "grad_norm": 0.49236923350843836, + "learning_rate": 1.3114866018091825e-05, + "loss": 0.5555, + "num_tokens": 370682071.0, + "step": 3939 + }, + { + "epoch": 0.6724697047277692, + "grad_norm": 0.4645556556035112, + "learning_rate": 1.3108038914490529e-05, + "loss": 0.5575, + "num_tokens": 370778782.0, + "step": 3940 + }, + { + "epoch": 0.6726403823178017, + "grad_norm": 0.5442119468998368, + "learning_rate": 1.3101211810889232e-05, + "loss": 0.5865, + "num_tokens": 370877948.0, + "step": 3941 + }, + { + "epoch": 0.6728110599078341, + "grad_norm": 0.41274817736462865, + "learning_rate": 1.3094384707287935e-05, + "loss": 0.4812, + "num_tokens": 370991265.0, + "step": 3942 + }, + { + "epoch": 0.6729817374978665, + "grad_norm": 0.4318675923781927, + "learning_rate": 1.3087557603686638e-05, + "loss": 0.5593, + "num_tokens": 371110208.0, + "step": 3943 + }, + { + "epoch": 0.6731524150878989, + "grad_norm": 0.49398476900863464, + "learning_rate": 1.3080730500085339e-05, + "loss": 0.5022, + "num_tokens": 371185312.0, + "step": 3944 + }, + { + "epoch": 0.6733230926779313, + "grad_norm": 0.4895199848831399, + "learning_rate": 1.3073903396484042e-05, + "loss": 0.5561, + "num_tokens": 371269487.0, + "step": 3945 + }, + { + "epoch": 0.6734937702679639, + "grad_norm": 0.45079761094101095, + "learning_rate": 1.3067076292882746e-05, + "loss": 0.5239, + "num_tokens": 371360083.0, + "step": 3946 + }, + { + "epoch": 0.6736644478579963, + "grad_norm": 0.4172355030909849, + "learning_rate": 1.3060249189281448e-05, + "loss": 0.5465, + "num_tokens": 371477200.0, + "step": 3947 + }, + { + "epoch": 0.6738351254480287, + "grad_norm": 0.47589173388707723, + "learning_rate": 1.3053422085680152e-05, + "loss": 0.5552, + "num_tokens": 371564593.0, + "step": 3948 + }, + { + "epoch": 0.6740058030380611, + "grad_norm": 0.4932993887730987, + "learning_rate": 1.3046594982078856e-05, + "loss": 0.601, + "num_tokens": 371652666.0, + "step": 3949 + }, + { + "epoch": 0.6741764806280935, + "grad_norm": 0.4520444687615404, + "learning_rate": 1.3039767878477556e-05, + "loss": 0.5546, + "num_tokens": 371756174.0, + "step": 3950 + }, + { + "epoch": 0.674347158218126, + "grad_norm": 0.4678625848778644, + "learning_rate": 1.303294077487626e-05, + "loss": 0.5077, + "num_tokens": 371841444.0, + "step": 3951 + }, + { + "epoch": 0.6745178358081584, + "grad_norm": 0.5098638786240405, + "learning_rate": 1.3026113671274962e-05, + "loss": 0.5446, + "num_tokens": 371927800.0, + "step": 3952 + }, + { + "epoch": 0.6746885133981908, + "grad_norm": 0.5342670218190857, + "learning_rate": 1.3019286567673666e-05, + "loss": 0.4723, + "num_tokens": 371985193.0, + "step": 3953 + }, + { + "epoch": 0.6748591909882232, + "grad_norm": 0.47351783144531995, + "learning_rate": 1.301245946407237e-05, + "loss": 0.6899, + "num_tokens": 372095587.0, + "step": 3954 + }, + { + "epoch": 0.6750298685782556, + "grad_norm": 0.4519901576644261, + "learning_rate": 1.3005632360471072e-05, + "loss": 0.6201, + "num_tokens": 372216165.0, + "step": 3955 + }, + { + "epoch": 0.6752005461682881, + "grad_norm": 0.44024301916018543, + "learning_rate": 1.2998805256869774e-05, + "loss": 0.6081, + "num_tokens": 372327487.0, + "step": 3956 + }, + { + "epoch": 0.6753712237583205, + "grad_norm": 0.47490491629724535, + "learning_rate": 1.2991978153268476e-05, + "loss": 0.5726, + "num_tokens": 372420314.0, + "step": 3957 + }, + { + "epoch": 0.675541901348353, + "grad_norm": 0.5160588372149131, + "learning_rate": 1.298515104966718e-05, + "loss": 0.5587, + "num_tokens": 372496063.0, + "step": 3958 + }, + { + "epoch": 0.6757125789383854, + "grad_norm": 0.47087352543192884, + "learning_rate": 1.2978323946065883e-05, + "loss": 0.5034, + "num_tokens": 372576292.0, + "step": 3959 + }, + { + "epoch": 0.6758832565284179, + "grad_norm": 0.5375732964760931, + "learning_rate": 1.2971496842464585e-05, + "loss": 0.6639, + "num_tokens": 372653300.0, + "step": 3960 + }, + { + "epoch": 0.6760539341184503, + "grad_norm": 0.5244784505998659, + "learning_rate": 1.2964669738863289e-05, + "loss": 0.5805, + "num_tokens": 372725783.0, + "step": 3961 + }, + { + "epoch": 0.6762246117084827, + "grad_norm": 0.44371765120544604, + "learning_rate": 1.295784263526199e-05, + "loss": 0.5496, + "num_tokens": 372829501.0, + "step": 3962 + }, + { + "epoch": 0.6763952892985151, + "grad_norm": 0.4318101128642206, + "learning_rate": 1.2951015531660693e-05, + "loss": 0.5306, + "num_tokens": 372956734.0, + "step": 3963 + }, + { + "epoch": 0.6765659668885475, + "grad_norm": 0.45964076486576644, + "learning_rate": 1.2944188428059397e-05, + "loss": 0.5476, + "num_tokens": 373050401.0, + "step": 3964 + }, + { + "epoch": 0.67673664447858, + "grad_norm": 0.45178352505846103, + "learning_rate": 1.2937361324458099e-05, + "loss": 0.6254, + "num_tokens": 373149843.0, + "step": 3965 + }, + { + "epoch": 0.6769073220686124, + "grad_norm": 0.479381452973111, + "learning_rate": 1.2930534220856803e-05, + "loss": 0.622, + "num_tokens": 373254991.0, + "step": 3966 + }, + { + "epoch": 0.6770779996586448, + "grad_norm": 0.46826821232498084, + "learning_rate": 1.2923707117255507e-05, + "loss": 0.6261, + "num_tokens": 373355851.0, + "step": 3967 + }, + { + "epoch": 0.6772486772486772, + "grad_norm": 0.5175973628130143, + "learning_rate": 1.2916880013654207e-05, + "loss": 0.4656, + "num_tokens": 373420009.0, + "step": 3968 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.4703526614054862, + "learning_rate": 1.291005291005291e-05, + "loss": 0.5593, + "num_tokens": 373514686.0, + "step": 3969 + }, + { + "epoch": 0.6775900324287422, + "grad_norm": 0.5486419527964167, + "learning_rate": 1.2903225806451613e-05, + "loss": 0.4631, + "num_tokens": 373578961.0, + "step": 3970 + }, + { + "epoch": 0.6777607100187746, + "grad_norm": 0.5112740497366018, + "learning_rate": 1.2896398702850317e-05, + "loss": 0.5831, + "num_tokens": 373659220.0, + "step": 3971 + }, + { + "epoch": 0.677931387608807, + "grad_norm": 0.4852943365380353, + "learning_rate": 1.288957159924902e-05, + "loss": 0.5637, + "num_tokens": 373752906.0, + "step": 3972 + }, + { + "epoch": 0.6781020651988394, + "grad_norm": 0.42291431733463675, + "learning_rate": 1.2882744495647724e-05, + "loss": 0.5253, + "num_tokens": 373864418.0, + "step": 3973 + }, + { + "epoch": 0.6782727427888718, + "grad_norm": 0.47922288410543473, + "learning_rate": 1.2875917392046426e-05, + "loss": 0.5647, + "num_tokens": 373956732.0, + "step": 3974 + }, + { + "epoch": 0.6784434203789043, + "grad_norm": 0.49586294412876053, + "learning_rate": 1.2869090288445128e-05, + "loss": 0.5582, + "num_tokens": 374055968.0, + "step": 3975 + }, + { + "epoch": 0.6786140979689367, + "grad_norm": 0.482838573079878, + "learning_rate": 1.286226318484383e-05, + "loss": 0.4865, + "num_tokens": 374137412.0, + "step": 3976 + }, + { + "epoch": 0.6787847755589691, + "grad_norm": 0.42937021485697635, + "learning_rate": 1.2855436081242534e-05, + "loss": 0.5384, + "num_tokens": 374250058.0, + "step": 3977 + }, + { + "epoch": 0.6789554531490015, + "grad_norm": 0.4841046605588727, + "learning_rate": 1.2848608977641238e-05, + "loss": 0.5594, + "num_tokens": 374341358.0, + "step": 3978 + }, + { + "epoch": 0.6791261307390339, + "grad_norm": 0.4810300559091305, + "learning_rate": 1.284178187403994e-05, + "loss": 0.6545, + "num_tokens": 374449506.0, + "step": 3979 + }, + { + "epoch": 0.6792968083290664, + "grad_norm": 0.49583356444898374, + "learning_rate": 1.2834954770438644e-05, + "loss": 0.5578, + "num_tokens": 374534068.0, + "step": 3980 + }, + { + "epoch": 0.6794674859190988, + "grad_norm": 0.4832047631064496, + "learning_rate": 1.2828127666837344e-05, + "loss": 0.5995, + "num_tokens": 374630059.0, + "step": 3981 + }, + { + "epoch": 0.6796381635091312, + "grad_norm": 0.5009175749615926, + "learning_rate": 1.2821300563236048e-05, + "loss": 0.5917, + "num_tokens": 374722425.0, + "step": 3982 + }, + { + "epoch": 0.6798088410991637, + "grad_norm": 0.526994307319266, + "learning_rate": 1.2814473459634752e-05, + "loss": 0.4939, + "num_tokens": 374789911.0, + "step": 3983 + }, + { + "epoch": 0.6799795186891961, + "grad_norm": 0.4628120028205401, + "learning_rate": 1.2807646356033454e-05, + "loss": 0.5968, + "num_tokens": 374886445.0, + "step": 3984 + }, + { + "epoch": 0.6801501962792286, + "grad_norm": 0.5094668807061902, + "learning_rate": 1.2800819252432157e-05, + "loss": 0.6628, + "num_tokens": 374990935.0, + "step": 3985 + }, + { + "epoch": 0.680320873869261, + "grad_norm": 0.5078004848459915, + "learning_rate": 1.2793992148830861e-05, + "loss": 0.6272, + "num_tokens": 375086412.0, + "step": 3986 + }, + { + "epoch": 0.6804915514592934, + "grad_norm": 0.5325557818356629, + "learning_rate": 1.2787165045229561e-05, + "loss": 0.6338, + "num_tokens": 375171145.0, + "step": 3987 + }, + { + "epoch": 0.6806622290493258, + "grad_norm": 0.5528878102579359, + "learning_rate": 1.2780337941628265e-05, + "loss": 0.5747, + "num_tokens": 375242894.0, + "step": 3988 + }, + { + "epoch": 0.6808329066393582, + "grad_norm": 0.5009449516449519, + "learning_rate": 1.2773510838026967e-05, + "loss": 0.5134, + "num_tokens": 375312671.0, + "step": 3989 + }, + { + "epoch": 0.6810035842293907, + "grad_norm": 0.41792388126071, + "learning_rate": 1.2766683734425671e-05, + "loss": 0.5155, + "num_tokens": 375430873.0, + "step": 3990 + }, + { + "epoch": 0.6811742618194231, + "grad_norm": 0.4870826623173786, + "learning_rate": 1.2759856630824375e-05, + "loss": 0.7072, + "num_tokens": 375543147.0, + "step": 3991 + }, + { + "epoch": 0.6813449394094555, + "grad_norm": 0.7798893663304252, + "learning_rate": 1.2753029527223077e-05, + "loss": 0.6427, + "num_tokens": 375643525.0, + "step": 3992 + }, + { + "epoch": 0.6815156169994879, + "grad_norm": 0.5170430908717488, + "learning_rate": 1.2746202423621779e-05, + "loss": 0.6013, + "num_tokens": 375726049.0, + "step": 3993 + }, + { + "epoch": 0.6816862945895203, + "grad_norm": 0.518984081683617, + "learning_rate": 1.2739375320020481e-05, + "loss": 0.5648, + "num_tokens": 375800069.0, + "step": 3994 + }, + { + "epoch": 0.6818569721795529, + "grad_norm": 0.4673278891359286, + "learning_rate": 1.2732548216419185e-05, + "loss": 0.4575, + "num_tokens": 375886742.0, + "step": 3995 + }, + { + "epoch": 0.6820276497695853, + "grad_norm": 0.5349828310794014, + "learning_rate": 1.2725721112817889e-05, + "loss": 0.5673, + "num_tokens": 375975622.0, + "step": 3996 + }, + { + "epoch": 0.6821983273596177, + "grad_norm": 0.4736897301530035, + "learning_rate": 1.271889400921659e-05, + "loss": 0.5895, + "num_tokens": 376073021.0, + "step": 3997 + }, + { + "epoch": 0.6823690049496501, + "grad_norm": 0.4396162038929571, + "learning_rate": 1.2712066905615294e-05, + "loss": 0.5747, + "num_tokens": 376179605.0, + "step": 3998 + }, + { + "epoch": 0.6825396825396826, + "grad_norm": 0.41313397260575235, + "learning_rate": 1.2705239802013995e-05, + "loss": 0.5057, + "num_tokens": 376298028.0, + "step": 3999 + }, + { + "epoch": 0.682710360129715, + "grad_norm": 0.5007542436673901, + "learning_rate": 1.2698412698412699e-05, + "loss": 0.5575, + "num_tokens": 376378459.0, + "step": 4000 + }, + { + "epoch": 0.6828810377197474, + "grad_norm": 0.46710920856908594, + "learning_rate": 1.2691585594811402e-05, + "loss": 0.5934, + "num_tokens": 376507883.0, + "step": 4001 + }, + { + "epoch": 0.6830517153097798, + "grad_norm": 0.45788473064042273, + "learning_rate": 1.2684758491210104e-05, + "loss": 0.665, + "num_tokens": 376640253.0, + "step": 4002 + }, + { + "epoch": 0.6832223928998122, + "grad_norm": 0.4928340237578695, + "learning_rate": 1.2677931387608808e-05, + "loss": 0.5535, + "num_tokens": 376719843.0, + "step": 4003 + }, + { + "epoch": 0.6833930704898447, + "grad_norm": 0.5967309056662397, + "learning_rate": 1.2671104284007512e-05, + "loss": 0.6641, + "num_tokens": 376814272.0, + "step": 4004 + }, + { + "epoch": 0.6835637480798771, + "grad_norm": 0.44562501549381434, + "learning_rate": 1.2664277180406214e-05, + "loss": 0.6094, + "num_tokens": 376927617.0, + "step": 4005 + }, + { + "epoch": 0.6837344256699095, + "grad_norm": 0.5276424684287614, + "learning_rate": 1.2657450076804916e-05, + "loss": 0.6026, + "num_tokens": 377006077.0, + "step": 4006 + }, + { + "epoch": 0.6839051032599419, + "grad_norm": 0.5002842600027759, + "learning_rate": 1.265062297320362e-05, + "loss": 0.5393, + "num_tokens": 377084691.0, + "step": 4007 + }, + { + "epoch": 0.6840757808499744, + "grad_norm": 0.4824961189191605, + "learning_rate": 1.2643795869602322e-05, + "loss": 0.567, + "num_tokens": 377179137.0, + "step": 4008 + }, + { + "epoch": 0.6842464584400069, + "grad_norm": 0.4530197900078497, + "learning_rate": 1.2636968766001026e-05, + "loss": 0.5748, + "num_tokens": 377285517.0, + "step": 4009 + }, + { + "epoch": 0.6844171360300393, + "grad_norm": 0.4771373728293529, + "learning_rate": 1.263014166239973e-05, + "loss": 0.6288, + "num_tokens": 377382664.0, + "step": 4010 + }, + { + "epoch": 0.6845878136200717, + "grad_norm": 0.5388506509807646, + "learning_rate": 1.2623314558798431e-05, + "loss": 0.6189, + "num_tokens": 377461086.0, + "step": 4011 + }, + { + "epoch": 0.6847584912101041, + "grad_norm": 0.4633369447734021, + "learning_rate": 1.2616487455197134e-05, + "loss": 0.6021, + "num_tokens": 377563124.0, + "step": 4012 + }, + { + "epoch": 0.6849291688001365, + "grad_norm": 0.46258660344649455, + "learning_rate": 1.2609660351595836e-05, + "loss": 0.5711, + "num_tokens": 377664534.0, + "step": 4013 + }, + { + "epoch": 0.685099846390169, + "grad_norm": 0.49254562154701925, + "learning_rate": 1.260283324799454e-05, + "loss": 0.5808, + "num_tokens": 377756223.0, + "step": 4014 + }, + { + "epoch": 0.6852705239802014, + "grad_norm": 0.46974237827311843, + "learning_rate": 1.2596006144393243e-05, + "loss": 0.5418, + "num_tokens": 377842646.0, + "step": 4015 + }, + { + "epoch": 0.6854412015702338, + "grad_norm": 0.5254622344007362, + "learning_rate": 1.2589179040791945e-05, + "loss": 0.6022, + "num_tokens": 377919052.0, + "step": 4016 + }, + { + "epoch": 0.6856118791602662, + "grad_norm": 0.5350632755480206, + "learning_rate": 1.2582351937190649e-05, + "loss": 0.5153, + "num_tokens": 377990794.0, + "step": 4017 + }, + { + "epoch": 0.6857825567502986, + "grad_norm": 0.5436893898941847, + "learning_rate": 1.257552483358935e-05, + "loss": 0.6305, + "num_tokens": 378083524.0, + "step": 4018 + }, + { + "epoch": 0.6859532343403311, + "grad_norm": 0.4682620879185587, + "learning_rate": 1.2568697729988053e-05, + "loss": 0.6102, + "num_tokens": 378177240.0, + "step": 4019 + }, + { + "epoch": 0.6861239119303636, + "grad_norm": 0.499652155715656, + "learning_rate": 1.2561870626386757e-05, + "loss": 0.5778, + "num_tokens": 378266368.0, + "step": 4020 + }, + { + "epoch": 0.686294589520396, + "grad_norm": 0.5323047922942338, + "learning_rate": 1.2555043522785459e-05, + "loss": 0.5079, + "num_tokens": 378329831.0, + "step": 4021 + }, + { + "epoch": 0.6864652671104284, + "grad_norm": 0.4278035849310555, + "learning_rate": 1.2548216419184163e-05, + "loss": 0.5518, + "num_tokens": 378447572.0, + "step": 4022 + }, + { + "epoch": 0.6866359447004609, + "grad_norm": 0.5470483384286055, + "learning_rate": 1.2541389315582866e-05, + "loss": 0.5659, + "num_tokens": 378514709.0, + "step": 4023 + }, + { + "epoch": 0.6868066222904933, + "grad_norm": 0.43018992337247614, + "learning_rate": 1.2534562211981567e-05, + "loss": 0.4979, + "num_tokens": 378619182.0, + "step": 4024 + }, + { + "epoch": 0.6869772998805257, + "grad_norm": 0.5073740953022507, + "learning_rate": 1.252773510838027e-05, + "loss": 0.6247, + "num_tokens": 378709502.0, + "step": 4025 + }, + { + "epoch": 0.6871479774705581, + "grad_norm": 0.49477186774888265, + "learning_rate": 1.2520908004778973e-05, + "loss": 0.6078, + "num_tokens": 378803480.0, + "step": 4026 + }, + { + "epoch": 0.6873186550605905, + "grad_norm": 0.453094144828214, + "learning_rate": 1.2514080901177676e-05, + "loss": 0.4982, + "num_tokens": 378894070.0, + "step": 4027 + }, + { + "epoch": 0.687489332650623, + "grad_norm": 0.44838988293392007, + "learning_rate": 1.250725379757638e-05, + "loss": 0.5687, + "num_tokens": 379000458.0, + "step": 4028 + }, + { + "epoch": 0.6876600102406554, + "grad_norm": 0.49385735167949923, + "learning_rate": 1.2500426693975082e-05, + "loss": 0.5258, + "num_tokens": 379084061.0, + "step": 4029 + }, + { + "epoch": 0.6878306878306878, + "grad_norm": 0.46204392316872395, + "learning_rate": 1.2493599590373784e-05, + "loss": 0.542, + "num_tokens": 379180181.0, + "step": 4030 + }, + { + "epoch": 0.6880013654207202, + "grad_norm": 0.43223331323832526, + "learning_rate": 1.2486772486772486e-05, + "loss": 0.5681, + "num_tokens": 379297424.0, + "step": 4031 + }, + { + "epoch": 0.6881720430107527, + "grad_norm": 0.4538144503030459, + "learning_rate": 1.247994538317119e-05, + "loss": 0.5885, + "num_tokens": 379406255.0, + "step": 4032 + }, + { + "epoch": 0.6883427206007852, + "grad_norm": 0.46911435586466554, + "learning_rate": 1.2473118279569894e-05, + "loss": 0.5306, + "num_tokens": 379490036.0, + "step": 4033 + }, + { + "epoch": 0.6885133981908176, + "grad_norm": 0.5580320177043012, + "learning_rate": 1.2466291175968596e-05, + "loss": 0.6005, + "num_tokens": 379559450.0, + "step": 4034 + }, + { + "epoch": 0.68868407578085, + "grad_norm": 0.47166784928078126, + "learning_rate": 1.24594640723673e-05, + "loss": 0.5355, + "num_tokens": 379646937.0, + "step": 4035 + }, + { + "epoch": 0.6888547533708824, + "grad_norm": 0.5234184865564476, + "learning_rate": 1.2452636968766004e-05, + "loss": 0.5924, + "num_tokens": 379719631.0, + "step": 4036 + }, + { + "epoch": 0.6890254309609148, + "grad_norm": 0.5137521610700557, + "learning_rate": 1.2445809865164704e-05, + "loss": 0.57, + "num_tokens": 379806896.0, + "step": 4037 + }, + { + "epoch": 0.6891961085509473, + "grad_norm": 0.4795421270342953, + "learning_rate": 1.2438982761563408e-05, + "loss": 0.6234, + "num_tokens": 379914659.0, + "step": 4038 + }, + { + "epoch": 0.6893667861409797, + "grad_norm": 0.5311394655591134, + "learning_rate": 1.243215565796211e-05, + "loss": 0.5719, + "num_tokens": 379987868.0, + "step": 4039 + }, + { + "epoch": 0.6895374637310121, + "grad_norm": 0.511974874374787, + "learning_rate": 1.2425328554360813e-05, + "loss": 0.6034, + "num_tokens": 380067411.0, + "step": 4040 + }, + { + "epoch": 0.6897081413210445, + "grad_norm": 0.4955596832842013, + "learning_rate": 1.2418501450759517e-05, + "loss": 0.5419, + "num_tokens": 380143074.0, + "step": 4041 + }, + { + "epoch": 0.6898788189110769, + "grad_norm": 0.4804937047127304, + "learning_rate": 1.2411674347158221e-05, + "loss": 0.4975, + "num_tokens": 380222104.0, + "step": 4042 + }, + { + "epoch": 0.6900494965011094, + "grad_norm": 0.433240743306625, + "learning_rate": 1.2404847243556921e-05, + "loss": 0.5987, + "num_tokens": 380346009.0, + "step": 4043 + }, + { + "epoch": 0.6902201740911418, + "grad_norm": 0.48420599151813526, + "learning_rate": 1.2398020139955625e-05, + "loss": 0.5788, + "num_tokens": 380436466.0, + "step": 4044 + }, + { + "epoch": 0.6903908516811743, + "grad_norm": 0.4731453396836686, + "learning_rate": 1.2391193036354327e-05, + "loss": 0.6, + "num_tokens": 380534445.0, + "step": 4045 + }, + { + "epoch": 0.6905615292712067, + "grad_norm": 0.47191237618476606, + "learning_rate": 1.2384365932753031e-05, + "loss": 0.6286, + "num_tokens": 380642445.0, + "step": 4046 + }, + { + "epoch": 0.6907322068612392, + "grad_norm": 0.5290557727695562, + "learning_rate": 1.2377538829151735e-05, + "loss": 0.5983, + "num_tokens": 380734530.0, + "step": 4047 + }, + { + "epoch": 0.6909028844512716, + "grad_norm": 0.47287585622700823, + "learning_rate": 1.2370711725550437e-05, + "loss": 0.6172, + "num_tokens": 380834419.0, + "step": 4048 + }, + { + "epoch": 0.691073562041304, + "grad_norm": 0.5279989191247129, + "learning_rate": 1.2363884621949139e-05, + "loss": 0.6222, + "num_tokens": 380919086.0, + "step": 4049 + }, + { + "epoch": 0.6912442396313364, + "grad_norm": 0.47755089851661237, + "learning_rate": 1.2357057518347841e-05, + "loss": 0.6064, + "num_tokens": 381017957.0, + "step": 4050 + }, + { + "epoch": 0.6914149172213688, + "grad_norm": 0.4716655937733939, + "learning_rate": 1.2350230414746545e-05, + "loss": 0.5796, + "num_tokens": 381114080.0, + "step": 4051 + }, + { + "epoch": 0.6915855948114012, + "grad_norm": 0.46183917665298285, + "learning_rate": 1.2343403311145248e-05, + "loss": 0.5309, + "num_tokens": 381204240.0, + "step": 4052 + }, + { + "epoch": 0.6917562724014337, + "grad_norm": 0.4553713791502409, + "learning_rate": 1.233657620754395e-05, + "loss": 0.5623, + "num_tokens": 381303624.0, + "step": 4053 + }, + { + "epoch": 0.6919269499914661, + "grad_norm": 0.45966633833573894, + "learning_rate": 1.2329749103942654e-05, + "loss": 0.5893, + "num_tokens": 381403708.0, + "step": 4054 + }, + { + "epoch": 0.6920976275814985, + "grad_norm": 0.5126112929840628, + "learning_rate": 1.2322922000341355e-05, + "loss": 0.612, + "num_tokens": 381507996.0, + "step": 4055 + }, + { + "epoch": 0.6922683051715309, + "grad_norm": 0.4412261892596809, + "learning_rate": 1.2316094896740058e-05, + "loss": 0.5958, + "num_tokens": 381616231.0, + "step": 4056 + }, + { + "epoch": 0.6924389827615635, + "grad_norm": 0.5199409604884575, + "learning_rate": 1.2309267793138762e-05, + "loss": 0.5946, + "num_tokens": 381700626.0, + "step": 4057 + }, + { + "epoch": 0.6926096603515959, + "grad_norm": 0.5004536355882978, + "learning_rate": 1.2302440689537464e-05, + "loss": 0.5296, + "num_tokens": 381776606.0, + "step": 4058 + }, + { + "epoch": 0.6927803379416283, + "grad_norm": 0.5340610225242278, + "learning_rate": 1.2295613585936168e-05, + "loss": 0.5623, + "num_tokens": 381846468.0, + "step": 4059 + }, + { + "epoch": 0.6929510155316607, + "grad_norm": 0.5290200104684436, + "learning_rate": 1.2288786482334872e-05, + "loss": 0.545, + "num_tokens": 381923632.0, + "step": 4060 + }, + { + "epoch": 0.6931216931216931, + "grad_norm": 0.49215463299086043, + "learning_rate": 1.2281959378733572e-05, + "loss": 0.6017, + "num_tokens": 382021289.0, + "step": 4061 + }, + { + "epoch": 0.6932923707117256, + "grad_norm": 0.48606487259932674, + "learning_rate": 1.2275132275132276e-05, + "loss": 0.5271, + "num_tokens": 382103866.0, + "step": 4062 + }, + { + "epoch": 0.693463048301758, + "grad_norm": 0.4647370571879275, + "learning_rate": 1.2268305171530978e-05, + "loss": 0.5177, + "num_tokens": 382187471.0, + "step": 4063 + }, + { + "epoch": 0.6936337258917904, + "grad_norm": 0.5370922361764342, + "learning_rate": 1.2261478067929682e-05, + "loss": 0.5887, + "num_tokens": 382260235.0, + "step": 4064 + }, + { + "epoch": 0.6938044034818228, + "grad_norm": 0.46547838801904734, + "learning_rate": 1.2254650964328386e-05, + "loss": 0.5894, + "num_tokens": 382361957.0, + "step": 4065 + }, + { + "epoch": 0.6939750810718552, + "grad_norm": 0.44219338404949154, + "learning_rate": 1.2247823860727088e-05, + "loss": 0.6121, + "num_tokens": 382478011.0, + "step": 4066 + }, + { + "epoch": 0.6941457586618877, + "grad_norm": 0.47544560079500564, + "learning_rate": 1.224099675712579e-05, + "loss": 0.5301, + "num_tokens": 382563398.0, + "step": 4067 + }, + { + "epoch": 0.6943164362519201, + "grad_norm": 0.4551338676597597, + "learning_rate": 1.2234169653524492e-05, + "loss": 0.6093, + "num_tokens": 382660760.0, + "step": 4068 + }, + { + "epoch": 0.6944871138419525, + "grad_norm": 0.48884576489496545, + "learning_rate": 1.2227342549923195e-05, + "loss": 0.5847, + "num_tokens": 382761643.0, + "step": 4069 + }, + { + "epoch": 0.694657791431985, + "grad_norm": 0.5311950941231147, + "learning_rate": 1.22205154463219e-05, + "loss": 0.6558, + "num_tokens": 382844064.0, + "step": 4070 + }, + { + "epoch": 0.6948284690220174, + "grad_norm": 0.4562787785903537, + "learning_rate": 1.2213688342720601e-05, + "loss": 0.5867, + "num_tokens": 382946341.0, + "step": 4071 + }, + { + "epoch": 0.6949991466120499, + "grad_norm": 0.4489774176318795, + "learning_rate": 1.2206861239119305e-05, + "loss": 0.5142, + "num_tokens": 383043547.0, + "step": 4072 + }, + { + "epoch": 0.6951698242020823, + "grad_norm": 0.5191170172749052, + "learning_rate": 1.2200034135518009e-05, + "loss": 0.5238, + "num_tokens": 383126662.0, + "step": 4073 + }, + { + "epoch": 0.6953405017921147, + "grad_norm": 0.4662900618847519, + "learning_rate": 1.219320703191671e-05, + "loss": 0.5792, + "num_tokens": 383224838.0, + "step": 4074 + }, + { + "epoch": 0.6955111793821471, + "grad_norm": 0.4994288408397585, + "learning_rate": 1.2186379928315413e-05, + "loss": 0.5954, + "num_tokens": 383312525.0, + "step": 4075 + }, + { + "epoch": 0.6956818569721795, + "grad_norm": 0.5613017039908135, + "learning_rate": 1.2179552824714117e-05, + "loss": 0.5779, + "num_tokens": 383378198.0, + "step": 4076 + }, + { + "epoch": 0.695852534562212, + "grad_norm": 0.4846737178308851, + "learning_rate": 1.2172725721112819e-05, + "loss": 0.5795, + "num_tokens": 383475526.0, + "step": 4077 + }, + { + "epoch": 0.6960232121522444, + "grad_norm": 0.46784073180407176, + "learning_rate": 1.2165898617511523e-05, + "loss": 0.5479, + "num_tokens": 383562515.0, + "step": 4078 + }, + { + "epoch": 0.6961938897422768, + "grad_norm": 0.454240140371569, + "learning_rate": 1.2159071513910226e-05, + "loss": 0.5988, + "num_tokens": 383666269.0, + "step": 4079 + }, + { + "epoch": 0.6963645673323092, + "grad_norm": 0.4528950323368183, + "learning_rate": 1.2152244410308927e-05, + "loss": 0.5212, + "num_tokens": 383762697.0, + "step": 4080 + }, + { + "epoch": 0.6965352449223416, + "grad_norm": 0.497087172696841, + "learning_rate": 1.214541730670763e-05, + "loss": 0.5636, + "num_tokens": 383840281.0, + "step": 4081 + }, + { + "epoch": 0.6967059225123742, + "grad_norm": 0.4465017208586227, + "learning_rate": 1.2138590203106333e-05, + "loss": 0.5035, + "num_tokens": 383944622.0, + "step": 4082 + }, + { + "epoch": 0.6968766001024066, + "grad_norm": 0.477920664090481, + "learning_rate": 1.2131763099505036e-05, + "loss": 0.5452, + "num_tokens": 384036236.0, + "step": 4083 + }, + { + "epoch": 0.697047277692439, + "grad_norm": 0.4573135602714023, + "learning_rate": 1.212493599590374e-05, + "loss": 0.5576, + "num_tokens": 384136963.0, + "step": 4084 + }, + { + "epoch": 0.6972179552824714, + "grad_norm": 0.5138586498152113, + "learning_rate": 1.2118108892302442e-05, + "loss": 0.584, + "num_tokens": 384220301.0, + "step": 4085 + }, + { + "epoch": 0.6973886328725039, + "grad_norm": 0.5073213015452517, + "learning_rate": 1.2111281788701144e-05, + "loss": 0.5483, + "num_tokens": 384290978.0, + "step": 4086 + }, + { + "epoch": 0.6975593104625363, + "grad_norm": 0.49296826031986285, + "learning_rate": 1.2104454685099846e-05, + "loss": 0.6203, + "num_tokens": 384407367.0, + "step": 4087 + }, + { + "epoch": 0.6977299880525687, + "grad_norm": 0.5092827679773567, + "learning_rate": 1.209762758149855e-05, + "loss": 0.5517, + "num_tokens": 384499652.0, + "step": 4088 + }, + { + "epoch": 0.6979006656426011, + "grad_norm": 0.47500974673443863, + "learning_rate": 1.2090800477897254e-05, + "loss": 0.5739, + "num_tokens": 384590374.0, + "step": 4089 + }, + { + "epoch": 0.6980713432326335, + "grad_norm": 0.4792486588484749, + "learning_rate": 1.2083973374295956e-05, + "loss": 0.6713, + "num_tokens": 384707783.0, + "step": 4090 + }, + { + "epoch": 0.698242020822666, + "grad_norm": 0.5074972850223977, + "learning_rate": 1.207714627069466e-05, + "loss": 0.6046, + "num_tokens": 384792223.0, + "step": 4091 + }, + { + "epoch": 0.6984126984126984, + "grad_norm": 0.4513597053395292, + "learning_rate": 1.207031916709336e-05, + "loss": 0.6322, + "num_tokens": 384903810.0, + "step": 4092 + }, + { + "epoch": 0.6985833760027308, + "grad_norm": 0.4606965108238913, + "learning_rate": 1.2063492063492064e-05, + "loss": 0.5773, + "num_tokens": 385009666.0, + "step": 4093 + }, + { + "epoch": 0.6987540535927633, + "grad_norm": 0.4362702584096806, + "learning_rate": 1.2056664959890768e-05, + "loss": 0.5937, + "num_tokens": 385129323.0, + "step": 4094 + }, + { + "epoch": 0.6989247311827957, + "grad_norm": 0.4918086275347048, + "learning_rate": 1.204983785628947e-05, + "loss": 0.6722, + "num_tokens": 385232104.0, + "step": 4095 + }, + { + "epoch": 0.6990954087728282, + "grad_norm": 0.5164245568965276, + "learning_rate": 1.2043010752688173e-05, + "loss": 0.6246, + "num_tokens": 385317431.0, + "step": 4096 + }, + { + "epoch": 0.6992660863628606, + "grad_norm": 0.4429121718117885, + "learning_rate": 1.2036183649086877e-05, + "loss": 0.5538, + "num_tokens": 385421385.0, + "step": 4097 + }, + { + "epoch": 0.699436763952893, + "grad_norm": 0.5064010273685836, + "learning_rate": 1.2029356545485578e-05, + "loss": 0.6038, + "num_tokens": 385512985.0, + "step": 4098 + }, + { + "epoch": 0.6996074415429254, + "grad_norm": 0.4983713392854213, + "learning_rate": 1.2022529441884281e-05, + "loss": 0.5941, + "num_tokens": 385600660.0, + "step": 4099 + }, + { + "epoch": 0.6997781191329578, + "grad_norm": 0.4711965770383893, + "learning_rate": 1.2015702338282983e-05, + "loss": 0.6212, + "num_tokens": 385716851.0, + "step": 4100 + }, + { + "epoch": 0.6999487967229903, + "grad_norm": 0.4592991284258155, + "learning_rate": 1.2008875234681687e-05, + "loss": 0.5983, + "num_tokens": 385815909.0, + "step": 4101 + }, + { + "epoch": 0.7001194743130227, + "grad_norm": 0.48315263148918547, + "learning_rate": 1.2002048131080391e-05, + "loss": 0.5577, + "num_tokens": 385911058.0, + "step": 4102 + }, + { + "epoch": 0.7002901519030551, + "grad_norm": 0.4625280064411512, + "learning_rate": 1.1995221027479093e-05, + "loss": 0.5811, + "num_tokens": 386003613.0, + "step": 4103 + }, + { + "epoch": 0.7004608294930875, + "grad_norm": 0.45073773680556023, + "learning_rate": 1.1988393923877797e-05, + "loss": 0.61, + "num_tokens": 386116346.0, + "step": 4104 + }, + { + "epoch": 0.7006315070831199, + "grad_norm": 0.4797281450030446, + "learning_rate": 1.1981566820276497e-05, + "loss": 0.486, + "num_tokens": 386198496.0, + "step": 4105 + }, + { + "epoch": 0.7008021846731524, + "grad_norm": 0.5221826230793649, + "learning_rate": 1.19747397166752e-05, + "loss": 0.5766, + "num_tokens": 386284154.0, + "step": 4106 + }, + { + "epoch": 0.7009728622631849, + "grad_norm": 0.440929936072235, + "learning_rate": 1.1967912613073905e-05, + "loss": 0.5549, + "num_tokens": 386394358.0, + "step": 4107 + }, + { + "epoch": 0.7011435398532173, + "grad_norm": 0.4867872242468334, + "learning_rate": 1.1961085509472607e-05, + "loss": 0.5291, + "num_tokens": 386514960.0, + "step": 4108 + }, + { + "epoch": 0.7013142174432497, + "grad_norm": 0.4473277090887126, + "learning_rate": 1.195425840587131e-05, + "loss": 0.5738, + "num_tokens": 386616634.0, + "step": 4109 + }, + { + "epoch": 0.7014848950332822, + "grad_norm": 0.5150706882467296, + "learning_rate": 1.1947431302270014e-05, + "loss": 0.6203, + "num_tokens": 386699751.0, + "step": 4110 + }, + { + "epoch": 0.7016555726233146, + "grad_norm": 0.5137330314722427, + "learning_rate": 1.1940604198668715e-05, + "loss": 0.5724, + "num_tokens": 386772855.0, + "step": 4111 + }, + { + "epoch": 0.701826250213347, + "grad_norm": 0.5179589818625412, + "learning_rate": 1.1933777095067418e-05, + "loss": 0.553, + "num_tokens": 386855662.0, + "step": 4112 + }, + { + "epoch": 0.7019969278033794, + "grad_norm": 0.4658119026067818, + "learning_rate": 1.1926949991466122e-05, + "loss": 0.5207, + "num_tokens": 386947518.0, + "step": 4113 + }, + { + "epoch": 0.7021676053934118, + "grad_norm": 0.47551722206467856, + "learning_rate": 1.1920122887864824e-05, + "loss": 0.6397, + "num_tokens": 387048007.0, + "step": 4114 + }, + { + "epoch": 0.7023382829834443, + "grad_norm": 0.5094923225732427, + "learning_rate": 1.1913295784263528e-05, + "loss": 0.5598, + "num_tokens": 387127313.0, + "step": 4115 + }, + { + "epoch": 0.7025089605734767, + "grad_norm": 0.5737273606485829, + "learning_rate": 1.1906468680662232e-05, + "loss": 0.6097, + "num_tokens": 387197702.0, + "step": 4116 + }, + { + "epoch": 0.7026796381635091, + "grad_norm": 0.5455939725478027, + "learning_rate": 1.1899641577060932e-05, + "loss": 0.6246, + "num_tokens": 387279024.0, + "step": 4117 + }, + { + "epoch": 0.7028503157535415, + "grad_norm": 0.47159716738246876, + "learning_rate": 1.1892814473459636e-05, + "loss": 0.5855, + "num_tokens": 387388381.0, + "step": 4118 + }, + { + "epoch": 0.703020993343574, + "grad_norm": 0.49897711903709224, + "learning_rate": 1.1885987369858338e-05, + "loss": 0.5754, + "num_tokens": 387476029.0, + "step": 4119 + }, + { + "epoch": 0.7031916709336065, + "grad_norm": 0.5361073272392255, + "learning_rate": 1.1879160266257042e-05, + "loss": 0.4912, + "num_tokens": 387545297.0, + "step": 4120 + }, + { + "epoch": 0.7033623485236389, + "grad_norm": 0.5153720574080317, + "learning_rate": 1.1872333162655745e-05, + "loss": 0.6424, + "num_tokens": 387634139.0, + "step": 4121 + }, + { + "epoch": 0.7035330261136713, + "grad_norm": 0.48922896284720485, + "learning_rate": 1.1865506059054447e-05, + "loss": 0.5724, + "num_tokens": 387722374.0, + "step": 4122 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.5300479469769814, + "learning_rate": 1.185867895545315e-05, + "loss": 0.5668, + "num_tokens": 387798657.0, + "step": 4123 + }, + { + "epoch": 0.7038743812937361, + "grad_norm": 0.498534968868138, + "learning_rate": 1.1851851851851852e-05, + "loss": 0.5816, + "num_tokens": 387885736.0, + "step": 4124 + }, + { + "epoch": 0.7040450588837686, + "grad_norm": 0.4469669770993118, + "learning_rate": 1.1845024748250555e-05, + "loss": 0.5674, + "num_tokens": 387985495.0, + "step": 4125 + }, + { + "epoch": 0.704215736473801, + "grad_norm": 0.6072948830818997, + "learning_rate": 1.183819764464926e-05, + "loss": 0.6895, + "num_tokens": 388067429.0, + "step": 4126 + }, + { + "epoch": 0.7043864140638334, + "grad_norm": 0.4695087224784077, + "learning_rate": 1.1831370541047961e-05, + "loss": 0.6271, + "num_tokens": 388171507.0, + "step": 4127 + }, + { + "epoch": 0.7045570916538658, + "grad_norm": 0.5616788388089635, + "learning_rate": 1.1824543437446665e-05, + "loss": 0.6122, + "num_tokens": 388245270.0, + "step": 4128 + }, + { + "epoch": 0.7047277692438982, + "grad_norm": 0.5202271142642079, + "learning_rate": 1.1817716333845365e-05, + "loss": 0.5509, + "num_tokens": 388315829.0, + "step": 4129 + }, + { + "epoch": 0.7048984468339307, + "grad_norm": 0.45977910805690847, + "learning_rate": 1.1810889230244069e-05, + "loss": 0.5678, + "num_tokens": 388409929.0, + "step": 4130 + }, + { + "epoch": 0.7050691244239631, + "grad_norm": 0.4962189760774949, + "learning_rate": 1.1804062126642773e-05, + "loss": 0.6235, + "num_tokens": 388502353.0, + "step": 4131 + }, + { + "epoch": 0.7052398020139956, + "grad_norm": 0.4430935988467391, + "learning_rate": 1.1797235023041475e-05, + "loss": 0.5954, + "num_tokens": 388611406.0, + "step": 4132 + }, + { + "epoch": 0.705410479604028, + "grad_norm": 0.49181163287906027, + "learning_rate": 1.1790407919440179e-05, + "loss": 0.504, + "num_tokens": 388693290.0, + "step": 4133 + }, + { + "epoch": 0.7055811571940604, + "grad_norm": 0.46996332694526616, + "learning_rate": 1.1783580815838882e-05, + "loss": 0.5939, + "num_tokens": 388789765.0, + "step": 4134 + }, + { + "epoch": 0.7057518347840929, + "grad_norm": 0.4639741478649215, + "learning_rate": 1.1776753712237585e-05, + "loss": 0.5901, + "num_tokens": 388886144.0, + "step": 4135 + }, + { + "epoch": 0.7059225123741253, + "grad_norm": 0.44680707031708133, + "learning_rate": 1.1769926608636287e-05, + "loss": 0.541, + "num_tokens": 388982983.0, + "step": 4136 + }, + { + "epoch": 0.7060931899641577, + "grad_norm": 0.5113767031152918, + "learning_rate": 1.1763099505034989e-05, + "loss": 0.5799, + "num_tokens": 389065716.0, + "step": 4137 + }, + { + "epoch": 0.7062638675541901, + "grad_norm": 0.45003599662460125, + "learning_rate": 1.1756272401433692e-05, + "loss": 0.5395, + "num_tokens": 389167247.0, + "step": 4138 + }, + { + "epoch": 0.7064345451442225, + "grad_norm": 0.5266127172020076, + "learning_rate": 1.1749445297832396e-05, + "loss": 0.5929, + "num_tokens": 389245786.0, + "step": 4139 + }, + { + "epoch": 0.706605222734255, + "grad_norm": 0.4799074028520023, + "learning_rate": 1.1742618194231098e-05, + "loss": 0.6516, + "num_tokens": 389357290.0, + "step": 4140 + }, + { + "epoch": 0.7067759003242874, + "grad_norm": 0.45100202123838, + "learning_rate": 1.1735791090629802e-05, + "loss": 0.5728, + "num_tokens": 389460618.0, + "step": 4141 + }, + { + "epoch": 0.7069465779143198, + "grad_norm": 0.4550728042857259, + "learning_rate": 1.1728963987028502e-05, + "loss": 0.6377, + "num_tokens": 389574928.0, + "step": 4142 + }, + { + "epoch": 0.7071172555043522, + "grad_norm": 0.471279616475047, + "learning_rate": 1.1722136883427206e-05, + "loss": 0.5254, + "num_tokens": 389656875.0, + "step": 4143 + }, + { + "epoch": 0.7072879330943848, + "grad_norm": 0.4293760751552169, + "learning_rate": 1.171530977982591e-05, + "loss": 0.633, + "num_tokens": 389776456.0, + "step": 4144 + }, + { + "epoch": 0.7074586106844172, + "grad_norm": 0.5439217105760531, + "learning_rate": 1.1708482676224614e-05, + "loss": 0.5685, + "num_tokens": 389852003.0, + "step": 4145 + }, + { + "epoch": 0.7076292882744496, + "grad_norm": 0.4641489262208069, + "learning_rate": 1.1701655572623316e-05, + "loss": 0.5186, + "num_tokens": 389936346.0, + "step": 4146 + }, + { + "epoch": 0.707799965864482, + "grad_norm": 0.46690216391868716, + "learning_rate": 1.169482846902202e-05, + "loss": 0.6127, + "num_tokens": 390039130.0, + "step": 4147 + }, + { + "epoch": 0.7079706434545144, + "grad_norm": 0.4865507768783059, + "learning_rate": 1.168800136542072e-05, + "loss": 0.5304, + "num_tokens": 390119440.0, + "step": 4148 + }, + { + "epoch": 0.7081413210445469, + "grad_norm": 0.44386639879508083, + "learning_rate": 1.1681174261819424e-05, + "loss": 0.5818, + "num_tokens": 390229029.0, + "step": 4149 + }, + { + "epoch": 0.7083119986345793, + "grad_norm": 0.566539970379962, + "learning_rate": 1.1674347158218127e-05, + "loss": 0.6115, + "num_tokens": 390296697.0, + "step": 4150 + }, + { + "epoch": 0.7084826762246117, + "grad_norm": 0.4775035536028336, + "learning_rate": 1.166752005461683e-05, + "loss": 0.5572, + "num_tokens": 390384121.0, + "step": 4151 + }, + { + "epoch": 0.7086533538146441, + "grad_norm": 0.5257205587574723, + "learning_rate": 1.1660692951015533e-05, + "loss": 0.6971, + "num_tokens": 390480115.0, + "step": 4152 + }, + { + "epoch": 0.7088240314046765, + "grad_norm": 0.4485352700716774, + "learning_rate": 1.1653865847414237e-05, + "loss": 0.6109, + "num_tokens": 390600360.0, + "step": 4153 + }, + { + "epoch": 0.708994708994709, + "grad_norm": 0.42741389771797933, + "learning_rate": 1.1647038743812937e-05, + "loss": 0.5617, + "num_tokens": 390715247.0, + "step": 4154 + }, + { + "epoch": 0.7091653865847414, + "grad_norm": 0.535568392326428, + "learning_rate": 1.1640211640211641e-05, + "loss": 0.6482, + "num_tokens": 390798135.0, + "step": 4155 + }, + { + "epoch": 0.7093360641747739, + "grad_norm": 0.5619900277389324, + "learning_rate": 1.1633384536610343e-05, + "loss": 0.6313, + "num_tokens": 390862606.0, + "step": 4156 + }, + { + "epoch": 0.7095067417648063, + "grad_norm": 0.4762809618343771, + "learning_rate": 1.1626557433009047e-05, + "loss": 0.6075, + "num_tokens": 390964702.0, + "step": 4157 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 0.5793691528101894, + "learning_rate": 1.161973032940775e-05, + "loss": 0.6894, + "num_tokens": 391034775.0, + "step": 4158 + }, + { + "epoch": 0.7098480969448712, + "grad_norm": 0.4562038988249158, + "learning_rate": 1.1612903225806453e-05, + "loss": 0.6117, + "num_tokens": 391145964.0, + "step": 4159 + }, + { + "epoch": 0.7100187745349036, + "grad_norm": 0.4834432103175352, + "learning_rate": 1.1606076122205155e-05, + "loss": 0.5086, + "num_tokens": 391230699.0, + "step": 4160 + }, + { + "epoch": 0.710189452124936, + "grad_norm": 0.5002894036568455, + "learning_rate": 1.1599249018603857e-05, + "loss": 0.5662, + "num_tokens": 391316360.0, + "step": 4161 + }, + { + "epoch": 0.7103601297149684, + "grad_norm": 0.46752050551593594, + "learning_rate": 1.159242191500256e-05, + "loss": 0.5611, + "num_tokens": 391401539.0, + "step": 4162 + }, + { + "epoch": 0.7105308073050008, + "grad_norm": 0.4214846935168744, + "learning_rate": 1.1585594811401264e-05, + "loss": 0.6184, + "num_tokens": 391539981.0, + "step": 4163 + }, + { + "epoch": 0.7107014848950333, + "grad_norm": 0.4700432162855328, + "learning_rate": 1.1578767707799967e-05, + "loss": 0.5276, + "num_tokens": 391639570.0, + "step": 4164 + }, + { + "epoch": 0.7108721624850657, + "grad_norm": 0.44950213231519465, + "learning_rate": 1.157194060419867e-05, + "loss": 0.5712, + "num_tokens": 391748337.0, + "step": 4165 + }, + { + "epoch": 0.7110428400750981, + "grad_norm": 0.4510234376025752, + "learning_rate": 1.1565113500597374e-05, + "loss": 0.5241, + "num_tokens": 391845460.0, + "step": 4166 + }, + { + "epoch": 0.7112135176651305, + "grad_norm": 0.5150983933947361, + "learning_rate": 1.1558286396996074e-05, + "loss": 0.5399, + "num_tokens": 391923148.0, + "step": 4167 + }, + { + "epoch": 0.7113841952551629, + "grad_norm": 0.5174045543091362, + "learning_rate": 1.1551459293394778e-05, + "loss": 0.6655, + "num_tokens": 392018433.0, + "step": 4168 + }, + { + "epoch": 0.7115548728451955, + "grad_norm": 0.46114593488945044, + "learning_rate": 1.154463218979348e-05, + "loss": 0.5941, + "num_tokens": 392117750.0, + "step": 4169 + }, + { + "epoch": 0.7117255504352279, + "grad_norm": 0.49318611587337335, + "learning_rate": 1.1537805086192184e-05, + "loss": 0.5574, + "num_tokens": 392203828.0, + "step": 4170 + }, + { + "epoch": 0.7118962280252603, + "grad_norm": 0.4472131132911206, + "learning_rate": 1.1530977982590888e-05, + "loss": 0.5357, + "num_tokens": 392298151.0, + "step": 4171 + }, + { + "epoch": 0.7120669056152927, + "grad_norm": 0.45776158314149157, + "learning_rate": 1.152415087898959e-05, + "loss": 0.5801, + "num_tokens": 392410414.0, + "step": 4172 + }, + { + "epoch": 0.7122375832053252, + "grad_norm": 0.48586553100051166, + "learning_rate": 1.1517323775388292e-05, + "loss": 0.4706, + "num_tokens": 392481467.0, + "step": 4173 + }, + { + "epoch": 0.7124082607953576, + "grad_norm": 0.4554274055597214, + "learning_rate": 1.1510496671786994e-05, + "loss": 0.5963, + "num_tokens": 392580901.0, + "step": 4174 + }, + { + "epoch": 0.71257893838539, + "grad_norm": 0.49527018255153715, + "learning_rate": 1.1503669568185698e-05, + "loss": 0.5785, + "num_tokens": 392665451.0, + "step": 4175 + }, + { + "epoch": 0.7127496159754224, + "grad_norm": 0.43773732566450524, + "learning_rate": 1.1496842464584402e-05, + "loss": 0.4915, + "num_tokens": 392760311.0, + "step": 4176 + }, + { + "epoch": 0.7129202935654548, + "grad_norm": 0.4682743527969606, + "learning_rate": 1.1490015360983104e-05, + "loss": 0.5852, + "num_tokens": 392860046.0, + "step": 4177 + }, + { + "epoch": 0.7130909711554873, + "grad_norm": 0.49649240403115796, + "learning_rate": 1.1483188257381807e-05, + "loss": 0.6317, + "num_tokens": 392955586.0, + "step": 4178 + }, + { + "epoch": 0.7132616487455197, + "grad_norm": 0.48005824183818957, + "learning_rate": 1.147636115378051e-05, + "loss": 0.5842, + "num_tokens": 393045702.0, + "step": 4179 + }, + { + "epoch": 0.7134323263355521, + "grad_norm": 0.506874548233269, + "learning_rate": 1.1469534050179212e-05, + "loss": 0.564, + "num_tokens": 393130559.0, + "step": 4180 + }, + { + "epoch": 0.7136030039255846, + "grad_norm": 0.5587040289719484, + "learning_rate": 1.1462706946577915e-05, + "loss": 0.5812, + "num_tokens": 393204210.0, + "step": 4181 + }, + { + "epoch": 0.713773681515617, + "grad_norm": 0.45735799408219047, + "learning_rate": 1.1455879842976619e-05, + "loss": 0.5438, + "num_tokens": 393299506.0, + "step": 4182 + }, + { + "epoch": 0.7139443591056495, + "grad_norm": 0.49685647805765465, + "learning_rate": 1.1449052739375321e-05, + "loss": 0.5839, + "num_tokens": 393386537.0, + "step": 4183 + }, + { + "epoch": 0.7141150366956819, + "grad_norm": 0.4960685302032027, + "learning_rate": 1.1442225635774025e-05, + "loss": 0.5717, + "num_tokens": 393471218.0, + "step": 4184 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.5010635880712248, + "learning_rate": 1.1435398532172725e-05, + "loss": 0.605, + "num_tokens": 393553272.0, + "step": 4185 + }, + { + "epoch": 0.7144563918757467, + "grad_norm": 0.4437033887987615, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.5193, + "num_tokens": 393650492.0, + "step": 4186 + }, + { + "epoch": 0.7146270694657791, + "grad_norm": 0.525044433467863, + "learning_rate": 1.1421744324970133e-05, + "loss": 0.5922, + "num_tokens": 393742432.0, + "step": 4187 + }, + { + "epoch": 0.7147977470558116, + "grad_norm": 0.4918521216072958, + "learning_rate": 1.1414917221368835e-05, + "loss": 0.6036, + "num_tokens": 393833004.0, + "step": 4188 + }, + { + "epoch": 0.714968424645844, + "grad_norm": 0.46463665306488, + "learning_rate": 1.1408090117767539e-05, + "loss": 0.5525, + "num_tokens": 393927669.0, + "step": 4189 + }, + { + "epoch": 0.7151391022358764, + "grad_norm": 0.510975638009283, + "learning_rate": 1.1401263014166242e-05, + "loss": 0.6436, + "num_tokens": 394032873.0, + "step": 4190 + }, + { + "epoch": 0.7153097798259088, + "grad_norm": 0.4990804113409891, + "learning_rate": 1.1394435910564943e-05, + "loss": 0.5967, + "num_tokens": 394119576.0, + "step": 4191 + }, + { + "epoch": 0.7154804574159412, + "grad_norm": 0.46474717014559763, + "learning_rate": 1.1387608806963647e-05, + "loss": 0.5996, + "num_tokens": 394216625.0, + "step": 4192 + }, + { + "epoch": 0.7156511350059738, + "grad_norm": 0.5012372546895841, + "learning_rate": 1.1380781703362349e-05, + "loss": 0.5861, + "num_tokens": 394300425.0, + "step": 4193 + }, + { + "epoch": 0.7158218125960062, + "grad_norm": 0.5053258544258128, + "learning_rate": 1.1373954599761052e-05, + "loss": 0.5912, + "num_tokens": 394386569.0, + "step": 4194 + }, + { + "epoch": 0.7159924901860386, + "grad_norm": 0.5075604560609313, + "learning_rate": 1.1367127496159756e-05, + "loss": 0.7188, + "num_tokens": 394489063.0, + "step": 4195 + }, + { + "epoch": 0.716163167776071, + "grad_norm": 0.503464258220226, + "learning_rate": 1.1360300392558458e-05, + "loss": 0.6462, + "num_tokens": 394590155.0, + "step": 4196 + }, + { + "epoch": 0.7163338453661034, + "grad_norm": 0.4839578544102546, + "learning_rate": 1.1353473288957162e-05, + "loss": 0.5533, + "num_tokens": 394675328.0, + "step": 4197 + }, + { + "epoch": 0.7165045229561359, + "grad_norm": 0.4762296794221066, + "learning_rate": 1.1346646185355862e-05, + "loss": 0.601, + "num_tokens": 394767226.0, + "step": 4198 + }, + { + "epoch": 0.7166752005461683, + "grad_norm": 0.5019653933263576, + "learning_rate": 1.1339819081754566e-05, + "loss": 0.5119, + "num_tokens": 394841606.0, + "step": 4199 + }, + { + "epoch": 0.7168458781362007, + "grad_norm": 0.5015511839199336, + "learning_rate": 1.133299197815327e-05, + "loss": 0.5008, + "num_tokens": 394935485.0, + "step": 4200 + }, + { + "epoch": 0.7170165557262331, + "grad_norm": 0.5032290976528789, + "learning_rate": 1.1326164874551972e-05, + "loss": 0.5655, + "num_tokens": 395019812.0, + "step": 4201 + }, + { + "epoch": 0.7171872333162655, + "grad_norm": 0.45437127327478266, + "learning_rate": 1.1319337770950676e-05, + "loss": 0.6015, + "num_tokens": 395116839.0, + "step": 4202 + }, + { + "epoch": 0.717357910906298, + "grad_norm": 0.5118814455209338, + "learning_rate": 1.131251066734938e-05, + "loss": 0.5928, + "num_tokens": 395195277.0, + "step": 4203 + }, + { + "epoch": 0.7175285884963304, + "grad_norm": 0.5437099535664465, + "learning_rate": 1.130568356374808e-05, + "loss": 0.5955, + "num_tokens": 395277249.0, + "step": 4204 + }, + { + "epoch": 0.7176992660863628, + "grad_norm": 0.46091704401643957, + "learning_rate": 1.1298856460146784e-05, + "loss": 0.6331, + "num_tokens": 395384063.0, + "step": 4205 + }, + { + "epoch": 0.7178699436763953, + "grad_norm": 0.4582373418303137, + "learning_rate": 1.1292029356545486e-05, + "loss": 0.5757, + "num_tokens": 395486319.0, + "step": 4206 + }, + { + "epoch": 0.7180406212664278, + "grad_norm": 0.48749294555648176, + "learning_rate": 1.128520225294419e-05, + "loss": 0.534, + "num_tokens": 395573708.0, + "step": 4207 + }, + { + "epoch": 0.7182112988564602, + "grad_norm": 0.4603445447588311, + "learning_rate": 1.1278375149342893e-05, + "loss": 0.4987, + "num_tokens": 395664782.0, + "step": 4208 + }, + { + "epoch": 0.7183819764464926, + "grad_norm": 0.5060192014328837, + "learning_rate": 1.1271548045741595e-05, + "loss": 0.516, + "num_tokens": 395732802.0, + "step": 4209 + }, + { + "epoch": 0.718552654036525, + "grad_norm": 0.4434204672622063, + "learning_rate": 1.1264720942140297e-05, + "loss": 0.5536, + "num_tokens": 395841375.0, + "step": 4210 + }, + { + "epoch": 0.7187233316265574, + "grad_norm": 0.47353460631279026, + "learning_rate": 1.1257893838539e-05, + "loss": 0.5885, + "num_tokens": 395940279.0, + "step": 4211 + }, + { + "epoch": 0.7188940092165899, + "grad_norm": 0.49640599650948386, + "learning_rate": 1.1251066734937703e-05, + "loss": 0.5896, + "num_tokens": 396028905.0, + "step": 4212 + }, + { + "epoch": 0.7190646868066223, + "grad_norm": 0.5130460925822962, + "learning_rate": 1.1244239631336407e-05, + "loss": 0.5563, + "num_tokens": 396105967.0, + "step": 4213 + }, + { + "epoch": 0.7192353643966547, + "grad_norm": 0.5954253195407005, + "learning_rate": 1.123741252773511e-05, + "loss": 0.5234, + "num_tokens": 396168687.0, + "step": 4214 + }, + { + "epoch": 0.7194060419866871, + "grad_norm": 0.49660376968702424, + "learning_rate": 1.1230585424133813e-05, + "loss": 0.5868, + "num_tokens": 396261438.0, + "step": 4215 + }, + { + "epoch": 0.7195767195767195, + "grad_norm": 0.4248828646703177, + "learning_rate": 1.1223758320532515e-05, + "loss": 0.5497, + "num_tokens": 396387790.0, + "step": 4216 + }, + { + "epoch": 0.719747397166752, + "grad_norm": 0.4718087066644489, + "learning_rate": 1.1216931216931217e-05, + "loss": 0.5466, + "num_tokens": 396481634.0, + "step": 4217 + }, + { + "epoch": 0.7199180747567845, + "grad_norm": 0.4438709788033794, + "learning_rate": 1.121010411332992e-05, + "loss": 0.5318, + "num_tokens": 396578776.0, + "step": 4218 + }, + { + "epoch": 0.7200887523468169, + "grad_norm": 0.5057812104466077, + "learning_rate": 1.1203277009728624e-05, + "loss": 0.5786, + "num_tokens": 396658658.0, + "step": 4219 + }, + { + "epoch": 0.7202594299368493, + "grad_norm": 0.43397773024908165, + "learning_rate": 1.1196449906127326e-05, + "loss": 0.5638, + "num_tokens": 396769724.0, + "step": 4220 + }, + { + "epoch": 0.7204301075268817, + "grad_norm": 0.5076846569046706, + "learning_rate": 1.118962280252603e-05, + "loss": 0.664, + "num_tokens": 396862125.0, + "step": 4221 + }, + { + "epoch": 0.7206007851169142, + "grad_norm": 0.4859766236841738, + "learning_rate": 1.118279569892473e-05, + "loss": 0.6066, + "num_tokens": 396953338.0, + "step": 4222 + }, + { + "epoch": 0.7207714627069466, + "grad_norm": 0.4905874972790196, + "learning_rate": 1.1175968595323434e-05, + "loss": 0.578, + "num_tokens": 397045299.0, + "step": 4223 + }, + { + "epoch": 0.720942140296979, + "grad_norm": 0.5116476724363765, + "learning_rate": 1.1169141491722138e-05, + "loss": 0.6113, + "num_tokens": 397130200.0, + "step": 4224 + }, + { + "epoch": 0.7211128178870114, + "grad_norm": 0.5172394796334555, + "learning_rate": 1.116231438812084e-05, + "loss": 0.5016, + "num_tokens": 397202802.0, + "step": 4225 + }, + { + "epoch": 0.7212834954770438, + "grad_norm": 0.4824177100568654, + "learning_rate": 1.1155487284519544e-05, + "loss": 0.5852, + "num_tokens": 397295235.0, + "step": 4226 + }, + { + "epoch": 0.7214541730670763, + "grad_norm": 0.46816329691972003, + "learning_rate": 1.1148660180918248e-05, + "loss": 0.522, + "num_tokens": 397381501.0, + "step": 4227 + }, + { + "epoch": 0.7216248506571087, + "grad_norm": 0.4421047173746162, + "learning_rate": 1.114183307731695e-05, + "loss": 0.5957, + "num_tokens": 397490072.0, + "step": 4228 + }, + { + "epoch": 0.7217955282471411, + "grad_norm": 0.48211719875379355, + "learning_rate": 1.1135005973715652e-05, + "loss": 0.5466, + "num_tokens": 397573653.0, + "step": 4229 + }, + { + "epoch": 0.7219662058371735, + "grad_norm": 0.516314532863785, + "learning_rate": 1.1128178870114354e-05, + "loss": 0.5713, + "num_tokens": 397658244.0, + "step": 4230 + }, + { + "epoch": 0.722136883427206, + "grad_norm": 0.43421595646834105, + "learning_rate": 1.1121351766513058e-05, + "loss": 0.6123, + "num_tokens": 397789428.0, + "step": 4231 + }, + { + "epoch": 0.7223075610172385, + "grad_norm": 0.4486745693589412, + "learning_rate": 1.1114524662911761e-05, + "loss": 0.5055, + "num_tokens": 397880439.0, + "step": 4232 + }, + { + "epoch": 0.7224782386072709, + "grad_norm": 0.4485558434425161, + "learning_rate": 1.1107697559310464e-05, + "loss": 0.6138, + "num_tokens": 397987451.0, + "step": 4233 + }, + { + "epoch": 0.7226489161973033, + "grad_norm": 0.5150493879545628, + "learning_rate": 1.1100870455709167e-05, + "loss": 0.5778, + "num_tokens": 398069412.0, + "step": 4234 + }, + { + "epoch": 0.7228195937873357, + "grad_norm": 0.4923403310614869, + "learning_rate": 1.1094043352107868e-05, + "loss": 0.5007, + "num_tokens": 398149491.0, + "step": 4235 + }, + { + "epoch": 0.7229902713773682, + "grad_norm": 0.508297830570167, + "learning_rate": 1.1087216248506571e-05, + "loss": 0.5752, + "num_tokens": 398236566.0, + "step": 4236 + }, + { + "epoch": 0.7231609489674006, + "grad_norm": 0.4945358274783822, + "learning_rate": 1.1080389144905275e-05, + "loss": 0.5658, + "num_tokens": 398323481.0, + "step": 4237 + }, + { + "epoch": 0.723331626557433, + "grad_norm": 0.43933018936293533, + "learning_rate": 1.1073562041303977e-05, + "loss": 0.5659, + "num_tokens": 398439506.0, + "step": 4238 + }, + { + "epoch": 0.7235023041474654, + "grad_norm": 0.47957955274082376, + "learning_rate": 1.1066734937702681e-05, + "loss": 0.5545, + "num_tokens": 398530686.0, + "step": 4239 + }, + { + "epoch": 0.7236729817374978, + "grad_norm": 0.4981186171336324, + "learning_rate": 1.1059907834101385e-05, + "loss": 0.6578, + "num_tokens": 398632195.0, + "step": 4240 + }, + { + "epoch": 0.7238436593275303, + "grad_norm": 0.39668857886562986, + "learning_rate": 1.1053080730500085e-05, + "loss": 0.6086, + "num_tokens": 398775602.0, + "step": 4241 + }, + { + "epoch": 0.7240143369175627, + "grad_norm": 0.5017674933457379, + "learning_rate": 1.1046253626898789e-05, + "loss": 0.5822, + "num_tokens": 398858041.0, + "step": 4242 + }, + { + "epoch": 0.7241850145075952, + "grad_norm": 0.475976097431269, + "learning_rate": 1.1039426523297491e-05, + "loss": 0.6044, + "num_tokens": 398958214.0, + "step": 4243 + }, + { + "epoch": 0.7243556920976276, + "grad_norm": 0.5498234789615752, + "learning_rate": 1.1032599419696195e-05, + "loss": 0.5314, + "num_tokens": 399019678.0, + "step": 4244 + }, + { + "epoch": 0.72452636968766, + "grad_norm": 0.4351373478037276, + "learning_rate": 1.1025772316094899e-05, + "loss": 0.4949, + "num_tokens": 399121942.0, + "step": 4245 + }, + { + "epoch": 0.7246970472776925, + "grad_norm": 0.4728345845623212, + "learning_rate": 1.10189452124936e-05, + "loss": 0.5614, + "num_tokens": 399213707.0, + "step": 4246 + }, + { + "epoch": 0.7248677248677249, + "grad_norm": 0.5083460313009013, + "learning_rate": 1.1012118108892303e-05, + "loss": 0.624, + "num_tokens": 399299997.0, + "step": 4247 + }, + { + "epoch": 0.7250384024577573, + "grad_norm": 0.47362224826575244, + "learning_rate": 1.1005291005291006e-05, + "loss": 0.4953, + "num_tokens": 399389935.0, + "step": 4248 + }, + { + "epoch": 0.7252090800477897, + "grad_norm": 0.5614903176423032, + "learning_rate": 1.0998463901689708e-05, + "loss": 0.6632, + "num_tokens": 399463600.0, + "step": 4249 + }, + { + "epoch": 0.7253797576378221, + "grad_norm": 0.46113292434069125, + "learning_rate": 1.0991636798088412e-05, + "loss": 0.5462, + "num_tokens": 399551132.0, + "step": 4250 + }, + { + "epoch": 0.7255504352278546, + "grad_norm": 0.45646447092112674, + "learning_rate": 1.0984809694487116e-05, + "loss": 0.6299, + "num_tokens": 399667527.0, + "step": 4251 + }, + { + "epoch": 0.725721112817887, + "grad_norm": 0.4757833012679637, + "learning_rate": 1.0977982590885818e-05, + "loss": 0.5381, + "num_tokens": 399756210.0, + "step": 4252 + }, + { + "epoch": 0.7258917904079194, + "grad_norm": 0.48451657295623723, + "learning_rate": 1.097115548728452e-05, + "loss": 0.5273, + "num_tokens": 399840785.0, + "step": 4253 + }, + { + "epoch": 0.7260624679979518, + "grad_norm": 0.48873621621534413, + "learning_rate": 1.0964328383683222e-05, + "loss": 0.5454, + "num_tokens": 399922712.0, + "step": 4254 + }, + { + "epoch": 0.7262331455879844, + "grad_norm": 0.5271392920997412, + "learning_rate": 1.0957501280081926e-05, + "loss": 0.5187, + "num_tokens": 399987108.0, + "step": 4255 + }, + { + "epoch": 0.7264038231780168, + "grad_norm": 0.47006711064445234, + "learning_rate": 1.095067417648063e-05, + "loss": 0.5538, + "num_tokens": 400084568.0, + "step": 4256 + }, + { + "epoch": 0.7265745007680492, + "grad_norm": 0.9007229472025261, + "learning_rate": 1.0943847072879332e-05, + "loss": 0.6006, + "num_tokens": 400189460.0, + "step": 4257 + }, + { + "epoch": 0.7267451783580816, + "grad_norm": 0.4360122036906084, + "learning_rate": 1.0937019969278036e-05, + "loss": 0.6515, + "num_tokens": 400324729.0, + "step": 4258 + }, + { + "epoch": 0.726915855948114, + "grad_norm": 0.46022337421372767, + "learning_rate": 1.0930192865676736e-05, + "loss": 0.5312, + "num_tokens": 400413701.0, + "step": 4259 + }, + { + "epoch": 0.7270865335381465, + "grad_norm": 0.4889204313897834, + "learning_rate": 1.092336576207544e-05, + "loss": 0.6946, + "num_tokens": 400529592.0, + "step": 4260 + }, + { + "epoch": 0.7272572111281789, + "grad_norm": 0.5144220169431971, + "learning_rate": 1.0916538658474143e-05, + "loss": 0.6694, + "num_tokens": 400624837.0, + "step": 4261 + }, + { + "epoch": 0.7274278887182113, + "grad_norm": 0.46089609599842474, + "learning_rate": 1.0909711554872846e-05, + "loss": 0.5621, + "num_tokens": 400720834.0, + "step": 4262 + }, + { + "epoch": 0.7275985663082437, + "grad_norm": 0.49010959026949963, + "learning_rate": 1.090288445127155e-05, + "loss": 0.4912, + "num_tokens": 400795348.0, + "step": 4263 + }, + { + "epoch": 0.7277692438982761, + "grad_norm": 0.48340462364432335, + "learning_rate": 1.0896057347670253e-05, + "loss": 0.5876, + "num_tokens": 400893159.0, + "step": 4264 + }, + { + "epoch": 0.7279399214883086, + "grad_norm": 0.4841646189438429, + "learning_rate": 1.0889230244068955e-05, + "loss": 0.559, + "num_tokens": 400985202.0, + "step": 4265 + }, + { + "epoch": 0.728110599078341, + "grad_norm": 0.4240401646462732, + "learning_rate": 1.0882403140467657e-05, + "loss": 0.5767, + "num_tokens": 401103215.0, + "step": 4266 + }, + { + "epoch": 0.7282812766683734, + "grad_norm": 0.4199670040833908, + "learning_rate": 1.087557603686636e-05, + "loss": 0.5959, + "num_tokens": 401243987.0, + "step": 4267 + }, + { + "epoch": 0.7284519542584059, + "grad_norm": 0.456592933416189, + "learning_rate": 1.0868748933265063e-05, + "loss": 0.6036, + "num_tokens": 401349738.0, + "step": 4268 + }, + { + "epoch": 0.7286226318484383, + "grad_norm": 0.5011114662576707, + "learning_rate": 1.0861921829663767e-05, + "loss": 0.6299, + "num_tokens": 401449410.0, + "step": 4269 + }, + { + "epoch": 0.7287933094384708, + "grad_norm": 0.4975992581576432, + "learning_rate": 1.0855094726062469e-05, + "loss": 0.4878, + "num_tokens": 401524669.0, + "step": 4270 + }, + { + "epoch": 0.7289639870285032, + "grad_norm": 0.5073792807564796, + "learning_rate": 1.0848267622461173e-05, + "loss": 0.5776, + "num_tokens": 401605899.0, + "step": 4271 + }, + { + "epoch": 0.7291346646185356, + "grad_norm": 0.46544974304262576, + "learning_rate": 1.0841440518859873e-05, + "loss": 0.5667, + "num_tokens": 401697163.0, + "step": 4272 + }, + { + "epoch": 0.729305342208568, + "grad_norm": 0.45135038211353384, + "learning_rate": 1.0834613415258577e-05, + "loss": 0.5204, + "num_tokens": 401808030.0, + "step": 4273 + }, + { + "epoch": 0.7294760197986004, + "grad_norm": 0.5335618470214282, + "learning_rate": 1.082778631165728e-05, + "loss": 0.5847, + "num_tokens": 401922010.0, + "step": 4274 + }, + { + "epoch": 0.7296466973886329, + "grad_norm": 0.47486480242624474, + "learning_rate": 1.0820959208055983e-05, + "loss": 0.5219, + "num_tokens": 402009618.0, + "step": 4275 + }, + { + "epoch": 0.7298173749786653, + "grad_norm": 0.47957998079741854, + "learning_rate": 1.0814132104454686e-05, + "loss": 0.6057, + "num_tokens": 402097974.0, + "step": 4276 + }, + { + "epoch": 0.7299880525686977, + "grad_norm": 0.5248642266356605, + "learning_rate": 1.080730500085339e-05, + "loss": 0.6323, + "num_tokens": 402186255.0, + "step": 4277 + }, + { + "epoch": 0.7301587301587301, + "grad_norm": 0.44403562449678036, + "learning_rate": 1.080047789725209e-05, + "loss": 0.647, + "num_tokens": 402304298.0, + "step": 4278 + }, + { + "epoch": 0.7303294077487625, + "grad_norm": 0.4409154088588288, + "learning_rate": 1.0793650793650794e-05, + "loss": 0.5569, + "num_tokens": 402408389.0, + "step": 4279 + }, + { + "epoch": 0.7305000853387951, + "grad_norm": 0.5318449933322448, + "learning_rate": 1.0786823690049496e-05, + "loss": 0.5137, + "num_tokens": 402473259.0, + "step": 4280 + }, + { + "epoch": 0.7306707629288275, + "grad_norm": 0.449191021430992, + "learning_rate": 1.07799965864482e-05, + "loss": 0.6212, + "num_tokens": 402585559.0, + "step": 4281 + }, + { + "epoch": 0.7308414405188599, + "grad_norm": 0.43371587149727714, + "learning_rate": 1.0773169482846904e-05, + "loss": 0.5024, + "num_tokens": 402682239.0, + "step": 4282 + }, + { + "epoch": 0.7310121181088923, + "grad_norm": 0.6084963995079462, + "learning_rate": 1.0766342379245606e-05, + "loss": 0.7041, + "num_tokens": 402763370.0, + "step": 4283 + }, + { + "epoch": 0.7311827956989247, + "grad_norm": 0.45702546848225545, + "learning_rate": 1.0759515275644308e-05, + "loss": 0.5266, + "num_tokens": 402852727.0, + "step": 4284 + }, + { + "epoch": 0.7313534732889572, + "grad_norm": 0.5536129462551732, + "learning_rate": 1.0752688172043012e-05, + "loss": 0.5457, + "num_tokens": 402915202.0, + "step": 4285 + }, + { + "epoch": 0.7315241508789896, + "grad_norm": 0.4742987565734776, + "learning_rate": 1.0745861068441714e-05, + "loss": 0.6226, + "num_tokens": 403014906.0, + "step": 4286 + }, + { + "epoch": 0.731694828469022, + "grad_norm": 0.4656047402094198, + "learning_rate": 1.0739033964840418e-05, + "loss": 0.5699, + "num_tokens": 403111390.0, + "step": 4287 + }, + { + "epoch": 0.7318655060590544, + "grad_norm": 0.5255141446701619, + "learning_rate": 1.0732206861239121e-05, + "loss": 0.6443, + "num_tokens": 403188508.0, + "step": 4288 + }, + { + "epoch": 0.7320361836490868, + "grad_norm": 0.46533597170111096, + "learning_rate": 1.0725379757637823e-05, + "loss": 0.5637, + "num_tokens": 403279757.0, + "step": 4289 + }, + { + "epoch": 0.7322068612391193, + "grad_norm": 0.5052571511016747, + "learning_rate": 1.0718552654036525e-05, + "loss": 0.5651, + "num_tokens": 403356256.0, + "step": 4290 + }, + { + "epoch": 0.7323775388291517, + "grad_norm": 0.4694439794584718, + "learning_rate": 1.0711725550435228e-05, + "loss": 0.6153, + "num_tokens": 403454824.0, + "step": 4291 + }, + { + "epoch": 0.7325482164191841, + "grad_norm": 0.4672981802420552, + "learning_rate": 1.0704898446833931e-05, + "loss": 0.5706, + "num_tokens": 403553258.0, + "step": 4292 + }, + { + "epoch": 0.7327188940092166, + "grad_norm": 0.4720078360728437, + "learning_rate": 1.0698071343232635e-05, + "loss": 0.5347, + "num_tokens": 403642251.0, + "step": 4293 + }, + { + "epoch": 0.7328895715992491, + "grad_norm": 0.47150758800785336, + "learning_rate": 1.0691244239631337e-05, + "loss": 0.4696, + "num_tokens": 403720537.0, + "step": 4294 + }, + { + "epoch": 0.7330602491892815, + "grad_norm": 0.4393035690120951, + "learning_rate": 1.0684417136030041e-05, + "loss": 0.5859, + "num_tokens": 403836042.0, + "step": 4295 + }, + { + "epoch": 0.7332309267793139, + "grad_norm": 0.46849974286421786, + "learning_rate": 1.0677590032428745e-05, + "loss": 0.6108, + "num_tokens": 403957621.0, + "step": 4296 + }, + { + "epoch": 0.7334016043693463, + "grad_norm": 0.4903523105629108, + "learning_rate": 1.0670762928827445e-05, + "loss": 0.6342, + "num_tokens": 404049128.0, + "step": 4297 + }, + { + "epoch": 0.7335722819593787, + "grad_norm": 0.4931107689643817, + "learning_rate": 1.0663935825226149e-05, + "loss": 0.629, + "num_tokens": 404151150.0, + "step": 4298 + }, + { + "epoch": 0.7337429595494112, + "grad_norm": 0.5187412944131274, + "learning_rate": 1.0657108721624851e-05, + "loss": 0.5803, + "num_tokens": 404228678.0, + "step": 4299 + }, + { + "epoch": 0.7339136371394436, + "grad_norm": 0.5097933104434164, + "learning_rate": 1.0650281618023555e-05, + "loss": 0.5635, + "num_tokens": 404301393.0, + "step": 4300 + }, + { + "epoch": 0.734084314729476, + "grad_norm": 0.47578495689221606, + "learning_rate": 1.0643454514422258e-05, + "loss": 0.6241, + "num_tokens": 404396430.0, + "step": 4301 + }, + { + "epoch": 0.7342549923195084, + "grad_norm": 1.669509249490551, + "learning_rate": 1.063662741082096e-05, + "loss": 0.7102, + "num_tokens": 404524026.0, + "step": 4302 + }, + { + "epoch": 0.7344256699095408, + "grad_norm": 0.45355446441671304, + "learning_rate": 1.0629800307219663e-05, + "loss": 0.4654, + "num_tokens": 404617248.0, + "step": 4303 + }, + { + "epoch": 0.7345963474995733, + "grad_norm": 0.4708137223745126, + "learning_rate": 1.0622973203618365e-05, + "loss": 0.589, + "num_tokens": 404718747.0, + "step": 4304 + }, + { + "epoch": 0.7347670250896058, + "grad_norm": 0.5578500235490801, + "learning_rate": 1.0616146100017068e-05, + "loss": 0.592, + "num_tokens": 404786852.0, + "step": 4305 + }, + { + "epoch": 0.7349377026796382, + "grad_norm": 0.47051368370278285, + "learning_rate": 1.0609318996415772e-05, + "loss": 0.6073, + "num_tokens": 404888953.0, + "step": 4306 + }, + { + "epoch": 0.7351083802696706, + "grad_norm": 0.47038255151727043, + "learning_rate": 1.0602491892814474e-05, + "loss": 0.5978, + "num_tokens": 404987239.0, + "step": 4307 + }, + { + "epoch": 0.735279057859703, + "grad_norm": 0.44107785804690974, + "learning_rate": 1.0595664789213178e-05, + "loss": 0.4563, + "num_tokens": 405074522.0, + "step": 4308 + }, + { + "epoch": 0.7354497354497355, + "grad_norm": 0.4642151946678104, + "learning_rate": 1.0588837685611878e-05, + "loss": 0.6299, + "num_tokens": 405176975.0, + "step": 4309 + }, + { + "epoch": 0.7356204130397679, + "grad_norm": 0.51745894851754, + "learning_rate": 1.0582010582010582e-05, + "loss": 0.6605, + "num_tokens": 405264249.0, + "step": 4310 + }, + { + "epoch": 0.7357910906298003, + "grad_norm": 0.4921636970623996, + "learning_rate": 1.0575183478409286e-05, + "loss": 0.5894, + "num_tokens": 405353843.0, + "step": 4311 + }, + { + "epoch": 0.7359617682198327, + "grad_norm": 0.5307533088110954, + "learning_rate": 1.0568356374807988e-05, + "loss": 0.5434, + "num_tokens": 405423522.0, + "step": 4312 + }, + { + "epoch": 0.7361324458098651, + "grad_norm": 0.5036438764272756, + "learning_rate": 1.0561529271206692e-05, + "loss": 0.5534, + "num_tokens": 405495927.0, + "step": 4313 + }, + { + "epoch": 0.7363031233998976, + "grad_norm": 0.4676877689821344, + "learning_rate": 1.0554702167605395e-05, + "loss": 0.6148, + "num_tokens": 405598446.0, + "step": 4314 + }, + { + "epoch": 0.73647380098993, + "grad_norm": 0.49306354131565944, + "learning_rate": 1.0547875064004096e-05, + "loss": 0.5909, + "num_tokens": 405688639.0, + "step": 4315 + }, + { + "epoch": 0.7366444785799624, + "grad_norm": 0.4533554453512728, + "learning_rate": 1.05410479604028e-05, + "loss": 0.5605, + "num_tokens": 405789197.0, + "step": 4316 + }, + { + "epoch": 0.7368151561699949, + "grad_norm": 0.4674787467885761, + "learning_rate": 1.0534220856801503e-05, + "loss": 0.5479, + "num_tokens": 405882289.0, + "step": 4317 + }, + { + "epoch": 0.7369858337600274, + "grad_norm": 0.42260705282148414, + "learning_rate": 1.0527393753200205e-05, + "loss": 0.5954, + "num_tokens": 406012059.0, + "step": 4318 + }, + { + "epoch": 0.7371565113500598, + "grad_norm": 0.4297292433764594, + "learning_rate": 1.052056664959891e-05, + "loss": 0.5297, + "num_tokens": 406114309.0, + "step": 4319 + }, + { + "epoch": 0.7373271889400922, + "grad_norm": 0.4648437317034659, + "learning_rate": 1.0513739545997613e-05, + "loss": 0.5761, + "num_tokens": 406212512.0, + "step": 4320 + }, + { + "epoch": 0.7374978665301246, + "grad_norm": 0.4659457976469246, + "learning_rate": 1.0506912442396313e-05, + "loss": 0.6255, + "num_tokens": 406316112.0, + "step": 4321 + }, + { + "epoch": 0.737668544120157, + "grad_norm": 0.439761587026063, + "learning_rate": 1.0500085338795017e-05, + "loss": 0.6542, + "num_tokens": 406450896.0, + "step": 4322 + }, + { + "epoch": 0.7378392217101895, + "grad_norm": 0.514637197365571, + "learning_rate": 1.049325823519372e-05, + "loss": 0.6096, + "num_tokens": 406540478.0, + "step": 4323 + }, + { + "epoch": 0.7380098993002219, + "grad_norm": 0.5128764747249741, + "learning_rate": 1.0486431131592423e-05, + "loss": 0.5749, + "num_tokens": 406625568.0, + "step": 4324 + }, + { + "epoch": 0.7381805768902543, + "grad_norm": 0.5014231139847178, + "learning_rate": 1.0479604027991127e-05, + "loss": 0.6242, + "num_tokens": 406713060.0, + "step": 4325 + }, + { + "epoch": 0.7383512544802867, + "grad_norm": 0.4900536540071442, + "learning_rate": 1.0472776924389829e-05, + "loss": 0.6036, + "num_tokens": 406819894.0, + "step": 4326 + }, + { + "epoch": 0.7385219320703191, + "grad_norm": 0.5210721666198292, + "learning_rate": 1.0465949820788533e-05, + "loss": 0.6062, + "num_tokens": 406901039.0, + "step": 4327 + }, + { + "epoch": 0.7386926096603516, + "grad_norm": 0.4221029210598249, + "learning_rate": 1.0459122717187233e-05, + "loss": 0.5243, + "num_tokens": 407011478.0, + "step": 4328 + }, + { + "epoch": 0.738863287250384, + "grad_norm": 0.5065806761484253, + "learning_rate": 1.0452295613585937e-05, + "loss": 0.5591, + "num_tokens": 407092023.0, + "step": 4329 + }, + { + "epoch": 0.7390339648404165, + "grad_norm": 0.5299471235955461, + "learning_rate": 1.044546850998464e-05, + "loss": 0.6163, + "num_tokens": 407166222.0, + "step": 4330 + }, + { + "epoch": 0.7392046424304489, + "grad_norm": 0.4644656118352512, + "learning_rate": 1.0438641406383342e-05, + "loss": 0.5349, + "num_tokens": 407255179.0, + "step": 4331 + }, + { + "epoch": 0.7393753200204813, + "grad_norm": 0.42416355737859723, + "learning_rate": 1.0431814302782046e-05, + "loss": 0.5774, + "num_tokens": 407375686.0, + "step": 4332 + }, + { + "epoch": 0.7395459976105138, + "grad_norm": 0.47666681298339775, + "learning_rate": 1.042498719918075e-05, + "loss": 0.6016, + "num_tokens": 407476498.0, + "step": 4333 + }, + { + "epoch": 0.7397166752005462, + "grad_norm": 0.4689306569306729, + "learning_rate": 1.041816009557945e-05, + "loss": 0.5775, + "num_tokens": 407570896.0, + "step": 4334 + }, + { + "epoch": 0.7398873527905786, + "grad_norm": 0.4178411759780276, + "learning_rate": 1.0411332991978154e-05, + "loss": 0.6259, + "num_tokens": 407700567.0, + "step": 4335 + }, + { + "epoch": 0.740058030380611, + "grad_norm": 0.4890278807145064, + "learning_rate": 1.0404505888376856e-05, + "loss": 0.628, + "num_tokens": 407794123.0, + "step": 4336 + }, + { + "epoch": 0.7402287079706434, + "grad_norm": 0.4611684405861955, + "learning_rate": 1.039767878477556e-05, + "loss": 0.4847, + "num_tokens": 407880561.0, + "step": 4337 + }, + { + "epoch": 0.7403993855606759, + "grad_norm": 0.5376170906370259, + "learning_rate": 1.0390851681174264e-05, + "loss": 0.582, + "num_tokens": 407960276.0, + "step": 4338 + }, + { + "epoch": 0.7405700631507083, + "grad_norm": 0.4709328670410732, + "learning_rate": 1.0384024577572966e-05, + "loss": 0.6639, + "num_tokens": 408070349.0, + "step": 4339 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.46193189972691256, + "learning_rate": 1.0377197473971668e-05, + "loss": 0.5161, + "num_tokens": 408158931.0, + "step": 4340 + }, + { + "epoch": 0.7409114183307731, + "grad_norm": 0.48283336467627475, + "learning_rate": 1.037037037037037e-05, + "loss": 0.5826, + "num_tokens": 408243374.0, + "step": 4341 + }, + { + "epoch": 0.7410820959208057, + "grad_norm": 0.47770340990277727, + "learning_rate": 1.0363543266769074e-05, + "loss": 0.5856, + "num_tokens": 408346353.0, + "step": 4342 + }, + { + "epoch": 0.7412527735108381, + "grad_norm": 0.5006449405412168, + "learning_rate": 1.0356716163167777e-05, + "loss": 0.5606, + "num_tokens": 408430597.0, + "step": 4343 + }, + { + "epoch": 0.7414234511008705, + "grad_norm": 0.5045732695299237, + "learning_rate": 1.034988905956648e-05, + "loss": 0.6064, + "num_tokens": 408515497.0, + "step": 4344 + }, + { + "epoch": 0.7415941286909029, + "grad_norm": 0.49643919641604445, + "learning_rate": 1.0343061955965183e-05, + "loss": 0.6412, + "num_tokens": 408612575.0, + "step": 4345 + }, + { + "epoch": 0.7417648062809353, + "grad_norm": 0.4890017962231783, + "learning_rate": 1.0336234852363884e-05, + "loss": 0.561, + "num_tokens": 408697989.0, + "step": 4346 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 0.508604066073957, + "learning_rate": 1.0329407748762587e-05, + "loss": 0.5274, + "num_tokens": 408773159.0, + "step": 4347 + }, + { + "epoch": 0.7421061614610002, + "grad_norm": 0.456461593082836, + "learning_rate": 1.0322580645161291e-05, + "loss": 0.496, + "num_tokens": 408865940.0, + "step": 4348 + }, + { + "epoch": 0.7422768390510326, + "grad_norm": 0.4538357239881049, + "learning_rate": 1.0315753541559993e-05, + "loss": 0.5252, + "num_tokens": 408965243.0, + "step": 4349 + }, + { + "epoch": 0.742447516641065, + "grad_norm": 0.46692002715015735, + "learning_rate": 1.0308926437958697e-05, + "loss": 0.5316, + "num_tokens": 409061642.0, + "step": 4350 + }, + { + "epoch": 0.7426181942310974, + "grad_norm": 0.48673691838618155, + "learning_rate": 1.03020993343574e-05, + "loss": 0.5245, + "num_tokens": 409140804.0, + "step": 4351 + }, + { + "epoch": 0.7427888718211298, + "grad_norm": 0.4649158259465345, + "learning_rate": 1.0295272230756101e-05, + "loss": 0.6799, + "num_tokens": 409249076.0, + "step": 4352 + }, + { + "epoch": 0.7429595494111623, + "grad_norm": 0.4503658617151835, + "learning_rate": 1.0288445127154805e-05, + "loss": 0.622, + "num_tokens": 409366014.0, + "step": 4353 + }, + { + "epoch": 0.7431302270011948, + "grad_norm": 0.4255692682504055, + "learning_rate": 1.0281618023553509e-05, + "loss": 0.6617, + "num_tokens": 409522537.0, + "step": 4354 + }, + { + "epoch": 0.7433009045912272, + "grad_norm": 0.4256897379491504, + "learning_rate": 1.027479091995221e-05, + "loss": 0.572, + "num_tokens": 409632993.0, + "step": 4355 + }, + { + "epoch": 0.7434715821812596, + "grad_norm": 0.48937589736438847, + "learning_rate": 1.0267963816350915e-05, + "loss": 0.6388, + "num_tokens": 409728832.0, + "step": 4356 + }, + { + "epoch": 0.7436422597712921, + "grad_norm": 0.4381952498125304, + "learning_rate": 1.0261136712749618e-05, + "loss": 0.5531, + "num_tokens": 409836329.0, + "step": 4357 + }, + { + "epoch": 0.7438129373613245, + "grad_norm": 0.42946332888756505, + "learning_rate": 1.025430960914832e-05, + "loss": 0.4915, + "num_tokens": 409933160.0, + "step": 4358 + }, + { + "epoch": 0.7439836149513569, + "grad_norm": 0.499375841223843, + "learning_rate": 1.0247482505547022e-05, + "loss": 0.5864, + "num_tokens": 410038113.0, + "step": 4359 + }, + { + "epoch": 0.7441542925413893, + "grad_norm": 0.42718790270831775, + "learning_rate": 1.0240655401945725e-05, + "loss": 0.5701, + "num_tokens": 410150970.0, + "step": 4360 + }, + { + "epoch": 0.7443249701314217, + "grad_norm": 0.4582628267905334, + "learning_rate": 1.0233828298344428e-05, + "loss": 0.6103, + "num_tokens": 410251937.0, + "step": 4361 + }, + { + "epoch": 0.7444956477214542, + "grad_norm": 0.5050187945859961, + "learning_rate": 1.0227001194743132e-05, + "loss": 0.5931, + "num_tokens": 410332976.0, + "step": 4362 + }, + { + "epoch": 0.7446663253114866, + "grad_norm": 0.4323297849711002, + "learning_rate": 1.0220174091141834e-05, + "loss": 0.5981, + "num_tokens": 410451985.0, + "step": 4363 + }, + { + "epoch": 0.744837002901519, + "grad_norm": 0.490469374144618, + "learning_rate": 1.0213346987540538e-05, + "loss": 0.6239, + "num_tokens": 410547692.0, + "step": 4364 + }, + { + "epoch": 0.7450076804915514, + "grad_norm": 0.5257572815768866, + "learning_rate": 1.0206519883939238e-05, + "loss": 0.6191, + "num_tokens": 410626494.0, + "step": 4365 + }, + { + "epoch": 0.7451783580815838, + "grad_norm": 0.44716341800888604, + "learning_rate": 1.0199692780337942e-05, + "loss": 0.5248, + "num_tokens": 410719273.0, + "step": 4366 + }, + { + "epoch": 0.7453490356716164, + "grad_norm": 0.47428584305823773, + "learning_rate": 1.0192865676736646e-05, + "loss": 0.5464, + "num_tokens": 410803289.0, + "step": 4367 + }, + { + "epoch": 0.7455197132616488, + "grad_norm": 0.5288738538491394, + "learning_rate": 1.0186038573135348e-05, + "loss": 0.581, + "num_tokens": 410877659.0, + "step": 4368 + }, + { + "epoch": 0.7456903908516812, + "grad_norm": 0.46576432387876987, + "learning_rate": 1.0179211469534052e-05, + "loss": 0.494, + "num_tokens": 410956814.0, + "step": 4369 + }, + { + "epoch": 0.7458610684417136, + "grad_norm": 0.4697540971372057, + "learning_rate": 1.0172384365932755e-05, + "loss": 0.5703, + "num_tokens": 411059046.0, + "step": 4370 + }, + { + "epoch": 0.746031746031746, + "grad_norm": 0.4231476153775304, + "learning_rate": 1.0165557262331456e-05, + "loss": 0.5505, + "num_tokens": 411172596.0, + "step": 4371 + }, + { + "epoch": 0.7462024236217785, + "grad_norm": 0.4606591526802254, + "learning_rate": 1.015873015873016e-05, + "loss": 0.6563, + "num_tokens": 411275754.0, + "step": 4372 + }, + { + "epoch": 0.7463731012118109, + "grad_norm": 0.5289200935191991, + "learning_rate": 1.0151903055128862e-05, + "loss": 0.5371, + "num_tokens": 411350866.0, + "step": 4373 + }, + { + "epoch": 0.7465437788018433, + "grad_norm": 0.48747155728287284, + "learning_rate": 1.0145075951527565e-05, + "loss": 0.5067, + "num_tokens": 411434036.0, + "step": 4374 + }, + { + "epoch": 0.7467144563918757, + "grad_norm": 0.4814716628817321, + "learning_rate": 1.0138248847926269e-05, + "loss": 0.6368, + "num_tokens": 411547356.0, + "step": 4375 + }, + { + "epoch": 0.7468851339819081, + "grad_norm": 0.4833844030022916, + "learning_rate": 1.0131421744324971e-05, + "loss": 0.5491, + "num_tokens": 411629877.0, + "step": 4376 + }, + { + "epoch": 0.7470558115719406, + "grad_norm": 0.43095842106338755, + "learning_rate": 1.0124594640723673e-05, + "loss": 0.5631, + "num_tokens": 411746738.0, + "step": 4377 + }, + { + "epoch": 0.747226489161973, + "grad_norm": 0.4166632996317229, + "learning_rate": 1.0117767537122375e-05, + "loss": 0.5269, + "num_tokens": 411861509.0, + "step": 4378 + }, + { + "epoch": 0.7473971667520055, + "grad_norm": 0.45351160281636504, + "learning_rate": 1.0110940433521079e-05, + "loss": 0.5175, + "num_tokens": 411947430.0, + "step": 4379 + }, + { + "epoch": 0.7475678443420379, + "grad_norm": 0.509981827291222, + "learning_rate": 1.0104113329919783e-05, + "loss": 0.6005, + "num_tokens": 412024959.0, + "step": 4380 + }, + { + "epoch": 0.7477385219320704, + "grad_norm": 0.4845683367329745, + "learning_rate": 1.0097286226318485e-05, + "loss": 0.5526, + "num_tokens": 412121735.0, + "step": 4381 + }, + { + "epoch": 0.7479091995221028, + "grad_norm": 0.501363653202676, + "learning_rate": 1.0090459122717189e-05, + "loss": 0.5759, + "num_tokens": 412201515.0, + "step": 4382 + }, + { + "epoch": 0.7480798771121352, + "grad_norm": 0.5599795773150181, + "learning_rate": 1.0083632019115889e-05, + "loss": 0.4714, + "num_tokens": 412257731.0, + "step": 4383 + }, + { + "epoch": 0.7482505547021676, + "grad_norm": 0.4471650661394628, + "learning_rate": 1.0076804915514593e-05, + "loss": 0.5263, + "num_tokens": 412367029.0, + "step": 4384 + }, + { + "epoch": 0.7484212322922, + "grad_norm": 0.5409099620981359, + "learning_rate": 1.0069977811913297e-05, + "loss": 0.5923, + "num_tokens": 412438913.0, + "step": 4385 + }, + { + "epoch": 0.7485919098822325, + "grad_norm": 0.5096706124730942, + "learning_rate": 1.0063150708311999e-05, + "loss": 0.5549, + "num_tokens": 412524198.0, + "step": 4386 + }, + { + "epoch": 0.7487625874722649, + "grad_norm": 0.4461483333737283, + "learning_rate": 1.0056323604710702e-05, + "loss": 0.5377, + "num_tokens": 412632293.0, + "step": 4387 + }, + { + "epoch": 0.7489332650622973, + "grad_norm": 0.5005457174954253, + "learning_rate": 1.0049496501109406e-05, + "loss": 0.5174, + "num_tokens": 412707462.0, + "step": 4388 + }, + { + "epoch": 0.7491039426523297, + "grad_norm": 0.5412747316586721, + "learning_rate": 1.004266939750811e-05, + "loss": 0.553, + "num_tokens": 412773088.0, + "step": 4389 + }, + { + "epoch": 0.7492746202423621, + "grad_norm": 0.4746974294476905, + "learning_rate": 1.003584229390681e-05, + "loss": 0.6174, + "num_tokens": 412870784.0, + "step": 4390 + }, + { + "epoch": 0.7494452978323946, + "grad_norm": 0.5093856787982399, + "learning_rate": 1.0029015190305514e-05, + "loss": 0.5849, + "num_tokens": 412950449.0, + "step": 4391 + }, + { + "epoch": 0.7496159754224271, + "grad_norm": 0.4590577271651555, + "learning_rate": 1.0022188086704216e-05, + "loss": 0.4777, + "num_tokens": 413034471.0, + "step": 4392 + }, + { + "epoch": 0.7497866530124595, + "grad_norm": 0.4453938008489481, + "learning_rate": 1.001536098310292e-05, + "loss": 0.6468, + "num_tokens": 413153889.0, + "step": 4393 + }, + { + "epoch": 0.7499573306024919, + "grad_norm": 0.48649040961583473, + "learning_rate": 1.0008533879501624e-05, + "loss": 0.5992, + "num_tokens": 413245032.0, + "step": 4394 + }, + { + "epoch": 0.7501280081925243, + "grad_norm": 0.5286552103135749, + "learning_rate": 1.0001706775900326e-05, + "loss": 0.5175, + "num_tokens": 413315226.0, + "step": 4395 + }, + { + "epoch": 0.7502986857825568, + "grad_norm": 0.4678935190727275, + "learning_rate": 9.994879672299028e-06, + "loss": 0.5836, + "num_tokens": 413416099.0, + "step": 4396 + }, + { + "epoch": 0.7504693633725892, + "grad_norm": 0.47284588434264996, + "learning_rate": 9.98805256869773e-06, + "loss": 0.593, + "num_tokens": 413515223.0, + "step": 4397 + }, + { + "epoch": 0.7506400409626216, + "grad_norm": 0.5050945082745006, + "learning_rate": 9.981225465096434e-06, + "loss": 0.603, + "num_tokens": 413601484.0, + "step": 4398 + }, + { + "epoch": 0.750810718552654, + "grad_norm": 0.45013357590581676, + "learning_rate": 9.974398361495137e-06, + "loss": 0.4949, + "num_tokens": 413699285.0, + "step": 4399 + }, + { + "epoch": 0.7509813961426864, + "grad_norm": 0.507124031246201, + "learning_rate": 9.96757125789384e-06, + "loss": 0.604, + "num_tokens": 413784039.0, + "step": 4400 + }, + { + "epoch": 0.7511520737327189, + "grad_norm": 0.4231696700203865, + "learning_rate": 9.960744154292542e-06, + "loss": 0.5121, + "num_tokens": 413895326.0, + "step": 4401 + }, + { + "epoch": 0.7513227513227513, + "grad_norm": 0.513428930327859, + "learning_rate": 9.953917050691245e-06, + "loss": 0.6086, + "num_tokens": 413970531.0, + "step": 4402 + }, + { + "epoch": 0.7514934289127837, + "grad_norm": 0.5607228295579075, + "learning_rate": 9.947089947089947e-06, + "loss": 0.5184, + "num_tokens": 414026060.0, + "step": 4403 + }, + { + "epoch": 0.7516641065028162, + "grad_norm": 0.5508821278979664, + "learning_rate": 9.940262843488651e-06, + "loss": 0.5861, + "num_tokens": 414112427.0, + "step": 4404 + }, + { + "epoch": 0.7518347840928487, + "grad_norm": 0.49856643266591516, + "learning_rate": 9.933435739887353e-06, + "loss": 0.6281, + "num_tokens": 414210392.0, + "step": 4405 + }, + { + "epoch": 0.7520054616828811, + "grad_norm": 0.4866378347925833, + "learning_rate": 9.926608636286057e-06, + "loss": 0.628, + "num_tokens": 414304714.0, + "step": 4406 + }, + { + "epoch": 0.7521761392729135, + "grad_norm": 0.4668886979717633, + "learning_rate": 9.919781532684759e-06, + "loss": 0.4907, + "num_tokens": 414385191.0, + "step": 4407 + }, + { + "epoch": 0.7523468168629459, + "grad_norm": 0.44974396176436543, + "learning_rate": 9.912954429083463e-06, + "loss": 0.5215, + "num_tokens": 414500297.0, + "step": 4408 + }, + { + "epoch": 0.7525174944529783, + "grad_norm": 0.4713162240824331, + "learning_rate": 9.906127325482165e-06, + "loss": 0.6623, + "num_tokens": 414611173.0, + "step": 4409 + }, + { + "epoch": 0.7526881720430108, + "grad_norm": 0.47983215938872664, + "learning_rate": 9.899300221880867e-06, + "loss": 0.5749, + "num_tokens": 414707718.0, + "step": 4410 + }, + { + "epoch": 0.7528588496330432, + "grad_norm": 0.45745925445811325, + "learning_rate": 9.89247311827957e-06, + "loss": 0.5405, + "num_tokens": 414806439.0, + "step": 4411 + }, + { + "epoch": 0.7530295272230756, + "grad_norm": 0.5082844804453913, + "learning_rate": 9.885646014678274e-06, + "loss": 0.5469, + "num_tokens": 414881962.0, + "step": 4412 + }, + { + "epoch": 0.753200204813108, + "grad_norm": 0.48107604396951764, + "learning_rate": 9.878818911076977e-06, + "loss": 0.5564, + "num_tokens": 414970493.0, + "step": 4413 + }, + { + "epoch": 0.7533708824031404, + "grad_norm": 0.4700975367055858, + "learning_rate": 9.871991807475679e-06, + "loss": 0.4943, + "num_tokens": 415051537.0, + "step": 4414 + }, + { + "epoch": 0.7535415599931728, + "grad_norm": 0.5021468995839762, + "learning_rate": 9.865164703874382e-06, + "loss": 0.5427, + "num_tokens": 415132908.0, + "step": 4415 + }, + { + "epoch": 0.7537122375832054, + "grad_norm": 0.4788233700929066, + "learning_rate": 9.858337600273084e-06, + "loss": 0.5467, + "num_tokens": 415217356.0, + "step": 4416 + }, + { + "epoch": 0.7538829151732378, + "grad_norm": 0.6673897632293267, + "learning_rate": 9.851510496671788e-06, + "loss": 0.6105, + "num_tokens": 415331087.0, + "step": 4417 + }, + { + "epoch": 0.7540535927632702, + "grad_norm": 0.4793050910515883, + "learning_rate": 9.84468339307049e-06, + "loss": 0.6405, + "num_tokens": 415438355.0, + "step": 4418 + }, + { + "epoch": 0.7542242703533026, + "grad_norm": 0.5428168980180111, + "learning_rate": 9.837856289469192e-06, + "loss": 0.5879, + "num_tokens": 415515992.0, + "step": 4419 + }, + { + "epoch": 0.7543949479433351, + "grad_norm": 0.45374307318641255, + "learning_rate": 9.831029185867896e-06, + "loss": 0.4631, + "num_tokens": 415603585.0, + "step": 4420 + }, + { + "epoch": 0.7545656255333675, + "grad_norm": 0.504396962141867, + "learning_rate": 9.8242020822666e-06, + "loss": 0.5079, + "num_tokens": 415668989.0, + "step": 4421 + }, + { + "epoch": 0.7547363031233999, + "grad_norm": 0.617372369654016, + "learning_rate": 9.817374978665302e-06, + "loss": 0.6583, + "num_tokens": 415728280.0, + "step": 4422 + }, + { + "epoch": 0.7549069807134323, + "grad_norm": 0.5431089783677541, + "learning_rate": 9.810547875064006e-06, + "loss": 0.6162, + "num_tokens": 415802572.0, + "step": 4423 + }, + { + "epoch": 0.7550776583034647, + "grad_norm": 0.5361980479412144, + "learning_rate": 9.803720771462708e-06, + "loss": 0.6506, + "num_tokens": 415899652.0, + "step": 4424 + }, + { + "epoch": 0.7552483358934972, + "grad_norm": 0.4184306874016687, + "learning_rate": 9.79689366786141e-06, + "loss": 0.5141, + "num_tokens": 416010475.0, + "step": 4425 + }, + { + "epoch": 0.7554190134835296, + "grad_norm": 0.5288051462699217, + "learning_rate": 9.790066564260114e-06, + "loss": 0.6031, + "num_tokens": 416081967.0, + "step": 4426 + }, + { + "epoch": 0.755589691073562, + "grad_norm": 0.47237305635257193, + "learning_rate": 9.783239460658817e-06, + "loss": 0.5827, + "num_tokens": 416178952.0, + "step": 4427 + }, + { + "epoch": 0.7557603686635944, + "grad_norm": 0.4619512827946518, + "learning_rate": 9.77641235705752e-06, + "loss": 0.4758, + "num_tokens": 416264123.0, + "step": 4428 + }, + { + "epoch": 0.755931046253627, + "grad_norm": 0.53584633033244, + "learning_rate": 9.769585253456221e-06, + "loss": 0.5908, + "num_tokens": 416346619.0, + "step": 4429 + }, + { + "epoch": 0.7561017238436594, + "grad_norm": 0.47083893072125776, + "learning_rate": 9.762758149854925e-06, + "loss": 0.547, + "num_tokens": 416434061.0, + "step": 4430 + }, + { + "epoch": 0.7562724014336918, + "grad_norm": 0.4766061151674145, + "learning_rate": 9.755931046253627e-06, + "loss": 0.5882, + "num_tokens": 416528103.0, + "step": 4431 + }, + { + "epoch": 0.7564430790237242, + "grad_norm": 0.4785649510670751, + "learning_rate": 9.749103942652331e-06, + "loss": 0.5587, + "num_tokens": 416615126.0, + "step": 4432 + }, + { + "epoch": 0.7566137566137566, + "grad_norm": 0.4414932917681694, + "learning_rate": 9.742276839051033e-06, + "loss": 0.543, + "num_tokens": 416723468.0, + "step": 4433 + }, + { + "epoch": 0.756784434203789, + "grad_norm": 0.5057221550025812, + "learning_rate": 9.735449735449735e-06, + "loss": 0.6204, + "num_tokens": 416808169.0, + "step": 4434 + }, + { + "epoch": 0.7569551117938215, + "grad_norm": 0.46592788187874246, + "learning_rate": 9.728622631848439e-06, + "loss": 0.491, + "num_tokens": 416899205.0, + "step": 4435 + }, + { + "epoch": 0.7571257893838539, + "grad_norm": 0.5677222784127941, + "learning_rate": 9.721795528247143e-06, + "loss": 0.5944, + "num_tokens": 416971393.0, + "step": 4436 + }, + { + "epoch": 0.7572964669738863, + "grad_norm": 0.48398148471118874, + "learning_rate": 9.714968424645845e-06, + "loss": 0.5986, + "num_tokens": 417063000.0, + "step": 4437 + }, + { + "epoch": 0.7574671445639187, + "grad_norm": 0.4302547660666367, + "learning_rate": 9.708141321044547e-06, + "loss": 0.5802, + "num_tokens": 417182394.0, + "step": 4438 + }, + { + "epoch": 0.7576378221539511, + "grad_norm": 0.53584224777, + "learning_rate": 9.70131421744325e-06, + "loss": 0.6026, + "num_tokens": 417262778.0, + "step": 4439 + }, + { + "epoch": 0.7578084997439836, + "grad_norm": 0.45091215848632504, + "learning_rate": 9.694487113841954e-06, + "loss": 0.5593, + "num_tokens": 417357496.0, + "step": 4440 + }, + { + "epoch": 0.7579791773340161, + "grad_norm": 0.4704919769506679, + "learning_rate": 9.687660010240656e-06, + "loss": 0.5861, + "num_tokens": 417452156.0, + "step": 4441 + }, + { + "epoch": 0.7581498549240485, + "grad_norm": 0.47206488440748234, + "learning_rate": 9.680832906639359e-06, + "loss": 0.5619, + "num_tokens": 417544617.0, + "step": 4442 + }, + { + "epoch": 0.7583205325140809, + "grad_norm": 0.44431901705383203, + "learning_rate": 9.674005803038062e-06, + "loss": 0.5785, + "num_tokens": 417653348.0, + "step": 4443 + }, + { + "epoch": 0.7584912101041134, + "grad_norm": 0.5421588661715374, + "learning_rate": 9.667178699436764e-06, + "loss": 0.6784, + "num_tokens": 417751633.0, + "step": 4444 + }, + { + "epoch": 0.7586618876941458, + "grad_norm": 0.45168457666970413, + "learning_rate": 9.660351595835468e-06, + "loss": 0.5026, + "num_tokens": 417844504.0, + "step": 4445 + }, + { + "epoch": 0.7588325652841782, + "grad_norm": 0.4846145736296778, + "learning_rate": 9.65352449223417e-06, + "loss": 0.5622, + "num_tokens": 417927221.0, + "step": 4446 + }, + { + "epoch": 0.7590032428742106, + "grad_norm": 0.45546712008511253, + "learning_rate": 9.646697388632872e-06, + "loss": 0.5406, + "num_tokens": 418021885.0, + "step": 4447 + }, + { + "epoch": 0.759173920464243, + "grad_norm": 0.5074577391146845, + "learning_rate": 9.639870285031576e-06, + "loss": 0.6139, + "num_tokens": 418109827.0, + "step": 4448 + }, + { + "epoch": 0.7593445980542755, + "grad_norm": 0.554466551284269, + "learning_rate": 9.63304318143028e-06, + "loss": 0.6127, + "num_tokens": 418174755.0, + "step": 4449 + }, + { + "epoch": 0.7595152756443079, + "grad_norm": 0.4503537733326652, + "learning_rate": 9.626216077828982e-06, + "loss": 0.5485, + "num_tokens": 418268991.0, + "step": 4450 + }, + { + "epoch": 0.7596859532343403, + "grad_norm": 0.4385008623586919, + "learning_rate": 9.619388974227684e-06, + "loss": 0.543, + "num_tokens": 418367286.0, + "step": 4451 + }, + { + "epoch": 0.7598566308243727, + "grad_norm": 0.47902203834350693, + "learning_rate": 9.612561870626388e-06, + "loss": 0.5738, + "num_tokens": 418456021.0, + "step": 4452 + }, + { + "epoch": 0.7600273084144051, + "grad_norm": 0.48235771620867257, + "learning_rate": 9.60573476702509e-06, + "loss": 0.5056, + "num_tokens": 418547971.0, + "step": 4453 + }, + { + "epoch": 0.7601979860044377, + "grad_norm": 0.42915176541265265, + "learning_rate": 9.598907663423794e-06, + "loss": 0.4778, + "num_tokens": 418649918.0, + "step": 4454 + }, + { + "epoch": 0.7603686635944701, + "grad_norm": 0.44832898906711677, + "learning_rate": 9.592080559822496e-06, + "loss": 0.5117, + "num_tokens": 418741098.0, + "step": 4455 + }, + { + "epoch": 0.7605393411845025, + "grad_norm": 0.4943682236286687, + "learning_rate": 9.5852534562212e-06, + "loss": 0.5655, + "num_tokens": 418820464.0, + "step": 4456 + }, + { + "epoch": 0.7607100187745349, + "grad_norm": 0.5562416507808026, + "learning_rate": 9.578426352619901e-06, + "loss": 0.5627, + "num_tokens": 418893942.0, + "step": 4457 + }, + { + "epoch": 0.7608806963645673, + "grad_norm": 0.5228132458211124, + "learning_rate": 9.571599249018605e-06, + "loss": 0.5658, + "num_tokens": 418965203.0, + "step": 4458 + }, + { + "epoch": 0.7610513739545998, + "grad_norm": 0.5163812048230894, + "learning_rate": 9.564772145417307e-06, + "loss": 0.6287, + "num_tokens": 419051865.0, + "step": 4459 + }, + { + "epoch": 0.7612220515446322, + "grad_norm": 0.47400328061836805, + "learning_rate": 9.557945041816011e-06, + "loss": 0.6036, + "num_tokens": 419147428.0, + "step": 4460 + }, + { + "epoch": 0.7613927291346646, + "grad_norm": 0.5055300628234156, + "learning_rate": 9.551117938214713e-06, + "loss": 0.6109, + "num_tokens": 419237651.0, + "step": 4461 + }, + { + "epoch": 0.761563406724697, + "grad_norm": 0.4971720869429387, + "learning_rate": 9.544290834613415e-06, + "loss": 0.6623, + "num_tokens": 419337762.0, + "step": 4462 + }, + { + "epoch": 0.7617340843147294, + "grad_norm": 0.4725981493813449, + "learning_rate": 9.537463731012119e-06, + "loss": 0.5342, + "num_tokens": 419424210.0, + "step": 4463 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.4756196201415142, + "learning_rate": 9.530636627410823e-06, + "loss": 0.5242, + "num_tokens": 419502684.0, + "step": 4464 + }, + { + "epoch": 0.7620754394947943, + "grad_norm": 0.47948915255643176, + "learning_rate": 9.523809523809525e-06, + "loss": 0.5806, + "num_tokens": 419590646.0, + "step": 4465 + }, + { + "epoch": 0.7622461170848268, + "grad_norm": 0.44918915041102403, + "learning_rate": 9.516982420208227e-06, + "loss": 0.5696, + "num_tokens": 419699397.0, + "step": 4466 + }, + { + "epoch": 0.7624167946748592, + "grad_norm": 0.45082740020220635, + "learning_rate": 9.51015531660693e-06, + "loss": 0.5631, + "num_tokens": 419814838.0, + "step": 4467 + }, + { + "epoch": 0.7625874722648917, + "grad_norm": 0.4930388300987612, + "learning_rate": 9.503328213005634e-06, + "loss": 0.5483, + "num_tokens": 419897601.0, + "step": 4468 + }, + { + "epoch": 0.7627581498549241, + "grad_norm": 0.4664389061401222, + "learning_rate": 9.496501109404336e-06, + "loss": 0.5548, + "num_tokens": 419985533.0, + "step": 4469 + }, + { + "epoch": 0.7629288274449565, + "grad_norm": 0.43422588998117245, + "learning_rate": 9.489674005803038e-06, + "loss": 0.532, + "num_tokens": 420093614.0, + "step": 4470 + }, + { + "epoch": 0.7630995050349889, + "grad_norm": 0.5164742395987232, + "learning_rate": 9.482846902201742e-06, + "loss": 0.5597, + "num_tokens": 420203468.0, + "step": 4471 + }, + { + "epoch": 0.7632701826250213, + "grad_norm": 0.4606169034850613, + "learning_rate": 9.476019798600444e-06, + "loss": 0.5633, + "num_tokens": 420295795.0, + "step": 4472 + }, + { + "epoch": 0.7634408602150538, + "grad_norm": 0.4298137177420179, + "learning_rate": 9.469192694999148e-06, + "loss": 0.5142, + "num_tokens": 420401449.0, + "step": 4473 + }, + { + "epoch": 0.7636115378050862, + "grad_norm": 0.5180350877667554, + "learning_rate": 9.46236559139785e-06, + "loss": 0.6878, + "num_tokens": 420492757.0, + "step": 4474 + }, + { + "epoch": 0.7637822153951186, + "grad_norm": 0.502604477702309, + "learning_rate": 9.455538487796552e-06, + "loss": 0.5908, + "num_tokens": 420582341.0, + "step": 4475 + }, + { + "epoch": 0.763952892985151, + "grad_norm": 0.5197702963335067, + "learning_rate": 9.448711384195256e-06, + "loss": 0.6118, + "num_tokens": 420672640.0, + "step": 4476 + }, + { + "epoch": 0.7641235705751834, + "grad_norm": 0.48318275521017234, + "learning_rate": 9.44188428059396e-06, + "loss": 0.5424, + "num_tokens": 420755614.0, + "step": 4477 + }, + { + "epoch": 0.764294248165216, + "grad_norm": 0.5551561645447183, + "learning_rate": 9.435057176992662e-06, + "loss": 0.5034, + "num_tokens": 420817357.0, + "step": 4478 + }, + { + "epoch": 0.7644649257552484, + "grad_norm": 0.45424288018532055, + "learning_rate": 9.428230073391364e-06, + "loss": 0.4805, + "num_tokens": 420898052.0, + "step": 4479 + }, + { + "epoch": 0.7646356033452808, + "grad_norm": 0.44062240568979877, + "learning_rate": 9.421402969790068e-06, + "loss": 0.5322, + "num_tokens": 421007593.0, + "step": 4480 + }, + { + "epoch": 0.7648062809353132, + "grad_norm": 0.4615501845499368, + "learning_rate": 9.41457586618877e-06, + "loss": 0.5579, + "num_tokens": 421105291.0, + "step": 4481 + }, + { + "epoch": 0.7649769585253456, + "grad_norm": 0.5183331439639892, + "learning_rate": 9.407748762587473e-06, + "loss": 0.5527, + "num_tokens": 421174309.0, + "step": 4482 + }, + { + "epoch": 0.7651476361153781, + "grad_norm": 0.47869916227293274, + "learning_rate": 9.400921658986176e-06, + "loss": 0.6004, + "num_tokens": 421274236.0, + "step": 4483 + }, + { + "epoch": 0.7653183137054105, + "grad_norm": 0.5415135050241976, + "learning_rate": 9.394094555384878e-06, + "loss": 0.6221, + "num_tokens": 421348146.0, + "step": 4484 + }, + { + "epoch": 0.7654889912954429, + "grad_norm": 0.5111591328330107, + "learning_rate": 9.387267451783581e-06, + "loss": 0.5464, + "num_tokens": 421420258.0, + "step": 4485 + }, + { + "epoch": 0.7656596688854753, + "grad_norm": 0.44018455713514754, + "learning_rate": 9.380440348182285e-06, + "loss": 0.5723, + "num_tokens": 421530630.0, + "step": 4486 + }, + { + "epoch": 0.7658303464755077, + "grad_norm": 0.4448382697916638, + "learning_rate": 9.373613244580987e-06, + "loss": 0.5464, + "num_tokens": 421630743.0, + "step": 4487 + }, + { + "epoch": 0.7660010240655402, + "grad_norm": 0.466854008802482, + "learning_rate": 9.36678614097969e-06, + "loss": 0.5247, + "num_tokens": 421723985.0, + "step": 4488 + }, + { + "epoch": 0.7661717016555726, + "grad_norm": 0.5036645525859088, + "learning_rate": 9.359959037378393e-06, + "loss": 0.5692, + "num_tokens": 421803255.0, + "step": 4489 + }, + { + "epoch": 0.766342379245605, + "grad_norm": 0.45553414531159214, + "learning_rate": 9.353131933777095e-06, + "loss": 0.6006, + "num_tokens": 421921109.0, + "step": 4490 + }, + { + "epoch": 0.7665130568356375, + "grad_norm": 0.49256463755469476, + "learning_rate": 9.346304830175799e-06, + "loss": 0.5789, + "num_tokens": 422013849.0, + "step": 4491 + }, + { + "epoch": 0.76668373442567, + "grad_norm": 0.4972082520999557, + "learning_rate": 9.339477726574503e-06, + "loss": 0.528, + "num_tokens": 422087678.0, + "step": 4492 + }, + { + "epoch": 0.7668544120157024, + "grad_norm": 0.6402870365652874, + "learning_rate": 9.332650622973205e-06, + "loss": 0.58, + "num_tokens": 422150396.0, + "step": 4493 + }, + { + "epoch": 0.7670250896057348, + "grad_norm": 0.421366844075591, + "learning_rate": 9.325823519371907e-06, + "loss": 0.5624, + "num_tokens": 422262896.0, + "step": 4494 + }, + { + "epoch": 0.7671957671957672, + "grad_norm": 0.44552251649091873, + "learning_rate": 9.31899641577061e-06, + "loss": 0.5827, + "num_tokens": 422366479.0, + "step": 4495 + }, + { + "epoch": 0.7673664447857996, + "grad_norm": 0.48638136438210045, + "learning_rate": 9.312169312169313e-06, + "loss": 0.5708, + "num_tokens": 422458179.0, + "step": 4496 + }, + { + "epoch": 0.767537122375832, + "grad_norm": 0.46572230937283754, + "learning_rate": 9.305342208568016e-06, + "loss": 0.5697, + "num_tokens": 422554795.0, + "step": 4497 + }, + { + "epoch": 0.7677077999658645, + "grad_norm": 0.4599964000306288, + "learning_rate": 9.298515104966718e-06, + "loss": 0.5934, + "num_tokens": 422660493.0, + "step": 4498 + }, + { + "epoch": 0.7678784775558969, + "grad_norm": 0.4392866483591171, + "learning_rate": 9.29168800136542e-06, + "loss": 0.5411, + "num_tokens": 422785430.0, + "step": 4499 + }, + { + "epoch": 0.7680491551459293, + "grad_norm": 0.4325895804353509, + "learning_rate": 9.284860897764124e-06, + "loss": 0.5835, + "num_tokens": 422899511.0, + "step": 4500 + }, + { + "epoch": 0.7682198327359617, + "grad_norm": 0.4675937459198648, + "learning_rate": 9.278033794162828e-06, + "loss": 0.5535, + "num_tokens": 422994509.0, + "step": 4501 + }, + { + "epoch": 0.7683905103259941, + "grad_norm": 0.4460707969710765, + "learning_rate": 9.27120669056153e-06, + "loss": 0.5222, + "num_tokens": 423100947.0, + "step": 4502 + }, + { + "epoch": 0.7685611879160267, + "grad_norm": 0.47763686181654635, + "learning_rate": 9.264379586960232e-06, + "loss": 0.5686, + "num_tokens": 423192100.0, + "step": 4503 + }, + { + "epoch": 0.7687318655060591, + "grad_norm": 0.4381582146220424, + "learning_rate": 9.257552483358936e-06, + "loss": 0.5936, + "num_tokens": 423308594.0, + "step": 4504 + }, + { + "epoch": 0.7689025430960915, + "grad_norm": 0.4344479110125472, + "learning_rate": 9.25072537975764e-06, + "loss": 0.6032, + "num_tokens": 423428767.0, + "step": 4505 + }, + { + "epoch": 0.7690732206861239, + "grad_norm": 0.4696103464978412, + "learning_rate": 9.243898276156342e-06, + "loss": 0.6242, + "num_tokens": 423524637.0, + "step": 4506 + }, + { + "epoch": 0.7692438982761564, + "grad_norm": 0.544846380279152, + "learning_rate": 9.237071172555044e-06, + "loss": 0.5515, + "num_tokens": 423597044.0, + "step": 4507 + }, + { + "epoch": 0.7694145758661888, + "grad_norm": 0.4966117392754593, + "learning_rate": 9.230244068953748e-06, + "loss": 0.5016, + "num_tokens": 423671447.0, + "step": 4508 + }, + { + "epoch": 0.7695852534562212, + "grad_norm": 0.4796027211022811, + "learning_rate": 9.22341696535245e-06, + "loss": 0.5526, + "num_tokens": 423758401.0, + "step": 4509 + }, + { + "epoch": 0.7697559310462536, + "grad_norm": 0.44402344683286826, + "learning_rate": 9.216589861751153e-06, + "loss": 0.6101, + "num_tokens": 423870571.0, + "step": 4510 + }, + { + "epoch": 0.769926608636286, + "grad_norm": 0.4737783929095333, + "learning_rate": 9.209762758149855e-06, + "loss": 0.5656, + "num_tokens": 423960887.0, + "step": 4511 + }, + { + "epoch": 0.7700972862263185, + "grad_norm": 0.4412598378314977, + "learning_rate": 9.202935654548558e-06, + "loss": 0.4825, + "num_tokens": 424053301.0, + "step": 4512 + }, + { + "epoch": 0.7702679638163509, + "grad_norm": 0.49604181567494016, + "learning_rate": 9.196108550947261e-06, + "loss": 0.6146, + "num_tokens": 424146036.0, + "step": 4513 + }, + { + "epoch": 0.7704386414063833, + "grad_norm": 0.4302833005846738, + "learning_rate": 9.189281447345965e-06, + "loss": 0.5836, + "num_tokens": 424258778.0, + "step": 4514 + }, + { + "epoch": 0.7706093189964157, + "grad_norm": 0.4856301526278776, + "learning_rate": 9.182454343744667e-06, + "loss": 0.5664, + "num_tokens": 424351612.0, + "step": 4515 + }, + { + "epoch": 0.7707799965864482, + "grad_norm": 0.4555318038380037, + "learning_rate": 9.17562724014337e-06, + "loss": 0.5853, + "num_tokens": 424468352.0, + "step": 4516 + }, + { + "epoch": 0.7709506741764807, + "grad_norm": 0.49433097491706823, + "learning_rate": 9.168800136542073e-06, + "loss": 0.4355, + "num_tokens": 424532010.0, + "step": 4517 + }, + { + "epoch": 0.7711213517665131, + "grad_norm": 0.4421706436874724, + "learning_rate": 9.161973032940775e-06, + "loss": 0.5419, + "num_tokens": 424635667.0, + "step": 4518 + }, + { + "epoch": 0.7712920293565455, + "grad_norm": 0.47582799417174243, + "learning_rate": 9.155145929339479e-06, + "loss": 0.6047, + "num_tokens": 424736999.0, + "step": 4519 + }, + { + "epoch": 0.7714627069465779, + "grad_norm": 0.47779959145446543, + "learning_rate": 9.148318825738181e-06, + "loss": 0.5619, + "num_tokens": 424821898.0, + "step": 4520 + }, + { + "epoch": 0.7716333845366103, + "grad_norm": 0.4457925740750154, + "learning_rate": 9.141491722136883e-06, + "loss": 0.519, + "num_tokens": 424913099.0, + "step": 4521 + }, + { + "epoch": 0.7718040621266428, + "grad_norm": 0.4845351346253027, + "learning_rate": 9.134664618535587e-06, + "loss": 0.4775, + "num_tokens": 424995311.0, + "step": 4522 + }, + { + "epoch": 0.7719747397166752, + "grad_norm": 0.4870649214438913, + "learning_rate": 9.12783751493429e-06, + "loss": 0.5311, + "num_tokens": 425076533.0, + "step": 4523 + }, + { + "epoch": 0.7721454173067076, + "grad_norm": 0.5047960364096198, + "learning_rate": 9.121010411332993e-06, + "loss": 0.5629, + "num_tokens": 425160505.0, + "step": 4524 + }, + { + "epoch": 0.77231609489674, + "grad_norm": 0.4774791890071037, + "learning_rate": 9.114183307731695e-06, + "loss": 0.6364, + "num_tokens": 425266390.0, + "step": 4525 + }, + { + "epoch": 0.7724867724867724, + "grad_norm": 0.4982697968608834, + "learning_rate": 9.107356204130398e-06, + "loss": 0.6118, + "num_tokens": 425355016.0, + "step": 4526 + }, + { + "epoch": 0.7726574500768049, + "grad_norm": 0.48494443072350496, + "learning_rate": 9.1005291005291e-06, + "loss": 0.5267, + "num_tokens": 425442743.0, + "step": 4527 + }, + { + "epoch": 0.7728281276668374, + "grad_norm": 0.4426526785579692, + "learning_rate": 9.093701996927804e-06, + "loss": 0.6169, + "num_tokens": 425554291.0, + "step": 4528 + }, + { + "epoch": 0.7729988052568698, + "grad_norm": 0.5278933852219229, + "learning_rate": 9.086874893326508e-06, + "loss": 0.6409, + "num_tokens": 425640934.0, + "step": 4529 + }, + { + "epoch": 0.7731694828469022, + "grad_norm": 0.5418593751130061, + "learning_rate": 9.08004778972521e-06, + "loss": 0.6371, + "num_tokens": 425744271.0, + "step": 4530 + }, + { + "epoch": 0.7733401604369347, + "grad_norm": 0.49844362826000577, + "learning_rate": 9.073220686123912e-06, + "loss": 0.6188, + "num_tokens": 425840523.0, + "step": 4531 + }, + { + "epoch": 0.7735108380269671, + "grad_norm": 0.4552683074353458, + "learning_rate": 9.066393582522616e-06, + "loss": 0.5444, + "num_tokens": 425938272.0, + "step": 4532 + }, + { + "epoch": 0.7736815156169995, + "grad_norm": 0.4272446088397507, + "learning_rate": 9.05956647892132e-06, + "loss": 0.5872, + "num_tokens": 426065559.0, + "step": 4533 + }, + { + "epoch": 0.7738521932070319, + "grad_norm": 0.4746114842469308, + "learning_rate": 9.052739375320022e-06, + "loss": 0.5118, + "num_tokens": 426147897.0, + "step": 4534 + }, + { + "epoch": 0.7740228707970643, + "grad_norm": 0.4570658926495669, + "learning_rate": 9.045912271718724e-06, + "loss": 0.5508, + "num_tokens": 426254551.0, + "step": 4535 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.49061508916507635, + "learning_rate": 9.039085168117428e-06, + "loss": 0.549, + "num_tokens": 426344735.0, + "step": 4536 + }, + { + "epoch": 0.7743642259771292, + "grad_norm": 0.4770853870665972, + "learning_rate": 9.03225806451613e-06, + "loss": 0.5804, + "num_tokens": 426434480.0, + "step": 4537 + }, + { + "epoch": 0.7745349035671616, + "grad_norm": 0.5089379414067082, + "learning_rate": 9.025430960914833e-06, + "loss": 0.6217, + "num_tokens": 426526671.0, + "step": 4538 + }, + { + "epoch": 0.774705581157194, + "grad_norm": 0.4824664243245564, + "learning_rate": 9.018603857313535e-06, + "loss": 0.5867, + "num_tokens": 426617131.0, + "step": 4539 + }, + { + "epoch": 0.7748762587472265, + "grad_norm": 0.46334079041523496, + "learning_rate": 9.011776753712238e-06, + "loss": 0.5214, + "num_tokens": 426717112.0, + "step": 4540 + }, + { + "epoch": 0.775046936337259, + "grad_norm": 0.5088525234917365, + "learning_rate": 9.004949650110941e-06, + "loss": 0.5795, + "num_tokens": 426818080.0, + "step": 4541 + }, + { + "epoch": 0.7752176139272914, + "grad_norm": 0.46875316995243166, + "learning_rate": 8.998122546509645e-06, + "loss": 0.5506, + "num_tokens": 426913226.0, + "step": 4542 + }, + { + "epoch": 0.7753882915173238, + "grad_norm": 0.5309452183948454, + "learning_rate": 8.991295442908347e-06, + "loss": 0.6116, + "num_tokens": 426985218.0, + "step": 4543 + }, + { + "epoch": 0.7755589691073562, + "grad_norm": 0.44376451728445676, + "learning_rate": 8.98446833930705e-06, + "loss": 0.6068, + "num_tokens": 427095894.0, + "step": 4544 + }, + { + "epoch": 0.7757296466973886, + "grad_norm": 0.5213292600816617, + "learning_rate": 8.977641235705753e-06, + "loss": 0.5886, + "num_tokens": 427194182.0, + "step": 4545 + }, + { + "epoch": 0.7759003242874211, + "grad_norm": 0.4573978683130957, + "learning_rate": 8.970814132104455e-06, + "loss": 0.5888, + "num_tokens": 427295496.0, + "step": 4546 + }, + { + "epoch": 0.7760710018774535, + "grad_norm": 0.46991400018446594, + "learning_rate": 8.963987028503159e-06, + "loss": 0.6199, + "num_tokens": 427402652.0, + "step": 4547 + }, + { + "epoch": 0.7762416794674859, + "grad_norm": 0.4575615911416777, + "learning_rate": 8.95715992490186e-06, + "loss": 0.5637, + "num_tokens": 427504517.0, + "step": 4548 + }, + { + "epoch": 0.7764123570575183, + "grad_norm": 0.4214203033876525, + "learning_rate": 8.950332821300563e-06, + "loss": 0.476, + "num_tokens": 427609272.0, + "step": 4549 + }, + { + "epoch": 0.7765830346475507, + "grad_norm": 0.45310210370323484, + "learning_rate": 8.943505717699267e-06, + "loss": 0.5463, + "num_tokens": 427706959.0, + "step": 4550 + }, + { + "epoch": 0.7767537122375832, + "grad_norm": 0.5017968125534078, + "learning_rate": 8.93667861409797e-06, + "loss": 0.5749, + "num_tokens": 427782946.0, + "step": 4551 + }, + { + "epoch": 0.7769243898276156, + "grad_norm": 0.48139278986226147, + "learning_rate": 8.929851510496672e-06, + "loss": 0.5725, + "num_tokens": 427873389.0, + "step": 4552 + }, + { + "epoch": 0.7770950674176481, + "grad_norm": 0.5333666968274469, + "learning_rate": 8.923024406895375e-06, + "loss": 0.4898, + "num_tokens": 427929686.0, + "step": 4553 + }, + { + "epoch": 0.7772657450076805, + "grad_norm": 0.4762745221295784, + "learning_rate": 8.916197303294078e-06, + "loss": 0.57, + "num_tokens": 428023064.0, + "step": 4554 + }, + { + "epoch": 0.777436422597713, + "grad_norm": 0.4497015992064834, + "learning_rate": 8.90937019969278e-06, + "loss": 0.5182, + "num_tokens": 428113801.0, + "step": 4555 + }, + { + "epoch": 0.7776071001877454, + "grad_norm": 0.4796222442567248, + "learning_rate": 8.902543096091484e-06, + "loss": 0.4844, + "num_tokens": 428194877.0, + "step": 4556 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.4559341996495817, + "learning_rate": 8.895715992490186e-06, + "loss": 0.5346, + "num_tokens": 428287715.0, + "step": 4557 + }, + { + "epoch": 0.7779484553678102, + "grad_norm": 0.48965669488191427, + "learning_rate": 8.888888888888888e-06, + "loss": 0.578, + "num_tokens": 428371047.0, + "step": 4558 + }, + { + "epoch": 0.7781191329578426, + "grad_norm": 0.4522941584080557, + "learning_rate": 8.882061785287592e-06, + "loss": 0.5559, + "num_tokens": 428472636.0, + "step": 4559 + }, + { + "epoch": 0.778289810547875, + "grad_norm": 0.48856879027383443, + "learning_rate": 8.875234681686296e-06, + "loss": 0.5835, + "num_tokens": 428557320.0, + "step": 4560 + }, + { + "epoch": 0.7784604881379075, + "grad_norm": 0.48421760626196886, + "learning_rate": 8.868407578084998e-06, + "loss": 0.545, + "num_tokens": 428642280.0, + "step": 4561 + }, + { + "epoch": 0.7786311657279399, + "grad_norm": 0.4012509199182392, + "learning_rate": 8.861580474483702e-06, + "loss": 0.569, + "num_tokens": 428775455.0, + "step": 4562 + }, + { + "epoch": 0.7788018433179723, + "grad_norm": 0.44142786312754967, + "learning_rate": 8.854753370882404e-06, + "loss": 0.5946, + "num_tokens": 428893146.0, + "step": 4563 + }, + { + "epoch": 0.7789725209080047, + "grad_norm": 0.49770792304713324, + "learning_rate": 8.847926267281107e-06, + "loss": 0.4907, + "num_tokens": 428964969.0, + "step": 4564 + }, + { + "epoch": 0.7791431984980373, + "grad_norm": 0.4654778908875265, + "learning_rate": 8.84109916367981e-06, + "loss": 0.6055, + "num_tokens": 429063596.0, + "step": 4565 + }, + { + "epoch": 0.7793138760880697, + "grad_norm": 0.494499183482781, + "learning_rate": 8.834272060078513e-06, + "loss": 0.5692, + "num_tokens": 429150429.0, + "step": 4566 + }, + { + "epoch": 0.7794845536781021, + "grad_norm": 0.46921031785447426, + "learning_rate": 8.827444956477215e-06, + "loss": 0.5601, + "num_tokens": 429241188.0, + "step": 4567 + }, + { + "epoch": 0.7796552312681345, + "grad_norm": 0.4764747188663704, + "learning_rate": 8.820617852875917e-06, + "loss": 0.6309, + "num_tokens": 429351429.0, + "step": 4568 + }, + { + "epoch": 0.7798259088581669, + "grad_norm": 0.5036611806146156, + "learning_rate": 8.813790749274621e-06, + "loss": 0.5691, + "num_tokens": 429432206.0, + "step": 4569 + }, + { + "epoch": 0.7799965864481994, + "grad_norm": 0.4700963740689947, + "learning_rate": 8.806963645673325e-06, + "loss": 0.6333, + "num_tokens": 429550658.0, + "step": 4570 + }, + { + "epoch": 0.7801672640382318, + "grad_norm": 0.4367878318534245, + "learning_rate": 8.800136542072027e-06, + "loss": 0.5275, + "num_tokens": 429653275.0, + "step": 4571 + }, + { + "epoch": 0.7803379416282642, + "grad_norm": 0.44579577522944247, + "learning_rate": 8.793309438470729e-06, + "loss": 0.5135, + "num_tokens": 429754875.0, + "step": 4572 + }, + { + "epoch": 0.7805086192182966, + "grad_norm": 0.49361128642347313, + "learning_rate": 8.786482334869433e-06, + "loss": 0.6328, + "num_tokens": 429848126.0, + "step": 4573 + }, + { + "epoch": 0.780679296808329, + "grad_norm": 0.46770657583914893, + "learning_rate": 8.779655231268135e-06, + "loss": 0.6048, + "num_tokens": 429951925.0, + "step": 4574 + }, + { + "epoch": 0.7808499743983615, + "grad_norm": 0.4290259108386704, + "learning_rate": 8.772828127666839e-06, + "loss": 0.5436, + "num_tokens": 430057091.0, + "step": 4575 + }, + { + "epoch": 0.7810206519883939, + "grad_norm": 0.472695336897098, + "learning_rate": 8.76600102406554e-06, + "loss": 0.5752, + "num_tokens": 430148561.0, + "step": 4576 + }, + { + "epoch": 0.7811913295784264, + "grad_norm": 0.4955730812773947, + "learning_rate": 8.759173920464243e-06, + "loss": 0.6567, + "num_tokens": 430251601.0, + "step": 4577 + }, + { + "epoch": 0.7813620071684588, + "grad_norm": 0.4388469846718159, + "learning_rate": 8.752346816862947e-06, + "loss": 0.4859, + "num_tokens": 430350158.0, + "step": 4578 + }, + { + "epoch": 0.7815326847584912, + "grad_norm": 0.5089645039736462, + "learning_rate": 8.74551971326165e-06, + "loss": 0.5055, + "num_tokens": 430422916.0, + "step": 4579 + }, + { + "epoch": 0.7817033623485237, + "grad_norm": 0.4848574930650972, + "learning_rate": 8.738692609660352e-06, + "loss": 0.5821, + "num_tokens": 430513810.0, + "step": 4580 + }, + { + "epoch": 0.7818740399385561, + "grad_norm": 0.5054786389256934, + "learning_rate": 8.731865506059055e-06, + "loss": 0.5402, + "num_tokens": 430594178.0, + "step": 4581 + }, + { + "epoch": 0.7820447175285885, + "grad_norm": 0.4608060943162128, + "learning_rate": 8.725038402457758e-06, + "loss": 0.5499, + "num_tokens": 430687482.0, + "step": 4582 + }, + { + "epoch": 0.7822153951186209, + "grad_norm": 0.45069466346455134, + "learning_rate": 8.71821129885646e-06, + "loss": 0.5428, + "num_tokens": 430784142.0, + "step": 4583 + }, + { + "epoch": 0.7823860727086533, + "grad_norm": 0.49610871363859954, + "learning_rate": 8.711384195255164e-06, + "loss": 0.5383, + "num_tokens": 430861803.0, + "step": 4584 + }, + { + "epoch": 0.7825567502986858, + "grad_norm": 0.5096610792213119, + "learning_rate": 8.704557091653866e-06, + "loss": 0.638, + "num_tokens": 430953415.0, + "step": 4585 + }, + { + "epoch": 0.7827274278887182, + "grad_norm": 0.4957544243552195, + "learning_rate": 8.697729988052568e-06, + "loss": 0.5235, + "num_tokens": 431038809.0, + "step": 4586 + }, + { + "epoch": 0.7828981054787506, + "grad_norm": 0.4816357608419736, + "learning_rate": 8.690902884451272e-06, + "loss": 0.5918, + "num_tokens": 431133164.0, + "step": 4587 + }, + { + "epoch": 0.783068783068783, + "grad_norm": 0.4675032096139556, + "learning_rate": 8.684075780849976e-06, + "loss": 0.5007, + "num_tokens": 431223902.0, + "step": 4588 + }, + { + "epoch": 0.7832394606588154, + "grad_norm": 0.4463678166796841, + "learning_rate": 8.677248677248678e-06, + "loss": 0.5653, + "num_tokens": 431329238.0, + "step": 4589 + }, + { + "epoch": 0.783410138248848, + "grad_norm": 0.46199259006788307, + "learning_rate": 8.67042157364738e-06, + "loss": 0.5178, + "num_tokens": 431421426.0, + "step": 4590 + }, + { + "epoch": 0.7835808158388804, + "grad_norm": 0.43626465550114807, + "learning_rate": 8.663594470046084e-06, + "loss": 0.5674, + "num_tokens": 431530721.0, + "step": 4591 + }, + { + "epoch": 0.7837514934289128, + "grad_norm": 0.4425831088710978, + "learning_rate": 8.656767366444786e-06, + "loss": 0.6462, + "num_tokens": 431648986.0, + "step": 4592 + }, + { + "epoch": 0.7839221710189452, + "grad_norm": 0.5387976837963264, + "learning_rate": 8.64994026284349e-06, + "loss": 0.5932, + "num_tokens": 431718197.0, + "step": 4593 + }, + { + "epoch": 0.7840928486089777, + "grad_norm": 0.504602722676754, + "learning_rate": 8.643113159242192e-06, + "loss": 0.5541, + "num_tokens": 431802491.0, + "step": 4594 + }, + { + "epoch": 0.7842635261990101, + "grad_norm": 0.5570452574510517, + "learning_rate": 8.636286055640895e-06, + "loss": 0.5717, + "num_tokens": 431902905.0, + "step": 4595 + }, + { + "epoch": 0.7844342037890425, + "grad_norm": 0.49290517963269287, + "learning_rate": 8.629458952039597e-06, + "loss": 0.5333, + "num_tokens": 431986921.0, + "step": 4596 + }, + { + "epoch": 0.7846048813790749, + "grad_norm": 0.4430669303056807, + "learning_rate": 8.622631848438301e-06, + "loss": 0.5853, + "num_tokens": 432103760.0, + "step": 4597 + }, + { + "epoch": 0.7847755589691073, + "grad_norm": 0.43263665470350543, + "learning_rate": 8.615804744837005e-06, + "loss": 0.5665, + "num_tokens": 432217443.0, + "step": 4598 + }, + { + "epoch": 0.7849462365591398, + "grad_norm": 0.4675827908849969, + "learning_rate": 8.608977641235707e-06, + "loss": 0.5636, + "num_tokens": 432315986.0, + "step": 4599 + }, + { + "epoch": 0.7851169141491722, + "grad_norm": 0.45187513898974707, + "learning_rate": 8.602150537634409e-06, + "loss": 0.5939, + "num_tokens": 432431178.0, + "step": 4600 + }, + { + "epoch": 0.7852875917392046, + "grad_norm": 0.41866053729639074, + "learning_rate": 8.595323434033113e-06, + "loss": 0.5919, + "num_tokens": 432566264.0, + "step": 4601 + }, + { + "epoch": 0.7854582693292371, + "grad_norm": 0.4392651647741495, + "learning_rate": 8.588496330431815e-06, + "loss": 0.5975, + "num_tokens": 432676588.0, + "step": 4602 + }, + { + "epoch": 0.7856289469192695, + "grad_norm": 0.46103188378027415, + "learning_rate": 8.581669226830519e-06, + "loss": 0.5788, + "num_tokens": 432774404.0, + "step": 4603 + }, + { + "epoch": 0.785799624509302, + "grad_norm": 0.4633664233935847, + "learning_rate": 8.57484212322922e-06, + "loss": 0.5786, + "num_tokens": 432877683.0, + "step": 4604 + }, + { + "epoch": 0.7859703020993344, + "grad_norm": 0.398142534115965, + "learning_rate": 8.568015019627923e-06, + "loss": 0.5377, + "num_tokens": 433009951.0, + "step": 4605 + }, + { + "epoch": 0.7861409796893668, + "grad_norm": 0.4622303752097327, + "learning_rate": 8.561187916026627e-06, + "loss": 0.5903, + "num_tokens": 433116374.0, + "step": 4606 + }, + { + "epoch": 0.7863116572793992, + "grad_norm": 0.4454738364301083, + "learning_rate": 8.55436081242533e-06, + "loss": 0.5417, + "num_tokens": 433217661.0, + "step": 4607 + }, + { + "epoch": 0.7864823348694316, + "grad_norm": 0.5461894819053416, + "learning_rate": 8.547533708824032e-06, + "loss": 0.7025, + "num_tokens": 433302406.0, + "step": 4608 + }, + { + "epoch": 0.7866530124594641, + "grad_norm": 0.48384519476435206, + "learning_rate": 8.540706605222734e-06, + "loss": 0.5855, + "num_tokens": 433394191.0, + "step": 4609 + }, + { + "epoch": 0.7868236900494965, + "grad_norm": 0.48109797204044713, + "learning_rate": 8.533879501621438e-06, + "loss": 0.6413, + "num_tokens": 433492501.0, + "step": 4610 + }, + { + "epoch": 0.7869943676395289, + "grad_norm": 0.45452814177225837, + "learning_rate": 8.52705239802014e-06, + "loss": 0.5759, + "num_tokens": 433589255.0, + "step": 4611 + }, + { + "epoch": 0.7871650452295613, + "grad_norm": 0.4408157451697141, + "learning_rate": 8.520225294418844e-06, + "loss": 0.5747, + "num_tokens": 433693911.0, + "step": 4612 + }, + { + "epoch": 0.7873357228195937, + "grad_norm": 0.5400864311781873, + "learning_rate": 8.513398190817546e-06, + "loss": 0.6007, + "num_tokens": 433794118.0, + "step": 4613 + }, + { + "epoch": 0.7875064004096262, + "grad_norm": 0.4144863244227111, + "learning_rate": 8.506571087216248e-06, + "loss": 0.5568, + "num_tokens": 433909925.0, + "step": 4614 + }, + { + "epoch": 0.7876770779996587, + "grad_norm": 0.427347547210067, + "learning_rate": 8.499743983614952e-06, + "loss": 0.5611, + "num_tokens": 434024463.0, + "step": 4615 + }, + { + "epoch": 0.7878477555896911, + "grad_norm": 0.6020443125918318, + "learning_rate": 8.492916880013656e-06, + "loss": 0.6249, + "num_tokens": 434108612.0, + "step": 4616 + }, + { + "epoch": 0.7880184331797235, + "grad_norm": 0.43223270534319364, + "learning_rate": 8.486089776412358e-06, + "loss": 0.5238, + "num_tokens": 434214804.0, + "step": 4617 + }, + { + "epoch": 0.788189110769756, + "grad_norm": 0.4434831823321924, + "learning_rate": 8.47926267281106e-06, + "loss": 0.5839, + "num_tokens": 434317418.0, + "step": 4618 + }, + { + "epoch": 0.7883597883597884, + "grad_norm": 0.4325608888522041, + "learning_rate": 8.472435569209764e-06, + "loss": 0.5654, + "num_tokens": 434429906.0, + "step": 4619 + }, + { + "epoch": 0.7885304659498208, + "grad_norm": 0.4806149639795102, + "learning_rate": 8.465608465608466e-06, + "loss": 0.5797, + "num_tokens": 434521263.0, + "step": 4620 + }, + { + "epoch": 0.7887011435398532, + "grad_norm": 0.47374016853348877, + "learning_rate": 8.45878136200717e-06, + "loss": 0.5365, + "num_tokens": 434606801.0, + "step": 4621 + }, + { + "epoch": 0.7888718211298856, + "grad_norm": 0.5195008072318411, + "learning_rate": 8.451954258405872e-06, + "loss": 0.5816, + "num_tokens": 434686037.0, + "step": 4622 + }, + { + "epoch": 0.789042498719918, + "grad_norm": 0.46900584101829657, + "learning_rate": 8.445127154804574e-06, + "loss": 0.5799, + "num_tokens": 434797831.0, + "step": 4623 + }, + { + "epoch": 0.7892131763099505, + "grad_norm": 0.41733832761401374, + "learning_rate": 8.438300051203277e-06, + "loss": 0.5922, + "num_tokens": 434921400.0, + "step": 4624 + }, + { + "epoch": 0.7893838538999829, + "grad_norm": 0.44359222910953233, + "learning_rate": 8.431472947601981e-06, + "loss": 0.589, + "num_tokens": 435026579.0, + "step": 4625 + }, + { + "epoch": 0.7895545314900153, + "grad_norm": 0.5298700464635128, + "learning_rate": 8.424645844000683e-06, + "loss": 0.6014, + "num_tokens": 435107128.0, + "step": 4626 + }, + { + "epoch": 0.7897252090800478, + "grad_norm": 0.44638202515282044, + "learning_rate": 8.417818740399385e-06, + "loss": 0.5614, + "num_tokens": 435215881.0, + "step": 4627 + }, + { + "epoch": 0.7898958866700803, + "grad_norm": 0.5016526106542457, + "learning_rate": 8.410991636798089e-06, + "loss": 0.6482, + "num_tokens": 435306018.0, + "step": 4628 + }, + { + "epoch": 0.7900665642601127, + "grad_norm": 0.4780506967021364, + "learning_rate": 8.404164533196793e-06, + "loss": 0.6216, + "num_tokens": 435404001.0, + "step": 4629 + }, + { + "epoch": 0.7902372418501451, + "grad_norm": 0.45978715919892893, + "learning_rate": 8.397337429595495e-06, + "loss": 0.5413, + "num_tokens": 435498817.0, + "step": 4630 + }, + { + "epoch": 0.7904079194401775, + "grad_norm": 0.4357432717585096, + "learning_rate": 8.390510325994199e-06, + "loss": 0.5223, + "num_tokens": 435599883.0, + "step": 4631 + }, + { + "epoch": 0.7905785970302099, + "grad_norm": 0.4442680183572049, + "learning_rate": 8.3836832223929e-06, + "loss": 0.6473, + "num_tokens": 435711063.0, + "step": 4632 + }, + { + "epoch": 0.7907492746202424, + "grad_norm": 0.500758003283685, + "learning_rate": 8.376856118791603e-06, + "loss": 0.5725, + "num_tokens": 435796663.0, + "step": 4633 + }, + { + "epoch": 0.7909199522102748, + "grad_norm": 0.5191335675844869, + "learning_rate": 8.370029015190307e-06, + "loss": 0.5261, + "num_tokens": 435866081.0, + "step": 4634 + }, + { + "epoch": 0.7910906298003072, + "grad_norm": 0.5979566571772553, + "learning_rate": 8.36320191158901e-06, + "loss": 0.6544, + "num_tokens": 435926609.0, + "step": 4635 + }, + { + "epoch": 0.7912613073903396, + "grad_norm": 0.45726854108560566, + "learning_rate": 8.356374807987712e-06, + "loss": 0.5217, + "num_tokens": 436015005.0, + "step": 4636 + }, + { + "epoch": 0.791431984980372, + "grad_norm": 0.7927775343042611, + "learning_rate": 8.349547704386414e-06, + "loss": 0.5485, + "num_tokens": 436108526.0, + "step": 4637 + }, + { + "epoch": 0.7916026625704045, + "grad_norm": 0.44920973042342965, + "learning_rate": 8.342720600785118e-06, + "loss": 0.5747, + "num_tokens": 436211074.0, + "step": 4638 + }, + { + "epoch": 0.791773340160437, + "grad_norm": 0.4822836248328942, + "learning_rate": 8.33589349718382e-06, + "loss": 0.6166, + "num_tokens": 436306243.0, + "step": 4639 + }, + { + "epoch": 0.7919440177504694, + "grad_norm": 0.4563287753253783, + "learning_rate": 8.329066393582524e-06, + "loss": 0.6283, + "num_tokens": 436414296.0, + "step": 4640 + }, + { + "epoch": 0.7921146953405018, + "grad_norm": 0.525407627849846, + "learning_rate": 8.322239289981226e-06, + "loss": 0.5997, + "num_tokens": 436490305.0, + "step": 4641 + }, + { + "epoch": 0.7922853729305342, + "grad_norm": 0.43797166215374833, + "learning_rate": 8.315412186379928e-06, + "loss": 0.6034, + "num_tokens": 436615076.0, + "step": 4642 + }, + { + "epoch": 0.7924560505205667, + "grad_norm": 0.4659227003901962, + "learning_rate": 8.308585082778632e-06, + "loss": 0.5769, + "num_tokens": 436708678.0, + "step": 4643 + }, + { + "epoch": 0.7926267281105991, + "grad_norm": 0.4347511103543041, + "learning_rate": 8.301757979177336e-06, + "loss": 0.532, + "num_tokens": 436810295.0, + "step": 4644 + }, + { + "epoch": 0.7927974057006315, + "grad_norm": 0.4929947822057612, + "learning_rate": 8.294930875576038e-06, + "loss": 0.5755, + "num_tokens": 436897372.0, + "step": 4645 + }, + { + "epoch": 0.7929680832906639, + "grad_norm": 0.44660059342540664, + "learning_rate": 8.28810377197474e-06, + "loss": 0.5349, + "num_tokens": 436994532.0, + "step": 4646 + }, + { + "epoch": 0.7931387608806963, + "grad_norm": 0.5354149399697585, + "learning_rate": 8.281276668373444e-06, + "loss": 0.5593, + "num_tokens": 437065092.0, + "step": 4647 + }, + { + "epoch": 0.7933094384707288, + "grad_norm": 0.47844954690907116, + "learning_rate": 8.274449564772146e-06, + "loss": 0.5148, + "num_tokens": 437148781.0, + "step": 4648 + }, + { + "epoch": 0.7934801160607612, + "grad_norm": 0.5391541644331918, + "learning_rate": 8.26762246117085e-06, + "loss": 0.5593, + "num_tokens": 437217459.0, + "step": 4649 + }, + { + "epoch": 0.7936507936507936, + "grad_norm": 0.4749223293470301, + "learning_rate": 8.260795357569551e-06, + "loss": 0.6505, + "num_tokens": 437329320.0, + "step": 4650 + }, + { + "epoch": 0.793821471240826, + "grad_norm": 0.43046038306084167, + "learning_rate": 8.253968253968254e-06, + "loss": 0.5078, + "num_tokens": 437431474.0, + "step": 4651 + }, + { + "epoch": 0.7939921488308586, + "grad_norm": 0.45041606611280705, + "learning_rate": 8.247141150366957e-06, + "loss": 0.6076, + "num_tokens": 437555576.0, + "step": 4652 + }, + { + "epoch": 0.794162826420891, + "grad_norm": 0.5467136829624468, + "learning_rate": 8.240314046765661e-06, + "loss": 0.6847, + "num_tokens": 437639255.0, + "step": 4653 + }, + { + "epoch": 0.7943335040109234, + "grad_norm": 0.5155956774945846, + "learning_rate": 8.233486943164363e-06, + "loss": 0.5401, + "num_tokens": 437709778.0, + "step": 4654 + }, + { + "epoch": 0.7945041816009558, + "grad_norm": 0.45093491332247954, + "learning_rate": 8.226659839563065e-06, + "loss": 0.518, + "num_tokens": 437812022.0, + "step": 4655 + }, + { + "epoch": 0.7946748591909882, + "grad_norm": 0.4870641646383607, + "learning_rate": 8.219832735961769e-06, + "loss": 0.5366, + "num_tokens": 437891770.0, + "step": 4656 + }, + { + "epoch": 0.7948455367810207, + "grad_norm": 0.49134484889626057, + "learning_rate": 8.213005632360471e-06, + "loss": 0.6033, + "num_tokens": 437988305.0, + "step": 4657 + }, + { + "epoch": 0.7950162143710531, + "grad_norm": 0.44440244613634977, + "learning_rate": 8.206178528759175e-06, + "loss": 0.5702, + "num_tokens": 438091870.0, + "step": 4658 + }, + { + "epoch": 0.7951868919610855, + "grad_norm": 0.5268637119108853, + "learning_rate": 8.199351425157877e-06, + "loss": 0.5373, + "num_tokens": 438158229.0, + "step": 4659 + }, + { + "epoch": 0.7953575695511179, + "grad_norm": 0.5775655646019227, + "learning_rate": 8.19252432155658e-06, + "loss": 0.5918, + "num_tokens": 438217052.0, + "step": 4660 + }, + { + "epoch": 0.7955282471411503, + "grad_norm": 0.5024238776609122, + "learning_rate": 8.185697217955283e-06, + "loss": 0.6908, + "num_tokens": 438321610.0, + "step": 4661 + }, + { + "epoch": 0.7956989247311828, + "grad_norm": 0.43373112751671394, + "learning_rate": 8.178870114353986e-06, + "loss": 0.4988, + "num_tokens": 438428646.0, + "step": 4662 + }, + { + "epoch": 0.7958696023212152, + "grad_norm": 0.43670897010782955, + "learning_rate": 8.172043010752689e-06, + "loss": 0.5892, + "num_tokens": 438537130.0, + "step": 4663 + }, + { + "epoch": 0.7960402799112477, + "grad_norm": 0.4863851549844351, + "learning_rate": 8.165215907151392e-06, + "loss": 0.5321, + "num_tokens": 438616370.0, + "step": 4664 + }, + { + "epoch": 0.7962109575012801, + "grad_norm": 0.4279499308461102, + "learning_rate": 8.158388803550094e-06, + "loss": 0.583, + "num_tokens": 438731017.0, + "step": 4665 + }, + { + "epoch": 0.7963816350913125, + "grad_norm": 0.43177232703506974, + "learning_rate": 8.151561699948798e-06, + "loss": 0.6101, + "num_tokens": 438850451.0, + "step": 4666 + }, + { + "epoch": 0.796552312681345, + "grad_norm": 0.47255085694124493, + "learning_rate": 8.1447345963475e-06, + "loss": 0.6493, + "num_tokens": 438954493.0, + "step": 4667 + }, + { + "epoch": 0.7967229902713774, + "grad_norm": 0.5000918217271694, + "learning_rate": 8.137907492746204e-06, + "loss": 0.5678, + "num_tokens": 439033174.0, + "step": 4668 + }, + { + "epoch": 0.7968936678614098, + "grad_norm": 0.43051004983527585, + "learning_rate": 8.131080389144906e-06, + "loss": 0.552, + "num_tokens": 439146241.0, + "step": 4669 + }, + { + "epoch": 0.7970643454514422, + "grad_norm": 0.46971760806463425, + "learning_rate": 8.124253285543608e-06, + "loss": 0.5236, + "num_tokens": 439246240.0, + "step": 4670 + }, + { + "epoch": 0.7972350230414746, + "grad_norm": 0.4660851937776103, + "learning_rate": 8.117426181942312e-06, + "loss": 0.6093, + "num_tokens": 439349486.0, + "step": 4671 + }, + { + "epoch": 0.7974057006315071, + "grad_norm": 0.4979952260345353, + "learning_rate": 8.110599078341016e-06, + "loss": 0.559, + "num_tokens": 439430743.0, + "step": 4672 + }, + { + "epoch": 0.7975763782215395, + "grad_norm": 0.42757577792360335, + "learning_rate": 8.103771974739718e-06, + "loss": 0.5109, + "num_tokens": 439535835.0, + "step": 4673 + }, + { + "epoch": 0.7977470558115719, + "grad_norm": 0.44250245049017345, + "learning_rate": 8.09694487113842e-06, + "loss": 0.5813, + "num_tokens": 439647673.0, + "step": 4674 + }, + { + "epoch": 0.7979177334016043, + "grad_norm": 0.5105691675240714, + "learning_rate": 8.090117767537124e-06, + "loss": 0.5487, + "num_tokens": 439718961.0, + "step": 4675 + }, + { + "epoch": 0.7980884109916367, + "grad_norm": 0.4554244975395649, + "learning_rate": 8.083290663935826e-06, + "loss": 0.5466, + "num_tokens": 439813555.0, + "step": 4676 + }, + { + "epoch": 0.7982590885816693, + "grad_norm": 0.39854777839388333, + "learning_rate": 8.07646356033453e-06, + "loss": 0.5492, + "num_tokens": 439941597.0, + "step": 4677 + }, + { + "epoch": 0.7984297661717017, + "grad_norm": 0.5701114977765167, + "learning_rate": 8.069636456733231e-06, + "loss": 0.646, + "num_tokens": 440015890.0, + "step": 4678 + }, + { + "epoch": 0.7986004437617341, + "grad_norm": 0.5250475357626101, + "learning_rate": 8.062809353131933e-06, + "loss": 0.5593, + "num_tokens": 440085639.0, + "step": 4679 + }, + { + "epoch": 0.7987711213517665, + "grad_norm": 0.5205267324314233, + "learning_rate": 8.055982249530637e-06, + "loss": 0.4823, + "num_tokens": 440157518.0, + "step": 4680 + }, + { + "epoch": 0.798941798941799, + "grad_norm": 0.49875640470294186, + "learning_rate": 8.049155145929341e-06, + "loss": 0.5638, + "num_tokens": 440239322.0, + "step": 4681 + }, + { + "epoch": 0.7991124765318314, + "grad_norm": 0.477046606877634, + "learning_rate": 8.042328042328043e-06, + "loss": 0.5859, + "num_tokens": 440333845.0, + "step": 4682 + }, + { + "epoch": 0.7992831541218638, + "grad_norm": 0.4932278481156632, + "learning_rate": 8.035500938726745e-06, + "loss": 0.5605, + "num_tokens": 440419039.0, + "step": 4683 + }, + { + "epoch": 0.7994538317118962, + "grad_norm": 0.5319684286051337, + "learning_rate": 8.028673835125449e-06, + "loss": 0.6639, + "num_tokens": 440503380.0, + "step": 4684 + }, + { + "epoch": 0.7996245093019286, + "grad_norm": 0.4612513862187699, + "learning_rate": 8.021846731524151e-06, + "loss": 0.5075, + "num_tokens": 440593883.0, + "step": 4685 + }, + { + "epoch": 0.799795186891961, + "grad_norm": 0.4674137788653919, + "learning_rate": 8.015019627922855e-06, + "loss": 0.557, + "num_tokens": 440680930.0, + "step": 4686 + }, + { + "epoch": 0.7999658644819935, + "grad_norm": 0.5515140314394225, + "learning_rate": 8.008192524321557e-06, + "loss": 0.6105, + "num_tokens": 440750111.0, + "step": 4687 + }, + { + "epoch": 0.8001365420720259, + "grad_norm": 0.4320931635748151, + "learning_rate": 8.001365420720259e-06, + "loss": 0.4867, + "num_tokens": 440850875.0, + "step": 4688 + }, + { + "epoch": 0.8003072196620584, + "grad_norm": 0.4793732479138382, + "learning_rate": 7.994538317118963e-06, + "loss": 0.5439, + "num_tokens": 440931081.0, + "step": 4689 + }, + { + "epoch": 0.8004778972520908, + "grad_norm": 0.4896389467587396, + "learning_rate": 7.987711213517666e-06, + "loss": 0.5816, + "num_tokens": 441022067.0, + "step": 4690 + }, + { + "epoch": 0.8006485748421233, + "grad_norm": 0.45087923104646044, + "learning_rate": 7.980884109916368e-06, + "loss": 0.5101, + "num_tokens": 441112705.0, + "step": 4691 + }, + { + "epoch": 0.8008192524321557, + "grad_norm": 0.4434833666927645, + "learning_rate": 7.97405700631507e-06, + "loss": 0.593, + "num_tokens": 441217313.0, + "step": 4692 + }, + { + "epoch": 0.8009899300221881, + "grad_norm": 0.47553182839320374, + "learning_rate": 7.967229902713774e-06, + "loss": 0.6504, + "num_tokens": 441319193.0, + "step": 4693 + }, + { + "epoch": 0.8011606076122205, + "grad_norm": 0.5653476793275856, + "learning_rate": 7.960402799112478e-06, + "loss": 0.6404, + "num_tokens": 441392778.0, + "step": 4694 + }, + { + "epoch": 0.8013312852022529, + "grad_norm": 0.46150968279771093, + "learning_rate": 7.95357569551118e-06, + "loss": 0.5816, + "num_tokens": 441496001.0, + "step": 4695 + }, + { + "epoch": 0.8015019627922854, + "grad_norm": 0.5034864622357038, + "learning_rate": 7.946748591909882e-06, + "loss": 0.5516, + "num_tokens": 441584091.0, + "step": 4696 + }, + { + "epoch": 0.8016726403823178, + "grad_norm": 0.4820755897959334, + "learning_rate": 7.939921488308586e-06, + "loss": 0.5819, + "num_tokens": 441679608.0, + "step": 4697 + }, + { + "epoch": 0.8018433179723502, + "grad_norm": 0.45651068791894567, + "learning_rate": 7.933094384707288e-06, + "loss": 0.485, + "num_tokens": 441771699.0, + "step": 4698 + }, + { + "epoch": 0.8020139955623826, + "grad_norm": 0.5371188820143241, + "learning_rate": 7.926267281105992e-06, + "loss": 0.6009, + "num_tokens": 441840387.0, + "step": 4699 + }, + { + "epoch": 0.802184673152415, + "grad_norm": 0.4347976007642283, + "learning_rate": 7.919440177504696e-06, + "loss": 0.6039, + "num_tokens": 441953085.0, + "step": 4700 + }, + { + "epoch": 0.8023553507424476, + "grad_norm": 0.43895848824732425, + "learning_rate": 7.912613073903398e-06, + "loss": 0.5554, + "num_tokens": 442069085.0, + "step": 4701 + }, + { + "epoch": 0.80252602833248, + "grad_norm": 0.4850531153241374, + "learning_rate": 7.9057859703021e-06, + "loss": 0.6859, + "num_tokens": 442174850.0, + "step": 4702 + }, + { + "epoch": 0.8026967059225124, + "grad_norm": 0.49048536742539856, + "learning_rate": 7.898958866700803e-06, + "loss": 0.5693, + "num_tokens": 442257749.0, + "step": 4703 + }, + { + "epoch": 0.8028673835125448, + "grad_norm": 0.456120240880761, + "learning_rate": 7.892131763099506e-06, + "loss": 0.5, + "num_tokens": 442351312.0, + "step": 4704 + }, + { + "epoch": 0.8030380611025772, + "grad_norm": 0.4851290433809586, + "learning_rate": 7.88530465949821e-06, + "loss": 0.5998, + "num_tokens": 442437700.0, + "step": 4705 + }, + { + "epoch": 0.8032087386926097, + "grad_norm": 0.48505172014800363, + "learning_rate": 7.878477555896911e-06, + "loss": 0.6141, + "num_tokens": 442530534.0, + "step": 4706 + }, + { + "epoch": 0.8033794162826421, + "grad_norm": 0.46628612721411083, + "learning_rate": 7.871650452295613e-06, + "loss": 0.6024, + "num_tokens": 442628867.0, + "step": 4707 + }, + { + "epoch": 0.8035500938726745, + "grad_norm": 0.4085305613049359, + "learning_rate": 7.864823348694317e-06, + "loss": 0.5818, + "num_tokens": 442755406.0, + "step": 4708 + }, + { + "epoch": 0.8037207714627069, + "grad_norm": 0.45995335142639193, + "learning_rate": 7.857996245093021e-06, + "loss": 0.5868, + "num_tokens": 442854076.0, + "step": 4709 + }, + { + "epoch": 0.8038914490527393, + "grad_norm": 0.49194037595173723, + "learning_rate": 7.851169141491723e-06, + "loss": 0.6284, + "num_tokens": 442949230.0, + "step": 4710 + }, + { + "epoch": 0.8040621266427718, + "grad_norm": 0.5070487707463428, + "learning_rate": 7.844342037890425e-06, + "loss": 0.6026, + "num_tokens": 443034232.0, + "step": 4711 + }, + { + "epoch": 0.8042328042328042, + "grad_norm": 0.6064606937520076, + "learning_rate": 7.837514934289129e-06, + "loss": 0.6276, + "num_tokens": 443090295.0, + "step": 4712 + }, + { + "epoch": 0.8044034818228366, + "grad_norm": 0.4822728047704988, + "learning_rate": 7.830687830687831e-06, + "loss": 0.581, + "num_tokens": 443191475.0, + "step": 4713 + }, + { + "epoch": 0.8045741594128691, + "grad_norm": 0.45686565761466347, + "learning_rate": 7.823860727086535e-06, + "loss": 0.548, + "num_tokens": 443291806.0, + "step": 4714 + }, + { + "epoch": 0.8047448370029016, + "grad_norm": 0.5078433389124268, + "learning_rate": 7.817033623485237e-06, + "loss": 0.6716, + "num_tokens": 443392935.0, + "step": 4715 + }, + { + "epoch": 0.804915514592934, + "grad_norm": 0.5297330657328968, + "learning_rate": 7.810206519883939e-06, + "loss": 0.6122, + "num_tokens": 443503219.0, + "step": 4716 + }, + { + "epoch": 0.8050861921829664, + "grad_norm": 0.4442754311519693, + "learning_rate": 7.803379416282643e-06, + "loss": 0.6364, + "num_tokens": 443633907.0, + "step": 4717 + }, + { + "epoch": 0.8052568697729988, + "grad_norm": 0.44523658274735073, + "learning_rate": 7.796552312681346e-06, + "loss": 0.5387, + "num_tokens": 443732581.0, + "step": 4718 + }, + { + "epoch": 0.8054275473630312, + "grad_norm": 0.4668061364369091, + "learning_rate": 7.789725209080048e-06, + "loss": 0.5953, + "num_tokens": 443842089.0, + "step": 4719 + }, + { + "epoch": 0.8055982249530637, + "grad_norm": 0.4527338502699434, + "learning_rate": 7.78289810547875e-06, + "loss": 0.5782, + "num_tokens": 443948404.0, + "step": 4720 + }, + { + "epoch": 0.8057689025430961, + "grad_norm": 0.4349686086567656, + "learning_rate": 7.776071001877454e-06, + "loss": 0.5588, + "num_tokens": 444064646.0, + "step": 4721 + }, + { + "epoch": 0.8059395801331285, + "grad_norm": 0.4178395799324611, + "learning_rate": 7.769243898276156e-06, + "loss": 0.505, + "num_tokens": 444177487.0, + "step": 4722 + }, + { + "epoch": 0.8061102577231609, + "grad_norm": 0.5492339505493958, + "learning_rate": 7.76241679467486e-06, + "loss": 0.5744, + "num_tokens": 444244843.0, + "step": 4723 + }, + { + "epoch": 0.8062809353131933, + "grad_norm": 0.5777868756945832, + "learning_rate": 7.755589691073562e-06, + "loss": 0.6124, + "num_tokens": 444313789.0, + "step": 4724 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.5660753830103177, + "learning_rate": 7.748762587472266e-06, + "loss": 0.648, + "num_tokens": 444451307.0, + "step": 4725 + }, + { + "epoch": 0.8066222904932583, + "grad_norm": 0.565788730801703, + "learning_rate": 7.741935483870968e-06, + "loss": 0.5554, + "num_tokens": 444559978.0, + "step": 4726 + }, + { + "epoch": 0.8067929680832907, + "grad_norm": 0.5077881760250416, + "learning_rate": 7.735108380269672e-06, + "loss": 0.5967, + "num_tokens": 444645394.0, + "step": 4727 + }, + { + "epoch": 0.8069636456733231, + "grad_norm": 0.5251965347626029, + "learning_rate": 7.728281276668374e-06, + "loss": 0.5582, + "num_tokens": 444713209.0, + "step": 4728 + }, + { + "epoch": 0.8071343232633555, + "grad_norm": 0.44665513693365444, + "learning_rate": 7.721454173067076e-06, + "loss": 0.6364, + "num_tokens": 444835380.0, + "step": 4729 + }, + { + "epoch": 0.807305000853388, + "grad_norm": 0.5262151675491631, + "learning_rate": 7.71462706946578e-06, + "loss": 0.6032, + "num_tokens": 444910532.0, + "step": 4730 + }, + { + "epoch": 0.8074756784434204, + "grad_norm": 0.4406408462039059, + "learning_rate": 7.707799965864483e-06, + "loss": 0.5854, + "num_tokens": 445021087.0, + "step": 4731 + }, + { + "epoch": 0.8076463560334528, + "grad_norm": 0.48509872502605433, + "learning_rate": 7.700972862263185e-06, + "loss": 0.5532, + "num_tokens": 445112228.0, + "step": 4732 + }, + { + "epoch": 0.8078170336234852, + "grad_norm": 0.5318598670696898, + "learning_rate": 7.694145758661888e-06, + "loss": 0.5411, + "num_tokens": 445186727.0, + "step": 4733 + }, + { + "epoch": 0.8079877112135176, + "grad_norm": 0.46175591129573507, + "learning_rate": 7.687318655060591e-06, + "loss": 0.5565, + "num_tokens": 445278887.0, + "step": 4734 + }, + { + "epoch": 0.8081583888035501, + "grad_norm": 0.5237995150968546, + "learning_rate": 7.680491551459293e-06, + "loss": 0.6402, + "num_tokens": 445365170.0, + "step": 4735 + }, + { + "epoch": 0.8083290663935825, + "grad_norm": 0.5204115640945969, + "learning_rate": 7.673664447857997e-06, + "loss": 0.592, + "num_tokens": 445448142.0, + "step": 4736 + }, + { + "epoch": 0.8084997439836149, + "grad_norm": 0.46033575180774333, + "learning_rate": 7.666837344256701e-06, + "loss": 0.5187, + "num_tokens": 445540187.0, + "step": 4737 + }, + { + "epoch": 0.8086704215736474, + "grad_norm": 0.5414392392584082, + "learning_rate": 7.660010240655403e-06, + "loss": 0.5108, + "num_tokens": 445600929.0, + "step": 4738 + }, + { + "epoch": 0.8088410991636799, + "grad_norm": 0.49365812499685696, + "learning_rate": 7.653183137054105e-06, + "loss": 0.5595, + "num_tokens": 445687049.0, + "step": 4739 + }, + { + "epoch": 0.8090117767537123, + "grad_norm": 0.4674204671236959, + "learning_rate": 7.646356033452809e-06, + "loss": 0.5302, + "num_tokens": 445779468.0, + "step": 4740 + }, + { + "epoch": 0.8091824543437447, + "grad_norm": 0.463834357586098, + "learning_rate": 7.639528929851511e-06, + "loss": 0.5908, + "num_tokens": 445885688.0, + "step": 4741 + }, + { + "epoch": 0.8093531319337771, + "grad_norm": 0.5180836879607043, + "learning_rate": 7.632701826250215e-06, + "loss": 0.5983, + "num_tokens": 445966849.0, + "step": 4742 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.43733858473361725, + "learning_rate": 7.6258747226489176e-06, + "loss": 0.5648, + "num_tokens": 446069949.0, + "step": 4743 + }, + { + "epoch": 0.809694487113842, + "grad_norm": 0.421098033902334, + "learning_rate": 7.61904761904762e-06, + "loss": 0.5108, + "num_tokens": 446174476.0, + "step": 4744 + }, + { + "epoch": 0.8098651647038744, + "grad_norm": 0.45775977878478563, + "learning_rate": 7.6122205154463225e-06, + "loss": 0.5109, + "num_tokens": 446265575.0, + "step": 4745 + }, + { + "epoch": 0.8100358422939068, + "grad_norm": 0.48272595996728374, + "learning_rate": 7.6053934118450255e-06, + "loss": 0.6086, + "num_tokens": 446381498.0, + "step": 4746 + }, + { + "epoch": 0.8102065198839392, + "grad_norm": 0.4120180210779112, + "learning_rate": 7.5985663082437275e-06, + "loss": 0.5533, + "num_tokens": 446504853.0, + "step": 4747 + }, + { + "epoch": 0.8103771974739716, + "grad_norm": 0.4482648690972832, + "learning_rate": 7.591739204642431e-06, + "loss": 0.5726, + "num_tokens": 446615737.0, + "step": 4748 + }, + { + "epoch": 0.810547875064004, + "grad_norm": 0.44542807746670415, + "learning_rate": 7.584912101041134e-06, + "loss": 0.4972, + "num_tokens": 446709191.0, + "step": 4749 + }, + { + "epoch": 0.8107185526540365, + "grad_norm": 0.5362469969304422, + "learning_rate": 7.578084997439836e-06, + "loss": 0.6435, + "num_tokens": 446786360.0, + "step": 4750 + }, + { + "epoch": 0.810889230244069, + "grad_norm": 0.5082461474855435, + "learning_rate": 7.571257893838539e-06, + "loss": 0.4849, + "num_tokens": 446861908.0, + "step": 4751 + }, + { + "epoch": 0.8110599078341014, + "grad_norm": 0.4681072380534358, + "learning_rate": 7.564430790237243e-06, + "loss": 0.5772, + "num_tokens": 446970830.0, + "step": 4752 + }, + { + "epoch": 0.8112305854241338, + "grad_norm": 0.5151206928149785, + "learning_rate": 7.557603686635945e-06, + "loss": 0.6221, + "num_tokens": 447052470.0, + "step": 4753 + }, + { + "epoch": 0.8114012630141663, + "grad_norm": 0.43845125865735257, + "learning_rate": 7.550776583034648e-06, + "loss": 0.5225, + "num_tokens": 447153197.0, + "step": 4754 + }, + { + "epoch": 0.8115719406041987, + "grad_norm": 0.4815680323746247, + "learning_rate": 7.543949479433351e-06, + "loss": 0.6382, + "num_tokens": 447250490.0, + "step": 4755 + }, + { + "epoch": 0.8117426181942311, + "grad_norm": 0.4961296233278673, + "learning_rate": 7.537122375832055e-06, + "loss": 0.5405, + "num_tokens": 447347047.0, + "step": 4756 + }, + { + "epoch": 0.8119132957842635, + "grad_norm": 0.4658964199326391, + "learning_rate": 7.530295272230757e-06, + "loss": 0.5568, + "num_tokens": 447430936.0, + "step": 4757 + }, + { + "epoch": 0.8120839733742959, + "grad_norm": 0.5062694397834623, + "learning_rate": 7.52346816862946e-06, + "loss": 0.6169, + "num_tokens": 447514403.0, + "step": 4758 + }, + { + "epoch": 0.8122546509643284, + "grad_norm": 0.45468790640144185, + "learning_rate": 7.5166410650281625e-06, + "loss": 0.549, + "num_tokens": 447608957.0, + "step": 4759 + }, + { + "epoch": 0.8124253285543608, + "grad_norm": 0.5002290619327998, + "learning_rate": 7.5098139614268654e-06, + "loss": 0.576, + "num_tokens": 447691666.0, + "step": 4760 + }, + { + "epoch": 0.8125960061443932, + "grad_norm": 0.5088123938297505, + "learning_rate": 7.502986857825568e-06, + "loss": 0.5503, + "num_tokens": 447762242.0, + "step": 4761 + }, + { + "epoch": 0.8127666837344256, + "grad_norm": 0.47884012155608563, + "learning_rate": 7.496159754224271e-06, + "loss": 0.558, + "num_tokens": 447855470.0, + "step": 4762 + }, + { + "epoch": 0.8129373613244582, + "grad_norm": 0.4742363471727603, + "learning_rate": 7.489332650622973e-06, + "loss": 0.5201, + "num_tokens": 447939417.0, + "step": 4763 + }, + { + "epoch": 0.8131080389144906, + "grad_norm": 0.4493923326126821, + "learning_rate": 7.482505547021677e-06, + "loss": 0.607, + "num_tokens": 448048856.0, + "step": 4764 + }, + { + "epoch": 0.813278716504523, + "grad_norm": 0.47240501095533605, + "learning_rate": 7.47567844342038e-06, + "loss": 0.5236, + "num_tokens": 448138324.0, + "step": 4765 + }, + { + "epoch": 0.8134493940945554, + "grad_norm": 0.5250939988068586, + "learning_rate": 7.468851339819082e-06, + "loss": 0.5509, + "num_tokens": 448210602.0, + "step": 4766 + }, + { + "epoch": 0.8136200716845878, + "grad_norm": 0.42710569176693125, + "learning_rate": 7.462024236217785e-06, + "loss": 0.5748, + "num_tokens": 448320843.0, + "step": 4767 + }, + { + "epoch": 0.8137907492746203, + "grad_norm": 0.4589262589434066, + "learning_rate": 7.455197132616489e-06, + "loss": 0.614, + "num_tokens": 448425836.0, + "step": 4768 + }, + { + "epoch": 0.8139614268646527, + "grad_norm": 0.4211510730261692, + "learning_rate": 7.448370029015191e-06, + "loss": 0.5238, + "num_tokens": 448539891.0, + "step": 4769 + }, + { + "epoch": 0.8141321044546851, + "grad_norm": 0.4820658228150474, + "learning_rate": 7.441542925413894e-06, + "loss": 0.6199, + "num_tokens": 448636263.0, + "step": 4770 + }, + { + "epoch": 0.8143027820447175, + "grad_norm": 0.4693880267045123, + "learning_rate": 7.434715821812597e-06, + "loss": 0.4356, + "num_tokens": 448710316.0, + "step": 4771 + }, + { + "epoch": 0.8144734596347499, + "grad_norm": 0.4330983283124619, + "learning_rate": 7.427888718211299e-06, + "loss": 0.5924, + "num_tokens": 448827997.0, + "step": 4772 + }, + { + "epoch": 0.8146441372247823, + "grad_norm": 0.48223230571088793, + "learning_rate": 7.4210616146100025e-06, + "loss": 0.5274, + "num_tokens": 448914434.0, + "step": 4773 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.504133452978231, + "learning_rate": 7.414234511008705e-06, + "loss": 0.5232, + "num_tokens": 448988285.0, + "step": 4774 + }, + { + "epoch": 0.8149854924048472, + "grad_norm": 0.463838356889875, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.5589, + "num_tokens": 449082330.0, + "step": 4775 + }, + { + "epoch": 0.8151561699948797, + "grad_norm": 0.4692978057266646, + "learning_rate": 7.40058030380611e-06, + "loss": 0.5358, + "num_tokens": 449168330.0, + "step": 4776 + }, + { + "epoch": 0.8153268475849121, + "grad_norm": 0.4728371355760739, + "learning_rate": 7.393753200204814e-06, + "loss": 0.5117, + "num_tokens": 449252821.0, + "step": 4777 + }, + { + "epoch": 0.8154975251749446, + "grad_norm": 0.4318585181077747, + "learning_rate": 7.386926096603516e-06, + "loss": 0.5939, + "num_tokens": 449371770.0, + "step": 4778 + }, + { + "epoch": 0.815668202764977, + "grad_norm": 0.4787517666440525, + "learning_rate": 7.380098993002219e-06, + "loss": 0.5661, + "num_tokens": 449495511.0, + "step": 4779 + }, + { + "epoch": 0.8158388803550094, + "grad_norm": 0.4653853957540983, + "learning_rate": 7.373271889400923e-06, + "loss": 0.561, + "num_tokens": 449594169.0, + "step": 4780 + }, + { + "epoch": 0.8160095579450418, + "grad_norm": 0.5402013617270179, + "learning_rate": 7.366444785799625e-06, + "loss": 0.5873, + "num_tokens": 449665113.0, + "step": 4781 + }, + { + "epoch": 0.8161802355350742, + "grad_norm": 0.42668335797631934, + "learning_rate": 7.359617682198328e-06, + "loss": 0.5343, + "num_tokens": 449772529.0, + "step": 4782 + }, + { + "epoch": 0.8163509131251067, + "grad_norm": 0.5206181911701467, + "learning_rate": 7.352790578597031e-06, + "loss": 0.5275, + "num_tokens": 449846623.0, + "step": 4783 + }, + { + "epoch": 0.8165215907151391, + "grad_norm": 0.5151588232810184, + "learning_rate": 7.345963474995733e-06, + "loss": 0.5633, + "num_tokens": 449926163.0, + "step": 4784 + }, + { + "epoch": 0.8166922683051715, + "grad_norm": 0.5410831794574849, + "learning_rate": 7.339136371394437e-06, + "loss": 0.6205, + "num_tokens": 450012887.0, + "step": 4785 + }, + { + "epoch": 0.8168629458952039, + "grad_norm": 0.43468436465328264, + "learning_rate": 7.3323092677931396e-06, + "loss": 0.4892, + "num_tokens": 450113031.0, + "step": 4786 + }, + { + "epoch": 0.8170336234852363, + "grad_norm": 0.4445283319759447, + "learning_rate": 7.325482164191842e-06, + "loss": 0.5436, + "num_tokens": 450215776.0, + "step": 4787 + }, + { + "epoch": 0.8172043010752689, + "grad_norm": 0.4497836397792133, + "learning_rate": 7.3186550605905445e-06, + "loss": 0.5622, + "num_tokens": 450315234.0, + "step": 4788 + }, + { + "epoch": 0.8173749786653013, + "grad_norm": 0.4964352188705711, + "learning_rate": 7.311827956989248e-06, + "loss": 0.664, + "num_tokens": 450406358.0, + "step": 4789 + }, + { + "epoch": 0.8175456562553337, + "grad_norm": 0.4685021885322408, + "learning_rate": 7.305000853387951e-06, + "loss": 0.58, + "num_tokens": 450497615.0, + "step": 4790 + }, + { + "epoch": 0.8177163338453661, + "grad_norm": 0.5070255442682443, + "learning_rate": 7.298173749786653e-06, + "loss": 0.5447, + "num_tokens": 450577276.0, + "step": 4791 + }, + { + "epoch": 0.8178870114353985, + "grad_norm": 0.4500326196025653, + "learning_rate": 7.291346646185356e-06, + "loss": 0.5952, + "num_tokens": 450688711.0, + "step": 4792 + }, + { + "epoch": 0.818057689025431, + "grad_norm": 0.4642506042277794, + "learning_rate": 7.28451954258406e-06, + "loss": 0.5338, + "num_tokens": 450793962.0, + "step": 4793 + }, + { + "epoch": 0.8182283666154634, + "grad_norm": 0.4404146776648589, + "learning_rate": 7.277692438982762e-06, + "loss": 0.5637, + "num_tokens": 450899438.0, + "step": 4794 + }, + { + "epoch": 0.8183990442054958, + "grad_norm": 0.4531786457009963, + "learning_rate": 7.270865335381465e-06, + "loss": 0.5399, + "num_tokens": 450997663.0, + "step": 4795 + }, + { + "epoch": 0.8185697217955282, + "grad_norm": 0.4559654845396106, + "learning_rate": 7.264038231780169e-06, + "loss": 0.6374, + "num_tokens": 451111021.0, + "step": 4796 + }, + { + "epoch": 0.8187403993855606, + "grad_norm": 0.45223797694737833, + "learning_rate": 7.257211128178871e-06, + "loss": 0.6404, + "num_tokens": 451220567.0, + "step": 4797 + }, + { + "epoch": 0.8189110769755931, + "grad_norm": 0.5659478820177348, + "learning_rate": 7.250384024577574e-06, + "loss": 0.5707, + "num_tokens": 451285684.0, + "step": 4798 + }, + { + "epoch": 0.8190817545656255, + "grad_norm": 0.4936199028734154, + "learning_rate": 7.243556920976277e-06, + "loss": 0.6067, + "num_tokens": 451375747.0, + "step": 4799 + }, + { + "epoch": 0.819252432155658, + "grad_norm": 0.4898395981698786, + "learning_rate": 7.236729817374979e-06, + "loss": 0.6498, + "num_tokens": 451476131.0, + "step": 4800 + }, + { + "epoch": 0.8194231097456904, + "grad_norm": 0.42581410750767, + "learning_rate": 7.2299027137736824e-06, + "loss": 0.6105, + "num_tokens": 451602384.0, + "step": 4801 + }, + { + "epoch": 0.8195937873357229, + "grad_norm": 0.4688277714931043, + "learning_rate": 7.223075610172385e-06, + "loss": 0.5278, + "num_tokens": 451689428.0, + "step": 4802 + }, + { + "epoch": 0.8197644649257553, + "grad_norm": 0.47243035454378807, + "learning_rate": 7.2162485065710874e-06, + "loss": 0.6396, + "num_tokens": 451793984.0, + "step": 4803 + }, + { + "epoch": 0.8199351425157877, + "grad_norm": 0.5004576418263701, + "learning_rate": 7.20942140296979e-06, + "loss": 0.5978, + "num_tokens": 451875464.0, + "step": 4804 + }, + { + "epoch": 0.8201058201058201, + "grad_norm": 0.4556979668009154, + "learning_rate": 7.202594299368494e-06, + "loss": 0.5326, + "num_tokens": 451970819.0, + "step": 4805 + }, + { + "epoch": 0.8202764976958525, + "grad_norm": 0.5140390685139122, + "learning_rate": 7.195767195767196e-06, + "loss": 0.6391, + "num_tokens": 452050765.0, + "step": 4806 + }, + { + "epoch": 0.820447175285885, + "grad_norm": 0.46058445237963563, + "learning_rate": 7.188940092165899e-06, + "loss": 0.4989, + "num_tokens": 452139203.0, + "step": 4807 + }, + { + "epoch": 0.8206178528759174, + "grad_norm": 0.528945618369798, + "learning_rate": 7.182112988564602e-06, + "loss": 0.6143, + "num_tokens": 452210812.0, + "step": 4808 + }, + { + "epoch": 0.8207885304659498, + "grad_norm": 0.798434653757783, + "learning_rate": 7.175285884963304e-06, + "loss": 0.615, + "num_tokens": 452327109.0, + "step": 4809 + }, + { + "epoch": 0.8209592080559822, + "grad_norm": 0.4716228158056162, + "learning_rate": 7.168458781362008e-06, + "loss": 0.6463, + "num_tokens": 452431972.0, + "step": 4810 + }, + { + "epoch": 0.8211298856460146, + "grad_norm": 0.6051937926872181, + "learning_rate": 7.161631677760711e-06, + "loss": 0.5733, + "num_tokens": 452537370.0, + "step": 4811 + }, + { + "epoch": 0.821300563236047, + "grad_norm": 0.45049765504540507, + "learning_rate": 7.154804574159413e-06, + "loss": 0.5656, + "num_tokens": 452639287.0, + "step": 4812 + }, + { + "epoch": 0.8214712408260796, + "grad_norm": 0.46519679103622785, + "learning_rate": 7.147977470558117e-06, + "loss": 0.4908, + "num_tokens": 452722759.0, + "step": 4813 + }, + { + "epoch": 0.821641918416112, + "grad_norm": 0.5351894101038412, + "learning_rate": 7.1411503669568195e-06, + "loss": 0.5778, + "num_tokens": 452787782.0, + "step": 4814 + }, + { + "epoch": 0.8218125960061444, + "grad_norm": 0.5246186968868738, + "learning_rate": 7.1343232633555216e-06, + "loss": 0.5896, + "num_tokens": 452871296.0, + "step": 4815 + }, + { + "epoch": 0.8219832735961768, + "grad_norm": 0.4446435270397607, + "learning_rate": 7.1274961597542245e-06, + "loss": 0.5902, + "num_tokens": 452978802.0, + "step": 4816 + }, + { + "epoch": 0.8221539511862093, + "grad_norm": 0.48912288703458223, + "learning_rate": 7.120669056152928e-06, + "loss": 0.5867, + "num_tokens": 453077100.0, + "step": 4817 + }, + { + "epoch": 0.8223246287762417, + "grad_norm": 0.43483011911778546, + "learning_rate": 7.11384195255163e-06, + "loss": 0.5679, + "num_tokens": 453191106.0, + "step": 4818 + }, + { + "epoch": 0.8224953063662741, + "grad_norm": 0.5466205658766324, + "learning_rate": 7.107014848950333e-06, + "loss": 0.622, + "num_tokens": 453263488.0, + "step": 4819 + }, + { + "epoch": 0.8226659839563065, + "grad_norm": 0.496155463466404, + "learning_rate": 7.100187745349036e-06, + "loss": 0.6355, + "num_tokens": 453356325.0, + "step": 4820 + }, + { + "epoch": 0.8228366615463389, + "grad_norm": 0.4858798399221191, + "learning_rate": 7.09336064174774e-06, + "loss": 0.5324, + "num_tokens": 453443711.0, + "step": 4821 + }, + { + "epoch": 0.8230073391363714, + "grad_norm": 0.548845262048738, + "learning_rate": 7.086533538146442e-06, + "loss": 0.59, + "num_tokens": 453512275.0, + "step": 4822 + }, + { + "epoch": 0.8231780167264038, + "grad_norm": 0.5187960184066911, + "learning_rate": 7.079706434545145e-06, + "loss": 0.6097, + "num_tokens": 453590499.0, + "step": 4823 + }, + { + "epoch": 0.8233486943164362, + "grad_norm": 0.4898898060708721, + "learning_rate": 7.072879330943848e-06, + "loss": 0.5171, + "num_tokens": 453669329.0, + "step": 4824 + }, + { + "epoch": 0.8235193719064687, + "grad_norm": 0.465907450035548, + "learning_rate": 7.06605222734255e-06, + "loss": 0.4843, + "num_tokens": 453767454.0, + "step": 4825 + }, + { + "epoch": 0.8236900494965012, + "grad_norm": 0.536875510762242, + "learning_rate": 7.059225123741254e-06, + "loss": 0.5441, + "num_tokens": 453830766.0, + "step": 4826 + }, + { + "epoch": 0.8238607270865336, + "grad_norm": 0.49558978514735863, + "learning_rate": 7.0523980201399566e-06, + "loss": 0.6641, + "num_tokens": 453928248.0, + "step": 4827 + }, + { + "epoch": 0.824031404676566, + "grad_norm": 0.47327679725439853, + "learning_rate": 7.045570916538659e-06, + "loss": 0.5421, + "num_tokens": 454036402.0, + "step": 4828 + }, + { + "epoch": 0.8242020822665984, + "grad_norm": 0.49705718570462404, + "learning_rate": 7.038743812937362e-06, + "loss": 0.5982, + "num_tokens": 454123024.0, + "step": 4829 + }, + { + "epoch": 0.8243727598566308, + "grad_norm": 0.5111429114746827, + "learning_rate": 7.031916709336065e-06, + "loss": 0.6565, + "num_tokens": 454210331.0, + "step": 4830 + }, + { + "epoch": 0.8245434374466633, + "grad_norm": 0.45674295683881294, + "learning_rate": 7.025089605734767e-06, + "loss": 0.4613, + "num_tokens": 454291839.0, + "step": 4831 + }, + { + "epoch": 0.8247141150366957, + "grad_norm": 0.4588941386872689, + "learning_rate": 7.01826250213347e-06, + "loss": 0.5411, + "num_tokens": 454382421.0, + "step": 4832 + }, + { + "epoch": 0.8248847926267281, + "grad_norm": 0.43356609698131476, + "learning_rate": 7.011435398532174e-06, + "loss": 0.553, + "num_tokens": 454489080.0, + "step": 4833 + }, + { + "epoch": 0.8250554702167605, + "grad_norm": 0.4171367892860194, + "learning_rate": 7.004608294930876e-06, + "loss": 0.5738, + "num_tokens": 454611396.0, + "step": 4834 + }, + { + "epoch": 0.8252261478067929, + "grad_norm": 0.466673602098529, + "learning_rate": 6.997781191329579e-06, + "loss": 0.5307, + "num_tokens": 454706651.0, + "step": 4835 + }, + { + "epoch": 0.8253968253968254, + "grad_norm": 0.4269043972221862, + "learning_rate": 6.990954087728282e-06, + "loss": 0.558, + "num_tokens": 454819769.0, + "step": 4836 + }, + { + "epoch": 0.8255675029868578, + "grad_norm": 0.4977846928121336, + "learning_rate": 6.984126984126984e-06, + "loss": 0.5011, + "num_tokens": 454905376.0, + "step": 4837 + }, + { + "epoch": 0.8257381805768903, + "grad_norm": 0.4069335603089527, + "learning_rate": 6.977299880525688e-06, + "loss": 0.563, + "num_tokens": 455030594.0, + "step": 4838 + }, + { + "epoch": 0.8259088581669227, + "grad_norm": 0.5263199180398532, + "learning_rate": 6.970472776924391e-06, + "loss": 0.5695, + "num_tokens": 455099221.0, + "step": 4839 + }, + { + "epoch": 0.8260795357569551, + "grad_norm": 0.5567499802180942, + "learning_rate": 6.963645673323093e-06, + "loss": 0.5342, + "num_tokens": 455157577.0, + "step": 4840 + }, + { + "epoch": 0.8262502133469876, + "grad_norm": 0.4042203454769082, + "learning_rate": 6.956818569721796e-06, + "loss": 0.5378, + "num_tokens": 455275744.0, + "step": 4841 + }, + { + "epoch": 0.82642089093702, + "grad_norm": 0.44346444903758286, + "learning_rate": 6.9499914661204995e-06, + "loss": 0.585, + "num_tokens": 455382356.0, + "step": 4842 + }, + { + "epoch": 0.8265915685270524, + "grad_norm": 0.45102813126351843, + "learning_rate": 6.9431643625192015e-06, + "loss": 0.5478, + "num_tokens": 455485741.0, + "step": 4843 + }, + { + "epoch": 0.8267622461170848, + "grad_norm": 0.4176332736164302, + "learning_rate": 6.9363372589179044e-06, + "loss": 0.5446, + "num_tokens": 455595873.0, + "step": 4844 + }, + { + "epoch": 0.8269329237071172, + "grad_norm": 0.47813312135428854, + "learning_rate": 6.929510155316607e-06, + "loss": 0.5709, + "num_tokens": 455688174.0, + "step": 4845 + }, + { + "epoch": 0.8271036012971497, + "grad_norm": 0.45853158469849914, + "learning_rate": 6.92268305171531e-06, + "loss": 0.6746, + "num_tokens": 455802832.0, + "step": 4846 + }, + { + "epoch": 0.8272742788871821, + "grad_norm": 0.46402235465960795, + "learning_rate": 6.915855948114013e-06, + "loss": 0.5899, + "num_tokens": 455898606.0, + "step": 4847 + }, + { + "epoch": 0.8274449564772145, + "grad_norm": 0.44783196920901436, + "learning_rate": 6.909028844512716e-06, + "loss": 0.5424, + "num_tokens": 455994500.0, + "step": 4848 + }, + { + "epoch": 0.8276156340672469, + "grad_norm": 0.45684842896180383, + "learning_rate": 6.902201740911418e-06, + "loss": 0.5033, + "num_tokens": 456082504.0, + "step": 4849 + }, + { + "epoch": 0.8277863116572794, + "grad_norm": 0.4769975128151429, + "learning_rate": 6.895374637310122e-06, + "loss": 0.5965, + "num_tokens": 456184210.0, + "step": 4850 + }, + { + "epoch": 0.8279569892473119, + "grad_norm": 0.4595500644144209, + "learning_rate": 6.888547533708825e-06, + "loss": 0.5401, + "num_tokens": 456278287.0, + "step": 4851 + }, + { + "epoch": 0.8281276668373443, + "grad_norm": 0.47758143331727887, + "learning_rate": 6.881720430107528e-06, + "loss": 0.6254, + "num_tokens": 456378473.0, + "step": 4852 + }, + { + "epoch": 0.8282983444273767, + "grad_norm": 0.4846421028944029, + "learning_rate": 6.87489332650623e-06, + "loss": 0.5349, + "num_tokens": 456462522.0, + "step": 4853 + }, + { + "epoch": 0.8284690220174091, + "grad_norm": 0.45793425826420137, + "learning_rate": 6.868066222904934e-06, + "loss": 0.51, + "num_tokens": 456547073.0, + "step": 4854 + }, + { + "epoch": 0.8286396996074415, + "grad_norm": 0.44251707365781306, + "learning_rate": 6.8612391193036365e-06, + "loss": 0.5303, + "num_tokens": 456643179.0, + "step": 4855 + }, + { + "epoch": 0.828810377197474, + "grad_norm": 0.4720750530184715, + "learning_rate": 6.854412015702339e-06, + "loss": 0.5158, + "num_tokens": 456720551.0, + "step": 4856 + }, + { + "epoch": 0.8289810547875064, + "grad_norm": 0.4688793935103529, + "learning_rate": 6.8475849121010415e-06, + "loss": 0.4821, + "num_tokens": 456802285.0, + "step": 4857 + }, + { + "epoch": 0.8291517323775388, + "grad_norm": 0.5133716249893587, + "learning_rate": 6.840757808499745e-06, + "loss": 0.5771, + "num_tokens": 456879334.0, + "step": 4858 + }, + { + "epoch": 0.8293224099675712, + "grad_norm": 0.43160155587366983, + "learning_rate": 6.833930704898447e-06, + "loss": 0.498, + "num_tokens": 456981448.0, + "step": 4859 + }, + { + "epoch": 0.8294930875576036, + "grad_norm": 0.4884227909418608, + "learning_rate": 6.82710360129715e-06, + "loss": 0.5781, + "num_tokens": 457063703.0, + "step": 4860 + }, + { + "epoch": 0.8296637651476361, + "grad_norm": 0.463492975864288, + "learning_rate": 6.820276497695853e-06, + "loss": 0.6675, + "num_tokens": 457176793.0, + "step": 4861 + }, + { + "epoch": 0.8298344427376686, + "grad_norm": 0.5296148485456056, + "learning_rate": 6.813449394094555e-06, + "loss": 0.5339, + "num_tokens": 457262529.0, + "step": 4862 + }, + { + "epoch": 0.830005120327701, + "grad_norm": 0.4585641888563883, + "learning_rate": 6.806622290493259e-06, + "loss": 0.5915, + "num_tokens": 457370926.0, + "step": 4863 + }, + { + "epoch": 0.8301757979177334, + "grad_norm": 0.4750806893059253, + "learning_rate": 6.799795186891962e-06, + "loss": 0.5294, + "num_tokens": 457455513.0, + "step": 4864 + }, + { + "epoch": 0.8303464755077659, + "grad_norm": 0.5243929172846149, + "learning_rate": 6.792968083290664e-06, + "loss": 0.6128, + "num_tokens": 457531823.0, + "step": 4865 + }, + { + "epoch": 0.8305171530977983, + "grad_norm": 0.4680795394118142, + "learning_rate": 6.786140979689368e-06, + "loss": 0.5236, + "num_tokens": 457623886.0, + "step": 4866 + }, + { + "epoch": 0.8306878306878307, + "grad_norm": 0.44195536875726743, + "learning_rate": 6.779313876088071e-06, + "loss": 0.5194, + "num_tokens": 457722717.0, + "step": 4867 + }, + { + "epoch": 0.8308585082778631, + "grad_norm": 0.4256610700649767, + "learning_rate": 6.772486772486773e-06, + "loss": 0.5621, + "num_tokens": 457844712.0, + "step": 4868 + }, + { + "epoch": 0.8310291858678955, + "grad_norm": 0.46001147887070243, + "learning_rate": 6.765659668885476e-06, + "loss": 0.5409, + "num_tokens": 457945351.0, + "step": 4869 + }, + { + "epoch": 0.831199863457928, + "grad_norm": 0.4197927098358501, + "learning_rate": 6.758832565284179e-06, + "loss": 0.5384, + "num_tokens": 458052750.0, + "step": 4870 + }, + { + "epoch": 0.8313705410479604, + "grad_norm": 0.5125765083711391, + "learning_rate": 6.7520054616828815e-06, + "loss": 0.5684, + "num_tokens": 458126923.0, + "step": 4871 + }, + { + "epoch": 0.8315412186379928, + "grad_norm": 0.4993310015441435, + "learning_rate": 6.745178358081584e-06, + "loss": 0.5765, + "num_tokens": 458212318.0, + "step": 4872 + }, + { + "epoch": 0.8317118962280252, + "grad_norm": 0.38768482283394085, + "learning_rate": 6.738351254480287e-06, + "loss": 0.5218, + "num_tokens": 458342394.0, + "step": 4873 + }, + { + "epoch": 0.8318825738180576, + "grad_norm": 0.45266234730272725, + "learning_rate": 6.731524150878989e-06, + "loss": 0.6317, + "num_tokens": 458450990.0, + "step": 4874 + }, + { + "epoch": 0.8320532514080902, + "grad_norm": 0.5019341643074298, + "learning_rate": 6.724697047277693e-06, + "loss": 0.6762, + "num_tokens": 458545705.0, + "step": 4875 + }, + { + "epoch": 0.8322239289981226, + "grad_norm": 0.4635092949774453, + "learning_rate": 6.717869943676396e-06, + "loss": 0.6293, + "num_tokens": 458657078.0, + "step": 4876 + }, + { + "epoch": 0.832394606588155, + "grad_norm": 0.49547570854527484, + "learning_rate": 6.711042840075098e-06, + "loss": 0.4896, + "num_tokens": 458727848.0, + "step": 4877 + }, + { + "epoch": 0.8325652841781874, + "grad_norm": 0.46041723948626934, + "learning_rate": 6.704215736473801e-06, + "loss": 0.5008, + "num_tokens": 458817159.0, + "step": 4878 + }, + { + "epoch": 0.8327359617682198, + "grad_norm": 0.4772888563999419, + "learning_rate": 6.697388632872505e-06, + "loss": 0.5825, + "num_tokens": 458913571.0, + "step": 4879 + }, + { + "epoch": 0.8329066393582523, + "grad_norm": 0.4278034111618673, + "learning_rate": 6.690561529271207e-06, + "loss": 0.558, + "num_tokens": 459036839.0, + "step": 4880 + }, + { + "epoch": 0.8330773169482847, + "grad_norm": 0.5267081756042016, + "learning_rate": 6.68373442566991e-06, + "loss": 0.5451, + "num_tokens": 459104395.0, + "step": 4881 + }, + { + "epoch": 0.8332479945383171, + "grad_norm": 0.5234842925125288, + "learning_rate": 6.6769073220686135e-06, + "loss": 0.5956, + "num_tokens": 459178330.0, + "step": 4882 + }, + { + "epoch": 0.8334186721283495, + "grad_norm": 0.47065981816379104, + "learning_rate": 6.670080218467316e-06, + "loss": 0.6192, + "num_tokens": 459295873.0, + "step": 4883 + }, + { + "epoch": 0.833589349718382, + "grad_norm": 0.46660604333507916, + "learning_rate": 6.6632531148660185e-06, + "loss": 0.5334, + "num_tokens": 459385174.0, + "step": 4884 + }, + { + "epoch": 0.8337600273084144, + "grad_norm": 0.47590211275031086, + "learning_rate": 6.6564260112647214e-06, + "loss": 0.5261, + "num_tokens": 459471019.0, + "step": 4885 + }, + { + "epoch": 0.8339307048984468, + "grad_norm": 0.43785513946431853, + "learning_rate": 6.649598907663425e-06, + "loss": 0.6322, + "num_tokens": 459594813.0, + "step": 4886 + }, + { + "epoch": 0.8341013824884793, + "grad_norm": 0.46254940245315024, + "learning_rate": 6.642771804062127e-06, + "loss": 0.6477, + "num_tokens": 459713402.0, + "step": 4887 + }, + { + "epoch": 0.8342720600785117, + "grad_norm": 0.45354864446248366, + "learning_rate": 6.63594470046083e-06, + "loss": 0.5844, + "num_tokens": 459810418.0, + "step": 4888 + }, + { + "epoch": 0.8344427376685442, + "grad_norm": 0.49575447892918656, + "learning_rate": 6.629117596859533e-06, + "loss": 0.605, + "num_tokens": 459893585.0, + "step": 4889 + }, + { + "epoch": 0.8346134152585766, + "grad_norm": 0.4461403433815446, + "learning_rate": 6.622290493258235e-06, + "loss": 0.6246, + "num_tokens": 460010549.0, + "step": 4890 + }, + { + "epoch": 0.834784092848609, + "grad_norm": 0.4559507367576683, + "learning_rate": 6.615463389656939e-06, + "loss": 0.6008, + "num_tokens": 460122586.0, + "step": 4891 + }, + { + "epoch": 0.8349547704386414, + "grad_norm": 0.45514693170031584, + "learning_rate": 6.608636286055642e-06, + "loss": 0.5554, + "num_tokens": 460222139.0, + "step": 4892 + }, + { + "epoch": 0.8351254480286738, + "grad_norm": 0.45768894432374213, + "learning_rate": 6.601809182454344e-06, + "loss": 0.5459, + "num_tokens": 460317841.0, + "step": 4893 + }, + { + "epoch": 0.8352961256187063, + "grad_norm": 0.45889326263618335, + "learning_rate": 6.594982078853047e-06, + "loss": 0.516, + "num_tokens": 460407812.0, + "step": 4894 + }, + { + "epoch": 0.8354668032087387, + "grad_norm": 0.43652434017337427, + "learning_rate": 6.588154975251751e-06, + "loss": 0.5529, + "num_tokens": 460510645.0, + "step": 4895 + }, + { + "epoch": 0.8356374807987711, + "grad_norm": 0.49162457668122383, + "learning_rate": 6.581327871650453e-06, + "loss": 0.5918, + "num_tokens": 460610174.0, + "step": 4896 + }, + { + "epoch": 0.8358081583888035, + "grad_norm": 0.44168987978768487, + "learning_rate": 6.574500768049156e-06, + "loss": 0.5168, + "num_tokens": 460712044.0, + "step": 4897 + }, + { + "epoch": 0.8359788359788359, + "grad_norm": 0.49307024273375566, + "learning_rate": 6.5676736644478585e-06, + "loss": 0.5706, + "num_tokens": 460819326.0, + "step": 4898 + }, + { + "epoch": 0.8361495135688684, + "grad_norm": 0.4644340676590431, + "learning_rate": 6.560846560846561e-06, + "loss": 0.4931, + "num_tokens": 460911555.0, + "step": 4899 + }, + { + "epoch": 0.8363201911589009, + "grad_norm": 0.5789397454530446, + "learning_rate": 6.554019457245264e-06, + "loss": 0.5677, + "num_tokens": 460976798.0, + "step": 4900 + }, + { + "epoch": 0.8364908687489333, + "grad_norm": 0.5689157133676696, + "learning_rate": 6.547192353643967e-06, + "loss": 0.5731, + "num_tokens": 461039252.0, + "step": 4901 + }, + { + "epoch": 0.8366615463389657, + "grad_norm": 0.49317626129342196, + "learning_rate": 6.540365250042669e-06, + "loss": 0.4933, + "num_tokens": 461116381.0, + "step": 4902 + }, + { + "epoch": 0.8368322239289981, + "grad_norm": 0.4996379619219291, + "learning_rate": 6.533538146441373e-06, + "loss": 0.5076, + "num_tokens": 461186911.0, + "step": 4903 + }, + { + "epoch": 0.8370029015190306, + "grad_norm": 0.5202328474603903, + "learning_rate": 6.526711042840076e-06, + "loss": 0.5431, + "num_tokens": 461264556.0, + "step": 4904 + }, + { + "epoch": 0.837173579109063, + "grad_norm": 0.46886826283801164, + "learning_rate": 6.519883939238778e-06, + "loss": 0.5708, + "num_tokens": 461367082.0, + "step": 4905 + }, + { + "epoch": 0.8373442566990954, + "grad_norm": 0.4780985815017691, + "learning_rate": 6.513056835637481e-06, + "loss": 0.6419, + "num_tokens": 461467087.0, + "step": 4906 + }, + { + "epoch": 0.8375149342891278, + "grad_norm": 0.4491312073825426, + "learning_rate": 6.506229732036185e-06, + "loss": 0.5026, + "num_tokens": 461569150.0, + "step": 4907 + }, + { + "epoch": 0.8376856118791602, + "grad_norm": 0.5005403376201004, + "learning_rate": 6.499402628434887e-06, + "loss": 0.6025, + "num_tokens": 461674459.0, + "step": 4908 + }, + { + "epoch": 0.8378562894691927, + "grad_norm": 0.4801498510730418, + "learning_rate": 6.49257552483359e-06, + "loss": 0.5598, + "num_tokens": 461760327.0, + "step": 4909 + }, + { + "epoch": 0.8380269670592251, + "grad_norm": 0.5348669155580196, + "learning_rate": 6.485748421232293e-06, + "loss": 0.5634, + "num_tokens": 461838988.0, + "step": 4910 + }, + { + "epoch": 0.8381976446492575, + "grad_norm": 0.484848233801489, + "learning_rate": 6.478921317630995e-06, + "loss": 0.5666, + "num_tokens": 461925962.0, + "step": 4911 + }, + { + "epoch": 0.83836832223929, + "grad_norm": 0.4870041308847209, + "learning_rate": 6.4720942140296985e-06, + "loss": 0.546, + "num_tokens": 462007132.0, + "step": 4912 + }, + { + "epoch": 0.8385389998293225, + "grad_norm": 0.4506231970050041, + "learning_rate": 6.465267110428401e-06, + "loss": 0.538, + "num_tokens": 462100750.0, + "step": 4913 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 0.45791402000264647, + "learning_rate": 6.4584400068271035e-06, + "loss": 0.5177, + "num_tokens": 462196678.0, + "step": 4914 + }, + { + "epoch": 0.8388803550093873, + "grad_norm": 0.4559909143954396, + "learning_rate": 6.451612903225806e-06, + "loss": 0.6158, + "num_tokens": 462300899.0, + "step": 4915 + }, + { + "epoch": 0.8390510325994197, + "grad_norm": 0.45752494712160663, + "learning_rate": 6.44478579962451e-06, + "loss": 0.6186, + "num_tokens": 462418844.0, + "step": 4916 + }, + { + "epoch": 0.8392217101894521, + "grad_norm": 0.48742176321543074, + "learning_rate": 6.437958696023213e-06, + "loss": 0.6164, + "num_tokens": 462508407.0, + "step": 4917 + }, + { + "epoch": 0.8393923877794846, + "grad_norm": 0.5277346440695257, + "learning_rate": 6.431131592421915e-06, + "loss": 0.5599, + "num_tokens": 462578791.0, + "step": 4918 + }, + { + "epoch": 0.839563065369517, + "grad_norm": 0.5175008625968157, + "learning_rate": 6.424304488820619e-06, + "loss": 0.5389, + "num_tokens": 462652299.0, + "step": 4919 + }, + { + "epoch": 0.8397337429595494, + "grad_norm": 0.4072000940839893, + "learning_rate": 6.417477385219322e-06, + "loss": 0.532, + "num_tokens": 462773256.0, + "step": 4920 + }, + { + "epoch": 0.8399044205495818, + "grad_norm": 0.5354207320459092, + "learning_rate": 6.410650281618024e-06, + "loss": 0.5449, + "num_tokens": 462842166.0, + "step": 4921 + }, + { + "epoch": 0.8400750981396142, + "grad_norm": 0.4949535072696954, + "learning_rate": 6.403823178016727e-06, + "loss": 0.5229, + "num_tokens": 462921412.0, + "step": 4922 + }, + { + "epoch": 0.8402457757296466, + "grad_norm": 0.49139832827052105, + "learning_rate": 6.3969960744154306e-06, + "loss": 0.5513, + "num_tokens": 463010111.0, + "step": 4923 + }, + { + "epoch": 0.8404164533196792, + "grad_norm": 0.4888872382939385, + "learning_rate": 6.390168970814133e-06, + "loss": 0.6033, + "num_tokens": 463095694.0, + "step": 4924 + }, + { + "epoch": 0.8405871309097116, + "grad_norm": 0.42173923381028505, + "learning_rate": 6.3833418672128355e-06, + "loss": 0.5945, + "num_tokens": 463218865.0, + "step": 4925 + }, + { + "epoch": 0.840757808499744, + "grad_norm": 0.5172653719442127, + "learning_rate": 6.3765147636115385e-06, + "loss": 0.5019, + "num_tokens": 463289194.0, + "step": 4926 + }, + { + "epoch": 0.8409284860897764, + "grad_norm": 0.48541287746629275, + "learning_rate": 6.3696876600102405e-06, + "loss": 0.5578, + "num_tokens": 463374617.0, + "step": 4927 + }, + { + "epoch": 0.8410991636798089, + "grad_norm": 0.5240388010411126, + "learning_rate": 6.362860556408944e-06, + "loss": 0.5734, + "num_tokens": 463453621.0, + "step": 4928 + }, + { + "epoch": 0.8412698412698413, + "grad_norm": 0.45077644981956383, + "learning_rate": 6.356033452807647e-06, + "loss": 0.5665, + "num_tokens": 463560736.0, + "step": 4929 + }, + { + "epoch": 0.8414405188598737, + "grad_norm": 0.5120195772504303, + "learning_rate": 6.349206349206349e-06, + "loss": 0.575, + "num_tokens": 463639727.0, + "step": 4930 + }, + { + "epoch": 0.8416111964499061, + "grad_norm": 0.4640297197548327, + "learning_rate": 6.342379245605052e-06, + "loss": 0.5508, + "num_tokens": 463732045.0, + "step": 4931 + }, + { + "epoch": 0.8417818740399385, + "grad_norm": 0.44725700211862646, + "learning_rate": 6.335552142003756e-06, + "loss": 0.5851, + "num_tokens": 463844996.0, + "step": 4932 + }, + { + "epoch": 0.841952551629971, + "grad_norm": 0.4937551711871825, + "learning_rate": 6.328725038402458e-06, + "loss": 0.639, + "num_tokens": 463930037.0, + "step": 4933 + }, + { + "epoch": 0.8421232292200034, + "grad_norm": 0.4547325013484731, + "learning_rate": 6.321897934801161e-06, + "loss": 0.5335, + "num_tokens": 464023787.0, + "step": 4934 + }, + { + "epoch": 0.8422939068100358, + "grad_norm": 0.47783226717750654, + "learning_rate": 6.315070831199865e-06, + "loss": 0.5343, + "num_tokens": 464105453.0, + "step": 4935 + }, + { + "epoch": 0.8424645844000682, + "grad_norm": 0.4251037721552533, + "learning_rate": 6.308243727598567e-06, + "loss": 0.5723, + "num_tokens": 464222691.0, + "step": 4936 + }, + { + "epoch": 0.8426352619901007, + "grad_norm": 0.4655710665971173, + "learning_rate": 6.30141662399727e-06, + "loss": 0.5833, + "num_tokens": 464323042.0, + "step": 4937 + }, + { + "epoch": 0.8428059395801332, + "grad_norm": 0.46271848293200707, + "learning_rate": 6.294589520395973e-06, + "loss": 0.6355, + "num_tokens": 464425157.0, + "step": 4938 + }, + { + "epoch": 0.8429766171701656, + "grad_norm": 0.4922589610297303, + "learning_rate": 6.287762416794675e-06, + "loss": 0.4944, + "num_tokens": 464493018.0, + "step": 4939 + }, + { + "epoch": 0.843147294760198, + "grad_norm": 0.5023519731300099, + "learning_rate": 6.2809353131933784e-06, + "loss": 0.5336, + "num_tokens": 464564937.0, + "step": 4940 + }, + { + "epoch": 0.8433179723502304, + "grad_norm": 0.4696686845230933, + "learning_rate": 6.274108209592081e-06, + "loss": 0.574, + "num_tokens": 464652488.0, + "step": 4941 + }, + { + "epoch": 0.8434886499402628, + "grad_norm": 0.5304683392675822, + "learning_rate": 6.267281105990783e-06, + "loss": 0.5898, + "num_tokens": 464724292.0, + "step": 4942 + }, + { + "epoch": 0.8436593275302953, + "grad_norm": 0.4534995896410765, + "learning_rate": 6.260454002389486e-06, + "loss": 0.5417, + "num_tokens": 464830028.0, + "step": 4943 + }, + { + "epoch": 0.8438300051203277, + "grad_norm": 0.4647490478110065, + "learning_rate": 6.25362689878819e-06, + "loss": 0.5978, + "num_tokens": 464926119.0, + "step": 4944 + }, + { + "epoch": 0.8440006827103601, + "grad_norm": 0.48664088126400584, + "learning_rate": 6.246799795186892e-06, + "loss": 0.5375, + "num_tokens": 465017000.0, + "step": 4945 + }, + { + "epoch": 0.8441713603003925, + "grad_norm": 0.6206806037389087, + "learning_rate": 6.239972691585595e-06, + "loss": 0.5489, + "num_tokens": 465105612.0, + "step": 4946 + }, + { + "epoch": 0.844342037890425, + "grad_norm": 0.5006934459315261, + "learning_rate": 6.233145587984298e-06, + "loss": 0.509, + "num_tokens": 465182302.0, + "step": 4947 + }, + { + "epoch": 0.8445127154804574, + "grad_norm": 0.4966112852952901, + "learning_rate": 6.226318484383002e-06, + "loss": 0.5672, + "num_tokens": 465261925.0, + "step": 4948 + }, + { + "epoch": 0.8446833930704899, + "grad_norm": 0.41417003503217936, + "learning_rate": 6.219491380781704e-06, + "loss": 0.5812, + "num_tokens": 465390207.0, + "step": 4949 + }, + { + "epoch": 0.8448540706605223, + "grad_norm": 0.4309553425505083, + "learning_rate": 6.212664277180407e-06, + "loss": 0.5551, + "num_tokens": 465501575.0, + "step": 4950 + }, + { + "epoch": 0.8450247482505547, + "grad_norm": 0.45522028900013767, + "learning_rate": 6.2058371735791105e-06, + "loss": 0.6458, + "num_tokens": 465625718.0, + "step": 4951 + }, + { + "epoch": 0.8451954258405872, + "grad_norm": 0.4389599158163193, + "learning_rate": 6.1990100699778126e-06, + "loss": 0.5817, + "num_tokens": 465744338.0, + "step": 4952 + }, + { + "epoch": 0.8453661034306196, + "grad_norm": 0.4227358769393761, + "learning_rate": 6.1921829663765155e-06, + "loss": 0.5868, + "num_tokens": 465863880.0, + "step": 4953 + }, + { + "epoch": 0.845536781020652, + "grad_norm": 0.46635486336138454, + "learning_rate": 6.185355862775218e-06, + "loss": 0.5609, + "num_tokens": 465960241.0, + "step": 4954 + }, + { + "epoch": 0.8457074586106844, + "grad_norm": 0.43872333413666487, + "learning_rate": 6.1785287591739205e-06, + "loss": 0.5368, + "num_tokens": 466067526.0, + "step": 4955 + }, + { + "epoch": 0.8458781362007168, + "grad_norm": 0.5205888789222144, + "learning_rate": 6.171701655572624e-06, + "loss": 0.4672, + "num_tokens": 466134867.0, + "step": 4956 + }, + { + "epoch": 0.8460488137907493, + "grad_norm": 0.5001507171342837, + "learning_rate": 6.164874551971327e-06, + "loss": 0.5511, + "num_tokens": 466211766.0, + "step": 4957 + }, + { + "epoch": 0.8462194913807817, + "grad_norm": 0.48035157635720777, + "learning_rate": 6.158047448370029e-06, + "loss": 0.5221, + "num_tokens": 466292528.0, + "step": 4958 + }, + { + "epoch": 0.8463901689708141, + "grad_norm": 0.4413367146470332, + "learning_rate": 6.151220344768732e-06, + "loss": 0.567, + "num_tokens": 466389040.0, + "step": 4959 + }, + { + "epoch": 0.8465608465608465, + "grad_norm": 0.4399020608133824, + "learning_rate": 6.144393241167436e-06, + "loss": 0.5265, + "num_tokens": 466493816.0, + "step": 4960 + }, + { + "epoch": 0.846731524150879, + "grad_norm": 0.4675597861964928, + "learning_rate": 6.137566137566138e-06, + "loss": 0.579, + "num_tokens": 466586881.0, + "step": 4961 + }, + { + "epoch": 0.8469022017409115, + "grad_norm": 0.47014096220311874, + "learning_rate": 6.130739033964841e-06, + "loss": 0.5924, + "num_tokens": 466682551.0, + "step": 4962 + }, + { + "epoch": 0.8470728793309439, + "grad_norm": 0.4255090820324177, + "learning_rate": 6.123911930363544e-06, + "loss": 0.5333, + "num_tokens": 466788729.0, + "step": 4963 + }, + { + "epoch": 0.8472435569209763, + "grad_norm": 0.4540065922427179, + "learning_rate": 6.117084826762246e-06, + "loss": 0.6029, + "num_tokens": 466907074.0, + "step": 4964 + }, + { + "epoch": 0.8474142345110087, + "grad_norm": 0.4472707364464128, + "learning_rate": 6.11025772316095e-06, + "loss": 0.6514, + "num_tokens": 467024702.0, + "step": 4965 + }, + { + "epoch": 0.8475849121010411, + "grad_norm": 0.46190991022935485, + "learning_rate": 6.1034306195596525e-06, + "loss": 0.5853, + "num_tokens": 467128248.0, + "step": 4966 + }, + { + "epoch": 0.8477555896910736, + "grad_norm": 0.4556616361960912, + "learning_rate": 6.096603515958355e-06, + "loss": 0.5171, + "num_tokens": 467218497.0, + "step": 4967 + }, + { + "epoch": 0.847926267281106, + "grad_norm": 0.4853929441269046, + "learning_rate": 6.089776412357058e-06, + "loss": 0.5532, + "num_tokens": 467314327.0, + "step": 4968 + }, + { + "epoch": 0.8480969448711384, + "grad_norm": 0.47987875109464245, + "learning_rate": 6.082949308755761e-06, + "loss": 0.5237, + "num_tokens": 467392646.0, + "step": 4969 + }, + { + "epoch": 0.8482676224611708, + "grad_norm": 0.4258253480551358, + "learning_rate": 6.076122205154463e-06, + "loss": 0.6457, + "num_tokens": 467520947.0, + "step": 4970 + }, + { + "epoch": 0.8484383000512032, + "grad_norm": 0.46253018129612067, + "learning_rate": 6.069295101553166e-06, + "loss": 0.5419, + "num_tokens": 467603638.0, + "step": 4971 + }, + { + "epoch": 0.8486089776412357, + "grad_norm": 0.501106102940788, + "learning_rate": 6.06246799795187e-06, + "loss": 0.606, + "num_tokens": 467691176.0, + "step": 4972 + }, + { + "epoch": 0.8487796552312681, + "grad_norm": 0.5122151911421258, + "learning_rate": 6.055640894350572e-06, + "loss": 0.5561, + "num_tokens": 467762605.0, + "step": 4973 + }, + { + "epoch": 0.8489503328213006, + "grad_norm": 0.45377035383322745, + "learning_rate": 6.048813790749275e-06, + "loss": 0.5974, + "num_tokens": 467863630.0, + "step": 4974 + }, + { + "epoch": 0.849121010411333, + "grad_norm": 0.4693450580532555, + "learning_rate": 6.041986687147978e-06, + "loss": 0.5307, + "num_tokens": 467955460.0, + "step": 4975 + }, + { + "epoch": 0.8492916880013655, + "grad_norm": 0.4641537829880674, + "learning_rate": 6.03515958354668e-06, + "loss": 0.5209, + "num_tokens": 468039799.0, + "step": 4976 + }, + { + "epoch": 0.8494623655913979, + "grad_norm": 0.4478809090685906, + "learning_rate": 6.028332479945384e-06, + "loss": 0.5808, + "num_tokens": 468141335.0, + "step": 4977 + }, + { + "epoch": 0.8496330431814303, + "grad_norm": 0.5412303176121621, + "learning_rate": 6.021505376344087e-06, + "loss": 0.6263, + "num_tokens": 468211472.0, + "step": 4978 + }, + { + "epoch": 0.8498037207714627, + "grad_norm": 0.4300827149476992, + "learning_rate": 6.014678272742789e-06, + "loss": 0.5039, + "num_tokens": 468316831.0, + "step": 4979 + }, + { + "epoch": 0.8499743983614951, + "grad_norm": 0.4523808188898401, + "learning_rate": 6.007851169141492e-06, + "loss": 0.5606, + "num_tokens": 468416586.0, + "step": 4980 + }, + { + "epoch": 0.8501450759515276, + "grad_norm": 0.5163433803432683, + "learning_rate": 6.0010240655401954e-06, + "loss": 0.5752, + "num_tokens": 468499652.0, + "step": 4981 + }, + { + "epoch": 0.85031575354156, + "grad_norm": 0.5049551733618891, + "learning_rate": 5.994196961938898e-06, + "loss": 0.576, + "num_tokens": 468586055.0, + "step": 4982 + }, + { + "epoch": 0.8504864311315924, + "grad_norm": 0.47583352766646003, + "learning_rate": 5.9873698583376e-06, + "loss": 0.6931, + "num_tokens": 468697014.0, + "step": 4983 + }, + { + "epoch": 0.8506571087216248, + "grad_norm": 0.4479022690866181, + "learning_rate": 5.980542754736303e-06, + "loss": 0.559, + "num_tokens": 468800759.0, + "step": 4984 + }, + { + "epoch": 0.8508277863116572, + "grad_norm": 0.47424811701259717, + "learning_rate": 5.973715651135007e-06, + "loss": 0.5227, + "num_tokens": 468884265.0, + "step": 4985 + }, + { + "epoch": 0.8509984639016898, + "grad_norm": 0.502831386046737, + "learning_rate": 5.966888547533709e-06, + "loss": 0.6681, + "num_tokens": 468976618.0, + "step": 4986 + }, + { + "epoch": 0.8511691414917222, + "grad_norm": 0.5019056609504023, + "learning_rate": 5.960061443932412e-06, + "loss": 0.615, + "num_tokens": 469070591.0, + "step": 4987 + }, + { + "epoch": 0.8513398190817546, + "grad_norm": 0.5648291278884545, + "learning_rate": 5.953234340331116e-06, + "loss": 0.5831, + "num_tokens": 469136441.0, + "step": 4988 + }, + { + "epoch": 0.851510496671787, + "grad_norm": 0.48938913872236844, + "learning_rate": 5.946407236729818e-06, + "loss": 0.4827, + "num_tokens": 469207299.0, + "step": 4989 + }, + { + "epoch": 0.8516811742618194, + "grad_norm": 0.49890453742373925, + "learning_rate": 5.939580133128521e-06, + "loss": 0.5718, + "num_tokens": 469290674.0, + "step": 4990 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.43599316653260806, + "learning_rate": 5.932753029527224e-06, + "loss": 0.5188, + "num_tokens": 469388407.0, + "step": 4991 + }, + { + "epoch": 0.8520225294418843, + "grad_norm": 0.46605233461894896, + "learning_rate": 5.925925925925926e-06, + "loss": 0.5241, + "num_tokens": 469481122.0, + "step": 4992 + }, + { + "epoch": 0.8521932070319167, + "grad_norm": 0.4210575017614484, + "learning_rate": 5.91909882232463e-06, + "loss": 0.5678, + "num_tokens": 469602184.0, + "step": 4993 + }, + { + "epoch": 0.8523638846219491, + "grad_norm": 0.44107818211952005, + "learning_rate": 5.9122717187233325e-06, + "loss": 0.5876, + "num_tokens": 469705854.0, + "step": 4994 + }, + { + "epoch": 0.8525345622119815, + "grad_norm": 0.5032672691549811, + "learning_rate": 5.9054446151220346e-06, + "loss": 0.6238, + "num_tokens": 469797995.0, + "step": 4995 + }, + { + "epoch": 0.852705239802014, + "grad_norm": 0.4558838142452927, + "learning_rate": 5.8986175115207375e-06, + "loss": 0.6389, + "num_tokens": 469913776.0, + "step": 4996 + }, + { + "epoch": 0.8528759173920464, + "grad_norm": 0.4845665265015083, + "learning_rate": 5.891790407919441e-06, + "loss": 0.5687, + "num_tokens": 470002163.0, + "step": 4997 + }, + { + "epoch": 0.8530465949820788, + "grad_norm": 0.47434922331172363, + "learning_rate": 5.884963304318143e-06, + "loss": 0.5892, + "num_tokens": 470096706.0, + "step": 4998 + }, + { + "epoch": 0.8532172725721113, + "grad_norm": 0.5156649023741182, + "learning_rate": 5.878136200716846e-06, + "loss": 0.5854, + "num_tokens": 470173344.0, + "step": 4999 + }, + { + "epoch": 0.8533879501621437, + "grad_norm": 0.43543507939606246, + "learning_rate": 5.871309097115549e-06, + "loss": 0.5235, + "num_tokens": 470278287.0, + "step": 5000 + }, + { + "epoch": 0.8535586277521762, + "grad_norm": 0.44249546142736296, + "learning_rate": 5.864481993514251e-06, + "loss": 0.5368, + "num_tokens": 470379820.0, + "step": 5001 + }, + { + "epoch": 0.8537293053422086, + "grad_norm": 0.46547698260223785, + "learning_rate": 5.857654889912955e-06, + "loss": 0.6091, + "num_tokens": 470476787.0, + "step": 5002 + }, + { + "epoch": 0.853899982932241, + "grad_norm": 0.4897880483944473, + "learning_rate": 5.850827786311658e-06, + "loss": 0.6005, + "num_tokens": 470565476.0, + "step": 5003 + }, + { + "epoch": 0.8540706605222734, + "grad_norm": 0.5254322009703806, + "learning_rate": 5.84400068271036e-06, + "loss": 0.5696, + "num_tokens": 470644341.0, + "step": 5004 + }, + { + "epoch": 0.8542413381123058, + "grad_norm": 0.5461866350213606, + "learning_rate": 5.837173579109064e-06, + "loss": 0.5936, + "num_tokens": 470741294.0, + "step": 5005 + }, + { + "epoch": 0.8544120157023383, + "grad_norm": 0.46918546948538264, + "learning_rate": 5.830346475507767e-06, + "loss": 0.5747, + "num_tokens": 470837964.0, + "step": 5006 + }, + { + "epoch": 0.8545826932923707, + "grad_norm": 0.45706594418184043, + "learning_rate": 5.823519371906469e-06, + "loss": 0.5056, + "num_tokens": 470932066.0, + "step": 5007 + }, + { + "epoch": 0.8547533708824031, + "grad_norm": 0.48674917755486047, + "learning_rate": 5.816692268305172e-06, + "loss": 0.6109, + "num_tokens": 471023964.0, + "step": 5008 + }, + { + "epoch": 0.8549240484724355, + "grad_norm": 0.47275294316350747, + "learning_rate": 5.809865164703875e-06, + "loss": 0.6066, + "num_tokens": 471125311.0, + "step": 5009 + }, + { + "epoch": 0.855094726062468, + "grad_norm": 0.45239561109554344, + "learning_rate": 5.8030380611025775e-06, + "loss": 0.5673, + "num_tokens": 471230984.0, + "step": 5010 + }, + { + "epoch": 0.8552654036525005, + "grad_norm": 0.4383600798877033, + "learning_rate": 5.79621095750128e-06, + "loss": 0.5881, + "num_tokens": 471345400.0, + "step": 5011 + }, + { + "epoch": 0.8554360812425329, + "grad_norm": 0.4543568194902383, + "learning_rate": 5.789383853899983e-06, + "loss": 0.4937, + "num_tokens": 471437449.0, + "step": 5012 + }, + { + "epoch": 0.8556067588325653, + "grad_norm": 0.5154552720825754, + "learning_rate": 5.782556750298687e-06, + "loss": 0.6626, + "num_tokens": 471529722.0, + "step": 5013 + }, + { + "epoch": 0.8557774364225977, + "grad_norm": 0.47118380564578627, + "learning_rate": 5.775729646697389e-06, + "loss": 0.4742, + "num_tokens": 471613265.0, + "step": 5014 + }, + { + "epoch": 0.8559481140126302, + "grad_norm": 0.46719513903436755, + "learning_rate": 5.768902543096092e-06, + "loss": 0.5936, + "num_tokens": 471706944.0, + "step": 5015 + }, + { + "epoch": 0.8561187916026626, + "grad_norm": 0.4250630798777261, + "learning_rate": 5.762075439494795e-06, + "loss": 0.5197, + "num_tokens": 471815593.0, + "step": 5016 + }, + { + "epoch": 0.856289469192695, + "grad_norm": 0.5044876755814766, + "learning_rate": 5.755248335893497e-06, + "loss": 0.5881, + "num_tokens": 471902100.0, + "step": 5017 + }, + { + "epoch": 0.8564601467827274, + "grad_norm": 0.4514622170034405, + "learning_rate": 5.748421232292201e-06, + "loss": 0.5855, + "num_tokens": 472012523.0, + "step": 5018 + }, + { + "epoch": 0.8566308243727598, + "grad_norm": 0.45337881382150985, + "learning_rate": 5.741594128690904e-06, + "loss": 0.5983, + "num_tokens": 472113955.0, + "step": 5019 + }, + { + "epoch": 0.8568015019627923, + "grad_norm": 0.4287112962533035, + "learning_rate": 5.734767025089606e-06, + "loss": 0.579, + "num_tokens": 472226845.0, + "step": 5020 + }, + { + "epoch": 0.8569721795528247, + "grad_norm": 0.3819101503996246, + "learning_rate": 5.7279399214883095e-06, + "loss": 0.5729, + "num_tokens": 472372018.0, + "step": 5021 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.45043883023562864, + "learning_rate": 5.7211128178870124e-06, + "loss": 0.636, + "num_tokens": 472490299.0, + "step": 5022 + }, + { + "epoch": 0.8573135347328896, + "grad_norm": 0.4711425498400568, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.5703, + "num_tokens": 472581914.0, + "step": 5023 + }, + { + "epoch": 0.857484212322922, + "grad_norm": 0.4458423843452687, + "learning_rate": 5.7074586106844174e-06, + "loss": 0.5252, + "num_tokens": 472678585.0, + "step": 5024 + }, + { + "epoch": 0.8576548899129545, + "grad_norm": 0.4599096087852025, + "learning_rate": 5.700631507083121e-06, + "loss": 0.6168, + "num_tokens": 472777365.0, + "step": 5025 + }, + { + "epoch": 0.8578255675029869, + "grad_norm": 0.4383181589106577, + "learning_rate": 5.693804403481823e-06, + "loss": 0.575, + "num_tokens": 472889942.0, + "step": 5026 + }, + { + "epoch": 0.8579962450930193, + "grad_norm": 0.4650462446390513, + "learning_rate": 5.686977299880526e-06, + "loss": 0.5644, + "num_tokens": 472977741.0, + "step": 5027 + }, + { + "epoch": 0.8581669226830517, + "grad_norm": 0.4386807517669245, + "learning_rate": 5.680150196279229e-06, + "loss": 0.4953, + "num_tokens": 473069695.0, + "step": 5028 + }, + { + "epoch": 0.8583376002730841, + "grad_norm": 0.4508844514106691, + "learning_rate": 5.673323092677931e-06, + "loss": 0.5474, + "num_tokens": 473166982.0, + "step": 5029 + }, + { + "epoch": 0.8585082778631166, + "grad_norm": 0.4954461985829233, + "learning_rate": 5.666495989076635e-06, + "loss": 0.5562, + "num_tokens": 473246504.0, + "step": 5030 + }, + { + "epoch": 0.858678955453149, + "grad_norm": 0.44626478783752643, + "learning_rate": 5.659668885475338e-06, + "loss": 0.5891, + "num_tokens": 473357725.0, + "step": 5031 + }, + { + "epoch": 0.8588496330431814, + "grad_norm": 0.4624381246607165, + "learning_rate": 5.65284178187404e-06, + "loss": 0.5856, + "num_tokens": 473452618.0, + "step": 5032 + }, + { + "epoch": 0.8590203106332138, + "grad_norm": 0.4991333212855299, + "learning_rate": 5.646014678272743e-06, + "loss": 0.6117, + "num_tokens": 473542174.0, + "step": 5033 + }, + { + "epoch": 0.8591909882232462, + "grad_norm": 0.463581050754311, + "learning_rate": 5.639187574671447e-06, + "loss": 0.5212, + "num_tokens": 473630036.0, + "step": 5034 + }, + { + "epoch": 0.8593616658132787, + "grad_norm": 0.45472577791929064, + "learning_rate": 5.632360471070149e-06, + "loss": 0.575, + "num_tokens": 473727625.0, + "step": 5035 + }, + { + "epoch": 0.8595323434033112, + "grad_norm": 0.4840482040093249, + "learning_rate": 5.6255333674688516e-06, + "loss": 0.5626, + "num_tokens": 473813543.0, + "step": 5036 + }, + { + "epoch": 0.8597030209933436, + "grad_norm": 0.48078993631829914, + "learning_rate": 5.618706263867555e-06, + "loss": 0.5994, + "num_tokens": 473904017.0, + "step": 5037 + }, + { + "epoch": 0.859873698583376, + "grad_norm": 0.47948329612309026, + "learning_rate": 5.611879160266257e-06, + "loss": 0.562, + "num_tokens": 473992770.0, + "step": 5038 + }, + { + "epoch": 0.8600443761734085, + "grad_norm": 0.4781366081574548, + "learning_rate": 5.60505205666496e-06, + "loss": 0.5589, + "num_tokens": 474086398.0, + "step": 5039 + }, + { + "epoch": 0.8602150537634409, + "grad_norm": 0.5039575300834054, + "learning_rate": 5.598224953063663e-06, + "loss": 0.5934, + "num_tokens": 474183628.0, + "step": 5040 + }, + { + "epoch": 0.8603857313534733, + "grad_norm": 0.524180301892005, + "learning_rate": 5.591397849462365e-06, + "loss": 0.6644, + "num_tokens": 474270271.0, + "step": 5041 + }, + { + "epoch": 0.8605564089435057, + "grad_norm": 0.4032997186702399, + "learning_rate": 5.584570745861069e-06, + "loss": 0.5654, + "num_tokens": 474394543.0, + "step": 5042 + }, + { + "epoch": 0.8607270865335381, + "grad_norm": 0.44599815880767735, + "learning_rate": 5.577743642259772e-06, + "loss": 0.4685, + "num_tokens": 474484646.0, + "step": 5043 + }, + { + "epoch": 0.8608977641235706, + "grad_norm": 0.4729150594952984, + "learning_rate": 5.570916538658475e-06, + "loss": 0.5447, + "num_tokens": 474583919.0, + "step": 5044 + }, + { + "epoch": 0.861068441713603, + "grad_norm": 0.47626122460640113, + "learning_rate": 5.564089435057177e-06, + "loss": 0.5302, + "num_tokens": 474674086.0, + "step": 5045 + }, + { + "epoch": 0.8612391193036354, + "grad_norm": 0.5171239267577753, + "learning_rate": 5.557262331455881e-06, + "loss": 0.5236, + "num_tokens": 474746757.0, + "step": 5046 + }, + { + "epoch": 0.8614097968936678, + "grad_norm": 0.5043361202024221, + "learning_rate": 5.550435227854584e-06, + "loss": 0.5296, + "num_tokens": 474822009.0, + "step": 5047 + }, + { + "epoch": 0.8615804744837003, + "grad_norm": 0.5048219548229176, + "learning_rate": 5.543608124253286e-06, + "loss": 0.6156, + "num_tokens": 474906183.0, + "step": 5048 + }, + { + "epoch": 0.8617511520737328, + "grad_norm": 0.48131944496639484, + "learning_rate": 5.536781020651989e-06, + "loss": 0.5456, + "num_tokens": 474988670.0, + "step": 5049 + }, + { + "epoch": 0.8619218296637652, + "grad_norm": 0.4666946173757857, + "learning_rate": 5.529953917050692e-06, + "loss": 0.4876, + "num_tokens": 475072347.0, + "step": 5050 + }, + { + "epoch": 0.8620925072537976, + "grad_norm": 0.6065070231880086, + "learning_rate": 5.5231268134493945e-06, + "loss": 0.6143, + "num_tokens": 475164553.0, + "step": 5051 + }, + { + "epoch": 0.86226318484383, + "grad_norm": 0.5333924061453027, + "learning_rate": 5.516299709848097e-06, + "loss": 0.5749, + "num_tokens": 475236136.0, + "step": 5052 + }, + { + "epoch": 0.8624338624338624, + "grad_norm": 0.4642926432490608, + "learning_rate": 5.5094726062468e-06, + "loss": 0.5521, + "num_tokens": 475354074.0, + "step": 5053 + }, + { + "epoch": 0.8626045400238949, + "grad_norm": 0.5017726866657745, + "learning_rate": 5.502645502645503e-06, + "loss": 0.6278, + "num_tokens": 475439808.0, + "step": 5054 + }, + { + "epoch": 0.8627752176139273, + "grad_norm": 0.47093465010391083, + "learning_rate": 5.495818399044206e-06, + "loss": 0.5258, + "num_tokens": 475525814.0, + "step": 5055 + }, + { + "epoch": 0.8629458952039597, + "grad_norm": 0.4174937360392457, + "learning_rate": 5.488991295442909e-06, + "loss": 0.6159, + "num_tokens": 475662397.0, + "step": 5056 + }, + { + "epoch": 0.8631165727939921, + "grad_norm": 0.46949629928368236, + "learning_rate": 5.482164191841611e-06, + "loss": 0.5804, + "num_tokens": 475761304.0, + "step": 5057 + }, + { + "epoch": 0.8632872503840245, + "grad_norm": 0.45780382546308734, + "learning_rate": 5.475337088240315e-06, + "loss": 0.5194, + "num_tokens": 475845764.0, + "step": 5058 + }, + { + "epoch": 0.863457927974057, + "grad_norm": 0.44415266763585676, + "learning_rate": 5.468509984639018e-06, + "loss": 0.5363, + "num_tokens": 475942971.0, + "step": 5059 + }, + { + "epoch": 0.8636286055640894, + "grad_norm": 0.5047677224913351, + "learning_rate": 5.46168288103772e-06, + "loss": 0.6036, + "num_tokens": 476024031.0, + "step": 5060 + }, + { + "epoch": 0.8637992831541219, + "grad_norm": 0.4513986730447121, + "learning_rate": 5.454855777436423e-06, + "loss": 0.6098, + "num_tokens": 476132717.0, + "step": 5061 + }, + { + "epoch": 0.8639699607441543, + "grad_norm": 0.5543797317985505, + "learning_rate": 5.4480286738351265e-06, + "loss": 0.6028, + "num_tokens": 476207637.0, + "step": 5062 + }, + { + "epoch": 0.8641406383341868, + "grad_norm": 0.4821037676775579, + "learning_rate": 5.441201570233829e-06, + "loss": 0.6016, + "num_tokens": 476301915.0, + "step": 5063 + }, + { + "epoch": 0.8643113159242192, + "grad_norm": 0.5054331814699304, + "learning_rate": 5.4343744666325315e-06, + "loss": 0.5083, + "num_tokens": 476378352.0, + "step": 5064 + }, + { + "epoch": 0.8644819935142516, + "grad_norm": 0.4030589092541839, + "learning_rate": 5.4275473630312344e-06, + "loss": 0.5942, + "num_tokens": 476508972.0, + "step": 5065 + }, + { + "epoch": 0.864652671104284, + "grad_norm": 0.525146654964081, + "learning_rate": 5.4207202594299365e-06, + "loss": 0.6508, + "num_tokens": 476591423.0, + "step": 5066 + }, + { + "epoch": 0.8648233486943164, + "grad_norm": 0.4591851357753713, + "learning_rate": 5.41389315582864e-06, + "loss": 0.5491, + "num_tokens": 476686158.0, + "step": 5067 + }, + { + "epoch": 0.8649940262843488, + "grad_norm": 0.4252034832719552, + "learning_rate": 5.407066052227343e-06, + "loss": 0.5279, + "num_tokens": 476796068.0, + "step": 5068 + }, + { + "epoch": 0.8651647038743813, + "grad_norm": 0.41637225263657424, + "learning_rate": 5.400238948626045e-06, + "loss": 0.6395, + "num_tokens": 476924740.0, + "step": 5069 + }, + { + "epoch": 0.8653353814644137, + "grad_norm": 0.48678971671554466, + "learning_rate": 5.393411845024748e-06, + "loss": 0.599, + "num_tokens": 477019493.0, + "step": 5070 + }, + { + "epoch": 0.8655060590544461, + "grad_norm": 0.4446276687029465, + "learning_rate": 5.386584741423452e-06, + "loss": 0.588, + "num_tokens": 477120895.0, + "step": 5071 + }, + { + "epoch": 0.8656767366444785, + "grad_norm": 0.49048930441986355, + "learning_rate": 5.379757637822154e-06, + "loss": 0.5382, + "num_tokens": 477209100.0, + "step": 5072 + }, + { + "epoch": 0.8658474142345111, + "grad_norm": 0.4372359119186143, + "learning_rate": 5.372930534220857e-06, + "loss": 0.5888, + "num_tokens": 477324230.0, + "step": 5073 + }, + { + "epoch": 0.8660180918245435, + "grad_norm": 0.4542502429028389, + "learning_rate": 5.366103430619561e-06, + "loss": 0.5458, + "num_tokens": 477414416.0, + "step": 5074 + }, + { + "epoch": 0.8661887694145759, + "grad_norm": 0.5262258080362914, + "learning_rate": 5.359276327018263e-06, + "loss": 0.6974, + "num_tokens": 477501545.0, + "step": 5075 + }, + { + "epoch": 0.8663594470046083, + "grad_norm": 0.47787152751890183, + "learning_rate": 5.352449223416966e-06, + "loss": 0.5435, + "num_tokens": 477592426.0, + "step": 5076 + }, + { + "epoch": 0.8665301245946407, + "grad_norm": 0.4841429704079997, + "learning_rate": 5.345622119815669e-06, + "loss": 0.5727, + "num_tokens": 477681296.0, + "step": 5077 + }, + { + "epoch": 0.8667008021846732, + "grad_norm": 0.4659974341802965, + "learning_rate": 5.338795016214372e-06, + "loss": 0.5534, + "num_tokens": 477773415.0, + "step": 5078 + }, + { + "epoch": 0.8668714797747056, + "grad_norm": 0.4689832520986852, + "learning_rate": 5.331967912613074e-06, + "loss": 0.5029, + "num_tokens": 477849390.0, + "step": 5079 + }, + { + "epoch": 0.867042157364738, + "grad_norm": 0.4524433640584122, + "learning_rate": 5.325140809011777e-06, + "loss": 0.609, + "num_tokens": 477956312.0, + "step": 5080 + }, + { + "epoch": 0.8672128349547704, + "grad_norm": 0.4848452837410558, + "learning_rate": 5.31831370541048e-06, + "loss": 0.5722, + "num_tokens": 478040737.0, + "step": 5081 + }, + { + "epoch": 0.8673835125448028, + "grad_norm": 0.41595700268898184, + "learning_rate": 5.311486601809182e-06, + "loss": 0.5462, + "num_tokens": 478153199.0, + "step": 5082 + }, + { + "epoch": 0.8675541901348353, + "grad_norm": 0.5067155384647308, + "learning_rate": 5.304659498207886e-06, + "loss": 0.5704, + "num_tokens": 478242470.0, + "step": 5083 + }, + { + "epoch": 0.8677248677248677, + "grad_norm": 0.4546905676335257, + "learning_rate": 5.297832394606589e-06, + "loss": 0.579, + "num_tokens": 478344915.0, + "step": 5084 + }, + { + "epoch": 0.8678955453149002, + "grad_norm": 0.5090824820666209, + "learning_rate": 5.291005291005291e-06, + "loss": 0.6522, + "num_tokens": 478433795.0, + "step": 5085 + }, + { + "epoch": 0.8680662229049326, + "grad_norm": 0.5440080298384428, + "learning_rate": 5.284178187403994e-06, + "loss": 0.5485, + "num_tokens": 478504334.0, + "step": 5086 + }, + { + "epoch": 0.868236900494965, + "grad_norm": 0.4053630201337281, + "learning_rate": 5.277351083802698e-06, + "loss": 0.5015, + "num_tokens": 478622658.0, + "step": 5087 + }, + { + "epoch": 0.8684075780849975, + "grad_norm": 0.4836081839721407, + "learning_rate": 5.2705239802014e-06, + "loss": 0.5713, + "num_tokens": 478716846.0, + "step": 5088 + }, + { + "epoch": 0.8685782556750299, + "grad_norm": 0.41597324395102575, + "learning_rate": 5.263696876600103e-06, + "loss": 0.5227, + "num_tokens": 478832864.0, + "step": 5089 + }, + { + "epoch": 0.8687489332650623, + "grad_norm": 0.4473354623677014, + "learning_rate": 5.2568697729988065e-06, + "loss": 0.5659, + "num_tokens": 478938100.0, + "step": 5090 + }, + { + "epoch": 0.8689196108550947, + "grad_norm": 0.4782342401897195, + "learning_rate": 5.2500426693975086e-06, + "loss": 0.5479, + "num_tokens": 479025377.0, + "step": 5091 + }, + { + "epoch": 0.8690902884451271, + "grad_norm": 0.46976976681840144, + "learning_rate": 5.2432155657962115e-06, + "loss": 0.5106, + "num_tokens": 479110320.0, + "step": 5092 + }, + { + "epoch": 0.8692609660351596, + "grad_norm": 0.49001353057991087, + "learning_rate": 5.236388462194914e-06, + "loss": 0.6295, + "num_tokens": 479211488.0, + "step": 5093 + }, + { + "epoch": 0.869431643625192, + "grad_norm": 0.4655898204242139, + "learning_rate": 5.2295613585936165e-06, + "loss": 0.5328, + "num_tokens": 479300188.0, + "step": 5094 + }, + { + "epoch": 0.8696023212152244, + "grad_norm": 0.42984808881706194, + "learning_rate": 5.22273425499232e-06, + "loss": 0.5288, + "num_tokens": 479408661.0, + "step": 5095 + }, + { + "epoch": 0.8697729988052568, + "grad_norm": 0.4761595750070644, + "learning_rate": 5.215907151391023e-06, + "loss": 0.6271, + "num_tokens": 479513399.0, + "step": 5096 + }, + { + "epoch": 0.8699436763952892, + "grad_norm": 0.4610632420837057, + "learning_rate": 5.209080047789725e-06, + "loss": 0.5423, + "num_tokens": 479603315.0, + "step": 5097 + }, + { + "epoch": 0.8701143539853218, + "grad_norm": 0.4896837422291437, + "learning_rate": 5.202252944188428e-06, + "loss": 0.5894, + "num_tokens": 479691183.0, + "step": 5098 + }, + { + "epoch": 0.8702850315753542, + "grad_norm": 0.4557469082530974, + "learning_rate": 5.195425840587132e-06, + "loss": 0.5119, + "num_tokens": 479780750.0, + "step": 5099 + }, + { + "epoch": 0.8704557091653866, + "grad_norm": 0.4465893628210023, + "learning_rate": 5.188598736985834e-06, + "loss": 0.5317, + "num_tokens": 479876952.0, + "step": 5100 + }, + { + "epoch": 0.870626386755419, + "grad_norm": 0.49317080135429997, + "learning_rate": 5.181771633384537e-06, + "loss": 0.6075, + "num_tokens": 479966622.0, + "step": 5101 + }, + { + "epoch": 0.8707970643454515, + "grad_norm": 0.4779903693083875, + "learning_rate": 5.17494452978324e-06, + "loss": 0.5653, + "num_tokens": 480054953.0, + "step": 5102 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.48485145405251073, + "learning_rate": 5.168117426181942e-06, + "loss": 0.5679, + "num_tokens": 480135299.0, + "step": 5103 + }, + { + "epoch": 0.8711384195255163, + "grad_norm": 0.529942257004737, + "learning_rate": 5.161290322580646e-06, + "loss": 0.5993, + "num_tokens": 480207895.0, + "step": 5104 + }, + { + "epoch": 0.8713090971155487, + "grad_norm": 0.4775816406390114, + "learning_rate": 5.1544632189793485e-06, + "loss": 0.5616, + "num_tokens": 480301502.0, + "step": 5105 + }, + { + "epoch": 0.8714797747055811, + "grad_norm": 0.49828595834758116, + "learning_rate": 5.147636115378051e-06, + "loss": 0.5332, + "num_tokens": 480379226.0, + "step": 5106 + }, + { + "epoch": 0.8716504522956136, + "grad_norm": 0.3850406628514721, + "learning_rate": 5.140809011776754e-06, + "loss": 0.5372, + "num_tokens": 480510506.0, + "step": 5107 + }, + { + "epoch": 0.871821129885646, + "grad_norm": 0.44972464134094686, + "learning_rate": 5.133981908175457e-06, + "loss": 0.5495, + "num_tokens": 480605338.0, + "step": 5108 + }, + { + "epoch": 0.8719918074756784, + "grad_norm": 0.4234185519075699, + "learning_rate": 5.12715480457416e-06, + "loss": 0.5376, + "num_tokens": 480714239.0, + "step": 5109 + }, + { + "epoch": 0.8721624850657109, + "grad_norm": 0.4337246627995907, + "learning_rate": 5.120327700972862e-06, + "loss": 0.4848, + "num_tokens": 480813971.0, + "step": 5110 + }, + { + "epoch": 0.8723331626557433, + "grad_norm": 0.4454840849786393, + "learning_rate": 5.113500597371566e-06, + "loss": 0.5908, + "num_tokens": 480918255.0, + "step": 5111 + }, + { + "epoch": 0.8725038402457758, + "grad_norm": 0.4582709757291227, + "learning_rate": 5.106673493770269e-06, + "loss": 0.5747, + "num_tokens": 481018127.0, + "step": 5112 + }, + { + "epoch": 0.8726745178358082, + "grad_norm": 0.4666246577486718, + "learning_rate": 5.099846390168971e-06, + "loss": 0.4899, + "num_tokens": 481092929.0, + "step": 5113 + }, + { + "epoch": 0.8728451954258406, + "grad_norm": 0.4822541169014824, + "learning_rate": 5.093019286567674e-06, + "loss": 0.517, + "num_tokens": 481174645.0, + "step": 5114 + }, + { + "epoch": 0.873015873015873, + "grad_norm": 0.4362059039926803, + "learning_rate": 5.086192182966378e-06, + "loss": 0.5701, + "num_tokens": 481287685.0, + "step": 5115 + }, + { + "epoch": 0.8731865506059054, + "grad_norm": 0.4509374853165365, + "learning_rate": 5.07936507936508e-06, + "loss": 0.5237, + "num_tokens": 481383744.0, + "step": 5116 + }, + { + "epoch": 0.8733572281959379, + "grad_norm": 0.423699677582795, + "learning_rate": 5.072537975763783e-06, + "loss": 0.5521, + "num_tokens": 481496424.0, + "step": 5117 + }, + { + "epoch": 0.8735279057859703, + "grad_norm": 0.4385807266943721, + "learning_rate": 5.065710872162486e-06, + "loss": 0.6094, + "num_tokens": 481616083.0, + "step": 5118 + }, + { + "epoch": 0.8736985833760027, + "grad_norm": 0.5203189787570264, + "learning_rate": 5.058883768561188e-06, + "loss": 0.5561, + "num_tokens": 481687041.0, + "step": 5119 + }, + { + "epoch": 0.8738692609660351, + "grad_norm": 0.4306905054019209, + "learning_rate": 5.052056664959891e-06, + "loss": 0.4996, + "num_tokens": 481786062.0, + "step": 5120 + }, + { + "epoch": 0.8740399385560675, + "grad_norm": 0.466790971398107, + "learning_rate": 5.045229561358594e-06, + "loss": 0.5019, + "num_tokens": 481869285.0, + "step": 5121 + }, + { + "epoch": 0.8742106161461001, + "grad_norm": 0.49448161390651535, + "learning_rate": 5.038402457757296e-06, + "loss": 0.5551, + "num_tokens": 481948650.0, + "step": 5122 + }, + { + "epoch": 0.8743812937361325, + "grad_norm": 0.4785616574301245, + "learning_rate": 5.031575354155999e-06, + "loss": 0.5643, + "num_tokens": 482039542.0, + "step": 5123 + }, + { + "epoch": 0.8745519713261649, + "grad_norm": 0.529021202278291, + "learning_rate": 5.024748250554703e-06, + "loss": 0.5712, + "num_tokens": 482115314.0, + "step": 5124 + }, + { + "epoch": 0.8747226489161973, + "grad_norm": 0.5367845336472805, + "learning_rate": 5.017921146953405e-06, + "loss": 0.4983, + "num_tokens": 482180853.0, + "step": 5125 + }, + { + "epoch": 0.8748933265062298, + "grad_norm": 0.44855980522529565, + "learning_rate": 5.011094043352108e-06, + "loss": 0.6375, + "num_tokens": 482294976.0, + "step": 5126 + }, + { + "epoch": 0.8750640040962622, + "grad_norm": 0.4378289717249907, + "learning_rate": 5.004266939750812e-06, + "loss": 0.5634, + "num_tokens": 482408470.0, + "step": 5127 + }, + { + "epoch": 0.8752346816862946, + "grad_norm": 0.4323729714032558, + "learning_rate": 4.997439836149514e-06, + "loss": 0.5855, + "num_tokens": 482534070.0, + "step": 5128 + }, + { + "epoch": 0.875405359276327, + "grad_norm": 0.4894003954658156, + "learning_rate": 4.990612732548217e-06, + "loss": 0.5797, + "num_tokens": 482634835.0, + "step": 5129 + }, + { + "epoch": 0.8755760368663594, + "grad_norm": 0.4325870037970568, + "learning_rate": 4.98378562894692e-06, + "loss": 0.5139, + "num_tokens": 482735176.0, + "step": 5130 + }, + { + "epoch": 0.8757467144563919, + "grad_norm": 0.5573914620109769, + "learning_rate": 4.976958525345623e-06, + "loss": 0.5636, + "num_tokens": 482798210.0, + "step": 5131 + }, + { + "epoch": 0.8759173920464243, + "grad_norm": 0.47348521944671706, + "learning_rate": 4.9701314217443256e-06, + "loss": 0.5923, + "num_tokens": 482888660.0, + "step": 5132 + }, + { + "epoch": 0.8760880696364567, + "grad_norm": 0.5017965092056168, + "learning_rate": 4.9633043181430285e-06, + "loss": 0.4953, + "num_tokens": 482962970.0, + "step": 5133 + }, + { + "epoch": 0.8762587472264891, + "grad_norm": 0.4384648598753509, + "learning_rate": 4.956477214541731e-06, + "loss": 0.5354, + "num_tokens": 483068226.0, + "step": 5134 + }, + { + "epoch": 0.8764294248165216, + "grad_norm": 0.4815551154956585, + "learning_rate": 4.9496501109404335e-06, + "loss": 0.5286, + "num_tokens": 483153683.0, + "step": 5135 + }, + { + "epoch": 0.8766001024065541, + "grad_norm": 0.4234594214687516, + "learning_rate": 4.942823007339137e-06, + "loss": 0.5912, + "num_tokens": 483270945.0, + "step": 5136 + }, + { + "epoch": 0.8767707799965865, + "grad_norm": 0.43736742724381233, + "learning_rate": 4.935995903737839e-06, + "loss": 0.5584, + "num_tokens": 483377280.0, + "step": 5137 + }, + { + "epoch": 0.8769414575866189, + "grad_norm": 0.4915416408522379, + "learning_rate": 4.929168800136542e-06, + "loss": 0.5449, + "num_tokens": 483459122.0, + "step": 5138 + }, + { + "epoch": 0.8771121351766513, + "grad_norm": 0.4264220197945617, + "learning_rate": 4.922341696535245e-06, + "loss": 0.644, + "num_tokens": 483581112.0, + "step": 5139 + }, + { + "epoch": 0.8772828127666837, + "grad_norm": 0.4735082888212155, + "learning_rate": 4.915514592933948e-06, + "loss": 0.537, + "num_tokens": 483670473.0, + "step": 5140 + }, + { + "epoch": 0.8774534903567162, + "grad_norm": 0.4193689557052416, + "learning_rate": 4.908687489332651e-06, + "loss": 0.5736, + "num_tokens": 483800077.0, + "step": 5141 + }, + { + "epoch": 0.8776241679467486, + "grad_norm": 0.47271826702118397, + "learning_rate": 4.901860385731354e-06, + "loss": 0.5946, + "num_tokens": 483895151.0, + "step": 5142 + }, + { + "epoch": 0.877794845536781, + "grad_norm": 0.44303890130464607, + "learning_rate": 4.895033282130057e-06, + "loss": 0.5008, + "num_tokens": 483990821.0, + "step": 5143 + }, + { + "epoch": 0.8779655231268134, + "grad_norm": 0.45467908861190803, + "learning_rate": 4.88820617852876e-06, + "loss": 0.5219, + "num_tokens": 484086411.0, + "step": 5144 + }, + { + "epoch": 0.8781362007168458, + "grad_norm": 0.45300161039394576, + "learning_rate": 4.881379074927463e-06, + "loss": 0.5651, + "num_tokens": 484184665.0, + "step": 5145 + }, + { + "epoch": 0.8783068783068783, + "grad_norm": 0.45275418240441667, + "learning_rate": 4.8745519713261655e-06, + "loss": 0.511, + "num_tokens": 484271468.0, + "step": 5146 + }, + { + "epoch": 0.8784775558969108, + "grad_norm": 0.5327310959133266, + "learning_rate": 4.867724867724868e-06, + "loss": 0.5993, + "num_tokens": 484360028.0, + "step": 5147 + }, + { + "epoch": 0.8786482334869432, + "grad_norm": 0.4440921193744442, + "learning_rate": 4.860897764123571e-06, + "loss": 0.5728, + "num_tokens": 484462613.0, + "step": 5148 + }, + { + "epoch": 0.8788189110769756, + "grad_norm": 0.40553349345586953, + "learning_rate": 4.8540706605222734e-06, + "loss": 0.5347, + "num_tokens": 484578994.0, + "step": 5149 + }, + { + "epoch": 0.878989588667008, + "grad_norm": 0.4556558405238685, + "learning_rate": 4.847243556920977e-06, + "loss": 0.5547, + "num_tokens": 484679278.0, + "step": 5150 + }, + { + "epoch": 0.8791602662570405, + "grad_norm": 0.4342408885003157, + "learning_rate": 4.840416453319679e-06, + "loss": 0.5575, + "num_tokens": 484784942.0, + "step": 5151 + }, + { + "epoch": 0.8793309438470729, + "grad_norm": 0.4733049154374729, + "learning_rate": 4.833589349718382e-06, + "loss": 0.5703, + "num_tokens": 484876549.0, + "step": 5152 + }, + { + "epoch": 0.8795016214371053, + "grad_norm": 0.5667701008029796, + "learning_rate": 4.826762246117085e-06, + "loss": 0.6091, + "num_tokens": 484946813.0, + "step": 5153 + }, + { + "epoch": 0.8796722990271377, + "grad_norm": 0.4387924270105301, + "learning_rate": 4.819935142515788e-06, + "loss": 0.5178, + "num_tokens": 485043887.0, + "step": 5154 + }, + { + "epoch": 0.8798429766171701, + "grad_norm": 0.5008457115202295, + "learning_rate": 4.813108038914491e-06, + "loss": 0.6035, + "num_tokens": 485132315.0, + "step": 5155 + }, + { + "epoch": 0.8800136542072026, + "grad_norm": 0.5167369888378072, + "learning_rate": 4.806280935313194e-06, + "loss": 0.5344, + "num_tokens": 485206463.0, + "step": 5156 + }, + { + "epoch": 0.880184331797235, + "grad_norm": 0.44150259759058486, + "learning_rate": 4.799453831711897e-06, + "loss": 0.5676, + "num_tokens": 485309343.0, + "step": 5157 + }, + { + "epoch": 0.8803550093872674, + "grad_norm": 0.42841026847835595, + "learning_rate": 4.7926267281106e-06, + "loss": 0.5257, + "num_tokens": 485406396.0, + "step": 5158 + }, + { + "epoch": 0.8805256869772998, + "grad_norm": 0.47330134787546013, + "learning_rate": 4.785799624509303e-06, + "loss": 0.572, + "num_tokens": 485496664.0, + "step": 5159 + }, + { + "epoch": 0.8806963645673324, + "grad_norm": 0.5313116006445662, + "learning_rate": 4.7789725209080055e-06, + "loss": 0.5853, + "num_tokens": 485567533.0, + "step": 5160 + }, + { + "epoch": 0.8808670421573648, + "grad_norm": 0.45516103078993886, + "learning_rate": 4.772145417306708e-06, + "loss": 0.5952, + "num_tokens": 485664981.0, + "step": 5161 + }, + { + "epoch": 0.8810377197473972, + "grad_norm": 0.5086206062251374, + "learning_rate": 4.765318313705411e-06, + "loss": 0.6891, + "num_tokens": 485782414.0, + "step": 5162 + }, + { + "epoch": 0.8812083973374296, + "grad_norm": 0.4312852476813892, + "learning_rate": 4.758491210104113e-06, + "loss": 0.4988, + "num_tokens": 485877776.0, + "step": 5163 + }, + { + "epoch": 0.881379074927462, + "grad_norm": 0.463616548912973, + "learning_rate": 4.751664106502817e-06, + "loss": 0.6298, + "num_tokens": 485988874.0, + "step": 5164 + }, + { + "epoch": 0.8815497525174945, + "grad_norm": 0.4594135092142407, + "learning_rate": 4.744837002901519e-06, + "loss": 0.5573, + "num_tokens": 486103505.0, + "step": 5165 + }, + { + "epoch": 0.8817204301075269, + "grad_norm": 0.41345097389702745, + "learning_rate": 4.738009899300222e-06, + "loss": 0.55, + "num_tokens": 486223014.0, + "step": 5166 + }, + { + "epoch": 0.8818911076975593, + "grad_norm": 0.46067888544004726, + "learning_rate": 4.731182795698925e-06, + "loss": 0.5137, + "num_tokens": 486311045.0, + "step": 5167 + }, + { + "epoch": 0.8820617852875917, + "grad_norm": 0.5232465912791857, + "learning_rate": 4.724355692097628e-06, + "loss": 0.5379, + "num_tokens": 486378432.0, + "step": 5168 + }, + { + "epoch": 0.8822324628776241, + "grad_norm": 0.4439572720818059, + "learning_rate": 4.717528588496331e-06, + "loss": 0.4917, + "num_tokens": 486471168.0, + "step": 5169 + }, + { + "epoch": 0.8824031404676566, + "grad_norm": 0.42694859689153397, + "learning_rate": 4.710701484895034e-06, + "loss": 0.5874, + "num_tokens": 486592222.0, + "step": 5170 + }, + { + "epoch": 0.882573818057689, + "grad_norm": 0.5342659576222201, + "learning_rate": 4.703874381293737e-06, + "loss": 0.6162, + "num_tokens": 486679549.0, + "step": 5171 + }, + { + "epoch": 0.8827444956477215, + "grad_norm": 0.47485522285058845, + "learning_rate": 4.697047277692439e-06, + "loss": 0.5851, + "num_tokens": 486772356.0, + "step": 5172 + }, + { + "epoch": 0.8829151732377539, + "grad_norm": 0.4671103027886709, + "learning_rate": 4.6902201740911426e-06, + "loss": 0.6925, + "num_tokens": 486883996.0, + "step": 5173 + }, + { + "epoch": 0.8830858508277863, + "grad_norm": 0.4389894040951316, + "learning_rate": 4.683393070489845e-06, + "loss": 0.6119, + "num_tokens": 486998746.0, + "step": 5174 + }, + { + "epoch": 0.8832565284178188, + "grad_norm": 0.5266939283833365, + "learning_rate": 4.6765659668885476e-06, + "loss": 0.6363, + "num_tokens": 487077686.0, + "step": 5175 + }, + { + "epoch": 0.8834272060078512, + "grad_norm": 0.44162934346566235, + "learning_rate": 4.669738863287251e-06, + "loss": 0.4934, + "num_tokens": 487165338.0, + "step": 5176 + }, + { + "epoch": 0.8835978835978836, + "grad_norm": 0.436950663961292, + "learning_rate": 4.662911759685953e-06, + "loss": 0.614, + "num_tokens": 487276540.0, + "step": 5177 + }, + { + "epoch": 0.883768561187916, + "grad_norm": 0.46050386891461514, + "learning_rate": 4.656084656084656e-06, + "loss": 0.5656, + "num_tokens": 487380514.0, + "step": 5178 + }, + { + "epoch": 0.8839392387779484, + "grad_norm": 0.3984561442798695, + "learning_rate": 4.649257552483359e-06, + "loss": 0.5669, + "num_tokens": 487513669.0, + "step": 5179 + }, + { + "epoch": 0.8841099163679809, + "grad_norm": 0.44380955306344655, + "learning_rate": 4.642430448882062e-06, + "loss": 0.6426, + "num_tokens": 487623413.0, + "step": 5180 + }, + { + "epoch": 0.8842805939580133, + "grad_norm": 0.44418346628267796, + "learning_rate": 4.635603345280765e-06, + "loss": 0.5389, + "num_tokens": 487724659.0, + "step": 5181 + }, + { + "epoch": 0.8844512715480457, + "grad_norm": 0.4330570181548016, + "learning_rate": 4.628776241679468e-06, + "loss": 0.5364, + "num_tokens": 487833760.0, + "step": 5182 + }, + { + "epoch": 0.8846219491380781, + "grad_norm": 0.48686337883250086, + "learning_rate": 4.621949138078171e-06, + "loss": 0.5541, + "num_tokens": 487921471.0, + "step": 5183 + }, + { + "epoch": 0.8847926267281107, + "grad_norm": 0.516388280299665, + "learning_rate": 4.615122034476874e-06, + "loss": 0.6371, + "num_tokens": 488014177.0, + "step": 5184 + }, + { + "epoch": 0.8849633043181431, + "grad_norm": 0.4543543527124441, + "learning_rate": 4.608294930875577e-06, + "loss": 0.5004, + "num_tokens": 488098975.0, + "step": 5185 + }, + { + "epoch": 0.8851339819081755, + "grad_norm": 0.5041917245353443, + "learning_rate": 4.601467827274279e-06, + "loss": 0.5184, + "num_tokens": 488173107.0, + "step": 5186 + }, + { + "epoch": 0.8853046594982079, + "grad_norm": 0.5377418122641905, + "learning_rate": 4.5946407236729825e-06, + "loss": 0.5984, + "num_tokens": 488257664.0, + "step": 5187 + }, + { + "epoch": 0.8854753370882403, + "grad_norm": 0.44817041259464097, + "learning_rate": 4.587813620071685e-06, + "loss": 0.5116, + "num_tokens": 488350705.0, + "step": 5188 + }, + { + "epoch": 0.8856460146782728, + "grad_norm": 0.40237793792168597, + "learning_rate": 4.5809865164703875e-06, + "loss": 0.5459, + "num_tokens": 488478605.0, + "step": 5189 + }, + { + "epoch": 0.8858166922683052, + "grad_norm": 0.45395920203691287, + "learning_rate": 4.5741594128690904e-06, + "loss": 0.5686, + "num_tokens": 488580503.0, + "step": 5190 + }, + { + "epoch": 0.8859873698583376, + "grad_norm": 0.4590472144417116, + "learning_rate": 4.567332309267793e-06, + "loss": 0.5171, + "num_tokens": 488670131.0, + "step": 5191 + }, + { + "epoch": 0.88615804744837, + "grad_norm": 0.43362577036135047, + "learning_rate": 4.560505205666496e-06, + "loss": 0.5203, + "num_tokens": 488767790.0, + "step": 5192 + }, + { + "epoch": 0.8863287250384024, + "grad_norm": 0.4726936632256063, + "learning_rate": 4.553678102065199e-06, + "loss": 0.5324, + "num_tokens": 488854311.0, + "step": 5193 + }, + { + "epoch": 0.8864994026284349, + "grad_norm": 0.4892278147377575, + "learning_rate": 4.546850998463902e-06, + "loss": 0.5697, + "num_tokens": 488933585.0, + "step": 5194 + }, + { + "epoch": 0.8866700802184673, + "grad_norm": 0.4277987958800885, + "learning_rate": 4.540023894862605e-06, + "loss": 0.5729, + "num_tokens": 489057672.0, + "step": 5195 + }, + { + "epoch": 0.8868407578084997, + "grad_norm": 0.4273362870251915, + "learning_rate": 4.533196791261308e-06, + "loss": 0.4588, + "num_tokens": 489152209.0, + "step": 5196 + }, + { + "epoch": 0.8870114353985322, + "grad_norm": 0.45181982468692405, + "learning_rate": 4.526369687660011e-06, + "loss": 0.6, + "num_tokens": 489252233.0, + "step": 5197 + }, + { + "epoch": 0.8871821129885646, + "grad_norm": 0.4507474686863346, + "learning_rate": 4.519542584058714e-06, + "loss": 0.6551, + "num_tokens": 489360832.0, + "step": 5198 + }, + { + "epoch": 0.8873527905785971, + "grad_norm": 0.5089691539571304, + "learning_rate": 4.512715480457417e-06, + "loss": 0.5681, + "num_tokens": 489436773.0, + "step": 5199 + }, + { + "epoch": 0.8875234681686295, + "grad_norm": 0.49609657600175555, + "learning_rate": 4.505888376856119e-06, + "loss": 0.5276, + "num_tokens": 489512974.0, + "step": 5200 + }, + { + "epoch": 0.8876941457586619, + "grad_norm": 0.41609160728281275, + "learning_rate": 4.4990612732548225e-06, + "loss": 0.5639, + "num_tokens": 489630736.0, + "step": 5201 + }, + { + "epoch": 0.8878648233486943, + "grad_norm": 0.5014240135080654, + "learning_rate": 4.492234169653525e-06, + "loss": 0.5905, + "num_tokens": 489713462.0, + "step": 5202 + }, + { + "epoch": 0.8880355009387267, + "grad_norm": 0.4406056320818226, + "learning_rate": 4.4854070660522275e-06, + "loss": 0.4697, + "num_tokens": 489801979.0, + "step": 5203 + }, + { + "epoch": 0.8882061785287592, + "grad_norm": 0.5406345893239002, + "learning_rate": 4.47857996245093e-06, + "loss": 0.5338, + "num_tokens": 489869273.0, + "step": 5204 + }, + { + "epoch": 0.8883768561187916, + "grad_norm": 0.4499636318960037, + "learning_rate": 4.471752858849633e-06, + "loss": 0.6095, + "num_tokens": 489978105.0, + "step": 5205 + }, + { + "epoch": 0.888547533708824, + "grad_norm": 0.48130895385973954, + "learning_rate": 4.464925755248336e-06, + "loss": 0.6109, + "num_tokens": 490072731.0, + "step": 5206 + }, + { + "epoch": 0.8887182112988564, + "grad_norm": 0.4394238456280044, + "learning_rate": 4.458098651647039e-06, + "loss": 0.5535, + "num_tokens": 490174153.0, + "step": 5207 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.4274347138809668, + "learning_rate": 4.451271548045742e-06, + "loss": 0.5516, + "num_tokens": 490286333.0, + "step": 5208 + }, + { + "epoch": 0.8890595664789214, + "grad_norm": 0.4913399383469133, + "learning_rate": 4.444444444444444e-06, + "loss": 0.5714, + "num_tokens": 490375160.0, + "step": 5209 + }, + { + "epoch": 0.8892302440689538, + "grad_norm": 0.5176861346811151, + "learning_rate": 4.437617340843148e-06, + "loss": 0.7168, + "num_tokens": 490468360.0, + "step": 5210 + }, + { + "epoch": 0.8894009216589862, + "grad_norm": 0.43732589749366185, + "learning_rate": 4.430790237241851e-06, + "loss": 0.5165, + "num_tokens": 490567370.0, + "step": 5211 + }, + { + "epoch": 0.8895715992490186, + "grad_norm": 0.4305830107383396, + "learning_rate": 4.423963133640554e-06, + "loss": 0.5019, + "num_tokens": 490668013.0, + "step": 5212 + }, + { + "epoch": 0.889742276839051, + "grad_norm": 0.4787488075004882, + "learning_rate": 4.417136030039257e-06, + "loss": 0.5993, + "num_tokens": 490763992.0, + "step": 5213 + }, + { + "epoch": 0.8899129544290835, + "grad_norm": 0.4240549573566833, + "learning_rate": 4.410308926437959e-06, + "loss": 0.5941, + "num_tokens": 490878354.0, + "step": 5214 + }, + { + "epoch": 0.8900836320191159, + "grad_norm": 0.5075261701586948, + "learning_rate": 4.4034818228366625e-06, + "loss": 0.5355, + "num_tokens": 490971534.0, + "step": 5215 + }, + { + "epoch": 0.8902543096091483, + "grad_norm": 0.47472776360807156, + "learning_rate": 4.3966547192353646e-06, + "loss": 0.6198, + "num_tokens": 491073514.0, + "step": 5216 + }, + { + "epoch": 0.8904249871991807, + "grad_norm": 0.53203848905701, + "learning_rate": 4.3898276156340675e-06, + "loss": 0.6302, + "num_tokens": 491168440.0, + "step": 5217 + }, + { + "epoch": 0.8905956647892131, + "grad_norm": 0.46157069897880515, + "learning_rate": 4.38300051203277e-06, + "loss": 0.5471, + "num_tokens": 491263369.0, + "step": 5218 + }, + { + "epoch": 0.8907663423792456, + "grad_norm": 0.48277258060687583, + "learning_rate": 4.376173408431473e-06, + "loss": 0.4805, + "num_tokens": 491344742.0, + "step": 5219 + }, + { + "epoch": 0.890937019969278, + "grad_norm": 0.4132220125294861, + "learning_rate": 4.369346304830176e-06, + "loss": 0.5656, + "num_tokens": 491463315.0, + "step": 5220 + }, + { + "epoch": 0.8911076975593104, + "grad_norm": 0.46498008340596725, + "learning_rate": 4.362519201228879e-06, + "loss": 0.5776, + "num_tokens": 491568578.0, + "step": 5221 + }, + { + "epoch": 0.8912783751493429, + "grad_norm": 0.4171225388760492, + "learning_rate": 4.355692097627582e-06, + "loss": 0.5288, + "num_tokens": 491679440.0, + "step": 5222 + }, + { + "epoch": 0.8914490527393754, + "grad_norm": 0.4755065757006446, + "learning_rate": 4.348864994026284e-06, + "loss": 0.5332, + "num_tokens": 491769931.0, + "step": 5223 + }, + { + "epoch": 0.8916197303294078, + "grad_norm": 0.4464705985392974, + "learning_rate": 4.342037890424988e-06, + "loss": 0.6027, + "num_tokens": 491879616.0, + "step": 5224 + }, + { + "epoch": 0.8917904079194402, + "grad_norm": 0.487494172405644, + "learning_rate": 4.33521078682369e-06, + "loss": 0.6147, + "num_tokens": 491969467.0, + "step": 5225 + }, + { + "epoch": 0.8919610855094726, + "grad_norm": 0.44583834722841853, + "learning_rate": 4.328383683222393e-06, + "loss": 0.5423, + "num_tokens": 492070482.0, + "step": 5226 + }, + { + "epoch": 0.892131763099505, + "grad_norm": 0.4938775498619923, + "learning_rate": 4.321556579621096e-06, + "loss": 0.6052, + "num_tokens": 492152322.0, + "step": 5227 + }, + { + "epoch": 0.8923024406895375, + "grad_norm": 0.483764849720068, + "learning_rate": 4.314729476019799e-06, + "loss": 0.4989, + "num_tokens": 492241238.0, + "step": 5228 + }, + { + "epoch": 0.8924731182795699, + "grad_norm": 0.4584380589858693, + "learning_rate": 4.3079023724185025e-06, + "loss": 0.5825, + "num_tokens": 492344316.0, + "step": 5229 + }, + { + "epoch": 0.8926437958696023, + "grad_norm": 0.46644091808051347, + "learning_rate": 4.3010752688172045e-06, + "loss": 0.5459, + "num_tokens": 492430273.0, + "step": 5230 + }, + { + "epoch": 0.8928144734596347, + "grad_norm": 0.45893163868217035, + "learning_rate": 4.2942481652159075e-06, + "loss": 0.6272, + "num_tokens": 492540977.0, + "step": 5231 + }, + { + "epoch": 0.8929851510496671, + "grad_norm": 0.4483047902349285, + "learning_rate": 4.28742106161461e-06, + "loss": 0.5308, + "num_tokens": 492633609.0, + "step": 5232 + }, + { + "epoch": 0.8931558286396996, + "grad_norm": 0.4673956932626181, + "learning_rate": 4.280593958013313e-06, + "loss": 0.5553, + "num_tokens": 492731714.0, + "step": 5233 + }, + { + "epoch": 0.8933265062297321, + "grad_norm": 0.43646667008655654, + "learning_rate": 4.273766854412016e-06, + "loss": 0.591, + "num_tokens": 492840586.0, + "step": 5234 + }, + { + "epoch": 0.8934971838197645, + "grad_norm": 0.45807847454009676, + "learning_rate": 4.266939750810719e-06, + "loss": 0.4998, + "num_tokens": 492927011.0, + "step": 5235 + }, + { + "epoch": 0.8936678614097969, + "grad_norm": 0.40746145168678793, + "learning_rate": 4.260112647209422e-06, + "loss": 0.5659, + "num_tokens": 493051308.0, + "step": 5236 + }, + { + "epoch": 0.8938385389998293, + "grad_norm": 0.44833343559637356, + "learning_rate": 4.253285543608124e-06, + "loss": 0.612, + "num_tokens": 493154850.0, + "step": 5237 + }, + { + "epoch": 0.8940092165898618, + "grad_norm": 0.4813280187860236, + "learning_rate": 4.246458440006828e-06, + "loss": 0.5435, + "num_tokens": 493239322.0, + "step": 5238 + }, + { + "epoch": 0.8941798941798942, + "grad_norm": 0.4752171813329055, + "learning_rate": 4.23963133640553e-06, + "loss": 0.4726, + "num_tokens": 493314421.0, + "step": 5239 + }, + { + "epoch": 0.8943505717699266, + "grad_norm": 0.46009344356661014, + "learning_rate": 4.232804232804233e-06, + "loss": 0.5306, + "num_tokens": 493405543.0, + "step": 5240 + }, + { + "epoch": 0.894521249359959, + "grad_norm": 0.559319226317564, + "learning_rate": 4.225977129202936e-06, + "loss": 0.6695, + "num_tokens": 493501577.0, + "step": 5241 + }, + { + "epoch": 0.8946919269499914, + "grad_norm": 0.41983807404503964, + "learning_rate": 4.219150025601639e-06, + "loss": 0.5405, + "num_tokens": 493610751.0, + "step": 5242 + }, + { + "epoch": 0.8948626045400239, + "grad_norm": 0.43899402802083015, + "learning_rate": 4.212322922000342e-06, + "loss": 0.5682, + "num_tokens": 493722913.0, + "step": 5243 + }, + { + "epoch": 0.8950332821300563, + "grad_norm": 0.40345130892200903, + "learning_rate": 4.2054958183990445e-06, + "loss": 0.5599, + "num_tokens": 493862749.0, + "step": 5244 + }, + { + "epoch": 0.8952039597200887, + "grad_norm": 0.4421066236081747, + "learning_rate": 4.1986687147977474e-06, + "loss": 0.5539, + "num_tokens": 493965164.0, + "step": 5245 + }, + { + "epoch": 0.8953746373101212, + "grad_norm": 0.43890805251864784, + "learning_rate": 4.19184161119645e-06, + "loss": 0.5338, + "num_tokens": 494064937.0, + "step": 5246 + }, + { + "epoch": 0.8955453149001537, + "grad_norm": 0.4373907115916884, + "learning_rate": 4.185014507595153e-06, + "loss": 0.5507, + "num_tokens": 494169202.0, + "step": 5247 + }, + { + "epoch": 0.8957159924901861, + "grad_norm": 0.5174422958683034, + "learning_rate": 4.178187403993856e-06, + "loss": 0.665, + "num_tokens": 494256878.0, + "step": 5248 + }, + { + "epoch": 0.8958866700802185, + "grad_norm": 0.4986774954926219, + "learning_rate": 4.171360300392559e-06, + "loss": 0.5699, + "num_tokens": 494352086.0, + "step": 5249 + }, + { + "epoch": 0.8960573476702509, + "grad_norm": 0.5031466847793998, + "learning_rate": 4.164533196791262e-06, + "loss": 0.4711, + "num_tokens": 494419676.0, + "step": 5250 + }, + { + "epoch": 0.8962280252602833, + "grad_norm": 0.5197423779542748, + "learning_rate": 4.157706093189964e-06, + "loss": 0.5968, + "num_tokens": 494500896.0, + "step": 5251 + }, + { + "epoch": 0.8963987028503158, + "grad_norm": 0.42510734723035165, + "learning_rate": 4.150878989588668e-06, + "loss": 0.5811, + "num_tokens": 494615638.0, + "step": 5252 + }, + { + "epoch": 0.8965693804403482, + "grad_norm": 0.46290936544695704, + "learning_rate": 4.14405188598737e-06, + "loss": 0.5755, + "num_tokens": 494710824.0, + "step": 5253 + }, + { + "epoch": 0.8967400580303806, + "grad_norm": 0.4911672261363778, + "learning_rate": 4.137224782386073e-06, + "loss": 0.5892, + "num_tokens": 494798321.0, + "step": 5254 + }, + { + "epoch": 0.896910735620413, + "grad_norm": 0.4254330966404156, + "learning_rate": 4.130397678784776e-06, + "loss": 0.5955, + "num_tokens": 494913764.0, + "step": 5255 + }, + { + "epoch": 0.8970814132104454, + "grad_norm": 0.6267557706404799, + "learning_rate": 4.123570575183479e-06, + "loss": 0.5538, + "num_tokens": 494991612.0, + "step": 5256 + }, + { + "epoch": 0.8972520908004779, + "grad_norm": 0.44241023701384286, + "learning_rate": 4.1167434715821816e-06, + "loss": 0.5173, + "num_tokens": 495100635.0, + "step": 5257 + }, + { + "epoch": 0.8974227683905103, + "grad_norm": 0.4520685654065106, + "learning_rate": 4.1099163679808845e-06, + "loss": 0.6038, + "num_tokens": 495208779.0, + "step": 5258 + }, + { + "epoch": 0.8975934459805428, + "grad_norm": 0.45945278134961287, + "learning_rate": 4.103089264379587e-06, + "loss": 0.5905, + "num_tokens": 495304844.0, + "step": 5259 + }, + { + "epoch": 0.8977641235705752, + "grad_norm": 0.5354509501914485, + "learning_rate": 4.09626216077829e-06, + "loss": 0.627, + "num_tokens": 495392226.0, + "step": 5260 + }, + { + "epoch": 0.8979348011606076, + "grad_norm": 0.4032340317534278, + "learning_rate": 4.089435057176993e-06, + "loss": 0.5156, + "num_tokens": 495512637.0, + "step": 5261 + }, + { + "epoch": 0.8981054787506401, + "grad_norm": 0.44771728717162596, + "learning_rate": 4.082607953575696e-06, + "loss": 0.6031, + "num_tokens": 495619513.0, + "step": 5262 + }, + { + "epoch": 0.8982761563406725, + "grad_norm": 0.4150649049796801, + "learning_rate": 4.075780849974399e-06, + "loss": 0.5512, + "num_tokens": 495733717.0, + "step": 5263 + }, + { + "epoch": 0.8984468339307049, + "grad_norm": 0.41658836781015235, + "learning_rate": 4.068953746373102e-06, + "loss": 0.4952, + "num_tokens": 495840630.0, + "step": 5264 + }, + { + "epoch": 0.8986175115207373, + "grad_norm": 0.45164505378619707, + "learning_rate": 4.062126642771804e-06, + "loss": 0.5941, + "num_tokens": 495943098.0, + "step": 5265 + }, + { + "epoch": 0.8987881891107697, + "grad_norm": 0.49526836468910035, + "learning_rate": 4.055299539170508e-06, + "loss": 0.6003, + "num_tokens": 496022412.0, + "step": 5266 + }, + { + "epoch": 0.8989588667008022, + "grad_norm": 0.48833819366985004, + "learning_rate": 4.04847243556921e-06, + "loss": 0.5905, + "num_tokens": 496108632.0, + "step": 5267 + }, + { + "epoch": 0.8991295442908346, + "grad_norm": 0.4843890520790355, + "learning_rate": 4.041645331967913e-06, + "loss": 0.5489, + "num_tokens": 496190875.0, + "step": 5268 + }, + { + "epoch": 0.899300221880867, + "grad_norm": 0.5262138519860232, + "learning_rate": 4.034818228366616e-06, + "loss": 0.5388, + "num_tokens": 496260235.0, + "step": 5269 + }, + { + "epoch": 0.8994708994708994, + "grad_norm": 0.4672629422057678, + "learning_rate": 4.027991124765319e-06, + "loss": 0.5886, + "num_tokens": 496364399.0, + "step": 5270 + }, + { + "epoch": 0.899641577060932, + "grad_norm": 0.4330535279412933, + "learning_rate": 4.0211640211640215e-06, + "loss": 0.5384, + "num_tokens": 496462839.0, + "step": 5271 + }, + { + "epoch": 0.8998122546509644, + "grad_norm": 0.4865475903140146, + "learning_rate": 4.0143369175627245e-06, + "loss": 0.5799, + "num_tokens": 496562017.0, + "step": 5272 + }, + { + "epoch": 0.8999829322409968, + "grad_norm": 0.5067583980027336, + "learning_rate": 4.007509813961427e-06, + "loss": 0.5004, + "num_tokens": 496630754.0, + "step": 5273 + }, + { + "epoch": 0.9001536098310292, + "grad_norm": 0.4209393556852556, + "learning_rate": 4.0006827103601294e-06, + "loss": 0.5506, + "num_tokens": 496743497.0, + "step": 5274 + }, + { + "epoch": 0.9003242874210616, + "grad_norm": 0.4730226998249523, + "learning_rate": 3.993855606758833e-06, + "loss": 0.5591, + "num_tokens": 496832662.0, + "step": 5275 + }, + { + "epoch": 0.900494965011094, + "grad_norm": 0.46340366833775526, + "learning_rate": 3.987028503157535e-06, + "loss": 0.5754, + "num_tokens": 496928032.0, + "step": 5276 + }, + { + "epoch": 0.9006656426011265, + "grad_norm": 0.49030775290988543, + "learning_rate": 3.980201399556239e-06, + "loss": 0.5547, + "num_tokens": 497006267.0, + "step": 5277 + }, + { + "epoch": 0.9008363201911589, + "grad_norm": 0.4598990434856833, + "learning_rate": 3.973374295954941e-06, + "loss": 0.5292, + "num_tokens": 497088961.0, + "step": 5278 + }, + { + "epoch": 0.9010069977811913, + "grad_norm": 0.4200127412228858, + "learning_rate": 3.966547192353644e-06, + "loss": 0.5692, + "num_tokens": 497209785.0, + "step": 5279 + }, + { + "epoch": 0.9011776753712237, + "grad_norm": 0.4733149788918243, + "learning_rate": 3.959720088752348e-06, + "loss": 0.5311, + "num_tokens": 497305190.0, + "step": 5280 + }, + { + "epoch": 0.9013483529612561, + "grad_norm": 0.511258204862203, + "learning_rate": 3.95289298515105e-06, + "loss": 0.6011, + "num_tokens": 497391844.0, + "step": 5281 + }, + { + "epoch": 0.9015190305512886, + "grad_norm": 0.4986935448845693, + "learning_rate": 3.946065881549753e-06, + "loss": 0.517, + "num_tokens": 497465332.0, + "step": 5282 + }, + { + "epoch": 0.901689708141321, + "grad_norm": 0.4221048728982547, + "learning_rate": 3.939238777948456e-06, + "loss": 0.5561, + "num_tokens": 497579889.0, + "step": 5283 + }, + { + "epoch": 0.9018603857313535, + "grad_norm": 0.42456218401772416, + "learning_rate": 3.932411674347159e-06, + "loss": 0.4864, + "num_tokens": 497685220.0, + "step": 5284 + }, + { + "epoch": 0.9020310633213859, + "grad_norm": 0.41559796086174405, + "learning_rate": 3.9255845707458615e-06, + "loss": 0.6378, + "num_tokens": 497817766.0, + "step": 5285 + }, + { + "epoch": 0.9022017409114184, + "grad_norm": 0.45055082196088553, + "learning_rate": 3.9187574671445644e-06, + "loss": 0.6185, + "num_tokens": 497922914.0, + "step": 5286 + }, + { + "epoch": 0.9023724185014508, + "grad_norm": 0.44013370605188984, + "learning_rate": 3.911930363543267e-06, + "loss": 0.5069, + "num_tokens": 498023618.0, + "step": 5287 + }, + { + "epoch": 0.9025430960914832, + "grad_norm": 0.42297426614304845, + "learning_rate": 3.905103259941969e-06, + "loss": 0.4891, + "num_tokens": 498125697.0, + "step": 5288 + }, + { + "epoch": 0.9027137736815156, + "grad_norm": 0.4895850031492476, + "learning_rate": 3.898276156340673e-06, + "loss": 0.5902, + "num_tokens": 498218138.0, + "step": 5289 + }, + { + "epoch": 0.902884451271548, + "grad_norm": 0.4748635414974263, + "learning_rate": 3.891449052739375e-06, + "loss": 0.5877, + "num_tokens": 498307557.0, + "step": 5290 + }, + { + "epoch": 0.9030551288615805, + "grad_norm": 0.47349889347471913, + "learning_rate": 3.884621949138078e-06, + "loss": 0.5867, + "num_tokens": 498406404.0, + "step": 5291 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.4947965654885615, + "learning_rate": 3.877794845536781e-06, + "loss": 0.6185, + "num_tokens": 498494104.0, + "step": 5292 + }, + { + "epoch": 0.9033964840416453, + "grad_norm": 0.5195272577243116, + "learning_rate": 3.870967741935484e-06, + "loss": 0.5555, + "num_tokens": 498569990.0, + "step": 5293 + }, + { + "epoch": 0.9035671616316777, + "grad_norm": 0.5879878941000674, + "learning_rate": 3.864140638334187e-06, + "loss": 0.5428, + "num_tokens": 498672054.0, + "step": 5294 + }, + { + "epoch": 0.9037378392217101, + "grad_norm": 0.4678973775788344, + "learning_rate": 3.85731353473289e-06, + "loss": 0.5706, + "num_tokens": 498762496.0, + "step": 5295 + }, + { + "epoch": 0.9039085168117427, + "grad_norm": 0.4541718023087249, + "learning_rate": 3.850486431131593e-06, + "loss": 0.5356, + "num_tokens": 498852834.0, + "step": 5296 + }, + { + "epoch": 0.9040791944017751, + "grad_norm": 0.44342211838314227, + "learning_rate": 3.843659327530296e-06, + "loss": 0.5587, + "num_tokens": 498955265.0, + "step": 5297 + }, + { + "epoch": 0.9042498719918075, + "grad_norm": 0.42282129227179205, + "learning_rate": 3.8368322239289986e-06, + "loss": 0.5154, + "num_tokens": 499059743.0, + "step": 5298 + }, + { + "epoch": 0.9044205495818399, + "grad_norm": 0.4669888274350278, + "learning_rate": 3.8300051203277015e-06, + "loss": 0.5856, + "num_tokens": 499157690.0, + "step": 5299 + }, + { + "epoch": 0.9045912271718723, + "grad_norm": 0.4408748130655521, + "learning_rate": 3.823178016726404e-06, + "loss": 0.513, + "num_tokens": 499251744.0, + "step": 5300 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 0.47752986786073437, + "learning_rate": 3.816350913125107e-06, + "loss": 0.6228, + "num_tokens": 499344975.0, + "step": 5301 + }, + { + "epoch": 0.9049325823519372, + "grad_norm": 0.4966600871599882, + "learning_rate": 3.80952380952381e-06, + "loss": 0.5536, + "num_tokens": 499435579.0, + "step": 5302 + }, + { + "epoch": 0.9051032599419696, + "grad_norm": 0.5158845682359557, + "learning_rate": 3.8026967059225127e-06, + "loss": 0.6439, + "num_tokens": 499531909.0, + "step": 5303 + }, + { + "epoch": 0.905273937532002, + "grad_norm": 0.49805457114656576, + "learning_rate": 3.7958696023212156e-06, + "loss": 0.5132, + "num_tokens": 499605042.0, + "step": 5304 + }, + { + "epoch": 0.9054446151220344, + "grad_norm": 0.4499461462579217, + "learning_rate": 3.789042498719918e-06, + "loss": 0.5871, + "num_tokens": 499709150.0, + "step": 5305 + }, + { + "epoch": 0.9056152927120669, + "grad_norm": 0.439065994718018, + "learning_rate": 3.7822153951186215e-06, + "loss": 0.5882, + "num_tokens": 499815893.0, + "step": 5306 + }, + { + "epoch": 0.9057859703020993, + "grad_norm": 0.4822563399315101, + "learning_rate": 3.775388291517324e-06, + "loss": 0.5449, + "num_tokens": 499901992.0, + "step": 5307 + }, + { + "epoch": 0.9059566478921318, + "grad_norm": 0.4422057863700933, + "learning_rate": 3.7685611879160273e-06, + "loss": 0.5453, + "num_tokens": 500001443.0, + "step": 5308 + }, + { + "epoch": 0.9061273254821642, + "grad_norm": 0.43104621541545146, + "learning_rate": 3.76173408431473e-06, + "loss": 0.5065, + "num_tokens": 500099143.0, + "step": 5309 + }, + { + "epoch": 0.9062980030721967, + "grad_norm": 0.4492082808191896, + "learning_rate": 3.7549069807134327e-06, + "loss": 0.5391, + "num_tokens": 500193172.0, + "step": 5310 + }, + { + "epoch": 0.9064686806622291, + "grad_norm": 0.5138298907923574, + "learning_rate": 3.7480798771121356e-06, + "loss": 0.608, + "num_tokens": 500271856.0, + "step": 5311 + }, + { + "epoch": 0.9066393582522615, + "grad_norm": 0.5003975025638752, + "learning_rate": 3.7412527735108386e-06, + "loss": 0.5244, + "num_tokens": 500349395.0, + "step": 5312 + }, + { + "epoch": 0.9068100358422939, + "grad_norm": 0.4533467488639566, + "learning_rate": 3.734425669909541e-06, + "loss": 0.5453, + "num_tokens": 500444188.0, + "step": 5313 + }, + { + "epoch": 0.9069807134323263, + "grad_norm": 0.45990949927691965, + "learning_rate": 3.7275985663082444e-06, + "loss": 0.6021, + "num_tokens": 500544490.0, + "step": 5314 + }, + { + "epoch": 0.9071513910223588, + "grad_norm": 0.46098139830768714, + "learning_rate": 3.720771462706947e-06, + "loss": 0.5652, + "num_tokens": 500637770.0, + "step": 5315 + }, + { + "epoch": 0.9073220686123912, + "grad_norm": 0.5210042113732044, + "learning_rate": 3.7139443591056494e-06, + "loss": 0.4725, + "num_tokens": 500697719.0, + "step": 5316 + }, + { + "epoch": 0.9074927462024236, + "grad_norm": 0.44512613880827945, + "learning_rate": 3.7071172555043527e-06, + "loss": 0.5612, + "num_tokens": 500797049.0, + "step": 5317 + }, + { + "epoch": 0.907663423792456, + "grad_norm": 0.5158750782437626, + "learning_rate": 3.700290151903055e-06, + "loss": 0.6351, + "num_tokens": 500885808.0, + "step": 5318 + }, + { + "epoch": 0.9078341013824884, + "grad_norm": 0.4267600511761505, + "learning_rate": 3.693463048301758e-06, + "loss": 0.5221, + "num_tokens": 500988810.0, + "step": 5319 + }, + { + "epoch": 0.9080047789725209, + "grad_norm": 0.5552799697604747, + "learning_rate": 3.6866359447004615e-06, + "loss": 0.5437, + "num_tokens": 501054264.0, + "step": 5320 + }, + { + "epoch": 0.9081754565625534, + "grad_norm": 0.46002129527354063, + "learning_rate": 3.679808841099164e-06, + "loss": 0.5802, + "num_tokens": 501159235.0, + "step": 5321 + }, + { + "epoch": 0.9083461341525858, + "grad_norm": 0.4537611988055724, + "learning_rate": 3.6729817374978664e-06, + "loss": 0.515, + "num_tokens": 501255005.0, + "step": 5322 + }, + { + "epoch": 0.9085168117426182, + "grad_norm": 0.5042673160811617, + "learning_rate": 3.6661546338965698e-06, + "loss": 0.475, + "num_tokens": 501318543.0, + "step": 5323 + }, + { + "epoch": 0.9086874893326506, + "grad_norm": 0.5046256632126798, + "learning_rate": 3.6593275302952723e-06, + "loss": 0.5746, + "num_tokens": 501399760.0, + "step": 5324 + }, + { + "epoch": 0.9088581669226831, + "grad_norm": 0.49985995076857975, + "learning_rate": 3.6525004266939756e-06, + "loss": 0.6416, + "num_tokens": 501499039.0, + "step": 5325 + }, + { + "epoch": 0.9090288445127155, + "grad_norm": 0.5066076843312194, + "learning_rate": 3.645673323092678e-06, + "loss": 0.6208, + "num_tokens": 501597290.0, + "step": 5326 + }, + { + "epoch": 0.9091995221027479, + "grad_norm": 0.5612005584788595, + "learning_rate": 3.638846219491381e-06, + "loss": 0.5211, + "num_tokens": 501665830.0, + "step": 5327 + }, + { + "epoch": 0.9093701996927803, + "grad_norm": 0.48459371276387153, + "learning_rate": 3.6320191158900844e-06, + "loss": 0.6183, + "num_tokens": 501758843.0, + "step": 5328 + }, + { + "epoch": 0.9095408772828127, + "grad_norm": 0.4354684606710131, + "learning_rate": 3.625192012288787e-06, + "loss": 0.6265, + "num_tokens": 501889294.0, + "step": 5329 + }, + { + "epoch": 0.9097115548728452, + "grad_norm": 0.4532610157075881, + "learning_rate": 3.6183649086874893e-06, + "loss": 0.5604, + "num_tokens": 501991920.0, + "step": 5330 + }, + { + "epoch": 0.9098822324628776, + "grad_norm": 0.44036428186362525, + "learning_rate": 3.6115378050861927e-06, + "loss": 0.5367, + "num_tokens": 502092757.0, + "step": 5331 + }, + { + "epoch": 0.91005291005291, + "grad_norm": 0.44784260507666557, + "learning_rate": 3.604710701484895e-06, + "loss": 0.5584, + "num_tokens": 502189872.0, + "step": 5332 + }, + { + "epoch": 0.9102235876429425, + "grad_norm": 0.5098483310552882, + "learning_rate": 3.597883597883598e-06, + "loss": 0.5433, + "num_tokens": 502268903.0, + "step": 5333 + }, + { + "epoch": 0.910394265232975, + "grad_norm": 0.5279586718227585, + "learning_rate": 3.591056494282301e-06, + "loss": 0.6459, + "num_tokens": 502346243.0, + "step": 5334 + }, + { + "epoch": 0.9105649428230074, + "grad_norm": 0.467028359145154, + "learning_rate": 3.584229390681004e-06, + "loss": 0.533, + "num_tokens": 502450048.0, + "step": 5335 + }, + { + "epoch": 0.9107356204130398, + "grad_norm": 0.4759699921064004, + "learning_rate": 3.5774022870797064e-06, + "loss": 0.5314, + "num_tokens": 502547782.0, + "step": 5336 + }, + { + "epoch": 0.9109062980030722, + "grad_norm": 0.47551355070149093, + "learning_rate": 3.5705751834784098e-06, + "loss": 0.5232, + "num_tokens": 502636343.0, + "step": 5337 + }, + { + "epoch": 0.9110769755931046, + "grad_norm": 0.46191322105220445, + "learning_rate": 3.5637480798771122e-06, + "loss": 0.5805, + "num_tokens": 502736017.0, + "step": 5338 + }, + { + "epoch": 0.911247653183137, + "grad_norm": 0.4034513223538862, + "learning_rate": 3.556920976275815e-06, + "loss": 0.5138, + "num_tokens": 502858152.0, + "step": 5339 + }, + { + "epoch": 0.9114183307731695, + "grad_norm": 0.48779656399502175, + "learning_rate": 3.550093872674518e-06, + "loss": 0.6483, + "num_tokens": 502960297.0, + "step": 5340 + }, + { + "epoch": 0.9115890083632019, + "grad_norm": 0.5044727072421453, + "learning_rate": 3.543266769073221e-06, + "loss": 0.4858, + "num_tokens": 503027500.0, + "step": 5341 + }, + { + "epoch": 0.9117596859532343, + "grad_norm": 0.44210841959825287, + "learning_rate": 3.536439665471924e-06, + "loss": 0.5938, + "num_tokens": 503141835.0, + "step": 5342 + }, + { + "epoch": 0.9119303635432667, + "grad_norm": 0.4647582847544203, + "learning_rate": 3.529612561870627e-06, + "loss": 0.5936, + "num_tokens": 503240687.0, + "step": 5343 + }, + { + "epoch": 0.9121010411332992, + "grad_norm": 0.4887299124223356, + "learning_rate": 3.5227854582693293e-06, + "loss": 0.5657, + "num_tokens": 503325106.0, + "step": 5344 + }, + { + "epoch": 0.9122717187233317, + "grad_norm": 0.4692336486779531, + "learning_rate": 3.5159583546680327e-06, + "loss": 0.4883, + "num_tokens": 503406563.0, + "step": 5345 + }, + { + "epoch": 0.9124423963133641, + "grad_norm": 0.4344388918316503, + "learning_rate": 3.509131251066735e-06, + "loss": 0.5447, + "num_tokens": 503517910.0, + "step": 5346 + }, + { + "epoch": 0.9126130739033965, + "grad_norm": 0.5001312674113143, + "learning_rate": 3.502304147465438e-06, + "loss": 0.5722, + "num_tokens": 503599299.0, + "step": 5347 + }, + { + "epoch": 0.9127837514934289, + "grad_norm": 0.5042623242992517, + "learning_rate": 3.495477043864141e-06, + "loss": 0.5053, + "num_tokens": 503677789.0, + "step": 5348 + }, + { + "epoch": 0.9129544290834614, + "grad_norm": 0.44027735843265986, + "learning_rate": 3.488649940262844e-06, + "loss": 0.6658, + "num_tokens": 503794962.0, + "step": 5349 + }, + { + "epoch": 0.9131251066734938, + "grad_norm": 0.42622970059420473, + "learning_rate": 3.4818228366615464e-06, + "loss": 0.6041, + "num_tokens": 503914808.0, + "step": 5350 + }, + { + "epoch": 0.9132957842635262, + "grad_norm": 0.45234575808231536, + "learning_rate": 3.4749957330602497e-06, + "loss": 0.5556, + "num_tokens": 504019217.0, + "step": 5351 + }, + { + "epoch": 0.9134664618535586, + "grad_norm": 0.5843350939887356, + "learning_rate": 3.4681686294589522e-06, + "loss": 0.5486, + "num_tokens": 504102075.0, + "step": 5352 + }, + { + "epoch": 0.913637139443591, + "grad_norm": 0.452333424504742, + "learning_rate": 3.461341525857655e-06, + "loss": 0.5682, + "num_tokens": 504209305.0, + "step": 5353 + }, + { + "epoch": 0.9138078170336235, + "grad_norm": 0.5201904006886146, + "learning_rate": 3.454514422256358e-06, + "loss": 0.615, + "num_tokens": 504290293.0, + "step": 5354 + }, + { + "epoch": 0.9139784946236559, + "grad_norm": 0.4671772810853668, + "learning_rate": 3.447687318655061e-06, + "loss": 0.644, + "num_tokens": 504404746.0, + "step": 5355 + }, + { + "epoch": 0.9141491722136883, + "grad_norm": 0.48217747051886156, + "learning_rate": 3.440860215053764e-06, + "loss": 0.6058, + "num_tokens": 504495659.0, + "step": 5356 + }, + { + "epoch": 0.9143198498037207, + "grad_norm": 0.4813663513243222, + "learning_rate": 3.434033111452467e-06, + "loss": 0.5417, + "num_tokens": 504583780.0, + "step": 5357 + }, + { + "epoch": 0.9144905273937532, + "grad_norm": 0.5065771384967627, + "learning_rate": 3.4272060078511693e-06, + "loss": 0.5182, + "num_tokens": 504654568.0, + "step": 5358 + }, + { + "epoch": 0.9146612049837857, + "grad_norm": 0.47327112357492124, + "learning_rate": 3.4203789042498726e-06, + "loss": 0.6349, + "num_tokens": 504757506.0, + "step": 5359 + }, + { + "epoch": 0.9148318825738181, + "grad_norm": 0.4721451244647943, + "learning_rate": 3.413551800648575e-06, + "loss": 0.6801, + "num_tokens": 504869566.0, + "step": 5360 + }, + { + "epoch": 0.9150025601638505, + "grad_norm": 0.4269208440250479, + "learning_rate": 3.4067246970472776e-06, + "loss": 0.4944, + "num_tokens": 504965126.0, + "step": 5361 + }, + { + "epoch": 0.9151732377538829, + "grad_norm": 0.5307606704310299, + "learning_rate": 3.399897593445981e-06, + "loss": 0.5975, + "num_tokens": 505044772.0, + "step": 5362 + }, + { + "epoch": 0.9153439153439153, + "grad_norm": 0.5120139516645136, + "learning_rate": 3.393070489844684e-06, + "loss": 0.5743, + "num_tokens": 505122677.0, + "step": 5363 + }, + { + "epoch": 0.9155145929339478, + "grad_norm": 0.4836326939491452, + "learning_rate": 3.3862433862433864e-06, + "loss": 0.5085, + "num_tokens": 505205454.0, + "step": 5364 + }, + { + "epoch": 0.9156852705239802, + "grad_norm": 0.4628408185250484, + "learning_rate": 3.3794162826420897e-06, + "loss": 0.5299, + "num_tokens": 505297419.0, + "step": 5365 + }, + { + "epoch": 0.9158559481140126, + "grad_norm": 0.4615744884596534, + "learning_rate": 3.372589179040792e-06, + "loss": 0.5992, + "num_tokens": 505391757.0, + "step": 5366 + }, + { + "epoch": 0.916026625704045, + "grad_norm": 0.44355401910644043, + "learning_rate": 3.3657620754394947e-06, + "loss": 0.5813, + "num_tokens": 505506295.0, + "step": 5367 + }, + { + "epoch": 0.9161973032940774, + "grad_norm": 0.508227686957493, + "learning_rate": 3.358934971838198e-06, + "loss": 0.5755, + "num_tokens": 505587306.0, + "step": 5368 + }, + { + "epoch": 0.9163679808841099, + "grad_norm": 0.44736300493398373, + "learning_rate": 3.3521078682369005e-06, + "loss": 0.4977, + "num_tokens": 505678373.0, + "step": 5369 + }, + { + "epoch": 0.9165386584741424, + "grad_norm": 0.4694351177040667, + "learning_rate": 3.3452807646356034e-06, + "loss": 0.5826, + "num_tokens": 505774890.0, + "step": 5370 + }, + { + "epoch": 0.9167093360641748, + "grad_norm": 0.6663876269586804, + "learning_rate": 3.3384536610343068e-06, + "loss": 0.4857, + "num_tokens": 505841467.0, + "step": 5371 + }, + { + "epoch": 0.9168800136542072, + "grad_norm": 0.4545208309977447, + "learning_rate": 3.3316265574330093e-06, + "loss": 0.5744, + "num_tokens": 505935793.0, + "step": 5372 + }, + { + "epoch": 0.9170506912442397, + "grad_norm": 0.43577996536259334, + "learning_rate": 3.3247994538317126e-06, + "loss": 0.526, + "num_tokens": 506044197.0, + "step": 5373 + }, + { + "epoch": 0.9172213688342721, + "grad_norm": 0.4517687197397171, + "learning_rate": 3.317972350230415e-06, + "loss": 0.5096, + "num_tokens": 506126640.0, + "step": 5374 + }, + { + "epoch": 0.9173920464243045, + "grad_norm": 0.4434609696709867, + "learning_rate": 3.3111452466291176e-06, + "loss": 0.5472, + "num_tokens": 506230854.0, + "step": 5375 + }, + { + "epoch": 0.9175627240143369, + "grad_norm": 0.4095116442825416, + "learning_rate": 3.304318143027821e-06, + "loss": 0.4664, + "num_tokens": 506329798.0, + "step": 5376 + }, + { + "epoch": 0.9177334016043693, + "grad_norm": 0.44844797047807383, + "learning_rate": 3.2974910394265234e-06, + "loss": 0.5337, + "num_tokens": 506422757.0, + "step": 5377 + }, + { + "epoch": 0.9179040791944018, + "grad_norm": 0.4606612162017382, + "learning_rate": 3.2906639358252263e-06, + "loss": 0.5425, + "num_tokens": 506513278.0, + "step": 5378 + }, + { + "epoch": 0.9180747567844342, + "grad_norm": 0.4663436776679643, + "learning_rate": 3.2838368322239293e-06, + "loss": 0.6006, + "num_tokens": 506611363.0, + "step": 5379 + }, + { + "epoch": 0.9182454343744666, + "grad_norm": 0.47923633415278416, + "learning_rate": 3.277009728622632e-06, + "loss": 0.5879, + "num_tokens": 506696974.0, + "step": 5380 + }, + { + "epoch": 0.918416111964499, + "grad_norm": 0.4468500000436874, + "learning_rate": 3.2701826250213347e-06, + "loss": 0.4459, + "num_tokens": 506777407.0, + "step": 5381 + }, + { + "epoch": 0.9185867895545314, + "grad_norm": 0.4308993336112712, + "learning_rate": 3.263355521420038e-06, + "loss": 0.5332, + "num_tokens": 506885758.0, + "step": 5382 + }, + { + "epoch": 0.918757467144564, + "grad_norm": 0.5172131972987359, + "learning_rate": 3.2565284178187405e-06, + "loss": 0.5173, + "num_tokens": 506973308.0, + "step": 5383 + }, + { + "epoch": 0.9189281447345964, + "grad_norm": 0.45274002543110164, + "learning_rate": 3.2497013142174434e-06, + "loss": 0.6092, + "num_tokens": 507082105.0, + "step": 5384 + }, + { + "epoch": 0.9190988223246288, + "grad_norm": 0.4607219849849767, + "learning_rate": 3.2428742106161463e-06, + "loss": 0.5345, + "num_tokens": 507173133.0, + "step": 5385 + }, + { + "epoch": 0.9192694999146612, + "grad_norm": 0.47702540063480336, + "learning_rate": 3.2360471070148492e-06, + "loss": 0.6318, + "num_tokens": 507274714.0, + "step": 5386 + }, + { + "epoch": 0.9194401775046936, + "grad_norm": 0.46291282299786557, + "learning_rate": 3.2292200034135517e-06, + "loss": 0.532, + "num_tokens": 507357552.0, + "step": 5387 + }, + { + "epoch": 0.9196108550947261, + "grad_norm": 0.510328737852426, + "learning_rate": 3.222392899812255e-06, + "loss": 0.559, + "num_tokens": 507432380.0, + "step": 5388 + }, + { + "epoch": 0.9197815326847585, + "grad_norm": 0.43221827463855134, + "learning_rate": 3.2155657962109576e-06, + "loss": 0.4916, + "num_tokens": 507524964.0, + "step": 5389 + }, + { + "epoch": 0.9199522102747909, + "grad_norm": 0.49071839606685885, + "learning_rate": 3.208738692609661e-06, + "loss": 0.5689, + "num_tokens": 507609896.0, + "step": 5390 + }, + { + "epoch": 0.9201228878648233, + "grad_norm": 0.4529707337077909, + "learning_rate": 3.2019115890083634e-06, + "loss": 0.5667, + "num_tokens": 507704008.0, + "step": 5391 + }, + { + "epoch": 0.9202935654548557, + "grad_norm": 0.4705618945398495, + "learning_rate": 3.1950844854070663e-06, + "loss": 0.5283, + "num_tokens": 507786472.0, + "step": 5392 + }, + { + "epoch": 0.9204642430448882, + "grad_norm": 0.4368348951514721, + "learning_rate": 3.1882573818057692e-06, + "loss": 0.6467, + "num_tokens": 507906666.0, + "step": 5393 + }, + { + "epoch": 0.9206349206349206, + "grad_norm": 0.4856911365651797, + "learning_rate": 3.181430278204472e-06, + "loss": 0.5725, + "num_tokens": 507995357.0, + "step": 5394 + }, + { + "epoch": 0.9208055982249531, + "grad_norm": 0.480623501903059, + "learning_rate": 3.1746031746031746e-06, + "loss": 0.5847, + "num_tokens": 508081024.0, + "step": 5395 + }, + { + "epoch": 0.9209762758149855, + "grad_norm": 0.4330889110878446, + "learning_rate": 3.167776071001878e-06, + "loss": 0.5984, + "num_tokens": 508204548.0, + "step": 5396 + }, + { + "epoch": 0.921146953405018, + "grad_norm": 0.5496395877686541, + "learning_rate": 3.1609489674005805e-06, + "loss": 0.6269, + "num_tokens": 508282575.0, + "step": 5397 + }, + { + "epoch": 0.9213176309950504, + "grad_norm": 0.5070305830374331, + "learning_rate": 3.1541218637992834e-06, + "loss": 0.572, + "num_tokens": 508368310.0, + "step": 5398 + }, + { + "epoch": 0.9214883085850828, + "grad_norm": 0.4802923825241698, + "learning_rate": 3.1472947601979863e-06, + "loss": 0.6821, + "num_tokens": 508477610.0, + "step": 5399 + }, + { + "epoch": 0.9216589861751152, + "grad_norm": 0.49262242031505504, + "learning_rate": 3.1404676565966892e-06, + "loss": 0.599, + "num_tokens": 508560304.0, + "step": 5400 + }, + { + "epoch": 0.9218296637651476, + "grad_norm": 0.441284349955824, + "learning_rate": 3.1336405529953917e-06, + "loss": 0.5156, + "num_tokens": 508651135.0, + "step": 5401 + }, + { + "epoch": 0.92200034135518, + "grad_norm": 0.46539202495827175, + "learning_rate": 3.126813449394095e-06, + "loss": 0.4922, + "num_tokens": 508731694.0, + "step": 5402 + }, + { + "epoch": 0.9221710189452125, + "grad_norm": 0.47408563692211886, + "learning_rate": 3.1199863457927975e-06, + "loss": 0.5506, + "num_tokens": 508816599.0, + "step": 5403 + }, + { + "epoch": 0.9223416965352449, + "grad_norm": 0.4793262627800329, + "learning_rate": 3.113159242191501e-06, + "loss": 0.5778, + "num_tokens": 508902634.0, + "step": 5404 + }, + { + "epoch": 0.9225123741252773, + "grad_norm": 0.47045120456264994, + "learning_rate": 3.1063321385902034e-06, + "loss": 0.5614, + "num_tokens": 508993785.0, + "step": 5405 + }, + { + "epoch": 0.9226830517153097, + "grad_norm": 0.4143208201064686, + "learning_rate": 3.0995050349889063e-06, + "loss": 0.5811, + "num_tokens": 509126174.0, + "step": 5406 + }, + { + "epoch": 0.9228537293053423, + "grad_norm": 0.45648810354693653, + "learning_rate": 3.092677931387609e-06, + "loss": 0.5624, + "num_tokens": 509222562.0, + "step": 5407 + }, + { + "epoch": 0.9230244068953747, + "grad_norm": 0.49760873402381495, + "learning_rate": 3.085850827786312e-06, + "loss": 0.5931, + "num_tokens": 509304314.0, + "step": 5408 + }, + { + "epoch": 0.9231950844854071, + "grad_norm": 0.44195396307117074, + "learning_rate": 3.0790237241850146e-06, + "loss": 0.626, + "num_tokens": 509416256.0, + "step": 5409 + }, + { + "epoch": 0.9233657620754395, + "grad_norm": 0.5010736235552447, + "learning_rate": 3.072196620583718e-06, + "loss": 0.6003, + "num_tokens": 509497625.0, + "step": 5410 + }, + { + "epoch": 0.9235364396654719, + "grad_norm": 0.43433897628966134, + "learning_rate": 3.0653695169824204e-06, + "loss": 0.5707, + "num_tokens": 509608525.0, + "step": 5411 + }, + { + "epoch": 0.9237071172555044, + "grad_norm": 0.4793365227798253, + "learning_rate": 3.058542413381123e-06, + "loss": 0.5655, + "num_tokens": 509694932.0, + "step": 5412 + }, + { + "epoch": 0.9238777948455368, + "grad_norm": 0.5143717558000056, + "learning_rate": 3.0517153097798263e-06, + "loss": 0.5763, + "num_tokens": 509769200.0, + "step": 5413 + }, + { + "epoch": 0.9240484724355692, + "grad_norm": 0.5395618516712574, + "learning_rate": 3.044888206178529e-06, + "loss": 0.7504, + "num_tokens": 509858776.0, + "step": 5414 + }, + { + "epoch": 0.9242191500256016, + "grad_norm": 0.46635891285169057, + "learning_rate": 3.0380611025772317e-06, + "loss": 0.5125, + "num_tokens": 509943949.0, + "step": 5415 + }, + { + "epoch": 0.924389827615634, + "grad_norm": 0.4494488633703432, + "learning_rate": 3.031233998975935e-06, + "loss": 0.5817, + "num_tokens": 510045590.0, + "step": 5416 + }, + { + "epoch": 0.9245605052056665, + "grad_norm": 0.4550341275664514, + "learning_rate": 3.0244068953746375e-06, + "loss": 0.5502, + "num_tokens": 510143411.0, + "step": 5417 + }, + { + "epoch": 0.9247311827956989, + "grad_norm": 0.4350397067524437, + "learning_rate": 3.01757979177334e-06, + "loss": 0.5585, + "num_tokens": 510252678.0, + "step": 5418 + }, + { + "epoch": 0.9249018603857313, + "grad_norm": 0.42392258307952024, + "learning_rate": 3.0107526881720433e-06, + "loss": 0.5555, + "num_tokens": 510357202.0, + "step": 5419 + }, + { + "epoch": 0.9250725379757638, + "grad_norm": 0.4487421745033821, + "learning_rate": 3.003925584570746e-06, + "loss": 0.5639, + "num_tokens": 510456841.0, + "step": 5420 + }, + { + "epoch": 0.9252432155657963, + "grad_norm": 0.47182627887793044, + "learning_rate": 2.997098480969449e-06, + "loss": 0.6146, + "num_tokens": 510564207.0, + "step": 5421 + }, + { + "epoch": 0.9254138931558287, + "grad_norm": 0.46414321262438124, + "learning_rate": 2.9902713773681517e-06, + "loss": 0.5029, + "num_tokens": 510648743.0, + "step": 5422 + }, + { + "epoch": 0.9255845707458611, + "grad_norm": 0.516516242165001, + "learning_rate": 2.9834442737668546e-06, + "loss": 0.5336, + "num_tokens": 510722560.0, + "step": 5423 + }, + { + "epoch": 0.9257552483358935, + "grad_norm": 0.48174668508465424, + "learning_rate": 2.976617170165558e-06, + "loss": 0.5804, + "num_tokens": 510809998.0, + "step": 5424 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.41912109464123964, + "learning_rate": 2.9697900665642604e-06, + "loss": 0.4858, + "num_tokens": 510912204.0, + "step": 5425 + }, + { + "epoch": 0.9260966035159583, + "grad_norm": 0.503262449455988, + "learning_rate": 2.962962962962963e-06, + "loss": 0.5833, + "num_tokens": 510999449.0, + "step": 5426 + }, + { + "epoch": 0.9262672811059908, + "grad_norm": 0.4779268206076635, + "learning_rate": 2.9561358593616662e-06, + "loss": 0.5649, + "num_tokens": 511086154.0, + "step": 5427 + }, + { + "epoch": 0.9264379586960232, + "grad_norm": 0.49099405726595846, + "learning_rate": 2.9493087557603687e-06, + "loss": 0.6107, + "num_tokens": 511174827.0, + "step": 5428 + }, + { + "epoch": 0.9266086362860556, + "grad_norm": 0.4777903644923014, + "learning_rate": 2.9424816521590717e-06, + "loss": 0.5219, + "num_tokens": 511252690.0, + "step": 5429 + }, + { + "epoch": 0.926779313876088, + "grad_norm": 0.44628146890233444, + "learning_rate": 2.9356545485577746e-06, + "loss": 0.533, + "num_tokens": 511348246.0, + "step": 5430 + }, + { + "epoch": 0.9269499914661204, + "grad_norm": 0.49653448104671577, + "learning_rate": 2.9288274449564775e-06, + "loss": 0.5427, + "num_tokens": 511427311.0, + "step": 5431 + }, + { + "epoch": 0.927120669056153, + "grad_norm": 0.48838384317970834, + "learning_rate": 2.92200034135518e-06, + "loss": 0.5492, + "num_tokens": 511515344.0, + "step": 5432 + }, + { + "epoch": 0.9272913466461854, + "grad_norm": 0.5194174310295145, + "learning_rate": 2.9151732377538833e-06, + "loss": 0.4875, + "num_tokens": 511580785.0, + "step": 5433 + }, + { + "epoch": 0.9274620242362178, + "grad_norm": 0.5165505175198224, + "learning_rate": 2.908346134152586e-06, + "loss": 0.6093, + "num_tokens": 511667288.0, + "step": 5434 + }, + { + "epoch": 0.9276327018262502, + "grad_norm": 0.4270965063043252, + "learning_rate": 2.9015190305512887e-06, + "loss": 0.6399, + "num_tokens": 511796469.0, + "step": 5435 + }, + { + "epoch": 0.9278033794162827, + "grad_norm": 0.47224477620629407, + "learning_rate": 2.8946919269499916e-06, + "loss": 0.5264, + "num_tokens": 511881684.0, + "step": 5436 + }, + { + "epoch": 0.9279740570063151, + "grad_norm": 0.4148512081232373, + "learning_rate": 2.8878648233486946e-06, + "loss": 0.6146, + "num_tokens": 512002294.0, + "step": 5437 + }, + { + "epoch": 0.9281447345963475, + "grad_norm": 0.5120083507604413, + "learning_rate": 2.8810377197473975e-06, + "loss": 0.6223, + "num_tokens": 512084807.0, + "step": 5438 + }, + { + "epoch": 0.9283154121863799, + "grad_norm": 0.49117912841888095, + "learning_rate": 2.8742106161461004e-06, + "loss": 0.5265, + "num_tokens": 512167445.0, + "step": 5439 + }, + { + "epoch": 0.9284860897764123, + "grad_norm": 0.46433127088843723, + "learning_rate": 2.867383512544803e-06, + "loss": 0.5753, + "num_tokens": 512260351.0, + "step": 5440 + }, + { + "epoch": 0.9286567673664448, + "grad_norm": 0.4961786020063404, + "learning_rate": 2.8605564089435062e-06, + "loss": 0.5586, + "num_tokens": 512343980.0, + "step": 5441 + }, + { + "epoch": 0.9288274449564772, + "grad_norm": 0.46220960862882904, + "learning_rate": 2.8537293053422087e-06, + "loss": 0.5009, + "num_tokens": 512434076.0, + "step": 5442 + }, + { + "epoch": 0.9289981225465096, + "grad_norm": 0.4301772561569583, + "learning_rate": 2.8469022017409116e-06, + "loss": 0.5908, + "num_tokens": 512553711.0, + "step": 5443 + }, + { + "epoch": 0.929168800136542, + "grad_norm": 0.5881770692803521, + "learning_rate": 2.8400750981396145e-06, + "loss": 0.4968, + "num_tokens": 512606044.0, + "step": 5444 + }, + { + "epoch": 0.9293394777265745, + "grad_norm": 0.48879189998064143, + "learning_rate": 2.8332479945383175e-06, + "loss": 0.5806, + "num_tokens": 512692371.0, + "step": 5445 + }, + { + "epoch": 0.929510155316607, + "grad_norm": 0.47435363308539635, + "learning_rate": 2.82642089093702e-06, + "loss": 0.5603, + "num_tokens": 512780901.0, + "step": 5446 + }, + { + "epoch": 0.9296808329066394, + "grad_norm": 0.5090768102613537, + "learning_rate": 2.8195937873357233e-06, + "loss": 0.573, + "num_tokens": 512862091.0, + "step": 5447 + }, + { + "epoch": 0.9298515104966718, + "grad_norm": 0.42757385207961474, + "learning_rate": 2.8127666837344258e-06, + "loss": 0.4727, + "num_tokens": 512964575.0, + "step": 5448 + }, + { + "epoch": 0.9300221880867042, + "grad_norm": 0.4612374710946291, + "learning_rate": 2.8059395801331287e-06, + "loss": 0.4902, + "num_tokens": 513046447.0, + "step": 5449 + }, + { + "epoch": 0.9301928656767366, + "grad_norm": 0.4941565602074959, + "learning_rate": 2.7991124765318316e-06, + "loss": 0.5738, + "num_tokens": 513129537.0, + "step": 5450 + }, + { + "epoch": 0.9303635432667691, + "grad_norm": 0.43215244026115285, + "learning_rate": 2.7922853729305345e-06, + "loss": 0.5726, + "num_tokens": 513244599.0, + "step": 5451 + }, + { + "epoch": 0.9305342208568015, + "grad_norm": 0.4877135402998334, + "learning_rate": 2.7854582693292374e-06, + "loss": 0.4889, + "num_tokens": 513319661.0, + "step": 5452 + }, + { + "epoch": 0.9307048984468339, + "grad_norm": 0.4527870201419127, + "learning_rate": 2.7786311657279404e-06, + "loss": 0.5367, + "num_tokens": 513423184.0, + "step": 5453 + }, + { + "epoch": 0.9308755760368663, + "grad_norm": 0.48083522486946056, + "learning_rate": 2.771804062126643e-06, + "loss": 0.6676, + "num_tokens": 513524761.0, + "step": 5454 + }, + { + "epoch": 0.9310462536268987, + "grad_norm": 0.4879883834399059, + "learning_rate": 2.764976958525346e-06, + "loss": 0.5653, + "num_tokens": 513622790.0, + "step": 5455 + }, + { + "epoch": 0.9312169312169312, + "grad_norm": 0.47873073710321423, + "learning_rate": 2.7581498549240487e-06, + "loss": 0.5557, + "num_tokens": 513708444.0, + "step": 5456 + }, + { + "epoch": 0.9313876088069637, + "grad_norm": 0.484117496003213, + "learning_rate": 2.7513227513227516e-06, + "loss": 0.6, + "num_tokens": 513802635.0, + "step": 5457 + }, + { + "epoch": 0.9315582863969961, + "grad_norm": 0.4524305303345735, + "learning_rate": 2.7444956477214545e-06, + "loss": 0.5549, + "num_tokens": 513903567.0, + "step": 5458 + }, + { + "epoch": 0.9317289639870285, + "grad_norm": 0.46421008407117, + "learning_rate": 2.7376685441201574e-06, + "loss": 0.4863, + "num_tokens": 513985335.0, + "step": 5459 + }, + { + "epoch": 0.931899641577061, + "grad_norm": 0.4960144391496249, + "learning_rate": 2.73084144051886e-06, + "loss": 0.5611, + "num_tokens": 514081536.0, + "step": 5460 + }, + { + "epoch": 0.9320703191670934, + "grad_norm": 0.45350122309643115, + "learning_rate": 2.7240143369175633e-06, + "loss": 0.5808, + "num_tokens": 514179894.0, + "step": 5461 + }, + { + "epoch": 0.9322409967571258, + "grad_norm": 0.4659693488161964, + "learning_rate": 2.7171872333162658e-06, + "loss": 0.5001, + "num_tokens": 514262005.0, + "step": 5462 + }, + { + "epoch": 0.9324116743471582, + "grad_norm": 0.4857942488143114, + "learning_rate": 2.7103601297149683e-06, + "loss": 0.5724, + "num_tokens": 514344273.0, + "step": 5463 + }, + { + "epoch": 0.9325823519371906, + "grad_norm": 0.44775546769024827, + "learning_rate": 2.7035330261136716e-06, + "loss": 0.5959, + "num_tokens": 514446663.0, + "step": 5464 + }, + { + "epoch": 0.932753029527223, + "grad_norm": 0.4422306455812576, + "learning_rate": 2.696705922512374e-06, + "loss": 0.5305, + "num_tokens": 514543699.0, + "step": 5465 + }, + { + "epoch": 0.9329237071172555, + "grad_norm": 0.49336792238559185, + "learning_rate": 2.689878818911077e-06, + "loss": 0.563, + "num_tokens": 514631790.0, + "step": 5466 + }, + { + "epoch": 0.9330943847072879, + "grad_norm": 0.4453933129973194, + "learning_rate": 2.6830517153097803e-06, + "loss": 0.6291, + "num_tokens": 514742120.0, + "step": 5467 + }, + { + "epoch": 0.9332650622973203, + "grad_norm": 0.4694408857525028, + "learning_rate": 2.676224611708483e-06, + "loss": 0.5089, + "num_tokens": 514827519.0, + "step": 5468 + }, + { + "epoch": 0.9334357398873528, + "grad_norm": 0.4207233672556042, + "learning_rate": 2.669397508107186e-06, + "loss": 0.5307, + "num_tokens": 514945515.0, + "step": 5469 + }, + { + "epoch": 0.9336064174773853, + "grad_norm": 0.4544195889910741, + "learning_rate": 2.6625704045058887e-06, + "loss": 0.4612, + "num_tokens": 515025309.0, + "step": 5470 + }, + { + "epoch": 0.9337770950674177, + "grad_norm": 0.4816280735807452, + "learning_rate": 2.655743300904591e-06, + "loss": 0.5363, + "num_tokens": 515109291.0, + "step": 5471 + }, + { + "epoch": 0.9339477726574501, + "grad_norm": 0.4536244500638378, + "learning_rate": 2.6489161973032945e-06, + "loss": 0.5253, + "num_tokens": 515200678.0, + "step": 5472 + }, + { + "epoch": 0.9341184502474825, + "grad_norm": 0.45245453556976917, + "learning_rate": 2.642089093701997e-06, + "loss": 0.5615, + "num_tokens": 515307961.0, + "step": 5473 + }, + { + "epoch": 0.9342891278375149, + "grad_norm": 0.42765704307036945, + "learning_rate": 2.6352619901007e-06, + "loss": 0.5418, + "num_tokens": 515409463.0, + "step": 5474 + }, + { + "epoch": 0.9344598054275474, + "grad_norm": 0.450625168074865, + "learning_rate": 2.6284348864994032e-06, + "loss": 0.5456, + "num_tokens": 515500421.0, + "step": 5475 + }, + { + "epoch": 0.9346304830175798, + "grad_norm": 0.4430650353118932, + "learning_rate": 2.6216077828981057e-06, + "loss": 0.5876, + "num_tokens": 515607173.0, + "step": 5476 + }, + { + "epoch": 0.9348011606076122, + "grad_norm": 0.420508080427054, + "learning_rate": 2.6147806792968082e-06, + "loss": 0.5916, + "num_tokens": 515727203.0, + "step": 5477 + }, + { + "epoch": 0.9349718381976446, + "grad_norm": 0.49688577057702865, + "learning_rate": 2.6079535756955116e-06, + "loss": 0.6135, + "num_tokens": 515812621.0, + "step": 5478 + }, + { + "epoch": 0.935142515787677, + "grad_norm": 0.8139477643923653, + "learning_rate": 2.601126472094214e-06, + "loss": 0.6045, + "num_tokens": 515897785.0, + "step": 5479 + }, + { + "epoch": 0.9353131933777095, + "grad_norm": 0.47014765101069933, + "learning_rate": 2.594299368492917e-06, + "loss": 0.6722, + "num_tokens": 516009037.0, + "step": 5480 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 0.4165830135842234, + "learning_rate": 2.58747226489162e-06, + "loss": 0.565, + "num_tokens": 516127182.0, + "step": 5481 + }, + { + "epoch": 0.9356545485577744, + "grad_norm": 0.428945965934401, + "learning_rate": 2.580645161290323e-06, + "loss": 0.4866, + "num_tokens": 516222854.0, + "step": 5482 + }, + { + "epoch": 0.9358252261478068, + "grad_norm": 0.4957030934664764, + "learning_rate": 2.5738180576890253e-06, + "loss": 0.6484, + "num_tokens": 516322068.0, + "step": 5483 + }, + { + "epoch": 0.9359959037378393, + "grad_norm": 0.501171466618241, + "learning_rate": 2.5669909540877286e-06, + "loss": 0.5186, + "num_tokens": 516398968.0, + "step": 5484 + }, + { + "epoch": 0.9361665813278717, + "grad_norm": 0.49839906533849676, + "learning_rate": 2.560163850486431e-06, + "loss": 0.5454, + "num_tokens": 516474998.0, + "step": 5485 + }, + { + "epoch": 0.9363372589179041, + "grad_norm": 0.47009340287576906, + "learning_rate": 2.5533367468851345e-06, + "loss": 0.607, + "num_tokens": 516567257.0, + "step": 5486 + }, + { + "epoch": 0.9365079365079365, + "grad_norm": 0.4625353700066961, + "learning_rate": 2.546509643283837e-06, + "loss": 0.6035, + "num_tokens": 516665359.0, + "step": 5487 + }, + { + "epoch": 0.9366786140979689, + "grad_norm": 0.5255053347477155, + "learning_rate": 2.53968253968254e-06, + "loss": 0.5328, + "num_tokens": 516739692.0, + "step": 5488 + }, + { + "epoch": 0.9368492916880014, + "grad_norm": 0.5305372377777674, + "learning_rate": 2.532855436081243e-06, + "loss": 0.5917, + "num_tokens": 516815373.0, + "step": 5489 + }, + { + "epoch": 0.9370199692780338, + "grad_norm": 0.48897474173761374, + "learning_rate": 2.5260283324799457e-06, + "loss": 0.5402, + "num_tokens": 516900342.0, + "step": 5490 + }, + { + "epoch": 0.9371906468680662, + "grad_norm": 0.5944463411211689, + "learning_rate": 2.519201228878648e-06, + "loss": 0.6885, + "num_tokens": 516988696.0, + "step": 5491 + }, + { + "epoch": 0.9373613244580986, + "grad_norm": 0.4639316545111113, + "learning_rate": 2.5123741252773515e-06, + "loss": 0.5946, + "num_tokens": 517090051.0, + "step": 5492 + }, + { + "epoch": 0.937532002048131, + "grad_norm": 0.4373133140786267, + "learning_rate": 2.505547021676054e-06, + "loss": 0.6699, + "num_tokens": 517227887.0, + "step": 5493 + }, + { + "epoch": 0.9377026796381636, + "grad_norm": 0.41634757077402823, + "learning_rate": 2.498719918074757e-06, + "loss": 0.5652, + "num_tokens": 517347190.0, + "step": 5494 + }, + { + "epoch": 0.937873357228196, + "grad_norm": 0.42750661427894593, + "learning_rate": 2.49189281447346e-06, + "loss": 0.5785, + "num_tokens": 517467357.0, + "step": 5495 + }, + { + "epoch": 0.9380440348182284, + "grad_norm": 0.5010172548743511, + "learning_rate": 2.4850657108721628e-06, + "loss": 0.5726, + "num_tokens": 517548274.0, + "step": 5496 + }, + { + "epoch": 0.9382147124082608, + "grad_norm": 0.46495038823435, + "learning_rate": 2.4782386072708657e-06, + "loss": 0.4931, + "num_tokens": 517633394.0, + "step": 5497 + }, + { + "epoch": 0.9383853899982932, + "grad_norm": 0.4510771739803765, + "learning_rate": 2.4714115036695686e-06, + "loss": 0.4872, + "num_tokens": 517725156.0, + "step": 5498 + }, + { + "epoch": 0.9385560675883257, + "grad_norm": 0.4728633715076294, + "learning_rate": 2.464584400068271e-06, + "loss": 0.6234, + "num_tokens": 517825059.0, + "step": 5499 + }, + { + "epoch": 0.9387267451783581, + "grad_norm": 0.4218492295186224, + "learning_rate": 2.457757296466974e-06, + "loss": 0.5136, + "num_tokens": 517927619.0, + "step": 5500 + }, + { + "epoch": 0.9388974227683905, + "grad_norm": 0.46377855929179773, + "learning_rate": 2.450930192865677e-06, + "loss": 0.6156, + "num_tokens": 518037015.0, + "step": 5501 + }, + { + "epoch": 0.9390681003584229, + "grad_norm": 0.48326162895368574, + "learning_rate": 2.44410308926438e-06, + "loss": 0.5576, + "num_tokens": 518129120.0, + "step": 5502 + }, + { + "epoch": 0.9392387779484553, + "grad_norm": 0.43400996049318297, + "learning_rate": 2.4372759856630828e-06, + "loss": 0.575, + "num_tokens": 518229119.0, + "step": 5503 + }, + { + "epoch": 0.9394094555384878, + "grad_norm": 0.46871091461209224, + "learning_rate": 2.4304488820617857e-06, + "loss": 0.6197, + "num_tokens": 518345966.0, + "step": 5504 + }, + { + "epoch": 0.9395801331285202, + "grad_norm": 0.501733458584167, + "learning_rate": 2.4236217784604886e-06, + "loss": 0.5791, + "num_tokens": 518433646.0, + "step": 5505 + }, + { + "epoch": 0.9397508107185527, + "grad_norm": 0.5568345940324755, + "learning_rate": 2.416794674859191e-06, + "loss": 0.55, + "num_tokens": 518497066.0, + "step": 5506 + }, + { + "epoch": 0.9399214883085851, + "grad_norm": 0.405751066492576, + "learning_rate": 2.409967571257894e-06, + "loss": 0.5551, + "num_tokens": 518627917.0, + "step": 5507 + }, + { + "epoch": 0.9400921658986175, + "grad_norm": 0.47192871976486583, + "learning_rate": 2.403140467656597e-06, + "loss": 0.5578, + "num_tokens": 518715239.0, + "step": 5508 + }, + { + "epoch": 0.94026284348865, + "grad_norm": 0.4569995647564533, + "learning_rate": 2.3963133640553e-06, + "loss": 0.6576, + "num_tokens": 518820145.0, + "step": 5509 + }, + { + "epoch": 0.9404335210786824, + "grad_norm": 0.4621951057723491, + "learning_rate": 2.3894862604540028e-06, + "loss": 0.5909, + "num_tokens": 518920725.0, + "step": 5510 + }, + { + "epoch": 0.9406041986687148, + "grad_norm": 0.5028932636584134, + "learning_rate": 2.3826591568527057e-06, + "loss": 0.5397, + "num_tokens": 518994760.0, + "step": 5511 + }, + { + "epoch": 0.9407748762587472, + "grad_norm": 0.44298377777394027, + "learning_rate": 2.3758320532514086e-06, + "loss": 0.5407, + "num_tokens": 519094351.0, + "step": 5512 + }, + { + "epoch": 0.9409455538487796, + "grad_norm": 0.48128618823381547, + "learning_rate": 2.369004949650111e-06, + "loss": 0.6084, + "num_tokens": 519182564.0, + "step": 5513 + }, + { + "epoch": 0.9411162314388121, + "grad_norm": 0.42169913286935884, + "learning_rate": 2.362177846048814e-06, + "loss": 0.5332, + "num_tokens": 519282494.0, + "step": 5514 + }, + { + "epoch": 0.9412869090288445, + "grad_norm": 0.4284489780730588, + "learning_rate": 2.355350742447517e-06, + "loss": 0.4606, + "num_tokens": 519376380.0, + "step": 5515 + }, + { + "epoch": 0.9414575866188769, + "grad_norm": 0.5117439330195109, + "learning_rate": 2.3485236388462194e-06, + "loss": 0.6465, + "num_tokens": 519463067.0, + "step": 5516 + }, + { + "epoch": 0.9416282642089093, + "grad_norm": 0.4722589415150123, + "learning_rate": 2.3416965352449223e-06, + "loss": 0.5541, + "num_tokens": 519549808.0, + "step": 5517 + }, + { + "epoch": 0.9417989417989417, + "grad_norm": 0.4996821841344559, + "learning_rate": 2.3348694316436257e-06, + "loss": 0.5878, + "num_tokens": 519630463.0, + "step": 5518 + }, + { + "epoch": 0.9419696193889743, + "grad_norm": 0.47087646994119897, + "learning_rate": 2.328042328042328e-06, + "loss": 0.5664, + "num_tokens": 519715254.0, + "step": 5519 + }, + { + "epoch": 0.9421402969790067, + "grad_norm": 0.4431299746142668, + "learning_rate": 2.321215224441031e-06, + "loss": 0.5734, + "num_tokens": 519812761.0, + "step": 5520 + }, + { + "epoch": 0.9423109745690391, + "grad_norm": 0.4520354448215237, + "learning_rate": 2.314388120839734e-06, + "loss": 0.5208, + "num_tokens": 519904175.0, + "step": 5521 + }, + { + "epoch": 0.9424816521590715, + "grad_norm": 0.4332533363041424, + "learning_rate": 2.307561017238437e-06, + "loss": 0.5981, + "num_tokens": 520014990.0, + "step": 5522 + }, + { + "epoch": 0.942652329749104, + "grad_norm": 0.40101880143200513, + "learning_rate": 2.3007339136371394e-06, + "loss": 0.5366, + "num_tokens": 520147213.0, + "step": 5523 + }, + { + "epoch": 0.9428230073391364, + "grad_norm": 0.4742377978890138, + "learning_rate": 2.2939068100358423e-06, + "loss": 0.5774, + "num_tokens": 520235150.0, + "step": 5524 + }, + { + "epoch": 0.9429936849291688, + "grad_norm": 0.41362470867889783, + "learning_rate": 2.2870797064345452e-06, + "loss": 0.511, + "num_tokens": 520340148.0, + "step": 5525 + }, + { + "epoch": 0.9431643625192012, + "grad_norm": 0.4583274958995653, + "learning_rate": 2.280252602833248e-06, + "loss": 0.5545, + "num_tokens": 520442538.0, + "step": 5526 + }, + { + "epoch": 0.9433350401092336, + "grad_norm": 0.43702246580489773, + "learning_rate": 2.273425499231951e-06, + "loss": 0.5123, + "num_tokens": 520537092.0, + "step": 5527 + }, + { + "epoch": 0.943505717699266, + "grad_norm": 0.5082019336358123, + "learning_rate": 2.266598395630654e-06, + "loss": 0.6131, + "num_tokens": 520630112.0, + "step": 5528 + }, + { + "epoch": 0.9436763952892985, + "grad_norm": 0.4446393936744557, + "learning_rate": 2.259771292029357e-06, + "loss": 0.6106, + "num_tokens": 520739107.0, + "step": 5529 + }, + { + "epoch": 0.9438470728793309, + "grad_norm": 0.46418902421865454, + "learning_rate": 2.2529441884280594e-06, + "loss": 0.6185, + "num_tokens": 520841670.0, + "step": 5530 + }, + { + "epoch": 0.9440177504693634, + "grad_norm": 0.49997033013949327, + "learning_rate": 2.2461170848267623e-06, + "loss": 0.6563, + "num_tokens": 520934176.0, + "step": 5531 + }, + { + "epoch": 0.9441884280593958, + "grad_norm": 0.45024575891637125, + "learning_rate": 2.239289981225465e-06, + "loss": 0.5746, + "num_tokens": 521028519.0, + "step": 5532 + }, + { + "epoch": 0.9443591056494283, + "grad_norm": 0.5512217015226967, + "learning_rate": 2.232462877624168e-06, + "loss": 0.5463, + "num_tokens": 521105749.0, + "step": 5533 + }, + { + "epoch": 0.9445297832394607, + "grad_norm": 0.42512112299975735, + "learning_rate": 2.225635774022871e-06, + "loss": 0.5521, + "num_tokens": 521218240.0, + "step": 5534 + }, + { + "epoch": 0.9447004608294931, + "grad_norm": 0.49709913094698155, + "learning_rate": 2.218808670421574e-06, + "loss": 0.5708, + "num_tokens": 521301379.0, + "step": 5535 + }, + { + "epoch": 0.9448711384195255, + "grad_norm": 0.46246395702015397, + "learning_rate": 2.211981566820277e-06, + "loss": 0.5673, + "num_tokens": 521397450.0, + "step": 5536 + }, + { + "epoch": 0.945041816009558, + "grad_norm": 0.46501344254222005, + "learning_rate": 2.2051544632189794e-06, + "loss": 0.6401, + "num_tokens": 521496432.0, + "step": 5537 + }, + { + "epoch": 0.9452124935995904, + "grad_norm": 0.495430024180871, + "learning_rate": 2.1983273596176823e-06, + "loss": 0.6013, + "num_tokens": 521583965.0, + "step": 5538 + }, + { + "epoch": 0.9453831711896228, + "grad_norm": 0.5039497736519625, + "learning_rate": 2.191500256016385e-06, + "loss": 0.526, + "num_tokens": 521657152.0, + "step": 5539 + }, + { + "epoch": 0.9455538487796552, + "grad_norm": 0.48819392177109505, + "learning_rate": 2.184673152415088e-06, + "loss": 0.5072, + "num_tokens": 521740336.0, + "step": 5540 + }, + { + "epoch": 0.9457245263696876, + "grad_norm": 0.47725965824188826, + "learning_rate": 2.177846048813791e-06, + "loss": 0.5258, + "num_tokens": 521825166.0, + "step": 5541 + }, + { + "epoch": 0.94589520395972, + "grad_norm": 0.440913540017944, + "learning_rate": 2.171018945212494e-06, + "loss": 0.58, + "num_tokens": 521930629.0, + "step": 5542 + }, + { + "epoch": 0.9460658815497525, + "grad_norm": 0.43112496451010596, + "learning_rate": 2.1641918416111964e-06, + "loss": 0.5928, + "num_tokens": 522041496.0, + "step": 5543 + }, + { + "epoch": 0.946236559139785, + "grad_norm": 0.47562846547546406, + "learning_rate": 2.1573647380098994e-06, + "loss": 0.5329, + "num_tokens": 522126125.0, + "step": 5544 + }, + { + "epoch": 0.9464072367298174, + "grad_norm": 0.512851084280365, + "learning_rate": 2.1505376344086023e-06, + "loss": 0.5827, + "num_tokens": 522203149.0, + "step": 5545 + }, + { + "epoch": 0.9465779143198498, + "grad_norm": 0.46177327020534625, + "learning_rate": 2.143710530807305e-06, + "loss": 0.567, + "num_tokens": 522291092.0, + "step": 5546 + }, + { + "epoch": 0.9467485919098823, + "grad_norm": 0.4793975122689328, + "learning_rate": 2.136883427206008e-06, + "loss": 0.4875, + "num_tokens": 522365837.0, + "step": 5547 + }, + { + "epoch": 0.9469192694999147, + "grad_norm": 0.44614643563467116, + "learning_rate": 2.130056323604711e-06, + "loss": 0.5119, + "num_tokens": 522452306.0, + "step": 5548 + }, + { + "epoch": 0.9470899470899471, + "grad_norm": 0.43252821019295157, + "learning_rate": 2.123229220003414e-06, + "loss": 0.6229, + "num_tokens": 522572737.0, + "step": 5549 + }, + { + "epoch": 0.9472606246799795, + "grad_norm": 0.5296908526619216, + "learning_rate": 2.1164021164021164e-06, + "loss": 0.5394, + "num_tokens": 522642537.0, + "step": 5550 + }, + { + "epoch": 0.9474313022700119, + "grad_norm": 0.4653246606441094, + "learning_rate": 2.1095750128008193e-06, + "loss": 0.6143, + "num_tokens": 522759404.0, + "step": 5551 + }, + { + "epoch": 0.9476019798600444, + "grad_norm": 0.49726987176830967, + "learning_rate": 2.1027479091995223e-06, + "loss": 0.6634, + "num_tokens": 522852425.0, + "step": 5552 + }, + { + "epoch": 0.9477726574500768, + "grad_norm": 0.5009676008761619, + "learning_rate": 2.095920805598225e-06, + "loss": 0.5427, + "num_tokens": 522937335.0, + "step": 5553 + }, + { + "epoch": 0.9479433350401092, + "grad_norm": 0.4693971345776611, + "learning_rate": 2.089093701996928e-06, + "loss": 0.5637, + "num_tokens": 523026473.0, + "step": 5554 + }, + { + "epoch": 0.9481140126301416, + "grad_norm": 0.49389874934244277, + "learning_rate": 2.082266598395631e-06, + "loss": 0.5033, + "num_tokens": 523100789.0, + "step": 5555 + }, + { + "epoch": 0.9482846902201741, + "grad_norm": 0.5060054378085181, + "learning_rate": 2.075439494794334e-06, + "loss": 0.5729, + "num_tokens": 523184016.0, + "step": 5556 + }, + { + "epoch": 0.9484553678102066, + "grad_norm": 0.4791512480208918, + "learning_rate": 2.0686123911930364e-06, + "loss": 0.5546, + "num_tokens": 523275297.0, + "step": 5557 + }, + { + "epoch": 0.948626045400239, + "grad_norm": 0.5033954265558841, + "learning_rate": 2.0617852875917393e-06, + "loss": 0.621, + "num_tokens": 523354642.0, + "step": 5558 + }, + { + "epoch": 0.9487967229902714, + "grad_norm": 0.46655081557621086, + "learning_rate": 2.0549581839904422e-06, + "loss": 0.5826, + "num_tokens": 523453747.0, + "step": 5559 + }, + { + "epoch": 0.9489674005803038, + "grad_norm": 0.4385954694693952, + "learning_rate": 2.048131080389145e-06, + "loss": 0.533, + "num_tokens": 523548425.0, + "step": 5560 + }, + { + "epoch": 0.9491380781703362, + "grad_norm": 0.48986864227597265, + "learning_rate": 2.041303976787848e-06, + "loss": 0.4933, + "num_tokens": 523626578.0, + "step": 5561 + }, + { + "epoch": 0.9493087557603687, + "grad_norm": 0.47659295770551036, + "learning_rate": 2.034476873186551e-06, + "loss": 0.5329, + "num_tokens": 523714047.0, + "step": 5562 + }, + { + "epoch": 0.9494794333504011, + "grad_norm": 0.47566133340272493, + "learning_rate": 2.027649769585254e-06, + "loss": 0.6278, + "num_tokens": 523807852.0, + "step": 5563 + }, + { + "epoch": 0.9496501109404335, + "grad_norm": 0.4443576342458348, + "learning_rate": 2.0208226659839564e-06, + "loss": 0.6057, + "num_tokens": 523917794.0, + "step": 5564 + }, + { + "epoch": 0.9498207885304659, + "grad_norm": 0.47584692106582427, + "learning_rate": 2.0139955623826593e-06, + "loss": 0.5552, + "num_tokens": 524009716.0, + "step": 5565 + }, + { + "epoch": 0.9499914661204983, + "grad_norm": 0.48995258197172425, + "learning_rate": 2.0071684587813622e-06, + "loss": 0.6287, + "num_tokens": 524100870.0, + "step": 5566 + }, + { + "epoch": 0.9501621437105308, + "grad_norm": 0.428005897737263, + "learning_rate": 2.0003413551800647e-06, + "loss": 0.5838, + "num_tokens": 524216823.0, + "step": 5567 + }, + { + "epoch": 0.9503328213005633, + "grad_norm": 0.49101891365942735, + "learning_rate": 1.9935142515787676e-06, + "loss": 0.5382, + "num_tokens": 524292286.0, + "step": 5568 + }, + { + "epoch": 0.9505034988905957, + "grad_norm": 0.45106099781214676, + "learning_rate": 1.9866871479774706e-06, + "loss": 0.6001, + "num_tokens": 524405206.0, + "step": 5569 + }, + { + "epoch": 0.9506741764806281, + "grad_norm": 0.4408496019973162, + "learning_rate": 1.979860044376174e-06, + "loss": 0.603, + "num_tokens": 524517970.0, + "step": 5570 + }, + { + "epoch": 0.9508448540706606, + "grad_norm": 0.5355440669928219, + "learning_rate": 1.9730329407748764e-06, + "loss": 0.6416, + "num_tokens": 524591804.0, + "step": 5571 + }, + { + "epoch": 0.951015531660693, + "grad_norm": 0.4865943152596659, + "learning_rate": 1.9662058371735793e-06, + "loss": 0.5468, + "num_tokens": 524670952.0, + "step": 5572 + }, + { + "epoch": 0.9511862092507254, + "grad_norm": 0.4435099098753487, + "learning_rate": 1.9593787335722822e-06, + "loss": 0.5563, + "num_tokens": 524767317.0, + "step": 5573 + }, + { + "epoch": 0.9513568868407578, + "grad_norm": 0.5432998010606401, + "learning_rate": 1.9525516299709847e-06, + "loss": 0.6554, + "num_tokens": 524839249.0, + "step": 5574 + }, + { + "epoch": 0.9515275644307902, + "grad_norm": 0.4716508405497249, + "learning_rate": 1.9457245263696876e-06, + "loss": 0.5918, + "num_tokens": 524937565.0, + "step": 5575 + }, + { + "epoch": 0.9516982420208226, + "grad_norm": 0.42541288928304094, + "learning_rate": 1.9388974227683905e-06, + "loss": 0.5053, + "num_tokens": 525043861.0, + "step": 5576 + }, + { + "epoch": 0.9518689196108551, + "grad_norm": 0.42408597910004026, + "learning_rate": 1.9320703191670935e-06, + "loss": 0.5714, + "num_tokens": 525157776.0, + "step": 5577 + }, + { + "epoch": 0.9520395972008875, + "grad_norm": 0.5168049422839908, + "learning_rate": 1.9252432155657964e-06, + "loss": 0.6048, + "num_tokens": 525231587.0, + "step": 5578 + }, + { + "epoch": 0.9522102747909199, + "grad_norm": 0.447444148774997, + "learning_rate": 1.9184161119644993e-06, + "loss": 0.5537, + "num_tokens": 525322855.0, + "step": 5579 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.43272308846181695, + "learning_rate": 1.911589008363202e-06, + "loss": 0.5186, + "num_tokens": 525424176.0, + "step": 5580 + }, + { + "epoch": 0.9525516299709849, + "grad_norm": 0.419078960502091, + "learning_rate": 1.904761904761905e-06, + "loss": 0.5686, + "num_tokens": 525544654.0, + "step": 5581 + }, + { + "epoch": 0.9527223075610173, + "grad_norm": 0.4766283450111657, + "learning_rate": 1.8979348011606078e-06, + "loss": 0.6453, + "num_tokens": 525641410.0, + "step": 5582 + }, + { + "epoch": 0.9528929851510497, + "grad_norm": 0.4534863386777123, + "learning_rate": 1.8911076975593107e-06, + "loss": 0.527, + "num_tokens": 525737390.0, + "step": 5583 + }, + { + "epoch": 0.9530636627410821, + "grad_norm": 0.5166528046982007, + "learning_rate": 1.8842805939580137e-06, + "loss": 0.5414, + "num_tokens": 525811252.0, + "step": 5584 + }, + { + "epoch": 0.9532343403311145, + "grad_norm": 0.48864370693164777, + "learning_rate": 1.8774534903567164e-06, + "loss": 0.4975, + "num_tokens": 525884742.0, + "step": 5585 + }, + { + "epoch": 0.953405017921147, + "grad_norm": 0.41397487645416814, + "learning_rate": 1.8706263867554193e-06, + "loss": 0.5391, + "num_tokens": 526006370.0, + "step": 5586 + }, + { + "epoch": 0.9535756955111794, + "grad_norm": 0.49524068929270776, + "learning_rate": 1.8637992831541222e-06, + "loss": 0.5773, + "num_tokens": 526089919.0, + "step": 5587 + }, + { + "epoch": 0.9537463731012118, + "grad_norm": 0.4665911749008316, + "learning_rate": 1.8569721795528247e-06, + "loss": 0.5314, + "num_tokens": 526173389.0, + "step": 5588 + }, + { + "epoch": 0.9539170506912442, + "grad_norm": 0.5174849130243897, + "learning_rate": 1.8501450759515276e-06, + "loss": 0.6467, + "num_tokens": 526257430.0, + "step": 5589 + }, + { + "epoch": 0.9540877282812766, + "grad_norm": 0.4294553752509148, + "learning_rate": 1.8433179723502307e-06, + "loss": 0.575, + "num_tokens": 526366967.0, + "step": 5590 + }, + { + "epoch": 0.9542584058713091, + "grad_norm": 0.4633548674133098, + "learning_rate": 1.8364908687489332e-06, + "loss": 0.5187, + "num_tokens": 526452751.0, + "step": 5591 + }, + { + "epoch": 0.9544290834613415, + "grad_norm": 0.45292851417477076, + "learning_rate": 1.8296637651476361e-06, + "loss": 0.5684, + "num_tokens": 526552378.0, + "step": 5592 + }, + { + "epoch": 0.954599761051374, + "grad_norm": 0.45599482368180416, + "learning_rate": 1.822836661546339e-06, + "loss": 0.5165, + "num_tokens": 526641249.0, + "step": 5593 + }, + { + "epoch": 0.9547704386414064, + "grad_norm": 0.42265867537011975, + "learning_rate": 1.8160095579450422e-06, + "loss": 0.5685, + "num_tokens": 526758428.0, + "step": 5594 + }, + { + "epoch": 0.9549411162314388, + "grad_norm": 0.48350722710141364, + "learning_rate": 1.8091824543437447e-06, + "loss": 0.5465, + "num_tokens": 526844724.0, + "step": 5595 + }, + { + "epoch": 0.9551117938214713, + "grad_norm": 0.4755454860873545, + "learning_rate": 1.8023553507424476e-06, + "loss": 0.5077, + "num_tokens": 526924053.0, + "step": 5596 + }, + { + "epoch": 0.9552824714115037, + "grad_norm": 0.4857378404402684, + "learning_rate": 1.7955282471411505e-06, + "loss": 0.554, + "num_tokens": 527009473.0, + "step": 5597 + }, + { + "epoch": 0.9554531490015361, + "grad_norm": 0.5268833361598596, + "learning_rate": 1.7887011435398532e-06, + "loss": 0.5772, + "num_tokens": 527082276.0, + "step": 5598 + }, + { + "epoch": 0.9556238265915685, + "grad_norm": 0.46143196554653304, + "learning_rate": 1.7818740399385561e-06, + "loss": 0.5695, + "num_tokens": 527177970.0, + "step": 5599 + }, + { + "epoch": 0.955794504181601, + "grad_norm": 0.4207952237765409, + "learning_rate": 1.775046936337259e-06, + "loss": 0.5508, + "num_tokens": 527298055.0, + "step": 5600 + }, + { + "epoch": 0.9559651817716334, + "grad_norm": 0.45847028570217063, + "learning_rate": 1.768219832735962e-06, + "loss": 0.5684, + "num_tokens": 527402548.0, + "step": 5601 + }, + { + "epoch": 0.9561358593616658, + "grad_norm": 0.3764487117060031, + "learning_rate": 1.7613927291346647e-06, + "loss": 0.5264, + "num_tokens": 527546451.0, + "step": 5602 + }, + { + "epoch": 0.9563065369516982, + "grad_norm": 0.49202400717648787, + "learning_rate": 1.7545656255333676e-06, + "loss": 0.5273, + "num_tokens": 527623874.0, + "step": 5603 + }, + { + "epoch": 0.9564772145417306, + "grad_norm": 0.49392708258510293, + "learning_rate": 1.7477385219320705e-06, + "loss": 0.4943, + "num_tokens": 527706039.0, + "step": 5604 + }, + { + "epoch": 0.956647892131763, + "grad_norm": 0.5234722613134056, + "learning_rate": 1.7409114183307732e-06, + "loss": 0.5409, + "num_tokens": 527788059.0, + "step": 5605 + }, + { + "epoch": 0.9568185697217956, + "grad_norm": 0.487795969948059, + "learning_rate": 1.7340843147294761e-06, + "loss": 0.5715, + "num_tokens": 527878129.0, + "step": 5606 + }, + { + "epoch": 0.956989247311828, + "grad_norm": 0.4605641698677367, + "learning_rate": 1.727257211128179e-06, + "loss": 0.5316, + "num_tokens": 527967610.0, + "step": 5607 + }, + { + "epoch": 0.9571599249018604, + "grad_norm": 0.47227584981124276, + "learning_rate": 1.720430107526882e-06, + "loss": 0.486, + "num_tokens": 528044372.0, + "step": 5608 + }, + { + "epoch": 0.9573306024918928, + "grad_norm": 0.47688294287048427, + "learning_rate": 1.7136030039255846e-06, + "loss": 0.5532, + "num_tokens": 528135537.0, + "step": 5609 + }, + { + "epoch": 0.9575012800819253, + "grad_norm": 0.4626319052288818, + "learning_rate": 1.7067759003242876e-06, + "loss": 0.6085, + "num_tokens": 528240836.0, + "step": 5610 + }, + { + "epoch": 0.9576719576719577, + "grad_norm": 0.4641923907514908, + "learning_rate": 1.6999487967229905e-06, + "loss": 0.58, + "num_tokens": 528343096.0, + "step": 5611 + }, + { + "epoch": 0.9578426352619901, + "grad_norm": 0.41276532314091224, + "learning_rate": 1.6931216931216932e-06, + "loss": 0.5009, + "num_tokens": 528449318.0, + "step": 5612 + }, + { + "epoch": 0.9580133128520225, + "grad_norm": 0.480380739714127, + "learning_rate": 1.686294589520396e-06, + "loss": 0.5409, + "num_tokens": 528540231.0, + "step": 5613 + }, + { + "epoch": 0.9581839904420549, + "grad_norm": 0.45189456729999017, + "learning_rate": 1.679467485919099e-06, + "loss": 0.5637, + "num_tokens": 528642534.0, + "step": 5614 + }, + { + "epoch": 0.9583546680320874, + "grad_norm": 0.5406397137116534, + "learning_rate": 1.6726403823178017e-06, + "loss": 0.6301, + "num_tokens": 528747464.0, + "step": 5615 + }, + { + "epoch": 0.9585253456221198, + "grad_norm": 0.4311526320655303, + "learning_rate": 1.6658132787165046e-06, + "loss": 0.5668, + "num_tokens": 528861750.0, + "step": 5616 + }, + { + "epoch": 0.9586960232121522, + "grad_norm": 0.40478539061303903, + "learning_rate": 1.6589861751152075e-06, + "loss": 0.5725, + "num_tokens": 528985271.0, + "step": 5617 + }, + { + "epoch": 0.9588667008021847, + "grad_norm": 0.45637152506792417, + "learning_rate": 1.6521590715139105e-06, + "loss": 0.5116, + "num_tokens": 529068446.0, + "step": 5618 + }, + { + "epoch": 0.9590373783922171, + "grad_norm": 0.4575763599330476, + "learning_rate": 1.6453319679126132e-06, + "loss": 0.5684, + "num_tokens": 529162785.0, + "step": 5619 + }, + { + "epoch": 0.9592080559822496, + "grad_norm": 0.40702879948585513, + "learning_rate": 1.638504864311316e-06, + "loss": 0.5178, + "num_tokens": 529270887.0, + "step": 5620 + }, + { + "epoch": 0.959378733572282, + "grad_norm": 0.4814516546075914, + "learning_rate": 1.631677760710019e-06, + "loss": 0.5493, + "num_tokens": 529352755.0, + "step": 5621 + }, + { + "epoch": 0.9595494111623144, + "grad_norm": 0.44247451915301417, + "learning_rate": 1.6248506571087217e-06, + "loss": 0.4932, + "num_tokens": 529443903.0, + "step": 5622 + }, + { + "epoch": 0.9597200887523468, + "grad_norm": 0.44847382414494447, + "learning_rate": 1.6180235535074246e-06, + "loss": 0.5516, + "num_tokens": 529550383.0, + "step": 5623 + }, + { + "epoch": 0.9598907663423792, + "grad_norm": 0.46673304931931586, + "learning_rate": 1.6111964499061275e-06, + "loss": 0.5716, + "num_tokens": 529644643.0, + "step": 5624 + }, + { + "epoch": 0.9600614439324117, + "grad_norm": 0.4409170235414359, + "learning_rate": 1.6043693463048305e-06, + "loss": 0.5976, + "num_tokens": 529760840.0, + "step": 5625 + }, + { + "epoch": 0.9602321215224441, + "grad_norm": 0.48975231362875005, + "learning_rate": 1.5975422427035332e-06, + "loss": 0.6236, + "num_tokens": 529853727.0, + "step": 5626 + }, + { + "epoch": 0.9604027991124765, + "grad_norm": 0.48060391454225504, + "learning_rate": 1.590715139102236e-06, + "loss": 0.6625, + "num_tokens": 529956197.0, + "step": 5627 + }, + { + "epoch": 0.9605734767025089, + "grad_norm": 0.44316777356188236, + "learning_rate": 1.583888035500939e-06, + "loss": 0.4665, + "num_tokens": 530039666.0, + "step": 5628 + }, + { + "epoch": 0.9607441542925413, + "grad_norm": 0.4953196781103376, + "learning_rate": 1.5770609318996417e-06, + "loss": 0.6171, + "num_tokens": 530139793.0, + "step": 5629 + }, + { + "epoch": 0.9609148318825739, + "grad_norm": 0.4492436369783889, + "learning_rate": 1.5702338282983446e-06, + "loss": 0.5887, + "num_tokens": 530248268.0, + "step": 5630 + }, + { + "epoch": 0.9610855094726063, + "grad_norm": 0.5272745840121872, + "learning_rate": 1.5634067246970475e-06, + "loss": 0.5742, + "num_tokens": 530319745.0, + "step": 5631 + }, + { + "epoch": 0.9612561870626387, + "grad_norm": 0.5151423123648355, + "learning_rate": 1.5565796210957504e-06, + "loss": 0.5843, + "num_tokens": 530419259.0, + "step": 5632 + }, + { + "epoch": 0.9614268646526711, + "grad_norm": 0.44743610719104593, + "learning_rate": 1.5497525174944531e-06, + "loss": 0.5963, + "num_tokens": 530520088.0, + "step": 5633 + }, + { + "epoch": 0.9615975422427036, + "grad_norm": 0.45047033937413733, + "learning_rate": 1.542925413893156e-06, + "loss": 0.5228, + "num_tokens": 530605193.0, + "step": 5634 + }, + { + "epoch": 0.961768219832736, + "grad_norm": 0.49408633868481094, + "learning_rate": 1.536098310291859e-06, + "loss": 0.4832, + "num_tokens": 530677921.0, + "step": 5635 + }, + { + "epoch": 0.9619388974227684, + "grad_norm": 0.46410768797768687, + "learning_rate": 1.5292712066905615e-06, + "loss": 0.5637, + "num_tokens": 530773277.0, + "step": 5636 + }, + { + "epoch": 0.9621095750128008, + "grad_norm": 0.4694094324968781, + "learning_rate": 1.5224441030892646e-06, + "loss": 0.5081, + "num_tokens": 530857077.0, + "step": 5637 + }, + { + "epoch": 0.9622802526028332, + "grad_norm": 0.5047157936353266, + "learning_rate": 1.5156169994879675e-06, + "loss": 0.5975, + "num_tokens": 530946274.0, + "step": 5638 + }, + { + "epoch": 0.9624509301928657, + "grad_norm": 0.4558146191218209, + "learning_rate": 1.50878989588667e-06, + "loss": 0.5014, + "num_tokens": 531037874.0, + "step": 5639 + }, + { + "epoch": 0.9626216077828981, + "grad_norm": 0.484504162464639, + "learning_rate": 1.501962792285373e-06, + "loss": 0.5178, + "num_tokens": 531120166.0, + "step": 5640 + }, + { + "epoch": 0.9627922853729305, + "grad_norm": 0.4795722578264394, + "learning_rate": 1.4951356886840758e-06, + "loss": 0.5675, + "num_tokens": 531204557.0, + "step": 5641 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.44757949270421316, + "learning_rate": 1.488308585082779e-06, + "loss": 0.5308, + "num_tokens": 531297678.0, + "step": 5642 + }, + { + "epoch": 0.9631336405529954, + "grad_norm": 0.4325471946601958, + "learning_rate": 1.4814814814814815e-06, + "loss": 0.6152, + "num_tokens": 531412443.0, + "step": 5643 + }, + { + "epoch": 0.9633043181430279, + "grad_norm": 0.4587536955262118, + "learning_rate": 1.4746543778801844e-06, + "loss": 0.5543, + "num_tokens": 531506978.0, + "step": 5644 + }, + { + "epoch": 0.9634749957330603, + "grad_norm": 0.4288090357808341, + "learning_rate": 1.4678272742788873e-06, + "loss": 0.5023, + "num_tokens": 531608487.0, + "step": 5645 + }, + { + "epoch": 0.9636456733230927, + "grad_norm": 0.5117093404435008, + "learning_rate": 1.46100017067759e-06, + "loss": 0.5228, + "num_tokens": 531681671.0, + "step": 5646 + }, + { + "epoch": 0.9638163509131251, + "grad_norm": 0.4415944257614078, + "learning_rate": 1.454173067076293e-06, + "loss": 0.6317, + "num_tokens": 531803331.0, + "step": 5647 + }, + { + "epoch": 0.9639870285031575, + "grad_norm": 0.4383931962718212, + "learning_rate": 1.4473459634749958e-06, + "loss": 0.5633, + "num_tokens": 531924738.0, + "step": 5648 + }, + { + "epoch": 0.96415770609319, + "grad_norm": 0.4838150815175649, + "learning_rate": 1.4405188598736987e-06, + "loss": 0.5221, + "num_tokens": 532001180.0, + "step": 5649 + }, + { + "epoch": 0.9643283836832224, + "grad_norm": 0.5247918576144031, + "learning_rate": 1.4336917562724014e-06, + "loss": 0.5681, + "num_tokens": 532073946.0, + "step": 5650 + }, + { + "epoch": 0.9644990612732548, + "grad_norm": 0.4432853465153793, + "learning_rate": 1.4268646526711044e-06, + "loss": 0.5467, + "num_tokens": 532174761.0, + "step": 5651 + }, + { + "epoch": 0.9646697388632872, + "grad_norm": 0.5250680767042077, + "learning_rate": 1.4200375490698073e-06, + "loss": 0.5578, + "num_tokens": 532246723.0, + "step": 5652 + }, + { + "epoch": 0.9648404164533196, + "grad_norm": 0.5861463932629221, + "learning_rate": 1.41321044546851e-06, + "loss": 0.5671, + "num_tokens": 532302299.0, + "step": 5653 + }, + { + "epoch": 0.9650110940433521, + "grad_norm": 0.4758280856222727, + "learning_rate": 1.4063833418672129e-06, + "loss": 0.4624, + "num_tokens": 532376235.0, + "step": 5654 + }, + { + "epoch": 0.9651817716333846, + "grad_norm": 0.4842886593295531, + "learning_rate": 1.3995562382659158e-06, + "loss": 0.614, + "num_tokens": 532465702.0, + "step": 5655 + }, + { + "epoch": 0.965352449223417, + "grad_norm": 0.4684751481092388, + "learning_rate": 1.3927291346646187e-06, + "loss": 0.543, + "num_tokens": 532558346.0, + "step": 5656 + }, + { + "epoch": 0.9655231268134494, + "grad_norm": 0.4869680989245708, + "learning_rate": 1.3859020310633214e-06, + "loss": 0.5977, + "num_tokens": 532647191.0, + "step": 5657 + }, + { + "epoch": 0.9656938044034818, + "grad_norm": 0.5072136320857673, + "learning_rate": 1.3790749274620243e-06, + "loss": 0.6, + "num_tokens": 532733373.0, + "step": 5658 + }, + { + "epoch": 0.9658644819935143, + "grad_norm": 0.44554293330010625, + "learning_rate": 1.3722478238607273e-06, + "loss": 0.5795, + "num_tokens": 532832245.0, + "step": 5659 + }, + { + "epoch": 0.9660351595835467, + "grad_norm": 0.4877720149831054, + "learning_rate": 1.36542072025943e-06, + "loss": 0.5248, + "num_tokens": 532909654.0, + "step": 5660 + }, + { + "epoch": 0.9662058371735791, + "grad_norm": 0.4220016289353896, + "learning_rate": 1.3585936166581329e-06, + "loss": 0.5438, + "num_tokens": 533018119.0, + "step": 5661 + }, + { + "epoch": 0.9663765147636115, + "grad_norm": 0.5356025315410168, + "learning_rate": 1.3517665130568358e-06, + "loss": 0.6078, + "num_tokens": 533091266.0, + "step": 5662 + }, + { + "epoch": 0.966547192353644, + "grad_norm": 0.4580175222272572, + "learning_rate": 1.3449394094555385e-06, + "loss": 0.6379, + "num_tokens": 533191675.0, + "step": 5663 + }, + { + "epoch": 0.9667178699436764, + "grad_norm": 0.4703004071263575, + "learning_rate": 1.3381123058542414e-06, + "loss": 0.5911, + "num_tokens": 533282644.0, + "step": 5664 + }, + { + "epoch": 0.9668885475337088, + "grad_norm": 0.4838847659715829, + "learning_rate": 1.3312852022529443e-06, + "loss": 0.6625, + "num_tokens": 533379790.0, + "step": 5665 + }, + { + "epoch": 0.9670592251237412, + "grad_norm": 0.49566401054822246, + "learning_rate": 1.3244580986516472e-06, + "loss": 0.5244, + "num_tokens": 533461708.0, + "step": 5666 + }, + { + "epoch": 0.9672299027137736, + "grad_norm": 0.45975935577258437, + "learning_rate": 1.31763099505035e-06, + "loss": 0.4884, + "num_tokens": 533541139.0, + "step": 5667 + }, + { + "epoch": 0.9674005803038062, + "grad_norm": 0.5030921341750876, + "learning_rate": 1.3108038914490529e-06, + "loss": 0.6637, + "num_tokens": 533641234.0, + "step": 5668 + }, + { + "epoch": 0.9675712578938386, + "grad_norm": 0.4570482930920054, + "learning_rate": 1.3039767878477558e-06, + "loss": 0.4772, + "num_tokens": 533731232.0, + "step": 5669 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.43142764566693564, + "learning_rate": 1.2971496842464585e-06, + "loss": 0.561, + "num_tokens": 533829447.0, + "step": 5670 + }, + { + "epoch": 0.9679126130739034, + "grad_norm": 0.5020728120515804, + "learning_rate": 1.2903225806451614e-06, + "loss": 0.5327, + "num_tokens": 533904379.0, + "step": 5671 + }, + { + "epoch": 0.9680832906639358, + "grad_norm": 0.4594184343917237, + "learning_rate": 1.2834954770438643e-06, + "loss": 0.5312, + "num_tokens": 533997839.0, + "step": 5672 + }, + { + "epoch": 0.9682539682539683, + "grad_norm": 0.4579616536753627, + "learning_rate": 1.2766683734425672e-06, + "loss": 0.561, + "num_tokens": 534090906.0, + "step": 5673 + }, + { + "epoch": 0.9684246458440007, + "grad_norm": 0.41658895969719983, + "learning_rate": 1.26984126984127e-06, + "loss": 0.4822, + "num_tokens": 534190018.0, + "step": 5674 + }, + { + "epoch": 0.9685953234340331, + "grad_norm": 0.520580958150149, + "learning_rate": 1.2630141662399729e-06, + "loss": 0.5971, + "num_tokens": 534269334.0, + "step": 5675 + }, + { + "epoch": 0.9687660010240655, + "grad_norm": 0.4866720610057144, + "learning_rate": 1.2561870626386758e-06, + "loss": 0.515, + "num_tokens": 534345058.0, + "step": 5676 + }, + { + "epoch": 0.9689366786140979, + "grad_norm": 0.5704383624126329, + "learning_rate": 1.2493599590373785e-06, + "loss": 0.5417, + "num_tokens": 534429056.0, + "step": 5677 + }, + { + "epoch": 0.9691073562041304, + "grad_norm": 0.3952461485245926, + "learning_rate": 1.2425328554360814e-06, + "loss": 0.5545, + "num_tokens": 534566415.0, + "step": 5678 + }, + { + "epoch": 0.9692780337941628, + "grad_norm": 0.4562849690297509, + "learning_rate": 1.2357057518347843e-06, + "loss": 0.5787, + "num_tokens": 534662118.0, + "step": 5679 + }, + { + "epoch": 0.9694487113841953, + "grad_norm": 0.47571830245594016, + "learning_rate": 1.228878648233487e-06, + "loss": 0.6149, + "num_tokens": 534759227.0, + "step": 5680 + }, + { + "epoch": 0.9696193889742277, + "grad_norm": 0.494420180133051, + "learning_rate": 1.22205154463219e-06, + "loss": 0.5587, + "num_tokens": 534843350.0, + "step": 5681 + }, + { + "epoch": 0.9697900665642601, + "grad_norm": 0.48151808070109053, + "learning_rate": 1.2152244410308928e-06, + "loss": 0.5343, + "num_tokens": 534927910.0, + "step": 5682 + }, + { + "epoch": 0.9699607441542926, + "grad_norm": 0.40784385013871205, + "learning_rate": 1.2083973374295955e-06, + "loss": 0.5646, + "num_tokens": 535051802.0, + "step": 5683 + }, + { + "epoch": 0.970131421744325, + "grad_norm": 0.46029053497862027, + "learning_rate": 1.2015702338282985e-06, + "loss": 0.5177, + "num_tokens": 535141352.0, + "step": 5684 + }, + { + "epoch": 0.9703020993343574, + "grad_norm": 0.4484885623629113, + "learning_rate": 1.1947431302270014e-06, + "loss": 0.5609, + "num_tokens": 535244831.0, + "step": 5685 + }, + { + "epoch": 0.9704727769243898, + "grad_norm": 0.5016411494184984, + "learning_rate": 1.1879160266257043e-06, + "loss": 0.6381, + "num_tokens": 535338839.0, + "step": 5686 + }, + { + "epoch": 0.9706434545144222, + "grad_norm": 0.4412107013727558, + "learning_rate": 1.181088923024407e-06, + "loss": 0.5184, + "num_tokens": 535432894.0, + "step": 5687 + }, + { + "epoch": 0.9708141321044547, + "grad_norm": 0.47272565399465755, + "learning_rate": 1.1742618194231097e-06, + "loss": 0.6627, + "num_tokens": 535534603.0, + "step": 5688 + }, + { + "epoch": 0.9709848096944871, + "grad_norm": 0.45607643321560754, + "learning_rate": 1.1674347158218128e-06, + "loss": 0.631, + "num_tokens": 535641543.0, + "step": 5689 + }, + { + "epoch": 0.9711554872845195, + "grad_norm": 0.5151824252068921, + "learning_rate": 1.1606076122205155e-06, + "loss": 0.5654, + "num_tokens": 535710623.0, + "step": 5690 + }, + { + "epoch": 0.9713261648745519, + "grad_norm": 0.4560699379252708, + "learning_rate": 1.1537805086192184e-06, + "loss": 0.6417, + "num_tokens": 535824936.0, + "step": 5691 + }, + { + "epoch": 0.9714968424645845, + "grad_norm": 0.41151083149648027, + "learning_rate": 1.1469534050179212e-06, + "loss": 0.6649, + "num_tokens": 535953736.0, + "step": 5692 + }, + { + "epoch": 0.9716675200546169, + "grad_norm": 0.5417282107267116, + "learning_rate": 1.140126301416624e-06, + "loss": 0.621, + "num_tokens": 536029434.0, + "step": 5693 + }, + { + "epoch": 0.9718381976446493, + "grad_norm": 0.5451225064618821, + "learning_rate": 1.133299197815327e-06, + "loss": 0.5802, + "num_tokens": 536109650.0, + "step": 5694 + }, + { + "epoch": 0.9720088752346817, + "grad_norm": 0.48066667308755695, + "learning_rate": 1.1264720942140297e-06, + "loss": 0.5818, + "num_tokens": 536208382.0, + "step": 5695 + }, + { + "epoch": 0.9721795528247141, + "grad_norm": 0.47177014266548817, + "learning_rate": 1.1196449906127326e-06, + "loss": 0.6056, + "num_tokens": 536302994.0, + "step": 5696 + }, + { + "epoch": 0.9723502304147466, + "grad_norm": 0.45353413211242544, + "learning_rate": 1.1128178870114355e-06, + "loss": 0.5588, + "num_tokens": 536401956.0, + "step": 5697 + }, + { + "epoch": 0.972520908004779, + "grad_norm": 0.44201435809474693, + "learning_rate": 1.1059907834101384e-06, + "loss": 0.5854, + "num_tokens": 536504421.0, + "step": 5698 + }, + { + "epoch": 0.9726915855948114, + "grad_norm": 0.4656896812341324, + "learning_rate": 1.0991636798088411e-06, + "loss": 0.4991, + "num_tokens": 536599614.0, + "step": 5699 + }, + { + "epoch": 0.9728622631848438, + "grad_norm": 0.565612314521484, + "learning_rate": 1.092336576207544e-06, + "loss": 0.5754, + "num_tokens": 536661661.0, + "step": 5700 + }, + { + "epoch": 0.9730329407748762, + "grad_norm": 0.39011205750441497, + "learning_rate": 1.085509472606247e-06, + "loss": 0.5048, + "num_tokens": 536777340.0, + "step": 5701 + }, + { + "epoch": 0.9732036183649087, + "grad_norm": 0.49804649012811286, + "learning_rate": 1.0786823690049497e-06, + "loss": 0.5388, + "num_tokens": 536853740.0, + "step": 5702 + }, + { + "epoch": 0.9733742959549411, + "grad_norm": 0.4513520212697472, + "learning_rate": 1.0718552654036526e-06, + "loss": 0.5836, + "num_tokens": 536949668.0, + "step": 5703 + }, + { + "epoch": 0.9735449735449735, + "grad_norm": 0.4966302071332309, + "learning_rate": 1.0650281618023555e-06, + "loss": 0.5419, + "num_tokens": 537025168.0, + "step": 5704 + }, + { + "epoch": 0.973715651135006, + "grad_norm": 0.5113915399315336, + "learning_rate": 1.0582010582010582e-06, + "loss": 0.6291, + "num_tokens": 537118594.0, + "step": 5705 + }, + { + "epoch": 0.9738863287250384, + "grad_norm": 0.4834927483283794, + "learning_rate": 1.0513739545997611e-06, + "loss": 0.6078, + "num_tokens": 537208405.0, + "step": 5706 + }, + { + "epoch": 0.9740570063150709, + "grad_norm": 0.4280457212694096, + "learning_rate": 1.044546850998464e-06, + "loss": 0.5664, + "num_tokens": 537324659.0, + "step": 5707 + }, + { + "epoch": 0.9742276839051033, + "grad_norm": 0.48672688080291393, + "learning_rate": 1.037719747397167e-06, + "loss": 0.6326, + "num_tokens": 537415315.0, + "step": 5708 + }, + { + "epoch": 0.9743983614951357, + "grad_norm": 0.5116372746174793, + "learning_rate": 1.0308926437958697e-06, + "loss": 0.4659, + "num_tokens": 537476425.0, + "step": 5709 + }, + { + "epoch": 0.9745690390851681, + "grad_norm": 0.505012755386635, + "learning_rate": 1.0240655401945726e-06, + "loss": 0.5854, + "num_tokens": 537554418.0, + "step": 5710 + }, + { + "epoch": 0.9747397166752005, + "grad_norm": 0.49065387418747913, + "learning_rate": 1.0172384365932755e-06, + "loss": 0.4857, + "num_tokens": 537630581.0, + "step": 5711 + }, + { + "epoch": 0.974910394265233, + "grad_norm": 0.5608173574871514, + "learning_rate": 1.0104113329919782e-06, + "loss": 0.5187, + "num_tokens": 537691848.0, + "step": 5712 + }, + { + "epoch": 0.9750810718552654, + "grad_norm": 0.5277119640686474, + "learning_rate": 1.0035842293906811e-06, + "loss": 0.5728, + "num_tokens": 537761086.0, + "step": 5713 + }, + { + "epoch": 0.9752517494452978, + "grad_norm": 0.5272841751284647, + "learning_rate": 9.967571257893838e-07, + "loss": 0.6312, + "num_tokens": 537858113.0, + "step": 5714 + }, + { + "epoch": 0.9754224270353302, + "grad_norm": 0.41892308424228397, + "learning_rate": 9.89930022188087e-07, + "loss": 0.4945, + "num_tokens": 537962812.0, + "step": 5715 + }, + { + "epoch": 0.9755931046253626, + "grad_norm": 0.5162545255352943, + "learning_rate": 9.831029185867897e-07, + "loss": 0.6265, + "num_tokens": 538041629.0, + "step": 5716 + }, + { + "epoch": 0.9757637822153952, + "grad_norm": 0.40947587376113537, + "learning_rate": 9.762758149854924e-07, + "loss": 0.593, + "num_tokens": 538164281.0, + "step": 5717 + }, + { + "epoch": 0.9759344598054276, + "grad_norm": 0.4716062864351842, + "learning_rate": 9.694487113841953e-07, + "loss": 0.6729, + "num_tokens": 538267977.0, + "step": 5718 + }, + { + "epoch": 0.97610513739546, + "grad_norm": 0.43905097672300236, + "learning_rate": 9.626216077828982e-07, + "loss": 0.4962, + "num_tokens": 538361586.0, + "step": 5719 + }, + { + "epoch": 0.9762758149854924, + "grad_norm": 0.5089771541390959, + "learning_rate": 9.55794504181601e-07, + "loss": 0.6195, + "num_tokens": 538447330.0, + "step": 5720 + }, + { + "epoch": 0.9764464925755248, + "grad_norm": 0.4967359761465205, + "learning_rate": 9.489674005803039e-07, + "loss": 0.5667, + "num_tokens": 538527195.0, + "step": 5721 + }, + { + "epoch": 0.9766171701655573, + "grad_norm": 0.4939707004204201, + "learning_rate": 9.421402969790068e-07, + "loss": 0.6059, + "num_tokens": 538628232.0, + "step": 5722 + }, + { + "epoch": 0.9767878477555897, + "grad_norm": 0.47251415292716403, + "learning_rate": 9.353131933777096e-07, + "loss": 0.5415, + "num_tokens": 538713885.0, + "step": 5723 + }, + { + "epoch": 0.9769585253456221, + "grad_norm": 0.4813054970193649, + "learning_rate": 9.284860897764123e-07, + "loss": 0.51, + "num_tokens": 538798735.0, + "step": 5724 + }, + { + "epoch": 0.9771292029356545, + "grad_norm": 0.5132622856620368, + "learning_rate": 9.216589861751154e-07, + "loss": 0.5083, + "num_tokens": 538868700.0, + "step": 5725 + }, + { + "epoch": 0.977299880525687, + "grad_norm": 0.4257725404174097, + "learning_rate": 9.148318825738181e-07, + "loss": 0.5177, + "num_tokens": 538975518.0, + "step": 5726 + }, + { + "epoch": 0.9774705581157194, + "grad_norm": 0.4930256729794848, + "learning_rate": 9.080047789725211e-07, + "loss": 0.511, + "num_tokens": 539051789.0, + "step": 5727 + }, + { + "epoch": 0.9776412357057518, + "grad_norm": 0.4749437304592271, + "learning_rate": 9.011776753712238e-07, + "loss": 0.6154, + "num_tokens": 539148826.0, + "step": 5728 + }, + { + "epoch": 0.9778119132957843, + "grad_norm": 0.4330594693562733, + "learning_rate": 8.943505717699266e-07, + "loss": 0.553, + "num_tokens": 539256740.0, + "step": 5729 + }, + { + "epoch": 0.9779825908858167, + "grad_norm": 0.42186299418530376, + "learning_rate": 8.875234681686295e-07, + "loss": 0.6048, + "num_tokens": 539381242.0, + "step": 5730 + }, + { + "epoch": 0.9781532684758492, + "grad_norm": 0.40627473666857217, + "learning_rate": 8.806963645673323e-07, + "loss": 0.5291, + "num_tokens": 539493047.0, + "step": 5731 + }, + { + "epoch": 0.9783239460658816, + "grad_norm": 0.5380674795865472, + "learning_rate": 8.738692609660352e-07, + "loss": 0.633, + "num_tokens": 539573082.0, + "step": 5732 + }, + { + "epoch": 0.978494623655914, + "grad_norm": 0.41539564320329664, + "learning_rate": 8.670421573647381e-07, + "loss": 0.5632, + "num_tokens": 539691278.0, + "step": 5733 + }, + { + "epoch": 0.9786653012459464, + "grad_norm": 0.417791370840599, + "learning_rate": 8.60215053763441e-07, + "loss": 0.5318, + "num_tokens": 539800829.0, + "step": 5734 + }, + { + "epoch": 0.9788359788359788, + "grad_norm": 0.4942514338848114, + "learning_rate": 8.533879501621438e-07, + "loss": 0.5848, + "num_tokens": 539890285.0, + "step": 5735 + }, + { + "epoch": 0.9790066564260113, + "grad_norm": 0.45628469189496573, + "learning_rate": 8.465608465608466e-07, + "loss": 0.5322, + "num_tokens": 539983069.0, + "step": 5736 + }, + { + "epoch": 0.9791773340160437, + "grad_norm": 0.5127685309066037, + "learning_rate": 8.397337429595495e-07, + "loss": 0.6126, + "num_tokens": 540068106.0, + "step": 5737 + }, + { + "epoch": 0.9793480116060761, + "grad_norm": 0.49773979184798806, + "learning_rate": 8.329066393582523e-07, + "loss": 0.5232, + "num_tokens": 540143233.0, + "step": 5738 + }, + { + "epoch": 0.9795186891961085, + "grad_norm": 0.4765629881692624, + "learning_rate": 8.260795357569552e-07, + "loss": 0.5975, + "num_tokens": 540235250.0, + "step": 5739 + }, + { + "epoch": 0.9796893667861409, + "grad_norm": 0.482298377378701, + "learning_rate": 8.19252432155658e-07, + "loss": 0.5561, + "num_tokens": 540322776.0, + "step": 5740 + }, + { + "epoch": 0.9798600443761734, + "grad_norm": 0.4866030407007398, + "learning_rate": 8.124253285543609e-07, + "loss": 0.5399, + "num_tokens": 540401689.0, + "step": 5741 + }, + { + "epoch": 0.9800307219662059, + "grad_norm": 0.4682685273129796, + "learning_rate": 8.055982249530638e-07, + "loss": 0.5507, + "num_tokens": 540499913.0, + "step": 5742 + }, + { + "epoch": 0.9802013995562383, + "grad_norm": 0.49308372411405366, + "learning_rate": 7.987711213517666e-07, + "loss": 0.6328, + "num_tokens": 540591654.0, + "step": 5743 + }, + { + "epoch": 0.9803720771462707, + "grad_norm": 0.46765176340499853, + "learning_rate": 7.919440177504695e-07, + "loss": 0.576, + "num_tokens": 540686608.0, + "step": 5744 + }, + { + "epoch": 0.9805427547363031, + "grad_norm": 0.46346137468628523, + "learning_rate": 7.851169141491723e-07, + "loss": 0.6127, + "num_tokens": 540794238.0, + "step": 5745 + }, + { + "epoch": 0.9807134323263356, + "grad_norm": 0.5018134267778155, + "learning_rate": 7.782898105478752e-07, + "loss": 0.5912, + "num_tokens": 540877741.0, + "step": 5746 + }, + { + "epoch": 0.980884109916368, + "grad_norm": 0.5234366717426516, + "learning_rate": 7.71462706946578e-07, + "loss": 0.5525, + "num_tokens": 540948394.0, + "step": 5747 + }, + { + "epoch": 0.9810547875064004, + "grad_norm": 0.4717706580707317, + "learning_rate": 7.646356033452807e-07, + "loss": 0.4969, + "num_tokens": 541027028.0, + "step": 5748 + }, + { + "epoch": 0.9812254650964328, + "grad_norm": 0.4286027350232933, + "learning_rate": 7.578084997439838e-07, + "loss": 0.5368, + "num_tokens": 541131877.0, + "step": 5749 + }, + { + "epoch": 0.9813961426864652, + "grad_norm": 0.47432074717566675, + "learning_rate": 7.509813961426865e-07, + "loss": 0.6926, + "num_tokens": 541233764.0, + "step": 5750 + }, + { + "epoch": 0.9815668202764977, + "grad_norm": 0.5162730871686685, + "learning_rate": 7.441542925413895e-07, + "loss": 0.5262, + "num_tokens": 541306909.0, + "step": 5751 + }, + { + "epoch": 0.9817374978665301, + "grad_norm": 0.4503363869023697, + "learning_rate": 7.373271889400922e-07, + "loss": 0.4919, + "num_tokens": 541393969.0, + "step": 5752 + }, + { + "epoch": 0.9819081754565625, + "grad_norm": 0.4484652049830725, + "learning_rate": 7.30500085338795e-07, + "loss": 0.5684, + "num_tokens": 541494815.0, + "step": 5753 + }, + { + "epoch": 0.982078853046595, + "grad_norm": 0.5002920159378207, + "learning_rate": 7.236729817374979e-07, + "loss": 0.6664, + "num_tokens": 541595229.0, + "step": 5754 + }, + { + "epoch": 0.9822495306366275, + "grad_norm": 0.41579288377878176, + "learning_rate": 7.168458781362007e-07, + "loss": 0.5086, + "num_tokens": 541708701.0, + "step": 5755 + }, + { + "epoch": 0.9824202082266599, + "grad_norm": 0.41615194292620467, + "learning_rate": 7.100187745349036e-07, + "loss": 0.4533, + "num_tokens": 541801020.0, + "step": 5756 + }, + { + "epoch": 0.9825908858166923, + "grad_norm": 0.4636451974702676, + "learning_rate": 7.031916709336064e-07, + "loss": 0.5349, + "num_tokens": 541886678.0, + "step": 5757 + }, + { + "epoch": 0.9827615634067247, + "grad_norm": 0.47017049624810825, + "learning_rate": 6.963645673323094e-07, + "loss": 0.6183, + "num_tokens": 541986337.0, + "step": 5758 + }, + { + "epoch": 0.9829322409967571, + "grad_norm": 0.41868177143037305, + "learning_rate": 6.895374637310122e-07, + "loss": 0.5683, + "num_tokens": 542108619.0, + "step": 5759 + }, + { + "epoch": 0.9831029185867896, + "grad_norm": 0.5358132995476426, + "learning_rate": 6.82710360129715e-07, + "loss": 0.5343, + "num_tokens": 542171448.0, + "step": 5760 + }, + { + "epoch": 0.983273596176822, + "grad_norm": 0.5440031682509913, + "learning_rate": 6.758832565284179e-07, + "loss": 0.4744, + "num_tokens": 542231619.0, + "step": 5761 + }, + { + "epoch": 0.9834442737668544, + "grad_norm": 0.46607822785376823, + "learning_rate": 6.690561529271207e-07, + "loss": 0.6097, + "num_tokens": 542332055.0, + "step": 5762 + }, + { + "epoch": 0.9836149513568868, + "grad_norm": 0.484963210595089, + "learning_rate": 6.622290493258236e-07, + "loss": 0.554, + "num_tokens": 542422079.0, + "step": 5763 + }, + { + "epoch": 0.9837856289469192, + "grad_norm": 0.471235941690195, + "learning_rate": 6.554019457245264e-07, + "loss": 0.6064, + "num_tokens": 542517877.0, + "step": 5764 + }, + { + "epoch": 0.9839563065369517, + "grad_norm": 0.46796393390041524, + "learning_rate": 6.485748421232292e-07, + "loss": 0.5522, + "num_tokens": 542610836.0, + "step": 5765 + }, + { + "epoch": 0.9841269841269841, + "grad_norm": 0.49361002000212556, + "learning_rate": 6.417477385219322e-07, + "loss": 0.5607, + "num_tokens": 542691617.0, + "step": 5766 + }, + { + "epoch": 0.9842976617170166, + "grad_norm": 0.5254342524247596, + "learning_rate": 6.34920634920635e-07, + "loss": 0.4885, + "num_tokens": 542754656.0, + "step": 5767 + }, + { + "epoch": 0.984468339307049, + "grad_norm": 0.4169663030145664, + "learning_rate": 6.280935313193379e-07, + "loss": 0.501, + "num_tokens": 542863104.0, + "step": 5768 + }, + { + "epoch": 0.9846390168970814, + "grad_norm": 0.40682154763089406, + "learning_rate": 6.212664277180407e-07, + "loss": 0.5601, + "num_tokens": 542983801.0, + "step": 5769 + }, + { + "epoch": 0.9848096944871139, + "grad_norm": 0.48389110528109314, + "learning_rate": 6.144393241167435e-07, + "loss": 0.5844, + "num_tokens": 543068499.0, + "step": 5770 + }, + { + "epoch": 0.9849803720771463, + "grad_norm": 0.47475379851520055, + "learning_rate": 6.076122205154464e-07, + "loss": 0.548, + "num_tokens": 543151943.0, + "step": 5771 + }, + { + "epoch": 0.9851510496671787, + "grad_norm": 0.4130073569056687, + "learning_rate": 6.007851169141492e-07, + "loss": 0.5512, + "num_tokens": 543269160.0, + "step": 5772 + }, + { + "epoch": 0.9853217272572111, + "grad_norm": 0.49448963388140277, + "learning_rate": 5.939580133128521e-07, + "loss": 0.591, + "num_tokens": 543352422.0, + "step": 5773 + }, + { + "epoch": 0.9854924048472435, + "grad_norm": 0.39814397399944024, + "learning_rate": 5.871309097115549e-07, + "loss": 0.5081, + "num_tokens": 543466609.0, + "step": 5774 + }, + { + "epoch": 0.985663082437276, + "grad_norm": 0.4226450071260806, + "learning_rate": 5.803038061102578e-07, + "loss": 0.6391, + "num_tokens": 543599863.0, + "step": 5775 + }, + { + "epoch": 0.9858337600273084, + "grad_norm": 0.4406201146797656, + "learning_rate": 5.734767025089606e-07, + "loss": 0.6155, + "num_tokens": 543724344.0, + "step": 5776 + }, + { + "epoch": 0.9860044376173408, + "grad_norm": 0.511452064055497, + "learning_rate": 5.666495989076635e-07, + "loss": 0.5932, + "num_tokens": 543803705.0, + "step": 5777 + }, + { + "epoch": 0.9861751152073732, + "grad_norm": 0.46468299063228635, + "learning_rate": 5.598224953063663e-07, + "loss": 0.5798, + "num_tokens": 543898260.0, + "step": 5778 + }, + { + "epoch": 0.9863457927974058, + "grad_norm": 0.6020277311027128, + "learning_rate": 5.529953917050692e-07, + "loss": 0.5736, + "num_tokens": 543976120.0, + "step": 5779 + }, + { + "epoch": 0.9865164703874382, + "grad_norm": 0.4113567878084181, + "learning_rate": 5.46168288103772e-07, + "loss": 0.516, + "num_tokens": 544082325.0, + "step": 5780 + }, + { + "epoch": 0.9866871479774706, + "grad_norm": 0.4895290306960513, + "learning_rate": 5.393411845024748e-07, + "loss": 0.6113, + "num_tokens": 544180348.0, + "step": 5781 + }, + { + "epoch": 0.986857825567503, + "grad_norm": 0.4878580478588014, + "learning_rate": 5.325140809011778e-07, + "loss": 0.5637, + "num_tokens": 544260470.0, + "step": 5782 + }, + { + "epoch": 0.9870285031575354, + "grad_norm": 0.44723633074566177, + "learning_rate": 5.256869772998806e-07, + "loss": 0.5144, + "num_tokens": 544351607.0, + "step": 5783 + }, + { + "epoch": 0.9871991807475679, + "grad_norm": 0.45251588478193316, + "learning_rate": 5.188598736985835e-07, + "loss": 0.5047, + "num_tokens": 544440539.0, + "step": 5784 + }, + { + "epoch": 0.9873698583376003, + "grad_norm": 0.4288770163680816, + "learning_rate": 5.120327700972863e-07, + "loss": 0.5199, + "num_tokens": 544539395.0, + "step": 5785 + }, + { + "epoch": 0.9875405359276327, + "grad_norm": 0.4492201778042409, + "learning_rate": 5.052056664959891e-07, + "loss": 0.551, + "num_tokens": 544640403.0, + "step": 5786 + }, + { + "epoch": 0.9877112135176651, + "grad_norm": 0.42358607843684837, + "learning_rate": 4.983785628946919e-07, + "loss": 0.5075, + "num_tokens": 544742428.0, + "step": 5787 + }, + { + "epoch": 0.9878818911076975, + "grad_norm": 0.5008822842478289, + "learning_rate": 4.915514592933948e-07, + "loss": 0.4547, + "num_tokens": 544806439.0, + "step": 5788 + }, + { + "epoch": 0.98805256869773, + "grad_norm": 0.4435975279271749, + "learning_rate": 4.847243556920976e-07, + "loss": 0.6268, + "num_tokens": 544927035.0, + "step": 5789 + }, + { + "epoch": 0.9882232462877624, + "grad_norm": 0.4519129422274784, + "learning_rate": 4.778972520908006e-07, + "loss": 0.6297, + "num_tokens": 545042222.0, + "step": 5790 + }, + { + "epoch": 0.9883939238777949, + "grad_norm": 0.5710619897153714, + "learning_rate": 4.710701484895034e-07, + "loss": 0.5843, + "num_tokens": 545103651.0, + "step": 5791 + }, + { + "epoch": 0.9885646014678273, + "grad_norm": 0.46213765181675537, + "learning_rate": 4.6424304488820617e-07, + "loss": 0.4902, + "num_tokens": 545190924.0, + "step": 5792 + }, + { + "epoch": 0.9887352790578597, + "grad_norm": 0.5240770479956915, + "learning_rate": 4.5741594128690903e-07, + "loss": 0.6508, + "num_tokens": 545277409.0, + "step": 5793 + }, + { + "epoch": 0.9889059566478922, + "grad_norm": 0.38457019910662404, + "learning_rate": 4.505888376856119e-07, + "loss": 0.546, + "num_tokens": 545405529.0, + "step": 5794 + }, + { + "epoch": 0.9890766342379246, + "grad_norm": 0.4744382204003423, + "learning_rate": 4.4376173408431476e-07, + "loss": 0.4748, + "num_tokens": 545484944.0, + "step": 5795 + }, + { + "epoch": 0.989247311827957, + "grad_norm": 0.45362547116823715, + "learning_rate": 4.369346304830176e-07, + "loss": 0.6449, + "num_tokens": 545600678.0, + "step": 5796 + }, + { + "epoch": 0.9894179894179894, + "grad_norm": 0.42979674158179343, + "learning_rate": 4.301075268817205e-07, + "loss": 0.5401, + "num_tokens": 545707393.0, + "step": 5797 + }, + { + "epoch": 0.9895886670080218, + "grad_norm": 0.41168715165075287, + "learning_rate": 4.232804232804233e-07, + "loss": 0.5022, + "num_tokens": 545813567.0, + "step": 5798 + }, + { + "epoch": 0.9897593445980543, + "grad_norm": 0.47554267612887186, + "learning_rate": 4.1645331967912616e-07, + "loss": 0.486, + "num_tokens": 545886852.0, + "step": 5799 + }, + { + "epoch": 0.9899300221880867, + "grad_norm": 0.4934162549922142, + "learning_rate": 4.09626216077829e-07, + "loss": 0.5681, + "num_tokens": 545965519.0, + "step": 5800 + }, + { + "epoch": 0.9901006997781191, + "grad_norm": 0.45737253351999513, + "learning_rate": 4.027991124765319e-07, + "loss": 0.5346, + "num_tokens": 546057977.0, + "step": 5801 + }, + { + "epoch": 0.9902713773681515, + "grad_norm": 0.5084515806894331, + "learning_rate": 3.9597200887523475e-07, + "loss": 0.6432, + "num_tokens": 546151277.0, + "step": 5802 + }, + { + "epoch": 0.9904420549581839, + "grad_norm": 0.4364907527716057, + "learning_rate": 3.891449052739376e-07, + "loss": 0.4923, + "num_tokens": 546237212.0, + "step": 5803 + }, + { + "epoch": 0.9906127325482165, + "grad_norm": 0.4552953867831013, + "learning_rate": 3.8231780167264037e-07, + "loss": 0.5795, + "num_tokens": 546330410.0, + "step": 5804 + }, + { + "epoch": 0.9907834101382489, + "grad_norm": 0.5210594940351512, + "learning_rate": 3.7549069807134323e-07, + "loss": 0.4782, + "num_tokens": 546399925.0, + "step": 5805 + }, + { + "epoch": 0.9909540877282813, + "grad_norm": 0.4549062761605478, + "learning_rate": 3.686635944700461e-07, + "loss": 0.5242, + "num_tokens": 546493457.0, + "step": 5806 + }, + { + "epoch": 0.9911247653183137, + "grad_norm": 0.4825891441764143, + "learning_rate": 3.6183649086874896e-07, + "loss": 0.5735, + "num_tokens": 546580799.0, + "step": 5807 + }, + { + "epoch": 0.9912954429083461, + "grad_norm": 0.4869958287810855, + "learning_rate": 3.550093872674518e-07, + "loss": 0.5979, + "num_tokens": 546669277.0, + "step": 5808 + }, + { + "epoch": 0.9914661204983786, + "grad_norm": 0.4536960150733509, + "learning_rate": 3.481822836661547e-07, + "loss": 0.5801, + "num_tokens": 546766880.0, + "step": 5809 + }, + { + "epoch": 0.991636798088411, + "grad_norm": 0.4868874405663085, + "learning_rate": 3.413551800648575e-07, + "loss": 0.5912, + "num_tokens": 546848734.0, + "step": 5810 + }, + { + "epoch": 0.9918074756784434, + "grad_norm": 0.44502317355334464, + "learning_rate": 3.3452807646356035e-07, + "loss": 0.5233, + "num_tokens": 546946804.0, + "step": 5811 + }, + { + "epoch": 0.9919781532684758, + "grad_norm": 0.4619057757967984, + "learning_rate": 3.277009728622632e-07, + "loss": 0.6044, + "num_tokens": 547059590.0, + "step": 5812 + }, + { + "epoch": 0.9921488308585082, + "grad_norm": 0.4915887660059776, + "learning_rate": 3.208738692609661e-07, + "loss": 0.5442, + "num_tokens": 547151972.0, + "step": 5813 + }, + { + "epoch": 0.9923195084485407, + "grad_norm": 0.6255365390217545, + "learning_rate": 3.1404676565966894e-07, + "loss": 0.6096, + "num_tokens": 547215153.0, + "step": 5814 + }, + { + "epoch": 0.9924901860385731, + "grad_norm": 0.470822034125046, + "learning_rate": 3.0721966205837175e-07, + "loss": 0.5895, + "num_tokens": 547311532.0, + "step": 5815 + }, + { + "epoch": 0.9926608636286056, + "grad_norm": 0.4762608503265472, + "learning_rate": 3.003925584570746e-07, + "loss": 0.5878, + "num_tokens": 547399396.0, + "step": 5816 + }, + { + "epoch": 0.992831541218638, + "grad_norm": 0.47980906982732047, + "learning_rate": 2.935654548557774e-07, + "loss": 0.6335, + "num_tokens": 547499933.0, + "step": 5817 + }, + { + "epoch": 0.9930022188086705, + "grad_norm": 0.5657441826645131, + "learning_rate": 2.867383512544803e-07, + "loss": 0.629, + "num_tokens": 547574103.0, + "step": 5818 + }, + { + "epoch": 0.9931728963987029, + "grad_norm": 0.4444063312886428, + "learning_rate": 2.7991124765318315e-07, + "loss": 0.5804, + "num_tokens": 547672508.0, + "step": 5819 + }, + { + "epoch": 0.9933435739887353, + "grad_norm": 0.448718798103949, + "learning_rate": 2.73084144051886e-07, + "loss": 0.519, + "num_tokens": 547770260.0, + "step": 5820 + }, + { + "epoch": 0.9935142515787677, + "grad_norm": 0.4751972844234836, + "learning_rate": 2.662570404505889e-07, + "loss": 0.6257, + "num_tokens": 547868830.0, + "step": 5821 + }, + { + "epoch": 0.9936849291688001, + "grad_norm": 0.46057589833746626, + "learning_rate": 2.5942993684929174e-07, + "loss": 0.5535, + "num_tokens": 547964765.0, + "step": 5822 + }, + { + "epoch": 0.9938556067588326, + "grad_norm": 0.44560014091650313, + "learning_rate": 2.5260283324799455e-07, + "loss": 0.5509, + "num_tokens": 548058706.0, + "step": 5823 + }, + { + "epoch": 0.994026284348865, + "grad_norm": 0.43514683343912264, + "learning_rate": 2.457757296466974e-07, + "loss": 0.5677, + "num_tokens": 548164612.0, + "step": 5824 + }, + { + "epoch": 0.9941969619388974, + "grad_norm": 0.48262238747703023, + "learning_rate": 2.389486260454003e-07, + "loss": 0.4868, + "num_tokens": 548241160.0, + "step": 5825 + }, + { + "epoch": 0.9943676395289298, + "grad_norm": 0.48249569278875876, + "learning_rate": 2.3212152244410309e-07, + "loss": 0.6833, + "num_tokens": 548350186.0, + "step": 5826 + }, + { + "epoch": 0.9945383171189622, + "grad_norm": 0.4552530328454943, + "learning_rate": 2.2529441884280595e-07, + "loss": 0.5533, + "num_tokens": 548440686.0, + "step": 5827 + }, + { + "epoch": 0.9947089947089947, + "grad_norm": 0.4721088305117028, + "learning_rate": 2.184673152415088e-07, + "loss": 0.5155, + "num_tokens": 548519817.0, + "step": 5828 + }, + { + "epoch": 0.9948796722990272, + "grad_norm": 0.45953226419462434, + "learning_rate": 2.1164021164021165e-07, + "loss": 0.4888, + "num_tokens": 548608305.0, + "step": 5829 + }, + { + "epoch": 0.9950503498890596, + "grad_norm": 0.4932556194701304, + "learning_rate": 2.048131080389145e-07, + "loss": 0.557, + "num_tokens": 548692143.0, + "step": 5830 + }, + { + "epoch": 0.995221027479092, + "grad_norm": 0.40105844015861297, + "learning_rate": 1.9798600443761737e-07, + "loss": 0.5344, + "num_tokens": 548808239.0, + "step": 5831 + }, + { + "epoch": 0.9953917050691244, + "grad_norm": 0.4572615492244303, + "learning_rate": 1.9115890083632018e-07, + "loss": 0.5222, + "num_tokens": 548895983.0, + "step": 5832 + }, + { + "epoch": 0.9955623826591569, + "grad_norm": 0.42537586759942986, + "learning_rate": 1.8433179723502305e-07, + "loss": 0.535, + "num_tokens": 549008709.0, + "step": 5833 + }, + { + "epoch": 0.9957330602491893, + "grad_norm": 0.4111588633060106, + "learning_rate": 1.775046936337259e-07, + "loss": 0.5503, + "num_tokens": 549119419.0, + "step": 5834 + }, + { + "epoch": 0.9959037378392217, + "grad_norm": 0.4740006195035117, + "learning_rate": 1.7067759003242875e-07, + "loss": 0.5599, + "num_tokens": 549208768.0, + "step": 5835 + }, + { + "epoch": 0.9960744154292541, + "grad_norm": 0.4680097802183672, + "learning_rate": 1.638504864311316e-07, + "loss": 0.6226, + "num_tokens": 549336406.0, + "step": 5836 + }, + { + "epoch": 0.9962450930192865, + "grad_norm": 0.44807482137235505, + "learning_rate": 1.5702338282983447e-07, + "loss": 0.4866, + "num_tokens": 549427878.0, + "step": 5837 + }, + { + "epoch": 0.996415770609319, + "grad_norm": 0.4510415718016738, + "learning_rate": 1.501962792285373e-07, + "loss": 0.5399, + "num_tokens": 549524270.0, + "step": 5838 + }, + { + "epoch": 0.9965864481993514, + "grad_norm": 0.5170634447715512, + "learning_rate": 1.4336917562724014e-07, + "loss": 0.5634, + "num_tokens": 549593266.0, + "step": 5839 + }, + { + "epoch": 0.9967571257893838, + "grad_norm": 0.4261840906680157, + "learning_rate": 1.36542072025943e-07, + "loss": 0.5331, + "num_tokens": 549699041.0, + "step": 5840 + }, + { + "epoch": 0.9969278033794163, + "grad_norm": 0.4794547615538534, + "learning_rate": 1.2971496842464587e-07, + "loss": 0.6273, + "num_tokens": 549794125.0, + "step": 5841 + }, + { + "epoch": 0.9970984809694488, + "grad_norm": 0.47174951341644733, + "learning_rate": 1.228878648233487e-07, + "loss": 0.5615, + "num_tokens": 549879603.0, + "step": 5842 + }, + { + "epoch": 0.9972691585594812, + "grad_norm": 0.45584229427372397, + "learning_rate": 1.1606076122205154e-07, + "loss": 0.5502, + "num_tokens": 549969185.0, + "step": 5843 + }, + { + "epoch": 0.9974398361495136, + "grad_norm": 0.47408926476678104, + "learning_rate": 1.092336576207544e-07, + "loss": 0.4942, + "num_tokens": 550050967.0, + "step": 5844 + }, + { + "epoch": 0.997610513739546, + "grad_norm": 0.5147101847206296, + "learning_rate": 1.0240655401945726e-07, + "loss": 0.6703, + "num_tokens": 550137084.0, + "step": 5845 + }, + { + "epoch": 0.9977811913295784, + "grad_norm": 0.4030430309754854, + "learning_rate": 9.557945041816009e-08, + "loss": 0.5217, + "num_tokens": 550250609.0, + "step": 5846 + }, + { + "epoch": 0.9979518689196109, + "grad_norm": 0.4289449212846011, + "learning_rate": 8.875234681686295e-08, + "loss": 0.5395, + "num_tokens": 550353200.0, + "step": 5847 + }, + { + "epoch": 0.9981225465096433, + "grad_norm": 0.5082682750568082, + "learning_rate": 8.19252432155658e-08, + "loss": 0.5302, + "num_tokens": 550431497.0, + "step": 5848 + }, + { + "epoch": 0.9982932240996757, + "grad_norm": 0.49830500043947556, + "learning_rate": 7.509813961426865e-08, + "loss": 0.5426, + "num_tokens": 550506850.0, + "step": 5849 + }, + { + "epoch": 0.9984639016897081, + "grad_norm": 0.4512741565282109, + "learning_rate": 6.82710360129715e-08, + "loss": 0.5019, + "num_tokens": 550592752.0, + "step": 5850 + }, + { + "epoch": 0.9986345792797405, + "grad_norm": 0.44212043423918934, + "learning_rate": 6.144393241167435e-08, + "loss": 0.5574, + "num_tokens": 550689952.0, + "step": 5851 + }, + { + "epoch": 0.998805256869773, + "grad_norm": 0.4464405135724566, + "learning_rate": 5.46168288103772e-08, + "loss": 0.5864, + "num_tokens": 550797460.0, + "step": 5852 + }, + { + "epoch": 0.9989759344598055, + "grad_norm": 0.39251959885722437, + "learning_rate": 4.7789725209080046e-08, + "loss": 0.5537, + "num_tokens": 550926143.0, + "step": 5853 + }, + { + "epoch": 0.9991466120498379, + "grad_norm": 0.429358694610898, + "learning_rate": 4.09626216077829e-08, + "loss": 0.5022, + "num_tokens": 551020494.0, + "step": 5854 + }, + { + "epoch": 0.9993172896398703, + "grad_norm": 0.5129746771239546, + "learning_rate": 3.413551800648575e-08, + "loss": 0.612, + "num_tokens": 551096687.0, + "step": 5855 + }, + { + "epoch": 0.9994879672299027, + "grad_norm": 0.47713258367504513, + "learning_rate": 2.73084144051886e-08, + "loss": 0.5658, + "num_tokens": 551186015.0, + "step": 5856 + }, + { + "epoch": 0.9996586448199352, + "grad_norm": 0.43080212263534023, + "learning_rate": 2.048131080389145e-08, + "loss": 0.5923, + "num_tokens": 551297060.0, + "step": 5857 + }, + { + "epoch": 0.9998293224099676, + "grad_norm": 0.4522964254123883, + "learning_rate": 1.36542072025943e-08, + "loss": 0.6076, + "num_tokens": 551403791.0, + "step": 5858 + }, + { + "epoch": 1.0, + "grad_norm": 0.4695761395698389, + "learning_rate": 6.82710360129715e-09, + "loss": 0.499, + "num_tokens": 551458209.0, + "step": 5859 + }, + { + "epoch": 1.0, + "step": 5859, + "total_flos": 438086667841536.0, + "train_loss": 0.610091478802438, + "train_runtime": 7786.7341, + "train_samples_per_second": 12.038, + "train_steps_per_second": 0.752 + } + ], + "logging_steps": 1.0, + "max_steps": 5859, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 438086667841536.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}