diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10043 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.7526811361312866, + "learning_rate": 4.999992104320636e-05, + "loss": 1.0428, + "num_input_tokens_seen": 26624, + "step": 5 + }, + { + "epoch": 0.0016, + "grad_norm": 0.5957724452018738, + "learning_rate": 4.999968417332415e-05, + "loss": 1.1061, + "num_input_tokens_seen": 51408, + "step": 10 + }, + { + "epoch": 0.0024, + "grad_norm": 0.8826403021812439, + "learning_rate": 4.999928939184958e-05, + "loss": 1.0426, + "num_input_tokens_seen": 75040, + "step": 15 + }, + { + "epoch": 0.0032, + "grad_norm": 0.6897421479225159, + "learning_rate": 4.9998736701276295e-05, + "loss": 1.1472, + "num_input_tokens_seen": 98848, + "step": 20 + }, + { + "epoch": 0.004, + "grad_norm": 0.7432862520217896, + "learning_rate": 4.9998026105095405e-05, + "loss": 0.9781, + "num_input_tokens_seen": 123104, + "step": 25 + }, + { + "epoch": 0.0048, + "grad_norm": 0.9830231666564941, + "learning_rate": 4.999715760779541e-05, + "loss": 0.9889, + "num_input_tokens_seen": 152144, + "step": 30 + }, + { + "epoch": 0.0056, + "grad_norm": 0.9177331924438477, + "learning_rate": 4.999613121486222e-05, + "loss": 0.9345, + "num_input_tokens_seen": 177216, + "step": 35 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6646199822425842, + "learning_rate": 4.999494693277907e-05, + "loss": 0.8539, + "num_input_tokens_seen": 203152, + "step": 40 + }, + { + "epoch": 0.0072, + "grad_norm": 0.5822590589523315, + "learning_rate": 4.999360476902656e-05, + "loss": 0.9183, + "num_input_tokens_seen": 233568, + "step": 45 + }, + { + "epoch": 0.008, + "grad_norm": 0.7686595916748047, + "learning_rate": 4.99921047320825e-05, + "loss": 0.873, + "num_input_tokens_seen": 264752, + "step": 50 + }, + { + "epoch": 0.0088, + "grad_norm": 0.729837954044342, + "learning_rate": 4.9990446831421955e-05, + "loss": 0.8676, + "num_input_tokens_seen": 291040, + "step": 55 + }, + { + "epoch": 0.0096, + "grad_norm": 0.9523835778236389, + "learning_rate": 4.998863107751711e-05, + "loss": 0.9004, + "num_input_tokens_seen": 321760, + "step": 60 + }, + { + "epoch": 0.0104, + "grad_norm": 0.6720367670059204, + "learning_rate": 4.9986657481837277e-05, + "loss": 0.8536, + "num_input_tokens_seen": 347168, + "step": 65 + }, + { + "epoch": 0.0112, + "grad_norm": 0.4336840808391571, + "learning_rate": 4.998452605684874e-05, + "loss": 0.8027, + "num_input_tokens_seen": 373888, + "step": 70 + }, + { + "epoch": 0.012, + "grad_norm": 0.808559238910675, + "learning_rate": 4.998223681601473e-05, + "loss": 0.8075, + "num_input_tokens_seen": 398752, + "step": 75 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5663979053497314, + "learning_rate": 4.997978977379536e-05, + "loss": 0.7919, + "num_input_tokens_seen": 421344, + "step": 80 + }, + { + "epoch": 0.0136, + "grad_norm": 0.5677878260612488, + "learning_rate": 4.9977184945647473e-05, + "loss": 0.7512, + "num_input_tokens_seen": 451296, + "step": 85 + }, + { + "epoch": 0.0144, + "grad_norm": 0.674132227897644, + "learning_rate": 4.997442234802456e-05, + "loss": 0.7713, + "num_input_tokens_seen": 482416, + "step": 90 + }, + { + "epoch": 0.0152, + "grad_norm": 0.5088427662849426, + "learning_rate": 4.997150199837671e-05, + "loss": 0.7965, + "num_input_tokens_seen": 513008, + "step": 95 + }, + { + "epoch": 0.016, + "grad_norm": 0.6657032370567322, + "learning_rate": 4.996842391515044e-05, + "loss": 0.8623, + "num_input_tokens_seen": 537984, + "step": 100 + }, + { + "epoch": 0.0168, + "grad_norm": 0.6862130761146545, + "learning_rate": 4.996518811778858e-05, + "loss": 0.7797, + "num_input_tokens_seen": 564528, + "step": 105 + }, + { + "epoch": 0.0176, + "grad_norm": 0.6449868083000183, + "learning_rate": 4.99617946267302e-05, + "loss": 0.7732, + "num_input_tokens_seen": 588608, + "step": 110 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5512914657592773, + "learning_rate": 4.9958243463410414e-05, + "loss": 0.7478, + "num_input_tokens_seen": 620752, + "step": 115 + }, + { + "epoch": 0.0192, + "grad_norm": 0.7411808371543884, + "learning_rate": 4.995453465026032e-05, + "loss": 0.7194, + "num_input_tokens_seen": 649200, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 0.9926447868347168, + "learning_rate": 4.995066821070679e-05, + "loss": 0.7506, + "num_input_tokens_seen": 679200, + "step": 125 + }, + { + "epoch": 0.0208, + "grad_norm": 0.7455246448516846, + "learning_rate": 4.9946644169172355e-05, + "loss": 0.6886, + "num_input_tokens_seen": 702144, + "step": 130 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5429201126098633, + "learning_rate": 4.9942462551075056e-05, + "loss": 0.8481, + "num_input_tokens_seen": 730128, + "step": 135 + }, + { + "epoch": 0.0224, + "grad_norm": 0.49708107113838196, + "learning_rate": 4.993812338282826e-05, + "loss": 0.7999, + "num_input_tokens_seen": 757248, + "step": 140 + }, + { + "epoch": 0.0232, + "grad_norm": 0.6150819063186646, + "learning_rate": 4.993362669184051e-05, + "loss": 0.7877, + "num_input_tokens_seen": 786096, + "step": 145 + }, + { + "epoch": 0.024, + "grad_norm": 0.6751016974449158, + "learning_rate": 4.992897250651535e-05, + "loss": 0.9312, + "num_input_tokens_seen": 814192, + "step": 150 + }, + { + "epoch": 0.0248, + "grad_norm": 0.6245042085647583, + "learning_rate": 4.992416085625115e-05, + "loss": 0.8767, + "num_input_tokens_seen": 840144, + "step": 155 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5385093688964844, + "learning_rate": 4.9919191771440905e-05, + "loss": 0.8646, + "num_input_tokens_seen": 870368, + "step": 160 + }, + { + "epoch": 0.0264, + "grad_norm": 0.5674800276756287, + "learning_rate": 4.991406528347206e-05, + "loss": 0.7159, + "num_input_tokens_seen": 897296, + "step": 165 + }, + { + "epoch": 0.0272, + "grad_norm": 0.3683065176010132, + "learning_rate": 4.990878142472628e-05, + "loss": 0.7573, + "num_input_tokens_seen": 924576, + "step": 170 + }, + { + "epoch": 0.028, + "grad_norm": 0.5186157822608948, + "learning_rate": 4.990334022857932e-05, + "loss": 0.8083, + "num_input_tokens_seen": 945856, + "step": 175 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5749205350875854, + "learning_rate": 4.9897741729400705e-05, + "loss": 0.7074, + "num_input_tokens_seen": 970416, + "step": 180 + }, + { + "epoch": 0.0296, + "grad_norm": 0.5518808960914612, + "learning_rate": 4.9891985962553606e-05, + "loss": 0.7709, + "num_input_tokens_seen": 994288, + "step": 185 + }, + { + "epoch": 0.0304, + "grad_norm": 0.5436009764671326, + "learning_rate": 4.988607296439458e-05, + "loss": 0.7, + "num_input_tokens_seen": 1029872, + "step": 190 + }, + { + "epoch": 0.0312, + "grad_norm": 0.4599573612213135, + "learning_rate": 4.988000277227334e-05, + "loss": 0.8251, + "num_input_tokens_seen": 1061072, + "step": 195 + }, + { + "epoch": 0.032, + "grad_norm": 0.4540678858757019, + "learning_rate": 4.987377542453251e-05, + "loss": 0.6707, + "num_input_tokens_seen": 1089312, + "step": 200 + }, + { + "epoch": 0.0328, + "grad_norm": 0.5874660611152649, + "learning_rate": 4.98673909605074e-05, + "loss": 0.7264, + "num_input_tokens_seen": 1114272, + "step": 205 + }, + { + "epoch": 0.0336, + "grad_norm": 0.6792047619819641, + "learning_rate": 4.9860849420525766e-05, + "loss": 0.7906, + "num_input_tokens_seen": 1139808, + "step": 210 + }, + { + "epoch": 0.0344, + "grad_norm": 0.5740874409675598, + "learning_rate": 4.985415084590752e-05, + "loss": 0.8062, + "num_input_tokens_seen": 1163072, + "step": 215 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5089894533157349, + "learning_rate": 4.9847295278964514e-05, + "loss": 0.7432, + "num_input_tokens_seen": 1193936, + "step": 220 + }, + { + "epoch": 0.036, + "grad_norm": 0.7231270670890808, + "learning_rate": 4.984028276300021e-05, + "loss": 0.7586, + "num_input_tokens_seen": 1219696, + "step": 225 + }, + { + "epoch": 0.0368, + "grad_norm": 0.6494696140289307, + "learning_rate": 4.98331133423095e-05, + "loss": 0.7532, + "num_input_tokens_seen": 1248096, + "step": 230 + }, + { + "epoch": 0.0376, + "grad_norm": 0.6063010692596436, + "learning_rate": 4.9825787062178315e-05, + "loss": 0.786, + "num_input_tokens_seen": 1276624, + "step": 235 + }, + { + "epoch": 0.0384, + "grad_norm": 0.8775933384895325, + "learning_rate": 4.981830396888344e-05, + "loss": 0.7947, + "num_input_tokens_seen": 1303472, + "step": 240 + }, + { + "epoch": 0.0392, + "grad_norm": 0.7558068633079529, + "learning_rate": 4.981066410969215e-05, + "loss": 0.6988, + "num_input_tokens_seen": 1326816, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 0.8880596160888672, + "learning_rate": 4.980286753286195e-05, + "loss": 0.7078, + "num_input_tokens_seen": 1351008, + "step": 250 + }, + { + "epoch": 0.0408, + "grad_norm": 0.661428689956665, + "learning_rate": 4.979491428764026e-05, + "loss": 0.7491, + "num_input_tokens_seen": 1374656, + "step": 255 + }, + { + "epoch": 0.0416, + "grad_norm": 0.7624719738960266, + "learning_rate": 4.9786804424264085e-05, + "loss": 0.75, + "num_input_tokens_seen": 1399264, + "step": 260 + }, + { + "epoch": 0.0424, + "grad_norm": 0.6995192170143127, + "learning_rate": 4.977853799395976e-05, + "loss": 0.798, + "num_input_tokens_seen": 1422304, + "step": 265 + }, + { + "epoch": 0.0432, + "grad_norm": 0.5227561593055725, + "learning_rate": 4.977011504894252e-05, + "loss": 0.8814, + "num_input_tokens_seen": 1447184, + "step": 270 + }, + { + "epoch": 0.044, + "grad_norm": 0.7046292424201965, + "learning_rate": 4.976153564241628e-05, + "loss": 0.7203, + "num_input_tokens_seen": 1474304, + "step": 275 + }, + { + "epoch": 0.0448, + "grad_norm": 0.7567644119262695, + "learning_rate": 4.975279982857324e-05, + "loss": 0.6936, + "num_input_tokens_seen": 1500896, + "step": 280 + }, + { + "epoch": 0.0456, + "grad_norm": 0.6787880063056946, + "learning_rate": 4.9743907662593524e-05, + "loss": 0.7872, + "num_input_tokens_seen": 1528688, + "step": 285 + }, + { + "epoch": 0.0464, + "grad_norm": 0.5113949775695801, + "learning_rate": 4.9734859200644905e-05, + "loss": 0.7517, + "num_input_tokens_seen": 1561328, + "step": 290 + }, + { + "epoch": 0.0472, + "grad_norm": 0.7206217050552368, + "learning_rate": 4.972565449988239e-05, + "loss": 0.6726, + "num_input_tokens_seen": 1589088, + "step": 295 + }, + { + "epoch": 0.048, + "grad_norm": 0.602922797203064, + "learning_rate": 4.971629361844785e-05, + "loss": 0.7259, + "num_input_tokens_seen": 1615712, + "step": 300 + }, + { + "epoch": 0.0488, + "grad_norm": 0.7673738598823547, + "learning_rate": 4.9706776615469716e-05, + "loss": 0.8337, + "num_input_tokens_seen": 1638640, + "step": 305 + }, + { + "epoch": 0.0496, + "grad_norm": 0.7302682995796204, + "learning_rate": 4.9697103551062556e-05, + "loss": 0.731, + "num_input_tokens_seen": 1664304, + "step": 310 + }, + { + "epoch": 0.0504, + "grad_norm": 0.45416679978370667, + "learning_rate": 4.968727448632669e-05, + "loss": 0.7285, + "num_input_tokens_seen": 1697648, + "step": 315 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5968911051750183, + "learning_rate": 4.967728948334784e-05, + "loss": 0.723, + "num_input_tokens_seen": 1726608, + "step": 320 + }, + { + "epoch": 0.052, + "grad_norm": 0.6134063601493835, + "learning_rate": 4.96671486051967e-05, + "loss": 0.7918, + "num_input_tokens_seen": 1750912, + "step": 325 + }, + { + "epoch": 0.0528, + "grad_norm": 0.5388225317001343, + "learning_rate": 4.965685191592859e-05, + "loss": 0.6448, + "num_input_tokens_seen": 1782912, + "step": 330 + }, + { + "epoch": 0.0536, + "grad_norm": 0.6615162491798401, + "learning_rate": 4.964639948058297e-05, + "loss": 0.7874, + "num_input_tokens_seen": 1804704, + "step": 335 + }, + { + "epoch": 0.0544, + "grad_norm": 0.8656606078147888, + "learning_rate": 4.963579136518312e-05, + "loss": 0.7025, + "num_input_tokens_seen": 1827248, + "step": 340 + }, + { + "epoch": 0.0552, + "grad_norm": 0.7784980535507202, + "learning_rate": 4.962502763673565e-05, + "loss": 0.6676, + "num_input_tokens_seen": 1854304, + "step": 345 + }, + { + "epoch": 0.056, + "grad_norm": 0.847607433795929, + "learning_rate": 4.9614108363230135e-05, + "loss": 0.7774, + "num_input_tokens_seen": 1878768, + "step": 350 + }, + { + "epoch": 0.0568, + "grad_norm": 0.5412896275520325, + "learning_rate": 4.9603033613638626e-05, + "loss": 0.7641, + "num_input_tokens_seen": 1905744, + "step": 355 + }, + { + "epoch": 0.0576, + "grad_norm": 0.5192331671714783, + "learning_rate": 4.959180345791528e-05, + "loss": 0.7169, + "num_input_tokens_seen": 1931392, + "step": 360 + }, + { + "epoch": 0.0584, + "grad_norm": 0.7992143630981445, + "learning_rate": 4.958041796699583e-05, + "loss": 0.7033, + "num_input_tokens_seen": 1954304, + "step": 365 + }, + { + "epoch": 0.0592, + "grad_norm": 0.49692437052726746, + "learning_rate": 4.956887721279726e-05, + "loss": 0.6569, + "num_input_tokens_seen": 1987264, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 0.7032391428947449, + "learning_rate": 4.9557181268217227e-05, + "loss": 0.7809, + "num_input_tokens_seen": 2010160, + "step": 375 + }, + { + "epoch": 0.0608, + "grad_norm": 0.780989944934845, + "learning_rate": 4.9545330207133664e-05, + "loss": 0.811, + "num_input_tokens_seen": 2038880, + "step": 380 + }, + { + "epoch": 0.0616, + "grad_norm": 0.819433867931366, + "learning_rate": 4.953332410440435e-05, + "loss": 0.825, + "num_input_tokens_seen": 2065344, + "step": 385 + }, + { + "epoch": 0.0624, + "grad_norm": 0.7076752781867981, + "learning_rate": 4.952116303586631e-05, + "loss": 0.7479, + "num_input_tokens_seen": 2092064, + "step": 390 + }, + { + "epoch": 0.0632, + "grad_norm": 0.6264218688011169, + "learning_rate": 4.9508847078335495e-05, + "loss": 0.7246, + "num_input_tokens_seen": 2119360, + "step": 395 + }, + { + "epoch": 0.064, + "grad_norm": 0.5829480290412903, + "learning_rate": 4.949637630960617e-05, + "loss": 0.6956, + "num_input_tokens_seen": 2146560, + "step": 400 + }, + { + "epoch": 0.0648, + "grad_norm": 0.5653419494628906, + "learning_rate": 4.94837508084505e-05, + "loss": 0.7315, + "num_input_tokens_seen": 2169232, + "step": 405 + }, + { + "epoch": 0.0656, + "grad_norm": 1.0192047357559204, + "learning_rate": 4.947097065461801e-05, + "loss": 0.7075, + "num_input_tokens_seen": 2192224, + "step": 410 + }, + { + "epoch": 0.0664, + "grad_norm": 0.7392141819000244, + "learning_rate": 4.945803592883509e-05, + "loss": 0.811, + "num_input_tokens_seen": 2216784, + "step": 415 + }, + { + "epoch": 0.0672, + "grad_norm": 0.6470807194709778, + "learning_rate": 4.9444946712804494e-05, + "loss": 0.7835, + "num_input_tokens_seen": 2243120, + "step": 420 + }, + { + "epoch": 0.068, + "grad_norm": 0.5305742025375366, + "learning_rate": 4.943170308920484e-05, + "loss": 0.7211, + "num_input_tokens_seen": 2270896, + "step": 425 + }, + { + "epoch": 0.0688, + "grad_norm": 0.8647666573524475, + "learning_rate": 4.941830514169004e-05, + "loss": 0.72, + "num_input_tokens_seen": 2298528, + "step": 430 + }, + { + "epoch": 0.0696, + "grad_norm": 0.6244668364524841, + "learning_rate": 4.9404752954888824e-05, + "loss": 0.7206, + "num_input_tokens_seen": 2328080, + "step": 435 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6552883386611938, + "learning_rate": 4.939104661440415e-05, + "loss": 0.8018, + "num_input_tokens_seen": 2355776, + "step": 440 + }, + { + "epoch": 0.0712, + "grad_norm": 0.8276055455207825, + "learning_rate": 4.937718620681273e-05, + "loss": 0.8267, + "num_input_tokens_seen": 2379056, + "step": 445 + }, + { + "epoch": 0.072, + "grad_norm": 0.6930189728736877, + "learning_rate": 4.9363171819664434e-05, + "loss": 0.8961, + "num_input_tokens_seen": 2401664, + "step": 450 + }, + { + "epoch": 0.0728, + "grad_norm": 0.7441433668136597, + "learning_rate": 4.934900354148173e-05, + "loss": 0.6942, + "num_input_tokens_seen": 2427456, + "step": 455 + }, + { + "epoch": 0.0736, + "grad_norm": 0.5929616093635559, + "learning_rate": 4.933468146175918e-05, + "loss": 0.7874, + "num_input_tokens_seen": 2450752, + "step": 460 + }, + { + "epoch": 0.0744, + "grad_norm": 0.5789006948471069, + "learning_rate": 4.9320205670962814e-05, + "loss": 0.7162, + "num_input_tokens_seen": 2473856, + "step": 465 + }, + { + "epoch": 0.0752, + "grad_norm": 0.6359069347381592, + "learning_rate": 4.9305576260529607e-05, + "loss": 0.7434, + "num_input_tokens_seen": 2502928, + "step": 470 + }, + { + "epoch": 0.076, + "grad_norm": 0.6155191659927368, + "learning_rate": 4.929079332286685e-05, + "loss": 0.6932, + "num_input_tokens_seen": 2536144, + "step": 475 + }, + { + "epoch": 0.0768, + "grad_norm": 0.6511387228965759, + "learning_rate": 4.927585695135162e-05, + "loss": 0.8053, + "num_input_tokens_seen": 2562688, + "step": 480 + }, + { + "epoch": 0.0776, + "grad_norm": 0.5791414976119995, + "learning_rate": 4.926076724033016e-05, + "loss": 0.7482, + "num_input_tokens_seen": 2594480, + "step": 485 + }, + { + "epoch": 0.0784, + "grad_norm": 0.5258495807647705, + "learning_rate": 4.9245524285117274e-05, + "loss": 0.7075, + "num_input_tokens_seen": 2624736, + "step": 490 + }, + { + "epoch": 0.0792, + "grad_norm": 0.5191717743873596, + "learning_rate": 4.923012818199576e-05, + "loss": 0.6132, + "num_input_tokens_seen": 2648880, + "step": 495 + }, + { + "epoch": 0.08, + "grad_norm": 0.8281647562980652, + "learning_rate": 4.9214579028215776e-05, + "loss": 0.6679, + "num_input_tokens_seen": 2675888, + "step": 500 + }, + { + "epoch": 0.0808, + "grad_norm": 0.588010847568512, + "learning_rate": 4.919887692199423e-05, + "loss": 0.7016, + "num_input_tokens_seen": 2699392, + "step": 505 + }, + { + "epoch": 0.0816, + "grad_norm": 0.8409311771392822, + "learning_rate": 4.918302196251415e-05, + "loss": 0.7216, + "num_input_tokens_seen": 2726432, + "step": 510 + }, + { + "epoch": 0.0824, + "grad_norm": 0.6029579639434814, + "learning_rate": 4.9167014249924075e-05, + "loss": 0.6602, + "num_input_tokens_seen": 2756336, + "step": 515 + }, + { + "epoch": 0.0832, + "grad_norm": 0.7269614934921265, + "learning_rate": 4.9150853885337426e-05, + "loss": 0.6956, + "num_input_tokens_seen": 2781648, + "step": 520 + }, + { + "epoch": 0.084, + "grad_norm": 0.5419861674308777, + "learning_rate": 4.913454097083185e-05, + "loss": 0.6427, + "num_input_tokens_seen": 2810336, + "step": 525 + }, + { + "epoch": 0.0848, + "grad_norm": 0.9006750583648682, + "learning_rate": 4.911807560944858e-05, + "loss": 0.8328, + "num_input_tokens_seen": 2836432, + "step": 530 + }, + { + "epoch": 0.0856, + "grad_norm": 0.7180121541023254, + "learning_rate": 4.9101457905191774e-05, + "loss": 0.8104, + "num_input_tokens_seen": 2863616, + "step": 535 + }, + { + "epoch": 0.0864, + "grad_norm": 0.6724479794502258, + "learning_rate": 4.9084687963027894e-05, + "loss": 0.6858, + "num_input_tokens_seen": 2891264, + "step": 540 + }, + { + "epoch": 0.0872, + "grad_norm": 0.7073305249214172, + "learning_rate": 4.906776588888502e-05, + "loss": 0.7271, + "num_input_tokens_seen": 2916256, + "step": 545 + }, + { + "epoch": 0.088, + "grad_norm": 0.7945154309272766, + "learning_rate": 4.905069178965215e-05, + "loss": 0.7527, + "num_input_tokens_seen": 2944112, + "step": 550 + }, + { + "epoch": 0.0888, + "grad_norm": 0.5791934728622437, + "learning_rate": 4.903346577317859e-05, + "loss": 0.7341, + "num_input_tokens_seen": 2972512, + "step": 555 + }, + { + "epoch": 0.0896, + "grad_norm": 0.8222031593322754, + "learning_rate": 4.90160879482732e-05, + "loss": 0.7339, + "num_input_tokens_seen": 2997168, + "step": 560 + }, + { + "epoch": 0.0904, + "grad_norm": 0.6719418168067932, + "learning_rate": 4.89985584247038e-05, + "loss": 0.768, + "num_input_tokens_seen": 3020880, + "step": 565 + }, + { + "epoch": 0.0912, + "grad_norm": 0.7740746140480042, + "learning_rate": 4.898087731319636e-05, + "loss": 0.7014, + "num_input_tokens_seen": 3044224, + "step": 570 + }, + { + "epoch": 0.092, + "grad_norm": 0.5642164945602417, + "learning_rate": 4.89630447254344e-05, + "loss": 0.6203, + "num_input_tokens_seen": 3071680, + "step": 575 + }, + { + "epoch": 0.0928, + "grad_norm": 1.4719825983047485, + "learning_rate": 4.894506077405824e-05, + "loss": 0.7461, + "num_input_tokens_seen": 3099088, + "step": 580 + }, + { + "epoch": 0.0936, + "grad_norm": 0.6961272954940796, + "learning_rate": 4.892692557266429e-05, + "loss": 0.7357, + "num_input_tokens_seen": 3127728, + "step": 585 + }, + { + "epoch": 0.0944, + "grad_norm": 0.686820924282074, + "learning_rate": 4.8908639235804324e-05, + "loss": 0.7819, + "num_input_tokens_seen": 3154336, + "step": 590 + }, + { + "epoch": 0.0952, + "grad_norm": 0.7145109176635742, + "learning_rate": 4.8890201878984796e-05, + "loss": 0.7121, + "num_input_tokens_seen": 3178768, + "step": 595 + }, + { + "epoch": 0.096, + "grad_norm": 0.6159213781356812, + "learning_rate": 4.887161361866608e-05, + "loss": 0.6698, + "num_input_tokens_seen": 3211968, + "step": 600 + }, + { + "epoch": 0.0968, + "grad_norm": 0.8054212927818298, + "learning_rate": 4.885287457226172e-05, + "loss": 0.7606, + "num_input_tokens_seen": 3238272, + "step": 605 + }, + { + "epoch": 0.0976, + "grad_norm": 1.1526386737823486, + "learning_rate": 4.8833984858137715e-05, + "loss": 0.7694, + "num_input_tokens_seen": 3270208, + "step": 610 + }, + { + "epoch": 0.0984, + "grad_norm": 0.5728780031204224, + "learning_rate": 4.8814944595611776e-05, + "loss": 0.7227, + "num_input_tokens_seen": 3296192, + "step": 615 + }, + { + "epoch": 0.0992, + "grad_norm": 0.6360820531845093, + "learning_rate": 4.8795753904952534e-05, + "loss": 0.7275, + "num_input_tokens_seen": 3321232, + "step": 620 + }, + { + "epoch": 0.1, + "grad_norm": 0.6169213056564331, + "learning_rate": 4.877641290737884e-05, + "loss": 0.6746, + "num_input_tokens_seen": 3343984, + "step": 625 + }, + { + "epoch": 0.1008, + "grad_norm": 0.8000876307487488, + "learning_rate": 4.8756921725058934e-05, + "loss": 0.8223, + "num_input_tokens_seen": 3367824, + "step": 630 + }, + { + "epoch": 0.1016, + "grad_norm": 0.5983218550682068, + "learning_rate": 4.8737280481109724e-05, + "loss": 0.8487, + "num_input_tokens_seen": 3394800, + "step": 635 + }, + { + "epoch": 0.1024, + "grad_norm": 0.9402346014976501, + "learning_rate": 4.871748929959598e-05, + "loss": 0.7441, + "num_input_tokens_seen": 3421360, + "step": 640 + }, + { + "epoch": 0.1032, + "grad_norm": 0.6266387104988098, + "learning_rate": 4.869754830552956e-05, + "loss": 0.7631, + "num_input_tokens_seen": 3449584, + "step": 645 + }, + { + "epoch": 0.104, + "grad_norm": 0.7829232215881348, + "learning_rate": 4.867745762486861e-05, + "loss": 0.7793, + "num_input_tokens_seen": 3477168, + "step": 650 + }, + { + "epoch": 0.1048, + "grad_norm": 0.7125943303108215, + "learning_rate": 4.86572173845168e-05, + "loss": 0.7415, + "num_input_tokens_seen": 3505056, + "step": 655 + }, + { + "epoch": 0.1056, + "grad_norm": 0.6520003080368042, + "learning_rate": 4.863682771232248e-05, + "loss": 0.7157, + "num_input_tokens_seen": 3534576, + "step": 660 + }, + { + "epoch": 0.1064, + "grad_norm": 0.5907071828842163, + "learning_rate": 4.861628873707792e-05, + "loss": 0.7287, + "num_input_tokens_seen": 3560688, + "step": 665 + }, + { + "epoch": 0.1072, + "grad_norm": 0.8829016089439392, + "learning_rate": 4.859560058851844e-05, + "loss": 0.7351, + "num_input_tokens_seen": 3586176, + "step": 670 + }, + { + "epoch": 0.108, + "grad_norm": 0.917322039604187, + "learning_rate": 4.8574763397321614e-05, + "loss": 0.6213, + "num_input_tokens_seen": 3615472, + "step": 675 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6344768404960632, + "learning_rate": 4.855377729510648e-05, + "loss": 0.729, + "num_input_tokens_seen": 3638256, + "step": 680 + }, + { + "epoch": 0.1096, + "grad_norm": 0.7305799722671509, + "learning_rate": 4.8532642414432674e-05, + "loss": 0.7242, + "num_input_tokens_seen": 3667824, + "step": 685 + }, + { + "epoch": 0.1104, + "grad_norm": 0.7569555044174194, + "learning_rate": 4.851135888879958e-05, + "loss": 0.7831, + "num_input_tokens_seen": 3695408, + "step": 690 + }, + { + "epoch": 0.1112, + "grad_norm": 0.7566932439804077, + "learning_rate": 4.8489926852645505e-05, + "loss": 0.7181, + "num_input_tokens_seen": 3719888, + "step": 695 + }, + { + "epoch": 0.112, + "grad_norm": 0.7932357788085938, + "learning_rate": 4.846834644134686e-05, + "loss": 0.7961, + "num_input_tokens_seen": 3744512, + "step": 700 + }, + { + "epoch": 0.1128, + "grad_norm": 0.708210825920105, + "learning_rate": 4.844661779121722e-05, + "loss": 0.8362, + "num_input_tokens_seen": 3771968, + "step": 705 + }, + { + "epoch": 0.1136, + "grad_norm": 0.7361094951629639, + "learning_rate": 4.8424741039506575e-05, + "loss": 0.7645, + "num_input_tokens_seen": 3801680, + "step": 710 + }, + { + "epoch": 0.1144, + "grad_norm": 0.48908814787864685, + "learning_rate": 4.840271632440038e-05, + "loss": 0.7042, + "num_input_tokens_seen": 3833952, + "step": 715 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6167788505554199, + "learning_rate": 4.8380543785018677e-05, + "loss": 0.7476, + "num_input_tokens_seen": 3860144, + "step": 720 + }, + { + "epoch": 0.116, + "grad_norm": 0.68650883436203, + "learning_rate": 4.8358223561415304e-05, + "loss": 0.7415, + "num_input_tokens_seen": 3890304, + "step": 725 + }, + { + "epoch": 0.1168, + "grad_norm": 0.7059746384620667, + "learning_rate": 4.833575579457691e-05, + "loss": 0.6717, + "num_input_tokens_seen": 3914560, + "step": 730 + }, + { + "epoch": 0.1176, + "grad_norm": 0.8362336158752441, + "learning_rate": 4.8313140626422125e-05, + "loss": 0.7545, + "num_input_tokens_seen": 3940128, + "step": 735 + }, + { + "epoch": 0.1184, + "grad_norm": 0.5400592684745789, + "learning_rate": 4.829037819980065e-05, + "loss": 0.7809, + "num_input_tokens_seen": 3970608, + "step": 740 + }, + { + "epoch": 0.1192, + "grad_norm": 1.0431326627731323, + "learning_rate": 4.8267468658492335e-05, + "loss": 0.7904, + "num_input_tokens_seen": 3996960, + "step": 745 + }, + { + "epoch": 0.12, + "grad_norm": 0.5358662605285645, + "learning_rate": 4.8244412147206284e-05, + "loss": 0.7688, + "num_input_tokens_seen": 4021488, + "step": 750 + }, + { + "epoch": 0.1208, + "grad_norm": 0.8147661685943604, + "learning_rate": 4.822120881157998e-05, + "loss": 0.7819, + "num_input_tokens_seen": 4047136, + "step": 755 + }, + { + "epoch": 0.1216, + "grad_norm": 0.6247139573097229, + "learning_rate": 4.819785879817827e-05, + "loss": 0.6757, + "num_input_tokens_seen": 4072256, + "step": 760 + }, + { + "epoch": 0.1224, + "grad_norm": 0.8849884271621704, + "learning_rate": 4.817436225449255e-05, + "loss": 0.8952, + "num_input_tokens_seen": 4095328, + "step": 765 + }, + { + "epoch": 0.1232, + "grad_norm": 0.8693557977676392, + "learning_rate": 4.8150719328939755e-05, + "loss": 0.6998, + "num_input_tokens_seen": 4118896, + "step": 770 + }, + { + "epoch": 0.124, + "grad_norm": 0.9492819905281067, + "learning_rate": 4.812693017086145e-05, + "loss": 0.7675, + "num_input_tokens_seen": 4144576, + "step": 775 + }, + { + "epoch": 0.1248, + "grad_norm": 0.8479375243186951, + "learning_rate": 4.810299493052289e-05, + "loss": 0.7332, + "num_input_tokens_seen": 4172448, + "step": 780 + }, + { + "epoch": 0.1256, + "grad_norm": 0.7956748008728027, + "learning_rate": 4.8078913759112066e-05, + "loss": 0.6942, + "num_input_tokens_seen": 4196032, + "step": 785 + }, + { + "epoch": 0.1264, + "grad_norm": 0.6426162123680115, + "learning_rate": 4.805468680873874e-05, + "loss": 0.7536, + "num_input_tokens_seen": 4224320, + "step": 790 + }, + { + "epoch": 0.1272, + "grad_norm": 0.6501713991165161, + "learning_rate": 4.803031423243349e-05, + "loss": 0.6722, + "num_input_tokens_seen": 4252752, + "step": 795 + }, + { + "epoch": 0.128, + "grad_norm": 0.773551881313324, + "learning_rate": 4.800579618414676e-05, + "loss": 0.7651, + "num_input_tokens_seen": 4278480, + "step": 800 + }, + { + "epoch": 0.1288, + "grad_norm": 0.6473078727722168, + "learning_rate": 4.7981132818747876e-05, + "loss": 0.6626, + "num_input_tokens_seen": 4305920, + "step": 805 + }, + { + "epoch": 0.1296, + "grad_norm": 0.5944277048110962, + "learning_rate": 4.795632429202405e-05, + "loss": 0.8511, + "num_input_tokens_seen": 4330448, + "step": 810 + }, + { + "epoch": 0.1304, + "grad_norm": 0.6878964900970459, + "learning_rate": 4.793137076067942e-05, + "loss": 0.7524, + "num_input_tokens_seen": 4356880, + "step": 815 + }, + { + "epoch": 0.1312, + "grad_norm": 0.9247101545333862, + "learning_rate": 4.790627238233405e-05, + "loss": 0.8498, + "num_input_tokens_seen": 4383744, + "step": 820 + }, + { + "epoch": 0.132, + "grad_norm": 0.9401747584342957, + "learning_rate": 4.788102931552294e-05, + "loss": 0.647, + "num_input_tokens_seen": 4411120, + "step": 825 + }, + { + "epoch": 0.1328, + "grad_norm": 0.765521764755249, + "learning_rate": 4.7855641719695023e-05, + "loss": 0.7766, + "num_input_tokens_seen": 4435920, + "step": 830 + }, + { + "epoch": 0.1336, + "grad_norm": 0.8985924124717712, + "learning_rate": 4.783010975521216e-05, + "loss": 0.7426, + "num_input_tokens_seen": 4462768, + "step": 835 + }, + { + "epoch": 0.1344, + "grad_norm": 0.8223104476928711, + "learning_rate": 4.78044335833481e-05, + "loss": 0.6919, + "num_input_tokens_seen": 4493232, + "step": 840 + }, + { + "epoch": 0.1352, + "grad_norm": 0.6721159219741821, + "learning_rate": 4.7778613366287505e-05, + "loss": 0.7221, + "num_input_tokens_seen": 4520048, + "step": 845 + }, + { + "epoch": 0.136, + "grad_norm": 0.6136463284492493, + "learning_rate": 4.775264926712489e-05, + "loss": 0.7344, + "num_input_tokens_seen": 4545984, + "step": 850 + }, + { + "epoch": 0.1368, + "grad_norm": 0.7662776708602905, + "learning_rate": 4.772654144986364e-05, + "loss": 0.6648, + "num_input_tokens_seen": 4577296, + "step": 855 + }, + { + "epoch": 0.1376, + "grad_norm": 0.8455452919006348, + "learning_rate": 4.7700290079414896e-05, + "loss": 0.7513, + "num_input_tokens_seen": 4602272, + "step": 860 + }, + { + "epoch": 0.1384, + "grad_norm": 0.8613116145133972, + "learning_rate": 4.767389532159659e-05, + "loss": 0.7792, + "num_input_tokens_seen": 4631008, + "step": 865 + }, + { + "epoch": 0.1392, + "grad_norm": 0.5791791677474976, + "learning_rate": 4.764735734313236e-05, + "loss": 0.7529, + "num_input_tokens_seen": 4660112, + "step": 870 + }, + { + "epoch": 0.14, + "grad_norm": 0.8197824954986572, + "learning_rate": 4.762067631165049e-05, + "loss": 0.6728, + "num_input_tokens_seen": 4689504, + "step": 875 + }, + { + "epoch": 0.1408, + "grad_norm": 0.9617213606834412, + "learning_rate": 4.759385239568289e-05, + "loss": 0.6935, + "num_input_tokens_seen": 4715312, + "step": 880 + }, + { + "epoch": 0.1416, + "grad_norm": 0.7933773398399353, + "learning_rate": 4.756688576466398e-05, + "loss": 0.8062, + "num_input_tokens_seen": 4735936, + "step": 885 + }, + { + "epoch": 0.1424, + "grad_norm": 0.975724458694458, + "learning_rate": 4.753977658892967e-05, + "loss": 0.7149, + "num_input_tokens_seen": 4760256, + "step": 890 + }, + { + "epoch": 0.1432, + "grad_norm": 0.755167543888092, + "learning_rate": 4.751252503971624e-05, + "loss": 0.7062, + "num_input_tokens_seen": 4789264, + "step": 895 + }, + { + "epoch": 0.144, + "grad_norm": 0.84686279296875, + "learning_rate": 4.7485131289159276e-05, + "loss": 0.837, + "num_input_tokens_seen": 4815344, + "step": 900 + }, + { + "epoch": 0.1448, + "grad_norm": 0.9440627098083496, + "learning_rate": 4.745759551029261e-05, + "loss": 0.6907, + "num_input_tokens_seen": 4840528, + "step": 905 + }, + { + "epoch": 0.1456, + "grad_norm": 0.7293935418128967, + "learning_rate": 4.742991787704719e-05, + "loss": 0.7192, + "num_input_tokens_seen": 4868032, + "step": 910 + }, + { + "epoch": 0.1464, + "grad_norm": 0.6401370763778687, + "learning_rate": 4.7402098564249974e-05, + "loss": 0.7223, + "num_input_tokens_seen": 4893376, + "step": 915 + }, + { + "epoch": 0.1472, + "grad_norm": 0.8882667422294617, + "learning_rate": 4.737413774762287e-05, + "loss": 0.6847, + "num_input_tokens_seen": 4918288, + "step": 920 + }, + { + "epoch": 0.148, + "grad_norm": 0.7663973569869995, + "learning_rate": 4.73460356037816e-05, + "loss": 0.7273, + "num_input_tokens_seen": 4944688, + "step": 925 + }, + { + "epoch": 0.1488, + "grad_norm": 0.7966660857200623, + "learning_rate": 4.731779231023456e-05, + "loss": 0.7087, + "num_input_tokens_seen": 4969744, + "step": 930 + }, + { + "epoch": 0.1496, + "grad_norm": 1.3590271472930908, + "learning_rate": 4.728940804538176e-05, + "loss": 0.7771, + "num_input_tokens_seen": 4997072, + "step": 935 + }, + { + "epoch": 0.1504, + "grad_norm": 0.8245935440063477, + "learning_rate": 4.7260882988513624e-05, + "loss": 0.7598, + "num_input_tokens_seen": 5024672, + "step": 940 + }, + { + "epoch": 0.1512, + "grad_norm": 0.6972777247428894, + "learning_rate": 4.723221731980993e-05, + "loss": 0.7961, + "num_input_tokens_seen": 5051952, + "step": 945 + }, + { + "epoch": 0.152, + "grad_norm": 0.7691385746002197, + "learning_rate": 4.720341122033862e-05, + "loss": 0.773, + "num_input_tokens_seen": 5074528, + "step": 950 + }, + { + "epoch": 0.1528, + "grad_norm": 1.4361584186553955, + "learning_rate": 4.717446487205466e-05, + "loss": 0.7216, + "num_input_tokens_seen": 5099840, + "step": 955 + }, + { + "epoch": 0.1536, + "grad_norm": 0.9640448689460754, + "learning_rate": 4.714537845779894e-05, + "loss": 0.6569, + "num_input_tokens_seen": 5122848, + "step": 960 + }, + { + "epoch": 0.1544, + "grad_norm": 0.8036720156669617, + "learning_rate": 4.7116152161297045e-05, + "loss": 0.7994, + "num_input_tokens_seen": 5152320, + "step": 965 + }, + { + "epoch": 0.1552, + "grad_norm": 0.7904760241508484, + "learning_rate": 4.708678616715815e-05, + "loss": 0.7259, + "num_input_tokens_seen": 5178816, + "step": 970 + }, + { + "epoch": 0.156, + "grad_norm": 0.7007213830947876, + "learning_rate": 4.7057280660873835e-05, + "loss": 0.747, + "num_input_tokens_seen": 5208112, + "step": 975 + }, + { + "epoch": 0.1568, + "grad_norm": 0.8959905505180359, + "learning_rate": 4.702763582881692e-05, + "loss": 0.8487, + "num_input_tokens_seen": 5231200, + "step": 980 + }, + { + "epoch": 0.1576, + "grad_norm": 0.8828222155570984, + "learning_rate": 4.699785185824026e-05, + "loss": 0.7654, + "num_input_tokens_seen": 5257312, + "step": 985 + }, + { + "epoch": 0.1584, + "grad_norm": 1.0120501518249512, + "learning_rate": 4.696792893727562e-05, + "loss": 0.7748, + "num_input_tokens_seen": 5280288, + "step": 990 + }, + { + "epoch": 0.1592, + "grad_norm": 0.8728295564651489, + "learning_rate": 4.693786725493242e-05, + "loss": 0.6957, + "num_input_tokens_seen": 5308272, + "step": 995 + }, + { + "epoch": 0.16, + "grad_norm": 1.252682089805603, + "learning_rate": 4.690766700109659e-05, + "loss": 0.7418, + "num_input_tokens_seen": 5335568, + "step": 1000 + }, + { + "epoch": 0.1608, + "grad_norm": 1.0933836698532104, + "learning_rate": 4.6877328366529346e-05, + "loss": 0.8225, + "num_input_tokens_seen": 5361872, + "step": 1005 + }, + { + "epoch": 0.1616, + "grad_norm": 0.8167702555656433, + "learning_rate": 4.684685154286599e-05, + "loss": 0.8552, + "num_input_tokens_seen": 5387456, + "step": 1010 + }, + { + "epoch": 0.1624, + "grad_norm": 0.6640987396240234, + "learning_rate": 4.681623672261469e-05, + "loss": 0.6654, + "num_input_tokens_seen": 5411472, + "step": 1015 + }, + { + "epoch": 0.1632, + "grad_norm": 0.9311304688453674, + "learning_rate": 4.678548409915532e-05, + "loss": 0.7339, + "num_input_tokens_seen": 5439648, + "step": 1020 + }, + { + "epoch": 0.164, + "grad_norm": 0.9733570218086243, + "learning_rate": 4.675459386673815e-05, + "loss": 0.7324, + "num_input_tokens_seen": 5468416, + "step": 1025 + }, + { + "epoch": 0.1648, + "grad_norm": 0.6521669030189514, + "learning_rate": 4.6723566220482664e-05, + "loss": 0.7065, + "num_input_tokens_seen": 5498800, + "step": 1030 + }, + { + "epoch": 0.1656, + "grad_norm": 0.6702024340629578, + "learning_rate": 4.669240135637635e-05, + "loss": 0.6822, + "num_input_tokens_seen": 5527856, + "step": 1035 + }, + { + "epoch": 0.1664, + "grad_norm": 1.3757870197296143, + "learning_rate": 4.666109947127343e-05, + "loss": 0.7554, + "num_input_tokens_seen": 5550848, + "step": 1040 + }, + { + "epoch": 0.1672, + "grad_norm": 0.7441242933273315, + "learning_rate": 4.662966076289362e-05, + "loss": 0.6784, + "num_input_tokens_seen": 5581552, + "step": 1045 + }, + { + "epoch": 0.168, + "grad_norm": 0.7709234356880188, + "learning_rate": 4.659808542982088e-05, + "loss": 0.8294, + "num_input_tokens_seen": 5604288, + "step": 1050 + }, + { + "epoch": 0.1688, + "grad_norm": 0.5358073115348816, + "learning_rate": 4.6566373671502196e-05, + "loss": 0.6633, + "num_input_tokens_seen": 5630336, + "step": 1055 + }, + { + "epoch": 0.1696, + "grad_norm": 0.5856006741523743, + "learning_rate": 4.653452568824625e-05, + "loss": 0.6684, + "num_input_tokens_seen": 5662480, + "step": 1060 + }, + { + "epoch": 0.1704, + "grad_norm": 0.7003797292709351, + "learning_rate": 4.650254168122222e-05, + "loss": 0.7109, + "num_input_tokens_seen": 5687376, + "step": 1065 + }, + { + "epoch": 0.1712, + "grad_norm": 0.7874431014060974, + "learning_rate": 4.647042185245847e-05, + "loss": 0.8036, + "num_input_tokens_seen": 5714896, + "step": 1070 + }, + { + "epoch": 0.172, + "grad_norm": 0.6988087296485901, + "learning_rate": 4.643816640484131e-05, + "loss": 0.6575, + "num_input_tokens_seen": 5740192, + "step": 1075 + }, + { + "epoch": 0.1728, + "grad_norm": 0.982477068901062, + "learning_rate": 4.640577554211366e-05, + "loss": 0.7477, + "num_input_tokens_seen": 5768656, + "step": 1080 + }, + { + "epoch": 0.1736, + "grad_norm": 1.1265698671340942, + "learning_rate": 4.6373249468873833e-05, + "loss": 0.7555, + "num_input_tokens_seen": 5794576, + "step": 1085 + }, + { + "epoch": 0.1744, + "grad_norm": 0.6747913360595703, + "learning_rate": 4.634058839057417e-05, + "loss": 0.6695, + "num_input_tokens_seen": 5823296, + "step": 1090 + }, + { + "epoch": 0.1752, + "grad_norm": 0.8027223348617554, + "learning_rate": 4.63077925135198e-05, + "loss": 0.6948, + "num_input_tokens_seen": 5846928, + "step": 1095 + }, + { + "epoch": 0.176, + "grad_norm": 0.7862293720245361, + "learning_rate": 4.6274862044867304e-05, + "loss": 0.7728, + "num_input_tokens_seen": 5871968, + "step": 1100 + }, + { + "epoch": 0.1768, + "grad_norm": 0.7790197134017944, + "learning_rate": 4.624179719262342e-05, + "loss": 0.765, + "num_input_tokens_seen": 5900304, + "step": 1105 + }, + { + "epoch": 0.1776, + "grad_norm": 0.8996221423149109, + "learning_rate": 4.6208598165643715e-05, + "loss": 0.6515, + "num_input_tokens_seen": 5925792, + "step": 1110 + }, + { + "epoch": 0.1784, + "grad_norm": 0.7972677946090698, + "learning_rate": 4.61752651736313e-05, + "loss": 0.75, + "num_input_tokens_seen": 5950672, + "step": 1115 + }, + { + "epoch": 0.1792, + "grad_norm": 0.6896753907203674, + "learning_rate": 4.614179842713547e-05, + "loss": 0.6592, + "num_input_tokens_seen": 5985552, + "step": 1120 + }, + { + "epoch": 0.18, + "grad_norm": 0.823128342628479, + "learning_rate": 4.610819813755038e-05, + "loss": 0.8463, + "num_input_tokens_seen": 6009904, + "step": 1125 + }, + { + "epoch": 0.1808, + "grad_norm": 0.8550837635993958, + "learning_rate": 4.607446451711372e-05, + "loss": 0.7349, + "num_input_tokens_seen": 6034160, + "step": 1130 + }, + { + "epoch": 0.1816, + "grad_norm": 0.8120406270027161, + "learning_rate": 4.604059777890537e-05, + "loss": 0.6396, + "num_input_tokens_seen": 6056544, + "step": 1135 + }, + { + "epoch": 0.1824, + "grad_norm": 0.6196752786636353, + "learning_rate": 4.6006598136846056e-05, + "loss": 0.6164, + "num_input_tokens_seen": 6083920, + "step": 1140 + }, + { + "epoch": 0.1832, + "grad_norm": 0.6641353368759155, + "learning_rate": 4.5972465805695996e-05, + "loss": 0.6775, + "num_input_tokens_seen": 6111520, + "step": 1145 + }, + { + "epoch": 0.184, + "grad_norm": 0.7323867082595825, + "learning_rate": 4.593820100105355e-05, + "loss": 0.6295, + "num_input_tokens_seen": 6141056, + "step": 1150 + }, + { + "epoch": 0.1848, + "grad_norm": 0.6919586658477783, + "learning_rate": 4.590380393935383e-05, + "loss": 0.7429, + "num_input_tokens_seen": 6163408, + "step": 1155 + }, + { + "epoch": 0.1856, + "grad_norm": 0.9530206322669983, + "learning_rate": 4.5869274837867394e-05, + "loss": 0.7516, + "num_input_tokens_seen": 6188816, + "step": 1160 + }, + { + "epoch": 0.1864, + "grad_norm": 0.9966915845870972, + "learning_rate": 4.583461391469879e-05, + "loss": 0.7524, + "num_input_tokens_seen": 6216800, + "step": 1165 + }, + { + "epoch": 0.1872, + "grad_norm": 1.096708059310913, + "learning_rate": 4.579982138878527e-05, + "loss": 0.7337, + "num_input_tokens_seen": 6245888, + "step": 1170 + }, + { + "epoch": 0.188, + "grad_norm": 0.8707526326179504, + "learning_rate": 4.5764897479895317e-05, + "loss": 0.7891, + "num_input_tokens_seen": 6275120, + "step": 1175 + }, + { + "epoch": 0.1888, + "grad_norm": 0.7489879727363586, + "learning_rate": 4.5729842408627334e-05, + "loss": 0.79, + "num_input_tokens_seen": 6299760, + "step": 1180 + }, + { + "epoch": 0.1896, + "grad_norm": 0.7835171222686768, + "learning_rate": 4.5694656396408195e-05, + "loss": 0.7506, + "num_input_tokens_seen": 6326720, + "step": 1185 + }, + { + "epoch": 0.1904, + "grad_norm": 0.7588552832603455, + "learning_rate": 4.565933966549189e-05, + "loss": 0.6294, + "num_input_tokens_seen": 6353728, + "step": 1190 + }, + { + "epoch": 0.1912, + "grad_norm": 0.6706573367118835, + "learning_rate": 4.5623892438958074e-05, + "loss": 0.7564, + "num_input_tokens_seen": 6379536, + "step": 1195 + }, + { + "epoch": 0.192, + "grad_norm": 0.7340586185455322, + "learning_rate": 4.558831494071069e-05, + "loss": 0.7683, + "num_input_tokens_seen": 6407152, + "step": 1200 + }, + { + "epoch": 0.1928, + "grad_norm": 0.735789954662323, + "learning_rate": 4.555260739547657e-05, + "loss": 0.7701, + "num_input_tokens_seen": 6434480, + "step": 1205 + }, + { + "epoch": 0.1936, + "grad_norm": 0.8325262069702148, + "learning_rate": 4.5516770028803954e-05, + "loss": 0.694, + "num_input_tokens_seen": 6463424, + "step": 1210 + }, + { + "epoch": 0.1944, + "grad_norm": 0.7930346727371216, + "learning_rate": 4.548080306706114e-05, + "loss": 0.7322, + "num_input_tokens_seen": 6487136, + "step": 1215 + }, + { + "epoch": 0.1952, + "grad_norm": 0.7683930397033691, + "learning_rate": 4.5444706737435014e-05, + "loss": 0.7616, + "num_input_tokens_seen": 6513120, + "step": 1220 + }, + { + "epoch": 0.196, + "grad_norm": 0.600136399269104, + "learning_rate": 4.5408481267929605e-05, + "loss": 0.6743, + "num_input_tokens_seen": 6543040, + "step": 1225 + }, + { + "epoch": 0.1968, + "grad_norm": 0.9069085121154785, + "learning_rate": 4.5372126887364655e-05, + "loss": 0.7377, + "num_input_tokens_seen": 6572432, + "step": 1230 + }, + { + "epoch": 0.1976, + "grad_norm": 0.9226580262184143, + "learning_rate": 4.533564382537421e-05, + "loss": 0.7766, + "num_input_tokens_seen": 6593136, + "step": 1235 + }, + { + "epoch": 0.1984, + "grad_norm": 0.7376300096511841, + "learning_rate": 4.529903231240511e-05, + "loss": 0.7873, + "num_input_tokens_seen": 6621024, + "step": 1240 + }, + { + "epoch": 0.1992, + "grad_norm": 0.6371731162071228, + "learning_rate": 4.5262292579715556e-05, + "loss": 0.7096, + "num_input_tokens_seen": 6646480, + "step": 1245 + }, + { + "epoch": 0.2, + "grad_norm": 0.8643271327018738, + "learning_rate": 4.522542485937369e-05, + "loss": 0.8187, + "num_input_tokens_seen": 6674032, + "step": 1250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.8012109398841858, + "learning_rate": 4.518842938425605e-05, + "loss": 0.772, + "num_input_tokens_seen": 6700112, + "step": 1255 + }, + { + "epoch": 0.2016, + "grad_norm": 0.7719143033027649, + "learning_rate": 4.5151306388046175e-05, + "loss": 0.6796, + "num_input_tokens_seen": 6727008, + "step": 1260 + }, + { + "epoch": 0.2024, + "grad_norm": 0.8668113946914673, + "learning_rate": 4.511405610523309e-05, + "loss": 0.7177, + "num_input_tokens_seen": 6752768, + "step": 1265 + }, + { + "epoch": 0.2032, + "grad_norm": 0.8964220285415649, + "learning_rate": 4.5076678771109815e-05, + "loss": 0.7078, + "num_input_tokens_seen": 6778112, + "step": 1270 + }, + { + "epoch": 0.204, + "grad_norm": 0.7097613215446472, + "learning_rate": 4.503917462177192e-05, + "loss": 0.6496, + "num_input_tokens_seen": 6804432, + "step": 1275 + }, + { + "epoch": 0.2048, + "grad_norm": 0.842675507068634, + "learning_rate": 4.5001543894115975e-05, + "loss": 0.6802, + "num_input_tokens_seen": 6829824, + "step": 1280 + }, + { + "epoch": 0.2056, + "grad_norm": 0.7390193343162537, + "learning_rate": 4.496378682583813e-05, + "loss": 0.7187, + "num_input_tokens_seen": 6858480, + "step": 1285 + }, + { + "epoch": 0.2064, + "grad_norm": 0.5758505463600159, + "learning_rate": 4.492590365543253e-05, + "loss": 0.6198, + "num_input_tokens_seen": 6886960, + "step": 1290 + }, + { + "epoch": 0.2072, + "grad_norm": 0.9554662108421326, + "learning_rate": 4.488789462218987e-05, + "loss": 0.6105, + "num_input_tokens_seen": 6912560, + "step": 1295 + }, + { + "epoch": 0.208, + "grad_norm": 0.9423254728317261, + "learning_rate": 4.484975996619589e-05, + "loss": 0.7671, + "num_input_tokens_seen": 6938912, + "step": 1300 + }, + { + "epoch": 0.2088, + "grad_norm": 0.7120509743690491, + "learning_rate": 4.481149992832977e-05, + "loss": 0.6833, + "num_input_tokens_seen": 6967616, + "step": 1305 + }, + { + "epoch": 0.2096, + "grad_norm": 0.9409400224685669, + "learning_rate": 4.477311475026271e-05, + "loss": 0.7547, + "num_input_tokens_seen": 6993872, + "step": 1310 + }, + { + "epoch": 0.2104, + "grad_norm": 0.8102442026138306, + "learning_rate": 4.473460467445637e-05, + "loss": 0.7479, + "num_input_tokens_seen": 7020784, + "step": 1315 + }, + { + "epoch": 0.2112, + "grad_norm": 0.787486732006073, + "learning_rate": 4.46959699441613e-05, + "loss": 0.761, + "num_input_tokens_seen": 7045024, + "step": 1320 + }, + { + "epoch": 0.212, + "grad_norm": 0.8877683877944946, + "learning_rate": 4.465721080341547e-05, + "loss": 0.7612, + "num_input_tokens_seen": 7072448, + "step": 1325 + }, + { + "epoch": 0.2128, + "grad_norm": 0.7483372688293457, + "learning_rate": 4.461832749704268e-05, + "loss": 0.6792, + "num_input_tokens_seen": 7097776, + "step": 1330 + }, + { + "epoch": 0.2136, + "grad_norm": 0.7852973341941833, + "learning_rate": 4.457932027065102e-05, + "loss": 0.7357, + "num_input_tokens_seen": 7123568, + "step": 1335 + }, + { + "epoch": 0.2144, + "grad_norm": 0.7306565642356873, + "learning_rate": 4.4540189370631315e-05, + "loss": 0.6676, + "num_input_tokens_seen": 7151728, + "step": 1340 + }, + { + "epoch": 0.2152, + "grad_norm": 0.7990534901618958, + "learning_rate": 4.4500935044155626e-05, + "loss": 0.7394, + "num_input_tokens_seen": 7181664, + "step": 1345 + }, + { + "epoch": 0.216, + "grad_norm": 1.287644863128662, + "learning_rate": 4.4461557539175594e-05, + "loss": 0.8017, + "num_input_tokens_seen": 7210336, + "step": 1350 + }, + { + "epoch": 0.2168, + "grad_norm": 0.7476962208747864, + "learning_rate": 4.4422057104420946e-05, + "loss": 0.6533, + "num_input_tokens_seen": 7240992, + "step": 1355 + }, + { + "epoch": 0.2176, + "grad_norm": 0.8233410120010376, + "learning_rate": 4.4382433989397895e-05, + "loss": 0.7029, + "num_input_tokens_seen": 7268048, + "step": 1360 + }, + { + "epoch": 0.2184, + "grad_norm": 0.609846293926239, + "learning_rate": 4.434268844438758e-05, + "loss": 0.7096, + "num_input_tokens_seen": 7297616, + "step": 1365 + }, + { + "epoch": 0.2192, + "grad_norm": 1.010886549949646, + "learning_rate": 4.4302820720444456e-05, + "loss": 0.8103, + "num_input_tokens_seen": 7326912, + "step": 1370 + }, + { + "epoch": 0.22, + "grad_norm": 0.7681688070297241, + "learning_rate": 4.426283106939474e-05, + "loss": 0.6238, + "num_input_tokens_seen": 7355136, + "step": 1375 + }, + { + "epoch": 0.2208, + "grad_norm": 0.7759270071983337, + "learning_rate": 4.422271974383479e-05, + "loss": 0.6625, + "num_input_tokens_seen": 7377584, + "step": 1380 + }, + { + "epoch": 0.2216, + "grad_norm": 0.831362783908844, + "learning_rate": 4.418248699712955e-05, + "loss": 0.6831, + "num_input_tokens_seen": 7405552, + "step": 1385 + }, + { + "epoch": 0.2224, + "grad_norm": 0.7530121207237244, + "learning_rate": 4.414213308341092e-05, + "loss": 0.7664, + "num_input_tokens_seen": 7430960, + "step": 1390 + }, + { + "epoch": 0.2232, + "grad_norm": 0.8572810292243958, + "learning_rate": 4.410165825757613e-05, + "loss": 0.7273, + "num_input_tokens_seen": 7457136, + "step": 1395 + }, + { + "epoch": 0.224, + "grad_norm": 0.7553160190582275, + "learning_rate": 4.40610627752862e-05, + "loss": 0.6607, + "num_input_tokens_seen": 7482208, + "step": 1400 + }, + { + "epoch": 0.2248, + "grad_norm": 0.6897515058517456, + "learning_rate": 4.4020346892964246e-05, + "loss": 0.731, + "num_input_tokens_seen": 7515760, + "step": 1405 + }, + { + "epoch": 0.2256, + "grad_norm": 0.7974178791046143, + "learning_rate": 4.3979510867793917e-05, + "loss": 0.7258, + "num_input_tokens_seen": 7542944, + "step": 1410 + }, + { + "epoch": 0.2264, + "grad_norm": 0.8745766282081604, + "learning_rate": 4.393855495771774e-05, + "loss": 0.6566, + "num_input_tokens_seen": 7573760, + "step": 1415 + }, + { + "epoch": 0.2272, + "grad_norm": 0.749857485294342, + "learning_rate": 4.38974794214355e-05, + "loss": 0.7433, + "num_input_tokens_seen": 7606592, + "step": 1420 + }, + { + "epoch": 0.228, + "grad_norm": 0.7722298502922058, + "learning_rate": 4.3856284518402594e-05, + "loss": 0.7452, + "num_input_tokens_seen": 7628672, + "step": 1425 + }, + { + "epoch": 0.2288, + "grad_norm": 0.8768362998962402, + "learning_rate": 4.381497050882845e-05, + "loss": 0.7077, + "num_input_tokens_seen": 7658528, + "step": 1430 + }, + { + "epoch": 0.2296, + "grad_norm": 0.7979273796081543, + "learning_rate": 4.377353765367479e-05, + "loss": 0.6274, + "num_input_tokens_seen": 7685248, + "step": 1435 + }, + { + "epoch": 0.2304, + "grad_norm": 0.988314151763916, + "learning_rate": 4.3731986214654035e-05, + "loss": 0.6845, + "num_input_tokens_seen": 7713616, + "step": 1440 + }, + { + "epoch": 0.2312, + "grad_norm": 0.7991346120834351, + "learning_rate": 4.3690316454227674e-05, + "loss": 0.7115, + "num_input_tokens_seen": 7740304, + "step": 1445 + }, + { + "epoch": 0.232, + "grad_norm": 1.072383999824524, + "learning_rate": 4.3648528635604556e-05, + "loss": 0.7209, + "num_input_tokens_seen": 7766848, + "step": 1450 + }, + { + "epoch": 0.2328, + "grad_norm": 1.357325792312622, + "learning_rate": 4.360662302273924e-05, + "loss": 0.8239, + "num_input_tokens_seen": 7791888, + "step": 1455 + }, + { + "epoch": 0.2336, + "grad_norm": 0.6083495020866394, + "learning_rate": 4.3564599880330385e-05, + "loss": 0.6199, + "num_input_tokens_seen": 7822448, + "step": 1460 + }, + { + "epoch": 0.2344, + "grad_norm": 0.7359746098518372, + "learning_rate": 4.352245947381898e-05, + "loss": 0.7481, + "num_input_tokens_seen": 7848464, + "step": 1465 + }, + { + "epoch": 0.2352, + "grad_norm": 0.9160847067832947, + "learning_rate": 4.348020206938672e-05, + "loss": 0.7235, + "num_input_tokens_seen": 7877216, + "step": 1470 + }, + { + "epoch": 0.236, + "grad_norm": 0.7445215582847595, + "learning_rate": 4.343782793395435e-05, + "loss": 0.7345, + "num_input_tokens_seen": 7904368, + "step": 1475 + }, + { + "epoch": 0.2368, + "grad_norm": 0.8324536681175232, + "learning_rate": 4.3395337335179945e-05, + "loss": 0.7532, + "num_input_tokens_seen": 7931520, + "step": 1480 + }, + { + "epoch": 0.2376, + "grad_norm": 1.0249683856964111, + "learning_rate": 4.335273054145722e-05, + "loss": 0.6902, + "num_input_tokens_seen": 7953296, + "step": 1485 + }, + { + "epoch": 0.2384, + "grad_norm": 0.6565669775009155, + "learning_rate": 4.3310007821913836e-05, + "loss": 0.7329, + "num_input_tokens_seen": 7978832, + "step": 1490 + }, + { + "epoch": 0.2392, + "grad_norm": 0.8256237506866455, + "learning_rate": 4.32671694464097e-05, + "loss": 0.6693, + "num_input_tokens_seen": 8004992, + "step": 1495 + }, + { + "epoch": 0.24, + "grad_norm": 0.9722650051116943, + "learning_rate": 4.3224215685535294e-05, + "loss": 0.7418, + "num_input_tokens_seen": 8027824, + "step": 1500 + }, + { + "epoch": 0.2408, + "grad_norm": 0.599818229675293, + "learning_rate": 4.31811468106099e-05, + "loss": 0.6157, + "num_input_tokens_seen": 8058528, + "step": 1505 + }, + { + "epoch": 0.2416, + "grad_norm": 1.0976861715316772, + "learning_rate": 4.3137963093679945e-05, + "loss": 0.6302, + "num_input_tokens_seen": 8081984, + "step": 1510 + }, + { + "epoch": 0.2424, + "grad_norm": 0.5699600577354431, + "learning_rate": 4.309466480751726e-05, + "loss": 0.628, + "num_input_tokens_seen": 8113216, + "step": 1515 + }, + { + "epoch": 0.2432, + "grad_norm": 0.8899049758911133, + "learning_rate": 4.305125222561736e-05, + "loss": 0.635, + "num_input_tokens_seen": 8142080, + "step": 1520 + }, + { + "epoch": 0.244, + "grad_norm": 0.9494242072105408, + "learning_rate": 4.3007725622197674e-05, + "loss": 0.8114, + "num_input_tokens_seen": 8171008, + "step": 1525 + }, + { + "epoch": 0.2448, + "grad_norm": 0.9237959384918213, + "learning_rate": 4.296408527219592e-05, + "loss": 0.6678, + "num_input_tokens_seen": 8197696, + "step": 1530 + }, + { + "epoch": 0.2456, + "grad_norm": 0.8756378889083862, + "learning_rate": 4.292033145126825e-05, + "loss": 0.8364, + "num_input_tokens_seen": 8225552, + "step": 1535 + }, + { + "epoch": 0.2464, + "grad_norm": 0.9631836414337158, + "learning_rate": 4.287646443578758e-05, + "loss": 0.7312, + "num_input_tokens_seen": 8257120, + "step": 1540 + }, + { + "epoch": 0.2472, + "grad_norm": 0.920713484287262, + "learning_rate": 4.283248450284182e-05, + "loss": 0.8067, + "num_input_tokens_seen": 8282400, + "step": 1545 + }, + { + "epoch": 0.248, + "grad_norm": 1.0773414373397827, + "learning_rate": 4.2788391930232136e-05, + "loss": 0.7109, + "num_input_tokens_seen": 8309568, + "step": 1550 + }, + { + "epoch": 0.2488, + "grad_norm": 0.5621623396873474, + "learning_rate": 4.2744186996471174e-05, + "loss": 0.6543, + "num_input_tokens_seen": 8338864, + "step": 1555 + }, + { + "epoch": 0.2496, + "grad_norm": 0.8737258315086365, + "learning_rate": 4.269986998078132e-05, + "loss": 0.7401, + "num_input_tokens_seen": 8364592, + "step": 1560 + }, + { + "epoch": 0.2504, + "grad_norm": 0.8454060554504395, + "learning_rate": 4.265544116309294e-05, + "loss": 0.7538, + "num_input_tokens_seen": 8391120, + "step": 1565 + }, + { + "epoch": 0.2512, + "grad_norm": 0.8107228875160217, + "learning_rate": 4.261090082404258e-05, + "loss": 0.7705, + "num_input_tokens_seen": 8418320, + "step": 1570 + }, + { + "epoch": 0.252, + "grad_norm": 0.7339603304862976, + "learning_rate": 4.256624924497123e-05, + "loss": 0.6846, + "num_input_tokens_seen": 8446640, + "step": 1575 + }, + { + "epoch": 0.2528, + "grad_norm": 1.0036543607711792, + "learning_rate": 4.252148670792254e-05, + "loss": 0.8502, + "num_input_tokens_seen": 8470416, + "step": 1580 + }, + { + "epoch": 0.2536, + "grad_norm": 0.8186982870101929, + "learning_rate": 4.2476613495641026e-05, + "loss": 0.6987, + "num_input_tokens_seen": 8498160, + "step": 1585 + }, + { + "epoch": 0.2544, + "grad_norm": 0.9724487066268921, + "learning_rate": 4.2431629891570266e-05, + "loss": 0.6461, + "num_input_tokens_seen": 8525904, + "step": 1590 + }, + { + "epoch": 0.2552, + "grad_norm": 0.5958553552627563, + "learning_rate": 4.238653617985118e-05, + "loss": 0.7143, + "num_input_tokens_seen": 8551872, + "step": 1595 + }, + { + "epoch": 0.256, + "grad_norm": 1.0192784070968628, + "learning_rate": 4.234133264532012e-05, + "loss": 0.7215, + "num_input_tokens_seen": 8583440, + "step": 1600 + }, + { + "epoch": 0.2568, + "grad_norm": 0.7806874513626099, + "learning_rate": 4.229601957350722e-05, + "loss": 0.8008, + "num_input_tokens_seen": 8609632, + "step": 1605 + }, + { + "epoch": 0.2576, + "grad_norm": 1.086475133895874, + "learning_rate": 4.225059725063444e-05, + "loss": 0.6612, + "num_input_tokens_seen": 8633888, + "step": 1610 + }, + { + "epoch": 0.2584, + "grad_norm": 0.6213988065719604, + "learning_rate": 4.2205065963613864e-05, + "loss": 0.7544, + "num_input_tokens_seen": 8660288, + "step": 1615 + }, + { + "epoch": 0.2592, + "grad_norm": 1.0608100891113281, + "learning_rate": 4.2159426000045854e-05, + "loss": 0.7569, + "num_input_tokens_seen": 8689184, + "step": 1620 + }, + { + "epoch": 0.26, + "grad_norm": 0.7601464986801147, + "learning_rate": 4.211367764821722e-05, + "loss": 0.8161, + "num_input_tokens_seen": 8713504, + "step": 1625 + }, + { + "epoch": 0.2608, + "grad_norm": 0.9310168623924255, + "learning_rate": 4.206782119709942e-05, + "loss": 0.8283, + "num_input_tokens_seen": 8741088, + "step": 1630 + }, + { + "epoch": 0.2616, + "grad_norm": 0.6408126354217529, + "learning_rate": 4.20218569363467e-05, + "loss": 0.5745, + "num_input_tokens_seen": 8767456, + "step": 1635 + }, + { + "epoch": 0.2624, + "grad_norm": 1.1697090864181519, + "learning_rate": 4.197578515629435e-05, + "loss": 0.7525, + "num_input_tokens_seen": 8791952, + "step": 1640 + }, + { + "epoch": 0.2632, + "grad_norm": 0.9160236716270447, + "learning_rate": 4.192960614795675e-05, + "loss": 0.7991, + "num_input_tokens_seen": 8816080, + "step": 1645 + }, + { + "epoch": 0.264, + "grad_norm": 1.0530091524124146, + "learning_rate": 4.188332020302561e-05, + "loss": 0.7297, + "num_input_tokens_seen": 8841536, + "step": 1650 + }, + { + "epoch": 0.2648, + "grad_norm": 0.8888834118843079, + "learning_rate": 4.183692761386813e-05, + "loss": 0.6276, + "num_input_tokens_seen": 8869872, + "step": 1655 + }, + { + "epoch": 0.2656, + "grad_norm": 0.6144154667854309, + "learning_rate": 4.179042867352511e-05, + "loss": 0.7127, + "num_input_tokens_seen": 8893152, + "step": 1660 + }, + { + "epoch": 0.2664, + "grad_norm": 0.814166247844696, + "learning_rate": 4.174382367570912e-05, + "loss": 0.7712, + "num_input_tokens_seen": 8923040, + "step": 1665 + }, + { + "epoch": 0.2672, + "grad_norm": 0.8960988521575928, + "learning_rate": 4.169711291480266e-05, + "loss": 0.8388, + "num_input_tokens_seen": 8945856, + "step": 1670 + }, + { + "epoch": 0.268, + "grad_norm": 0.8164514303207397, + "learning_rate": 4.165029668585629e-05, + "loss": 0.7538, + "num_input_tokens_seen": 8971664, + "step": 1675 + }, + { + "epoch": 0.2688, + "grad_norm": 0.8044324517250061, + "learning_rate": 4.160337528458676e-05, + "loss": 0.708, + "num_input_tokens_seen": 8996064, + "step": 1680 + }, + { + "epoch": 0.2696, + "grad_norm": 0.7704948782920837, + "learning_rate": 4.155634900737513e-05, + "loss": 0.668, + "num_input_tokens_seen": 9022416, + "step": 1685 + }, + { + "epoch": 0.2704, + "grad_norm": 0.8315603137016296, + "learning_rate": 4.150921815126493e-05, + "loss": 0.752, + "num_input_tokens_seen": 9052480, + "step": 1690 + }, + { + "epoch": 0.2712, + "grad_norm": 0.7516652345657349, + "learning_rate": 4.1461983013960245e-05, + "loss": 0.6534, + "num_input_tokens_seen": 9079760, + "step": 1695 + }, + { + "epoch": 0.272, + "grad_norm": 0.7449467182159424, + "learning_rate": 4.1414643893823914e-05, + "loss": 0.6808, + "num_input_tokens_seen": 9109424, + "step": 1700 + }, + { + "epoch": 0.2728, + "grad_norm": 0.6889111995697021, + "learning_rate": 4.136720108987552e-05, + "loss": 0.7627, + "num_input_tokens_seen": 9132128, + "step": 1705 + }, + { + "epoch": 0.2736, + "grad_norm": 0.9195050597190857, + "learning_rate": 4.131965490178959e-05, + "loss": 0.6527, + "num_input_tokens_seen": 9160960, + "step": 1710 + }, + { + "epoch": 0.2744, + "grad_norm": 0.9934877157211304, + "learning_rate": 4.1272005629893714e-05, + "loss": 0.7102, + "num_input_tokens_seen": 9190992, + "step": 1715 + }, + { + "epoch": 0.2752, + "grad_norm": 0.8816946148872375, + "learning_rate": 4.122425357516658e-05, + "loss": 0.67, + "num_input_tokens_seen": 9218320, + "step": 1720 + }, + { + "epoch": 0.276, + "grad_norm": 0.7904371619224548, + "learning_rate": 4.1176399039236116e-05, + "loss": 0.7159, + "num_input_tokens_seen": 9246304, + "step": 1725 + }, + { + "epoch": 0.2768, + "grad_norm": 0.795921266078949, + "learning_rate": 4.112844232437757e-05, + "loss": 0.8248, + "num_input_tokens_seen": 9271856, + "step": 1730 + }, + { + "epoch": 0.2776, + "grad_norm": 0.8109453320503235, + "learning_rate": 4.108038373351163e-05, + "loss": 0.7264, + "num_input_tokens_seen": 9297152, + "step": 1735 + }, + { + "epoch": 0.2784, + "grad_norm": 0.8012672066688538, + "learning_rate": 4.1032223570202474e-05, + "loss": 0.7368, + "num_input_tokens_seen": 9326896, + "step": 1740 + }, + { + "epoch": 0.2792, + "grad_norm": 0.8711723685264587, + "learning_rate": 4.0983962138655873e-05, + "loss": 0.6245, + "num_input_tokens_seen": 9351680, + "step": 1745 + }, + { + "epoch": 0.28, + "grad_norm": 1.034636378288269, + "learning_rate": 4.093559974371725e-05, + "loss": 0.8033, + "num_input_tokens_seen": 9374896, + "step": 1750 + }, + { + "epoch": 0.2808, + "grad_norm": 0.8999419808387756, + "learning_rate": 4.088713669086977e-05, + "loss": 0.6803, + "num_input_tokens_seen": 9400592, + "step": 1755 + }, + { + "epoch": 0.2816, + "grad_norm": 0.5961094498634338, + "learning_rate": 4.083857328623243e-05, + "loss": 0.7384, + "num_input_tokens_seen": 9429280, + "step": 1760 + }, + { + "epoch": 0.2824, + "grad_norm": 1.194028377532959, + "learning_rate": 4.078990983655807e-05, + "loss": 0.8149, + "num_input_tokens_seen": 9454736, + "step": 1765 + }, + { + "epoch": 0.2832, + "grad_norm": 0.904292643070221, + "learning_rate": 4.0741146649231504e-05, + "loss": 0.7243, + "num_input_tokens_seen": 9479648, + "step": 1770 + }, + { + "epoch": 0.284, + "grad_norm": 0.8501243591308594, + "learning_rate": 4.0692284032267516e-05, + "loss": 0.7639, + "num_input_tokens_seen": 9504432, + "step": 1775 + }, + { + "epoch": 0.2848, + "grad_norm": 1.0718458890914917, + "learning_rate": 4.064332229430895e-05, + "loss": 0.6857, + "num_input_tokens_seen": 9528880, + "step": 1780 + }, + { + "epoch": 0.2856, + "grad_norm": 0.7065584063529968, + "learning_rate": 4.059426174462476e-05, + "loss": 0.69, + "num_input_tokens_seen": 9557360, + "step": 1785 + }, + { + "epoch": 0.2864, + "grad_norm": 1.0800750255584717, + "learning_rate": 4.054510269310803e-05, + "loss": 0.704, + "num_input_tokens_seen": 9580608, + "step": 1790 + }, + { + "epoch": 0.2872, + "grad_norm": 0.5907096862792969, + "learning_rate": 4.0495845450274064e-05, + "loss": 0.8015, + "num_input_tokens_seen": 9611376, + "step": 1795 + }, + { + "epoch": 0.288, + "grad_norm": 0.9455146789550781, + "learning_rate": 4.044649032725836e-05, + "loss": 0.7382, + "num_input_tokens_seen": 9640784, + "step": 1800 + }, + { + "epoch": 0.2888, + "grad_norm": 1.0408939123153687, + "learning_rate": 4.039703763581472e-05, + "loss": 0.7299, + "num_input_tokens_seen": 9667120, + "step": 1805 + }, + { + "epoch": 0.2896, + "grad_norm": 0.8098856806755066, + "learning_rate": 4.0347487688313194e-05, + "loss": 0.6402, + "num_input_tokens_seen": 9696832, + "step": 1810 + }, + { + "epoch": 0.2904, + "grad_norm": 0.695599377155304, + "learning_rate": 4.02978407977382e-05, + "loss": 0.711, + "num_input_tokens_seen": 9722080, + "step": 1815 + }, + { + "epoch": 0.2912, + "grad_norm": 0.6605217456817627, + "learning_rate": 4.024809727768648e-05, + "loss": 0.6587, + "num_input_tokens_seen": 9748096, + "step": 1820 + }, + { + "epoch": 0.292, + "grad_norm": 0.9249849915504456, + "learning_rate": 4.019825744236514e-05, + "loss": 0.6656, + "num_input_tokens_seen": 9774128, + "step": 1825 + }, + { + "epoch": 0.2928, + "grad_norm": 0.8226694464683533, + "learning_rate": 4.0148321606589656e-05, + "loss": 0.7143, + "num_input_tokens_seen": 9805488, + "step": 1830 + }, + { + "epoch": 0.2936, + "grad_norm": 1.0425550937652588, + "learning_rate": 4.009829008578192e-05, + "loss": 0.6735, + "num_input_tokens_seen": 9828480, + "step": 1835 + }, + { + "epoch": 0.2944, + "grad_norm": 0.6911535263061523, + "learning_rate": 4.0048163195968214e-05, + "loss": 0.7395, + "num_input_tokens_seen": 9863648, + "step": 1840 + }, + { + "epoch": 0.2952, + "grad_norm": 0.8600900769233704, + "learning_rate": 3.999794125377721e-05, + "loss": 0.729, + "num_input_tokens_seen": 9893184, + "step": 1845 + }, + { + "epoch": 0.296, + "grad_norm": 1.009696364402771, + "learning_rate": 3.9947624576437975e-05, + "loss": 0.6565, + "num_input_tokens_seen": 9922464, + "step": 1850 + }, + { + "epoch": 0.2968, + "grad_norm": 0.916327178478241, + "learning_rate": 3.9897213481778006e-05, + "loss": 0.691, + "num_input_tokens_seen": 9948384, + "step": 1855 + }, + { + "epoch": 0.2976, + "grad_norm": 0.9392701387405396, + "learning_rate": 3.984670828822118e-05, + "loss": 0.7408, + "num_input_tokens_seen": 9973760, + "step": 1860 + }, + { + "epoch": 0.2984, + "grad_norm": 0.9044517278671265, + "learning_rate": 3.979610931478574e-05, + "loss": 0.761, + "num_input_tokens_seen": 10001648, + "step": 1865 + }, + { + "epoch": 0.2992, + "grad_norm": 0.9471223950386047, + "learning_rate": 3.97454168810823e-05, + "loss": 0.8524, + "num_input_tokens_seen": 10024912, + "step": 1870 + }, + { + "epoch": 0.3, + "grad_norm": 1.0985262393951416, + "learning_rate": 3.969463130731183e-05, + "loss": 0.7221, + "num_input_tokens_seen": 10049872, + "step": 1875 + }, + { + "epoch": 0.3008, + "grad_norm": 0.8284273147583008, + "learning_rate": 3.964375291426361e-05, + "loss": 0.7708, + "num_input_tokens_seen": 10073568, + "step": 1880 + }, + { + "epoch": 0.3016, + "grad_norm": 0.7012784481048584, + "learning_rate": 3.959278202331322e-05, + "loss": 0.6842, + "num_input_tokens_seen": 10098448, + "step": 1885 + }, + { + "epoch": 0.3024, + "grad_norm": 1.1056398153305054, + "learning_rate": 3.954171895642052e-05, + "loss": 0.772, + "num_input_tokens_seen": 10123168, + "step": 1890 + }, + { + "epoch": 0.3032, + "grad_norm": 1.0128076076507568, + "learning_rate": 3.949056403612758e-05, + "loss": 0.6993, + "num_input_tokens_seen": 10149440, + "step": 1895 + }, + { + "epoch": 0.304, + "grad_norm": 0.7793564796447754, + "learning_rate": 3.943931758555669e-05, + "loss": 0.7672, + "num_input_tokens_seen": 10174496, + "step": 1900 + }, + { + "epoch": 0.3048, + "grad_norm": 0.909677267074585, + "learning_rate": 3.938797992840828e-05, + "loss": 0.6716, + "num_input_tokens_seen": 10199648, + "step": 1905 + }, + { + "epoch": 0.3056, + "grad_norm": 0.8851680159568787, + "learning_rate": 3.933655138895889e-05, + "loss": 0.7062, + "num_input_tokens_seen": 10221840, + "step": 1910 + }, + { + "epoch": 0.3064, + "grad_norm": 0.9452556371688843, + "learning_rate": 3.928503229205913e-05, + "loss": 0.6748, + "num_input_tokens_seen": 10247504, + "step": 1915 + }, + { + "epoch": 0.3072, + "grad_norm": 0.8891339302062988, + "learning_rate": 3.9233422963131616e-05, + "loss": 0.6331, + "num_input_tokens_seen": 10277984, + "step": 1920 + }, + { + "epoch": 0.308, + "grad_norm": 0.9662081599235535, + "learning_rate": 3.9181723728168916e-05, + "loss": 0.779, + "num_input_tokens_seen": 10300400, + "step": 1925 + }, + { + "epoch": 0.3088, + "grad_norm": 0.9517924785614014, + "learning_rate": 3.91299349137315e-05, + "loss": 0.722, + "num_input_tokens_seen": 10326672, + "step": 1930 + }, + { + "epoch": 0.3096, + "grad_norm": 0.755901575088501, + "learning_rate": 3.907805684694566e-05, + "loss": 0.6321, + "num_input_tokens_seen": 10356864, + "step": 1935 + }, + { + "epoch": 0.3104, + "grad_norm": 0.8272456526756287, + "learning_rate": 3.902608985550147e-05, + "loss": 0.6077, + "num_input_tokens_seen": 10388032, + "step": 1940 + }, + { + "epoch": 0.3112, + "grad_norm": 1.138036847114563, + "learning_rate": 3.897403426765069e-05, + "loss": 0.6726, + "num_input_tokens_seen": 10417152, + "step": 1945 + }, + { + "epoch": 0.312, + "grad_norm": 0.8155280351638794, + "learning_rate": 3.8921890412204705e-05, + "loss": 0.741, + "num_input_tokens_seen": 10448128, + "step": 1950 + }, + { + "epoch": 0.3128, + "grad_norm": 0.7004032135009766, + "learning_rate": 3.886965861853244e-05, + "loss": 0.6555, + "num_input_tokens_seen": 10474960, + "step": 1955 + }, + { + "epoch": 0.3136, + "grad_norm": 0.9554680585861206, + "learning_rate": 3.881733921655829e-05, + "loss": 0.75, + "num_input_tokens_seen": 10502848, + "step": 1960 + }, + { + "epoch": 0.3144, + "grad_norm": 0.8525771498680115, + "learning_rate": 3.876493253676004e-05, + "loss": 0.7042, + "num_input_tokens_seen": 10532640, + "step": 1965 + }, + { + "epoch": 0.3152, + "grad_norm": 0.8739621043205261, + "learning_rate": 3.871243891016676e-05, + "loss": 0.6188, + "num_input_tokens_seen": 10560096, + "step": 1970 + }, + { + "epoch": 0.316, + "grad_norm": 0.9146223068237305, + "learning_rate": 3.865985866835673e-05, + "loss": 0.8165, + "num_input_tokens_seen": 10585520, + "step": 1975 + }, + { + "epoch": 0.3168, + "grad_norm": 1.1149648427963257, + "learning_rate": 3.8607192143455326e-05, + "loss": 0.7437, + "num_input_tokens_seen": 10614560, + "step": 1980 + }, + { + "epoch": 0.3176, + "grad_norm": 0.9382626414299011, + "learning_rate": 3.8554439668132946e-05, + "loss": 0.7758, + "num_input_tokens_seen": 10637344, + "step": 1985 + }, + { + "epoch": 0.3184, + "grad_norm": 0.9469596743583679, + "learning_rate": 3.85016015756029e-05, + "loss": 0.7593, + "num_input_tokens_seen": 10663440, + "step": 1990 + }, + { + "epoch": 0.3192, + "grad_norm": 0.8701977133750916, + "learning_rate": 3.844867819961928e-05, + "loss": 0.6535, + "num_input_tokens_seen": 10693392, + "step": 1995 + }, + { + "epoch": 0.32, + "grad_norm": 0.7110251188278198, + "learning_rate": 3.8395669874474915e-05, + "loss": 0.8263, + "num_input_tokens_seen": 10719232, + "step": 2000 + }, + { + "epoch": 0.3208, + "grad_norm": 0.8518005609512329, + "learning_rate": 3.8342576934999184e-05, + "loss": 0.7992, + "num_input_tokens_seen": 10746560, + "step": 2005 + }, + { + "epoch": 0.3216, + "grad_norm": 0.9604689478874207, + "learning_rate": 3.828939971655595e-05, + "loss": 0.7513, + "num_input_tokens_seen": 10768512, + "step": 2010 + }, + { + "epoch": 0.3224, + "grad_norm": 0.8639784455299377, + "learning_rate": 3.8236138555041434e-05, + "loss": 0.6775, + "num_input_tokens_seen": 10803648, + "step": 2015 + }, + { + "epoch": 0.3232, + "grad_norm": 0.8527281880378723, + "learning_rate": 3.8182793786882065e-05, + "loss": 0.7856, + "num_input_tokens_seen": 10830640, + "step": 2020 + }, + { + "epoch": 0.324, + "grad_norm": 0.7717742919921875, + "learning_rate": 3.81293657490324e-05, + "loss": 0.6793, + "num_input_tokens_seen": 10860272, + "step": 2025 + }, + { + "epoch": 0.3248, + "grad_norm": 0.6685821413993835, + "learning_rate": 3.8075854778972955e-05, + "loss": 0.6546, + "num_input_tokens_seen": 10887664, + "step": 2030 + }, + { + "epoch": 0.3256, + "grad_norm": 0.9813340306282043, + "learning_rate": 3.802226121470811e-05, + "loss": 0.6673, + "num_input_tokens_seen": 10912000, + "step": 2035 + }, + { + "epoch": 0.3264, + "grad_norm": 1.0419212579727173, + "learning_rate": 3.796858539476394e-05, + "loss": 0.6933, + "num_input_tokens_seen": 10936704, + "step": 2040 + }, + { + "epoch": 0.3272, + "grad_norm": 0.851434588432312, + "learning_rate": 3.7914827658186103e-05, + "loss": 0.6593, + "num_input_tokens_seen": 10960464, + "step": 2045 + }, + { + "epoch": 0.328, + "grad_norm": 0.7272098660469055, + "learning_rate": 3.786098834453766e-05, + "loss": 0.6246, + "num_input_tokens_seen": 10989680, + "step": 2050 + }, + { + "epoch": 0.3288, + "grad_norm": 0.7740225791931152, + "learning_rate": 3.780706779389701e-05, + "loss": 0.7029, + "num_input_tokens_seen": 11014928, + "step": 2055 + }, + { + "epoch": 0.3296, + "grad_norm": 0.963455080986023, + "learning_rate": 3.775306634685562e-05, + "loss": 0.7331, + "num_input_tokens_seen": 11041920, + "step": 2060 + }, + { + "epoch": 0.3304, + "grad_norm": 0.7765479683876038, + "learning_rate": 3.7698984344515997e-05, + "loss": 0.6624, + "num_input_tokens_seen": 11070304, + "step": 2065 + }, + { + "epoch": 0.3312, + "grad_norm": 0.8283601999282837, + "learning_rate": 3.764482212848948e-05, + "loss": 0.7505, + "num_input_tokens_seen": 11099520, + "step": 2070 + }, + { + "epoch": 0.332, + "grad_norm": 0.5610854029655457, + "learning_rate": 3.759058004089402e-05, + "loss": 0.6908, + "num_input_tokens_seen": 11129008, + "step": 2075 + }, + { + "epoch": 0.3328, + "grad_norm": 0.8462053537368774, + "learning_rate": 3.753625842435216e-05, + "loss": 0.7062, + "num_input_tokens_seen": 11151600, + "step": 2080 + }, + { + "epoch": 0.3336, + "grad_norm": 0.8926122188568115, + "learning_rate": 3.748185762198873e-05, + "loss": 0.7177, + "num_input_tokens_seen": 11176784, + "step": 2085 + }, + { + "epoch": 0.3344, + "grad_norm": 0.6711943745613098, + "learning_rate": 3.742737797742878e-05, + "loss": 0.7504, + "num_input_tokens_seen": 11205008, + "step": 2090 + }, + { + "epoch": 0.3352, + "grad_norm": 1.014253854751587, + "learning_rate": 3.7372819834795335e-05, + "loss": 0.7144, + "num_input_tokens_seen": 11229872, + "step": 2095 + }, + { + "epoch": 0.336, + "grad_norm": 0.7249706983566284, + "learning_rate": 3.731818353870729e-05, + "loss": 0.6876, + "num_input_tokens_seen": 11253296, + "step": 2100 + }, + { + "epoch": 0.3368, + "grad_norm": 0.8249915838241577, + "learning_rate": 3.726346943427719e-05, + "loss": 0.7102, + "num_input_tokens_seen": 11279408, + "step": 2105 + }, + { + "epoch": 0.3376, + "grad_norm": 1.027541995048523, + "learning_rate": 3.720867786710904e-05, + "loss": 0.7708, + "num_input_tokens_seen": 11304176, + "step": 2110 + }, + { + "epoch": 0.3384, + "grad_norm": 0.7004812955856323, + "learning_rate": 3.7153809183296176e-05, + "loss": 0.5882, + "num_input_tokens_seen": 11330944, + "step": 2115 + }, + { + "epoch": 0.3392, + "grad_norm": 1.1122636795043945, + "learning_rate": 3.7098863729419e-05, + "loss": 0.6127, + "num_input_tokens_seen": 11354064, + "step": 2120 + }, + { + "epoch": 0.34, + "grad_norm": 0.925553560256958, + "learning_rate": 3.704384185254288e-05, + "loss": 0.7732, + "num_input_tokens_seen": 11376288, + "step": 2125 + }, + { + "epoch": 0.3408, + "grad_norm": 0.6940233707427979, + "learning_rate": 3.6988743900215894e-05, + "loss": 0.7334, + "num_input_tokens_seen": 11405472, + "step": 2130 + }, + { + "epoch": 0.3416, + "grad_norm": 0.7634669542312622, + "learning_rate": 3.693357022046665e-05, + "loss": 0.8137, + "num_input_tokens_seen": 11431552, + "step": 2135 + }, + { + "epoch": 0.3424, + "grad_norm": 0.804530680179596, + "learning_rate": 3.68783211618021e-05, + "loss": 0.6987, + "num_input_tokens_seen": 11459152, + "step": 2140 + }, + { + "epoch": 0.3432, + "grad_norm": 1.1058536767959595, + "learning_rate": 3.682299707320532e-05, + "loss": 0.6614, + "num_input_tokens_seen": 11487552, + "step": 2145 + }, + { + "epoch": 0.344, + "grad_norm": 0.6808910369873047, + "learning_rate": 3.6767598304133324e-05, + "loss": 0.688, + "num_input_tokens_seen": 11515792, + "step": 2150 + }, + { + "epoch": 0.3448, + "grad_norm": 1.0619826316833496, + "learning_rate": 3.671212520451484e-05, + "loss": 0.7897, + "num_input_tokens_seen": 11541280, + "step": 2155 + }, + { + "epoch": 0.3456, + "grad_norm": 0.8404290080070496, + "learning_rate": 3.665657812474812e-05, + "loss": 0.7086, + "num_input_tokens_seen": 11569440, + "step": 2160 + }, + { + "epoch": 0.3464, + "grad_norm": 1.316372036933899, + "learning_rate": 3.660095741569871e-05, + "loss": 0.7421, + "num_input_tokens_seen": 11597792, + "step": 2165 + }, + { + "epoch": 0.3472, + "grad_norm": 0.7798539400100708, + "learning_rate": 3.654526342869724e-05, + "loss": 0.6954, + "num_input_tokens_seen": 11622864, + "step": 2170 + }, + { + "epoch": 0.348, + "grad_norm": 0.7101672887802124, + "learning_rate": 3.6489496515537204e-05, + "loss": 0.6764, + "num_input_tokens_seen": 11651280, + "step": 2175 + }, + { + "epoch": 0.3488, + "grad_norm": 0.8456715941429138, + "learning_rate": 3.643365702847272e-05, + "loss": 0.705, + "num_input_tokens_seen": 11680048, + "step": 2180 + }, + { + "epoch": 0.3496, + "grad_norm": 0.9790185689926147, + "learning_rate": 3.6377745320216346e-05, + "loss": 0.7433, + "num_input_tokens_seen": 11702144, + "step": 2185 + }, + { + "epoch": 0.3504, + "grad_norm": 0.9205552935600281, + "learning_rate": 3.632176174393682e-05, + "loss": 0.653, + "num_input_tokens_seen": 11728816, + "step": 2190 + }, + { + "epoch": 0.3512, + "grad_norm": 0.8499376177787781, + "learning_rate": 3.626570665325684e-05, + "loss": 0.6381, + "num_input_tokens_seen": 11756688, + "step": 2195 + }, + { + "epoch": 0.352, + "grad_norm": 0.7778225541114807, + "learning_rate": 3.6209580402250815e-05, + "loss": 0.7347, + "num_input_tokens_seen": 11781664, + "step": 2200 + }, + { + "epoch": 0.3528, + "grad_norm": 0.8913766145706177, + "learning_rate": 3.615338334544265e-05, + "loss": 0.8036, + "num_input_tokens_seen": 11808352, + "step": 2205 + }, + { + "epoch": 0.3536, + "grad_norm": 1.0191758871078491, + "learning_rate": 3.6097115837803505e-05, + "loss": 0.7486, + "num_input_tokens_seen": 11836400, + "step": 2210 + }, + { + "epoch": 0.3544, + "grad_norm": 0.7858436703681946, + "learning_rate": 3.604077823474954e-05, + "loss": 0.7885, + "num_input_tokens_seen": 11862608, + "step": 2215 + }, + { + "epoch": 0.3552, + "grad_norm": 0.6349871158599854, + "learning_rate": 3.5984370892139666e-05, + "loss": 0.7005, + "num_input_tokens_seen": 11886528, + "step": 2220 + }, + { + "epoch": 0.356, + "grad_norm": 0.8877844214439392, + "learning_rate": 3.592789416627332e-05, + "loss": 0.607, + "num_input_tokens_seen": 11915040, + "step": 2225 + }, + { + "epoch": 0.3568, + "grad_norm": 1.1504970788955688, + "learning_rate": 3.5871348413888204e-05, + "loss": 0.6723, + "num_input_tokens_seen": 11942768, + "step": 2230 + }, + { + "epoch": 0.3576, + "grad_norm": 0.7394466400146484, + "learning_rate": 3.581473399215802e-05, + "loss": 0.7302, + "num_input_tokens_seen": 11978464, + "step": 2235 + }, + { + "epoch": 0.3584, + "grad_norm": 1.1570250988006592, + "learning_rate": 3.575805125869022e-05, + "loss": 0.6867, + "num_input_tokens_seen": 12001392, + "step": 2240 + }, + { + "epoch": 0.3592, + "grad_norm": 0.8141620755195618, + "learning_rate": 3.5701300571523755e-05, + "loss": 0.7346, + "num_input_tokens_seen": 12030352, + "step": 2245 + }, + { + "epoch": 0.36, + "grad_norm": 0.8653257489204407, + "learning_rate": 3.564448228912682e-05, + "loss": 0.6381, + "num_input_tokens_seen": 12062384, + "step": 2250 + }, + { + "epoch": 0.3608, + "grad_norm": 0.8065868020057678, + "learning_rate": 3.558759677039455e-05, + "loss": 0.7679, + "num_input_tokens_seen": 12089408, + "step": 2255 + }, + { + "epoch": 0.3616, + "grad_norm": 0.7610428929328918, + "learning_rate": 3.5530644374646815e-05, + "loss": 0.668, + "num_input_tokens_seen": 12114656, + "step": 2260 + }, + { + "epoch": 0.3624, + "grad_norm": 0.8063391447067261, + "learning_rate": 3.547362546162588e-05, + "loss": 0.7454, + "num_input_tokens_seen": 12144832, + "step": 2265 + }, + { + "epoch": 0.3632, + "grad_norm": 1.0300970077514648, + "learning_rate": 3.54165403914942e-05, + "loss": 0.7513, + "num_input_tokens_seen": 12170096, + "step": 2270 + }, + { + "epoch": 0.364, + "grad_norm": 1.1293412446975708, + "learning_rate": 3.535938952483211e-05, + "loss": 0.7881, + "num_input_tokens_seen": 12191104, + "step": 2275 + }, + { + "epoch": 0.3648, + "grad_norm": 0.8911874294281006, + "learning_rate": 3.5302173222635524e-05, + "loss": 0.7253, + "num_input_tokens_seen": 12214416, + "step": 2280 + }, + { + "epoch": 0.3656, + "grad_norm": 1.0665303468704224, + "learning_rate": 3.5244891846313736e-05, + "loss": 0.8122, + "num_input_tokens_seen": 12241344, + "step": 2285 + }, + { + "epoch": 0.3664, + "grad_norm": 0.6204916834831238, + "learning_rate": 3.5187545757687015e-05, + "loss": 0.6188, + "num_input_tokens_seen": 12269376, + "step": 2290 + }, + { + "epoch": 0.3672, + "grad_norm": 0.7871102094650269, + "learning_rate": 3.5130135318984456e-05, + "loss": 0.7138, + "num_input_tokens_seen": 12294960, + "step": 2295 + }, + { + "epoch": 0.368, + "grad_norm": 0.7584692239761353, + "learning_rate": 3.507266089284157e-05, + "loss": 0.7425, + "num_input_tokens_seen": 12318864, + "step": 2300 + }, + { + "epoch": 0.3688, + "grad_norm": 0.6678550839424133, + "learning_rate": 3.501512284229807e-05, + "loss": 0.7238, + "num_input_tokens_seen": 12345520, + "step": 2305 + }, + { + "epoch": 0.3696, + "grad_norm": 0.9825206398963928, + "learning_rate": 3.495752153079557e-05, + "loss": 0.684, + "num_input_tokens_seen": 12369600, + "step": 2310 + }, + { + "epoch": 0.3704, + "grad_norm": 0.8038123250007629, + "learning_rate": 3.489985732217525e-05, + "loss": 0.707, + "num_input_tokens_seen": 12394400, + "step": 2315 + }, + { + "epoch": 0.3712, + "grad_norm": 1.158873438835144, + "learning_rate": 3.484213058067559e-05, + "loss": 0.5843, + "num_input_tokens_seen": 12420848, + "step": 2320 + }, + { + "epoch": 0.372, + "grad_norm": 0.8114385604858398, + "learning_rate": 3.4784341670930065e-05, + "loss": 0.7014, + "num_input_tokens_seen": 12446192, + "step": 2325 + }, + { + "epoch": 0.3728, + "grad_norm": 0.8132364749908447, + "learning_rate": 3.4726490957964834e-05, + "loss": 0.777, + "num_input_tokens_seen": 12472960, + "step": 2330 + }, + { + "epoch": 0.3736, + "grad_norm": 0.7918152213096619, + "learning_rate": 3.466857880719645e-05, + "loss": 0.6856, + "num_input_tokens_seen": 12504256, + "step": 2335 + }, + { + "epoch": 0.3744, + "grad_norm": 0.8399984240531921, + "learning_rate": 3.461060558442952e-05, + "loss": 0.7742, + "num_input_tokens_seen": 12529872, + "step": 2340 + }, + { + "epoch": 0.3752, + "grad_norm": 1.0398231744766235, + "learning_rate": 3.455257165585444e-05, + "loss": 0.6815, + "num_input_tokens_seen": 12552368, + "step": 2345 + }, + { + "epoch": 0.376, + "grad_norm": 0.9708042144775391, + "learning_rate": 3.4494477388045035e-05, + "loss": 0.677, + "num_input_tokens_seen": 12576720, + "step": 2350 + }, + { + "epoch": 0.3768, + "grad_norm": 0.8928380012512207, + "learning_rate": 3.443632314795627e-05, + "loss": 0.6239, + "num_input_tokens_seen": 12606096, + "step": 2355 + }, + { + "epoch": 0.3776, + "grad_norm": 1.3437156677246094, + "learning_rate": 3.437810930292195e-05, + "loss": 0.7379, + "num_input_tokens_seen": 12631376, + "step": 2360 + }, + { + "epoch": 0.3784, + "grad_norm": 0.9309334754943848, + "learning_rate": 3.4319836220652335e-05, + "loss": 0.7315, + "num_input_tokens_seen": 12662096, + "step": 2365 + }, + { + "epoch": 0.3792, + "grad_norm": 1.4636520147323608, + "learning_rate": 3.4261504269231904e-05, + "loss": 0.7738, + "num_input_tokens_seen": 12691696, + "step": 2370 + }, + { + "epoch": 0.38, + "grad_norm": 0.8436228632926941, + "learning_rate": 3.4203113817116957e-05, + "loss": 0.7307, + "num_input_tokens_seen": 12718368, + "step": 2375 + }, + { + "epoch": 0.3808, + "grad_norm": 0.877709150314331, + "learning_rate": 3.414466523313332e-05, + "loss": 0.7119, + "num_input_tokens_seen": 12743664, + "step": 2380 + }, + { + "epoch": 0.3816, + "grad_norm": 1.2288016080856323, + "learning_rate": 3.408615888647402e-05, + "loss": 0.781, + "num_input_tokens_seen": 12775088, + "step": 2385 + }, + { + "epoch": 0.3824, + "grad_norm": 0.8335594534873962, + "learning_rate": 3.402759514669694e-05, + "loss": 0.6256, + "num_input_tokens_seen": 12802576, + "step": 2390 + }, + { + "epoch": 0.3832, + "grad_norm": 1.0417710542678833, + "learning_rate": 3.3968974383722495e-05, + "loss": 0.7672, + "num_input_tokens_seen": 12831280, + "step": 2395 + }, + { + "epoch": 0.384, + "grad_norm": 1.1079373359680176, + "learning_rate": 3.3910296967831266e-05, + "loss": 0.7665, + "num_input_tokens_seen": 12853744, + "step": 2400 + }, + { + "epoch": 0.3848, + "grad_norm": 0.870614230632782, + "learning_rate": 3.3851563269661726e-05, + "loss": 0.6321, + "num_input_tokens_seen": 12883408, + "step": 2405 + }, + { + "epoch": 0.3856, + "grad_norm": 1.090280294418335, + "learning_rate": 3.379277366020782e-05, + "loss": 0.7086, + "num_input_tokens_seen": 12914592, + "step": 2410 + }, + { + "epoch": 0.3864, + "grad_norm": 0.8816367983818054, + "learning_rate": 3.373392851081668e-05, + "loss": 0.7712, + "num_input_tokens_seen": 12936832, + "step": 2415 + }, + { + "epoch": 0.3872, + "grad_norm": 0.8722823858261108, + "learning_rate": 3.367502819318624e-05, + "loss": 0.6844, + "num_input_tokens_seen": 12962864, + "step": 2420 + }, + { + "epoch": 0.388, + "grad_norm": 0.9704541563987732, + "learning_rate": 3.3616073079362926e-05, + "loss": 0.6877, + "num_input_tokens_seen": 12992560, + "step": 2425 + }, + { + "epoch": 0.3888, + "grad_norm": 0.8094004988670349, + "learning_rate": 3.355706354173928e-05, + "loss": 0.8139, + "num_input_tokens_seen": 13015440, + "step": 2430 + }, + { + "epoch": 0.3896, + "grad_norm": 0.8286037445068359, + "learning_rate": 3.349799995305162e-05, + "loss": 0.6696, + "num_input_tokens_seen": 13039008, + "step": 2435 + }, + { + "epoch": 0.3904, + "grad_norm": 0.985637366771698, + "learning_rate": 3.343888268637765e-05, + "loss": 0.7001, + "num_input_tokens_seen": 13067648, + "step": 2440 + }, + { + "epoch": 0.3912, + "grad_norm": 0.8938013911247253, + "learning_rate": 3.337971211513417e-05, + "loss": 0.8036, + "num_input_tokens_seen": 13090064, + "step": 2445 + }, + { + "epoch": 0.392, + "grad_norm": 0.7293727397918701, + "learning_rate": 3.332048861307467e-05, + "loss": 0.7405, + "num_input_tokens_seen": 13119856, + "step": 2450 + }, + { + "epoch": 0.3928, + "grad_norm": 0.5999038219451904, + "learning_rate": 3.3261212554286975e-05, + "loss": 0.6975, + "num_input_tokens_seen": 13148288, + "step": 2455 + }, + { + "epoch": 0.3936, + "grad_norm": 0.8091318607330322, + "learning_rate": 3.320188431319088e-05, + "loss": 0.6809, + "num_input_tokens_seen": 13175616, + "step": 2460 + }, + { + "epoch": 0.3944, + "grad_norm": 1.0293824672698975, + "learning_rate": 3.3142504264535804e-05, + "loss": 0.7749, + "num_input_tokens_seen": 13199280, + "step": 2465 + }, + { + "epoch": 0.3952, + "grad_norm": 0.793485701084137, + "learning_rate": 3.3083072783398416e-05, + "loss": 0.6873, + "num_input_tokens_seen": 13224640, + "step": 2470 + }, + { + "epoch": 0.396, + "grad_norm": 0.8636240363121033, + "learning_rate": 3.302359024518024e-05, + "loss": 0.7554, + "num_input_tokens_seen": 13250448, + "step": 2475 + }, + { + "epoch": 0.3968, + "grad_norm": 0.9471914172172546, + "learning_rate": 3.296405702560532e-05, + "loss": 0.7112, + "num_input_tokens_seen": 13273472, + "step": 2480 + }, + { + "epoch": 0.3976, + "grad_norm": 1.1579172611236572, + "learning_rate": 3.2904473500717824e-05, + "loss": 0.8207, + "num_input_tokens_seen": 13300608, + "step": 2485 + }, + { + "epoch": 0.3984, + "grad_norm": 1.022197961807251, + "learning_rate": 3.2844840046879686e-05, + "loss": 0.693, + "num_input_tokens_seen": 13326976, + "step": 2490 + }, + { + "epoch": 0.3992, + "grad_norm": 0.7574387788772583, + "learning_rate": 3.278515704076821e-05, + "loss": 0.6826, + "num_input_tokens_seen": 13358528, + "step": 2495 + }, + { + "epoch": 0.4, + "grad_norm": 0.7097072005271912, + "learning_rate": 3.272542485937369e-05, + "loss": 0.6714, + "num_input_tokens_seen": 13384096, + "step": 2500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.8780053853988647, + "learning_rate": 3.2665643879997056e-05, + "loss": 0.7387, + "num_input_tokens_seen": 13417120, + "step": 2505 + }, + { + "epoch": 0.4016, + "grad_norm": 0.8968010544776917, + "learning_rate": 3.260581448024745e-05, + "loss": 0.6875, + "num_input_tokens_seen": 13444832, + "step": 2510 + }, + { + "epoch": 0.4024, + "grad_norm": 0.9647771716117859, + "learning_rate": 3.25459370380399e-05, + "loss": 0.834, + "num_input_tokens_seen": 13472304, + "step": 2515 + }, + { + "epoch": 0.4032, + "grad_norm": 0.9738301038742065, + "learning_rate": 3.248601193159287e-05, + "loss": 0.7144, + "num_input_tokens_seen": 13495984, + "step": 2520 + }, + { + "epoch": 0.404, + "grad_norm": 1.03775155544281, + "learning_rate": 3.2426039539425876e-05, + "loss": 0.7171, + "num_input_tokens_seen": 13523360, + "step": 2525 + }, + { + "epoch": 0.4048, + "grad_norm": 1.3964909315109253, + "learning_rate": 3.236602024035716e-05, + "loss": 0.7197, + "num_input_tokens_seen": 13550016, + "step": 2530 + }, + { + "epoch": 0.4056, + "grad_norm": 1.0805152654647827, + "learning_rate": 3.230595441350125e-05, + "loss": 0.7997, + "num_input_tokens_seen": 13575088, + "step": 2535 + }, + { + "epoch": 0.4064, + "grad_norm": 0.9613687992095947, + "learning_rate": 3.2245842438266526e-05, + "loss": 0.7847, + "num_input_tokens_seen": 13600832, + "step": 2540 + }, + { + "epoch": 0.4072, + "grad_norm": 0.9843304753303528, + "learning_rate": 3.2185684694352916e-05, + "loss": 0.7213, + "num_input_tokens_seen": 13627328, + "step": 2545 + }, + { + "epoch": 0.408, + "grad_norm": 0.7906083464622498, + "learning_rate": 3.21254815617494e-05, + "loss": 0.633, + "num_input_tokens_seen": 13651664, + "step": 2550 + }, + { + "epoch": 0.4088, + "grad_norm": 0.788149893283844, + "learning_rate": 3.206523342073172e-05, + "loss": 0.7512, + "num_input_tokens_seen": 13677248, + "step": 2555 + }, + { + "epoch": 0.4096, + "grad_norm": 0.7680060863494873, + "learning_rate": 3.2004940651859844e-05, + "loss": 0.703, + "num_input_tokens_seen": 13705904, + "step": 2560 + }, + { + "epoch": 0.4104, + "grad_norm": 0.8078610301017761, + "learning_rate": 3.194460363597569e-05, + "loss": 0.7212, + "num_input_tokens_seen": 13731520, + "step": 2565 + }, + { + "epoch": 0.4112, + "grad_norm": 1.2152231931686401, + "learning_rate": 3.1884222754200625e-05, + "loss": 0.7009, + "num_input_tokens_seen": 13753840, + "step": 2570 + }, + { + "epoch": 0.412, + "grad_norm": 0.8687548637390137, + "learning_rate": 3.1823798387933134e-05, + "loss": 0.718, + "num_input_tokens_seen": 13777504, + "step": 2575 + }, + { + "epoch": 0.4128, + "grad_norm": 1.1128169298171997, + "learning_rate": 3.176333091884635e-05, + "loss": 0.6796, + "num_input_tokens_seen": 13805392, + "step": 2580 + }, + { + "epoch": 0.4136, + "grad_norm": 0.6620244383811951, + "learning_rate": 3.170282072888566e-05, + "loss": 0.6632, + "num_input_tokens_seen": 13835600, + "step": 2585 + }, + { + "epoch": 0.4144, + "grad_norm": 1.0803226232528687, + "learning_rate": 3.1642268200266317e-05, + "loss": 0.743, + "num_input_tokens_seen": 13862528, + "step": 2590 + }, + { + "epoch": 0.4152, + "grad_norm": 0.8314620852470398, + "learning_rate": 3.1581673715471006e-05, + "loss": 0.7091, + "num_input_tokens_seen": 13890272, + "step": 2595 + }, + { + "epoch": 0.416, + "grad_norm": 1.0047166347503662, + "learning_rate": 3.152103765724743e-05, + "loss": 0.8011, + "num_input_tokens_seen": 13913328, + "step": 2600 + }, + { + "epoch": 0.4168, + "grad_norm": 0.9856431484222412, + "learning_rate": 3.1460360408605866e-05, + "loss": 0.7569, + "num_input_tokens_seen": 13943040, + "step": 2605 + }, + { + "epoch": 0.4176, + "grad_norm": 0.8467027544975281, + "learning_rate": 3.139964235281682e-05, + "loss": 0.6976, + "num_input_tokens_seen": 13971872, + "step": 2610 + }, + { + "epoch": 0.4184, + "grad_norm": 1.2195795774459839, + "learning_rate": 3.1338883873408516e-05, + "loss": 0.7039, + "num_input_tokens_seen": 13997456, + "step": 2615 + }, + { + "epoch": 0.4192, + "grad_norm": 0.832929253578186, + "learning_rate": 3.127808535416454e-05, + "loss": 0.7153, + "num_input_tokens_seen": 14024656, + "step": 2620 + }, + { + "epoch": 0.42, + "grad_norm": 0.8261767625808716, + "learning_rate": 3.121724717912138e-05, + "loss": 0.7317, + "num_input_tokens_seen": 14053680, + "step": 2625 + }, + { + "epoch": 0.4208, + "grad_norm": 0.8690986633300781, + "learning_rate": 3.1156369732566006e-05, + "loss": 0.6991, + "num_input_tokens_seen": 14080096, + "step": 2630 + }, + { + "epoch": 0.4216, + "grad_norm": 1.041561484336853, + "learning_rate": 3.1095453399033466e-05, + "loss": 0.7442, + "num_input_tokens_seen": 14108080, + "step": 2635 + }, + { + "epoch": 0.4224, + "grad_norm": 1.1139183044433594, + "learning_rate": 3.103449856330443e-05, + "loss": 0.7026, + "num_input_tokens_seen": 14132448, + "step": 2640 + }, + { + "epoch": 0.4232, + "grad_norm": 0.9388411045074463, + "learning_rate": 3.0973505610402765e-05, + "loss": 0.6425, + "num_input_tokens_seen": 14157312, + "step": 2645 + }, + { + "epoch": 0.424, + "grad_norm": 0.8923696279525757, + "learning_rate": 3.091247492559312e-05, + "loss": 0.7421, + "num_input_tokens_seen": 14184288, + "step": 2650 + }, + { + "epoch": 0.4248, + "grad_norm": 0.9683478474617004, + "learning_rate": 3.085140689437846e-05, + "loss": 0.7044, + "num_input_tokens_seen": 14207920, + "step": 2655 + }, + { + "epoch": 0.4256, + "grad_norm": 0.7942652106285095, + "learning_rate": 3.0790301902497666e-05, + "loss": 0.6892, + "num_input_tokens_seen": 14235504, + "step": 2660 + }, + { + "epoch": 0.4264, + "grad_norm": 0.9955897331237793, + "learning_rate": 3.072916033592307e-05, + "loss": 0.6595, + "num_input_tokens_seen": 14259280, + "step": 2665 + }, + { + "epoch": 0.4272, + "grad_norm": 0.9912785291671753, + "learning_rate": 3.0667982580858044e-05, + "loss": 0.6948, + "num_input_tokens_seen": 14286592, + "step": 2670 + }, + { + "epoch": 0.428, + "grad_norm": 1.352742314338684, + "learning_rate": 3.0606769023734536e-05, + "loss": 0.7009, + "num_input_tokens_seen": 14309280, + "step": 2675 + }, + { + "epoch": 0.4288, + "grad_norm": 1.183185338973999, + "learning_rate": 3.054552005121064e-05, + "loss": 0.6814, + "num_input_tokens_seen": 14335984, + "step": 2680 + }, + { + "epoch": 0.4296, + "grad_norm": 1.2679824829101562, + "learning_rate": 3.0484236050168153e-05, + "loss": 0.7468, + "num_input_tokens_seen": 14361024, + "step": 2685 + }, + { + "epoch": 0.4304, + "grad_norm": 1.1353107690811157, + "learning_rate": 3.0422917407710137e-05, + "loss": 0.629, + "num_input_tokens_seen": 14391440, + "step": 2690 + }, + { + "epoch": 0.4312, + "grad_norm": 1.1603094339370728, + "learning_rate": 3.0361564511158457e-05, + "loss": 0.7106, + "num_input_tokens_seen": 14417952, + "step": 2695 + }, + { + "epoch": 0.432, + "grad_norm": 0.9477285146713257, + "learning_rate": 3.0300177748051373e-05, + "loss": 0.7136, + "num_input_tokens_seen": 14446752, + "step": 2700 + }, + { + "epoch": 0.4328, + "grad_norm": 0.9295204281806946, + "learning_rate": 3.0238757506141012e-05, + "loss": 0.6269, + "num_input_tokens_seen": 14475280, + "step": 2705 + }, + { + "epoch": 0.4336, + "grad_norm": 0.8617603182792664, + "learning_rate": 3.0177304173391037e-05, + "loss": 0.6517, + "num_input_tokens_seen": 14498112, + "step": 2710 + }, + { + "epoch": 0.4344, + "grad_norm": 0.962295413017273, + "learning_rate": 3.0115818137974067e-05, + "loss": 0.6903, + "num_input_tokens_seen": 14525664, + "step": 2715 + }, + { + "epoch": 0.4352, + "grad_norm": 0.7317754030227661, + "learning_rate": 3.005429978826934e-05, + "loss": 0.7302, + "num_input_tokens_seen": 14551536, + "step": 2720 + }, + { + "epoch": 0.436, + "grad_norm": 0.9604383111000061, + "learning_rate": 2.9992749512860173e-05, + "loss": 0.7126, + "num_input_tokens_seen": 14574560, + "step": 2725 + }, + { + "epoch": 0.4368, + "grad_norm": 0.9363977313041687, + "learning_rate": 2.9931167700531578e-05, + "loss": 0.6701, + "num_input_tokens_seen": 14602384, + "step": 2730 + }, + { + "epoch": 0.4376, + "grad_norm": 1.0513427257537842, + "learning_rate": 2.9869554740267724e-05, + "loss": 0.5816, + "num_input_tokens_seen": 14633728, + "step": 2735 + }, + { + "epoch": 0.4384, + "grad_norm": 1.0142287015914917, + "learning_rate": 2.9807911021249573e-05, + "loss": 0.7965, + "num_input_tokens_seen": 14662752, + "step": 2740 + }, + { + "epoch": 0.4392, + "grad_norm": 0.8593106269836426, + "learning_rate": 2.9746236932852355e-05, + "loss": 0.6396, + "num_input_tokens_seen": 14690896, + "step": 2745 + }, + { + "epoch": 0.44, + "grad_norm": 0.912413477897644, + "learning_rate": 2.9684532864643122e-05, + "loss": 0.6914, + "num_input_tokens_seen": 14717680, + "step": 2750 + }, + { + "epoch": 0.4408, + "grad_norm": 1.1753630638122559, + "learning_rate": 2.9622799206378305e-05, + "loss": 0.7188, + "num_input_tokens_seen": 14744176, + "step": 2755 + }, + { + "epoch": 0.4416, + "grad_norm": 1.0383411645889282, + "learning_rate": 2.956103634800126e-05, + "loss": 0.6936, + "num_input_tokens_seen": 14772464, + "step": 2760 + }, + { + "epoch": 0.4424, + "grad_norm": 0.8875827789306641, + "learning_rate": 2.949924467963975e-05, + "loss": 0.709, + "num_input_tokens_seen": 14800896, + "step": 2765 + }, + { + "epoch": 0.4432, + "grad_norm": 1.0359493494033813, + "learning_rate": 2.943742459160354e-05, + "loss": 0.6361, + "num_input_tokens_seen": 14826624, + "step": 2770 + }, + { + "epoch": 0.444, + "grad_norm": 0.7070389986038208, + "learning_rate": 2.9375576474381905e-05, + "loss": 0.6062, + "num_input_tokens_seen": 14859392, + "step": 2775 + }, + { + "epoch": 0.4448, + "grad_norm": 1.0716419219970703, + "learning_rate": 2.9313700718641167e-05, + "loss": 0.7882, + "num_input_tokens_seen": 14882336, + "step": 2780 + }, + { + "epoch": 0.4456, + "grad_norm": 0.8054667115211487, + "learning_rate": 2.925179771522223e-05, + "loss": 0.7978, + "num_input_tokens_seen": 14911312, + "step": 2785 + }, + { + "epoch": 0.4464, + "grad_norm": 0.9502078294754028, + "learning_rate": 2.9189867855138103e-05, + "loss": 0.6835, + "num_input_tokens_seen": 14938400, + "step": 2790 + }, + { + "epoch": 0.4472, + "grad_norm": 0.8377355933189392, + "learning_rate": 2.912791152957145e-05, + "loss": 0.6566, + "num_input_tokens_seen": 14965424, + "step": 2795 + }, + { + "epoch": 0.448, + "grad_norm": 0.8674115538597107, + "learning_rate": 2.9065929129872094e-05, + "loss": 0.6616, + "num_input_tokens_seen": 14994368, + "step": 2800 + }, + { + "epoch": 0.4488, + "grad_norm": 0.9967759251594543, + "learning_rate": 2.900392104755455e-05, + "loss": 0.8051, + "num_input_tokens_seen": 15018480, + "step": 2805 + }, + { + "epoch": 0.4496, + "grad_norm": 1.04585862159729, + "learning_rate": 2.894188767429557e-05, + "loss": 0.6961, + "num_input_tokens_seen": 15045840, + "step": 2810 + }, + { + "epoch": 0.4504, + "grad_norm": 0.8065064549446106, + "learning_rate": 2.8879829401931652e-05, + "loss": 0.6898, + "num_input_tokens_seen": 15070832, + "step": 2815 + }, + { + "epoch": 0.4512, + "grad_norm": 0.7077392935752869, + "learning_rate": 2.881774662245658e-05, + "loss": 0.6789, + "num_input_tokens_seen": 15097008, + "step": 2820 + }, + { + "epoch": 0.452, + "grad_norm": 1.068467378616333, + "learning_rate": 2.875563972801893e-05, + "loss": 0.6684, + "num_input_tokens_seen": 15120080, + "step": 2825 + }, + { + "epoch": 0.4528, + "grad_norm": 0.7860395312309265, + "learning_rate": 2.8693509110919598e-05, + "loss": 0.6577, + "num_input_tokens_seen": 15144976, + "step": 2830 + }, + { + "epoch": 0.4536, + "grad_norm": 0.86238032579422, + "learning_rate": 2.863135516360932e-05, + "loss": 0.7893, + "num_input_tokens_seen": 15174640, + "step": 2835 + }, + { + "epoch": 0.4544, + "grad_norm": 0.8910583257675171, + "learning_rate": 2.856917827868622e-05, + "loss": 0.7377, + "num_input_tokens_seen": 15198128, + "step": 2840 + }, + { + "epoch": 0.4552, + "grad_norm": 0.9576541781425476, + "learning_rate": 2.8506978848893302e-05, + "loss": 0.821, + "num_input_tokens_seen": 15222224, + "step": 2845 + }, + { + "epoch": 0.456, + "grad_norm": 1.114388108253479, + "learning_rate": 2.844475726711595e-05, + "loss": 0.695, + "num_input_tokens_seen": 15246640, + "step": 2850 + }, + { + "epoch": 0.4568, + "grad_norm": 0.8997412323951721, + "learning_rate": 2.8382513926379504e-05, + "loss": 0.7175, + "num_input_tokens_seen": 15277728, + "step": 2855 + }, + { + "epoch": 0.4576, + "grad_norm": 1.1595414876937866, + "learning_rate": 2.832024921984674e-05, + "loss": 0.6505, + "num_input_tokens_seen": 15307040, + "step": 2860 + }, + { + "epoch": 0.4584, + "grad_norm": 0.7592776417732239, + "learning_rate": 2.825796354081537e-05, + "loss": 0.6686, + "num_input_tokens_seen": 15334176, + "step": 2865 + }, + { + "epoch": 0.4592, + "grad_norm": 1.0087366104125977, + "learning_rate": 2.8195657282715594e-05, + "loss": 0.6365, + "num_input_tokens_seen": 15360496, + "step": 2870 + }, + { + "epoch": 0.46, + "grad_norm": 0.9191427826881409, + "learning_rate": 2.8133330839107608e-05, + "loss": 0.6518, + "num_input_tokens_seen": 15381328, + "step": 2875 + }, + { + "epoch": 0.4608, + "grad_norm": 1.0468344688415527, + "learning_rate": 2.8070984603679107e-05, + "loss": 0.6262, + "num_input_tokens_seen": 15409936, + "step": 2880 + }, + { + "epoch": 0.4616, + "grad_norm": 1.1070493459701538, + "learning_rate": 2.800861897024279e-05, + "loss": 0.6684, + "num_input_tokens_seen": 15436848, + "step": 2885 + }, + { + "epoch": 0.4624, + "grad_norm": 1.3349978923797607, + "learning_rate": 2.79462343327339e-05, + "loss": 0.6978, + "num_input_tokens_seen": 15463328, + "step": 2890 + }, + { + "epoch": 0.4632, + "grad_norm": 0.7566163539886475, + "learning_rate": 2.7883831085207707e-05, + "loss": 0.7062, + "num_input_tokens_seen": 15489232, + "step": 2895 + }, + { + "epoch": 0.464, + "grad_norm": 0.7610609531402588, + "learning_rate": 2.782140962183704e-05, + "loss": 0.6642, + "num_input_tokens_seen": 15516224, + "step": 2900 + }, + { + "epoch": 0.4648, + "grad_norm": 0.7749585509300232, + "learning_rate": 2.7758970336909795e-05, + "loss": 0.6287, + "num_input_tokens_seen": 15545584, + "step": 2905 + }, + { + "epoch": 0.4656, + "grad_norm": 1.0202007293701172, + "learning_rate": 2.769651362482642e-05, + "loss": 0.6672, + "num_input_tokens_seen": 15571216, + "step": 2910 + }, + { + "epoch": 0.4664, + "grad_norm": 0.7980359792709351, + "learning_rate": 2.763403988009746e-05, + "loss": 0.737, + "num_input_tokens_seen": 15597744, + "step": 2915 + }, + { + "epoch": 0.4672, + "grad_norm": 0.942456841468811, + "learning_rate": 2.7571549497341042e-05, + "loss": 0.7715, + "num_input_tokens_seen": 15622496, + "step": 2920 + }, + { + "epoch": 0.468, + "grad_norm": 0.7782229781150818, + "learning_rate": 2.7509042871280372e-05, + "loss": 0.7435, + "num_input_tokens_seen": 15647344, + "step": 2925 + }, + { + "epoch": 0.4688, + "grad_norm": 1.0889509916305542, + "learning_rate": 2.744652039674129e-05, + "loss": 0.6946, + "num_input_tokens_seen": 15672672, + "step": 2930 + }, + { + "epoch": 0.4696, + "grad_norm": 1.0606461763381958, + "learning_rate": 2.7383982468649714e-05, + "loss": 0.7523, + "num_input_tokens_seen": 15696144, + "step": 2935 + }, + { + "epoch": 0.4704, + "grad_norm": 0.942613959312439, + "learning_rate": 2.73214294820292e-05, + "loss": 0.7218, + "num_input_tokens_seen": 15723600, + "step": 2940 + }, + { + "epoch": 0.4712, + "grad_norm": 0.8650354743003845, + "learning_rate": 2.7258861831998388e-05, + "loss": 0.6736, + "num_input_tokens_seen": 15749680, + "step": 2945 + }, + { + "epoch": 0.472, + "grad_norm": 0.8573226928710938, + "learning_rate": 2.7196279913768584e-05, + "loss": 0.6314, + "num_input_tokens_seen": 15776768, + "step": 2950 + }, + { + "epoch": 0.4728, + "grad_norm": 0.9692303538322449, + "learning_rate": 2.713368412264118e-05, + "loss": 0.7035, + "num_input_tokens_seen": 15801376, + "step": 2955 + }, + { + "epoch": 0.4736, + "grad_norm": 1.2111790180206299, + "learning_rate": 2.707107485400521e-05, + "loss": 0.6785, + "num_input_tokens_seen": 15828416, + "step": 2960 + }, + { + "epoch": 0.4744, + "grad_norm": 1.0816082954406738, + "learning_rate": 2.7008452503334858e-05, + "loss": 0.7672, + "num_input_tokens_seen": 15852720, + "step": 2965 + }, + { + "epoch": 0.4752, + "grad_norm": 0.980903148651123, + "learning_rate": 2.6945817466186912e-05, + "loss": 0.7723, + "num_input_tokens_seen": 15880624, + "step": 2970 + }, + { + "epoch": 0.476, + "grad_norm": 1.012623906135559, + "learning_rate": 2.6883170138198323e-05, + "loss": 0.6105, + "num_input_tokens_seen": 15912176, + "step": 2975 + }, + { + "epoch": 0.4768, + "grad_norm": 1.069486141204834, + "learning_rate": 2.6820510915083648e-05, + "loss": 0.6941, + "num_input_tokens_seen": 15944384, + "step": 2980 + }, + { + "epoch": 0.4776, + "grad_norm": 0.8411433100700378, + "learning_rate": 2.6757840192632598e-05, + "loss": 0.6669, + "num_input_tokens_seen": 15969680, + "step": 2985 + }, + { + "epoch": 0.4784, + "grad_norm": 0.6902319192886353, + "learning_rate": 2.6695158366707522e-05, + "loss": 0.6814, + "num_input_tokens_seen": 15997264, + "step": 2990 + }, + { + "epoch": 0.4792, + "grad_norm": 0.7844128012657166, + "learning_rate": 2.6632465833240893e-05, + "loss": 0.5641, + "num_input_tokens_seen": 16029664, + "step": 2995 + }, + { + "epoch": 0.48, + "grad_norm": 1.1021162271499634, + "learning_rate": 2.656976298823284e-05, + "loss": 0.7797, + "num_input_tokens_seen": 16054864, + "step": 3000 + }, + { + "epoch": 0.4808, + "grad_norm": 1.2080223560333252, + "learning_rate": 2.650705022774859e-05, + "loss": 0.6778, + "num_input_tokens_seen": 16079552, + "step": 3005 + }, + { + "epoch": 0.4816, + "grad_norm": 1.1015046834945679, + "learning_rate": 2.6444327947916036e-05, + "loss": 0.6806, + "num_input_tokens_seen": 16105632, + "step": 3010 + }, + { + "epoch": 0.4824, + "grad_norm": 1.0269590616226196, + "learning_rate": 2.638159654492318e-05, + "loss": 0.7589, + "num_input_tokens_seen": 16134688, + "step": 3015 + }, + { + "epoch": 0.4832, + "grad_norm": 0.8565163612365723, + "learning_rate": 2.6318856415015664e-05, + "loss": 0.677, + "num_input_tokens_seen": 16163152, + "step": 3020 + }, + { + "epoch": 0.484, + "grad_norm": 0.8519279956817627, + "learning_rate": 2.6256107954494242e-05, + "loss": 0.6136, + "num_input_tokens_seen": 16189248, + "step": 3025 + }, + { + "epoch": 0.4848, + "grad_norm": 0.9466794729232788, + "learning_rate": 2.6193351559712292e-05, + "loss": 0.6369, + "num_input_tokens_seen": 16214832, + "step": 3030 + }, + { + "epoch": 0.4856, + "grad_norm": 0.9867402911186218, + "learning_rate": 2.6130587627073315e-05, + "loss": 0.7202, + "num_input_tokens_seen": 16244736, + "step": 3035 + }, + { + "epoch": 0.4864, + "grad_norm": 0.8893384337425232, + "learning_rate": 2.606781655302843e-05, + "loss": 0.7057, + "num_input_tokens_seen": 16272064, + "step": 3040 + }, + { + "epoch": 0.4872, + "grad_norm": 1.2341115474700928, + "learning_rate": 2.6005038734073833e-05, + "loss": 0.6715, + "num_input_tokens_seen": 16301344, + "step": 3045 + }, + { + "epoch": 0.488, + "grad_norm": 1.0158292055130005, + "learning_rate": 2.594225456674837e-05, + "loss": 0.7479, + "num_input_tokens_seen": 16325872, + "step": 3050 + }, + { + "epoch": 0.4888, + "grad_norm": 0.9316710233688354, + "learning_rate": 2.5879464447630946e-05, + "loss": 0.6581, + "num_input_tokens_seen": 16352272, + "step": 3055 + }, + { + "epoch": 0.4896, + "grad_norm": 0.9104299545288086, + "learning_rate": 2.5816668773338098e-05, + "loss": 0.691, + "num_input_tokens_seen": 16380464, + "step": 3060 + }, + { + "epoch": 0.4904, + "grad_norm": 0.8835129737854004, + "learning_rate": 2.575386794052142e-05, + "loss": 0.6606, + "num_input_tokens_seen": 16408736, + "step": 3065 + }, + { + "epoch": 0.4912, + "grad_norm": 0.869504451751709, + "learning_rate": 2.569106234586511e-05, + "loss": 0.729, + "num_input_tokens_seen": 16436352, + "step": 3070 + }, + { + "epoch": 0.492, + "grad_norm": 1.0879722833633423, + "learning_rate": 2.562825238608344e-05, + "loss": 0.7137, + "num_input_tokens_seen": 16464624, + "step": 3075 + }, + { + "epoch": 0.4928, + "grad_norm": 0.9328833818435669, + "learning_rate": 2.5565438457918244e-05, + "loss": 0.7238, + "num_input_tokens_seen": 16496720, + "step": 3080 + }, + { + "epoch": 0.4936, + "grad_norm": 0.7433749437332153, + "learning_rate": 2.5502620958136443e-05, + "loss": 0.7019, + "num_input_tokens_seen": 16524208, + "step": 3085 + }, + { + "epoch": 0.4944, + "grad_norm": 0.7768478989601135, + "learning_rate": 2.5439800283527494e-05, + "loss": 0.5851, + "num_input_tokens_seen": 16552192, + "step": 3090 + }, + { + "epoch": 0.4952, + "grad_norm": 1.139993667602539, + "learning_rate": 2.537697683090093e-05, + "loss": 0.7357, + "num_input_tokens_seen": 16578144, + "step": 3095 + }, + { + "epoch": 0.496, + "grad_norm": 0.9104892611503601, + "learning_rate": 2.531415099708382e-05, + "loss": 0.6254, + "num_input_tokens_seen": 16608288, + "step": 3100 + }, + { + "epoch": 0.4968, + "grad_norm": 0.6912931799888611, + "learning_rate": 2.5251323178918268e-05, + "loss": 0.7284, + "num_input_tokens_seen": 16636176, + "step": 3105 + }, + { + "epoch": 0.4976, + "grad_norm": 0.8370018601417542, + "learning_rate": 2.518849377325893e-05, + "loss": 0.8136, + "num_input_tokens_seen": 16659168, + "step": 3110 + }, + { + "epoch": 0.4984, + "grad_norm": 1.049914836883545, + "learning_rate": 2.5125663176970476e-05, + "loss": 0.7334, + "num_input_tokens_seen": 16687344, + "step": 3115 + }, + { + "epoch": 0.4992, + "grad_norm": 1.0298138856887817, + "learning_rate": 2.5062831786925102e-05, + "loss": 0.7599, + "num_input_tokens_seen": 16714496, + "step": 3120 + }, + { + "epoch": 0.5, + "grad_norm": 0.7912611961364746, + "learning_rate": 2.5e-05, + "loss": 0.6517, + "num_input_tokens_seen": 16742528, + "step": 3125 + }, + { + "epoch": 0.5008, + "grad_norm": 1.0854965448379517, + "learning_rate": 2.4937168213074907e-05, + "loss": 0.6778, + "num_input_tokens_seen": 16771248, + "step": 3130 + }, + { + "epoch": 0.5016, + "grad_norm": 0.9674849510192871, + "learning_rate": 2.4874336823029526e-05, + "loss": 0.6847, + "num_input_tokens_seen": 16799136, + "step": 3135 + }, + { + "epoch": 0.5024, + "grad_norm": 0.8434900641441345, + "learning_rate": 2.481150622674108e-05, + "loss": 0.6638, + "num_input_tokens_seen": 16825648, + "step": 3140 + }, + { + "epoch": 0.5032, + "grad_norm": 0.8714620471000671, + "learning_rate": 2.4748676821081738e-05, + "loss": 0.7139, + "num_input_tokens_seen": 16852240, + "step": 3145 + }, + { + "epoch": 0.504, + "grad_norm": 0.8312164545059204, + "learning_rate": 2.4685849002916183e-05, + "loss": 0.7507, + "num_input_tokens_seen": 16878624, + "step": 3150 + }, + { + "epoch": 0.5048, + "grad_norm": 1.1353472471237183, + "learning_rate": 2.4623023169099073e-05, + "loss": 0.6951, + "num_input_tokens_seen": 16906864, + "step": 3155 + }, + { + "epoch": 0.5056, + "grad_norm": 1.1486365795135498, + "learning_rate": 2.4560199716472508e-05, + "loss": 0.733, + "num_input_tokens_seen": 16930080, + "step": 3160 + }, + { + "epoch": 0.5064, + "grad_norm": 0.9651095867156982, + "learning_rate": 2.449737904186357e-05, + "loss": 0.7517, + "num_input_tokens_seen": 16952240, + "step": 3165 + }, + { + "epoch": 0.5072, + "grad_norm": 0.8921483755111694, + "learning_rate": 2.4434561542081762e-05, + "loss": 0.7472, + "num_input_tokens_seen": 16985408, + "step": 3170 + }, + { + "epoch": 0.508, + "grad_norm": 1.0625066757202148, + "learning_rate": 2.4371747613916566e-05, + "loss": 0.7514, + "num_input_tokens_seen": 17013776, + "step": 3175 + }, + { + "epoch": 0.5088, + "grad_norm": 1.10313081741333, + "learning_rate": 2.4308937654134893e-05, + "loss": 0.7633, + "num_input_tokens_seen": 17039120, + "step": 3180 + }, + { + "epoch": 0.5096, + "grad_norm": 1.115670084953308, + "learning_rate": 2.4246132059478578e-05, + "loss": 0.6606, + "num_input_tokens_seen": 17065296, + "step": 3185 + }, + { + "epoch": 0.5104, + "grad_norm": 1.0417555570602417, + "learning_rate": 2.418333122666191e-05, + "loss": 0.764, + "num_input_tokens_seen": 17089264, + "step": 3190 + }, + { + "epoch": 0.5112, + "grad_norm": 0.9926926493644714, + "learning_rate": 2.412053555236906e-05, + "loss": 0.751, + "num_input_tokens_seen": 17117488, + "step": 3195 + }, + { + "epoch": 0.512, + "grad_norm": 1.1716359853744507, + "learning_rate": 2.4057745433251635e-05, + "loss": 0.7067, + "num_input_tokens_seen": 17141232, + "step": 3200 + }, + { + "epoch": 0.5128, + "grad_norm": 1.0248827934265137, + "learning_rate": 2.3994961265926166e-05, + "loss": 0.6432, + "num_input_tokens_seen": 17171632, + "step": 3205 + }, + { + "epoch": 0.5136, + "grad_norm": 0.8832619190216064, + "learning_rate": 2.3932183446971583e-05, + "loss": 0.6373, + "num_input_tokens_seen": 17198640, + "step": 3210 + }, + { + "epoch": 0.5144, + "grad_norm": 0.8581608533859253, + "learning_rate": 2.3869412372926687e-05, + "loss": 0.7347, + "num_input_tokens_seen": 17228240, + "step": 3215 + }, + { + "epoch": 0.5152, + "grad_norm": 1.0683339834213257, + "learning_rate": 2.3806648440287714e-05, + "loss": 0.6789, + "num_input_tokens_seen": 17259392, + "step": 3220 + }, + { + "epoch": 0.516, + "grad_norm": 0.9491643309593201, + "learning_rate": 2.3743892045505764e-05, + "loss": 0.7548, + "num_input_tokens_seen": 17287808, + "step": 3225 + }, + { + "epoch": 0.5168, + "grad_norm": 0.8620545864105225, + "learning_rate": 2.368114358498434e-05, + "loss": 0.7792, + "num_input_tokens_seen": 17311520, + "step": 3230 + }, + { + "epoch": 0.5176, + "grad_norm": 1.0956013202667236, + "learning_rate": 2.361840345507683e-05, + "loss": 0.6575, + "num_input_tokens_seen": 17340816, + "step": 3235 + }, + { + "epoch": 0.5184, + "grad_norm": 1.0029551982879639, + "learning_rate": 2.355567205208397e-05, + "loss": 0.6414, + "num_input_tokens_seen": 17363408, + "step": 3240 + }, + { + "epoch": 0.5192, + "grad_norm": 1.0472480058670044, + "learning_rate": 2.3492949772251414e-05, + "loss": 0.7161, + "num_input_tokens_seen": 17393248, + "step": 3245 + }, + { + "epoch": 0.52, + "grad_norm": 0.8757247924804688, + "learning_rate": 2.3430237011767167e-05, + "loss": 0.6957, + "num_input_tokens_seen": 17425232, + "step": 3250 + }, + { + "epoch": 0.5208, + "grad_norm": 1.0374081134796143, + "learning_rate": 2.3367534166759102e-05, + "loss": 0.7615, + "num_input_tokens_seen": 17446864, + "step": 3255 + }, + { + "epoch": 0.5216, + "grad_norm": 1.0572500228881836, + "learning_rate": 2.3304841633292487e-05, + "loss": 0.6994, + "num_input_tokens_seen": 17470896, + "step": 3260 + }, + { + "epoch": 0.5224, + "grad_norm": 1.0209540128707886, + "learning_rate": 2.3242159807367408e-05, + "loss": 0.7116, + "num_input_tokens_seen": 17501488, + "step": 3265 + }, + { + "epoch": 0.5232, + "grad_norm": 1.0854222774505615, + "learning_rate": 2.3179489084916358e-05, + "loss": 0.7583, + "num_input_tokens_seen": 17526032, + "step": 3270 + }, + { + "epoch": 0.524, + "grad_norm": 1.0327874422073364, + "learning_rate": 2.3116829861801686e-05, + "loss": 0.7302, + "num_input_tokens_seen": 17550144, + "step": 3275 + }, + { + "epoch": 0.5248, + "grad_norm": 1.186990737915039, + "learning_rate": 2.3054182533813087e-05, + "loss": 0.6794, + "num_input_tokens_seen": 17575600, + "step": 3280 + }, + { + "epoch": 0.5256, + "grad_norm": 1.000475287437439, + "learning_rate": 2.2991547496665148e-05, + "loss": 0.7294, + "num_input_tokens_seen": 17601408, + "step": 3285 + }, + { + "epoch": 0.5264, + "grad_norm": 1.1120193004608154, + "learning_rate": 2.2928925145994794e-05, + "loss": 0.6333, + "num_input_tokens_seen": 17624752, + "step": 3290 + }, + { + "epoch": 0.5272, + "grad_norm": 0.7765217423439026, + "learning_rate": 2.286631587735883e-05, + "loss": 0.7779, + "num_input_tokens_seen": 17651040, + "step": 3295 + }, + { + "epoch": 0.528, + "grad_norm": 0.9403998255729675, + "learning_rate": 2.280372008623142e-05, + "loss": 0.7035, + "num_input_tokens_seen": 17678288, + "step": 3300 + }, + { + "epoch": 0.5288, + "grad_norm": 1.019305944442749, + "learning_rate": 2.2741138168001608e-05, + "loss": 0.719, + "num_input_tokens_seen": 17702816, + "step": 3305 + }, + { + "epoch": 0.5296, + "grad_norm": 1.0804177522659302, + "learning_rate": 2.267857051797081e-05, + "loss": 0.7134, + "num_input_tokens_seen": 17728848, + "step": 3310 + }, + { + "epoch": 0.5304, + "grad_norm": 0.7340876460075378, + "learning_rate": 2.2616017531350288e-05, + "loss": 0.6916, + "num_input_tokens_seen": 17756240, + "step": 3315 + }, + { + "epoch": 0.5312, + "grad_norm": 0.9618902802467346, + "learning_rate": 2.255347960325871e-05, + "loss": 0.6389, + "num_input_tokens_seen": 17781104, + "step": 3320 + }, + { + "epoch": 0.532, + "grad_norm": 0.9528229832649231, + "learning_rate": 2.2490957128719624e-05, + "loss": 0.6648, + "num_input_tokens_seen": 17808816, + "step": 3325 + }, + { + "epoch": 0.5328, + "grad_norm": 1.043328881263733, + "learning_rate": 2.2428450502658967e-05, + "loss": 0.6683, + "num_input_tokens_seen": 17834496, + "step": 3330 + }, + { + "epoch": 0.5336, + "grad_norm": 0.8162310719490051, + "learning_rate": 2.2365960119902545e-05, + "loss": 0.7686, + "num_input_tokens_seen": 17862880, + "step": 3335 + }, + { + "epoch": 0.5344, + "grad_norm": 0.8925397396087646, + "learning_rate": 2.2303486375173585e-05, + "loss": 0.7073, + "num_input_tokens_seen": 17890064, + "step": 3340 + }, + { + "epoch": 0.5352, + "grad_norm": 1.0610705614089966, + "learning_rate": 2.224102966309021e-05, + "loss": 0.642, + "num_input_tokens_seen": 17918144, + "step": 3345 + }, + { + "epoch": 0.536, + "grad_norm": 1.1452162265777588, + "learning_rate": 2.217859037816296e-05, + "loss": 0.7078, + "num_input_tokens_seen": 17945344, + "step": 3350 + }, + { + "epoch": 0.5368, + "grad_norm": 0.8698946833610535, + "learning_rate": 2.2116168914792292e-05, + "loss": 0.7437, + "num_input_tokens_seen": 17970096, + "step": 3355 + }, + { + "epoch": 0.5376, + "grad_norm": 1.1551156044006348, + "learning_rate": 2.205376566726611e-05, + "loss": 0.7606, + "num_input_tokens_seen": 17997328, + "step": 3360 + }, + { + "epoch": 0.5384, + "grad_norm": 1.3479046821594238, + "learning_rate": 2.1991381029757215e-05, + "loss": 0.6824, + "num_input_tokens_seen": 18022464, + "step": 3365 + }, + { + "epoch": 0.5392, + "grad_norm": 0.9218052625656128, + "learning_rate": 2.19290153963209e-05, + "loss": 0.7262, + "num_input_tokens_seen": 18052176, + "step": 3370 + }, + { + "epoch": 0.54, + "grad_norm": 1.302252173423767, + "learning_rate": 2.186666916089239e-05, + "loss": 0.7491, + "num_input_tokens_seen": 18079008, + "step": 3375 + }, + { + "epoch": 0.5408, + "grad_norm": 1.4532941579818726, + "learning_rate": 2.1804342717284415e-05, + "loss": 0.6246, + "num_input_tokens_seen": 18102784, + "step": 3380 + }, + { + "epoch": 0.5416, + "grad_norm": 0.7572783827781677, + "learning_rate": 2.174203645918464e-05, + "loss": 0.6712, + "num_input_tokens_seen": 18130688, + "step": 3385 + }, + { + "epoch": 0.5424, + "grad_norm": 1.0954492092132568, + "learning_rate": 2.1679750780153267e-05, + "loss": 0.7238, + "num_input_tokens_seen": 18159200, + "step": 3390 + }, + { + "epoch": 0.5432, + "grad_norm": 1.1352787017822266, + "learning_rate": 2.1617486073620498e-05, + "loss": 0.663, + "num_input_tokens_seen": 18188736, + "step": 3395 + }, + { + "epoch": 0.544, + "grad_norm": 1.012987732887268, + "learning_rate": 2.155524273288405e-05, + "loss": 0.6928, + "num_input_tokens_seen": 18217856, + "step": 3400 + }, + { + "epoch": 0.5448, + "grad_norm": 0.8638446927070618, + "learning_rate": 2.1493021151106703e-05, + "loss": 0.7373, + "num_input_tokens_seen": 18247616, + "step": 3405 + }, + { + "epoch": 0.5456, + "grad_norm": 1.2647075653076172, + "learning_rate": 2.1430821721313782e-05, + "loss": 0.7593, + "num_input_tokens_seen": 18274416, + "step": 3410 + }, + { + "epoch": 0.5464, + "grad_norm": 0.8533580899238586, + "learning_rate": 2.1368644836390684e-05, + "loss": 0.6718, + "num_input_tokens_seen": 18298720, + "step": 3415 + }, + { + "epoch": 0.5472, + "grad_norm": 0.8091197609901428, + "learning_rate": 2.130649088908041e-05, + "loss": 0.7303, + "num_input_tokens_seen": 18326160, + "step": 3420 + }, + { + "epoch": 0.548, + "grad_norm": 0.886374294757843, + "learning_rate": 2.1244360271981073e-05, + "loss": 0.74, + "num_input_tokens_seen": 18351344, + "step": 3425 + }, + { + "epoch": 0.5488, + "grad_norm": 0.8284346461296082, + "learning_rate": 2.1182253377543425e-05, + "loss": 0.6448, + "num_input_tokens_seen": 18374752, + "step": 3430 + }, + { + "epoch": 0.5496, + "grad_norm": 0.9252715706825256, + "learning_rate": 2.112017059806835e-05, + "loss": 0.6759, + "num_input_tokens_seen": 18402432, + "step": 3435 + }, + { + "epoch": 0.5504, + "grad_norm": 1.0463306903839111, + "learning_rate": 2.1058112325704436e-05, + "loss": 0.7327, + "num_input_tokens_seen": 18428656, + "step": 3440 + }, + { + "epoch": 0.5512, + "grad_norm": 0.999754011631012, + "learning_rate": 2.0996078952445452e-05, + "loss": 0.7214, + "num_input_tokens_seen": 18451744, + "step": 3445 + }, + { + "epoch": 0.552, + "grad_norm": 0.759445071220398, + "learning_rate": 2.0934070870127912e-05, + "loss": 0.7192, + "num_input_tokens_seen": 18476960, + "step": 3450 + }, + { + "epoch": 0.5528, + "grad_norm": 0.757986843585968, + "learning_rate": 2.0872088470428553e-05, + "loss": 0.6481, + "num_input_tokens_seen": 18507280, + "step": 3455 + }, + { + "epoch": 0.5536, + "grad_norm": 1.555863857269287, + "learning_rate": 2.08101321448619e-05, + "loss": 0.6906, + "num_input_tokens_seen": 18531264, + "step": 3460 + }, + { + "epoch": 0.5544, + "grad_norm": 0.9648059606552124, + "learning_rate": 2.0748202284777777e-05, + "loss": 0.6691, + "num_input_tokens_seen": 18559552, + "step": 3465 + }, + { + "epoch": 0.5552, + "grad_norm": 0.7556829452514648, + "learning_rate": 2.0686299281358835e-05, + "loss": 0.743, + "num_input_tokens_seen": 18587408, + "step": 3470 + }, + { + "epoch": 0.556, + "grad_norm": 1.2063502073287964, + "learning_rate": 2.0624423525618098e-05, + "loss": 0.6896, + "num_input_tokens_seen": 18616384, + "step": 3475 + }, + { + "epoch": 0.5568, + "grad_norm": 1.0923407077789307, + "learning_rate": 2.056257540839647e-05, + "loss": 0.7799, + "num_input_tokens_seen": 18640432, + "step": 3480 + }, + { + "epoch": 0.5576, + "grad_norm": 0.9768779873847961, + "learning_rate": 2.050075532036026e-05, + "loss": 0.6796, + "num_input_tokens_seen": 18661696, + "step": 3485 + }, + { + "epoch": 0.5584, + "grad_norm": 1.0903396606445312, + "learning_rate": 2.0438963651998747e-05, + "loss": 0.6601, + "num_input_tokens_seen": 18689280, + "step": 3490 + }, + { + "epoch": 0.5592, + "grad_norm": 1.0859960317611694, + "learning_rate": 2.037720079362169e-05, + "loss": 0.7247, + "num_input_tokens_seen": 18713776, + "step": 3495 + }, + { + "epoch": 0.56, + "grad_norm": 0.979040801525116, + "learning_rate": 2.031546713535688e-05, + "loss": 0.7113, + "num_input_tokens_seen": 18739632, + "step": 3500 + }, + { + "epoch": 0.5608, + "grad_norm": 0.9411745667457581, + "learning_rate": 2.0253763067147657e-05, + "loss": 0.604, + "num_input_tokens_seen": 18767504, + "step": 3505 + }, + { + "epoch": 0.5616, + "grad_norm": 1.0369850397109985, + "learning_rate": 2.0192088978750433e-05, + "loss": 0.6292, + "num_input_tokens_seen": 18794976, + "step": 3510 + }, + { + "epoch": 0.5624, + "grad_norm": 1.033288598060608, + "learning_rate": 2.0130445259732285e-05, + "loss": 0.7227, + "num_input_tokens_seen": 18823456, + "step": 3515 + }, + { + "epoch": 0.5632, + "grad_norm": 1.0590053796768188, + "learning_rate": 2.0068832299468428e-05, + "loss": 0.6536, + "num_input_tokens_seen": 18851104, + "step": 3520 + }, + { + "epoch": 0.564, + "grad_norm": 0.8934491276741028, + "learning_rate": 2.000725048713983e-05, + "loss": 0.6729, + "num_input_tokens_seen": 18882096, + "step": 3525 + }, + { + "epoch": 0.5648, + "grad_norm": 1.3166091442108154, + "learning_rate": 1.994570021173067e-05, + "loss": 0.8178, + "num_input_tokens_seen": 18903520, + "step": 3530 + }, + { + "epoch": 0.5656, + "grad_norm": 0.7813395857810974, + "learning_rate": 1.988418186202594e-05, + "loss": 0.6575, + "num_input_tokens_seen": 18937200, + "step": 3535 + }, + { + "epoch": 0.5664, + "grad_norm": 1.1987985372543335, + "learning_rate": 1.9822695826608972e-05, + "loss": 0.7789, + "num_input_tokens_seen": 18965424, + "step": 3540 + }, + { + "epoch": 0.5672, + "grad_norm": 0.9326161742210388, + "learning_rate": 1.9761242493858987e-05, + "loss": 0.6699, + "num_input_tokens_seen": 18989456, + "step": 3545 + }, + { + "epoch": 0.568, + "grad_norm": 0.736609160900116, + "learning_rate": 1.969982225194864e-05, + "loss": 0.7095, + "num_input_tokens_seen": 19017808, + "step": 3550 + }, + { + "epoch": 0.5688, + "grad_norm": 1.0842061042785645, + "learning_rate": 1.9638435488841546e-05, + "loss": 0.7191, + "num_input_tokens_seen": 19046496, + "step": 3555 + }, + { + "epoch": 0.5696, + "grad_norm": 0.9748028516769409, + "learning_rate": 1.957708259228987e-05, + "loss": 0.7016, + "num_input_tokens_seen": 19072128, + "step": 3560 + }, + { + "epoch": 0.5704, + "grad_norm": 1.0534452199935913, + "learning_rate": 1.951576394983185e-05, + "loss": 0.6903, + "num_input_tokens_seen": 19096528, + "step": 3565 + }, + { + "epoch": 0.5712, + "grad_norm": 0.860016405582428, + "learning_rate": 1.945447994878937e-05, + "loss": 0.6144, + "num_input_tokens_seen": 19126240, + "step": 3570 + }, + { + "epoch": 0.572, + "grad_norm": 0.9540638327598572, + "learning_rate": 1.9393230976265473e-05, + "loss": 0.6755, + "num_input_tokens_seen": 19152752, + "step": 3575 + }, + { + "epoch": 0.5728, + "grad_norm": 0.8391373157501221, + "learning_rate": 1.9332017419141962e-05, + "loss": 0.748, + "num_input_tokens_seen": 19179296, + "step": 3580 + }, + { + "epoch": 0.5736, + "grad_norm": 1.1639857292175293, + "learning_rate": 1.9270839664076936e-05, + "loss": 0.7011, + "num_input_tokens_seen": 19205616, + "step": 3585 + }, + { + "epoch": 0.5744, + "grad_norm": 0.8739202618598938, + "learning_rate": 1.920969809750234e-05, + "loss": 0.6672, + "num_input_tokens_seen": 19231440, + "step": 3590 + }, + { + "epoch": 0.5752, + "grad_norm": 0.8280954360961914, + "learning_rate": 1.914859310562154e-05, + "loss": 0.7261, + "num_input_tokens_seen": 19258288, + "step": 3595 + }, + { + "epoch": 0.576, + "grad_norm": 0.671859622001648, + "learning_rate": 1.908752507440689e-05, + "loss": 0.7838, + "num_input_tokens_seen": 19284464, + "step": 3600 + }, + { + "epoch": 0.5768, + "grad_norm": 0.7985883951187134, + "learning_rate": 1.9026494389597238e-05, + "loss": 0.6574, + "num_input_tokens_seen": 19312272, + "step": 3605 + }, + { + "epoch": 0.5776, + "grad_norm": 1.1776115894317627, + "learning_rate": 1.8965501436695577e-05, + "loss": 0.7692, + "num_input_tokens_seen": 19335408, + "step": 3610 + }, + { + "epoch": 0.5784, + "grad_norm": 0.7614251971244812, + "learning_rate": 1.890454660096654e-05, + "loss": 0.7165, + "num_input_tokens_seen": 19360768, + "step": 3615 + }, + { + "epoch": 0.5792, + "grad_norm": 1.0146969556808472, + "learning_rate": 1.8843630267434e-05, + "loss": 0.8187, + "num_input_tokens_seen": 19386016, + "step": 3620 + }, + { + "epoch": 0.58, + "grad_norm": 0.8127625584602356, + "learning_rate": 1.8782752820878634e-05, + "loss": 0.6587, + "num_input_tokens_seen": 19408560, + "step": 3625 + }, + { + "epoch": 0.5808, + "grad_norm": 1.102415680885315, + "learning_rate": 1.872191464583547e-05, + "loss": 0.6268, + "num_input_tokens_seen": 19436592, + "step": 3630 + }, + { + "epoch": 0.5816, + "grad_norm": 0.8009477853775024, + "learning_rate": 1.866111612659149e-05, + "loss": 0.6977, + "num_input_tokens_seen": 19463440, + "step": 3635 + }, + { + "epoch": 0.5824, + "grad_norm": 0.9613442420959473, + "learning_rate": 1.8600357647183185e-05, + "loss": 0.6292, + "num_input_tokens_seen": 19493360, + "step": 3640 + }, + { + "epoch": 0.5832, + "grad_norm": 1.1276973485946655, + "learning_rate": 1.8539639591394133e-05, + "loss": 0.6547, + "num_input_tokens_seen": 19521392, + "step": 3645 + }, + { + "epoch": 0.584, + "grad_norm": 1.2128732204437256, + "learning_rate": 1.8478962342752583e-05, + "loss": 0.6717, + "num_input_tokens_seen": 19550336, + "step": 3650 + }, + { + "epoch": 0.5848, + "grad_norm": 1.1931806802749634, + "learning_rate": 1.8418326284528996e-05, + "loss": 0.7065, + "num_input_tokens_seen": 19575776, + "step": 3655 + }, + { + "epoch": 0.5856, + "grad_norm": 0.921335756778717, + "learning_rate": 1.8357731799733686e-05, + "loss": 0.7029, + "num_input_tokens_seen": 19598128, + "step": 3660 + }, + { + "epoch": 0.5864, + "grad_norm": 0.8000009655952454, + "learning_rate": 1.8297179271114346e-05, + "loss": 0.7311, + "num_input_tokens_seen": 19625648, + "step": 3665 + }, + { + "epoch": 0.5872, + "grad_norm": 1.0933367013931274, + "learning_rate": 1.8236669081153657e-05, + "loss": 0.7296, + "num_input_tokens_seen": 19649952, + "step": 3670 + }, + { + "epoch": 0.588, + "grad_norm": 0.8331469297409058, + "learning_rate": 1.817620161206687e-05, + "loss": 0.7534, + "num_input_tokens_seen": 19677680, + "step": 3675 + }, + { + "epoch": 0.5888, + "grad_norm": 1.3450491428375244, + "learning_rate": 1.811577724579938e-05, + "loss": 0.6995, + "num_input_tokens_seen": 19711904, + "step": 3680 + }, + { + "epoch": 0.5896, + "grad_norm": 1.0697826147079468, + "learning_rate": 1.8055396364024317e-05, + "loss": 0.7517, + "num_input_tokens_seen": 19734272, + "step": 3685 + }, + { + "epoch": 0.5904, + "grad_norm": 0.9218750596046448, + "learning_rate": 1.7995059348140165e-05, + "loss": 0.7048, + "num_input_tokens_seen": 19761136, + "step": 3690 + }, + { + "epoch": 0.5912, + "grad_norm": 0.7037175297737122, + "learning_rate": 1.7934766579268292e-05, + "loss": 0.6385, + "num_input_tokens_seen": 19784880, + "step": 3695 + }, + { + "epoch": 0.592, + "grad_norm": 0.9812880754470825, + "learning_rate": 1.7874518438250597e-05, + "loss": 0.8177, + "num_input_tokens_seen": 19811456, + "step": 3700 + }, + { + "epoch": 0.5928, + "grad_norm": 1.0128806829452515, + "learning_rate": 1.7814315305647093e-05, + "loss": 0.7373, + "num_input_tokens_seen": 19839168, + "step": 3705 + }, + { + "epoch": 0.5936, + "grad_norm": 0.8506542444229126, + "learning_rate": 1.7754157561733476e-05, + "loss": 0.723, + "num_input_tokens_seen": 19865584, + "step": 3710 + }, + { + "epoch": 0.5944, + "grad_norm": 0.8591898679733276, + "learning_rate": 1.7694045586498752e-05, + "loss": 0.6315, + "num_input_tokens_seen": 19893232, + "step": 3715 + }, + { + "epoch": 0.5952, + "grad_norm": 0.9761216640472412, + "learning_rate": 1.7633979759642844e-05, + "loss": 0.6184, + "num_input_tokens_seen": 19918512, + "step": 3720 + }, + { + "epoch": 0.596, + "grad_norm": 0.9515823721885681, + "learning_rate": 1.7573960460574133e-05, + "loss": 0.682, + "num_input_tokens_seen": 19944544, + "step": 3725 + }, + { + "epoch": 0.5968, + "grad_norm": 1.2393804788589478, + "learning_rate": 1.7513988068407146e-05, + "loss": 0.6738, + "num_input_tokens_seen": 19971104, + "step": 3730 + }, + { + "epoch": 0.5976, + "grad_norm": 1.2483285665512085, + "learning_rate": 1.74540629619601e-05, + "loss": 0.6895, + "num_input_tokens_seen": 19996352, + "step": 3735 + }, + { + "epoch": 0.5984, + "grad_norm": 1.162599802017212, + "learning_rate": 1.7394185519752545e-05, + "loss": 0.7436, + "num_input_tokens_seen": 20021744, + "step": 3740 + }, + { + "epoch": 0.5992, + "grad_norm": 0.8526731133460999, + "learning_rate": 1.7334356120002957e-05, + "loss": 0.7612, + "num_input_tokens_seen": 20046560, + "step": 3745 + }, + { + "epoch": 0.6, + "grad_norm": 1.1033904552459717, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.7139, + "num_input_tokens_seen": 20072560, + "step": 3750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.9515017867088318, + "learning_rate": 1.7214842959231794e-05, + "loss": 0.6556, + "num_input_tokens_seen": 20103488, + "step": 3755 + }, + { + "epoch": 0.6016, + "grad_norm": 1.1830626726150513, + "learning_rate": 1.7155159953120313e-05, + "loss": 0.6884, + "num_input_tokens_seen": 20130784, + "step": 3760 + }, + { + "epoch": 0.6024, + "grad_norm": 1.1456624269485474, + "learning_rate": 1.7095526499282172e-05, + "loss": 0.7729, + "num_input_tokens_seen": 20158720, + "step": 3765 + }, + { + "epoch": 0.6032, + "grad_norm": 0.8993046879768372, + "learning_rate": 1.703594297439469e-05, + "loss": 0.7427, + "num_input_tokens_seen": 20180736, + "step": 3770 + }, + { + "epoch": 0.604, + "grad_norm": 1.0378142595291138, + "learning_rate": 1.6976409754819767e-05, + "loss": 0.6831, + "num_input_tokens_seen": 20203744, + "step": 3775 + }, + { + "epoch": 0.6048, + "grad_norm": 1.0006003379821777, + "learning_rate": 1.6916927216601593e-05, + "loss": 0.6098, + "num_input_tokens_seen": 20232784, + "step": 3780 + }, + { + "epoch": 0.6056, + "grad_norm": 0.9714456796646118, + "learning_rate": 1.6857495735464195e-05, + "loss": 0.812, + "num_input_tokens_seen": 20262256, + "step": 3785 + }, + { + "epoch": 0.6064, + "grad_norm": 1.226090431213379, + "learning_rate": 1.6798115686809125e-05, + "loss": 0.6337, + "num_input_tokens_seen": 20290720, + "step": 3790 + }, + { + "epoch": 0.6072, + "grad_norm": 1.2579602003097534, + "learning_rate": 1.6738787445713037e-05, + "loss": 0.7105, + "num_input_tokens_seen": 20314368, + "step": 3795 + }, + { + "epoch": 0.608, + "grad_norm": 0.9636203050613403, + "learning_rate": 1.6679511386925337e-05, + "loss": 0.7776, + "num_input_tokens_seen": 20337648, + "step": 3800 + }, + { + "epoch": 0.6088, + "grad_norm": 0.8365712761878967, + "learning_rate": 1.662028788486583e-05, + "loss": 0.6626, + "num_input_tokens_seen": 20367344, + "step": 3805 + }, + { + "epoch": 0.6096, + "grad_norm": 1.340468168258667, + "learning_rate": 1.656111731362236e-05, + "loss": 0.6983, + "num_input_tokens_seen": 20391616, + "step": 3810 + }, + { + "epoch": 0.6104, + "grad_norm": 0.878955066204071, + "learning_rate": 1.650200004694839e-05, + "loss": 0.6948, + "num_input_tokens_seen": 20419520, + "step": 3815 + }, + { + "epoch": 0.6112, + "grad_norm": 0.9543726444244385, + "learning_rate": 1.644293645826072e-05, + "loss": 0.8154, + "num_input_tokens_seen": 20446048, + "step": 3820 + }, + { + "epoch": 0.612, + "grad_norm": 1.2340530157089233, + "learning_rate": 1.6383926920637077e-05, + "loss": 0.7234, + "num_input_tokens_seen": 20472960, + "step": 3825 + }, + { + "epoch": 0.6128, + "grad_norm": 0.8794097900390625, + "learning_rate": 1.6324971806813767e-05, + "loss": 0.668, + "num_input_tokens_seen": 20502816, + "step": 3830 + }, + { + "epoch": 0.6136, + "grad_norm": 1.0433683395385742, + "learning_rate": 1.6266071489183327e-05, + "loss": 0.6936, + "num_input_tokens_seen": 20529056, + "step": 3835 + }, + { + "epoch": 0.6144, + "grad_norm": 1.3372730016708374, + "learning_rate": 1.620722633979219e-05, + "loss": 0.7988, + "num_input_tokens_seen": 20555392, + "step": 3840 + }, + { + "epoch": 0.6152, + "grad_norm": 1.0201383829116821, + "learning_rate": 1.614843673033828e-05, + "loss": 0.7752, + "num_input_tokens_seen": 20583888, + "step": 3845 + }, + { + "epoch": 0.616, + "grad_norm": 0.7360509634017944, + "learning_rate": 1.6089703032168733e-05, + "loss": 0.6784, + "num_input_tokens_seen": 20612112, + "step": 3850 + }, + { + "epoch": 0.6168, + "grad_norm": 0.8650053143501282, + "learning_rate": 1.603102561627751e-05, + "loss": 0.6483, + "num_input_tokens_seen": 20639296, + "step": 3855 + }, + { + "epoch": 0.6176, + "grad_norm": 0.9596717953681946, + "learning_rate": 1.5972404853303062e-05, + "loss": 0.6876, + "num_input_tokens_seen": 20663680, + "step": 3860 + }, + { + "epoch": 0.6184, + "grad_norm": 0.9903700947761536, + "learning_rate": 1.5913841113525992e-05, + "loss": 0.7651, + "num_input_tokens_seen": 20690592, + "step": 3865 + }, + { + "epoch": 0.6192, + "grad_norm": 1.0361056327819824, + "learning_rate": 1.585533476686669e-05, + "loss": 0.6692, + "num_input_tokens_seen": 20716944, + "step": 3870 + }, + { + "epoch": 0.62, + "grad_norm": 1.3285175561904907, + "learning_rate": 1.5796886182883053e-05, + "loss": 0.708, + "num_input_tokens_seen": 20742128, + "step": 3875 + }, + { + "epoch": 0.6208, + "grad_norm": 0.9742456078529358, + "learning_rate": 1.5738495730768105e-05, + "loss": 0.6734, + "num_input_tokens_seen": 20769344, + "step": 3880 + }, + { + "epoch": 0.6216, + "grad_norm": 0.9866467118263245, + "learning_rate": 1.5680163779347667e-05, + "loss": 0.7442, + "num_input_tokens_seen": 20793920, + "step": 3885 + }, + { + "epoch": 0.6224, + "grad_norm": 1.2530503273010254, + "learning_rate": 1.562189069707807e-05, + "loss": 0.786, + "num_input_tokens_seen": 20819616, + "step": 3890 + }, + { + "epoch": 0.6232, + "grad_norm": 1.0180388689041138, + "learning_rate": 1.556367685204374e-05, + "loss": 0.6565, + "num_input_tokens_seen": 20843056, + "step": 3895 + }, + { + "epoch": 0.624, + "grad_norm": 1.1570924520492554, + "learning_rate": 1.5505522611954975e-05, + "loss": 0.8403, + "num_input_tokens_seen": 20870320, + "step": 3900 + }, + { + "epoch": 0.6248, + "grad_norm": 0.9555189609527588, + "learning_rate": 1.5447428344145563e-05, + "loss": 0.717, + "num_input_tokens_seen": 20894448, + "step": 3905 + }, + { + "epoch": 0.6256, + "grad_norm": 0.7047298550605774, + "learning_rate": 1.538939441557048e-05, + "loss": 0.6563, + "num_input_tokens_seen": 20926800, + "step": 3910 + }, + { + "epoch": 0.6264, + "grad_norm": 1.1212091445922852, + "learning_rate": 1.5331421192803565e-05, + "loss": 0.7742, + "num_input_tokens_seen": 20954016, + "step": 3915 + }, + { + "epoch": 0.6272, + "grad_norm": 0.9030645489692688, + "learning_rate": 1.5273509042035172e-05, + "loss": 0.6654, + "num_input_tokens_seen": 20982512, + "step": 3920 + }, + { + "epoch": 0.628, + "grad_norm": 0.9414677619934082, + "learning_rate": 1.521565832906994e-05, + "loss": 0.6737, + "num_input_tokens_seen": 21008768, + "step": 3925 + }, + { + "epoch": 0.6288, + "grad_norm": 1.1415228843688965, + "learning_rate": 1.515786941932441e-05, + "loss": 0.7259, + "num_input_tokens_seen": 21038144, + "step": 3930 + }, + { + "epoch": 0.6296, + "grad_norm": 1.0087826251983643, + "learning_rate": 1.5100142677824753e-05, + "loss": 0.6793, + "num_input_tokens_seen": 21065120, + "step": 3935 + }, + { + "epoch": 0.6304, + "grad_norm": 1.3329883813858032, + "learning_rate": 1.5042478469204435e-05, + "loss": 0.6934, + "num_input_tokens_seen": 21091296, + "step": 3940 + }, + { + "epoch": 0.6312, + "grad_norm": 0.9850925803184509, + "learning_rate": 1.4984877157701932e-05, + "loss": 0.7746, + "num_input_tokens_seen": 21117568, + "step": 3945 + }, + { + "epoch": 0.632, + "grad_norm": 0.8925058245658875, + "learning_rate": 1.4927339107158437e-05, + "loss": 0.6311, + "num_input_tokens_seen": 21146640, + "step": 3950 + }, + { + "epoch": 0.6328, + "grad_norm": 1.2707431316375732, + "learning_rate": 1.486986468101555e-05, + "loss": 0.7614, + "num_input_tokens_seen": 21169680, + "step": 3955 + }, + { + "epoch": 0.6336, + "grad_norm": 1.0344791412353516, + "learning_rate": 1.4812454242312979e-05, + "loss": 0.7291, + "num_input_tokens_seen": 21195360, + "step": 3960 + }, + { + "epoch": 0.6344, + "grad_norm": 0.8999541997909546, + "learning_rate": 1.4755108153686275e-05, + "loss": 0.7421, + "num_input_tokens_seen": 21218896, + "step": 3965 + }, + { + "epoch": 0.6352, + "grad_norm": 1.3539083003997803, + "learning_rate": 1.4697826777364477e-05, + "loss": 0.7831, + "num_input_tokens_seen": 21244080, + "step": 3970 + }, + { + "epoch": 0.636, + "grad_norm": 0.9629884362220764, + "learning_rate": 1.4640610475167898e-05, + "loss": 0.6907, + "num_input_tokens_seen": 21271024, + "step": 3975 + }, + { + "epoch": 0.6368, + "grad_norm": 0.9040243625640869, + "learning_rate": 1.4583459608505801e-05, + "loss": 0.7001, + "num_input_tokens_seen": 21298992, + "step": 3980 + }, + { + "epoch": 0.6376, + "grad_norm": 1.122290849685669, + "learning_rate": 1.4526374538374132e-05, + "loss": 0.6729, + "num_input_tokens_seen": 21324032, + "step": 3985 + }, + { + "epoch": 0.6384, + "grad_norm": 0.8082273602485657, + "learning_rate": 1.4469355625353198e-05, + "loss": 0.6636, + "num_input_tokens_seen": 21354256, + "step": 3990 + }, + { + "epoch": 0.6392, + "grad_norm": 0.7639278173446655, + "learning_rate": 1.4412403229605454e-05, + "loss": 0.6349, + "num_input_tokens_seen": 21382144, + "step": 3995 + }, + { + "epoch": 0.64, + "grad_norm": 1.0013383626937866, + "learning_rate": 1.4355517710873184e-05, + "loss": 0.6892, + "num_input_tokens_seen": 21408112, + "step": 4000 + }, + { + "epoch": 0.6408, + "grad_norm": 0.918889582157135, + "learning_rate": 1.4298699428476236e-05, + "loss": 0.6602, + "num_input_tokens_seen": 21438800, + "step": 4005 + }, + { + "epoch": 0.6416, + "grad_norm": 0.9448719620704651, + "learning_rate": 1.4241948741309782e-05, + "loss": 0.6613, + "num_input_tokens_seen": 21466464, + "step": 4010 + }, + { + "epoch": 0.6424, + "grad_norm": 1.1950000524520874, + "learning_rate": 1.418526600784198e-05, + "loss": 0.6821, + "num_input_tokens_seen": 21496864, + "step": 4015 + }, + { + "epoch": 0.6432, + "grad_norm": 1.0359631776809692, + "learning_rate": 1.412865158611179e-05, + "loss": 0.6698, + "num_input_tokens_seen": 21523456, + "step": 4020 + }, + { + "epoch": 0.644, + "grad_norm": 0.9636697769165039, + "learning_rate": 1.4072105833726684e-05, + "loss": 0.5917, + "num_input_tokens_seen": 21554320, + "step": 4025 + }, + { + "epoch": 0.6448, + "grad_norm": 0.8568287491798401, + "learning_rate": 1.401562910786034e-05, + "loss": 0.7332, + "num_input_tokens_seen": 21584496, + "step": 4030 + }, + { + "epoch": 0.6456, + "grad_norm": 0.7950714230537415, + "learning_rate": 1.3959221765250469e-05, + "loss": 0.6826, + "num_input_tokens_seen": 21615104, + "step": 4035 + }, + { + "epoch": 0.6464, + "grad_norm": 0.9343571662902832, + "learning_rate": 1.3902884162196508e-05, + "loss": 0.7349, + "num_input_tokens_seen": 21642144, + "step": 4040 + }, + { + "epoch": 0.6472, + "grad_norm": 0.8434100151062012, + "learning_rate": 1.3846616654557362e-05, + "loss": 0.6341, + "num_input_tokens_seen": 21671408, + "step": 4045 + }, + { + "epoch": 0.648, + "grad_norm": 1.1461455821990967, + "learning_rate": 1.3790419597749199e-05, + "loss": 0.7531, + "num_input_tokens_seen": 21698880, + "step": 4050 + }, + { + "epoch": 0.6488, + "grad_norm": 1.261234164237976, + "learning_rate": 1.3734293346743168e-05, + "loss": 0.6284, + "num_input_tokens_seen": 21727280, + "step": 4055 + }, + { + "epoch": 0.6496, + "grad_norm": 1.0802944898605347, + "learning_rate": 1.367823825606319e-05, + "loss": 0.7148, + "num_input_tokens_seen": 21751824, + "step": 4060 + }, + { + "epoch": 0.6504, + "grad_norm": 1.1353379487991333, + "learning_rate": 1.3622254679783663e-05, + "loss": 0.7022, + "num_input_tokens_seen": 21782080, + "step": 4065 + }, + { + "epoch": 0.6512, + "grad_norm": 1.0912383794784546, + "learning_rate": 1.3566342971527291e-05, + "loss": 0.7308, + "num_input_tokens_seen": 21809376, + "step": 4070 + }, + { + "epoch": 0.652, + "grad_norm": 1.0142539739608765, + "learning_rate": 1.3510503484462805e-05, + "loss": 0.7338, + "num_input_tokens_seen": 21836240, + "step": 4075 + }, + { + "epoch": 0.6528, + "grad_norm": 1.0957950353622437, + "learning_rate": 1.3454736571302763e-05, + "loss": 0.6941, + "num_input_tokens_seen": 21862768, + "step": 4080 + }, + { + "epoch": 0.6536, + "grad_norm": 0.9035006761550903, + "learning_rate": 1.3399042584301298e-05, + "loss": 0.7218, + "num_input_tokens_seen": 21890304, + "step": 4085 + }, + { + "epoch": 0.6544, + "grad_norm": 1.0284723043441772, + "learning_rate": 1.3343421875251888e-05, + "loss": 0.8144, + "num_input_tokens_seen": 21912192, + "step": 4090 + }, + { + "epoch": 0.6552, + "grad_norm": 1.0489941835403442, + "learning_rate": 1.3287874795485167e-05, + "loss": 0.8236, + "num_input_tokens_seen": 21939984, + "step": 4095 + }, + { + "epoch": 0.656, + "grad_norm": 0.9491598606109619, + "learning_rate": 1.3232401695866687e-05, + "loss": 0.6512, + "num_input_tokens_seen": 21967168, + "step": 4100 + }, + { + "epoch": 0.6568, + "grad_norm": 1.0019705295562744, + "learning_rate": 1.3177002926794685e-05, + "loss": 0.7271, + "num_input_tokens_seen": 21999904, + "step": 4105 + }, + { + "epoch": 0.6576, + "grad_norm": 1.0153288841247559, + "learning_rate": 1.3121678838197909e-05, + "loss": 0.635, + "num_input_tokens_seen": 22028272, + "step": 4110 + }, + { + "epoch": 0.6584, + "grad_norm": 0.8823714852333069, + "learning_rate": 1.3066429779533351e-05, + "loss": 0.6708, + "num_input_tokens_seen": 22052224, + "step": 4115 + }, + { + "epoch": 0.6592, + "grad_norm": 0.8675245642662048, + "learning_rate": 1.3011256099784103e-05, + "loss": 0.5916, + "num_input_tokens_seen": 22081360, + "step": 4120 + }, + { + "epoch": 0.66, + "grad_norm": 1.1484395265579224, + "learning_rate": 1.2956158147457115e-05, + "loss": 0.7399, + "num_input_tokens_seen": 22112080, + "step": 4125 + }, + { + "epoch": 0.6608, + "grad_norm": 1.2655909061431885, + "learning_rate": 1.2901136270580993e-05, + "loss": 0.6543, + "num_input_tokens_seen": 22139792, + "step": 4130 + }, + { + "epoch": 0.6616, + "grad_norm": 1.2049787044525146, + "learning_rate": 1.2846190816703835e-05, + "loss": 0.6808, + "num_input_tokens_seen": 22163136, + "step": 4135 + }, + { + "epoch": 0.6624, + "grad_norm": 0.7781268358230591, + "learning_rate": 1.279132213289096e-05, + "loss": 0.6939, + "num_input_tokens_seen": 22192464, + "step": 4140 + }, + { + "epoch": 0.6632, + "grad_norm": 1.1952673196792603, + "learning_rate": 1.273653056572282e-05, + "loss": 0.6628, + "num_input_tokens_seen": 22219424, + "step": 4145 + }, + { + "epoch": 0.664, + "grad_norm": 1.0534050464630127, + "learning_rate": 1.2681816461292715e-05, + "loss": 0.686, + "num_input_tokens_seen": 22244496, + "step": 4150 + }, + { + "epoch": 0.6648, + "grad_norm": 1.18624746799469, + "learning_rate": 1.2627180165204671e-05, + "loss": 0.7278, + "num_input_tokens_seen": 22271600, + "step": 4155 + }, + { + "epoch": 0.6656, + "grad_norm": 0.8680349588394165, + "learning_rate": 1.257262202257124e-05, + "loss": 0.6531, + "num_input_tokens_seen": 22298080, + "step": 4160 + }, + { + "epoch": 0.6664, + "grad_norm": 0.90425705909729, + "learning_rate": 1.251814237801128e-05, + "loss": 0.5756, + "num_input_tokens_seen": 22324832, + "step": 4165 + }, + { + "epoch": 0.6672, + "grad_norm": 1.0510259866714478, + "learning_rate": 1.246374157564785e-05, + "loss": 0.647, + "num_input_tokens_seen": 22353728, + "step": 4170 + }, + { + "epoch": 0.668, + "grad_norm": 1.1430630683898926, + "learning_rate": 1.2409419959105981e-05, + "loss": 0.7024, + "num_input_tokens_seen": 22374880, + "step": 4175 + }, + { + "epoch": 0.6688, + "grad_norm": 0.8265404105186462, + "learning_rate": 1.2355177871510538e-05, + "loss": 0.7661, + "num_input_tokens_seen": 22402288, + "step": 4180 + }, + { + "epoch": 0.6696, + "grad_norm": 0.8584622144699097, + "learning_rate": 1.2301015655484006e-05, + "loss": 0.6462, + "num_input_tokens_seen": 22430240, + "step": 4185 + }, + { + "epoch": 0.6704, + "grad_norm": 1.0526131391525269, + "learning_rate": 1.2246933653144385e-05, + "loss": 0.6487, + "num_input_tokens_seen": 22454800, + "step": 4190 + }, + { + "epoch": 0.6712, + "grad_norm": 1.1912094354629517, + "learning_rate": 1.2192932206103e-05, + "loss": 0.7369, + "num_input_tokens_seen": 22482528, + "step": 4195 + }, + { + "epoch": 0.672, + "grad_norm": 0.8804354071617126, + "learning_rate": 1.2139011655462337e-05, + "loss": 0.6942, + "num_input_tokens_seen": 22508976, + "step": 4200 + }, + { + "epoch": 0.6728, + "grad_norm": 0.9333446025848389, + "learning_rate": 1.2085172341813911e-05, + "loss": 0.7691, + "num_input_tokens_seen": 22538976, + "step": 4205 + }, + { + "epoch": 0.6736, + "grad_norm": 0.8501102328300476, + "learning_rate": 1.2031414605236066e-05, + "loss": 0.5865, + "num_input_tokens_seen": 22566368, + "step": 4210 + }, + { + "epoch": 0.6744, + "grad_norm": 0.89410001039505, + "learning_rate": 1.1977738785291895e-05, + "loss": 0.6916, + "num_input_tokens_seen": 22592656, + "step": 4215 + }, + { + "epoch": 0.6752, + "grad_norm": 0.9672756195068359, + "learning_rate": 1.1924145221027047e-05, + "loss": 0.7436, + "num_input_tokens_seen": 22619872, + "step": 4220 + }, + { + "epoch": 0.676, + "grad_norm": 0.8680210113525391, + "learning_rate": 1.1870634250967605e-05, + "loss": 0.6728, + "num_input_tokens_seen": 22650320, + "step": 4225 + }, + { + "epoch": 0.6768, + "grad_norm": 1.0170356035232544, + "learning_rate": 1.1817206213117946e-05, + "loss": 0.728, + "num_input_tokens_seen": 22676896, + "step": 4230 + }, + { + "epoch": 0.6776, + "grad_norm": 1.0950289964675903, + "learning_rate": 1.1763861444958573e-05, + "loss": 0.6696, + "num_input_tokens_seen": 22702352, + "step": 4235 + }, + { + "epoch": 0.6784, + "grad_norm": 1.2183908224105835, + "learning_rate": 1.1710600283444047e-05, + "loss": 0.7827, + "num_input_tokens_seen": 22728288, + "step": 4240 + }, + { + "epoch": 0.6792, + "grad_norm": 0.9134287238121033, + "learning_rate": 1.1657423065000811e-05, + "loss": 0.7166, + "num_input_tokens_seen": 22757632, + "step": 4245 + }, + { + "epoch": 0.68, + "grad_norm": 1.0772439241409302, + "learning_rate": 1.1604330125525079e-05, + "loss": 0.7143, + "num_input_tokens_seen": 22783440, + "step": 4250 + }, + { + "epoch": 0.6808, + "grad_norm": 1.003915786743164, + "learning_rate": 1.155132180038072e-05, + "loss": 0.82, + "num_input_tokens_seen": 22809616, + "step": 4255 + }, + { + "epoch": 0.6816, + "grad_norm": 0.9822829961776733, + "learning_rate": 1.1498398424397106e-05, + "loss": 0.7297, + "num_input_tokens_seen": 22835792, + "step": 4260 + }, + { + "epoch": 0.6824, + "grad_norm": 0.855888307094574, + "learning_rate": 1.1445560331867053e-05, + "loss": 0.6956, + "num_input_tokens_seen": 22864560, + "step": 4265 + }, + { + "epoch": 0.6832, + "grad_norm": 0.850237250328064, + "learning_rate": 1.1392807856544683e-05, + "loss": 0.7157, + "num_input_tokens_seen": 22892912, + "step": 4270 + }, + { + "epoch": 0.684, + "grad_norm": 1.4631861448287964, + "learning_rate": 1.1340141331643276e-05, + "loss": 0.753, + "num_input_tokens_seen": 22912640, + "step": 4275 + }, + { + "epoch": 0.6848, + "grad_norm": 1.1579556465148926, + "learning_rate": 1.1287561089833248e-05, + "loss": 0.7247, + "num_input_tokens_seen": 22937072, + "step": 4280 + }, + { + "epoch": 0.6856, + "grad_norm": 1.3090944290161133, + "learning_rate": 1.1235067463239967e-05, + "loss": 0.7671, + "num_input_tokens_seen": 22961104, + "step": 4285 + }, + { + "epoch": 0.6864, + "grad_norm": 0.9593985676765442, + "learning_rate": 1.1182660783441718e-05, + "loss": 0.6771, + "num_input_tokens_seen": 22983744, + "step": 4290 + }, + { + "epoch": 0.6872, + "grad_norm": 0.6641414165496826, + "learning_rate": 1.1130341381467569e-05, + "loss": 0.7179, + "num_input_tokens_seen": 23010208, + "step": 4295 + }, + { + "epoch": 0.688, + "grad_norm": 0.6973831057548523, + "learning_rate": 1.107810958779531e-05, + "loss": 0.5792, + "num_input_tokens_seen": 23043392, + "step": 4300 + }, + { + "epoch": 0.6888, + "grad_norm": 0.9109097123146057, + "learning_rate": 1.1025965732349316e-05, + "loss": 0.6574, + "num_input_tokens_seen": 23074928, + "step": 4305 + }, + { + "epoch": 0.6896, + "grad_norm": 0.8845970034599304, + "learning_rate": 1.0973910144498534e-05, + "loss": 0.6544, + "num_input_tokens_seen": 23105728, + "step": 4310 + }, + { + "epoch": 0.6904, + "grad_norm": 1.181096076965332, + "learning_rate": 1.0921943153054343e-05, + "loss": 0.6503, + "num_input_tokens_seen": 23132768, + "step": 4315 + }, + { + "epoch": 0.6912, + "grad_norm": 0.785658061504364, + "learning_rate": 1.0870065086268505e-05, + "loss": 0.6502, + "num_input_tokens_seen": 23160080, + "step": 4320 + }, + { + "epoch": 0.692, + "grad_norm": 1.1847856044769287, + "learning_rate": 1.0818276271831093e-05, + "loss": 0.7114, + "num_input_tokens_seen": 23187696, + "step": 4325 + }, + { + "epoch": 0.6928, + "grad_norm": 1.1500554084777832, + "learning_rate": 1.0766577036868395e-05, + "loss": 0.6546, + "num_input_tokens_seen": 23211904, + "step": 4330 + }, + { + "epoch": 0.6936, + "grad_norm": 0.9399601221084595, + "learning_rate": 1.0714967707940875e-05, + "loss": 0.6965, + "num_input_tokens_seen": 23238144, + "step": 4335 + }, + { + "epoch": 0.6944, + "grad_norm": 0.928415060043335, + "learning_rate": 1.0663448611041113e-05, + "loss": 0.6232, + "num_input_tokens_seen": 23267104, + "step": 4340 + }, + { + "epoch": 0.6952, + "grad_norm": 1.2702407836914062, + "learning_rate": 1.0612020071591722e-05, + "loss": 0.6686, + "num_input_tokens_seen": 23298976, + "step": 4345 + }, + { + "epoch": 0.696, + "grad_norm": 1.1251682043075562, + "learning_rate": 1.0560682414443315e-05, + "loss": 0.6975, + "num_input_tokens_seen": 23329552, + "step": 4350 + }, + { + "epoch": 0.6968, + "grad_norm": 0.8569183945655823, + "learning_rate": 1.0509435963872422e-05, + "loss": 0.7017, + "num_input_tokens_seen": 23359664, + "step": 4355 + }, + { + "epoch": 0.6976, + "grad_norm": 0.9474520683288574, + "learning_rate": 1.0458281043579482e-05, + "loss": 0.6856, + "num_input_tokens_seen": 23386320, + "step": 4360 + }, + { + "epoch": 0.6984, + "grad_norm": 1.3578598499298096, + "learning_rate": 1.0407217976686775e-05, + "loss": 0.6739, + "num_input_tokens_seen": 23416512, + "step": 4365 + }, + { + "epoch": 0.6992, + "grad_norm": 1.1748965978622437, + "learning_rate": 1.0356247085736386e-05, + "loss": 0.6803, + "num_input_tokens_seen": 23439904, + "step": 4370 + }, + { + "epoch": 0.7, + "grad_norm": 0.7888918519020081, + "learning_rate": 1.0305368692688174e-05, + "loss": 0.7111, + "num_input_tokens_seen": 23466256, + "step": 4375 + }, + { + "epoch": 0.7008, + "grad_norm": 1.137215256690979, + "learning_rate": 1.0254583118917698e-05, + "loss": 0.762, + "num_input_tokens_seen": 23493536, + "step": 4380 + }, + { + "epoch": 0.7016, + "grad_norm": 1.337811827659607, + "learning_rate": 1.020389068521426e-05, + "loss": 0.6206, + "num_input_tokens_seen": 23515632, + "step": 4385 + }, + { + "epoch": 0.7024, + "grad_norm": 1.0022634267807007, + "learning_rate": 1.0153291711778826e-05, + "loss": 0.6711, + "num_input_tokens_seen": 23541152, + "step": 4390 + }, + { + "epoch": 0.7032, + "grad_norm": 1.339572548866272, + "learning_rate": 1.0102786518221997e-05, + "loss": 0.7606, + "num_input_tokens_seen": 23565424, + "step": 4395 + }, + { + "epoch": 0.704, + "grad_norm": 0.9340786337852478, + "learning_rate": 1.0052375423562038e-05, + "loss": 0.7306, + "num_input_tokens_seen": 23592256, + "step": 4400 + }, + { + "epoch": 0.7048, + "grad_norm": 1.1283642053604126, + "learning_rate": 1.0002058746222806e-05, + "loss": 0.6121, + "num_input_tokens_seen": 23622240, + "step": 4405 + }, + { + "epoch": 0.7056, + "grad_norm": 0.7319700717926025, + "learning_rate": 9.951836804031794e-06, + "loss": 0.6273, + "num_input_tokens_seen": 23650544, + "step": 4410 + }, + { + "epoch": 0.7064, + "grad_norm": 1.1631896495819092, + "learning_rate": 9.90170991421808e-06, + "loss": 0.7261, + "num_input_tokens_seen": 23676016, + "step": 4415 + }, + { + "epoch": 0.7072, + "grad_norm": 0.7371265292167664, + "learning_rate": 9.851678393410343e-06, + "loss": 0.8013, + "num_input_tokens_seen": 23701232, + "step": 4420 + }, + { + "epoch": 0.708, + "grad_norm": 0.8485360741615295, + "learning_rate": 9.801742557634872e-06, + "loss": 0.7189, + "num_input_tokens_seen": 23731984, + "step": 4425 + }, + { + "epoch": 0.7088, + "grad_norm": 0.958996057510376, + "learning_rate": 9.751902722313527e-06, + "loss": 0.7397, + "num_input_tokens_seen": 23756560, + "step": 4430 + }, + { + "epoch": 0.7096, + "grad_norm": 0.9431763887405396, + "learning_rate": 9.702159202261801e-06, + "loss": 0.625, + "num_input_tokens_seen": 23785504, + "step": 4435 + }, + { + "epoch": 0.7104, + "grad_norm": 1.2089407444000244, + "learning_rate": 9.652512311686809e-06, + "loss": 0.7958, + "num_input_tokens_seen": 23811840, + "step": 4440 + }, + { + "epoch": 0.7112, + "grad_norm": 1.212649941444397, + "learning_rate": 9.602962364185286e-06, + "loss": 0.7092, + "num_input_tokens_seen": 23834688, + "step": 4445 + }, + { + "epoch": 0.712, + "grad_norm": 0.8656122088432312, + "learning_rate": 9.553509672741645e-06, + "loss": 0.6516, + "num_input_tokens_seen": 23858736, + "step": 4450 + }, + { + "epoch": 0.7128, + "grad_norm": 0.8871904611587524, + "learning_rate": 9.504154549725943e-06, + "loss": 0.6276, + "num_input_tokens_seen": 23883696, + "step": 4455 + }, + { + "epoch": 0.7136, + "grad_norm": 0.8539274334907532, + "learning_rate": 9.454897306891972e-06, + "loss": 0.6741, + "num_input_tokens_seen": 23909904, + "step": 4460 + }, + { + "epoch": 0.7144, + "grad_norm": 1.0730023384094238, + "learning_rate": 9.405738255375244e-06, + "loss": 0.7054, + "num_input_tokens_seen": 23933056, + "step": 4465 + }, + { + "epoch": 0.7152, + "grad_norm": 1.2047233581542969, + "learning_rate": 9.356677705691058e-06, + "loss": 0.7593, + "num_input_tokens_seen": 23957440, + "step": 4470 + }, + { + "epoch": 0.716, + "grad_norm": 0.8580662608146667, + "learning_rate": 9.307715967732491e-06, + "loss": 0.6264, + "num_input_tokens_seen": 23985088, + "step": 4475 + }, + { + "epoch": 0.7168, + "grad_norm": 0.884903073310852, + "learning_rate": 9.258853350768499e-06, + "loss": 0.6596, + "num_input_tokens_seen": 24010448, + "step": 4480 + }, + { + "epoch": 0.7176, + "grad_norm": 0.9185977578163147, + "learning_rate": 9.210090163441929e-06, + "loss": 0.7053, + "num_input_tokens_seen": 24035040, + "step": 4485 + }, + { + "epoch": 0.7184, + "grad_norm": 1.3897747993469238, + "learning_rate": 9.161426713767574e-06, + "loss": 0.6769, + "num_input_tokens_seen": 24057872, + "step": 4490 + }, + { + "epoch": 0.7192, + "grad_norm": 1.220688819885254, + "learning_rate": 9.112863309130235e-06, + "loss": 0.7486, + "num_input_tokens_seen": 24077920, + "step": 4495 + }, + { + "epoch": 0.72, + "grad_norm": 0.9036649465560913, + "learning_rate": 9.064400256282757e-06, + "loss": 0.765, + "num_input_tokens_seen": 24104320, + "step": 4500 + }, + { + "epoch": 0.7208, + "grad_norm": 0.7980133891105652, + "learning_rate": 9.016037861344129e-06, + "loss": 0.653, + "num_input_tokens_seen": 24134144, + "step": 4505 + }, + { + "epoch": 0.7216, + "grad_norm": 0.7849147915840149, + "learning_rate": 8.967776429797528e-06, + "loss": 0.6412, + "num_input_tokens_seen": 24164576, + "step": 4510 + }, + { + "epoch": 0.7224, + "grad_norm": 0.8543937802314758, + "learning_rate": 8.919616266488373e-06, + "loss": 0.7113, + "num_input_tokens_seen": 24192736, + "step": 4515 + }, + { + "epoch": 0.7232, + "grad_norm": 0.9191213250160217, + "learning_rate": 8.871557675622441e-06, + "loss": 0.8171, + "num_input_tokens_seen": 24218064, + "step": 4520 + }, + { + "epoch": 0.724, + "grad_norm": 1.1177440881729126, + "learning_rate": 8.8236009607639e-06, + "loss": 0.7845, + "num_input_tokens_seen": 24244832, + "step": 4525 + }, + { + "epoch": 0.7248, + "grad_norm": 0.899111807346344, + "learning_rate": 8.775746424833427e-06, + "loss": 0.7025, + "num_input_tokens_seen": 24272848, + "step": 4530 + }, + { + "epoch": 0.7256, + "grad_norm": 1.1424217224121094, + "learning_rate": 8.727994370106288e-06, + "loss": 0.868, + "num_input_tokens_seen": 24298240, + "step": 4535 + }, + { + "epoch": 0.7264, + "grad_norm": 0.9559049010276794, + "learning_rate": 8.680345098210408e-06, + "loss": 0.6285, + "num_input_tokens_seen": 24327776, + "step": 4540 + }, + { + "epoch": 0.7272, + "grad_norm": 0.9032924771308899, + "learning_rate": 8.632798910124492e-06, + "loss": 0.6583, + "num_input_tokens_seen": 24355424, + "step": 4545 + }, + { + "epoch": 0.728, + "grad_norm": 1.056780457496643, + "learning_rate": 8.585356106176094e-06, + "loss": 0.756, + "num_input_tokens_seen": 24381104, + "step": 4550 + }, + { + "epoch": 0.7288, + "grad_norm": 1.2001997232437134, + "learning_rate": 8.538016986039754e-06, + "loss": 0.7739, + "num_input_tokens_seen": 24403760, + "step": 4555 + }, + { + "epoch": 0.7296, + "grad_norm": 1.1103582382202148, + "learning_rate": 8.49078184873508e-06, + "loss": 0.6998, + "num_input_tokens_seen": 24431280, + "step": 4560 + }, + { + "epoch": 0.7304, + "grad_norm": 1.0271393060684204, + "learning_rate": 8.443650992624877e-06, + "loss": 0.723, + "num_input_tokens_seen": 24459120, + "step": 4565 + }, + { + "epoch": 0.7312, + "grad_norm": 0.7871257066726685, + "learning_rate": 8.39662471541325e-06, + "loss": 0.7225, + "num_input_tokens_seen": 24485152, + "step": 4570 + }, + { + "epoch": 0.732, + "grad_norm": 0.8319628238677979, + "learning_rate": 8.34970331414371e-06, + "loss": 0.5801, + "num_input_tokens_seen": 24512416, + "step": 4575 + }, + { + "epoch": 0.7328, + "grad_norm": 0.7009981274604797, + "learning_rate": 8.302887085197341e-06, + "loss": 0.6043, + "num_input_tokens_seen": 24543328, + "step": 4580 + }, + { + "epoch": 0.7336, + "grad_norm": 1.0223398208618164, + "learning_rate": 8.256176324290885e-06, + "loss": 0.6533, + "num_input_tokens_seen": 24566000, + "step": 4585 + }, + { + "epoch": 0.7344, + "grad_norm": 1.127424955368042, + "learning_rate": 8.209571326474896e-06, + "loss": 0.6906, + "num_input_tokens_seen": 24594080, + "step": 4590 + }, + { + "epoch": 0.7352, + "grad_norm": 0.9771124124526978, + "learning_rate": 8.163072386131876e-06, + "loss": 0.6661, + "num_input_tokens_seen": 24621424, + "step": 4595 + }, + { + "epoch": 0.736, + "grad_norm": 0.859312117099762, + "learning_rate": 8.116679796974392e-06, + "loss": 0.6663, + "num_input_tokens_seen": 24644288, + "step": 4600 + }, + { + "epoch": 0.7368, + "grad_norm": 1.3819899559020996, + "learning_rate": 8.070393852043251e-06, + "loss": 0.7064, + "num_input_tokens_seen": 24674048, + "step": 4605 + }, + { + "epoch": 0.7376, + "grad_norm": 1.034734845161438, + "learning_rate": 8.024214843705646e-06, + "loss": 0.6837, + "num_input_tokens_seen": 24696320, + "step": 4610 + }, + { + "epoch": 0.7384, + "grad_norm": 0.9610295295715332, + "learning_rate": 7.978143063653298e-06, + "loss": 0.5342, + "num_input_tokens_seen": 24729280, + "step": 4615 + }, + { + "epoch": 0.7392, + "grad_norm": 1.166585922241211, + "learning_rate": 7.93217880290059e-06, + "loss": 0.6907, + "num_input_tokens_seen": 24758080, + "step": 4620 + }, + { + "epoch": 0.74, + "grad_norm": 1.1341148614883423, + "learning_rate": 7.886322351782783e-06, + "loss": 0.6856, + "num_input_tokens_seen": 24787472, + "step": 4625 + }, + { + "epoch": 0.7408, + "grad_norm": 0.9481520056724548, + "learning_rate": 7.840573999954153e-06, + "loss": 0.713, + "num_input_tokens_seen": 24815936, + "step": 4630 + }, + { + "epoch": 0.7416, + "grad_norm": 1.2403899431228638, + "learning_rate": 7.79493403638614e-06, + "loss": 0.7692, + "num_input_tokens_seen": 24840096, + "step": 4635 + }, + { + "epoch": 0.7424, + "grad_norm": 0.9576728343963623, + "learning_rate": 7.749402749365572e-06, + "loss": 0.7177, + "num_input_tokens_seen": 24866480, + "step": 4640 + }, + { + "epoch": 0.7432, + "grad_norm": 1.0239994525909424, + "learning_rate": 7.703980426492791e-06, + "loss": 0.7124, + "num_input_tokens_seen": 24889456, + "step": 4645 + }, + { + "epoch": 0.744, + "grad_norm": 1.0492584705352783, + "learning_rate": 7.658667354679883e-06, + "loss": 0.7038, + "num_input_tokens_seen": 24913824, + "step": 4650 + }, + { + "epoch": 0.7448, + "grad_norm": 1.1247596740722656, + "learning_rate": 7.613463820148831e-06, + "loss": 0.6662, + "num_input_tokens_seen": 24940880, + "step": 4655 + }, + { + "epoch": 0.7456, + "grad_norm": 1.2390748262405396, + "learning_rate": 7.568370108429732e-06, + "loss": 0.7949, + "num_input_tokens_seen": 24965696, + "step": 4660 + }, + { + "epoch": 0.7464, + "grad_norm": 0.7792567610740662, + "learning_rate": 7.523386504358984e-06, + "loss": 0.7146, + "num_input_tokens_seen": 24992096, + "step": 4665 + }, + { + "epoch": 0.7472, + "grad_norm": 0.9417341351509094, + "learning_rate": 7.478513292077463e-06, + "loss": 0.669, + "num_input_tokens_seen": 25024320, + "step": 4670 + }, + { + "epoch": 0.748, + "grad_norm": 1.226563572883606, + "learning_rate": 7.433750755028773e-06, + "loss": 0.7789, + "num_input_tokens_seen": 25049152, + "step": 4675 + }, + { + "epoch": 0.7488, + "grad_norm": 0.8685075640678406, + "learning_rate": 7.389099175957429e-06, + "loss": 0.6992, + "num_input_tokens_seen": 25077328, + "step": 4680 + }, + { + "epoch": 0.7496, + "grad_norm": 0.7221574187278748, + "learning_rate": 7.344558836907067e-06, + "loss": 0.6421, + "num_input_tokens_seen": 25105008, + "step": 4685 + }, + { + "epoch": 0.7504, + "grad_norm": 0.8624604940414429, + "learning_rate": 7.300130019218687e-06, + "loss": 0.7656, + "num_input_tokens_seen": 25131392, + "step": 4690 + }, + { + "epoch": 0.7512, + "grad_norm": 1.0061527490615845, + "learning_rate": 7.255813003528833e-06, + "loss": 0.6506, + "num_input_tokens_seen": 25159984, + "step": 4695 + }, + { + "epoch": 0.752, + "grad_norm": 1.0879433155059814, + "learning_rate": 7.211608069767867e-06, + "loss": 0.6253, + "num_input_tokens_seen": 25188192, + "step": 4700 + }, + { + "epoch": 0.7528, + "grad_norm": 1.2521827220916748, + "learning_rate": 7.1675154971581785e-06, + "loss": 0.6776, + "num_input_tokens_seen": 25215360, + "step": 4705 + }, + { + "epoch": 0.7536, + "grad_norm": 1.4772545099258423, + "learning_rate": 7.123535564212422e-06, + "loss": 0.8286, + "num_input_tokens_seen": 25240384, + "step": 4710 + }, + { + "epoch": 0.7544, + "grad_norm": 1.0587224960327148, + "learning_rate": 7.079668548731758e-06, + "loss": 0.6152, + "num_input_tokens_seen": 25263104, + "step": 4715 + }, + { + "epoch": 0.7552, + "grad_norm": 1.562467336654663, + "learning_rate": 7.035914727804085e-06, + "loss": 0.7227, + "num_input_tokens_seen": 25288176, + "step": 4720 + }, + { + "epoch": 0.756, + "grad_norm": 1.3081474304199219, + "learning_rate": 6.992274377802327e-06, + "loss": 0.6808, + "num_input_tokens_seen": 25313536, + "step": 4725 + }, + { + "epoch": 0.7568, + "grad_norm": 1.1339465379714966, + "learning_rate": 6.94874777438265e-06, + "loss": 0.7039, + "num_input_tokens_seen": 25339744, + "step": 4730 + }, + { + "epoch": 0.7576, + "grad_norm": 0.8575751185417175, + "learning_rate": 6.905335192482737e-06, + "loss": 0.7081, + "num_input_tokens_seen": 25367440, + "step": 4735 + }, + { + "epoch": 0.7584, + "grad_norm": 0.9493206143379211, + "learning_rate": 6.862036906320058e-06, + "loss": 0.6139, + "num_input_tokens_seen": 25395952, + "step": 4740 + }, + { + "epoch": 0.7592, + "grad_norm": 1.0198074579238892, + "learning_rate": 6.818853189390104e-06, + "loss": 0.8142, + "num_input_tokens_seen": 25421744, + "step": 4745 + }, + { + "epoch": 0.76, + "grad_norm": 0.8722714185714722, + "learning_rate": 6.775784314464717e-06, + "loss": 0.6538, + "num_input_tokens_seen": 25448944, + "step": 4750 + }, + { + "epoch": 0.7608, + "grad_norm": 0.750995397567749, + "learning_rate": 6.732830553590305e-06, + "loss": 0.6409, + "num_input_tokens_seen": 25476640, + "step": 4755 + }, + { + "epoch": 0.7616, + "grad_norm": 1.152645230293274, + "learning_rate": 6.689992178086174e-06, + "loss": 0.6814, + "num_input_tokens_seen": 25503328, + "step": 4760 + }, + { + "epoch": 0.7624, + "grad_norm": 0.9856323599815369, + "learning_rate": 6.647269458542793e-06, + "loss": 0.739, + "num_input_tokens_seen": 25530384, + "step": 4765 + }, + { + "epoch": 0.7632, + "grad_norm": 1.0248849391937256, + "learning_rate": 6.604662664820063e-06, + "loss": 0.6775, + "num_input_tokens_seen": 25558880, + "step": 4770 + }, + { + "epoch": 0.764, + "grad_norm": 1.1603997945785522, + "learning_rate": 6.562172066045655e-06, + "loss": 0.8087, + "num_input_tokens_seen": 25584016, + "step": 4775 + }, + { + "epoch": 0.7648, + "grad_norm": 1.0015392303466797, + "learning_rate": 6.519797930613289e-06, + "loss": 0.6836, + "num_input_tokens_seen": 25611712, + "step": 4780 + }, + { + "epoch": 0.7656, + "grad_norm": 0.929892897605896, + "learning_rate": 6.4775405261810364e-06, + "loss": 0.7174, + "num_input_tokens_seen": 25640928, + "step": 4785 + }, + { + "epoch": 0.7664, + "grad_norm": 1.0972721576690674, + "learning_rate": 6.435400119669618e-06, + "loss": 0.6151, + "num_input_tokens_seen": 25667376, + "step": 4790 + }, + { + "epoch": 0.7672, + "grad_norm": 1.1308437585830688, + "learning_rate": 6.3933769772607535e-06, + "loss": 0.7291, + "num_input_tokens_seen": 25697136, + "step": 4795 + }, + { + "epoch": 0.768, + "grad_norm": 0.9035334587097168, + "learning_rate": 6.3514713643954475e-06, + "loss": 0.7215, + "num_input_tokens_seen": 25718912, + "step": 4800 + }, + { + "epoch": 0.7688, + "grad_norm": 0.7563897371292114, + "learning_rate": 6.309683545772327e-06, + "loss": 0.7092, + "num_input_tokens_seen": 25746400, + "step": 4805 + }, + { + "epoch": 0.7696, + "grad_norm": 0.8460260629653931, + "learning_rate": 6.268013785345969e-06, + "loss": 0.6675, + "num_input_tokens_seen": 25772192, + "step": 4810 + }, + { + "epoch": 0.7704, + "grad_norm": 0.8550633788108826, + "learning_rate": 6.226462346325221e-06, + "loss": 0.7428, + "num_input_tokens_seen": 25802256, + "step": 4815 + }, + { + "epoch": 0.7712, + "grad_norm": 1.0745741128921509, + "learning_rate": 6.185029491171554e-06, + "loss": 0.7039, + "num_input_tokens_seen": 25829952, + "step": 4820 + }, + { + "epoch": 0.772, + "grad_norm": 0.9149506688117981, + "learning_rate": 6.143715481597404e-06, + "loss": 0.6733, + "num_input_tokens_seen": 25854752, + "step": 4825 + }, + { + "epoch": 0.7728, + "grad_norm": 1.1725239753723145, + "learning_rate": 6.102520578564508e-06, + "loss": 0.7872, + "num_input_tokens_seen": 25881264, + "step": 4830 + }, + { + "epoch": 0.7736, + "grad_norm": 1.1597212553024292, + "learning_rate": 6.061445042282271e-06, + "loss": 0.7586, + "num_input_tokens_seen": 25906064, + "step": 4835 + }, + { + "epoch": 0.7744, + "grad_norm": 0.9395809173583984, + "learning_rate": 6.020489132206089e-06, + "loss": 0.7269, + "num_input_tokens_seen": 25931280, + "step": 4840 + }, + { + "epoch": 0.7752, + "grad_norm": 0.8174002170562744, + "learning_rate": 5.979653107035757e-06, + "loss": 0.6304, + "num_input_tokens_seen": 25958880, + "step": 4845 + }, + { + "epoch": 0.776, + "grad_norm": 0.9226968884468079, + "learning_rate": 5.9389372247138e-06, + "loss": 0.6855, + "num_input_tokens_seen": 25984528, + "step": 4850 + }, + { + "epoch": 0.7768, + "grad_norm": 1.0765284299850464, + "learning_rate": 5.898341742423865e-06, + "loss": 0.7141, + "num_input_tokens_seen": 26014272, + "step": 4855 + }, + { + "epoch": 0.7776, + "grad_norm": 0.7940208315849304, + "learning_rate": 5.857866916589089e-06, + "loss": 0.668, + "num_input_tokens_seen": 26045888, + "step": 4860 + }, + { + "epoch": 0.7784, + "grad_norm": 0.9069024920463562, + "learning_rate": 5.81751300287045e-06, + "loss": 0.7404, + "num_input_tokens_seen": 26069232, + "step": 4865 + }, + { + "epoch": 0.7792, + "grad_norm": 1.2687326669692993, + "learning_rate": 5.777280256165218e-06, + "loss": 0.633, + "num_input_tokens_seen": 26095936, + "step": 4870 + }, + { + "epoch": 0.78, + "grad_norm": 1.0579140186309814, + "learning_rate": 5.737168930605272e-06, + "loss": 0.6365, + "num_input_tokens_seen": 26121184, + "step": 4875 + }, + { + "epoch": 0.7808, + "grad_norm": 0.8767179846763611, + "learning_rate": 5.6971792795555505e-06, + "loss": 0.6427, + "num_input_tokens_seen": 26147504, + "step": 4880 + }, + { + "epoch": 0.7816, + "grad_norm": 0.9713358283042908, + "learning_rate": 5.6573115556124325e-06, + "loss": 0.6509, + "num_input_tokens_seen": 26174208, + "step": 4885 + }, + { + "epoch": 0.7824, + "grad_norm": 0.7532449960708618, + "learning_rate": 5.617566010602113e-06, + "loss": 0.7382, + "num_input_tokens_seen": 26200112, + "step": 4890 + }, + { + "epoch": 0.7832, + "grad_norm": 1.191658854484558, + "learning_rate": 5.577942895579064e-06, + "loss": 0.7537, + "num_input_tokens_seen": 26227952, + "step": 4895 + }, + { + "epoch": 0.784, + "grad_norm": 0.9605312943458557, + "learning_rate": 5.538442460824417e-06, + "loss": 0.673, + "num_input_tokens_seen": 26259392, + "step": 4900 + }, + { + "epoch": 0.7848, + "grad_norm": 1.0074589252471924, + "learning_rate": 5.499064955844382e-06, + "loss": 0.6684, + "num_input_tokens_seen": 26285456, + "step": 4905 + }, + { + "epoch": 0.7856, + "grad_norm": 0.8559053540229797, + "learning_rate": 5.4598106293686916e-06, + "loss": 0.7051, + "num_input_tokens_seen": 26316544, + "step": 4910 + }, + { + "epoch": 0.7864, + "grad_norm": 1.0885223150253296, + "learning_rate": 5.420679729348993e-06, + "loss": 0.6481, + "num_input_tokens_seen": 26341840, + "step": 4915 + }, + { + "epoch": 0.7872, + "grad_norm": 1.0094472169876099, + "learning_rate": 5.381672502957324e-06, + "loss": 0.7953, + "num_input_tokens_seen": 26371008, + "step": 4920 + }, + { + "epoch": 0.788, + "grad_norm": 1.1841135025024414, + "learning_rate": 5.342789196584527e-06, + "loss": 0.6966, + "num_input_tokens_seen": 26400048, + "step": 4925 + }, + { + "epoch": 0.7888, + "grad_norm": 0.9742115139961243, + "learning_rate": 5.304030055838705e-06, + "loss": 0.6804, + "num_input_tokens_seen": 26425408, + "step": 4930 + }, + { + "epoch": 0.7896, + "grad_norm": 1.327427625656128, + "learning_rate": 5.26539532554364e-06, + "loss": 0.6282, + "num_input_tokens_seen": 26452352, + "step": 4935 + }, + { + "epoch": 0.7904, + "grad_norm": 0.9497706890106201, + "learning_rate": 5.226885249737293e-06, + "loss": 0.588, + "num_input_tokens_seen": 26479456, + "step": 4940 + }, + { + "epoch": 0.7912, + "grad_norm": 1.0179611444473267, + "learning_rate": 5.1885000716702355e-06, + "loss": 0.738, + "num_input_tokens_seen": 26504912, + "step": 4945 + }, + { + "epoch": 0.792, + "grad_norm": 1.0633511543273926, + "learning_rate": 5.150240033804116e-06, + "loss": 0.661, + "num_input_tokens_seen": 26528320, + "step": 4950 + }, + { + "epoch": 0.7928, + "grad_norm": 0.887589693069458, + "learning_rate": 5.112105377810128e-06, + "loss": 0.8033, + "num_input_tokens_seen": 26553984, + "step": 4955 + }, + { + "epoch": 0.7936, + "grad_norm": 1.0450173616409302, + "learning_rate": 5.074096344567475e-06, + "loss": 0.6161, + "num_input_tokens_seen": 26582768, + "step": 4960 + }, + { + "epoch": 0.7944, + "grad_norm": 1.2726836204528809, + "learning_rate": 5.036213174161877e-06, + "loss": 0.7286, + "num_input_tokens_seen": 26610272, + "step": 4965 + }, + { + "epoch": 0.7952, + "grad_norm": 1.0096362829208374, + "learning_rate": 4.998456105884025e-06, + "loss": 0.7065, + "num_input_tokens_seen": 26636352, + "step": 4970 + }, + { + "epoch": 0.796, + "grad_norm": 1.1659733057022095, + "learning_rate": 4.960825378228082e-06, + "loss": 0.6842, + "num_input_tokens_seen": 26667824, + "step": 4975 + }, + { + "epoch": 0.7968, + "grad_norm": 0.9078534245491028, + "learning_rate": 4.9233212288901845e-06, + "loss": 0.7069, + "num_input_tokens_seen": 26698272, + "step": 4980 + }, + { + "epoch": 0.7976, + "grad_norm": 0.8440881967544556, + "learning_rate": 4.885943894766909e-06, + "loss": 0.5942, + "num_input_tokens_seen": 26725984, + "step": 4985 + }, + { + "epoch": 0.7984, + "grad_norm": 1.0457020998001099, + "learning_rate": 4.848693611953825e-06, + "loss": 0.8419, + "num_input_tokens_seen": 26751360, + "step": 4990 + }, + { + "epoch": 0.7992, + "grad_norm": 0.947726309299469, + "learning_rate": 4.811570615743952e-06, + "loss": 0.5888, + "num_input_tokens_seen": 26782672, + "step": 4995 + }, + { + "epoch": 0.8, + "grad_norm": 1.1138640642166138, + "learning_rate": 4.7745751406263165e-06, + "loss": 0.6504, + "num_input_tokens_seen": 26809120, + "step": 5000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.9211150407791138, + "learning_rate": 4.737707420284451e-06, + "loss": 0.6603, + "num_input_tokens_seen": 26839552, + "step": 5005 + }, + { + "epoch": 0.8016, + "grad_norm": 1.2926892042160034, + "learning_rate": 4.700967687594901e-06, + "loss": 0.627, + "num_input_tokens_seen": 26864416, + "step": 5010 + }, + { + "epoch": 0.8024, + "grad_norm": 0.9436898827552795, + "learning_rate": 4.664356174625795e-06, + "loss": 0.6509, + "num_input_tokens_seen": 26890368, + "step": 5015 + }, + { + "epoch": 0.8032, + "grad_norm": 0.8215711712837219, + "learning_rate": 4.627873112635345e-06, + "loss": 0.6673, + "num_input_tokens_seen": 26916064, + "step": 5020 + }, + { + "epoch": 0.804, + "grad_norm": 0.9311307072639465, + "learning_rate": 4.591518732070402e-06, + "loss": 0.7972, + "num_input_tokens_seen": 26940528, + "step": 5025 + }, + { + "epoch": 0.8048, + "grad_norm": 1.1058831214904785, + "learning_rate": 4.5552932625649944e-06, + "loss": 0.6977, + "num_input_tokens_seen": 26965296, + "step": 5030 + }, + { + "epoch": 0.8056, + "grad_norm": 1.2519973516464233, + "learning_rate": 4.5191969329388625e-06, + "loss": 0.8094, + "num_input_tokens_seen": 26988240, + "step": 5035 + }, + { + "epoch": 0.8064, + "grad_norm": 0.9225587248802185, + "learning_rate": 4.483229971196054e-06, + "loss": 0.7441, + "num_input_tokens_seen": 27015632, + "step": 5040 + }, + { + "epoch": 0.8072, + "grad_norm": 0.8636001944541931, + "learning_rate": 4.44739260452344e-06, + "loss": 0.6618, + "num_input_tokens_seen": 27040528, + "step": 5045 + }, + { + "epoch": 0.808, + "grad_norm": 1.094529151916504, + "learning_rate": 4.411685059289314e-06, + "loss": 0.7527, + "num_input_tokens_seen": 27066560, + "step": 5050 + }, + { + "epoch": 0.8088, + "grad_norm": 0.9794814586639404, + "learning_rate": 4.376107561041937e-06, + "loss": 0.7844, + "num_input_tokens_seen": 27089408, + "step": 5055 + }, + { + "epoch": 0.8096, + "grad_norm": 0.8608947396278381, + "learning_rate": 4.340660334508115e-06, + "loss": 0.7598, + "num_input_tokens_seen": 27114832, + "step": 5060 + }, + { + "epoch": 0.8104, + "grad_norm": 0.8481171727180481, + "learning_rate": 4.305343603591802e-06, + "loss": 0.6645, + "num_input_tokens_seen": 27140320, + "step": 5065 + }, + { + "epoch": 0.8112, + "grad_norm": 0.9115588068962097, + "learning_rate": 4.270157591372667e-06, + "loss": 0.7065, + "num_input_tokens_seen": 27171200, + "step": 5070 + }, + { + "epoch": 0.812, + "grad_norm": 1.4465726613998413, + "learning_rate": 4.235102520104681e-06, + "loss": 0.7481, + "num_input_tokens_seen": 27195056, + "step": 5075 + }, + { + "epoch": 0.8128, + "grad_norm": 1.1063685417175293, + "learning_rate": 4.200178611214736e-06, + "loss": 0.6154, + "num_input_tokens_seen": 27220816, + "step": 5080 + }, + { + "epoch": 0.8136, + "grad_norm": 1.3894023895263672, + "learning_rate": 4.165386085301212e-06, + "loss": 0.6661, + "num_input_tokens_seen": 27246400, + "step": 5085 + }, + { + "epoch": 0.8144, + "grad_norm": 1.0136696100234985, + "learning_rate": 4.130725162132612e-06, + "loss": 0.7043, + "num_input_tokens_seen": 27269936, + "step": 5090 + }, + { + "epoch": 0.8152, + "grad_norm": 1.36388099193573, + "learning_rate": 4.096196060646168e-06, + "loss": 0.8173, + "num_input_tokens_seen": 27293488, + "step": 5095 + }, + { + "epoch": 0.816, + "grad_norm": 0.9864152669906616, + "learning_rate": 4.061798998946459e-06, + "loss": 0.7154, + "num_input_tokens_seen": 27318592, + "step": 5100 + }, + { + "epoch": 0.8168, + "grad_norm": 0.9855313301086426, + "learning_rate": 4.027534194304005e-06, + "loss": 0.6336, + "num_input_tokens_seen": 27343616, + "step": 5105 + }, + { + "epoch": 0.8176, + "grad_norm": 0.9487363696098328, + "learning_rate": 3.99340186315395e-06, + "loss": 0.7355, + "num_input_tokens_seen": 27369216, + "step": 5110 + }, + { + "epoch": 0.8184, + "grad_norm": 0.8174062371253967, + "learning_rate": 3.959402221094635e-06, + "loss": 0.6034, + "num_input_tokens_seen": 27398704, + "step": 5115 + }, + { + "epoch": 0.8192, + "grad_norm": 1.17109215259552, + "learning_rate": 3.925535482886286e-06, + "loss": 0.7962, + "num_input_tokens_seen": 27424176, + "step": 5120 + }, + { + "epoch": 0.82, + "grad_norm": 0.9322016835212708, + "learning_rate": 3.891801862449629e-06, + "loss": 0.7289, + "num_input_tokens_seen": 27452656, + "step": 5125 + }, + { + "epoch": 0.8208, + "grad_norm": 0.8374980688095093, + "learning_rate": 3.858201572864537e-06, + "loss": 0.6644, + "num_input_tokens_seen": 27478656, + "step": 5130 + }, + { + "epoch": 0.8216, + "grad_norm": 0.8750137686729431, + "learning_rate": 3.824734826368703e-06, + "loss": 0.7519, + "num_input_tokens_seen": 27507184, + "step": 5135 + }, + { + "epoch": 0.8224, + "grad_norm": 0.935739278793335, + "learning_rate": 3.7914018343562895e-06, + "loss": 0.7611, + "num_input_tokens_seen": 27536112, + "step": 5140 + }, + { + "epoch": 0.8232, + "grad_norm": 1.0115796327590942, + "learning_rate": 3.75820280737659e-06, + "loss": 0.6713, + "num_input_tokens_seen": 27563728, + "step": 5145 + }, + { + "epoch": 0.824, + "grad_norm": 1.1365923881530762, + "learning_rate": 3.725137955132707e-06, + "loss": 0.6522, + "num_input_tokens_seen": 27587120, + "step": 5150 + }, + { + "epoch": 0.8248, + "grad_norm": 1.2187998294830322, + "learning_rate": 3.692207486480209e-06, + "loss": 0.7707, + "num_input_tokens_seen": 27608240, + "step": 5155 + }, + { + "epoch": 0.8256, + "grad_norm": 1.2192776203155518, + "learning_rate": 3.6594116094258337e-06, + "loss": 0.6148, + "num_input_tokens_seen": 27637840, + "step": 5160 + }, + { + "epoch": 0.8264, + "grad_norm": 0.8527853488922119, + "learning_rate": 3.626750531126169e-06, + "loss": 0.6884, + "num_input_tokens_seen": 27662144, + "step": 5165 + }, + { + "epoch": 0.8272, + "grad_norm": 1.293915033340454, + "learning_rate": 3.594224457886336e-06, + "loss": 0.6954, + "num_input_tokens_seen": 27691184, + "step": 5170 + }, + { + "epoch": 0.828, + "grad_norm": 0.9655611515045166, + "learning_rate": 3.561833595158698e-06, + "loss": 0.6736, + "num_input_tokens_seen": 27719376, + "step": 5175 + }, + { + "epoch": 0.8288, + "grad_norm": 0.8014153838157654, + "learning_rate": 3.529578147541532e-06, + "loss": 0.7575, + "num_input_tokens_seen": 27749248, + "step": 5180 + }, + { + "epoch": 0.8296, + "grad_norm": 0.8693012595176697, + "learning_rate": 3.4974583187777852e-06, + "loss": 0.6607, + "num_input_tokens_seen": 27779136, + "step": 5185 + }, + { + "epoch": 0.8304, + "grad_norm": 1.2928762435913086, + "learning_rate": 3.4654743117537524e-06, + "loss": 0.844, + "num_input_tokens_seen": 27802592, + "step": 5190 + }, + { + "epoch": 0.8312, + "grad_norm": 0.9536007046699524, + "learning_rate": 3.433626328497805e-06, + "loss": 0.6596, + "num_input_tokens_seen": 27826864, + "step": 5195 + }, + { + "epoch": 0.832, + "grad_norm": 1.01909601688385, + "learning_rate": 3.4019145701791184e-06, + "loss": 0.7834, + "num_input_tokens_seen": 27851680, + "step": 5200 + }, + { + "epoch": 0.8328, + "grad_norm": 0.8944137096405029, + "learning_rate": 3.3703392371063845e-06, + "loss": 0.6874, + "num_input_tokens_seen": 27880208, + "step": 5205 + }, + { + "epoch": 0.8336, + "grad_norm": 1.2721880674362183, + "learning_rate": 3.338900528726571e-06, + "loss": 0.6468, + "num_input_tokens_seen": 27907392, + "step": 5210 + }, + { + "epoch": 0.8344, + "grad_norm": 1.3406318426132202, + "learning_rate": 3.3075986436236493e-06, + "loss": 0.6675, + "num_input_tokens_seen": 27934560, + "step": 5215 + }, + { + "epoch": 0.8352, + "grad_norm": 0.9489244818687439, + "learning_rate": 3.2764337795173435e-06, + "loss": 0.6704, + "num_input_tokens_seen": 27963248, + "step": 5220 + }, + { + "epoch": 0.836, + "grad_norm": 0.9695754647254944, + "learning_rate": 3.245406133261858e-06, + "loss": 0.6903, + "num_input_tokens_seen": 27989872, + "step": 5225 + }, + { + "epoch": 0.8368, + "grad_norm": 0.7802848219871521, + "learning_rate": 3.2145159008446807e-06, + "loss": 0.7441, + "num_input_tokens_seen": 28012208, + "step": 5230 + }, + { + "epoch": 0.8376, + "grad_norm": 1.2693451642990112, + "learning_rate": 3.1837632773853098e-06, + "loss": 0.6636, + "num_input_tokens_seen": 28041200, + "step": 5235 + }, + { + "epoch": 0.8384, + "grad_norm": 1.1437602043151855, + "learning_rate": 3.15314845713402e-06, + "loss": 0.7246, + "num_input_tokens_seen": 28068272, + "step": 5240 + }, + { + "epoch": 0.8392, + "grad_norm": 0.8950793743133545, + "learning_rate": 3.122671633470664e-06, + "loss": 0.6583, + "num_input_tokens_seen": 28092768, + "step": 5245 + }, + { + "epoch": 0.84, + "grad_norm": 1.010606288909912, + "learning_rate": 3.0923329989034132e-06, + "loss": 0.6823, + "num_input_tokens_seen": 28122944, + "step": 5250 + }, + { + "epoch": 0.8408, + "grad_norm": 1.0807864665985107, + "learning_rate": 3.062132745067581e-06, + "loss": 0.7467, + "num_input_tokens_seen": 28151424, + "step": 5255 + }, + { + "epoch": 0.8416, + "grad_norm": 1.1845099925994873, + "learning_rate": 3.0320710627243813e-06, + "loss": 0.7541, + "num_input_tokens_seen": 28176752, + "step": 5260 + }, + { + "epoch": 0.8424, + "grad_norm": 0.7790080904960632, + "learning_rate": 3.002148141759739e-06, + "loss": 0.6829, + "num_input_tokens_seen": 28205456, + "step": 5265 + }, + { + "epoch": 0.8432, + "grad_norm": 1.364443302154541, + "learning_rate": 2.97236417118309e-06, + "loss": 0.692, + "num_input_tokens_seen": 28230304, + "step": 5270 + }, + { + "epoch": 0.844, + "grad_norm": 1.1289632320404053, + "learning_rate": 2.942719339126171e-06, + "loss": 0.7628, + "num_input_tokens_seen": 28255536, + "step": 5275 + }, + { + "epoch": 0.8448, + "grad_norm": 1.3984529972076416, + "learning_rate": 2.9132138328418573e-06, + "loss": 0.6972, + "num_input_tokens_seen": 28279600, + "step": 5280 + }, + { + "epoch": 0.8456, + "grad_norm": 1.0336651802062988, + "learning_rate": 2.8838478387029606e-06, + "loss": 0.6801, + "num_input_tokens_seen": 28304688, + "step": 5285 + }, + { + "epoch": 0.8464, + "grad_norm": 0.8985416293144226, + "learning_rate": 2.8546215422010638e-06, + "loss": 0.6697, + "num_input_tokens_seen": 28331584, + "step": 5290 + }, + { + "epoch": 0.8472, + "grad_norm": 1.3255959749221802, + "learning_rate": 2.8255351279453446e-06, + "loss": 0.6816, + "num_input_tokens_seen": 28360256, + "step": 5295 + }, + { + "epoch": 0.848, + "grad_norm": 0.9332570433616638, + "learning_rate": 2.7965887796613884e-06, + "loss": 0.7763, + "num_input_tokens_seen": 28385168, + "step": 5300 + }, + { + "epoch": 0.8488, + "grad_norm": 1.041506290435791, + "learning_rate": 2.767782680190073e-06, + "loss": 0.7837, + "num_input_tokens_seen": 28407248, + "step": 5305 + }, + { + "epoch": 0.8496, + "grad_norm": 1.075434923171997, + "learning_rate": 2.739117011486378e-06, + "loss": 0.6975, + "num_input_tokens_seen": 28434304, + "step": 5310 + }, + { + "epoch": 0.8504, + "grad_norm": 1.0231233835220337, + "learning_rate": 2.710591954618247e-06, + "loss": 0.7119, + "num_input_tokens_seen": 28465424, + "step": 5315 + }, + { + "epoch": 0.8512, + "grad_norm": 1.0655337572097778, + "learning_rate": 2.6822076897654452e-06, + "loss": 0.7644, + "num_input_tokens_seen": 28494416, + "step": 5320 + }, + { + "epoch": 0.852, + "grad_norm": 1.177964448928833, + "learning_rate": 2.6539643962184057e-06, + "loss": 0.644, + "num_input_tokens_seen": 28519552, + "step": 5325 + }, + { + "epoch": 0.8528, + "grad_norm": 1.0055785179138184, + "learning_rate": 2.6258622523771287e-06, + "loss": 0.7378, + "num_input_tokens_seen": 28545632, + "step": 5330 + }, + { + "epoch": 0.8536, + "grad_norm": 1.2302112579345703, + "learning_rate": 2.5979014357500248e-06, + "loss": 0.7267, + "num_input_tokens_seen": 28571440, + "step": 5335 + }, + { + "epoch": 0.8544, + "grad_norm": 1.2346506118774414, + "learning_rate": 2.570082122952816e-06, + "loss": 0.6015, + "num_input_tokens_seen": 28599472, + "step": 5340 + }, + { + "epoch": 0.8552, + "grad_norm": 0.8938889503479004, + "learning_rate": 2.5424044897073895e-06, + "loss": 0.6327, + "num_input_tokens_seen": 28629136, + "step": 5345 + }, + { + "epoch": 0.856, + "grad_norm": 0.8958183526992798, + "learning_rate": 2.514868710840723e-06, + "loss": 0.739, + "num_input_tokens_seen": 28648928, + "step": 5350 + }, + { + "epoch": 0.8568, + "grad_norm": 1.0484833717346191, + "learning_rate": 2.4874749602837697e-06, + "loss": 0.7279, + "num_input_tokens_seen": 28675056, + "step": 5355 + }, + { + "epoch": 0.8576, + "grad_norm": 0.887750506401062, + "learning_rate": 2.4602234110703364e-06, + "loss": 0.726, + "num_input_tokens_seen": 28698416, + "step": 5360 + }, + { + "epoch": 0.8584, + "grad_norm": 0.7482147216796875, + "learning_rate": 2.43311423533602e-06, + "loss": 0.7046, + "num_input_tokens_seen": 28729856, + "step": 5365 + }, + { + "epoch": 0.8592, + "grad_norm": 1.003188967704773, + "learning_rate": 2.406147604317119e-06, + "loss": 0.6922, + "num_input_tokens_seen": 28757360, + "step": 5370 + }, + { + "epoch": 0.86, + "grad_norm": 1.5129293203353882, + "learning_rate": 2.379323688349516e-06, + "loss": 0.6664, + "num_input_tokens_seen": 28780624, + "step": 5375 + }, + { + "epoch": 0.8608, + "grad_norm": 0.9118067622184753, + "learning_rate": 2.3526426568676483e-06, + "loss": 0.6532, + "num_input_tokens_seen": 28805616, + "step": 5380 + }, + { + "epoch": 0.8616, + "grad_norm": 1.007717251777649, + "learning_rate": 2.326104678403415e-06, + "loss": 0.6678, + "num_input_tokens_seen": 28833504, + "step": 5385 + }, + { + "epoch": 0.8624, + "grad_norm": 0.9576060771942139, + "learning_rate": 2.299709920585108e-06, + "loss": 0.6152, + "num_input_tokens_seen": 28862704, + "step": 5390 + }, + { + "epoch": 0.8632, + "grad_norm": 1.1482651233673096, + "learning_rate": 2.2734585501363673e-06, + "loss": 0.7131, + "num_input_tokens_seen": 28886224, + "step": 5395 + }, + { + "epoch": 0.864, + "grad_norm": 1.0614038705825806, + "learning_rate": 2.2473507328751086e-06, + "loss": 0.735, + "num_input_tokens_seen": 28911760, + "step": 5400 + }, + { + "epoch": 0.8648, + "grad_norm": 1.0017966032028198, + "learning_rate": 2.2213866337125022e-06, + "loss": 0.6706, + "num_input_tokens_seen": 28941360, + "step": 5405 + }, + { + "epoch": 0.8656, + "grad_norm": 0.963431179523468, + "learning_rate": 2.1955664166519036e-06, + "loss": 0.7683, + "num_input_tokens_seen": 28965568, + "step": 5410 + }, + { + "epoch": 0.8664, + "grad_norm": 1.1716382503509521, + "learning_rate": 2.1698902447878477e-06, + "loss": 0.623, + "num_input_tokens_seen": 28994432, + "step": 5415 + }, + { + "epoch": 0.8672, + "grad_norm": 1.2640058994293213, + "learning_rate": 2.1443582803049755e-06, + "loss": 0.7774, + "num_input_tokens_seen": 29016560, + "step": 5420 + }, + { + "epoch": 0.868, + "grad_norm": 0.9828547239303589, + "learning_rate": 2.118970684477062e-06, + "loss": 0.6332, + "num_input_tokens_seen": 29043920, + "step": 5425 + }, + { + "epoch": 0.8688, + "grad_norm": 0.9180524349212646, + "learning_rate": 2.093727617665955e-06, + "loss": 0.6658, + "num_input_tokens_seen": 29073840, + "step": 5430 + }, + { + "epoch": 0.8696, + "grad_norm": 1.1137315034866333, + "learning_rate": 2.068629239320588e-06, + "loss": 0.7078, + "num_input_tokens_seen": 29102752, + "step": 5435 + }, + { + "epoch": 0.8704, + "grad_norm": 1.1765251159667969, + "learning_rate": 2.043675707975959e-06, + "loss": 0.7049, + "num_input_tokens_seen": 29126576, + "step": 5440 + }, + { + "epoch": 0.8712, + "grad_norm": 0.9444310665130615, + "learning_rate": 2.0188671812521292e-06, + "loss": 0.7931, + "num_input_tokens_seen": 29153120, + "step": 5445 + }, + { + "epoch": 0.872, + "grad_norm": 0.914959192276001, + "learning_rate": 1.9942038158532407e-06, + "loss": 0.8394, + "num_input_tokens_seen": 29182192, + "step": 5450 + }, + { + "epoch": 0.8728, + "grad_norm": 1.21523916721344, + "learning_rate": 1.969685767566512e-06, + "loss": 0.6915, + "num_input_tokens_seen": 29206368, + "step": 5455 + }, + { + "epoch": 0.8736, + "grad_norm": 0.8198549151420593, + "learning_rate": 1.9453131912612694e-06, + "loss": 0.6627, + "num_input_tokens_seen": 29235984, + "step": 5460 + }, + { + "epoch": 0.8744, + "grad_norm": 0.9284049868583679, + "learning_rate": 1.921086240887937e-06, + "loss": 0.6671, + "num_input_tokens_seen": 29260672, + "step": 5465 + }, + { + "epoch": 0.8752, + "grad_norm": 0.9254517555236816, + "learning_rate": 1.8970050694771064e-06, + "loss": 0.665, + "num_input_tokens_seen": 29287792, + "step": 5470 + }, + { + "epoch": 0.876, + "grad_norm": 0.8524356484413147, + "learning_rate": 1.8730698291385518e-06, + "loss": 0.663, + "num_input_tokens_seen": 29314656, + "step": 5475 + }, + { + "epoch": 0.8768, + "grad_norm": 0.8038460612297058, + "learning_rate": 1.8492806710602496e-06, + "loss": 0.7054, + "num_input_tokens_seen": 29338976, + "step": 5480 + }, + { + "epoch": 0.8776, + "grad_norm": 0.8914519548416138, + "learning_rate": 1.8256377455074525e-06, + "loss": 0.6905, + "num_input_tokens_seen": 29364912, + "step": 5485 + }, + { + "epoch": 0.8784, + "grad_norm": 0.8928874135017395, + "learning_rate": 1.802141201821736e-06, + "loss": 0.7641, + "num_input_tokens_seen": 29392960, + "step": 5490 + }, + { + "epoch": 0.8792, + "grad_norm": 0.7842042446136475, + "learning_rate": 1.7787911884200314e-06, + "loss": 0.6918, + "num_input_tokens_seen": 29416848, + "step": 5495 + }, + { + "epoch": 0.88, + "grad_norm": 1.335999846458435, + "learning_rate": 1.7555878527937164e-06, + "loss": 0.6281, + "num_input_tokens_seen": 29445856, + "step": 5500 + }, + { + "epoch": 0.8808, + "grad_norm": 0.981039822101593, + "learning_rate": 1.7325313415076705e-06, + "loss": 0.7199, + "num_input_tokens_seen": 29474400, + "step": 5505 + }, + { + "epoch": 0.8816, + "grad_norm": 1.003205418586731, + "learning_rate": 1.7096218001993513e-06, + "loss": 0.7022, + "num_input_tokens_seen": 29501312, + "step": 5510 + }, + { + "epoch": 0.8824, + "grad_norm": 0.804409921169281, + "learning_rate": 1.686859373577876e-06, + "loss": 0.635, + "num_input_tokens_seen": 29530160, + "step": 5515 + }, + { + "epoch": 0.8832, + "grad_norm": 0.6613283157348633, + "learning_rate": 1.6642442054230934e-06, + "loss": 0.6752, + "num_input_tokens_seen": 29557168, + "step": 5520 + }, + { + "epoch": 0.884, + "grad_norm": 1.0624265670776367, + "learning_rate": 1.6417764385846996e-06, + "loss": 0.7824, + "num_input_tokens_seen": 29584832, + "step": 5525 + }, + { + "epoch": 0.8848, + "grad_norm": 1.0792144536972046, + "learning_rate": 1.6194562149813242e-06, + "loss": 0.6823, + "num_input_tokens_seen": 29609504, + "step": 5530 + }, + { + "epoch": 0.8856, + "grad_norm": 1.0641052722930908, + "learning_rate": 1.5972836755996285e-06, + "loss": 0.6777, + "num_input_tokens_seen": 29636768, + "step": 5535 + }, + { + "epoch": 0.8864, + "grad_norm": 1.2235264778137207, + "learning_rate": 1.5752589604934255e-06, + "loss": 0.7372, + "num_input_tokens_seen": 29660496, + "step": 5540 + }, + { + "epoch": 0.8872, + "grad_norm": 0.9904717206954956, + "learning_rate": 1.5533822087827805e-06, + "loss": 0.7126, + "num_input_tokens_seen": 29686928, + "step": 5545 + }, + { + "epoch": 0.888, + "grad_norm": 1.2209672927856445, + "learning_rate": 1.5316535586531483e-06, + "loss": 0.6564, + "num_input_tokens_seen": 29714800, + "step": 5550 + }, + { + "epoch": 0.8888, + "grad_norm": 1.2465245723724365, + "learning_rate": 1.5100731473544933e-06, + "loss": 0.8006, + "num_input_tokens_seen": 29741808, + "step": 5555 + }, + { + "epoch": 0.8896, + "grad_norm": 1.0243083238601685, + "learning_rate": 1.4886411112004255e-06, + "loss": 0.7322, + "num_input_tokens_seen": 29763088, + "step": 5560 + }, + { + "epoch": 0.8904, + "grad_norm": 0.9879063367843628, + "learning_rate": 1.4673575855673277e-06, + "loss": 0.7243, + "num_input_tokens_seen": 29791520, + "step": 5565 + }, + { + "epoch": 0.8912, + "grad_norm": 0.8593109250068665, + "learning_rate": 1.4462227048935183e-06, + "loss": 0.6955, + "num_input_tokens_seen": 29817600, + "step": 5570 + }, + { + "epoch": 0.892, + "grad_norm": 0.9585959911346436, + "learning_rate": 1.425236602678387e-06, + "loss": 0.6658, + "num_input_tokens_seen": 29843136, + "step": 5575 + }, + { + "epoch": 0.8928, + "grad_norm": 1.2681745290756226, + "learning_rate": 1.4043994114815661e-06, + "loss": 0.7943, + "num_input_tokens_seen": 29864864, + "step": 5580 + }, + { + "epoch": 0.8936, + "grad_norm": 0.8207817673683167, + "learning_rate": 1.38371126292208e-06, + "loss": 0.7734, + "num_input_tokens_seen": 29890416, + "step": 5585 + }, + { + "epoch": 0.8944, + "grad_norm": 1.0749483108520508, + "learning_rate": 1.3631722876775138e-06, + "loss": 0.7008, + "num_input_tokens_seen": 29916400, + "step": 5590 + }, + { + "epoch": 0.8952, + "grad_norm": 1.1091291904449463, + "learning_rate": 1.3427826154832042e-06, + "loss": 0.6434, + "num_input_tokens_seen": 29944304, + "step": 5595 + }, + { + "epoch": 0.896, + "grad_norm": 1.3164221048355103, + "learning_rate": 1.3225423751313942e-06, + "loss": 0.6952, + "num_input_tokens_seen": 29967648, + "step": 5600 + }, + { + "epoch": 0.8968, + "grad_norm": 1.2402032613754272, + "learning_rate": 1.3024516944704496e-06, + "loss": 0.6331, + "num_input_tokens_seen": 29989312, + "step": 5605 + }, + { + "epoch": 0.8976, + "grad_norm": 1.1296372413635254, + "learning_rate": 1.2825107004040272e-06, + "loss": 0.7894, + "num_input_tokens_seen": 30012384, + "step": 5610 + }, + { + "epoch": 0.8984, + "grad_norm": 1.109983205795288, + "learning_rate": 1.2627195188902791e-06, + "loss": 0.6819, + "num_input_tokens_seen": 30042656, + "step": 5615 + }, + { + "epoch": 0.8992, + "grad_norm": 1.0760643482208252, + "learning_rate": 1.2430782749410673e-06, + "loss": 0.8208, + "num_input_tokens_seen": 30068464, + "step": 5620 + }, + { + "epoch": 0.9, + "grad_norm": 1.1865812540054321, + "learning_rate": 1.2235870926211619e-06, + "loss": 0.7203, + "num_input_tokens_seen": 30093216, + "step": 5625 + }, + { + "epoch": 0.9008, + "grad_norm": 1.0090526342391968, + "learning_rate": 1.2042460950474648e-06, + "loss": 0.7368, + "num_input_tokens_seen": 30117488, + "step": 5630 + }, + { + "epoch": 0.9016, + "grad_norm": 0.8406433463096619, + "learning_rate": 1.1850554043882328e-06, + "loss": 0.7681, + "num_input_tokens_seen": 30144016, + "step": 5635 + }, + { + "epoch": 0.9024, + "grad_norm": 1.057853102684021, + "learning_rate": 1.1660151418622922e-06, + "loss": 0.6962, + "num_input_tokens_seen": 30177184, + "step": 5640 + }, + { + "epoch": 0.9032, + "grad_norm": 0.9097471237182617, + "learning_rate": 1.1471254277382881e-06, + "loss": 0.7056, + "num_input_tokens_seen": 30206048, + "step": 5645 + }, + { + "epoch": 0.904, + "grad_norm": 1.0608327388763428, + "learning_rate": 1.1283863813339263e-06, + "loss": 0.7089, + "num_input_tokens_seen": 30229936, + "step": 5650 + }, + { + "epoch": 0.9048, + "grad_norm": 1.1075376272201538, + "learning_rate": 1.1097981210152043e-06, + "loss": 0.7794, + "num_input_tokens_seen": 30257760, + "step": 5655 + }, + { + "epoch": 0.9056, + "grad_norm": 0.9509792327880859, + "learning_rate": 1.0913607641956841e-06, + "loss": 0.77, + "num_input_tokens_seen": 30286464, + "step": 5660 + }, + { + "epoch": 0.9064, + "grad_norm": 0.981621265411377, + "learning_rate": 1.0730744273357213e-06, + "loss": 0.708, + "num_input_tokens_seen": 30317040, + "step": 5665 + }, + { + "epoch": 0.9072, + "grad_norm": 0.8428750038146973, + "learning_rate": 1.0549392259417646e-06, + "loss": 0.6423, + "num_input_tokens_seen": 30342672, + "step": 5670 + }, + { + "epoch": 0.908, + "grad_norm": 0.9921499490737915, + "learning_rate": 1.0369552745656013e-06, + "loss": 0.8176, + "num_input_tokens_seen": 30369952, + "step": 5675 + }, + { + "epoch": 0.9088, + "grad_norm": 0.9219129681587219, + "learning_rate": 1.0191226868036418e-06, + "loss": 0.6924, + "num_input_tokens_seen": 30400800, + "step": 5680 + }, + { + "epoch": 0.9096, + "grad_norm": 1.451660394668579, + "learning_rate": 1.001441575296208e-06, + "loss": 0.576, + "num_input_tokens_seen": 30436240, + "step": 5685 + }, + { + "epoch": 0.9104, + "grad_norm": 0.9631555676460266, + "learning_rate": 9.839120517267985e-07, + "loss": 0.6206, + "num_input_tokens_seen": 30465232, + "step": 5690 + }, + { + "epoch": 0.9112, + "grad_norm": 1.125351071357727, + "learning_rate": 9.665342268214166e-07, + "loss": 0.7424, + "num_input_tokens_seen": 30489776, + "step": 5695 + }, + { + "epoch": 0.912, + "grad_norm": 1.0404316186904907, + "learning_rate": 9.493082103478517e-07, + "loss": 0.6868, + "num_input_tokens_seen": 30514592, + "step": 5700 + }, + { + "epoch": 0.9128, + "grad_norm": 0.9020105004310608, + "learning_rate": 9.322341111149852e-07, + "loss": 0.7017, + "num_input_tokens_seen": 30544112, + "step": 5705 + }, + { + "epoch": 0.9136, + "grad_norm": 1.0047924518585205, + "learning_rate": 9.153120369721046e-07, + "loss": 0.6429, + "num_input_tokens_seen": 30577440, + "step": 5710 + }, + { + "epoch": 0.9144, + "grad_norm": 1.087547779083252, + "learning_rate": 8.985420948082329e-07, + "loss": 0.6507, + "num_input_tokens_seen": 30602704, + "step": 5715 + }, + { + "epoch": 0.9152, + "grad_norm": 0.9647343754768372, + "learning_rate": 8.819243905514308e-07, + "loss": 0.7508, + "num_input_tokens_seen": 30627360, + "step": 5720 + }, + { + "epoch": 0.916, + "grad_norm": 0.9557966589927673, + "learning_rate": 8.65459029168153e-07, + "loss": 0.6506, + "num_input_tokens_seen": 30657216, + "step": 5725 + }, + { + "epoch": 0.9168, + "grad_norm": 0.991233229637146, + "learning_rate": 8.491461146625773e-07, + "loss": 0.6426, + "num_input_tokens_seen": 30683472, + "step": 5730 + }, + { + "epoch": 0.9176, + "grad_norm": 0.8405401706695557, + "learning_rate": 8.329857500759292e-07, + "loss": 0.6227, + "num_input_tokens_seen": 30707392, + "step": 5735 + }, + { + "epoch": 0.9184, + "grad_norm": 1.2772481441497803, + "learning_rate": 8.169780374858577e-07, + "loss": 0.839, + "num_input_tokens_seen": 30732160, + "step": 5740 + }, + { + "epoch": 0.9192, + "grad_norm": 0.8571876883506775, + "learning_rate": 8.011230780057749e-07, + "loss": 0.6817, + "num_input_tokens_seen": 30760336, + "step": 5745 + }, + { + "epoch": 0.92, + "grad_norm": 1.000186562538147, + "learning_rate": 7.854209717842231e-07, + "loss": 0.7016, + "num_input_tokens_seen": 30788800, + "step": 5750 + }, + { + "epoch": 0.9208, + "grad_norm": 1.0842921733856201, + "learning_rate": 7.698718180042392e-07, + "loss": 0.702, + "num_input_tokens_seen": 30813632, + "step": 5755 + }, + { + "epoch": 0.9216, + "grad_norm": 1.0399980545043945, + "learning_rate": 7.544757148827297e-07, + "loss": 0.7203, + "num_input_tokens_seen": 30840816, + "step": 5760 + }, + { + "epoch": 0.9224, + "grad_norm": 1.1896955966949463, + "learning_rate": 7.392327596698473e-07, + "loss": 0.6873, + "num_input_tokens_seen": 30861664, + "step": 5765 + }, + { + "epoch": 0.9232, + "grad_norm": 1.3494455814361572, + "learning_rate": 7.241430486483819e-07, + "loss": 0.6975, + "num_input_tokens_seen": 30886064, + "step": 5770 + }, + { + "epoch": 0.924, + "grad_norm": 1.1318798065185547, + "learning_rate": 7.092066771331507e-07, + "loss": 0.6058, + "num_input_tokens_seen": 30910608, + "step": 5775 + }, + { + "epoch": 0.9248, + "grad_norm": 1.230055332183838, + "learning_rate": 6.944237394703984e-07, + "loss": 0.8128, + "num_input_tokens_seen": 30935008, + "step": 5780 + }, + { + "epoch": 0.9256, + "grad_norm": 1.0150400400161743, + "learning_rate": 6.797943290371839e-07, + "loss": 0.7329, + "num_input_tokens_seen": 30959792, + "step": 5785 + }, + { + "epoch": 0.9264, + "grad_norm": 0.9498345255851746, + "learning_rate": 6.653185382408194e-07, + "loss": 0.673, + "num_input_tokens_seen": 30985856, + "step": 5790 + }, + { + "epoch": 0.9272, + "grad_norm": 1.047587275505066, + "learning_rate": 6.509964585182687e-07, + "loss": 0.7395, + "num_input_tokens_seen": 31013888, + "step": 5795 + }, + { + "epoch": 0.928, + "grad_norm": 0.9112536907196045, + "learning_rate": 6.368281803355691e-07, + "loss": 0.753, + "num_input_tokens_seen": 31038352, + "step": 5800 + }, + { + "epoch": 0.9288, + "grad_norm": 0.9714504480361938, + "learning_rate": 6.228137931872713e-07, + "loss": 0.7573, + "num_input_tokens_seen": 31066624, + "step": 5805 + }, + { + "epoch": 0.9296, + "grad_norm": 1.1300855875015259, + "learning_rate": 6.089533855958507e-07, + "loss": 0.759, + "num_input_tokens_seen": 31093184, + "step": 5810 + }, + { + "epoch": 0.9304, + "grad_norm": 1.0004905462265015, + "learning_rate": 5.95247045111183e-07, + "loss": 0.7482, + "num_input_tokens_seen": 31118352, + "step": 5815 + }, + { + "epoch": 0.9312, + "grad_norm": 0.8432052731513977, + "learning_rate": 5.816948583099613e-07, + "loss": 0.6295, + "num_input_tokens_seen": 31145616, + "step": 5820 + }, + { + "epoch": 0.932, + "grad_norm": 0.923195481300354, + "learning_rate": 5.68296910795163e-07, + "loss": 0.7596, + "num_input_tokens_seen": 31167088, + "step": 5825 + }, + { + "epoch": 0.9328, + "grad_norm": 1.0346819162368774, + "learning_rate": 5.550532871955061e-07, + "loss": 0.689, + "num_input_tokens_seen": 31192672, + "step": 5830 + }, + { + "epoch": 0.9336, + "grad_norm": 0.8770239949226379, + "learning_rate": 5.419640711649188e-07, + "loss": 0.6387, + "num_input_tokens_seen": 31224016, + "step": 5835 + }, + { + "epoch": 0.9344, + "grad_norm": 1.2829992771148682, + "learning_rate": 5.290293453819955e-07, + "loss": 0.7316, + "num_input_tokens_seen": 31247008, + "step": 5840 + }, + { + "epoch": 0.9352, + "grad_norm": 0.9933931231498718, + "learning_rate": 5.162491915495005e-07, + "loss": 0.7255, + "num_input_tokens_seen": 31273232, + "step": 5845 + }, + { + "epoch": 0.936, + "grad_norm": 0.8756529092788696, + "learning_rate": 5.036236903938285e-07, + "loss": 0.7188, + "num_input_tokens_seen": 31299504, + "step": 5850 + }, + { + "epoch": 0.9368, + "grad_norm": 0.85035240650177, + "learning_rate": 4.911529216645088e-07, + "loss": 0.6763, + "num_input_tokens_seen": 31325792, + "step": 5855 + }, + { + "epoch": 0.9376, + "grad_norm": 0.9068401455879211, + "learning_rate": 4.788369641336943e-07, + "loss": 0.6109, + "num_input_tokens_seen": 31351216, + "step": 5860 + }, + { + "epoch": 0.9384, + "grad_norm": 1.079689860343933, + "learning_rate": 4.666758955956613e-07, + "loss": 0.7778, + "num_input_tokens_seen": 31376464, + "step": 5865 + }, + { + "epoch": 0.9392, + "grad_norm": 0.964074969291687, + "learning_rate": 4.546697928663357e-07, + "loss": 0.6315, + "num_input_tokens_seen": 31408832, + "step": 5870 + }, + { + "epoch": 0.94, + "grad_norm": 1.1026054620742798, + "learning_rate": 4.4281873178278475e-07, + "loss": 0.7918, + "num_input_tokens_seen": 31432352, + "step": 5875 + }, + { + "epoch": 0.9408, + "grad_norm": 0.914069652557373, + "learning_rate": 4.311227872027479e-07, + "loss": 0.6983, + "num_input_tokens_seen": 31457392, + "step": 5880 + }, + { + "epoch": 0.9416, + "grad_norm": 0.989115834236145, + "learning_rate": 4.1958203300417054e-07, + "loss": 0.7233, + "num_input_tokens_seen": 31482704, + "step": 5885 + }, + { + "epoch": 0.9424, + "grad_norm": 1.034597396850586, + "learning_rate": 4.0819654208472947e-07, + "loss": 0.6402, + "num_input_tokens_seen": 31512368, + "step": 5890 + }, + { + "epoch": 0.9432, + "grad_norm": 1.3309321403503418, + "learning_rate": 3.9696638636137206e-07, + "loss": 0.6942, + "num_input_tokens_seen": 31539040, + "step": 5895 + }, + { + "epoch": 0.944, + "grad_norm": 1.2237857580184937, + "learning_rate": 3.8589163676986674e-07, + "loss": 0.7119, + "num_input_tokens_seen": 31563712, + "step": 5900 + }, + { + "epoch": 0.9448, + "grad_norm": 1.0268304347991943, + "learning_rate": 3.7497236326434757e-07, + "loss": 0.6575, + "num_input_tokens_seen": 31587760, + "step": 5905 + }, + { + "epoch": 0.9456, + "grad_norm": 1.0060738325119019, + "learning_rate": 3.6420863481688437e-07, + "loss": 0.704, + "num_input_tokens_seen": 31612976, + "step": 5910 + }, + { + "epoch": 0.9464, + "grad_norm": 0.9219969511032104, + "learning_rate": 3.536005194170328e-07, + "loss": 0.7876, + "num_input_tokens_seen": 31639472, + "step": 5915 + }, + { + "epoch": 0.9472, + "grad_norm": 0.8863883018493652, + "learning_rate": 3.431480840714152e-07, + "loss": 0.7033, + "num_input_tokens_seen": 31670768, + "step": 5920 + }, + { + "epoch": 0.948, + "grad_norm": 0.9394556879997253, + "learning_rate": 3.328513948032991e-07, + "loss": 0.7095, + "num_input_tokens_seen": 31696624, + "step": 5925 + }, + { + "epoch": 0.9488, + "grad_norm": 0.8008967638015747, + "learning_rate": 3.227105166521638e-07, + "loss": 0.6629, + "num_input_tokens_seen": 31723840, + "step": 5930 + }, + { + "epoch": 0.9496, + "grad_norm": 0.9910029172897339, + "learning_rate": 3.127255136733093e-07, + "loss": 0.591, + "num_input_tokens_seen": 31752736, + "step": 5935 + }, + { + "epoch": 0.9504, + "grad_norm": 0.9355325698852539, + "learning_rate": 3.0289644893744527e-07, + "loss": 0.6641, + "num_input_tokens_seen": 31777760, + "step": 5940 + }, + { + "epoch": 0.9512, + "grad_norm": 0.9911002516746521, + "learning_rate": 2.9322338453028066e-07, + "loss": 0.6156, + "num_input_tokens_seen": 31805264, + "step": 5945 + }, + { + "epoch": 0.952, + "grad_norm": 1.4127229452133179, + "learning_rate": 2.8370638155215123e-07, + "loss": 0.7834, + "num_input_tokens_seen": 31828656, + "step": 5950 + }, + { + "epoch": 0.9528, + "grad_norm": 1.0222047567367554, + "learning_rate": 2.743455001176176e-07, + "loss": 0.6998, + "num_input_tokens_seen": 31855424, + "step": 5955 + }, + { + "epoch": 0.9536, + "grad_norm": 0.9893736839294434, + "learning_rate": 2.6514079935509584e-07, + "loss": 0.7458, + "num_input_tokens_seen": 31879168, + "step": 5960 + }, + { + "epoch": 0.9544, + "grad_norm": 0.8317204713821411, + "learning_rate": 2.560923374064772e-07, + "loss": 0.7061, + "num_input_tokens_seen": 31903824, + "step": 5965 + }, + { + "epoch": 0.9552, + "grad_norm": 1.3220785856246948, + "learning_rate": 2.472001714267674e-07, + "loss": 0.8603, + "num_input_tokens_seen": 31927184, + "step": 5970 + }, + { + "epoch": 0.956, + "grad_norm": 0.8110103607177734, + "learning_rate": 2.384643575837203e-07, + "loss": 0.6273, + "num_input_tokens_seen": 31955104, + "step": 5975 + }, + { + "epoch": 0.9568, + "grad_norm": 0.6332679390907288, + "learning_rate": 2.298849510574824e-07, + "loss": 0.714, + "num_input_tokens_seen": 31985888, + "step": 5980 + }, + { + "epoch": 0.9576, + "grad_norm": 0.9290034174919128, + "learning_rate": 2.2146200604024613e-07, + "loss": 0.6899, + "num_input_tokens_seen": 32013520, + "step": 5985 + }, + { + "epoch": 0.9584, + "grad_norm": 1.0509424209594727, + "learning_rate": 2.1319557573591108e-07, + "loss": 0.677, + "num_input_tokens_seen": 32038880, + "step": 5990 + }, + { + "epoch": 0.9592, + "grad_norm": 1.0169018507003784, + "learning_rate": 2.050857123597455e-07, + "loss": 0.7033, + "num_input_tokens_seen": 32062160, + "step": 5995 + }, + { + "epoch": 0.96, + "grad_norm": 1.053408145904541, + "learning_rate": 1.9713246713805588e-07, + "loss": 0.6431, + "num_input_tokens_seen": 32085712, + "step": 6000 + }, + { + "epoch": 0.9608, + "grad_norm": 1.1077343225479126, + "learning_rate": 1.8933589030785682e-07, + "loss": 0.683, + "num_input_tokens_seen": 32115232, + "step": 6005 + }, + { + "epoch": 0.9616, + "grad_norm": 1.2601428031921387, + "learning_rate": 1.8169603111656552e-07, + "loss": 0.751, + "num_input_tokens_seen": 32142992, + "step": 6010 + }, + { + "epoch": 0.9624, + "grad_norm": 0.7372344136238098, + "learning_rate": 1.7421293782168835e-07, + "loss": 0.5808, + "num_input_tokens_seen": 32176176, + "step": 6015 + }, + { + "epoch": 0.9632, + "grad_norm": 0.856760561466217, + "learning_rate": 1.6688665769050703e-07, + "loss": 0.6852, + "num_input_tokens_seen": 32204992, + "step": 6020 + }, + { + "epoch": 0.964, + "grad_norm": 1.110574722290039, + "learning_rate": 1.5971723699979013e-07, + "loss": 0.6778, + "num_input_tokens_seen": 32232960, + "step": 6025 + }, + { + "epoch": 0.9648, + "grad_norm": 1.084190845489502, + "learning_rate": 1.5270472103549315e-07, + "loss": 0.7036, + "num_input_tokens_seen": 32262672, + "step": 6030 + }, + { + "epoch": 0.9656, + "grad_norm": 0.9454313516616821, + "learning_rate": 1.4584915409248112e-07, + "loss": 0.655, + "num_input_tokens_seen": 32285504, + "step": 6035 + }, + { + "epoch": 0.9664, + "grad_norm": 0.9206419587135315, + "learning_rate": 1.3915057947423705e-07, + "loss": 0.7324, + "num_input_tokens_seen": 32312288, + "step": 6040 + }, + { + "epoch": 0.9672, + "grad_norm": 0.9567137956619263, + "learning_rate": 1.3260903949260107e-07, + "loss": 0.7166, + "num_input_tokens_seen": 32339424, + "step": 6045 + }, + { + "epoch": 0.968, + "grad_norm": 1.0180697441101074, + "learning_rate": 1.2622457546749567e-07, + "loss": 0.7, + "num_input_tokens_seen": 32362848, + "step": 6050 + }, + { + "epoch": 0.9688, + "grad_norm": 1.2073848247528076, + "learning_rate": 1.1999722772666476e-07, + "loss": 0.7519, + "num_input_tokens_seen": 32393264, + "step": 6055 + }, + { + "epoch": 0.9696, + "grad_norm": 1.020180583000183, + "learning_rate": 1.1392703560542117e-07, + "loss": 0.7524, + "num_input_tokens_seen": 32418464, + "step": 6060 + }, + { + "epoch": 0.9704, + "grad_norm": 1.0939137935638428, + "learning_rate": 1.080140374463967e-07, + "loss": 0.5829, + "num_input_tokens_seen": 32449248, + "step": 6065 + }, + { + "epoch": 0.9712, + "grad_norm": 1.2307384014129639, + "learning_rate": 1.0225827059930083e-07, + "loss": 0.7017, + "num_input_tokens_seen": 32477312, + "step": 6070 + }, + { + "epoch": 0.972, + "grad_norm": 1.0141756534576416, + "learning_rate": 9.665977142068738e-08, + "loss": 0.6852, + "num_input_tokens_seen": 32505024, + "step": 6075 + }, + { + "epoch": 0.9728, + "grad_norm": 1.0366077423095703, + "learning_rate": 9.121857527372158e-08, + "loss": 0.733, + "num_input_tokens_seen": 32530080, + "step": 6080 + }, + { + "epoch": 0.9736, + "grad_norm": 0.9831274151802063, + "learning_rate": 8.593471652794949e-08, + "loss": 0.6721, + "num_input_tokens_seen": 32557488, + "step": 6085 + }, + { + "epoch": 0.9744, + "grad_norm": 1.0170478820800781, + "learning_rate": 8.080822855909831e-08, + "loss": 0.6572, + "num_input_tokens_seen": 32589072, + "step": 6090 + }, + { + "epoch": 0.9752, + "grad_norm": 1.0840100049972534, + "learning_rate": 7.583914374885426e-08, + "loss": 0.7535, + "num_input_tokens_seen": 32613296, + "step": 6095 + }, + { + "epoch": 0.976, + "grad_norm": 1.1899126768112183, + "learning_rate": 7.102749348465165e-08, + "loss": 0.639, + "num_input_tokens_seen": 32642512, + "step": 6100 + }, + { + "epoch": 0.9768, + "grad_norm": 1.0756986141204834, + "learning_rate": 6.637330815949527e-08, + "loss": 0.7558, + "num_input_tokens_seen": 32666064, + "step": 6105 + }, + { + "epoch": 0.9776, + "grad_norm": 0.9403240084648132, + "learning_rate": 6.187661717174386e-08, + "loss": 0.7228, + "num_input_tokens_seen": 32690016, + "step": 6110 + }, + { + "epoch": 0.9784, + "grad_norm": 0.9194949269294739, + "learning_rate": 5.753744892494639e-08, + "loss": 0.7079, + "num_input_tokens_seen": 32716240, + "step": 6115 + }, + { + "epoch": 0.9792, + "grad_norm": 0.9947624206542969, + "learning_rate": 5.335583082764495e-08, + "loss": 0.7692, + "num_input_tokens_seen": 32741648, + "step": 6120 + }, + { + "epoch": 0.98, + "grad_norm": 1.2828369140625, + "learning_rate": 4.9331789293211026e-08, + "loss": 0.6285, + "num_input_tokens_seen": 32770224, + "step": 6125 + }, + { + "epoch": 0.9808, + "grad_norm": 1.0066205263137817, + "learning_rate": 4.546534973968175e-08, + "loss": 0.7464, + "num_input_tokens_seen": 32798864, + "step": 6130 + }, + { + "epoch": 0.9816, + "grad_norm": 1.3146965503692627, + "learning_rate": 4.1756536589585004e-08, + "loss": 0.6632, + "num_input_tokens_seen": 32829136, + "step": 6135 + }, + { + "epoch": 0.9824, + "grad_norm": 0.8514100909233093, + "learning_rate": 3.820537326980622e-08, + "loss": 0.7378, + "num_input_tokens_seen": 32858976, + "step": 6140 + }, + { + "epoch": 0.9832, + "grad_norm": 0.9065835475921631, + "learning_rate": 3.481188221142184e-08, + "loss": 0.7125, + "num_input_tokens_seen": 32886208, + "step": 6145 + }, + { + "epoch": 0.984, + "grad_norm": 1.2251099348068237, + "learning_rate": 3.157608484956332e-08, + "loss": 0.7723, + "num_input_tokens_seen": 32912960, + "step": 6150 + }, + { + "epoch": 0.9848, + "grad_norm": 1.0463021993637085, + "learning_rate": 2.8498001623286642e-08, + "loss": 0.6472, + "num_input_tokens_seen": 32941072, + "step": 6155 + }, + { + "epoch": 0.9856, + "grad_norm": 1.001555323600769, + "learning_rate": 2.557765197543638e-08, + "loss": 0.7462, + "num_input_tokens_seen": 32968304, + "step": 6160 + }, + { + "epoch": 0.9864, + "grad_norm": 1.0437195301055908, + "learning_rate": 2.281505435253184e-08, + "loss": 0.7079, + "num_input_tokens_seen": 32993008, + "step": 6165 + }, + { + "epoch": 0.9872, + "grad_norm": 1.1048009395599365, + "learning_rate": 2.0210226204639414e-08, + "loss": 0.7058, + "num_input_tokens_seen": 33018128, + "step": 6170 + }, + { + "epoch": 0.988, + "grad_norm": 1.1593177318572998, + "learning_rate": 1.7763183985269883e-08, + "loss": 0.5942, + "num_input_tokens_seen": 33047968, + "step": 6175 + }, + { + "epoch": 0.9888, + "grad_norm": 1.3036433458328247, + "learning_rate": 1.5473943151270153e-08, + "loss": 0.7738, + "num_input_tokens_seen": 33072560, + "step": 6180 + }, + { + "epoch": 0.9896, + "grad_norm": 1.0337815284729004, + "learning_rate": 1.3342518162728912e-08, + "loss": 0.781, + "num_input_tokens_seen": 33096928, + "step": 6185 + }, + { + "epoch": 0.9904, + "grad_norm": 1.0598255395889282, + "learning_rate": 1.136892248288779e-08, + "loss": 0.6607, + "num_input_tokens_seen": 33123808, + "step": 6190 + }, + { + "epoch": 0.9912, + "grad_norm": 1.658722162246704, + "learning_rate": 9.553168578049775e-09, + "loss": 0.7506, + "num_input_tokens_seen": 33148688, + "step": 6195 + }, + { + "epoch": 0.992, + "grad_norm": 0.9460881352424622, + "learning_rate": 7.895267917501504e-09, + "loss": 0.6521, + "num_input_tokens_seen": 33176624, + "step": 6200 + }, + { + "epoch": 0.9928, + "grad_norm": 0.9642547369003296, + "learning_rate": 6.395230973443856e-09, + "loss": 0.709, + "num_input_tokens_seen": 33198160, + "step": 6205 + }, + { + "epoch": 0.9936, + "grad_norm": 1.1588774919509888, + "learning_rate": 5.053067220925356e-09, + "loss": 0.6685, + "num_input_tokens_seen": 33226336, + "step": 6210 + }, + { + "epoch": 0.9944, + "grad_norm": 0.7818155288696289, + "learning_rate": 3.868785137786657e-09, + "loss": 0.6672, + "num_input_tokens_seen": 33245824, + "step": 6215 + }, + { + "epoch": 0.9952, + "grad_norm": 0.8517420887947083, + "learning_rate": 2.842392204591149e-09, + "loss": 0.7053, + "num_input_tokens_seen": 33274176, + "step": 6220 + }, + { + "epoch": 0.996, + "grad_norm": 1.1486226320266724, + "learning_rate": 1.973894904597207e-09, + "loss": 0.7184, + "num_input_tokens_seen": 33302528, + "step": 6225 + }, + { + "epoch": 0.9968, + "grad_norm": 0.9121309518814087, + "learning_rate": 1.2632987237054528e-09, + "loss": 0.7092, + "num_input_tokens_seen": 33330384, + "step": 6230 + }, + { + "epoch": 0.9976, + "grad_norm": 1.1165632009506226, + "learning_rate": 7.106081504254514e-10, + "loss": 0.6142, + "num_input_tokens_seen": 33357968, + "step": 6235 + }, + { + "epoch": 0.9984, + "grad_norm": 1.0336644649505615, + "learning_rate": 3.158266758562789e-10, + "loss": 0.7147, + "num_input_tokens_seen": 33381536, + "step": 6240 + }, + { + "epoch": 0.9992, + "grad_norm": 1.1494331359863281, + "learning_rate": 7.89567936476665e-11, + "loss": 0.7005, + "num_input_tokens_seen": 33409984, + "step": 6245 + }, + { + "epoch": 1.0, + "grad_norm": 1.0414918661117554, + "learning_rate": 0.0, + "loss": 0.7516, + "num_input_tokens_seen": 33437856, + "step": 6250 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 33437856, + "step": 6250, + "total_flos": 7.1914395644928e+16, + "train_loss": 0.7151971128082275, + "train_runtime": 36754.4929, + "train_samples_per_second": 2.721, + "train_steps_per_second": 0.17 + } + ], + "logging_steps": 5, + "max_steps": 6250, + "num_input_tokens_seen": 33437856, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.1914395644928e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}