diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.13995801259622112, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013995801259622114, + "grad_norm": 1.6205339948497404, + "learning_rate": 0.0, + "loss": 1.8649, + "step": 1 + }, + { + "epoch": 0.00027991602519244227, + "grad_norm": 1.8823543727463894, + "learning_rate": 2.3255813953488374e-07, + "loss": 1.8156, + "step": 2 + }, + { + "epoch": 0.0004198740377886634, + "grad_norm": 1.5734521895417912, + "learning_rate": 4.651162790697675e-07, + "loss": 1.8395, + "step": 3 + }, + { + "epoch": 0.0005598320503848845, + "grad_norm": 1.884881935408653, + "learning_rate": 6.976744186046511e-07, + "loss": 1.8379, + "step": 4 + }, + { + "epoch": 0.0006997900629811056, + "grad_norm": 1.570942303353113, + "learning_rate": 9.30232558139535e-07, + "loss": 1.7683, + "step": 5 + }, + { + "epoch": 0.0008397480755773268, + "grad_norm": 2.1232694447925367, + "learning_rate": 1.1627906976744186e-06, + "loss": 1.834, + "step": 6 + }, + { + "epoch": 0.000979706088173548, + "grad_norm": 1.5460826364849427, + "learning_rate": 1.3953488372093023e-06, + "loss": 1.8431, + "step": 7 + }, + { + "epoch": 0.001119664100769769, + "grad_norm": 1.5383810682727819, + "learning_rate": 1.627906976744186e-06, + "loss": 1.7739, + "step": 8 + }, + { + "epoch": 0.0012596221133659902, + "grad_norm": 2.321401184160743, + "learning_rate": 1.86046511627907e-06, + "loss": 1.7974, + "step": 9 + }, + { + "epoch": 0.0013995801259622112, + "grad_norm": 1.5959823871905185, + "learning_rate": 2.0930232558139536e-06, + "loss": 1.802, + "step": 10 + }, + { + "epoch": 0.0015395381385584325, + "grad_norm": 1.4312201376848954, + "learning_rate": 2.325581395348837e-06, + "loss": 1.8292, + "step": 11 + }, + { + "epoch": 0.0016794961511546536, + "grad_norm": 1.3848325251343343, + "learning_rate": 2.558139534883721e-06, + "loss": 1.7843, + "step": 12 + }, + { + "epoch": 0.0018194541637508747, + "grad_norm": 1.5003345129280181, + "learning_rate": 2.7906976744186046e-06, + "loss": 1.8619, + "step": 13 + }, + { + "epoch": 0.001959412176347096, + "grad_norm": 1.5397334321812373, + "learning_rate": 3.0232558139534885e-06, + "loss": 1.8203, + "step": 14 + }, + { + "epoch": 0.002099370188943317, + "grad_norm": 1.5024026903634549, + "learning_rate": 3.255813953488372e-06, + "loss": 1.7945, + "step": 15 + }, + { + "epoch": 0.002239328201539538, + "grad_norm": 1.4537582898628276, + "learning_rate": 3.488372093023256e-06, + "loss": 1.7954, + "step": 16 + }, + { + "epoch": 0.0023792862141357595, + "grad_norm": 1.4392959782369958, + "learning_rate": 3.72093023255814e-06, + "loss": 1.7539, + "step": 17 + }, + { + "epoch": 0.0025192442267319803, + "grad_norm": 1.4318028272838452, + "learning_rate": 3.953488372093024e-06, + "loss": 1.7287, + "step": 18 + }, + { + "epoch": 0.0026592022393282016, + "grad_norm": 1.5567916289389137, + "learning_rate": 4.186046511627907e-06, + "loss": 1.7651, + "step": 19 + }, + { + "epoch": 0.0027991602519244225, + "grad_norm": 2.133786571708112, + "learning_rate": 4.418604651162791e-06, + "loss": 1.7431, + "step": 20 + }, + { + "epoch": 0.0029391182645206438, + "grad_norm": 1.3887907218945514, + "learning_rate": 4.651162790697674e-06, + "loss": 1.769, + "step": 21 + }, + { + "epoch": 0.003079076277116865, + "grad_norm": 1.4440046752202067, + "learning_rate": 4.883720930232559e-06, + "loss": 1.6852, + "step": 22 + }, + { + "epoch": 0.003219034289713086, + "grad_norm": 1.7560763608783, + "learning_rate": 5.116279069767442e-06, + "loss": 1.7515, + "step": 23 + }, + { + "epoch": 0.0033589923023093072, + "grad_norm": 1.4866717560719882, + "learning_rate": 5.348837209302326e-06, + "loss": 1.7103, + "step": 24 + }, + { + "epoch": 0.0034989503149055285, + "grad_norm": 1.3191066550690622, + "learning_rate": 5.581395348837209e-06, + "loss": 1.6115, + "step": 25 + }, + { + "epoch": 0.0036389083275017494, + "grad_norm": 3.5279348783477205, + "learning_rate": 5.8139534883720935e-06, + "loss": 1.639, + "step": 26 + }, + { + "epoch": 0.0037788663400979707, + "grad_norm": 1.4814448441008254, + "learning_rate": 6.046511627906977e-06, + "loss": 1.6249, + "step": 27 + }, + { + "epoch": 0.003918824352694192, + "grad_norm": 1.2456228376106218, + "learning_rate": 6.279069767441861e-06, + "loss": 1.5937, + "step": 28 + }, + { + "epoch": 0.004058782365290413, + "grad_norm": 1.2032083110394567, + "learning_rate": 6.511627906976744e-06, + "loss": 1.57, + "step": 29 + }, + { + "epoch": 0.004198740377886634, + "grad_norm": 1.1873685890041599, + "learning_rate": 6.744186046511629e-06, + "loss": 1.5443, + "step": 30 + }, + { + "epoch": 0.0043386983904828555, + "grad_norm": 1.1206050573785102, + "learning_rate": 6.976744186046512e-06, + "loss": 1.4658, + "step": 31 + }, + { + "epoch": 0.004478656403079076, + "grad_norm": 1.613173262015054, + "learning_rate": 7.209302325581396e-06, + "loss": 1.5061, + "step": 32 + }, + { + "epoch": 0.004618614415675297, + "grad_norm": 1.0997443478827273, + "learning_rate": 7.44186046511628e-06, + "loss": 1.4946, + "step": 33 + }, + { + "epoch": 0.004758572428271519, + "grad_norm": 1.0976998793927069, + "learning_rate": 7.674418604651164e-06, + "loss": 1.4857, + "step": 34 + }, + { + "epoch": 0.00489853044086774, + "grad_norm": 0.9524795998625615, + "learning_rate": 7.906976744186048e-06, + "loss": 1.4493, + "step": 35 + }, + { + "epoch": 0.005038488453463961, + "grad_norm": 0.9642671767042398, + "learning_rate": 8.139534883720931e-06, + "loss": 1.4039, + "step": 36 + }, + { + "epoch": 0.005178446466060182, + "grad_norm": 0.8572170901816704, + "learning_rate": 8.372093023255815e-06, + "loss": 1.3529, + "step": 37 + }, + { + "epoch": 0.005318404478656403, + "grad_norm": 0.8008007221794986, + "learning_rate": 8.604651162790698e-06, + "loss": 1.3514, + "step": 38 + }, + { + "epoch": 0.005458362491252624, + "grad_norm": 1.484907272279964, + "learning_rate": 8.837209302325582e-06, + "loss": 1.3637, + "step": 39 + }, + { + "epoch": 0.005598320503848845, + "grad_norm": 1.3633541730503271, + "learning_rate": 9.069767441860467e-06, + "loss": 1.3263, + "step": 40 + }, + { + "epoch": 0.005738278516445067, + "grad_norm": 0.6081019616299234, + "learning_rate": 9.302325581395349e-06, + "loss": 1.333, + "step": 41 + }, + { + "epoch": 0.0058782365290412876, + "grad_norm": 0.6853488905437116, + "learning_rate": 9.534883720930234e-06, + "loss": 1.2903, + "step": 42 + }, + { + "epoch": 0.006018194541637508, + "grad_norm": 0.8466578955626367, + "learning_rate": 9.767441860465117e-06, + "loss": 1.3079, + "step": 43 + }, + { + "epoch": 0.00615815255423373, + "grad_norm": 0.567927473680037, + "learning_rate": 1e-05, + "loss": 1.2735, + "step": 44 + }, + { + "epoch": 0.006298110566829951, + "grad_norm": 0.6150202452547671, + "learning_rate": 1.0232558139534884e-05, + "loss": 1.2115, + "step": 45 + }, + { + "epoch": 0.006438068579426172, + "grad_norm": 0.4989582290513981, + "learning_rate": 1.0465116279069768e-05, + "loss": 1.2315, + "step": 46 + }, + { + "epoch": 0.006578026592022394, + "grad_norm": 0.671517817392481, + "learning_rate": 1.0697674418604651e-05, + "loss": 1.211, + "step": 47 + }, + { + "epoch": 0.0067179846046186145, + "grad_norm": 0.4743826379981072, + "learning_rate": 1.0930232558139537e-05, + "loss": 1.2323, + "step": 48 + }, + { + "epoch": 0.006857942617214835, + "grad_norm": 0.6130428027193264, + "learning_rate": 1.1162790697674418e-05, + "loss": 1.1871, + "step": 49 + }, + { + "epoch": 0.006997900629811057, + "grad_norm": 0.5556426649020297, + "learning_rate": 1.1395348837209304e-05, + "loss": 1.1869, + "step": 50 + }, + { + "epoch": 0.007137858642407278, + "grad_norm": 1.896863162731097, + "learning_rate": 1.1627906976744187e-05, + "loss": 1.1683, + "step": 51 + }, + { + "epoch": 0.007277816655003499, + "grad_norm": 0.8412820077594921, + "learning_rate": 1.186046511627907e-05, + "loss": 1.1453, + "step": 52 + }, + { + "epoch": 0.00741777466759972, + "grad_norm": 0.5453872617764336, + "learning_rate": 1.2093023255813954e-05, + "loss": 1.1449, + "step": 53 + }, + { + "epoch": 0.007557732680195941, + "grad_norm": 0.5447374444716608, + "learning_rate": 1.2325581395348838e-05, + "loss": 1.0974, + "step": 54 + }, + { + "epoch": 0.007697690692792162, + "grad_norm": 0.4102706328599184, + "learning_rate": 1.2558139534883723e-05, + "loss": 1.1077, + "step": 55 + }, + { + "epoch": 0.007837648705388384, + "grad_norm": 3.5443873887390946, + "learning_rate": 1.2790697674418606e-05, + "loss": 1.1119, + "step": 56 + }, + { + "epoch": 0.007977606717984604, + "grad_norm": 0.46404403603707844, + "learning_rate": 1.3023255813953488e-05, + "loss": 1.074, + "step": 57 + }, + { + "epoch": 0.008117564730580826, + "grad_norm": 0.39045250619813876, + "learning_rate": 1.3255813953488372e-05, + "loss": 1.1102, + "step": 58 + }, + { + "epoch": 0.008257522743177047, + "grad_norm": 1.343556937733491, + "learning_rate": 1.3488372093023258e-05, + "loss": 1.1127, + "step": 59 + }, + { + "epoch": 0.008397480755773267, + "grad_norm": 0.7610889883379984, + "learning_rate": 1.372093023255814e-05, + "loss": 1.0093, + "step": 60 + }, + { + "epoch": 0.00853743876836949, + "grad_norm": 0.4556554186064198, + "learning_rate": 1.3953488372093024e-05, + "loss": 1.0192, + "step": 61 + }, + { + "epoch": 0.008677396780965711, + "grad_norm": 0.501444946545551, + "learning_rate": 1.4186046511627907e-05, + "loss": 1.0403, + "step": 62 + }, + { + "epoch": 0.008817354793561931, + "grad_norm": 0.35988075895590804, + "learning_rate": 1.4418604651162792e-05, + "loss": 1.1192, + "step": 63 + }, + { + "epoch": 0.008957312806158153, + "grad_norm": 0.6921457564720671, + "learning_rate": 1.4651162790697676e-05, + "loss": 1.056, + "step": 64 + }, + { + "epoch": 0.009097270818754374, + "grad_norm": 0.37626208734508315, + "learning_rate": 1.488372093023256e-05, + "loss": 1.0427, + "step": 65 + }, + { + "epoch": 0.009237228831350594, + "grad_norm": 0.4227916388896408, + "learning_rate": 1.5116279069767441e-05, + "loss": 1.0649, + "step": 66 + }, + { + "epoch": 0.009377186843946816, + "grad_norm": 0.43865399337074507, + "learning_rate": 1.5348837209302328e-05, + "loss": 1.0342, + "step": 67 + }, + { + "epoch": 0.009517144856543038, + "grad_norm": 1.1110606201155102, + "learning_rate": 1.558139534883721e-05, + "loss": 1.079, + "step": 68 + }, + { + "epoch": 0.009657102869139258, + "grad_norm": 0.3429950483555357, + "learning_rate": 1.5813953488372095e-05, + "loss": 1.0354, + "step": 69 + }, + { + "epoch": 0.00979706088173548, + "grad_norm": 0.500911837092983, + "learning_rate": 1.6046511627906977e-05, + "loss": 1.0117, + "step": 70 + }, + { + "epoch": 0.009937018894331701, + "grad_norm": 0.5054192697417574, + "learning_rate": 1.6279069767441862e-05, + "loss": 1.0435, + "step": 71 + }, + { + "epoch": 0.010076976906927921, + "grad_norm": 0.4024563344236819, + "learning_rate": 1.6511627906976744e-05, + "loss": 1.045, + "step": 72 + }, + { + "epoch": 0.010216934919524143, + "grad_norm": 0.4790998578083277, + "learning_rate": 1.674418604651163e-05, + "loss": 1.0253, + "step": 73 + }, + { + "epoch": 0.010356892932120365, + "grad_norm": 0.45731467968480566, + "learning_rate": 1.697674418604651e-05, + "loss": 1.0088, + "step": 74 + }, + { + "epoch": 0.010496850944716585, + "grad_norm": 0.4617808389918637, + "learning_rate": 1.7209302325581396e-05, + "loss": 0.9946, + "step": 75 + }, + { + "epoch": 0.010636808957312806, + "grad_norm": 0.4274246905900338, + "learning_rate": 1.744186046511628e-05, + "loss": 1.0078, + "step": 76 + }, + { + "epoch": 0.010776766969909026, + "grad_norm": 0.4859617233595146, + "learning_rate": 1.7674418604651163e-05, + "loss": 1.0292, + "step": 77 + }, + { + "epoch": 0.010916724982505248, + "grad_norm": 0.5837878294485921, + "learning_rate": 1.7906976744186045e-05, + "loss": 1.0338, + "step": 78 + }, + { + "epoch": 0.01105668299510147, + "grad_norm": 1.7173283109289441, + "learning_rate": 1.8139534883720934e-05, + "loss": 1.0126, + "step": 79 + }, + { + "epoch": 0.01119664100769769, + "grad_norm": 0.5256113477743695, + "learning_rate": 1.8372093023255815e-05, + "loss": 0.9907, + "step": 80 + }, + { + "epoch": 0.011336599020293912, + "grad_norm": 0.4483569419385721, + "learning_rate": 1.8604651162790697e-05, + "loss": 0.9643, + "step": 81 + }, + { + "epoch": 0.011476557032890133, + "grad_norm": 0.4539441530526643, + "learning_rate": 1.8837209302325582e-05, + "loss": 1.0462, + "step": 82 + }, + { + "epoch": 0.011616515045486353, + "grad_norm": 0.33426240052710565, + "learning_rate": 1.9069767441860468e-05, + "loss": 0.954, + "step": 83 + }, + { + "epoch": 0.011756473058082575, + "grad_norm": 0.3396314689877589, + "learning_rate": 1.930232558139535e-05, + "loss": 0.9857, + "step": 84 + }, + { + "epoch": 0.011896431070678797, + "grad_norm": 0.395155279481088, + "learning_rate": 1.9534883720930235e-05, + "loss": 1.004, + "step": 85 + }, + { + "epoch": 0.012036389083275017, + "grad_norm": 0.38928902258671755, + "learning_rate": 1.9767441860465116e-05, + "loss": 0.9253, + "step": 86 + }, + { + "epoch": 0.012176347095871239, + "grad_norm": 0.3964184151155477, + "learning_rate": 2e-05, + "loss": 0.9562, + "step": 87 + }, + { + "epoch": 0.01231630510846746, + "grad_norm": 0.41507841516147836, + "learning_rate": 2.0232558139534883e-05, + "loss": 0.9857, + "step": 88 + }, + { + "epoch": 0.01245626312106368, + "grad_norm": 0.3303740429955767, + "learning_rate": 2.046511627906977e-05, + "loss": 0.96, + "step": 89 + }, + { + "epoch": 0.012596221133659902, + "grad_norm": 0.35729090571512173, + "learning_rate": 2.069767441860465e-05, + "loss": 0.9453, + "step": 90 + }, + { + "epoch": 0.012736179146256124, + "grad_norm": 0.34652306094213037, + "learning_rate": 2.0930232558139536e-05, + "loss": 0.9478, + "step": 91 + }, + { + "epoch": 0.012876137158852344, + "grad_norm": 0.3781323029481633, + "learning_rate": 2.116279069767442e-05, + "loss": 0.9232, + "step": 92 + }, + { + "epoch": 0.013016095171448566, + "grad_norm": 0.3615380548631367, + "learning_rate": 2.1395348837209303e-05, + "loss": 0.903, + "step": 93 + }, + { + "epoch": 0.013156053184044787, + "grad_norm": 0.4096150963666029, + "learning_rate": 2.1627906976744184e-05, + "loss": 0.9599, + "step": 94 + }, + { + "epoch": 0.013296011196641007, + "grad_norm": 0.38313805223400516, + "learning_rate": 2.1860465116279073e-05, + "loss": 0.9512, + "step": 95 + }, + { + "epoch": 0.013435969209237229, + "grad_norm": 0.4249487976331709, + "learning_rate": 2.2093023255813955e-05, + "loss": 0.888, + "step": 96 + }, + { + "epoch": 0.01357592722183345, + "grad_norm": 0.3958713012909362, + "learning_rate": 2.2325581395348837e-05, + "loss": 0.9366, + "step": 97 + }, + { + "epoch": 0.01371588523442967, + "grad_norm": 0.3666415905158996, + "learning_rate": 2.2558139534883722e-05, + "loss": 0.9161, + "step": 98 + }, + { + "epoch": 0.013855843247025892, + "grad_norm": 0.4821438335731706, + "learning_rate": 2.2790697674418607e-05, + "loss": 0.9352, + "step": 99 + }, + { + "epoch": 0.013995801259622114, + "grad_norm": 0.3878615433598532, + "learning_rate": 2.302325581395349e-05, + "loss": 0.9557, + "step": 100 + }, + { + "epoch": 0.014135759272218334, + "grad_norm": 5.323533163270388, + "learning_rate": 2.3255813953488374e-05, + "loss": 0.9419, + "step": 101 + }, + { + "epoch": 0.014275717284814556, + "grad_norm": 0.6042685685685957, + "learning_rate": 2.3488372093023256e-05, + "loss": 0.9249, + "step": 102 + }, + { + "epoch": 0.014415675297410778, + "grad_norm": 0.4137056327381694, + "learning_rate": 2.372093023255814e-05, + "loss": 0.9179, + "step": 103 + }, + { + "epoch": 0.014555633310006998, + "grad_norm": 0.3772048144992787, + "learning_rate": 2.3953488372093026e-05, + "loss": 0.9144, + "step": 104 + }, + { + "epoch": 0.01469559132260322, + "grad_norm": 0.4175730821083398, + "learning_rate": 2.4186046511627908e-05, + "loss": 0.9227, + "step": 105 + }, + { + "epoch": 0.01483554933519944, + "grad_norm": 0.40061188528050645, + "learning_rate": 2.441860465116279e-05, + "loss": 0.9278, + "step": 106 + }, + { + "epoch": 0.014975507347795661, + "grad_norm": 0.3978558565443655, + "learning_rate": 2.4651162790697675e-05, + "loss": 0.8924, + "step": 107 + }, + { + "epoch": 0.015115465360391883, + "grad_norm": 0.8747382632018756, + "learning_rate": 2.488372093023256e-05, + "loss": 0.9015, + "step": 108 + }, + { + "epoch": 0.015255423372988103, + "grad_norm": 0.39737512145413006, + "learning_rate": 2.5116279069767445e-05, + "loss": 0.9201, + "step": 109 + }, + { + "epoch": 0.015395381385584325, + "grad_norm": 0.41650482599802, + "learning_rate": 2.5348837209302327e-05, + "loss": 0.8626, + "step": 110 + }, + { + "epoch": 0.015535339398180546, + "grad_norm": 0.42011360836146605, + "learning_rate": 2.5581395348837212e-05, + "loss": 0.8862, + "step": 111 + }, + { + "epoch": 0.015675297410776768, + "grad_norm": 0.4680549878413964, + "learning_rate": 2.5813953488372094e-05, + "loss": 0.9105, + "step": 112 + }, + { + "epoch": 0.01581525542337299, + "grad_norm": 0.49968103218957055, + "learning_rate": 2.6046511627906976e-05, + "loss": 0.8913, + "step": 113 + }, + { + "epoch": 0.015955213435969208, + "grad_norm": 0.49556276645971464, + "learning_rate": 2.627906976744186e-05, + "loss": 0.9266, + "step": 114 + }, + { + "epoch": 0.01609517144856543, + "grad_norm": 0.40655693565880857, + "learning_rate": 2.6511627906976743e-05, + "loss": 0.8941, + "step": 115 + }, + { + "epoch": 0.01623512946116165, + "grad_norm": 0.39000307235707055, + "learning_rate": 2.674418604651163e-05, + "loss": 0.8655, + "step": 116 + }, + { + "epoch": 0.016375087473757873, + "grad_norm": 0.4264245544215563, + "learning_rate": 2.6976744186046517e-05, + "loss": 0.837, + "step": 117 + }, + { + "epoch": 0.016515045486354095, + "grad_norm": 0.42674207968546857, + "learning_rate": 2.72093023255814e-05, + "loss": 0.8974, + "step": 118 + }, + { + "epoch": 0.016655003498950313, + "grad_norm": 0.46006604179365407, + "learning_rate": 2.744186046511628e-05, + "loss": 0.9413, + "step": 119 + }, + { + "epoch": 0.016794961511546535, + "grad_norm": 0.41235716420572693, + "learning_rate": 2.7674418604651166e-05, + "loss": 0.8682, + "step": 120 + }, + { + "epoch": 0.016934919524142757, + "grad_norm": 0.41352483122836786, + "learning_rate": 2.7906976744186048e-05, + "loss": 0.8926, + "step": 121 + }, + { + "epoch": 0.01707487753673898, + "grad_norm": 0.4135656272062978, + "learning_rate": 2.813953488372093e-05, + "loss": 0.8664, + "step": 122 + }, + { + "epoch": 0.0172148355493352, + "grad_norm": 0.40087350807045763, + "learning_rate": 2.8372093023255815e-05, + "loss": 0.8399, + "step": 123 + }, + { + "epoch": 0.017354793561931422, + "grad_norm": 0.4308300834648659, + "learning_rate": 2.8604651162790696e-05, + "loss": 0.9084, + "step": 124 + }, + { + "epoch": 0.01749475157452764, + "grad_norm": 0.48776022790516643, + "learning_rate": 2.8837209302325585e-05, + "loss": 0.911, + "step": 125 + }, + { + "epoch": 0.017634709587123862, + "grad_norm": 0.4111163096942089, + "learning_rate": 2.9069767441860467e-05, + "loss": 0.8848, + "step": 126 + }, + { + "epoch": 0.017774667599720084, + "grad_norm": 0.4381509329328657, + "learning_rate": 2.9302325581395352e-05, + "loss": 0.8876, + "step": 127 + }, + { + "epoch": 0.017914625612316305, + "grad_norm": 0.4459710226095744, + "learning_rate": 2.9534883720930234e-05, + "loss": 0.914, + "step": 128 + }, + { + "epoch": 0.018054583624912527, + "grad_norm": 0.7227835408304066, + "learning_rate": 2.976744186046512e-05, + "loss": 0.8587, + "step": 129 + }, + { + "epoch": 0.01819454163750875, + "grad_norm": 0.43926461194413513, + "learning_rate": 3e-05, + "loss": 0.8779, + "step": 130 + }, + { + "epoch": 0.018334499650104967, + "grad_norm": 0.44123229572799033, + "learning_rate": 3.0232558139534883e-05, + "loss": 0.8371, + "step": 131 + }, + { + "epoch": 0.01847445766270119, + "grad_norm": 0.5044627843039927, + "learning_rate": 3.0465116279069768e-05, + "loss": 0.8745, + "step": 132 + }, + { + "epoch": 0.01861441567529741, + "grad_norm": 0.4802286671204574, + "learning_rate": 3.0697674418604656e-05, + "loss": 0.8163, + "step": 133 + }, + { + "epoch": 0.018754373687893632, + "grad_norm": 0.4753543186959038, + "learning_rate": 3.093023255813954e-05, + "loss": 0.8499, + "step": 134 + }, + { + "epoch": 0.018894331700489854, + "grad_norm": 0.4957303847669158, + "learning_rate": 3.116279069767442e-05, + "loss": 0.8485, + "step": 135 + }, + { + "epoch": 0.019034289713086076, + "grad_norm": 0.4550945161127889, + "learning_rate": 3.13953488372093e-05, + "loss": 0.7769, + "step": 136 + }, + { + "epoch": 0.019174247725682294, + "grad_norm": 0.6087163747209936, + "learning_rate": 3.162790697674419e-05, + "loss": 0.8363, + "step": 137 + }, + { + "epoch": 0.019314205738278516, + "grad_norm": 0.500988900697958, + "learning_rate": 3.186046511627907e-05, + "loss": 0.8602, + "step": 138 + }, + { + "epoch": 0.019454163750874737, + "grad_norm": 0.5444272083147511, + "learning_rate": 3.2093023255813954e-05, + "loss": 0.8407, + "step": 139 + }, + { + "epoch": 0.01959412176347096, + "grad_norm": 0.5954883412818768, + "learning_rate": 3.2325581395348836e-05, + "loss": 0.8497, + "step": 140 + }, + { + "epoch": 0.01973407977606718, + "grad_norm": 0.5375949124547408, + "learning_rate": 3.2558139534883724e-05, + "loss": 0.8361, + "step": 141 + }, + { + "epoch": 0.019874037788663403, + "grad_norm": 0.5552189636185006, + "learning_rate": 3.2790697674418606e-05, + "loss": 0.8342, + "step": 142 + }, + { + "epoch": 0.02001399580125962, + "grad_norm": 0.5148019418462507, + "learning_rate": 3.302325581395349e-05, + "loss": 0.8261, + "step": 143 + }, + { + "epoch": 0.020153953813855843, + "grad_norm": 0.5700734210055411, + "learning_rate": 3.3255813953488377e-05, + "loss": 0.8323, + "step": 144 + }, + { + "epoch": 0.020293911826452064, + "grad_norm": 0.48554968620911604, + "learning_rate": 3.348837209302326e-05, + "loss": 0.8401, + "step": 145 + }, + { + "epoch": 0.020433869839048286, + "grad_norm": 0.46690708567053424, + "learning_rate": 3.372093023255814e-05, + "loss": 0.832, + "step": 146 + }, + { + "epoch": 0.020573827851644508, + "grad_norm": 0.4830729701490898, + "learning_rate": 3.395348837209302e-05, + "loss": 0.8425, + "step": 147 + }, + { + "epoch": 0.02071378586424073, + "grad_norm": 0.48191860044948637, + "learning_rate": 3.4186046511627904e-05, + "loss": 0.8274, + "step": 148 + }, + { + "epoch": 0.020853743876836948, + "grad_norm": 0.5142113223620266, + "learning_rate": 3.441860465116279e-05, + "loss": 0.8972, + "step": 149 + }, + { + "epoch": 0.02099370188943317, + "grad_norm": 0.4790668173486759, + "learning_rate": 3.465116279069768e-05, + "loss": 0.8315, + "step": 150 + }, + { + "epoch": 0.02113365990202939, + "grad_norm": 0.46716122441432334, + "learning_rate": 3.488372093023256e-05, + "loss": 0.8027, + "step": 151 + }, + { + "epoch": 0.021273617914625613, + "grad_norm": 0.5138958015599178, + "learning_rate": 3.5116279069767445e-05, + "loss": 0.7866, + "step": 152 + }, + { + "epoch": 0.021413575927221835, + "grad_norm": 0.5846032834190681, + "learning_rate": 3.5348837209302326e-05, + "loss": 0.7656, + "step": 153 + }, + { + "epoch": 0.021553533939818053, + "grad_norm": 0.4931365055890448, + "learning_rate": 3.558139534883721e-05, + "loss": 0.7875, + "step": 154 + }, + { + "epoch": 0.021693491952414275, + "grad_norm": 0.49665243607020065, + "learning_rate": 3.581395348837209e-05, + "loss": 0.8285, + "step": 155 + }, + { + "epoch": 0.021833449965010496, + "grad_norm": 0.4851409756713546, + "learning_rate": 3.604651162790698e-05, + "loss": 0.8395, + "step": 156 + }, + { + "epoch": 0.021973407977606718, + "grad_norm": 0.4915230656047339, + "learning_rate": 3.627906976744187e-05, + "loss": 0.8381, + "step": 157 + }, + { + "epoch": 0.02211336599020294, + "grad_norm": 0.4892154916147055, + "learning_rate": 3.651162790697675e-05, + "loss": 0.8372, + "step": 158 + }, + { + "epoch": 0.02225332400279916, + "grad_norm": 0.48541705485781506, + "learning_rate": 3.674418604651163e-05, + "loss": 0.8512, + "step": 159 + }, + { + "epoch": 0.02239328201539538, + "grad_norm": 0.490142048396627, + "learning_rate": 3.697674418604651e-05, + "loss": 0.8185, + "step": 160 + }, + { + "epoch": 0.0225332400279916, + "grad_norm": 0.5674374097426846, + "learning_rate": 3.7209302325581394e-05, + "loss": 0.8101, + "step": 161 + }, + { + "epoch": 0.022673198040587823, + "grad_norm": 0.617554041890815, + "learning_rate": 3.7441860465116276e-05, + "loss": 0.8273, + "step": 162 + }, + { + "epoch": 0.022813156053184045, + "grad_norm": 0.5446295352802258, + "learning_rate": 3.7674418604651165e-05, + "loss": 0.867, + "step": 163 + }, + { + "epoch": 0.022953114065780267, + "grad_norm": 0.5370506605400496, + "learning_rate": 3.790697674418605e-05, + "loss": 0.7857, + "step": 164 + }, + { + "epoch": 0.02309307207837649, + "grad_norm": 0.5011901313523426, + "learning_rate": 3.8139534883720935e-05, + "loss": 0.7731, + "step": 165 + }, + { + "epoch": 0.023233030090972707, + "grad_norm": 0.5329852231835671, + "learning_rate": 3.837209302325582e-05, + "loss": 0.7844, + "step": 166 + }, + { + "epoch": 0.02337298810356893, + "grad_norm": 0.5409808244274257, + "learning_rate": 3.86046511627907e-05, + "loss": 0.8565, + "step": 167 + }, + { + "epoch": 0.02351294611616515, + "grad_norm": 0.4698618846754603, + "learning_rate": 3.883720930232558e-05, + "loss": 0.8027, + "step": 168 + }, + { + "epoch": 0.023652904128761372, + "grad_norm": 0.5265079885633662, + "learning_rate": 3.906976744186047e-05, + "loss": 0.7934, + "step": 169 + }, + { + "epoch": 0.023792862141357594, + "grad_norm": 0.5066786460553115, + "learning_rate": 3.930232558139535e-05, + "loss": 0.8472, + "step": 170 + }, + { + "epoch": 0.023932820153953815, + "grad_norm": 0.4895093560469156, + "learning_rate": 3.953488372093023e-05, + "loss": 0.8038, + "step": 171 + }, + { + "epoch": 0.024072778166550034, + "grad_norm": 0.4722527785641036, + "learning_rate": 3.9767441860465115e-05, + "loss": 0.7926, + "step": 172 + }, + { + "epoch": 0.024212736179146255, + "grad_norm": 0.5273651829904968, + "learning_rate": 4e-05, + "loss": 0.8107, + "step": 173 + }, + { + "epoch": 0.024352694191742477, + "grad_norm": 0.5381629057966655, + "learning_rate": 4.0232558139534885e-05, + "loss": 0.7478, + "step": 174 + }, + { + "epoch": 0.0244926522043387, + "grad_norm": 0.5109213734104476, + "learning_rate": 4.046511627906977e-05, + "loss": 0.7436, + "step": 175 + }, + { + "epoch": 0.02463261021693492, + "grad_norm": 0.5144280803181732, + "learning_rate": 4.0697674418604655e-05, + "loss": 0.8024, + "step": 176 + }, + { + "epoch": 0.024772568229531142, + "grad_norm": 0.5119285784380162, + "learning_rate": 4.093023255813954e-05, + "loss": 0.8339, + "step": 177 + }, + { + "epoch": 0.02491252624212736, + "grad_norm": 0.5343750927263191, + "learning_rate": 4.116279069767442e-05, + "loss": 0.7744, + "step": 178 + }, + { + "epoch": 0.025052484254723582, + "grad_norm": 0.534909362676503, + "learning_rate": 4.13953488372093e-05, + "loss": 0.7942, + "step": 179 + }, + { + "epoch": 0.025192442267319804, + "grad_norm": 0.5393369737565247, + "learning_rate": 4.162790697674418e-05, + "loss": 0.7677, + "step": 180 + }, + { + "epoch": 0.025332400279916026, + "grad_norm": 0.4985423936875566, + "learning_rate": 4.186046511627907e-05, + "loss": 0.8088, + "step": 181 + }, + { + "epoch": 0.025472358292512248, + "grad_norm": 0.5167443499358275, + "learning_rate": 4.209302325581396e-05, + "loss": 0.7999, + "step": 182 + }, + { + "epoch": 0.025612316305108466, + "grad_norm": 0.5170409826954023, + "learning_rate": 4.232558139534884e-05, + "loss": 0.7601, + "step": 183 + }, + { + "epoch": 0.025752274317704688, + "grad_norm": 0.5219678512296154, + "learning_rate": 4.2558139534883724e-05, + "loss": 0.8032, + "step": 184 + }, + { + "epoch": 0.02589223233030091, + "grad_norm": 0.5652005338899992, + "learning_rate": 4.2790697674418605e-05, + "loss": 0.8096, + "step": 185 + }, + { + "epoch": 0.02603219034289713, + "grad_norm": 0.5199050303840674, + "learning_rate": 4.302325581395349e-05, + "loss": 0.7953, + "step": 186 + }, + { + "epoch": 0.026172148355493353, + "grad_norm": 0.5389317228680723, + "learning_rate": 4.325581395348837e-05, + "loss": 0.7972, + "step": 187 + }, + { + "epoch": 0.026312106368089574, + "grad_norm": 0.5496576941977979, + "learning_rate": 4.348837209302326e-05, + "loss": 0.7948, + "step": 188 + }, + { + "epoch": 0.026452064380685793, + "grad_norm": 0.4991575270339626, + "learning_rate": 4.3720930232558146e-05, + "loss": 0.7805, + "step": 189 + }, + { + "epoch": 0.026592022393282014, + "grad_norm": 0.5848638237399637, + "learning_rate": 4.395348837209303e-05, + "loss": 0.8075, + "step": 190 + }, + { + "epoch": 0.026731980405878236, + "grad_norm": 0.5521242621533461, + "learning_rate": 4.418604651162791e-05, + "loss": 0.8142, + "step": 191 + }, + { + "epoch": 0.026871938418474458, + "grad_norm": 0.5435127966309655, + "learning_rate": 4.441860465116279e-05, + "loss": 0.7825, + "step": 192 + }, + { + "epoch": 0.02701189643107068, + "grad_norm": 0.5470711522807221, + "learning_rate": 4.465116279069767e-05, + "loss": 0.7823, + "step": 193 + }, + { + "epoch": 0.0271518544436669, + "grad_norm": 0.5527490475528084, + "learning_rate": 4.488372093023256e-05, + "loss": 0.7346, + "step": 194 + }, + { + "epoch": 0.02729181245626312, + "grad_norm": 0.5440674129311732, + "learning_rate": 4.5116279069767444e-05, + "loss": 0.7794, + "step": 195 + }, + { + "epoch": 0.02743177046885934, + "grad_norm": 0.5470033960615739, + "learning_rate": 4.5348837209302326e-05, + "loss": 0.7778, + "step": 196 + }, + { + "epoch": 0.027571728481455563, + "grad_norm": 0.5297715369487911, + "learning_rate": 4.5581395348837214e-05, + "loss": 0.7817, + "step": 197 + }, + { + "epoch": 0.027711686494051785, + "grad_norm": 0.5904204687588617, + "learning_rate": 4.5813953488372096e-05, + "loss": 0.741, + "step": 198 + }, + { + "epoch": 0.027851644506648007, + "grad_norm": 0.5344813639779119, + "learning_rate": 4.604651162790698e-05, + "loss": 0.7695, + "step": 199 + }, + { + "epoch": 0.02799160251924423, + "grad_norm": 0.5332713262025963, + "learning_rate": 4.627906976744186e-05, + "loss": 0.7648, + "step": 200 + }, + { + "epoch": 0.028131560531840447, + "grad_norm": 0.5503858696857928, + "learning_rate": 4.651162790697675e-05, + "loss": 0.7835, + "step": 201 + }, + { + "epoch": 0.02827151854443667, + "grad_norm": 0.5358241040816709, + "learning_rate": 4.674418604651163e-05, + "loss": 0.8045, + "step": 202 + }, + { + "epoch": 0.02841147655703289, + "grad_norm": 0.5486918473034686, + "learning_rate": 4.697674418604651e-05, + "loss": 0.7938, + "step": 203 + }, + { + "epoch": 0.028551434569629112, + "grad_norm": 0.5435329407519798, + "learning_rate": 4.7209302325581394e-05, + "loss": 0.8122, + "step": 204 + }, + { + "epoch": 0.028691392582225334, + "grad_norm": 0.5353685402018145, + "learning_rate": 4.744186046511628e-05, + "loss": 0.7831, + "step": 205 + }, + { + "epoch": 0.028831350594821555, + "grad_norm": 0.5707995107234539, + "learning_rate": 4.7674418604651164e-05, + "loss": 0.7409, + "step": 206 + }, + { + "epoch": 0.028971308607417774, + "grad_norm": 0.5604394624011331, + "learning_rate": 4.790697674418605e-05, + "loss": 0.7517, + "step": 207 + }, + { + "epoch": 0.029111266620013995, + "grad_norm": 0.5515428854045646, + "learning_rate": 4.8139534883720934e-05, + "loss": 0.7812, + "step": 208 + }, + { + "epoch": 0.029251224632610217, + "grad_norm": 0.4865993670607851, + "learning_rate": 4.8372093023255816e-05, + "loss": 0.7767, + "step": 209 + }, + { + "epoch": 0.02939118264520644, + "grad_norm": 0.5336012601695919, + "learning_rate": 4.86046511627907e-05, + "loss": 0.7463, + "step": 210 + }, + { + "epoch": 0.02953114065780266, + "grad_norm": 0.5759753656135316, + "learning_rate": 4.883720930232558e-05, + "loss": 0.7899, + "step": 211 + }, + { + "epoch": 0.02967109867039888, + "grad_norm": 0.6037700025251694, + "learning_rate": 4.906976744186046e-05, + "loss": 0.7493, + "step": 212 + }, + { + "epoch": 0.0298110566829951, + "grad_norm": 0.5451464405404893, + "learning_rate": 4.930232558139535e-05, + "loss": 0.7501, + "step": 213 + }, + { + "epoch": 0.029951014695591322, + "grad_norm": 0.5084395290000155, + "learning_rate": 4.953488372093024e-05, + "loss": 0.8, + "step": 214 + }, + { + "epoch": 0.030090972708187544, + "grad_norm": 0.5375968031425802, + "learning_rate": 4.976744186046512e-05, + "loss": 0.7644, + "step": 215 + }, + { + "epoch": 0.030230930720783766, + "grad_norm": 0.5232124591965054, + "learning_rate": 5e-05, + "loss": 0.7661, + "step": 216 + }, + { + "epoch": 0.030370888733379987, + "grad_norm": 0.53168165514151, + "learning_rate": 4.999999743112317e-05, + "loss": 0.7469, + "step": 217 + }, + { + "epoch": 0.030510846745976206, + "grad_norm": 0.5682427706170695, + "learning_rate": 4.9999989724493205e-05, + "loss": 0.723, + "step": 218 + }, + { + "epoch": 0.030650804758572427, + "grad_norm": 0.5709911717094754, + "learning_rate": 4.99999768801117e-05, + "loss": 0.8116, + "step": 219 + }, + { + "epoch": 0.03079076277116865, + "grad_norm": 0.5259086859276223, + "learning_rate": 4.999995889798127e-05, + "loss": 0.7802, + "step": 220 + }, + { + "epoch": 0.03093072078376487, + "grad_norm": 0.5855912789813319, + "learning_rate": 4.999993577810563e-05, + "loss": 0.7523, + "step": 221 + }, + { + "epoch": 0.031070678796361093, + "grad_norm": 0.5703052221422155, + "learning_rate": 4.999990752048953e-05, + "loss": 0.7753, + "step": 222 + }, + { + "epoch": 0.031210636808957314, + "grad_norm": 0.5108767700185519, + "learning_rate": 4.999987412513878e-05, + "loss": 0.7634, + "step": 223 + }, + { + "epoch": 0.031350594821553536, + "grad_norm": 0.5527712966540788, + "learning_rate": 4.999983559206023e-05, + "loss": 0.7776, + "step": 224 + }, + { + "epoch": 0.031490552834149754, + "grad_norm": 0.5490103596581906, + "learning_rate": 4.999979192126181e-05, + "loss": 0.8043, + "step": 225 + }, + { + "epoch": 0.03163051084674598, + "grad_norm": 0.5298825659589008, + "learning_rate": 4.9999743112752485e-05, + "loss": 0.7873, + "step": 226 + }, + { + "epoch": 0.0317704688593422, + "grad_norm": 0.5312004706999939, + "learning_rate": 4.9999689166542295e-05, + "loss": 0.7243, + "step": 227 + }, + { + "epoch": 0.031910426871938416, + "grad_norm": 0.5282314986009607, + "learning_rate": 4.9999630082642325e-05, + "loss": 0.7747, + "step": 228 + }, + { + "epoch": 0.03205038488453464, + "grad_norm": 0.5605656711144292, + "learning_rate": 4.999956586106472e-05, + "loss": 0.8079, + "step": 229 + }, + { + "epoch": 0.03219034289713086, + "grad_norm": 0.5457865205370973, + "learning_rate": 4.999949650182266e-05, + "loss": 0.7413, + "step": 230 + }, + { + "epoch": 0.032330300909727085, + "grad_norm": 0.5870565643730205, + "learning_rate": 4.999942200493043e-05, + "loss": 0.696, + "step": 231 + }, + { + "epoch": 0.0324702589223233, + "grad_norm": 0.5334906454266579, + "learning_rate": 4.9999342370403316e-05, + "loss": 0.7259, + "step": 232 + }, + { + "epoch": 0.03261021693491952, + "grad_norm": 0.5740266159557987, + "learning_rate": 4.999925759825768e-05, + "loss": 0.7221, + "step": 233 + }, + { + "epoch": 0.032750174947515746, + "grad_norm": 0.613668330354887, + "learning_rate": 4.999916768851096e-05, + "loss": 0.7756, + "step": 234 + }, + { + "epoch": 0.032890132960111965, + "grad_norm": 0.5903235415127636, + "learning_rate": 4.999907264118163e-05, + "loss": 0.7386, + "step": 235 + }, + { + "epoch": 0.03303009097270819, + "grad_norm": 0.534428463431133, + "learning_rate": 4.9998972456289226e-05, + "loss": 0.7299, + "step": 236 + }, + { + "epoch": 0.03317004898530441, + "grad_norm": 0.5238837539032761, + "learning_rate": 4.999886713385432e-05, + "loss": 0.7432, + "step": 237 + }, + { + "epoch": 0.033310006997900626, + "grad_norm": 0.5371791485677779, + "learning_rate": 4.999875667389858e-05, + "loss": 0.7918, + "step": 238 + }, + { + "epoch": 0.03344996501049685, + "grad_norm": 0.5835373561208981, + "learning_rate": 4.999864107644469e-05, + "loss": 0.7534, + "step": 239 + }, + { + "epoch": 0.03358992302309307, + "grad_norm": 0.552016602574312, + "learning_rate": 4.999852034151641e-05, + "loss": 0.6981, + "step": 240 + }, + { + "epoch": 0.033729881035689295, + "grad_norm": 0.5154017381634809, + "learning_rate": 4.999839446913855e-05, + "loss": 0.6855, + "step": 241 + }, + { + "epoch": 0.03386983904828551, + "grad_norm": 0.567904246402228, + "learning_rate": 4.999826345933699e-05, + "loss": 0.7559, + "step": 242 + }, + { + "epoch": 0.03400979706088174, + "grad_norm": 0.5334580095149974, + "learning_rate": 4.999812731213864e-05, + "loss": 0.7427, + "step": 243 + }, + { + "epoch": 0.03414975507347796, + "grad_norm": 0.5705684901629905, + "learning_rate": 4.9997986027571485e-05, + "loss": 0.7097, + "step": 244 + }, + { + "epoch": 0.034289713086074175, + "grad_norm": 0.5308751498286732, + "learning_rate": 4.9997839605664564e-05, + "loss": 0.7885, + "step": 245 + }, + { + "epoch": 0.0344296710986704, + "grad_norm": 0.5655649609764111, + "learning_rate": 4.999768804644796e-05, + "loss": 0.7829, + "step": 246 + }, + { + "epoch": 0.03456962911126662, + "grad_norm": 0.5251341000071817, + "learning_rate": 4.999753134995283e-05, + "loss": 0.7406, + "step": 247 + }, + { + "epoch": 0.034709587123862844, + "grad_norm": 0.5176591391474799, + "learning_rate": 4.999736951621137e-05, + "loss": 0.7466, + "step": 248 + }, + { + "epoch": 0.03484954513645906, + "grad_norm": 0.510003241623571, + "learning_rate": 4.999720254525684e-05, + "loss": 0.7625, + "step": 249 + }, + { + "epoch": 0.03498950314905528, + "grad_norm": 0.5472293188653408, + "learning_rate": 4.999703043712355e-05, + "loss": 0.7486, + "step": 250 + }, + { + "epoch": 0.035129461161651505, + "grad_norm": 0.513397158699985, + "learning_rate": 4.9996853191846885e-05, + "loss": 0.7676, + "step": 251 + }, + { + "epoch": 0.035269419174247724, + "grad_norm": 0.5227735996669705, + "learning_rate": 4.999667080946324e-05, + "loss": 0.7421, + "step": 252 + }, + { + "epoch": 0.03540937718684395, + "grad_norm": 0.5706154309444754, + "learning_rate": 4.999648329001013e-05, + "loss": 0.7591, + "step": 253 + }, + { + "epoch": 0.03554933519944017, + "grad_norm": 0.5258959198504635, + "learning_rate": 4.999629063352608e-05, + "loss": 0.7478, + "step": 254 + }, + { + "epoch": 0.03568929321203639, + "grad_norm": 0.5487756790231113, + "learning_rate": 4.999609284005068e-05, + "loss": 0.7605, + "step": 255 + }, + { + "epoch": 0.03582925122463261, + "grad_norm": 0.5536825461384619, + "learning_rate": 4.999588990962458e-05, + "loss": 0.743, + "step": 256 + }, + { + "epoch": 0.03596920923722883, + "grad_norm": 0.5272600123907261, + "learning_rate": 4.9995681842289476e-05, + "loss": 0.6928, + "step": 257 + }, + { + "epoch": 0.036109167249825054, + "grad_norm": 0.5932574229764146, + "learning_rate": 4.999546863808815e-05, + "loss": 0.7398, + "step": 258 + }, + { + "epoch": 0.03624912526242127, + "grad_norm": 0.5318314385969402, + "learning_rate": 4.999525029706439e-05, + "loss": 0.7392, + "step": 259 + }, + { + "epoch": 0.0363890832750175, + "grad_norm": 0.548901725278328, + "learning_rate": 4.999502681926309e-05, + "loss": 0.7501, + "step": 260 + }, + { + "epoch": 0.036529041287613716, + "grad_norm": 0.5288753436227192, + "learning_rate": 4.9994798204730166e-05, + "loss": 0.6955, + "step": 261 + }, + { + "epoch": 0.036668999300209934, + "grad_norm": 0.5694503048772611, + "learning_rate": 4.99945644535126e-05, + "loss": 0.723, + "step": 262 + }, + { + "epoch": 0.03680895731280616, + "grad_norm": 0.5665031858281824, + "learning_rate": 4.999432556565843e-05, + "loss": 0.6754, + "step": 263 + }, + { + "epoch": 0.03694891532540238, + "grad_norm": 0.5647585870166252, + "learning_rate": 4.999408154121676e-05, + "loss": 0.7434, + "step": 264 + }, + { + "epoch": 0.0370888733379986, + "grad_norm": 0.5198420793446112, + "learning_rate": 4.9993832380237735e-05, + "loss": 0.7398, + "step": 265 + }, + { + "epoch": 0.03722883135059482, + "grad_norm": 0.5250695619803318, + "learning_rate": 4.999357808277255e-05, + "loss": 0.7355, + "step": 266 + }, + { + "epoch": 0.03736878936319104, + "grad_norm": 0.582781926595669, + "learning_rate": 4.999331864887347e-05, + "loss": 0.794, + "step": 267 + }, + { + "epoch": 0.037508747375787264, + "grad_norm": 0.5564844235496481, + "learning_rate": 4.9993054078593824e-05, + "loss": 0.7205, + "step": 268 + }, + { + "epoch": 0.03764870538838348, + "grad_norm": 0.5165771944533392, + "learning_rate": 4.9992784371987966e-05, + "loss": 0.723, + "step": 269 + }, + { + "epoch": 0.03778866340097971, + "grad_norm": 0.528823368400942, + "learning_rate": 4.999250952911133e-05, + "loss": 0.752, + "step": 270 + }, + { + "epoch": 0.037928621413575926, + "grad_norm": 0.5429179446888814, + "learning_rate": 4.999222955002041e-05, + "loss": 0.723, + "step": 271 + }, + { + "epoch": 0.03806857942617215, + "grad_norm": 0.5587199303859207, + "learning_rate": 4.9991944434772734e-05, + "loss": 0.7417, + "step": 272 + }, + { + "epoch": 0.03820853743876837, + "grad_norm": 0.5640839472568651, + "learning_rate": 4.999165418342689e-05, + "loss": 0.7231, + "step": 273 + }, + { + "epoch": 0.03834849545136459, + "grad_norm": 0.5679111247364553, + "learning_rate": 4.9991358796042535e-05, + "loss": 0.7469, + "step": 274 + }, + { + "epoch": 0.03848845346396081, + "grad_norm": 0.5333389832026874, + "learning_rate": 4.999105827268038e-05, + "loss": 0.7593, + "step": 275 + }, + { + "epoch": 0.03862841147655703, + "grad_norm": 0.576446977047569, + "learning_rate": 4.999075261340218e-05, + "loss": 0.7442, + "step": 276 + }, + { + "epoch": 0.03876836948915326, + "grad_norm": 0.5832905654806, + "learning_rate": 4.9990441818270745e-05, + "loss": 0.7631, + "step": 277 + }, + { + "epoch": 0.038908327501749475, + "grad_norm": 0.5606985999591908, + "learning_rate": 4.9990125887349956e-05, + "loss": 0.7525, + "step": 278 + }, + { + "epoch": 0.03904828551434569, + "grad_norm": 0.5397877770051253, + "learning_rate": 4.9989804820704735e-05, + "loss": 0.677, + "step": 279 + }, + { + "epoch": 0.03918824352694192, + "grad_norm": 0.5715063431844626, + "learning_rate": 4.998947861840106e-05, + "loss": 0.7539, + "step": 280 + }, + { + "epoch": 0.039328201539538137, + "grad_norm": 0.542818827148529, + "learning_rate": 4.998914728050598e-05, + "loss": 0.7505, + "step": 281 + }, + { + "epoch": 0.03946815955213436, + "grad_norm": 0.5465408569180555, + "learning_rate": 4.9988810807087584e-05, + "loss": 0.7119, + "step": 282 + }, + { + "epoch": 0.03960811756473058, + "grad_norm": 0.5093766924471657, + "learning_rate": 4.998846919821502e-05, + "loss": 0.7062, + "step": 283 + }, + { + "epoch": 0.039748075577326805, + "grad_norm": 0.5138282632852486, + "learning_rate": 4.998812245395849e-05, + "loss": 0.7453, + "step": 284 + }, + { + "epoch": 0.03988803358992302, + "grad_norm": 0.49293443848284324, + "learning_rate": 4.998777057438926e-05, + "loss": 0.7277, + "step": 285 + }, + { + "epoch": 0.04002799160251924, + "grad_norm": 0.5162138996089223, + "learning_rate": 4.9987413559579636e-05, + "loss": 0.7804, + "step": 286 + }, + { + "epoch": 0.04016794961511547, + "grad_norm": 0.5357676179971859, + "learning_rate": 4.998705140960299e-05, + "loss": 0.7399, + "step": 287 + }, + { + "epoch": 0.040307907627711685, + "grad_norm": 0.518163900018162, + "learning_rate": 4.998668412453374e-05, + "loss": 0.6914, + "step": 288 + }, + { + "epoch": 0.04044786564030791, + "grad_norm": 0.5334432804565435, + "learning_rate": 4.9986311704447395e-05, + "loss": 0.7186, + "step": 289 + }, + { + "epoch": 0.04058782365290413, + "grad_norm": 0.5466256273934306, + "learning_rate": 4.9985934149420466e-05, + "loss": 0.7361, + "step": 290 + }, + { + "epoch": 0.04072778166550035, + "grad_norm": 0.5299994895086962, + "learning_rate": 4.998555145953054e-05, + "loss": 0.7223, + "step": 291 + }, + { + "epoch": 0.04086773967809657, + "grad_norm": 0.5350513478260928, + "learning_rate": 4.998516363485629e-05, + "loss": 0.725, + "step": 292 + }, + { + "epoch": 0.04100769769069279, + "grad_norm": 0.47179695277188055, + "learning_rate": 4.99847706754774e-05, + "loss": 0.7441, + "step": 293 + }, + { + "epoch": 0.041147655703289016, + "grad_norm": 0.53626273737559, + "learning_rate": 4.998437258147462e-05, + "loss": 0.7643, + "step": 294 + }, + { + "epoch": 0.041287613715885234, + "grad_norm": 0.5542158039437199, + "learning_rate": 4.9983969352929786e-05, + "loss": 0.6996, + "step": 295 + }, + { + "epoch": 0.04142757172848146, + "grad_norm": 0.513139127485963, + "learning_rate": 4.9983560989925736e-05, + "loss": 0.7176, + "step": 296 + }, + { + "epoch": 0.04156752974107768, + "grad_norm": 0.5464627329044695, + "learning_rate": 4.9983147492546414e-05, + "loss": 0.6829, + "step": 297 + }, + { + "epoch": 0.041707487753673896, + "grad_norm": 0.4970498064260649, + "learning_rate": 4.9982728860876794e-05, + "loss": 0.6962, + "step": 298 + }, + { + "epoch": 0.04184744576627012, + "grad_norm": 0.5892603998956238, + "learning_rate": 4.998230509500291e-05, + "loss": 0.7526, + "step": 299 + }, + { + "epoch": 0.04198740377886634, + "grad_norm": 0.5501443462930937, + "learning_rate": 4.9981876195011844e-05, + "loss": 0.7133, + "step": 300 + }, + { + "epoch": 0.042127361791462564, + "grad_norm": 0.5498252845378556, + "learning_rate": 4.998144216099174e-05, + "loss": 0.7257, + "step": 301 + }, + { + "epoch": 0.04226731980405878, + "grad_norm": 0.5513802243572777, + "learning_rate": 4.99810029930318e-05, + "loss": 0.7403, + "step": 302 + }, + { + "epoch": 0.042407277816655, + "grad_norm": 0.5188265748765649, + "learning_rate": 4.998055869122228e-05, + "loss": 0.7302, + "step": 303 + }, + { + "epoch": 0.042547235829251226, + "grad_norm": 0.5540179540190183, + "learning_rate": 4.998010925565448e-05, + "loss": 0.7517, + "step": 304 + }, + { + "epoch": 0.042687193841847444, + "grad_norm": 0.5281059249590544, + "learning_rate": 4.9979654686420775e-05, + "loss": 0.7317, + "step": 305 + }, + { + "epoch": 0.04282715185444367, + "grad_norm": 0.5359394569524711, + "learning_rate": 4.997919498361457e-05, + "loss": 0.7344, + "step": 306 + }, + { + "epoch": 0.04296710986703989, + "grad_norm": 0.5074397919192178, + "learning_rate": 4.9978730147330355e-05, + "loss": 0.7245, + "step": 307 + }, + { + "epoch": 0.043107067879636106, + "grad_norm": 0.49809746671209315, + "learning_rate": 4.997826017766364e-05, + "loss": 0.7181, + "step": 308 + }, + { + "epoch": 0.04324702589223233, + "grad_norm": 0.5593359787877635, + "learning_rate": 4.997778507471102e-05, + "loss": 0.6895, + "step": 309 + }, + { + "epoch": 0.04338698390482855, + "grad_norm": 0.5259196612830013, + "learning_rate": 4.997730483857014e-05, + "loss": 0.7075, + "step": 310 + }, + { + "epoch": 0.043526941917424775, + "grad_norm": 0.5158600291227199, + "learning_rate": 4.997681946933967e-05, + "loss": 0.7071, + "step": 311 + }, + { + "epoch": 0.04366689993002099, + "grad_norm": 0.534021017035884, + "learning_rate": 4.997632896711939e-05, + "loss": 0.6599, + "step": 312 + }, + { + "epoch": 0.04380685794261722, + "grad_norm": 0.5669098102151614, + "learning_rate": 4.997583333201008e-05, + "loss": 0.6998, + "step": 313 + }, + { + "epoch": 0.043946815955213436, + "grad_norm": 0.5277846431661678, + "learning_rate": 4.99753325641136e-05, + "loss": 0.7232, + "step": 314 + }, + { + "epoch": 0.044086773967809655, + "grad_norm": 0.5834444960242834, + "learning_rate": 4.997482666353287e-05, + "loss": 0.7281, + "step": 315 + }, + { + "epoch": 0.04422673198040588, + "grad_norm": 0.5050930819325137, + "learning_rate": 4.9974315630371855e-05, + "loss": 0.7042, + "step": 316 + }, + { + "epoch": 0.0443666899930021, + "grad_norm": 0.5077463810437512, + "learning_rate": 4.997379946473557e-05, + "loss": 0.7043, + "step": 317 + }, + { + "epoch": 0.04450664800559832, + "grad_norm": 0.552209585789916, + "learning_rate": 4.9973278166730106e-05, + "loss": 0.7197, + "step": 318 + }, + { + "epoch": 0.04464660601819454, + "grad_norm": 0.6653079515801595, + "learning_rate": 4.99727517364626e-05, + "loss": 0.7359, + "step": 319 + }, + { + "epoch": 0.04478656403079076, + "grad_norm": 0.5255861840491078, + "learning_rate": 4.9972220174041205e-05, + "loss": 0.6736, + "step": 320 + }, + { + "epoch": 0.044926522043386985, + "grad_norm": 0.5400381711080283, + "learning_rate": 4.99716834795752e-05, + "loss": 0.7683, + "step": 321 + }, + { + "epoch": 0.0450664800559832, + "grad_norm": 0.5297848029054303, + "learning_rate": 4.997114165317486e-05, + "loss": 0.7252, + "step": 322 + }, + { + "epoch": 0.04520643806857943, + "grad_norm": 0.5344509508263574, + "learning_rate": 4.997059469495155e-05, + "loss": 0.6991, + "step": 323 + }, + { + "epoch": 0.04534639608117565, + "grad_norm": 0.5037923428143048, + "learning_rate": 4.997004260501766e-05, + "loss": 0.7393, + "step": 324 + }, + { + "epoch": 0.04548635409377187, + "grad_norm": 0.5338819883439406, + "learning_rate": 4.996948538348666e-05, + "loss": 0.711, + "step": 325 + }, + { + "epoch": 0.04562631210636809, + "grad_norm": 0.542939872517023, + "learning_rate": 4.996892303047306e-05, + "loss": 0.7, + "step": 326 + }, + { + "epoch": 0.04576627011896431, + "grad_norm": 0.5211829437872917, + "learning_rate": 4.996835554609244e-05, + "loss": 0.7185, + "step": 327 + }, + { + "epoch": 0.045906228131560534, + "grad_norm": 0.6756691366488846, + "learning_rate": 4.996778293046141e-05, + "loss": 0.7205, + "step": 328 + }, + { + "epoch": 0.04604618614415675, + "grad_norm": 0.5016052820529759, + "learning_rate": 4.996720518369764e-05, + "loss": 0.6912, + "step": 329 + }, + { + "epoch": 0.04618614415675298, + "grad_norm": 0.5201008063930296, + "learning_rate": 4.996662230591989e-05, + "loss": 0.7341, + "step": 330 + }, + { + "epoch": 0.046326102169349195, + "grad_norm": 0.567491502617686, + "learning_rate": 4.996603429724793e-05, + "loss": 0.7365, + "step": 331 + }, + { + "epoch": 0.046466060181945414, + "grad_norm": 0.5449583987974782, + "learning_rate": 4.996544115780261e-05, + "loss": 0.7075, + "step": 332 + }, + { + "epoch": 0.04660601819454164, + "grad_norm": 0.5492024616709544, + "learning_rate": 4.996484288770582e-05, + "loss": 0.7896, + "step": 333 + }, + { + "epoch": 0.04674597620713786, + "grad_norm": 0.5076286169172769, + "learning_rate": 4.996423948708051e-05, + "loss": 0.7394, + "step": 334 + }, + { + "epoch": 0.04688593421973408, + "grad_norm": 0.550162273407423, + "learning_rate": 4.996363095605069e-05, + "loss": 0.7049, + "step": 335 + }, + { + "epoch": 0.0470258922323303, + "grad_norm": 0.6075217637750909, + "learning_rate": 4.9963017294741407e-05, + "loss": 0.656, + "step": 336 + }, + { + "epoch": 0.04716585024492652, + "grad_norm": 0.5456579494737005, + "learning_rate": 4.99623985032788e-05, + "loss": 0.6872, + "step": 337 + }, + { + "epoch": 0.047305808257522744, + "grad_norm": 0.661719363460849, + "learning_rate": 4.996177458179001e-05, + "loss": 0.7, + "step": 338 + }, + { + "epoch": 0.04744576627011896, + "grad_norm": 0.6073637233523712, + "learning_rate": 4.996114553040328e-05, + "loss": 0.7034, + "step": 339 + }, + { + "epoch": 0.04758572428271519, + "grad_norm": 0.627821201647191, + "learning_rate": 4.996051134924786e-05, + "loss": 0.6996, + "step": 340 + }, + { + "epoch": 0.047725682295311406, + "grad_norm": 0.5557264017341775, + "learning_rate": 4.99598720384541e-05, + "loss": 0.7071, + "step": 341 + }, + { + "epoch": 0.04786564030790763, + "grad_norm": 0.5980687216959104, + "learning_rate": 4.995922759815339e-05, + "loss": 0.7101, + "step": 342 + }, + { + "epoch": 0.04800559832050385, + "grad_norm": 0.6227145204853264, + "learning_rate": 4.995857802847816e-05, + "loss": 0.7248, + "step": 343 + }, + { + "epoch": 0.04814555633310007, + "grad_norm": 0.9190740737396743, + "learning_rate": 4.9957923329561907e-05, + "loss": 0.709, + "step": 344 + }, + { + "epoch": 0.04828551434569629, + "grad_norm": 0.5382237071595833, + "learning_rate": 4.9957263501539174e-05, + "loss": 0.6756, + "step": 345 + }, + { + "epoch": 0.04842547235829251, + "grad_norm": 0.7390793234122391, + "learning_rate": 4.9956598544545566e-05, + "loss": 0.7184, + "step": 346 + }, + { + "epoch": 0.048565430370888736, + "grad_norm": 0.5896661276064007, + "learning_rate": 4.9955928458717723e-05, + "loss": 0.7022, + "step": 347 + }, + { + "epoch": 0.048705388383484954, + "grad_norm": 0.5284578098126113, + "learning_rate": 4.9955253244193375e-05, + "loss": 0.7238, + "step": 348 + }, + { + "epoch": 0.04884534639608117, + "grad_norm": 0.5539013371779543, + "learning_rate": 4.9954572901111286e-05, + "loss": 0.693, + "step": 349 + }, + { + "epoch": 0.0489853044086774, + "grad_norm": 0.7749019171598228, + "learning_rate": 4.9953887429611256e-05, + "loss": 0.6873, + "step": 350 + }, + { + "epoch": 0.049125262421273616, + "grad_norm": 0.5416373773112516, + "learning_rate": 4.995319682983418e-05, + "loss": 0.7418, + "step": 351 + }, + { + "epoch": 0.04926522043386984, + "grad_norm": 0.5962345555842343, + "learning_rate": 4.995250110192195e-05, + "loss": 0.7048, + "step": 352 + }, + { + "epoch": 0.04940517844646606, + "grad_norm": 0.6171440633433287, + "learning_rate": 4.995180024601758e-05, + "loss": 0.719, + "step": 353 + }, + { + "epoch": 0.049545136459062285, + "grad_norm": 0.5613359206707593, + "learning_rate": 4.995109426226508e-05, + "loss": 0.7374, + "step": 354 + }, + { + "epoch": 0.0496850944716585, + "grad_norm": 0.5557877380798822, + "learning_rate": 4.995038315080954e-05, + "loss": 0.7149, + "step": 355 + }, + { + "epoch": 0.04982505248425472, + "grad_norm": 0.5681067506858839, + "learning_rate": 4.994966691179711e-05, + "loss": 0.74, + "step": 356 + }, + { + "epoch": 0.049965010496850946, + "grad_norm": 0.5671390578619896, + "learning_rate": 4.994894554537498e-05, + "loss": 0.743, + "step": 357 + }, + { + "epoch": 0.050104968509447165, + "grad_norm": 0.5040150906914191, + "learning_rate": 4.9948219051691394e-05, + "loss": 0.7411, + "step": 358 + }, + { + "epoch": 0.05024492652204339, + "grad_norm": 0.5000433753838786, + "learning_rate": 4.994748743089566e-05, + "loss": 0.709, + "step": 359 + }, + { + "epoch": 0.05038488453463961, + "grad_norm": 0.5669390690482634, + "learning_rate": 4.9946750683138134e-05, + "loss": 0.7157, + "step": 360 + }, + { + "epoch": 0.050524842547235826, + "grad_norm": 0.5348574517172033, + "learning_rate": 4.994600880857022e-05, + "loss": 0.6982, + "step": 361 + }, + { + "epoch": 0.05066480055983205, + "grad_norm": 0.5519610052382385, + "learning_rate": 4.9945261807344376e-05, + "loss": 0.701, + "step": 362 + }, + { + "epoch": 0.05080475857242827, + "grad_norm": 0.5267779469247078, + "learning_rate": 4.994450967961413e-05, + "loss": 0.7164, + "step": 363 + }, + { + "epoch": 0.050944716585024495, + "grad_norm": 0.5168765165006579, + "learning_rate": 4.994375242553405e-05, + "loss": 0.7321, + "step": 364 + }, + { + "epoch": 0.05108467459762071, + "grad_norm": 0.6298046212170411, + "learning_rate": 4.994299004525975e-05, + "loss": 0.6847, + "step": 365 + }, + { + "epoch": 0.05122463261021693, + "grad_norm": 0.557578502569354, + "learning_rate": 4.994222253894791e-05, + "loss": 0.7152, + "step": 366 + }, + { + "epoch": 0.05136459062281316, + "grad_norm": 0.5335254526570578, + "learning_rate": 4.994144990675627e-05, + "loss": 0.7247, + "step": 367 + }, + { + "epoch": 0.051504548635409375, + "grad_norm": 0.5400373413925982, + "learning_rate": 4.99406721488436e-05, + "loss": 0.6837, + "step": 368 + }, + { + "epoch": 0.0516445066480056, + "grad_norm": 0.4935375590306368, + "learning_rate": 4.993988926536975e-05, + "loss": 0.6861, + "step": 369 + }, + { + "epoch": 0.05178446466060182, + "grad_norm": 0.6275855125475005, + "learning_rate": 4.993910125649561e-05, + "loss": 0.6829, + "step": 370 + }, + { + "epoch": 0.051924422673198044, + "grad_norm": 1.019947219371485, + "learning_rate": 4.993830812238311e-05, + "loss": 0.7568, + "step": 371 + }, + { + "epoch": 0.05206438068579426, + "grad_norm": 0.5311851634325208, + "learning_rate": 4.9937509863195256e-05, + "loss": 0.6854, + "step": 372 + }, + { + "epoch": 0.05220433869839048, + "grad_norm": 0.5202083034030205, + "learning_rate": 4.993670647909611e-05, + "loss": 0.7297, + "step": 373 + }, + { + "epoch": 0.052344296710986706, + "grad_norm": 0.5639019664915923, + "learning_rate": 4.9935897970250745e-05, + "loss": 0.7471, + "step": 374 + }, + { + "epoch": 0.052484254723582924, + "grad_norm": 0.5482982154303914, + "learning_rate": 4.993508433682535e-05, + "loss": 0.6802, + "step": 375 + }, + { + "epoch": 0.05262421273617915, + "grad_norm": 0.576536365667398, + "learning_rate": 4.993426557898711e-05, + "loss": 0.6996, + "step": 376 + }, + { + "epoch": 0.05276417074877537, + "grad_norm": 0.4997211624617034, + "learning_rate": 4.993344169690431e-05, + "loss": 0.6907, + "step": 377 + }, + { + "epoch": 0.052904128761371585, + "grad_norm": 0.4948774132141246, + "learning_rate": 4.993261269074625e-05, + "loss": 0.6869, + "step": 378 + }, + { + "epoch": 0.05304408677396781, + "grad_norm": 0.49620741460648987, + "learning_rate": 4.9931778560683304e-05, + "loss": 0.7045, + "step": 379 + }, + { + "epoch": 0.05318404478656403, + "grad_norm": 0.5687949523084871, + "learning_rate": 4.99309393068869e-05, + "loss": 0.6876, + "step": 380 + }, + { + "epoch": 0.053324002799160254, + "grad_norm": 1.3449476384226542, + "learning_rate": 4.9930094929529506e-05, + "loss": 0.6932, + "step": 381 + }, + { + "epoch": 0.05346396081175647, + "grad_norm": 0.5421720343729313, + "learning_rate": 4.992924542878465e-05, + "loss": 0.6636, + "step": 382 + }, + { + "epoch": 0.0536039188243527, + "grad_norm": 0.5574373007750801, + "learning_rate": 4.9928390804826916e-05, + "loss": 0.7171, + "step": 383 + }, + { + "epoch": 0.053743876836948916, + "grad_norm": 0.5165807349158404, + "learning_rate": 4.992753105783194e-05, + "loss": 0.6979, + "step": 384 + }, + { + "epoch": 0.053883834849545134, + "grad_norm": 0.5207307553346301, + "learning_rate": 4.99266661879764e-05, + "loss": 0.6768, + "step": 385 + }, + { + "epoch": 0.05402379286214136, + "grad_norm": 0.7774982969038746, + "learning_rate": 4.9925796195438044e-05, + "loss": 0.6667, + "step": 386 + }, + { + "epoch": 0.05416375087473758, + "grad_norm": 0.7031427692671837, + "learning_rate": 4.992492108039566e-05, + "loss": 0.7051, + "step": 387 + }, + { + "epoch": 0.0543037088873338, + "grad_norm": 0.5217628043633614, + "learning_rate": 4.99240408430291e-05, + "loss": 0.6427, + "step": 388 + }, + { + "epoch": 0.05444366689993002, + "grad_norm": 0.8333810612085015, + "learning_rate": 4.992315548351925e-05, + "loss": 0.7245, + "step": 389 + }, + { + "epoch": 0.05458362491252624, + "grad_norm": 0.629384361310055, + "learning_rate": 4.992226500204808e-05, + "loss": 0.6932, + "step": 390 + }, + { + "epoch": 0.054723582925122465, + "grad_norm": 1.215348456810311, + "learning_rate": 4.992136939879856e-05, + "loss": 0.6962, + "step": 391 + }, + { + "epoch": 0.05486354093771868, + "grad_norm": 0.5807472413206434, + "learning_rate": 4.992046867395478e-05, + "loss": 0.7176, + "step": 392 + }, + { + "epoch": 0.05500349895031491, + "grad_norm": 0.5439919445778956, + "learning_rate": 4.9919562827701824e-05, + "loss": 0.716, + "step": 393 + }, + { + "epoch": 0.055143456962911126, + "grad_norm": 0.5602331962806683, + "learning_rate": 4.9918651860225864e-05, + "loss": 0.6776, + "step": 394 + }, + { + "epoch": 0.055283414975507345, + "grad_norm": 0.522684609371295, + "learning_rate": 4.9917735771714114e-05, + "loss": 0.6646, + "step": 395 + }, + { + "epoch": 0.05542337298810357, + "grad_norm": 0.6622073701270422, + "learning_rate": 4.991681456235483e-05, + "loss": 0.6714, + "step": 396 + }, + { + "epoch": 0.05556333100069979, + "grad_norm": 0.6427418148993167, + "learning_rate": 4.991588823233735e-05, + "loss": 0.7056, + "step": 397 + }, + { + "epoch": 0.05570328901329601, + "grad_norm": 0.5725098170561623, + "learning_rate": 4.991495678185202e-05, + "loss": 0.6672, + "step": 398 + }, + { + "epoch": 0.05584324702589223, + "grad_norm": 0.5585281717216761, + "learning_rate": 4.991402021109027e-05, + "loss": 0.7064, + "step": 399 + }, + { + "epoch": 0.05598320503848846, + "grad_norm": 1.7745368005124815, + "learning_rate": 4.991307852024458e-05, + "loss": 0.7246, + "step": 400 + }, + { + "epoch": 0.056123163051084675, + "grad_norm": 0.5282756777124027, + "learning_rate": 4.991213170950848e-05, + "loss": 0.6971, + "step": 401 + }, + { + "epoch": 0.05626312106368089, + "grad_norm": 0.7218574920206103, + "learning_rate": 4.9911179779076544e-05, + "loss": 0.7173, + "step": 402 + }, + { + "epoch": 0.05640307907627712, + "grad_norm": 0.5451854604938866, + "learning_rate": 4.99102227291444e-05, + "loss": 0.7146, + "step": 403 + }, + { + "epoch": 0.05654303708887334, + "grad_norm": 0.5389963653898296, + "learning_rate": 4.990926055990873e-05, + "loss": 0.6745, + "step": 404 + }, + { + "epoch": 0.05668299510146956, + "grad_norm": 0.5101195028542534, + "learning_rate": 4.9908293271567286e-05, + "loss": 0.7158, + "step": 405 + }, + { + "epoch": 0.05682295311406578, + "grad_norm": 0.5324266228237025, + "learning_rate": 4.990732086431884e-05, + "loss": 0.6766, + "step": 406 + }, + { + "epoch": 0.056962911126662, + "grad_norm": 0.5056404230280876, + "learning_rate": 4.990634333836324e-05, + "loss": 0.6859, + "step": 407 + }, + { + "epoch": 0.057102869139258224, + "grad_norm": 0.553015466117812, + "learning_rate": 4.990536069390136e-05, + "loss": 0.7476, + "step": 408 + }, + { + "epoch": 0.05724282715185444, + "grad_norm": 0.6468042464648912, + "learning_rate": 4.9904372931135167e-05, + "loss": 0.6926, + "step": 409 + }, + { + "epoch": 0.05738278516445067, + "grad_norm": 0.5169803890591222, + "learning_rate": 4.990338005026764e-05, + "loss": 0.6948, + "step": 410 + }, + { + "epoch": 0.057522743177046885, + "grad_norm": 0.7826976195286862, + "learning_rate": 4.990238205150284e-05, + "loss": 0.693, + "step": 411 + }, + { + "epoch": 0.05766270118964311, + "grad_norm": 0.5898380237885452, + "learning_rate": 4.990137893504585e-05, + "loss": 0.7025, + "step": 412 + }, + { + "epoch": 0.05780265920223933, + "grad_norm": 0.6044914970349515, + "learning_rate": 4.990037070110283e-05, + "loss": 0.6642, + "step": 413 + }, + { + "epoch": 0.05794261721483555, + "grad_norm": 0.6647502597764803, + "learning_rate": 4.989935734988098e-05, + "loss": 0.7141, + "step": 414 + }, + { + "epoch": 0.05808257522743177, + "grad_norm": 0.578032490799264, + "learning_rate": 4.989833888158856e-05, + "loss": 0.7525, + "step": 415 + }, + { + "epoch": 0.05822253324002799, + "grad_norm": 0.5446056431076586, + "learning_rate": 4.989731529643486e-05, + "loss": 0.7152, + "step": 416 + }, + { + "epoch": 0.058362491252624216, + "grad_norm": 0.5520058248905833, + "learning_rate": 4.9896286594630255e-05, + "loss": 0.7104, + "step": 417 + }, + { + "epoch": 0.058502449265220434, + "grad_norm": 0.5358722455002061, + "learning_rate": 4.989525277638614e-05, + "loss": 0.6877, + "step": 418 + }, + { + "epoch": 0.05864240727781665, + "grad_norm": 0.5211574683086443, + "learning_rate": 4.989421384191499e-05, + "loss": 0.7235, + "step": 419 + }, + { + "epoch": 0.05878236529041288, + "grad_norm": 0.5620604980498491, + "learning_rate": 4.98931697914303e-05, + "loss": 0.6579, + "step": 420 + }, + { + "epoch": 0.058922323303009096, + "grad_norm": 0.5591196420226805, + "learning_rate": 4.989212062514664e-05, + "loss": 0.6848, + "step": 421 + }, + { + "epoch": 0.05906228131560532, + "grad_norm": 0.593542823435001, + "learning_rate": 4.989106634327963e-05, + "loss": 0.6769, + "step": 422 + }, + { + "epoch": 0.05920223932820154, + "grad_norm": 0.4993385226927883, + "learning_rate": 4.989000694604593e-05, + "loss": 0.7045, + "step": 423 + }, + { + "epoch": 0.05934219734079776, + "grad_norm": 0.8610343379419788, + "learning_rate": 4.9888942433663255e-05, + "loss": 0.6875, + "step": 424 + }, + { + "epoch": 0.05948215535339398, + "grad_norm": 0.48976014126417133, + "learning_rate": 4.988787280635038e-05, + "loss": 0.7148, + "step": 425 + }, + { + "epoch": 0.0596221133659902, + "grad_norm": 0.4952300036598977, + "learning_rate": 4.988679806432712e-05, + "loss": 0.6947, + "step": 426 + }, + { + "epoch": 0.059762071378586426, + "grad_norm": 0.46474336456699533, + "learning_rate": 4.9885718207814335e-05, + "loss": 0.6794, + "step": 427 + }, + { + "epoch": 0.059902029391182644, + "grad_norm": 0.5147006168979844, + "learning_rate": 4.988463323703397e-05, + "loss": 0.6711, + "step": 428 + }, + { + "epoch": 0.06004198740377887, + "grad_norm": 0.637107834123775, + "learning_rate": 4.988354315220898e-05, + "loss": 0.6715, + "step": 429 + }, + { + "epoch": 0.06018194541637509, + "grad_norm": 0.5511914486791841, + "learning_rate": 4.988244795356339e-05, + "loss": 0.649, + "step": 430 + }, + { + "epoch": 0.060321903428971306, + "grad_norm": 0.4830610595446907, + "learning_rate": 4.9881347641322277e-05, + "loss": 0.6591, + "step": 431 + }, + { + "epoch": 0.06046186144156753, + "grad_norm": 0.5705178740705275, + "learning_rate": 4.988024221571177e-05, + "loss": 0.6686, + "step": 432 + }, + { + "epoch": 0.06060181945416375, + "grad_norm": 0.7215515180250442, + "learning_rate": 4.987913167695904e-05, + "loss": 0.7433, + "step": 433 + }, + { + "epoch": 0.060741777466759975, + "grad_norm": 0.7060924726627863, + "learning_rate": 4.9878016025292305e-05, + "loss": 0.6936, + "step": 434 + }, + { + "epoch": 0.06088173547935619, + "grad_norm": 0.5381947781056515, + "learning_rate": 4.987689526094087e-05, + "loss": 0.6751, + "step": 435 + }, + { + "epoch": 0.06102169349195241, + "grad_norm": 0.4984642961815355, + "learning_rate": 4.987576938413504e-05, + "loss": 0.6551, + "step": 436 + }, + { + "epoch": 0.061161651504548636, + "grad_norm": 0.6995103632612955, + "learning_rate": 4.98746383951062e-05, + "loss": 0.6852, + "step": 437 + }, + { + "epoch": 0.061301609517144855, + "grad_norm": 0.5321506444103643, + "learning_rate": 4.9873502294086785e-05, + "loss": 0.7047, + "step": 438 + }, + { + "epoch": 0.06144156752974108, + "grad_norm": 0.5373668549791867, + "learning_rate": 4.987236108131026e-05, + "loss": 0.6476, + "step": 439 + }, + { + "epoch": 0.0615815255423373, + "grad_norm": 0.5531880070215052, + "learning_rate": 4.9871214757011176e-05, + "loss": 0.6695, + "step": 440 + }, + { + "epoch": 0.06172148355493352, + "grad_norm": 0.5377302222338248, + "learning_rate": 4.9870063321425105e-05, + "loss": 0.6522, + "step": 441 + }, + { + "epoch": 0.06186144156752974, + "grad_norm": 0.4885102130170266, + "learning_rate": 4.986890677478867e-05, + "loss": 0.6693, + "step": 442 + }, + { + "epoch": 0.06200139958012596, + "grad_norm": 0.5111003799643065, + "learning_rate": 4.986774511733957e-05, + "loss": 0.6578, + "step": 443 + }, + { + "epoch": 0.062141357592722185, + "grad_norm": 0.45757845171169315, + "learning_rate": 4.986657834931653e-05, + "loss": 0.701, + "step": 444 + }, + { + "epoch": 0.0622813156053184, + "grad_norm": 0.6187591152576597, + "learning_rate": 4.986540647095933e-05, + "loss": 0.671, + "step": 445 + }, + { + "epoch": 0.06242127361791463, + "grad_norm": 0.5054481649682342, + "learning_rate": 4.9864229482508804e-05, + "loss": 0.7186, + "step": 446 + }, + { + "epoch": 0.06256123163051085, + "grad_norm": 0.5370871184063281, + "learning_rate": 4.9863047384206835e-05, + "loss": 0.6889, + "step": 447 + }, + { + "epoch": 0.06270118964310707, + "grad_norm": 0.5046075184933217, + "learning_rate": 4.986186017629636e-05, + "loss": 0.6968, + "step": 448 + }, + { + "epoch": 0.06284114765570328, + "grad_norm": 0.5017707016018389, + "learning_rate": 4.986066785902136e-05, + "loss": 0.7472, + "step": 449 + }, + { + "epoch": 0.06298110566829951, + "grad_norm": 0.488992352862207, + "learning_rate": 4.985947043262686e-05, + "loss": 0.6916, + "step": 450 + }, + { + "epoch": 0.06312106368089573, + "grad_norm": 0.5153973631708659, + "learning_rate": 4.9858267897358956e-05, + "loss": 0.7105, + "step": 451 + }, + { + "epoch": 0.06326102169349196, + "grad_norm": 0.5351052897634555, + "learning_rate": 4.985706025346477e-05, + "loss": 0.7257, + "step": 452 + }, + { + "epoch": 0.06340097970608817, + "grad_norm": 0.5382494705510661, + "learning_rate": 4.98558475011925e-05, + "loss": 0.6587, + "step": 453 + }, + { + "epoch": 0.0635409377186844, + "grad_norm": 0.5164087894936398, + "learning_rate": 4.985462964079137e-05, + "loss": 0.6732, + "step": 454 + }, + { + "epoch": 0.06368089573128062, + "grad_norm": 0.5018878284351228, + "learning_rate": 4.985340667251166e-05, + "loss": 0.6436, + "step": 455 + }, + { + "epoch": 0.06382085374387683, + "grad_norm": 0.6387432517988891, + "learning_rate": 4.9852178596604705e-05, + "loss": 0.6874, + "step": 456 + }, + { + "epoch": 0.06396081175647306, + "grad_norm": 0.525361815670854, + "learning_rate": 4.985094541332288e-05, + "loss": 0.6899, + "step": 457 + }, + { + "epoch": 0.06410076976906928, + "grad_norm": 0.5252701657432063, + "learning_rate": 4.984970712291963e-05, + "loss": 0.6788, + "step": 458 + }, + { + "epoch": 0.0642407277816655, + "grad_norm": 0.5268887795524961, + "learning_rate": 4.984846372564943e-05, + "loss": 0.6915, + "step": 459 + }, + { + "epoch": 0.06438068579426172, + "grad_norm": 0.5762015280618955, + "learning_rate": 4.9847215221767815e-05, + "loss": 0.7023, + "step": 460 + }, + { + "epoch": 0.06452064380685794, + "grad_norm": 0.5228367157993173, + "learning_rate": 4.984596161153136e-05, + "loss": 0.7088, + "step": 461 + }, + { + "epoch": 0.06466060181945417, + "grad_norm": 0.5157276830814348, + "learning_rate": 4.984470289519769e-05, + "loss": 0.7164, + "step": 462 + }, + { + "epoch": 0.06480055983205038, + "grad_norm": 0.5405166337013064, + "learning_rate": 4.9843439073025486e-05, + "loss": 0.7111, + "step": 463 + }, + { + "epoch": 0.0649405178446466, + "grad_norm": 0.5812948989732744, + "learning_rate": 4.984217014527449e-05, + "loss": 0.7043, + "step": 464 + }, + { + "epoch": 0.06508047585724283, + "grad_norm": 0.49548874056024383, + "learning_rate": 4.984089611220547e-05, + "loss": 0.6428, + "step": 465 + }, + { + "epoch": 0.06522043386983904, + "grad_norm": 0.5197419283605628, + "learning_rate": 4.9839616974080246e-05, + "loss": 0.7102, + "step": 466 + }, + { + "epoch": 0.06536039188243527, + "grad_norm": 0.6462028524249613, + "learning_rate": 4.9838332731161694e-05, + "loss": 0.664, + "step": 467 + }, + { + "epoch": 0.06550034989503149, + "grad_norm": 0.4883003485107369, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.6494, + "step": 468 + }, + { + "epoch": 0.06564030790762772, + "grad_norm": 0.510996545798241, + "learning_rate": 4.983574893200139e-05, + "loss": 0.6762, + "step": 469 + }, + { + "epoch": 0.06578026592022393, + "grad_norm": 0.5132612521238953, + "learning_rate": 4.9834449376290625e-05, + "loss": 0.6646, + "step": 470 + }, + { + "epoch": 0.06592022393282015, + "grad_norm": 0.5184693525295171, + "learning_rate": 4.983314471684853e-05, + "loss": 0.6709, + "step": 471 + }, + { + "epoch": 0.06606018194541638, + "grad_norm": 0.48530849719789637, + "learning_rate": 4.9831834953943236e-05, + "loss": 0.6798, + "step": 472 + }, + { + "epoch": 0.06620013995801259, + "grad_norm": 0.5169999057943723, + "learning_rate": 4.9830520087843894e-05, + "loss": 0.6949, + "step": 473 + }, + { + "epoch": 0.06634009797060882, + "grad_norm": 0.563401180326856, + "learning_rate": 4.982920011882074e-05, + "loss": 0.7226, + "step": 474 + }, + { + "epoch": 0.06648005598320504, + "grad_norm": 0.5359095901003381, + "learning_rate": 4.982787504714503e-05, + "loss": 0.6834, + "step": 475 + }, + { + "epoch": 0.06662001399580125, + "grad_norm": 0.7237453631577152, + "learning_rate": 4.982654487308908e-05, + "loss": 0.6794, + "step": 476 + }, + { + "epoch": 0.06675997200839748, + "grad_norm": 0.5209505752877471, + "learning_rate": 4.982520959692626e-05, + "loss": 0.6533, + "step": 477 + }, + { + "epoch": 0.0668999300209937, + "grad_norm": 0.5103023397815947, + "learning_rate": 4.982386921893098e-05, + "loss": 0.6633, + "step": 478 + }, + { + "epoch": 0.06703988803358993, + "grad_norm": 0.5060683439354244, + "learning_rate": 4.98225237393787e-05, + "loss": 0.6369, + "step": 479 + }, + { + "epoch": 0.06717984604618614, + "grad_norm": 0.5296294502024173, + "learning_rate": 4.9821173158545936e-05, + "loss": 0.6394, + "step": 480 + }, + { + "epoch": 0.06731980405878236, + "grad_norm": 0.5333731273318062, + "learning_rate": 4.981981747671024e-05, + "loss": 0.6792, + "step": 481 + }, + { + "epoch": 0.06745976207137859, + "grad_norm": 0.5350960162399097, + "learning_rate": 4.981845669415022e-05, + "loss": 0.6378, + "step": 482 + }, + { + "epoch": 0.0675997200839748, + "grad_norm": 0.4966051310703327, + "learning_rate": 4.9817090811145524e-05, + "loss": 0.6664, + "step": 483 + }, + { + "epoch": 0.06773967809657103, + "grad_norm": 0.5117571057914638, + "learning_rate": 4.9815719827976864e-05, + "loss": 0.674, + "step": 484 + }, + { + "epoch": 0.06787963610916725, + "grad_norm": 0.5325656306807487, + "learning_rate": 4.9814343744925984e-05, + "loss": 0.7459, + "step": 485 + }, + { + "epoch": 0.06801959412176348, + "grad_norm": 0.5363735435396061, + "learning_rate": 4.981296256227569e-05, + "loss": 0.6356, + "step": 486 + }, + { + "epoch": 0.06815955213435969, + "grad_norm": 0.5880213444397324, + "learning_rate": 4.981157628030984e-05, + "loss": 0.7152, + "step": 487 + }, + { + "epoch": 0.06829951014695591, + "grad_norm": 0.5622620490135608, + "learning_rate": 4.9810184899313294e-05, + "loss": 0.6904, + "step": 488 + }, + { + "epoch": 0.06843946815955214, + "grad_norm": 0.5387407294746895, + "learning_rate": 4.980878841957203e-05, + "loss": 0.6417, + "step": 489 + }, + { + "epoch": 0.06857942617214835, + "grad_norm": 0.5810694092783647, + "learning_rate": 4.9807386841373014e-05, + "loss": 0.7077, + "step": 490 + }, + { + "epoch": 0.06871938418474458, + "grad_norm": 0.5468672188592382, + "learning_rate": 4.9805980165004304e-05, + "loss": 0.6831, + "step": 491 + }, + { + "epoch": 0.0688593421973408, + "grad_norm": 0.5009452260143993, + "learning_rate": 4.9804568390754974e-05, + "loss": 0.6618, + "step": 492 + }, + { + "epoch": 0.06899930020993703, + "grad_norm": 0.5198589285355217, + "learning_rate": 4.980315151891516e-05, + "loss": 0.7322, + "step": 493 + }, + { + "epoch": 0.06913925822253324, + "grad_norm": 0.5255753639976765, + "learning_rate": 4.980172954977605e-05, + "loss": 0.667, + "step": 494 + }, + { + "epoch": 0.06927921623512946, + "grad_norm": 0.514064283357064, + "learning_rate": 4.980030248362987e-05, + "loss": 0.7107, + "step": 495 + }, + { + "epoch": 0.06941917424772569, + "grad_norm": 0.5532838372693374, + "learning_rate": 4.9798870320769886e-05, + "loss": 0.6811, + "step": 496 + }, + { + "epoch": 0.0695591322603219, + "grad_norm": 0.602307520111878, + "learning_rate": 4.9797433061490434e-05, + "loss": 0.655, + "step": 497 + }, + { + "epoch": 0.06969909027291812, + "grad_norm": 1.1081876267570516, + "learning_rate": 4.979599070608688e-05, + "loss": 0.7107, + "step": 498 + }, + { + "epoch": 0.06983904828551435, + "grad_norm": 0.529722398927933, + "learning_rate": 4.979454325485565e-05, + "loss": 0.7186, + "step": 499 + }, + { + "epoch": 0.06997900629811056, + "grad_norm": 0.688076194733635, + "learning_rate": 4.97930907080942e-05, + "loss": 0.6966, + "step": 500 + }, + { + "epoch": 0.07011896431070679, + "grad_norm": 0.5241126214650118, + "learning_rate": 4.979163306610105e-05, + "loss": 0.6648, + "step": 501 + }, + { + "epoch": 0.07025892232330301, + "grad_norm": 0.6545221839203675, + "learning_rate": 4.9790170329175754e-05, + "loss": 0.6634, + "step": 502 + }, + { + "epoch": 0.07039888033589924, + "grad_norm": 0.5292755961917179, + "learning_rate": 4.978870249761893e-05, + "loss": 0.6554, + "step": 503 + }, + { + "epoch": 0.07053883834849545, + "grad_norm": 0.5071567724521334, + "learning_rate": 4.978722957173222e-05, + "loss": 0.6741, + "step": 504 + }, + { + "epoch": 0.07067879636109167, + "grad_norm": 0.6987987933626845, + "learning_rate": 4.9785751551818336e-05, + "loss": 0.6845, + "step": 505 + }, + { + "epoch": 0.0708187543736879, + "grad_norm": 0.6505466788008574, + "learning_rate": 4.9784268438181016e-05, + "loss": 0.6632, + "step": 506 + }, + { + "epoch": 0.07095871238628411, + "grad_norm": 0.5780869067311343, + "learning_rate": 4.978278023112506e-05, + "loss": 0.7101, + "step": 507 + }, + { + "epoch": 0.07109867039888033, + "grad_norm": 0.5668906286656329, + "learning_rate": 4.978128693095632e-05, + "loss": 0.6727, + "step": 508 + }, + { + "epoch": 0.07123862841147656, + "grad_norm": 0.5631385798045004, + "learning_rate": 4.977978853798166e-05, + "loss": 0.6936, + "step": 509 + }, + { + "epoch": 0.07137858642407278, + "grad_norm": 0.563836937443064, + "learning_rate": 4.977828505250903e-05, + "loss": 0.6646, + "step": 510 + }, + { + "epoch": 0.071518544436669, + "grad_norm": 0.49771256088584315, + "learning_rate": 4.977677647484741e-05, + "loss": 0.6543, + "step": 511 + }, + { + "epoch": 0.07165850244926522, + "grad_norm": 0.813746974710708, + "learning_rate": 4.977526280530684e-05, + "loss": 0.6579, + "step": 512 + }, + { + "epoch": 0.07179846046186145, + "grad_norm": 0.6705561238466075, + "learning_rate": 4.977374404419837e-05, + "loss": 0.6773, + "step": 513 + }, + { + "epoch": 0.07193841847445766, + "grad_norm": 0.6410722179382549, + "learning_rate": 4.977222019183414e-05, + "loss": 0.7036, + "step": 514 + }, + { + "epoch": 0.07207837648705388, + "grad_norm": 0.49081412125581536, + "learning_rate": 4.977069124852731e-05, + "loss": 0.6944, + "step": 515 + }, + { + "epoch": 0.07221833449965011, + "grad_norm": 4.072584572211915, + "learning_rate": 4.976915721459209e-05, + "loss": 0.6678, + "step": 516 + }, + { + "epoch": 0.07235829251224632, + "grad_norm": 0.5026288646761555, + "learning_rate": 4.9767618090343745e-05, + "loss": 0.6564, + "step": 517 + }, + { + "epoch": 0.07249825052484254, + "grad_norm": 0.5165366931023583, + "learning_rate": 4.976607387609858e-05, + "loss": 0.6766, + "step": 518 + }, + { + "epoch": 0.07263820853743877, + "grad_norm": 0.5086677754272171, + "learning_rate": 4.976452457217394e-05, + "loss": 0.6468, + "step": 519 + }, + { + "epoch": 0.072778166550035, + "grad_norm": 0.5430265340871115, + "learning_rate": 4.976297017888824e-05, + "loss": 0.6543, + "step": 520 + }, + { + "epoch": 0.0729181245626312, + "grad_norm": 0.5436880201922443, + "learning_rate": 4.976141069656091e-05, + "loss": 0.6712, + "step": 521 + }, + { + "epoch": 0.07305808257522743, + "grad_norm": 0.5178999941373105, + "learning_rate": 4.975984612551243e-05, + "loss": 0.6676, + "step": 522 + }, + { + "epoch": 0.07319804058782366, + "grad_norm": 0.5814901447061906, + "learning_rate": 4.975827646606436e-05, + "loss": 0.6959, + "step": 523 + }, + { + "epoch": 0.07333799860041987, + "grad_norm": 0.5577926798263833, + "learning_rate": 4.975670171853926e-05, + "loss": 0.6691, + "step": 524 + }, + { + "epoch": 0.0734779566130161, + "grad_norm": 0.5368192741447849, + "learning_rate": 4.975512188326077e-05, + "loss": 0.7067, + "step": 525 + }, + { + "epoch": 0.07361791462561232, + "grad_norm": 0.8965061390425338, + "learning_rate": 4.9753536960553545e-05, + "loss": 0.6601, + "step": 526 + }, + { + "epoch": 0.07375787263820854, + "grad_norm": 0.8372680909100517, + "learning_rate": 4.975194695074333e-05, + "loss": 0.7079, + "step": 527 + }, + { + "epoch": 0.07389783065080475, + "grad_norm": 0.610185754465145, + "learning_rate": 4.9750351854156864e-05, + "loss": 0.6345, + "step": 528 + }, + { + "epoch": 0.07403778866340098, + "grad_norm": 0.5906986023369948, + "learning_rate": 4.9748751671121964e-05, + "loss": 0.6875, + "step": 529 + }, + { + "epoch": 0.0741777466759972, + "grad_norm": 0.5382974215866725, + "learning_rate": 4.9747146401967484e-05, + "loss": 0.6763, + "step": 530 + }, + { + "epoch": 0.07431770468859342, + "grad_norm": 0.5118350667595736, + "learning_rate": 4.9745536047023324e-05, + "loss": 0.693, + "step": 531 + }, + { + "epoch": 0.07445766270118964, + "grad_norm": 0.5342169754955468, + "learning_rate": 4.974392060662042e-05, + "loss": 0.6564, + "step": 532 + }, + { + "epoch": 0.07459762071378587, + "grad_norm": 0.494630398936876, + "learning_rate": 4.9742300081090774e-05, + "loss": 0.651, + "step": 533 + }, + { + "epoch": 0.07473757872638208, + "grad_norm": 0.5020465590373978, + "learning_rate": 4.974067447076742e-05, + "loss": 0.6924, + "step": 534 + }, + { + "epoch": 0.0748775367389783, + "grad_norm": 0.4949817767414362, + "learning_rate": 4.973904377598443e-05, + "loss": 0.6757, + "step": 535 + }, + { + "epoch": 0.07501749475157453, + "grad_norm": 0.503752121503341, + "learning_rate": 4.973740799707692e-05, + "loss": 0.6316, + "step": 536 + }, + { + "epoch": 0.07515745276417075, + "grad_norm": 0.5134153985614953, + "learning_rate": 4.973576713438108e-05, + "loss": 0.6768, + "step": 537 + }, + { + "epoch": 0.07529741077676697, + "grad_norm": 0.5293943650054048, + "learning_rate": 4.973412118823412e-05, + "loss": 0.6915, + "step": 538 + }, + { + "epoch": 0.07543736878936319, + "grad_norm": 0.5248241753853723, + "learning_rate": 4.973247015897428e-05, + "loss": 0.6878, + "step": 539 + }, + { + "epoch": 0.07557732680195942, + "grad_norm": 0.536619020426942, + "learning_rate": 4.973081404694088e-05, + "loss": 0.6913, + "step": 540 + }, + { + "epoch": 0.07571728481455563, + "grad_norm": 0.4893689470456731, + "learning_rate": 4.972915285247426e-05, + "loss": 0.6593, + "step": 541 + }, + { + "epoch": 0.07585724282715185, + "grad_norm": 0.5456811185536261, + "learning_rate": 4.9727486575915823e-05, + "loss": 0.6797, + "step": 542 + }, + { + "epoch": 0.07599720083974808, + "grad_norm": 0.547854073554072, + "learning_rate": 4.9725815217607994e-05, + "loss": 0.6729, + "step": 543 + }, + { + "epoch": 0.0761371588523443, + "grad_norm": 0.5192786810194786, + "learning_rate": 4.972413877789426e-05, + "loss": 0.6241, + "step": 544 + }, + { + "epoch": 0.07627711686494051, + "grad_norm": 0.47716140290292464, + "learning_rate": 4.972245725711914e-05, + "loss": 0.662, + "step": 545 + }, + { + "epoch": 0.07641707487753674, + "grad_norm": 0.5273241129689946, + "learning_rate": 4.972077065562821e-05, + "loss": 0.6603, + "step": 546 + }, + { + "epoch": 0.07655703289013296, + "grad_norm": 0.5220860206770274, + "learning_rate": 4.971907897376809e-05, + "loss": 0.6862, + "step": 547 + }, + { + "epoch": 0.07669699090272918, + "grad_norm": 0.4954377301463657, + "learning_rate": 4.971738221188643e-05, + "loss": 0.6357, + "step": 548 + }, + { + "epoch": 0.0768369489153254, + "grad_norm": 0.5031579615121048, + "learning_rate": 4.9715680370331926e-05, + "loss": 0.7242, + "step": 549 + }, + { + "epoch": 0.07697690692792163, + "grad_norm": 0.4883815277737791, + "learning_rate": 4.9713973449454335e-05, + "loss": 0.6764, + "step": 550 + }, + { + "epoch": 0.07711686494051785, + "grad_norm": 0.5324244397343785, + "learning_rate": 4.971226144960443e-05, + "loss": 0.7038, + "step": 551 + }, + { + "epoch": 0.07725682295311406, + "grad_norm": 0.5072266809492816, + "learning_rate": 4.971054437113406e-05, + "loss": 0.6778, + "step": 552 + }, + { + "epoch": 0.07739678096571029, + "grad_norm": 0.48810600146086575, + "learning_rate": 4.97088222143961e-05, + "loss": 0.6719, + "step": 553 + }, + { + "epoch": 0.07753673897830651, + "grad_norm": 0.5236886736407549, + "learning_rate": 4.970709497974447e-05, + "loss": 0.6472, + "step": 554 + }, + { + "epoch": 0.07767669699090272, + "grad_norm": 0.521109484700154, + "learning_rate": 4.9705362667534126e-05, + "loss": 0.637, + "step": 555 + }, + { + "epoch": 0.07781665500349895, + "grad_norm": 0.4825990181972682, + "learning_rate": 4.970362527812109e-05, + "loss": 0.6906, + "step": 556 + }, + { + "epoch": 0.07795661301609517, + "grad_norm": 0.5261802281784332, + "learning_rate": 4.970188281186241e-05, + "loss": 0.6585, + "step": 557 + }, + { + "epoch": 0.07809657102869139, + "grad_norm": 0.4913754461237493, + "learning_rate": 4.970013526911617e-05, + "loss": 0.6868, + "step": 558 + }, + { + "epoch": 0.07823652904128761, + "grad_norm": 0.5026456765855343, + "learning_rate": 4.969838265024151e-05, + "loss": 0.7251, + "step": 559 + }, + { + "epoch": 0.07837648705388384, + "grad_norm": 0.500311336929775, + "learning_rate": 4.969662495559862e-05, + "loss": 0.6589, + "step": 560 + }, + { + "epoch": 0.07851644506648006, + "grad_norm": 0.5085930979997164, + "learning_rate": 4.969486218554871e-05, + "loss": 0.6596, + "step": 561 + }, + { + "epoch": 0.07865640307907627, + "grad_norm": 0.5160633817299983, + "learning_rate": 4.9693094340454055e-05, + "loss": 0.6414, + "step": 562 + }, + { + "epoch": 0.0787963610916725, + "grad_norm": 0.5037304772878324, + "learning_rate": 4.969132142067797e-05, + "loss": 0.6766, + "step": 563 + }, + { + "epoch": 0.07893631910426872, + "grad_norm": 0.4848388430973453, + "learning_rate": 4.96895434265848e-05, + "loss": 0.6928, + "step": 564 + }, + { + "epoch": 0.07907627711686493, + "grad_norm": 0.5085119763538183, + "learning_rate": 4.968776035853996e-05, + "loss": 0.6998, + "step": 565 + }, + { + "epoch": 0.07921623512946116, + "grad_norm": 0.9757272802585409, + "learning_rate": 4.968597221690986e-05, + "loss": 0.6367, + "step": 566 + }, + { + "epoch": 0.07935619314205739, + "grad_norm": 0.49870651337329763, + "learning_rate": 4.9684179002062e-05, + "loss": 0.6814, + "step": 567 + }, + { + "epoch": 0.07949615115465361, + "grad_norm": 0.5044807130296783, + "learning_rate": 4.9682380714364897e-05, + "loss": 0.6108, + "step": 568 + }, + { + "epoch": 0.07963610916724982, + "grad_norm": 0.708447636643108, + "learning_rate": 4.968057735418812e-05, + "loss": 0.6621, + "step": 569 + }, + { + "epoch": 0.07977606717984605, + "grad_norm": 0.5176890020141557, + "learning_rate": 4.967876892190227e-05, + "loss": 0.6712, + "step": 570 + }, + { + "epoch": 0.07991602519244227, + "grad_norm": 0.5165402375548048, + "learning_rate": 4.967695541787901e-05, + "loss": 0.6516, + "step": 571 + }, + { + "epoch": 0.08005598320503848, + "grad_norm": 0.5023727855771402, + "learning_rate": 4.967513684249103e-05, + "loss": 0.6761, + "step": 572 + }, + { + "epoch": 0.08019594121763471, + "grad_norm": 0.5273373598923502, + "learning_rate": 4.967331319611206e-05, + "loss": 0.6513, + "step": 573 + }, + { + "epoch": 0.08033589923023093, + "grad_norm": 0.5038764795022782, + "learning_rate": 4.967148447911688e-05, + "loss": 0.6332, + "step": 574 + }, + { + "epoch": 0.08047585724282715, + "grad_norm": 0.5050569409944193, + "learning_rate": 4.966965069188132e-05, + "loss": 0.6617, + "step": 575 + }, + { + "epoch": 0.08061581525542337, + "grad_norm": 0.48919260228995454, + "learning_rate": 4.9667811834782224e-05, + "loss": 0.6654, + "step": 576 + }, + { + "epoch": 0.0807557732680196, + "grad_norm": 0.5353958441988039, + "learning_rate": 4.9665967908197506e-05, + "loss": 0.6592, + "step": 577 + }, + { + "epoch": 0.08089573128061582, + "grad_norm": 0.5361921590438847, + "learning_rate": 4.966411891250612e-05, + "loss": 0.674, + "step": 578 + }, + { + "epoch": 0.08103568929321203, + "grad_norm": 0.5342727473087102, + "learning_rate": 4.9662264848088034e-05, + "loss": 0.6885, + "step": 579 + }, + { + "epoch": 0.08117564730580826, + "grad_norm": 0.539903295130481, + "learning_rate": 4.96604057153243e-05, + "loss": 0.6662, + "step": 580 + }, + { + "epoch": 0.08131560531840448, + "grad_norm": 0.49180697521465716, + "learning_rate": 4.965854151459697e-05, + "loss": 0.6423, + "step": 581 + }, + { + "epoch": 0.0814555633310007, + "grad_norm": 0.5088216974014643, + "learning_rate": 4.965667224628916e-05, + "loss": 0.696, + "step": 582 + }, + { + "epoch": 0.08159552134359692, + "grad_norm": 0.49968640535207737, + "learning_rate": 4.965479791078502e-05, + "loss": 0.6543, + "step": 583 + }, + { + "epoch": 0.08173547935619314, + "grad_norm": 0.4859083259348156, + "learning_rate": 4.965291850846976e-05, + "loss": 0.6729, + "step": 584 + }, + { + "epoch": 0.08187543736878937, + "grad_norm": 0.4993128882253819, + "learning_rate": 4.96510340397296e-05, + "loss": 0.6521, + "step": 585 + }, + { + "epoch": 0.08201539538138558, + "grad_norm": 0.5193117598814236, + "learning_rate": 4.964914450495183e-05, + "loss": 0.6525, + "step": 586 + }, + { + "epoch": 0.0821553533939818, + "grad_norm": 0.5719370954745235, + "learning_rate": 4.964724990452476e-05, + "loss": 0.6622, + "step": 587 + }, + { + "epoch": 0.08229531140657803, + "grad_norm": 0.4700969122107685, + "learning_rate": 4.964535023883776e-05, + "loss": 0.6954, + "step": 588 + }, + { + "epoch": 0.08243526941917424, + "grad_norm": 0.484560721731152, + "learning_rate": 4.964344550828122e-05, + "loss": 0.6869, + "step": 589 + }, + { + "epoch": 0.08257522743177047, + "grad_norm": 0.4925497467039491, + "learning_rate": 4.964153571324658e-05, + "loss": 0.6675, + "step": 590 + }, + { + "epoch": 0.08271518544436669, + "grad_norm": 0.48466585548673363, + "learning_rate": 4.9639620854126326e-05, + "loss": 0.6593, + "step": 591 + }, + { + "epoch": 0.08285514345696292, + "grad_norm": 0.48477008272792094, + "learning_rate": 4.963770093131399e-05, + "loss": 0.7009, + "step": 592 + }, + { + "epoch": 0.08299510146955913, + "grad_norm": 0.5346681837659809, + "learning_rate": 4.963577594520412e-05, + "loss": 0.6611, + "step": 593 + }, + { + "epoch": 0.08313505948215535, + "grad_norm": 0.5094553222628161, + "learning_rate": 4.963384589619233e-05, + "loss": 0.6488, + "step": 594 + }, + { + "epoch": 0.08327501749475158, + "grad_norm": 0.5908935657202756, + "learning_rate": 4.9631910784675265e-05, + "loss": 0.6724, + "step": 595 + }, + { + "epoch": 0.08341497550734779, + "grad_norm": 0.515639184329749, + "learning_rate": 4.96299706110506e-05, + "loss": 0.6859, + "step": 596 + }, + { + "epoch": 0.08355493351994402, + "grad_norm": 0.55185217811717, + "learning_rate": 4.962802537571707e-05, + "loss": 0.6623, + "step": 597 + }, + { + "epoch": 0.08369489153254024, + "grad_norm": 0.49915022546087007, + "learning_rate": 4.962607507907444e-05, + "loss": 0.6648, + "step": 598 + }, + { + "epoch": 0.08383484954513645, + "grad_norm": 0.5284448258033697, + "learning_rate": 4.962411972152352e-05, + "loss": 0.6342, + "step": 599 + }, + { + "epoch": 0.08397480755773268, + "grad_norm": 0.4774184931571651, + "learning_rate": 4.9622159303466144e-05, + "loss": 0.686, + "step": 600 + }, + { + "epoch": 0.0841147655703289, + "grad_norm": 0.49606716653441585, + "learning_rate": 4.962019382530521e-05, + "loss": 0.6664, + "step": 601 + }, + { + "epoch": 0.08425472358292513, + "grad_norm": 0.5155895908215095, + "learning_rate": 4.9618223287444624e-05, + "loss": 0.693, + "step": 602 + }, + { + "epoch": 0.08439468159552134, + "grad_norm": 0.48698005510262393, + "learning_rate": 4.9616247690289375e-05, + "loss": 0.6824, + "step": 603 + }, + { + "epoch": 0.08453463960811756, + "grad_norm": 0.5096390231519464, + "learning_rate": 4.9614267034245454e-05, + "loss": 0.6709, + "step": 604 + }, + { + "epoch": 0.08467459762071379, + "grad_norm": 0.47879312610079056, + "learning_rate": 4.961228131971991e-05, + "loss": 0.6191, + "step": 605 + }, + { + "epoch": 0.08481455563331, + "grad_norm": 0.5036694948790285, + "learning_rate": 4.9610290547120835e-05, + "loss": 0.6677, + "step": 606 + }, + { + "epoch": 0.08495451364590623, + "grad_norm": 0.5387911945134125, + "learning_rate": 4.960829471685734e-05, + "loss": 0.7108, + "step": 607 + }, + { + "epoch": 0.08509447165850245, + "grad_norm": 0.48418619588152695, + "learning_rate": 4.9606293829339595e-05, + "loss": 0.6581, + "step": 608 + }, + { + "epoch": 0.08523442967109868, + "grad_norm": 0.46835555910688126, + "learning_rate": 4.9604287884978803e-05, + "loss": 0.6285, + "step": 609 + }, + { + "epoch": 0.08537438768369489, + "grad_norm": 0.49536583738424655, + "learning_rate": 4.9602276884187206e-05, + "loss": 0.6862, + "step": 610 + }, + { + "epoch": 0.08551434569629111, + "grad_norm": 0.5219805753845016, + "learning_rate": 4.9600260827378074e-05, + "loss": 0.7262, + "step": 611 + }, + { + "epoch": 0.08565430370888734, + "grad_norm": 0.5105322442141146, + "learning_rate": 4.959823971496574e-05, + "loss": 0.6818, + "step": 612 + }, + { + "epoch": 0.08579426172148355, + "grad_norm": 0.5306990676448388, + "learning_rate": 4.9596213547365566e-05, + "loss": 0.7141, + "step": 613 + }, + { + "epoch": 0.08593421973407978, + "grad_norm": 0.518971911880668, + "learning_rate": 4.959418232499394e-05, + "loss": 0.6447, + "step": 614 + }, + { + "epoch": 0.086074177746676, + "grad_norm": 0.48723426006398907, + "learning_rate": 4.959214604826831e-05, + "loss": 0.6691, + "step": 615 + }, + { + "epoch": 0.08621413575927221, + "grad_norm": 0.47908534192693764, + "learning_rate": 4.9590104717607135e-05, + "loss": 0.7039, + "step": 616 + }, + { + "epoch": 0.08635409377186844, + "grad_norm": 0.47477383891868286, + "learning_rate": 4.958805833342994e-05, + "loss": 0.6664, + "step": 617 + }, + { + "epoch": 0.08649405178446466, + "grad_norm": 0.4984833292893388, + "learning_rate": 4.958600689615728e-05, + "loss": 0.6829, + "step": 618 + }, + { + "epoch": 0.08663400979706089, + "grad_norm": 0.49546276658635413, + "learning_rate": 4.958395040621073e-05, + "loss": 0.6482, + "step": 619 + }, + { + "epoch": 0.0867739678096571, + "grad_norm": 0.51460494631631, + "learning_rate": 4.958188886401295e-05, + "loss": 0.684, + "step": 620 + }, + { + "epoch": 0.08691392582225332, + "grad_norm": 0.5030849606038066, + "learning_rate": 4.9579822269987574e-05, + "loss": 0.6708, + "step": 621 + }, + { + "epoch": 0.08705388383484955, + "grad_norm": 0.50101210673068, + "learning_rate": 4.957775062455933e-05, + "loss": 0.6639, + "step": 622 + }, + { + "epoch": 0.08719384184744576, + "grad_norm": 0.46852010673845496, + "learning_rate": 4.9575673928153957e-05, + "loss": 0.6603, + "step": 623 + }, + { + "epoch": 0.08733379986004199, + "grad_norm": 0.48823643441327075, + "learning_rate": 4.957359218119824e-05, + "loss": 0.6719, + "step": 624 + }, + { + "epoch": 0.08747375787263821, + "grad_norm": 0.4891031572821987, + "learning_rate": 4.957150538411999e-05, + "loss": 0.6656, + "step": 625 + }, + { + "epoch": 0.08761371588523444, + "grad_norm": 0.563701350648365, + "learning_rate": 4.956941353734807e-05, + "loss": 0.6663, + "step": 626 + }, + { + "epoch": 0.08775367389783065, + "grad_norm": 0.48687644197380436, + "learning_rate": 4.956731664131238e-05, + "loss": 0.6913, + "step": 627 + }, + { + "epoch": 0.08789363191042687, + "grad_norm": 0.4907175129397562, + "learning_rate": 4.956521469644384e-05, + "loss": 0.706, + "step": 628 + }, + { + "epoch": 0.0880335899230231, + "grad_norm": 0.5313788653762662, + "learning_rate": 4.9563107703174436e-05, + "loss": 0.6644, + "step": 629 + }, + { + "epoch": 0.08817354793561931, + "grad_norm": 0.5033134935760109, + "learning_rate": 4.956099566193717e-05, + "loss": 0.6654, + "step": 630 + }, + { + "epoch": 0.08831350594821553, + "grad_norm": 0.4589870741107357, + "learning_rate": 4.955887857316609e-05, + "loss": 0.6417, + "step": 631 + }, + { + "epoch": 0.08845346396081176, + "grad_norm": 0.48309495215754206, + "learning_rate": 4.955675643729628e-05, + "loss": 0.665, + "step": 632 + }, + { + "epoch": 0.08859342197340797, + "grad_norm": 0.5259807553857797, + "learning_rate": 4.955462925476385e-05, + "loss": 0.6945, + "step": 633 + }, + { + "epoch": 0.0887333799860042, + "grad_norm": 0.5163974444977445, + "learning_rate": 4.9552497026005974e-05, + "loss": 0.6972, + "step": 634 + }, + { + "epoch": 0.08887333799860042, + "grad_norm": 0.4973389672383331, + "learning_rate": 4.955035975146084e-05, + "loss": 0.6551, + "step": 635 + }, + { + "epoch": 0.08901329601119665, + "grad_norm": 0.47160543121321047, + "learning_rate": 4.9548217431567665e-05, + "loss": 0.6499, + "step": 636 + }, + { + "epoch": 0.08915325402379286, + "grad_norm": 0.4981939411665721, + "learning_rate": 4.954607006676675e-05, + "loss": 0.6541, + "step": 637 + }, + { + "epoch": 0.08929321203638908, + "grad_norm": 0.4495878467816103, + "learning_rate": 4.954391765749936e-05, + "loss": 0.6113, + "step": 638 + }, + { + "epoch": 0.08943317004898531, + "grad_norm": 0.47812065178812635, + "learning_rate": 4.954176020420788e-05, + "loss": 0.6269, + "step": 639 + }, + { + "epoch": 0.08957312806158152, + "grad_norm": 0.5080513968505054, + "learning_rate": 4.953959770733565e-05, + "loss": 0.7068, + "step": 640 + }, + { + "epoch": 0.08971308607417774, + "grad_norm": 0.5272266307181286, + "learning_rate": 4.95374301673271e-05, + "loss": 0.6986, + "step": 641 + }, + { + "epoch": 0.08985304408677397, + "grad_norm": 0.5013094523061675, + "learning_rate": 4.953525758462769e-05, + "loss": 0.6807, + "step": 642 + }, + { + "epoch": 0.0899930020993702, + "grad_norm": 0.6486797331723868, + "learning_rate": 4.95330799596839e-05, + "loss": 0.6414, + "step": 643 + }, + { + "epoch": 0.0901329601119664, + "grad_norm": 0.4872602306676202, + "learning_rate": 4.953089729294326e-05, + "loss": 0.6689, + "step": 644 + }, + { + "epoch": 0.09027291812456263, + "grad_norm": 0.5040432352418639, + "learning_rate": 4.952870958485432e-05, + "loss": 0.6865, + "step": 645 + }, + { + "epoch": 0.09041287613715886, + "grad_norm": 0.46173609966688006, + "learning_rate": 4.952651683586668e-05, + "loss": 0.6481, + "step": 646 + }, + { + "epoch": 0.09055283414975507, + "grad_norm": 0.45658386339702023, + "learning_rate": 4.952431904643097e-05, + "loss": 0.6539, + "step": 647 + }, + { + "epoch": 0.0906927921623513, + "grad_norm": 0.503432394082156, + "learning_rate": 4.952211621699887e-05, + "loss": 0.7104, + "step": 648 + }, + { + "epoch": 0.09083275017494752, + "grad_norm": 0.46517000786275325, + "learning_rate": 4.951990834802307e-05, + "loss": 0.6525, + "step": 649 + }, + { + "epoch": 0.09097270818754374, + "grad_norm": 0.5062356868953571, + "learning_rate": 4.951769543995731e-05, + "loss": 0.6823, + "step": 650 + }, + { + "epoch": 0.09111266620013996, + "grad_norm": 0.5119419441222778, + "learning_rate": 4.951547749325638e-05, + "loss": 0.6635, + "step": 651 + }, + { + "epoch": 0.09125262421273618, + "grad_norm": 0.46695225104802485, + "learning_rate": 4.951325450837607e-05, + "loss": 0.6971, + "step": 652 + }, + { + "epoch": 0.0913925822253324, + "grad_norm": 0.49434009422947434, + "learning_rate": 4.951102648577324e-05, + "loss": 0.6914, + "step": 653 + }, + { + "epoch": 0.09153254023792862, + "grad_norm": 0.4604670405526644, + "learning_rate": 4.950879342590577e-05, + "loss": 0.6519, + "step": 654 + }, + { + "epoch": 0.09167249825052484, + "grad_norm": 0.5184837228643522, + "learning_rate": 4.9506555329232574e-05, + "loss": 0.6886, + "step": 655 + }, + { + "epoch": 0.09181245626312107, + "grad_norm": 0.48694633832428874, + "learning_rate": 4.9504312196213596e-05, + "loss": 0.6636, + "step": 656 + }, + { + "epoch": 0.09195241427571728, + "grad_norm": 0.45470193445575985, + "learning_rate": 4.9502064027309836e-05, + "loss": 0.6402, + "step": 657 + }, + { + "epoch": 0.0920923722883135, + "grad_norm": 0.49263294738528823, + "learning_rate": 4.9499810822983314e-05, + "loss": 0.6513, + "step": 658 + }, + { + "epoch": 0.09223233030090973, + "grad_norm": 0.48137872453923874, + "learning_rate": 4.949755258369707e-05, + "loss": 0.6387, + "step": 659 + }, + { + "epoch": 0.09237228831350595, + "grad_norm": 0.5244416908958357, + "learning_rate": 4.949528930991522e-05, + "loss": 0.6634, + "step": 660 + }, + { + "epoch": 0.09251224632610217, + "grad_norm": 0.4835025657565747, + "learning_rate": 4.949302100210287e-05, + "loss": 0.6582, + "step": 661 + }, + { + "epoch": 0.09265220433869839, + "grad_norm": 0.4601647504352115, + "learning_rate": 4.9490747660726186e-05, + "loss": 0.633, + "step": 662 + }, + { + "epoch": 0.09279216235129462, + "grad_norm": 0.4807820429695223, + "learning_rate": 4.948846928625236e-05, + "loss": 0.6484, + "step": 663 + }, + { + "epoch": 0.09293212036389083, + "grad_norm": 0.4621826268890677, + "learning_rate": 4.948618587914963e-05, + "loss": 0.6201, + "step": 664 + }, + { + "epoch": 0.09307207837648705, + "grad_norm": 0.5355555893122882, + "learning_rate": 4.9483897439887256e-05, + "loss": 0.6939, + "step": 665 + }, + { + "epoch": 0.09321203638908328, + "grad_norm": 0.48631181937028944, + "learning_rate": 4.948160396893553e-05, + "loss": 0.6463, + "step": 666 + }, + { + "epoch": 0.0933519944016795, + "grad_norm": 0.4699689810513121, + "learning_rate": 4.947930546676579e-05, + "loss": 0.6417, + "step": 667 + }, + { + "epoch": 0.09349195241427571, + "grad_norm": 0.5070622795052616, + "learning_rate": 4.94770019338504e-05, + "loss": 0.6536, + "step": 668 + }, + { + "epoch": 0.09363191042687194, + "grad_norm": 0.5074909040261282, + "learning_rate": 4.947469337066275e-05, + "loss": 0.6313, + "step": 669 + }, + { + "epoch": 0.09377186843946816, + "grad_norm": 0.5145354321514217, + "learning_rate": 4.9472379777677287e-05, + "loss": 0.6679, + "step": 670 + }, + { + "epoch": 0.09391182645206438, + "grad_norm": 0.45833527492215737, + "learning_rate": 4.947006115536947e-05, + "loss": 0.6604, + "step": 671 + }, + { + "epoch": 0.0940517844646606, + "grad_norm": 0.45998883685850256, + "learning_rate": 4.9467737504215805e-05, + "loss": 0.6484, + "step": 672 + }, + { + "epoch": 0.09419174247725683, + "grad_norm": 0.46701832176416125, + "learning_rate": 4.946540882469381e-05, + "loss": 0.6622, + "step": 673 + }, + { + "epoch": 0.09433170048985304, + "grad_norm": 0.4937012715997403, + "learning_rate": 4.946307511728208e-05, + "loss": 0.6423, + "step": 674 + }, + { + "epoch": 0.09447165850244926, + "grad_norm": 0.45944259724419273, + "learning_rate": 4.9460736382460195e-05, + "loss": 0.6263, + "step": 675 + }, + { + "epoch": 0.09461161651504549, + "grad_norm": 0.4953211118671407, + "learning_rate": 4.945839262070879e-05, + "loss": 0.6401, + "step": 676 + }, + { + "epoch": 0.09475157452764171, + "grad_norm": 0.4954445380856775, + "learning_rate": 4.9456043832509544e-05, + "loss": 0.6766, + "step": 677 + }, + { + "epoch": 0.09489153254023792, + "grad_norm": 0.4756639472671179, + "learning_rate": 4.9453690018345144e-05, + "loss": 0.6516, + "step": 678 + }, + { + "epoch": 0.09503149055283415, + "grad_norm": 0.5216854708766936, + "learning_rate": 4.9451331178699324e-05, + "loss": 0.6233, + "step": 679 + }, + { + "epoch": 0.09517144856543037, + "grad_norm": 0.5149096513895491, + "learning_rate": 4.944896731405686e-05, + "loss": 0.6561, + "step": 680 + }, + { + "epoch": 0.09531140657802659, + "grad_norm": 0.4884876631431996, + "learning_rate": 4.944659842490354e-05, + "loss": 0.6594, + "step": 681 + }, + { + "epoch": 0.09545136459062281, + "grad_norm": 0.4960871887261993, + "learning_rate": 4.944422451172619e-05, + "loss": 0.6391, + "step": 682 + }, + { + "epoch": 0.09559132260321904, + "grad_norm": 0.49440117471489403, + "learning_rate": 4.944184557501269e-05, + "loss": 0.6714, + "step": 683 + }, + { + "epoch": 0.09573128061581526, + "grad_norm": 0.47570363710472247, + "learning_rate": 4.943946161525192e-05, + "loss": 0.6786, + "step": 684 + }, + { + "epoch": 0.09587123862841147, + "grad_norm": 0.48841469758633105, + "learning_rate": 4.9437072632933814e-05, + "loss": 0.6932, + "step": 685 + }, + { + "epoch": 0.0960111966410077, + "grad_norm": 0.47367349893999716, + "learning_rate": 4.943467862854934e-05, + "loss": 0.6331, + "step": 686 + }, + { + "epoch": 0.09615115465360392, + "grad_norm": 0.4966192406884947, + "learning_rate": 4.943227960259048e-05, + "loss": 0.6632, + "step": 687 + }, + { + "epoch": 0.09629111266620013, + "grad_norm": 0.46145851197097154, + "learning_rate": 4.942987555555026e-05, + "loss": 0.6428, + "step": 688 + }, + { + "epoch": 0.09643107067879636, + "grad_norm": 0.47559863610819186, + "learning_rate": 4.942746648792274e-05, + "loss": 0.6499, + "step": 689 + }, + { + "epoch": 0.09657102869139259, + "grad_norm": 0.46991178970104935, + "learning_rate": 4.9425052400203e-05, + "loss": 0.6134, + "step": 690 + }, + { + "epoch": 0.0967109867039888, + "grad_norm": 0.486056912045251, + "learning_rate": 4.942263329288716e-05, + "loss": 0.6594, + "step": 691 + }, + { + "epoch": 0.09685094471658502, + "grad_norm": 0.5045620121297718, + "learning_rate": 4.942020916647238e-05, + "loss": 0.6606, + "step": 692 + }, + { + "epoch": 0.09699090272918125, + "grad_norm": 0.49357726954668424, + "learning_rate": 4.941778002145684e-05, + "loss": 0.6436, + "step": 693 + }, + { + "epoch": 0.09713086074177747, + "grad_norm": 0.44767321778450064, + "learning_rate": 4.941534585833975e-05, + "loss": 0.6662, + "step": 694 + }, + { + "epoch": 0.09727081875437368, + "grad_norm": 0.49515786663501604, + "learning_rate": 4.941290667762136e-05, + "loss": 0.6441, + "step": 695 + }, + { + "epoch": 0.09741077676696991, + "grad_norm": 0.5193186895726914, + "learning_rate": 4.9410462479802945e-05, + "loss": 0.6667, + "step": 696 + }, + { + "epoch": 0.09755073477956613, + "grad_norm": 0.48819228314517343, + "learning_rate": 4.94080132653868e-05, + "loss": 0.6525, + "step": 697 + }, + { + "epoch": 0.09769069279216235, + "grad_norm": 0.47504662151869315, + "learning_rate": 4.940555903487628e-05, + "loss": 0.6507, + "step": 698 + }, + { + "epoch": 0.09783065080475857, + "grad_norm": 0.5087429079412467, + "learning_rate": 4.9403099788775754e-05, + "loss": 0.6885, + "step": 699 + }, + { + "epoch": 0.0979706088173548, + "grad_norm": 0.4991373858117139, + "learning_rate": 4.940063552759061e-05, + "loss": 0.6614, + "step": 700 + }, + { + "epoch": 0.09811056682995102, + "grad_norm": 0.5030115759681447, + "learning_rate": 4.939816625182729e-05, + "loss": 0.6415, + "step": 701 + }, + { + "epoch": 0.09825052484254723, + "grad_norm": 0.47371776821243294, + "learning_rate": 4.939569196199325e-05, + "loss": 0.6391, + "step": 702 + }, + { + "epoch": 0.09839048285514346, + "grad_norm": 0.4881159343567594, + "learning_rate": 4.9393212658596976e-05, + "loss": 0.6244, + "step": 703 + }, + { + "epoch": 0.09853044086773968, + "grad_norm": 0.49870488221888004, + "learning_rate": 4.9390728342148006e-05, + "loss": 0.6616, + "step": 704 + }, + { + "epoch": 0.0986703988803359, + "grad_norm": 0.4768129759031237, + "learning_rate": 4.938823901315688e-05, + "loss": 0.6559, + "step": 705 + }, + { + "epoch": 0.09881035689293212, + "grad_norm": 0.4729780422275052, + "learning_rate": 4.938574467213518e-05, + "loss": 0.6256, + "step": 706 + }, + { + "epoch": 0.09895031490552834, + "grad_norm": 0.49227905209471745, + "learning_rate": 4.9383245319595514e-05, + "loss": 0.6564, + "step": 707 + }, + { + "epoch": 0.09909027291812457, + "grad_norm": 0.49477666423952565, + "learning_rate": 4.9380740956051545e-05, + "loss": 0.6031, + "step": 708 + }, + { + "epoch": 0.09923023093072078, + "grad_norm": 0.4711224186000299, + "learning_rate": 4.9378231582017926e-05, + "loss": 0.6302, + "step": 709 + }, + { + "epoch": 0.099370188943317, + "grad_norm": 0.49742927700008693, + "learning_rate": 4.9375717198010366e-05, + "loss": 0.6732, + "step": 710 + }, + { + "epoch": 0.09951014695591323, + "grad_norm": 0.4750405851316029, + "learning_rate": 4.937319780454559e-05, + "loss": 0.6852, + "step": 711 + }, + { + "epoch": 0.09965010496850944, + "grad_norm": 0.5319507911786844, + "learning_rate": 4.937067340214137e-05, + "loss": 0.6426, + "step": 712 + }, + { + "epoch": 0.09979006298110567, + "grad_norm": 0.5222169180449785, + "learning_rate": 4.936814399131648e-05, + "loss": 0.6738, + "step": 713 + }, + { + "epoch": 0.09993002099370189, + "grad_norm": 0.5105935227729471, + "learning_rate": 4.936560957259077e-05, + "loss": 0.6668, + "step": 714 + }, + { + "epoch": 0.1000699790062981, + "grad_norm": 0.5163958109604428, + "learning_rate": 4.9363070146485044e-05, + "loss": 0.6986, + "step": 715 + }, + { + "epoch": 0.10020993701889433, + "grad_norm": 0.5021401060702906, + "learning_rate": 4.936052571352122e-05, + "loss": 0.6468, + "step": 716 + }, + { + "epoch": 0.10034989503149055, + "grad_norm": 0.4685192631602669, + "learning_rate": 4.9357976274222185e-05, + "loss": 0.6609, + "step": 717 + }, + { + "epoch": 0.10048985304408678, + "grad_norm": 0.48726170019620224, + "learning_rate": 4.935542182911188e-05, + "loss": 0.6761, + "step": 718 + }, + { + "epoch": 0.10062981105668299, + "grad_norm": 0.48175436628421253, + "learning_rate": 4.935286237871527e-05, + "loss": 0.645, + "step": 719 + }, + { + "epoch": 0.10076976906927922, + "grad_norm": 0.49385381693887187, + "learning_rate": 4.935029792355834e-05, + "loss": 0.6243, + "step": 720 + }, + { + "epoch": 0.10090972708187544, + "grad_norm": 0.4743721480397386, + "learning_rate": 4.934772846416812e-05, + "loss": 0.6999, + "step": 721 + }, + { + "epoch": 0.10104968509447165, + "grad_norm": 0.49107710855800335, + "learning_rate": 4.934515400107266e-05, + "loss": 0.6637, + "step": 722 + }, + { + "epoch": 0.10118964310706788, + "grad_norm": 0.49159205634657455, + "learning_rate": 4.934257453480103e-05, + "loss": 0.6218, + "step": 723 + }, + { + "epoch": 0.1013296011196641, + "grad_norm": 0.49141594694855456, + "learning_rate": 4.933999006588335e-05, + "loss": 0.674, + "step": 724 + }, + { + "epoch": 0.10146955913226033, + "grad_norm": 0.44737735842488807, + "learning_rate": 4.933740059485075e-05, + "loss": 0.6446, + "step": 725 + }, + { + "epoch": 0.10160951714485654, + "grad_norm": 0.4856033404998713, + "learning_rate": 4.9334806122235376e-05, + "loss": 0.6557, + "step": 726 + }, + { + "epoch": 0.10174947515745277, + "grad_norm": 0.48253432462131046, + "learning_rate": 4.933220664857044e-05, + "loss": 0.6452, + "step": 727 + }, + { + "epoch": 0.10188943317004899, + "grad_norm": 0.5065000807014411, + "learning_rate": 4.9329602174390153e-05, + "loss": 0.6936, + "step": 728 + }, + { + "epoch": 0.1020293911826452, + "grad_norm": 0.46004243943931644, + "learning_rate": 4.932699270022976e-05, + "loss": 0.6666, + "step": 729 + }, + { + "epoch": 0.10216934919524143, + "grad_norm": 0.4729412035182845, + "learning_rate": 4.932437822662553e-05, + "loss": 0.619, + "step": 730 + }, + { + "epoch": 0.10230930720783765, + "grad_norm": 0.4829984181448435, + "learning_rate": 4.932175875411478e-05, + "loss": 0.6387, + "step": 731 + }, + { + "epoch": 0.10244926522043386, + "grad_norm": 0.4826696857482811, + "learning_rate": 4.931913428323581e-05, + "loss": 0.6433, + "step": 732 + }, + { + "epoch": 0.10258922323303009, + "grad_norm": 0.45648188028698533, + "learning_rate": 4.931650481452801e-05, + "loss": 0.6035, + "step": 733 + }, + { + "epoch": 0.10272918124562631, + "grad_norm": 0.4691725557215748, + "learning_rate": 4.931387034853173e-05, + "loss": 0.6362, + "step": 734 + }, + { + "epoch": 0.10286913925822254, + "grad_norm": 0.49511735363849857, + "learning_rate": 4.93112308857884e-05, + "loss": 0.6128, + "step": 735 + }, + { + "epoch": 0.10300909727081875, + "grad_norm": 0.5067311529916844, + "learning_rate": 4.9308586426840454e-05, + "loss": 0.6581, + "step": 736 + }, + { + "epoch": 0.10314905528341498, + "grad_norm": 0.49461941413672095, + "learning_rate": 4.9305936972231346e-05, + "loss": 0.6545, + "step": 737 + }, + { + "epoch": 0.1032890132960112, + "grad_norm": 0.46083627671042743, + "learning_rate": 4.9303282522505565e-05, + "loss": 0.6384, + "step": 738 + }, + { + "epoch": 0.10342897130860741, + "grad_norm": 0.48817979810198836, + "learning_rate": 4.930062307820865e-05, + "loss": 0.6392, + "step": 739 + }, + { + "epoch": 0.10356892932120364, + "grad_norm": 0.4835520562427775, + "learning_rate": 4.9297958639887116e-05, + "loss": 0.6333, + "step": 740 + }, + { + "epoch": 0.10370888733379986, + "grad_norm": 0.5167572191592301, + "learning_rate": 4.929528920808854e-05, + "loss": 0.635, + "step": 741 + }, + { + "epoch": 0.10384884534639609, + "grad_norm": 0.4595745343183344, + "learning_rate": 4.9292614783361536e-05, + "loss": 0.6376, + "step": 742 + }, + { + "epoch": 0.1039888033589923, + "grad_norm": 0.43859471943688627, + "learning_rate": 4.92899353662557e-05, + "loss": 0.6585, + "step": 743 + }, + { + "epoch": 0.10412876137158852, + "grad_norm": 0.4953753280512198, + "learning_rate": 4.928725095732169e-05, + "loss": 0.6396, + "step": 744 + }, + { + "epoch": 0.10426871938418475, + "grad_norm": 0.4880410414717604, + "learning_rate": 4.928456155711117e-05, + "loss": 0.6403, + "step": 745 + }, + { + "epoch": 0.10440867739678096, + "grad_norm": 0.4737693275569935, + "learning_rate": 4.928186716617686e-05, + "loss": 0.6301, + "step": 746 + }, + { + "epoch": 0.10454863540937719, + "grad_norm": 0.5034731514446417, + "learning_rate": 4.927916778507248e-05, + "loss": 0.6729, + "step": 747 + }, + { + "epoch": 0.10468859342197341, + "grad_norm": 0.46500169965537136, + "learning_rate": 4.9276463414352757e-05, + "loss": 0.6296, + "step": 748 + }, + { + "epoch": 0.10482855143456962, + "grad_norm": 0.48081470356503164, + "learning_rate": 4.9273754054573496e-05, + "loss": 0.614, + "step": 749 + }, + { + "epoch": 0.10496850944716585, + "grad_norm": 0.4506016223155311, + "learning_rate": 4.927103970629148e-05, + "loss": 0.6313, + "step": 750 + }, + { + "epoch": 0.10510846745976207, + "grad_norm": 0.4806095102495363, + "learning_rate": 4.926832037006453e-05, + "loss": 0.6862, + "step": 751 + }, + { + "epoch": 0.1052484254723583, + "grad_norm": 0.5264224078411822, + "learning_rate": 4.926559604645152e-05, + "loss": 0.6387, + "step": 752 + }, + { + "epoch": 0.10538838348495451, + "grad_norm": 0.4993673425121446, + "learning_rate": 4.9262866736012304e-05, + "loss": 0.6761, + "step": 753 + }, + { + "epoch": 0.10552834149755073, + "grad_norm": 0.5130179673023382, + "learning_rate": 4.926013243930779e-05, + "loss": 0.6582, + "step": 754 + }, + { + "epoch": 0.10566829951014696, + "grad_norm": 0.5161547254751858, + "learning_rate": 4.925739315689991e-05, + "loss": 0.65, + "step": 755 + }, + { + "epoch": 0.10580825752274317, + "grad_norm": 3.5932005324517964, + "learning_rate": 4.925464888935162e-05, + "loss": 0.6722, + "step": 756 + }, + { + "epoch": 0.1059482155353394, + "grad_norm": 0.47238634638015287, + "learning_rate": 4.925189963722687e-05, + "loss": 0.6734, + "step": 757 + }, + { + "epoch": 0.10608817354793562, + "grad_norm": 0.4958610236105621, + "learning_rate": 4.924914540109068e-05, + "loss": 0.705, + "step": 758 + }, + { + "epoch": 0.10622813156053185, + "grad_norm": 0.47904033540016167, + "learning_rate": 4.924638618150906e-05, + "loss": 0.6611, + "step": 759 + }, + { + "epoch": 0.10636808957312806, + "grad_norm": 0.4595734849503931, + "learning_rate": 4.924362197904908e-05, + "loss": 0.637, + "step": 760 + }, + { + "epoch": 0.10650804758572428, + "grad_norm": 0.4706049650368603, + "learning_rate": 4.924085279427879e-05, + "loss": 0.607, + "step": 761 + }, + { + "epoch": 0.10664800559832051, + "grad_norm": 0.4829263701881818, + "learning_rate": 4.923807862776728e-05, + "loss": 0.6311, + "step": 762 + }, + { + "epoch": 0.10678796361091672, + "grad_norm": 0.4658603465917324, + "learning_rate": 4.92352994800847e-05, + "loss": 0.6575, + "step": 763 + }, + { + "epoch": 0.10692792162351294, + "grad_norm": 0.48975156023608535, + "learning_rate": 4.9232515351802166e-05, + "loss": 0.6754, + "step": 764 + }, + { + "epoch": 0.10706787963610917, + "grad_norm": 0.4574370958416068, + "learning_rate": 4.922972624349185e-05, + "loss": 0.6556, + "step": 765 + }, + { + "epoch": 0.1072078376487054, + "grad_norm": 0.47320816538424315, + "learning_rate": 4.922693215572695e-05, + "loss": 0.6808, + "step": 766 + }, + { + "epoch": 0.1073477956613016, + "grad_norm": 0.4938979904546163, + "learning_rate": 4.9224133089081675e-05, + "loss": 0.6396, + "step": 767 + }, + { + "epoch": 0.10748775367389783, + "grad_norm": 0.4815436050811902, + "learning_rate": 4.922132904413126e-05, + "loss": 0.6885, + "step": 768 + }, + { + "epoch": 0.10762771168649406, + "grad_norm": 0.4526917995255154, + "learning_rate": 4.921852002145196e-05, + "loss": 0.6613, + "step": 769 + }, + { + "epoch": 0.10776766969909027, + "grad_norm": 0.48451528868058824, + "learning_rate": 4.921570602162108e-05, + "loss": 0.6273, + "step": 770 + }, + { + "epoch": 0.1079076277116865, + "grad_norm": 0.44910249964176424, + "learning_rate": 4.921288704521689e-05, + "loss": 0.6445, + "step": 771 + }, + { + "epoch": 0.10804758572428272, + "grad_norm": 0.4803448598027863, + "learning_rate": 4.9210063092818755e-05, + "loss": 0.6369, + "step": 772 + }, + { + "epoch": 0.10818754373687893, + "grad_norm": 0.500963530964286, + "learning_rate": 4.9207234165007e-05, + "loss": 0.6572, + "step": 773 + }, + { + "epoch": 0.10832750174947516, + "grad_norm": 0.4929298522075444, + "learning_rate": 4.920440026236301e-05, + "loss": 0.6346, + "step": 774 + }, + { + "epoch": 0.10846745976207138, + "grad_norm": 0.45662320216713365, + "learning_rate": 4.920156138546917e-05, + "loss": 0.6593, + "step": 775 + }, + { + "epoch": 0.1086074177746676, + "grad_norm": 0.46973861985195237, + "learning_rate": 4.919871753490891e-05, + "loss": 0.6906, + "step": 776 + }, + { + "epoch": 0.10874737578726382, + "grad_norm": 0.47814740177046505, + "learning_rate": 4.919586871126667e-05, + "loss": 0.646, + "step": 777 + }, + { + "epoch": 0.10888733379986004, + "grad_norm": 0.5173386479220408, + "learning_rate": 4.91930149151279e-05, + "loss": 0.6629, + "step": 778 + }, + { + "epoch": 0.10902729181245627, + "grad_norm": 0.47384818148671254, + "learning_rate": 4.9190156147079094e-05, + "loss": 0.6458, + "step": 779 + }, + { + "epoch": 0.10916724982505248, + "grad_norm": 0.4998121952757577, + "learning_rate": 4.918729240770775e-05, + "loss": 0.6627, + "step": 780 + }, + { + "epoch": 0.1093072078376487, + "grad_norm": 0.49480732704046404, + "learning_rate": 4.918442369760241e-05, + "loss": 0.6503, + "step": 781 + }, + { + "epoch": 0.10944716585024493, + "grad_norm": 0.48089249137852297, + "learning_rate": 4.9181550017352615e-05, + "loss": 0.6334, + "step": 782 + }, + { + "epoch": 0.10958712386284115, + "grad_norm": 0.46605303630427286, + "learning_rate": 4.917867136754893e-05, + "loss": 0.598, + "step": 783 + }, + { + "epoch": 0.10972708187543737, + "grad_norm": 0.4766867229123813, + "learning_rate": 4.9175787748782955e-05, + "loss": 0.6478, + "step": 784 + }, + { + "epoch": 0.10986703988803359, + "grad_norm": 0.47706432841684887, + "learning_rate": 4.9172899161647295e-05, + "loss": 0.6579, + "step": 785 + }, + { + "epoch": 0.11000699790062982, + "grad_norm": 0.47350025007351243, + "learning_rate": 4.9170005606735594e-05, + "loss": 0.622, + "step": 786 + }, + { + "epoch": 0.11014695591322603, + "grad_norm": 0.4737951373989515, + "learning_rate": 4.9167107084642496e-05, + "loss": 0.6186, + "step": 787 + }, + { + "epoch": 0.11028691392582225, + "grad_norm": 0.4973982793226407, + "learning_rate": 4.916420359596368e-05, + "loss": 0.6041, + "step": 788 + }, + { + "epoch": 0.11042687193841848, + "grad_norm": 0.45566949261190215, + "learning_rate": 4.916129514129585e-05, + "loss": 0.6196, + "step": 789 + }, + { + "epoch": 0.11056682995101469, + "grad_norm": 0.47407807015226433, + "learning_rate": 4.915838172123671e-05, + "loss": 0.5968, + "step": 790 + }, + { + "epoch": 0.11070678796361091, + "grad_norm": 0.4508773406494618, + "learning_rate": 4.915546333638501e-05, + "loss": 0.6558, + "step": 791 + }, + { + "epoch": 0.11084674597620714, + "grad_norm": 0.47903719726456023, + "learning_rate": 4.915253998734051e-05, + "loss": 0.6281, + "step": 792 + }, + { + "epoch": 0.11098670398880336, + "grad_norm": 0.4864004849243157, + "learning_rate": 4.914961167470396e-05, + "loss": 0.6632, + "step": 793 + }, + { + "epoch": 0.11112666200139958, + "grad_norm": 0.4828968005003782, + "learning_rate": 4.9146678399077196e-05, + "loss": 0.6641, + "step": 794 + }, + { + "epoch": 0.1112666200139958, + "grad_norm": 0.5082783883297906, + "learning_rate": 4.9143740161063015e-05, + "loss": 0.6881, + "step": 795 + }, + { + "epoch": 0.11140657802659203, + "grad_norm": 0.4811155277843086, + "learning_rate": 4.914079696126526e-05, + "loss": 0.6542, + "step": 796 + }, + { + "epoch": 0.11154653603918824, + "grad_norm": 0.4697422705172665, + "learning_rate": 4.913784880028878e-05, + "loss": 0.6398, + "step": 797 + }, + { + "epoch": 0.11168649405178446, + "grad_norm": 0.48609960976538413, + "learning_rate": 4.9134895678739456e-05, + "loss": 0.6361, + "step": 798 + }, + { + "epoch": 0.11182645206438069, + "grad_norm": 0.4505917244706054, + "learning_rate": 4.9131937597224185e-05, + "loss": 0.6531, + "step": 799 + }, + { + "epoch": 0.11196641007697691, + "grad_norm": 0.4609463571323594, + "learning_rate": 4.912897455635089e-05, + "loss": 0.6091, + "step": 800 + }, + { + "epoch": 0.11210636808957312, + "grad_norm": 0.4633046052805601, + "learning_rate": 4.91260065567285e-05, + "loss": 0.6326, + "step": 801 + }, + { + "epoch": 0.11224632610216935, + "grad_norm": 0.48176393499918024, + "learning_rate": 4.912303359896697e-05, + "loss": 0.6256, + "step": 802 + }, + { + "epoch": 0.11238628411476558, + "grad_norm": 0.44238158654462206, + "learning_rate": 4.912005568367727e-05, + "loss": 0.617, + "step": 803 + }, + { + "epoch": 0.11252624212736179, + "grad_norm": 0.46784130274686425, + "learning_rate": 4.91170728114714e-05, + "loss": 0.6194, + "step": 804 + }, + { + "epoch": 0.11266620013995801, + "grad_norm": 0.4813594149903505, + "learning_rate": 4.9114084982962356e-05, + "loss": 0.6809, + "step": 805 + }, + { + "epoch": 0.11280615815255424, + "grad_norm": 0.4746068016576439, + "learning_rate": 4.911109219876417e-05, + "loss": 0.6482, + "step": 806 + }, + { + "epoch": 0.11294611616515045, + "grad_norm": 0.4837726952509842, + "learning_rate": 4.9108094459491916e-05, + "loss": 0.593, + "step": 807 + }, + { + "epoch": 0.11308607417774667, + "grad_norm": 0.4939387206527619, + "learning_rate": 4.910509176576162e-05, + "loss": 0.6372, + "step": 808 + }, + { + "epoch": 0.1132260321903429, + "grad_norm": 0.49309033390988954, + "learning_rate": 4.910208411819039e-05, + "loss": 0.6212, + "step": 809 + }, + { + "epoch": 0.11336599020293912, + "grad_norm": 0.48931195891148005, + "learning_rate": 4.909907151739633e-05, + "loss": 0.6668, + "step": 810 + }, + { + "epoch": 0.11350594821553533, + "grad_norm": 0.47504035399989575, + "learning_rate": 4.909605396399856e-05, + "loss": 0.66, + "step": 811 + }, + { + "epoch": 0.11364590622813156, + "grad_norm": 0.475509126808091, + "learning_rate": 4.90930314586172e-05, + "loss": 0.6693, + "step": 812 + }, + { + "epoch": 0.11378586424072779, + "grad_norm": 0.46395502226780827, + "learning_rate": 4.909000400187341e-05, + "loss": 0.6028, + "step": 813 + }, + { + "epoch": 0.113925822253324, + "grad_norm": 0.48820830409318683, + "learning_rate": 4.908697159438937e-05, + "loss": 0.62, + "step": 814 + }, + { + "epoch": 0.11406578026592022, + "grad_norm": 0.4909040802882437, + "learning_rate": 4.908393423678829e-05, + "loss": 0.6261, + "step": 815 + }, + { + "epoch": 0.11420573827851645, + "grad_norm": 0.4853877606631391, + "learning_rate": 4.908089192969434e-05, + "loss": 0.6574, + "step": 816 + }, + { + "epoch": 0.11434569629111267, + "grad_norm": 0.4604083047219368, + "learning_rate": 4.907784467373277e-05, + "loss": 0.6091, + "step": 817 + }, + { + "epoch": 0.11448565430370888, + "grad_norm": 0.4533069892751884, + "learning_rate": 4.9074792469529815e-05, + "loss": 0.6021, + "step": 818 + }, + { + "epoch": 0.11462561231630511, + "grad_norm": 0.4520978783038299, + "learning_rate": 4.907173531771273e-05, + "loss": 0.6427, + "step": 819 + }, + { + "epoch": 0.11476557032890133, + "grad_norm": 0.45052658755884484, + "learning_rate": 4.9068673218909796e-05, + "loss": 0.6294, + "step": 820 + }, + { + "epoch": 0.11490552834149755, + "grad_norm": 0.4793207729603878, + "learning_rate": 4.90656061737503e-05, + "loss": 0.6575, + "step": 821 + }, + { + "epoch": 0.11504548635409377, + "grad_norm": 0.4713321633366388, + "learning_rate": 4.906253418286456e-05, + "loss": 0.6507, + "step": 822 + }, + { + "epoch": 0.11518544436669, + "grad_norm": 0.4933569292067938, + "learning_rate": 4.90594572468839e-05, + "loss": 0.6084, + "step": 823 + }, + { + "epoch": 0.11532540237928622, + "grad_norm": 0.4727488359524651, + "learning_rate": 4.9056375366440654e-05, + "loss": 0.6047, + "step": 824 + }, + { + "epoch": 0.11546536039188243, + "grad_norm": 0.4790836053521045, + "learning_rate": 4.9053288542168185e-05, + "loss": 0.6496, + "step": 825 + }, + { + "epoch": 0.11560531840447866, + "grad_norm": 0.4832677203184707, + "learning_rate": 4.905019677470086e-05, + "loss": 0.6247, + "step": 826 + }, + { + "epoch": 0.11574527641707488, + "grad_norm": 0.48906934804664143, + "learning_rate": 4.9047100064674076e-05, + "loss": 0.6419, + "step": 827 + }, + { + "epoch": 0.1158852344296711, + "grad_norm": 0.44450056169083824, + "learning_rate": 4.904399841272423e-05, + "loss": 0.6284, + "step": 828 + }, + { + "epoch": 0.11602519244226732, + "grad_norm": 0.4736815158989831, + "learning_rate": 4.9040891819488766e-05, + "loss": 0.6254, + "step": 829 + }, + { + "epoch": 0.11616515045486354, + "grad_norm": 0.44945213217864644, + "learning_rate": 4.903778028560609e-05, + "loss": 0.6441, + "step": 830 + }, + { + "epoch": 0.11630510846745976, + "grad_norm": 0.5007544417322715, + "learning_rate": 4.903466381171568e-05, + "loss": 0.6898, + "step": 831 + }, + { + "epoch": 0.11644506648005598, + "grad_norm": 0.5281054716405869, + "learning_rate": 4.9031542398457974e-05, + "loss": 0.6338, + "step": 832 + }, + { + "epoch": 0.1165850244926522, + "grad_norm": 0.484478263124759, + "learning_rate": 4.902841604647448e-05, + "loss": 0.6164, + "step": 833 + }, + { + "epoch": 0.11672498250524843, + "grad_norm": 0.44994143276964366, + "learning_rate": 4.902528475640768e-05, + "loss": 0.622, + "step": 834 + }, + { + "epoch": 0.11686494051784464, + "grad_norm": 0.4865019036883684, + "learning_rate": 4.902214852890109e-05, + "loss": 0.6106, + "step": 835 + }, + { + "epoch": 0.11700489853044087, + "grad_norm": 0.4846126418090665, + "learning_rate": 4.9019007364599246e-05, + "loss": 0.6271, + "step": 836 + }, + { + "epoch": 0.11714485654303709, + "grad_norm": 0.48440494490884967, + "learning_rate": 4.901586126414768e-05, + "loss": 0.6221, + "step": 837 + }, + { + "epoch": 0.1172848145556333, + "grad_norm": 0.5014410717293758, + "learning_rate": 4.9012710228192946e-05, + "loss": 0.6326, + "step": 838 + }, + { + "epoch": 0.11742477256822953, + "grad_norm": 0.463246332033912, + "learning_rate": 4.9009554257382616e-05, + "loss": 0.5973, + "step": 839 + }, + { + "epoch": 0.11756473058082575, + "grad_norm": 0.5021414470675243, + "learning_rate": 4.900639335236527e-05, + "loss": 0.6163, + "step": 840 + }, + { + "epoch": 0.11770468859342198, + "grad_norm": 0.4751809205646141, + "learning_rate": 4.900322751379052e-05, + "loss": 0.5995, + "step": 841 + }, + { + "epoch": 0.11784464660601819, + "grad_norm": 0.495594879328081, + "learning_rate": 4.900005674230896e-05, + "loss": 0.6666, + "step": 842 + }, + { + "epoch": 0.11798460461861442, + "grad_norm": 0.46838396983160246, + "learning_rate": 4.899688103857223e-05, + "loss": 0.6391, + "step": 843 + }, + { + "epoch": 0.11812456263121064, + "grad_norm": 0.48479222227164487, + "learning_rate": 4.899370040323295e-05, + "loss": 0.6425, + "step": 844 + }, + { + "epoch": 0.11826452064380685, + "grad_norm": 0.4758054425265797, + "learning_rate": 4.899051483694481e-05, + "loss": 0.6722, + "step": 845 + }, + { + "epoch": 0.11840447865640308, + "grad_norm": 0.4517669680918226, + "learning_rate": 4.898732434036244e-05, + "loss": 0.6417, + "step": 846 + }, + { + "epoch": 0.1185444366689993, + "grad_norm": 0.4637034408072003, + "learning_rate": 4.898412891414153e-05, + "loss": 0.6472, + "step": 847 + }, + { + "epoch": 0.11868439468159551, + "grad_norm": 0.4483579373594681, + "learning_rate": 4.8980928558938774e-05, + "loss": 0.6631, + "step": 848 + }, + { + "epoch": 0.11882435269419174, + "grad_norm": 0.4607581614793959, + "learning_rate": 4.897772327541188e-05, + "loss": 0.6564, + "step": 849 + }, + { + "epoch": 0.11896431070678797, + "grad_norm": 0.4627218264459414, + "learning_rate": 4.8974513064219564e-05, + "loss": 0.6421, + "step": 850 + }, + { + "epoch": 0.11910426871938419, + "grad_norm": 0.4381385195746151, + "learning_rate": 4.897129792602156e-05, + "loss": 0.6443, + "step": 851 + }, + { + "epoch": 0.1192442267319804, + "grad_norm": 0.44805468338464166, + "learning_rate": 4.8968077861478606e-05, + "loss": 0.6289, + "step": 852 + }, + { + "epoch": 0.11938418474457663, + "grad_norm": 0.4419209751144886, + "learning_rate": 4.896485287125246e-05, + "loss": 0.6148, + "step": 853 + }, + { + "epoch": 0.11952414275717285, + "grad_norm": 0.46951206043414845, + "learning_rate": 4.896162295600589e-05, + "loss": 0.6552, + "step": 854 + }, + { + "epoch": 0.11966410076976906, + "grad_norm": 0.6796790512869556, + "learning_rate": 4.8958388116402685e-05, + "loss": 0.6547, + "step": 855 + }, + { + "epoch": 0.11980405878236529, + "grad_norm": 0.4900439088338446, + "learning_rate": 4.8955148353107625e-05, + "loss": 0.6618, + "step": 856 + }, + { + "epoch": 0.11994401679496151, + "grad_norm": 0.5670961555372047, + "learning_rate": 4.8951903666786514e-05, + "loss": 0.6234, + "step": 857 + }, + { + "epoch": 0.12008397480755774, + "grad_norm": 0.4709993547132971, + "learning_rate": 4.894865405810618e-05, + "loss": 0.6528, + "step": 858 + }, + { + "epoch": 0.12022393282015395, + "grad_norm": 0.5029700711110836, + "learning_rate": 4.8945399527734436e-05, + "loss": 0.6508, + "step": 859 + }, + { + "epoch": 0.12036389083275018, + "grad_norm": 0.471348795948391, + "learning_rate": 4.8942140076340135e-05, + "loss": 0.6303, + "step": 860 + }, + { + "epoch": 0.1205038488453464, + "grad_norm": 0.47522173933299294, + "learning_rate": 4.893887570459312e-05, + "loss": 0.6503, + "step": 861 + }, + { + "epoch": 0.12064380685794261, + "grad_norm": 0.44742680670952745, + "learning_rate": 4.893560641316425e-05, + "loss": 0.6306, + "step": 862 + }, + { + "epoch": 0.12078376487053884, + "grad_norm": 0.46472651593001396, + "learning_rate": 4.89323322027254e-05, + "loss": 0.6322, + "step": 863 + }, + { + "epoch": 0.12092372288313506, + "grad_norm": 0.4573234609819833, + "learning_rate": 4.8929053073949456e-05, + "loss": 0.6014, + "step": 864 + }, + { + "epoch": 0.12106368089573127, + "grad_norm": 0.5281892171792254, + "learning_rate": 4.892576902751031e-05, + "loss": 0.666, + "step": 865 + }, + { + "epoch": 0.1212036389083275, + "grad_norm": 0.46580727241797515, + "learning_rate": 4.8922480064082864e-05, + "loss": 0.634, + "step": 866 + }, + { + "epoch": 0.12134359692092372, + "grad_norm": 0.46889330922237543, + "learning_rate": 4.8919186184343046e-05, + "loss": 0.6171, + "step": 867 + }, + { + "epoch": 0.12148355493351995, + "grad_norm": 0.4593728936842765, + "learning_rate": 4.891588738896776e-05, + "loss": 0.569, + "step": 868 + }, + { + "epoch": 0.12162351294611616, + "grad_norm": 0.4554124468260883, + "learning_rate": 4.891258367863497e-05, + "loss": 0.6426, + "step": 869 + }, + { + "epoch": 0.12176347095871239, + "grad_norm": 0.4613451131787414, + "learning_rate": 4.890927505402359e-05, + "loss": 0.6738, + "step": 870 + }, + { + "epoch": 0.12190342897130861, + "grad_norm": 0.4775869179312642, + "learning_rate": 4.8905961515813604e-05, + "loss": 0.6403, + "step": 871 + }, + { + "epoch": 0.12204338698390482, + "grad_norm": 0.4836931023229791, + "learning_rate": 4.890264306468596e-05, + "loss": 0.6472, + "step": 872 + }, + { + "epoch": 0.12218334499650105, + "grad_norm": 0.4661703058075851, + "learning_rate": 4.8899319701322646e-05, + "loss": 0.6656, + "step": 873 + }, + { + "epoch": 0.12232330300909727, + "grad_norm": 0.46602771617049493, + "learning_rate": 4.889599142640663e-05, + "loss": 0.6471, + "step": 874 + }, + { + "epoch": 0.1224632610216935, + "grad_norm": 0.46986425878775284, + "learning_rate": 4.889265824062193e-05, + "loss": 0.6538, + "step": 875 + }, + { + "epoch": 0.12260321903428971, + "grad_norm": 0.4350487912771349, + "learning_rate": 4.888932014465352e-05, + "loss": 0.5965, + "step": 876 + }, + { + "epoch": 0.12274317704688593, + "grad_norm": 0.45434867025136694, + "learning_rate": 4.888597713918743e-05, + "loss": 0.6091, + "step": 877 + }, + { + "epoch": 0.12288313505948216, + "grad_norm": 0.45288960332540396, + "learning_rate": 4.888262922491069e-05, + "loss": 0.6633, + "step": 878 + }, + { + "epoch": 0.12302309307207837, + "grad_norm": 0.46432887825126745, + "learning_rate": 4.887927640251132e-05, + "loss": 0.6597, + "step": 879 + }, + { + "epoch": 0.1231630510846746, + "grad_norm": 0.45520528507797114, + "learning_rate": 4.887591867267836e-05, + "loss": 0.6403, + "step": 880 + }, + { + "epoch": 0.12330300909727082, + "grad_norm": 0.4575189583703599, + "learning_rate": 4.887255603610185e-05, + "loss": 0.6531, + "step": 881 + }, + { + "epoch": 0.12344296710986705, + "grad_norm": 0.46480538006139166, + "learning_rate": 4.8869188493472854e-05, + "loss": 0.5969, + "step": 882 + }, + { + "epoch": 0.12358292512246326, + "grad_norm": 0.46565682699578237, + "learning_rate": 4.886581604548344e-05, + "loss": 0.6334, + "step": 883 + }, + { + "epoch": 0.12372288313505948, + "grad_norm": 0.4615157170447177, + "learning_rate": 4.8862438692826675e-05, + "loss": 0.6317, + "step": 884 + }, + { + "epoch": 0.12386284114765571, + "grad_norm": 0.45719602979674634, + "learning_rate": 4.885905643619664e-05, + "loss": 0.6615, + "step": 885 + }, + { + "epoch": 0.12400279916025192, + "grad_norm": 0.4666726722691908, + "learning_rate": 4.885566927628842e-05, + "loss": 0.6192, + "step": 886 + }, + { + "epoch": 0.12414275717284814, + "grad_norm": 0.49622217670592067, + "learning_rate": 4.8852277213798106e-05, + "loss": 0.6263, + "step": 887 + }, + { + "epoch": 0.12428271518544437, + "grad_norm": 0.47891535770456606, + "learning_rate": 4.8848880249422815e-05, + "loss": 0.6435, + "step": 888 + }, + { + "epoch": 0.12442267319804058, + "grad_norm": 0.478301172680431, + "learning_rate": 4.884547838386065e-05, + "loss": 0.671, + "step": 889 + }, + { + "epoch": 0.1245626312106368, + "grad_norm": 0.44791777443248404, + "learning_rate": 4.884207161781074e-05, + "loss": 0.5783, + "step": 890 + }, + { + "epoch": 0.12470258922323303, + "grad_norm": 0.4476426364893327, + "learning_rate": 4.883865995197319e-05, + "loss": 0.6164, + "step": 891 + }, + { + "epoch": 0.12484254723582926, + "grad_norm": 0.47381113823924553, + "learning_rate": 4.8835243387049144e-05, + "loss": 0.6427, + "step": 892 + }, + { + "epoch": 0.12498250524842547, + "grad_norm": 0.47937356676709797, + "learning_rate": 4.8831821923740745e-05, + "loss": 0.6783, + "step": 893 + }, + { + "epoch": 0.1251224632610217, + "grad_norm": 0.4634097903429328, + "learning_rate": 4.882839556275113e-05, + "loss": 0.6131, + "step": 894 + }, + { + "epoch": 0.12526242127361792, + "grad_norm": 0.4867484756619001, + "learning_rate": 4.8824964304784446e-05, + "loss": 0.649, + "step": 895 + }, + { + "epoch": 0.12540237928621414, + "grad_norm": 0.450881363852382, + "learning_rate": 4.882152815054587e-05, + "loss": 0.6561, + "step": 896 + }, + { + "epoch": 0.12554233729881037, + "grad_norm": 0.46114414970423057, + "learning_rate": 4.881808710074155e-05, + "loss": 0.6104, + "step": 897 + }, + { + "epoch": 0.12568229531140657, + "grad_norm": 0.45063006067640693, + "learning_rate": 4.881464115607865e-05, + "loss": 0.6445, + "step": 898 + }, + { + "epoch": 0.1258222533240028, + "grad_norm": 0.47221223029390763, + "learning_rate": 4.8811190317265376e-05, + "loss": 0.6246, + "step": 899 + }, + { + "epoch": 0.12596221133659902, + "grad_norm": 0.4781011312710143, + "learning_rate": 4.880773458501089e-05, + "loss": 0.6222, + "step": 900 + }, + { + "epoch": 0.12610216934919524, + "grad_norm": 0.45326509799123327, + "learning_rate": 4.8804273960025376e-05, + "loss": 0.651, + "step": 901 + }, + { + "epoch": 0.12624212736179147, + "grad_norm": 0.482160143883641, + "learning_rate": 4.880080844302004e-05, + "loss": 0.6286, + "step": 902 + }, + { + "epoch": 0.1263820853743877, + "grad_norm": 0.4630226354527779, + "learning_rate": 4.879733803470707e-05, + "loss": 0.636, + "step": 903 + }, + { + "epoch": 0.12652204338698392, + "grad_norm": 0.4731355871687041, + "learning_rate": 4.8793862735799676e-05, + "loss": 0.6418, + "step": 904 + }, + { + "epoch": 0.12666200139958012, + "grad_norm": 0.467195164605955, + "learning_rate": 4.879038254701207e-05, + "loss": 0.6306, + "step": 905 + }, + { + "epoch": 0.12680195941217634, + "grad_norm": 0.441351930375645, + "learning_rate": 4.878689746905946e-05, + "loss": 0.5974, + "step": 906 + }, + { + "epoch": 0.12694191742477257, + "grad_norm": 0.4714053662846153, + "learning_rate": 4.878340750265807e-05, + "loss": 0.6506, + "step": 907 + }, + { + "epoch": 0.1270818754373688, + "grad_norm": 0.4459074638142588, + "learning_rate": 4.877991264852512e-05, + "loss": 0.6474, + "step": 908 + }, + { + "epoch": 0.12722183344996502, + "grad_norm": 0.469617227274652, + "learning_rate": 4.877641290737884e-05, + "loss": 0.6656, + "step": 909 + }, + { + "epoch": 0.12736179146256124, + "grad_norm": 0.4557573307622375, + "learning_rate": 4.8772908279938464e-05, + "loss": 0.6908, + "step": 910 + }, + { + "epoch": 0.12750174947515747, + "grad_norm": 0.46053886117091863, + "learning_rate": 4.8769398766924226e-05, + "loss": 0.6619, + "step": 911 + }, + { + "epoch": 0.12764170748775366, + "grad_norm": 0.5409321614433314, + "learning_rate": 4.876588436905736e-05, + "loss": 0.6306, + "step": 912 + }, + { + "epoch": 0.1277816655003499, + "grad_norm": 0.4525734524473738, + "learning_rate": 4.8762365087060117e-05, + "loss": 0.6163, + "step": 913 + }, + { + "epoch": 0.12792162351294611, + "grad_norm": 0.44910287840912927, + "learning_rate": 4.8758840921655744e-05, + "loss": 0.5945, + "step": 914 + }, + { + "epoch": 0.12806158152554234, + "grad_norm": 0.4451813661618601, + "learning_rate": 4.8755311873568505e-05, + "loss": 0.613, + "step": 915 + }, + { + "epoch": 0.12820153953813856, + "grad_norm": 0.46559156418594866, + "learning_rate": 4.8751777943523634e-05, + "loss": 0.6414, + "step": 916 + }, + { + "epoch": 0.1283414975507348, + "grad_norm": 0.4947913830806307, + "learning_rate": 4.874823913224741e-05, + "loss": 0.6461, + "step": 917 + }, + { + "epoch": 0.128481455563331, + "grad_norm": 0.4786066687436742, + "learning_rate": 4.874469544046707e-05, + "loss": 0.6474, + "step": 918 + }, + { + "epoch": 0.1286214135759272, + "grad_norm": 0.4603973225093796, + "learning_rate": 4.8741146868910906e-05, + "loss": 0.66, + "step": 919 + }, + { + "epoch": 0.12876137158852344, + "grad_norm": 0.48095107605482856, + "learning_rate": 4.8737593418308156e-05, + "loss": 0.6534, + "step": 920 + }, + { + "epoch": 0.12890132960111966, + "grad_norm": 0.4623235801737886, + "learning_rate": 4.8734035089389115e-05, + "loss": 0.628, + "step": 921 + }, + { + "epoch": 0.1290412876137159, + "grad_norm": 0.4930237562486678, + "learning_rate": 4.873047188288505e-05, + "loss": 0.6409, + "step": 922 + }, + { + "epoch": 0.1291812456263121, + "grad_norm": 0.46495991811324616, + "learning_rate": 4.8726903799528234e-05, + "loss": 0.6293, + "step": 923 + }, + { + "epoch": 0.12932120363890834, + "grad_norm": 0.46304983740233435, + "learning_rate": 4.872333084005194e-05, + "loss": 0.6472, + "step": 924 + }, + { + "epoch": 0.12946116165150454, + "grad_norm": 0.44884446203631584, + "learning_rate": 4.871975300519045e-05, + "loss": 0.6371, + "step": 925 + }, + { + "epoch": 0.12960111966410076, + "grad_norm": 0.4675721819496966, + "learning_rate": 4.8716170295679053e-05, + "loss": 0.6653, + "step": 926 + }, + { + "epoch": 0.129741077676697, + "grad_norm": 0.47573888293206407, + "learning_rate": 4.8712582712254016e-05, + "loss": 0.6267, + "step": 927 + }, + { + "epoch": 0.1298810356892932, + "grad_norm": 0.46375806207869347, + "learning_rate": 4.870899025565264e-05, + "loss": 0.6467, + "step": 928 + }, + { + "epoch": 0.13002099370188944, + "grad_norm": 0.457474154948321, + "learning_rate": 4.8705392926613205e-05, + "loss": 0.6156, + "step": 929 + }, + { + "epoch": 0.13016095171448566, + "grad_norm": 0.44866595177001034, + "learning_rate": 4.870179072587499e-05, + "loss": 0.5962, + "step": 930 + }, + { + "epoch": 0.1303009097270819, + "grad_norm": 0.4703083744568931, + "learning_rate": 4.86981836541783e-05, + "loss": 0.6309, + "step": 931 + }, + { + "epoch": 0.13044086773967808, + "grad_norm": 0.6018616664400933, + "learning_rate": 4.869457171226441e-05, + "loss": 0.642, + "step": 932 + }, + { + "epoch": 0.1305808257522743, + "grad_norm": 0.46944918935149577, + "learning_rate": 4.869095490087562e-05, + "loss": 0.6273, + "step": 933 + }, + { + "epoch": 0.13072078376487054, + "grad_norm": 0.44148958617555917, + "learning_rate": 4.868733322075522e-05, + "loss": 0.6378, + "step": 934 + }, + { + "epoch": 0.13086074177746676, + "grad_norm": 0.4395583432649483, + "learning_rate": 4.86837066726475e-05, + "loss": 0.6202, + "step": 935 + }, + { + "epoch": 0.13100069979006299, + "grad_norm": 0.44852952705339616, + "learning_rate": 4.868007525729775e-05, + "loss": 0.6416, + "step": 936 + }, + { + "epoch": 0.1311406578026592, + "grad_norm": 0.4487713642870549, + "learning_rate": 4.8676438975452274e-05, + "loss": 0.6263, + "step": 937 + }, + { + "epoch": 0.13128061581525544, + "grad_norm": 0.4648656677758751, + "learning_rate": 4.8672797827858355e-05, + "loss": 0.6164, + "step": 938 + }, + { + "epoch": 0.13142057382785163, + "grad_norm": 0.4608329526156638, + "learning_rate": 4.866915181526428e-05, + "loss": 0.6692, + "step": 939 + }, + { + "epoch": 0.13156053184044786, + "grad_norm": 0.4654710571009042, + "learning_rate": 4.866550093841936e-05, + "loss": 0.6222, + "step": 940 + }, + { + "epoch": 0.13170048985304408, + "grad_norm": 0.4603807269880903, + "learning_rate": 4.866184519807387e-05, + "loss": 0.6165, + "step": 941 + }, + { + "epoch": 0.1318404478656403, + "grad_norm": 0.47496860741108315, + "learning_rate": 4.865818459497911e-05, + "loss": 0.6267, + "step": 942 + }, + { + "epoch": 0.13198040587823653, + "grad_norm": 0.46532589078199804, + "learning_rate": 4.8654519129887364e-05, + "loss": 0.6293, + "step": 943 + }, + { + "epoch": 0.13212036389083276, + "grad_norm": 0.4590484934275629, + "learning_rate": 4.865084880355193e-05, + "loss": 0.6231, + "step": 944 + }, + { + "epoch": 0.13226032190342898, + "grad_norm": 0.47530046553555577, + "learning_rate": 4.86471736167271e-05, + "loss": 0.6133, + "step": 945 + }, + { + "epoch": 0.13240027991602518, + "grad_norm": 0.4527393981074044, + "learning_rate": 4.864349357016815e-05, + "loss": 0.6124, + "step": 946 + }, + { + "epoch": 0.1325402379286214, + "grad_norm": 0.46113362427703974, + "learning_rate": 4.863980866463138e-05, + "loss": 0.6456, + "step": 947 + }, + { + "epoch": 0.13268019594121763, + "grad_norm": 0.509871579156769, + "learning_rate": 4.8636118900874064e-05, + "loss": 0.6229, + "step": 948 + }, + { + "epoch": 0.13282015395381386, + "grad_norm": 0.4784635752003754, + "learning_rate": 4.86324242796545e-05, + "loss": 0.6596, + "step": 949 + }, + { + "epoch": 0.13296011196641008, + "grad_norm": 0.4414714405275805, + "learning_rate": 4.862872480173195e-05, + "loss": 0.6459, + "step": 950 + }, + { + "epoch": 0.1331000699790063, + "grad_norm": 0.4587139434548765, + "learning_rate": 4.862502046786671e-05, + "loss": 0.6126, + "step": 951 + }, + { + "epoch": 0.1332400279916025, + "grad_norm": 0.42461073662577176, + "learning_rate": 4.8621311278820056e-05, + "loss": 0.6353, + "step": 952 + }, + { + "epoch": 0.13337998600419873, + "grad_norm": 0.4324208829491985, + "learning_rate": 4.861759723535426e-05, + "loss": 0.6499, + "step": 953 + }, + { + "epoch": 0.13351994401679496, + "grad_norm": 0.4640550519654056, + "learning_rate": 4.86138783382326e-05, + "loss": 0.5896, + "step": 954 + }, + { + "epoch": 0.13365990202939118, + "grad_norm": 0.4638965920279924, + "learning_rate": 4.8610154588219345e-05, + "loss": 0.6201, + "step": 955 + }, + { + "epoch": 0.1337998600419874, + "grad_norm": 0.4695822317863163, + "learning_rate": 4.860642598607976e-05, + "loss": 0.6751, + "step": 956 + }, + { + "epoch": 0.13393981805458363, + "grad_norm": 0.4600752660391693, + "learning_rate": 4.860269253258012e-05, + "loss": 0.6654, + "step": 957 + }, + { + "epoch": 0.13407977606717986, + "grad_norm": 0.47652178074145596, + "learning_rate": 4.859895422848767e-05, + "loss": 0.659, + "step": 958 + }, + { + "epoch": 0.13421973407977605, + "grad_norm": 0.445979113412429, + "learning_rate": 4.859521107457069e-05, + "loss": 0.632, + "step": 959 + }, + { + "epoch": 0.13435969209237228, + "grad_norm": 0.45688889669250304, + "learning_rate": 4.859146307159842e-05, + "loss": 0.5705, + "step": 960 + }, + { + "epoch": 0.1344996501049685, + "grad_norm": 0.4567888614063869, + "learning_rate": 4.858771022034112e-05, + "loss": 0.6607, + "step": 961 + }, + { + "epoch": 0.13463960811756473, + "grad_norm": 0.4495345209098684, + "learning_rate": 4.858395252157004e-05, + "loss": 0.6415, + "step": 962 + }, + { + "epoch": 0.13477956613016095, + "grad_norm": 0.48532112670125027, + "learning_rate": 4.858018997605742e-05, + "loss": 0.6088, + "step": 963 + }, + { + "epoch": 0.13491952414275718, + "grad_norm": 0.440987511045589, + "learning_rate": 4.8576422584576514e-05, + "loss": 0.6129, + "step": 964 + }, + { + "epoch": 0.1350594821553534, + "grad_norm": 0.4669491385905085, + "learning_rate": 4.8572650347901544e-05, + "loss": 0.6191, + "step": 965 + }, + { + "epoch": 0.1351994401679496, + "grad_norm": 0.4665408837691419, + "learning_rate": 4.856887326680774e-05, + "loss": 0.6317, + "step": 966 + }, + { + "epoch": 0.13533939818054583, + "grad_norm": 0.48893208349101197, + "learning_rate": 4.856509134207136e-05, + "loss": 0.595, + "step": 967 + }, + { + "epoch": 0.13547935619314205, + "grad_norm": 0.4438150829055967, + "learning_rate": 4.856130457446959e-05, + "loss": 0.6065, + "step": 968 + }, + { + "epoch": 0.13561931420573828, + "grad_norm": 0.5115425422953542, + "learning_rate": 4.8557512964780674e-05, + "loss": 0.6448, + "step": 969 + }, + { + "epoch": 0.1357592722183345, + "grad_norm": 0.44504395647634876, + "learning_rate": 4.855371651378382e-05, + "loss": 0.6164, + "step": 970 + }, + { + "epoch": 0.13589923023093073, + "grad_norm": 0.5076347594918873, + "learning_rate": 4.854991522225923e-05, + "loss": 0.6207, + "step": 971 + }, + { + "epoch": 0.13603918824352695, + "grad_norm": 0.4602778279540183, + "learning_rate": 4.854610909098812e-05, + "loss": 0.5923, + "step": 972 + }, + { + "epoch": 0.13617914625612315, + "grad_norm": 0.49106399964431596, + "learning_rate": 4.8542298120752684e-05, + "loss": 0.5914, + "step": 973 + }, + { + "epoch": 0.13631910426871938, + "grad_norm": 0.44686877889055854, + "learning_rate": 4.85384823123361e-05, + "loss": 0.6163, + "step": 974 + }, + { + "epoch": 0.1364590622813156, + "grad_norm": 0.46546574820842096, + "learning_rate": 4.8534661666522584e-05, + "loss": 0.6625, + "step": 975 + }, + { + "epoch": 0.13659902029391183, + "grad_norm": 0.6072066700290619, + "learning_rate": 4.8530836184097297e-05, + "loss": 0.6048, + "step": 976 + }, + { + "epoch": 0.13673897830650805, + "grad_norm": 0.49583731751388566, + "learning_rate": 4.852700586584642e-05, + "loss": 0.6445, + "step": 977 + }, + { + "epoch": 0.13687893631910428, + "grad_norm": 0.4600581543242378, + "learning_rate": 4.852317071255712e-05, + "loss": 0.6276, + "step": 978 + }, + { + "epoch": 0.1370188943317005, + "grad_norm": 0.47143173605603533, + "learning_rate": 4.851933072501756e-05, + "loss": 0.6707, + "step": 979 + }, + { + "epoch": 0.1371588523442967, + "grad_norm": 0.43226562343994834, + "learning_rate": 4.85154859040169e-05, + "loss": 0.5993, + "step": 980 + }, + { + "epoch": 0.13729881035689293, + "grad_norm": 0.5032598563086278, + "learning_rate": 4.8511636250345294e-05, + "loss": 0.6388, + "step": 981 + }, + { + "epoch": 0.13743876836948915, + "grad_norm": 0.6566397034726049, + "learning_rate": 4.850778176479387e-05, + "loss": 0.626, + "step": 982 + }, + { + "epoch": 0.13757872638208538, + "grad_norm": 0.45320474044463904, + "learning_rate": 4.850392244815478e-05, + "loss": 0.6222, + "step": 983 + }, + { + "epoch": 0.1377186843946816, + "grad_norm": 0.4637289824351019, + "learning_rate": 4.8500058301221144e-05, + "loss": 0.6357, + "step": 984 + }, + { + "epoch": 0.13785864240727783, + "grad_norm": 0.4502718394375495, + "learning_rate": 4.849618932478708e-05, + "loss": 0.6161, + "step": 985 + }, + { + "epoch": 0.13799860041987405, + "grad_norm": 0.45481795684352794, + "learning_rate": 4.849231551964771e-05, + "loss": 0.6251, + "step": 986 + }, + { + "epoch": 0.13813855843247025, + "grad_norm": 0.505134985018315, + "learning_rate": 4.8488436886599144e-05, + "loss": 0.6358, + "step": 987 + }, + { + "epoch": 0.13827851644506647, + "grad_norm": 0.48619324349330834, + "learning_rate": 4.8484553426438464e-05, + "loss": 0.6115, + "step": 988 + }, + { + "epoch": 0.1384184744576627, + "grad_norm": 0.4650935755290328, + "learning_rate": 4.8480665139963774e-05, + "loss": 0.6004, + "step": 989 + }, + { + "epoch": 0.13855843247025892, + "grad_norm": 0.44207287204019446, + "learning_rate": 4.847677202797415e-05, + "loss": 0.597, + "step": 990 + }, + { + "epoch": 0.13869839048285515, + "grad_norm": 0.4629268299609161, + "learning_rate": 4.8472874091269674e-05, + "loss": 0.6414, + "step": 991 + }, + { + "epoch": 0.13883834849545137, + "grad_norm": 0.46784298534877244, + "learning_rate": 4.84689713306514e-05, + "loss": 0.6263, + "step": 992 + }, + { + "epoch": 0.13897830650804757, + "grad_norm": 0.448599541259752, + "learning_rate": 4.8465063746921395e-05, + "loss": 0.6395, + "step": 993 + }, + { + "epoch": 0.1391182645206438, + "grad_norm": 0.4582611197304845, + "learning_rate": 4.8461151340882706e-05, + "loss": 0.6533, + "step": 994 + }, + { + "epoch": 0.13925822253324002, + "grad_norm": 0.49137392446019373, + "learning_rate": 4.845723411333936e-05, + "loss": 0.6327, + "step": 995 + }, + { + "epoch": 0.13939818054583625, + "grad_norm": 0.4651834345736373, + "learning_rate": 4.84533120650964e-05, + "loss": 0.6444, + "step": 996 + }, + { + "epoch": 0.13953813855843247, + "grad_norm": 0.43613025151896734, + "learning_rate": 4.844938519695984e-05, + "loss": 0.6036, + "step": 997 + }, + { + "epoch": 0.1396780965710287, + "grad_norm": 0.4547252123073849, + "learning_rate": 4.84454535097367e-05, + "loss": 0.6715, + "step": 998 + }, + { + "epoch": 0.13981805458362492, + "grad_norm": 0.4411221787105127, + "learning_rate": 4.8441517004234975e-05, + "loss": 0.6207, + "step": 999 + }, + { + "epoch": 0.13995801259622112, + "grad_norm": 0.4652981257021806, + "learning_rate": 4.8437575681263656e-05, + "loss": 0.628, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 7145, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 175094048915456.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}