diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,57730 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 16482, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00036406662419222717, + "grad_norm": 2.203125, + "learning_rate": 1.2500000000000002e-07, + "loss": 1.4439427852630615, + "step": 2 + }, + { + "epoch": 0.0007281332483844543, + "grad_norm": 25.875, + "learning_rate": 3.75e-07, + "loss": 1.8903988599777222, + "step": 4 + }, + { + "epoch": 0.0010921998725766816, + "grad_norm": 13.1875, + "learning_rate": 6.25e-07, + "loss": 1.9823973178863525, + "step": 6 + }, + { + "epoch": 0.0014562664967689087, + "grad_norm": 7.84375, + "learning_rate": 8.75e-07, + "loss": 1.108595848083496, + "step": 8 + }, + { + "epoch": 0.0018203331209611358, + "grad_norm": 54.5, + "learning_rate": 1.125e-06, + "loss": 1.6908321380615234, + "step": 10 + }, + { + "epoch": 0.002184399745153363, + "grad_norm": 82.5, + "learning_rate": 1.3750000000000002e-06, + "loss": 2.864623546600342, + "step": 12 + }, + { + "epoch": 0.0025484663693455902, + "grad_norm": 8.8125, + "learning_rate": 1.6250000000000001e-06, + "loss": 1.8119561672210693, + "step": 14 + }, + { + "epoch": 0.0029125329935378174, + "grad_norm": 4.0, + "learning_rate": 1.8750000000000003e-06, + "loss": 1.419490933418274, + "step": 16 + }, + { + "epoch": 0.0032765996177300445, + "grad_norm": 20.25, + "learning_rate": 2.125e-06, + "loss": 1.774035930633545, + "step": 18 + }, + { + "epoch": 0.0036406662419222716, + "grad_norm": 49.25, + "learning_rate": 2.375e-06, + "loss": 2.2708888053894043, + "step": 20 + }, + { + "epoch": 0.004004732866114499, + "grad_norm": 115.5, + "learning_rate": 2.6250000000000003e-06, + "loss": 2.69008207321167, + "step": 22 + }, + { + "epoch": 0.004368799490306726, + "grad_norm": 23.125, + "learning_rate": 2.875e-06, + "loss": 1.6948392391204834, + "step": 24 + }, + { + "epoch": 0.004732866114498953, + "grad_norm": 11.375, + "learning_rate": 3.125e-06, + "loss": 1.5246765613555908, + "step": 26 + }, + { + "epoch": 0.0050969327386911805, + "grad_norm": 17.875, + "learning_rate": 3.3750000000000003e-06, + "loss": 1.863516092300415, + "step": 28 + }, + { + "epoch": 0.005460999362883408, + "grad_norm": 15.25, + "learning_rate": 3.625e-06, + "loss": 1.9239016771316528, + "step": 30 + }, + { + "epoch": 0.005825065987075635, + "grad_norm": 16.875, + "learning_rate": 3.875e-06, + "loss": 1.7198164463043213, + "step": 32 + }, + { + "epoch": 0.006189132611267862, + "grad_norm": 35.0, + "learning_rate": 4.125e-06, + "loss": 2.3988637924194336, + "step": 34 + }, + { + "epoch": 0.006553199235460089, + "grad_norm": 14.3125, + "learning_rate": 4.3750000000000005e-06, + "loss": 1.200141429901123, + "step": 36 + }, + { + "epoch": 0.006917265859652316, + "grad_norm": 18.75, + "learning_rate": 4.625000000000001e-06, + "loss": 1.9250798225402832, + "step": 38 + }, + { + "epoch": 0.007281332483844543, + "grad_norm": 12.875, + "learning_rate": 4.875e-06, + "loss": 1.8268136978149414, + "step": 40 + }, + { + "epoch": 0.007645399108036771, + "grad_norm": 10.5, + "learning_rate": 5.125e-06, + "loss": 1.8823270797729492, + "step": 42 + }, + { + "epoch": 0.008009465732228998, + "grad_norm": 9.75, + "learning_rate": 5.375e-06, + "loss": 1.8128286600112915, + "step": 44 + }, + { + "epoch": 0.008373532356421225, + "grad_norm": 20.0, + "learning_rate": 5.625e-06, + "loss": 1.9367401599884033, + "step": 46 + }, + { + "epoch": 0.008737598980613452, + "grad_norm": 11.8125, + "learning_rate": 5.8750000000000005e-06, + "loss": 1.669650912284851, + "step": 48 + }, + { + "epoch": 0.00910166560480568, + "grad_norm": 37.5, + "learning_rate": 6.125000000000001e-06, + "loss": 1.978642225265503, + "step": 50 + }, + { + "epoch": 0.009465732228997907, + "grad_norm": 12.4375, + "learning_rate": 6.375e-06, + "loss": 1.313441276550293, + "step": 52 + }, + { + "epoch": 0.009829798853190134, + "grad_norm": 2.71875, + "learning_rate": 6.625e-06, + "loss": 1.225853443145752, + "step": 54 + }, + { + "epoch": 0.010193865477382361, + "grad_norm": 3.625, + "learning_rate": 6.875e-06, + "loss": 1.0497552156448364, + "step": 56 + }, + { + "epoch": 0.010557932101574588, + "grad_norm": 26.125, + "learning_rate": 7.125e-06, + "loss": 1.9384973049163818, + "step": 58 + }, + { + "epoch": 0.010921998725766815, + "grad_norm": 5.125, + "learning_rate": 7.375000000000001e-06, + "loss": 1.4457396268844604, + "step": 60 + }, + { + "epoch": 0.011286065349959042, + "grad_norm": 12.75, + "learning_rate": 7.625e-06, + "loss": 1.929830551147461, + "step": 62 + }, + { + "epoch": 0.01165013197415127, + "grad_norm": 4.96875, + "learning_rate": 7.875e-06, + "loss": 1.4362810850143433, + "step": 64 + }, + { + "epoch": 0.012014198598343497, + "grad_norm": 13.6875, + "learning_rate": 8.125000000000001e-06, + "loss": 1.7963954210281372, + "step": 66 + }, + { + "epoch": 0.012378265222535724, + "grad_norm": 14.6875, + "learning_rate": 8.375e-06, + "loss": 1.2056946754455566, + "step": 68 + }, + { + "epoch": 0.01274233184672795, + "grad_norm": 7.625, + "learning_rate": 8.625000000000001e-06, + "loss": 0.9177781343460083, + "step": 70 + }, + { + "epoch": 0.013106398470920178, + "grad_norm": 15.3125, + "learning_rate": 8.875e-06, + "loss": 1.7942477464675903, + "step": 72 + }, + { + "epoch": 0.013470465095112405, + "grad_norm": 30.125, + "learning_rate": 9.125e-06, + "loss": 1.7084472179412842, + "step": 74 + }, + { + "epoch": 0.013834531719304632, + "grad_norm": 9.5625, + "learning_rate": 9.375000000000001e-06, + "loss": 1.5940330028533936, + "step": 76 + }, + { + "epoch": 0.01419859834349686, + "grad_norm": 23.125, + "learning_rate": 9.625e-06, + "loss": 1.7003000974655151, + "step": 78 + }, + { + "epoch": 0.014562664967689086, + "grad_norm": 15.3125, + "learning_rate": 9.875000000000001e-06, + "loss": 1.2153486013412476, + "step": 80 + }, + { + "epoch": 0.014926731591881313, + "grad_norm": 11.625, + "learning_rate": 9.999999926627027e-06, + "loss": 1.7893983125686646, + "step": 82 + }, + { + "epoch": 0.015290798216073542, + "grad_norm": 3.53125, + "learning_rate": 9.999999339643249e-06, + "loss": 1.1524908542633057, + "step": 84 + }, + { + "epoch": 0.015654864840265768, + "grad_norm": 8.125, + "learning_rate": 9.999998165675777e-06, + "loss": 1.632883071899414, + "step": 86 + }, + { + "epoch": 0.016018931464457997, + "grad_norm": 10.4375, + "learning_rate": 9.999996404724785e-06, + "loss": 1.738683819770813, + "step": 88 + }, + { + "epoch": 0.016382998088650222, + "grad_norm": 14.9375, + "learning_rate": 9.999994056790531e-06, + "loss": 1.5707422494888306, + "step": 90 + }, + { + "epoch": 0.01674706471284245, + "grad_norm": 7.90625, + "learning_rate": 9.999991121873363e-06, + "loss": 1.6899995803833008, + "step": 92 + }, + { + "epoch": 0.017111131337034676, + "grad_norm": 11.5, + "learning_rate": 9.999987599973705e-06, + "loss": 1.8284931182861328, + "step": 94 + }, + { + "epoch": 0.017475197961226905, + "grad_norm": 8.4375, + "learning_rate": 9.999983491092077e-06, + "loss": 1.0054672956466675, + "step": 96 + }, + { + "epoch": 0.01783926458541913, + "grad_norm": 22.625, + "learning_rate": 9.999978795229084e-06, + "loss": 2.263137102127075, + "step": 98 + }, + { + "epoch": 0.01820333120961136, + "grad_norm": 6.84375, + "learning_rate": 9.999973512385412e-06, + "loss": 1.502677321434021, + "step": 100 + }, + { + "epoch": 0.018567397833803585, + "grad_norm": 10.6875, + "learning_rate": 9.999967642561839e-06, + "loss": 1.5737406015396118, + "step": 102 + }, + { + "epoch": 0.018931464457995813, + "grad_norm": 11.5625, + "learning_rate": 9.999961185759224e-06, + "loss": 1.5635281801223755, + "step": 104 + }, + { + "epoch": 0.01929553108218804, + "grad_norm": 83.5, + "learning_rate": 9.999954141978516e-06, + "loss": 1.48868989944458, + "step": 106 + }, + { + "epoch": 0.019659597706380268, + "grad_norm": 34.0, + "learning_rate": 9.999946511220748e-06, + "loss": 1.5162310600280762, + "step": 108 + }, + { + "epoch": 0.020023664330572493, + "grad_norm": 9.5, + "learning_rate": 9.99993829348704e-06, + "loss": 1.5604157447814941, + "step": 110 + }, + { + "epoch": 0.020387730954764722, + "grad_norm": 10.75, + "learning_rate": 9.999929488778595e-06, + "loss": 1.359964370727539, + "step": 112 + }, + { + "epoch": 0.02075179757895695, + "grad_norm": 10.5625, + "learning_rate": 9.999920097096712e-06, + "loss": 1.7113839387893677, + "step": 114 + }, + { + "epoch": 0.021115864203149176, + "grad_norm": 23.0, + "learning_rate": 9.999910118442761e-06, + "loss": 1.6603569984436035, + "step": 116 + }, + { + "epoch": 0.021479930827341405, + "grad_norm": 35.25, + "learning_rate": 9.999899552818212e-06, + "loss": 1.904317855834961, + "step": 118 + }, + { + "epoch": 0.02184399745153363, + "grad_norm": 9.25, + "learning_rate": 9.999888400224615e-06, + "loss": 1.069483757019043, + "step": 120 + }, + { + "epoch": 0.02220806407572586, + "grad_norm": 5.1875, + "learning_rate": 9.999876660663605e-06, + "loss": 1.128413438796997, + "step": 122 + }, + { + "epoch": 0.022572130699918085, + "grad_norm": 5.03125, + "learning_rate": 9.999864334136904e-06, + "loss": 1.5015290975570679, + "step": 124 + }, + { + "epoch": 0.022936197324110313, + "grad_norm": 13.6875, + "learning_rate": 9.999851420646323e-06, + "loss": 1.1502019166946411, + "step": 126 + }, + { + "epoch": 0.02330026394830254, + "grad_norm": 11.6875, + "learning_rate": 9.999837920193756e-06, + "loss": 1.7315152883529663, + "step": 128 + }, + { + "epoch": 0.023664330572494768, + "grad_norm": 16.75, + "learning_rate": 9.999823832781184e-06, + "loss": 1.6791443824768066, + "step": 130 + }, + { + "epoch": 0.024028397196686993, + "grad_norm": 8.625, + "learning_rate": 9.999809158410674e-06, + "loss": 1.6817721128463745, + "step": 132 + }, + { + "epoch": 0.024392463820879222, + "grad_norm": 17.625, + "learning_rate": 9.99979389708438e-06, + "loss": 1.598803997039795, + "step": 134 + }, + { + "epoch": 0.024756530445071447, + "grad_norm": 16.0, + "learning_rate": 9.999778048804541e-06, + "loss": 1.5391294956207275, + "step": 136 + }, + { + "epoch": 0.025120597069263676, + "grad_norm": 12.5, + "learning_rate": 9.999761613573484e-06, + "loss": 1.6425312757492065, + "step": 138 + }, + { + "epoch": 0.0254846636934559, + "grad_norm": 10.3125, + "learning_rate": 9.999744591393619e-06, + "loss": 1.5416018962860107, + "step": 140 + }, + { + "epoch": 0.02584873031764813, + "grad_norm": 7.6875, + "learning_rate": 9.999726982267444e-06, + "loss": 0.9722456932067871, + "step": 142 + }, + { + "epoch": 0.026212796941840356, + "grad_norm": 9.8125, + "learning_rate": 9.999708786197546e-06, + "loss": 1.7667213678359985, + "step": 144 + }, + { + "epoch": 0.026576863566032585, + "grad_norm": 22.25, + "learning_rate": 9.99969000318659e-06, + "loss": 1.0348119735717773, + "step": 146 + }, + { + "epoch": 0.02694093019022481, + "grad_norm": 4.3125, + "learning_rate": 9.999670633237337e-06, + "loss": 0.9323269128799438, + "step": 148 + }, + { + "epoch": 0.02730499681441704, + "grad_norm": 3.8125, + "learning_rate": 9.99965067635263e-06, + "loss": 0.9375461339950562, + "step": 150 + }, + { + "epoch": 0.027669063438609264, + "grad_norm": 30.5, + "learning_rate": 9.999630132535391e-06, + "loss": 1.8954745531082153, + "step": 152 + }, + { + "epoch": 0.028033130062801493, + "grad_norm": 13.6875, + "learning_rate": 9.999609001788643e-06, + "loss": 1.5002610683441162, + "step": 154 + }, + { + "epoch": 0.02839719668699372, + "grad_norm": 7.59375, + "learning_rate": 9.999587284115482e-06, + "loss": 1.7262972593307495, + "step": 156 + }, + { + "epoch": 0.028761263311185947, + "grad_norm": 7.03125, + "learning_rate": 9.999564979519097e-06, + "loss": 1.6420605182647705, + "step": 158 + }, + { + "epoch": 0.029125329935378173, + "grad_norm": 8.375, + "learning_rate": 9.999542088002755e-06, + "loss": 1.707388162612915, + "step": 160 + }, + { + "epoch": 0.0294893965595704, + "grad_norm": 7.84375, + "learning_rate": 9.999518609569824e-06, + "loss": 1.618377923965454, + "step": 162 + }, + { + "epoch": 0.029853463183762627, + "grad_norm": 3.359375, + "learning_rate": 9.999494544223747e-06, + "loss": 1.1561410427093506, + "step": 164 + }, + { + "epoch": 0.030217529807954856, + "grad_norm": 24.25, + "learning_rate": 9.999469891968052e-06, + "loss": 1.6138756275177002, + "step": 166 + }, + { + "epoch": 0.030581596432147085, + "grad_norm": 9.125, + "learning_rate": 9.999444652806361e-06, + "loss": 0.8504073619842529, + "step": 168 + }, + { + "epoch": 0.03094566305633931, + "grad_norm": 8.9375, + "learning_rate": 9.999418826742373e-06, + "loss": 1.608824610710144, + "step": 170 + }, + { + "epoch": 0.031309729680531535, + "grad_norm": 14.25, + "learning_rate": 9.999392413779883e-06, + "loss": 1.5359747409820557, + "step": 172 + }, + { + "epoch": 0.031673796304723764, + "grad_norm": 9.5, + "learning_rate": 9.999365413922762e-06, + "loss": 1.133074164390564, + "step": 174 + }, + { + "epoch": 0.03203786292891599, + "grad_norm": 34.25, + "learning_rate": 9.999337827174975e-06, + "loss": 1.6861637830734253, + "step": 176 + }, + { + "epoch": 0.03240192955310822, + "grad_norm": 13.25, + "learning_rate": 9.99930965354057e-06, + "loss": 1.4610446691513062, + "step": 178 + }, + { + "epoch": 0.032765996177300444, + "grad_norm": 8.8125, + "learning_rate": 9.999280893023682e-06, + "loss": 1.3413668870925903, + "step": 180 + }, + { + "epoch": 0.03313006280149267, + "grad_norm": 46.75, + "learning_rate": 9.99925154562853e-06, + "loss": 2.047168254852295, + "step": 182 + }, + { + "epoch": 0.0334941294256849, + "grad_norm": 2.484375, + "learning_rate": 9.99922161135942e-06, + "loss": 1.0822950601577759, + "step": 184 + }, + { + "epoch": 0.03385819604987713, + "grad_norm": 11.75, + "learning_rate": 9.999191090220748e-06, + "loss": 1.5762875080108643, + "step": 186 + }, + { + "epoch": 0.03422226267406935, + "grad_norm": 4.375, + "learning_rate": 9.99915998221699e-06, + "loss": 1.0849459171295166, + "step": 188 + }, + { + "epoch": 0.03458632929826158, + "grad_norm": 41.0, + "learning_rate": 9.99912828735271e-06, + "loss": 1.8276569843292236, + "step": 190 + }, + { + "epoch": 0.03495039592245381, + "grad_norm": 7.0625, + "learning_rate": 9.999096005632565e-06, + "loss": 1.6723151206970215, + "step": 192 + }, + { + "epoch": 0.03531446254664604, + "grad_norm": 35.5, + "learning_rate": 9.999063137061284e-06, + "loss": 0.9587855935096741, + "step": 194 + }, + { + "epoch": 0.03567852917083826, + "grad_norm": 13.3125, + "learning_rate": 9.999029681643694e-06, + "loss": 1.6144752502441406, + "step": 196 + }, + { + "epoch": 0.03604259579503049, + "grad_norm": 22.75, + "learning_rate": 9.998995639384709e-06, + "loss": 1.3902058601379395, + "step": 198 + }, + { + "epoch": 0.03640666241922272, + "grad_norm": 18.0, + "learning_rate": 9.99896101028932e-06, + "loss": 1.6762981414794922, + "step": 200 + }, + { + "epoch": 0.03677072904341495, + "grad_norm": 13.1875, + "learning_rate": 9.998925794362606e-06, + "loss": 0.8191450238227844, + "step": 202 + }, + { + "epoch": 0.03713479566760717, + "grad_norm": 16.25, + "learning_rate": 9.99888999160974e-06, + "loss": 0.9685258865356445, + "step": 204 + }, + { + "epoch": 0.0374988622917994, + "grad_norm": 17.0, + "learning_rate": 9.998853602035974e-06, + "loss": 1.5055614709854126, + "step": 206 + }, + { + "epoch": 0.03786292891599163, + "grad_norm": 13.1875, + "learning_rate": 9.998816625646646e-06, + "loss": 1.3083559274673462, + "step": 208 + }, + { + "epoch": 0.038226995540183856, + "grad_norm": 7.6875, + "learning_rate": 9.998779062447183e-06, + "loss": 1.0685100555419922, + "step": 210 + }, + { + "epoch": 0.03859106216437608, + "grad_norm": 11.5, + "learning_rate": 9.9987409124431e-06, + "loss": 1.6704621315002441, + "step": 212 + }, + { + "epoch": 0.038955128788568306, + "grad_norm": 14.625, + "learning_rate": 9.998702175639997e-06, + "loss": 0.9525822401046753, + "step": 214 + }, + { + "epoch": 0.039319195412760535, + "grad_norm": 4.25, + "learning_rate": 9.998662852043551e-06, + "loss": 1.2631767988204956, + "step": 216 + }, + { + "epoch": 0.039683262036952764, + "grad_norm": 21.875, + "learning_rate": 9.998622941659538e-06, + "loss": 1.7770329713821411, + "step": 218 + }, + { + "epoch": 0.040047328661144986, + "grad_norm": 8.3125, + "learning_rate": 9.998582444493812e-06, + "loss": 1.6833420991897583, + "step": 220 + }, + { + "epoch": 0.040411395285337215, + "grad_norm": 3.28125, + "learning_rate": 9.998541360552318e-06, + "loss": 1.1433168649673462, + "step": 222 + }, + { + "epoch": 0.040775461909529444, + "grad_norm": 12.8125, + "learning_rate": 9.998499689841084e-06, + "loss": 1.5833598375320435, + "step": 224 + }, + { + "epoch": 0.04113952853372167, + "grad_norm": 38.0, + "learning_rate": 9.998457432366225e-06, + "loss": 1.5597223043441772, + "step": 226 + }, + { + "epoch": 0.0415035951579139, + "grad_norm": 9.25, + "learning_rate": 9.998414588133943e-06, + "loss": 1.6640405654907227, + "step": 228 + }, + { + "epoch": 0.04186766178210612, + "grad_norm": 5.09375, + "learning_rate": 9.998371157150522e-06, + "loss": 1.2285068035125732, + "step": 230 + }, + { + "epoch": 0.04223172840629835, + "grad_norm": 8.625, + "learning_rate": 9.998327139422339e-06, + "loss": 1.5521577596664429, + "step": 232 + }, + { + "epoch": 0.04259579503049058, + "grad_norm": 9.625, + "learning_rate": 9.998282534955851e-06, + "loss": 1.3443025350570679, + "step": 234 + }, + { + "epoch": 0.04295986165468281, + "grad_norm": 21.75, + "learning_rate": 9.998237343757606e-06, + "loss": 2.068732738494873, + "step": 236 + }, + { + "epoch": 0.04332392827887503, + "grad_norm": 5.3125, + "learning_rate": 9.998191565834235e-06, + "loss": 0.8551888465881348, + "step": 238 + }, + { + "epoch": 0.04368799490306726, + "grad_norm": 14.25, + "learning_rate": 9.998145201192453e-06, + "loss": 1.6422497034072876, + "step": 240 + }, + { + "epoch": 0.04405206152725949, + "grad_norm": 6.25, + "learning_rate": 9.998098249839067e-06, + "loss": 0.9000154733657837, + "step": 242 + }, + { + "epoch": 0.04441612815145172, + "grad_norm": 26.125, + "learning_rate": 9.998050711780964e-06, + "loss": 1.3616223335266113, + "step": 244 + }, + { + "epoch": 0.04478019477564394, + "grad_norm": 24.5, + "learning_rate": 9.998002587025124e-06, + "loss": 1.8066420555114746, + "step": 246 + }, + { + "epoch": 0.04514426139983617, + "grad_norm": 12.875, + "learning_rate": 9.997953875578608e-06, + "loss": 1.7318825721740723, + "step": 248 + }, + { + "epoch": 0.0455083280240284, + "grad_norm": 11.6875, + "learning_rate": 9.997904577448561e-06, + "loss": 1.0848517417907715, + "step": 250 + }, + { + "epoch": 0.04587239464822063, + "grad_norm": 9.8125, + "learning_rate": 9.99785469264222e-06, + "loss": 1.163558006286621, + "step": 252 + }, + { + "epoch": 0.04623646127241285, + "grad_norm": 12.4375, + "learning_rate": 9.997804221166903e-06, + "loss": 2.1979146003723145, + "step": 254 + }, + { + "epoch": 0.04660052789660508, + "grad_norm": 17.75, + "learning_rate": 9.99775316303002e-06, + "loss": 1.7131332159042358, + "step": 256 + }, + { + "epoch": 0.046964594520797306, + "grad_norm": 16.625, + "learning_rate": 9.99770151823906e-06, + "loss": 0.8940047025680542, + "step": 258 + }, + { + "epoch": 0.047328661144989535, + "grad_norm": 10.3125, + "learning_rate": 9.997649286801605e-06, + "loss": 1.5393319129943848, + "step": 260 + }, + { + "epoch": 0.04769272776918176, + "grad_norm": 10.0625, + "learning_rate": 9.997596468725319e-06, + "loss": 2.0852997303009033, + "step": 262 + }, + { + "epoch": 0.048056794393373986, + "grad_norm": 2.578125, + "learning_rate": 9.997543064017949e-06, + "loss": 1.1634950637817383, + "step": 264 + }, + { + "epoch": 0.048420861017566215, + "grad_norm": 41.25, + "learning_rate": 9.997489072687338e-06, + "loss": 0.6052025556564331, + "step": 266 + }, + { + "epoch": 0.048784927641758444, + "grad_norm": 10.5, + "learning_rate": 9.997434494741406e-06, + "loss": 1.594040036201477, + "step": 268 + }, + { + "epoch": 0.049148994265950666, + "grad_norm": 9.875, + "learning_rate": 9.997379330188159e-06, + "loss": 0.9371297359466553, + "step": 270 + }, + { + "epoch": 0.049513060890142895, + "grad_norm": 9.5625, + "learning_rate": 9.997323579035698e-06, + "loss": 1.5109306573867798, + "step": 272 + }, + { + "epoch": 0.04987712751433512, + "grad_norm": 5.6875, + "learning_rate": 9.997267241292202e-06, + "loss": 0.7313521504402161, + "step": 274 + }, + { + "epoch": 0.05024119413852735, + "grad_norm": 2.390625, + "learning_rate": 9.997210316965935e-06, + "loss": 0.7774074077606201, + "step": 276 + }, + { + "epoch": 0.05060526076271958, + "grad_norm": 5.84375, + "learning_rate": 9.997152806065255e-06, + "loss": 1.225516676902771, + "step": 278 + }, + { + "epoch": 0.0509693273869118, + "grad_norm": 23.25, + "learning_rate": 9.9970947085986e-06, + "loss": 0.763280987739563, + "step": 280 + }, + { + "epoch": 0.05133339401110403, + "grad_norm": 50.0, + "learning_rate": 9.997036024574495e-06, + "loss": 1.6328344345092773, + "step": 282 + }, + { + "epoch": 0.05169746063529626, + "grad_norm": 22.875, + "learning_rate": 9.996976754001552e-06, + "loss": 1.7208383083343506, + "step": 284 + }, + { + "epoch": 0.05206152725948849, + "grad_norm": 17.375, + "learning_rate": 9.99691689688847e-06, + "loss": 1.9400101900100708, + "step": 286 + }, + { + "epoch": 0.05242559388368071, + "grad_norm": 3.328125, + "learning_rate": 9.996856453244029e-06, + "loss": 1.0347318649291992, + "step": 288 + }, + { + "epoch": 0.05278966050787294, + "grad_norm": 53.75, + "learning_rate": 9.996795423077101e-06, + "loss": 2.3298144340515137, + "step": 290 + }, + { + "epoch": 0.05315372713206517, + "grad_norm": 26.125, + "learning_rate": 9.996733806396646e-06, + "loss": 2.0908212661743164, + "step": 292 + }, + { + "epoch": 0.0535177937562574, + "grad_norm": 23.375, + "learning_rate": 9.996671603211699e-06, + "loss": 1.9290943145751953, + "step": 294 + }, + { + "epoch": 0.05388186038044962, + "grad_norm": 6.84375, + "learning_rate": 9.996608813531392e-06, + "loss": 1.1384938955307007, + "step": 296 + }, + { + "epoch": 0.05424592700464185, + "grad_norm": 9.25, + "learning_rate": 9.99654543736494e-06, + "loss": 1.6162941455841064, + "step": 298 + }, + { + "epoch": 0.05460999362883408, + "grad_norm": 11.8125, + "learning_rate": 9.996481474721638e-06, + "loss": 1.863888144493103, + "step": 300 + }, + { + "epoch": 0.054974060253026306, + "grad_norm": 4.15625, + "learning_rate": 9.99641692561088e-06, + "loss": 1.0767062902450562, + "step": 302 + }, + { + "epoch": 0.05533812687721853, + "grad_norm": 3.78125, + "learning_rate": 9.996351790042132e-06, + "loss": 1.0521377325057983, + "step": 304 + }, + { + "epoch": 0.05570219350141076, + "grad_norm": 18.75, + "learning_rate": 9.996286068024956e-06, + "loss": 1.1314442157745361, + "step": 306 + }, + { + "epoch": 0.056066260125602986, + "grad_norm": 37.5, + "learning_rate": 9.996219759568992e-06, + "loss": 1.8040153980255127, + "step": 308 + }, + { + "epoch": 0.056430326749795215, + "grad_norm": 8.375, + "learning_rate": 9.996152864683977e-06, + "loss": 1.5515364408493042, + "step": 310 + }, + { + "epoch": 0.05679439337398744, + "grad_norm": 36.75, + "learning_rate": 9.996085383379724e-06, + "loss": 0.8420185446739197, + "step": 312 + }, + { + "epoch": 0.057158459998179666, + "grad_norm": 18.625, + "learning_rate": 9.996017315666134e-06, + "loss": 1.5096814632415771, + "step": 314 + }, + { + "epoch": 0.057522526622371895, + "grad_norm": 19.0, + "learning_rate": 9.995948661553196e-06, + "loss": 1.18393874168396, + "step": 316 + }, + { + "epoch": 0.05788659324656412, + "grad_norm": 26.375, + "learning_rate": 9.995879421050989e-06, + "loss": 2.3062312602996826, + "step": 318 + }, + { + "epoch": 0.058250659870756345, + "grad_norm": 11.8125, + "learning_rate": 9.99580959416967e-06, + "loss": 1.6267517805099487, + "step": 320 + }, + { + "epoch": 0.058614726494948574, + "grad_norm": 31.625, + "learning_rate": 9.995739180919487e-06, + "loss": 0.3946562707424164, + "step": 322 + }, + { + "epoch": 0.0589787931191408, + "grad_norm": 26.5, + "learning_rate": 9.99566818131077e-06, + "loss": 1.8692868947982788, + "step": 324 + }, + { + "epoch": 0.05934285974333303, + "grad_norm": 55.25, + "learning_rate": 9.995596595353943e-06, + "loss": 1.4824076890945435, + "step": 326 + }, + { + "epoch": 0.059706926367525254, + "grad_norm": 60.5, + "learning_rate": 9.995524423059508e-06, + "loss": 1.873770833015442, + "step": 328 + }, + { + "epoch": 0.06007099299171748, + "grad_norm": 22.875, + "learning_rate": 9.995451664438057e-06, + "loss": 2.0384387969970703, + "step": 330 + }, + { + "epoch": 0.06043505961590971, + "grad_norm": 11.875, + "learning_rate": 9.995378319500264e-06, + "loss": 1.7458500862121582, + "step": 332 + }, + { + "epoch": 0.06079912624010194, + "grad_norm": 18.375, + "learning_rate": 9.995304388256898e-06, + "loss": 1.488701343536377, + "step": 334 + }, + { + "epoch": 0.06116319286429417, + "grad_norm": 30.625, + "learning_rate": 9.9952298707188e-06, + "loss": 1.4240211248397827, + "step": 336 + }, + { + "epoch": 0.06152725948848639, + "grad_norm": 28.0, + "learning_rate": 9.995154766896913e-06, + "loss": 1.5667370557785034, + "step": 338 + }, + { + "epoch": 0.06189132611267862, + "grad_norm": 14.5, + "learning_rate": 9.995079076802254e-06, + "loss": 1.5257785320281982, + "step": 340 + }, + { + "epoch": 0.06225539273687085, + "grad_norm": 5.6875, + "learning_rate": 9.995002800445932e-06, + "loss": 1.3201079368591309, + "step": 342 + }, + { + "epoch": 0.06261945936106307, + "grad_norm": 15.0, + "learning_rate": 9.99492593783914e-06, + "loss": 1.531526803970337, + "step": 344 + }, + { + "epoch": 0.0629835259852553, + "grad_norm": 12.5, + "learning_rate": 9.994848488993155e-06, + "loss": 1.606310486793518, + "step": 346 + }, + { + "epoch": 0.06334759260944753, + "grad_norm": 22.5, + "learning_rate": 9.994770453919343e-06, + "loss": 1.9906249046325684, + "step": 348 + }, + { + "epoch": 0.06371165923363975, + "grad_norm": 14.75, + "learning_rate": 9.994691832629157e-06, + "loss": 1.6271324157714844, + "step": 350 + }, + { + "epoch": 0.06407572585783199, + "grad_norm": 22.0, + "learning_rate": 9.994612625134134e-06, + "loss": 1.7152941226959229, + "step": 352 + }, + { + "epoch": 0.06443979248202421, + "grad_norm": 6.21875, + "learning_rate": 9.994532831445898e-06, + "loss": 1.098881721496582, + "step": 354 + }, + { + "epoch": 0.06480385910621644, + "grad_norm": 9.25, + "learning_rate": 9.994452451576155e-06, + "loss": 1.5961560010910034, + "step": 356 + }, + { + "epoch": 0.06516792573040867, + "grad_norm": 14.125, + "learning_rate": 9.994371485536705e-06, + "loss": 1.7563493251800537, + "step": 358 + }, + { + "epoch": 0.06553199235460089, + "grad_norm": 10.6875, + "learning_rate": 9.994289933339426e-06, + "loss": 1.4499320983886719, + "step": 360 + }, + { + "epoch": 0.06589605897879312, + "grad_norm": 8.8125, + "learning_rate": 9.994207794996289e-06, + "loss": 1.6752084493637085, + "step": 362 + }, + { + "epoch": 0.06626012560298535, + "grad_norm": 6.0625, + "learning_rate": 9.994125070519343e-06, + "loss": 1.204414963722229, + "step": 364 + }, + { + "epoch": 0.06662419222717757, + "grad_norm": 9.375, + "learning_rate": 9.99404175992073e-06, + "loss": 1.204558253288269, + "step": 366 + }, + { + "epoch": 0.0669882588513698, + "grad_norm": 11.1875, + "learning_rate": 9.993957863212676e-06, + "loss": 1.3985226154327393, + "step": 368 + }, + { + "epoch": 0.06735232547556202, + "grad_norm": 12.875, + "learning_rate": 9.993873380407491e-06, + "loss": 0.8727314472198486, + "step": 370 + }, + { + "epoch": 0.06771639209975426, + "grad_norm": 11.3125, + "learning_rate": 9.993788311517574e-06, + "loss": 1.6418206691741943, + "step": 372 + }, + { + "epoch": 0.06808045872394648, + "grad_norm": 17.875, + "learning_rate": 9.993702656555406e-06, + "loss": 1.4980840682983398, + "step": 374 + }, + { + "epoch": 0.0684445253481387, + "grad_norm": 9.0625, + "learning_rate": 9.99361641553356e-06, + "loss": 1.5730842351913452, + "step": 376 + }, + { + "epoch": 0.06880859197233094, + "grad_norm": 9.1875, + "learning_rate": 9.993529588464688e-06, + "loss": 1.6968036890029907, + "step": 378 + }, + { + "epoch": 0.06917265859652316, + "grad_norm": 14.375, + "learning_rate": 9.993442175361534e-06, + "loss": 1.8771188259124756, + "step": 380 + }, + { + "epoch": 0.06953672522071538, + "grad_norm": 12.5, + "learning_rate": 9.993354176236925e-06, + "loss": 1.5471888780593872, + "step": 382 + }, + { + "epoch": 0.06990079184490762, + "grad_norm": 15.625, + "learning_rate": 9.993265591103774e-06, + "loss": 1.749687910079956, + "step": 384 + }, + { + "epoch": 0.07026485846909984, + "grad_norm": 4.6875, + "learning_rate": 9.993176419975082e-06, + "loss": 1.1482858657836914, + "step": 386 + }, + { + "epoch": 0.07062892509329208, + "grad_norm": 44.75, + "learning_rate": 9.993086662863931e-06, + "loss": 0.842327356338501, + "step": 388 + }, + { + "epoch": 0.0709929917174843, + "grad_norm": 11.625, + "learning_rate": 9.992996319783496e-06, + "loss": 1.451005458831787, + "step": 390 + }, + { + "epoch": 0.07135705834167652, + "grad_norm": 10.75, + "learning_rate": 9.992905390747035e-06, + "loss": 1.5364468097686768, + "step": 392 + }, + { + "epoch": 0.07172112496586876, + "grad_norm": 17.0, + "learning_rate": 9.992813875767889e-06, + "loss": 1.2617599964141846, + "step": 394 + }, + { + "epoch": 0.07208519159006098, + "grad_norm": 10.4375, + "learning_rate": 9.992721774859487e-06, + "loss": 1.686112403869629, + "step": 396 + }, + { + "epoch": 0.07244925821425321, + "grad_norm": 10.0, + "learning_rate": 9.992629088035344e-06, + "loss": 1.6116350889205933, + "step": 398 + }, + { + "epoch": 0.07281332483844544, + "grad_norm": 7.65625, + "learning_rate": 9.992535815309065e-06, + "loss": 0.8564869165420532, + "step": 400 + }, + { + "epoch": 0.07317739146263766, + "grad_norm": 22.25, + "learning_rate": 9.992441956694337e-06, + "loss": 2.0098888874053955, + "step": 402 + }, + { + "epoch": 0.0735414580868299, + "grad_norm": 10.1875, + "learning_rate": 9.992347512204929e-06, + "loss": 1.6706129312515259, + "step": 404 + }, + { + "epoch": 0.07390552471102212, + "grad_norm": 90.0, + "learning_rate": 9.992252481854705e-06, + "loss": 1.5262184143066406, + "step": 406 + }, + { + "epoch": 0.07426959133521434, + "grad_norm": 8.125, + "learning_rate": 9.992156865657608e-06, + "loss": 1.7168821096420288, + "step": 408 + }, + { + "epoch": 0.07463365795940657, + "grad_norm": 12.1875, + "learning_rate": 9.992060663627669e-06, + "loss": 1.509940505027771, + "step": 410 + }, + { + "epoch": 0.0749977245835988, + "grad_norm": 10.5, + "learning_rate": 9.991963875779007e-06, + "loss": 1.7270442247390747, + "step": 412 + }, + { + "epoch": 0.07536179120779103, + "grad_norm": 6.1875, + "learning_rate": 9.991866502125822e-06, + "loss": 1.1227549314498901, + "step": 414 + }, + { + "epoch": 0.07572585783198325, + "grad_norm": 37.5, + "learning_rate": 9.991768542682409e-06, + "loss": 1.3501100540161133, + "step": 416 + }, + { + "epoch": 0.07608992445617548, + "grad_norm": 26.375, + "learning_rate": 9.991669997463139e-06, + "loss": 2.197963237762451, + "step": 418 + }, + { + "epoch": 0.07645399108036771, + "grad_norm": 9.75, + "learning_rate": 9.991570866482471e-06, + "loss": 1.4900336265563965, + "step": 420 + }, + { + "epoch": 0.07681805770455993, + "grad_norm": 7.15625, + "learning_rate": 9.991471149754957e-06, + "loss": 1.4062421321868896, + "step": 422 + }, + { + "epoch": 0.07718212432875216, + "grad_norm": 8.375, + "learning_rate": 9.991370847295228e-06, + "loss": 1.5279009342193604, + "step": 424 + }, + { + "epoch": 0.07754619095294439, + "grad_norm": 18.375, + "learning_rate": 9.991269959118002e-06, + "loss": 1.131893515586853, + "step": 426 + }, + { + "epoch": 0.07791025757713661, + "grad_norm": 15.625, + "learning_rate": 9.991168485238083e-06, + "loss": 1.4996836185455322, + "step": 428 + }, + { + "epoch": 0.07827432420132885, + "grad_norm": 15.0625, + "learning_rate": 9.991066425670365e-06, + "loss": 1.921349048614502, + "step": 430 + }, + { + "epoch": 0.07863839082552107, + "grad_norm": 9.0, + "learning_rate": 9.990963780429824e-06, + "loss": 1.4701411724090576, + "step": 432 + }, + { + "epoch": 0.07900245744971329, + "grad_norm": 9.5625, + "learning_rate": 9.990860549531522e-06, + "loss": 1.564692735671997, + "step": 434 + }, + { + "epoch": 0.07936652407390553, + "grad_norm": 12.625, + "learning_rate": 9.990756732990607e-06, + "loss": 1.6308060884475708, + "step": 436 + }, + { + "epoch": 0.07973059069809775, + "grad_norm": 9.125, + "learning_rate": 9.990652330822315e-06, + "loss": 1.5474727153778076, + "step": 438 + }, + { + "epoch": 0.08009465732228997, + "grad_norm": 22.0, + "learning_rate": 9.990547343041968e-06, + "loss": 1.6923563480377197, + "step": 440 + }, + { + "epoch": 0.08045872394648221, + "grad_norm": 3.25, + "learning_rate": 9.990441769664969e-06, + "loss": 1.285474419593811, + "step": 442 + }, + { + "epoch": 0.08082279057067443, + "grad_norm": 25.75, + "learning_rate": 9.990335610706812e-06, + "loss": 2.203094005584717, + "step": 444 + }, + { + "epoch": 0.08118685719486667, + "grad_norm": 6.21875, + "learning_rate": 9.990228866183076e-06, + "loss": 1.5876837968826294, + "step": 446 + }, + { + "epoch": 0.08155092381905889, + "grad_norm": 16.125, + "learning_rate": 9.990121536109423e-06, + "loss": 0.9697806239128113, + "step": 448 + }, + { + "epoch": 0.08191499044325111, + "grad_norm": 15.0, + "learning_rate": 9.990013620501609e-06, + "loss": 1.0794289112091064, + "step": 450 + }, + { + "epoch": 0.08227905706744335, + "grad_norm": 10.0, + "learning_rate": 9.989905119375463e-06, + "loss": 1.4529541730880737, + "step": 452 + }, + { + "epoch": 0.08264312369163557, + "grad_norm": 53.75, + "learning_rate": 9.98979603274691e-06, + "loss": 1.9725878238677979, + "step": 454 + }, + { + "epoch": 0.0830071903158278, + "grad_norm": 12.375, + "learning_rate": 9.98968636063196e-06, + "loss": 1.691248893737793, + "step": 456 + }, + { + "epoch": 0.08337125694002002, + "grad_norm": 15.0625, + "learning_rate": 9.989576103046706e-06, + "loss": 0.7653573751449585, + "step": 458 + }, + { + "epoch": 0.08373532356421225, + "grad_norm": 7.96875, + "learning_rate": 9.989465260007326e-06, + "loss": 1.3986103534698486, + "step": 460 + }, + { + "epoch": 0.08409939018840448, + "grad_norm": 32.5, + "learning_rate": 9.989353831530089e-06, + "loss": 2.173577070236206, + "step": 462 + }, + { + "epoch": 0.0844634568125967, + "grad_norm": 9.125, + "learning_rate": 9.989241817631344e-06, + "loss": 1.509488821029663, + "step": 464 + }, + { + "epoch": 0.08482752343678893, + "grad_norm": 27.25, + "learning_rate": 9.989129218327526e-06, + "loss": 2.010396957397461, + "step": 466 + }, + { + "epoch": 0.08519159006098116, + "grad_norm": 8.375, + "learning_rate": 9.989016033635164e-06, + "loss": 1.5934710502624512, + "step": 468 + }, + { + "epoch": 0.08555565668517338, + "grad_norm": 9.625, + "learning_rate": 9.988902263570865e-06, + "loss": 1.3932366371154785, + "step": 470 + }, + { + "epoch": 0.08591972330936562, + "grad_norm": 35.25, + "learning_rate": 9.988787908151326e-06, + "loss": 0.9453686475753784, + "step": 472 + }, + { + "epoch": 0.08628378993355784, + "grad_norm": 14.625, + "learning_rate": 9.988672967393325e-06, + "loss": 1.237920880317688, + "step": 474 + }, + { + "epoch": 0.08664785655775006, + "grad_norm": 14.1875, + "learning_rate": 9.98855744131373e-06, + "loss": 0.9060868620872498, + "step": 476 + }, + { + "epoch": 0.0870119231819423, + "grad_norm": 14.0625, + "learning_rate": 9.988441329929497e-06, + "loss": 1.101029872894287, + "step": 478 + }, + { + "epoch": 0.08737598980613452, + "grad_norm": 10.875, + "learning_rate": 9.98832463325766e-06, + "loss": 1.4590922594070435, + "step": 480 + }, + { + "epoch": 0.08774005643032674, + "grad_norm": 13.25, + "learning_rate": 9.988207351315349e-06, + "loss": 1.5289921760559082, + "step": 482 + }, + { + "epoch": 0.08810412305451898, + "grad_norm": 8.25, + "learning_rate": 9.98808948411977e-06, + "loss": 1.443752646446228, + "step": 484 + }, + { + "epoch": 0.0884681896787112, + "grad_norm": 27.875, + "learning_rate": 9.987971031688221e-06, + "loss": 1.3074227571487427, + "step": 486 + }, + { + "epoch": 0.08883225630290344, + "grad_norm": 13.75, + "learning_rate": 9.987851994038088e-06, + "loss": 1.5410394668579102, + "step": 488 + }, + { + "epoch": 0.08919632292709566, + "grad_norm": 13.4375, + "learning_rate": 9.987732371186834e-06, + "loss": 1.531282901763916, + "step": 490 + }, + { + "epoch": 0.08956038955128788, + "grad_norm": 24.625, + "learning_rate": 9.987612163152014e-06, + "loss": 2.0960986614227295, + "step": 492 + }, + { + "epoch": 0.08992445617548012, + "grad_norm": 19.5, + "learning_rate": 9.987491369951271e-06, + "loss": 1.941217064857483, + "step": 494 + }, + { + "epoch": 0.09028852279967234, + "grad_norm": 21.375, + "learning_rate": 9.987369991602329e-06, + "loss": 2.0323238372802734, + "step": 496 + }, + { + "epoch": 0.09065258942386457, + "grad_norm": 11.5625, + "learning_rate": 9.987248028123003e-06, + "loss": 0.8621390461921692, + "step": 498 + }, + { + "epoch": 0.0910166560480568, + "grad_norm": 18.5, + "learning_rate": 9.987125479531186e-06, + "loss": 1.1050454378128052, + "step": 500 + }, + { + "epoch": 0.09138072267224902, + "grad_norm": 30.5, + "learning_rate": 9.98700234584486e-06, + "loss": 1.3388925790786743, + "step": 502 + }, + { + "epoch": 0.09174478929644125, + "grad_norm": 5.28125, + "learning_rate": 9.986878627082102e-06, + "loss": 0.9674100875854492, + "step": 504 + }, + { + "epoch": 0.09210885592063348, + "grad_norm": 14.3125, + "learning_rate": 9.986754323261061e-06, + "loss": 1.6390407085418701, + "step": 506 + }, + { + "epoch": 0.0924729225448257, + "grad_norm": 20.0, + "learning_rate": 9.98662943439998e-06, + "loss": 0.7929630875587463, + "step": 508 + }, + { + "epoch": 0.09283698916901793, + "grad_norm": 16.75, + "learning_rate": 9.986503960517185e-06, + "loss": 1.8872112035751343, + "step": 510 + }, + { + "epoch": 0.09320105579321016, + "grad_norm": 15.875, + "learning_rate": 9.98637790163109e-06, + "loss": 1.5856091976165771, + "step": 512 + }, + { + "epoch": 0.09356512241740239, + "grad_norm": 12.5625, + "learning_rate": 9.986251257760195e-06, + "loss": 1.5681712627410889, + "step": 514 + }, + { + "epoch": 0.09392918904159461, + "grad_norm": 15.375, + "learning_rate": 9.986124028923083e-06, + "loss": 1.0393840074539185, + "step": 516 + }, + { + "epoch": 0.09429325566578683, + "grad_norm": 11.0625, + "learning_rate": 9.985996215138423e-06, + "loss": 1.6617491245269775, + "step": 518 + }, + { + "epoch": 0.09465732228997907, + "grad_norm": 9.3125, + "learning_rate": 9.98586781642497e-06, + "loss": 1.6009297370910645, + "step": 520 + }, + { + "epoch": 0.09502138891417129, + "grad_norm": 11.75, + "learning_rate": 9.98573883280157e-06, + "loss": 1.736532211303711, + "step": 522 + }, + { + "epoch": 0.09538545553836351, + "grad_norm": 24.875, + "learning_rate": 9.98560926428715e-06, + "loss": 2.1928296089172363, + "step": 524 + }, + { + "epoch": 0.09574952216255575, + "grad_norm": 4.40625, + "learning_rate": 9.985479110900721e-06, + "loss": 1.0732488632202148, + "step": 526 + }, + { + "epoch": 0.09611358878674797, + "grad_norm": 9.0625, + "learning_rate": 9.985348372661388e-06, + "loss": 1.6667879819869995, + "step": 528 + }, + { + "epoch": 0.09647765541094021, + "grad_norm": 13.9375, + "learning_rate": 9.98521704958833e-06, + "loss": 1.4967536926269531, + "step": 530 + }, + { + "epoch": 0.09684172203513243, + "grad_norm": 4.0625, + "learning_rate": 9.98508514170082e-06, + "loss": 1.0845718383789062, + "step": 532 + }, + { + "epoch": 0.09720578865932465, + "grad_norm": 3.265625, + "learning_rate": 9.984952649018215e-06, + "loss": 1.1927682161331177, + "step": 534 + }, + { + "epoch": 0.09756985528351689, + "grad_norm": 11.125, + "learning_rate": 9.98481957155996e-06, + "loss": 1.3530960083007812, + "step": 536 + }, + { + "epoch": 0.09793392190770911, + "grad_norm": 9.3125, + "learning_rate": 9.984685909345582e-06, + "loss": 1.5236634016036987, + "step": 538 + }, + { + "epoch": 0.09829798853190133, + "grad_norm": 5.3125, + "learning_rate": 9.984551662394695e-06, + "loss": 1.1593369245529175, + "step": 540 + }, + { + "epoch": 0.09866205515609357, + "grad_norm": 5.90625, + "learning_rate": 9.984416830727e-06, + "loss": 1.307875156402588, + "step": 542 + }, + { + "epoch": 0.09902612178028579, + "grad_norm": 3.8125, + "learning_rate": 9.98428141436228e-06, + "loss": 1.5498743057250977, + "step": 544 + }, + { + "epoch": 0.09939018840447802, + "grad_norm": 5.125, + "learning_rate": 9.984145413320412e-06, + "loss": 1.4448132514953613, + "step": 546 + }, + { + "epoch": 0.09975425502867025, + "grad_norm": 2.765625, + "learning_rate": 9.984008827621349e-06, + "loss": 1.1951658725738525, + "step": 548 + }, + { + "epoch": 0.10011832165286247, + "grad_norm": 15.375, + "learning_rate": 9.983871657285138e-06, + "loss": 1.38310706615448, + "step": 550 + }, + { + "epoch": 0.1004823882770547, + "grad_norm": 70.5, + "learning_rate": 9.983733902331907e-06, + "loss": 0.7291164994239807, + "step": 552 + }, + { + "epoch": 0.10084645490124693, + "grad_norm": 6.53125, + "learning_rate": 9.98359556278187e-06, + "loss": 1.401672124862671, + "step": 554 + }, + { + "epoch": 0.10121052152543916, + "grad_norm": 9.25, + "learning_rate": 9.983456638655327e-06, + "loss": 1.8654245138168335, + "step": 556 + }, + { + "epoch": 0.10157458814963138, + "grad_norm": 15.0625, + "learning_rate": 9.983317129972667e-06, + "loss": 1.8804476261138916, + "step": 558 + }, + { + "epoch": 0.1019386547738236, + "grad_norm": 10.75, + "learning_rate": 9.98317703675436e-06, + "loss": 1.4194706678390503, + "step": 560 + }, + { + "epoch": 0.10230272139801584, + "grad_norm": 26.75, + "learning_rate": 9.983036359020965e-06, + "loss": 1.7774877548217773, + "step": 562 + }, + { + "epoch": 0.10266678802220806, + "grad_norm": 17.375, + "learning_rate": 9.982895096793128e-06, + "loss": 1.0727057456970215, + "step": 564 + }, + { + "epoch": 0.10303085464640029, + "grad_norm": 16.75, + "learning_rate": 9.982753250091577e-06, + "loss": 1.36929190158844, + "step": 566 + }, + { + "epoch": 0.10339492127059252, + "grad_norm": 10.6875, + "learning_rate": 9.982610818937124e-06, + "loss": 1.3780345916748047, + "step": 568 + }, + { + "epoch": 0.10375898789478474, + "grad_norm": 15.0625, + "learning_rate": 9.982467803350675e-06, + "loss": 0.8375037908554077, + "step": 570 + }, + { + "epoch": 0.10412305451897698, + "grad_norm": 20.5, + "learning_rate": 9.982324203353217e-06, + "loss": 1.4844985008239746, + "step": 572 + }, + { + "epoch": 0.1044871211431692, + "grad_norm": 19.125, + "learning_rate": 9.98218001896582e-06, + "loss": 1.5376529693603516, + "step": 574 + }, + { + "epoch": 0.10485118776736142, + "grad_norm": 18.375, + "learning_rate": 9.982035250209642e-06, + "loss": 1.4483085870742798, + "step": 576 + }, + { + "epoch": 0.10521525439155366, + "grad_norm": 10.5, + "learning_rate": 9.981889897105932e-06, + "loss": 1.7304250001907349, + "step": 578 + }, + { + "epoch": 0.10557932101574588, + "grad_norm": 26.375, + "learning_rate": 9.981743959676016e-06, + "loss": 2.1831817626953125, + "step": 580 + }, + { + "epoch": 0.1059433876399381, + "grad_norm": 10.5, + "learning_rate": 9.981597437941309e-06, + "loss": 1.4532653093338013, + "step": 582 + }, + { + "epoch": 0.10630745426413034, + "grad_norm": 11.0, + "learning_rate": 9.981450331923315e-06, + "loss": 1.6760830879211426, + "step": 584 + }, + { + "epoch": 0.10667152088832256, + "grad_norm": 45.75, + "learning_rate": 9.98130264164362e-06, + "loss": 1.4925655126571655, + "step": 586 + }, + { + "epoch": 0.1070355875125148, + "grad_norm": 11.6875, + "learning_rate": 9.981154367123898e-06, + "loss": 1.2994002103805542, + "step": 588 + }, + { + "epoch": 0.10739965413670702, + "grad_norm": 7.8125, + "learning_rate": 9.981005508385904e-06, + "loss": 1.09633207321167, + "step": 590 + }, + { + "epoch": 0.10776372076089924, + "grad_norm": 14.5, + "learning_rate": 9.980856065451487e-06, + "loss": 1.5942790508270264, + "step": 592 + }, + { + "epoch": 0.10812778738509148, + "grad_norm": 11.5, + "learning_rate": 9.980706038342575e-06, + "loss": 1.3079761266708374, + "step": 594 + }, + { + "epoch": 0.1084918540092837, + "grad_norm": 27.625, + "learning_rate": 9.980555427081187e-06, + "loss": 1.7374396324157715, + "step": 596 + }, + { + "epoch": 0.10885592063347592, + "grad_norm": 22.125, + "learning_rate": 9.980404231689418e-06, + "loss": 1.8327726125717163, + "step": 598 + }, + { + "epoch": 0.10921998725766816, + "grad_norm": 5.21875, + "learning_rate": 9.98025245218946e-06, + "loss": 1.228814959526062, + "step": 600 + }, + { + "epoch": 0.10958405388186038, + "grad_norm": 8.9375, + "learning_rate": 9.980100088603588e-06, + "loss": 1.3061808347702026, + "step": 602 + }, + { + "epoch": 0.10994812050605261, + "grad_norm": 8.1875, + "learning_rate": 9.979947140954156e-06, + "loss": 1.4228764772415161, + "step": 604 + }, + { + "epoch": 0.11031218713024483, + "grad_norm": 7.875, + "learning_rate": 9.979793609263609e-06, + "loss": 1.4661059379577637, + "step": 606 + }, + { + "epoch": 0.11067625375443706, + "grad_norm": 11.25, + "learning_rate": 9.97963949355448e-06, + "loss": 1.5948870182037354, + "step": 608 + }, + { + "epoch": 0.11104032037862929, + "grad_norm": 46.5, + "learning_rate": 9.979484793849383e-06, + "loss": 1.5271118879318237, + "step": 610 + }, + { + "epoch": 0.11140438700282151, + "grad_norm": 7.9375, + "learning_rate": 9.979329510171021e-06, + "loss": 1.2037501335144043, + "step": 612 + }, + { + "epoch": 0.11176845362701375, + "grad_norm": 6.53125, + "learning_rate": 9.979173642542179e-06, + "loss": 1.2872017621994019, + "step": 614 + }, + { + "epoch": 0.11213252025120597, + "grad_norm": 7.0, + "learning_rate": 9.979017190985732e-06, + "loss": 1.5186052322387695, + "step": 616 + }, + { + "epoch": 0.1124965868753982, + "grad_norm": 3.796875, + "learning_rate": 9.978860155524637e-06, + "loss": 1.2106845378875732, + "step": 618 + }, + { + "epoch": 0.11286065349959043, + "grad_norm": 25.875, + "learning_rate": 9.978702536181939e-06, + "loss": 1.4363855123519897, + "step": 620 + }, + { + "epoch": 0.11322472012378265, + "grad_norm": 22.875, + "learning_rate": 9.978544332980769e-06, + "loss": 1.5833772420883179, + "step": 622 + }, + { + "epoch": 0.11358878674797487, + "grad_norm": 29.75, + "learning_rate": 9.97838554594434e-06, + "loss": 1.4303171634674072, + "step": 624 + }, + { + "epoch": 0.11395285337216711, + "grad_norm": 23.0, + "learning_rate": 9.978226175095957e-06, + "loss": 1.8701369762420654, + "step": 626 + }, + { + "epoch": 0.11431691999635933, + "grad_norm": 31.5, + "learning_rate": 9.978066220459004e-06, + "loss": 1.5721532106399536, + "step": 628 + }, + { + "epoch": 0.11468098662055157, + "grad_norm": 11.375, + "learning_rate": 9.977905682056957e-06, + "loss": 1.6384358406066895, + "step": 630 + }, + { + "epoch": 0.11504505324474379, + "grad_norm": 59.0, + "learning_rate": 9.977744559913369e-06, + "loss": 1.4425804615020752, + "step": 632 + }, + { + "epoch": 0.11540911986893601, + "grad_norm": 21.25, + "learning_rate": 9.977582854051887e-06, + "loss": 1.9026998281478882, + "step": 634 + }, + { + "epoch": 0.11577318649312825, + "grad_norm": 8.625, + "learning_rate": 9.977420564496244e-06, + "loss": 1.4961724281311035, + "step": 636 + }, + { + "epoch": 0.11613725311732047, + "grad_norm": 14.25, + "learning_rate": 9.97725769127025e-06, + "loss": 1.5527920722961426, + "step": 638 + }, + { + "epoch": 0.11650131974151269, + "grad_norm": 12.1875, + "learning_rate": 9.977094234397811e-06, + "loss": 1.6305630207061768, + "step": 640 + }, + { + "epoch": 0.11686538636570493, + "grad_norm": 12.5625, + "learning_rate": 9.976930193902909e-06, + "loss": 1.5491584539413452, + "step": 642 + }, + { + "epoch": 0.11722945298989715, + "grad_norm": 14.0, + "learning_rate": 9.97676556980962e-06, + "loss": 1.3492447137832642, + "step": 644 + }, + { + "epoch": 0.11759351961408938, + "grad_norm": 15.375, + "learning_rate": 9.976600362142095e-06, + "loss": 1.4404855966567993, + "step": 646 + }, + { + "epoch": 0.1179575862382816, + "grad_norm": 13.3125, + "learning_rate": 9.976434570924587e-06, + "loss": 1.5379244089126587, + "step": 648 + }, + { + "epoch": 0.11832165286247383, + "grad_norm": 5.6875, + "learning_rate": 9.976268196181418e-06, + "loss": 0.9439575672149658, + "step": 650 + }, + { + "epoch": 0.11868571948666606, + "grad_norm": 7.8125, + "learning_rate": 9.976101237937008e-06, + "loss": 1.5202707052230835, + "step": 652 + }, + { + "epoch": 0.11904978611085829, + "grad_norm": 20.125, + "learning_rate": 9.975933696215854e-06, + "loss": 2.09470796585083, + "step": 654 + }, + { + "epoch": 0.11941385273505051, + "grad_norm": 7.625, + "learning_rate": 9.975765571042543e-06, + "loss": 0.9947249889373779, + "step": 656 + }, + { + "epoch": 0.11977791935924274, + "grad_norm": 9.3125, + "learning_rate": 9.975596862441748e-06, + "loss": 0.9893943071365356, + "step": 658 + }, + { + "epoch": 0.12014198598343497, + "grad_norm": 6.875, + "learning_rate": 9.975427570438223e-06, + "loss": 0.8843866586685181, + "step": 660 + }, + { + "epoch": 0.1205060526076272, + "grad_norm": 3.265625, + "learning_rate": 9.975257695056815e-06, + "loss": 0.9782657027244568, + "step": 662 + }, + { + "epoch": 0.12087011923181942, + "grad_norm": 25.625, + "learning_rate": 9.97508723632245e-06, + "loss": 1.4665272235870361, + "step": 664 + }, + { + "epoch": 0.12123418585601164, + "grad_norm": 13.5625, + "learning_rate": 9.974916194260143e-06, + "loss": 1.4590774774551392, + "step": 666 + }, + { + "epoch": 0.12159825248020388, + "grad_norm": 13.625, + "learning_rate": 9.974744568894991e-06, + "loss": 1.5867903232574463, + "step": 668 + }, + { + "epoch": 0.1219623191043961, + "grad_norm": 14.5, + "learning_rate": 9.974572360252185e-06, + "loss": 1.555790901184082, + "step": 670 + }, + { + "epoch": 0.12232638572858834, + "grad_norm": 30.375, + "learning_rate": 9.974399568356991e-06, + "loss": 2.1648361682891846, + "step": 672 + }, + { + "epoch": 0.12269045235278056, + "grad_norm": 16.25, + "learning_rate": 9.974226193234768e-06, + "loss": 1.1645123958587646, + "step": 674 + }, + { + "epoch": 0.12305451897697278, + "grad_norm": 11.5625, + "learning_rate": 9.974052234910958e-06, + "loss": 1.2378907203674316, + "step": 676 + }, + { + "epoch": 0.12341858560116502, + "grad_norm": 9.4375, + "learning_rate": 9.973877693411088e-06, + "loss": 1.3132832050323486, + "step": 678 + }, + { + "epoch": 0.12378265222535724, + "grad_norm": 11.5625, + "learning_rate": 9.973702568760768e-06, + "loss": 1.4829702377319336, + "step": 680 + }, + { + "epoch": 0.12414671884954946, + "grad_norm": 7.3125, + "learning_rate": 9.973526860985702e-06, + "loss": 1.3518272638320923, + "step": 682 + }, + { + "epoch": 0.1245107854737417, + "grad_norm": 8.0, + "learning_rate": 9.973350570111673e-06, + "loss": 0.964803159236908, + "step": 684 + }, + { + "epoch": 0.12487485209793392, + "grad_norm": 7.8125, + "learning_rate": 9.973173696164549e-06, + "loss": 1.5220879316329956, + "step": 686 + }, + { + "epoch": 0.12523891872212614, + "grad_norm": 7.09375, + "learning_rate": 9.97299623917029e-06, + "loss": 1.4920923709869385, + "step": 688 + }, + { + "epoch": 0.12560298534631836, + "grad_norm": 7.3125, + "learning_rate": 9.97281819915493e-06, + "loss": 1.5284004211425781, + "step": 690 + }, + { + "epoch": 0.1259670519705106, + "grad_norm": 7.3125, + "learning_rate": 9.972639576144603e-06, + "loss": 1.1986876726150513, + "step": 692 + }, + { + "epoch": 0.12633111859470283, + "grad_norm": 4.90625, + "learning_rate": 9.972460370165516e-06, + "loss": 1.2006864547729492, + "step": 694 + }, + { + "epoch": 0.12669518521889506, + "grad_norm": 17.125, + "learning_rate": 9.97228058124397e-06, + "loss": 1.4673051834106445, + "step": 696 + }, + { + "epoch": 0.12705925184308728, + "grad_norm": 11.5, + "learning_rate": 9.972100209406345e-06, + "loss": 1.475710153579712, + "step": 698 + }, + { + "epoch": 0.1274233184672795, + "grad_norm": 31.125, + "learning_rate": 9.971919254679113e-06, + "loss": 2.0274956226348877, + "step": 700 + }, + { + "epoch": 0.12778738509147175, + "grad_norm": 14.875, + "learning_rate": 9.971737717088826e-06, + "loss": 1.4405767917633057, + "step": 702 + }, + { + "epoch": 0.12815145171566397, + "grad_norm": 14.0, + "learning_rate": 9.971555596662125e-06, + "loss": 1.1153233051300049, + "step": 704 + }, + { + "epoch": 0.1285155183398562, + "grad_norm": 9.875, + "learning_rate": 9.971372893425739e-06, + "loss": 1.256136417388916, + "step": 706 + }, + { + "epoch": 0.12887958496404842, + "grad_norm": 15.9375, + "learning_rate": 9.971189607406473e-06, + "loss": 1.79349684715271, + "step": 708 + }, + { + "epoch": 0.12924365158824064, + "grad_norm": 4.34375, + "learning_rate": 9.971005738631226e-06, + "loss": 0.9817097783088684, + "step": 710 + }, + { + "epoch": 0.1296077182124329, + "grad_norm": 8.0625, + "learning_rate": 9.97082128712698e-06, + "loss": 1.4649547338485718, + "step": 712 + }, + { + "epoch": 0.1299717848366251, + "grad_norm": 17.125, + "learning_rate": 9.970636252920802e-06, + "loss": 0.6673729419708252, + "step": 714 + }, + { + "epoch": 0.13033585146081733, + "grad_norm": 6.9375, + "learning_rate": 9.970450636039846e-06, + "loss": 1.397263765335083, + "step": 716 + }, + { + "epoch": 0.13069991808500955, + "grad_norm": 23.75, + "learning_rate": 9.97026443651135e-06, + "loss": 1.0178101062774658, + "step": 718 + }, + { + "epoch": 0.13106398470920178, + "grad_norm": 9.5, + "learning_rate": 9.970077654362637e-06, + "loss": 1.8943710327148438, + "step": 720 + }, + { + "epoch": 0.13142805133339402, + "grad_norm": 41.75, + "learning_rate": 9.969890289621117e-06, + "loss": 1.5627148151397705, + "step": 722 + }, + { + "epoch": 0.13179211795758625, + "grad_norm": 9.625, + "learning_rate": 9.969702342314289e-06, + "loss": 1.4578361511230469, + "step": 724 + }, + { + "epoch": 0.13215618458177847, + "grad_norm": 11.0625, + "learning_rate": 9.969513812469726e-06, + "loss": 1.7569128274917603, + "step": 726 + }, + { + "epoch": 0.1325202512059707, + "grad_norm": 24.375, + "learning_rate": 9.969324700115101e-06, + "loss": 1.8174906969070435, + "step": 728 + }, + { + "epoch": 0.1328843178301629, + "grad_norm": 6.21875, + "learning_rate": 9.969135005278164e-06, + "loss": 0.8841165900230408, + "step": 730 + }, + { + "epoch": 0.13324838445435513, + "grad_norm": 3.921875, + "learning_rate": 9.968944727986746e-06, + "loss": 1.0754631757736206, + "step": 732 + }, + { + "epoch": 0.13361245107854738, + "grad_norm": 21.25, + "learning_rate": 9.968753868268776e-06, + "loss": 1.3698331117630005, + "step": 734 + }, + { + "epoch": 0.1339765177027396, + "grad_norm": 6.28125, + "learning_rate": 9.96856242615226e-06, + "loss": 1.5272889137268066, + "step": 736 + }, + { + "epoch": 0.13434058432693183, + "grad_norm": 18.5, + "learning_rate": 9.96837040166529e-06, + "loss": 1.4165997505187988, + "step": 738 + }, + { + "epoch": 0.13470465095112405, + "grad_norm": 20.5, + "learning_rate": 9.968177794836047e-06, + "loss": 2.025388240814209, + "step": 740 + }, + { + "epoch": 0.13506871757531627, + "grad_norm": 8.375, + "learning_rate": 9.967984605692796e-06, + "loss": 1.4579286575317383, + "step": 742 + }, + { + "epoch": 0.13543278419950852, + "grad_norm": 6.5625, + "learning_rate": 9.967790834263882e-06, + "loss": 1.4546328783035278, + "step": 744 + }, + { + "epoch": 0.13579685082370074, + "grad_norm": 20.5, + "learning_rate": 9.967596480577744e-06, + "loss": 0.7280870079994202, + "step": 746 + }, + { + "epoch": 0.13616091744789297, + "grad_norm": 20.75, + "learning_rate": 9.967401544662902e-06, + "loss": 1.4604501724243164, + "step": 748 + }, + { + "epoch": 0.1365249840720852, + "grad_norm": 4.0, + "learning_rate": 9.967206026547962e-06, + "loss": 1.1921180486679077, + "step": 750 + }, + { + "epoch": 0.1368890506962774, + "grad_norm": 11.9375, + "learning_rate": 9.967009926261615e-06, + "loss": 1.5692627429962158, + "step": 752 + }, + { + "epoch": 0.13725311732046966, + "grad_norm": 13.5, + "learning_rate": 9.966813243832638e-06, + "loss": 1.506689429283142, + "step": 754 + }, + { + "epoch": 0.13761718394466188, + "grad_norm": 14.6875, + "learning_rate": 9.966615979289893e-06, + "loss": 1.4427322149276733, + "step": 756 + }, + { + "epoch": 0.1379812505688541, + "grad_norm": 4.09375, + "learning_rate": 9.96641813266233e-06, + "loss": 1.2432501316070557, + "step": 758 + }, + { + "epoch": 0.13834531719304632, + "grad_norm": 8.625, + "learning_rate": 9.966219703978979e-06, + "loss": 1.504516839981079, + "step": 760 + }, + { + "epoch": 0.13870938381723855, + "grad_norm": 30.75, + "learning_rate": 9.966020693268961e-06, + "loss": 1.0999618768692017, + "step": 762 + }, + { + "epoch": 0.13907345044143077, + "grad_norm": 6.1875, + "learning_rate": 9.965821100561479e-06, + "loss": 1.4633733034133911, + "step": 764 + }, + { + "epoch": 0.13943751706562302, + "grad_norm": 8.6875, + "learning_rate": 9.965620925885822e-06, + "loss": 1.5323455333709717, + "step": 766 + }, + { + "epoch": 0.13980158368981524, + "grad_norm": 9.375, + "learning_rate": 9.965420169271367e-06, + "loss": 1.4056990146636963, + "step": 768 + }, + { + "epoch": 0.14016565031400746, + "grad_norm": 6.28125, + "learning_rate": 9.96521883074757e-06, + "loss": 1.4805994033813477, + "step": 770 + }, + { + "epoch": 0.14052971693819968, + "grad_norm": 10.0625, + "learning_rate": 9.96501691034398e-06, + "loss": 1.500640869140625, + "step": 772 + }, + { + "epoch": 0.1408937835623919, + "grad_norm": 10.125, + "learning_rate": 9.964814408090228e-06, + "loss": 1.409245252609253, + "step": 774 + }, + { + "epoch": 0.14125785018658416, + "grad_norm": 17.25, + "learning_rate": 9.96461132401603e-06, + "loss": 1.5120081901550293, + "step": 776 + }, + { + "epoch": 0.14162191681077638, + "grad_norm": 39.5, + "learning_rate": 9.964407658151188e-06, + "loss": 1.4226701259613037, + "step": 778 + }, + { + "epoch": 0.1419859834349686, + "grad_norm": 11.1875, + "learning_rate": 9.964203410525585e-06, + "loss": 1.6625443696975708, + "step": 780 + }, + { + "epoch": 0.14235005005916082, + "grad_norm": 9.75, + "learning_rate": 9.963998581169201e-06, + "loss": 1.989743947982788, + "step": 782 + }, + { + "epoch": 0.14271411668335304, + "grad_norm": 22.5, + "learning_rate": 9.963793170112089e-06, + "loss": 1.6876050233840942, + "step": 784 + }, + { + "epoch": 0.1430781833075453, + "grad_norm": 8.1875, + "learning_rate": 9.963587177384391e-06, + "loss": 1.217936635017395, + "step": 786 + }, + { + "epoch": 0.14344224993173751, + "grad_norm": 9.375, + "learning_rate": 9.963380603016339e-06, + "loss": 1.0292983055114746, + "step": 788 + }, + { + "epoch": 0.14380631655592974, + "grad_norm": 7.3125, + "learning_rate": 9.963173447038246e-06, + "loss": 1.520676612854004, + "step": 790 + }, + { + "epoch": 0.14417038318012196, + "grad_norm": 26.0, + "learning_rate": 9.96296570948051e-06, + "loss": 1.4777098894119263, + "step": 792 + }, + { + "epoch": 0.14453444980431418, + "grad_norm": 22.5, + "learning_rate": 9.962757390373616e-06, + "loss": 1.348071813583374, + "step": 794 + }, + { + "epoch": 0.14489851642850643, + "grad_norm": 23.0, + "learning_rate": 9.962548489748138e-06, + "loss": 1.1797630786895752, + "step": 796 + }, + { + "epoch": 0.14526258305269865, + "grad_norm": 14.8125, + "learning_rate": 9.962339007634724e-06, + "loss": 2.2863168716430664, + "step": 798 + }, + { + "epoch": 0.14562664967689087, + "grad_norm": 4.25, + "learning_rate": 9.962128944064123e-06, + "loss": 1.5526676177978516, + "step": 800 + }, + { + "epoch": 0.1459907163010831, + "grad_norm": 11.1875, + "learning_rate": 9.961918299067152e-06, + "loss": 1.201155662536621, + "step": 802 + }, + { + "epoch": 0.14635478292527532, + "grad_norm": 3.015625, + "learning_rate": 9.961707072674731e-06, + "loss": 1.3782280683517456, + "step": 804 + }, + { + "epoch": 0.14671884954946754, + "grad_norm": 258.0, + "learning_rate": 9.961495264917849e-06, + "loss": 1.1102190017700195, + "step": 806 + }, + { + "epoch": 0.1470829161736598, + "grad_norm": 35.75, + "learning_rate": 9.961282875827593e-06, + "loss": 1.995614767074585, + "step": 808 + }, + { + "epoch": 0.147446982797852, + "grad_norm": 76.5, + "learning_rate": 9.961069905435127e-06, + "loss": 2.0287728309631348, + "step": 810 + }, + { + "epoch": 0.14781104942204423, + "grad_norm": 38.25, + "learning_rate": 9.960856353771709e-06, + "loss": 0.8975493311882019, + "step": 812 + }, + { + "epoch": 0.14817511604623645, + "grad_norm": 16.5, + "learning_rate": 9.96064222086867e-06, + "loss": 1.629402756690979, + "step": 814 + }, + { + "epoch": 0.14853918267042868, + "grad_norm": 8.0625, + "learning_rate": 9.960427506757438e-06, + "loss": 1.3786468505859375, + "step": 816 + }, + { + "epoch": 0.14890324929462093, + "grad_norm": 16.5, + "learning_rate": 9.960212211469518e-06, + "loss": 1.452179193496704, + "step": 818 + }, + { + "epoch": 0.14926731591881315, + "grad_norm": 12.1875, + "learning_rate": 9.959996335036507e-06, + "loss": 1.387854814529419, + "step": 820 + }, + { + "epoch": 0.14963138254300537, + "grad_norm": 9.1875, + "learning_rate": 9.959779877490079e-06, + "loss": 1.5545549392700195, + "step": 822 + }, + { + "epoch": 0.1499954491671976, + "grad_norm": 23.625, + "learning_rate": 9.959562838862003e-06, + "loss": 1.6043356657028198, + "step": 824 + }, + { + "epoch": 0.15035951579138981, + "grad_norm": 9.875, + "learning_rate": 9.959345219184128e-06, + "loss": 1.181017518043518, + "step": 826 + }, + { + "epoch": 0.15072358241558206, + "grad_norm": 48.25, + "learning_rate": 9.959127018488388e-06, + "loss": 0.8593971133232117, + "step": 828 + }, + { + "epoch": 0.15108764903977429, + "grad_norm": 13.125, + "learning_rate": 9.958908236806801e-06, + "loss": 1.6113003492355347, + "step": 830 + }, + { + "epoch": 0.1514517156639665, + "grad_norm": 3.296875, + "learning_rate": 9.958688874171475e-06, + "loss": 1.1172258853912354, + "step": 832 + }, + { + "epoch": 0.15181578228815873, + "grad_norm": 40.75, + "learning_rate": 9.958468930614601e-06, + "loss": 1.400185227394104, + "step": 834 + }, + { + "epoch": 0.15217984891235095, + "grad_norm": 4.65625, + "learning_rate": 9.958248406168456e-06, + "loss": 0.9719647169113159, + "step": 836 + }, + { + "epoch": 0.1525439155365432, + "grad_norm": 37.5, + "learning_rate": 9.958027300865395e-06, + "loss": 2.0113043785095215, + "step": 838 + }, + { + "epoch": 0.15290798216073542, + "grad_norm": 6.28125, + "learning_rate": 9.95780561473787e-06, + "loss": 1.525176763534546, + "step": 840 + }, + { + "epoch": 0.15327204878492764, + "grad_norm": 7.875, + "learning_rate": 9.95758334781841e-06, + "loss": 1.3904681205749512, + "step": 842 + }, + { + "epoch": 0.15363611540911987, + "grad_norm": 9.0, + "learning_rate": 9.957360500139633e-06, + "loss": 1.3753796815872192, + "step": 844 + }, + { + "epoch": 0.1540001820333121, + "grad_norm": 9.0, + "learning_rate": 9.957137071734239e-06, + "loss": 1.5650053024291992, + "step": 846 + }, + { + "epoch": 0.1543642486575043, + "grad_norm": 15.875, + "learning_rate": 9.956913062635017e-06, + "loss": 1.6978940963745117, + "step": 848 + }, + { + "epoch": 0.15472831528169656, + "grad_norm": 33.0, + "learning_rate": 9.956688472874838e-06, + "loss": 2.079392433166504, + "step": 850 + }, + { + "epoch": 0.15509238190588878, + "grad_norm": 8.875, + "learning_rate": 9.956463302486662e-06, + "loss": 1.4703980684280396, + "step": 852 + }, + { + "epoch": 0.155456448530081, + "grad_norm": 7.75, + "learning_rate": 9.95623755150353e-06, + "loss": 1.3990546464920044, + "step": 854 + }, + { + "epoch": 0.15582051515427323, + "grad_norm": 17.25, + "learning_rate": 9.956011219958572e-06, + "loss": 1.414876937866211, + "step": 856 + }, + { + "epoch": 0.15618458177846545, + "grad_norm": 20.125, + "learning_rate": 9.955784307884999e-06, + "loss": 1.403910517692566, + "step": 858 + }, + { + "epoch": 0.1565486484026577, + "grad_norm": 15.5625, + "learning_rate": 9.95555681531611e-06, + "loss": 1.23539400100708, + "step": 860 + }, + { + "epoch": 0.15691271502684992, + "grad_norm": 9.3125, + "learning_rate": 9.955328742285288e-06, + "loss": 1.3026257753372192, + "step": 862 + }, + { + "epoch": 0.15727678165104214, + "grad_norm": 25.375, + "learning_rate": 9.955100088826005e-06, + "loss": 0.5582795143127441, + "step": 864 + }, + { + "epoch": 0.15764084827523436, + "grad_norm": 5.875, + "learning_rate": 9.954870854971809e-06, + "loss": 1.4719241857528687, + "step": 866 + }, + { + "epoch": 0.15800491489942659, + "grad_norm": 4.75, + "learning_rate": 9.954641040756346e-06, + "loss": 1.4069263935089111, + "step": 868 + }, + { + "epoch": 0.15836898152361883, + "grad_norm": 4.5625, + "learning_rate": 9.954410646213334e-06, + "loss": 1.4353796243667603, + "step": 870 + }, + { + "epoch": 0.15873304814781106, + "grad_norm": 5.875, + "learning_rate": 9.954179671376589e-06, + "loss": 1.2221503257751465, + "step": 872 + }, + { + "epoch": 0.15909711477200328, + "grad_norm": 11.8125, + "learning_rate": 9.953948116280001e-06, + "loss": 1.591544270515442, + "step": 874 + }, + { + "epoch": 0.1594611813961955, + "grad_norm": 6.75, + "learning_rate": 9.95371598095755e-06, + "loss": 1.3720848560333252, + "step": 876 + }, + { + "epoch": 0.15982524802038772, + "grad_norm": 90.5, + "learning_rate": 9.953483265443303e-06, + "loss": 1.4762324094772339, + "step": 878 + }, + { + "epoch": 0.16018931464457994, + "grad_norm": 14.375, + "learning_rate": 9.953249969771408e-06, + "loss": 1.6112408638000488, + "step": 880 + }, + { + "epoch": 0.1605533812687722, + "grad_norm": 11.5, + "learning_rate": 9.9530160939761e-06, + "loss": 1.8050957918167114, + "step": 882 + }, + { + "epoch": 0.16091744789296442, + "grad_norm": 18.5, + "learning_rate": 9.952781638091702e-06, + "loss": 1.5882316827774048, + "step": 884 + }, + { + "epoch": 0.16128151451715664, + "grad_norm": 34.5, + "learning_rate": 9.952546602152618e-06, + "loss": 2.116976737976074, + "step": 886 + }, + { + "epoch": 0.16164558114134886, + "grad_norm": 32.5, + "learning_rate": 9.952310986193337e-06, + "loss": 1.351901650428772, + "step": 888 + }, + { + "epoch": 0.16200964776554108, + "grad_norm": 17.5, + "learning_rate": 9.952074790248436e-06, + "loss": 1.0881125926971436, + "step": 890 + }, + { + "epoch": 0.16237371438973333, + "grad_norm": 5.96875, + "learning_rate": 9.951838014352578e-06, + "loss": 1.182440996170044, + "step": 892 + }, + { + "epoch": 0.16273778101392555, + "grad_norm": 11.1875, + "learning_rate": 9.951600658540503e-06, + "loss": 0.9450311660766602, + "step": 894 + }, + { + "epoch": 0.16310184763811778, + "grad_norm": 9.4375, + "learning_rate": 9.951362722847048e-06, + "loss": 1.0391665697097778, + "step": 896 + }, + { + "epoch": 0.16346591426231, + "grad_norm": 8.375, + "learning_rate": 9.951124207307127e-06, + "loss": 1.4324138164520264, + "step": 898 + }, + { + "epoch": 0.16382998088650222, + "grad_norm": 22.75, + "learning_rate": 9.950885111955742e-06, + "loss": 1.5406783819198608, + "step": 900 + }, + { + "epoch": 0.16419404751069447, + "grad_norm": 11.5625, + "learning_rate": 9.950645436827977e-06, + "loss": 1.5500184297561646, + "step": 902 + }, + { + "epoch": 0.1645581141348867, + "grad_norm": 14.4375, + "learning_rate": 9.950405181959007e-06, + "loss": 1.4855787754058838, + "step": 904 + }, + { + "epoch": 0.1649221807590789, + "grad_norm": 38.0, + "learning_rate": 9.950164347384083e-06, + "loss": 1.4765028953552246, + "step": 906 + }, + { + "epoch": 0.16528624738327113, + "grad_norm": 9.5, + "learning_rate": 9.949922933138552e-06, + "loss": 1.1264008283615112, + "step": 908 + }, + { + "epoch": 0.16565031400746336, + "grad_norm": 8.25, + "learning_rate": 9.949680939257838e-06, + "loss": 1.071408987045288, + "step": 910 + }, + { + "epoch": 0.1660143806316556, + "grad_norm": 12.8125, + "learning_rate": 9.94943836577745e-06, + "loss": 1.0403443574905396, + "step": 912 + }, + { + "epoch": 0.16637844725584783, + "grad_norm": 5.21875, + "learning_rate": 9.94919521273299e-06, + "loss": 1.491145372390747, + "step": 914 + }, + { + "epoch": 0.16674251388004005, + "grad_norm": 18.375, + "learning_rate": 9.948951480160138e-06, + "loss": 1.5384421348571777, + "step": 916 + }, + { + "epoch": 0.16710658050423227, + "grad_norm": 19.625, + "learning_rate": 9.948707168094658e-06, + "loss": 1.332274317741394, + "step": 918 + }, + { + "epoch": 0.1674706471284245, + "grad_norm": 8.4375, + "learning_rate": 9.948462276572404e-06, + "loss": 1.6329753398895264, + "step": 920 + }, + { + "epoch": 0.16783471375261672, + "grad_norm": 18.875, + "learning_rate": 9.948216805629314e-06, + "loss": 1.5353933572769165, + "step": 922 + }, + { + "epoch": 0.16819878037680897, + "grad_norm": 10.1875, + "learning_rate": 9.947970755301408e-06, + "loss": 1.2155869007110596, + "step": 924 + }, + { + "epoch": 0.1685628470010012, + "grad_norm": 9.75, + "learning_rate": 9.947724125624793e-06, + "loss": 1.4639822244644165, + "step": 926 + }, + { + "epoch": 0.1689269136251934, + "grad_norm": 13.125, + "learning_rate": 9.94747691663566e-06, + "loss": 1.5783990621566772, + "step": 928 + }, + { + "epoch": 0.16929098024938563, + "grad_norm": 21.75, + "learning_rate": 9.947229128370289e-06, + "loss": 1.7080756425857544, + "step": 930 + }, + { + "epoch": 0.16965504687357785, + "grad_norm": 21.0, + "learning_rate": 9.94698076086504e-06, + "loss": 1.9166209697723389, + "step": 932 + }, + { + "epoch": 0.1700191134977701, + "grad_norm": 7.96875, + "learning_rate": 9.946731814156358e-06, + "loss": 1.4945251941680908, + "step": 934 + }, + { + "epoch": 0.17038318012196232, + "grad_norm": 8.5625, + "learning_rate": 9.946482288280782e-06, + "loss": 1.2176241874694824, + "step": 936 + }, + { + "epoch": 0.17074724674615455, + "grad_norm": 13.1875, + "learning_rate": 9.946232183274918e-06, + "loss": 1.4079389572143555, + "step": 938 + }, + { + "epoch": 0.17111131337034677, + "grad_norm": 16.75, + "learning_rate": 9.945981499175476e-06, + "loss": 1.3811269998550415, + "step": 940 + }, + { + "epoch": 0.171475379994539, + "grad_norm": 13.0625, + "learning_rate": 9.945730236019242e-06, + "loss": 1.600013017654419, + "step": 942 + }, + { + "epoch": 0.17183944661873124, + "grad_norm": 28.5, + "learning_rate": 9.945478393843086e-06, + "loss": 1.5965791940689087, + "step": 944 + }, + { + "epoch": 0.17220351324292346, + "grad_norm": 12.4375, + "learning_rate": 9.945225972683965e-06, + "loss": 1.4906278848648071, + "step": 946 + }, + { + "epoch": 0.17256757986711568, + "grad_norm": 6.46875, + "learning_rate": 9.944972972578921e-06, + "loss": 1.0816751718521118, + "step": 948 + }, + { + "epoch": 0.1729316464913079, + "grad_norm": 7.75, + "learning_rate": 9.94471939356508e-06, + "loss": 1.4420520067214966, + "step": 950 + }, + { + "epoch": 0.17329571311550013, + "grad_norm": 11.5, + "learning_rate": 9.944465235679657e-06, + "loss": 1.559586524963379, + "step": 952 + }, + { + "epoch": 0.17365977973969238, + "grad_norm": 7.0625, + "learning_rate": 9.944210498959943e-06, + "loss": 1.3783751726150513, + "step": 954 + }, + { + "epoch": 0.1740238463638846, + "grad_norm": 12.875, + "learning_rate": 9.943955183443325e-06, + "loss": 1.5681862831115723, + "step": 956 + }, + { + "epoch": 0.17438791298807682, + "grad_norm": 12.75, + "learning_rate": 9.943699289167265e-06, + "loss": 1.6779510974884033, + "step": 958 + }, + { + "epoch": 0.17475197961226904, + "grad_norm": 19.625, + "learning_rate": 9.94344281616932e-06, + "loss": 1.9288862943649292, + "step": 960 + }, + { + "epoch": 0.17511604623646126, + "grad_norm": 7.71875, + "learning_rate": 9.943185764487122e-06, + "loss": 1.4111822843551636, + "step": 962 + }, + { + "epoch": 0.1754801128606535, + "grad_norm": 18.5, + "learning_rate": 9.942928134158392e-06, + "loss": 1.4696686267852783, + "step": 964 + }, + { + "epoch": 0.17584417948484574, + "grad_norm": 47.25, + "learning_rate": 9.942669925220938e-06, + "loss": 1.5688788890838623, + "step": 966 + }, + { + "epoch": 0.17620824610903796, + "grad_norm": 39.5, + "learning_rate": 9.942411137712651e-06, + "loss": 2.28415584564209, + "step": 968 + }, + { + "epoch": 0.17657231273323018, + "grad_norm": 6.5, + "learning_rate": 9.942151771671506e-06, + "loss": 1.3252819776535034, + "step": 970 + }, + { + "epoch": 0.1769363793574224, + "grad_norm": 6.4375, + "learning_rate": 9.941891827135568e-06, + "loss": 1.2047700881958008, + "step": 972 + }, + { + "epoch": 0.17730044598161462, + "grad_norm": 8.125, + "learning_rate": 9.941631304142976e-06, + "loss": 1.415588617324829, + "step": 974 + }, + { + "epoch": 0.17766451260580687, + "grad_norm": 10.5625, + "learning_rate": 9.941370202731966e-06, + "loss": 1.6344401836395264, + "step": 976 + }, + { + "epoch": 0.1780285792299991, + "grad_norm": 43.75, + "learning_rate": 9.94110852294085e-06, + "loss": 1.6803901195526123, + "step": 978 + }, + { + "epoch": 0.17839264585419132, + "grad_norm": 9.9375, + "learning_rate": 9.940846264808031e-06, + "loss": 2.081049919128418, + "step": 980 + }, + { + "epoch": 0.17875671247838354, + "grad_norm": 7.125, + "learning_rate": 9.940583428371993e-06, + "loss": 1.3472354412078857, + "step": 982 + }, + { + "epoch": 0.17912077910257576, + "grad_norm": 11.5625, + "learning_rate": 9.94032001367131e-06, + "loss": 1.3411457538604736, + "step": 984 + }, + { + "epoch": 0.179484845726768, + "grad_norm": 8.0625, + "learning_rate": 9.940056020744628e-06, + "loss": 1.1788313388824463, + "step": 986 + }, + { + "epoch": 0.17984891235096023, + "grad_norm": 16.0, + "learning_rate": 9.939791449630696e-06, + "loss": 1.1841378211975098, + "step": 988 + }, + { + "epoch": 0.18021297897515245, + "grad_norm": 8.75, + "learning_rate": 9.939526300368337e-06, + "loss": 1.285358190536499, + "step": 990 + }, + { + "epoch": 0.18057704559934468, + "grad_norm": 12.0625, + "learning_rate": 9.939260572996456e-06, + "loss": 1.534583330154419, + "step": 992 + }, + { + "epoch": 0.1809411122235369, + "grad_norm": 10.5625, + "learning_rate": 9.93899426755405e-06, + "loss": 1.439420223236084, + "step": 994 + }, + { + "epoch": 0.18130517884772915, + "grad_norm": 12.0625, + "learning_rate": 9.938727384080201e-06, + "loss": 1.6394792795181274, + "step": 996 + }, + { + "epoch": 0.18166924547192137, + "grad_norm": 18.375, + "learning_rate": 9.938459922614069e-06, + "loss": 2.0668153762817383, + "step": 998 + }, + { + "epoch": 0.1820333120961136, + "grad_norm": 11.0625, + "learning_rate": 9.938191883194906e-06, + "loss": 1.2434909343719482, + "step": 1000 + }, + { + "epoch": 0.18239737872030581, + "grad_norm": 8.625, + "learning_rate": 9.937923265862041e-06, + "loss": 1.194491982460022, + "step": 1002 + }, + { + "epoch": 0.18276144534449804, + "grad_norm": 12.75, + "learning_rate": 9.937654070654898e-06, + "loss": 0.9255943298339844, + "step": 1004 + }, + { + "epoch": 0.18312551196869026, + "grad_norm": 9.25, + "learning_rate": 9.93738429761298e-06, + "loss": 1.0794122219085693, + "step": 1006 + }, + { + "epoch": 0.1834895785928825, + "grad_norm": 11.3125, + "learning_rate": 9.93711394677587e-06, + "loss": 0.2285841405391693, + "step": 1008 + }, + { + "epoch": 0.18385364521707473, + "grad_norm": 9.8125, + "learning_rate": 9.936843018183245e-06, + "loss": 0.5525633096694946, + "step": 1010 + }, + { + "epoch": 0.18421771184126695, + "grad_norm": 39.0, + "learning_rate": 9.936571511874863e-06, + "loss": 1.6073859930038452, + "step": 1012 + }, + { + "epoch": 0.18458177846545917, + "grad_norm": 38.5, + "learning_rate": 9.936299427890563e-06, + "loss": 1.7834596633911133, + "step": 1014 + }, + { + "epoch": 0.1849458450896514, + "grad_norm": 10.9375, + "learning_rate": 9.936026766270274e-06, + "loss": 1.6835075616836548, + "step": 1016 + }, + { + "epoch": 0.18530991171384364, + "grad_norm": 22.75, + "learning_rate": 9.93575352705401e-06, + "loss": 2.135343074798584, + "step": 1018 + }, + { + "epoch": 0.18567397833803587, + "grad_norm": 7.8125, + "learning_rate": 9.935479710281867e-06, + "loss": 1.7683391571044922, + "step": 1020 + }, + { + "epoch": 0.1860380449622281, + "grad_norm": 18.75, + "learning_rate": 9.935205315994025e-06, + "loss": 1.1597998142242432, + "step": 1022 + }, + { + "epoch": 0.1864021115864203, + "grad_norm": 11.5, + "learning_rate": 9.934930344230748e-06, + "loss": 1.8588007688522339, + "step": 1024 + }, + { + "epoch": 0.18676617821061253, + "grad_norm": 7.46875, + "learning_rate": 9.934654795032394e-06, + "loss": 1.4659507274627686, + "step": 1026 + }, + { + "epoch": 0.18713024483480478, + "grad_norm": 276.0, + "learning_rate": 9.934378668439394e-06, + "loss": 1.1973302364349365, + "step": 1028 + }, + { + "epoch": 0.187494311458997, + "grad_norm": 10.3125, + "learning_rate": 9.934101964492266e-06, + "loss": 1.6813024282455444, + "step": 1030 + }, + { + "epoch": 0.18785837808318923, + "grad_norm": 18.75, + "learning_rate": 9.93382468323162e-06, + "loss": 1.5307891368865967, + "step": 1032 + }, + { + "epoch": 0.18822244470738145, + "grad_norm": 10.3125, + "learning_rate": 9.933546824698145e-06, + "loss": 1.5396398305892944, + "step": 1034 + }, + { + "epoch": 0.18858651133157367, + "grad_norm": 13.1875, + "learning_rate": 9.933268388932612e-06, + "loss": 1.411933422088623, + "step": 1036 + }, + { + "epoch": 0.1889505779557659, + "grad_norm": 10.25, + "learning_rate": 9.932989375975888e-06, + "loss": 1.581071138381958, + "step": 1038 + }, + { + "epoch": 0.18931464457995814, + "grad_norm": 31.0, + "learning_rate": 9.932709785868908e-06, + "loss": 1.5011827945709229, + "step": 1040 + }, + { + "epoch": 0.18967871120415036, + "grad_norm": 30.75, + "learning_rate": 9.932429618652706e-06, + "loss": 1.411431074142456, + "step": 1042 + }, + { + "epoch": 0.19004277782834259, + "grad_norm": 7.96875, + "learning_rate": 9.932148874368395e-06, + "loss": 1.2337169647216797, + "step": 1044 + }, + { + "epoch": 0.1904068444525348, + "grad_norm": 10.9375, + "learning_rate": 9.931867553057171e-06, + "loss": 1.2204968929290771, + "step": 1046 + }, + { + "epoch": 0.19077091107672703, + "grad_norm": 15.5, + "learning_rate": 9.931585654760318e-06, + "loss": 1.4795103073120117, + "step": 1048 + }, + { + "epoch": 0.19113497770091928, + "grad_norm": 21.375, + "learning_rate": 9.931303179519205e-06, + "loss": 1.5027167797088623, + "step": 1050 + }, + { + "epoch": 0.1914990443251115, + "grad_norm": 12.5625, + "learning_rate": 9.931020127375281e-06, + "loss": 1.405234456062317, + "step": 1052 + }, + { + "epoch": 0.19186311094930372, + "grad_norm": 36.75, + "learning_rate": 9.930736498370086e-06, + "loss": 1.4587359428405762, + "step": 1054 + }, + { + "epoch": 0.19222717757349594, + "grad_norm": 11.0625, + "learning_rate": 9.930452292545239e-06, + "loss": 1.5163779258728027, + "step": 1056 + }, + { + "epoch": 0.19259124419768817, + "grad_norm": 13.875, + "learning_rate": 9.930167509942445e-06, + "loss": 1.6596927642822266, + "step": 1058 + }, + { + "epoch": 0.19295531082188042, + "grad_norm": 18.375, + "learning_rate": 9.929882150603499e-06, + "loss": 1.8548587560653687, + "step": 1060 + }, + { + "epoch": 0.19331937744607264, + "grad_norm": 8.0625, + "learning_rate": 9.929596214570272e-06, + "loss": 1.2847120761871338, + "step": 1062 + }, + { + "epoch": 0.19368344407026486, + "grad_norm": 2.71875, + "learning_rate": 9.929309701884725e-06, + "loss": 0.8553503751754761, + "step": 1064 + }, + { + "epoch": 0.19404751069445708, + "grad_norm": 5.0625, + "learning_rate": 9.929022612588908e-06, + "loss": 1.2123963832855225, + "step": 1066 + }, + { + "epoch": 0.1944115773186493, + "grad_norm": 19.125, + "learning_rate": 9.92873494672494e-06, + "loss": 1.4959981441497803, + "step": 1068 + }, + { + "epoch": 0.19477564394284155, + "grad_norm": 8.25, + "learning_rate": 9.928446704335044e-06, + "loss": 1.4451946020126343, + "step": 1070 + }, + { + "epoch": 0.19513971056703378, + "grad_norm": 6.96875, + "learning_rate": 9.928157885461514e-06, + "loss": 1.088136911392212, + "step": 1072 + }, + { + "epoch": 0.195503777191226, + "grad_norm": 10.125, + "learning_rate": 9.927868490146732e-06, + "loss": 1.2119640111923218, + "step": 1074 + }, + { + "epoch": 0.19586784381541822, + "grad_norm": 5.4375, + "learning_rate": 9.927578518433167e-06, + "loss": 1.1197935342788696, + "step": 1076 + }, + { + "epoch": 0.19623191043961044, + "grad_norm": 43.0, + "learning_rate": 9.927287970363375e-06, + "loss": 1.6458243131637573, + "step": 1078 + }, + { + "epoch": 0.19659597706380266, + "grad_norm": 11.25, + "learning_rate": 9.926996845979986e-06, + "loss": 1.4721394777297974, + "step": 1080 + }, + { + "epoch": 0.1969600436879949, + "grad_norm": 15.0, + "learning_rate": 9.926705145325729e-06, + "loss": 1.404916763305664, + "step": 1082 + }, + { + "epoch": 0.19732411031218713, + "grad_norm": 6.90625, + "learning_rate": 9.9264128684434e-06, + "loss": 1.3604183197021484, + "step": 1084 + }, + { + "epoch": 0.19768817693637936, + "grad_norm": 7.5625, + "learning_rate": 9.926120015375897e-06, + "loss": 1.0928027629852295, + "step": 1086 + }, + { + "epoch": 0.19805224356057158, + "grad_norm": 4.53125, + "learning_rate": 9.925826586166193e-06, + "loss": 1.4573407173156738, + "step": 1088 + }, + { + "epoch": 0.1984163101847638, + "grad_norm": 4.34375, + "learning_rate": 9.925532580857349e-06, + "loss": 0.8799431324005127, + "step": 1090 + }, + { + "epoch": 0.19878037680895605, + "grad_norm": 16.625, + "learning_rate": 9.925237999492505e-06, + "loss": 1.3133742809295654, + "step": 1092 + }, + { + "epoch": 0.19914444343314827, + "grad_norm": 11.9375, + "learning_rate": 9.924942842114895e-06, + "loss": 1.619940161705017, + "step": 1094 + }, + { + "epoch": 0.1995085100573405, + "grad_norm": 15.4375, + "learning_rate": 9.924647108767826e-06, + "loss": 1.6402709484100342, + "step": 1096 + }, + { + "epoch": 0.19987257668153272, + "grad_norm": 18.75, + "learning_rate": 9.924350799494701e-06, + "loss": 1.0292584896087646, + "step": 1098 + }, + { + "epoch": 0.20023664330572494, + "grad_norm": 7.375, + "learning_rate": 9.924053914339e-06, + "loss": 1.3446272611618042, + "step": 1100 + }, + { + "epoch": 0.2006007099299172, + "grad_norm": 46.0, + "learning_rate": 9.923756453344291e-06, + "loss": 1.4331274032592773, + "step": 1102 + }, + { + "epoch": 0.2009647765541094, + "grad_norm": 9.0625, + "learning_rate": 9.923458416554221e-06, + "loss": 1.8346397876739502, + "step": 1104 + }, + { + "epoch": 0.20132884317830163, + "grad_norm": 29.875, + "learning_rate": 9.923159804012531e-06, + "loss": 1.6610716581344604, + "step": 1106 + }, + { + "epoch": 0.20169290980249385, + "grad_norm": 10.5625, + "learning_rate": 9.922860615763039e-06, + "loss": 1.5934720039367676, + "step": 1108 + }, + { + "epoch": 0.20205697642668607, + "grad_norm": 14.0, + "learning_rate": 9.92256085184965e-06, + "loss": 1.4009087085723877, + "step": 1110 + }, + { + "epoch": 0.20242104305087832, + "grad_norm": 10.0, + "learning_rate": 9.922260512316352e-06, + "loss": 1.3968048095703125, + "step": 1112 + }, + { + "epoch": 0.20278510967507055, + "grad_norm": 3.703125, + "learning_rate": 9.92195959720722e-06, + "loss": 1.24936842918396, + "step": 1114 + }, + { + "epoch": 0.20314917629926277, + "grad_norm": 10.25, + "learning_rate": 9.92165810656641e-06, + "loss": 1.251719355583191, + "step": 1116 + }, + { + "epoch": 0.203513242923455, + "grad_norm": 14.4375, + "learning_rate": 9.921356040438165e-06, + "loss": 1.711976408958435, + "step": 1118 + }, + { + "epoch": 0.2038773095476472, + "grad_norm": 14.5, + "learning_rate": 9.921053398866816e-06, + "loss": 1.462119698524475, + "step": 1120 + }, + { + "epoch": 0.20424137617183943, + "grad_norm": 24.0, + "learning_rate": 9.92075018189677e-06, + "loss": 1.8700501918792725, + "step": 1122 + }, + { + "epoch": 0.20460544279603168, + "grad_norm": 15.875, + "learning_rate": 9.920446389572523e-06, + "loss": 1.4322757720947266, + "step": 1124 + }, + { + "epoch": 0.2049695094202239, + "grad_norm": 22.25, + "learning_rate": 9.92014202193866e-06, + "loss": 1.3662235736846924, + "step": 1126 + }, + { + "epoch": 0.20533357604441613, + "grad_norm": 12.75, + "learning_rate": 9.919837079039838e-06, + "loss": 0.9338915944099426, + "step": 1128 + }, + { + "epoch": 0.20569764266860835, + "grad_norm": 35.5, + "learning_rate": 9.919531560920812e-06, + "loss": 1.1594756841659546, + "step": 1130 + }, + { + "epoch": 0.20606170929280057, + "grad_norm": 6.1875, + "learning_rate": 9.919225467626414e-06, + "loss": 0.9257606267929077, + "step": 1132 + }, + { + "epoch": 0.20642577591699282, + "grad_norm": 20.125, + "learning_rate": 9.918918799201563e-06, + "loss": 1.1696546077728271, + "step": 1134 + }, + { + "epoch": 0.20678984254118504, + "grad_norm": 8.875, + "learning_rate": 9.918611555691258e-06, + "loss": 1.4146943092346191, + "step": 1136 + }, + { + "epoch": 0.20715390916537726, + "grad_norm": 18.25, + "learning_rate": 9.91830373714059e-06, + "loss": 1.5787546634674072, + "step": 1138 + }, + { + "epoch": 0.2075179757895695, + "grad_norm": 7.71875, + "learning_rate": 9.917995343594726e-06, + "loss": 1.4461215734481812, + "step": 1140 + }, + { + "epoch": 0.2078820424137617, + "grad_norm": 13.8125, + "learning_rate": 9.917686375098925e-06, + "loss": 1.4070762395858765, + "step": 1142 + }, + { + "epoch": 0.20824610903795396, + "grad_norm": 28.25, + "learning_rate": 9.917376831698526e-06, + "loss": 1.3918594121932983, + "step": 1144 + }, + { + "epoch": 0.20861017566214618, + "grad_norm": 14.125, + "learning_rate": 9.917066713438952e-06, + "loss": 1.3972517251968384, + "step": 1146 + }, + { + "epoch": 0.2089742422863384, + "grad_norm": 9.3125, + "learning_rate": 9.916756020365712e-06, + "loss": 1.2324825525283813, + "step": 1148 + }, + { + "epoch": 0.20933830891053062, + "grad_norm": 8.6875, + "learning_rate": 9.916444752524398e-06, + "loss": 0.8400664925575256, + "step": 1150 + }, + { + "epoch": 0.20970237553472285, + "grad_norm": 7.75, + "learning_rate": 9.91613290996069e-06, + "loss": 0.996133029460907, + "step": 1152 + }, + { + "epoch": 0.21006644215891507, + "grad_norm": 6.625, + "learning_rate": 9.91582049272035e-06, + "loss": 0.4878230690956116, + "step": 1154 + }, + { + "epoch": 0.21043050878310732, + "grad_norm": 11.6875, + "learning_rate": 9.915507500849219e-06, + "loss": 1.4658123254776, + "step": 1156 + }, + { + "epoch": 0.21079457540729954, + "grad_norm": 4.09375, + "learning_rate": 9.915193934393232e-06, + "loss": 0.9831236600875854, + "step": 1158 + }, + { + "epoch": 0.21115864203149176, + "grad_norm": 13.0, + "learning_rate": 9.914879793398402e-06, + "loss": 1.397628903388977, + "step": 1160 + }, + { + "epoch": 0.21152270865568398, + "grad_norm": 22.0, + "learning_rate": 9.914565077910827e-06, + "loss": 1.3963582515716553, + "step": 1162 + }, + { + "epoch": 0.2118867752798762, + "grad_norm": 61.5, + "learning_rate": 9.914249787976694e-06, + "loss": 1.5733776092529297, + "step": 1164 + }, + { + "epoch": 0.21225084190406845, + "grad_norm": 39.75, + "learning_rate": 9.913933923642263e-06, + "loss": 0.5213334560394287, + "step": 1166 + }, + { + "epoch": 0.21261490852826068, + "grad_norm": 4.09375, + "learning_rate": 9.913617484953895e-06, + "loss": 1.096614122390747, + "step": 1168 + }, + { + "epoch": 0.2129789751524529, + "grad_norm": 12.25, + "learning_rate": 9.913300471958019e-06, + "loss": 1.6260157823562622, + "step": 1170 + }, + { + "epoch": 0.21334304177664512, + "grad_norm": 11.9375, + "learning_rate": 9.912982884701157e-06, + "loss": 1.5148996114730835, + "step": 1172 + }, + { + "epoch": 0.21370710840083734, + "grad_norm": 10.25, + "learning_rate": 9.912664723229916e-06, + "loss": 1.464977741241455, + "step": 1174 + }, + { + "epoch": 0.2140711750250296, + "grad_norm": 7.375, + "learning_rate": 9.912345987590985e-06, + "loss": 1.406298041343689, + "step": 1176 + }, + { + "epoch": 0.2144352416492218, + "grad_norm": 3.0, + "learning_rate": 9.912026677831135e-06, + "loss": 1.0304433107376099, + "step": 1178 + }, + { + "epoch": 0.21479930827341404, + "grad_norm": 5.3125, + "learning_rate": 9.911706793997223e-06, + "loss": 1.3358741998672485, + "step": 1180 + }, + { + "epoch": 0.21516337489760626, + "grad_norm": 64.5, + "learning_rate": 9.91138633613619e-06, + "loss": 1.5699183940887451, + "step": 1182 + }, + { + "epoch": 0.21552744152179848, + "grad_norm": 28.0, + "learning_rate": 9.911065304295067e-06, + "loss": 1.9996180534362793, + "step": 1184 + }, + { + "epoch": 0.21589150814599073, + "grad_norm": 17.0, + "learning_rate": 9.910743698520959e-06, + "loss": 1.4907023906707764, + "step": 1186 + }, + { + "epoch": 0.21625557477018295, + "grad_norm": 17.625, + "learning_rate": 9.910421518861062e-06, + "loss": 1.4150099754333496, + "step": 1188 + }, + { + "epoch": 0.21661964139437517, + "grad_norm": 9.3125, + "learning_rate": 9.910098765362657e-06, + "loss": 1.4136031866073608, + "step": 1190 + }, + { + "epoch": 0.2169837080185674, + "grad_norm": 10.8125, + "learning_rate": 9.909775438073102e-06, + "loss": 1.5777891874313354, + "step": 1192 + }, + { + "epoch": 0.21734777464275962, + "grad_norm": 58.0, + "learning_rate": 9.909451537039847e-06, + "loss": 1.3451259136199951, + "step": 1194 + }, + { + "epoch": 0.21771184126695184, + "grad_norm": 8.0625, + "learning_rate": 9.909127062310422e-06, + "loss": 1.3976876735687256, + "step": 1196 + }, + { + "epoch": 0.2180759078911441, + "grad_norm": 12.5, + "learning_rate": 9.908802013932443e-06, + "loss": 1.479409098625183, + "step": 1198 + }, + { + "epoch": 0.2184399745153363, + "grad_norm": 12.3125, + "learning_rate": 9.90847639195361e-06, + "loss": 1.1923532485961914, + "step": 1200 + }, + { + "epoch": 0.21880404113952853, + "grad_norm": 28.625, + "learning_rate": 9.908150196421707e-06, + "loss": 0.7754479050636292, + "step": 1202 + }, + { + "epoch": 0.21916810776372075, + "grad_norm": 11.5, + "learning_rate": 9.9078234273846e-06, + "loss": 0.6064488291740417, + "step": 1204 + }, + { + "epoch": 0.21953217438791298, + "grad_norm": 9.9375, + "learning_rate": 9.907496084890242e-06, + "loss": 1.1636220216751099, + "step": 1206 + }, + { + "epoch": 0.21989624101210523, + "grad_norm": 15.0625, + "learning_rate": 9.90716816898667e-06, + "loss": 1.0404331684112549, + "step": 1208 + }, + { + "epoch": 0.22026030763629745, + "grad_norm": 37.75, + "learning_rate": 9.906839679722002e-06, + "loss": 1.5967075824737549, + "step": 1210 + }, + { + "epoch": 0.22062437426048967, + "grad_norm": 14.8125, + "learning_rate": 9.906510617144448e-06, + "loss": 1.4154984951019287, + "step": 1212 + }, + { + "epoch": 0.2209884408846819, + "grad_norm": 10.5, + "learning_rate": 9.906180981302286e-06, + "loss": 1.3427590131759644, + "step": 1214 + }, + { + "epoch": 0.2213525075088741, + "grad_norm": 6.03125, + "learning_rate": 9.905850772243901e-06, + "loss": 1.381961464881897, + "step": 1216 + }, + { + "epoch": 0.22171657413306636, + "grad_norm": 29.0, + "learning_rate": 9.905519990017742e-06, + "loss": 1.1825193166732788, + "step": 1218 + }, + { + "epoch": 0.22208064075725858, + "grad_norm": 7.40625, + "learning_rate": 9.905188634672352e-06, + "loss": 1.1286404132843018, + "step": 1220 + }, + { + "epoch": 0.2224447073814508, + "grad_norm": 9.875, + "learning_rate": 9.904856706256356e-06, + "loss": 1.1375209093093872, + "step": 1222 + }, + { + "epoch": 0.22280877400564303, + "grad_norm": 68.0, + "learning_rate": 9.904524204818464e-06, + "loss": 0.8734057545661926, + "step": 1224 + }, + { + "epoch": 0.22317284062983525, + "grad_norm": 33.0, + "learning_rate": 9.90419113040747e-06, + "loss": 1.3355128765106201, + "step": 1226 + }, + { + "epoch": 0.2235369072540275, + "grad_norm": 9.6875, + "learning_rate": 9.903857483072247e-06, + "loss": 1.3340837955474854, + "step": 1228 + }, + { + "epoch": 0.22390097387821972, + "grad_norm": 6.75, + "learning_rate": 9.903523262861763e-06, + "loss": 1.293943166732788, + "step": 1230 + }, + { + "epoch": 0.22426504050241194, + "grad_norm": 16.25, + "learning_rate": 9.903188469825057e-06, + "loss": 1.562679409980774, + "step": 1232 + }, + { + "epoch": 0.22462910712660417, + "grad_norm": 18.75, + "learning_rate": 9.902853104011261e-06, + "loss": 1.3159688711166382, + "step": 1234 + }, + { + "epoch": 0.2249931737507964, + "grad_norm": 14.0625, + "learning_rate": 9.902517165469589e-06, + "loss": 1.25418221950531, + "step": 1236 + }, + { + "epoch": 0.2253572403749886, + "grad_norm": 15.5, + "learning_rate": 9.90218065424934e-06, + "loss": 0.9642902612686157, + "step": 1238 + }, + { + "epoch": 0.22572130699918086, + "grad_norm": 11.6875, + "learning_rate": 9.901843570399895e-06, + "loss": 1.0084102153778076, + "step": 1240 + }, + { + "epoch": 0.22608537362337308, + "grad_norm": 15.625, + "learning_rate": 9.901505913970716e-06, + "loss": 1.6591867208480835, + "step": 1242 + }, + { + "epoch": 0.2264494402475653, + "grad_norm": 7.71875, + "learning_rate": 9.901167685011358e-06, + "loss": 1.203679084777832, + "step": 1244 + }, + { + "epoch": 0.22681350687175753, + "grad_norm": 10.5, + "learning_rate": 9.90082888357145e-06, + "loss": 1.0881551504135132, + "step": 1246 + }, + { + "epoch": 0.22717757349594975, + "grad_norm": 10.8125, + "learning_rate": 9.900489509700713e-06, + "loss": 1.368391513824463, + "step": 1248 + }, + { + "epoch": 0.227541640120142, + "grad_norm": 5.625, + "learning_rate": 9.900149563448947e-06, + "loss": 1.1805520057678223, + "step": 1250 + }, + { + "epoch": 0.22790570674433422, + "grad_norm": 78.0, + "learning_rate": 9.89980904486604e-06, + "loss": 0.828658938407898, + "step": 1252 + }, + { + "epoch": 0.22826977336852644, + "grad_norm": 7.8125, + "learning_rate": 9.89946795400196e-06, + "loss": 0.939128041267395, + "step": 1254 + }, + { + "epoch": 0.22863383999271866, + "grad_norm": 11.0625, + "learning_rate": 9.89912629090676e-06, + "loss": 1.503872275352478, + "step": 1256 + }, + { + "epoch": 0.22899790661691088, + "grad_norm": 17.0, + "learning_rate": 9.898784055630576e-06, + "loss": 1.7752597332000732, + "step": 1258 + }, + { + "epoch": 0.22936197324110313, + "grad_norm": 6.59375, + "learning_rate": 9.898441248223638e-06, + "loss": 1.3853998184204102, + "step": 1260 + }, + { + "epoch": 0.22972603986529536, + "grad_norm": 10.125, + "learning_rate": 9.898097868736243e-06, + "loss": 1.2807327508926392, + "step": 1262 + }, + { + "epoch": 0.23009010648948758, + "grad_norm": 14.375, + "learning_rate": 9.897753917218782e-06, + "loss": 1.4517203569412231, + "step": 1264 + }, + { + "epoch": 0.2304541731136798, + "grad_norm": 10.75, + "learning_rate": 9.897409393721731e-06, + "loss": 1.5580300092697144, + "step": 1266 + }, + { + "epoch": 0.23081823973787202, + "grad_norm": 6.3125, + "learning_rate": 9.897064298295646e-06, + "loss": 1.2930718660354614, + "step": 1268 + }, + { + "epoch": 0.23118230636206427, + "grad_norm": 13.125, + "learning_rate": 9.89671863099117e-06, + "loss": 1.1210490465164185, + "step": 1270 + }, + { + "epoch": 0.2315463729862565, + "grad_norm": 10.1875, + "learning_rate": 9.896372391859026e-06, + "loss": 1.4651648998260498, + "step": 1272 + }, + { + "epoch": 0.23191043961044872, + "grad_norm": 7.59375, + "learning_rate": 9.896025580950027e-06, + "loss": 0.7894704341888428, + "step": 1274 + }, + { + "epoch": 0.23227450623464094, + "grad_norm": 8.4375, + "learning_rate": 9.895678198315057e-06, + "loss": 1.3474678993225098, + "step": 1276 + }, + { + "epoch": 0.23263857285883316, + "grad_norm": 7.75, + "learning_rate": 9.895330244005105e-06, + "loss": 0.9716839790344238, + "step": 1278 + }, + { + "epoch": 0.23300263948302538, + "grad_norm": 9.4375, + "learning_rate": 9.894981718071225e-06, + "loss": 1.635238766670227, + "step": 1280 + }, + { + "epoch": 0.23336670610721763, + "grad_norm": 7.96875, + "learning_rate": 9.89463262056456e-06, + "loss": 1.4350322484970093, + "step": 1282 + }, + { + "epoch": 0.23373077273140985, + "grad_norm": 7.15625, + "learning_rate": 9.894282951536342e-06, + "loss": 1.4168959856033325, + "step": 1284 + }, + { + "epoch": 0.23409483935560207, + "grad_norm": 26.375, + "learning_rate": 9.893932711037885e-06, + "loss": 1.4836479425430298, + "step": 1286 + }, + { + "epoch": 0.2344589059797943, + "grad_norm": 105.5, + "learning_rate": 9.893581899120582e-06, + "loss": 0.9905297756195068, + "step": 1288 + }, + { + "epoch": 0.23482297260398652, + "grad_norm": 5.21875, + "learning_rate": 9.893230515835915e-06, + "loss": 0.49923619627952576, + "step": 1290 + }, + { + "epoch": 0.23518703922817877, + "grad_norm": 6.625, + "learning_rate": 9.892878561235448e-06, + "loss": 1.3243517875671387, + "step": 1292 + }, + { + "epoch": 0.235551105852371, + "grad_norm": 9.1875, + "learning_rate": 9.892526035370829e-06, + "loss": 1.0172383785247803, + "step": 1294 + }, + { + "epoch": 0.2359151724765632, + "grad_norm": 26.875, + "learning_rate": 9.89217293829379e-06, + "loss": 1.0750057697296143, + "step": 1296 + }, + { + "epoch": 0.23627923910075543, + "grad_norm": 55.75, + "learning_rate": 9.891819270056143e-06, + "loss": 0.7329437732696533, + "step": 1298 + }, + { + "epoch": 0.23664330572494766, + "grad_norm": 22.625, + "learning_rate": 9.891465030709792e-06, + "loss": 1.4872076511383057, + "step": 1300 + }, + { + "epoch": 0.2370073723491399, + "grad_norm": 9.3125, + "learning_rate": 9.891110220306717e-06, + "loss": 1.3877533674240112, + "step": 1302 + }, + { + "epoch": 0.23737143897333213, + "grad_norm": 14.5625, + "learning_rate": 9.890754838898988e-06, + "loss": 1.449622631072998, + "step": 1304 + }, + { + "epoch": 0.23773550559752435, + "grad_norm": 14.0, + "learning_rate": 9.890398886538754e-06, + "loss": 1.658739686012268, + "step": 1306 + }, + { + "epoch": 0.23809957222171657, + "grad_norm": 11.375, + "learning_rate": 9.890042363278252e-06, + "loss": 1.7893016338348389, + "step": 1308 + }, + { + "epoch": 0.2384636388459088, + "grad_norm": 2.15625, + "learning_rate": 9.889685269169795e-06, + "loss": 0.8570665121078491, + "step": 1310 + }, + { + "epoch": 0.23882770547010101, + "grad_norm": 14.25, + "learning_rate": 9.889327604265789e-06, + "loss": 1.2315696477890015, + "step": 1312 + }, + { + "epoch": 0.23919177209429326, + "grad_norm": 40.25, + "learning_rate": 9.88896936861872e-06, + "loss": 1.3997361660003662, + "step": 1314 + }, + { + "epoch": 0.2395558387184855, + "grad_norm": 7.6875, + "learning_rate": 9.888610562281156e-06, + "loss": 1.5080149173736572, + "step": 1316 + }, + { + "epoch": 0.2399199053426777, + "grad_norm": 8.0625, + "learning_rate": 9.888251185305751e-06, + "loss": 1.3581037521362305, + "step": 1318 + }, + { + "epoch": 0.24028397196686993, + "grad_norm": 11.3125, + "learning_rate": 9.887891237745243e-06, + "loss": 0.9687387347221375, + "step": 1320 + }, + { + "epoch": 0.24064803859106215, + "grad_norm": 20.875, + "learning_rate": 9.887530719652452e-06, + "loss": 1.9171817302703857, + "step": 1322 + }, + { + "epoch": 0.2410121052152544, + "grad_norm": 17.375, + "learning_rate": 9.887169631080282e-06, + "loss": 1.739109754562378, + "step": 1324 + }, + { + "epoch": 0.24137617183944662, + "grad_norm": 15.9375, + "learning_rate": 9.886807972081723e-06, + "loss": 1.485357642173767, + "step": 1326 + }, + { + "epoch": 0.24174023846363885, + "grad_norm": 7.3125, + "learning_rate": 9.886445742709844e-06, + "loss": 1.400372862815857, + "step": 1328 + }, + { + "epoch": 0.24210430508783107, + "grad_norm": 23.0, + "learning_rate": 9.886082943017804e-06, + "loss": 1.637008547782898, + "step": 1330 + }, + { + "epoch": 0.2424683717120233, + "grad_norm": 6.9375, + "learning_rate": 9.88571957305884e-06, + "loss": 1.6923367977142334, + "step": 1332 + }, + { + "epoch": 0.24283243833621554, + "grad_norm": 50.5, + "learning_rate": 9.885355632886278e-06, + "loss": 1.4662747383117676, + "step": 1334 + }, + { + "epoch": 0.24319650496040776, + "grad_norm": 11.875, + "learning_rate": 9.884991122553522e-06, + "loss": 1.4708256721496582, + "step": 1336 + }, + { + "epoch": 0.24356057158459998, + "grad_norm": 15.5625, + "learning_rate": 9.884626042114063e-06, + "loss": 1.5708904266357422, + "step": 1338 + }, + { + "epoch": 0.2439246382087922, + "grad_norm": 74.0, + "learning_rate": 9.884260391621477e-06, + "loss": 1.4456716775894165, + "step": 1340 + }, + { + "epoch": 0.24428870483298443, + "grad_norm": 15.0, + "learning_rate": 9.883894171129418e-06, + "loss": 1.475473403930664, + "step": 1342 + }, + { + "epoch": 0.24465277145717668, + "grad_norm": 6.9375, + "learning_rate": 9.883527380691628e-06, + "loss": 1.3831150531768799, + "step": 1344 + }, + { + "epoch": 0.2450168380813689, + "grad_norm": 23.125, + "learning_rate": 9.883160020361938e-06, + "loss": 1.4802080392837524, + "step": 1346 + }, + { + "epoch": 0.24538090470556112, + "grad_norm": 40.75, + "learning_rate": 9.882792090194248e-06, + "loss": 1.7584106922149658, + "step": 1348 + }, + { + "epoch": 0.24574497132975334, + "grad_norm": 12.0, + "learning_rate": 9.882423590242556e-06, + "loss": 1.4928392171859741, + "step": 1350 + }, + { + "epoch": 0.24610903795394556, + "grad_norm": 13.375, + "learning_rate": 9.882054520560936e-06, + "loss": 1.6194241046905518, + "step": 1352 + }, + { + "epoch": 0.24647310457813779, + "grad_norm": 10.75, + "learning_rate": 9.881684881203547e-06, + "loss": 1.4315046072006226, + "step": 1354 + }, + { + "epoch": 0.24683717120233004, + "grad_norm": 8.125, + "learning_rate": 9.881314672224634e-06, + "loss": 1.2643805742263794, + "step": 1356 + }, + { + "epoch": 0.24720123782652226, + "grad_norm": 21.75, + "learning_rate": 9.880943893678523e-06, + "loss": 1.4243457317352295, + "step": 1358 + }, + { + "epoch": 0.24756530445071448, + "grad_norm": 6.96875, + "learning_rate": 9.880572545619622e-06, + "loss": 1.4940712451934814, + "step": 1360 + }, + { + "epoch": 0.2479293710749067, + "grad_norm": 3.65625, + "learning_rate": 9.880200628102427e-06, + "loss": 0.874103844165802, + "step": 1362 + }, + { + "epoch": 0.24829343769909892, + "grad_norm": 23.5, + "learning_rate": 9.879828141181515e-06, + "loss": 1.1623986959457397, + "step": 1364 + }, + { + "epoch": 0.24865750432329117, + "grad_norm": 12.5, + "learning_rate": 9.879455084911547e-06, + "loss": 1.583655834197998, + "step": 1366 + }, + { + "epoch": 0.2490215709474834, + "grad_norm": 9.3125, + "learning_rate": 9.879081459347268e-06, + "loss": 1.4835476875305176, + "step": 1368 + }, + { + "epoch": 0.24938563757167562, + "grad_norm": 18.75, + "learning_rate": 9.878707264543504e-06, + "loss": 1.3770345449447632, + "step": 1370 + }, + { + "epoch": 0.24974970419586784, + "grad_norm": 17.875, + "learning_rate": 9.878332500555168e-06, + "loss": 1.46439528465271, + "step": 1372 + }, + { + "epoch": 0.2501137708200601, + "grad_norm": 31.375, + "learning_rate": 9.877957167437258e-06, + "loss": 1.293486475944519, + "step": 1374 + }, + { + "epoch": 0.2504778374442523, + "grad_norm": 9.3125, + "learning_rate": 9.877581265244847e-06, + "loss": 1.4575697183609009, + "step": 1376 + }, + { + "epoch": 0.25084190406844453, + "grad_norm": 12.1875, + "learning_rate": 9.8772047940331e-06, + "loss": 1.5639668703079224, + "step": 1378 + }, + { + "epoch": 0.2512059706926367, + "grad_norm": 8.75, + "learning_rate": 9.87682775385726e-06, + "loss": 1.3468657732009888, + "step": 1380 + }, + { + "epoch": 0.251570037316829, + "grad_norm": 11.3125, + "learning_rate": 9.876450144772663e-06, + "loss": 1.6191599369049072, + "step": 1382 + }, + { + "epoch": 0.2519341039410212, + "grad_norm": 7.9375, + "learning_rate": 9.876071966834715e-06, + "loss": 1.406794548034668, + "step": 1384 + }, + { + "epoch": 0.2522981705652134, + "grad_norm": 237.0, + "learning_rate": 9.875693220098915e-06, + "loss": 1.6290373802185059, + "step": 1386 + }, + { + "epoch": 0.25266223718940567, + "grad_norm": 9.6875, + "learning_rate": 9.87531390462084e-06, + "loss": 1.4231597185134888, + "step": 1388 + }, + { + "epoch": 0.25302630381359786, + "grad_norm": 9.375, + "learning_rate": 9.874934020456158e-06, + "loss": 1.5313165187835693, + "step": 1390 + }, + { + "epoch": 0.2533903704377901, + "grad_norm": 10.75, + "learning_rate": 9.874553567660607e-06, + "loss": 1.6047847270965576, + "step": 1392 + }, + { + "epoch": 0.25375443706198236, + "grad_norm": 16.375, + "learning_rate": 9.874172546290028e-06, + "loss": 1.9918204545974731, + "step": 1394 + }, + { + "epoch": 0.25411850368617456, + "grad_norm": 11.0, + "learning_rate": 9.873790956400325e-06, + "loss": 1.765520691871643, + "step": 1396 + }, + { + "epoch": 0.2544825703103668, + "grad_norm": 9.25, + "learning_rate": 9.873408798047498e-06, + "loss": 1.5310132503509521, + "step": 1398 + }, + { + "epoch": 0.254846636934559, + "grad_norm": 7.5625, + "learning_rate": 9.873026071287627e-06, + "loss": 1.395627737045288, + "step": 1400 + }, + { + "epoch": 0.25521070355875125, + "grad_norm": 31.625, + "learning_rate": 9.87264277617688e-06, + "loss": 1.5362725257873535, + "step": 1402 + }, + { + "epoch": 0.2555747701829435, + "grad_norm": 33.5, + "learning_rate": 9.872258912771497e-06, + "loss": 1.6543318033218384, + "step": 1404 + }, + { + "epoch": 0.2559388368071357, + "grad_norm": 8.3125, + "learning_rate": 9.871874481127813e-06, + "loss": 1.240380883216858, + "step": 1406 + }, + { + "epoch": 0.25630290343132794, + "grad_norm": 28.625, + "learning_rate": 9.871489481302239e-06, + "loss": 1.4505254030227661, + "step": 1408 + }, + { + "epoch": 0.25666697005552014, + "grad_norm": 27.5, + "learning_rate": 9.871103913351273e-06, + "loss": 1.2581229209899902, + "step": 1410 + }, + { + "epoch": 0.2570310366797124, + "grad_norm": 9.875, + "learning_rate": 9.870717777331497e-06, + "loss": 1.4536426067352295, + "step": 1412 + }, + { + "epoch": 0.25739510330390464, + "grad_norm": 17.75, + "learning_rate": 9.870331073299576e-06, + "loss": 1.31911039352417, + "step": 1414 + }, + { + "epoch": 0.25775916992809683, + "grad_norm": 5.4375, + "learning_rate": 9.86994380131225e-06, + "loss": 1.2973837852478027, + "step": 1416 + }, + { + "epoch": 0.2581232365522891, + "grad_norm": 12.0, + "learning_rate": 9.869555961426356e-06, + "loss": 1.1395800113677979, + "step": 1418 + }, + { + "epoch": 0.2584873031764813, + "grad_norm": 6.96875, + "learning_rate": 9.869167553698809e-06, + "loss": 1.3528611660003662, + "step": 1420 + }, + { + "epoch": 0.2588513698006735, + "grad_norm": 14.9375, + "learning_rate": 9.868778578186602e-06, + "loss": 1.6839865446090698, + "step": 1422 + }, + { + "epoch": 0.2592154364248658, + "grad_norm": 11.375, + "learning_rate": 9.86838903494682e-06, + "loss": 2.011813163757324, + "step": 1424 + }, + { + "epoch": 0.25957950304905797, + "grad_norm": 12.875, + "learning_rate": 9.867998924036622e-06, + "loss": 1.4388245344161987, + "step": 1426 + }, + { + "epoch": 0.2599435696732502, + "grad_norm": 12.375, + "learning_rate": 9.867608245513256e-06, + "loss": 1.5373148918151855, + "step": 1428 + }, + { + "epoch": 0.2603076362974424, + "grad_norm": 10.5, + "learning_rate": 9.867216999434057e-06, + "loss": 1.4208545684814453, + "step": 1430 + }, + { + "epoch": 0.26067170292163466, + "grad_norm": 14.125, + "learning_rate": 9.866825185856435e-06, + "loss": 0.8961284756660461, + "step": 1432 + }, + { + "epoch": 0.2610357695458269, + "grad_norm": 16.25, + "learning_rate": 9.866432804837886e-06, + "loss": 0.7015911936759949, + "step": 1434 + }, + { + "epoch": 0.2613998361700191, + "grad_norm": 11.3125, + "learning_rate": 9.866039856435994e-06, + "loss": 1.6991215944290161, + "step": 1436 + }, + { + "epoch": 0.26176390279421136, + "grad_norm": 6.9375, + "learning_rate": 9.865646340708422e-06, + "loss": 1.207765817642212, + "step": 1438 + }, + { + "epoch": 0.26212796941840355, + "grad_norm": 48.75, + "learning_rate": 9.865252257712914e-06, + "loss": 1.382554531097412, + "step": 1440 + }, + { + "epoch": 0.2624920360425958, + "grad_norm": 5.09375, + "learning_rate": 9.864857607507302e-06, + "loss": 1.594775676727295, + "step": 1442 + }, + { + "epoch": 0.26285610266678805, + "grad_norm": 19.125, + "learning_rate": 9.864462390149499e-06, + "loss": 1.1088712215423584, + "step": 1444 + }, + { + "epoch": 0.26322016929098024, + "grad_norm": 8.4375, + "learning_rate": 9.8640666056975e-06, + "loss": 1.2831988334655762, + "step": 1446 + }, + { + "epoch": 0.2635842359151725, + "grad_norm": 9.875, + "learning_rate": 9.863670254209388e-06, + "loss": 1.4446523189544678, + "step": 1448 + }, + { + "epoch": 0.2639483025393647, + "grad_norm": 9.125, + "learning_rate": 9.863273335743324e-06, + "loss": 1.4284683465957642, + "step": 1450 + }, + { + "epoch": 0.26431236916355694, + "grad_norm": 11.875, + "learning_rate": 9.862875850357553e-06, + "loss": 1.5124616622924805, + "step": 1452 + }, + { + "epoch": 0.26467643578774913, + "grad_norm": 3.859375, + "learning_rate": 9.862477798110408e-06, + "loss": 1.2909436225891113, + "step": 1454 + }, + { + "epoch": 0.2650405024119414, + "grad_norm": 9.25, + "learning_rate": 9.862079179060298e-06, + "loss": 1.0865919589996338, + "step": 1456 + }, + { + "epoch": 0.26540456903613363, + "grad_norm": 34.0, + "learning_rate": 9.86167999326572e-06, + "loss": 1.8287204504013062, + "step": 1458 + }, + { + "epoch": 0.2657686356603258, + "grad_norm": 17.375, + "learning_rate": 9.861280240785257e-06, + "loss": 1.4243782758712769, + "step": 1460 + }, + { + "epoch": 0.2661327022845181, + "grad_norm": 11.0, + "learning_rate": 9.860879921677561e-06, + "loss": 1.4999065399169922, + "step": 1462 + }, + { + "epoch": 0.26649676890871027, + "grad_norm": 6.53125, + "learning_rate": 9.860479036001386e-06, + "loss": 1.460152268409729, + "step": 1464 + }, + { + "epoch": 0.2668608355329025, + "grad_norm": 10.1875, + "learning_rate": 9.86007758381556e-06, + "loss": 1.4036602973937988, + "step": 1466 + }, + { + "epoch": 0.26722490215709477, + "grad_norm": 21.125, + "learning_rate": 9.859675565178988e-06, + "loss": 1.90579354763031, + "step": 1468 + }, + { + "epoch": 0.26758896878128696, + "grad_norm": 24.125, + "learning_rate": 9.859272980150669e-06, + "loss": 1.1752175092697144, + "step": 1470 + }, + { + "epoch": 0.2679530354054792, + "grad_norm": 11.625, + "learning_rate": 9.858869828789684e-06, + "loss": 1.4299908876419067, + "step": 1472 + }, + { + "epoch": 0.2683171020296714, + "grad_norm": 10.0625, + "learning_rate": 9.858466111155188e-06, + "loss": 1.4527727365493774, + "step": 1474 + }, + { + "epoch": 0.26868116865386366, + "grad_norm": 7.375, + "learning_rate": 9.858061827306427e-06, + "loss": 1.5154187679290771, + "step": 1476 + }, + { + "epoch": 0.2690452352780559, + "grad_norm": 13.4375, + "learning_rate": 9.857656977302727e-06, + "loss": 1.2620866298675537, + "step": 1478 + }, + { + "epoch": 0.2694093019022481, + "grad_norm": 8.8125, + "learning_rate": 9.857251561203503e-06, + "loss": 1.7393077611923218, + "step": 1480 + }, + { + "epoch": 0.26977336852644035, + "grad_norm": 13.375, + "learning_rate": 9.856845579068242e-06, + "loss": 1.2178714275360107, + "step": 1482 + }, + { + "epoch": 0.27013743515063254, + "grad_norm": 15.125, + "learning_rate": 9.856439030956521e-06, + "loss": 1.5004127025604248, + "step": 1484 + }, + { + "epoch": 0.2705015017748248, + "grad_norm": 14.875, + "learning_rate": 9.856031916928004e-06, + "loss": 1.3529198169708252, + "step": 1486 + }, + { + "epoch": 0.27086556839901704, + "grad_norm": 15.1875, + "learning_rate": 9.85562423704243e-06, + "loss": 1.7885589599609375, + "step": 1488 + }, + { + "epoch": 0.27122963502320924, + "grad_norm": 12.8125, + "learning_rate": 9.855215991359624e-06, + "loss": 1.7847390174865723, + "step": 1490 + }, + { + "epoch": 0.2715937016474015, + "grad_norm": 31.875, + "learning_rate": 9.854807179939493e-06, + "loss": 1.7920869588851929, + "step": 1492 + }, + { + "epoch": 0.2719577682715937, + "grad_norm": 10.6875, + "learning_rate": 9.854397802842036e-06, + "loss": 1.6153438091278076, + "step": 1494 + }, + { + "epoch": 0.27232183489578593, + "grad_norm": 23.0, + "learning_rate": 9.853987860127316e-06, + "loss": 1.0620189905166626, + "step": 1496 + }, + { + "epoch": 0.2726859015199782, + "grad_norm": 30.875, + "learning_rate": 9.8535773518555e-06, + "loss": 1.0283299684524536, + "step": 1498 + }, + { + "epoch": 0.2730499681441704, + "grad_norm": 21.0, + "learning_rate": 9.853166278086823e-06, + "loss": 1.4579250812530518, + "step": 1500 + }, + { + "epoch": 0.2734140347683626, + "grad_norm": 6.96875, + "learning_rate": 9.852754638881611e-06, + "loss": 0.9517868757247925, + "step": 1502 + }, + { + "epoch": 0.2737781013925548, + "grad_norm": 26.375, + "learning_rate": 9.85234243430027e-06, + "loss": 1.4180676937103271, + "step": 1504 + }, + { + "epoch": 0.27414216801674707, + "grad_norm": 17.125, + "learning_rate": 9.851929664403288e-06, + "loss": 1.859014630317688, + "step": 1506 + }, + { + "epoch": 0.2745062346409393, + "grad_norm": 26.0, + "learning_rate": 9.851516329251238e-06, + "loss": 1.5463061332702637, + "step": 1508 + }, + { + "epoch": 0.2748703012651315, + "grad_norm": 15.4375, + "learning_rate": 9.851102428904777e-06, + "loss": 1.5154248476028442, + "step": 1510 + }, + { + "epoch": 0.27523436788932376, + "grad_norm": 11.0625, + "learning_rate": 9.85068796342464e-06, + "loss": 1.578622579574585, + "step": 1512 + }, + { + "epoch": 0.27559843451351596, + "grad_norm": 13.3125, + "learning_rate": 9.850272932871652e-06, + "loss": 1.9917669296264648, + "step": 1514 + }, + { + "epoch": 0.2759625011377082, + "grad_norm": 14.125, + "learning_rate": 9.849857337306715e-06, + "loss": 1.6202292442321777, + "step": 1516 + }, + { + "epoch": 0.27632656776190045, + "grad_norm": 9.4375, + "learning_rate": 9.849441176790812e-06, + "loss": 1.3205862045288086, + "step": 1518 + }, + { + "epoch": 0.27669063438609265, + "grad_norm": 4.40625, + "learning_rate": 9.849024451385019e-06, + "loss": 1.511529803276062, + "step": 1520 + }, + { + "epoch": 0.2770547010102849, + "grad_norm": 4.21875, + "learning_rate": 9.848607161150488e-06, + "loss": 1.0927166938781738, + "step": 1522 + }, + { + "epoch": 0.2774187676344771, + "grad_norm": 13.5625, + "learning_rate": 9.848189306148453e-06, + "loss": 1.257932186126709, + "step": 1524 + }, + { + "epoch": 0.27778283425866934, + "grad_norm": 15.0, + "learning_rate": 9.847770886440229e-06, + "loss": 1.434149980545044, + "step": 1526 + }, + { + "epoch": 0.27814690088286154, + "grad_norm": 18.25, + "learning_rate": 9.847351902087225e-06, + "loss": 1.5815212726593018, + "step": 1528 + }, + { + "epoch": 0.2785109675070538, + "grad_norm": 39.5, + "learning_rate": 9.84693235315092e-06, + "loss": 1.7690224647521973, + "step": 1530 + }, + { + "epoch": 0.27887503413124604, + "grad_norm": 17.375, + "learning_rate": 9.846512239692883e-06, + "loss": 1.7450687885284424, + "step": 1532 + }, + { + "epoch": 0.27923910075543823, + "grad_norm": 14.5, + "learning_rate": 9.846091561774762e-06, + "loss": 1.1304049491882324, + "step": 1534 + }, + { + "epoch": 0.2796031673796305, + "grad_norm": 25.625, + "learning_rate": 9.84567031945829e-06, + "loss": 1.400388479232788, + "step": 1536 + }, + { + "epoch": 0.2799672340038227, + "grad_norm": 26.25, + "learning_rate": 9.845248512805288e-06, + "loss": 1.3323603868484497, + "step": 1538 + }, + { + "epoch": 0.2803313006280149, + "grad_norm": 9.1875, + "learning_rate": 9.844826141877646e-06, + "loss": 0.9121460318565369, + "step": 1540 + }, + { + "epoch": 0.2806953672522072, + "grad_norm": 78.0, + "learning_rate": 9.844403206737352e-06, + "loss": 1.4342448711395264, + "step": 1542 + }, + { + "epoch": 0.28105943387639937, + "grad_norm": 9.75, + "learning_rate": 9.843979707446468e-06, + "loss": 1.3524324893951416, + "step": 1544 + }, + { + "epoch": 0.2814235005005916, + "grad_norm": 18.5, + "learning_rate": 9.843555644067138e-06, + "loss": 0.9413845539093018, + "step": 1546 + }, + { + "epoch": 0.2817875671247838, + "grad_norm": 8.0625, + "learning_rate": 9.843131016661594e-06, + "loss": 1.6236538887023926, + "step": 1548 + }, + { + "epoch": 0.28215163374897606, + "grad_norm": 16.75, + "learning_rate": 9.84270582529215e-06, + "loss": 1.3235279321670532, + "step": 1550 + }, + { + "epoch": 0.2825157003731683, + "grad_norm": 17.875, + "learning_rate": 9.842280070021198e-06, + "loss": 1.366626501083374, + "step": 1552 + }, + { + "epoch": 0.2828797669973605, + "grad_norm": 15.75, + "learning_rate": 9.841853750911216e-06, + "loss": 1.478304386138916, + "step": 1554 + }, + { + "epoch": 0.28324383362155275, + "grad_norm": 20.375, + "learning_rate": 9.841426868024768e-06, + "loss": 1.4233434200286865, + "step": 1556 + }, + { + "epoch": 0.28360790024574495, + "grad_norm": 7.75, + "learning_rate": 9.840999421424494e-06, + "loss": 1.4747873544692993, + "step": 1558 + }, + { + "epoch": 0.2839719668699372, + "grad_norm": 9.1875, + "learning_rate": 9.84057141117312e-06, + "loss": 1.232305645942688, + "step": 1560 + }, + { + "epoch": 0.28433603349412945, + "grad_norm": 8.25, + "learning_rate": 9.840142837333457e-06, + "loss": 1.073418378829956, + "step": 1562 + }, + { + "epoch": 0.28470010011832164, + "grad_norm": 4.125, + "learning_rate": 9.839713699968396e-06, + "loss": 1.256094217300415, + "step": 1564 + }, + { + "epoch": 0.2850641667425139, + "grad_norm": 41.0, + "learning_rate": 9.839283999140909e-06, + "loss": 1.3534482717514038, + "step": 1566 + }, + { + "epoch": 0.2854282333667061, + "grad_norm": 20.25, + "learning_rate": 9.838853734914055e-06, + "loss": 1.4696502685546875, + "step": 1568 + }, + { + "epoch": 0.28579229999089834, + "grad_norm": 6.0, + "learning_rate": 9.838422907350972e-06, + "loss": 1.550337314605713, + "step": 1570 + }, + { + "epoch": 0.2861563666150906, + "grad_norm": 14.3125, + "learning_rate": 9.837991516514886e-06, + "loss": 1.1209887266159058, + "step": 1572 + }, + { + "epoch": 0.2865204332392828, + "grad_norm": 13.0625, + "learning_rate": 9.837559562469096e-06, + "loss": 1.3484582901000977, + "step": 1574 + }, + { + "epoch": 0.28688449986347503, + "grad_norm": 8.9375, + "learning_rate": 9.837127045276996e-06, + "loss": 1.2887582778930664, + "step": 1576 + }, + { + "epoch": 0.2872485664876672, + "grad_norm": 10.125, + "learning_rate": 9.83669396500205e-06, + "loss": 1.5810158252716064, + "step": 1578 + }, + { + "epoch": 0.2876126331118595, + "grad_norm": 17.5, + "learning_rate": 9.836260321707813e-06, + "loss": 1.284956932067871, + "step": 1580 + }, + { + "epoch": 0.2879766997360517, + "grad_norm": 12.75, + "learning_rate": 9.835826115457922e-06, + "loss": 1.606035590171814, + "step": 1582 + }, + { + "epoch": 0.2883407663602439, + "grad_norm": 8.5625, + "learning_rate": 9.835391346316093e-06, + "loss": 1.323326587677002, + "step": 1584 + }, + { + "epoch": 0.28870483298443617, + "grad_norm": 10.3125, + "learning_rate": 9.834956014346127e-06, + "loss": 1.0132527351379395, + "step": 1586 + }, + { + "epoch": 0.28906889960862836, + "grad_norm": 12.25, + "learning_rate": 9.834520119611908e-06, + "loss": 1.1451250314712524, + "step": 1588 + }, + { + "epoch": 0.2894329662328206, + "grad_norm": 3.671875, + "learning_rate": 9.834083662177403e-06, + "loss": 1.3297022581100464, + "step": 1590 + }, + { + "epoch": 0.28979703285701286, + "grad_norm": 9.8125, + "learning_rate": 9.833646642106657e-06, + "loss": 1.092158555984497, + "step": 1592 + }, + { + "epoch": 0.29016109948120505, + "grad_norm": 11.125, + "learning_rate": 9.833209059463804e-06, + "loss": 1.427994728088379, + "step": 1594 + }, + { + "epoch": 0.2905251661053973, + "grad_norm": 10.625, + "learning_rate": 9.832770914313055e-06, + "loss": 1.4076528549194336, + "step": 1596 + }, + { + "epoch": 0.2908892327295895, + "grad_norm": 11.25, + "learning_rate": 9.832332206718706e-06, + "loss": 1.4831396341323853, + "step": 1598 + }, + { + "epoch": 0.29125329935378175, + "grad_norm": 6.4375, + "learning_rate": 9.83189293674514e-06, + "loss": 1.3140579462051392, + "step": 1600 + }, + { + "epoch": 0.291617365977974, + "grad_norm": 19.125, + "learning_rate": 9.831453104456812e-06, + "loss": 1.156083345413208, + "step": 1602 + }, + { + "epoch": 0.2919814326021662, + "grad_norm": 8.3125, + "learning_rate": 9.83101270991827e-06, + "loss": 0.5887618064880371, + "step": 1604 + }, + { + "epoch": 0.29234549922635844, + "grad_norm": 6.125, + "learning_rate": 9.830571753194137e-06, + "loss": 1.233492374420166, + "step": 1606 + }, + { + "epoch": 0.29270956585055063, + "grad_norm": 8.875, + "learning_rate": 9.830130234349127e-06, + "loss": 1.5636699199676514, + "step": 1608 + }, + { + "epoch": 0.2930736324747429, + "grad_norm": 15.5, + "learning_rate": 9.829688153448023e-06, + "loss": 1.3661271333694458, + "step": 1610 + }, + { + "epoch": 0.2934376990989351, + "grad_norm": 9.0625, + "learning_rate": 9.829245510555704e-06, + "loss": 1.3555999994277954, + "step": 1612 + }, + { + "epoch": 0.29380176572312733, + "grad_norm": 11.125, + "learning_rate": 9.828802305737127e-06, + "loss": 1.480827808380127, + "step": 1614 + }, + { + "epoch": 0.2941658323473196, + "grad_norm": 13.3125, + "learning_rate": 9.828358539057325e-06, + "loss": 1.3503471612930298, + "step": 1616 + }, + { + "epoch": 0.29452989897151177, + "grad_norm": 5.625, + "learning_rate": 9.827914210581425e-06, + "loss": 1.2063119411468506, + "step": 1618 + }, + { + "epoch": 0.294893965595704, + "grad_norm": 6.21875, + "learning_rate": 9.827469320374627e-06, + "loss": 1.237090826034546, + "step": 1620 + }, + { + "epoch": 0.2952580322198962, + "grad_norm": 19.75, + "learning_rate": 9.827023868502218e-06, + "loss": 1.2755967378616333, + "step": 1622 + }, + { + "epoch": 0.29562209884408847, + "grad_norm": 12.5625, + "learning_rate": 9.826577855029564e-06, + "loss": 1.341290831565857, + "step": 1624 + }, + { + "epoch": 0.2959861654682807, + "grad_norm": 6.90625, + "learning_rate": 9.82613128002212e-06, + "loss": 1.2220969200134277, + "step": 1626 + }, + { + "epoch": 0.2963502320924729, + "grad_norm": 14.25, + "learning_rate": 9.825684143545416e-06, + "loss": 1.5383570194244385, + "step": 1628 + }, + { + "epoch": 0.29671429871666516, + "grad_norm": 8.9375, + "learning_rate": 9.825236445665068e-06, + "loss": 1.8045324087142944, + "step": 1630 + }, + { + "epoch": 0.29707836534085735, + "grad_norm": 9.625, + "learning_rate": 9.824788186446771e-06, + "loss": 1.0440577268600464, + "step": 1632 + }, + { + "epoch": 0.2974424319650496, + "grad_norm": 15.3125, + "learning_rate": 9.824339365956313e-06, + "loss": 1.281540036201477, + "step": 1634 + }, + { + "epoch": 0.29780649858924185, + "grad_norm": 7.625, + "learning_rate": 9.823889984259546e-06, + "loss": 0.13898517191410065, + "step": 1636 + }, + { + "epoch": 0.29817056521343405, + "grad_norm": 21.0, + "learning_rate": 9.823440041422424e-06, + "loss": 0.4255257546901703, + "step": 1638 + }, + { + "epoch": 0.2985346318376263, + "grad_norm": 19.75, + "learning_rate": 9.822989537510975e-06, + "loss": 1.3400230407714844, + "step": 1640 + }, + { + "epoch": 0.2988986984618185, + "grad_norm": 63.5, + "learning_rate": 9.8225384725913e-06, + "loss": 0.9882588386535645, + "step": 1642 + }, + { + "epoch": 0.29926276508601074, + "grad_norm": 11.8125, + "learning_rate": 9.822086846729595e-06, + "loss": 1.5584814548492432, + "step": 1644 + }, + { + "epoch": 0.299626831710203, + "grad_norm": 8.9375, + "learning_rate": 9.821634659992137e-06, + "loss": 1.435099482536316, + "step": 1646 + }, + { + "epoch": 0.2999908983343952, + "grad_norm": 6.3125, + "learning_rate": 9.821181912445278e-06, + "loss": 0.9147496819496155, + "step": 1648 + }, + { + "epoch": 0.30035496495858743, + "grad_norm": 9.0625, + "learning_rate": 9.820728604155461e-06, + "loss": 1.538068175315857, + "step": 1650 + }, + { + "epoch": 0.30071903158277963, + "grad_norm": 21.25, + "learning_rate": 9.820274735189203e-06, + "loss": 1.2914254665374756, + "step": 1652 + }, + { + "epoch": 0.3010830982069719, + "grad_norm": 14.1875, + "learning_rate": 9.819820305613113e-06, + "loss": 1.0761820077896118, + "step": 1654 + }, + { + "epoch": 0.3014471648311641, + "grad_norm": 23.5, + "learning_rate": 9.819365315493871e-06, + "loss": 1.4292621612548828, + "step": 1656 + }, + { + "epoch": 0.3018112314553563, + "grad_norm": 33.5, + "learning_rate": 9.818909764898251e-06, + "loss": 1.3318498134613037, + "step": 1658 + }, + { + "epoch": 0.30217529807954857, + "grad_norm": 17.625, + "learning_rate": 9.818453653893097e-06, + "loss": 0.6170845627784729, + "step": 1660 + }, + { + "epoch": 0.30253936470374077, + "grad_norm": 28.125, + "learning_rate": 9.817996982545346e-06, + "loss": 1.8151825666427612, + "step": 1662 + }, + { + "epoch": 0.302903431327933, + "grad_norm": 8.1875, + "learning_rate": 9.81753975092201e-06, + "loss": 1.3903858661651611, + "step": 1664 + }, + { + "epoch": 0.30326749795212526, + "grad_norm": 10.1875, + "learning_rate": 9.817081959090184e-06, + "loss": 1.486268401145935, + "step": 1666 + }, + { + "epoch": 0.30363156457631746, + "grad_norm": 12.9375, + "learning_rate": 9.816623607117053e-06, + "loss": 1.3959635496139526, + "step": 1668 + }, + { + "epoch": 0.3039956312005097, + "grad_norm": 19.125, + "learning_rate": 9.816164695069874e-06, + "loss": 1.384684443473816, + "step": 1670 + }, + { + "epoch": 0.3043596978247019, + "grad_norm": 9.9375, + "learning_rate": 9.81570522301599e-06, + "loss": 0.8759585022926331, + "step": 1672 + }, + { + "epoch": 0.30472376444889415, + "grad_norm": 18.625, + "learning_rate": 9.815245191022832e-06, + "loss": 1.1100056171417236, + "step": 1674 + }, + { + "epoch": 0.3050878310730864, + "grad_norm": 5.1875, + "learning_rate": 9.8147845991579e-06, + "loss": 0.5108053684234619, + "step": 1676 + }, + { + "epoch": 0.3054518976972786, + "grad_norm": 16.0, + "learning_rate": 9.81432344748879e-06, + "loss": 1.3095556497573853, + "step": 1678 + }, + { + "epoch": 0.30581596432147085, + "grad_norm": 16.375, + "learning_rate": 9.813861736083172e-06, + "loss": 1.2732908725738525, + "step": 1680 + }, + { + "epoch": 0.30618003094566304, + "grad_norm": 8.625, + "learning_rate": 9.813399465008802e-06, + "loss": 1.582122802734375, + "step": 1682 + }, + { + "epoch": 0.3065440975698553, + "grad_norm": 4.90625, + "learning_rate": 9.812936634333512e-06, + "loss": 0.9677426218986511, + "step": 1684 + }, + { + "epoch": 0.3069081641940475, + "grad_norm": 9.4375, + "learning_rate": 9.812473244125225e-06, + "loss": 1.2957812547683716, + "step": 1686 + }, + { + "epoch": 0.30727223081823973, + "grad_norm": 7.3125, + "learning_rate": 9.812009294451939e-06, + "loss": 1.3681244850158691, + "step": 1688 + }, + { + "epoch": 0.307636297442432, + "grad_norm": 10.25, + "learning_rate": 9.811544785381738e-06, + "loss": 1.2751357555389404, + "step": 1690 + }, + { + "epoch": 0.3080003640666242, + "grad_norm": 55.0, + "learning_rate": 9.811079716982787e-06, + "loss": 1.1589667797088623, + "step": 1692 + }, + { + "epoch": 0.3083644306908164, + "grad_norm": 11.75, + "learning_rate": 9.810614089323333e-06, + "loss": 1.3476650714874268, + "step": 1694 + }, + { + "epoch": 0.3087284973150086, + "grad_norm": 60.25, + "learning_rate": 9.810147902471706e-06, + "loss": 1.2627832889556885, + "step": 1696 + }, + { + "epoch": 0.30909256393920087, + "grad_norm": 64.0, + "learning_rate": 9.809681156496313e-06, + "loss": 0.7154263854026794, + "step": 1698 + }, + { + "epoch": 0.3094566305633931, + "grad_norm": 7.0, + "learning_rate": 9.809213851465652e-06, + "loss": 1.081131100654602, + "step": 1700 + }, + { + "epoch": 0.3098206971875853, + "grad_norm": 15.625, + "learning_rate": 9.808745987448292e-06, + "loss": 1.3883049488067627, + "step": 1702 + }, + { + "epoch": 0.31018476381177756, + "grad_norm": 26.5, + "learning_rate": 9.808277564512896e-06, + "loss": 1.5548325777053833, + "step": 1704 + }, + { + "epoch": 0.31054883043596976, + "grad_norm": 13.4375, + "learning_rate": 9.8078085827282e-06, + "loss": 1.3953886032104492, + "step": 1706 + }, + { + "epoch": 0.310912897060162, + "grad_norm": 6.375, + "learning_rate": 9.807339042163027e-06, + "loss": 1.2681118249893188, + "step": 1708 + }, + { + "epoch": 0.31127696368435426, + "grad_norm": 20.25, + "learning_rate": 9.80686894288628e-06, + "loss": 1.185044765472412, + "step": 1710 + }, + { + "epoch": 0.31164103030854645, + "grad_norm": 17.75, + "learning_rate": 9.806398284966943e-06, + "loss": 1.1058142185211182, + "step": 1712 + }, + { + "epoch": 0.3120050969327387, + "grad_norm": 4.375, + "learning_rate": 9.805927068474083e-06, + "loss": 1.2485921382904053, + "step": 1714 + }, + { + "epoch": 0.3123691635569309, + "grad_norm": 15.875, + "learning_rate": 9.805455293476848e-06, + "loss": 1.3581323623657227, + "step": 1716 + }, + { + "epoch": 0.31273323018112315, + "grad_norm": 17.875, + "learning_rate": 9.804982960044475e-06, + "loss": 1.1913976669311523, + "step": 1718 + }, + { + "epoch": 0.3130972968053154, + "grad_norm": 9.5625, + "learning_rate": 9.804510068246271e-06, + "loss": 1.160415530204773, + "step": 1720 + }, + { + "epoch": 0.3134613634295076, + "grad_norm": 35.5, + "learning_rate": 9.804036618151633e-06, + "loss": 1.5496083498001099, + "step": 1722 + }, + { + "epoch": 0.31382543005369984, + "grad_norm": 23.875, + "learning_rate": 9.803562609830037e-06, + "loss": 1.7188440561294556, + "step": 1724 + }, + { + "epoch": 0.31418949667789203, + "grad_norm": 14.0625, + "learning_rate": 9.803088043351043e-06, + "loss": 1.419533610343933, + "step": 1726 + }, + { + "epoch": 0.3145535633020843, + "grad_norm": 28.375, + "learning_rate": 9.802612918784291e-06, + "loss": 1.4445359706878662, + "step": 1728 + }, + { + "epoch": 0.31491762992627653, + "grad_norm": 6.59375, + "learning_rate": 9.802137236199505e-06, + "loss": 1.1445637941360474, + "step": 1730 + }, + { + "epoch": 0.3152816965504687, + "grad_norm": 11.5, + "learning_rate": 9.801660995666486e-06, + "loss": 1.4296021461486816, + "step": 1732 + }, + { + "epoch": 0.315645763174661, + "grad_norm": 16.75, + "learning_rate": 9.801184197255125e-06, + "loss": 1.4084744453430176, + "step": 1734 + }, + { + "epoch": 0.31600982979885317, + "grad_norm": 6.03125, + "learning_rate": 9.800706841035385e-06, + "loss": 1.0689120292663574, + "step": 1736 + }, + { + "epoch": 0.3163738964230454, + "grad_norm": 9.0625, + "learning_rate": 9.800228927077322e-06, + "loss": 1.274327039718628, + "step": 1738 + }, + { + "epoch": 0.31673796304723767, + "grad_norm": 14.1875, + "learning_rate": 9.799750455451065e-06, + "loss": 1.524749994277954, + "step": 1740 + }, + { + "epoch": 0.31710202967142986, + "grad_norm": 13.125, + "learning_rate": 9.799271426226823e-06, + "loss": 1.7396551370620728, + "step": 1742 + }, + { + "epoch": 0.3174660962956221, + "grad_norm": 22.0, + "learning_rate": 9.798791839474902e-06, + "loss": 1.7445507049560547, + "step": 1744 + }, + { + "epoch": 0.3178301629198143, + "grad_norm": 39.75, + "learning_rate": 9.798311695265672e-06, + "loss": 1.9367200136184692, + "step": 1746 + }, + { + "epoch": 0.31819422954400656, + "grad_norm": 3.28125, + "learning_rate": 9.797830993669592e-06, + "loss": 1.2434825897216797, + "step": 1748 + }, + { + "epoch": 0.3185582961681988, + "grad_norm": 7.1875, + "learning_rate": 9.797349734757206e-06, + "loss": 1.0539696216583252, + "step": 1750 + }, + { + "epoch": 0.318922362792391, + "grad_norm": 9.9375, + "learning_rate": 9.796867918599138e-06, + "loss": 1.5123122930526733, + "step": 1752 + }, + { + "epoch": 0.31928642941658325, + "grad_norm": 9.75, + "learning_rate": 9.796385545266086e-06, + "loss": 1.2620917558670044, + "step": 1754 + }, + { + "epoch": 0.31965049604077544, + "grad_norm": 15.0, + "learning_rate": 9.795902614828846e-06, + "loss": 1.4678680896759033, + "step": 1756 + }, + { + "epoch": 0.3200145626649677, + "grad_norm": 10.4375, + "learning_rate": 9.795419127358276e-06, + "loss": 1.2457855939865112, + "step": 1758 + }, + { + "epoch": 0.3203786292891599, + "grad_norm": 6.15625, + "learning_rate": 9.794935082925333e-06, + "loss": 1.2296313047409058, + "step": 1760 + }, + { + "epoch": 0.32074269591335214, + "grad_norm": 6.15625, + "learning_rate": 9.794450481601046e-06, + "loss": 1.0726215839385986, + "step": 1762 + }, + { + "epoch": 0.3211067625375444, + "grad_norm": 11.0, + "learning_rate": 9.793965323456526e-06, + "loss": 1.5971686840057373, + "step": 1764 + }, + { + "epoch": 0.3214708291617366, + "grad_norm": 5.3125, + "learning_rate": 9.793479608562972e-06, + "loss": 1.3719642162322998, + "step": 1766 + }, + { + "epoch": 0.32183489578592883, + "grad_norm": 7.3125, + "learning_rate": 9.79299333699166e-06, + "loss": 1.2214471101760864, + "step": 1768 + }, + { + "epoch": 0.322198962410121, + "grad_norm": 6.5625, + "learning_rate": 9.792506508813946e-06, + "loss": 1.3700158596038818, + "step": 1770 + }, + { + "epoch": 0.3225630290343133, + "grad_norm": 9.8125, + "learning_rate": 9.792019124101273e-06, + "loss": 1.152277946472168, + "step": 1772 + }, + { + "epoch": 0.3229270956585055, + "grad_norm": 17.25, + "learning_rate": 9.79153118292516e-06, + "loss": 0.443513959646225, + "step": 1774 + }, + { + "epoch": 0.3232911622826977, + "grad_norm": 11.5, + "learning_rate": 9.791042685357212e-06, + "loss": 1.2890838384628296, + "step": 1776 + }, + { + "epoch": 0.32365522890688997, + "grad_norm": 7.71875, + "learning_rate": 9.790553631469116e-06, + "loss": 1.5680713653564453, + "step": 1778 + }, + { + "epoch": 0.32401929553108216, + "grad_norm": 23.5, + "learning_rate": 9.790064021332633e-06, + "loss": 1.4733153581619263, + "step": 1780 + }, + { + "epoch": 0.3243833621552744, + "grad_norm": 15.4375, + "learning_rate": 9.789573855019616e-06, + "loss": 1.6985194683074951, + "step": 1782 + }, + { + "epoch": 0.32474742877946666, + "grad_norm": 9.8125, + "learning_rate": 9.789083132601992e-06, + "loss": 1.5019042491912842, + "step": 1784 + }, + { + "epoch": 0.32511149540365886, + "grad_norm": 15.125, + "learning_rate": 9.788591854151777e-06, + "loss": 1.2202986478805542, + "step": 1786 + }, + { + "epoch": 0.3254755620278511, + "grad_norm": 764.0, + "learning_rate": 9.788100019741059e-06, + "loss": 1.676939606666565, + "step": 1788 + }, + { + "epoch": 0.3258396286520433, + "grad_norm": 17.25, + "learning_rate": 9.787607629442015e-06, + "loss": 1.8803229331970215, + "step": 1790 + }, + { + "epoch": 0.32620369527623555, + "grad_norm": 15.0625, + "learning_rate": 9.787114683326903e-06, + "loss": 1.6080893278121948, + "step": 1792 + }, + { + "epoch": 0.3265677619004278, + "grad_norm": 5.78125, + "learning_rate": 9.786621181468057e-06, + "loss": 1.2275793552398682, + "step": 1794 + }, + { + "epoch": 0.32693182852462, + "grad_norm": 17.125, + "learning_rate": 9.786127123937901e-06, + "loss": 0.8695844411849976, + "step": 1796 + }, + { + "epoch": 0.32729589514881224, + "grad_norm": 21.125, + "learning_rate": 9.78563251080893e-06, + "loss": 0.6618102788925171, + "step": 1798 + }, + { + "epoch": 0.32765996177300444, + "grad_norm": 15.875, + "learning_rate": 9.785137342153733e-06, + "loss": 1.7897183895111084, + "step": 1800 + }, + { + "epoch": 0.3280240283971967, + "grad_norm": 12.5625, + "learning_rate": 9.784641618044968e-06, + "loss": 1.4144792556762695, + "step": 1802 + }, + { + "epoch": 0.32838809502138894, + "grad_norm": 11.3125, + "learning_rate": 9.784145338555384e-06, + "loss": 1.4410724639892578, + "step": 1804 + }, + { + "epoch": 0.32875216164558113, + "grad_norm": 6.125, + "learning_rate": 9.783648503757809e-06, + "loss": 1.3081823587417603, + "step": 1806 + }, + { + "epoch": 0.3291162282697734, + "grad_norm": 3.671875, + "learning_rate": 9.783151113725148e-06, + "loss": 1.0756101608276367, + "step": 1808 + }, + { + "epoch": 0.3294802948939656, + "grad_norm": 11.9375, + "learning_rate": 9.782653168530397e-06, + "loss": 0.8786813020706177, + "step": 1810 + }, + { + "epoch": 0.3298443615181578, + "grad_norm": 20.75, + "learning_rate": 9.78215466824662e-06, + "loss": 1.1601600646972656, + "step": 1812 + }, + { + "epoch": 0.3302084281423501, + "grad_norm": 29.375, + "learning_rate": 9.781655612946971e-06, + "loss": 1.689734935760498, + "step": 1814 + }, + { + "epoch": 0.33057249476654227, + "grad_norm": 11.8125, + "learning_rate": 9.78115600270469e-06, + "loss": 1.6082452535629272, + "step": 1816 + }, + { + "epoch": 0.3309365613907345, + "grad_norm": 10.9375, + "learning_rate": 9.780655837593087e-06, + "loss": 1.4208040237426758, + "step": 1818 + }, + { + "epoch": 0.3313006280149267, + "grad_norm": 15.4375, + "learning_rate": 9.780155117685564e-06, + "loss": 1.8875669240951538, + "step": 1820 + }, + { + "epoch": 0.33166469463911896, + "grad_norm": 9.25, + "learning_rate": 9.779653843055594e-06, + "loss": 1.3704841136932373, + "step": 1822 + }, + { + "epoch": 0.3320287612633112, + "grad_norm": 16.125, + "learning_rate": 9.779152013776743e-06, + "loss": 1.2594153881072998, + "step": 1824 + }, + { + "epoch": 0.3323928278875034, + "grad_norm": 54.5, + "learning_rate": 9.778649629922647e-06, + "loss": 0.7861361503601074, + "step": 1826 + }, + { + "epoch": 0.33275689451169566, + "grad_norm": 2.28125, + "learning_rate": 9.778146691567034e-06, + "loss": 0.9009575843811035, + "step": 1828 + }, + { + "epoch": 0.33312096113588785, + "grad_norm": 9.9375, + "learning_rate": 9.777643198783703e-06, + "loss": 1.0703434944152832, + "step": 1830 + }, + { + "epoch": 0.3334850277600801, + "grad_norm": 6.625, + "learning_rate": 9.777139151646545e-06, + "loss": 1.5458961725234985, + "step": 1832 + }, + { + "epoch": 0.33384909438427235, + "grad_norm": 11.1875, + "learning_rate": 9.776634550229523e-06, + "loss": 1.2762070894241333, + "step": 1834 + }, + { + "epoch": 0.33421316100846454, + "grad_norm": 25.0, + "learning_rate": 9.776129394606684e-06, + "loss": 1.1095106601715088, + "step": 1836 + }, + { + "epoch": 0.3345772276326568, + "grad_norm": 21.875, + "learning_rate": 9.77562368485216e-06, + "loss": 1.7597427368164062, + "step": 1838 + }, + { + "epoch": 0.334941294256849, + "grad_norm": 3.46875, + "learning_rate": 9.775117421040163e-06, + "loss": 0.9171950817108154, + "step": 1840 + }, + { + "epoch": 0.33530536088104124, + "grad_norm": 14.0625, + "learning_rate": 9.774610603244983e-06, + "loss": 1.3025262355804443, + "step": 1842 + }, + { + "epoch": 0.33566942750523343, + "grad_norm": 9.3125, + "learning_rate": 9.774103231540995e-06, + "loss": 1.7508900165557861, + "step": 1844 + }, + { + "epoch": 0.3360334941294257, + "grad_norm": 8.3125, + "learning_rate": 9.773595306002652e-06, + "loss": 1.1705724000930786, + "step": 1846 + }, + { + "epoch": 0.33639756075361793, + "grad_norm": 10.375, + "learning_rate": 9.77308682670449e-06, + "loss": 1.5776920318603516, + "step": 1848 + }, + { + "epoch": 0.3367616273778101, + "grad_norm": 14.75, + "learning_rate": 9.77257779372113e-06, + "loss": 1.6449886560440063, + "step": 1850 + }, + { + "epoch": 0.3371256940020024, + "grad_norm": 14.0625, + "learning_rate": 9.772068207127265e-06, + "loss": 1.361955165863037, + "step": 1852 + }, + { + "epoch": 0.33748976062619457, + "grad_norm": 10.625, + "learning_rate": 9.771558066997677e-06, + "loss": 1.4854588508605957, + "step": 1854 + }, + { + "epoch": 0.3378538272503868, + "grad_norm": 9.625, + "learning_rate": 9.77104737340723e-06, + "loss": 1.811299204826355, + "step": 1856 + }, + { + "epoch": 0.33821789387457907, + "grad_norm": 8.9375, + "learning_rate": 9.770536126430861e-06, + "loss": 1.18406081199646, + "step": 1858 + }, + { + "epoch": 0.33858196049877126, + "grad_norm": 18.125, + "learning_rate": 9.7700243261436e-06, + "loss": 0.7963888049125671, + "step": 1860 + }, + { + "epoch": 0.3389460271229635, + "grad_norm": 19.125, + "learning_rate": 9.769511972620542e-06, + "loss": 1.499111294746399, + "step": 1862 + }, + { + "epoch": 0.3393100937471557, + "grad_norm": 18.375, + "learning_rate": 9.768999065936883e-06, + "loss": 1.6931930780410767, + "step": 1864 + }, + { + "epoch": 0.33967416037134796, + "grad_norm": 12.0, + "learning_rate": 9.768485606167886e-06, + "loss": 1.2183482646942139, + "step": 1866 + }, + { + "epoch": 0.3400382269955402, + "grad_norm": 44.5, + "learning_rate": 9.767971593388897e-06, + "loss": 1.1583776473999023, + "step": 1868 + }, + { + "epoch": 0.3404022936197324, + "grad_norm": 6.96875, + "learning_rate": 9.767457027675345e-06, + "loss": 1.2160083055496216, + "step": 1870 + }, + { + "epoch": 0.34076636024392465, + "grad_norm": 30.625, + "learning_rate": 9.766941909102746e-06, + "loss": 1.2918589115142822, + "step": 1872 + }, + { + "epoch": 0.34113042686811684, + "grad_norm": 16.125, + "learning_rate": 9.766426237746685e-06, + "loss": 1.6872920989990234, + "step": 1874 + }, + { + "epoch": 0.3414944934923091, + "grad_norm": 26.25, + "learning_rate": 9.765910013682838e-06, + "loss": 2.0386478900909424, + "step": 1876 + }, + { + "epoch": 0.34185856011650134, + "grad_norm": 6.125, + "learning_rate": 9.76539323698696e-06, + "loss": 1.350124478340149, + "step": 1878 + }, + { + "epoch": 0.34222262674069354, + "grad_norm": 74.0, + "learning_rate": 9.764875907734883e-06, + "loss": 1.3060801029205322, + "step": 1880 + }, + { + "epoch": 0.3425866933648858, + "grad_norm": 23.5, + "learning_rate": 9.764358026002523e-06, + "loss": 2.2640185356140137, + "step": 1882 + }, + { + "epoch": 0.342950759989078, + "grad_norm": 30.375, + "learning_rate": 9.763839591865881e-06, + "loss": 1.190342903137207, + "step": 1884 + }, + { + "epoch": 0.34331482661327023, + "grad_norm": 20.375, + "learning_rate": 9.763320605401032e-06, + "loss": 0.9115470051765442, + "step": 1886 + }, + { + "epoch": 0.3436788932374625, + "grad_norm": 10.875, + "learning_rate": 9.762801066684136e-06, + "loss": 1.5014320611953735, + "step": 1888 + }, + { + "epoch": 0.3440429598616547, + "grad_norm": 12.75, + "learning_rate": 9.76228097579143e-06, + "loss": 1.5600974559783936, + "step": 1890 + }, + { + "epoch": 0.3444070264858469, + "grad_norm": 16.625, + "learning_rate": 9.761760332799239e-06, + "loss": 1.4339325428009033, + "step": 1892 + }, + { + "epoch": 0.3447710931100391, + "grad_norm": 11.8125, + "learning_rate": 9.761239137783964e-06, + "loss": 1.575667381286621, + "step": 1894 + }, + { + "epoch": 0.34513515973423137, + "grad_norm": 9.0625, + "learning_rate": 9.76071739082209e-06, + "loss": 1.4520823955535889, + "step": 1896 + }, + { + "epoch": 0.3454992263584236, + "grad_norm": 11.3125, + "learning_rate": 9.760195091990178e-06, + "loss": 0.9596503376960754, + "step": 1898 + }, + { + "epoch": 0.3458632929826158, + "grad_norm": 20.125, + "learning_rate": 9.759672241364877e-06, + "loss": 0.9361091256141663, + "step": 1900 + }, + { + "epoch": 0.34622735960680806, + "grad_norm": 7.71875, + "learning_rate": 9.759148839022912e-06, + "loss": 0.8034407496452332, + "step": 1902 + }, + { + "epoch": 0.34659142623100025, + "grad_norm": 6.625, + "learning_rate": 9.758624885041087e-06, + "loss": 1.4066028594970703, + "step": 1904 + }, + { + "epoch": 0.3469554928551925, + "grad_norm": 12.6875, + "learning_rate": 9.758100379496294e-06, + "loss": 1.470516562461853, + "step": 1906 + }, + { + "epoch": 0.34731955947938475, + "grad_norm": 18.5, + "learning_rate": 9.757575322465498e-06, + "loss": 1.3514392375946045, + "step": 1908 + }, + { + "epoch": 0.34768362610357695, + "grad_norm": 9.3125, + "learning_rate": 9.757049714025755e-06, + "loss": 1.326603889465332, + "step": 1910 + }, + { + "epoch": 0.3480476927277692, + "grad_norm": 48.75, + "learning_rate": 9.75652355425419e-06, + "loss": 1.2181487083435059, + "step": 1912 + }, + { + "epoch": 0.3484117593519614, + "grad_norm": 2.703125, + "learning_rate": 9.755996843228018e-06, + "loss": 0.715548038482666, + "step": 1914 + }, + { + "epoch": 0.34877582597615364, + "grad_norm": 5.4375, + "learning_rate": 9.75546958102453e-06, + "loss": 1.374284029006958, + "step": 1916 + }, + { + "epoch": 0.34913989260034584, + "grad_norm": 5.75, + "learning_rate": 9.754941767721103e-06, + "loss": 1.1102149486541748, + "step": 1918 + }, + { + "epoch": 0.3495039592245381, + "grad_norm": 11.5, + "learning_rate": 9.754413403395187e-06, + "loss": 1.2851014137268066, + "step": 1920 + }, + { + "epoch": 0.34986802584873034, + "grad_norm": 15.0, + "learning_rate": 9.753884488124321e-06, + "loss": 1.2167428731918335, + "step": 1922 + }, + { + "epoch": 0.35023209247292253, + "grad_norm": 4.9375, + "learning_rate": 9.753355021986116e-06, + "loss": 0.9661097526550293, + "step": 1924 + }, + { + "epoch": 0.3505961590971148, + "grad_norm": 11.875, + "learning_rate": 9.752825005058277e-06, + "loss": 1.509225606918335, + "step": 1926 + }, + { + "epoch": 0.350960225721307, + "grad_norm": 7.28125, + "learning_rate": 9.752294437418575e-06, + "loss": 1.3005475997924805, + "step": 1928 + }, + { + "epoch": 0.3513242923454992, + "grad_norm": 8.125, + "learning_rate": 9.751763319144871e-06, + "loss": 1.2685401439666748, + "step": 1930 + }, + { + "epoch": 0.3516883589696915, + "grad_norm": 5.09375, + "learning_rate": 9.751231650315106e-06, + "loss": 1.2690961360931396, + "step": 1932 + }, + { + "epoch": 0.35205242559388367, + "grad_norm": 17.75, + "learning_rate": 9.750699431007296e-06, + "loss": 1.5271693468093872, + "step": 1934 + }, + { + "epoch": 0.3524164922180759, + "grad_norm": 22.125, + "learning_rate": 9.750166661299548e-06, + "loss": 1.6990630626678467, + "step": 1936 + }, + { + "epoch": 0.3527805588422681, + "grad_norm": 21.625, + "learning_rate": 9.749633341270038e-06, + "loss": 1.6855173110961914, + "step": 1938 + }, + { + "epoch": 0.35314462546646036, + "grad_norm": 14.25, + "learning_rate": 9.749099470997033e-06, + "loss": 0.9684444069862366, + "step": 1940 + }, + { + "epoch": 0.3535086920906526, + "grad_norm": 40.5, + "learning_rate": 9.748565050558871e-06, + "loss": 1.6541390419006348, + "step": 1942 + }, + { + "epoch": 0.3538727587148448, + "grad_norm": 25.875, + "learning_rate": 9.748030080033982e-06, + "loss": 2.1384077072143555, + "step": 1944 + }, + { + "epoch": 0.35423682533903705, + "grad_norm": 9.0, + "learning_rate": 9.747494559500869e-06, + "loss": 1.5122464895248413, + "step": 1946 + }, + { + "epoch": 0.35460089196322925, + "grad_norm": 9.375, + "learning_rate": 9.746958489038116e-06, + "loss": 1.494816541671753, + "step": 1948 + }, + { + "epoch": 0.3549649585874215, + "grad_norm": 18.375, + "learning_rate": 9.74642186872439e-06, + "loss": 1.6494117975234985, + "step": 1950 + }, + { + "epoch": 0.35532902521161375, + "grad_norm": 13.25, + "learning_rate": 9.745884698638437e-06, + "loss": 1.7236353158950806, + "step": 1952 + }, + { + "epoch": 0.35569309183580594, + "grad_norm": 10.0, + "learning_rate": 9.745346978859084e-06, + "loss": 1.5397441387176514, + "step": 1954 + }, + { + "epoch": 0.3560571584599982, + "grad_norm": 13.5, + "learning_rate": 9.744808709465243e-06, + "loss": 1.2179691791534424, + "step": 1956 + }, + { + "epoch": 0.3564212250841904, + "grad_norm": 14.3125, + "learning_rate": 9.7442698905359e-06, + "loss": 1.4973336458206177, + "step": 1958 + }, + { + "epoch": 0.35678529170838263, + "grad_norm": 8.0625, + "learning_rate": 9.743730522150123e-06, + "loss": 1.3711413145065308, + "step": 1960 + }, + { + "epoch": 0.3571493583325749, + "grad_norm": 14.875, + "learning_rate": 9.743190604387066e-06, + "loss": 1.5691016912460327, + "step": 1962 + }, + { + "epoch": 0.3575134249567671, + "grad_norm": 16.25, + "learning_rate": 9.742650137325956e-06, + "loss": 1.8776967525482178, + "step": 1964 + }, + { + "epoch": 0.35787749158095933, + "grad_norm": 9.375, + "learning_rate": 9.742109121046106e-06, + "loss": 1.3278359174728394, + "step": 1966 + }, + { + "epoch": 0.3582415582051515, + "grad_norm": 9.4375, + "learning_rate": 9.741567555626908e-06, + "loss": 1.5787303447723389, + "step": 1968 + }, + { + "epoch": 0.35860562482934377, + "grad_norm": 11.3125, + "learning_rate": 9.741025441147836e-06, + "loss": 1.1342029571533203, + "step": 1970 + }, + { + "epoch": 0.358969691453536, + "grad_norm": 10.9375, + "learning_rate": 9.74048277768844e-06, + "loss": 0.539789080619812, + "step": 1972 + }, + { + "epoch": 0.3593337580777282, + "grad_norm": 10.375, + "learning_rate": 9.739939565328356e-06, + "loss": 1.2300916910171509, + "step": 1974 + }, + { + "epoch": 0.35969782470192047, + "grad_norm": 9.375, + "learning_rate": 9.739395804147296e-06, + "loss": 1.6378649473190308, + "step": 1976 + }, + { + "epoch": 0.36006189132611266, + "grad_norm": 12.0, + "learning_rate": 9.738851494225056e-06, + "loss": 1.1425185203552246, + "step": 1978 + }, + { + "epoch": 0.3604259579503049, + "grad_norm": 31.0, + "learning_rate": 9.738306635641514e-06, + "loss": 1.4143235683441162, + "step": 1980 + }, + { + "epoch": 0.36079002457449716, + "grad_norm": 10.5625, + "learning_rate": 9.737761228476621e-06, + "loss": 1.4866759777069092, + "step": 1982 + }, + { + "epoch": 0.36115409119868935, + "grad_norm": 11.125, + "learning_rate": 9.737215272810417e-06, + "loss": 1.732095718383789, + "step": 1984 + }, + { + "epoch": 0.3615181578228816, + "grad_norm": 17.0, + "learning_rate": 9.736668768723017e-06, + "loss": 1.595767617225647, + "step": 1986 + }, + { + "epoch": 0.3618822244470738, + "grad_norm": 9.375, + "learning_rate": 9.736121716294617e-06, + "loss": 1.0042572021484375, + "step": 1988 + }, + { + "epoch": 0.36224629107126605, + "grad_norm": 24.625, + "learning_rate": 9.735574115605499e-06, + "loss": 1.4346060752868652, + "step": 1990 + }, + { + "epoch": 0.3626103576954583, + "grad_norm": 10.75, + "learning_rate": 9.735025966736019e-06, + "loss": 0.6375983953475952, + "step": 1992 + }, + { + "epoch": 0.3629744243196505, + "grad_norm": 18.25, + "learning_rate": 9.73447726976661e-06, + "loss": 1.3740500211715698, + "step": 1994 + }, + { + "epoch": 0.36333849094384274, + "grad_norm": 24.75, + "learning_rate": 9.7339280247778e-06, + "loss": 1.8298540115356445, + "step": 1996 + }, + { + "epoch": 0.36370255756803493, + "grad_norm": 10.9375, + "learning_rate": 9.733378231850186e-06, + "loss": 1.511721134185791, + "step": 1998 + }, + { + "epoch": 0.3640666241922272, + "grad_norm": 11.25, + "learning_rate": 9.732827891064442e-06, + "loss": 1.127073049545288, + "step": 2000 + }, + { + "epoch": 0.3644306908164194, + "grad_norm": 17.75, + "learning_rate": 9.732277002501338e-06, + "loss": 0.941523551940918, + "step": 2002 + }, + { + "epoch": 0.36479475744061163, + "grad_norm": 7.0, + "learning_rate": 9.731725566241705e-06, + "loss": 1.2926925420761108, + "step": 2004 + }, + { + "epoch": 0.3651588240648039, + "grad_norm": 4.4375, + "learning_rate": 9.731173582366472e-06, + "loss": 1.0289406776428223, + "step": 2006 + }, + { + "epoch": 0.36552289068899607, + "grad_norm": 11.0625, + "learning_rate": 9.730621050956635e-06, + "loss": 1.2006124258041382, + "step": 2008 + }, + { + "epoch": 0.3658869573131883, + "grad_norm": 9.3125, + "learning_rate": 9.730067972093277e-06, + "loss": 1.3921442031860352, + "step": 2010 + }, + { + "epoch": 0.3662510239373805, + "grad_norm": 6.125, + "learning_rate": 9.729514345857563e-06, + "loss": 1.3311762809753418, + "step": 2012 + }, + { + "epoch": 0.36661509056157277, + "grad_norm": 8.1875, + "learning_rate": 9.72896017233073e-06, + "loss": 1.4179387092590332, + "step": 2014 + }, + { + "epoch": 0.366979157185765, + "grad_norm": 7.3125, + "learning_rate": 9.728405451594107e-06, + "loss": 1.2038400173187256, + "step": 2016 + }, + { + "epoch": 0.3673432238099572, + "grad_norm": 29.25, + "learning_rate": 9.727850183729094e-06, + "loss": 1.4478435516357422, + "step": 2018 + }, + { + "epoch": 0.36770729043414946, + "grad_norm": 17.875, + "learning_rate": 9.72729436881717e-06, + "loss": 1.4033602476119995, + "step": 2020 + }, + { + "epoch": 0.36807135705834165, + "grad_norm": 15.875, + "learning_rate": 9.726738006939907e-06, + "loss": 1.0183924436569214, + "step": 2022 + }, + { + "epoch": 0.3684354236825339, + "grad_norm": 8.8125, + "learning_rate": 9.726181098178943e-06, + "loss": 1.345322608947754, + "step": 2024 + }, + { + "epoch": 0.36879949030672615, + "grad_norm": 6.28125, + "learning_rate": 9.725623642616004e-06, + "loss": 1.3124163150787354, + "step": 2026 + }, + { + "epoch": 0.36916355693091835, + "grad_norm": 8.9375, + "learning_rate": 9.725065640332893e-06, + "loss": 1.3291378021240234, + "step": 2028 + }, + { + "epoch": 0.3695276235551106, + "grad_norm": 11.375, + "learning_rate": 9.7245070914115e-06, + "loss": 1.4275076389312744, + "step": 2030 + }, + { + "epoch": 0.3698916901793028, + "grad_norm": 18.25, + "learning_rate": 9.723947995933781e-06, + "loss": 1.418353796005249, + "step": 2032 + }, + { + "epoch": 0.37025575680349504, + "grad_norm": 8.5, + "learning_rate": 9.723388353981787e-06, + "loss": 1.358997106552124, + "step": 2034 + }, + { + "epoch": 0.3706198234276873, + "grad_norm": 9.3125, + "learning_rate": 9.72282816563764e-06, + "loss": 1.1635044813156128, + "step": 2036 + }, + { + "epoch": 0.3709838900518795, + "grad_norm": 10.5, + "learning_rate": 9.722267430983547e-06, + "loss": 1.261069655418396, + "step": 2038 + }, + { + "epoch": 0.37134795667607173, + "grad_norm": 15.375, + "learning_rate": 9.721706150101797e-06, + "loss": 1.3923027515411377, + "step": 2040 + }, + { + "epoch": 0.3717120233002639, + "grad_norm": 17.25, + "learning_rate": 9.721144323074749e-06, + "loss": 1.47236967086792, + "step": 2042 + }, + { + "epoch": 0.3720760899244562, + "grad_norm": 19.625, + "learning_rate": 9.720581949984853e-06, + "loss": 1.6056426763534546, + "step": 2044 + }, + { + "epoch": 0.3724401565486484, + "grad_norm": 9.8125, + "learning_rate": 9.720019030914633e-06, + "loss": 1.3068888187408447, + "step": 2046 + }, + { + "epoch": 0.3728042231728406, + "grad_norm": 65.5, + "learning_rate": 9.719455565946698e-06, + "loss": 1.5941617488861084, + "step": 2048 + }, + { + "epoch": 0.37316828979703287, + "grad_norm": 6.71875, + "learning_rate": 9.718891555163732e-06, + "loss": 1.2337017059326172, + "step": 2050 + }, + { + "epoch": 0.37353235642122506, + "grad_norm": 39.75, + "learning_rate": 9.718326998648502e-06, + "loss": 1.029727578163147, + "step": 2052 + }, + { + "epoch": 0.3738964230454173, + "grad_norm": 10.125, + "learning_rate": 9.717761896483853e-06, + "loss": 1.1859915256500244, + "step": 2054 + }, + { + "epoch": 0.37426048966960956, + "grad_norm": 8.0625, + "learning_rate": 9.717196248752712e-06, + "loss": 1.510350227355957, + "step": 2056 + }, + { + "epoch": 0.37462455629380176, + "grad_norm": 6.96875, + "learning_rate": 9.716630055538088e-06, + "loss": 1.4485962390899658, + "step": 2058 + }, + { + "epoch": 0.374988622917994, + "grad_norm": 17.125, + "learning_rate": 9.716063316923063e-06, + "loss": 1.7793612480163574, + "step": 2060 + }, + { + "epoch": 0.3753526895421862, + "grad_norm": 13.4375, + "learning_rate": 9.715496032990808e-06, + "loss": 1.738204002380371, + "step": 2062 + }, + { + "epoch": 0.37571675616637845, + "grad_norm": 9.4375, + "learning_rate": 9.714928203824564e-06, + "loss": 1.8245890140533447, + "step": 2064 + }, + { + "epoch": 0.3760808227905707, + "grad_norm": 11.125, + "learning_rate": 9.714359829507667e-06, + "loss": 1.3317662477493286, + "step": 2066 + }, + { + "epoch": 0.3764448894147629, + "grad_norm": 16.5, + "learning_rate": 9.713790910123515e-06, + "loss": 1.6469428539276123, + "step": 2068 + }, + { + "epoch": 0.37680895603895515, + "grad_norm": 142.0, + "learning_rate": 9.713221445755598e-06, + "loss": 1.8514375686645508, + "step": 2070 + }, + { + "epoch": 0.37717302266314734, + "grad_norm": 12.5, + "learning_rate": 9.712651436487484e-06, + "loss": 1.4966158866882324, + "step": 2072 + }, + { + "epoch": 0.3775370892873396, + "grad_norm": 43.0, + "learning_rate": 9.712080882402814e-06, + "loss": 1.633608341217041, + "step": 2074 + }, + { + "epoch": 0.3779011559115318, + "grad_norm": 14.25, + "learning_rate": 9.71150978358532e-06, + "loss": 0.921191394329071, + "step": 2076 + }, + { + "epoch": 0.37826522253572403, + "grad_norm": 9.4375, + "learning_rate": 9.710938140118807e-06, + "loss": 0.6324692964553833, + "step": 2078 + }, + { + "epoch": 0.3786292891599163, + "grad_norm": 4.59375, + "learning_rate": 9.710365952087163e-06, + "loss": 1.0460968017578125, + "step": 2080 + }, + { + "epoch": 0.3789933557841085, + "grad_norm": 8.4375, + "learning_rate": 9.709793219574347e-06, + "loss": 1.064198613166809, + "step": 2082 + }, + { + "epoch": 0.3793574224083007, + "grad_norm": 13.5, + "learning_rate": 9.709219942664416e-06, + "loss": 1.372009038925171, + "step": 2084 + }, + { + "epoch": 0.3797214890324929, + "grad_norm": 10.4375, + "learning_rate": 9.708646121441487e-06, + "loss": 1.4485632181167603, + "step": 2086 + }, + { + "epoch": 0.38008555565668517, + "grad_norm": 10.0625, + "learning_rate": 9.708071755989772e-06, + "loss": 1.4089813232421875, + "step": 2088 + }, + { + "epoch": 0.3804496222808774, + "grad_norm": 43.25, + "learning_rate": 9.707496846393553e-06, + "loss": 0.8870041370391846, + "step": 2090 + }, + { + "epoch": 0.3808136889050696, + "grad_norm": 11.875, + "learning_rate": 9.706921392737198e-06, + "loss": 1.2658190727233887, + "step": 2092 + }, + { + "epoch": 0.38117775552926186, + "grad_norm": 14.625, + "learning_rate": 9.70634539510515e-06, + "loss": 1.7558362483978271, + "step": 2094 + }, + { + "epoch": 0.38154182215345406, + "grad_norm": 13.5, + "learning_rate": 9.705768853581937e-06, + "loss": 1.5499252080917358, + "step": 2096 + }, + { + "epoch": 0.3819058887776463, + "grad_norm": 8.25, + "learning_rate": 9.705191768252163e-06, + "loss": 1.5761405229568481, + "step": 2098 + }, + { + "epoch": 0.38226995540183856, + "grad_norm": 9.8125, + "learning_rate": 9.704614139200512e-06, + "loss": 1.324540376663208, + "step": 2100 + }, + { + "epoch": 0.38263402202603075, + "grad_norm": 12.25, + "learning_rate": 9.704035966511748e-06, + "loss": 1.4139460325241089, + "step": 2102 + }, + { + "epoch": 0.382998088650223, + "grad_norm": 14.0625, + "learning_rate": 9.703457250270721e-06, + "loss": 1.282622218132019, + "step": 2104 + }, + { + "epoch": 0.3833621552744152, + "grad_norm": 15.5, + "learning_rate": 9.702877990562347e-06, + "loss": 0.9831581115722656, + "step": 2106 + }, + { + "epoch": 0.38372622189860744, + "grad_norm": 8.8125, + "learning_rate": 9.702298187471637e-06, + "loss": 1.4399539232254028, + "step": 2108 + }, + { + "epoch": 0.3840902885227997, + "grad_norm": 12.375, + "learning_rate": 9.701717841083671e-06, + "loss": 1.5087871551513672, + "step": 2110 + }, + { + "epoch": 0.3844543551469919, + "grad_norm": 14.9375, + "learning_rate": 9.701136951483614e-06, + "loss": 1.056596040725708, + "step": 2112 + }, + { + "epoch": 0.38481842177118414, + "grad_norm": 59.5, + "learning_rate": 9.70055551875671e-06, + "loss": 1.0874437093734741, + "step": 2114 + }, + { + "epoch": 0.38518248839537633, + "grad_norm": 11.8125, + "learning_rate": 9.699973542988278e-06, + "loss": 1.6652891635894775, + "step": 2116 + }, + { + "epoch": 0.3855465550195686, + "grad_norm": 12.1875, + "learning_rate": 9.699391024263727e-06, + "loss": 1.5830656290054321, + "step": 2118 + }, + { + "epoch": 0.38591062164376083, + "grad_norm": 16.875, + "learning_rate": 9.698807962668534e-06, + "loss": 1.6655910015106201, + "step": 2120 + }, + { + "epoch": 0.386274688267953, + "grad_norm": 20.75, + "learning_rate": 9.698224358288259e-06, + "loss": 1.8198539018630981, + "step": 2122 + }, + { + "epoch": 0.3866387548921453, + "grad_norm": 8.75, + "learning_rate": 9.69764021120855e-06, + "loss": 1.5100162029266357, + "step": 2124 + }, + { + "epoch": 0.38700282151633747, + "grad_norm": 10.1875, + "learning_rate": 9.697055521515127e-06, + "loss": 1.4647231101989746, + "step": 2126 + }, + { + "epoch": 0.3873668881405297, + "grad_norm": 14.25, + "learning_rate": 9.696470289293785e-06, + "loss": 1.421520471572876, + "step": 2128 + }, + { + "epoch": 0.38773095476472197, + "grad_norm": 4.9375, + "learning_rate": 9.695884514630411e-06, + "loss": 1.1075094938278198, + "step": 2130 + }, + { + "epoch": 0.38809502138891416, + "grad_norm": 8.3125, + "learning_rate": 9.695298197610963e-06, + "loss": 1.5915309190750122, + "step": 2132 + }, + { + "epoch": 0.3884590880131064, + "grad_norm": 8.3125, + "learning_rate": 9.694711338321479e-06, + "loss": 1.5105695724487305, + "step": 2134 + }, + { + "epoch": 0.3888231546372986, + "grad_norm": 15.6875, + "learning_rate": 9.69412393684808e-06, + "loss": 1.4969050884246826, + "step": 2136 + }, + { + "epoch": 0.38918722126149086, + "grad_norm": 14.0, + "learning_rate": 9.693535993276964e-06, + "loss": 1.765354871749878, + "step": 2138 + }, + { + "epoch": 0.3895512878856831, + "grad_norm": 33.0, + "learning_rate": 9.692947507694408e-06, + "loss": 1.1231988668441772, + "step": 2140 + }, + { + "epoch": 0.3899153545098753, + "grad_norm": 12.75, + "learning_rate": 9.692358480186775e-06, + "loss": 1.1892549991607666, + "step": 2142 + }, + { + "epoch": 0.39027942113406755, + "grad_norm": 18.125, + "learning_rate": 9.691768910840495e-06, + "loss": 1.389365792274475, + "step": 2144 + }, + { + "epoch": 0.39064348775825974, + "grad_norm": 9.875, + "learning_rate": 9.691178799742091e-06, + "loss": 1.8792126178741455, + "step": 2146 + }, + { + "epoch": 0.391007554382452, + "grad_norm": 9.1875, + "learning_rate": 9.690588146978157e-06, + "loss": 1.4509637355804443, + "step": 2148 + }, + { + "epoch": 0.39137162100664424, + "grad_norm": 9.1875, + "learning_rate": 9.68999695263537e-06, + "loss": 1.119098424911499, + "step": 2150 + }, + { + "epoch": 0.39173568763083644, + "grad_norm": 7.0, + "learning_rate": 9.689405216800483e-06, + "loss": 1.4772535562515259, + "step": 2152 + }, + { + "epoch": 0.3920997542550287, + "grad_norm": 10.4375, + "learning_rate": 9.688812939560332e-06, + "loss": 1.401158332824707, + "step": 2154 + }, + { + "epoch": 0.3924638208792209, + "grad_norm": 15.875, + "learning_rate": 9.688220121001832e-06, + "loss": 1.2952908277511597, + "step": 2156 + }, + { + "epoch": 0.39282788750341313, + "grad_norm": 9.5625, + "learning_rate": 9.687626761211979e-06, + "loss": 1.1576765775680542, + "step": 2158 + }, + { + "epoch": 0.3931919541276053, + "grad_norm": 18.625, + "learning_rate": 9.68703286027784e-06, + "loss": 1.9631154537200928, + "step": 2160 + }, + { + "epoch": 0.3935560207517976, + "grad_norm": 8.0, + "learning_rate": 9.686438418286572e-06, + "loss": 1.681584358215332, + "step": 2162 + }, + { + "epoch": 0.3939200873759898, + "grad_norm": 22.75, + "learning_rate": 9.685843435325406e-06, + "loss": 1.2069242000579834, + "step": 2164 + }, + { + "epoch": 0.394284154000182, + "grad_norm": 36.5, + "learning_rate": 9.685247911481652e-06, + "loss": 1.852647304534912, + "step": 2166 + }, + { + "epoch": 0.39464822062437427, + "grad_norm": 5.3125, + "learning_rate": 9.684651846842705e-06, + "loss": 1.0551722049713135, + "step": 2168 + }, + { + "epoch": 0.39501228724856646, + "grad_norm": 13.9375, + "learning_rate": 9.684055241496028e-06, + "loss": 1.378616452217102, + "step": 2170 + }, + { + "epoch": 0.3953763538727587, + "grad_norm": 10.3125, + "learning_rate": 9.683458095529179e-06, + "loss": 1.5073012113571167, + "step": 2172 + }, + { + "epoch": 0.39574042049695096, + "grad_norm": 14.25, + "learning_rate": 9.68286040902978e-06, + "loss": 0.7733011245727539, + "step": 2174 + }, + { + "epoch": 0.39610448712114316, + "grad_norm": 7.03125, + "learning_rate": 9.682262182085541e-06, + "loss": 0.7533259391784668, + "step": 2176 + }, + { + "epoch": 0.3964685537453354, + "grad_norm": 10.625, + "learning_rate": 9.68166341478425e-06, + "loss": 1.4501830339431763, + "step": 2178 + }, + { + "epoch": 0.3968326203695276, + "grad_norm": 19.0, + "learning_rate": 9.681064107213774e-06, + "loss": 1.541663408279419, + "step": 2180 + }, + { + "epoch": 0.39719668699371985, + "grad_norm": 11.625, + "learning_rate": 9.680464259462056e-06, + "loss": 1.8518545627593994, + "step": 2182 + }, + { + "epoch": 0.3975607536179121, + "grad_norm": 18.5, + "learning_rate": 9.679863871617126e-06, + "loss": 1.6773710250854492, + "step": 2184 + }, + { + "epoch": 0.3979248202421043, + "grad_norm": 23.875, + "learning_rate": 9.679262943767085e-06, + "loss": 0.9477849006652832, + "step": 2186 + }, + { + "epoch": 0.39828888686629654, + "grad_norm": 10.5, + "learning_rate": 9.67866147600012e-06, + "loss": 1.4844529628753662, + "step": 2188 + }, + { + "epoch": 0.39865295349048874, + "grad_norm": 8.4375, + "learning_rate": 9.678059468404488e-06, + "loss": 1.2404515743255615, + "step": 2190 + }, + { + "epoch": 0.399017020114681, + "grad_norm": 3.796875, + "learning_rate": 9.677456921068538e-06, + "loss": 1.119039535522461, + "step": 2192 + }, + { + "epoch": 0.39938108673887324, + "grad_norm": 29.375, + "learning_rate": 9.676853834080685e-06, + "loss": 1.2887297868728638, + "step": 2194 + }, + { + "epoch": 0.39974515336306543, + "grad_norm": 23.125, + "learning_rate": 9.676250207529434e-06, + "loss": 1.5945512056350708, + "step": 2196 + }, + { + "epoch": 0.4001092199872577, + "grad_norm": 14.0625, + "learning_rate": 9.675646041503366e-06, + "loss": 0.6604722738265991, + "step": 2198 + }, + { + "epoch": 0.4004732866114499, + "grad_norm": 10.25, + "learning_rate": 9.675041336091135e-06, + "loss": 1.4796134233474731, + "step": 2200 + }, + { + "epoch": 0.4008373532356421, + "grad_norm": 10.5625, + "learning_rate": 9.674436091381482e-06, + "loss": 1.4008347988128662, + "step": 2202 + }, + { + "epoch": 0.4012014198598344, + "grad_norm": 6.9375, + "learning_rate": 9.673830307463225e-06, + "loss": 1.2272157669067383, + "step": 2204 + }, + { + "epoch": 0.40156548648402657, + "grad_norm": 4.96875, + "learning_rate": 9.673223984425258e-06, + "loss": 1.2955687046051025, + "step": 2206 + }, + { + "epoch": 0.4019295531082188, + "grad_norm": 17.125, + "learning_rate": 9.672617122356558e-06, + "loss": 1.1196295022964478, + "step": 2208 + }, + { + "epoch": 0.402293619732411, + "grad_norm": 20.125, + "learning_rate": 9.672009721346178e-06, + "loss": 1.329920768737793, + "step": 2210 + }, + { + "epoch": 0.40265768635660326, + "grad_norm": 8.125, + "learning_rate": 9.671401781483254e-06, + "loss": 1.5276387929916382, + "step": 2212 + }, + { + "epoch": 0.4030217529807955, + "grad_norm": 21.625, + "learning_rate": 9.670793302856998e-06, + "loss": 1.7229703664779663, + "step": 2214 + }, + { + "epoch": 0.4033858196049877, + "grad_norm": 16.5, + "learning_rate": 9.670184285556698e-06, + "loss": 1.8548630475997925, + "step": 2216 + }, + { + "epoch": 0.40374988622917996, + "grad_norm": 27.625, + "learning_rate": 9.669574729671732e-06, + "loss": 1.9700976610183716, + "step": 2218 + }, + { + "epoch": 0.40411395285337215, + "grad_norm": 9.5625, + "learning_rate": 9.668964635291544e-06, + "loss": 1.4679195880889893, + "step": 2220 + }, + { + "epoch": 0.4044780194775644, + "grad_norm": 15.375, + "learning_rate": 9.668354002505664e-06, + "loss": 1.486782193183899, + "step": 2222 + }, + { + "epoch": 0.40484208610175665, + "grad_norm": 20.75, + "learning_rate": 9.667742831403704e-06, + "loss": 1.2617809772491455, + "step": 2224 + }, + { + "epoch": 0.40520615272594884, + "grad_norm": 9.6875, + "learning_rate": 9.667131122075345e-06, + "loss": 1.2526487112045288, + "step": 2226 + }, + { + "epoch": 0.4055702193501411, + "grad_norm": 8.75, + "learning_rate": 9.666518874610355e-06, + "loss": 1.2761998176574707, + "step": 2228 + }, + { + "epoch": 0.4059342859743333, + "grad_norm": 14.75, + "learning_rate": 9.66590608909858e-06, + "loss": 1.1356678009033203, + "step": 2230 + }, + { + "epoch": 0.40629835259852554, + "grad_norm": 7.5, + "learning_rate": 9.665292765629944e-06, + "loss": 1.4666557312011719, + "step": 2232 + }, + { + "epoch": 0.40666241922271773, + "grad_norm": 18.0, + "learning_rate": 9.664678904294447e-06, + "loss": 1.1282765865325928, + "step": 2234 + }, + { + "epoch": 0.40702648584691, + "grad_norm": 13.3125, + "learning_rate": 9.664064505182174e-06, + "loss": 1.6259841918945312, + "step": 2236 + }, + { + "epoch": 0.40739055247110223, + "grad_norm": 7.4375, + "learning_rate": 9.663449568383282e-06, + "loss": 1.4740502834320068, + "step": 2238 + }, + { + "epoch": 0.4077546190952944, + "grad_norm": 10.8125, + "learning_rate": 9.662834093988014e-06, + "loss": 1.4135972261428833, + "step": 2240 + }, + { + "epoch": 0.4081186857194867, + "grad_norm": 9.75, + "learning_rate": 9.662218082086688e-06, + "loss": 1.4066673517227173, + "step": 2242 + }, + { + "epoch": 0.40848275234367887, + "grad_norm": 12.5625, + "learning_rate": 9.661601532769697e-06, + "loss": 1.3687559366226196, + "step": 2244 + }, + { + "epoch": 0.4088468189678711, + "grad_norm": 14.875, + "learning_rate": 9.66098444612752e-06, + "loss": 1.6247143745422363, + "step": 2246 + }, + { + "epoch": 0.40921088559206337, + "grad_norm": 7.46875, + "learning_rate": 9.660366822250717e-06, + "loss": 1.6085401773452759, + "step": 2248 + }, + { + "epoch": 0.40957495221625556, + "grad_norm": 8.875, + "learning_rate": 9.659748661229912e-06, + "loss": 1.3979735374450684, + "step": 2250 + }, + { + "epoch": 0.4099390188404478, + "grad_norm": 36.0, + "learning_rate": 9.659129963155826e-06, + "loss": 1.095952033996582, + "step": 2252 + }, + { + "epoch": 0.41030308546464, + "grad_norm": 11.5, + "learning_rate": 9.658510728119245e-06, + "loss": 1.641335368156433, + "step": 2254 + }, + { + "epoch": 0.41066715208883225, + "grad_norm": 16.125, + "learning_rate": 9.657890956211043e-06, + "loss": 1.5438529253005981, + "step": 2256 + }, + { + "epoch": 0.4110312187130245, + "grad_norm": 10.0625, + "learning_rate": 9.657270647522166e-06, + "loss": 1.3864002227783203, + "step": 2258 + }, + { + "epoch": 0.4113952853372167, + "grad_norm": 12.5625, + "learning_rate": 9.656649802143646e-06, + "loss": 1.7747098207473755, + "step": 2260 + }, + { + "epoch": 0.41175935196140895, + "grad_norm": 7.875, + "learning_rate": 9.656028420166584e-06, + "loss": 1.4297223091125488, + "step": 2262 + }, + { + "epoch": 0.41212341858560114, + "grad_norm": 13.9375, + "learning_rate": 9.655406501682167e-06, + "loss": 1.1971871852874756, + "step": 2264 + }, + { + "epoch": 0.4124874852097934, + "grad_norm": 130.0, + "learning_rate": 9.654784046781661e-06, + "loss": 1.7497791051864624, + "step": 2266 + }, + { + "epoch": 0.41285155183398564, + "grad_norm": 6.8125, + "learning_rate": 9.654161055556408e-06, + "loss": 1.375710129737854, + "step": 2268 + }, + { + "epoch": 0.41321561845817784, + "grad_norm": 9.1875, + "learning_rate": 9.653537528097827e-06, + "loss": 1.2497614622116089, + "step": 2270 + }, + { + "epoch": 0.4135796850823701, + "grad_norm": 12.4375, + "learning_rate": 9.652913464497423e-06, + "loss": 1.4670453071594238, + "step": 2272 + }, + { + "epoch": 0.4139437517065623, + "grad_norm": 19.125, + "learning_rate": 9.652288864846773e-06, + "loss": 1.6561918258666992, + "step": 2274 + }, + { + "epoch": 0.41430781833075453, + "grad_norm": 20.0, + "learning_rate": 9.65166372923753e-06, + "loss": 1.6369695663452148, + "step": 2276 + }, + { + "epoch": 0.4146718849549468, + "grad_norm": 12.3125, + "learning_rate": 9.651038057761435e-06, + "loss": 1.2019511461257935, + "step": 2278 + }, + { + "epoch": 0.415035951579139, + "grad_norm": 11.4375, + "learning_rate": 9.650411850510302e-06, + "loss": 1.3143885135650635, + "step": 2280 + }, + { + "epoch": 0.4154000182033312, + "grad_norm": 13.3125, + "learning_rate": 9.649785107576025e-06, + "loss": 1.4684603214263916, + "step": 2282 + }, + { + "epoch": 0.4157640848275234, + "grad_norm": 9.875, + "learning_rate": 9.649157829050573e-06, + "loss": 1.6451226472854614, + "step": 2284 + }, + { + "epoch": 0.41612815145171567, + "grad_norm": 6.3125, + "learning_rate": 9.648530015025998e-06, + "loss": 1.2578718662261963, + "step": 2286 + }, + { + "epoch": 0.4164922180759079, + "grad_norm": 11.125, + "learning_rate": 9.64790166559443e-06, + "loss": 1.4395158290863037, + "step": 2288 + }, + { + "epoch": 0.4168562847001001, + "grad_norm": 6.9375, + "learning_rate": 9.647272780848076e-06, + "loss": 1.3975218534469604, + "step": 2290 + }, + { + "epoch": 0.41722035132429236, + "grad_norm": 8.4375, + "learning_rate": 9.646643360879222e-06, + "loss": 1.0757924318313599, + "step": 2292 + }, + { + "epoch": 0.41758441794848455, + "grad_norm": 11.3125, + "learning_rate": 9.646013405780235e-06, + "loss": 1.3886827230453491, + "step": 2294 + }, + { + "epoch": 0.4179484845726768, + "grad_norm": 3.734375, + "learning_rate": 9.645382915643554e-06, + "loss": 1.2899380922317505, + "step": 2296 + }, + { + "epoch": 0.41831255119686905, + "grad_norm": 48.5, + "learning_rate": 9.644751890561708e-06, + "loss": 1.2573360204696655, + "step": 2298 + }, + { + "epoch": 0.41867661782106125, + "grad_norm": 8.5625, + "learning_rate": 9.64412033062729e-06, + "loss": 1.3277581930160522, + "step": 2300 + }, + { + "epoch": 0.4190406844452535, + "grad_norm": 71.5, + "learning_rate": 9.643488235932981e-06, + "loss": 1.5124677419662476, + "step": 2302 + }, + { + "epoch": 0.4194047510694457, + "grad_norm": 13.5625, + "learning_rate": 9.642855606571541e-06, + "loss": 1.6766806840896606, + "step": 2304 + }, + { + "epoch": 0.41976881769363794, + "grad_norm": 10.0625, + "learning_rate": 9.642222442635802e-06, + "loss": 1.4139695167541504, + "step": 2306 + }, + { + "epoch": 0.42013288431783014, + "grad_norm": 86.0, + "learning_rate": 9.641588744218684e-06, + "loss": 1.4274101257324219, + "step": 2308 + }, + { + "epoch": 0.4204969509420224, + "grad_norm": 8.5, + "learning_rate": 9.640954511413171e-06, + "loss": 1.2447904348373413, + "step": 2310 + }, + { + "epoch": 0.42086101756621463, + "grad_norm": 8.5625, + "learning_rate": 9.640319744312344e-06, + "loss": 0.9159026145935059, + "step": 2312 + }, + { + "epoch": 0.42122508419040683, + "grad_norm": 5.78125, + "learning_rate": 9.639684443009343e-06, + "loss": 1.0781114101409912, + "step": 2314 + }, + { + "epoch": 0.4215891508145991, + "grad_norm": 43.5, + "learning_rate": 9.6390486075974e-06, + "loss": 1.3290879726409912, + "step": 2316 + }, + { + "epoch": 0.4219532174387913, + "grad_norm": 13.5625, + "learning_rate": 9.638412238169825e-06, + "loss": 0.6265374422073364, + "step": 2318 + }, + { + "epoch": 0.4223172840629835, + "grad_norm": 10.0625, + "learning_rate": 9.637775334819999e-06, + "loss": 1.3973679542541504, + "step": 2320 + }, + { + "epoch": 0.42268135068717577, + "grad_norm": 4.46875, + "learning_rate": 9.637137897641385e-06, + "loss": 1.2640068531036377, + "step": 2322 + }, + { + "epoch": 0.42304541731136797, + "grad_norm": 4.15625, + "learning_rate": 9.63649992672752e-06, + "loss": 0.9691958427429199, + "step": 2324 + }, + { + "epoch": 0.4234094839355602, + "grad_norm": 19.375, + "learning_rate": 9.635861422172034e-06, + "loss": 1.5447862148284912, + "step": 2326 + }, + { + "epoch": 0.4237735505597524, + "grad_norm": 7.6875, + "learning_rate": 9.635222384068617e-06, + "loss": 1.8208239078521729, + "step": 2328 + }, + { + "epoch": 0.42413761718394466, + "grad_norm": 3.984375, + "learning_rate": 9.634582812511049e-06, + "loss": 1.0587718486785889, + "step": 2330 + }, + { + "epoch": 0.4245016838081369, + "grad_norm": 9.625, + "learning_rate": 9.63394270759318e-06, + "loss": 1.0779318809509277, + "step": 2332 + }, + { + "epoch": 0.4248657504323291, + "grad_norm": 9.5625, + "learning_rate": 9.633302069408948e-06, + "loss": 1.7597366571426392, + "step": 2334 + }, + { + "epoch": 0.42522981705652135, + "grad_norm": 10.625, + "learning_rate": 9.63266089805236e-06, + "loss": 1.526566982269287, + "step": 2336 + }, + { + "epoch": 0.42559388368071355, + "grad_norm": 12.375, + "learning_rate": 9.632019193617507e-06, + "loss": 1.5721865892410278, + "step": 2338 + }, + { + "epoch": 0.4259579503049058, + "grad_norm": 74.0, + "learning_rate": 9.631376956198559e-06, + "loss": 0.6878648400306702, + "step": 2340 + }, + { + "epoch": 0.42632201692909805, + "grad_norm": 53.0, + "learning_rate": 9.630734185889756e-06, + "loss": 0.45763444900512695, + "step": 2342 + }, + { + "epoch": 0.42668608355329024, + "grad_norm": 15.8125, + "learning_rate": 9.630090882785431e-06, + "loss": 0.8998202085494995, + "step": 2344 + }, + { + "epoch": 0.4270501501774825, + "grad_norm": 8.5, + "learning_rate": 9.629447046979974e-06, + "loss": 1.385998010635376, + "step": 2346 + }, + { + "epoch": 0.4274142168016747, + "grad_norm": 9.9375, + "learning_rate": 9.628802678567874e-06, + "loss": 1.6371874809265137, + "step": 2348 + }, + { + "epoch": 0.42777828342586693, + "grad_norm": 9.5, + "learning_rate": 9.628157777643687e-06, + "loss": 1.8729541301727295, + "step": 2350 + }, + { + "epoch": 0.4281423500500592, + "grad_norm": 25.625, + "learning_rate": 9.627512344302052e-06, + "loss": 1.4513651132583618, + "step": 2352 + }, + { + "epoch": 0.4285064166742514, + "grad_norm": 78.5, + "learning_rate": 9.626866378637681e-06, + "loss": 1.4546449184417725, + "step": 2354 + }, + { + "epoch": 0.4288704832984436, + "grad_norm": 6.6875, + "learning_rate": 9.626219880745365e-06, + "loss": 1.1034903526306152, + "step": 2356 + }, + { + "epoch": 0.4292345499226358, + "grad_norm": 36.5, + "learning_rate": 9.625572850719978e-06, + "loss": 0.9412211179733276, + "step": 2358 + }, + { + "epoch": 0.42959861654682807, + "grad_norm": 8.4375, + "learning_rate": 9.624925288656469e-06, + "loss": 1.3844599723815918, + "step": 2360 + }, + { + "epoch": 0.4299626831710203, + "grad_norm": 8.875, + "learning_rate": 9.624277194649864e-06, + "loss": 1.4201388359069824, + "step": 2362 + }, + { + "epoch": 0.4303267497952125, + "grad_norm": 5.09375, + "learning_rate": 9.623628568795269e-06, + "loss": 1.3087117671966553, + "step": 2364 + }, + { + "epoch": 0.43069081641940477, + "grad_norm": 9.5625, + "learning_rate": 9.622979411187867e-06, + "loss": 1.365851879119873, + "step": 2366 + }, + { + "epoch": 0.43105488304359696, + "grad_norm": 8.25, + "learning_rate": 9.622329721922919e-06, + "loss": 1.383945107460022, + "step": 2368 + }, + { + "epoch": 0.4314189496677892, + "grad_norm": 5.3125, + "learning_rate": 9.621679501095764e-06, + "loss": 1.314424753189087, + "step": 2370 + }, + { + "epoch": 0.43178301629198146, + "grad_norm": 11.0, + "learning_rate": 9.62102874880182e-06, + "loss": 1.4450368881225586, + "step": 2372 + }, + { + "epoch": 0.43214708291617365, + "grad_norm": 30.25, + "learning_rate": 9.620377465136582e-06, + "loss": 1.0981154441833496, + "step": 2374 + }, + { + "epoch": 0.4325111495403659, + "grad_norm": 14.75, + "learning_rate": 9.619725650195621e-06, + "loss": 0.416414350271225, + "step": 2376 + }, + { + "epoch": 0.4328752161645581, + "grad_norm": 6.90625, + "learning_rate": 9.619073304074591e-06, + "loss": 1.3250190019607544, + "step": 2378 + }, + { + "epoch": 0.43323928278875035, + "grad_norm": 11.0625, + "learning_rate": 9.618420426869222e-06, + "loss": 1.3769481182098389, + "step": 2380 + }, + { + "epoch": 0.4336033494129426, + "grad_norm": 54.25, + "learning_rate": 9.617767018675319e-06, + "loss": 1.4135537147521973, + "step": 2382 + }, + { + "epoch": 0.4339674160371348, + "grad_norm": 13.75, + "learning_rate": 9.617113079588766e-06, + "loss": 1.5625133514404297, + "step": 2384 + }, + { + "epoch": 0.43433148266132704, + "grad_norm": 9.6875, + "learning_rate": 9.61645860970553e-06, + "loss": 1.414522647857666, + "step": 2386 + }, + { + "epoch": 0.43469554928551923, + "grad_norm": 47.75, + "learning_rate": 9.615803609121649e-06, + "loss": 1.3832049369812012, + "step": 2388 + }, + { + "epoch": 0.4350596159097115, + "grad_norm": 34.25, + "learning_rate": 9.61514807793324e-06, + "loss": 1.6867897510528564, + "step": 2390 + }, + { + "epoch": 0.4354236825339037, + "grad_norm": 18.0, + "learning_rate": 9.614492016236502e-06, + "loss": 2.0114636421203613, + "step": 2392 + }, + { + "epoch": 0.4357877491580959, + "grad_norm": 9.3125, + "learning_rate": 9.613835424127711e-06, + "loss": 1.6052539348602295, + "step": 2394 + }, + { + "epoch": 0.4361518157822882, + "grad_norm": 12.3125, + "learning_rate": 9.613178301703215e-06, + "loss": 1.3947083950042725, + "step": 2396 + }, + { + "epoch": 0.43651588240648037, + "grad_norm": 11.9375, + "learning_rate": 9.612520649059444e-06, + "loss": 1.4418244361877441, + "step": 2398 + }, + { + "epoch": 0.4368799490306726, + "grad_norm": 18.5, + "learning_rate": 9.611862466292914e-06, + "loss": 1.5792224407196045, + "step": 2400 + }, + { + "epoch": 0.4372440156548648, + "grad_norm": 11.875, + "learning_rate": 9.6112037535002e-06, + "loss": 1.3662711381912231, + "step": 2402 + }, + { + "epoch": 0.43760808227905706, + "grad_norm": 26.5, + "learning_rate": 9.61054451077797e-06, + "loss": 1.4911441802978516, + "step": 2404 + }, + { + "epoch": 0.4379721489032493, + "grad_norm": 19.5, + "learning_rate": 9.609884738222968e-06, + "loss": 1.6761562824249268, + "step": 2406 + }, + { + "epoch": 0.4383362155274415, + "grad_norm": 11.75, + "learning_rate": 9.60922443593201e-06, + "loss": 1.0963339805603027, + "step": 2408 + }, + { + "epoch": 0.43870028215163376, + "grad_norm": 79.0, + "learning_rate": 9.60856360400199e-06, + "loss": 1.265076994895935, + "step": 2410 + }, + { + "epoch": 0.43906434877582595, + "grad_norm": 12.5, + "learning_rate": 9.607902242529887e-06, + "loss": 1.6833877563476562, + "step": 2412 + }, + { + "epoch": 0.4394284154000182, + "grad_norm": 16.75, + "learning_rate": 9.607240351612754e-06, + "loss": 1.2937580347061157, + "step": 2414 + }, + { + "epoch": 0.43979248202421045, + "grad_norm": 26.375, + "learning_rate": 9.606577931347714e-06, + "loss": 1.891810417175293, + "step": 2416 + }, + { + "epoch": 0.44015654864840265, + "grad_norm": 16.25, + "learning_rate": 9.60591498183198e-06, + "loss": 1.2742931842803955, + "step": 2418 + }, + { + "epoch": 0.4405206152725949, + "grad_norm": 66.5, + "learning_rate": 9.605251503162838e-06, + "loss": 0.997144341468811, + "step": 2420 + }, + { + "epoch": 0.4408846818967871, + "grad_norm": 22.25, + "learning_rate": 9.604587495437647e-06, + "loss": 2.052816390991211, + "step": 2422 + }, + { + "epoch": 0.44124874852097934, + "grad_norm": 21.25, + "learning_rate": 9.60392295875385e-06, + "loss": 1.2586523294448853, + "step": 2424 + }, + { + "epoch": 0.4416128151451716, + "grad_norm": 18.0, + "learning_rate": 9.603257893208964e-06, + "loss": 1.330334186553955, + "step": 2426 + }, + { + "epoch": 0.4419768817693638, + "grad_norm": 10.3125, + "learning_rate": 9.602592298900587e-06, + "loss": 1.0452346801757812, + "step": 2428 + }, + { + "epoch": 0.44234094839355603, + "grad_norm": 9.3125, + "learning_rate": 9.601926175926386e-06, + "loss": 1.7862670421600342, + "step": 2430 + }, + { + "epoch": 0.4427050150177482, + "grad_norm": 11.3125, + "learning_rate": 9.601259524384117e-06, + "loss": 1.168826937675476, + "step": 2432 + }, + { + "epoch": 0.4430690816419405, + "grad_norm": 6.65625, + "learning_rate": 9.600592344371608e-06, + "loss": 1.142594575881958, + "step": 2434 + }, + { + "epoch": 0.4434331482661327, + "grad_norm": 9.0625, + "learning_rate": 9.599924635986764e-06, + "loss": 1.5105631351470947, + "step": 2436 + }, + { + "epoch": 0.4437972148903249, + "grad_norm": 15.9375, + "learning_rate": 9.59925639932757e-06, + "loss": 1.7031049728393555, + "step": 2438 + }, + { + "epoch": 0.44416128151451717, + "grad_norm": 13.5, + "learning_rate": 9.598587634492087e-06, + "loss": 1.6640347242355347, + "step": 2440 + }, + { + "epoch": 0.44452534813870936, + "grad_norm": 3.875, + "learning_rate": 9.59791834157845e-06, + "loss": 1.2849209308624268, + "step": 2442 + }, + { + "epoch": 0.4448894147629016, + "grad_norm": 5.21875, + "learning_rate": 9.597248520684878e-06, + "loss": 1.2987831830978394, + "step": 2444 + }, + { + "epoch": 0.44525348138709386, + "grad_norm": 7.78125, + "learning_rate": 9.596578171909665e-06, + "loss": 1.2319151163101196, + "step": 2446 + }, + { + "epoch": 0.44561754801128606, + "grad_norm": 6.0625, + "learning_rate": 9.59590729535118e-06, + "loss": 1.290708303451538, + "step": 2448 + }, + { + "epoch": 0.4459816146354783, + "grad_norm": 9.3125, + "learning_rate": 9.595235891107873e-06, + "loss": 1.2799159288406372, + "step": 2450 + }, + { + "epoch": 0.4463456812596705, + "grad_norm": 11.625, + "learning_rate": 9.594563959278267e-06, + "loss": 2.1087443828582764, + "step": 2452 + }, + { + "epoch": 0.44670974788386275, + "grad_norm": 7.96875, + "learning_rate": 9.59389149996097e-06, + "loss": 1.3041532039642334, + "step": 2454 + }, + { + "epoch": 0.447073814508055, + "grad_norm": 8.5, + "learning_rate": 9.59321851325466e-06, + "loss": 1.4496657848358154, + "step": 2456 + }, + { + "epoch": 0.4474378811322472, + "grad_norm": 11.75, + "learning_rate": 9.592544999258096e-06, + "loss": 1.35652756690979, + "step": 2458 + }, + { + "epoch": 0.44780194775643944, + "grad_norm": 7.125, + "learning_rate": 9.59187095807011e-06, + "loss": 1.4326095581054688, + "step": 2460 + }, + { + "epoch": 0.44816601438063164, + "grad_norm": 9.0, + "learning_rate": 9.591196389789619e-06, + "loss": 1.1687068939208984, + "step": 2462 + }, + { + "epoch": 0.4485300810048239, + "grad_norm": 21.375, + "learning_rate": 9.59052129451561e-06, + "loss": 1.209844946861267, + "step": 2464 + }, + { + "epoch": 0.4488941476290161, + "grad_norm": 20.0, + "learning_rate": 9.589845672347153e-06, + "loss": 1.7998831272125244, + "step": 2466 + }, + { + "epoch": 0.44925821425320833, + "grad_norm": 6.4375, + "learning_rate": 9.589169523383393e-06, + "loss": 1.1922943592071533, + "step": 2468 + }, + { + "epoch": 0.4496222808774006, + "grad_norm": 6.9375, + "learning_rate": 9.588492847723551e-06, + "loss": 1.3234403133392334, + "step": 2470 + }, + { + "epoch": 0.4499863475015928, + "grad_norm": 6.84375, + "learning_rate": 9.587815645466927e-06, + "loss": 1.366612434387207, + "step": 2472 + }, + { + "epoch": 0.450350414125785, + "grad_norm": 13.3125, + "learning_rate": 9.587137916712896e-06, + "loss": 1.3607661724090576, + "step": 2474 + }, + { + "epoch": 0.4507144807499772, + "grad_norm": 22.875, + "learning_rate": 9.586459661560913e-06, + "loss": 1.454066514968872, + "step": 2476 + }, + { + "epoch": 0.45107854737416947, + "grad_norm": 34.0, + "learning_rate": 9.58578088011051e-06, + "loss": 1.3590030670166016, + "step": 2478 + }, + { + "epoch": 0.4514426139983617, + "grad_norm": 11.8125, + "learning_rate": 9.585101572461293e-06, + "loss": 1.268595576286316, + "step": 2480 + }, + { + "epoch": 0.4518066806225539, + "grad_norm": 20.25, + "learning_rate": 9.584421738712953e-06, + "loss": 1.4530681371688843, + "step": 2482 + }, + { + "epoch": 0.45217074724674616, + "grad_norm": 9.25, + "learning_rate": 9.583741378965246e-06, + "loss": 1.4298181533813477, + "step": 2484 + }, + { + "epoch": 0.45253481387093836, + "grad_norm": 7.03125, + "learning_rate": 9.583060493318015e-06, + "loss": 1.1290262937545776, + "step": 2486 + }, + { + "epoch": 0.4528988804951306, + "grad_norm": 8.8125, + "learning_rate": 9.582379081871178e-06, + "loss": 1.4337834119796753, + "step": 2488 + }, + { + "epoch": 0.45326294711932286, + "grad_norm": 11.625, + "learning_rate": 9.58169714472473e-06, + "loss": 1.0604631900787354, + "step": 2490 + }, + { + "epoch": 0.45362701374351505, + "grad_norm": 10.9375, + "learning_rate": 9.581014681978742e-06, + "loss": 1.4299339056015015, + "step": 2492 + }, + { + "epoch": 0.4539910803677073, + "grad_norm": 9.5, + "learning_rate": 9.58033169373336e-06, + "loss": 1.0158357620239258, + "step": 2494 + }, + { + "epoch": 0.4543551469918995, + "grad_norm": 15.9375, + "learning_rate": 9.579648180088814e-06, + "loss": 0.505800724029541, + "step": 2496 + }, + { + "epoch": 0.45471921361609174, + "grad_norm": 11.25, + "learning_rate": 9.578964141145404e-06, + "loss": 1.334416389465332, + "step": 2498 + }, + { + "epoch": 0.455083280240284, + "grad_norm": 15.75, + "learning_rate": 9.57827957700351e-06, + "loss": 1.4583282470703125, + "step": 2500 + }, + { + "epoch": 0.4554473468644762, + "grad_norm": 96.0, + "learning_rate": 9.577594487763589e-06, + "loss": 1.6680463552474976, + "step": 2502 + }, + { + "epoch": 0.45581141348866844, + "grad_norm": 32.0, + "learning_rate": 9.576908873526176e-06, + "loss": 1.1395843029022217, + "step": 2504 + }, + { + "epoch": 0.45617548011286063, + "grad_norm": 9.9375, + "learning_rate": 9.576222734391882e-06, + "loss": 1.6111465692520142, + "step": 2506 + }, + { + "epoch": 0.4565395467370529, + "grad_norm": 12.9375, + "learning_rate": 9.575536070461393e-06, + "loss": 2.0324113368988037, + "step": 2508 + }, + { + "epoch": 0.45690361336124513, + "grad_norm": 22.875, + "learning_rate": 9.57484888183548e-06, + "loss": 1.335809350013733, + "step": 2510 + }, + { + "epoch": 0.4572676799854373, + "grad_norm": 22.5, + "learning_rate": 9.57416116861498e-06, + "loss": 1.6101138591766357, + "step": 2512 + }, + { + "epoch": 0.4576317466096296, + "grad_norm": 15.4375, + "learning_rate": 9.57347293090081e-06, + "loss": 1.3389840126037598, + "step": 2514 + }, + { + "epoch": 0.45799581323382177, + "grad_norm": 14.0625, + "learning_rate": 9.572784168793972e-06, + "loss": 1.2342911958694458, + "step": 2516 + }, + { + "epoch": 0.458359879858014, + "grad_norm": 4.5, + "learning_rate": 9.572094882395537e-06, + "loss": 1.1682186126708984, + "step": 2518 + }, + { + "epoch": 0.45872394648220627, + "grad_norm": 7.09375, + "learning_rate": 9.571405071806652e-06, + "loss": 1.5737831592559814, + "step": 2520 + }, + { + "epoch": 0.45908801310639846, + "grad_norm": 14.875, + "learning_rate": 9.57071473712855e-06, + "loss": 1.350905179977417, + "step": 2522 + }, + { + "epoch": 0.4594520797305907, + "grad_norm": 6.875, + "learning_rate": 9.57002387846253e-06, + "loss": 1.218022346496582, + "step": 2524 + }, + { + "epoch": 0.4598161463547829, + "grad_norm": 13.0, + "learning_rate": 9.569332495909972e-06, + "loss": 1.3421099185943604, + "step": 2526 + }, + { + "epoch": 0.46018021297897516, + "grad_norm": 19.75, + "learning_rate": 9.568640589572336e-06, + "loss": 1.123950481414795, + "step": 2528 + }, + { + "epoch": 0.4605442796031674, + "grad_norm": 13.5, + "learning_rate": 9.567948159551158e-06, + "loss": 1.0482577085494995, + "step": 2530 + }, + { + "epoch": 0.4609083462273596, + "grad_norm": 21.25, + "learning_rate": 9.567255205948046e-06, + "loss": 1.5531996488571167, + "step": 2532 + }, + { + "epoch": 0.46127241285155185, + "grad_norm": 14.25, + "learning_rate": 9.566561728864688e-06, + "loss": 1.5404894351959229, + "step": 2534 + }, + { + "epoch": 0.46163647947574404, + "grad_norm": 8.0, + "learning_rate": 9.565867728402851e-06, + "loss": 1.4660441875457764, + "step": 2536 + }, + { + "epoch": 0.4620005460999363, + "grad_norm": 5.4375, + "learning_rate": 9.565173204664375e-06, + "loss": 1.4605087041854858, + "step": 2538 + }, + { + "epoch": 0.46236461272412854, + "grad_norm": 7.1875, + "learning_rate": 9.564478157751182e-06, + "loss": 1.2721070051193237, + "step": 2540 + }, + { + "epoch": 0.46272867934832074, + "grad_norm": 12.125, + "learning_rate": 9.563782587765263e-06, + "loss": 1.4563473463058472, + "step": 2542 + }, + { + "epoch": 0.463092745972513, + "grad_norm": 25.875, + "learning_rate": 9.563086494808694e-06, + "loss": 1.4034043550491333, + "step": 2544 + }, + { + "epoch": 0.4634568125967052, + "grad_norm": 13.9375, + "learning_rate": 9.56238987898362e-06, + "loss": 1.4014735221862793, + "step": 2546 + }, + { + "epoch": 0.46382087922089743, + "grad_norm": 8.75, + "learning_rate": 9.561692740392268e-06, + "loss": 1.389622449874878, + "step": 2548 + }, + { + "epoch": 0.4641849458450896, + "grad_norm": 12.125, + "learning_rate": 9.560995079136942e-06, + "loss": 1.4004762172698975, + "step": 2550 + }, + { + "epoch": 0.4645490124692819, + "grad_norm": 5.34375, + "learning_rate": 9.56029689532002e-06, + "loss": 1.3371957540512085, + "step": 2552 + }, + { + "epoch": 0.4649130790934741, + "grad_norm": 13.875, + "learning_rate": 9.559598189043958e-06, + "loss": 1.3006490468978882, + "step": 2554 + }, + { + "epoch": 0.4652771457176663, + "grad_norm": 21.0, + "learning_rate": 9.558898960411284e-06, + "loss": 1.2520625591278076, + "step": 2556 + }, + { + "epoch": 0.46564121234185857, + "grad_norm": 30.25, + "learning_rate": 9.558199209524613e-06, + "loss": 2.064906120300293, + "step": 2558 + }, + { + "epoch": 0.46600527896605076, + "grad_norm": 10.6875, + "learning_rate": 9.557498936486627e-06, + "loss": 1.0734522342681885, + "step": 2560 + }, + { + "epoch": 0.466369345590243, + "grad_norm": 7.6875, + "learning_rate": 9.55679814140009e-06, + "loss": 1.417957067489624, + "step": 2562 + }, + { + "epoch": 0.46673341221443526, + "grad_norm": 11.0, + "learning_rate": 9.55609682436784e-06, + "loss": 1.2966688871383667, + "step": 2564 + }, + { + "epoch": 0.46709747883862746, + "grad_norm": 6.96875, + "learning_rate": 9.555394985492794e-06, + "loss": 1.1198680400848389, + "step": 2566 + }, + { + "epoch": 0.4674615454628197, + "grad_norm": 19.25, + "learning_rate": 9.55469262487794e-06, + "loss": 1.3935253620147705, + "step": 2568 + }, + { + "epoch": 0.4678256120870119, + "grad_norm": 14.6875, + "learning_rate": 9.55398974262635e-06, + "loss": 1.3144853115081787, + "step": 2570 + }, + { + "epoch": 0.46818967871120415, + "grad_norm": 11.4375, + "learning_rate": 9.55328633884117e-06, + "loss": 1.0120824575424194, + "step": 2572 + }, + { + "epoch": 0.4685537453353964, + "grad_norm": 13.0625, + "learning_rate": 9.552582413625619e-06, + "loss": 1.8596464395523071, + "step": 2574 + }, + { + "epoch": 0.4689178119595886, + "grad_norm": 16.5, + "learning_rate": 9.551877967082996e-06, + "loss": 1.6003751754760742, + "step": 2576 + }, + { + "epoch": 0.46928187858378084, + "grad_norm": 8.375, + "learning_rate": 9.551172999316675e-06, + "loss": 1.1527396440505981, + "step": 2578 + }, + { + "epoch": 0.46964594520797304, + "grad_norm": 11.75, + "learning_rate": 9.55046751043011e-06, + "loss": 1.185123324394226, + "step": 2580 + }, + { + "epoch": 0.4700100118321653, + "grad_norm": 11.125, + "learning_rate": 9.549761500526827e-06, + "loss": 1.2402656078338623, + "step": 2582 + }, + { + "epoch": 0.47037407845635754, + "grad_norm": 20.125, + "learning_rate": 9.549054969710427e-06, + "loss": 1.3258576393127441, + "step": 2584 + }, + { + "epoch": 0.47073814508054973, + "grad_norm": 15.6875, + "learning_rate": 9.548347918084595e-06, + "loss": 1.0669920444488525, + "step": 2586 + }, + { + "epoch": 0.471102211704742, + "grad_norm": 20.875, + "learning_rate": 9.547640345753087e-06, + "loss": 1.544162631034851, + "step": 2588 + }, + { + "epoch": 0.4714662783289342, + "grad_norm": 6.53125, + "learning_rate": 9.546932252819732e-06, + "loss": 1.3497596979141235, + "step": 2590 + }, + { + "epoch": 0.4718303449531264, + "grad_norm": 14.3125, + "learning_rate": 9.546223639388448e-06, + "loss": 1.206697940826416, + "step": 2592 + }, + { + "epoch": 0.4721944115773187, + "grad_norm": 3.703125, + "learning_rate": 9.545514505563214e-06, + "loss": 1.1008825302124023, + "step": 2594 + }, + { + "epoch": 0.47255847820151087, + "grad_norm": 6.15625, + "learning_rate": 9.544804851448094e-06, + "loss": 1.3963027000427246, + "step": 2596 + }, + { + "epoch": 0.4729225448257031, + "grad_norm": 36.5, + "learning_rate": 9.54409467714723e-06, + "loss": 1.3875467777252197, + "step": 2598 + }, + { + "epoch": 0.4732866114498953, + "grad_norm": 7.1875, + "learning_rate": 9.543383982764833e-06, + "loss": 1.1912996768951416, + "step": 2600 + }, + { + "epoch": 0.47365067807408756, + "grad_norm": 3.046875, + "learning_rate": 9.542672768405199e-06, + "loss": 1.1987833976745605, + "step": 2602 + }, + { + "epoch": 0.4740147446982798, + "grad_norm": 2.703125, + "learning_rate": 9.541961034172692e-06, + "loss": 0.8739246726036072, + "step": 2604 + }, + { + "epoch": 0.474378811322472, + "grad_norm": 9.125, + "learning_rate": 9.541248780171757e-06, + "loss": 0.3598101735115051, + "step": 2606 + }, + { + "epoch": 0.47474287794666425, + "grad_norm": 8.25, + "learning_rate": 9.540536006506917e-06, + "loss": 0.901496171951294, + "step": 2608 + }, + { + "epoch": 0.47510694457085645, + "grad_norm": 6.25, + "learning_rate": 9.539822713282765e-06, + "loss": 1.2627109289169312, + "step": 2610 + }, + { + "epoch": 0.4754710111950487, + "grad_norm": 9.5625, + "learning_rate": 9.539108900603975e-06, + "loss": 1.4826383590698242, + "step": 2612 + }, + { + "epoch": 0.47583507781924095, + "grad_norm": 21.25, + "learning_rate": 9.538394568575298e-06, + "loss": 1.5900036096572876, + "step": 2614 + }, + { + "epoch": 0.47619914444343314, + "grad_norm": 4.84375, + "learning_rate": 9.537679717301558e-06, + "loss": 1.3371520042419434, + "step": 2616 + }, + { + "epoch": 0.4765632110676254, + "grad_norm": 3.1875, + "learning_rate": 9.536964346887656e-06, + "loss": 0.876775324344635, + "step": 2618 + }, + { + "epoch": 0.4769272776918176, + "grad_norm": 14.125, + "learning_rate": 9.536248457438568e-06, + "loss": 1.3802015781402588, + "step": 2620 + }, + { + "epoch": 0.47729134431600984, + "grad_norm": 12.8125, + "learning_rate": 9.535532049059353e-06, + "loss": 1.3966376781463623, + "step": 2622 + }, + { + "epoch": 0.47765541094020203, + "grad_norm": 15.75, + "learning_rate": 9.534815121855137e-06, + "loss": 1.4105513095855713, + "step": 2624 + }, + { + "epoch": 0.4780194775643943, + "grad_norm": 24.5, + "learning_rate": 9.534097675931127e-06, + "loss": 1.8112739324569702, + "step": 2626 + }, + { + "epoch": 0.47838354418858653, + "grad_norm": 10.625, + "learning_rate": 9.533379711392605e-06, + "loss": 1.3103275299072266, + "step": 2628 + }, + { + "epoch": 0.4787476108127787, + "grad_norm": 12.0, + "learning_rate": 9.53266122834493e-06, + "loss": 1.1173889636993408, + "step": 2630 + }, + { + "epoch": 0.479111677436971, + "grad_norm": 7.0, + "learning_rate": 9.531942226893537e-06, + "loss": 1.347985863685608, + "step": 2632 + }, + { + "epoch": 0.47947574406116317, + "grad_norm": 7.65625, + "learning_rate": 9.531222707143936e-06, + "loss": 1.4570860862731934, + "step": 2634 + }, + { + "epoch": 0.4798398106853554, + "grad_norm": 2.671875, + "learning_rate": 9.530502669201716e-06, + "loss": 0.9980091452598572, + "step": 2636 + }, + { + "epoch": 0.48020387730954767, + "grad_norm": 3.25, + "learning_rate": 9.529782113172532e-06, + "loss": 0.9306013584136963, + "step": 2638 + }, + { + "epoch": 0.48056794393373986, + "grad_norm": 7.15625, + "learning_rate": 9.529061039162131e-06, + "loss": 1.2191468477249146, + "step": 2640 + }, + { + "epoch": 0.4809320105579321, + "grad_norm": 6.34375, + "learning_rate": 9.528339447276325e-06, + "loss": 1.4673439264297485, + "step": 2642 + }, + { + "epoch": 0.4812960771821243, + "grad_norm": 8.0625, + "learning_rate": 9.527617337621002e-06, + "loss": 1.4516894817352295, + "step": 2644 + }, + { + "epoch": 0.48166014380631655, + "grad_norm": 5.53125, + "learning_rate": 9.526894710302133e-06, + "loss": 1.3467499017715454, + "step": 2646 + }, + { + "epoch": 0.4820242104305088, + "grad_norm": 8.4375, + "learning_rate": 9.526171565425757e-06, + "loss": 1.574650526046753, + "step": 2648 + }, + { + "epoch": 0.482388277054701, + "grad_norm": 19.75, + "learning_rate": 9.525447903097996e-06, + "loss": 1.150609016418457, + "step": 2650 + }, + { + "epoch": 0.48275234367889325, + "grad_norm": 8.125, + "learning_rate": 9.52472372342504e-06, + "loss": 1.2525157928466797, + "step": 2652 + }, + { + "epoch": 0.48311641030308544, + "grad_norm": 10.5, + "learning_rate": 9.523999026513164e-06, + "loss": 1.4114595651626587, + "step": 2654 + }, + { + "epoch": 0.4834804769272777, + "grad_norm": 10.3125, + "learning_rate": 9.523273812468713e-06, + "loss": 1.3584535121917725, + "step": 2656 + }, + { + "epoch": 0.48384454355146994, + "grad_norm": 22.0, + "learning_rate": 9.522548081398106e-06, + "loss": 1.293147325515747, + "step": 2658 + }, + { + "epoch": 0.48420861017566214, + "grad_norm": 21.25, + "learning_rate": 9.521821833407845e-06, + "loss": 1.4591537714004517, + "step": 2660 + }, + { + "epoch": 0.4845726767998544, + "grad_norm": 7.75, + "learning_rate": 9.521095068604504e-06, + "loss": 1.287800908088684, + "step": 2662 + }, + { + "epoch": 0.4849367434240466, + "grad_norm": 10.4375, + "learning_rate": 9.520367787094728e-06, + "loss": 1.5418848991394043, + "step": 2664 + }, + { + "epoch": 0.48530081004823883, + "grad_norm": 12.4375, + "learning_rate": 9.51963998898525e-06, + "loss": 1.5308781862258911, + "step": 2666 + }, + { + "epoch": 0.4856648766724311, + "grad_norm": 9.25, + "learning_rate": 9.518911674382865e-06, + "loss": 1.5935556888580322, + "step": 2668 + }, + { + "epoch": 0.4860289432966233, + "grad_norm": 17.25, + "learning_rate": 9.518182843394455e-06, + "loss": 1.4845272302627563, + "step": 2670 + }, + { + "epoch": 0.4863930099208155, + "grad_norm": 22.5, + "learning_rate": 9.517453496126967e-06, + "loss": 1.4654523134231567, + "step": 2672 + }, + { + "epoch": 0.4867570765450077, + "grad_norm": 9.1875, + "learning_rate": 9.516723632687434e-06, + "loss": 1.3871456384658813, + "step": 2674 + }, + { + "epoch": 0.48712114316919997, + "grad_norm": 9.875, + "learning_rate": 9.515993253182962e-06, + "loss": 1.3925297260284424, + "step": 2676 + }, + { + "epoch": 0.4874852097933922, + "grad_norm": 10.8125, + "learning_rate": 9.51526235772073e-06, + "loss": 1.2590036392211914, + "step": 2678 + }, + { + "epoch": 0.4878492764175844, + "grad_norm": 22.625, + "learning_rate": 9.514530946407992e-06, + "loss": 1.259558916091919, + "step": 2680 + }, + { + "epoch": 0.48821334304177666, + "grad_norm": 10.625, + "learning_rate": 9.51379901935208e-06, + "loss": 0.9127806425094604, + "step": 2682 + }, + { + "epoch": 0.48857740966596885, + "grad_norm": 11.4375, + "learning_rate": 9.513066576660404e-06, + "loss": 1.3500789403915405, + "step": 2684 + }, + { + "epoch": 0.4889414762901611, + "grad_norm": 13.0625, + "learning_rate": 9.512333618440441e-06, + "loss": 1.4418582916259766, + "step": 2686 + }, + { + "epoch": 0.48930554291435335, + "grad_norm": 15.125, + "learning_rate": 9.511600144799758e-06, + "loss": 1.7348606586456299, + "step": 2688 + }, + { + "epoch": 0.48966960953854555, + "grad_norm": 14.625, + "learning_rate": 9.510866155845984e-06, + "loss": 1.5145047903060913, + "step": 2690 + }, + { + "epoch": 0.4900336761627378, + "grad_norm": 21.5, + "learning_rate": 9.510131651686826e-06, + "loss": 1.2467162609100342, + "step": 2692 + }, + { + "epoch": 0.49039774278693, + "grad_norm": 20.5, + "learning_rate": 9.509396632430079e-06, + "loss": 0.8618804216384888, + "step": 2694 + }, + { + "epoch": 0.49076180941112224, + "grad_norm": 5.5625, + "learning_rate": 9.508661098183596e-06, + "loss": 1.282824158668518, + "step": 2696 + }, + { + "epoch": 0.49112587603531443, + "grad_norm": 5.4375, + "learning_rate": 9.507925049055316e-06, + "loss": 1.109830379486084, + "step": 2698 + }, + { + "epoch": 0.4914899426595067, + "grad_norm": 8.4375, + "learning_rate": 9.507188485153252e-06, + "loss": 1.4412422180175781, + "step": 2700 + }, + { + "epoch": 0.49185400928369893, + "grad_norm": 7.1875, + "learning_rate": 9.50645140658549e-06, + "loss": 1.339979648590088, + "step": 2702 + }, + { + "epoch": 0.49221807590789113, + "grad_norm": 6.59375, + "learning_rate": 9.505713813460195e-06, + "loss": 1.2000776529312134, + "step": 2704 + }, + { + "epoch": 0.4925821425320834, + "grad_norm": 11.6875, + "learning_rate": 9.504975705885606e-06, + "loss": 1.5046942234039307, + "step": 2706 + }, + { + "epoch": 0.49294620915627557, + "grad_norm": 9.5625, + "learning_rate": 9.504237083970038e-06, + "loss": 1.4285600185394287, + "step": 2708 + }, + { + "epoch": 0.4933102757804678, + "grad_norm": 6.375, + "learning_rate": 9.503497947821879e-06, + "loss": 1.1578221321105957, + "step": 2710 + }, + { + "epoch": 0.49367434240466007, + "grad_norm": 15.75, + "learning_rate": 9.502758297549593e-06, + "loss": 1.4977189302444458, + "step": 2712 + }, + { + "epoch": 0.49403840902885227, + "grad_norm": 23.125, + "learning_rate": 9.50201813326172e-06, + "loss": 1.241639256477356, + "step": 2714 + }, + { + "epoch": 0.4944024756530445, + "grad_norm": 10.625, + "learning_rate": 9.501277455066884e-06, + "loss": 0.7195055484771729, + "step": 2716 + }, + { + "epoch": 0.4947665422772367, + "grad_norm": 21.125, + "learning_rate": 9.500536263073768e-06, + "loss": 1.4854540824890137, + "step": 2718 + }, + { + "epoch": 0.49513060890142896, + "grad_norm": 10.4375, + "learning_rate": 9.49979455739114e-06, + "loss": 1.5270678997039795, + "step": 2720 + }, + { + "epoch": 0.4954946755256212, + "grad_norm": 7.09375, + "learning_rate": 9.499052338127845e-06, + "loss": 1.1615458726882935, + "step": 2722 + }, + { + "epoch": 0.4958587421498134, + "grad_norm": 10.125, + "learning_rate": 9.4983096053928e-06, + "loss": 1.2840888500213623, + "step": 2724 + }, + { + "epoch": 0.49622280877400565, + "grad_norm": 7.5625, + "learning_rate": 9.497566359295e-06, + "loss": 1.2300231456756592, + "step": 2726 + }, + { + "epoch": 0.49658687539819785, + "grad_norm": 118.5, + "learning_rate": 9.49682259994351e-06, + "loss": 1.7407028675079346, + "step": 2728 + }, + { + "epoch": 0.4969509420223901, + "grad_norm": 18.875, + "learning_rate": 9.496078327447476e-06, + "loss": 1.362412929534912, + "step": 2730 + }, + { + "epoch": 0.49731500864658235, + "grad_norm": 8.375, + "learning_rate": 9.495333541916114e-06, + "loss": 1.2506790161132812, + "step": 2732 + }, + { + "epoch": 0.49767907527077454, + "grad_norm": 8.0625, + "learning_rate": 9.49458824345872e-06, + "loss": 1.0845894813537598, + "step": 2734 + }, + { + "epoch": 0.4980431418949668, + "grad_norm": 6.5, + "learning_rate": 9.493842432184664e-06, + "loss": 1.4774537086486816, + "step": 2736 + }, + { + "epoch": 0.498407208519159, + "grad_norm": 9.375, + "learning_rate": 9.49309610820339e-06, + "loss": 1.234175443649292, + "step": 2738 + }, + { + "epoch": 0.49877127514335123, + "grad_norm": 15.0, + "learning_rate": 9.49234927162442e-06, + "loss": 1.5128930807113647, + "step": 2740 + }, + { + "epoch": 0.4991353417675435, + "grad_norm": 7.6875, + "learning_rate": 9.491601922557346e-06, + "loss": 1.6760063171386719, + "step": 2742 + }, + { + "epoch": 0.4994994083917357, + "grad_norm": 28.125, + "learning_rate": 9.490854061111838e-06, + "loss": 1.2086949348449707, + "step": 2744 + }, + { + "epoch": 0.4998634750159279, + "grad_norm": 9.5, + "learning_rate": 9.490105687397648e-06, + "loss": 1.7749273777008057, + "step": 2746 + }, + { + "epoch": 0.5002275416401202, + "grad_norm": 14.6875, + "learning_rate": 9.489356801524592e-06, + "loss": 1.8374942541122437, + "step": 2748 + }, + { + "epoch": 0.5005916082643124, + "grad_norm": 7.0, + "learning_rate": 9.488607403602563e-06, + "loss": 1.4177943468093872, + "step": 2750 + }, + { + "epoch": 0.5009556748885046, + "grad_norm": 19.375, + "learning_rate": 9.48785749374154e-06, + "loss": 1.1259864568710327, + "step": 2752 + }, + { + "epoch": 0.5013197415126969, + "grad_norm": 44.25, + "learning_rate": 9.487107072051562e-06, + "loss": 1.4717812538146973, + "step": 2754 + }, + { + "epoch": 0.5016838081368891, + "grad_norm": 14.8125, + "learning_rate": 9.486356138642753e-06, + "loss": 1.581669569015503, + "step": 2756 + }, + { + "epoch": 0.5020478747610813, + "grad_norm": 20.25, + "learning_rate": 9.485604693625311e-06, + "loss": 1.6590213775634766, + "step": 2758 + }, + { + "epoch": 0.5024119413852735, + "grad_norm": 12.5, + "learning_rate": 9.484852737109504e-06, + "loss": 1.4304105043411255, + "step": 2760 + }, + { + "epoch": 0.5027760080094658, + "grad_norm": 22.375, + "learning_rate": 9.484100269205685e-06, + "loss": 1.0655628442764282, + "step": 2762 + }, + { + "epoch": 0.503140074633658, + "grad_norm": 17.75, + "learning_rate": 9.483347290024267e-06, + "loss": 1.2159664630889893, + "step": 2764 + }, + { + "epoch": 0.5035041412578501, + "grad_norm": 7.4375, + "learning_rate": 9.482593799675754e-06, + "loss": 1.2526659965515137, + "step": 2766 + }, + { + "epoch": 0.5038682078820425, + "grad_norm": 13.375, + "learning_rate": 9.481839798270714e-06, + "loss": 1.2676928043365479, + "step": 2768 + }, + { + "epoch": 0.5042322745062346, + "grad_norm": 6.90625, + "learning_rate": 9.481085285919794e-06, + "loss": 1.1000837087631226, + "step": 2770 + }, + { + "epoch": 0.5045963411304268, + "grad_norm": 9.625, + "learning_rate": 9.480330262733715e-06, + "loss": 1.4362484216690063, + "step": 2772 + }, + { + "epoch": 0.5049604077546191, + "grad_norm": 11.1875, + "learning_rate": 9.479574728823276e-06, + "loss": 1.7629673480987549, + "step": 2774 + }, + { + "epoch": 0.5053244743788113, + "grad_norm": 17.5, + "learning_rate": 9.478818684299345e-06, + "loss": 1.3608262538909912, + "step": 2776 + }, + { + "epoch": 0.5056885410030035, + "grad_norm": 6.5625, + "learning_rate": 9.478062129272872e-06, + "loss": 0.9855809807777405, + "step": 2778 + }, + { + "epoch": 0.5060526076271957, + "grad_norm": 8.0, + "learning_rate": 9.477305063854877e-06, + "loss": 1.3332241773605347, + "step": 2780 + }, + { + "epoch": 0.506416674251388, + "grad_norm": 33.0, + "learning_rate": 9.476547488156453e-06, + "loss": 1.4193193912506104, + "step": 2782 + }, + { + "epoch": 0.5067807408755802, + "grad_norm": 7.3125, + "learning_rate": 9.475789402288778e-06, + "loss": 1.3930517435073853, + "step": 2784 + }, + { + "epoch": 0.5071448074997724, + "grad_norm": 4.25, + "learning_rate": 9.475030806363093e-06, + "loss": 1.0984879732131958, + "step": 2786 + }, + { + "epoch": 0.5075088741239647, + "grad_norm": 13.75, + "learning_rate": 9.47427170049072e-06, + "loss": 1.243513584136963, + "step": 2788 + }, + { + "epoch": 0.5078729407481569, + "grad_norm": 18.625, + "learning_rate": 9.473512084783054e-06, + "loss": 1.5150600671768188, + "step": 2790 + }, + { + "epoch": 0.5082370073723491, + "grad_norm": 9.75, + "learning_rate": 9.472751959351569e-06, + "loss": 1.5471346378326416, + "step": 2792 + }, + { + "epoch": 0.5086010739965414, + "grad_norm": 5.46875, + "learning_rate": 9.471991324307808e-06, + "loss": 1.2661476135253906, + "step": 2794 + }, + { + "epoch": 0.5089651406207336, + "grad_norm": 10.1875, + "learning_rate": 9.471230179763389e-06, + "loss": 1.4615060091018677, + "step": 2796 + }, + { + "epoch": 0.5093292072449258, + "grad_norm": 11.8125, + "learning_rate": 9.470468525830008e-06, + "loss": 1.3388614654541016, + "step": 2798 + }, + { + "epoch": 0.509693273869118, + "grad_norm": 8.875, + "learning_rate": 9.469706362619438e-06, + "loss": 1.2015575170516968, + "step": 2800 + }, + { + "epoch": 0.5100573404933103, + "grad_norm": 16.75, + "learning_rate": 9.468943690243518e-06, + "loss": 0.8708786964416504, + "step": 2802 + }, + { + "epoch": 0.5104214071175025, + "grad_norm": 11.75, + "learning_rate": 9.468180508814173e-06, + "loss": 1.2993409633636475, + "step": 2804 + }, + { + "epoch": 0.5107854737416947, + "grad_norm": 16.5, + "learning_rate": 9.46741681844339e-06, + "loss": 1.4893848896026611, + "step": 2806 + }, + { + "epoch": 0.511149540365887, + "grad_norm": 22.5, + "learning_rate": 9.466652619243244e-06, + "loss": 1.7756671905517578, + "step": 2808 + }, + { + "epoch": 0.5115136069900792, + "grad_norm": 8.1875, + "learning_rate": 9.465887911325875e-06, + "loss": 1.0276234149932861, + "step": 2810 + }, + { + "epoch": 0.5118776736142714, + "grad_norm": 6.09375, + "learning_rate": 9.465122694803502e-06, + "loss": 1.35276460647583, + "step": 2812 + }, + { + "epoch": 0.5122417402384637, + "grad_norm": 8.75, + "learning_rate": 9.464356969788413e-06, + "loss": 0.9041693210601807, + "step": 2814 + }, + { + "epoch": 0.5126058068626559, + "grad_norm": 24.625, + "learning_rate": 9.46359073639298e-06, + "loss": 0.9895302653312683, + "step": 2816 + }, + { + "epoch": 0.5129698734868481, + "grad_norm": 12.5625, + "learning_rate": 9.462823994729643e-06, + "loss": 1.7848765850067139, + "step": 2818 + }, + { + "epoch": 0.5133339401110403, + "grad_norm": 14.75, + "learning_rate": 9.46205674491092e-06, + "loss": 1.70426607131958, + "step": 2820 + }, + { + "epoch": 0.5136980067352326, + "grad_norm": 17.875, + "learning_rate": 9.4612889870494e-06, + "loss": 2.081897258758545, + "step": 2822 + }, + { + "epoch": 0.5140620733594248, + "grad_norm": 16.875, + "learning_rate": 9.460520721257747e-06, + "loss": 2.092416763305664, + "step": 2824 + }, + { + "epoch": 0.514426139983617, + "grad_norm": 29.625, + "learning_rate": 9.459751947648701e-06, + "loss": 1.7710515260696411, + "step": 2826 + }, + { + "epoch": 0.5147902066078093, + "grad_norm": 64.0, + "learning_rate": 9.458982666335081e-06, + "loss": 1.5571202039718628, + "step": 2828 + }, + { + "epoch": 0.5151542732320015, + "grad_norm": 22.5, + "learning_rate": 9.458212877429771e-06, + "loss": 0.5372373461723328, + "step": 2830 + }, + { + "epoch": 0.5155183398561937, + "grad_norm": 190.0, + "learning_rate": 9.457442581045737e-06, + "loss": 1.3746743202209473, + "step": 2832 + }, + { + "epoch": 0.5158824064803859, + "grad_norm": 27.75, + "learning_rate": 9.456671777296016e-06, + "loss": 1.4118127822875977, + "step": 2834 + }, + { + "epoch": 0.5162464731045782, + "grad_norm": 10.5625, + "learning_rate": 9.45590046629372e-06, + "loss": 1.48917818069458, + "step": 2836 + }, + { + "epoch": 0.5166105397287704, + "grad_norm": 19.5, + "learning_rate": 9.455128648152037e-06, + "loss": 1.4884352684020996, + "step": 2838 + }, + { + "epoch": 0.5169746063529626, + "grad_norm": 32.5, + "learning_rate": 9.454356322984225e-06, + "loss": 0.9315375685691833, + "step": 2840 + }, + { + "epoch": 0.5173386729771549, + "grad_norm": 24.375, + "learning_rate": 9.453583490903624e-06, + "loss": 1.5715090036392212, + "step": 2842 + }, + { + "epoch": 0.517702739601347, + "grad_norm": 7.4375, + "learning_rate": 9.452810152023641e-06, + "loss": 1.3380179405212402, + "step": 2844 + }, + { + "epoch": 0.5180668062255392, + "grad_norm": 5.96875, + "learning_rate": 9.45203630645776e-06, + "loss": 1.4265273809432983, + "step": 2846 + }, + { + "epoch": 0.5184308728497315, + "grad_norm": 10.5, + "learning_rate": 9.451261954319543e-06, + "loss": 0.9977113008499146, + "step": 2848 + }, + { + "epoch": 0.5187949394739237, + "grad_norm": 21.0, + "learning_rate": 9.45048709572262e-06, + "loss": 1.4064290523529053, + "step": 2850 + }, + { + "epoch": 0.5191590060981159, + "grad_norm": 10.0625, + "learning_rate": 9.4497117307807e-06, + "loss": 1.428276777267456, + "step": 2852 + }, + { + "epoch": 0.5195230727223081, + "grad_norm": 7.4375, + "learning_rate": 9.448935859607564e-06, + "loss": 1.0143812894821167, + "step": 2854 + }, + { + "epoch": 0.5198871393465004, + "grad_norm": 49.0, + "learning_rate": 9.448159482317067e-06, + "loss": 1.3358533382415771, + "step": 2856 + }, + { + "epoch": 0.5202512059706926, + "grad_norm": 20.0, + "learning_rate": 9.44738259902314e-06, + "loss": 1.1163548231124878, + "step": 2858 + }, + { + "epoch": 0.5206152725948848, + "grad_norm": 36.5, + "learning_rate": 9.44660520983979e-06, + "loss": 1.5802149772644043, + "step": 2860 + }, + { + "epoch": 0.5209793392190771, + "grad_norm": 11.1875, + "learning_rate": 9.44582731488109e-06, + "loss": 1.4620944261550903, + "step": 2862 + }, + { + "epoch": 0.5213434058432693, + "grad_norm": 17.375, + "learning_rate": 9.445048914261198e-06, + "loss": 1.5336461067199707, + "step": 2864 + }, + { + "epoch": 0.5217074724674615, + "grad_norm": 27.625, + "learning_rate": 9.44427000809434e-06, + "loss": 1.6409046649932861, + "step": 2866 + }, + { + "epoch": 0.5220715390916538, + "grad_norm": 33.0, + "learning_rate": 9.443490596494816e-06, + "loss": 1.742478847503662, + "step": 2868 + }, + { + "epoch": 0.522435605715846, + "grad_norm": 20.75, + "learning_rate": 9.442710679577003e-06, + "loss": 1.4603480100631714, + "step": 2870 + }, + { + "epoch": 0.5227996723400382, + "grad_norm": 13.1875, + "learning_rate": 9.441930257455348e-06, + "loss": 1.4711697101593018, + "step": 2872 + }, + { + "epoch": 0.5231637389642304, + "grad_norm": 17.125, + "learning_rate": 9.44114933024438e-06, + "loss": 1.5037627220153809, + "step": 2874 + }, + { + "epoch": 0.5235278055884227, + "grad_norm": 33.75, + "learning_rate": 9.440367898058688e-06, + "loss": 1.4397495985031128, + "step": 2876 + }, + { + "epoch": 0.5238918722126149, + "grad_norm": 17.125, + "learning_rate": 9.439585961012954e-06, + "loss": 1.0517516136169434, + "step": 2878 + }, + { + "epoch": 0.5242559388368071, + "grad_norm": 12.875, + "learning_rate": 9.438803519221917e-06, + "loss": 1.8930423259735107, + "step": 2880 + }, + { + "epoch": 0.5246200054609994, + "grad_norm": 17.125, + "learning_rate": 9.438020572800401e-06, + "loss": 1.791634202003479, + "step": 2882 + }, + { + "epoch": 0.5249840720851916, + "grad_norm": 9.3125, + "learning_rate": 9.4372371218633e-06, + "loss": 1.128345012664795, + "step": 2884 + }, + { + "epoch": 0.5253481387093838, + "grad_norm": 14.4375, + "learning_rate": 9.436453166525581e-06, + "loss": 1.4580602645874023, + "step": 2886 + }, + { + "epoch": 0.5257122053335761, + "grad_norm": 47.25, + "learning_rate": 9.435668706902286e-06, + "loss": 1.5464787483215332, + "step": 2888 + }, + { + "epoch": 0.5260762719577683, + "grad_norm": 7.03125, + "learning_rate": 9.434883743108532e-06, + "loss": 1.2771549224853516, + "step": 2890 + }, + { + "epoch": 0.5264403385819605, + "grad_norm": 29.5, + "learning_rate": 9.434098275259507e-06, + "loss": 1.0110841989517212, + "step": 2892 + }, + { + "epoch": 0.5268044052061527, + "grad_norm": 218.0, + "learning_rate": 9.433312303470481e-06, + "loss": 0.9715045094490051, + "step": 2894 + }, + { + "epoch": 0.527168471830345, + "grad_norm": 6.78125, + "learning_rate": 9.432525827856787e-06, + "loss": 1.327849268913269, + "step": 2896 + }, + { + "epoch": 0.5275325384545372, + "grad_norm": 5.78125, + "learning_rate": 9.431738848533838e-06, + "loss": 1.2787411212921143, + "step": 2898 + }, + { + "epoch": 0.5278966050787294, + "grad_norm": 11.375, + "learning_rate": 9.43095136561712e-06, + "loss": 1.470754861831665, + "step": 2900 + }, + { + "epoch": 0.5282606717029217, + "grad_norm": 12.8125, + "learning_rate": 9.430163379222194e-06, + "loss": 1.4690332412719727, + "step": 2902 + }, + { + "epoch": 0.5286247383271139, + "grad_norm": 17.375, + "learning_rate": 9.429374889464696e-06, + "loss": 1.7040634155273438, + "step": 2904 + }, + { + "epoch": 0.5289888049513061, + "grad_norm": 35.0, + "learning_rate": 9.428585896460327e-06, + "loss": 1.6173440217971802, + "step": 2906 + }, + { + "epoch": 0.5293528715754983, + "grad_norm": 18.875, + "learning_rate": 9.427796400324873e-06, + "loss": 1.873227834701538, + "step": 2908 + }, + { + "epoch": 0.5297169381996906, + "grad_norm": 14.0, + "learning_rate": 9.42700640117419e-06, + "loss": 1.515747308731079, + "step": 2910 + }, + { + "epoch": 0.5300810048238828, + "grad_norm": 10.5625, + "learning_rate": 9.426215899124207e-06, + "loss": 1.3949079513549805, + "step": 2912 + }, + { + "epoch": 0.530445071448075, + "grad_norm": 8.1875, + "learning_rate": 9.425424894290925e-06, + "loss": 1.3828617334365845, + "step": 2914 + }, + { + "epoch": 0.5308091380722673, + "grad_norm": 14.3125, + "learning_rate": 9.424633386790422e-06, + "loss": 1.37305748462677, + "step": 2916 + }, + { + "epoch": 0.5311732046964595, + "grad_norm": 10.4375, + "learning_rate": 9.423841376738849e-06, + "loss": 1.4214001893997192, + "step": 2918 + }, + { + "epoch": 0.5315372713206516, + "grad_norm": 16.75, + "learning_rate": 9.423048864252428e-06, + "loss": 1.463394284248352, + "step": 2920 + }, + { + "epoch": 0.531901337944844, + "grad_norm": 15.375, + "learning_rate": 9.422255849447459e-06, + "loss": 1.5125001668930054, + "step": 2922 + }, + { + "epoch": 0.5322654045690361, + "grad_norm": 11.9375, + "learning_rate": 9.421462332440314e-06, + "loss": 1.5132102966308594, + "step": 2924 + }, + { + "epoch": 0.5326294711932283, + "grad_norm": 3.390625, + "learning_rate": 9.420668313347439e-06, + "loss": 0.818705677986145, + "step": 2926 + }, + { + "epoch": 0.5329935378174205, + "grad_norm": 59.25, + "learning_rate": 9.41987379228535e-06, + "loss": 0.3280879259109497, + "step": 2928 + }, + { + "epoch": 0.5333576044416128, + "grad_norm": 48.25, + "learning_rate": 9.419078769370642e-06, + "loss": 0.5533462166786194, + "step": 2930 + }, + { + "epoch": 0.533721671065805, + "grad_norm": 39.5, + "learning_rate": 9.41828324471998e-06, + "loss": 0.6874912977218628, + "step": 2932 + }, + { + "epoch": 0.5340857376899972, + "grad_norm": 11.3125, + "learning_rate": 9.417487218450106e-06, + "loss": 1.105255365371704, + "step": 2934 + }, + { + "epoch": 0.5344498043141895, + "grad_norm": 17.875, + "learning_rate": 9.416690690677833e-06, + "loss": 1.3310904502868652, + "step": 2936 + }, + { + "epoch": 0.5348138709383817, + "grad_norm": 19.0, + "learning_rate": 9.415893661520047e-06, + "loss": 1.4222571849822998, + "step": 2938 + }, + { + "epoch": 0.5351779375625739, + "grad_norm": 10.875, + "learning_rate": 9.415096131093708e-06, + "loss": 1.4867838621139526, + "step": 2940 + }, + { + "epoch": 0.5355420041867662, + "grad_norm": 8.9375, + "learning_rate": 9.414298099515853e-06, + "loss": 1.5833359956741333, + "step": 2942 + }, + { + "epoch": 0.5359060708109584, + "grad_norm": 9.3125, + "learning_rate": 9.41349956690359e-06, + "loss": 1.4017274379730225, + "step": 2944 + }, + { + "epoch": 0.5362701374351506, + "grad_norm": 10.375, + "learning_rate": 9.412700533374098e-06, + "loss": 1.6258537769317627, + "step": 2946 + }, + { + "epoch": 0.5366342040593428, + "grad_norm": 10.8125, + "learning_rate": 9.411900999044635e-06, + "loss": 1.232919692993164, + "step": 2948 + }, + { + "epoch": 0.5369982706835351, + "grad_norm": 20.5, + "learning_rate": 9.411100964032524e-06, + "loss": 1.5530389547348022, + "step": 2950 + }, + { + "epoch": 0.5373623373077273, + "grad_norm": 9.0625, + "learning_rate": 9.410300428455174e-06, + "loss": 1.3744899034500122, + "step": 2952 + }, + { + "epoch": 0.5377264039319195, + "grad_norm": 24.0, + "learning_rate": 9.409499392430057e-06, + "loss": 1.5849263668060303, + "step": 2954 + }, + { + "epoch": 0.5380904705561118, + "grad_norm": 55.25, + "learning_rate": 9.40869785607472e-06, + "loss": 1.63058602809906, + "step": 2956 + }, + { + "epoch": 0.538454537180304, + "grad_norm": 9.125, + "learning_rate": 9.407895819506787e-06, + "loss": 1.4717212915420532, + "step": 2958 + }, + { + "epoch": 0.5388186038044962, + "grad_norm": 5.40625, + "learning_rate": 9.407093282843953e-06, + "loss": 1.30521821975708, + "step": 2960 + }, + { + "epoch": 0.5391826704286885, + "grad_norm": 62.5, + "learning_rate": 9.406290246203988e-06, + "loss": 1.3819918632507324, + "step": 2962 + }, + { + "epoch": 0.5395467370528807, + "grad_norm": 12.625, + "learning_rate": 9.405486709704734e-06, + "loss": 1.3954023122787476, + "step": 2964 + }, + { + "epoch": 0.5399108036770729, + "grad_norm": 6.34375, + "learning_rate": 9.404682673464108e-06, + "loss": 1.1037077903747559, + "step": 2966 + }, + { + "epoch": 0.5402748703012651, + "grad_norm": 60.25, + "learning_rate": 9.403878137600095e-06, + "loss": 1.2842849493026733, + "step": 2968 + }, + { + "epoch": 0.5406389369254574, + "grad_norm": 22.375, + "learning_rate": 9.403073102230762e-06, + "loss": 1.7027831077575684, + "step": 2970 + }, + { + "epoch": 0.5410030035496496, + "grad_norm": 8.1875, + "learning_rate": 9.402267567474242e-06, + "loss": 1.579690933227539, + "step": 2972 + }, + { + "epoch": 0.5413670701738418, + "grad_norm": 48.0, + "learning_rate": 9.401461533448744e-06, + "loss": 1.7669086456298828, + "step": 2974 + }, + { + "epoch": 0.5417311367980341, + "grad_norm": 6.78125, + "learning_rate": 9.400655000272551e-06, + "loss": 1.4734055995941162, + "step": 2976 + }, + { + "epoch": 0.5420952034222263, + "grad_norm": 13.375, + "learning_rate": 9.39984796806402e-06, + "loss": 1.4527418613433838, + "step": 2978 + }, + { + "epoch": 0.5424592700464185, + "grad_norm": 9.6875, + "learning_rate": 9.399040436941577e-06, + "loss": 1.5013947486877441, + "step": 2980 + }, + { + "epoch": 0.5428233366706107, + "grad_norm": 14.75, + "learning_rate": 9.398232407023724e-06, + "loss": 1.5700006484985352, + "step": 2982 + }, + { + "epoch": 0.543187403294803, + "grad_norm": 32.5, + "learning_rate": 9.397423878429037e-06, + "loss": 2.04573655128479, + "step": 2984 + }, + { + "epoch": 0.5435514699189952, + "grad_norm": 45.0, + "learning_rate": 9.396614851276166e-06, + "loss": 1.7349227666854858, + "step": 2986 + }, + { + "epoch": 0.5439155365431874, + "grad_norm": 15.3125, + "learning_rate": 9.39580532568383e-06, + "loss": 1.1179993152618408, + "step": 2988 + }, + { + "epoch": 0.5442796031673797, + "grad_norm": 13.0625, + "learning_rate": 9.394995301770826e-06, + "loss": 1.492805004119873, + "step": 2990 + }, + { + "epoch": 0.5446436697915719, + "grad_norm": 12.0, + "learning_rate": 9.39418477965602e-06, + "loss": 1.3820233345031738, + "step": 2992 + }, + { + "epoch": 0.545007736415764, + "grad_norm": 44.5, + "learning_rate": 9.393373759458351e-06, + "loss": 1.1499322652816772, + "step": 2994 + }, + { + "epoch": 0.5453718030399564, + "grad_norm": 7.03125, + "learning_rate": 9.392562241296837e-06, + "loss": 0.344651997089386, + "step": 2996 + }, + { + "epoch": 0.5457358696641486, + "grad_norm": 7.0, + "learning_rate": 9.391750225290561e-06, + "loss": 1.1447712182998657, + "step": 2998 + }, + { + "epoch": 0.5460999362883407, + "grad_norm": 10.375, + "learning_rate": 9.390937711558685e-06, + "loss": 1.3334729671478271, + "step": 3000 + }, + { + "epoch": 0.5464640029125329, + "grad_norm": 14.125, + "learning_rate": 9.390124700220442e-06, + "loss": 1.356000304222107, + "step": 3002 + }, + { + "epoch": 0.5468280695367252, + "grad_norm": 10.125, + "learning_rate": 9.389311191395141e-06, + "loss": 1.5714797973632812, + "step": 3004 + }, + { + "epoch": 0.5471921361609174, + "grad_norm": 12.875, + "learning_rate": 9.388497185202155e-06, + "loss": 1.2656877040863037, + "step": 3006 + }, + { + "epoch": 0.5475562027851096, + "grad_norm": 11.125, + "learning_rate": 9.387682681760941e-06, + "loss": 0.8870418071746826, + "step": 3008 + }, + { + "epoch": 0.5479202694093019, + "grad_norm": 5.46875, + "learning_rate": 9.386867681191023e-06, + "loss": 1.3088362216949463, + "step": 3010 + }, + { + "epoch": 0.5482843360334941, + "grad_norm": 14.9375, + "learning_rate": 9.386052183611998e-06, + "loss": 1.3964647054672241, + "step": 3012 + }, + { + "epoch": 0.5486484026576863, + "grad_norm": 5.34375, + "learning_rate": 9.385236189143538e-06, + "loss": 1.2509205341339111, + "step": 3014 + }, + { + "epoch": 0.5490124692818786, + "grad_norm": 24.75, + "learning_rate": 9.384419697905385e-06, + "loss": 1.3296830654144287, + "step": 3016 + }, + { + "epoch": 0.5493765359060708, + "grad_norm": 12.0625, + "learning_rate": 9.383602710017358e-06, + "loss": 1.932960033416748, + "step": 3018 + }, + { + "epoch": 0.549740602530263, + "grad_norm": 7.21875, + "learning_rate": 9.382785225599346e-06, + "loss": 1.3641269207000732, + "step": 3020 + }, + { + "epoch": 0.5501046691544552, + "grad_norm": 9.1875, + "learning_rate": 9.381967244771311e-06, + "loss": 1.4412264823913574, + "step": 3022 + }, + { + "epoch": 0.5504687357786475, + "grad_norm": 7.8125, + "learning_rate": 9.38114876765329e-06, + "loss": 1.327149748802185, + "step": 3024 + }, + { + "epoch": 0.5508328024028397, + "grad_norm": 13.75, + "learning_rate": 9.380329794365389e-06, + "loss": 1.6417878866195679, + "step": 3026 + }, + { + "epoch": 0.5511968690270319, + "grad_norm": 10.0, + "learning_rate": 9.37951032502779e-06, + "loss": 1.334957480430603, + "step": 3028 + }, + { + "epoch": 0.5515609356512242, + "grad_norm": 3.875, + "learning_rate": 9.378690359760747e-06, + "loss": 1.0583679676055908, + "step": 3030 + }, + { + "epoch": 0.5519250022754164, + "grad_norm": 8.375, + "learning_rate": 9.377869898684587e-06, + "loss": 1.087319016456604, + "step": 3032 + }, + { + "epoch": 0.5522890688996086, + "grad_norm": 25.375, + "learning_rate": 9.377048941919706e-06, + "loss": 1.6336027383804321, + "step": 3034 + }, + { + "epoch": 0.5526531355238009, + "grad_norm": 25.375, + "learning_rate": 9.37622748958658e-06, + "loss": 1.5905839204788208, + "step": 3036 + }, + { + "epoch": 0.5530172021479931, + "grad_norm": 9.5625, + "learning_rate": 9.375405541805753e-06, + "loss": 1.389833688735962, + "step": 3038 + }, + { + "epoch": 0.5533812687721853, + "grad_norm": 7.78125, + "learning_rate": 9.374583098697843e-06, + "loss": 1.4018820524215698, + "step": 3040 + }, + { + "epoch": 0.5537453353963775, + "grad_norm": 12.875, + "learning_rate": 9.373760160383538e-06, + "loss": 1.2264093160629272, + "step": 3042 + }, + { + "epoch": 0.5541094020205698, + "grad_norm": 13.8125, + "learning_rate": 9.372936726983604e-06, + "loss": 0.9025511741638184, + "step": 3044 + }, + { + "epoch": 0.554473468644762, + "grad_norm": 20.25, + "learning_rate": 9.372112798618872e-06, + "loss": 1.039534568786621, + "step": 3046 + }, + { + "epoch": 0.5548375352689542, + "grad_norm": 16.125, + "learning_rate": 9.371288375410254e-06, + "loss": 1.5039215087890625, + "step": 3048 + }, + { + "epoch": 0.5552016018931465, + "grad_norm": 11.8125, + "learning_rate": 9.370463457478729e-06, + "loss": 1.6696616411209106, + "step": 3050 + }, + { + "epoch": 0.5555656685173387, + "grad_norm": 12.0, + "learning_rate": 9.369638044945354e-06, + "loss": 1.4033293724060059, + "step": 3052 + }, + { + "epoch": 0.5559297351415309, + "grad_norm": 23.25, + "learning_rate": 9.368812137931247e-06, + "loss": 1.0766596794128418, + "step": 3054 + }, + { + "epoch": 0.5562938017657231, + "grad_norm": 9.25, + "learning_rate": 9.367985736557614e-06, + "loss": 0.966900646686554, + "step": 3056 + }, + { + "epoch": 0.5566578683899154, + "grad_norm": 20.875, + "learning_rate": 9.367158840945722e-06, + "loss": 0.7809659242630005, + "step": 3058 + }, + { + "epoch": 0.5570219350141076, + "grad_norm": 8.375, + "learning_rate": 9.36633145121692e-06, + "loss": 0.9630832076072693, + "step": 3060 + }, + { + "epoch": 0.5573860016382998, + "grad_norm": 28.25, + "learning_rate": 9.365503567492615e-06, + "loss": 1.0313334465026855, + "step": 3062 + }, + { + "epoch": 0.5577500682624921, + "grad_norm": 24.5, + "learning_rate": 9.364675189894304e-06, + "loss": 1.5308561325073242, + "step": 3064 + }, + { + "epoch": 0.5581141348866843, + "grad_norm": 23.0, + "learning_rate": 9.36384631854354e-06, + "loss": 1.5186749696731567, + "step": 3066 + }, + { + "epoch": 0.5584782015108765, + "grad_norm": 11.875, + "learning_rate": 9.363016953561967e-06, + "loss": 1.5328071117401123, + "step": 3068 + }, + { + "epoch": 0.5588422681350688, + "grad_norm": 16.0, + "learning_rate": 9.362187095071282e-06, + "loss": 1.3448283672332764, + "step": 3070 + }, + { + "epoch": 0.559206334759261, + "grad_norm": 117.5, + "learning_rate": 9.361356743193269e-06, + "loss": 1.5575233697891235, + "step": 3072 + }, + { + "epoch": 0.5595704013834532, + "grad_norm": 8.25, + "learning_rate": 9.360525898049772e-06, + "loss": 1.38517165184021, + "step": 3074 + }, + { + "epoch": 0.5599344680076453, + "grad_norm": 32.5, + "learning_rate": 9.359694559762722e-06, + "loss": 2.0679996013641357, + "step": 3076 + }, + { + "epoch": 0.5602985346318377, + "grad_norm": 8.875, + "learning_rate": 9.35886272845411e-06, + "loss": 1.4488943815231323, + "step": 3078 + }, + { + "epoch": 0.5606626012560298, + "grad_norm": 10.6875, + "learning_rate": 9.358030404246006e-06, + "loss": 1.427444577217102, + "step": 3080 + }, + { + "epoch": 0.561026667880222, + "grad_norm": 9.375, + "learning_rate": 9.357197587260549e-06, + "loss": 1.4008798599243164, + "step": 3082 + }, + { + "epoch": 0.5613907345044143, + "grad_norm": 7.5, + "learning_rate": 9.356364277619952e-06, + "loss": 1.3714756965637207, + "step": 3084 + }, + { + "epoch": 0.5617548011286065, + "grad_norm": 14.25, + "learning_rate": 9.355530475446494e-06, + "loss": 0.8201741576194763, + "step": 3086 + }, + { + "epoch": 0.5621188677527987, + "grad_norm": 14.3125, + "learning_rate": 9.354696180862543e-06, + "loss": 0.45517778396606445, + "step": 3088 + }, + { + "epoch": 0.562482934376991, + "grad_norm": 15.4375, + "learning_rate": 9.353861393990522e-06, + "loss": 1.5810420513153076, + "step": 3090 + }, + { + "epoch": 0.5628470010011832, + "grad_norm": 80.5, + "learning_rate": 9.353026114952935e-06, + "loss": 1.2946951389312744, + "step": 3092 + }, + { + "epoch": 0.5632110676253754, + "grad_norm": 6.0625, + "learning_rate": 9.352190343872352e-06, + "loss": 1.4217023849487305, + "step": 3094 + }, + { + "epoch": 0.5635751342495676, + "grad_norm": 61.5, + "learning_rate": 9.35135408087142e-06, + "loss": 1.041006326675415, + "step": 3096 + }, + { + "epoch": 0.5639392008737599, + "grad_norm": 8.75, + "learning_rate": 9.350517326072861e-06, + "loss": 1.5310918092727661, + "step": 3098 + }, + { + "epoch": 0.5643032674979521, + "grad_norm": 4.9375, + "learning_rate": 9.349680079599462e-06, + "loss": 1.3874033689498901, + "step": 3100 + }, + { + "epoch": 0.5646673341221443, + "grad_norm": 11.625, + "learning_rate": 9.348842341574085e-06, + "loss": 1.1787211894989014, + "step": 3102 + }, + { + "epoch": 0.5650314007463366, + "grad_norm": 10.5625, + "learning_rate": 9.348004112119666e-06, + "loss": 1.3066539764404297, + "step": 3104 + }, + { + "epoch": 0.5653954673705288, + "grad_norm": 15.5, + "learning_rate": 9.347165391359214e-06, + "loss": 1.4900389909744263, + "step": 3106 + }, + { + "epoch": 0.565759533994721, + "grad_norm": 20.125, + "learning_rate": 9.346326179415805e-06, + "loss": 1.2628798484802246, + "step": 3108 + }, + { + "epoch": 0.5661236006189133, + "grad_norm": 17.375, + "learning_rate": 9.34548647641259e-06, + "loss": 1.6182929277420044, + "step": 3110 + }, + { + "epoch": 0.5664876672431055, + "grad_norm": 24.375, + "learning_rate": 9.344646282472794e-06, + "loss": 1.743363618850708, + "step": 3112 + }, + { + "epoch": 0.5668517338672977, + "grad_norm": 11.1875, + "learning_rate": 9.343805597719711e-06, + "loss": 1.09340500831604, + "step": 3114 + }, + { + "epoch": 0.5672158004914899, + "grad_norm": 13.5625, + "learning_rate": 9.342964422276705e-06, + "loss": 1.409019112586975, + "step": 3116 + }, + { + "epoch": 0.5675798671156822, + "grad_norm": 7.21875, + "learning_rate": 9.34212275626722e-06, + "loss": 1.4036802053451538, + "step": 3118 + }, + { + "epoch": 0.5679439337398744, + "grad_norm": 4.1875, + "learning_rate": 9.341280599814764e-06, + "loss": 1.0053377151489258, + "step": 3120 + }, + { + "epoch": 0.5683080003640666, + "grad_norm": 4.84375, + "learning_rate": 9.340437953042923e-06, + "loss": 1.2106422185897827, + "step": 3122 + }, + { + "epoch": 0.5686720669882589, + "grad_norm": 12.75, + "learning_rate": 9.339594816075348e-06, + "loss": 1.238559365272522, + "step": 3124 + }, + { + "epoch": 0.5690361336124511, + "grad_norm": 14.8125, + "learning_rate": 9.338751189035769e-06, + "loss": 1.2562676668167114, + "step": 3126 + }, + { + "epoch": 0.5694002002366433, + "grad_norm": 8.4375, + "learning_rate": 9.337907072047982e-06, + "loss": 1.0469386577606201, + "step": 3128 + }, + { + "epoch": 0.5697642668608355, + "grad_norm": 13.6875, + "learning_rate": 9.337062465235862e-06, + "loss": 1.5876796245574951, + "step": 3130 + }, + { + "epoch": 0.5701283334850278, + "grad_norm": 18.625, + "learning_rate": 9.336217368723346e-06, + "loss": 0.9844861626625061, + "step": 3132 + }, + { + "epoch": 0.57049240010922, + "grad_norm": 14.6875, + "learning_rate": 9.335371782634455e-06, + "loss": 1.2141327857971191, + "step": 3134 + }, + { + "epoch": 0.5708564667334122, + "grad_norm": 17.375, + "learning_rate": 9.334525707093269e-06, + "loss": 1.1805936098098755, + "step": 3136 + }, + { + "epoch": 0.5712205333576045, + "grad_norm": 14.375, + "learning_rate": 9.33367914222395e-06, + "loss": 1.9679069519042969, + "step": 3138 + }, + { + "epoch": 0.5715845999817967, + "grad_norm": 12.3125, + "learning_rate": 9.33283208815073e-06, + "loss": 1.9321767091751099, + "step": 3140 + }, + { + "epoch": 0.5719486666059889, + "grad_norm": 16.125, + "learning_rate": 9.331984544997904e-06, + "loss": 1.0903077125549316, + "step": 3142 + }, + { + "epoch": 0.5723127332301812, + "grad_norm": 6.03125, + "learning_rate": 9.331136512889852e-06, + "loss": 1.1446666717529297, + "step": 3144 + }, + { + "epoch": 0.5726767998543734, + "grad_norm": 103.0, + "learning_rate": 9.330287991951015e-06, + "loss": 1.2337350845336914, + "step": 3146 + }, + { + "epoch": 0.5730408664785656, + "grad_norm": 11.375, + "learning_rate": 9.329438982305911e-06, + "loss": 1.6238542795181274, + "step": 3148 + }, + { + "epoch": 0.5734049331027578, + "grad_norm": 8.875, + "learning_rate": 9.328589484079134e-06, + "loss": 1.4405121803283691, + "step": 3150 + }, + { + "epoch": 0.5737689997269501, + "grad_norm": 2.796875, + "learning_rate": 9.327739497395333e-06, + "loss": 1.1899447441101074, + "step": 3152 + }, + { + "epoch": 0.5741330663511423, + "grad_norm": 11.625, + "learning_rate": 9.326889022379253e-06, + "loss": 0.8332608342170715, + "step": 3154 + }, + { + "epoch": 0.5744971329753344, + "grad_norm": 13.5625, + "learning_rate": 9.326038059155689e-06, + "loss": 1.0307068824768066, + "step": 3156 + }, + { + "epoch": 0.5748611995995268, + "grad_norm": 8.625, + "learning_rate": 9.325186607849518e-06, + "loss": 1.2585155963897705, + "step": 3158 + }, + { + "epoch": 0.575225266223719, + "grad_norm": 12.0, + "learning_rate": 9.32433466858569e-06, + "loss": 1.5540335178375244, + "step": 3160 + }, + { + "epoch": 0.5755893328479111, + "grad_norm": 27.375, + "learning_rate": 9.323482241489221e-06, + "loss": 1.4413390159606934, + "step": 3162 + }, + { + "epoch": 0.5759533994721034, + "grad_norm": 73.5, + "learning_rate": 9.322629326685202e-06, + "loss": 0.5993403196334839, + "step": 3164 + }, + { + "epoch": 0.5763174660962956, + "grad_norm": 11.375, + "learning_rate": 9.321775924298794e-06, + "loss": 1.529767394065857, + "step": 3166 + }, + { + "epoch": 0.5766815327204878, + "grad_norm": 6.5, + "learning_rate": 9.320922034455233e-06, + "loss": 1.3970431089401245, + "step": 3168 + }, + { + "epoch": 0.57704559934468, + "grad_norm": 4.875, + "learning_rate": 9.320067657279819e-06, + "loss": 1.2680381536483765, + "step": 3170 + }, + { + "epoch": 0.5774096659688723, + "grad_norm": 16.75, + "learning_rate": 9.319212792897933e-06, + "loss": 1.3864164352416992, + "step": 3172 + }, + { + "epoch": 0.5777737325930645, + "grad_norm": 17.625, + "learning_rate": 9.318357441435021e-06, + "loss": 1.337921380996704, + "step": 3174 + }, + { + "epoch": 0.5781377992172567, + "grad_norm": 23.75, + "learning_rate": 9.317501603016604e-06, + "loss": 1.7786891460418701, + "step": 3176 + }, + { + "epoch": 0.578501865841449, + "grad_norm": 12.375, + "learning_rate": 9.31664527776827e-06, + "loss": 1.1613082885742188, + "step": 3178 + }, + { + "epoch": 0.5788659324656412, + "grad_norm": 11.6875, + "learning_rate": 9.315788465815683e-06, + "loss": 1.5284299850463867, + "step": 3180 + }, + { + "epoch": 0.5792299990898334, + "grad_norm": 7.375, + "learning_rate": 9.314931167284575e-06, + "loss": 1.23740553855896, + "step": 3182 + }, + { + "epoch": 0.5795940657140257, + "grad_norm": 3.96875, + "learning_rate": 9.314073382300754e-06, + "loss": 1.274847388267517, + "step": 3184 + }, + { + "epoch": 0.5799581323382179, + "grad_norm": 6.125, + "learning_rate": 9.313215110990097e-06, + "loss": 1.0943043231964111, + "step": 3186 + }, + { + "epoch": 0.5803221989624101, + "grad_norm": 22.0, + "learning_rate": 9.312356353478547e-06, + "loss": 1.4497488737106323, + "step": 3188 + }, + { + "epoch": 0.5806862655866023, + "grad_norm": 4.4375, + "learning_rate": 9.311497109892127e-06, + "loss": 1.0539054870605469, + "step": 3190 + }, + { + "epoch": 0.5810503322107946, + "grad_norm": 63.25, + "learning_rate": 9.310637380356924e-06, + "loss": 1.0131137371063232, + "step": 3192 + }, + { + "epoch": 0.5814143988349868, + "grad_norm": 14.0625, + "learning_rate": 9.309777164999103e-06, + "loss": 0.5613528490066528, + "step": 3194 + }, + { + "epoch": 0.581778465459179, + "grad_norm": 10.375, + "learning_rate": 9.3089164639449e-06, + "loss": 1.598435878753662, + "step": 3196 + }, + { + "epoch": 0.5821425320833713, + "grad_norm": 7.21875, + "learning_rate": 9.308055277320611e-06, + "loss": 1.3231985569000244, + "step": 3198 + }, + { + "epoch": 0.5825065987075635, + "grad_norm": 14.375, + "learning_rate": 9.30719360525262e-06, + "loss": 1.137812852859497, + "step": 3200 + }, + { + "epoch": 0.5828706653317557, + "grad_norm": 11.4375, + "learning_rate": 9.306331447867369e-06, + "loss": 1.5650297403335571, + "step": 3202 + }, + { + "epoch": 0.583234731955948, + "grad_norm": 7.78125, + "learning_rate": 9.305468805291377e-06, + "loss": 1.8017995357513428, + "step": 3204 + }, + { + "epoch": 0.5835987985801402, + "grad_norm": 4.28125, + "learning_rate": 9.304605677651234e-06, + "loss": 1.0382485389709473, + "step": 3206 + }, + { + "epoch": 0.5839628652043324, + "grad_norm": 10.375, + "learning_rate": 9.3037420650736e-06, + "loss": 1.7024339437484741, + "step": 3208 + }, + { + "epoch": 0.5843269318285246, + "grad_norm": 19.625, + "learning_rate": 9.302877967685209e-06, + "loss": 1.4393378496170044, + "step": 3210 + }, + { + "epoch": 0.5846909984527169, + "grad_norm": 15.0625, + "learning_rate": 9.302013385612858e-06, + "loss": 1.771433711051941, + "step": 3212 + }, + { + "epoch": 0.5850550650769091, + "grad_norm": 6.90625, + "learning_rate": 9.301148318983425e-06, + "loss": 1.4224319458007812, + "step": 3214 + }, + { + "epoch": 0.5854191317011013, + "grad_norm": 11.625, + "learning_rate": 9.300282767923858e-06, + "loss": 1.3252739906311035, + "step": 3216 + }, + { + "epoch": 0.5857831983252936, + "grad_norm": 12.125, + "learning_rate": 9.299416732561169e-06, + "loss": 1.5036948919296265, + "step": 3218 + }, + { + "epoch": 0.5861472649494858, + "grad_norm": 8.125, + "learning_rate": 9.298550213022443e-06, + "loss": 1.2930983304977417, + "step": 3220 + }, + { + "epoch": 0.586511331573678, + "grad_norm": 13.125, + "learning_rate": 9.297683209434842e-06, + "loss": 1.3540523052215576, + "step": 3222 + }, + { + "epoch": 0.5868753981978702, + "grad_norm": 24.25, + "learning_rate": 9.296815721925596e-06, + "loss": 1.4716181755065918, + "step": 3224 + }, + { + "epoch": 0.5872394648220625, + "grad_norm": 7.84375, + "learning_rate": 9.295947750622003e-06, + "loss": 1.4196969270706177, + "step": 3226 + }, + { + "epoch": 0.5876035314462547, + "grad_norm": 26.375, + "learning_rate": 9.295079295651432e-06, + "loss": 1.579772710800171, + "step": 3228 + }, + { + "epoch": 0.5879675980704469, + "grad_norm": 34.25, + "learning_rate": 9.294210357141333e-06, + "loss": 1.7943522930145264, + "step": 3230 + }, + { + "epoch": 0.5883316646946392, + "grad_norm": 28.0, + "learning_rate": 9.29334093521921e-06, + "loss": 1.5552234649658203, + "step": 3232 + }, + { + "epoch": 0.5886957313188314, + "grad_norm": 7.34375, + "learning_rate": 9.292471030012656e-06, + "loss": 1.3278955221176147, + "step": 3234 + }, + { + "epoch": 0.5890597979430235, + "grad_norm": 20.375, + "learning_rate": 9.291600641649319e-06, + "loss": 1.1258277893066406, + "step": 3236 + }, + { + "epoch": 0.5894238645672158, + "grad_norm": 39.75, + "learning_rate": 9.290729770256925e-06, + "loss": 1.332985281944275, + "step": 3238 + }, + { + "epoch": 0.589787931191408, + "grad_norm": 5.25, + "learning_rate": 9.289858415963278e-06, + "loss": 1.2252001762390137, + "step": 3240 + }, + { + "epoch": 0.5901519978156002, + "grad_norm": 9.0625, + "learning_rate": 9.288986578896237e-06, + "loss": 1.2293524742126465, + "step": 3242 + }, + { + "epoch": 0.5905160644397924, + "grad_norm": 12.0625, + "learning_rate": 9.288114259183747e-06, + "loss": 0.7665857076644897, + "step": 3244 + }, + { + "epoch": 0.5908801310639847, + "grad_norm": 6.4375, + "learning_rate": 9.287241456953814e-06, + "loss": 1.017699956893921, + "step": 3246 + }, + { + "epoch": 0.5912441976881769, + "grad_norm": 9.6875, + "learning_rate": 9.28636817233452e-06, + "loss": 1.5378084182739258, + "step": 3248 + }, + { + "epoch": 0.5916082643123691, + "grad_norm": 9.9375, + "learning_rate": 9.285494405454016e-06, + "loss": 1.4946824312210083, + "step": 3250 + }, + { + "epoch": 0.5919723309365614, + "grad_norm": 29.875, + "learning_rate": 9.284620156440523e-06, + "loss": 1.6368951797485352, + "step": 3252 + }, + { + "epoch": 0.5923363975607536, + "grad_norm": 15.1875, + "learning_rate": 9.283745425422332e-06, + "loss": 1.6273603439331055, + "step": 3254 + }, + { + "epoch": 0.5927004641849458, + "grad_norm": 23.875, + "learning_rate": 9.282870212527809e-06, + "loss": 1.7658079862594604, + "step": 3256 + }, + { + "epoch": 0.5930645308091381, + "grad_norm": 12.625, + "learning_rate": 9.281994517885384e-06, + "loss": 1.7158167362213135, + "step": 3258 + }, + { + "epoch": 0.5934285974333303, + "grad_norm": 14.8125, + "learning_rate": 9.281118341623567e-06, + "loss": 1.436009168624878, + "step": 3260 + }, + { + "epoch": 0.5937926640575225, + "grad_norm": 7.21875, + "learning_rate": 9.28024168387093e-06, + "loss": 1.3976292610168457, + "step": 3262 + }, + { + "epoch": 0.5941567306817147, + "grad_norm": 22.125, + "learning_rate": 9.27936454475612e-06, + "loss": 1.0721571445465088, + "step": 3264 + }, + { + "epoch": 0.594520797305907, + "grad_norm": 11.625, + "learning_rate": 9.278486924407853e-06, + "loss": 0.6458092331886292, + "step": 3266 + }, + { + "epoch": 0.5948848639300992, + "grad_norm": 8.3125, + "learning_rate": 9.277608822954914e-06, + "loss": 1.586045742034912, + "step": 3268 + }, + { + "epoch": 0.5952489305542914, + "grad_norm": 4.96875, + "learning_rate": 9.276730240526167e-06, + "loss": 1.1255725622177124, + "step": 3270 + }, + { + "epoch": 0.5956129971784837, + "grad_norm": 14.1875, + "learning_rate": 9.275851177250533e-06, + "loss": 1.4628227949142456, + "step": 3272 + }, + { + "epoch": 0.5959770638026759, + "grad_norm": 28.375, + "learning_rate": 9.274971633257014e-06, + "loss": 1.8455495834350586, + "step": 3274 + }, + { + "epoch": 0.5963411304268681, + "grad_norm": 4.6875, + "learning_rate": 9.274091608674685e-06, + "loss": 0.8988832235336304, + "step": 3276 + }, + { + "epoch": 0.5967051970510604, + "grad_norm": 7.8125, + "learning_rate": 9.273211103632678e-06, + "loss": 1.1780006885528564, + "step": 3278 + }, + { + "epoch": 0.5970692636752526, + "grad_norm": 12.125, + "learning_rate": 9.272330118260207e-06, + "loss": 1.5535714626312256, + "step": 3280 + }, + { + "epoch": 0.5974333302994448, + "grad_norm": 15.875, + "learning_rate": 9.271448652686552e-06, + "loss": 1.5006792545318604, + "step": 3282 + }, + { + "epoch": 0.597797396923637, + "grad_norm": 16.375, + "learning_rate": 9.270566707041067e-06, + "loss": 0.8949829339981079, + "step": 3284 + }, + { + "epoch": 0.5981614635478293, + "grad_norm": 22.25, + "learning_rate": 9.269684281453172e-06, + "loss": 1.794499158859253, + "step": 3286 + }, + { + "epoch": 0.5985255301720215, + "grad_norm": 11.25, + "learning_rate": 9.268801376052358e-06, + "loss": 1.3735519647598267, + "step": 3288 + }, + { + "epoch": 0.5988895967962137, + "grad_norm": 21.5, + "learning_rate": 9.26791799096819e-06, + "loss": 1.250535249710083, + "step": 3290 + }, + { + "epoch": 0.599253663420406, + "grad_norm": 28.75, + "learning_rate": 9.267034126330301e-06, + "loss": 1.287517786026001, + "step": 3292 + }, + { + "epoch": 0.5996177300445982, + "grad_norm": 53.25, + "learning_rate": 9.266149782268395e-06, + "loss": 1.5526132583618164, + "step": 3294 + }, + { + "epoch": 0.5999817966687904, + "grad_norm": 8.625, + "learning_rate": 9.265264958912243e-06, + "loss": 1.298661231994629, + "step": 3296 + }, + { + "epoch": 0.6003458632929826, + "grad_norm": 17.875, + "learning_rate": 9.264379656391694e-06, + "loss": 1.7273969650268555, + "step": 3298 + }, + { + "epoch": 0.6007099299171749, + "grad_norm": 11.6875, + "learning_rate": 9.263493874836656e-06, + "loss": 1.4521604776382446, + "step": 3300 + }, + { + "epoch": 0.6010739965413671, + "grad_norm": 4.75, + "learning_rate": 9.26260761437712e-06, + "loss": 1.0998653173446655, + "step": 3302 + }, + { + "epoch": 0.6014380631655593, + "grad_norm": 9.75, + "learning_rate": 9.26172087514314e-06, + "loss": 1.437475323677063, + "step": 3304 + }, + { + "epoch": 0.6018021297897516, + "grad_norm": 6.5, + "learning_rate": 9.260833657264836e-06, + "loss": 1.114046335220337, + "step": 3306 + }, + { + "epoch": 0.6021661964139438, + "grad_norm": 8.6875, + "learning_rate": 9.259945960872409e-06, + "loss": 1.4759052991867065, + "step": 3308 + }, + { + "epoch": 0.602530263038136, + "grad_norm": 8.875, + "learning_rate": 9.259057786096126e-06, + "loss": 1.436745047569275, + "step": 3310 + }, + { + "epoch": 0.6028943296623283, + "grad_norm": 20.25, + "learning_rate": 9.258169133066322e-06, + "loss": 1.6244862079620361, + "step": 3312 + }, + { + "epoch": 0.6032583962865204, + "grad_norm": 10.0, + "learning_rate": 9.257280001913397e-06, + "loss": 1.7222747802734375, + "step": 3314 + }, + { + "epoch": 0.6036224629107126, + "grad_norm": 3.4375, + "learning_rate": 9.256390392767835e-06, + "loss": 0.9695895910263062, + "step": 3316 + }, + { + "epoch": 0.6039865295349048, + "grad_norm": 3.734375, + "learning_rate": 9.255500305760181e-06, + "loss": 0.9903299808502197, + "step": 3318 + }, + { + "epoch": 0.6043505961590971, + "grad_norm": 6.75, + "learning_rate": 9.254609741021047e-06, + "loss": 1.1582101583480835, + "step": 3320 + }, + { + "epoch": 0.6047146627832893, + "grad_norm": 11.625, + "learning_rate": 9.253718698681127e-06, + "loss": 1.4803917407989502, + "step": 3322 + }, + { + "epoch": 0.6050787294074815, + "grad_norm": 18.375, + "learning_rate": 9.252827178871172e-06, + "loss": 1.4825176000595093, + "step": 3324 + }, + { + "epoch": 0.6054427960316738, + "grad_norm": 29.375, + "learning_rate": 9.251935181722014e-06, + "loss": 1.2989507913589478, + "step": 3326 + }, + { + "epoch": 0.605806862655866, + "grad_norm": 33.75, + "learning_rate": 9.251042707364544e-06, + "loss": 1.761730432510376, + "step": 3328 + }, + { + "epoch": 0.6061709292800582, + "grad_norm": 12.0, + "learning_rate": 9.250149755929733e-06, + "loss": 1.545709490776062, + "step": 3330 + }, + { + "epoch": 0.6065349959042505, + "grad_norm": 138.0, + "learning_rate": 9.249256327548617e-06, + "loss": 1.9335663318634033, + "step": 3332 + }, + { + "epoch": 0.6068990625284427, + "grad_norm": 9.3125, + "learning_rate": 9.248362422352302e-06, + "loss": 1.6645519733428955, + "step": 3334 + }, + { + "epoch": 0.6072631291526349, + "grad_norm": 16.75, + "learning_rate": 9.247468040471968e-06, + "loss": 1.3717955350875854, + "step": 3336 + }, + { + "epoch": 0.6076271957768271, + "grad_norm": 18.125, + "learning_rate": 9.246573182038858e-06, + "loss": 1.4449280500411987, + "step": 3338 + }, + { + "epoch": 0.6079912624010194, + "grad_norm": 152.0, + "learning_rate": 9.24567784718429e-06, + "loss": 1.7102694511413574, + "step": 3340 + }, + { + "epoch": 0.6083553290252116, + "grad_norm": 14.6875, + "learning_rate": 9.244782036039655e-06, + "loss": 1.5718226432800293, + "step": 3342 + }, + { + "epoch": 0.6087193956494038, + "grad_norm": 7.625, + "learning_rate": 9.243885748736404e-06, + "loss": 0.9751600623130798, + "step": 3344 + }, + { + "epoch": 0.6090834622735961, + "grad_norm": 7.15625, + "learning_rate": 9.242988985406065e-06, + "loss": 0.9214791059494019, + "step": 3346 + }, + { + "epoch": 0.6094475288977883, + "grad_norm": 10.0625, + "learning_rate": 9.242091746180237e-06, + "loss": 0.6493130922317505, + "step": 3348 + }, + { + "epoch": 0.6098115955219805, + "grad_norm": 11.9375, + "learning_rate": 9.241194031190581e-06, + "loss": 1.479999303817749, + "step": 3350 + }, + { + "epoch": 0.6101756621461728, + "grad_norm": 9.8125, + "learning_rate": 9.24029584056884e-06, + "loss": 1.6433162689208984, + "step": 3352 + }, + { + "epoch": 0.610539728770365, + "grad_norm": 21.875, + "learning_rate": 9.239397174446815e-06, + "loss": 1.396240472793579, + "step": 3354 + }, + { + "epoch": 0.6109037953945572, + "grad_norm": 11.5, + "learning_rate": 9.238498032956383e-06, + "loss": 1.2425209283828735, + "step": 3356 + }, + { + "epoch": 0.6112678620187494, + "grad_norm": 9.375, + "learning_rate": 9.237598416229487e-06, + "loss": 1.254563808441162, + "step": 3358 + }, + { + "epoch": 0.6116319286429417, + "grad_norm": 15.4375, + "learning_rate": 9.236698324398147e-06, + "loss": 1.3713421821594238, + "step": 3360 + }, + { + "epoch": 0.6119959952671339, + "grad_norm": 15.625, + "learning_rate": 9.235797757594443e-06, + "loss": 1.393410563468933, + "step": 3362 + }, + { + "epoch": 0.6123600618913261, + "grad_norm": 7.40625, + "learning_rate": 9.234896715950534e-06, + "loss": 1.4773261547088623, + "step": 3364 + }, + { + "epoch": 0.6127241285155184, + "grad_norm": 12.375, + "learning_rate": 9.23399519959864e-06, + "loss": 1.3584930896759033, + "step": 3366 + }, + { + "epoch": 0.6130881951397106, + "grad_norm": 17.25, + "learning_rate": 9.233093208671058e-06, + "loss": 1.3461037874221802, + "step": 3368 + }, + { + "epoch": 0.6134522617639028, + "grad_norm": 10.0625, + "learning_rate": 9.23219074330015e-06, + "loss": 1.556216835975647, + "step": 3370 + }, + { + "epoch": 0.613816328388095, + "grad_norm": 12.75, + "learning_rate": 9.231287803618347e-06, + "loss": 1.533297061920166, + "step": 3372 + }, + { + "epoch": 0.6141803950122873, + "grad_norm": 17.125, + "learning_rate": 9.230384389758155e-06, + "loss": 1.4484983682632446, + "step": 3374 + }, + { + "epoch": 0.6145444616364795, + "grad_norm": 15.1875, + "learning_rate": 9.229480501852148e-06, + "loss": 1.4798980951309204, + "step": 3376 + }, + { + "epoch": 0.6149085282606717, + "grad_norm": 6.25, + "learning_rate": 9.228576140032963e-06, + "loss": 0.9490472674369812, + "step": 3378 + }, + { + "epoch": 0.615272594884864, + "grad_norm": 13.4375, + "learning_rate": 9.227671304433315e-06, + "loss": 1.0812265872955322, + "step": 3380 + }, + { + "epoch": 0.6156366615090562, + "grad_norm": 16.0, + "learning_rate": 9.226765995185983e-06, + "loss": 1.0332449674606323, + "step": 3382 + }, + { + "epoch": 0.6160007281332484, + "grad_norm": 29.875, + "learning_rate": 9.225860212423816e-06, + "loss": 1.5068589448928833, + "step": 3384 + }, + { + "epoch": 0.6163647947574407, + "grad_norm": 11.625, + "learning_rate": 9.224953956279739e-06, + "loss": 1.6702684164047241, + "step": 3386 + }, + { + "epoch": 0.6167288613816329, + "grad_norm": 7.09375, + "learning_rate": 9.224047226886737e-06, + "loss": 1.2706372737884521, + "step": 3388 + }, + { + "epoch": 0.617092928005825, + "grad_norm": 18.625, + "learning_rate": 9.223140024377872e-06, + "loss": 1.6000615358352661, + "step": 3390 + }, + { + "epoch": 0.6174569946300172, + "grad_norm": 17.375, + "learning_rate": 9.222232348886268e-06, + "loss": 1.853668212890625, + "step": 3392 + }, + { + "epoch": 0.6178210612542095, + "grad_norm": 12.3125, + "learning_rate": 9.221324200545128e-06, + "loss": 1.2229450941085815, + "step": 3394 + }, + { + "epoch": 0.6181851278784017, + "grad_norm": 9.125, + "learning_rate": 9.220415579487716e-06, + "loss": 0.6639102101325989, + "step": 3396 + }, + { + "epoch": 0.6185491945025939, + "grad_norm": 12.625, + "learning_rate": 9.219506485847367e-06, + "loss": 1.2761417627334595, + "step": 3398 + }, + { + "epoch": 0.6189132611267862, + "grad_norm": 11.125, + "learning_rate": 9.21859691975749e-06, + "loss": 1.670899510383606, + "step": 3400 + }, + { + "epoch": 0.6192773277509784, + "grad_norm": 6.78125, + "learning_rate": 9.21768688135156e-06, + "loss": 0.8308597207069397, + "step": 3402 + }, + { + "epoch": 0.6196413943751706, + "grad_norm": 11.25, + "learning_rate": 9.216776370763118e-06, + "loss": 1.4510284662246704, + "step": 3404 + }, + { + "epoch": 0.6200054609993629, + "grad_norm": 3.09375, + "learning_rate": 9.215865388125782e-06, + "loss": 0.9976418614387512, + "step": 3406 + }, + { + "epoch": 0.6203695276235551, + "grad_norm": 17.125, + "learning_rate": 9.214953933573232e-06, + "loss": 1.0372593402862549, + "step": 3408 + }, + { + "epoch": 0.6207335942477473, + "grad_norm": 15.875, + "learning_rate": 9.214042007239223e-06, + "loss": 1.172553539276123, + "step": 3410 + }, + { + "epoch": 0.6210976608719395, + "grad_norm": 7.9375, + "learning_rate": 9.213129609257574e-06, + "loss": 1.231721043586731, + "step": 3412 + }, + { + "epoch": 0.6214617274961318, + "grad_norm": 7.0, + "learning_rate": 9.212216739762174e-06, + "loss": 1.1757313013076782, + "step": 3414 + }, + { + "epoch": 0.621825794120324, + "grad_norm": 262.0, + "learning_rate": 9.211303398886988e-06, + "loss": 1.4580628871917725, + "step": 3416 + }, + { + "epoch": 0.6221898607445162, + "grad_norm": 9.25, + "learning_rate": 9.210389586766042e-06, + "loss": 1.1615405082702637, + "step": 3418 + }, + { + "epoch": 0.6225539273687085, + "grad_norm": 3.3125, + "learning_rate": 9.209475303533435e-06, + "loss": 0.8812665939331055, + "step": 3420 + }, + { + "epoch": 0.6229179939929007, + "grad_norm": 18.375, + "learning_rate": 9.208560549323334e-06, + "loss": 1.1766237020492554, + "step": 3422 + }, + { + "epoch": 0.6232820606170929, + "grad_norm": 12.75, + "learning_rate": 9.207645324269977e-06, + "loss": 1.4328970909118652, + "step": 3424 + }, + { + "epoch": 0.6236461272412852, + "grad_norm": 5.90625, + "learning_rate": 9.206729628507665e-06, + "loss": 1.4967670440673828, + "step": 3426 + }, + { + "epoch": 0.6240101938654774, + "grad_norm": 11.75, + "learning_rate": 9.205813462170776e-06, + "loss": 1.3680963516235352, + "step": 3428 + }, + { + "epoch": 0.6243742604896696, + "grad_norm": 20.75, + "learning_rate": 9.204896825393754e-06, + "loss": 1.1024887561798096, + "step": 3430 + }, + { + "epoch": 0.6247383271138618, + "grad_norm": 8.8125, + "learning_rate": 9.203979718311113e-06, + "loss": 0.7795675992965698, + "step": 3432 + }, + { + "epoch": 0.6251023937380541, + "grad_norm": 7.8125, + "learning_rate": 9.203062141057431e-06, + "loss": 1.5429625511169434, + "step": 3434 + }, + { + "epoch": 0.6254664603622463, + "grad_norm": 11.125, + "learning_rate": 9.202144093767362e-06, + "loss": 1.4142343997955322, + "step": 3436 + }, + { + "epoch": 0.6258305269864385, + "grad_norm": 15.75, + "learning_rate": 9.201225576575623e-06, + "loss": 1.8705246448516846, + "step": 3438 + }, + { + "epoch": 0.6261945936106308, + "grad_norm": 10.375, + "learning_rate": 9.200306589617006e-06, + "loss": 1.3505696058273315, + "step": 3440 + }, + { + "epoch": 0.626558660234823, + "grad_norm": 27.5, + "learning_rate": 9.199387133026365e-06, + "loss": 1.9452415704727173, + "step": 3442 + }, + { + "epoch": 0.6269227268590152, + "grad_norm": 11.0625, + "learning_rate": 9.198467206938628e-06, + "loss": 1.3088587522506714, + "step": 3444 + }, + { + "epoch": 0.6272867934832074, + "grad_norm": 82.5, + "learning_rate": 9.197546811488794e-06, + "loss": 1.4850128889083862, + "step": 3446 + }, + { + "epoch": 0.6276508601073997, + "grad_norm": 15.375, + "learning_rate": 9.196625946811918e-06, + "loss": 1.3140945434570312, + "step": 3448 + }, + { + "epoch": 0.6280149267315919, + "grad_norm": 13.25, + "learning_rate": 9.195704613043143e-06, + "loss": 0.8738912343978882, + "step": 3450 + }, + { + "epoch": 0.6283789933557841, + "grad_norm": 8.875, + "learning_rate": 9.194782810317667e-06, + "loss": 1.1542036533355713, + "step": 3452 + }, + { + "epoch": 0.6287430599799764, + "grad_norm": 11.125, + "learning_rate": 9.19386053877076e-06, + "loss": 1.4024494886398315, + "step": 3454 + }, + { + "epoch": 0.6291071266041686, + "grad_norm": 13.25, + "learning_rate": 9.192937798537764e-06, + "loss": 1.2433480024337769, + "step": 3456 + }, + { + "epoch": 0.6294711932283608, + "grad_norm": 11.9375, + "learning_rate": 9.192014589754083e-06, + "loss": 1.5170204639434814, + "step": 3458 + }, + { + "epoch": 0.6298352598525531, + "grad_norm": 9.9375, + "learning_rate": 9.191090912555201e-06, + "loss": 1.5104825496673584, + "step": 3460 + }, + { + "epoch": 0.6301993264767453, + "grad_norm": 11.1875, + "learning_rate": 9.190166767076658e-06, + "loss": 1.1128158569335938, + "step": 3462 + }, + { + "epoch": 0.6305633931009375, + "grad_norm": 15.625, + "learning_rate": 9.18924215345407e-06, + "loss": 1.4777809381484985, + "step": 3464 + }, + { + "epoch": 0.6309274597251296, + "grad_norm": 6.78125, + "learning_rate": 9.18831707182312e-06, + "loss": 1.2820425033569336, + "step": 3466 + }, + { + "epoch": 0.631291526349322, + "grad_norm": 5.28125, + "learning_rate": 9.187391522319562e-06, + "loss": 1.0210084915161133, + "step": 3468 + }, + { + "epoch": 0.6316555929735141, + "grad_norm": 10.1875, + "learning_rate": 9.186465505079216e-06, + "loss": 1.4332205057144165, + "step": 3470 + }, + { + "epoch": 0.6320196595977063, + "grad_norm": 14.0, + "learning_rate": 9.18553902023797e-06, + "loss": 1.7701436281204224, + "step": 3472 + }, + { + "epoch": 0.6323837262218986, + "grad_norm": 20.125, + "learning_rate": 9.184612067931784e-06, + "loss": 1.5954891443252563, + "step": 3474 + }, + { + "epoch": 0.6327477928460908, + "grad_norm": 12.25, + "learning_rate": 9.183684648296683e-06, + "loss": 1.7572270631790161, + "step": 3476 + }, + { + "epoch": 0.633111859470283, + "grad_norm": 10.4375, + "learning_rate": 9.182756761468761e-06, + "loss": 1.467654824256897, + "step": 3478 + }, + { + "epoch": 0.6334759260944753, + "grad_norm": 19.0, + "learning_rate": 9.181828407584181e-06, + "loss": 1.4725662469863892, + "step": 3480 + }, + { + "epoch": 0.6338399927186675, + "grad_norm": 12.25, + "learning_rate": 9.18089958677918e-06, + "loss": 1.3248523473739624, + "step": 3482 + }, + { + "epoch": 0.6342040593428597, + "grad_norm": 4.65625, + "learning_rate": 9.179970299190055e-06, + "loss": 1.4300415515899658, + "step": 3484 + }, + { + "epoch": 0.6345681259670519, + "grad_norm": 7.4375, + "learning_rate": 9.179040544953176e-06, + "loss": 0.9890764355659485, + "step": 3486 + }, + { + "epoch": 0.6349321925912442, + "grad_norm": 107.0, + "learning_rate": 9.178110324204981e-06, + "loss": 1.3034687042236328, + "step": 3488 + }, + { + "epoch": 0.6352962592154364, + "grad_norm": 10.25, + "learning_rate": 9.177179637081974e-06, + "loss": 0.8682063817977905, + "step": 3490 + }, + { + "epoch": 0.6356603258396286, + "grad_norm": 10.1875, + "learning_rate": 9.176248483720731e-06, + "loss": 1.4274332523345947, + "step": 3492 + }, + { + "epoch": 0.6360243924638209, + "grad_norm": 7.625, + "learning_rate": 9.175316864257896e-06, + "loss": 1.0677056312561035, + "step": 3494 + }, + { + "epoch": 0.6363884590880131, + "grad_norm": 7.15625, + "learning_rate": 9.17438477883018e-06, + "loss": 1.1895233392715454, + "step": 3496 + }, + { + "epoch": 0.6367525257122053, + "grad_norm": 12.25, + "learning_rate": 9.173452227574365e-06, + "loss": 1.4755933284759521, + "step": 3498 + }, + { + "epoch": 0.6371165923363976, + "grad_norm": 42.25, + "learning_rate": 9.172519210627293e-06, + "loss": 1.356042742729187, + "step": 3500 + }, + { + "epoch": 0.6374806589605898, + "grad_norm": 13.6875, + "learning_rate": 9.171585728125886e-06, + "loss": 1.1299794912338257, + "step": 3502 + }, + { + "epoch": 0.637844725584782, + "grad_norm": 5.9375, + "learning_rate": 9.170651780207123e-06, + "loss": 1.3135566711425781, + "step": 3504 + }, + { + "epoch": 0.6382087922089742, + "grad_norm": 10.5, + "learning_rate": 9.169717367008064e-06, + "loss": 1.4078896045684814, + "step": 3506 + }, + { + "epoch": 0.6385728588331665, + "grad_norm": 9.75, + "learning_rate": 9.168782488665827e-06, + "loss": 1.4428621530532837, + "step": 3508 + }, + { + "epoch": 0.6389369254573587, + "grad_norm": 6.25, + "learning_rate": 9.167847145317602e-06, + "loss": 1.2242130041122437, + "step": 3510 + }, + { + "epoch": 0.6393009920815509, + "grad_norm": 9.0625, + "learning_rate": 9.166911337100643e-06, + "loss": 1.3666901588439941, + "step": 3512 + }, + { + "epoch": 0.6396650587057432, + "grad_norm": 9.25, + "learning_rate": 9.165975064152283e-06, + "loss": 1.274733304977417, + "step": 3514 + }, + { + "epoch": 0.6400291253299354, + "grad_norm": 13.0, + "learning_rate": 9.165038326609913e-06, + "loss": 1.3937983512878418, + "step": 3516 + }, + { + "epoch": 0.6403931919541276, + "grad_norm": 13.5, + "learning_rate": 9.164101124610993e-06, + "loss": 1.6175740957260132, + "step": 3518 + }, + { + "epoch": 0.6407572585783198, + "grad_norm": 26.125, + "learning_rate": 9.163163458293059e-06, + "loss": 1.2710429430007935, + "step": 3520 + }, + { + "epoch": 0.6411213252025121, + "grad_norm": 18.125, + "learning_rate": 9.162225327793706e-06, + "loss": 1.8064460754394531, + "step": 3522 + }, + { + "epoch": 0.6414853918267043, + "grad_norm": 9.625, + "learning_rate": 9.161286733250601e-06, + "loss": 1.56912100315094, + "step": 3524 + }, + { + "epoch": 0.6418494584508965, + "grad_norm": 7.625, + "learning_rate": 9.160347674801477e-06, + "loss": 1.2091684341430664, + "step": 3526 + }, + { + "epoch": 0.6422135250750888, + "grad_norm": 10.1875, + "learning_rate": 9.159408152584143e-06, + "loss": 1.3263113498687744, + "step": 3528 + }, + { + "epoch": 0.642577591699281, + "grad_norm": 11.125, + "learning_rate": 9.158468166736465e-06, + "loss": 1.3968076705932617, + "step": 3530 + }, + { + "epoch": 0.6429416583234732, + "grad_norm": 6.0625, + "learning_rate": 9.157527717396383e-06, + "loss": 1.2390064001083374, + "step": 3532 + }, + { + "epoch": 0.6433057249476655, + "grad_norm": 11.0, + "learning_rate": 9.156586804701908e-06, + "loss": 1.1571060419082642, + "step": 3534 + }, + { + "epoch": 0.6436697915718577, + "grad_norm": 18.75, + "learning_rate": 9.15564542879111e-06, + "loss": 1.441100001335144, + "step": 3536 + }, + { + "epoch": 0.6440338581960499, + "grad_norm": 13.6875, + "learning_rate": 9.154703589802132e-06, + "loss": 1.4943218231201172, + "step": 3538 + }, + { + "epoch": 0.644397924820242, + "grad_norm": 6.875, + "learning_rate": 9.153761287873189e-06, + "loss": 1.366776466369629, + "step": 3540 + }, + { + "epoch": 0.6447619914444344, + "grad_norm": 7.59375, + "learning_rate": 9.152818523142557e-06, + "loss": 1.526563286781311, + "step": 3542 + }, + { + "epoch": 0.6451260580686266, + "grad_norm": 8.25, + "learning_rate": 9.151875295748587e-06, + "loss": 1.320901870727539, + "step": 3544 + }, + { + "epoch": 0.6454901246928187, + "grad_norm": 16.125, + "learning_rate": 9.150931605829688e-06, + "loss": 1.2511060237884521, + "step": 3546 + }, + { + "epoch": 0.645854191317011, + "grad_norm": 12.375, + "learning_rate": 9.149987453524345e-06, + "loss": 1.428252100944519, + "step": 3548 + }, + { + "epoch": 0.6462182579412032, + "grad_norm": 8.5, + "learning_rate": 9.149042838971111e-06, + "loss": 1.5096672773361206, + "step": 3550 + }, + { + "epoch": 0.6465823245653954, + "grad_norm": 20.5, + "learning_rate": 9.1480977623086e-06, + "loss": 1.6028022766113281, + "step": 3552 + }, + { + "epoch": 0.6469463911895877, + "grad_norm": 5.21875, + "learning_rate": 9.147152223675504e-06, + "loss": 1.3338195085525513, + "step": 3554 + }, + { + "epoch": 0.6473104578137799, + "grad_norm": 9.5, + "learning_rate": 9.146206223210572e-06, + "loss": 1.3280006647109985, + "step": 3556 + }, + { + "epoch": 0.6476745244379721, + "grad_norm": 3.234375, + "learning_rate": 9.145259761052625e-06, + "loss": 1.1066222190856934, + "step": 3558 + }, + { + "epoch": 0.6480385910621643, + "grad_norm": 25.125, + "learning_rate": 9.144312837340557e-06, + "loss": 1.0565403699874878, + "step": 3560 + }, + { + "epoch": 0.6484026576863566, + "grad_norm": 12.4375, + "learning_rate": 9.143365452213322e-06, + "loss": 0.8064821362495422, + "step": 3562 + }, + { + "epoch": 0.6487667243105488, + "grad_norm": 5.09375, + "learning_rate": 9.142417605809945e-06, + "loss": 1.0452728271484375, + "step": 3564 + }, + { + "epoch": 0.649130790934741, + "grad_norm": 15.6875, + "learning_rate": 9.141469298269522e-06, + "loss": 1.308931827545166, + "step": 3566 + }, + { + "epoch": 0.6494948575589333, + "grad_norm": 18.0, + "learning_rate": 9.14052052973121e-06, + "loss": 1.3293876647949219, + "step": 3568 + }, + { + "epoch": 0.6498589241831255, + "grad_norm": 8.625, + "learning_rate": 9.139571300334238e-06, + "loss": 1.2970260381698608, + "step": 3570 + }, + { + "epoch": 0.6502229908073177, + "grad_norm": 2.625, + "learning_rate": 9.138621610217899e-06, + "loss": 1.0420702695846558, + "step": 3572 + }, + { + "epoch": 0.65058705743151, + "grad_norm": 8.375, + "learning_rate": 9.13767145952156e-06, + "loss": 0.9667816162109375, + "step": 3574 + }, + { + "epoch": 0.6509511240557022, + "grad_norm": 16.625, + "learning_rate": 9.13672084838465e-06, + "loss": 1.4660228490829468, + "step": 3576 + }, + { + "epoch": 0.6513151906798944, + "grad_norm": 30.625, + "learning_rate": 9.135769776946666e-06, + "loss": 1.392289400100708, + "step": 3578 + }, + { + "epoch": 0.6516792573040866, + "grad_norm": 15.1875, + "learning_rate": 9.134818245347176e-06, + "loss": 1.5141706466674805, + "step": 3580 + }, + { + "epoch": 0.6520433239282789, + "grad_norm": 14.625, + "learning_rate": 9.133866253725813e-06, + "loss": 1.7205846309661865, + "step": 3582 + }, + { + "epoch": 0.6524073905524711, + "grad_norm": 10.3125, + "learning_rate": 9.132913802222278e-06, + "loss": 1.2909421920776367, + "step": 3584 + }, + { + "epoch": 0.6527714571766633, + "grad_norm": 12.25, + "learning_rate": 9.131960890976341e-06, + "loss": 1.3672901391983032, + "step": 3586 + }, + { + "epoch": 0.6531355238008556, + "grad_norm": 28.0, + "learning_rate": 9.131007520127836e-06, + "loss": 1.5591942071914673, + "step": 3588 + }, + { + "epoch": 0.6534995904250478, + "grad_norm": 78.0, + "learning_rate": 9.130053689816662e-06, + "loss": 1.5478873252868652, + "step": 3590 + }, + { + "epoch": 0.65386365704924, + "grad_norm": 7.46875, + "learning_rate": 9.129099400182797e-06, + "loss": 1.6789886951446533, + "step": 3592 + }, + { + "epoch": 0.6542277236734323, + "grad_norm": 11.0625, + "learning_rate": 9.128144651366277e-06, + "loss": 1.4372708797454834, + "step": 3594 + }, + { + "epoch": 0.6545917902976245, + "grad_norm": 11.3125, + "learning_rate": 9.127189443507205e-06, + "loss": 1.0014150142669678, + "step": 3596 + }, + { + "epoch": 0.6549558569218167, + "grad_norm": 39.0, + "learning_rate": 9.126233776745756e-06, + "loss": 1.4076837301254272, + "step": 3598 + }, + { + "epoch": 0.6553199235460089, + "grad_norm": 6.28125, + "learning_rate": 9.125277651222168e-06, + "loss": 1.1437819004058838, + "step": 3600 + }, + { + "epoch": 0.6556839901702012, + "grad_norm": 11.25, + "learning_rate": 9.124321067076753e-06, + "loss": 1.3973109722137451, + "step": 3602 + }, + { + "epoch": 0.6560480567943934, + "grad_norm": 11.0, + "learning_rate": 9.123364024449883e-06, + "loss": 1.3617955446243286, + "step": 3604 + }, + { + "epoch": 0.6564121234185856, + "grad_norm": 23.0, + "learning_rate": 9.122406523482e-06, + "loss": 1.5899008512496948, + "step": 3606 + }, + { + "epoch": 0.6567761900427779, + "grad_norm": 19.375, + "learning_rate": 9.121448564313612e-06, + "loss": 1.8046138286590576, + "step": 3608 + }, + { + "epoch": 0.6571402566669701, + "grad_norm": 9.1875, + "learning_rate": 9.120490147085299e-06, + "loss": 1.3494240045547485, + "step": 3610 + }, + { + "epoch": 0.6575043232911623, + "grad_norm": 7.5625, + "learning_rate": 9.119531271937703e-06, + "loss": 1.0726373195648193, + "step": 3612 + }, + { + "epoch": 0.6578683899153545, + "grad_norm": 6.53125, + "learning_rate": 9.118571939011535e-06, + "loss": 0.8167750239372253, + "step": 3614 + }, + { + "epoch": 0.6582324565395468, + "grad_norm": 10.1875, + "learning_rate": 9.117612148447574e-06, + "loss": 1.3734815120697021, + "step": 3616 + }, + { + "epoch": 0.658596523163739, + "grad_norm": 13.875, + "learning_rate": 9.116651900386665e-06, + "loss": 1.6158146858215332, + "step": 3618 + }, + { + "epoch": 0.6589605897879312, + "grad_norm": 21.875, + "learning_rate": 9.115691194969719e-06, + "loss": 1.5301512479782104, + "step": 3620 + }, + { + "epoch": 0.6593246564121235, + "grad_norm": 5.1875, + "learning_rate": 9.114730032337717e-06, + "loss": 0.2065422236919403, + "step": 3622 + }, + { + "epoch": 0.6596887230363156, + "grad_norm": 18.75, + "learning_rate": 9.113768412631705e-06, + "loss": 1.3237202167510986, + "step": 3624 + }, + { + "epoch": 0.6600527896605078, + "grad_norm": 17.625, + "learning_rate": 9.112806335992798e-06, + "loss": 1.7446810007095337, + "step": 3626 + }, + { + "epoch": 0.6604168562847001, + "grad_norm": 21.5, + "learning_rate": 9.111843802562178e-06, + "loss": 1.4303135871887207, + "step": 3628 + }, + { + "epoch": 0.6607809229088923, + "grad_norm": 11.0, + "learning_rate": 9.110880812481089e-06, + "loss": 1.577548861503601, + "step": 3630 + }, + { + "epoch": 0.6611449895330845, + "grad_norm": 11.8125, + "learning_rate": 9.10991736589085e-06, + "loss": 1.5449628829956055, + "step": 3632 + }, + { + "epoch": 0.6615090561572767, + "grad_norm": 10.625, + "learning_rate": 9.108953462932839e-06, + "loss": 1.6511759757995605, + "step": 3634 + }, + { + "epoch": 0.661873122781469, + "grad_norm": 18.625, + "learning_rate": 9.107989103748508e-06, + "loss": 1.8719981908798218, + "step": 3636 + }, + { + "epoch": 0.6622371894056612, + "grad_norm": 18.75, + "learning_rate": 9.107024288479371e-06, + "loss": 1.4480392932891846, + "step": 3638 + }, + { + "epoch": 0.6626012560298534, + "grad_norm": 21.375, + "learning_rate": 9.106059017267013e-06, + "loss": 1.9537347555160522, + "step": 3640 + }, + { + "epoch": 0.6629653226540457, + "grad_norm": 17.75, + "learning_rate": 9.10509329025308e-06, + "loss": 1.4034967422485352, + "step": 3642 + }, + { + "epoch": 0.6633293892782379, + "grad_norm": 20.75, + "learning_rate": 9.10412710757929e-06, + "loss": 1.2126206159591675, + "step": 3644 + }, + { + "epoch": 0.6636934559024301, + "grad_norm": 9.625, + "learning_rate": 9.10316046938743e-06, + "loss": 0.8758245706558228, + "step": 3646 + }, + { + "epoch": 0.6640575225266224, + "grad_norm": 14.0625, + "learning_rate": 9.102193375819344e-06, + "loss": 1.525219440460205, + "step": 3648 + }, + { + "epoch": 0.6644215891508146, + "grad_norm": 18.25, + "learning_rate": 9.101225827016956e-06, + "loss": 2.0014102458953857, + "step": 3650 + }, + { + "epoch": 0.6647856557750068, + "grad_norm": 9.25, + "learning_rate": 9.100257823122241e-06, + "loss": 1.4483447074890137, + "step": 3652 + }, + { + "epoch": 0.665149722399199, + "grad_norm": 13.125, + "learning_rate": 9.099289364277257e-06, + "loss": 1.418277382850647, + "step": 3654 + }, + { + "epoch": 0.6655137890233913, + "grad_norm": 12.9375, + "learning_rate": 9.09832045062412e-06, + "loss": 1.392439842224121, + "step": 3656 + }, + { + "epoch": 0.6658778556475835, + "grad_norm": 16.5, + "learning_rate": 9.097351082305012e-06, + "loss": 1.9772604703903198, + "step": 3658 + }, + { + "epoch": 0.6662419222717757, + "grad_norm": 11.3125, + "learning_rate": 9.096381259462187e-06, + "loss": 1.629359483718872, + "step": 3660 + }, + { + "epoch": 0.666605988895968, + "grad_norm": 3.640625, + "learning_rate": 9.095410982237957e-06, + "loss": 0.9926884174346924, + "step": 3662 + }, + { + "epoch": 0.6669700555201602, + "grad_norm": 10.125, + "learning_rate": 9.094440250774712e-06, + "loss": 1.2300328016281128, + "step": 3664 + }, + { + "epoch": 0.6673341221443524, + "grad_norm": 9.875, + "learning_rate": 9.093469065214903e-06, + "loss": 1.2856507301330566, + "step": 3666 + }, + { + "epoch": 0.6676981887685447, + "grad_norm": 30.0, + "learning_rate": 9.092497425701043e-06, + "loss": 1.8362234830856323, + "step": 3668 + }, + { + "epoch": 0.6680622553927369, + "grad_norm": 15.0625, + "learning_rate": 9.09152533237572e-06, + "loss": 1.8722002506256104, + "step": 3670 + }, + { + "epoch": 0.6684263220169291, + "grad_norm": 37.75, + "learning_rate": 9.09055278538158e-06, + "loss": 1.195242166519165, + "step": 3672 + }, + { + "epoch": 0.6687903886411213, + "grad_norm": 21.0, + "learning_rate": 9.089579784861348e-06, + "loss": 1.375098466873169, + "step": 3674 + }, + { + "epoch": 0.6691544552653136, + "grad_norm": 14.0, + "learning_rate": 9.088606330957803e-06, + "loss": 1.2577656507492065, + "step": 3676 + }, + { + "epoch": 0.6695185218895058, + "grad_norm": 10.375, + "learning_rate": 9.087632423813792e-06, + "loss": 1.4363480806350708, + "step": 3678 + }, + { + "epoch": 0.669882588513698, + "grad_norm": 23.375, + "learning_rate": 9.08665806357224e-06, + "loss": 1.4257678985595703, + "step": 3680 + }, + { + "epoch": 0.6702466551378903, + "grad_norm": 6.5, + "learning_rate": 9.085683250376124e-06, + "loss": 1.5641443729400635, + "step": 3682 + }, + { + "epoch": 0.6706107217620825, + "grad_norm": 23.0, + "learning_rate": 9.084707984368496e-06, + "loss": 1.6387417316436768, + "step": 3684 + }, + { + "epoch": 0.6709747883862747, + "grad_norm": 36.5, + "learning_rate": 9.083732265692475e-06, + "loss": 2.193417549133301, + "step": 3686 + }, + { + "epoch": 0.6713388550104669, + "grad_norm": 15.375, + "learning_rate": 9.08275609449124e-06, + "loss": 1.682459831237793, + "step": 3688 + }, + { + "epoch": 0.6717029216346592, + "grad_norm": 27.375, + "learning_rate": 9.08177947090804e-06, + "loss": 1.7272855043411255, + "step": 3690 + }, + { + "epoch": 0.6720669882588514, + "grad_norm": 42.5, + "learning_rate": 9.080802395086194e-06, + "loss": 0.8551642894744873, + "step": 3692 + }, + { + "epoch": 0.6724310548830436, + "grad_norm": 9.8125, + "learning_rate": 9.079824867169082e-06, + "loss": 1.4094687700271606, + "step": 3694 + }, + { + "epoch": 0.6727951215072359, + "grad_norm": 10.125, + "learning_rate": 9.078846887300153e-06, + "loss": 0.6505305171012878, + "step": 3696 + }, + { + "epoch": 0.673159188131428, + "grad_norm": 18.0, + "learning_rate": 9.077868455622918e-06, + "loss": 1.4937613010406494, + "step": 3698 + }, + { + "epoch": 0.6735232547556202, + "grad_norm": 12.125, + "learning_rate": 9.076889572280961e-06, + "loss": 1.5752208232879639, + "step": 3700 + }, + { + "epoch": 0.6738873213798126, + "grad_norm": 17.5, + "learning_rate": 9.07591023741793e-06, + "loss": 1.1342289447784424, + "step": 3702 + }, + { + "epoch": 0.6742513880040047, + "grad_norm": 12.125, + "learning_rate": 9.074930451177538e-06, + "loss": 0.5768145322799683, + "step": 3704 + }, + { + "epoch": 0.6746154546281969, + "grad_norm": 41.0, + "learning_rate": 9.073950213703561e-06, + "loss": 1.5090428590774536, + "step": 3706 + }, + { + "epoch": 0.6749795212523891, + "grad_norm": 12.25, + "learning_rate": 9.072969525139849e-06, + "loss": 1.757763385772705, + "step": 3708 + }, + { + "epoch": 0.6753435878765814, + "grad_norm": 6.8125, + "learning_rate": 9.071988385630316e-06, + "loss": 1.5267927646636963, + "step": 3710 + }, + { + "epoch": 0.6757076545007736, + "grad_norm": 16.75, + "learning_rate": 9.071006795318933e-06, + "loss": 1.0630428791046143, + "step": 3712 + }, + { + "epoch": 0.6760717211249658, + "grad_norm": 11.875, + "learning_rate": 9.07002475434975e-06, + "loss": 1.157039761543274, + "step": 3714 + }, + { + "epoch": 0.6764357877491581, + "grad_norm": 18.0, + "learning_rate": 9.069042262866876e-06, + "loss": 1.0162842273712158, + "step": 3716 + }, + { + "epoch": 0.6767998543733503, + "grad_norm": 16.25, + "learning_rate": 9.068059321014489e-06, + "loss": 1.2372663021087646, + "step": 3718 + }, + { + "epoch": 0.6771639209975425, + "grad_norm": 8.6875, + "learning_rate": 9.067075928936829e-06, + "loss": 1.5702767372131348, + "step": 3720 + }, + { + "epoch": 0.6775279876217348, + "grad_norm": 16.875, + "learning_rate": 9.066092086778205e-06, + "loss": 2.2045841217041016, + "step": 3722 + }, + { + "epoch": 0.677892054245927, + "grad_norm": 4.5625, + "learning_rate": 9.065107794682994e-06, + "loss": 1.0491530895233154, + "step": 3724 + }, + { + "epoch": 0.6782561208701192, + "grad_norm": 6.25, + "learning_rate": 9.064123052795636e-06, + "loss": 1.4416320323944092, + "step": 3726 + }, + { + "epoch": 0.6786201874943114, + "grad_norm": 12.5, + "learning_rate": 9.063137861260639e-06, + "loss": 1.3921982049942017, + "step": 3728 + }, + { + "epoch": 0.6789842541185037, + "grad_norm": 14.125, + "learning_rate": 9.062152220222572e-06, + "loss": 1.5467690229415894, + "step": 3730 + }, + { + "epoch": 0.6793483207426959, + "grad_norm": 8.0, + "learning_rate": 9.06116612982608e-06, + "loss": 1.4096386432647705, + "step": 3732 + }, + { + "epoch": 0.6797123873668881, + "grad_norm": 6.40625, + "learning_rate": 9.060179590215862e-06, + "loss": 1.107706904411316, + "step": 3734 + }, + { + "epoch": 0.6800764539910804, + "grad_norm": 8.8125, + "learning_rate": 9.059192601536691e-06, + "loss": 1.1766926050186157, + "step": 3736 + }, + { + "epoch": 0.6804405206152726, + "grad_norm": 4.65625, + "learning_rate": 9.058205163933404e-06, + "loss": 0.695428192615509, + "step": 3738 + }, + { + "epoch": 0.6808045872394648, + "grad_norm": 20.5, + "learning_rate": 9.057217277550903e-06, + "loss": 1.3012020587921143, + "step": 3740 + }, + { + "epoch": 0.6811686538636571, + "grad_norm": 22.625, + "learning_rate": 9.056228942534158e-06, + "loss": 0.7615830302238464, + "step": 3742 + }, + { + "epoch": 0.6815327204878493, + "grad_norm": 17.125, + "learning_rate": 9.055240159028198e-06, + "loss": 1.4642246961593628, + "step": 3744 + }, + { + "epoch": 0.6818967871120415, + "grad_norm": 7.34375, + "learning_rate": 9.05425092717813e-06, + "loss": 1.075600266456604, + "step": 3746 + }, + { + "epoch": 0.6822608537362337, + "grad_norm": 16.625, + "learning_rate": 9.053261247129113e-06, + "loss": 1.5341835021972656, + "step": 3748 + }, + { + "epoch": 0.682624920360426, + "grad_norm": 12.125, + "learning_rate": 9.052271119026383e-06, + "loss": 1.0074820518493652, + "step": 3750 + }, + { + "epoch": 0.6829889869846182, + "grad_norm": 11.9375, + "learning_rate": 9.051280543015238e-06, + "loss": 1.3445367813110352, + "step": 3752 + }, + { + "epoch": 0.6833530536088104, + "grad_norm": 10.625, + "learning_rate": 9.050289519241036e-06, + "loss": 1.3881233930587769, + "step": 3754 + }, + { + "epoch": 0.6837171202330027, + "grad_norm": 6.9375, + "learning_rate": 9.04929804784921e-06, + "loss": 1.3323017358779907, + "step": 3756 + }, + { + "epoch": 0.6840811868571949, + "grad_norm": 10.6875, + "learning_rate": 9.048306128985253e-06, + "loss": 1.4439759254455566, + "step": 3758 + }, + { + "epoch": 0.6844452534813871, + "grad_norm": 50.25, + "learning_rate": 9.047313762794727e-06, + "loss": 1.2597166299819946, + "step": 3760 + }, + { + "epoch": 0.6848093201055793, + "grad_norm": 6.4375, + "learning_rate": 9.046320949423254e-06, + "loss": 1.0838193893432617, + "step": 3762 + }, + { + "epoch": 0.6851733867297716, + "grad_norm": 11.6875, + "learning_rate": 9.045327689016527e-06, + "loss": 1.339174747467041, + "step": 3764 + }, + { + "epoch": 0.6855374533539638, + "grad_norm": 11.125, + "learning_rate": 9.044333981720306e-06, + "loss": 1.1985255479812622, + "step": 3766 + }, + { + "epoch": 0.685901519978156, + "grad_norm": 15.75, + "learning_rate": 9.043339827680408e-06, + "loss": 1.1497808694839478, + "step": 3768 + }, + { + "epoch": 0.6862655866023483, + "grad_norm": 11.3125, + "learning_rate": 9.042345227042726e-06, + "loss": 1.723909616470337, + "step": 3770 + }, + { + "epoch": 0.6866296532265405, + "grad_norm": 9.8125, + "learning_rate": 9.04135017995321e-06, + "loss": 1.6765867471694946, + "step": 3772 + }, + { + "epoch": 0.6869937198507327, + "grad_norm": 6.625, + "learning_rate": 9.040354686557881e-06, + "loss": 1.2211887836456299, + "step": 3774 + }, + { + "epoch": 0.687357786474925, + "grad_norm": 7.9375, + "learning_rate": 9.039358747002824e-06, + "loss": 0.9901754856109619, + "step": 3776 + }, + { + "epoch": 0.6877218530991172, + "grad_norm": 30.75, + "learning_rate": 9.038362361434186e-06, + "loss": 1.4585082530975342, + "step": 3778 + }, + { + "epoch": 0.6880859197233093, + "grad_norm": 14.9375, + "learning_rate": 9.037365529998185e-06, + "loss": 1.7494745254516602, + "step": 3780 + }, + { + "epoch": 0.6884499863475015, + "grad_norm": 12.625, + "learning_rate": 9.036368252841106e-06, + "loss": 1.340724229812622, + "step": 3782 + }, + { + "epoch": 0.6888140529716938, + "grad_norm": 7.03125, + "learning_rate": 9.035370530109288e-06, + "loss": 1.5212626457214355, + "step": 3784 + }, + { + "epoch": 0.689178119595886, + "grad_norm": 8.1875, + "learning_rate": 9.034372361949146e-06, + "loss": 1.4433444738388062, + "step": 3786 + }, + { + "epoch": 0.6895421862200782, + "grad_norm": 12.875, + "learning_rate": 9.033373748507157e-06, + "loss": 1.622555136680603, + "step": 3788 + }, + { + "epoch": 0.6899062528442705, + "grad_norm": 10.0625, + "learning_rate": 9.032374689929864e-06, + "loss": 1.35758376121521, + "step": 3790 + }, + { + "epoch": 0.6902703194684627, + "grad_norm": 6.625, + "learning_rate": 9.031375186363875e-06, + "loss": 1.3290655612945557, + "step": 3792 + }, + { + "epoch": 0.6906343860926549, + "grad_norm": 7.09375, + "learning_rate": 9.030375237955862e-06, + "loss": 1.209246277809143, + "step": 3794 + }, + { + "epoch": 0.6909984527168472, + "grad_norm": 17.125, + "learning_rate": 9.029374844852565e-06, + "loss": 1.3822636604309082, + "step": 3796 + }, + { + "epoch": 0.6913625193410394, + "grad_norm": 4.84375, + "learning_rate": 9.028374007200785e-06, + "loss": 1.1453665494918823, + "step": 3798 + }, + { + "epoch": 0.6917265859652316, + "grad_norm": 11.0625, + "learning_rate": 9.027372725147392e-06, + "loss": 1.2516539096832275, + "step": 3800 + }, + { + "epoch": 0.6920906525894238, + "grad_norm": 6.15625, + "learning_rate": 9.026370998839322e-06, + "loss": 1.0178422927856445, + "step": 3802 + }, + { + "epoch": 0.6924547192136161, + "grad_norm": 20.375, + "learning_rate": 9.025368828423573e-06, + "loss": 1.292572259902954, + "step": 3804 + }, + { + "epoch": 0.6928187858378083, + "grad_norm": 23.25, + "learning_rate": 9.024366214047206e-06, + "loss": 1.7667193412780762, + "step": 3806 + }, + { + "epoch": 0.6931828524620005, + "grad_norm": 23.875, + "learning_rate": 9.023363155857357e-06, + "loss": 1.1921138763427734, + "step": 3808 + }, + { + "epoch": 0.6935469190861928, + "grad_norm": 19.875, + "learning_rate": 9.022359654001216e-06, + "loss": 0.8805833458900452, + "step": 3810 + }, + { + "epoch": 0.693910985710385, + "grad_norm": 13.25, + "learning_rate": 9.021355708626046e-06, + "loss": 1.5749866962432861, + "step": 3812 + }, + { + "epoch": 0.6942750523345772, + "grad_norm": 4.3125, + "learning_rate": 9.020351319879169e-06, + "loss": 1.334184169769287, + "step": 3814 + }, + { + "epoch": 0.6946391189587695, + "grad_norm": 6.8125, + "learning_rate": 9.019346487907977e-06, + "loss": 1.4905041456222534, + "step": 3816 + }, + { + "epoch": 0.6950031855829617, + "grad_norm": 22.0, + "learning_rate": 9.018341212859922e-06, + "loss": 1.3692952394485474, + "step": 3818 + }, + { + "epoch": 0.6953672522071539, + "grad_norm": 16.25, + "learning_rate": 9.017335494882528e-06, + "loss": 1.5577635765075684, + "step": 3820 + }, + { + "epoch": 0.6957313188313461, + "grad_norm": 20.375, + "learning_rate": 9.016329334123377e-06, + "loss": 1.3878321647644043, + "step": 3822 + }, + { + "epoch": 0.6960953854555384, + "grad_norm": 22.25, + "learning_rate": 9.015322730730123e-06, + "loss": 1.6337788105010986, + "step": 3824 + }, + { + "epoch": 0.6964594520797306, + "grad_norm": 13.5, + "learning_rate": 9.014315684850477e-06, + "loss": 1.3739981651306152, + "step": 3826 + }, + { + "epoch": 0.6968235187039228, + "grad_norm": 9.875, + "learning_rate": 9.013308196632218e-06, + "loss": 1.1458913087844849, + "step": 3828 + }, + { + "epoch": 0.6971875853281151, + "grad_norm": 44.0, + "learning_rate": 9.012300266223196e-06, + "loss": 0.8456411361694336, + "step": 3830 + }, + { + "epoch": 0.6975516519523073, + "grad_norm": 11.625, + "learning_rate": 9.011291893771317e-06, + "loss": 1.2588180303573608, + "step": 3832 + }, + { + "epoch": 0.6979157185764995, + "grad_norm": 8.9375, + "learning_rate": 9.010283079424556e-06, + "loss": 1.327724814414978, + "step": 3834 + }, + { + "epoch": 0.6982797852006917, + "grad_norm": 12.625, + "learning_rate": 9.009273823330951e-06, + "loss": 1.160435438156128, + "step": 3836 + }, + { + "epoch": 0.698643851824884, + "grad_norm": 23.875, + "learning_rate": 9.008264125638611e-06, + "loss": 0.8293675184249878, + "step": 3838 + }, + { + "epoch": 0.6990079184490762, + "grad_norm": 193.0, + "learning_rate": 9.007253986495701e-06, + "loss": 0.7560205459594727, + "step": 3840 + }, + { + "epoch": 0.6993719850732684, + "grad_norm": 6.96875, + "learning_rate": 9.006243406050454e-06, + "loss": 1.1688603162765503, + "step": 3842 + }, + { + "epoch": 0.6997360516974607, + "grad_norm": 12.0, + "learning_rate": 9.005232384451172e-06, + "loss": 1.6441826820373535, + "step": 3844 + }, + { + "epoch": 0.7001001183216529, + "grad_norm": 10.6875, + "learning_rate": 9.004220921846217e-06, + "loss": 1.4990862607955933, + "step": 3846 + }, + { + "epoch": 0.7004641849458451, + "grad_norm": 8.8125, + "learning_rate": 9.003209018384017e-06, + "loss": 1.2281641960144043, + "step": 3848 + }, + { + "epoch": 0.7008282515700374, + "grad_norm": 9.0, + "learning_rate": 9.002196674213065e-06, + "loss": 1.1953091621398926, + "step": 3850 + }, + { + "epoch": 0.7011923181942296, + "grad_norm": 7.375, + "learning_rate": 9.001183889481915e-06, + "loss": 1.1360362768173218, + "step": 3852 + }, + { + "epoch": 0.7015563848184218, + "grad_norm": 6.1875, + "learning_rate": 9.000170664339191e-06, + "loss": 1.31300950050354, + "step": 3854 + }, + { + "epoch": 0.701920451442614, + "grad_norm": 18.125, + "learning_rate": 8.999156998933585e-06, + "loss": 1.570727825164795, + "step": 3856 + }, + { + "epoch": 0.7022845180668063, + "grad_norm": 17.5, + "learning_rate": 8.998142893413842e-06, + "loss": 1.5525283813476562, + "step": 3858 + }, + { + "epoch": 0.7026485846909984, + "grad_norm": 25.75, + "learning_rate": 8.99712834792878e-06, + "loss": 1.5696039199829102, + "step": 3860 + }, + { + "epoch": 0.7030126513151906, + "grad_norm": 7.9375, + "learning_rate": 8.996113362627279e-06, + "loss": 1.2184580564498901, + "step": 3862 + }, + { + "epoch": 0.703376717939383, + "grad_norm": 7.0625, + "learning_rate": 8.995097937658286e-06, + "loss": 1.4596872329711914, + "step": 3864 + }, + { + "epoch": 0.7037407845635751, + "grad_norm": 12.8125, + "learning_rate": 8.994082073170807e-06, + "loss": 1.5328449010849, + "step": 3866 + }, + { + "epoch": 0.7041048511877673, + "grad_norm": 9.75, + "learning_rate": 8.993065769313915e-06, + "loss": 1.5280065536499023, + "step": 3868 + }, + { + "epoch": 0.7044689178119596, + "grad_norm": 17.125, + "learning_rate": 8.992049026236756e-06, + "loss": 1.651501178741455, + "step": 3870 + }, + { + "epoch": 0.7048329844361518, + "grad_norm": 5.40625, + "learning_rate": 8.991031844088528e-06, + "loss": 1.0046759843826294, + "step": 3872 + }, + { + "epoch": 0.705197051060344, + "grad_norm": 11.3125, + "learning_rate": 8.990014223018495e-06, + "loss": 1.5836578607559204, + "step": 3874 + }, + { + "epoch": 0.7055611176845362, + "grad_norm": 16.0, + "learning_rate": 8.988996163175994e-06, + "loss": 1.4356216192245483, + "step": 3876 + }, + { + "epoch": 0.7059251843087285, + "grad_norm": 8.25, + "learning_rate": 8.98797766471042e-06, + "loss": 1.3767637014389038, + "step": 3878 + }, + { + "epoch": 0.7062892509329207, + "grad_norm": 14.875, + "learning_rate": 8.98695872777123e-06, + "loss": 1.3272817134857178, + "step": 3880 + }, + { + "epoch": 0.7066533175571129, + "grad_norm": 7.875, + "learning_rate": 8.985939352507955e-06, + "loss": 1.3519479036331177, + "step": 3882 + }, + { + "epoch": 0.7070173841813052, + "grad_norm": 10.875, + "learning_rate": 8.98491953907018e-06, + "loss": 1.2547365427017212, + "step": 3884 + }, + { + "epoch": 0.7073814508054974, + "grad_norm": 11.125, + "learning_rate": 8.98389928760756e-06, + "loss": 1.3547316789627075, + "step": 3886 + }, + { + "epoch": 0.7077455174296896, + "grad_norm": 17.125, + "learning_rate": 8.982878598269811e-06, + "loss": 1.4341374635696411, + "step": 3888 + }, + { + "epoch": 0.7081095840538819, + "grad_norm": 13.4375, + "learning_rate": 8.981857471206716e-06, + "loss": 1.963982343673706, + "step": 3890 + }, + { + "epoch": 0.7084736506780741, + "grad_norm": 15.3125, + "learning_rate": 8.980835906568125e-06, + "loss": 1.3414711952209473, + "step": 3892 + }, + { + "epoch": 0.7088377173022663, + "grad_norm": 15.5625, + "learning_rate": 8.97981390450394e-06, + "loss": 1.1575798988342285, + "step": 3894 + }, + { + "epoch": 0.7092017839264585, + "grad_norm": 9.75, + "learning_rate": 8.978791465164145e-06, + "loss": 1.2165459394454956, + "step": 3896 + }, + { + "epoch": 0.7095658505506508, + "grad_norm": 13.0, + "learning_rate": 8.977768588698772e-06, + "loss": 1.3495299816131592, + "step": 3898 + }, + { + "epoch": 0.709929917174843, + "grad_norm": 22.875, + "learning_rate": 8.976745275257925e-06, + "loss": 1.4521515369415283, + "step": 3900 + }, + { + "epoch": 0.7102939837990352, + "grad_norm": 15.5625, + "learning_rate": 8.975721524991777e-06, + "loss": 1.0500729084014893, + "step": 3902 + }, + { + "epoch": 0.7106580504232275, + "grad_norm": 9.6875, + "learning_rate": 8.97469733805055e-06, + "loss": 0.7728732824325562, + "step": 3904 + }, + { + "epoch": 0.7110221170474197, + "grad_norm": 14.25, + "learning_rate": 8.973672714584547e-06, + "loss": 1.131068468093872, + "step": 3906 + }, + { + "epoch": 0.7113861836716119, + "grad_norm": 10.9375, + "learning_rate": 8.972647654744125e-06, + "loss": 1.590749740600586, + "step": 3908 + }, + { + "epoch": 0.7117502502958041, + "grad_norm": 8.3125, + "learning_rate": 8.971622158679704e-06, + "loss": 1.2511940002441406, + "step": 3910 + }, + { + "epoch": 0.7121143169199964, + "grad_norm": 8.1875, + "learning_rate": 8.970596226541775e-06, + "loss": 1.4950616359710693, + "step": 3912 + }, + { + "epoch": 0.7124783835441886, + "grad_norm": 11.5625, + "learning_rate": 8.96956985848089e-06, + "loss": 1.2371429204940796, + "step": 3914 + }, + { + "epoch": 0.7128424501683808, + "grad_norm": 16.375, + "learning_rate": 8.968543054647662e-06, + "loss": 1.5337557792663574, + "step": 3916 + }, + { + "epoch": 0.7132065167925731, + "grad_norm": 10.5625, + "learning_rate": 8.967515815192772e-06, + "loss": 1.0237574577331543, + "step": 3918 + }, + { + "epoch": 0.7135705834167653, + "grad_norm": 11.875, + "learning_rate": 8.96648814026696e-06, + "loss": 1.3167476654052734, + "step": 3920 + }, + { + "epoch": 0.7139346500409575, + "grad_norm": 40.5, + "learning_rate": 8.965460030021038e-06, + "loss": 0.9302914142608643, + "step": 3922 + }, + { + "epoch": 0.7142987166651498, + "grad_norm": 16.25, + "learning_rate": 8.964431484605874e-06, + "loss": 1.4194064140319824, + "step": 3924 + }, + { + "epoch": 0.714662783289342, + "grad_norm": 12.0, + "learning_rate": 8.963402504172403e-06, + "loss": 1.2571460008621216, + "step": 3926 + }, + { + "epoch": 0.7150268499135342, + "grad_norm": 10.5, + "learning_rate": 8.962373088871624e-06, + "loss": 1.4063042402267456, + "step": 3928 + }, + { + "epoch": 0.7153909165377264, + "grad_norm": 14.5625, + "learning_rate": 8.9613432388546e-06, + "loss": 1.3174469470977783, + "step": 3930 + }, + { + "epoch": 0.7157549831619187, + "grad_norm": 19.0, + "learning_rate": 8.960312954272457e-06, + "loss": 0.9649209976196289, + "step": 3932 + }, + { + "epoch": 0.7161190497861109, + "grad_norm": 13.4375, + "learning_rate": 8.959282235276386e-06, + "loss": 1.6821236610412598, + "step": 3934 + }, + { + "epoch": 0.716483116410303, + "grad_norm": 12.5, + "learning_rate": 8.958251082017637e-06, + "loss": 1.40818190574646, + "step": 3936 + }, + { + "epoch": 0.7168471830344953, + "grad_norm": 55.75, + "learning_rate": 8.957219494647534e-06, + "loss": 1.3784319162368774, + "step": 3938 + }, + { + "epoch": 0.7172112496586875, + "grad_norm": 5.15625, + "learning_rate": 8.95618747331745e-06, + "loss": 0.9126887321472168, + "step": 3940 + }, + { + "epoch": 0.7175753162828797, + "grad_norm": 26.125, + "learning_rate": 8.955155018178839e-06, + "loss": 0.997277557849884, + "step": 3942 + }, + { + "epoch": 0.717939382907072, + "grad_norm": 16.875, + "learning_rate": 8.954122129383205e-06, + "loss": 0.6627995371818542, + "step": 3944 + }, + { + "epoch": 0.7183034495312642, + "grad_norm": 10.0, + "learning_rate": 8.95308880708212e-06, + "loss": 1.61794114112854, + "step": 3946 + }, + { + "epoch": 0.7186675161554564, + "grad_norm": 13.0, + "learning_rate": 8.95205505142722e-06, + "loss": 1.6856334209442139, + "step": 3948 + }, + { + "epoch": 0.7190315827796486, + "grad_norm": 10.25, + "learning_rate": 8.951020862570204e-06, + "loss": 1.308815836906433, + "step": 3950 + }, + { + "epoch": 0.7193956494038409, + "grad_norm": 8.1875, + "learning_rate": 8.949986240662835e-06, + "loss": 1.3072153329849243, + "step": 3952 + }, + { + "epoch": 0.7197597160280331, + "grad_norm": 10.0625, + "learning_rate": 8.948951185856943e-06, + "loss": 1.3817613124847412, + "step": 3954 + }, + { + "epoch": 0.7201237826522253, + "grad_norm": 14.5, + "learning_rate": 8.947915698304415e-06, + "loss": 1.75496506690979, + "step": 3956 + }, + { + "epoch": 0.7204878492764176, + "grad_norm": 4.46875, + "learning_rate": 8.946879778157203e-06, + "loss": 1.1558823585510254, + "step": 3958 + }, + { + "epoch": 0.7208519159006098, + "grad_norm": 9.0625, + "learning_rate": 8.94584342556733e-06, + "loss": 1.1663873195648193, + "step": 3960 + }, + { + "epoch": 0.721215982524802, + "grad_norm": 19.75, + "learning_rate": 8.944806640686869e-06, + "loss": 1.1661570072174072, + "step": 3962 + }, + { + "epoch": 0.7215800491489943, + "grad_norm": 14.9375, + "learning_rate": 8.94376942366797e-06, + "loss": 1.5698572397232056, + "step": 3964 + }, + { + "epoch": 0.7219441157731865, + "grad_norm": 17.875, + "learning_rate": 8.942731774662837e-06, + "loss": 1.1468448638916016, + "step": 3966 + }, + { + "epoch": 0.7223081823973787, + "grad_norm": 18.0, + "learning_rate": 8.94169369382374e-06, + "loss": 1.856109619140625, + "step": 3968 + }, + { + "epoch": 0.7226722490215709, + "grad_norm": 5.46875, + "learning_rate": 8.940655181303019e-06, + "loss": 1.5610462427139282, + "step": 3970 + }, + { + "epoch": 0.7230363156457632, + "grad_norm": 23.125, + "learning_rate": 8.939616237253068e-06, + "loss": 1.1868939399719238, + "step": 3972 + }, + { + "epoch": 0.7234003822699554, + "grad_norm": 12.0625, + "learning_rate": 8.938576861826344e-06, + "loss": 0.9291361570358276, + "step": 3974 + }, + { + "epoch": 0.7237644488941476, + "grad_norm": 5.375, + "learning_rate": 8.937537055175375e-06, + "loss": 1.2235300540924072, + "step": 3976 + }, + { + "epoch": 0.7241285155183399, + "grad_norm": 16.75, + "learning_rate": 8.936496817452752e-06, + "loss": 1.2607587575912476, + "step": 3978 + }, + { + "epoch": 0.7244925821425321, + "grad_norm": 16.25, + "learning_rate": 8.935456148811116e-06, + "loss": 1.295597791671753, + "step": 3980 + }, + { + "epoch": 0.7248566487667243, + "grad_norm": 14.3125, + "learning_rate": 8.93441504940319e-06, + "loss": 1.4838569164276123, + "step": 3982 + }, + { + "epoch": 0.7252207153909166, + "grad_norm": 64.5, + "learning_rate": 8.933373519381748e-06, + "loss": 1.0721181631088257, + "step": 3984 + }, + { + "epoch": 0.7255847820151088, + "grad_norm": 15.75, + "learning_rate": 8.932331558899627e-06, + "loss": 1.4383556842803955, + "step": 3986 + }, + { + "epoch": 0.725948848639301, + "grad_norm": 7.125, + "learning_rate": 8.931289168109737e-06, + "loss": 1.3305082321166992, + "step": 3988 + }, + { + "epoch": 0.7263129152634932, + "grad_norm": 12.1875, + "learning_rate": 8.930246347165038e-06, + "loss": 1.8584890365600586, + "step": 3990 + }, + { + "epoch": 0.7266769818876855, + "grad_norm": 11.625, + "learning_rate": 8.929203096218561e-06, + "loss": 1.5628858804702759, + "step": 3992 + }, + { + "epoch": 0.7270410485118777, + "grad_norm": 22.0, + "learning_rate": 8.928159415423406e-06, + "loss": 1.3630568981170654, + "step": 3994 + }, + { + "epoch": 0.7274051151360699, + "grad_norm": 30.5, + "learning_rate": 8.92711530493272e-06, + "loss": 1.333431363105774, + "step": 3996 + }, + { + "epoch": 0.7277691817602622, + "grad_norm": 22.625, + "learning_rate": 8.926070764899729e-06, + "loss": 1.5804121494293213, + "step": 3998 + }, + { + "epoch": 0.7281332483844544, + "grad_norm": 16.875, + "learning_rate": 8.92502579547771e-06, + "loss": 1.5849170684814453, + "step": 4000 + }, + { + "epoch": 0.7284973150086466, + "grad_norm": 10.125, + "learning_rate": 8.923980396820006e-06, + "loss": 1.6163780689239502, + "step": 4002 + }, + { + "epoch": 0.7288613816328388, + "grad_norm": 14.9375, + "learning_rate": 8.922934569080033e-06, + "loss": 1.7660216093063354, + "step": 4004 + }, + { + "epoch": 0.7292254482570311, + "grad_norm": 13.4375, + "learning_rate": 8.921888312411256e-06, + "loss": 1.4897310733795166, + "step": 4006 + }, + { + "epoch": 0.7295895148812233, + "grad_norm": 7.46875, + "learning_rate": 8.92084162696721e-06, + "loss": 1.14540433883667, + "step": 4008 + }, + { + "epoch": 0.7299535815054154, + "grad_norm": 8.5, + "learning_rate": 8.919794512901495e-06, + "loss": 1.2850167751312256, + "step": 4010 + }, + { + "epoch": 0.7303176481296078, + "grad_norm": 13.8125, + "learning_rate": 8.918746970367764e-06, + "loss": 1.3529331684112549, + "step": 4012 + }, + { + "epoch": 0.7306817147538, + "grad_norm": 16.625, + "learning_rate": 8.917698999519746e-06, + "loss": 1.7041479349136353, + "step": 4014 + }, + { + "epoch": 0.7310457813779921, + "grad_norm": 15.8125, + "learning_rate": 8.916650600511225e-06, + "loss": 1.6238069534301758, + "step": 4016 + }, + { + "epoch": 0.7314098480021844, + "grad_norm": 24.125, + "learning_rate": 8.915601773496048e-06, + "loss": 0.6529870629310608, + "step": 4018 + }, + { + "epoch": 0.7317739146263766, + "grad_norm": 43.75, + "learning_rate": 8.914552518628126e-06, + "loss": 0.9866514205932617, + "step": 4020 + }, + { + "epoch": 0.7321379812505688, + "grad_norm": 20.75, + "learning_rate": 8.913502836061434e-06, + "loss": 1.5501985549926758, + "step": 4022 + }, + { + "epoch": 0.732502047874761, + "grad_norm": 12.125, + "learning_rate": 8.912452725950008e-06, + "loss": 1.9918923377990723, + "step": 4024 + }, + { + "epoch": 0.7328661144989533, + "grad_norm": 19.625, + "learning_rate": 8.911402188447946e-06, + "loss": 1.2583119869232178, + "step": 4026 + }, + { + "epoch": 0.7332301811231455, + "grad_norm": 9.0, + "learning_rate": 8.910351223709416e-06, + "loss": 1.0755724906921387, + "step": 4028 + }, + { + "epoch": 0.7335942477473377, + "grad_norm": 18.5, + "learning_rate": 8.909299831888634e-06, + "loss": 1.4027138948440552, + "step": 4030 + }, + { + "epoch": 0.73395831437153, + "grad_norm": 11.375, + "learning_rate": 8.908248013139895e-06, + "loss": 0.8530186414718628, + "step": 4032 + }, + { + "epoch": 0.7343223809957222, + "grad_norm": 8.25, + "learning_rate": 8.907195767617545e-06, + "loss": 1.2342077493667603, + "step": 4034 + }, + { + "epoch": 0.7346864476199144, + "grad_norm": 5.03125, + "learning_rate": 8.906143095475999e-06, + "loss": 0.518145740032196, + "step": 4036 + }, + { + "epoch": 0.7350505142441067, + "grad_norm": 11.4375, + "learning_rate": 8.905089996869729e-06, + "loss": 1.4473772048950195, + "step": 4038 + }, + { + "epoch": 0.7354145808682989, + "grad_norm": 16.0, + "learning_rate": 8.904036471953277e-06, + "loss": 1.4251716136932373, + "step": 4040 + }, + { + "epoch": 0.7357786474924911, + "grad_norm": 8.875, + "learning_rate": 8.90298252088124e-06, + "loss": 1.3303751945495605, + "step": 4042 + }, + { + "epoch": 0.7361427141166833, + "grad_norm": 9.5625, + "learning_rate": 8.901928143808285e-06, + "loss": 1.4375848770141602, + "step": 4044 + }, + { + "epoch": 0.7365067807408756, + "grad_norm": 32.75, + "learning_rate": 8.900873340889136e-06, + "loss": 1.4494879245758057, + "step": 4046 + }, + { + "epoch": 0.7368708473650678, + "grad_norm": 24.25, + "learning_rate": 8.899818112278578e-06, + "loss": 1.4735255241394043, + "step": 4048 + }, + { + "epoch": 0.73723491398926, + "grad_norm": 20.625, + "learning_rate": 8.898762458131467e-06, + "loss": 1.40456223487854, + "step": 4050 + }, + { + "epoch": 0.7375989806134523, + "grad_norm": 7.53125, + "learning_rate": 8.897706378602708e-06, + "loss": 1.2281516790390015, + "step": 4052 + }, + { + "epoch": 0.7379630472376445, + "grad_norm": 15.375, + "learning_rate": 8.896649873847286e-06, + "loss": 1.433131456375122, + "step": 4054 + }, + { + "epoch": 0.7383271138618367, + "grad_norm": 27.75, + "learning_rate": 8.895592944020232e-06, + "loss": 1.968809962272644, + "step": 4056 + }, + { + "epoch": 0.738691180486029, + "grad_norm": 24.375, + "learning_rate": 8.894535589276649e-06, + "loss": 1.0219695568084717, + "step": 4058 + }, + { + "epoch": 0.7390552471102212, + "grad_norm": 15.75, + "learning_rate": 8.893477809771698e-06, + "loss": 1.2095305919647217, + "step": 4060 + }, + { + "epoch": 0.7394193137344134, + "grad_norm": 12.125, + "learning_rate": 8.892419605660606e-06, + "loss": 1.8842158317565918, + "step": 4062 + }, + { + "epoch": 0.7397833803586056, + "grad_norm": 10.9375, + "learning_rate": 8.891360977098658e-06, + "loss": 1.4887069463729858, + "step": 4064 + }, + { + "epoch": 0.7401474469827979, + "grad_norm": 7.03125, + "learning_rate": 8.890301924241203e-06, + "loss": 1.483014464378357, + "step": 4066 + }, + { + "epoch": 0.7405115136069901, + "grad_norm": 17.0, + "learning_rate": 8.889242447243655e-06, + "loss": 1.4747929573059082, + "step": 4068 + }, + { + "epoch": 0.7408755802311823, + "grad_norm": 7.8125, + "learning_rate": 8.888182546261488e-06, + "loss": 1.4071441888809204, + "step": 4070 + }, + { + "epoch": 0.7412396468553746, + "grad_norm": 19.125, + "learning_rate": 8.887122221450235e-06, + "loss": 1.1633793115615845, + "step": 4072 + }, + { + "epoch": 0.7416037134795668, + "grad_norm": 16.0, + "learning_rate": 8.8860614729655e-06, + "loss": 1.3279290199279785, + "step": 4074 + }, + { + "epoch": 0.741967780103759, + "grad_norm": 13.9375, + "learning_rate": 8.885000300962936e-06, + "loss": 1.4762736558914185, + "step": 4076 + }, + { + "epoch": 0.7423318467279512, + "grad_norm": 12.1875, + "learning_rate": 8.883938705598271e-06, + "loss": 1.5852270126342773, + "step": 4078 + }, + { + "epoch": 0.7426959133521435, + "grad_norm": 13.3125, + "learning_rate": 8.882876687027289e-06, + "loss": 1.5529594421386719, + "step": 4080 + }, + { + "epoch": 0.7430599799763357, + "grad_norm": 29.75, + "learning_rate": 8.881814245405838e-06, + "loss": 1.4649889469146729, + "step": 4082 + }, + { + "epoch": 0.7434240466005279, + "grad_norm": 5.6875, + "learning_rate": 8.880751380889822e-06, + "loss": 0.5284292697906494, + "step": 4084 + }, + { + "epoch": 0.7437881132247202, + "grad_norm": 13.75, + "learning_rate": 8.879688093635218e-06, + "loss": 1.3982738256454468, + "step": 4086 + }, + { + "epoch": 0.7441521798489124, + "grad_norm": 13.9375, + "learning_rate": 8.878624383798056e-06, + "loss": 1.694324254989624, + "step": 4088 + }, + { + "epoch": 0.7445162464731045, + "grad_norm": 5.0, + "learning_rate": 8.877560251534431e-06, + "loss": 1.3337924480438232, + "step": 4090 + }, + { + "epoch": 0.7448803130972969, + "grad_norm": 16.25, + "learning_rate": 8.876495697000502e-06, + "loss": 1.4946918487548828, + "step": 4092 + }, + { + "epoch": 0.745244379721489, + "grad_norm": 9.5625, + "learning_rate": 8.875430720352487e-06, + "loss": 1.9913033246994019, + "step": 4094 + }, + { + "epoch": 0.7456084463456812, + "grad_norm": 20.375, + "learning_rate": 8.874365321746668e-06, + "loss": 1.2863436937332153, + "step": 4096 + }, + { + "epoch": 0.7459725129698734, + "grad_norm": 14.6875, + "learning_rate": 8.873299501339383e-06, + "loss": 1.560056209564209, + "step": 4098 + }, + { + "epoch": 0.7463365795940657, + "grad_norm": 9.1875, + "learning_rate": 8.872233259287044e-06, + "loss": 1.3569194078445435, + "step": 4100 + }, + { + "epoch": 0.7467006462182579, + "grad_norm": 12.375, + "learning_rate": 8.871166595746113e-06, + "loss": 1.429707407951355, + "step": 4102 + }, + { + "epoch": 0.7470647128424501, + "grad_norm": 16.0, + "learning_rate": 8.87009951087312e-06, + "loss": 1.4371904134750366, + "step": 4104 + }, + { + "epoch": 0.7474287794666424, + "grad_norm": 8.5625, + "learning_rate": 8.869032004824656e-06, + "loss": 1.375626564025879, + "step": 4106 + }, + { + "epoch": 0.7477928460908346, + "grad_norm": 4.6875, + "learning_rate": 8.867964077757372e-06, + "loss": 1.330753207206726, + "step": 4108 + }, + { + "epoch": 0.7481569127150268, + "grad_norm": 5.9375, + "learning_rate": 8.866895729827983e-06, + "loss": 0.913055956363678, + "step": 4110 + }, + { + "epoch": 0.7485209793392191, + "grad_norm": 36.0, + "learning_rate": 8.865826961193263e-06, + "loss": 1.1803151369094849, + "step": 4112 + }, + { + "epoch": 0.7488850459634113, + "grad_norm": 10.1875, + "learning_rate": 8.86475777201005e-06, + "loss": 0.8327276110649109, + "step": 4114 + }, + { + "epoch": 0.7492491125876035, + "grad_norm": 11.125, + "learning_rate": 8.863688162435244e-06, + "loss": 1.4408912658691406, + "step": 4116 + }, + { + "epoch": 0.7496131792117957, + "grad_norm": 23.0, + "learning_rate": 8.862618132625806e-06, + "loss": 1.602658987045288, + "step": 4118 + }, + { + "epoch": 0.749977245835988, + "grad_norm": 82.0, + "learning_rate": 8.86154768273876e-06, + "loss": 1.4650168418884277, + "step": 4120 + }, + { + "epoch": 0.7503413124601802, + "grad_norm": 8.1875, + "learning_rate": 8.860476812931188e-06, + "loss": 1.435798168182373, + "step": 4122 + }, + { + "epoch": 0.7507053790843724, + "grad_norm": 9.3125, + "learning_rate": 8.859405523360234e-06, + "loss": 1.2188894748687744, + "step": 4124 + }, + { + "epoch": 0.7510694457085647, + "grad_norm": 17.25, + "learning_rate": 8.858333814183109e-06, + "loss": 1.763692021369934, + "step": 4126 + }, + { + "epoch": 0.7514335123327569, + "grad_norm": 13.75, + "learning_rate": 8.857261685557079e-06, + "loss": 1.8446099758148193, + "step": 4128 + }, + { + "epoch": 0.7517975789569491, + "grad_norm": 23.875, + "learning_rate": 8.85618913763948e-06, + "loss": 1.2607200145721436, + "step": 4130 + }, + { + "epoch": 0.7521616455811414, + "grad_norm": 14.0, + "learning_rate": 8.855116170587697e-06, + "loss": 0.876882016658783, + "step": 4132 + }, + { + "epoch": 0.7525257122053336, + "grad_norm": 6.0625, + "learning_rate": 8.854042784559187e-06, + "loss": 1.0581748485565186, + "step": 4134 + }, + { + "epoch": 0.7528897788295258, + "grad_norm": 24.0, + "learning_rate": 8.852968979711465e-06, + "loss": 1.3951890468597412, + "step": 4136 + }, + { + "epoch": 0.753253845453718, + "grad_norm": 8.125, + "learning_rate": 8.851894756202109e-06, + "loss": 1.4047353267669678, + "step": 4138 + }, + { + "epoch": 0.7536179120779103, + "grad_norm": 9.0, + "learning_rate": 8.850820114188754e-06, + "loss": 1.1785528659820557, + "step": 4140 + }, + { + "epoch": 0.7539819787021025, + "grad_norm": 21.875, + "learning_rate": 8.8497450538291e-06, + "loss": 1.7320830821990967, + "step": 4142 + }, + { + "epoch": 0.7543460453262947, + "grad_norm": 9.6875, + "learning_rate": 8.848669575280907e-06, + "loss": 1.4284143447875977, + "step": 4144 + }, + { + "epoch": 0.754710111950487, + "grad_norm": 9.5, + "learning_rate": 8.847593678702002e-06, + "loss": 1.2777068614959717, + "step": 4146 + }, + { + "epoch": 0.7550741785746792, + "grad_norm": 18.375, + "learning_rate": 8.846517364250265e-06, + "loss": 1.6407257318496704, + "step": 4148 + }, + { + "epoch": 0.7554382451988714, + "grad_norm": 8.375, + "learning_rate": 8.845440632083637e-06, + "loss": 0.8985050320625305, + "step": 4150 + }, + { + "epoch": 0.7558023118230636, + "grad_norm": 39.25, + "learning_rate": 8.84436348236013e-06, + "loss": 1.2232482433319092, + "step": 4152 + }, + { + "epoch": 0.7561663784472559, + "grad_norm": 19.25, + "learning_rate": 8.84328591523781e-06, + "loss": 1.7907624244689941, + "step": 4154 + }, + { + "epoch": 0.7565304450714481, + "grad_norm": 7.125, + "learning_rate": 8.842207930874802e-06, + "loss": 1.2141859531402588, + "step": 4156 + }, + { + "epoch": 0.7568945116956403, + "grad_norm": 6.90625, + "learning_rate": 8.841129529429299e-06, + "loss": 1.3649156093597412, + "step": 4158 + }, + { + "epoch": 0.7572585783198326, + "grad_norm": 19.25, + "learning_rate": 8.840050711059556e-06, + "loss": 1.3847813606262207, + "step": 4160 + }, + { + "epoch": 0.7576226449440248, + "grad_norm": 13.125, + "learning_rate": 8.838971475923876e-06, + "loss": 1.7742314338684082, + "step": 4162 + }, + { + "epoch": 0.757986711568217, + "grad_norm": 10.9375, + "learning_rate": 8.83789182418064e-06, + "loss": 1.4739961624145508, + "step": 4164 + }, + { + "epoch": 0.7583507781924093, + "grad_norm": 20.375, + "learning_rate": 8.83681175598828e-06, + "loss": 1.5541844367980957, + "step": 4166 + }, + { + "epoch": 0.7587148448166015, + "grad_norm": 326.0, + "learning_rate": 8.835731271505289e-06, + "loss": 1.3313920497894287, + "step": 4168 + }, + { + "epoch": 0.7590789114407936, + "grad_norm": 6.8125, + "learning_rate": 8.834650370890227e-06, + "loss": 1.3126857280731201, + "step": 4170 + }, + { + "epoch": 0.7594429780649858, + "grad_norm": 16.125, + "learning_rate": 8.833569054301712e-06, + "loss": 1.2136203050613403, + "step": 4172 + }, + { + "epoch": 0.7598070446891781, + "grad_norm": 8.5625, + "learning_rate": 8.83248732189842e-06, + "loss": 1.5499008893966675, + "step": 4174 + }, + { + "epoch": 0.7601711113133703, + "grad_norm": 17.75, + "learning_rate": 8.831405173839094e-06, + "loss": 1.3748478889465332, + "step": 4176 + }, + { + "epoch": 0.7605351779375625, + "grad_norm": 9.375, + "learning_rate": 8.830322610282533e-06, + "loss": 1.2371443510055542, + "step": 4178 + }, + { + "epoch": 0.7608992445617548, + "grad_norm": 16.625, + "learning_rate": 8.829239631387598e-06, + "loss": 1.2147772312164307, + "step": 4180 + }, + { + "epoch": 0.761263311185947, + "grad_norm": 15.1875, + "learning_rate": 8.828156237313215e-06, + "loss": 1.9534438848495483, + "step": 4182 + }, + { + "epoch": 0.7616273778101392, + "grad_norm": 15.75, + "learning_rate": 8.827072428218366e-06, + "loss": 1.5573246479034424, + "step": 4184 + }, + { + "epoch": 0.7619914444343315, + "grad_norm": 12.75, + "learning_rate": 8.825988204262092e-06, + "loss": 1.4831327199935913, + "step": 4186 + }, + { + "epoch": 0.7623555110585237, + "grad_norm": 8.9375, + "learning_rate": 8.824903565603507e-06, + "loss": 1.2221503257751465, + "step": 4188 + }, + { + "epoch": 0.7627195776827159, + "grad_norm": 7.46875, + "learning_rate": 8.823818512401768e-06, + "loss": 1.3167275190353394, + "step": 4190 + }, + { + "epoch": 0.7630836443069081, + "grad_norm": 8.125, + "learning_rate": 8.822733044816108e-06, + "loss": 1.0497537851333618, + "step": 4192 + }, + { + "epoch": 0.7634477109311004, + "grad_norm": 10.625, + "learning_rate": 8.821647163005811e-06, + "loss": 1.4484376907348633, + "step": 4194 + }, + { + "epoch": 0.7638117775552926, + "grad_norm": 11.1875, + "learning_rate": 8.82056086713023e-06, + "loss": 1.0506078004837036, + "step": 4196 + }, + { + "epoch": 0.7641758441794848, + "grad_norm": 12.0625, + "learning_rate": 8.819474157348774e-06, + "loss": 1.6282072067260742, + "step": 4198 + }, + { + "epoch": 0.7645399108036771, + "grad_norm": 10.875, + "learning_rate": 8.818387033820907e-06, + "loss": 1.5184917449951172, + "step": 4200 + }, + { + "epoch": 0.7649039774278693, + "grad_norm": 10.6875, + "learning_rate": 8.817299496706166e-06, + "loss": 1.6753480434417725, + "step": 4202 + }, + { + "epoch": 0.7652680440520615, + "grad_norm": 12.25, + "learning_rate": 8.816211546164145e-06, + "loss": 1.4577313661575317, + "step": 4204 + }, + { + "epoch": 0.7656321106762538, + "grad_norm": 12.1875, + "learning_rate": 8.81512318235449e-06, + "loss": 1.4209911823272705, + "step": 4206 + }, + { + "epoch": 0.765996177300446, + "grad_norm": 11.125, + "learning_rate": 8.814034405436918e-06, + "loss": 1.5116889476776123, + "step": 4208 + }, + { + "epoch": 0.7663602439246382, + "grad_norm": 10.875, + "learning_rate": 8.812945215571198e-06, + "loss": 1.1695141792297363, + "step": 4210 + }, + { + "epoch": 0.7667243105488304, + "grad_norm": 11.75, + "learning_rate": 8.811855612917172e-06, + "loss": 1.3789341449737549, + "step": 4212 + }, + { + "epoch": 0.7670883771730227, + "grad_norm": 10.125, + "learning_rate": 8.810765597634728e-06, + "loss": 1.172945261001587, + "step": 4214 + }, + { + "epoch": 0.7674524437972149, + "grad_norm": 25.5, + "learning_rate": 8.809675169883823e-06, + "loss": 1.0605055093765259, + "step": 4216 + }, + { + "epoch": 0.7678165104214071, + "grad_norm": 11.375, + "learning_rate": 8.808584329824474e-06, + "loss": 1.2311534881591797, + "step": 4218 + }, + { + "epoch": 0.7681805770455994, + "grad_norm": 8.4375, + "learning_rate": 8.807493077616757e-06, + "loss": 1.150730013847351, + "step": 4220 + }, + { + "epoch": 0.7685446436697916, + "grad_norm": 10.5625, + "learning_rate": 8.806401413420809e-06, + "loss": 1.649304986000061, + "step": 4222 + }, + { + "epoch": 0.7689087102939838, + "grad_norm": 28.875, + "learning_rate": 8.805309337396826e-06, + "loss": 1.9404007196426392, + "step": 4224 + }, + { + "epoch": 0.769272776918176, + "grad_norm": 5.8125, + "learning_rate": 8.804216849705067e-06, + "loss": 1.3319504261016846, + "step": 4226 + }, + { + "epoch": 0.7696368435423683, + "grad_norm": 19.375, + "learning_rate": 8.803123950505852e-06, + "loss": 1.5371342897415161, + "step": 4228 + }, + { + "epoch": 0.7700009101665605, + "grad_norm": 19.25, + "learning_rate": 8.802030639959553e-06, + "loss": 1.4116675853729248, + "step": 4230 + }, + { + "epoch": 0.7703649767907527, + "grad_norm": 15.0625, + "learning_rate": 8.800936918226616e-06, + "loss": 1.0641881227493286, + "step": 4232 + }, + { + "epoch": 0.770729043414945, + "grad_norm": 19.0, + "learning_rate": 8.799842785467538e-06, + "loss": 0.5184696316719055, + "step": 4234 + }, + { + "epoch": 0.7710931100391372, + "grad_norm": 14.75, + "learning_rate": 8.798748241842878e-06, + "loss": 1.5277142524719238, + "step": 4236 + }, + { + "epoch": 0.7714571766633294, + "grad_norm": 9.0, + "learning_rate": 8.797653287513256e-06, + "loss": 1.4668095111846924, + "step": 4238 + }, + { + "epoch": 0.7718212432875217, + "grad_norm": 5.6875, + "learning_rate": 8.796557922639347e-06, + "loss": 1.599367380142212, + "step": 4240 + }, + { + "epoch": 0.7721853099117139, + "grad_norm": 6.90625, + "learning_rate": 8.795462147381902e-06, + "loss": 1.2175581455230713, + "step": 4242 + }, + { + "epoch": 0.772549376535906, + "grad_norm": 12.125, + "learning_rate": 8.794365961901714e-06, + "loss": 1.287764310836792, + "step": 4244 + }, + { + "epoch": 0.7729134431600982, + "grad_norm": 8.3125, + "learning_rate": 8.793269366359645e-06, + "loss": 1.232977271080017, + "step": 4246 + }, + { + "epoch": 0.7732775097842906, + "grad_norm": 18.875, + "learning_rate": 8.792172360916618e-06, + "loss": 1.4290766716003418, + "step": 4248 + }, + { + "epoch": 0.7736415764084827, + "grad_norm": 20.625, + "learning_rate": 8.79107494573361e-06, + "loss": 1.7165418863296509, + "step": 4250 + }, + { + "epoch": 0.7740056430326749, + "grad_norm": 8.625, + "learning_rate": 8.78997712097167e-06, + "loss": 1.4682202339172363, + "step": 4252 + }, + { + "epoch": 0.7743697096568672, + "grad_norm": 28.625, + "learning_rate": 8.788878886791889e-06, + "loss": 1.5918464660644531, + "step": 4254 + }, + { + "epoch": 0.7747337762810594, + "grad_norm": 10.6875, + "learning_rate": 8.787780243355437e-06, + "loss": 1.5947998762130737, + "step": 4256 + }, + { + "epoch": 0.7750978429052516, + "grad_norm": 28.5, + "learning_rate": 8.786681190823531e-06, + "loss": 1.198481798171997, + "step": 4258 + }, + { + "epoch": 0.7754619095294439, + "grad_norm": 7.0625, + "learning_rate": 8.785581729357456e-06, + "loss": 1.3257670402526855, + "step": 4260 + }, + { + "epoch": 0.7758259761536361, + "grad_norm": 101.0, + "learning_rate": 8.784481859118547e-06, + "loss": 1.5759248733520508, + "step": 4262 + }, + { + "epoch": 0.7761900427778283, + "grad_norm": 12.5625, + "learning_rate": 8.78338158026821e-06, + "loss": 1.9172375202178955, + "step": 4264 + }, + { + "epoch": 0.7765541094020205, + "grad_norm": 7.5, + "learning_rate": 8.782280892967909e-06, + "loss": 1.3287369012832642, + "step": 4266 + }, + { + "epoch": 0.7769181760262128, + "grad_norm": 7.65625, + "learning_rate": 8.781179797379162e-06, + "loss": 1.1690568923950195, + "step": 4268 + }, + { + "epoch": 0.777282242650405, + "grad_norm": 12.25, + "learning_rate": 8.78007829366355e-06, + "loss": 1.3491458892822266, + "step": 4270 + }, + { + "epoch": 0.7776463092745972, + "grad_norm": 9.375, + "learning_rate": 8.778976381982716e-06, + "loss": 1.4549459218978882, + "step": 4272 + }, + { + "epoch": 0.7780103758987895, + "grad_norm": 9.875, + "learning_rate": 8.77787406249836e-06, + "loss": 1.206111192703247, + "step": 4274 + }, + { + "epoch": 0.7783744425229817, + "grad_norm": 8.0625, + "learning_rate": 8.77677133537224e-06, + "loss": 0.9205513000488281, + "step": 4276 + }, + { + "epoch": 0.7787385091471739, + "grad_norm": 10.375, + "learning_rate": 8.775668200766186e-06, + "loss": 1.348332405090332, + "step": 4278 + }, + { + "epoch": 0.7791025757713662, + "grad_norm": 6.5, + "learning_rate": 8.774564658842066e-06, + "loss": 1.286337971687317, + "step": 4280 + }, + { + "epoch": 0.7794666423955584, + "grad_norm": 37.5, + "learning_rate": 8.773460709761831e-06, + "loss": 1.2027983665466309, + "step": 4282 + }, + { + "epoch": 0.7798307090197506, + "grad_norm": 56.75, + "learning_rate": 8.772356353687474e-06, + "loss": 1.7031192779541016, + "step": 4284 + }, + { + "epoch": 0.7801947756439428, + "grad_norm": 13.25, + "learning_rate": 8.771251590781059e-06, + "loss": 1.289367437362671, + "step": 4286 + }, + { + "epoch": 0.7805588422681351, + "grad_norm": 14.125, + "learning_rate": 8.770146421204704e-06, + "loss": 1.0499001741409302, + "step": 4288 + }, + { + "epoch": 0.7809229088923273, + "grad_norm": 7.0, + "learning_rate": 8.769040845120587e-06, + "loss": 1.2087839841842651, + "step": 4290 + }, + { + "epoch": 0.7812869755165195, + "grad_norm": 12.125, + "learning_rate": 8.767934862690948e-06, + "loss": 1.2550642490386963, + "step": 4292 + }, + { + "epoch": 0.7816510421407118, + "grad_norm": 12.875, + "learning_rate": 8.766828474078087e-06, + "loss": 1.7575970888137817, + "step": 4294 + }, + { + "epoch": 0.782015108764904, + "grad_norm": 8.5, + "learning_rate": 8.765721679444359e-06, + "loss": 1.30368173122406, + "step": 4296 + }, + { + "epoch": 0.7823791753890962, + "grad_norm": 8.1875, + "learning_rate": 8.764614478952185e-06, + "loss": 1.3250945806503296, + "step": 4298 + }, + { + "epoch": 0.7827432420132885, + "grad_norm": 9.0625, + "learning_rate": 8.763506872764036e-06, + "loss": 1.5625274181365967, + "step": 4300 + }, + { + "epoch": 0.7831073086374807, + "grad_norm": 12.0, + "learning_rate": 8.762398861042456e-06, + "loss": 1.3666670322418213, + "step": 4302 + }, + { + "epoch": 0.7834713752616729, + "grad_norm": 13.125, + "learning_rate": 8.761290443950037e-06, + "loss": 1.488893747329712, + "step": 4304 + }, + { + "epoch": 0.7838354418858651, + "grad_norm": 25.0, + "learning_rate": 8.760181621649438e-06, + "loss": 1.437034249305725, + "step": 4306 + }, + { + "epoch": 0.7841995085100574, + "grad_norm": 26.875, + "learning_rate": 8.75907239430337e-06, + "loss": 1.2303574085235596, + "step": 4308 + }, + { + "epoch": 0.7845635751342496, + "grad_norm": 7.90625, + "learning_rate": 8.75796276207461e-06, + "loss": 1.3229930400848389, + "step": 4310 + }, + { + "epoch": 0.7849276417584418, + "grad_norm": 8.9375, + "learning_rate": 8.756852725125993e-06, + "loss": 1.1679694652557373, + "step": 4312 + }, + { + "epoch": 0.7852917083826341, + "grad_norm": 22.125, + "learning_rate": 8.75574228362041e-06, + "loss": 1.4020464420318604, + "step": 4314 + }, + { + "epoch": 0.7856557750068263, + "grad_norm": 2.21875, + "learning_rate": 8.754631437720814e-06, + "loss": 1.1125102043151855, + "step": 4316 + }, + { + "epoch": 0.7860198416310185, + "grad_norm": 11.125, + "learning_rate": 8.753520187590222e-06, + "loss": 1.3107123374938965, + "step": 4318 + }, + { + "epoch": 0.7863839082552107, + "grad_norm": 10.5625, + "learning_rate": 8.752408533391697e-06, + "loss": 1.4200538396835327, + "step": 4320 + }, + { + "epoch": 0.786747974879403, + "grad_norm": 12.75, + "learning_rate": 8.751296475288375e-06, + "loss": 1.6238305568695068, + "step": 4322 + }, + { + "epoch": 0.7871120415035952, + "grad_norm": 18.25, + "learning_rate": 8.750184013443445e-06, + "loss": 1.7460156679153442, + "step": 4324 + }, + { + "epoch": 0.7874761081277873, + "grad_norm": 19.625, + "learning_rate": 8.749071148020159e-06, + "loss": 0.9765514135360718, + "step": 4326 + }, + { + "epoch": 0.7878401747519796, + "grad_norm": 13.9375, + "learning_rate": 8.74795787918182e-06, + "loss": 1.1897203922271729, + "step": 4328 + }, + { + "epoch": 0.7882042413761718, + "grad_norm": 9.75, + "learning_rate": 8.7468442070918e-06, + "loss": 1.2098305225372314, + "step": 4330 + }, + { + "epoch": 0.788568308000364, + "grad_norm": 9.375, + "learning_rate": 8.745730131913525e-06, + "loss": 0.8845806121826172, + "step": 4332 + }, + { + "epoch": 0.7889323746245563, + "grad_norm": 5.8125, + "learning_rate": 8.744615653810482e-06, + "loss": 0.9948340654373169, + "step": 4334 + }, + { + "epoch": 0.7892964412487485, + "grad_norm": 4.875, + "learning_rate": 8.743500772946215e-06, + "loss": 1.226870059967041, + "step": 4336 + }, + { + "epoch": 0.7896605078729407, + "grad_norm": 18.625, + "learning_rate": 8.742385489484325e-06, + "loss": 1.5136758089065552, + "step": 4338 + }, + { + "epoch": 0.7900245744971329, + "grad_norm": 16.875, + "learning_rate": 8.741269803588479e-06, + "loss": 0.9573846459388733, + "step": 4340 + }, + { + "epoch": 0.7903886411213252, + "grad_norm": 13.125, + "learning_rate": 8.7401537154224e-06, + "loss": 1.5381865501403809, + "step": 4342 + }, + { + "epoch": 0.7907527077455174, + "grad_norm": 31.875, + "learning_rate": 8.739037225149867e-06, + "loss": 1.2576684951782227, + "step": 4344 + }, + { + "epoch": 0.7911167743697096, + "grad_norm": 45.0, + "learning_rate": 8.737920332934724e-06, + "loss": 1.0145173072814941, + "step": 4346 + }, + { + "epoch": 0.7914808409939019, + "grad_norm": 14.0, + "learning_rate": 8.736803038940867e-06, + "loss": 1.7330098152160645, + "step": 4348 + }, + { + "epoch": 0.7918449076180941, + "grad_norm": 6.15625, + "learning_rate": 8.735685343332251e-06, + "loss": 0.963650643825531, + "step": 4350 + }, + { + "epoch": 0.7922089742422863, + "grad_norm": 19.125, + "learning_rate": 8.734567246272902e-06, + "loss": 1.4850554466247559, + "step": 4352 + }, + { + "epoch": 0.7925730408664786, + "grad_norm": 11.9375, + "learning_rate": 8.733448747926892e-06, + "loss": 1.444158911705017, + "step": 4354 + }, + { + "epoch": 0.7929371074906708, + "grad_norm": 8.1875, + "learning_rate": 8.732329848458357e-06, + "loss": 1.3289103507995605, + "step": 4356 + }, + { + "epoch": 0.793301174114863, + "grad_norm": 7.96875, + "learning_rate": 8.731210548031485e-06, + "loss": 1.4776029586791992, + "step": 4358 + }, + { + "epoch": 0.7936652407390552, + "grad_norm": 8.5625, + "learning_rate": 8.730090846810537e-06, + "loss": 1.4532073736190796, + "step": 4360 + }, + { + "epoch": 0.7940293073632475, + "grad_norm": 7.8125, + "learning_rate": 8.72897074495982e-06, + "loss": 1.2802387475967407, + "step": 4362 + }, + { + "epoch": 0.7943933739874397, + "grad_norm": 18.375, + "learning_rate": 8.727850242643707e-06, + "loss": 1.2998063564300537, + "step": 4364 + }, + { + "epoch": 0.7947574406116319, + "grad_norm": 24.125, + "learning_rate": 8.726729340026625e-06, + "loss": 1.371389627456665, + "step": 4366 + }, + { + "epoch": 0.7951215072358242, + "grad_norm": 6.46875, + "learning_rate": 8.725608037273063e-06, + "loss": 1.3968528509140015, + "step": 4368 + }, + { + "epoch": 0.7954855738600164, + "grad_norm": 8.125, + "learning_rate": 8.724486334547568e-06, + "loss": 1.310551404953003, + "step": 4370 + }, + { + "epoch": 0.7958496404842086, + "grad_norm": 7.6875, + "learning_rate": 8.723364232014744e-06, + "loss": 1.35434889793396, + "step": 4372 + }, + { + "epoch": 0.7962137071084009, + "grad_norm": 17.875, + "learning_rate": 8.722241729839257e-06, + "loss": 1.2660176753997803, + "step": 4374 + }, + { + "epoch": 0.7965777737325931, + "grad_norm": 12.0, + "learning_rate": 8.721118828185828e-06, + "loss": 1.4760253429412842, + "step": 4376 + }, + { + "epoch": 0.7969418403567853, + "grad_norm": 9.5625, + "learning_rate": 8.719995527219238e-06, + "loss": 1.267617106437683, + "step": 4378 + }, + { + "epoch": 0.7973059069809775, + "grad_norm": 8.6875, + "learning_rate": 8.718871827104327e-06, + "loss": 1.4835522174835205, + "step": 4380 + }, + { + "epoch": 0.7976699736051698, + "grad_norm": 10.5, + "learning_rate": 8.717747728005997e-06, + "loss": 1.3898547887802124, + "step": 4382 + }, + { + "epoch": 0.798034040229362, + "grad_norm": 9.125, + "learning_rate": 8.716623230089199e-06, + "loss": 1.2879462242126465, + "step": 4384 + }, + { + "epoch": 0.7983981068535542, + "grad_norm": 8.3125, + "learning_rate": 8.715498333518953e-06, + "loss": 1.351919412612915, + "step": 4386 + }, + { + "epoch": 0.7987621734777465, + "grad_norm": 11.125, + "learning_rate": 8.71437303846033e-06, + "loss": 1.5388047695159912, + "step": 4388 + }, + { + "epoch": 0.7991262401019387, + "grad_norm": 38.75, + "learning_rate": 8.713247345078467e-06, + "loss": 1.153308629989624, + "step": 4390 + }, + { + "epoch": 0.7994903067261309, + "grad_norm": 11.5, + "learning_rate": 8.712121253538549e-06, + "loss": 0.690681517124176, + "step": 4392 + }, + { + "epoch": 0.7998543733503231, + "grad_norm": 11.5, + "learning_rate": 8.71099476400583e-06, + "loss": 1.2446155548095703, + "step": 4394 + }, + { + "epoch": 0.8002184399745154, + "grad_norm": 10.375, + "learning_rate": 8.709867876645613e-06, + "loss": 1.4488739967346191, + "step": 4396 + }, + { + "epoch": 0.8005825065987076, + "grad_norm": 8.5625, + "learning_rate": 8.70874059162327e-06, + "loss": 1.2444548606872559, + "step": 4398 + }, + { + "epoch": 0.8009465732228997, + "grad_norm": 8.9375, + "learning_rate": 8.707612909104222e-06, + "loss": 1.2452828884124756, + "step": 4400 + }, + { + "epoch": 0.801310639847092, + "grad_norm": 36.25, + "learning_rate": 8.706484829253954e-06, + "loss": 1.4948606491088867, + "step": 4402 + }, + { + "epoch": 0.8016747064712842, + "grad_norm": 15.125, + "learning_rate": 8.705356352238003e-06, + "loss": 1.2883861064910889, + "step": 4404 + }, + { + "epoch": 0.8020387730954764, + "grad_norm": 38.5, + "learning_rate": 8.704227478221974e-06, + "loss": 1.2535079717636108, + "step": 4406 + }, + { + "epoch": 0.8024028397196687, + "grad_norm": 10.0625, + "learning_rate": 8.70309820737152e-06, + "loss": 0.8813364505767822, + "step": 4408 + }, + { + "epoch": 0.8027669063438609, + "grad_norm": 26.75, + "learning_rate": 8.70196853985236e-06, + "loss": 1.6842833757400513, + "step": 4410 + }, + { + "epoch": 0.8031309729680531, + "grad_norm": 9.9375, + "learning_rate": 8.700838475830267e-06, + "loss": 1.0274977684020996, + "step": 4412 + }, + { + "epoch": 0.8034950395922453, + "grad_norm": 3.546875, + "learning_rate": 8.699708015471071e-06, + "loss": 0.6947841644287109, + "step": 4414 + }, + { + "epoch": 0.8038591062164376, + "grad_norm": 4.0625, + "learning_rate": 8.698577158940666e-06, + "loss": 1.1942253112792969, + "step": 4416 + }, + { + "epoch": 0.8042231728406298, + "grad_norm": 12.25, + "learning_rate": 8.697445906405e-06, + "loss": 1.1912000179290771, + "step": 4418 + }, + { + "epoch": 0.804587239464822, + "grad_norm": 2.828125, + "learning_rate": 8.696314258030078e-06, + "loss": 1.1682474613189697, + "step": 4420 + }, + { + "epoch": 0.8049513060890143, + "grad_norm": 11.3125, + "learning_rate": 8.695182213981968e-06, + "loss": 1.1248276233673096, + "step": 4422 + }, + { + "epoch": 0.8053153727132065, + "grad_norm": 6.71875, + "learning_rate": 8.694049774426786e-06, + "loss": 1.4576165676116943, + "step": 4424 + }, + { + "epoch": 0.8056794393373987, + "grad_norm": 8.875, + "learning_rate": 8.692916939530722e-06, + "loss": 1.3074344396591187, + "step": 4426 + }, + { + "epoch": 0.806043505961591, + "grad_norm": 4.4375, + "learning_rate": 8.69178370946001e-06, + "loss": 1.23238205909729, + "step": 4428 + }, + { + "epoch": 0.8064075725857832, + "grad_norm": 18.625, + "learning_rate": 8.690650084380946e-06, + "loss": 1.8036983013153076, + "step": 4430 + }, + { + "epoch": 0.8067716392099754, + "grad_norm": 19.375, + "learning_rate": 8.689516064459886e-06, + "loss": 1.278721809387207, + "step": 4432 + }, + { + "epoch": 0.8071357058341676, + "grad_norm": 10.0625, + "learning_rate": 8.688381649863245e-06, + "loss": 1.2096893787384033, + "step": 4434 + }, + { + "epoch": 0.8074997724583599, + "grad_norm": 7.90625, + "learning_rate": 8.68724684075749e-06, + "loss": 1.3531429767608643, + "step": 4436 + }, + { + "epoch": 0.8078638390825521, + "grad_norm": 6.0, + "learning_rate": 8.686111637309153e-06, + "loss": 1.2938752174377441, + "step": 4438 + }, + { + "epoch": 0.8082279057067443, + "grad_norm": 3.328125, + "learning_rate": 8.68497603968482e-06, + "loss": 1.0541632175445557, + "step": 4440 + }, + { + "epoch": 0.8085919723309366, + "grad_norm": 17.625, + "learning_rate": 8.683840048051133e-06, + "loss": 1.6006433963775635, + "step": 4442 + }, + { + "epoch": 0.8089560389551288, + "grad_norm": 22.125, + "learning_rate": 8.682703662574796e-06, + "loss": 1.6809207201004028, + "step": 4444 + }, + { + "epoch": 0.809320105579321, + "grad_norm": 14.9375, + "learning_rate": 8.681566883422567e-06, + "loss": 1.2290083169937134, + "step": 4446 + }, + { + "epoch": 0.8096841722035133, + "grad_norm": 7.96875, + "learning_rate": 8.680429710761269e-06, + "loss": 1.5254971981048584, + "step": 4448 + }, + { + "epoch": 0.8100482388277055, + "grad_norm": 5.59375, + "learning_rate": 8.679292144757771e-06, + "loss": 0.9327143430709839, + "step": 4450 + }, + { + "epoch": 0.8104123054518977, + "grad_norm": 19.5, + "learning_rate": 8.67815418557901e-06, + "loss": 1.0140594244003296, + "step": 4452 + }, + { + "epoch": 0.8107763720760899, + "grad_norm": 14.4375, + "learning_rate": 8.677015833391976e-06, + "loss": 0.7433057427406311, + "step": 4454 + }, + { + "epoch": 0.8111404387002822, + "grad_norm": 4.6875, + "learning_rate": 8.675877088363715e-06, + "loss": 0.9882104396820068, + "step": 4456 + }, + { + "epoch": 0.8115045053244744, + "grad_norm": 14.375, + "learning_rate": 8.674737950661338e-06, + "loss": 1.3083375692367554, + "step": 4458 + }, + { + "epoch": 0.8118685719486666, + "grad_norm": 34.75, + "learning_rate": 8.673598420452006e-06, + "loss": 1.9675068855285645, + "step": 4460 + }, + { + "epoch": 0.8122326385728589, + "grad_norm": 13.4375, + "learning_rate": 8.672458497902943e-06, + "loss": 1.9735381603240967, + "step": 4462 + }, + { + "epoch": 0.8125967051970511, + "grad_norm": 11.375, + "learning_rate": 8.671318183181422e-06, + "loss": 2.0083773136138916, + "step": 4464 + }, + { + "epoch": 0.8129607718212433, + "grad_norm": 8.5, + "learning_rate": 8.670177476454787e-06, + "loss": 1.4760448932647705, + "step": 4466 + }, + { + "epoch": 0.8133248384454355, + "grad_norm": 7.5, + "learning_rate": 8.669036377890425e-06, + "loss": 1.4627360105514526, + "step": 4468 + }, + { + "epoch": 0.8136889050696278, + "grad_norm": 16.5, + "learning_rate": 8.667894887655794e-06, + "loss": 1.3630309104919434, + "step": 4470 + }, + { + "epoch": 0.81405297169382, + "grad_norm": 14.5, + "learning_rate": 8.6667530059184e-06, + "loss": 1.4635146856307983, + "step": 4472 + }, + { + "epoch": 0.8144170383180122, + "grad_norm": 9.9375, + "learning_rate": 8.665610732845809e-06, + "loss": 1.494768500328064, + "step": 4474 + }, + { + "epoch": 0.8147811049422045, + "grad_norm": 10.25, + "learning_rate": 8.664468068605648e-06, + "loss": 1.092423915863037, + "step": 4476 + }, + { + "epoch": 0.8151451715663967, + "grad_norm": 14.0625, + "learning_rate": 8.663325013365591e-06, + "loss": 1.326206922531128, + "step": 4478 + }, + { + "epoch": 0.8155092381905888, + "grad_norm": 6.4375, + "learning_rate": 8.662181567293386e-06, + "loss": 1.8527684211730957, + "step": 4480 + }, + { + "epoch": 0.8158733048147812, + "grad_norm": 9.5, + "learning_rate": 8.661037730556823e-06, + "loss": 1.338208556175232, + "step": 4482 + }, + { + "epoch": 0.8162373714389733, + "grad_norm": 9.4375, + "learning_rate": 8.65989350332376e-06, + "loss": 1.6235486268997192, + "step": 4484 + }, + { + "epoch": 0.8166014380631655, + "grad_norm": 15.6875, + "learning_rate": 8.658748885762103e-06, + "loss": 1.3146204948425293, + "step": 4486 + }, + { + "epoch": 0.8169655046873577, + "grad_norm": 28.375, + "learning_rate": 8.65760387803982e-06, + "loss": 1.09885835647583, + "step": 4488 + }, + { + "epoch": 0.81732957131155, + "grad_norm": 47.5, + "learning_rate": 8.656458480324942e-06, + "loss": 0.6355609893798828, + "step": 4490 + }, + { + "epoch": 0.8176936379357422, + "grad_norm": 15.75, + "learning_rate": 8.655312692785545e-06, + "loss": 1.4863014221191406, + "step": 4492 + }, + { + "epoch": 0.8180577045599344, + "grad_norm": 29.75, + "learning_rate": 8.654166515589773e-06, + "loss": 1.5574688911437988, + "step": 4494 + }, + { + "epoch": 0.8184217711841267, + "grad_norm": 11.9375, + "learning_rate": 8.653019948905819e-06, + "loss": 1.5795235633850098, + "step": 4496 + }, + { + "epoch": 0.8187858378083189, + "grad_norm": 12.5625, + "learning_rate": 8.651872992901942e-06, + "loss": 1.678736686706543, + "step": 4498 + }, + { + "epoch": 0.8191499044325111, + "grad_norm": 13.8125, + "learning_rate": 8.650725647746449e-06, + "loss": 1.5295724868774414, + "step": 4500 + }, + { + "epoch": 0.8195139710567034, + "grad_norm": 5.1875, + "learning_rate": 8.649577913607709e-06, + "loss": 1.323065996170044, + "step": 4502 + }, + { + "epoch": 0.8198780376808956, + "grad_norm": 8.0, + "learning_rate": 8.648429790654149e-06, + "loss": 1.0482020378112793, + "step": 4504 + }, + { + "epoch": 0.8202421043050878, + "grad_norm": 13.4375, + "learning_rate": 8.647281279054251e-06, + "loss": 1.4844077825546265, + "step": 4506 + }, + { + "epoch": 0.82060617092928, + "grad_norm": 15.625, + "learning_rate": 8.646132378976553e-06, + "loss": 1.617138147354126, + "step": 4508 + }, + { + "epoch": 0.8209702375534723, + "grad_norm": 51.75, + "learning_rate": 8.644983090589651e-06, + "loss": 0.5058410167694092, + "step": 4510 + }, + { + "epoch": 0.8213343041776645, + "grad_norm": 13.625, + "learning_rate": 8.643833414062202e-06, + "loss": 1.4589542150497437, + "step": 4512 + }, + { + "epoch": 0.8216983708018567, + "grad_norm": 6.9375, + "learning_rate": 8.642683349562913e-06, + "loss": 1.2672033309936523, + "step": 4514 + }, + { + "epoch": 0.822062437426049, + "grad_norm": 19.25, + "learning_rate": 8.641532897260552e-06, + "loss": 1.3366519212722778, + "step": 4516 + }, + { + "epoch": 0.8224265040502412, + "grad_norm": 21.75, + "learning_rate": 8.640382057323944e-06, + "loss": 1.3999751806259155, + "step": 4518 + }, + { + "epoch": 0.8227905706744334, + "grad_norm": 90.0, + "learning_rate": 8.639230829921968e-06, + "loss": 1.2534772157669067, + "step": 4520 + }, + { + "epoch": 0.8231546372986257, + "grad_norm": 8.25, + "learning_rate": 8.638079215223564e-06, + "loss": 0.44437363743782043, + "step": 4522 + }, + { + "epoch": 0.8235187039228179, + "grad_norm": 7.75, + "learning_rate": 8.63692721339773e-06, + "loss": 1.2233778238296509, + "step": 4524 + }, + { + "epoch": 0.8238827705470101, + "grad_norm": 15.6875, + "learning_rate": 8.63577482461351e-06, + "loss": 1.2223553657531738, + "step": 4526 + }, + { + "epoch": 0.8242468371712023, + "grad_norm": 9.5, + "learning_rate": 8.634622049040018e-06, + "loss": 1.6178956031799316, + "step": 4528 + }, + { + "epoch": 0.8246109037953946, + "grad_norm": 14.4375, + "learning_rate": 8.633468886846417e-06, + "loss": 1.471444845199585, + "step": 4530 + }, + { + "epoch": 0.8249749704195868, + "grad_norm": 6.9375, + "learning_rate": 8.632315338201929e-06, + "loss": 1.151768684387207, + "step": 4532 + }, + { + "epoch": 0.825339037043779, + "grad_norm": 7.96875, + "learning_rate": 8.631161403275833e-06, + "loss": 1.2900487184524536, + "step": 4534 + }, + { + "epoch": 0.8257031036679713, + "grad_norm": 8.75, + "learning_rate": 8.630007082237466e-06, + "loss": 1.1465637683868408, + "step": 4536 + }, + { + "epoch": 0.8260671702921635, + "grad_norm": 12.75, + "learning_rate": 8.628852375256216e-06, + "loss": 1.3564552068710327, + "step": 4538 + }, + { + "epoch": 0.8264312369163557, + "grad_norm": 13.6875, + "learning_rate": 8.627697282501535e-06, + "loss": 1.4555790424346924, + "step": 4540 + }, + { + "epoch": 0.8267953035405479, + "grad_norm": 12.625, + "learning_rate": 8.626541804142926e-06, + "loss": 1.5490326881408691, + "step": 4542 + }, + { + "epoch": 0.8271593701647402, + "grad_norm": 6.90625, + "learning_rate": 8.625385940349953e-06, + "loss": 1.2507295608520508, + "step": 4544 + }, + { + "epoch": 0.8275234367889324, + "grad_norm": 15.75, + "learning_rate": 8.624229691292232e-06, + "loss": 1.327358365058899, + "step": 4546 + }, + { + "epoch": 0.8278875034131246, + "grad_norm": 69.5, + "learning_rate": 8.62307305713944e-06, + "loss": 1.1240493059158325, + "step": 4548 + }, + { + "epoch": 0.8282515700373169, + "grad_norm": 3.3125, + "learning_rate": 8.621916038061304e-06, + "loss": 0.9504419565200806, + "step": 4550 + }, + { + "epoch": 0.8286156366615091, + "grad_norm": 19.0, + "learning_rate": 8.620758634227617e-06, + "loss": 1.206761121749878, + "step": 4552 + }, + { + "epoch": 0.8289797032857013, + "grad_norm": 17.875, + "learning_rate": 8.619600845808222e-06, + "loss": 1.7506375312805176, + "step": 4554 + }, + { + "epoch": 0.8293437699098936, + "grad_norm": 16.25, + "learning_rate": 8.61844267297302e-06, + "loss": 1.412170648574829, + "step": 4556 + }, + { + "epoch": 0.8297078365340858, + "grad_norm": 9.8125, + "learning_rate": 8.617284115891967e-06, + "loss": 1.3583259582519531, + "step": 4558 + }, + { + "epoch": 0.830071903158278, + "grad_norm": 14.875, + "learning_rate": 8.616125174735074e-06, + "loss": 1.4089040756225586, + "step": 4560 + }, + { + "epoch": 0.8304359697824701, + "grad_norm": 9.875, + "learning_rate": 8.614965849672416e-06, + "loss": 1.405224084854126, + "step": 4562 + }, + { + "epoch": 0.8308000364066624, + "grad_norm": 19.5, + "learning_rate": 8.613806140874119e-06, + "loss": 0.9720988273620605, + "step": 4564 + }, + { + "epoch": 0.8311641030308546, + "grad_norm": 18.5, + "learning_rate": 8.61264604851036e-06, + "loss": 1.3065283298492432, + "step": 4566 + }, + { + "epoch": 0.8315281696550468, + "grad_norm": 10.3125, + "learning_rate": 8.611485572751386e-06, + "loss": 1.6876684427261353, + "step": 4568 + }, + { + "epoch": 0.8318922362792391, + "grad_norm": 9.3125, + "learning_rate": 8.610324713767484e-06, + "loss": 1.3253494501113892, + "step": 4570 + }, + { + "epoch": 0.8322563029034313, + "grad_norm": 11.4375, + "learning_rate": 8.609163471729009e-06, + "loss": 1.3207170963287354, + "step": 4572 + }, + { + "epoch": 0.8326203695276235, + "grad_norm": 6.0625, + "learning_rate": 8.608001846806372e-06, + "loss": 1.431268334388733, + "step": 4574 + }, + { + "epoch": 0.8329844361518158, + "grad_norm": 2.21875, + "learning_rate": 8.606839839170029e-06, + "loss": 1.201312780380249, + "step": 4576 + }, + { + "epoch": 0.833348502776008, + "grad_norm": 14.375, + "learning_rate": 8.605677448990507e-06, + "loss": 1.3415477275848389, + "step": 4578 + }, + { + "epoch": 0.8337125694002002, + "grad_norm": 21.125, + "learning_rate": 8.604514676438377e-06, + "loss": 1.891241192817688, + "step": 4580 + }, + { + "epoch": 0.8340766360243924, + "grad_norm": 18.875, + "learning_rate": 8.603351521684276e-06, + "loss": 1.4462015628814697, + "step": 4582 + }, + { + "epoch": 0.8344407026485847, + "grad_norm": 29.625, + "learning_rate": 8.60218798489889e-06, + "loss": 1.554612159729004, + "step": 4584 + }, + { + "epoch": 0.8348047692727769, + "grad_norm": 10.0, + "learning_rate": 8.601024066252959e-06, + "loss": 1.2380497455596924, + "step": 4586 + }, + { + "epoch": 0.8351688358969691, + "grad_norm": 16.875, + "learning_rate": 8.599859765917291e-06, + "loss": 1.8688032627105713, + "step": 4588 + }, + { + "epoch": 0.8355329025211614, + "grad_norm": 13.5625, + "learning_rate": 8.598695084062735e-06, + "loss": 1.8470561504364014, + "step": 4590 + }, + { + "epoch": 0.8358969691453536, + "grad_norm": 3.25, + "learning_rate": 8.59753002086021e-06, + "loss": 1.0626814365386963, + "step": 4592 + }, + { + "epoch": 0.8362610357695458, + "grad_norm": 3.515625, + "learning_rate": 8.59636457648068e-06, + "loss": 1.0638459920883179, + "step": 4594 + }, + { + "epoch": 0.8366251023937381, + "grad_norm": 6.15625, + "learning_rate": 8.595198751095171e-06, + "loss": 0.9944407343864441, + "step": 4596 + }, + { + "epoch": 0.8369891690179303, + "grad_norm": 20.5, + "learning_rate": 8.594032544874764e-06, + "loss": 1.381744623184204, + "step": 4598 + }, + { + "epoch": 0.8373532356421225, + "grad_norm": 45.75, + "learning_rate": 8.592865957990592e-06, + "loss": 1.9402213096618652, + "step": 4600 + }, + { + "epoch": 0.8377173022663147, + "grad_norm": 18.25, + "learning_rate": 8.591698990613848e-06, + "loss": 1.8793973922729492, + "step": 4602 + }, + { + "epoch": 0.838081368890507, + "grad_norm": 7.03125, + "learning_rate": 8.590531642915783e-06, + "loss": 1.4722939729690552, + "step": 4604 + }, + { + "epoch": 0.8384454355146992, + "grad_norm": 10.25, + "learning_rate": 8.589363915067697e-06, + "loss": 1.2915740013122559, + "step": 4606 + }, + { + "epoch": 0.8388095021388914, + "grad_norm": 117.5, + "learning_rate": 8.588195807240949e-06, + "loss": 1.3434512615203857, + "step": 4608 + }, + { + "epoch": 0.8391735687630837, + "grad_norm": 22.375, + "learning_rate": 8.587027319606956e-06, + "loss": 1.4871494770050049, + "step": 4610 + }, + { + "epoch": 0.8395376353872759, + "grad_norm": 8.8125, + "learning_rate": 8.585858452337188e-06, + "loss": 1.6669485569000244, + "step": 4612 + }, + { + "epoch": 0.8399017020114681, + "grad_norm": 3.375, + "learning_rate": 8.584689205603171e-06, + "loss": 1.0657391548156738, + "step": 4614 + }, + { + "epoch": 0.8402657686356603, + "grad_norm": 9.75, + "learning_rate": 8.583519579576489e-06, + "loss": 1.0997756719589233, + "step": 4616 + }, + { + "epoch": 0.8406298352598526, + "grad_norm": 21.625, + "learning_rate": 8.58234957442878e-06, + "loss": 1.145671010017395, + "step": 4618 + }, + { + "epoch": 0.8409939018840448, + "grad_norm": 17.125, + "learning_rate": 8.581179190331735e-06, + "loss": 1.6979058980941772, + "step": 4620 + }, + { + "epoch": 0.841357968508237, + "grad_norm": 8.125, + "learning_rate": 8.580008427457102e-06, + "loss": 1.4544659852981567, + "step": 4622 + }, + { + "epoch": 0.8417220351324293, + "grad_norm": 15.6875, + "learning_rate": 8.578837285976691e-06, + "loss": 1.3736255168914795, + "step": 4624 + }, + { + "epoch": 0.8420861017566215, + "grad_norm": 15.1875, + "learning_rate": 8.57766576606236e-06, + "loss": 1.7360119819641113, + "step": 4626 + }, + { + "epoch": 0.8424501683808137, + "grad_norm": 13.4375, + "learning_rate": 8.576493867886022e-06, + "loss": 1.564675211906433, + "step": 4628 + }, + { + "epoch": 0.842814235005006, + "grad_norm": 16.25, + "learning_rate": 8.575321591619652e-06, + "loss": 1.6518443822860718, + "step": 4630 + }, + { + "epoch": 0.8431783016291982, + "grad_norm": 11.3125, + "learning_rate": 8.574148937435274e-06, + "loss": 1.9839905500411987, + "step": 4632 + }, + { + "epoch": 0.8435423682533904, + "grad_norm": 19.125, + "learning_rate": 8.572975905504972e-06, + "loss": 1.3030701875686646, + "step": 4634 + }, + { + "epoch": 0.8439064348775825, + "grad_norm": 42.25, + "learning_rate": 8.571802496000885e-06, + "loss": 0.9104695916175842, + "step": 4636 + }, + { + "epoch": 0.8442705015017749, + "grad_norm": 10.5, + "learning_rate": 8.570628709095203e-06, + "loss": 1.4311996698379517, + "step": 4638 + }, + { + "epoch": 0.844634568125967, + "grad_norm": 9.1875, + "learning_rate": 8.569454544960174e-06, + "loss": 1.2529677152633667, + "step": 4640 + }, + { + "epoch": 0.8449986347501592, + "grad_norm": 11.3125, + "learning_rate": 8.568280003768107e-06, + "loss": 1.3084317445755005, + "step": 4642 + }, + { + "epoch": 0.8453627013743515, + "grad_norm": 7.6875, + "learning_rate": 8.567105085691357e-06, + "loss": 1.383557915687561, + "step": 4644 + }, + { + "epoch": 0.8457267679985437, + "grad_norm": 16.75, + "learning_rate": 8.565929790902337e-06, + "loss": 1.4977948665618896, + "step": 4646 + }, + { + "epoch": 0.8460908346227359, + "grad_norm": 10.4375, + "learning_rate": 8.564754119573519e-06, + "loss": 1.5442651510238647, + "step": 4648 + }, + { + "epoch": 0.8464549012469282, + "grad_norm": 56.5, + "learning_rate": 8.563578071877429e-06, + "loss": 0.5610999464988708, + "step": 4650 + }, + { + "epoch": 0.8468189678711204, + "grad_norm": 33.5, + "learning_rate": 8.562401647986646e-06, + "loss": 1.4163718223571777, + "step": 4652 + }, + { + "epoch": 0.8471830344953126, + "grad_norm": 8.25, + "learning_rate": 8.561224848073808e-06, + "loss": 1.2096872329711914, + "step": 4654 + }, + { + "epoch": 0.8475471011195048, + "grad_norm": 14.0, + "learning_rate": 8.560047672311602e-06, + "loss": 1.3381762504577637, + "step": 4656 + }, + { + "epoch": 0.8479111677436971, + "grad_norm": 5.34375, + "learning_rate": 8.558870120872774e-06, + "loss": 1.2475321292877197, + "step": 4658 + }, + { + "epoch": 0.8482752343678893, + "grad_norm": 1504.0, + "learning_rate": 8.557692193930125e-06, + "loss": 1.5093698501586914, + "step": 4660 + }, + { + "epoch": 0.8486393009920815, + "grad_norm": 10.0625, + "learning_rate": 8.556513891656516e-06, + "loss": 1.805031180381775, + "step": 4662 + }, + { + "epoch": 0.8490033676162738, + "grad_norm": 8.3125, + "learning_rate": 8.55533521422485e-06, + "loss": 0.82941734790802, + "step": 4664 + }, + { + "epoch": 0.849367434240466, + "grad_norm": 10.0625, + "learning_rate": 8.554156161808099e-06, + "loss": 0.9767967462539673, + "step": 4666 + }, + { + "epoch": 0.8497315008646582, + "grad_norm": 14.625, + "learning_rate": 8.552976734579281e-06, + "loss": 1.5947502851486206, + "step": 4668 + }, + { + "epoch": 0.8500955674888505, + "grad_norm": 21.75, + "learning_rate": 8.551796932711476e-06, + "loss": 1.6526554822921753, + "step": 4670 + }, + { + "epoch": 0.8504596341130427, + "grad_norm": 27.25, + "learning_rate": 8.550616756377809e-06, + "loss": 1.4393222332000732, + "step": 4672 + }, + { + "epoch": 0.8508237007372349, + "grad_norm": 11.8125, + "learning_rate": 8.549436205751474e-06, + "loss": 1.4700366258621216, + "step": 4674 + }, + { + "epoch": 0.8511877673614271, + "grad_norm": 15.5, + "learning_rate": 8.548255281005704e-06, + "loss": 1.7356404066085815, + "step": 4676 + }, + { + "epoch": 0.8515518339856194, + "grad_norm": 19.5, + "learning_rate": 8.5470739823138e-06, + "loss": 1.289542317390442, + "step": 4678 + }, + { + "epoch": 0.8519159006098116, + "grad_norm": 8.5625, + "learning_rate": 8.545892309849113e-06, + "loss": 1.308435320854187, + "step": 4680 + }, + { + "epoch": 0.8522799672340038, + "grad_norm": 9.875, + "learning_rate": 8.544710263785046e-06, + "loss": 1.412814736366272, + "step": 4682 + }, + { + "epoch": 0.8526440338581961, + "grad_norm": 9.25, + "learning_rate": 8.543527844295062e-06, + "loss": 1.3229460716247559, + "step": 4684 + }, + { + "epoch": 0.8530081004823883, + "grad_norm": 2.625, + "learning_rate": 8.542345051552672e-06, + "loss": 1.1141674518585205, + "step": 4686 + }, + { + "epoch": 0.8533721671065805, + "grad_norm": 12.4375, + "learning_rate": 8.54116188573145e-06, + "loss": 1.0369725227355957, + "step": 4688 + }, + { + "epoch": 0.8537362337307728, + "grad_norm": 14.0, + "learning_rate": 8.53997834700502e-06, + "loss": 1.4032094478607178, + "step": 4690 + }, + { + "epoch": 0.854100300354965, + "grad_norm": 8.625, + "learning_rate": 8.538794435547063e-06, + "loss": 1.7089297771453857, + "step": 4692 + }, + { + "epoch": 0.8544643669791572, + "grad_norm": 16.625, + "learning_rate": 8.537610151531308e-06, + "loss": 1.9842157363891602, + "step": 4694 + }, + { + "epoch": 0.8548284336033494, + "grad_norm": 24.25, + "learning_rate": 8.536425495131548e-06, + "loss": 1.698715329170227, + "step": 4696 + }, + { + "epoch": 0.8551925002275417, + "grad_norm": 11.6875, + "learning_rate": 8.535240466521628e-06, + "loss": 1.3599730730056763, + "step": 4698 + }, + { + "epoch": 0.8555565668517339, + "grad_norm": 9.9375, + "learning_rate": 8.534055065875442e-06, + "loss": 1.3171544075012207, + "step": 4700 + }, + { + "epoch": 0.8559206334759261, + "grad_norm": 17.875, + "learning_rate": 8.532869293366945e-06, + "loss": 1.4855996370315552, + "step": 4702 + }, + { + "epoch": 0.8562847001001184, + "grad_norm": 7.03125, + "learning_rate": 8.531683149170144e-06, + "loss": 1.3304357528686523, + "step": 4704 + }, + { + "epoch": 0.8566487667243106, + "grad_norm": 9.75, + "learning_rate": 8.530496633459102e-06, + "loss": 1.072676420211792, + "step": 4706 + }, + { + "epoch": 0.8570128333485028, + "grad_norm": 25.75, + "learning_rate": 8.529309746407935e-06, + "loss": 1.4439506530761719, + "step": 4708 + }, + { + "epoch": 0.857376899972695, + "grad_norm": 33.75, + "learning_rate": 8.528122488190811e-06, + "loss": 1.3170230388641357, + "step": 4710 + }, + { + "epoch": 0.8577409665968873, + "grad_norm": 31.375, + "learning_rate": 8.526934858981957e-06, + "loss": 0.8731761574745178, + "step": 4712 + }, + { + "epoch": 0.8581050332210794, + "grad_norm": 6.03125, + "learning_rate": 8.525746858955657e-06, + "loss": 0.3805575370788574, + "step": 4714 + }, + { + "epoch": 0.8584690998452716, + "grad_norm": 28.5, + "learning_rate": 8.524558488286239e-06, + "loss": 1.6718380451202393, + "step": 4716 + }, + { + "epoch": 0.858833166469464, + "grad_norm": 10.1875, + "learning_rate": 8.523369747148094e-06, + "loss": 1.316070318222046, + "step": 4718 + }, + { + "epoch": 0.8591972330936561, + "grad_norm": 14.0625, + "learning_rate": 8.522180635715662e-06, + "loss": 1.9480071067810059, + "step": 4720 + }, + { + "epoch": 0.8595612997178483, + "grad_norm": 17.375, + "learning_rate": 8.520991154163448e-06, + "loss": 1.8747416734695435, + "step": 4722 + }, + { + "epoch": 0.8599253663420406, + "grad_norm": 10.6875, + "learning_rate": 8.519801302665996e-06, + "loss": 1.327930212020874, + "step": 4724 + }, + { + "epoch": 0.8602894329662328, + "grad_norm": 20.375, + "learning_rate": 8.518611081397917e-06, + "loss": 1.5011496543884277, + "step": 4726 + }, + { + "epoch": 0.860653499590425, + "grad_norm": 11.6875, + "learning_rate": 8.517420490533865e-06, + "loss": 1.3953161239624023, + "step": 4728 + }, + { + "epoch": 0.8610175662146172, + "grad_norm": 10.8125, + "learning_rate": 8.51622953024856e-06, + "loss": 1.2196745872497559, + "step": 4730 + }, + { + "epoch": 0.8613816328388095, + "grad_norm": 9.375, + "learning_rate": 8.51503820071677e-06, + "loss": 1.3489603996276855, + "step": 4732 + }, + { + "epoch": 0.8617456994630017, + "grad_norm": 11.3125, + "learning_rate": 8.513846502113317e-06, + "loss": 1.4766356945037842, + "step": 4734 + }, + { + "epoch": 0.8621097660871939, + "grad_norm": 10.6875, + "learning_rate": 8.512654434613074e-06, + "loss": 1.4373071193695068, + "step": 4736 + }, + { + "epoch": 0.8624738327113862, + "grad_norm": 6.1875, + "learning_rate": 8.51146199839098e-06, + "loss": 1.2352737188339233, + "step": 4738 + }, + { + "epoch": 0.8628378993355784, + "grad_norm": 21.75, + "learning_rate": 8.510269193622014e-06, + "loss": 1.0454188585281372, + "step": 4740 + }, + { + "epoch": 0.8632019659597706, + "grad_norm": 47.0, + "learning_rate": 8.509076020481217e-06, + "loss": 0.6144300103187561, + "step": 4742 + }, + { + "epoch": 0.8635660325839629, + "grad_norm": 9.6875, + "learning_rate": 8.507882479143681e-06, + "loss": 1.545186996459961, + "step": 4744 + }, + { + "epoch": 0.8639300992081551, + "grad_norm": 9.75, + "learning_rate": 8.506688569784557e-06, + "loss": 1.4921015501022339, + "step": 4746 + }, + { + "epoch": 0.8642941658323473, + "grad_norm": 9.3125, + "learning_rate": 8.505494292579041e-06, + "loss": 1.3835653066635132, + "step": 4748 + }, + { + "epoch": 0.8646582324565395, + "grad_norm": 47.5, + "learning_rate": 8.504299647702396e-06, + "loss": 1.2491490840911865, + "step": 4750 + }, + { + "epoch": 0.8650222990807318, + "grad_norm": 10.375, + "learning_rate": 8.503104635329924e-06, + "loss": 1.6203796863555908, + "step": 4752 + }, + { + "epoch": 0.865386365704924, + "grad_norm": 14.375, + "learning_rate": 8.50190925563699e-06, + "loss": 1.4374362230300903, + "step": 4754 + }, + { + "epoch": 0.8657504323291162, + "grad_norm": 18.5, + "learning_rate": 8.500713508799014e-06, + "loss": 1.4110661745071411, + "step": 4756 + }, + { + "epoch": 0.8661144989533085, + "grad_norm": 7.46875, + "learning_rate": 8.499517394991466e-06, + "loss": 1.1272107362747192, + "step": 4758 + }, + { + "epoch": 0.8664785655775007, + "grad_norm": 9.6875, + "learning_rate": 8.498320914389865e-06, + "loss": 1.2199032306671143, + "step": 4760 + }, + { + "epoch": 0.8668426322016929, + "grad_norm": 6.3125, + "learning_rate": 8.4971240671698e-06, + "loss": 1.4588050842285156, + "step": 4762 + }, + { + "epoch": 0.8672066988258852, + "grad_norm": 8.5625, + "learning_rate": 8.495926853506897e-06, + "loss": 1.4851653575897217, + "step": 4764 + }, + { + "epoch": 0.8675707654500774, + "grad_norm": 34.0, + "learning_rate": 8.494729273576842e-06, + "loss": 1.281112790107727, + "step": 4766 + }, + { + "epoch": 0.8679348320742696, + "grad_norm": 12.3125, + "learning_rate": 8.493531327555378e-06, + "loss": 1.2939958572387695, + "step": 4768 + }, + { + "epoch": 0.8682988986984618, + "grad_norm": 10.375, + "learning_rate": 8.492333015618295e-06, + "loss": 1.711450219154358, + "step": 4770 + }, + { + "epoch": 0.8686629653226541, + "grad_norm": 9.9375, + "learning_rate": 8.491134337941442e-06, + "loss": 1.1526232957839966, + "step": 4772 + }, + { + "epoch": 0.8690270319468463, + "grad_norm": 8.4375, + "learning_rate": 8.489935294700722e-06, + "loss": 1.4241366386413574, + "step": 4774 + }, + { + "epoch": 0.8693910985710385, + "grad_norm": 6.5625, + "learning_rate": 8.48873588607209e-06, + "loss": 1.183090090751648, + "step": 4776 + }, + { + "epoch": 0.8697551651952308, + "grad_norm": 21.5, + "learning_rate": 8.487536112231548e-06, + "loss": 1.3121047019958496, + "step": 4778 + }, + { + "epoch": 0.870119231819423, + "grad_norm": 10.4375, + "learning_rate": 8.486335973355168e-06, + "loss": 1.1925883293151855, + "step": 4780 + }, + { + "epoch": 0.8704832984436152, + "grad_norm": 11.4375, + "learning_rate": 8.485135469619058e-06, + "loss": 1.3835647106170654, + "step": 4782 + }, + { + "epoch": 0.8708473650678074, + "grad_norm": 12.625, + "learning_rate": 8.483934601199391e-06, + "loss": 1.4478524923324585, + "step": 4784 + }, + { + "epoch": 0.8712114316919997, + "grad_norm": 10.875, + "learning_rate": 8.482733368272385e-06, + "loss": 1.408987045288086, + "step": 4786 + }, + { + "epoch": 0.8715754983161919, + "grad_norm": 9.25, + "learning_rate": 8.48153177101432e-06, + "loss": 1.3271706104278564, + "step": 4788 + }, + { + "epoch": 0.871939564940384, + "grad_norm": 14.625, + "learning_rate": 8.480329809601521e-06, + "loss": 1.3561227321624756, + "step": 4790 + }, + { + "epoch": 0.8723036315645764, + "grad_norm": 7.875, + "learning_rate": 8.47912748421038e-06, + "loss": 1.4736913442611694, + "step": 4792 + }, + { + "epoch": 0.8726676981887685, + "grad_norm": 20.25, + "learning_rate": 8.477924795017324e-06, + "loss": 1.3741657733917236, + "step": 4794 + }, + { + "epoch": 0.8730317648129607, + "grad_norm": 17.25, + "learning_rate": 8.476721742198848e-06, + "loss": 2.122314929962158, + "step": 4796 + }, + { + "epoch": 0.873395831437153, + "grad_norm": 7.15625, + "learning_rate": 8.475518325931493e-06, + "loss": 1.1818575859069824, + "step": 4798 + }, + { + "epoch": 0.8737598980613452, + "grad_norm": 21.25, + "learning_rate": 8.474314546391855e-06, + "loss": 1.8577296733856201, + "step": 4800 + }, + { + "epoch": 0.8741239646855374, + "grad_norm": 9.75, + "learning_rate": 8.473110403756585e-06, + "loss": 1.5991967916488647, + "step": 4802 + }, + { + "epoch": 0.8744880313097296, + "grad_norm": 6.5, + "learning_rate": 8.47190589820239e-06, + "loss": 1.3095811605453491, + "step": 4804 + }, + { + "epoch": 0.8748520979339219, + "grad_norm": 9.375, + "learning_rate": 8.470701029906019e-06, + "loss": 1.3215951919555664, + "step": 4806 + }, + { + "epoch": 0.8752161645581141, + "grad_norm": 12.25, + "learning_rate": 8.469495799044284e-06, + "loss": 1.3220648765563965, + "step": 4808 + }, + { + "epoch": 0.8755802311823063, + "grad_norm": 17.125, + "learning_rate": 8.46829020579405e-06, + "loss": 0.9009023904800415, + "step": 4810 + }, + { + "epoch": 0.8759442978064986, + "grad_norm": 5.875, + "learning_rate": 8.467084250332231e-06, + "loss": 1.3488471508026123, + "step": 4812 + }, + { + "epoch": 0.8763083644306908, + "grad_norm": 11.3125, + "learning_rate": 8.465877932835796e-06, + "loss": 1.3471102714538574, + "step": 4814 + }, + { + "epoch": 0.876672431054883, + "grad_norm": 38.25, + "learning_rate": 8.464671253481766e-06, + "loss": 1.6553246974945068, + "step": 4816 + }, + { + "epoch": 0.8770364976790753, + "grad_norm": 20.5, + "learning_rate": 8.463464212447221e-06, + "loss": 1.4767603874206543, + "step": 4818 + }, + { + "epoch": 0.8774005643032675, + "grad_norm": 26.625, + "learning_rate": 8.462256809909285e-06, + "loss": 1.8642398118972778, + "step": 4820 + }, + { + "epoch": 0.8777646309274597, + "grad_norm": 17.0, + "learning_rate": 8.461049046045143e-06, + "loss": 1.7345826625823975, + "step": 4822 + }, + { + "epoch": 0.8781286975516519, + "grad_norm": 84.5, + "learning_rate": 8.459840921032025e-06, + "loss": 1.211651086807251, + "step": 4824 + }, + { + "epoch": 0.8784927641758442, + "grad_norm": 23.25, + "learning_rate": 8.458632435047221e-06, + "loss": 1.039841651916504, + "step": 4826 + }, + { + "epoch": 0.8788568308000364, + "grad_norm": 18.125, + "learning_rate": 8.45742358826807e-06, + "loss": 1.2840505838394165, + "step": 4828 + }, + { + "epoch": 0.8792208974242286, + "grad_norm": 25.75, + "learning_rate": 8.456214380871968e-06, + "loss": 1.78365159034729, + "step": 4830 + }, + { + "epoch": 0.8795849640484209, + "grad_norm": 8.625, + "learning_rate": 8.45500481303636e-06, + "loss": 0.8666900396347046, + "step": 4832 + }, + { + "epoch": 0.8799490306726131, + "grad_norm": 24.375, + "learning_rate": 8.453794884938745e-06, + "loss": 1.4163602590560913, + "step": 4834 + }, + { + "epoch": 0.8803130972968053, + "grad_norm": 4.3125, + "learning_rate": 8.452584596756674e-06, + "loss": 0.9222185611724854, + "step": 4836 + }, + { + "epoch": 0.8806771639209976, + "grad_norm": 13.6875, + "learning_rate": 8.451373948667754e-06, + "loss": 1.1016875505447388, + "step": 4838 + }, + { + "epoch": 0.8810412305451898, + "grad_norm": 9.0, + "learning_rate": 8.45016294084964e-06, + "loss": 1.2642927169799805, + "step": 4840 + }, + { + "epoch": 0.881405297169382, + "grad_norm": 25.25, + "learning_rate": 8.448951573480044e-06, + "loss": 1.3859792947769165, + "step": 4842 + }, + { + "epoch": 0.8817693637935742, + "grad_norm": 10.9375, + "learning_rate": 8.447739846736732e-06, + "loss": 0.9050328731536865, + "step": 4844 + }, + { + "epoch": 0.8821334304177665, + "grad_norm": 8.4375, + "learning_rate": 8.446527760797514e-06, + "loss": 1.337114930152893, + "step": 4846 + }, + { + "epoch": 0.8824974970419587, + "grad_norm": 9.625, + "learning_rate": 8.445315315840263e-06, + "loss": 0.8304621577262878, + "step": 4848 + }, + { + "epoch": 0.8828615636661509, + "grad_norm": 9.875, + "learning_rate": 8.4441025120429e-06, + "loss": 1.2512221336364746, + "step": 4850 + }, + { + "epoch": 0.8832256302903432, + "grad_norm": 17.875, + "learning_rate": 8.4428893495834e-06, + "loss": 1.6167973279953003, + "step": 4852 + }, + { + "epoch": 0.8835896969145354, + "grad_norm": 13.875, + "learning_rate": 8.441675828639785e-06, + "loss": 0.732928991317749, + "step": 4854 + }, + { + "epoch": 0.8839537635387276, + "grad_norm": 10.8125, + "learning_rate": 8.44046194939014e-06, + "loss": 1.6956959962844849, + "step": 4856 + }, + { + "epoch": 0.8843178301629198, + "grad_norm": 12.625, + "learning_rate": 8.439247712012593e-06, + "loss": 1.5149354934692383, + "step": 4858 + }, + { + "epoch": 0.8846818967871121, + "grad_norm": 28.375, + "learning_rate": 8.438033116685329e-06, + "loss": 1.8743064403533936, + "step": 4860 + }, + { + "epoch": 0.8850459634113043, + "grad_norm": 13.6875, + "learning_rate": 8.436818163586588e-06, + "loss": 1.7524183988571167, + "step": 4862 + }, + { + "epoch": 0.8854100300354965, + "grad_norm": 4.375, + "learning_rate": 8.435602852894656e-06, + "loss": 1.2591854333877563, + "step": 4864 + }, + { + "epoch": 0.8857740966596888, + "grad_norm": 5.875, + "learning_rate": 8.434387184787874e-06, + "loss": 0.7776451706886292, + "step": 4866 + }, + { + "epoch": 0.886138163283881, + "grad_norm": 11.125, + "learning_rate": 8.43317115944464e-06, + "loss": 0.9579576253890991, + "step": 4868 + }, + { + "epoch": 0.8865022299080731, + "grad_norm": 10.6875, + "learning_rate": 8.431954777043398e-06, + "loss": 0.9780623912811279, + "step": 4870 + }, + { + "epoch": 0.8868662965322655, + "grad_norm": 6.0, + "learning_rate": 8.430738037762651e-06, + "loss": 1.3000987768173218, + "step": 4872 + }, + { + "epoch": 0.8872303631564576, + "grad_norm": 15.25, + "learning_rate": 8.429520941780946e-06, + "loss": 1.4808517694473267, + "step": 4874 + }, + { + "epoch": 0.8875944297806498, + "grad_norm": 11.125, + "learning_rate": 8.428303489276888e-06, + "loss": 1.4657047986984253, + "step": 4876 + }, + { + "epoch": 0.887958496404842, + "grad_norm": 7.96875, + "learning_rate": 8.427085680429137e-06, + "loss": 1.5792397260665894, + "step": 4878 + }, + { + "epoch": 0.8883225630290343, + "grad_norm": 7.96875, + "learning_rate": 8.425867515416396e-06, + "loss": 1.1995391845703125, + "step": 4880 + }, + { + "epoch": 0.8886866296532265, + "grad_norm": 6.78125, + "learning_rate": 8.424648994417427e-06, + "loss": 1.1995643377304077, + "step": 4882 + }, + { + "epoch": 0.8890506962774187, + "grad_norm": 5.53125, + "learning_rate": 8.423430117611047e-06, + "loss": 1.083752155303955, + "step": 4884 + }, + { + "epoch": 0.889414762901611, + "grad_norm": 3.4375, + "learning_rate": 8.42221088517612e-06, + "loss": 1.5045748949050903, + "step": 4886 + }, + { + "epoch": 0.8897788295258032, + "grad_norm": 10.9375, + "learning_rate": 8.420991297291556e-06, + "loss": 1.0758121013641357, + "step": 4888 + }, + { + "epoch": 0.8901428961499954, + "grad_norm": 9.75, + "learning_rate": 8.419771354136335e-06, + "loss": 1.478236198425293, + "step": 4890 + }, + { + "epoch": 0.8905069627741877, + "grad_norm": 52.25, + "learning_rate": 8.418551055889472e-06, + "loss": 1.5452797412872314, + "step": 4892 + }, + { + "epoch": 0.8908710293983799, + "grad_norm": 9.875, + "learning_rate": 8.417330402730047e-06, + "loss": 1.2707470655441284, + "step": 4894 + }, + { + "epoch": 0.8912350960225721, + "grad_norm": 8.3125, + "learning_rate": 8.416109394837178e-06, + "loss": 0.7832297086715698, + "step": 4896 + }, + { + "epoch": 0.8915991626467643, + "grad_norm": 15.9375, + "learning_rate": 8.414888032390049e-06, + "loss": 0.6458851099014282, + "step": 4898 + }, + { + "epoch": 0.8919632292709566, + "grad_norm": 3.515625, + "learning_rate": 8.413666315567888e-06, + "loss": 0.9119938611984253, + "step": 4900 + }, + { + "epoch": 0.8923272958951488, + "grad_norm": 9.9375, + "learning_rate": 8.412444244549975e-06, + "loss": 1.2867916822433472, + "step": 4902 + }, + { + "epoch": 0.892691362519341, + "grad_norm": 44.75, + "learning_rate": 8.411221819515646e-06, + "loss": 1.285341739654541, + "step": 4904 + }, + { + "epoch": 0.8930554291435333, + "grad_norm": 24.75, + "learning_rate": 8.40999904064429e-06, + "loss": 1.0806200504302979, + "step": 4906 + }, + { + "epoch": 0.8934194957677255, + "grad_norm": 10.1875, + "learning_rate": 8.408775908115339e-06, + "loss": 1.166571021080017, + "step": 4908 + }, + { + "epoch": 0.8937835623919177, + "grad_norm": 14.8125, + "learning_rate": 8.407552422108287e-06, + "loss": 1.7610280513763428, + "step": 4910 + }, + { + "epoch": 0.89414762901611, + "grad_norm": 6.46875, + "learning_rate": 8.406328582802672e-06, + "loss": 1.1678876876831055, + "step": 4912 + }, + { + "epoch": 0.8945116956403022, + "grad_norm": 10.4375, + "learning_rate": 8.405104390378091e-06, + "loss": 1.4904696941375732, + "step": 4914 + }, + { + "epoch": 0.8948757622644944, + "grad_norm": 4.90625, + "learning_rate": 8.403879845014187e-06, + "loss": 1.3808884620666504, + "step": 4916 + }, + { + "epoch": 0.8952398288886866, + "grad_norm": 19.125, + "learning_rate": 8.402654946890658e-06, + "loss": 1.1424446105957031, + "step": 4918 + }, + { + "epoch": 0.8956038955128789, + "grad_norm": 24.5, + "learning_rate": 8.401429696187253e-06, + "loss": 1.9523061513900757, + "step": 4920 + }, + { + "epoch": 0.8959679621370711, + "grad_norm": 46.25, + "learning_rate": 8.400204093083773e-06, + "loss": 1.4287670850753784, + "step": 4922 + }, + { + "epoch": 0.8963320287612633, + "grad_norm": 13.625, + "learning_rate": 8.398978137760068e-06, + "loss": 1.08514404296875, + "step": 4924 + }, + { + "epoch": 0.8966960953854556, + "grad_norm": 9.4375, + "learning_rate": 8.397751830396042e-06, + "loss": 1.5198423862457275, + "step": 4926 + }, + { + "epoch": 0.8970601620096478, + "grad_norm": 11.0, + "learning_rate": 8.396525171171654e-06, + "loss": 1.3252668380737305, + "step": 4928 + }, + { + "epoch": 0.89742422863384, + "grad_norm": 12.0625, + "learning_rate": 8.395298160266911e-06, + "loss": 1.685268759727478, + "step": 4930 + }, + { + "epoch": 0.8977882952580322, + "grad_norm": 12.3125, + "learning_rate": 8.39407079786187e-06, + "loss": 1.306122064590454, + "step": 4932 + }, + { + "epoch": 0.8981523618822245, + "grad_norm": 7.59375, + "learning_rate": 8.39284308413664e-06, + "loss": 1.524824857711792, + "step": 4934 + }, + { + "epoch": 0.8985164285064167, + "grad_norm": 6.8125, + "learning_rate": 8.391615019271384e-06, + "loss": 1.1539874076843262, + "step": 4936 + }, + { + "epoch": 0.8988804951306089, + "grad_norm": 5.0625, + "learning_rate": 8.390386603446316e-06, + "loss": 1.2209997177124023, + "step": 4938 + }, + { + "epoch": 0.8992445617548012, + "grad_norm": 12.3125, + "learning_rate": 8.389157836841704e-06, + "loss": 1.5792509317398071, + "step": 4940 + }, + { + "epoch": 0.8996086283789934, + "grad_norm": 15.875, + "learning_rate": 8.387928719637862e-06, + "loss": 1.7660202980041504, + "step": 4942 + }, + { + "epoch": 0.8999726950031856, + "grad_norm": 6.1875, + "learning_rate": 8.386699252015156e-06, + "loss": 1.128103256225586, + "step": 4944 + }, + { + "epoch": 0.9003367616273779, + "grad_norm": 6.5625, + "learning_rate": 8.385469434154008e-06, + "loss": 1.2405339479446411, + "step": 4946 + }, + { + "epoch": 0.90070082825157, + "grad_norm": 9.625, + "learning_rate": 8.384239266234887e-06, + "loss": 1.3091473579406738, + "step": 4948 + }, + { + "epoch": 0.9010648948757622, + "grad_norm": 16.5, + "learning_rate": 8.383008748438317e-06, + "loss": 1.3025203943252563, + "step": 4950 + }, + { + "epoch": 0.9014289614999544, + "grad_norm": 95.0, + "learning_rate": 8.38177788094487e-06, + "loss": 1.8716806173324585, + "step": 4952 + }, + { + "epoch": 0.9017930281241467, + "grad_norm": 12.5, + "learning_rate": 8.380546663935171e-06, + "loss": 0.988882303237915, + "step": 4954 + }, + { + "epoch": 0.9021570947483389, + "grad_norm": 7.71875, + "learning_rate": 8.379315097589897e-06, + "loss": 1.0238853693008423, + "step": 4956 + }, + { + "epoch": 0.9025211613725311, + "grad_norm": 9.875, + "learning_rate": 8.378083182089778e-06, + "loss": 1.3713093996047974, + "step": 4958 + }, + { + "epoch": 0.9028852279967234, + "grad_norm": 13.4375, + "learning_rate": 8.376850917615587e-06, + "loss": 1.4977836608886719, + "step": 4960 + }, + { + "epoch": 0.9032492946209156, + "grad_norm": 6.53125, + "learning_rate": 8.375618304348156e-06, + "loss": 1.1415998935699463, + "step": 4962 + }, + { + "epoch": 0.9036133612451078, + "grad_norm": 15.5625, + "learning_rate": 8.374385342468365e-06, + "loss": 1.337787389755249, + "step": 4964 + }, + { + "epoch": 0.9039774278693001, + "grad_norm": 11.0, + "learning_rate": 8.37315203215715e-06, + "loss": 1.2822262048721313, + "step": 4966 + }, + { + "epoch": 0.9043414944934923, + "grad_norm": 15.6875, + "learning_rate": 8.371918373595494e-06, + "loss": 0.9648714065551758, + "step": 4968 + }, + { + "epoch": 0.9047055611176845, + "grad_norm": 20.25, + "learning_rate": 8.370684366964426e-06, + "loss": 1.1467806100845337, + "step": 4970 + }, + { + "epoch": 0.9050696277418767, + "grad_norm": 15.1875, + "learning_rate": 8.369450012445033e-06, + "loss": 1.447210431098938, + "step": 4972 + }, + { + "epoch": 0.905433694366069, + "grad_norm": 13.0625, + "learning_rate": 8.368215310218454e-06, + "loss": 1.545515537261963, + "step": 4974 + }, + { + "epoch": 0.9057977609902612, + "grad_norm": 16.25, + "learning_rate": 8.36698026046588e-06, + "loss": 1.4063040018081665, + "step": 4976 + }, + { + "epoch": 0.9061618276144534, + "grad_norm": 10.5, + "learning_rate": 8.36574486336854e-06, + "loss": 1.3798906803131104, + "step": 4978 + }, + { + "epoch": 0.9065258942386457, + "grad_norm": 9.5, + "learning_rate": 8.364509119107734e-06, + "loss": 1.3341408967971802, + "step": 4980 + }, + { + "epoch": 0.9068899608628379, + "grad_norm": 15.3125, + "learning_rate": 8.363273027864793e-06, + "loss": 1.2749433517456055, + "step": 4982 + }, + { + "epoch": 0.9072540274870301, + "grad_norm": 29.0, + "learning_rate": 8.362036589821114e-06, + "loss": 1.9989795684814453, + "step": 4984 + }, + { + "epoch": 0.9076180941112224, + "grad_norm": 420.0, + "learning_rate": 8.360799805158139e-06, + "loss": 1.1506237983703613, + "step": 4986 + }, + { + "epoch": 0.9079821607354146, + "grad_norm": 10.9375, + "learning_rate": 8.35956267405736e-06, + "loss": 1.7607243061065674, + "step": 4988 + }, + { + "epoch": 0.9083462273596068, + "grad_norm": 19.0, + "learning_rate": 8.358325196700318e-06, + "loss": 1.412642002105713, + "step": 4990 + }, + { + "epoch": 0.908710293983799, + "grad_norm": 16.875, + "learning_rate": 8.357087373268613e-06, + "loss": 1.9293653964996338, + "step": 4992 + }, + { + "epoch": 0.9090743606079913, + "grad_norm": 22.0, + "learning_rate": 8.355849203943888e-06, + "loss": 1.420868992805481, + "step": 4994 + }, + { + "epoch": 0.9094384272321835, + "grad_norm": 12.0, + "learning_rate": 8.354610688907843e-06, + "loss": 1.3737428188323975, + "step": 4996 + }, + { + "epoch": 0.9098024938563757, + "grad_norm": 14.3125, + "learning_rate": 8.353371828342218e-06, + "loss": 0.9121301174163818, + "step": 4998 + }, + { + "epoch": 0.910166560480568, + "grad_norm": 6.84375, + "learning_rate": 8.352132622428814e-06, + "loss": 0.9797143936157227, + "step": 5000 + }, + { + "epoch": 0.9105306271047602, + "grad_norm": 12.0625, + "learning_rate": 8.350893071349484e-06, + "loss": 0.5149597525596619, + "step": 5002 + }, + { + "epoch": 0.9108946937289524, + "grad_norm": 8.0625, + "learning_rate": 8.349653175286122e-06, + "loss": 1.4142628908157349, + "step": 5004 + }, + { + "epoch": 0.9112587603531446, + "grad_norm": 26.375, + "learning_rate": 8.348412934420675e-06, + "loss": 1.8623378276824951, + "step": 5006 + }, + { + "epoch": 0.9116228269773369, + "grad_norm": 8.5625, + "learning_rate": 8.34717234893515e-06, + "loss": 1.1840145587921143, + "step": 5008 + }, + { + "epoch": 0.9119868936015291, + "grad_norm": 97.0, + "learning_rate": 8.345931419011594e-06, + "loss": 1.4677081108093262, + "step": 5010 + }, + { + "epoch": 0.9123509602257213, + "grad_norm": 6.84375, + "learning_rate": 8.344690144832113e-06, + "loss": 1.0516201257705688, + "step": 5012 + }, + { + "epoch": 0.9127150268499136, + "grad_norm": 9.3125, + "learning_rate": 8.343448526578852e-06, + "loss": 1.4309465885162354, + "step": 5014 + }, + { + "epoch": 0.9130790934741058, + "grad_norm": 6.84375, + "learning_rate": 8.342206564434017e-06, + "loss": 1.171036720275879, + "step": 5016 + }, + { + "epoch": 0.913443160098298, + "grad_norm": 9.25, + "learning_rate": 8.340964258579862e-06, + "loss": 1.3203787803649902, + "step": 5018 + }, + { + "epoch": 0.9138072267224903, + "grad_norm": 7.125, + "learning_rate": 8.339721609198688e-06, + "loss": 1.3614780902862549, + "step": 5020 + }, + { + "epoch": 0.9141712933466825, + "grad_norm": 8.3125, + "learning_rate": 8.338478616472849e-06, + "loss": 1.4372724294662476, + "step": 5022 + }, + { + "epoch": 0.9145353599708747, + "grad_norm": 10.625, + "learning_rate": 8.337235280584752e-06, + "loss": 1.2682194709777832, + "step": 5024 + }, + { + "epoch": 0.9148994265950668, + "grad_norm": 11.4375, + "learning_rate": 8.33599160171685e-06, + "loss": 1.185659646987915, + "step": 5026 + }, + { + "epoch": 0.9152634932192591, + "grad_norm": 17.5, + "learning_rate": 8.334747580051647e-06, + "loss": 1.4867969751358032, + "step": 5028 + }, + { + "epoch": 0.9156275598434513, + "grad_norm": 20.5, + "learning_rate": 8.333503215771696e-06, + "loss": 1.5580549240112305, + "step": 5030 + }, + { + "epoch": 0.9159916264676435, + "grad_norm": 9.0625, + "learning_rate": 8.332258509059608e-06, + "loss": 1.5288459062576294, + "step": 5032 + }, + { + "epoch": 0.9163556930918358, + "grad_norm": 26.125, + "learning_rate": 8.331013460098034e-06, + "loss": 1.522063136100769, + "step": 5034 + }, + { + "epoch": 0.916719759716028, + "grad_norm": 8.8125, + "learning_rate": 8.329768069069684e-06, + "loss": 1.3480898141860962, + "step": 5036 + }, + { + "epoch": 0.9170838263402202, + "grad_norm": 9.9375, + "learning_rate": 8.328522336157309e-06, + "loss": 1.5097423791885376, + "step": 5038 + }, + { + "epoch": 0.9174478929644125, + "grad_norm": 11.875, + "learning_rate": 8.32727626154372e-06, + "loss": 1.3834683895111084, + "step": 5040 + }, + { + "epoch": 0.9178119595886047, + "grad_norm": 5.625, + "learning_rate": 8.326029845411769e-06, + "loss": 1.2475342750549316, + "step": 5042 + }, + { + "epoch": 0.9181760262127969, + "grad_norm": 9.5, + "learning_rate": 8.324783087944365e-06, + "loss": 0.9466933012008667, + "step": 5044 + }, + { + "epoch": 0.9185400928369891, + "grad_norm": 13.0625, + "learning_rate": 8.323535989324465e-06, + "loss": 1.868581771850586, + "step": 5046 + }, + { + "epoch": 0.9189041594611814, + "grad_norm": 12.1875, + "learning_rate": 8.322288549735076e-06, + "loss": 1.1781412363052368, + "step": 5048 + }, + { + "epoch": 0.9192682260853736, + "grad_norm": 3.640625, + "learning_rate": 8.321040769359252e-06, + "loss": 0.8578981757164001, + "step": 5050 + }, + { + "epoch": 0.9196322927095658, + "grad_norm": 8.125, + "learning_rate": 8.3197926483801e-06, + "loss": 1.571542739868164, + "step": 5052 + }, + { + "epoch": 0.9199963593337581, + "grad_norm": 4.4375, + "learning_rate": 8.318544186980782e-06, + "loss": 0.9482288360595703, + "step": 5054 + }, + { + "epoch": 0.9203604259579503, + "grad_norm": 9.6875, + "learning_rate": 8.317295385344499e-06, + "loss": 1.2681465148925781, + "step": 5056 + }, + { + "epoch": 0.9207244925821425, + "grad_norm": 6.5625, + "learning_rate": 8.31604624365451e-06, + "loss": 1.228989839553833, + "step": 5058 + }, + { + "epoch": 0.9210885592063348, + "grad_norm": 6.75, + "learning_rate": 8.31479676209412e-06, + "loss": 1.2084938287734985, + "step": 5060 + }, + { + "epoch": 0.921452625830527, + "grad_norm": 7.34375, + "learning_rate": 8.313546940846686e-06, + "loss": 1.2084535360336304, + "step": 5062 + }, + { + "epoch": 0.9218166924547192, + "grad_norm": 4.59375, + "learning_rate": 8.312296780095617e-06, + "loss": 1.1716161966323853, + "step": 5064 + }, + { + "epoch": 0.9221807590789114, + "grad_norm": 14.875, + "learning_rate": 8.311046280024364e-06, + "loss": 1.3594609498977661, + "step": 5066 + }, + { + "epoch": 0.9225448257031037, + "grad_norm": 34.25, + "learning_rate": 8.309795440816435e-06, + "loss": 1.4242632389068604, + "step": 5068 + }, + { + "epoch": 0.9229088923272959, + "grad_norm": 11.0625, + "learning_rate": 8.308544262655387e-06, + "loss": 1.5514527559280396, + "step": 5070 + }, + { + "epoch": 0.9232729589514881, + "grad_norm": 12.9375, + "learning_rate": 8.307292745724823e-06, + "loss": 0.4746064841747284, + "step": 5072 + }, + { + "epoch": 0.9236370255756804, + "grad_norm": 26.75, + "learning_rate": 8.3060408902084e-06, + "loss": 1.3490463495254517, + "step": 5074 + }, + { + "epoch": 0.9240010921998726, + "grad_norm": 20.625, + "learning_rate": 8.304788696289824e-06, + "loss": 1.0141253471374512, + "step": 5076 + }, + { + "epoch": 0.9243651588240648, + "grad_norm": 4.625, + "learning_rate": 8.303536164152843e-06, + "loss": 0.9631630778312683, + "step": 5078 + }, + { + "epoch": 0.9247292254482571, + "grad_norm": 7.5, + "learning_rate": 8.302283293981265e-06, + "loss": 1.2772789001464844, + "step": 5080 + }, + { + "epoch": 0.9250932920724493, + "grad_norm": 10.0625, + "learning_rate": 8.301030085958948e-06, + "loss": 1.2958730459213257, + "step": 5082 + }, + { + "epoch": 0.9254573586966415, + "grad_norm": 4.75, + "learning_rate": 8.29977654026979e-06, + "loss": 1.0446795225143433, + "step": 5084 + }, + { + "epoch": 0.9258214253208337, + "grad_norm": 16.25, + "learning_rate": 8.298522657097746e-06, + "loss": 1.114783763885498, + "step": 5086 + }, + { + "epoch": 0.926185491945026, + "grad_norm": 13.9375, + "learning_rate": 8.297268436626812e-06, + "loss": 1.363879680633545, + "step": 5088 + }, + { + "epoch": 0.9265495585692182, + "grad_norm": 9.0, + "learning_rate": 8.296013879041049e-06, + "loss": 1.2302645444869995, + "step": 5090 + }, + { + "epoch": 0.9269136251934104, + "grad_norm": 13.5, + "learning_rate": 8.294758984524556e-06, + "loss": 1.7460567951202393, + "step": 5092 + }, + { + "epoch": 0.9272776918176027, + "grad_norm": 12.4375, + "learning_rate": 8.293503753261478e-06, + "loss": 1.9477344751358032, + "step": 5094 + }, + { + "epoch": 0.9276417584417949, + "grad_norm": 6.34375, + "learning_rate": 8.29224818543602e-06, + "loss": 1.330409049987793, + "step": 5096 + }, + { + "epoch": 0.9280058250659871, + "grad_norm": 27.375, + "learning_rate": 8.290992281232434e-06, + "loss": 1.1722042560577393, + "step": 5098 + }, + { + "epoch": 0.9283698916901792, + "grad_norm": 3.796875, + "learning_rate": 8.289736040835011e-06, + "loss": 1.0856956243515015, + "step": 5100 + }, + { + "epoch": 0.9287339583143716, + "grad_norm": 5.125, + "learning_rate": 8.288479464428104e-06, + "loss": 1.2190728187561035, + "step": 5102 + }, + { + "epoch": 0.9290980249385637, + "grad_norm": 16.25, + "learning_rate": 8.28722255219611e-06, + "loss": 1.3171327114105225, + "step": 5104 + }, + { + "epoch": 0.9294620915627559, + "grad_norm": 15.75, + "learning_rate": 8.285965304323477e-06, + "loss": 1.6932451725006104, + "step": 5106 + }, + { + "epoch": 0.9298261581869482, + "grad_norm": 7.375, + "learning_rate": 8.2847077209947e-06, + "loss": 1.3218982219696045, + "step": 5108 + }, + { + "epoch": 0.9301902248111404, + "grad_norm": 3.625, + "learning_rate": 8.283449802394323e-06, + "loss": 1.330397129058838, + "step": 5110 + }, + { + "epoch": 0.9305542914353326, + "grad_norm": 7.84375, + "learning_rate": 8.28219154870694e-06, + "loss": 1.1778228282928467, + "step": 5112 + }, + { + "epoch": 0.9309183580595249, + "grad_norm": 13.3125, + "learning_rate": 8.2809329601172e-06, + "loss": 0.8992382287979126, + "step": 5114 + }, + { + "epoch": 0.9312824246837171, + "grad_norm": 17.75, + "learning_rate": 8.27967403680979e-06, + "loss": 1.3484028577804565, + "step": 5116 + }, + { + "epoch": 0.9316464913079093, + "grad_norm": 15.75, + "learning_rate": 8.278414778969454e-06, + "loss": 1.9711592197418213, + "step": 5118 + }, + { + "epoch": 0.9320105579321015, + "grad_norm": 5.3125, + "learning_rate": 8.277155186780983e-06, + "loss": 1.1555635929107666, + "step": 5120 + }, + { + "epoch": 0.9323746245562938, + "grad_norm": 9.1875, + "learning_rate": 8.275895260429217e-06, + "loss": 1.209521770477295, + "step": 5122 + }, + { + "epoch": 0.932738691180486, + "grad_norm": 20.25, + "learning_rate": 8.274635000099043e-06, + "loss": 1.5935063362121582, + "step": 5124 + }, + { + "epoch": 0.9331027578046782, + "grad_norm": 8.6875, + "learning_rate": 8.273374405975402e-06, + "loss": 1.1172841787338257, + "step": 5126 + }, + { + "epoch": 0.9334668244288705, + "grad_norm": 6.75, + "learning_rate": 8.272113478243281e-06, + "loss": 1.2846150398254395, + "step": 5128 + }, + { + "epoch": 0.9338308910530627, + "grad_norm": 6.84375, + "learning_rate": 8.270852217087715e-06, + "loss": 0.9405727386474609, + "step": 5130 + }, + { + "epoch": 0.9341949576772549, + "grad_norm": 10.1875, + "learning_rate": 8.269590622693788e-06, + "loss": 1.3887560367584229, + "step": 5132 + }, + { + "epoch": 0.9345590243014472, + "grad_norm": 65.5, + "learning_rate": 8.268328695246637e-06, + "loss": 1.0281797647476196, + "step": 5134 + }, + { + "epoch": 0.9349230909256394, + "grad_norm": 15.5, + "learning_rate": 8.267066434931441e-06, + "loss": 1.4884922504425049, + "step": 5136 + }, + { + "epoch": 0.9352871575498316, + "grad_norm": 32.25, + "learning_rate": 8.265803841933432e-06, + "loss": 1.8731805086135864, + "step": 5138 + }, + { + "epoch": 0.9356512241740238, + "grad_norm": 11.25, + "learning_rate": 8.264540916437893e-06, + "loss": 1.7738471031188965, + "step": 5140 + }, + { + "epoch": 0.9360152907982161, + "grad_norm": 9.625, + "learning_rate": 8.263277658630153e-06, + "loss": 1.4231477975845337, + "step": 5142 + }, + { + "epoch": 0.9363793574224083, + "grad_norm": 22.0, + "learning_rate": 8.26201406869559e-06, + "loss": 1.5946168899536133, + "step": 5144 + }, + { + "epoch": 0.9367434240466005, + "grad_norm": 8.3125, + "learning_rate": 8.260750146819628e-06, + "loss": 1.3846397399902344, + "step": 5146 + }, + { + "epoch": 0.9371074906707928, + "grad_norm": 6.34375, + "learning_rate": 8.259485893187744e-06, + "loss": 0.9462764263153076, + "step": 5148 + }, + { + "epoch": 0.937471557294985, + "grad_norm": 6.3125, + "learning_rate": 8.25822130798546e-06, + "loss": 0.9857575297355652, + "step": 5150 + }, + { + "epoch": 0.9378356239191772, + "grad_norm": 7.875, + "learning_rate": 8.256956391398352e-06, + "loss": 1.2868921756744385, + "step": 5152 + }, + { + "epoch": 0.9381996905433695, + "grad_norm": 11.9375, + "learning_rate": 8.25569114361204e-06, + "loss": 1.2789064645767212, + "step": 5154 + }, + { + "epoch": 0.9385637571675617, + "grad_norm": 15.75, + "learning_rate": 8.254425564812196e-06, + "loss": 1.760619878768921, + "step": 5156 + }, + { + "epoch": 0.9389278237917539, + "grad_norm": 7.15625, + "learning_rate": 8.253159655184537e-06, + "loss": 1.2983062267303467, + "step": 5158 + }, + { + "epoch": 0.9392918904159461, + "grad_norm": 11.875, + "learning_rate": 8.25189341491483e-06, + "loss": 1.3249213695526123, + "step": 5160 + }, + { + "epoch": 0.9396559570401384, + "grad_norm": 17.125, + "learning_rate": 8.250626844188886e-06, + "loss": 1.342602014541626, + "step": 5162 + }, + { + "epoch": 0.9400200236643306, + "grad_norm": 9.5625, + "learning_rate": 8.249359943192578e-06, + "loss": 1.1832472085952759, + "step": 5164 + }, + { + "epoch": 0.9403840902885228, + "grad_norm": 14.0625, + "learning_rate": 8.248092712111813e-06, + "loss": 1.0097612142562866, + "step": 5166 + }, + { + "epoch": 0.9407481569127151, + "grad_norm": 15.875, + "learning_rate": 8.246825151132552e-06, + "loss": 1.484398365020752, + "step": 5168 + }, + { + "epoch": 0.9411122235369073, + "grad_norm": 22.75, + "learning_rate": 8.245557260440807e-06, + "loss": 1.6879181861877441, + "step": 5170 + }, + { + "epoch": 0.9414762901610995, + "grad_norm": 13.3125, + "learning_rate": 8.244289040222633e-06, + "loss": 1.4201488494873047, + "step": 5172 + }, + { + "epoch": 0.9418403567852917, + "grad_norm": 9.0, + "learning_rate": 8.24302049066414e-06, + "loss": 1.4941177368164062, + "step": 5174 + }, + { + "epoch": 0.942204423409484, + "grad_norm": 17.5, + "learning_rate": 8.241751611951481e-06, + "loss": 1.1575415134429932, + "step": 5176 + }, + { + "epoch": 0.9425684900336762, + "grad_norm": 14.5, + "learning_rate": 8.240482404270856e-06, + "loss": 1.3664989471435547, + "step": 5178 + }, + { + "epoch": 0.9429325566578683, + "grad_norm": 10.8125, + "learning_rate": 8.239212867808518e-06, + "loss": 1.367465615272522, + "step": 5180 + }, + { + "epoch": 0.9432966232820607, + "grad_norm": 9.875, + "learning_rate": 8.237943002750765e-06, + "loss": 1.3932271003723145, + "step": 5182 + }, + { + "epoch": 0.9436606899062528, + "grad_norm": 15.0, + "learning_rate": 8.236672809283945e-06, + "loss": 1.0750181674957275, + "step": 5184 + }, + { + "epoch": 0.944024756530445, + "grad_norm": 26.5, + "learning_rate": 8.235402287594458e-06, + "loss": 0.897424042224884, + "step": 5186 + }, + { + "epoch": 0.9443888231546373, + "grad_norm": 12.0625, + "learning_rate": 8.234131437868745e-06, + "loss": 1.3413605690002441, + "step": 5188 + }, + { + "epoch": 0.9447528897788295, + "grad_norm": 35.75, + "learning_rate": 8.232860260293297e-06, + "loss": 1.293330430984497, + "step": 5190 + }, + { + "epoch": 0.9451169564030217, + "grad_norm": 4.75, + "learning_rate": 8.231588755054654e-06, + "loss": 1.2065048217773438, + "step": 5192 + }, + { + "epoch": 0.9454810230272139, + "grad_norm": 9.75, + "learning_rate": 8.230316922339406e-06, + "loss": 1.4518396854400635, + "step": 5194 + }, + { + "epoch": 0.9458450896514062, + "grad_norm": 14.0625, + "learning_rate": 8.229044762334187e-06, + "loss": 1.380263090133667, + "step": 5196 + }, + { + "epoch": 0.9462091562755984, + "grad_norm": 12.5, + "learning_rate": 8.22777227522568e-06, + "loss": 1.5777246952056885, + "step": 5198 + }, + { + "epoch": 0.9465732228997906, + "grad_norm": 13.8125, + "learning_rate": 8.226499461200623e-06, + "loss": 1.9536418914794922, + "step": 5200 + }, + { + "epoch": 0.9469372895239829, + "grad_norm": 10.5, + "learning_rate": 8.225226320445795e-06, + "loss": 1.1971180438995361, + "step": 5202 + }, + { + "epoch": 0.9473013561481751, + "grad_norm": 14.3125, + "learning_rate": 8.22395285314802e-06, + "loss": 1.3106062412261963, + "step": 5204 + }, + { + "epoch": 0.9476654227723673, + "grad_norm": 10.6875, + "learning_rate": 8.222679059494179e-06, + "loss": 1.810975432395935, + "step": 5206 + }, + { + "epoch": 0.9480294893965596, + "grad_norm": 8.5625, + "learning_rate": 8.221404939671192e-06, + "loss": 1.3279353380203247, + "step": 5208 + }, + { + "epoch": 0.9483935560207518, + "grad_norm": 18.5, + "learning_rate": 8.220130493866033e-06, + "loss": 0.9554933309555054, + "step": 5210 + }, + { + "epoch": 0.948757622644944, + "grad_norm": 16.125, + "learning_rate": 8.218855722265721e-06, + "loss": 1.6110066175460815, + "step": 5212 + }, + { + "epoch": 0.9491216892691362, + "grad_norm": 7.9375, + "learning_rate": 8.217580625057324e-06, + "loss": 1.5206242799758911, + "step": 5214 + }, + { + "epoch": 0.9494857558933285, + "grad_norm": 12.5625, + "learning_rate": 8.216305202427959e-06, + "loss": 1.3869340419769287, + "step": 5216 + }, + { + "epoch": 0.9498498225175207, + "grad_norm": 25.125, + "learning_rate": 8.215029454564788e-06, + "loss": 1.1318068504333496, + "step": 5218 + }, + { + "epoch": 0.9502138891417129, + "grad_norm": 27.625, + "learning_rate": 8.213753381655017e-06, + "loss": 0.851793110370636, + "step": 5220 + }, + { + "epoch": 0.9505779557659052, + "grad_norm": 9.8125, + "learning_rate": 8.212476983885912e-06, + "loss": 1.0434181690216064, + "step": 5222 + }, + { + "epoch": 0.9509420223900974, + "grad_norm": 3.359375, + "learning_rate": 8.211200261444775e-06, + "loss": 0.9184368848800659, + "step": 5224 + }, + { + "epoch": 0.9513060890142896, + "grad_norm": 5.71875, + "learning_rate": 8.209923214518962e-06, + "loss": 1.1558799743652344, + "step": 5226 + }, + { + "epoch": 0.9516701556384819, + "grad_norm": 11.1875, + "learning_rate": 8.208645843295873e-06, + "loss": 1.3522435426712036, + "step": 5228 + }, + { + "epoch": 0.9520342222626741, + "grad_norm": 7.34375, + "learning_rate": 8.207368147962955e-06, + "loss": 1.3889973163604736, + "step": 5230 + }, + { + "epoch": 0.9523982888868663, + "grad_norm": 11.25, + "learning_rate": 8.206090128707709e-06, + "loss": 1.2717233896255493, + "step": 5232 + }, + { + "epoch": 0.9527623555110585, + "grad_norm": 69.0, + "learning_rate": 8.204811785717677e-06, + "loss": 1.466750144958496, + "step": 5234 + }, + { + "epoch": 0.9531264221352508, + "grad_norm": 12.5625, + "learning_rate": 8.203533119180452e-06, + "loss": 2.098508596420288, + "step": 5236 + }, + { + "epoch": 0.953490488759443, + "grad_norm": 9.625, + "learning_rate": 8.202254129283669e-06, + "loss": 1.3959881067276, + "step": 5238 + }, + { + "epoch": 0.9538545553836352, + "grad_norm": 9.875, + "learning_rate": 8.20097481621502e-06, + "loss": 1.5155279636383057, + "step": 5240 + }, + { + "epoch": 0.9542186220078275, + "grad_norm": 14.5, + "learning_rate": 8.199695180162234e-06, + "loss": 1.3899306058883667, + "step": 5242 + }, + { + "epoch": 0.9545826886320197, + "grad_norm": 12.0, + "learning_rate": 8.198415221313096e-06, + "loss": 1.476435899734497, + "step": 5244 + }, + { + "epoch": 0.9549467552562119, + "grad_norm": 10.0, + "learning_rate": 8.197134939855435e-06, + "loss": 1.5116914510726929, + "step": 5246 + }, + { + "epoch": 0.9553108218804041, + "grad_norm": 24.25, + "learning_rate": 8.195854335977124e-06, + "loss": 1.25270676612854, + "step": 5248 + }, + { + "epoch": 0.9556748885045964, + "grad_norm": 7.8125, + "learning_rate": 8.19457340986609e-06, + "loss": 0.954714298248291, + "step": 5250 + }, + { + "epoch": 0.9560389551287886, + "grad_norm": 8.25, + "learning_rate": 8.1932921617103e-06, + "loss": 1.685788869857788, + "step": 5252 + }, + { + "epoch": 0.9564030217529808, + "grad_norm": 21.125, + "learning_rate": 8.192010591697777e-06, + "loss": 1.696622371673584, + "step": 5254 + }, + { + "epoch": 0.9567670883771731, + "grad_norm": 10.5625, + "learning_rate": 8.190728700016579e-06, + "loss": 1.5226070880889893, + "step": 5256 + }, + { + "epoch": 0.9571311550013653, + "grad_norm": 16.75, + "learning_rate": 8.189446486854827e-06, + "loss": 1.7219946384429932, + "step": 5258 + }, + { + "epoch": 0.9574952216255574, + "grad_norm": 17.25, + "learning_rate": 8.188163952400672e-06, + "loss": 1.4439606666564941, + "step": 5260 + }, + { + "epoch": 0.9578592882497498, + "grad_norm": 24.0, + "learning_rate": 8.186881096842325e-06, + "loss": 1.0094411373138428, + "step": 5262 + }, + { + "epoch": 0.958223354873942, + "grad_norm": 12.1875, + "learning_rate": 8.185597920368042e-06, + "loss": 1.5339179039001465, + "step": 5264 + }, + { + "epoch": 0.9585874214981341, + "grad_norm": 12.6875, + "learning_rate": 8.184314423166123e-06, + "loss": 1.436272144317627, + "step": 5266 + }, + { + "epoch": 0.9589514881223263, + "grad_norm": 7.9375, + "learning_rate": 8.183030605424912e-06, + "loss": 1.4084436893463135, + "step": 5268 + }, + { + "epoch": 0.9593155547465186, + "grad_norm": 3.375, + "learning_rate": 8.181746467332804e-06, + "loss": 1.054578185081482, + "step": 5270 + }, + { + "epoch": 0.9596796213707108, + "grad_norm": 10.5625, + "learning_rate": 8.18046200907825e-06, + "loss": 1.0280214548110962, + "step": 5272 + }, + { + "epoch": 0.960043687994903, + "grad_norm": 12.5625, + "learning_rate": 8.17917723084973e-06, + "loss": 1.4819560050964355, + "step": 5274 + }, + { + "epoch": 0.9604077546190953, + "grad_norm": 31.875, + "learning_rate": 8.177892132835781e-06, + "loss": 1.3622816801071167, + "step": 5276 + }, + { + "epoch": 0.9607718212432875, + "grad_norm": 9.375, + "learning_rate": 8.176606715224989e-06, + "loss": 1.3200557231903076, + "step": 5278 + }, + { + "epoch": 0.9611358878674797, + "grad_norm": 26.625, + "learning_rate": 8.175320978205983e-06, + "loss": 1.4435665607452393, + "step": 5280 + }, + { + "epoch": 0.961499954491672, + "grad_norm": 15.875, + "learning_rate": 8.17403492196744e-06, + "loss": 1.4555540084838867, + "step": 5282 + }, + { + "epoch": 0.9618640211158642, + "grad_norm": 5.40625, + "learning_rate": 8.172748546698082e-06, + "loss": 1.0828317403793335, + "step": 5284 + }, + { + "epoch": 0.9622280877400564, + "grad_norm": 7.8125, + "learning_rate": 8.17146185258668e-06, + "loss": 1.1645407676696777, + "step": 5286 + }, + { + "epoch": 0.9625921543642486, + "grad_norm": 3.765625, + "learning_rate": 8.170174839822051e-06, + "loss": 1.3565809726715088, + "step": 5288 + }, + { + "epoch": 0.9629562209884409, + "grad_norm": 23.375, + "learning_rate": 8.168887508593058e-06, + "loss": 0.8393096923828125, + "step": 5290 + }, + { + "epoch": 0.9633202876126331, + "grad_norm": 13.25, + "learning_rate": 8.167599859088615e-06, + "loss": 0.9749665260314941, + "step": 5292 + }, + { + "epoch": 0.9636843542368253, + "grad_norm": 26.125, + "learning_rate": 8.166311891497678e-06, + "loss": 1.551950216293335, + "step": 5294 + }, + { + "epoch": 0.9640484208610176, + "grad_norm": 18.625, + "learning_rate": 8.165023606009248e-06, + "loss": 1.8238662481307983, + "step": 5296 + }, + { + "epoch": 0.9644124874852098, + "grad_norm": 7.46875, + "learning_rate": 8.163735002812378e-06, + "loss": 0.9817075729370117, + "step": 5298 + }, + { + "epoch": 0.964776554109402, + "grad_norm": 10.1875, + "learning_rate": 8.162446082096167e-06, + "loss": 0.9963616132736206, + "step": 5300 + }, + { + "epoch": 0.9651406207335943, + "grad_norm": 11.5, + "learning_rate": 8.161156844049755e-06, + "loss": 1.4401131868362427, + "step": 5302 + }, + { + "epoch": 0.9655046873577865, + "grad_norm": 100.0, + "learning_rate": 8.159867288862336e-06, + "loss": 1.1715326309204102, + "step": 5304 + }, + { + "epoch": 0.9658687539819787, + "grad_norm": 11.625, + "learning_rate": 8.158577416723143e-06, + "loss": 1.2917958498001099, + "step": 5306 + }, + { + "epoch": 0.9662328206061709, + "grad_norm": 25.0, + "learning_rate": 8.157287227821466e-06, + "loss": 1.5508229732513428, + "step": 5308 + }, + { + "epoch": 0.9665968872303632, + "grad_norm": 11.9375, + "learning_rate": 8.155996722346628e-06, + "loss": 1.7019456624984741, + "step": 5310 + }, + { + "epoch": 0.9669609538545554, + "grad_norm": 21.75, + "learning_rate": 8.154705900488011e-06, + "loss": 1.0934464931488037, + "step": 5312 + }, + { + "epoch": 0.9673250204787476, + "grad_norm": 14.8125, + "learning_rate": 8.153414762435032e-06, + "loss": 1.3818862438201904, + "step": 5314 + }, + { + "epoch": 0.9676890871029399, + "grad_norm": 15.4375, + "learning_rate": 8.152123308377167e-06, + "loss": 1.3036679029464722, + "step": 5316 + }, + { + "epoch": 0.9680531537271321, + "grad_norm": 12.75, + "learning_rate": 8.150831538503927e-06, + "loss": 1.5211178064346313, + "step": 5318 + }, + { + "epoch": 0.9684172203513243, + "grad_norm": 9.3125, + "learning_rate": 8.149539453004876e-06, + "loss": 1.3985037803649902, + "step": 5320 + }, + { + "epoch": 0.9687812869755165, + "grad_norm": 13.125, + "learning_rate": 8.14824705206962e-06, + "loss": 1.1710278987884521, + "step": 5322 + }, + { + "epoch": 0.9691453535997088, + "grad_norm": 21.0, + "learning_rate": 8.146954335887816e-06, + "loss": 1.7147796154022217, + "step": 5324 + }, + { + "epoch": 0.969509420223901, + "grad_norm": 12.625, + "learning_rate": 8.145661304649164e-06, + "loss": 1.3703334331512451, + "step": 5326 + }, + { + "epoch": 0.9698734868480932, + "grad_norm": 10.25, + "learning_rate": 8.144367958543411e-06, + "loss": 1.2839322090148926, + "step": 5328 + }, + { + "epoch": 0.9702375534722855, + "grad_norm": 7.3125, + "learning_rate": 8.143074297760349e-06, + "loss": 1.2957262992858887, + "step": 5330 + }, + { + "epoch": 0.9706016200964777, + "grad_norm": 16.875, + "learning_rate": 8.141780322489821e-06, + "loss": 1.485806941986084, + "step": 5332 + }, + { + "epoch": 0.9709656867206699, + "grad_norm": 23.125, + "learning_rate": 8.14048603292171e-06, + "loss": 1.864945411682129, + "step": 5334 + }, + { + "epoch": 0.9713297533448622, + "grad_norm": 15.1875, + "learning_rate": 8.139191429245949e-06, + "loss": 1.2725948095321655, + "step": 5336 + }, + { + "epoch": 0.9716938199690544, + "grad_norm": 14.0625, + "learning_rate": 8.137896511652515e-06, + "loss": 1.4446682929992676, + "step": 5338 + }, + { + "epoch": 0.9720578865932465, + "grad_norm": 14.8125, + "learning_rate": 8.136601280331431e-06, + "loss": 1.7600085735321045, + "step": 5340 + }, + { + "epoch": 0.9724219532174387, + "grad_norm": 10.375, + "learning_rate": 8.135305735472769e-06, + "loss": 1.6449780464172363, + "step": 5342 + }, + { + "epoch": 0.972786019841631, + "grad_norm": 12.9375, + "learning_rate": 8.134009877266645e-06, + "loss": 0.9663881063461304, + "step": 5344 + }, + { + "epoch": 0.9731500864658232, + "grad_norm": 19.5, + "learning_rate": 8.132713705903218e-06, + "loss": 1.1315466165542603, + "step": 5346 + }, + { + "epoch": 0.9735141530900154, + "grad_norm": 9.625, + "learning_rate": 8.131417221572697e-06, + "loss": 1.482200026512146, + "step": 5348 + }, + { + "epoch": 0.9738782197142077, + "grad_norm": 11.875, + "learning_rate": 8.130120424465337e-06, + "loss": 1.3849362134933472, + "step": 5350 + }, + { + "epoch": 0.9742422863383999, + "grad_norm": 6.71875, + "learning_rate": 8.128823314771438e-06, + "loss": 1.348287582397461, + "step": 5352 + }, + { + "epoch": 0.9746063529625921, + "grad_norm": 3.65625, + "learning_rate": 8.127525892681347e-06, + "loss": 1.2137531042099, + "step": 5354 + }, + { + "epoch": 0.9749704195867844, + "grad_norm": 4.375, + "learning_rate": 8.126228158385453e-06, + "loss": 1.0482590198516846, + "step": 5356 + }, + { + "epoch": 0.9753344862109766, + "grad_norm": 6.84375, + "learning_rate": 8.12493011207419e-06, + "loss": 1.190213680267334, + "step": 5358 + }, + { + "epoch": 0.9756985528351688, + "grad_norm": 19.0, + "learning_rate": 8.123631753938046e-06, + "loss": 1.2104922533035278, + "step": 5360 + }, + { + "epoch": 0.976062619459361, + "grad_norm": 10.3125, + "learning_rate": 8.12233308416755e-06, + "loss": 1.0388247966766357, + "step": 5362 + }, + { + "epoch": 0.9764266860835533, + "grad_norm": 24.5, + "learning_rate": 8.121034102953274e-06, + "loss": 1.3831732273101807, + "step": 5364 + }, + { + "epoch": 0.9767907527077455, + "grad_norm": 14.4375, + "learning_rate": 8.11973481048584e-06, + "loss": 1.5067081451416016, + "step": 5366 + }, + { + "epoch": 0.9771548193319377, + "grad_norm": 13.5, + "learning_rate": 8.11843520695591e-06, + "loss": 1.541979193687439, + "step": 5368 + }, + { + "epoch": 0.97751888595613, + "grad_norm": 6.34375, + "learning_rate": 8.117135292554202e-06, + "loss": 1.4077279567718506, + "step": 5370 + }, + { + "epoch": 0.9778829525803222, + "grad_norm": 15.1875, + "learning_rate": 8.115835067471468e-06, + "loss": 1.1266310214996338, + "step": 5372 + }, + { + "epoch": 0.9782470192045144, + "grad_norm": 32.25, + "learning_rate": 8.114534531898515e-06, + "loss": 2.0582756996154785, + "step": 5374 + }, + { + "epoch": 0.9786110858287067, + "grad_norm": 18.0, + "learning_rate": 8.113233686026188e-06, + "loss": 1.877091646194458, + "step": 5376 + }, + { + "epoch": 0.9789751524528989, + "grad_norm": 13.125, + "learning_rate": 8.111932530045378e-06, + "loss": 1.180967092514038, + "step": 5378 + }, + { + "epoch": 0.9793392190770911, + "grad_norm": 20.625, + "learning_rate": 8.110631064147035e-06, + "loss": 0.9638977646827698, + "step": 5380 + }, + { + "epoch": 0.9797032857012833, + "grad_norm": 10.1875, + "learning_rate": 8.10932928852213e-06, + "loss": 1.4034727811813354, + "step": 5382 + }, + { + "epoch": 0.9800673523254756, + "grad_norm": 11.875, + "learning_rate": 8.108027203361704e-06, + "loss": 1.380660891532898, + "step": 5384 + }, + { + "epoch": 0.9804314189496678, + "grad_norm": 14.5, + "learning_rate": 8.106724808856829e-06, + "loss": 1.2414599657058716, + "step": 5386 + }, + { + "epoch": 0.98079548557386, + "grad_norm": 9.5, + "learning_rate": 8.105422105198626e-06, + "loss": 0.574763298034668, + "step": 5388 + }, + { + "epoch": 0.9811595521980523, + "grad_norm": 10.6875, + "learning_rate": 8.10411909257826e-06, + "loss": 0.8744221925735474, + "step": 5390 + }, + { + "epoch": 0.9815236188222445, + "grad_norm": 7.4375, + "learning_rate": 8.102815771186946e-06, + "loss": 1.4710537195205688, + "step": 5392 + }, + { + "epoch": 0.9818876854464367, + "grad_norm": 6.09375, + "learning_rate": 8.101512141215939e-06, + "loss": 0.9895541071891785, + "step": 5394 + }, + { + "epoch": 0.9822517520706289, + "grad_norm": 6.875, + "learning_rate": 8.100208202856542e-06, + "loss": 1.1242363452911377, + "step": 5396 + }, + { + "epoch": 0.9826158186948212, + "grad_norm": 9.625, + "learning_rate": 8.098903956300104e-06, + "loss": 1.4189202785491943, + "step": 5398 + }, + { + "epoch": 0.9829798853190134, + "grad_norm": 8.625, + "learning_rate": 8.097599401738014e-06, + "loss": 1.5487031936645508, + "step": 5400 + }, + { + "epoch": 0.9833439519432056, + "grad_norm": 5.5625, + "learning_rate": 8.096294539361713e-06, + "loss": 1.016351342201233, + "step": 5402 + }, + { + "epoch": 0.9837080185673979, + "grad_norm": 9.4375, + "learning_rate": 8.094989369362685e-06, + "loss": 1.4842634201049805, + "step": 5404 + }, + { + "epoch": 0.9840720851915901, + "grad_norm": 29.625, + "learning_rate": 8.093683891932458e-06, + "loss": 1.4828146696090698, + "step": 5406 + }, + { + "epoch": 0.9844361518157823, + "grad_norm": 25.75, + "learning_rate": 8.092378107262603e-06, + "loss": 1.6820260286331177, + "step": 5408 + }, + { + "epoch": 0.9848002184399746, + "grad_norm": 59.0, + "learning_rate": 8.091072015544743e-06, + "loss": 1.2868156433105469, + "step": 5410 + }, + { + "epoch": 0.9851642850641668, + "grad_norm": 17.25, + "learning_rate": 8.089765616970534e-06, + "loss": 1.1464850902557373, + "step": 5412 + }, + { + "epoch": 0.985528351688359, + "grad_norm": 12.6875, + "learning_rate": 8.088458911731696e-06, + "loss": 1.3672478199005127, + "step": 5414 + }, + { + "epoch": 0.9858924183125511, + "grad_norm": 7.0, + "learning_rate": 8.087151900019975e-06, + "loss": 1.2416682243347168, + "step": 5416 + }, + { + "epoch": 0.9862564849367434, + "grad_norm": 13.1875, + "learning_rate": 8.085844582027168e-06, + "loss": 0.6675612926483154, + "step": 5418 + }, + { + "epoch": 0.9866205515609356, + "grad_norm": 10.875, + "learning_rate": 8.084536957945124e-06, + "loss": 0.32999187707901, + "step": 5420 + }, + { + "epoch": 0.9869846181851278, + "grad_norm": 13.4375, + "learning_rate": 8.083229027965728e-06, + "loss": 1.7137484550476074, + "step": 5422 + }, + { + "epoch": 0.9873486848093201, + "grad_norm": 17.75, + "learning_rate": 8.081920792280915e-06, + "loss": 1.1099997758865356, + "step": 5424 + }, + { + "epoch": 0.9877127514335123, + "grad_norm": 9.9375, + "learning_rate": 8.080612251082664e-06, + "loss": 1.3685656785964966, + "step": 5426 + }, + { + "epoch": 0.9880768180577045, + "grad_norm": 5.90625, + "learning_rate": 8.079303404562997e-06, + "loss": 1.3203872442245483, + "step": 5428 + }, + { + "epoch": 0.9884408846818968, + "grad_norm": 21.375, + "learning_rate": 8.077994252913984e-06, + "loss": 1.183948278427124, + "step": 5430 + }, + { + "epoch": 0.988804951306089, + "grad_norm": 15.0625, + "learning_rate": 8.076684796327732e-06, + "loss": 1.546653151512146, + "step": 5432 + }, + { + "epoch": 0.9891690179302812, + "grad_norm": 8.4375, + "learning_rate": 8.075375034996405e-06, + "loss": 1.072319746017456, + "step": 5434 + }, + { + "epoch": 0.9895330845544734, + "grad_norm": 7.71875, + "learning_rate": 8.074064969112199e-06, + "loss": 1.4160891771316528, + "step": 5436 + }, + { + "epoch": 0.9898971511786657, + "grad_norm": 9.4375, + "learning_rate": 8.072754598867367e-06, + "loss": 1.2624622583389282, + "step": 5438 + }, + { + "epoch": 0.9902612178028579, + "grad_norm": 11.25, + "learning_rate": 8.071443924454196e-06, + "loss": 1.232493281364441, + "step": 5440 + }, + { + "epoch": 0.9906252844270501, + "grad_norm": 15.5625, + "learning_rate": 8.070132946065026e-06, + "loss": 0.8610067367553711, + "step": 5442 + }, + { + "epoch": 0.9909893510512424, + "grad_norm": 12.0, + "learning_rate": 8.068821663892234e-06, + "loss": 1.402634859085083, + "step": 5444 + }, + { + "epoch": 0.9913534176754346, + "grad_norm": 14.125, + "learning_rate": 8.067510078128248e-06, + "loss": 1.3765374422073364, + "step": 5446 + }, + { + "epoch": 0.9917174842996268, + "grad_norm": 23.75, + "learning_rate": 8.066198188965538e-06, + "loss": 1.3468719720840454, + "step": 5448 + }, + { + "epoch": 0.9920815509238191, + "grad_norm": 8.4375, + "learning_rate": 8.064885996596616e-06, + "loss": 1.3279544115066528, + "step": 5450 + }, + { + "epoch": 0.9924456175480113, + "grad_norm": 8.6875, + "learning_rate": 8.063573501214042e-06, + "loss": 1.4619123935699463, + "step": 5452 + }, + { + "epoch": 0.9928096841722035, + "grad_norm": 16.625, + "learning_rate": 8.06226070301042e-06, + "loss": 1.315147876739502, + "step": 5454 + }, + { + "epoch": 0.9931737507963957, + "grad_norm": 17.125, + "learning_rate": 8.060947602178397e-06, + "loss": 1.395255208015442, + "step": 5456 + }, + { + "epoch": 0.993537817420588, + "grad_norm": 23.5, + "learning_rate": 8.059634198910666e-06, + "loss": 1.3836113214492798, + "step": 5458 + }, + { + "epoch": 0.9939018840447802, + "grad_norm": 40.75, + "learning_rate": 8.058320493399965e-06, + "loss": 1.9325013160705566, + "step": 5460 + }, + { + "epoch": 0.9942659506689724, + "grad_norm": 11.625, + "learning_rate": 8.057006485839071e-06, + "loss": 1.4436297416687012, + "step": 5462 + }, + { + "epoch": 0.9946300172931647, + "grad_norm": 21.875, + "learning_rate": 8.055692176420813e-06, + "loss": 1.2843307256698608, + "step": 5464 + }, + { + "epoch": 0.9949940839173569, + "grad_norm": 5.3125, + "learning_rate": 8.054377565338057e-06, + "loss": 1.1902520656585693, + "step": 5466 + }, + { + "epoch": 0.9953581505415491, + "grad_norm": 3.609375, + "learning_rate": 8.05306265278372e-06, + "loss": 1.211944341659546, + "step": 5468 + }, + { + "epoch": 0.9957222171657414, + "grad_norm": 7.71875, + "learning_rate": 8.051747438950759e-06, + "loss": 1.4459137916564941, + "step": 5470 + }, + { + "epoch": 0.9960862837899336, + "grad_norm": 14.8125, + "learning_rate": 8.050431924032176e-06, + "loss": 1.3181432485580444, + "step": 5472 + }, + { + "epoch": 0.9964503504141258, + "grad_norm": 92.5, + "learning_rate": 8.049116108221018e-06, + "loss": 0.8120696544647217, + "step": 5474 + }, + { + "epoch": 0.996814417038318, + "grad_norm": 7.375, + "learning_rate": 8.047799991710376e-06, + "loss": 1.399395227432251, + "step": 5476 + }, + { + "epoch": 0.9971784836625103, + "grad_norm": 5.90625, + "learning_rate": 8.046483574693384e-06, + "loss": 1.252817988395691, + "step": 5478 + }, + { + "epoch": 0.9975425502867025, + "grad_norm": 10.1875, + "learning_rate": 8.045166857363223e-06, + "loss": 1.1775716543197632, + "step": 5480 + }, + { + "epoch": 0.9979066169108947, + "grad_norm": 7.9375, + "learning_rate": 8.043849839913112e-06, + "loss": 1.3445779085159302, + "step": 5482 + }, + { + "epoch": 0.998270683535087, + "grad_norm": 14.3125, + "learning_rate": 8.04253252253632e-06, + "loss": 1.2628525495529175, + "step": 5484 + }, + { + "epoch": 0.9986347501592792, + "grad_norm": 7.8125, + "learning_rate": 8.041214905426155e-06, + "loss": 1.3635976314544678, + "step": 5486 + }, + { + "epoch": 0.9989988167834714, + "grad_norm": 5.09375, + "learning_rate": 8.039896988775979e-06, + "loss": 1.1852967739105225, + "step": 5488 + }, + { + "epoch": 0.9993628834076635, + "grad_norm": 16.375, + "learning_rate": 8.038578772779186e-06, + "loss": 1.8710843324661255, + "step": 5490 + }, + { + "epoch": 0.9997269500318559, + "grad_norm": 22.75, + "learning_rate": 8.03726025762922e-06, + "loss": 1.5698821544647217, + "step": 5492 + }, + { + "epoch": 1.0, + "grad_norm": 21.875, + "learning_rate": 8.035941443519568e-06, + "loss": 1.4250874519348145, + "step": 5494 + }, + { + "epoch": 1.0003640666241922, + "grad_norm": 6.125, + "learning_rate": 8.034622330643759e-06, + "loss": 1.3944168090820312, + "step": 5496 + }, + { + "epoch": 1.0007281332483844, + "grad_norm": 36.5, + "learning_rate": 8.033302919195369e-06, + "loss": 0.8998700976371765, + "step": 5498 + }, + { + "epoch": 1.0010921998725766, + "grad_norm": 12.1875, + "learning_rate": 8.031983209368015e-06, + "loss": 1.4408015012741089, + "step": 5500 + }, + { + "epoch": 1.001456266496769, + "grad_norm": 9.3125, + "learning_rate": 8.030663201355359e-06, + "loss": 1.031071424484253, + "step": 5502 + }, + { + "epoch": 1.0018203331209612, + "grad_norm": 111.5, + "learning_rate": 8.029342895351111e-06, + "loss": 1.486803650856018, + "step": 5504 + }, + { + "epoch": 1.0021843997451534, + "grad_norm": 25.75, + "learning_rate": 8.028022291549015e-06, + "loss": 0.2996535897254944, + "step": 5506 + }, + { + "epoch": 1.0025484663693456, + "grad_norm": 10.6875, + "learning_rate": 8.026701390142867e-06, + "loss": 1.3124223947525024, + "step": 5508 + }, + { + "epoch": 1.0029125329935378, + "grad_norm": 6.40625, + "learning_rate": 8.025380191326506e-06, + "loss": 1.3028819561004639, + "step": 5510 + }, + { + "epoch": 1.00327659961773, + "grad_norm": 12.875, + "learning_rate": 8.024058695293807e-06, + "loss": 1.241189956665039, + "step": 5512 + }, + { + "epoch": 1.0036406662419222, + "grad_norm": 16.75, + "learning_rate": 8.0227369022387e-06, + "loss": 1.8382549285888672, + "step": 5514 + }, + { + "epoch": 1.0040047328661146, + "grad_norm": 84.5, + "learning_rate": 8.021414812355146e-06, + "loss": 1.1665067672729492, + "step": 5516 + }, + { + "epoch": 1.0043687994903068, + "grad_norm": 13.8125, + "learning_rate": 8.020092425837162e-06, + "loss": 1.4099467992782593, + "step": 5518 + }, + { + "epoch": 1.004732866114499, + "grad_norm": 5.1875, + "learning_rate": 8.018769742878802e-06, + "loss": 1.3823487758636475, + "step": 5520 + }, + { + "epoch": 1.0050969327386912, + "grad_norm": 16.625, + "learning_rate": 8.017446763674165e-06, + "loss": 1.4213688373565674, + "step": 5522 + }, + { + "epoch": 1.0054609993628834, + "grad_norm": 9.4375, + "learning_rate": 8.016123488417389e-06, + "loss": 1.474638819694519, + "step": 5524 + }, + { + "epoch": 1.0058250659870756, + "grad_norm": 14.1875, + "learning_rate": 8.014799917302662e-06, + "loss": 1.3464412689208984, + "step": 5526 + }, + { + "epoch": 1.006189132611268, + "grad_norm": 14.6875, + "learning_rate": 8.013476050524212e-06, + "loss": 1.8093068599700928, + "step": 5528 + }, + { + "epoch": 1.0065531992354602, + "grad_norm": 7.8125, + "learning_rate": 8.012151888276313e-06, + "loss": 1.0855789184570312, + "step": 5530 + }, + { + "epoch": 1.0069172658596524, + "grad_norm": 9.375, + "learning_rate": 8.01082743075328e-06, + "loss": 1.3014432191848755, + "step": 5532 + }, + { + "epoch": 1.0072813324838445, + "grad_norm": 10.8125, + "learning_rate": 8.009502678149467e-06, + "loss": 1.377088189125061, + "step": 5534 + }, + { + "epoch": 1.0076453991080367, + "grad_norm": 9.0, + "learning_rate": 8.008177630659282e-06, + "loss": 1.4790308475494385, + "step": 5536 + }, + { + "epoch": 1.008009465732229, + "grad_norm": 8.1875, + "learning_rate": 8.006852288477167e-06, + "loss": 1.417969822883606, + "step": 5538 + }, + { + "epoch": 1.0083735323564211, + "grad_norm": 19.75, + "learning_rate": 8.005526651797615e-06, + "loss": 1.2785357236862183, + "step": 5540 + }, + { + "epoch": 1.0087375989806135, + "grad_norm": 10.875, + "learning_rate": 8.004200720815152e-06, + "loss": 1.297044277191162, + "step": 5542 + }, + { + "epoch": 1.0091016656048057, + "grad_norm": 31.5, + "learning_rate": 8.002874495724355e-06, + "loss": 1.5670397281646729, + "step": 5544 + }, + { + "epoch": 1.009465732228998, + "grad_norm": 5.28125, + "learning_rate": 8.001547976719844e-06, + "loss": 1.1834278106689453, + "step": 5546 + }, + { + "epoch": 1.0098297988531901, + "grad_norm": 2.828125, + "learning_rate": 8.000221163996277e-06, + "loss": 1.164804458618164, + "step": 5548 + }, + { + "epoch": 1.0101938654773823, + "grad_norm": 3.53125, + "learning_rate": 7.998894057748361e-06, + "loss": 0.9811059832572937, + "step": 5550 + }, + { + "epoch": 1.0105579321015745, + "grad_norm": 11.4375, + "learning_rate": 7.997566658170843e-06, + "loss": 1.6221890449523926, + "step": 5552 + }, + { + "epoch": 1.0109219987257667, + "grad_norm": 25.75, + "learning_rate": 7.996238965458516e-06, + "loss": 1.3792487382888794, + "step": 5554 + }, + { + "epoch": 1.0112860653499591, + "grad_norm": 12.375, + "learning_rate": 7.994910979806208e-06, + "loss": 1.555490493774414, + "step": 5556 + }, + { + "epoch": 1.0116501319741513, + "grad_norm": 6.6875, + "learning_rate": 7.9935827014088e-06, + "loss": 1.3472093343734741, + "step": 5558 + }, + { + "epoch": 1.0120141985983435, + "grad_norm": 9.625, + "learning_rate": 7.992254130461208e-06, + "loss": 1.4074777364730835, + "step": 5560 + }, + { + "epoch": 1.0123782652225357, + "grad_norm": 21.875, + "learning_rate": 7.990925267158398e-06, + "loss": 1.1080771684646606, + "step": 5562 + }, + { + "epoch": 1.012742331846728, + "grad_norm": 4.34375, + "learning_rate": 7.989596111695373e-06, + "loss": 0.7663303017616272, + "step": 5564 + }, + { + "epoch": 1.01310639847092, + "grad_norm": 18.875, + "learning_rate": 7.988266664267181e-06, + "loss": 1.478060245513916, + "step": 5566 + }, + { + "epoch": 1.0134704650951125, + "grad_norm": 9.8125, + "learning_rate": 7.986936925068913e-06, + "loss": 1.3367947340011597, + "step": 5568 + }, + { + "epoch": 1.0138345317193047, + "grad_norm": 9.8125, + "learning_rate": 7.985606894295705e-06, + "loss": 1.3023066520690918, + "step": 5570 + }, + { + "epoch": 1.014198598343497, + "grad_norm": 26.0, + "learning_rate": 7.984276572142733e-06, + "loss": 1.348774790763855, + "step": 5572 + }, + { + "epoch": 1.014562664967689, + "grad_norm": 5.71875, + "learning_rate": 7.982945958805215e-06, + "loss": 1.1589783430099487, + "step": 5574 + }, + { + "epoch": 1.0149267315918813, + "grad_norm": 35.0, + "learning_rate": 7.981615054478412e-06, + "loss": 1.486038327217102, + "step": 5576 + }, + { + "epoch": 1.0152907982160735, + "grad_norm": 5.21875, + "learning_rate": 7.980283859357633e-06, + "loss": 1.1041972637176514, + "step": 5578 + }, + { + "epoch": 1.0156548648402657, + "grad_norm": 9.5, + "learning_rate": 7.978952373638222e-06, + "loss": 1.3358068466186523, + "step": 5580 + }, + { + "epoch": 1.016018931464458, + "grad_norm": 11.9375, + "learning_rate": 7.97762059751557e-06, + "loss": 1.4987894296646118, + "step": 5582 + }, + { + "epoch": 1.0163829980886503, + "grad_norm": 4.40625, + "learning_rate": 7.976288531185112e-06, + "loss": 1.5133428573608398, + "step": 5584 + }, + { + "epoch": 1.0167470647128425, + "grad_norm": 7.71875, + "learning_rate": 7.974956174842319e-06, + "loss": 1.4151054620742798, + "step": 5586 + }, + { + "epoch": 1.0171111313370347, + "grad_norm": 11.625, + "learning_rate": 7.97362352868271e-06, + "loss": 1.5820132493972778, + "step": 5588 + }, + { + "epoch": 1.0174751979612269, + "grad_norm": 4.34375, + "learning_rate": 7.972290592901847e-06, + "loss": 0.930628776550293, + "step": 5590 + }, + { + "epoch": 1.017839264585419, + "grad_norm": 21.375, + "learning_rate": 7.970957367695335e-06, + "loss": 2.016417980194092, + "step": 5592 + }, + { + "epoch": 1.0182033312096113, + "grad_norm": 9.6875, + "learning_rate": 7.969623853258816e-06, + "loss": 1.2663755416870117, + "step": 5594 + }, + { + "epoch": 1.0185673978338037, + "grad_norm": 10.6875, + "learning_rate": 7.968290049787979e-06, + "loss": 1.3435295820236206, + "step": 5596 + }, + { + "epoch": 1.0189314644579959, + "grad_norm": 12.8125, + "learning_rate": 7.966955957478553e-06, + "loss": 1.3806339502334595, + "step": 5598 + }, + { + "epoch": 1.019295531082188, + "grad_norm": 32.0, + "learning_rate": 7.96562157652631e-06, + "loss": 1.3372187614440918, + "step": 5600 + }, + { + "epoch": 1.0196595977063803, + "grad_norm": 17.125, + "learning_rate": 7.96428690712707e-06, + "loss": 1.2605324983596802, + "step": 5602 + }, + { + "epoch": 1.0200236643305725, + "grad_norm": 28.625, + "learning_rate": 7.962951949476685e-06, + "loss": 1.2971853017807007, + "step": 5604 + }, + { + "epoch": 1.0203877309547646, + "grad_norm": 6.46875, + "learning_rate": 7.961616703771055e-06, + "loss": 1.1381902694702148, + "step": 5606 + }, + { + "epoch": 1.0207517975789568, + "grad_norm": 16.875, + "learning_rate": 7.960281170206128e-06, + "loss": 1.4832932949066162, + "step": 5608 + }, + { + "epoch": 1.0211158642031493, + "grad_norm": 16.625, + "learning_rate": 7.95894534897788e-06, + "loss": 1.4112111330032349, + "step": 5610 + }, + { + "epoch": 1.0214799308273415, + "grad_norm": 32.25, + "learning_rate": 7.957609240282342e-06, + "loss": 1.7598168849945068, + "step": 5612 + }, + { + "epoch": 1.0218439974515336, + "grad_norm": 3.828125, + "learning_rate": 7.95627284431558e-06, + "loss": 1.01741623878479, + "step": 5614 + }, + { + "epoch": 1.0222080640757258, + "grad_norm": 7.90625, + "learning_rate": 7.954936161273707e-06, + "loss": 1.0698068141937256, + "step": 5616 + }, + { + "epoch": 1.022572130699918, + "grad_norm": 7.59375, + "learning_rate": 7.953599191352876e-06, + "loss": 1.4497947692871094, + "step": 5618 + }, + { + "epoch": 1.0229361973241102, + "grad_norm": 6.71875, + "learning_rate": 7.95226193474928e-06, + "loss": 1.1009907722473145, + "step": 5620 + }, + { + "epoch": 1.0233002639483026, + "grad_norm": 11.625, + "learning_rate": 7.950924391659159e-06, + "loss": 1.5086032152175903, + "step": 5622 + }, + { + "epoch": 1.0236643305724948, + "grad_norm": 16.125, + "learning_rate": 7.949586562278788e-06, + "loss": 1.419341802597046, + "step": 5624 + }, + { + "epoch": 1.024028397196687, + "grad_norm": 10.625, + "learning_rate": 7.948248446804492e-06, + "loss": 1.4290971755981445, + "step": 5626 + }, + { + "epoch": 1.0243924638208792, + "grad_norm": 18.875, + "learning_rate": 7.946910045432627e-06, + "loss": 1.3870618343353271, + "step": 5628 + }, + { + "epoch": 1.0247565304450714, + "grad_norm": 7.78125, + "learning_rate": 7.945571358359607e-06, + "loss": 1.3220620155334473, + "step": 5630 + }, + { + "epoch": 1.0251205970692636, + "grad_norm": 7.90625, + "learning_rate": 7.944232385781874e-06, + "loss": 1.4463415145874023, + "step": 5632 + }, + { + "epoch": 1.0254846636934558, + "grad_norm": 11.25, + "learning_rate": 7.94289312789592e-06, + "loss": 1.3423161506652832, + "step": 5634 + }, + { + "epoch": 1.0258487303176482, + "grad_norm": 4.4375, + "learning_rate": 7.94155358489827e-06, + "loss": 0.9303983449935913, + "step": 5636 + }, + { + "epoch": 1.0262127969418404, + "grad_norm": 11.6875, + "learning_rate": 7.940213756985503e-06, + "loss": 1.551761269569397, + "step": 5638 + }, + { + "epoch": 1.0265768635660326, + "grad_norm": 23.5, + "learning_rate": 7.938873644354225e-06, + "loss": 0.8018133640289307, + "step": 5640 + }, + { + "epoch": 1.0269409301902248, + "grad_norm": 8.625, + "learning_rate": 7.9375332472011e-06, + "loss": 0.8830986022949219, + "step": 5642 + }, + { + "epoch": 1.027304996814417, + "grad_norm": 24.125, + "learning_rate": 7.936192565722824e-06, + "loss": 0.9009889364242554, + "step": 5644 + }, + { + "epoch": 1.0276690634386092, + "grad_norm": 24.125, + "learning_rate": 7.934851600116136e-06, + "loss": 1.7313756942749023, + "step": 5646 + }, + { + "epoch": 1.0280331300628014, + "grad_norm": 12.375, + "learning_rate": 7.933510350577816e-06, + "loss": 1.3145555257797241, + "step": 5648 + }, + { + "epoch": 1.0283971966869938, + "grad_norm": 7.28125, + "learning_rate": 7.932168817304689e-06, + "loss": 1.5072616338729858, + "step": 5650 + }, + { + "epoch": 1.028761263311186, + "grad_norm": 7.84375, + "learning_rate": 7.930827000493614e-06, + "loss": 1.4100126028060913, + "step": 5652 + }, + { + "epoch": 1.0291253299353782, + "grad_norm": 7.5, + "learning_rate": 7.929484900341507e-06, + "loss": 1.5347495079040527, + "step": 5654 + }, + { + "epoch": 1.0294893965595704, + "grad_norm": 12.1875, + "learning_rate": 7.928142517045307e-06, + "loss": 1.370579481124878, + "step": 5656 + }, + { + "epoch": 1.0298534631837626, + "grad_norm": 14.125, + "learning_rate": 7.926799850802009e-06, + "loss": 1.1086434125900269, + "step": 5658 + }, + { + "epoch": 1.0302175298079548, + "grad_norm": 14.4375, + "learning_rate": 7.925456901808642e-06, + "loss": 1.4568206071853638, + "step": 5660 + }, + { + "epoch": 1.030581596432147, + "grad_norm": 8.25, + "learning_rate": 7.924113670262277e-06, + "loss": 0.6029893159866333, + "step": 5662 + }, + { + "epoch": 1.0309456630563394, + "grad_norm": 8.8125, + "learning_rate": 7.922770156360027e-06, + "loss": 1.4496461153030396, + "step": 5664 + }, + { + "epoch": 1.0313097296805316, + "grad_norm": 9.0, + "learning_rate": 7.921426360299052e-06, + "loss": 1.3289098739624023, + "step": 5666 + }, + { + "epoch": 1.0316737963047238, + "grad_norm": 7.40625, + "learning_rate": 7.920082282276542e-06, + "loss": 1.0863322019577026, + "step": 5668 + }, + { + "epoch": 1.032037862928916, + "grad_norm": 17.125, + "learning_rate": 7.918737922489741e-06, + "loss": 1.4817003011703491, + "step": 5670 + }, + { + "epoch": 1.0324019295531082, + "grad_norm": 10.3125, + "learning_rate": 7.917393281135923e-06, + "loss": 1.361006736755371, + "step": 5672 + }, + { + "epoch": 1.0327659961773004, + "grad_norm": 10.1875, + "learning_rate": 7.916048358412413e-06, + "loss": 1.1761387586593628, + "step": 5674 + }, + { + "epoch": 1.0331300628014928, + "grad_norm": 24.625, + "learning_rate": 7.914703154516572e-06, + "loss": 1.8891961574554443, + "step": 5676 + }, + { + "epoch": 1.033494129425685, + "grad_norm": 2.453125, + "learning_rate": 7.913357669645803e-06, + "loss": 1.049350380897522, + "step": 5678 + }, + { + "epoch": 1.0338581960498772, + "grad_norm": 10.0, + "learning_rate": 7.91201190399755e-06, + "loss": 1.3955374956130981, + "step": 5680 + }, + { + "epoch": 1.0342222626740694, + "grad_norm": 2.78125, + "learning_rate": 7.910665857769298e-06, + "loss": 1.0531591176986694, + "step": 5682 + }, + { + "epoch": 1.0345863292982616, + "grad_norm": 12.0625, + "learning_rate": 7.909319531158577e-06, + "loss": 1.6384118795394897, + "step": 5684 + }, + { + "epoch": 1.0349503959224537, + "grad_norm": 13.125, + "learning_rate": 7.90797292436295e-06, + "loss": 1.4781283140182495, + "step": 5686 + }, + { + "epoch": 1.035314462546646, + "grad_norm": 17.0, + "learning_rate": 7.906626037580027e-06, + "loss": 0.19681823253631592, + "step": 5688 + }, + { + "epoch": 1.0356785291708384, + "grad_norm": 8.1875, + "learning_rate": 7.905278871007465e-06, + "loss": 1.3952281475067139, + "step": 5690 + }, + { + "epoch": 1.0360425957950306, + "grad_norm": 6.90625, + "learning_rate": 7.903931424842946e-06, + "loss": 1.3457505702972412, + "step": 5692 + }, + { + "epoch": 1.0364066624192227, + "grad_norm": 13.75, + "learning_rate": 7.902583699284207e-06, + "loss": 1.4581931829452515, + "step": 5694 + }, + { + "epoch": 1.036770729043415, + "grad_norm": 6.78125, + "learning_rate": 7.901235694529021e-06, + "loss": 0.7864412069320679, + "step": 5696 + }, + { + "epoch": 1.0371347956676071, + "grad_norm": 10.1875, + "learning_rate": 7.899887410775202e-06, + "loss": 0.9320163130760193, + "step": 5698 + }, + { + "epoch": 1.0374988622917993, + "grad_norm": 17.875, + "learning_rate": 7.898538848220607e-06, + "loss": 1.3165302276611328, + "step": 5700 + }, + { + "epoch": 1.0378629289159915, + "grad_norm": 10.4375, + "learning_rate": 7.897190007063129e-06, + "loss": 1.2565268278121948, + "step": 5702 + }, + { + "epoch": 1.038226995540184, + "grad_norm": 4.3125, + "learning_rate": 7.895840887500703e-06, + "loss": 1.0381525754928589, + "step": 5704 + }, + { + "epoch": 1.0385910621643761, + "grad_norm": 17.625, + "learning_rate": 7.894491489731313e-06, + "loss": 1.5520533323287964, + "step": 5706 + }, + { + "epoch": 1.0389551287885683, + "grad_norm": 7.375, + "learning_rate": 7.893141813952973e-06, + "loss": 0.8928835391998291, + "step": 5708 + }, + { + "epoch": 1.0393191954127605, + "grad_norm": 3.734375, + "learning_rate": 7.891791860363747e-06, + "loss": 1.226557731628418, + "step": 5710 + }, + { + "epoch": 1.0396832620369527, + "grad_norm": 20.125, + "learning_rate": 7.89044162916173e-06, + "loss": 1.661374568939209, + "step": 5712 + }, + { + "epoch": 1.040047328661145, + "grad_norm": 12.375, + "learning_rate": 7.889091120545064e-06, + "loss": 1.4480359554290771, + "step": 5714 + }, + { + "epoch": 1.040411395285337, + "grad_norm": 4.5, + "learning_rate": 7.887740334711935e-06, + "loss": 1.0804306268692017, + "step": 5716 + }, + { + "epoch": 1.0407754619095295, + "grad_norm": 14.375, + "learning_rate": 7.88638927186056e-06, + "loss": 1.383379578590393, + "step": 5718 + }, + { + "epoch": 1.0411395285337217, + "grad_norm": 16.5, + "learning_rate": 7.885037932189206e-06, + "loss": 1.3997722864151, + "step": 5720 + }, + { + "epoch": 1.041503595157914, + "grad_norm": 8.5625, + "learning_rate": 7.883686315896173e-06, + "loss": 1.4699680805206299, + "step": 5722 + }, + { + "epoch": 1.041867661782106, + "grad_norm": 13.75, + "learning_rate": 7.882334423179807e-06, + "loss": 1.2019716501235962, + "step": 5724 + }, + { + "epoch": 1.0422317284062983, + "grad_norm": 9.0625, + "learning_rate": 7.880982254238495e-06, + "loss": 1.3388502597808838, + "step": 5726 + }, + { + "epoch": 1.0425957950304905, + "grad_norm": 10.5, + "learning_rate": 7.879629809270657e-06, + "loss": 1.1779283285140991, + "step": 5728 + }, + { + "epoch": 1.042959861654683, + "grad_norm": 19.625, + "learning_rate": 7.878277088474764e-06, + "loss": 1.9388954639434814, + "step": 5730 + }, + { + "epoch": 1.043323928278875, + "grad_norm": 2.96875, + "learning_rate": 7.87692409204932e-06, + "loss": 0.8207886815071106, + "step": 5732 + }, + { + "epoch": 1.0436879949030673, + "grad_norm": 31.625, + "learning_rate": 7.875570820192873e-06, + "loss": 1.4753812551498413, + "step": 5734 + }, + { + "epoch": 1.0440520615272595, + "grad_norm": 5.75, + "learning_rate": 7.874217273104008e-06, + "loss": 0.8696913719177246, + "step": 5736 + }, + { + "epoch": 1.0444161281514517, + "grad_norm": 16.75, + "learning_rate": 7.872863450981352e-06, + "loss": 1.227979302406311, + "step": 5738 + }, + { + "epoch": 1.0447801947756439, + "grad_norm": 11.3125, + "learning_rate": 7.871509354023577e-06, + "loss": 1.7013131380081177, + "step": 5740 + }, + { + "epoch": 1.045144261399836, + "grad_norm": 14.125, + "learning_rate": 7.870154982429387e-06, + "loss": 1.4951039552688599, + "step": 5742 + }, + { + "epoch": 1.0455083280240285, + "grad_norm": 3.703125, + "learning_rate": 7.868800336397536e-06, + "loss": 1.0460208654403687, + "step": 5744 + }, + { + "epoch": 1.0458723946482207, + "grad_norm": 5.90625, + "learning_rate": 7.867445416126804e-06, + "loss": 1.1345962285995483, + "step": 5746 + }, + { + "epoch": 1.0462364612724129, + "grad_norm": 16.125, + "learning_rate": 7.86609022181603e-06, + "loss": 2.0216028690338135, + "step": 5748 + }, + { + "epoch": 1.046600527896605, + "grad_norm": 15.125, + "learning_rate": 7.864734753664076e-06, + "loss": 1.5517833232879639, + "step": 5750 + }, + { + "epoch": 1.0469645945207973, + "grad_norm": 15.25, + "learning_rate": 7.863379011869856e-06, + "loss": 0.74480140209198, + "step": 5752 + }, + { + "epoch": 1.0473286611449895, + "grad_norm": 18.25, + "learning_rate": 7.862022996632315e-06, + "loss": 1.3665461540222168, + "step": 5754 + }, + { + "epoch": 1.0476927277691817, + "grad_norm": 12.5625, + "learning_rate": 7.860666708150447e-06, + "loss": 2.0109081268310547, + "step": 5756 + }, + { + "epoch": 1.048056794393374, + "grad_norm": 4.09375, + "learning_rate": 7.85931014662328e-06, + "loss": 1.1212888956069946, + "step": 5758 + }, + { + "epoch": 1.0484208610175663, + "grad_norm": 36.0, + "learning_rate": 7.857953312249885e-06, + "loss": 0.11577693372964859, + "step": 5760 + }, + { + "epoch": 1.0487849276417585, + "grad_norm": 10.5625, + "learning_rate": 7.85659620522937e-06, + "loss": 1.418005347251892, + "step": 5762 + }, + { + "epoch": 1.0491489942659507, + "grad_norm": 45.5, + "learning_rate": 7.855238825760885e-06, + "loss": 0.9027985334396362, + "step": 5764 + }, + { + "epoch": 1.0495130608901428, + "grad_norm": 90.5, + "learning_rate": 7.853881174043623e-06, + "loss": 1.3457250595092773, + "step": 5766 + }, + { + "epoch": 1.049877127514335, + "grad_norm": 3.359375, + "learning_rate": 7.852523250276809e-06, + "loss": 0.7009027004241943, + "step": 5768 + }, + { + "epoch": 1.0502411941385275, + "grad_norm": 2.65625, + "learning_rate": 7.85116505465972e-06, + "loss": 0.751906156539917, + "step": 5770 + }, + { + "epoch": 1.0506052607627197, + "grad_norm": 6.0, + "learning_rate": 7.849806587391657e-06, + "loss": 1.1912050247192383, + "step": 5772 + }, + { + "epoch": 1.0509693273869118, + "grad_norm": 20.375, + "learning_rate": 7.848447848671976e-06, + "loss": 0.5299519896507263, + "step": 5774 + }, + { + "epoch": 1.051333394011104, + "grad_norm": 26.125, + "learning_rate": 7.847088838700066e-06, + "loss": 1.519481897354126, + "step": 5776 + }, + { + "epoch": 1.0516974606352962, + "grad_norm": 28.25, + "learning_rate": 7.84572955767535e-06, + "loss": 1.52915358543396, + "step": 5778 + }, + { + "epoch": 1.0520615272594884, + "grad_norm": 14.875, + "learning_rate": 7.844370005797304e-06, + "loss": 1.8322023153305054, + "step": 5780 + }, + { + "epoch": 1.0524255938836806, + "grad_norm": 2.5, + "learning_rate": 7.843010183265436e-06, + "loss": 1.0097967386245728, + "step": 5782 + }, + { + "epoch": 1.052789660507873, + "grad_norm": 34.25, + "learning_rate": 7.841650090279292e-06, + "loss": 2.2194175720214844, + "step": 5784 + }, + { + "epoch": 1.0531537271320652, + "grad_norm": 35.0, + "learning_rate": 7.840289727038457e-06, + "loss": 1.9527595043182373, + "step": 5786 + }, + { + "epoch": 1.0535177937562574, + "grad_norm": 20.625, + "learning_rate": 7.838929093742566e-06, + "loss": 1.8302507400512695, + "step": 5788 + }, + { + "epoch": 1.0538818603804496, + "grad_norm": 4.59375, + "learning_rate": 7.837568190591283e-06, + "loss": 1.1075001955032349, + "step": 5790 + }, + { + "epoch": 1.0542459270046418, + "grad_norm": 11.5, + "learning_rate": 7.836207017784314e-06, + "loss": 1.4602190256118774, + "step": 5792 + }, + { + "epoch": 1.054609993628834, + "grad_norm": 15.0625, + "learning_rate": 7.834845575521408e-06, + "loss": 1.7976443767547607, + "step": 5794 + }, + { + "epoch": 1.0549740602530262, + "grad_norm": 30.25, + "learning_rate": 7.833483864002347e-06, + "loss": 1.040452480316162, + "step": 5796 + }, + { + "epoch": 1.0553381268772186, + "grad_norm": 5.0625, + "learning_rate": 7.832121883426961e-06, + "loss": 1.0199024677276611, + "step": 5798 + }, + { + "epoch": 1.0557021935014108, + "grad_norm": 8.875, + "learning_rate": 7.830759633995116e-06, + "loss": 1.1002187728881836, + "step": 5800 + }, + { + "epoch": 1.056066260125603, + "grad_norm": 10.875, + "learning_rate": 7.82939711590671e-06, + "loss": 1.657292366027832, + "step": 5802 + }, + { + "epoch": 1.0564303267497952, + "grad_norm": 13.125, + "learning_rate": 7.828034329361694e-06, + "loss": 1.3729122877120972, + "step": 5804 + }, + { + "epoch": 1.0567943933739874, + "grad_norm": 33.5, + "learning_rate": 7.826671274560048e-06, + "loss": 0.6674918532371521, + "step": 5806 + }, + { + "epoch": 1.0571584599981796, + "grad_norm": 13.6875, + "learning_rate": 7.825307951701795e-06, + "loss": 1.3292901515960693, + "step": 5808 + }, + { + "epoch": 1.057522526622372, + "grad_norm": 16.0, + "learning_rate": 7.823944360986997e-06, + "loss": 1.1440261602401733, + "step": 5810 + }, + { + "epoch": 1.0578865932465642, + "grad_norm": 54.5, + "learning_rate": 7.822580502615755e-06, + "loss": 2.157811403274536, + "step": 5812 + }, + { + "epoch": 1.0582506598707564, + "grad_norm": 17.125, + "learning_rate": 7.821216376788211e-06, + "loss": 1.4427753686904907, + "step": 5814 + }, + { + "epoch": 1.0586147264949486, + "grad_norm": 8.9375, + "learning_rate": 7.819851983704548e-06, + "loss": 0.21290744841098785, + "step": 5816 + }, + { + "epoch": 1.0589787931191408, + "grad_norm": 10.5, + "learning_rate": 7.818487323564976e-06, + "loss": 1.7550071477890015, + "step": 5818 + }, + { + "epoch": 1.059342859743333, + "grad_norm": 46.75, + "learning_rate": 7.817122396569762e-06, + "loss": 1.3410418033599854, + "step": 5820 + }, + { + "epoch": 1.0597069263675252, + "grad_norm": 15.8125, + "learning_rate": 7.8157572029192e-06, + "loss": 1.7078181505203247, + "step": 5822 + }, + { + "epoch": 1.0600709929917176, + "grad_norm": 22.5, + "learning_rate": 7.814391742813627e-06, + "loss": 1.946914792060852, + "step": 5824 + }, + { + "epoch": 1.0604350596159098, + "grad_norm": 11.0625, + "learning_rate": 7.81302601645342e-06, + "loss": 1.6175501346588135, + "step": 5826 + }, + { + "epoch": 1.060799126240102, + "grad_norm": 9.3125, + "learning_rate": 7.811660024038992e-06, + "loss": 1.3057560920715332, + "step": 5828 + }, + { + "epoch": 1.0611631928642942, + "grad_norm": 16.125, + "learning_rate": 7.810293765770798e-06, + "loss": 1.3452696800231934, + "step": 5830 + }, + { + "epoch": 1.0615272594884864, + "grad_norm": 18.375, + "learning_rate": 7.80892724184933e-06, + "loss": 1.3865573406219482, + "step": 5832 + }, + { + "epoch": 1.0618913261126786, + "grad_norm": 9.6875, + "learning_rate": 7.807560452475117e-06, + "loss": 1.3722175359725952, + "step": 5834 + }, + { + "epoch": 1.0622553927368708, + "grad_norm": 4.21875, + "learning_rate": 7.806193397848735e-06, + "loss": 1.2886347770690918, + "step": 5836 + }, + { + "epoch": 1.0626194593610632, + "grad_norm": 18.875, + "learning_rate": 7.804826078170795e-06, + "loss": 1.3787620067596436, + "step": 5838 + }, + { + "epoch": 1.0629835259852554, + "grad_norm": 10.125, + "learning_rate": 7.80345849364194e-06, + "loss": 1.409644365310669, + "step": 5840 + }, + { + "epoch": 1.0633475926094476, + "grad_norm": 18.5, + "learning_rate": 7.802090644462858e-06, + "loss": 1.8887813091278076, + "step": 5842 + }, + { + "epoch": 1.0637116592336398, + "grad_norm": 11.375, + "learning_rate": 7.800722530834279e-06, + "loss": 1.4753693342208862, + "step": 5844 + }, + { + "epoch": 1.064075725857832, + "grad_norm": 12.875, + "learning_rate": 7.799354152956967e-06, + "loss": 1.5493706464767456, + "step": 5846 + }, + { + "epoch": 1.0644397924820241, + "grad_norm": 12.375, + "learning_rate": 7.797985511031724e-06, + "loss": 1.069637656211853, + "step": 5848 + }, + { + "epoch": 1.0648038591062163, + "grad_norm": 6.4375, + "learning_rate": 7.796616605259395e-06, + "loss": 1.4298814535140991, + "step": 5850 + }, + { + "epoch": 1.0651679257304087, + "grad_norm": 10.375, + "learning_rate": 7.79524743584086e-06, + "loss": 1.6018315553665161, + "step": 5852 + }, + { + "epoch": 1.065531992354601, + "grad_norm": 13.25, + "learning_rate": 7.793878002977038e-06, + "loss": 1.3345156908035278, + "step": 5854 + }, + { + "epoch": 1.0658960589787931, + "grad_norm": 9.625, + "learning_rate": 7.792508306868889e-06, + "loss": 1.5287530422210693, + "step": 5856 + }, + { + "epoch": 1.0662601256029853, + "grad_norm": 6.34375, + "learning_rate": 7.79113834771741e-06, + "loss": 1.1813935041427612, + "step": 5858 + }, + { + "epoch": 1.0666241922271775, + "grad_norm": 4.59375, + "learning_rate": 7.78976812572364e-06, + "loss": 1.168440818786621, + "step": 5860 + }, + { + "epoch": 1.0669882588513697, + "grad_norm": 30.25, + "learning_rate": 7.78839764108865e-06, + "loss": 1.3632657527923584, + "step": 5862 + }, + { + "epoch": 1.0673523254755621, + "grad_norm": 3.46875, + "learning_rate": 7.787026894013551e-06, + "loss": 0.8462130427360535, + "step": 5864 + }, + { + "epoch": 1.0677163920997543, + "grad_norm": 14.0, + "learning_rate": 7.7856558846995e-06, + "loss": 1.4883511066436768, + "step": 5866 + }, + { + "epoch": 1.0680804587239465, + "grad_norm": 17.875, + "learning_rate": 7.784284613347684e-06, + "loss": 1.332727074623108, + "step": 5868 + }, + { + "epoch": 1.0684445253481387, + "grad_norm": 13.25, + "learning_rate": 7.782913080159334e-06, + "loss": 1.416504979133606, + "step": 5870 + }, + { + "epoch": 1.068808591972331, + "grad_norm": 11.25, + "learning_rate": 7.781541285335712e-06, + "loss": 1.5033235549926758, + "step": 5872 + }, + { + "epoch": 1.069172658596523, + "grad_norm": 12.0, + "learning_rate": 7.780169229078127e-06, + "loss": 1.7850062847137451, + "step": 5874 + }, + { + "epoch": 1.0695367252207153, + "grad_norm": 21.0, + "learning_rate": 7.778796911587923e-06, + "loss": 1.450186014175415, + "step": 5876 + }, + { + "epoch": 1.0699007918449077, + "grad_norm": 27.125, + "learning_rate": 7.77742433306648e-06, + "loss": 1.6427778005599976, + "step": 5878 + }, + { + "epoch": 1.0702648584691, + "grad_norm": 4.125, + "learning_rate": 7.776051493715223e-06, + "loss": 1.117126703262329, + "step": 5880 + }, + { + "epoch": 1.070628925093292, + "grad_norm": 26.5, + "learning_rate": 7.774678393735602e-06, + "loss": 0.5835259556770325, + "step": 5882 + }, + { + "epoch": 1.0709929917174843, + "grad_norm": 14.5625, + "learning_rate": 7.773305033329121e-06, + "loss": 1.3859399557113647, + "step": 5884 + }, + { + "epoch": 1.0713570583416765, + "grad_norm": 6.3125, + "learning_rate": 7.771931412697314e-06, + "loss": 1.3661203384399414, + "step": 5886 + }, + { + "epoch": 1.0717211249658687, + "grad_norm": 18.125, + "learning_rate": 7.770557532041752e-06, + "loss": 1.1463693380355835, + "step": 5888 + }, + { + "epoch": 1.0720851915900609, + "grad_norm": 10.1875, + "learning_rate": 7.769183391564047e-06, + "loss": 1.524759292602539, + "step": 5890 + }, + { + "epoch": 1.0724492582142533, + "grad_norm": 16.375, + "learning_rate": 7.76780899146585e-06, + "loss": 1.4006747007369995, + "step": 5892 + }, + { + "epoch": 1.0728133248384455, + "grad_norm": 5.78125, + "learning_rate": 7.766434331948846e-06, + "loss": 0.8311281204223633, + "step": 5894 + }, + { + "epoch": 1.0731773914626377, + "grad_norm": 20.0, + "learning_rate": 7.765059413214767e-06, + "loss": 1.905245304107666, + "step": 5896 + }, + { + "epoch": 1.0735414580868299, + "grad_norm": 16.25, + "learning_rate": 7.763684235465367e-06, + "loss": 1.507002353668213, + "step": 5898 + }, + { + "epoch": 1.073905524711022, + "grad_norm": 10.25, + "learning_rate": 7.762308798902455e-06, + "loss": 1.3457531929016113, + "step": 5900 + }, + { + "epoch": 1.0742695913352143, + "grad_norm": 7.59375, + "learning_rate": 7.76093310372787e-06, + "loss": 1.5727198123931885, + "step": 5902 + }, + { + "epoch": 1.0746336579594065, + "grad_norm": 10.3125, + "learning_rate": 7.759557150143488e-06, + "loss": 1.3533425331115723, + "step": 5904 + }, + { + "epoch": 1.0749977245835989, + "grad_norm": 10.1875, + "learning_rate": 7.758180938351225e-06, + "loss": 1.562125563621521, + "step": 5906 + }, + { + "epoch": 1.075361791207791, + "grad_norm": 8.0625, + "learning_rate": 7.756804468553033e-06, + "loss": 1.0933337211608887, + "step": 5908 + }, + { + "epoch": 1.0757258578319833, + "grad_norm": 45.0, + "learning_rate": 7.755427740950908e-06, + "loss": 1.2925703525543213, + "step": 5910 + }, + { + "epoch": 1.0760899244561755, + "grad_norm": 37.0, + "learning_rate": 7.754050755746874e-06, + "loss": 2.092103958129883, + "step": 5912 + }, + { + "epoch": 1.0764539910803677, + "grad_norm": 15.3125, + "learning_rate": 7.752673513143e-06, + "loss": 1.3334004878997803, + "step": 5914 + }, + { + "epoch": 1.0768180577045599, + "grad_norm": 7.8125, + "learning_rate": 7.751296013341391e-06, + "loss": 1.2754844427108765, + "step": 5916 + }, + { + "epoch": 1.0771821243287523, + "grad_norm": 9.125, + "learning_rate": 7.749918256544192e-06, + "loss": 1.4102667570114136, + "step": 5918 + }, + { + "epoch": 1.0775461909529445, + "grad_norm": 18.0, + "learning_rate": 7.748540242953577e-06, + "loss": 1.1046556234359741, + "step": 5920 + }, + { + "epoch": 1.0779102575771367, + "grad_norm": 7.96875, + "learning_rate": 7.747161972771769e-06, + "loss": 1.364701271057129, + "step": 5922 + }, + { + "epoch": 1.0782743242013288, + "grad_norm": 15.5, + "learning_rate": 7.745783446201024e-06, + "loss": 1.830952763557434, + "step": 5924 + }, + { + "epoch": 1.078638390825521, + "grad_norm": 10.125, + "learning_rate": 7.74440466344363e-06, + "loss": 1.3364646434783936, + "step": 5926 + }, + { + "epoch": 1.0790024574497132, + "grad_norm": 30.375, + "learning_rate": 7.743025624701924e-06, + "loss": 1.4164223670959473, + "step": 5928 + }, + { + "epoch": 1.0793665240739054, + "grad_norm": 13.875, + "learning_rate": 7.741646330178269e-06, + "loss": 1.4897363185882568, + "step": 5930 + }, + { + "epoch": 1.0797305906980978, + "grad_norm": 9.1875, + "learning_rate": 7.740266780075074e-06, + "loss": 1.3814654350280762, + "step": 5932 + }, + { + "epoch": 1.08009465732229, + "grad_norm": 12.0, + "learning_rate": 7.738886974594784e-06, + "loss": 1.5612090826034546, + "step": 5934 + }, + { + "epoch": 1.0804587239464822, + "grad_norm": 3.9375, + "learning_rate": 7.737506913939875e-06, + "loss": 1.2420921325683594, + "step": 5936 + }, + { + "epoch": 1.0808227905706744, + "grad_norm": 14.25, + "learning_rate": 7.736126598312867e-06, + "loss": 2.12004017829895, + "step": 5938 + }, + { + "epoch": 1.0811868571948666, + "grad_norm": 10.5625, + "learning_rate": 7.734746027916319e-06, + "loss": 1.4516373872756958, + "step": 5940 + }, + { + "epoch": 1.0815509238190588, + "grad_norm": 4.0625, + "learning_rate": 7.73336520295282e-06, + "loss": 0.9513915777206421, + "step": 5942 + }, + { + "epoch": 1.081914990443251, + "grad_norm": 21.0, + "learning_rate": 7.731984123625002e-06, + "loss": 0.9905973672866821, + "step": 5944 + }, + { + "epoch": 1.0822790570674434, + "grad_norm": 13.9375, + "learning_rate": 7.730602790135536e-06, + "loss": 1.3266266584396362, + "step": 5946 + }, + { + "epoch": 1.0826431236916356, + "grad_norm": 27.625, + "learning_rate": 7.729221202687123e-06, + "loss": 1.8169238567352295, + "step": 5948 + }, + { + "epoch": 1.0830071903158278, + "grad_norm": 15.875, + "learning_rate": 7.727839361482505e-06, + "loss": 1.5460846424102783, + "step": 5950 + }, + { + "epoch": 1.08337125694002, + "grad_norm": 16.0, + "learning_rate": 7.726457266724463e-06, + "loss": 0.6210437417030334, + "step": 5952 + }, + { + "epoch": 1.0837353235642122, + "grad_norm": 7.65625, + "learning_rate": 7.725074918615816e-06, + "loss": 1.2871100902557373, + "step": 5954 + }, + { + "epoch": 1.0840993901884044, + "grad_norm": 25.875, + "learning_rate": 7.723692317359413e-06, + "loss": 2.0566678047180176, + "step": 5956 + }, + { + "epoch": 1.0844634568125966, + "grad_norm": 14.125, + "learning_rate": 7.72230946315815e-06, + "loss": 1.345217227935791, + "step": 5958 + }, + { + "epoch": 1.084827523436789, + "grad_norm": 46.75, + "learning_rate": 7.720926356214951e-06, + "loss": 1.9433022737503052, + "step": 5960 + }, + { + "epoch": 1.0851915900609812, + "grad_norm": 19.0, + "learning_rate": 7.719542996732784e-06, + "loss": 1.4428740739822388, + "step": 5962 + }, + { + "epoch": 1.0855556566851734, + "grad_norm": 12.0, + "learning_rate": 7.718159384914654e-06, + "loss": 1.2566156387329102, + "step": 5964 + }, + { + "epoch": 1.0859197233093656, + "grad_norm": 18.0, + "learning_rate": 7.716775520963595e-06, + "loss": 0.7551980018615723, + "step": 5966 + }, + { + "epoch": 1.0862837899335578, + "grad_norm": 10.875, + "learning_rate": 7.715391405082686e-06, + "loss": 1.1641606092453003, + "step": 5968 + }, + { + "epoch": 1.08664785655775, + "grad_norm": 16.625, + "learning_rate": 7.71400703747504e-06, + "loss": 0.7905727028846741, + "step": 5970 + }, + { + "epoch": 1.0870119231819424, + "grad_norm": 6.53125, + "learning_rate": 7.712622418343808e-06, + "loss": 1.0725716352462769, + "step": 5972 + }, + { + "epoch": 1.0873759898061346, + "grad_norm": 19.25, + "learning_rate": 7.711237547892174e-06, + "loss": 1.3367869853973389, + "step": 5974 + }, + { + "epoch": 1.0877400564303268, + "grad_norm": 10.625, + "learning_rate": 7.709852426323367e-06, + "loss": 1.3810782432556152, + "step": 5976 + }, + { + "epoch": 1.088104123054519, + "grad_norm": 11.1875, + "learning_rate": 7.708467053840647e-06, + "loss": 1.3132565021514893, + "step": 5978 + }, + { + "epoch": 1.0884681896787112, + "grad_norm": 29.25, + "learning_rate": 7.70708143064731e-06, + "loss": 1.1596884727478027, + "step": 5980 + }, + { + "epoch": 1.0888322563029034, + "grad_norm": 15.0625, + "learning_rate": 7.70569555694669e-06, + "loss": 1.4741616249084473, + "step": 5982 + }, + { + "epoch": 1.0891963229270956, + "grad_norm": 14.0, + "learning_rate": 7.704309432942161e-06, + "loss": 1.375129222869873, + "step": 5984 + }, + { + "epoch": 1.089560389551288, + "grad_norm": 31.25, + "learning_rate": 7.702923058837131e-06, + "loss": 2.0185065269470215, + "step": 5986 + }, + { + "epoch": 1.0899244561754802, + "grad_norm": 17.5, + "learning_rate": 7.701536434835042e-06, + "loss": 1.766796588897705, + "step": 5988 + }, + { + "epoch": 1.0902885227996724, + "grad_norm": 19.5, + "learning_rate": 7.700149561139377e-06, + "loss": 1.9443023204803467, + "step": 5990 + }, + { + "epoch": 1.0906525894238646, + "grad_norm": 11.6875, + "learning_rate": 7.698762437953653e-06, + "loss": 0.7731174230575562, + "step": 5992 + }, + { + "epoch": 1.0910166560480568, + "grad_norm": 11.625, + "learning_rate": 7.697375065481425e-06, + "loss": 1.0690741539001465, + "step": 5994 + }, + { + "epoch": 1.091380722672249, + "grad_norm": 20.75, + "learning_rate": 7.695987443926286e-06, + "loss": 1.2377794981002808, + "step": 5996 + }, + { + "epoch": 1.0917447892964414, + "grad_norm": 4.6875, + "learning_rate": 7.694599573491863e-06, + "loss": 0.9404780864715576, + "step": 5998 + }, + { + "epoch": 1.0921088559206336, + "grad_norm": 17.375, + "learning_rate": 7.693211454381822e-06, + "loss": 1.5230134725570679, + "step": 6000 + }, + { + "epoch": 1.0924729225448258, + "grad_norm": 19.75, + "learning_rate": 7.691823086799862e-06, + "loss": 0.7093430161476135, + "step": 6002 + }, + { + "epoch": 1.092836989169018, + "grad_norm": 15.0, + "learning_rate": 7.690434470949717e-06, + "loss": 1.7909890413284302, + "step": 6004 + }, + { + "epoch": 1.0932010557932101, + "grad_norm": 12.75, + "learning_rate": 7.689045607035166e-06, + "loss": 1.454171895980835, + "step": 6006 + }, + { + "epoch": 1.0935651224174023, + "grad_norm": 20.5, + "learning_rate": 7.68765649526002e-06, + "loss": 1.4441622495651245, + "step": 6008 + }, + { + "epoch": 1.0939291890415945, + "grad_norm": 18.625, + "learning_rate": 7.686267135828119e-06, + "loss": 0.9444335699081421, + "step": 6010 + }, + { + "epoch": 1.0942932556657867, + "grad_norm": 11.0, + "learning_rate": 7.684877528943348e-06, + "loss": 1.5426393747329712, + "step": 6012 + }, + { + "epoch": 1.0946573222899791, + "grad_norm": 12.6875, + "learning_rate": 7.68348767480963e-06, + "loss": 1.4582114219665527, + "step": 6014 + }, + { + "epoch": 1.0950213889141713, + "grad_norm": 24.5, + "learning_rate": 7.68209757363092e-06, + "loss": 1.5973546504974365, + "step": 6016 + }, + { + "epoch": 1.0953854555383635, + "grad_norm": 128.0, + "learning_rate": 7.680707225611208e-06, + "loss": 2.097869873046875, + "step": 6018 + }, + { + "epoch": 1.0957495221625557, + "grad_norm": 25.0, + "learning_rate": 7.67931663095452e-06, + "loss": 1.0489354133605957, + "step": 6020 + }, + { + "epoch": 1.096113588786748, + "grad_norm": 10.3125, + "learning_rate": 7.677925789864923e-06, + "loss": 1.5369552373886108, + "step": 6022 + }, + { + "epoch": 1.09647765541094, + "grad_norm": 19.625, + "learning_rate": 7.676534702546516e-06, + "loss": 1.3601114749908447, + "step": 6024 + }, + { + "epoch": 1.0968417220351325, + "grad_norm": 10.1875, + "learning_rate": 7.675143369203437e-06, + "loss": 1.0489931106567383, + "step": 6026 + }, + { + "epoch": 1.0972057886593247, + "grad_norm": 3.546875, + "learning_rate": 7.673751790039856e-06, + "loss": 1.155547022819519, + "step": 6028 + }, + { + "epoch": 1.097569855283517, + "grad_norm": 23.75, + "learning_rate": 7.672359965259984e-06, + "loss": 1.2132055759429932, + "step": 6030 + }, + { + "epoch": 1.097933921907709, + "grad_norm": 12.5, + "learning_rate": 7.670967895068065e-06, + "loss": 1.3862146139144897, + "step": 6032 + }, + { + "epoch": 1.0982979885319013, + "grad_norm": 4.375, + "learning_rate": 7.669575579668375e-06, + "loss": 1.0918357372283936, + "step": 6034 + }, + { + "epoch": 1.0986620551560935, + "grad_norm": 8.375, + "learning_rate": 7.668183019265238e-06, + "loss": 1.2735404968261719, + "step": 6036 + }, + { + "epoch": 1.0990261217802857, + "grad_norm": 15.0, + "learning_rate": 7.666790214063005e-06, + "loss": 1.5066826343536377, + "step": 6038 + }, + { + "epoch": 1.099390188404478, + "grad_norm": 13.75, + "learning_rate": 7.665397164266061e-06, + "loss": 1.4150091409683228, + "step": 6040 + }, + { + "epoch": 1.0997542550286703, + "grad_norm": 2.6875, + "learning_rate": 7.664003870078833e-06, + "loss": 1.1389790773391724, + "step": 6042 + }, + { + "epoch": 1.1001183216528625, + "grad_norm": 13.5625, + "learning_rate": 7.662610331705782e-06, + "loss": 1.240532398223877, + "step": 6044 + }, + { + "epoch": 1.1004823882770547, + "grad_norm": 124.0, + "learning_rate": 7.6612165493514e-06, + "loss": 0.4803674519062042, + "step": 6046 + }, + { + "epoch": 1.1008464549012469, + "grad_norm": 10.25, + "learning_rate": 7.659822523220225e-06, + "loss": 1.2698731422424316, + "step": 6048 + }, + { + "epoch": 1.101210521525439, + "grad_norm": 12.0, + "learning_rate": 7.658428253516818e-06, + "loss": 1.7715986967086792, + "step": 6050 + }, + { + "epoch": 1.1015745881496315, + "grad_norm": 76.5, + "learning_rate": 7.657033740445787e-06, + "loss": 1.7982748746871948, + "step": 6052 + }, + { + "epoch": 1.1019386547738237, + "grad_norm": 10.6875, + "learning_rate": 7.65563898421177e-06, + "loss": 1.2796062231063843, + "step": 6054 + }, + { + "epoch": 1.1023027213980159, + "grad_norm": 41.75, + "learning_rate": 7.654243985019442e-06, + "loss": 1.7099835872650146, + "step": 6056 + }, + { + "epoch": 1.102666788022208, + "grad_norm": 26.25, + "learning_rate": 7.652848743073512e-06, + "loss": 0.9476800560951233, + "step": 6058 + }, + { + "epoch": 1.1030308546464003, + "grad_norm": 12.5625, + "learning_rate": 7.651453258578731e-06, + "loss": 1.2216453552246094, + "step": 6060 + }, + { + "epoch": 1.1033949212705925, + "grad_norm": 9.75, + "learning_rate": 7.650057531739873e-06, + "loss": 1.2758268117904663, + "step": 6062 + }, + { + "epoch": 1.1037589878947847, + "grad_norm": 27.375, + "learning_rate": 7.648661562761763e-06, + "loss": 0.8147430419921875, + "step": 6064 + }, + { + "epoch": 1.104123054518977, + "grad_norm": 21.5, + "learning_rate": 7.647265351849246e-06, + "loss": 1.325575828552246, + "step": 6066 + }, + { + "epoch": 1.1044871211431693, + "grad_norm": 9.0, + "learning_rate": 7.645868899207219e-06, + "loss": 1.3994100093841553, + "step": 6068 + }, + { + "epoch": 1.1048511877673615, + "grad_norm": 42.0, + "learning_rate": 7.644472205040598e-06, + "loss": 1.3932067155838013, + "step": 6070 + }, + { + "epoch": 1.1052152543915537, + "grad_norm": 10.0625, + "learning_rate": 7.643075269554345e-06, + "loss": 1.6284995079040527, + "step": 6072 + }, + { + "epoch": 1.1055793210157459, + "grad_norm": 33.25, + "learning_rate": 7.641678092953456e-06, + "loss": 2.114983558654785, + "step": 6074 + }, + { + "epoch": 1.105943387639938, + "grad_norm": 17.75, + "learning_rate": 7.640280675442962e-06, + "loss": 1.3591622114181519, + "step": 6076 + }, + { + "epoch": 1.1063074542641302, + "grad_norm": 9.6875, + "learning_rate": 7.638883017227924e-06, + "loss": 1.5362677574157715, + "step": 6078 + }, + { + "epoch": 1.1066715208883227, + "grad_norm": 26.125, + "learning_rate": 7.637485118513447e-06, + "loss": 1.373355507850647, + "step": 6080 + }, + { + "epoch": 1.1070355875125149, + "grad_norm": 23.375, + "learning_rate": 7.636086979504663e-06, + "loss": 1.2211226224899292, + "step": 6082 + }, + { + "epoch": 1.107399654136707, + "grad_norm": 9.0, + "learning_rate": 7.634688600406745e-06, + "loss": 1.0737813711166382, + "step": 6084 + }, + { + "epoch": 1.1077637207608992, + "grad_norm": 14.75, + "learning_rate": 7.6332899814249e-06, + "loss": 1.4608807563781738, + "step": 6086 + }, + { + "epoch": 1.1081277873850914, + "grad_norm": 4.21875, + "learning_rate": 7.63189112276437e-06, + "loss": 1.29204523563385, + "step": 6088 + }, + { + "epoch": 1.1084918540092836, + "grad_norm": 16.375, + "learning_rate": 7.630492024630431e-06, + "loss": 1.6481047868728638, + "step": 6090 + }, + { + "epoch": 1.1088559206334758, + "grad_norm": 15.8125, + "learning_rate": 7.629092687228395e-06, + "loss": 1.7501280307769775, + "step": 6092 + }, + { + "epoch": 1.1092199872576682, + "grad_norm": 8.25, + "learning_rate": 7.62769311076361e-06, + "loss": 1.2036296129226685, + "step": 6094 + }, + { + "epoch": 1.1095840538818604, + "grad_norm": 9.125, + "learning_rate": 7.626293295441456e-06, + "loss": 1.2800333499908447, + "step": 6096 + }, + { + "epoch": 1.1099481205060526, + "grad_norm": 46.75, + "learning_rate": 7.624893241467353e-06, + "loss": 1.3343054056167603, + "step": 6098 + }, + { + "epoch": 1.1103121871302448, + "grad_norm": 20.0, + "learning_rate": 7.623492949046752e-06, + "loss": 1.3392642736434937, + "step": 6100 + }, + { + "epoch": 1.110676253754437, + "grad_norm": 11.75, + "learning_rate": 7.622092418385139e-06, + "loss": 1.4595372676849365, + "step": 6102 + }, + { + "epoch": 1.1110403203786292, + "grad_norm": 9.0625, + "learning_rate": 7.620691649688039e-06, + "loss": 1.3964455127716064, + "step": 6104 + }, + { + "epoch": 1.1114043870028216, + "grad_norm": 11.1875, + "learning_rate": 7.619290643161006e-06, + "loss": 1.163963794708252, + "step": 6106 + }, + { + "epoch": 1.1117684536270138, + "grad_norm": 8.625, + "learning_rate": 7.617889399009635e-06, + "loss": 1.1876957416534424, + "step": 6108 + }, + { + "epoch": 1.112132520251206, + "grad_norm": 9.375, + "learning_rate": 7.61648791743955e-06, + "loss": 1.3723902702331543, + "step": 6110 + }, + { + "epoch": 1.1124965868753982, + "grad_norm": 3.6875, + "learning_rate": 7.615086198656414e-06, + "loss": 1.1908726692199707, + "step": 6112 + }, + { + "epoch": 1.1128606534995904, + "grad_norm": 8.5, + "learning_rate": 7.613684242865924e-06, + "loss": 1.3020076751708984, + "step": 6114 + }, + { + "epoch": 1.1132247201237826, + "grad_norm": 14.8125, + "learning_rate": 7.612282050273812e-06, + "loss": 1.5242159366607666, + "step": 6116 + }, + { + "epoch": 1.1135887867479748, + "grad_norm": 17.0, + "learning_rate": 7.6108796210858425e-06, + "loss": 1.308373212814331, + "step": 6118 + }, + { + "epoch": 1.1139528533721672, + "grad_norm": 21.25, + "learning_rate": 7.6094769555078175e-06, + "loss": 1.6335344314575195, + "step": 6120 + }, + { + "epoch": 1.1143169199963594, + "grad_norm": 15.8125, + "learning_rate": 7.608074053745571e-06, + "loss": 1.460789680480957, + "step": 6122 + }, + { + "epoch": 1.1146809866205516, + "grad_norm": 13.875, + "learning_rate": 7.606670916004975e-06, + "loss": 1.5344470739364624, + "step": 6124 + }, + { + "epoch": 1.1150450532447438, + "grad_norm": 15.8125, + "learning_rate": 7.605267542491932e-06, + "loss": 1.3683898448944092, + "step": 6126 + }, + { + "epoch": 1.115409119868936, + "grad_norm": 24.25, + "learning_rate": 7.603863933412385e-06, + "loss": 1.812874436378479, + "step": 6128 + }, + { + "epoch": 1.1157731864931282, + "grad_norm": 8.5, + "learning_rate": 7.602460088972303e-06, + "loss": 1.38902747631073, + "step": 6130 + }, + { + "epoch": 1.1161372531173204, + "grad_norm": 10.9375, + "learning_rate": 7.601056009377699e-06, + "loss": 1.4262350797653198, + "step": 6132 + }, + { + "epoch": 1.1165013197415128, + "grad_norm": 18.875, + "learning_rate": 7.5996516948346135e-06, + "loss": 1.5712898969650269, + "step": 6134 + }, + { + "epoch": 1.116865386365705, + "grad_norm": 13.3125, + "learning_rate": 7.5982471455491244e-06, + "loss": 1.4469237327575684, + "step": 6136 + }, + { + "epoch": 1.1172294529898972, + "grad_norm": 9.4375, + "learning_rate": 7.596842361727346e-06, + "loss": 1.220457673072815, + "step": 6138 + }, + { + "epoch": 1.1175935196140894, + "grad_norm": 14.375, + "learning_rate": 7.595437343575421e-06, + "loss": 1.317972183227539, + "step": 6140 + }, + { + "epoch": 1.1179575862382816, + "grad_norm": 10.3125, + "learning_rate": 7.5940320912995304e-06, + "loss": 1.4042589664459229, + "step": 6142 + }, + { + "epoch": 1.1183216528624738, + "grad_norm": 11.0625, + "learning_rate": 7.592626605105891e-06, + "loss": 0.9241873025894165, + "step": 6144 + }, + { + "epoch": 1.118685719486666, + "grad_norm": 10.6875, + "learning_rate": 7.591220885200752e-06, + "loss": 1.404044270515442, + "step": 6146 + }, + { + "epoch": 1.1190497861108584, + "grad_norm": 12.3125, + "learning_rate": 7.589814931790396e-06, + "loss": 2.0191550254821777, + "step": 6148 + }, + { + "epoch": 1.1194138527350506, + "grad_norm": 11.125, + "learning_rate": 7.5884087450811414e-06, + "loss": 0.9703912734985352, + "step": 6150 + }, + { + "epoch": 1.1197779193592428, + "grad_norm": 12.0, + "learning_rate": 7.587002325279342e-06, + "loss": 0.9269189834594727, + "step": 6152 + }, + { + "epoch": 1.120141985983435, + "grad_norm": 4.8125, + "learning_rate": 7.585595672591382e-06, + "loss": 0.8698933720588684, + "step": 6154 + }, + { + "epoch": 1.1205060526076271, + "grad_norm": 4.75, + "learning_rate": 7.584188787223683e-06, + "loss": 0.9515825510025024, + "step": 6156 + }, + { + "epoch": 1.1208701192318193, + "grad_norm": 12.5, + "learning_rate": 7.5827816693827e-06, + "loss": 1.3375169038772583, + "step": 6158 + }, + { + "epoch": 1.1212341858560118, + "grad_norm": 15.5625, + "learning_rate": 7.581374319274924e-06, + "loss": 1.3456029891967773, + "step": 6160 + }, + { + "epoch": 1.121598252480204, + "grad_norm": 28.75, + "learning_rate": 7.579966737106872e-06, + "loss": 1.425205111503601, + "step": 6162 + }, + { + "epoch": 1.1219623191043961, + "grad_norm": 10.375, + "learning_rate": 7.578558923085104e-06, + "loss": 1.4412906169891357, + "step": 6164 + }, + { + "epoch": 1.1223263857285883, + "grad_norm": 28.5, + "learning_rate": 7.577150877416213e-06, + "loss": 2.089081287384033, + "step": 6166 + }, + { + "epoch": 1.1226904523527805, + "grad_norm": 7.21875, + "learning_rate": 7.575742600306821e-06, + "loss": 1.1414821147918701, + "step": 6168 + }, + { + "epoch": 1.1230545189769727, + "grad_norm": 8.875, + "learning_rate": 7.574334091963591e-06, + "loss": 1.154106616973877, + "step": 6170 + }, + { + "epoch": 1.123418585601165, + "grad_norm": 33.25, + "learning_rate": 7.572925352593212e-06, + "loss": 1.2359026670455933, + "step": 6172 + }, + { + "epoch": 1.1237826522253573, + "grad_norm": 8.625, + "learning_rate": 7.571516382402411e-06, + "loss": 1.3775465488433838, + "step": 6174 + }, + { + "epoch": 1.1241467188495495, + "grad_norm": 13.25, + "learning_rate": 7.57010718159795e-06, + "loss": 1.2567527294158936, + "step": 6176 + }, + { + "epoch": 1.1245107854737417, + "grad_norm": 2.28125, + "learning_rate": 7.5686977503866245e-06, + "loss": 0.9441748857498169, + "step": 6178 + }, + { + "epoch": 1.124874852097934, + "grad_norm": 9.625, + "learning_rate": 7.56728808897526e-06, + "loss": 1.390716791152954, + "step": 6180 + }, + { + "epoch": 1.1252389187221261, + "grad_norm": 10.0625, + "learning_rate": 7.565878197570719e-06, + "loss": 1.4376145601272583, + "step": 6182 + }, + { + "epoch": 1.1256029853463183, + "grad_norm": 12.125, + "learning_rate": 7.5644680763799e-06, + "loss": 1.4083622694015503, + "step": 6184 + }, + { + "epoch": 1.1259670519705107, + "grad_norm": 11.8125, + "learning_rate": 7.563057725609733e-06, + "loss": 1.1680452823638916, + "step": 6186 + }, + { + "epoch": 1.126331118594703, + "grad_norm": 3.796875, + "learning_rate": 7.561647145467177e-06, + "loss": 1.1869986057281494, + "step": 6188 + }, + { + "epoch": 1.1266951852188951, + "grad_norm": 20.625, + "learning_rate": 7.56023633615923e-06, + "loss": 1.40728759765625, + "step": 6190 + }, + { + "epoch": 1.1270592518430873, + "grad_norm": 5.75, + "learning_rate": 7.558825297892927e-06, + "loss": 1.409633755683899, + "step": 6192 + }, + { + "epoch": 1.1274233184672795, + "grad_norm": 37.75, + "learning_rate": 7.557414030875325e-06, + "loss": 1.965872049331665, + "step": 6194 + }, + { + "epoch": 1.1277873850914717, + "grad_norm": 29.5, + "learning_rate": 7.556002535313529e-06, + "loss": 1.337764859199524, + "step": 6196 + }, + { + "epoch": 1.1281514517156639, + "grad_norm": 25.375, + "learning_rate": 7.554590811414666e-06, + "loss": 1.0945656299591064, + "step": 6198 + }, + { + "epoch": 1.128515518339856, + "grad_norm": 7.4375, + "learning_rate": 7.553178859385901e-06, + "loss": 1.225588083267212, + "step": 6200 + }, + { + "epoch": 1.1288795849640485, + "grad_norm": 25.125, + "learning_rate": 7.551766679434433e-06, + "loss": 1.7256288528442383, + "step": 6202 + }, + { + "epoch": 1.1292436515882407, + "grad_norm": 4.625, + "learning_rate": 7.550354271767495e-06, + "loss": 0.9670466184616089, + "step": 6204 + }, + { + "epoch": 1.1296077182124329, + "grad_norm": 7.875, + "learning_rate": 7.5489416365923485e-06, + "loss": 1.3122233152389526, + "step": 6206 + }, + { + "epoch": 1.129971784836625, + "grad_norm": 12.75, + "learning_rate": 7.547528774116295e-06, + "loss": 0.5349314212799072, + "step": 6208 + }, + { + "epoch": 1.1303358514608173, + "grad_norm": 18.625, + "learning_rate": 7.546115684546667e-06, + "loss": 1.282565951347351, + "step": 6210 + }, + { + "epoch": 1.1306999180850095, + "grad_norm": 8.875, + "learning_rate": 7.544702368090826e-06, + "loss": 0.9437564611434937, + "step": 6212 + }, + { + "epoch": 1.1310639847092019, + "grad_norm": 24.25, + "learning_rate": 7.543288824956172e-06, + "loss": 1.8608406782150269, + "step": 6214 + }, + { + "epoch": 1.131428051333394, + "grad_norm": 12.1875, + "learning_rate": 7.541875055350138e-06, + "loss": 1.468324899673462, + "step": 6216 + }, + { + "epoch": 1.1317921179575863, + "grad_norm": 20.125, + "learning_rate": 7.540461059480191e-06, + "loss": 1.348197102546692, + "step": 6218 + }, + { + "epoch": 1.1321561845817785, + "grad_norm": 17.875, + "learning_rate": 7.539046837553823e-06, + "loss": 1.6343202590942383, + "step": 6220 + }, + { + "epoch": 1.1325202512059707, + "grad_norm": 10.4375, + "learning_rate": 7.537632389778571e-06, + "loss": 1.754242181777954, + "step": 6222 + }, + { + "epoch": 1.1328843178301629, + "grad_norm": 8.375, + "learning_rate": 7.536217716361995e-06, + "loss": 0.8625420928001404, + "step": 6224 + }, + { + "epoch": 1.133248384454355, + "grad_norm": 4.5, + "learning_rate": 7.5348028175116975e-06, + "loss": 1.041172742843628, + "step": 6226 + }, + { + "epoch": 1.1336124510785475, + "grad_norm": 23.25, + "learning_rate": 7.533387693435305e-06, + "loss": 1.2522087097167969, + "step": 6228 + }, + { + "epoch": 1.1339765177027397, + "grad_norm": 7.5625, + "learning_rate": 7.531972344340483e-06, + "loss": 1.4327716827392578, + "step": 6230 + }, + { + "epoch": 1.1343405843269319, + "grad_norm": 15.0625, + "learning_rate": 7.5305567704349294e-06, + "loss": 1.3205615282058716, + "step": 6232 + }, + { + "epoch": 1.134704650951124, + "grad_norm": 13.125, + "learning_rate": 7.5291409719263696e-06, + "loss": 1.954979419708252, + "step": 6234 + }, + { + "epoch": 1.1350687175753162, + "grad_norm": 8.3125, + "learning_rate": 7.527724949022572e-06, + "loss": 1.3674852848052979, + "step": 6236 + }, + { + "epoch": 1.1354327841995084, + "grad_norm": 6.40625, + "learning_rate": 7.526308701931328e-06, + "loss": 1.3474839925765991, + "step": 6238 + }, + { + "epoch": 1.1357968508237009, + "grad_norm": 14.1875, + "learning_rate": 7.52489223086047e-06, + "loss": 0.5992215275764465, + "step": 6240 + }, + { + "epoch": 1.136160917447893, + "grad_norm": 24.875, + "learning_rate": 7.523475536017855e-06, + "loss": 1.400850772857666, + "step": 6242 + }, + { + "epoch": 1.1365249840720852, + "grad_norm": 9.875, + "learning_rate": 7.522058617611379e-06, + "loss": 1.1672260761260986, + "step": 6244 + }, + { + "epoch": 1.1368890506962774, + "grad_norm": 9.375, + "learning_rate": 7.520641475848969e-06, + "loss": 1.4604942798614502, + "step": 6246 + }, + { + "epoch": 1.1372531173204696, + "grad_norm": 10.0625, + "learning_rate": 7.519224110938583e-06, + "loss": 1.3994964361190796, + "step": 6248 + }, + { + "epoch": 1.1376171839446618, + "grad_norm": 7.9375, + "learning_rate": 7.5178065230882205e-06, + "loss": 1.3514472246170044, + "step": 6250 + }, + { + "epoch": 1.137981250568854, + "grad_norm": 9.5625, + "learning_rate": 7.5163887125058975e-06, + "loss": 1.2155338525772095, + "step": 6252 + }, + { + "epoch": 1.1383453171930462, + "grad_norm": 8.875, + "learning_rate": 7.514970679399678e-06, + "loss": 1.4023786783218384, + "step": 6254 + }, + { + "epoch": 1.1387093838172386, + "grad_norm": 38.75, + "learning_rate": 7.5135524239776525e-06, + "loss": 1.0800318717956543, + "step": 6256 + }, + { + "epoch": 1.1390734504414308, + "grad_norm": 8.5625, + "learning_rate": 7.5121339464479395e-06, + "loss": 1.371211290359497, + "step": 6258 + }, + { + "epoch": 1.139437517065623, + "grad_norm": 10.125, + "learning_rate": 7.510715247018701e-06, + "loss": 1.4171255826950073, + "step": 6260 + }, + { + "epoch": 1.1398015836898152, + "grad_norm": 11.125, + "learning_rate": 7.509296325898121e-06, + "loss": 1.3335071802139282, + "step": 6262 + }, + { + "epoch": 1.1401656503140074, + "grad_norm": 9.625, + "learning_rate": 7.507877183294423e-06, + "loss": 1.3642916679382324, + "step": 6264 + }, + { + "epoch": 1.1405297169381996, + "grad_norm": 10.8125, + "learning_rate": 7.506457819415858e-06, + "loss": 1.3994640111923218, + "step": 6266 + }, + { + "epoch": 1.140893783562392, + "grad_norm": 15.8125, + "learning_rate": 7.505038234470712e-06, + "loss": 1.3100051879882812, + "step": 6268 + }, + { + "epoch": 1.1412578501865842, + "grad_norm": 19.125, + "learning_rate": 7.503618428667308e-06, + "loss": 1.3815580606460571, + "step": 6270 + }, + { + "epoch": 1.1416219168107764, + "grad_norm": 15.375, + "learning_rate": 7.502198402213989e-06, + "loss": 1.344904899597168, + "step": 6272 + }, + { + "epoch": 1.1419859834349686, + "grad_norm": 13.5, + "learning_rate": 7.500778155319146e-06, + "loss": 1.5381231307983398, + "step": 6274 + }, + { + "epoch": 1.1423500500591608, + "grad_norm": 10.875, + "learning_rate": 7.499357688191189e-06, + "loss": 1.9140883684158325, + "step": 6276 + }, + { + "epoch": 1.142714116683353, + "grad_norm": 14.125, + "learning_rate": 7.497937001038567e-06, + "loss": 1.6217155456542969, + "step": 6278 + }, + { + "epoch": 1.1430781833075452, + "grad_norm": 17.5, + "learning_rate": 7.496516094069761e-06, + "loss": 1.1441524028778076, + "step": 6280 + }, + { + "epoch": 1.1434422499317376, + "grad_norm": 47.25, + "learning_rate": 7.495094967493286e-06, + "loss": 1.0068614482879639, + "step": 6282 + }, + { + "epoch": 1.1438063165559298, + "grad_norm": 7.9375, + "learning_rate": 7.493673621517681e-06, + "loss": 1.4224953651428223, + "step": 6284 + }, + { + "epoch": 1.144170383180122, + "grad_norm": 20.0, + "learning_rate": 7.492252056351525e-06, + "loss": 1.3754998445510864, + "step": 6286 + }, + { + "epoch": 1.1445344498043142, + "grad_norm": 10.625, + "learning_rate": 7.4908302722034286e-06, + "loss": 1.2591030597686768, + "step": 6288 + }, + { + "epoch": 1.1448985164285064, + "grad_norm": 8.625, + "learning_rate": 7.48940826928203e-06, + "loss": 1.1596477031707764, + "step": 6290 + }, + { + "epoch": 1.1452625830526986, + "grad_norm": 14.1875, + "learning_rate": 7.487986047796005e-06, + "loss": 2.2149932384490967, + "step": 6292 + }, + { + "epoch": 1.145626649676891, + "grad_norm": 5.25, + "learning_rate": 7.486563607954058e-06, + "loss": 1.5302165746688843, + "step": 6294 + }, + { + "epoch": 1.1459907163010832, + "grad_norm": 5.75, + "learning_rate": 7.485140949964926e-06, + "loss": 1.1800148487091064, + "step": 6296 + }, + { + "epoch": 1.1463547829252754, + "grad_norm": 3.203125, + "learning_rate": 7.483718074037376e-06, + "loss": 1.3644418716430664, + "step": 6298 + }, + { + "epoch": 1.1467188495494676, + "grad_norm": 17.125, + "learning_rate": 7.482294980380216e-06, + "loss": 0.949133038520813, + "step": 6300 + }, + { + "epoch": 1.1470829161736598, + "grad_norm": 39.75, + "learning_rate": 7.480871669202272e-06, + "loss": 1.9215071201324463, + "step": 6302 + }, + { + "epoch": 1.147446982797852, + "grad_norm": 13.5, + "learning_rate": 7.479448140712413e-06, + "loss": 1.995676040649414, + "step": 6304 + }, + { + "epoch": 1.1478110494220441, + "grad_norm": 34.5, + "learning_rate": 7.478024395119534e-06, + "loss": 0.7667779922485352, + "step": 6306 + }, + { + "epoch": 1.1481751160462363, + "grad_norm": 46.75, + "learning_rate": 7.476600432632564e-06, + "loss": 1.5224629640579224, + "step": 6308 + }, + { + "epoch": 1.1485391826704288, + "grad_norm": 7.625, + "learning_rate": 7.475176253460466e-06, + "loss": 1.2831366062164307, + "step": 6310 + }, + { + "epoch": 1.148903249294621, + "grad_norm": 11.8125, + "learning_rate": 7.473751857812232e-06, + "loss": 1.342350721359253, + "step": 6312 + }, + { + "epoch": 1.1492673159188131, + "grad_norm": 37.5, + "learning_rate": 7.472327245896884e-06, + "loss": 1.3044123649597168, + "step": 6314 + }, + { + "epoch": 1.1496313825430053, + "grad_norm": 156.0, + "learning_rate": 7.4709024179234824e-06, + "loss": 1.45254647731781, + "step": 6316 + }, + { + "epoch": 1.1499954491671975, + "grad_norm": 21.5, + "learning_rate": 7.469477374101108e-06, + "loss": 1.532165288925171, + "step": 6318 + }, + { + "epoch": 1.1503595157913897, + "grad_norm": 8.6875, + "learning_rate": 7.468052114638886e-06, + "loss": 1.1146246194839478, + "step": 6320 + }, + { + "epoch": 1.1507235824155821, + "grad_norm": 11.625, + "learning_rate": 7.466626639745966e-06, + "loss": 0.7958694100379944, + "step": 6322 + }, + { + "epoch": 1.1510876490397743, + "grad_norm": 17.625, + "learning_rate": 7.46520094963153e-06, + "loss": 1.5166926383972168, + "step": 6324 + }, + { + "epoch": 1.1514517156639665, + "grad_norm": 6.21875, + "learning_rate": 7.463775044504793e-06, + "loss": 1.0953150987625122, + "step": 6326 + }, + { + "epoch": 1.1518157822881587, + "grad_norm": 13.25, + "learning_rate": 7.462348924575e-06, + "loss": 1.311816930770874, + "step": 6328 + }, + { + "epoch": 1.152179848912351, + "grad_norm": 13.8125, + "learning_rate": 7.460922590051427e-06, + "loss": 0.9509832262992859, + "step": 6330 + }, + { + "epoch": 1.1525439155365431, + "grad_norm": 18.5, + "learning_rate": 7.459496041143388e-06, + "loss": 1.926865816116333, + "step": 6332 + }, + { + "epoch": 1.1529079821607353, + "grad_norm": 8.5625, + "learning_rate": 7.4580692780602185e-06, + "loss": 1.442535638809204, + "step": 6334 + }, + { + "epoch": 1.1532720487849277, + "grad_norm": 8.125, + "learning_rate": 7.45664230101129e-06, + "loss": 1.306257724761963, + "step": 6336 + }, + { + "epoch": 1.15363611540912, + "grad_norm": 10.0625, + "learning_rate": 7.455215110206006e-06, + "loss": 1.2971405982971191, + "step": 6338 + }, + { + "epoch": 1.1540001820333121, + "grad_norm": 12.75, + "learning_rate": 7.453787705853805e-06, + "loss": 1.462487816810608, + "step": 6340 + }, + { + "epoch": 1.1543642486575043, + "grad_norm": 13.1875, + "learning_rate": 7.45236008816415e-06, + "loss": 1.573169469833374, + "step": 6342 + }, + { + "epoch": 1.1547283152816965, + "grad_norm": 17.125, + "learning_rate": 7.450932257346537e-06, + "loss": 2.021141529083252, + "step": 6344 + }, + { + "epoch": 1.1550923819058887, + "grad_norm": 13.5625, + "learning_rate": 7.449504213610494e-06, + "loss": 1.3924634456634521, + "step": 6346 + }, + { + "epoch": 1.1554564485300811, + "grad_norm": 17.0, + "learning_rate": 7.448075957165584e-06, + "loss": 1.3201172351837158, + "step": 6348 + }, + { + "epoch": 1.1558205151542733, + "grad_norm": 14.875, + "learning_rate": 7.446647488221394e-06, + "loss": 1.371399998664856, + "step": 6350 + }, + { + "epoch": 1.1561845817784655, + "grad_norm": 11.9375, + "learning_rate": 7.445218806987551e-06, + "loss": 1.3123196363449097, + "step": 6352 + }, + { + "epoch": 1.1565486484026577, + "grad_norm": 3.734375, + "learning_rate": 7.443789913673702e-06, + "loss": 1.211283564567566, + "step": 6354 + }, + { + "epoch": 1.15691271502685, + "grad_norm": 10.0, + "learning_rate": 7.442360808489535e-06, + "loss": 1.22006094455719, + "step": 6356 + }, + { + "epoch": 1.157276781651042, + "grad_norm": 31.375, + "learning_rate": 7.440931491644765e-06, + "loss": 0.45160406827926636, + "step": 6358 + }, + { + "epoch": 1.1576408482752343, + "grad_norm": 9.5625, + "learning_rate": 7.439501963349139e-06, + "loss": 1.3940057754516602, + "step": 6360 + }, + { + "epoch": 1.1580049148994265, + "grad_norm": 5.0, + "learning_rate": 7.438072223812434e-06, + "loss": 1.384699821472168, + "step": 6362 + }, + { + "epoch": 1.158368981523619, + "grad_norm": 4.25, + "learning_rate": 7.436642273244457e-06, + "loss": 1.4117671251296997, + "step": 6364 + }, + { + "epoch": 1.158733048147811, + "grad_norm": 11.4375, + "learning_rate": 7.435212111855048e-06, + "loss": 1.1795032024383545, + "step": 6366 + }, + { + "epoch": 1.1590971147720033, + "grad_norm": 18.0, + "learning_rate": 7.4337817398540775e-06, + "loss": 1.5126287937164307, + "step": 6368 + }, + { + "epoch": 1.1594611813961955, + "grad_norm": 9.125, + "learning_rate": 7.432351157451447e-06, + "loss": 1.2650301456451416, + "step": 6370 + }, + { + "epoch": 1.1598252480203877, + "grad_norm": 11.0, + "learning_rate": 7.430920364857092e-06, + "loss": 1.3520147800445557, + "step": 6372 + }, + { + "epoch": 1.1601893146445799, + "grad_norm": 18.75, + "learning_rate": 7.429489362280971e-06, + "loss": 1.5229713916778564, + "step": 6374 + }, + { + "epoch": 1.1605533812687723, + "grad_norm": 11.375, + "learning_rate": 7.428058149933079e-06, + "loss": 1.7363439798355103, + "step": 6376 + }, + { + "epoch": 1.1609174478929645, + "grad_norm": 12.4375, + "learning_rate": 7.4266267280234405e-06, + "loss": 1.5316307544708252, + "step": 6378 + }, + { + "epoch": 1.1612815145171567, + "grad_norm": 30.0, + "learning_rate": 7.4251950967621125e-06, + "loss": 1.9637465476989746, + "step": 6380 + }, + { + "epoch": 1.1616455811413489, + "grad_norm": 34.75, + "learning_rate": 7.42376325635918e-06, + "loss": 1.2630438804626465, + "step": 6382 + }, + { + "epoch": 1.162009647765541, + "grad_norm": 3.296875, + "learning_rate": 7.422331207024757e-06, + "loss": 1.0641576051712036, + "step": 6384 + }, + { + "epoch": 1.1623737143897332, + "grad_norm": 6.65625, + "learning_rate": 7.420898948968995e-06, + "loss": 1.1182897090911865, + "step": 6386 + }, + { + "epoch": 1.1627377810139254, + "grad_norm": 17.875, + "learning_rate": 7.419466482402071e-06, + "loss": 0.9243950843811035, + "step": 6388 + }, + { + "epoch": 1.1631018476381179, + "grad_norm": 9.75, + "learning_rate": 7.418033807534193e-06, + "loss": 1.0089322328567505, + "step": 6390 + }, + { + "epoch": 1.16346591426231, + "grad_norm": 8.5, + "learning_rate": 7.416600924575604e-06, + "loss": 1.3400102853775024, + "step": 6392 + }, + { + "epoch": 1.1638299808865022, + "grad_norm": 7.0, + "learning_rate": 7.415167833736565e-06, + "loss": 1.4119009971618652, + "step": 6394 + }, + { + "epoch": 1.1641940475106944, + "grad_norm": 14.0625, + "learning_rate": 7.413734535227386e-06, + "loss": 1.4507344961166382, + "step": 6396 + }, + { + "epoch": 1.1645581141348866, + "grad_norm": 25.125, + "learning_rate": 7.412301029258389e-06, + "loss": 1.4036744832992554, + "step": 6398 + }, + { + "epoch": 1.1649221807590788, + "grad_norm": 9.25, + "learning_rate": 7.410867316039943e-06, + "loss": 1.3667032718658447, + "step": 6400 + }, + { + "epoch": 1.1652862473832712, + "grad_norm": 4.0625, + "learning_rate": 7.409433395782433e-06, + "loss": 1.1058387756347656, + "step": 6402 + }, + { + "epoch": 1.1656503140074634, + "grad_norm": 8.375, + "learning_rate": 7.407999268696287e-06, + "loss": 1.0475088357925415, + "step": 6404 + }, + { + "epoch": 1.1660143806316556, + "grad_norm": 10.625, + "learning_rate": 7.406564934991953e-06, + "loss": 0.9343958497047424, + "step": 6406 + }, + { + "epoch": 1.1663784472558478, + "grad_norm": 8.125, + "learning_rate": 7.4051303948799135e-06, + "loss": 1.419680118560791, + "step": 6408 + }, + { + "epoch": 1.16674251388004, + "grad_norm": 47.0, + "learning_rate": 7.403695648570685e-06, + "loss": 1.5038814544677734, + "step": 6410 + }, + { + "epoch": 1.1671065805042322, + "grad_norm": 17.625, + "learning_rate": 7.40226069627481e-06, + "loss": 1.248150110244751, + "step": 6412 + }, + { + "epoch": 1.1674706471284244, + "grad_norm": 9.25, + "learning_rate": 7.400825538202861e-06, + "loss": 1.5459160804748535, + "step": 6414 + }, + { + "epoch": 1.1678347137526166, + "grad_norm": 21.25, + "learning_rate": 7.399390174565438e-06, + "loss": 1.4511265754699707, + "step": 6416 + }, + { + "epoch": 1.168198780376809, + "grad_norm": 12.1875, + "learning_rate": 7.3979546055731784e-06, + "loss": 1.095632553100586, + "step": 6418 + }, + { + "epoch": 1.1685628470010012, + "grad_norm": 15.1875, + "learning_rate": 7.396518831436748e-06, + "loss": 1.3852022886276245, + "step": 6420 + }, + { + "epoch": 1.1689269136251934, + "grad_norm": 13.1875, + "learning_rate": 7.395082852366837e-06, + "loss": 1.4753453731536865, + "step": 6422 + }, + { + "epoch": 1.1692909802493856, + "grad_norm": 23.0, + "learning_rate": 7.393646668574172e-06, + "loss": 1.6241109371185303, + "step": 6424 + }, + { + "epoch": 1.1696550468735778, + "grad_norm": 16.125, + "learning_rate": 7.392210280269507e-06, + "loss": 1.8432515859603882, + "step": 6426 + }, + { + "epoch": 1.1700191134977702, + "grad_norm": 10.5, + "learning_rate": 7.390773687663626e-06, + "loss": 1.3995847702026367, + "step": 6428 + }, + { + "epoch": 1.1703831801219624, + "grad_norm": 5.90625, + "learning_rate": 7.38933689096734e-06, + "loss": 1.1835557222366333, + "step": 6430 + }, + { + "epoch": 1.1707472467461546, + "grad_norm": 17.25, + "learning_rate": 7.387899890391499e-06, + "loss": 1.3437449932098389, + "step": 6432 + }, + { + "epoch": 1.1711113133703468, + "grad_norm": 9.25, + "learning_rate": 7.386462686146971e-06, + "loss": 1.2797107696533203, + "step": 6434 + }, + { + "epoch": 1.171475379994539, + "grad_norm": 12.8125, + "learning_rate": 7.385025278444664e-06, + "loss": 1.510704517364502, + "step": 6436 + }, + { + "epoch": 1.1718394466187312, + "grad_norm": 16.75, + "learning_rate": 7.3835876674955085e-06, + "loss": 1.4765868186950684, + "step": 6438 + }, + { + "epoch": 1.1722035132429234, + "grad_norm": 13.5625, + "learning_rate": 7.3821498535104715e-06, + "loss": 1.404543399810791, + "step": 6440 + }, + { + "epoch": 1.1725675798671156, + "grad_norm": 40.5, + "learning_rate": 7.380711836700547e-06, + "loss": 1.0411429405212402, + "step": 6442 + }, + { + "epoch": 1.172931646491308, + "grad_norm": 9.75, + "learning_rate": 7.379273617276755e-06, + "loss": 1.3396193981170654, + "step": 6444 + }, + { + "epoch": 1.1732957131155002, + "grad_norm": 15.0625, + "learning_rate": 7.377835195450147e-06, + "loss": 1.489682912826538, + "step": 6446 + }, + { + "epoch": 1.1736597797396924, + "grad_norm": 9.875, + "learning_rate": 7.376396571431808e-06, + "loss": 1.307582974433899, + "step": 6448 + }, + { + "epoch": 1.1740238463638846, + "grad_norm": 14.125, + "learning_rate": 7.374957745432853e-06, + "loss": 1.4828547239303589, + "step": 6450 + }, + { + "epoch": 1.1743879129880768, + "grad_norm": 14.75, + "learning_rate": 7.373518717664418e-06, + "loss": 1.586402416229248, + "step": 6452 + }, + { + "epoch": 1.174751979612269, + "grad_norm": 116.0, + "learning_rate": 7.37207948833768e-06, + "loss": 1.8769099712371826, + "step": 6454 + }, + { + "epoch": 1.1751160462364614, + "grad_norm": 12.5625, + "learning_rate": 7.3706400576638385e-06, + "loss": 1.3780337572097778, + "step": 6456 + }, + { + "epoch": 1.1754801128606536, + "grad_norm": 14.25, + "learning_rate": 7.369200425854119e-06, + "loss": 1.3824583292007446, + "step": 6458 + }, + { + "epoch": 1.1758441794848458, + "grad_norm": 34.75, + "learning_rate": 7.367760593119788e-06, + "loss": 1.509294867515564, + "step": 6460 + }, + { + "epoch": 1.176208246109038, + "grad_norm": 37.0, + "learning_rate": 7.366320559672136e-06, + "loss": 2.1823012828826904, + "step": 6462 + }, + { + "epoch": 1.1765723127332302, + "grad_norm": 6.96875, + "learning_rate": 7.364880325722474e-06, + "loss": 1.2436455488204956, + "step": 6464 + }, + { + "epoch": 1.1769363793574223, + "grad_norm": 5.40625, + "learning_rate": 7.3634398914821604e-06, + "loss": 1.1496533155441284, + "step": 6466 + }, + { + "epoch": 1.1773004459816145, + "grad_norm": 19.625, + "learning_rate": 7.361999257162564e-06, + "loss": 1.324145793914795, + "step": 6468 + }, + { + "epoch": 1.177664512605807, + "grad_norm": 15.8125, + "learning_rate": 7.360558422975099e-06, + "loss": 1.561884880065918, + "step": 6470 + }, + { + "epoch": 1.1780285792299992, + "grad_norm": 26.5, + "learning_rate": 7.3591173891311985e-06, + "loss": 1.5190938711166382, + "step": 6472 + }, + { + "epoch": 1.1783926458541913, + "grad_norm": 11.6875, + "learning_rate": 7.35767615584233e-06, + "loss": 1.820457935333252, + "step": 6474 + }, + { + "epoch": 1.1787567124783835, + "grad_norm": 10.125, + "learning_rate": 7.356234723319986e-06, + "loss": 1.3109452724456787, + "step": 6476 + }, + { + "epoch": 1.1791207791025757, + "grad_norm": 6.0625, + "learning_rate": 7.354793091775694e-06, + "loss": 1.2659820318222046, + "step": 6478 + }, + { + "epoch": 1.179484845726768, + "grad_norm": 10.125, + "learning_rate": 7.353351261421005e-06, + "loss": 1.1349012851715088, + "step": 6480 + }, + { + "epoch": 1.1798489123509603, + "grad_norm": 17.5, + "learning_rate": 7.351909232467505e-06, + "loss": 1.104724407196045, + "step": 6482 + }, + { + "epoch": 1.1802129789751525, + "grad_norm": 13.875, + "learning_rate": 7.350467005126802e-06, + "loss": 1.198919415473938, + "step": 6484 + }, + { + "epoch": 1.1805770455993447, + "grad_norm": 13.125, + "learning_rate": 7.349024579610542e-06, + "loss": 1.4486275911331177, + "step": 6486 + }, + { + "epoch": 1.180941112223537, + "grad_norm": 8.6875, + "learning_rate": 7.347581956130387e-06, + "loss": 1.340187430381775, + "step": 6488 + }, + { + "epoch": 1.1813051788477291, + "grad_norm": 14.5625, + "learning_rate": 7.346139134898045e-06, + "loss": 1.5551060438156128, + "step": 6490 + }, + { + "epoch": 1.1816692454719213, + "grad_norm": 14.8125, + "learning_rate": 7.344696116125241e-06, + "loss": 2.0186920166015625, + "step": 6492 + }, + { + "epoch": 1.1820333120961135, + "grad_norm": 7.5, + "learning_rate": 7.34325290002373e-06, + "loss": 1.1832094192504883, + "step": 6494 + }, + { + "epoch": 1.1823973787203057, + "grad_norm": 8.375, + "learning_rate": 7.3418094868053e-06, + "loss": 1.1233105659484863, + "step": 6496 + }, + { + "epoch": 1.1827614453444981, + "grad_norm": 27.0, + "learning_rate": 7.340365876681763e-06, + "loss": 0.8370498418807983, + "step": 6498 + }, + { + "epoch": 1.1831255119686903, + "grad_norm": 21.0, + "learning_rate": 7.3389220698649685e-06, + "loss": 0.9888896346092224, + "step": 6500 + }, + { + "epoch": 1.1834895785928825, + "grad_norm": 10.9375, + "learning_rate": 7.337478066566787e-06, + "loss": 0.07581327855587006, + "step": 6502 + }, + { + "epoch": 1.1838536452170747, + "grad_norm": 11.1875, + "learning_rate": 7.336033866999119e-06, + "loss": 0.45197027921676636, + "step": 6504 + }, + { + "epoch": 1.184217711841267, + "grad_norm": 11.125, + "learning_rate": 7.334589471373894e-06, + "loss": 1.5227274894714355, + "step": 6506 + }, + { + "epoch": 1.184581778465459, + "grad_norm": 21.125, + "learning_rate": 7.3331448799030735e-06, + "loss": 1.7064509391784668, + "step": 6508 + }, + { + "epoch": 1.1849458450896515, + "grad_norm": 12.0, + "learning_rate": 7.331700092798646e-06, + "loss": 1.5975301265716553, + "step": 6510 + }, + { + "epoch": 1.1853099117138437, + "grad_norm": 27.25, + "learning_rate": 7.330255110272626e-06, + "loss": 2.0628128051757812, + "step": 6512 + }, + { + "epoch": 1.185673978338036, + "grad_norm": 11.4375, + "learning_rate": 7.32880993253706e-06, + "loss": 1.7338371276855469, + "step": 6514 + }, + { + "epoch": 1.186038044962228, + "grad_norm": 14.6875, + "learning_rate": 7.32736455980402e-06, + "loss": 1.141144871711731, + "step": 6516 + }, + { + "epoch": 1.1864021115864203, + "grad_norm": 11.3125, + "learning_rate": 7.32591899228561e-06, + "loss": 1.7801835536956787, + "step": 6518 + }, + { + "epoch": 1.1867661782106125, + "grad_norm": 6.4375, + "learning_rate": 7.3244732301939625e-06, + "loss": 1.399308204650879, + "step": 6520 + }, + { + "epoch": 1.1871302448348047, + "grad_norm": 11.1875, + "learning_rate": 7.323027273741237e-06, + "loss": 1.177535057067871, + "step": 6522 + }, + { + "epoch": 1.187494311458997, + "grad_norm": 13.5, + "learning_rate": 7.32158112313962e-06, + "loss": 1.6235177516937256, + "step": 6524 + }, + { + "epoch": 1.1878583780831893, + "grad_norm": 11.5, + "learning_rate": 7.320134778601329e-06, + "loss": 1.449987769126892, + "step": 6526 + }, + { + "epoch": 1.1882224447073815, + "grad_norm": 14.3125, + "learning_rate": 7.318688240338607e-06, + "loss": 1.4296740293502808, + "step": 6528 + }, + { + "epoch": 1.1885865113315737, + "grad_norm": 16.75, + "learning_rate": 7.317241508563733e-06, + "loss": 1.3188289403915405, + "step": 6530 + }, + { + "epoch": 1.1889505779557659, + "grad_norm": 11.75, + "learning_rate": 7.315794583489006e-06, + "loss": 1.4860177040100098, + "step": 6532 + }, + { + "epoch": 1.189314644579958, + "grad_norm": 26.5, + "learning_rate": 7.314347465326757e-06, + "loss": 1.426046371459961, + "step": 6534 + }, + { + "epoch": 1.1896787112041505, + "grad_norm": 25.75, + "learning_rate": 7.31290015428934e-06, + "loss": 1.3271759748458862, + "step": 6536 + }, + { + "epoch": 1.1900427778283427, + "grad_norm": 11.1875, + "learning_rate": 7.311452650589148e-06, + "loss": 1.1695268154144287, + "step": 6538 + }, + { + "epoch": 1.1904068444525349, + "grad_norm": 13.1875, + "learning_rate": 7.310004954438594e-06, + "loss": 1.1791671514511108, + "step": 6540 + }, + { + "epoch": 1.190770911076727, + "grad_norm": 15.125, + "learning_rate": 7.308557066050126e-06, + "loss": 1.3848741054534912, + "step": 6542 + }, + { + "epoch": 1.1911349777009193, + "grad_norm": 9.1875, + "learning_rate": 7.307108985636206e-06, + "loss": 1.4438623189926147, + "step": 6544 + }, + { + "epoch": 1.1914990443251114, + "grad_norm": 12.0, + "learning_rate": 7.305660713409343e-06, + "loss": 1.3221287727355957, + "step": 6546 + }, + { + "epoch": 1.1918631109493036, + "grad_norm": 13.375, + "learning_rate": 7.304212249582059e-06, + "loss": 1.3651946783065796, + "step": 6548 + }, + { + "epoch": 1.1922271775734958, + "grad_norm": 9.875, + "learning_rate": 7.302763594366915e-06, + "loss": 1.4340474605560303, + "step": 6550 + }, + { + "epoch": 1.1925912441976882, + "grad_norm": 15.6875, + "learning_rate": 7.3013147479764936e-06, + "loss": 1.5670411586761475, + "step": 6552 + }, + { + "epoch": 1.1929553108218804, + "grad_norm": 16.625, + "learning_rate": 7.299865710623406e-06, + "loss": 1.788050651550293, + "step": 6554 + }, + { + "epoch": 1.1933193774460726, + "grad_norm": 8.25, + "learning_rate": 7.298416482520294e-06, + "loss": 1.226399540901184, + "step": 6556 + }, + { + "epoch": 1.1936834440702648, + "grad_norm": 22.625, + "learning_rate": 7.296967063879823e-06, + "loss": 0.8390741348266602, + "step": 6558 + }, + { + "epoch": 1.194047510694457, + "grad_norm": 5.21875, + "learning_rate": 7.295517454914694e-06, + "loss": 1.1856259107589722, + "step": 6560 + }, + { + "epoch": 1.1944115773186492, + "grad_norm": 17.125, + "learning_rate": 7.294067655837629e-06, + "loss": 1.42365562915802, + "step": 6562 + }, + { + "epoch": 1.1947756439428416, + "grad_norm": 8.0625, + "learning_rate": 7.292617666861377e-06, + "loss": 1.3596998453140259, + "step": 6564 + }, + { + "epoch": 1.1951397105670338, + "grad_norm": 10.1875, + "learning_rate": 7.291167488198723e-06, + "loss": 1.0605143308639526, + "step": 6566 + }, + { + "epoch": 1.195503777191226, + "grad_norm": 9.3125, + "learning_rate": 7.289717120062471e-06, + "loss": 1.1264477968215942, + "step": 6568 + }, + { + "epoch": 1.1958678438154182, + "grad_norm": 8.5625, + "learning_rate": 7.28826656266546e-06, + "loss": 1.0780694484710693, + "step": 6570 + }, + { + "epoch": 1.1962319104396104, + "grad_norm": 7.1875, + "learning_rate": 7.286815816220551e-06, + "loss": 1.5786877870559692, + "step": 6572 + }, + { + "epoch": 1.1965959770638026, + "grad_norm": 13.9375, + "learning_rate": 7.285364880940637e-06, + "loss": 1.3698921203613281, + "step": 6574 + }, + { + "epoch": 1.1969600436879948, + "grad_norm": 10.8125, + "learning_rate": 7.283913757038636e-06, + "loss": 1.3198425769805908, + "step": 6576 + }, + { + "epoch": 1.1973241103121872, + "grad_norm": 13.375, + "learning_rate": 7.282462444727492e-06, + "loss": 1.289044737815857, + "step": 6578 + }, + { + "epoch": 1.1976881769363794, + "grad_norm": 6.375, + "learning_rate": 7.281010944220184e-06, + "loss": 1.0620331764221191, + "step": 6580 + }, + { + "epoch": 1.1980522435605716, + "grad_norm": 147.0, + "learning_rate": 7.279559255729711e-06, + "loss": 1.3790700435638428, + "step": 6582 + }, + { + "epoch": 1.1984163101847638, + "grad_norm": 3.765625, + "learning_rate": 7.278107379469103e-06, + "loss": 0.8620489835739136, + "step": 6584 + }, + { + "epoch": 1.198780376808956, + "grad_norm": 12.1875, + "learning_rate": 7.276655315651415e-06, + "loss": 1.2727453708648682, + "step": 6586 + }, + { + "epoch": 1.1991444434331482, + "grad_norm": 11.25, + "learning_rate": 7.275203064489735e-06, + "loss": 1.532165288925171, + "step": 6588 + }, + { + "epoch": 1.1995085100573406, + "grad_norm": 12.5, + "learning_rate": 7.273750626197173e-06, + "loss": 1.5772807598114014, + "step": 6590 + }, + { + "epoch": 1.1998725766815328, + "grad_norm": 9.375, + "learning_rate": 7.272298000986868e-06, + "loss": 0.9821051359176636, + "step": 6592 + }, + { + "epoch": 1.200236643305725, + "grad_norm": 6.0, + "learning_rate": 7.270845189071989e-06, + "loss": 1.3076342344284058, + "step": 6594 + }, + { + "epoch": 1.2006007099299172, + "grad_norm": 14.9375, + "learning_rate": 7.269392190665727e-06, + "loss": 1.369358777999878, + "step": 6596 + }, + { + "epoch": 1.2009647765541094, + "grad_norm": 7.90625, + "learning_rate": 7.267939005981306e-06, + "loss": 1.7717223167419434, + "step": 6598 + }, + { + "epoch": 1.2013288431783016, + "grad_norm": 8.625, + "learning_rate": 7.266485635231975e-06, + "loss": 1.5751750469207764, + "step": 6600 + }, + { + "epoch": 1.2016929098024938, + "grad_norm": 13.125, + "learning_rate": 7.2650320786310095e-06, + "loss": 1.4999746084213257, + "step": 6602 + }, + { + "epoch": 1.202056976426686, + "grad_norm": 9.875, + "learning_rate": 7.2635783363917125e-06, + "loss": 1.3215546607971191, + "step": 6604 + }, + { + "epoch": 1.2024210430508784, + "grad_norm": 18.5, + "learning_rate": 7.262124408727416e-06, + "loss": 1.3302925825119019, + "step": 6606 + }, + { + "epoch": 1.2027851096750706, + "grad_norm": 4.25, + "learning_rate": 7.2606702958514775e-06, + "loss": 1.225448727607727, + "step": 6608 + }, + { + "epoch": 1.2031491762992628, + "grad_norm": 13.0, + "learning_rate": 7.259215997977282e-06, + "loss": 1.221543312072754, + "step": 6610 + }, + { + "epoch": 1.203513242923455, + "grad_norm": 27.0, + "learning_rate": 7.257761515318243e-06, + "loss": 1.6428560018539429, + "step": 6612 + }, + { + "epoch": 1.2038773095476472, + "grad_norm": 20.5, + "learning_rate": 7.256306848087797e-06, + "loss": 1.39435613155365, + "step": 6614 + }, + { + "epoch": 1.2042413761718394, + "grad_norm": 19.125, + "learning_rate": 7.254851996499412e-06, + "loss": 1.7749639749526978, + "step": 6616 + }, + { + "epoch": 1.2046054427960318, + "grad_norm": 14.875, + "learning_rate": 7.253396960766583e-06, + "loss": 1.3632102012634277, + "step": 6618 + }, + { + "epoch": 1.204969509420224, + "grad_norm": 11.375, + "learning_rate": 7.25194174110283e-06, + "loss": 1.3126219511032104, + "step": 6620 + }, + { + "epoch": 1.2053335760444162, + "grad_norm": 9.5625, + "learning_rate": 7.2504863377217e-06, + "loss": 0.8591881990432739, + "step": 6622 + }, + { + "epoch": 1.2056976426686083, + "grad_norm": 5.5625, + "learning_rate": 7.249030750836767e-06, + "loss": 1.1397168636322021, + "step": 6624 + }, + { + "epoch": 1.2060617092928005, + "grad_norm": 11.75, + "learning_rate": 7.247574980661635e-06, + "loss": 0.9087386131286621, + "step": 6626 + }, + { + "epoch": 1.2064257759169927, + "grad_norm": 25.375, + "learning_rate": 7.246119027409928e-06, + "loss": 1.1396706104278564, + "step": 6628 + }, + { + "epoch": 1.206789842541185, + "grad_norm": 22.375, + "learning_rate": 7.244662891295307e-06, + "loss": 1.324886679649353, + "step": 6630 + }, + { + "epoch": 1.2071539091653773, + "grad_norm": 9.9375, + "learning_rate": 7.24320657253145e-06, + "loss": 1.511910080909729, + "step": 6632 + }, + { + "epoch": 1.2075179757895695, + "grad_norm": 11.4375, + "learning_rate": 7.241750071332065e-06, + "loss": 1.3533015251159668, + "step": 6634 + }, + { + "epoch": 1.2078820424137617, + "grad_norm": 18.25, + "learning_rate": 7.2402933879108905e-06, + "loss": 1.3343080282211304, + "step": 6636 + }, + { + "epoch": 1.208246109037954, + "grad_norm": 10.3125, + "learning_rate": 7.238836522481687e-06, + "loss": 1.3347954750061035, + "step": 6638 + }, + { + "epoch": 1.2086101756621461, + "grad_norm": 17.0, + "learning_rate": 7.237379475258244e-06, + "loss": 1.3202630281448364, + "step": 6640 + }, + { + "epoch": 1.2089742422863383, + "grad_norm": 7.625, + "learning_rate": 7.235922246454381e-06, + "loss": 1.1637362241744995, + "step": 6642 + }, + { + "epoch": 1.2093383089105307, + "grad_norm": 3.703125, + "learning_rate": 7.234464836283935e-06, + "loss": 0.8209491968154907, + "step": 6644 + }, + { + "epoch": 1.209702375534723, + "grad_norm": 3.90625, + "learning_rate": 7.233007244960775e-06, + "loss": 0.9462370276451111, + "step": 6646 + }, + { + "epoch": 1.2100664421589151, + "grad_norm": 10.6875, + "learning_rate": 7.2315494726988e-06, + "loss": 0.38805603981018066, + "step": 6648 + }, + { + "epoch": 1.2104305087831073, + "grad_norm": 9.75, + "learning_rate": 7.23009151971193e-06, + "loss": 1.400206208229065, + "step": 6650 + }, + { + "epoch": 1.2107945754072995, + "grad_norm": 4.0625, + "learning_rate": 7.228633386214119e-06, + "loss": 0.9574282169342041, + "step": 6652 + }, + { + "epoch": 1.2111586420314917, + "grad_norm": 10.0625, + "learning_rate": 7.227175072419335e-06, + "loss": 1.3425127267837524, + "step": 6654 + }, + { + "epoch": 1.211522708655684, + "grad_norm": 75.0, + "learning_rate": 7.225716578541582e-06, + "loss": 1.3184183835983276, + "step": 6656 + }, + { + "epoch": 1.211886775279876, + "grad_norm": 31.375, + "learning_rate": 7.224257904794887e-06, + "loss": 1.531571626663208, + "step": 6658 + }, + { + "epoch": 1.2122508419040685, + "grad_norm": 151.0, + "learning_rate": 7.222799051393308e-06, + "loss": 0.5535272359848022, + "step": 6660 + }, + { + "epoch": 1.2126149085282607, + "grad_norm": 5.46875, + "learning_rate": 7.2213400185509245e-06, + "loss": 1.0640232563018799, + "step": 6662 + }, + { + "epoch": 1.212978975152453, + "grad_norm": 32.25, + "learning_rate": 7.2198808064818425e-06, + "loss": 1.5700925588607788, + "step": 6664 + }, + { + "epoch": 1.213343041776645, + "grad_norm": 9.0, + "learning_rate": 7.2184214154001965e-06, + "loss": 1.416872501373291, + "step": 6666 + }, + { + "epoch": 1.2137071084008373, + "grad_norm": 23.75, + "learning_rate": 7.216961845520143e-06, + "loss": 1.3927534818649292, + "step": 6668 + }, + { + "epoch": 1.2140711750250297, + "grad_norm": 10.4375, + "learning_rate": 7.215502097055871e-06, + "loss": 1.3334345817565918, + "step": 6670 + }, + { + "epoch": 1.214435241649222, + "grad_norm": 4.84375, + "learning_rate": 7.214042170221596e-06, + "loss": 1.010744333267212, + "step": 6672 + }, + { + "epoch": 1.214799308273414, + "grad_norm": 34.0, + "learning_rate": 7.212582065231548e-06, + "loss": 1.3117693662643433, + "step": 6674 + }, + { + "epoch": 1.2151633748976063, + "grad_norm": 22.75, + "learning_rate": 7.211121782299999e-06, + "loss": 1.4588491916656494, + "step": 6676 + }, + { + "epoch": 1.2155274415217985, + "grad_norm": 21.125, + "learning_rate": 7.209661321641235e-06, + "loss": 1.8186008930206299, + "step": 6678 + }, + { + "epoch": 1.2158915081459907, + "grad_norm": 31.125, + "learning_rate": 7.208200683469575e-06, + "loss": 1.4026299715042114, + "step": 6680 + }, + { + "epoch": 1.2162555747701829, + "grad_norm": 23.75, + "learning_rate": 7.2067398679993615e-06, + "loss": 1.360709309577942, + "step": 6682 + }, + { + "epoch": 1.216619641394375, + "grad_norm": 8.75, + "learning_rate": 7.205278875444963e-06, + "loss": 1.3849159479141235, + "step": 6684 + }, + { + "epoch": 1.2169837080185675, + "grad_norm": 9.375, + "learning_rate": 7.203817706020773e-06, + "loss": 1.5182673931121826, + "step": 6686 + }, + { + "epoch": 1.2173477746427597, + "grad_norm": 18.75, + "learning_rate": 7.202356359941214e-06, + "loss": 1.2989633083343506, + "step": 6688 + }, + { + "epoch": 1.2177118412669519, + "grad_norm": 7.5625, + "learning_rate": 7.200894837420731e-06, + "loss": 1.3390648365020752, + "step": 6690 + }, + { + "epoch": 1.218075907891144, + "grad_norm": 18.875, + "learning_rate": 7.199433138673799e-06, + "loss": 1.4473764896392822, + "step": 6692 + }, + { + "epoch": 1.2184399745153363, + "grad_norm": 8.25, + "learning_rate": 7.197971263914916e-06, + "loss": 1.149372935295105, + "step": 6694 + }, + { + "epoch": 1.2188040411395284, + "grad_norm": 11.0, + "learning_rate": 7.196509213358602e-06, + "loss": 0.6737701296806335, + "step": 6696 + }, + { + "epoch": 1.2191681077637209, + "grad_norm": 16.875, + "learning_rate": 7.1950469872194095e-06, + "loss": 0.5430116057395935, + "step": 6698 + }, + { + "epoch": 1.219532174387913, + "grad_norm": 12.5625, + "learning_rate": 7.193584585711917e-06, + "loss": 1.1017062664031982, + "step": 6700 + }, + { + "epoch": 1.2198962410121053, + "grad_norm": 12.9375, + "learning_rate": 7.1921220090507235e-06, + "loss": 0.9444236755371094, + "step": 6702 + }, + { + "epoch": 1.2202603076362974, + "grad_norm": 13.4375, + "learning_rate": 7.190659257450454e-06, + "loss": 1.5194355249404907, + "step": 6704 + }, + { + "epoch": 1.2206243742604896, + "grad_norm": 36.5, + "learning_rate": 7.1891963311257675e-06, + "loss": 1.3496087789535522, + "step": 6706 + }, + { + "epoch": 1.2209884408846818, + "grad_norm": 17.875, + "learning_rate": 7.187733230291336e-06, + "loss": 1.2691317796707153, + "step": 6708 + }, + { + "epoch": 1.221352507508874, + "grad_norm": 34.0, + "learning_rate": 7.186269955161867e-06, + "loss": 1.3138542175292969, + "step": 6710 + }, + { + "epoch": 1.2217165741330664, + "grad_norm": 3.5625, + "learning_rate": 7.184806505952091e-06, + "loss": 1.1663285493850708, + "step": 6712 + }, + { + "epoch": 1.2220806407572586, + "grad_norm": 5.34375, + "learning_rate": 7.18334288287676e-06, + "loss": 1.0912249088287354, + "step": 6714 + }, + { + "epoch": 1.2224447073814508, + "grad_norm": 13.5625, + "learning_rate": 7.181879086150658e-06, + "loss": 1.0867218971252441, + "step": 6716 + }, + { + "epoch": 1.222808774005643, + "grad_norm": 268.0, + "learning_rate": 7.180415115988588e-06, + "loss": 0.7946165800094604, + "step": 6718 + }, + { + "epoch": 1.2231728406298352, + "grad_norm": 15.0, + "learning_rate": 7.178950972605385e-06, + "loss": 1.2604389190673828, + "step": 6720 + }, + { + "epoch": 1.2235369072540274, + "grad_norm": 6.40625, + "learning_rate": 7.177486656215906e-06, + "loss": 1.27571439743042, + "step": 6722 + }, + { + "epoch": 1.2239009738782198, + "grad_norm": 11.1875, + "learning_rate": 7.176022167035031e-06, + "loss": 1.2737550735473633, + "step": 6724 + }, + { + "epoch": 1.224265040502412, + "grad_norm": 19.0, + "learning_rate": 7.174557505277669e-06, + "loss": 1.4829754829406738, + "step": 6726 + }, + { + "epoch": 1.2246291071266042, + "grad_norm": 18.25, + "learning_rate": 7.17309267115875e-06, + "loss": 1.2702488899230957, + "step": 6728 + }, + { + "epoch": 1.2249931737507964, + "grad_norm": 65.5, + "learning_rate": 7.171627664893239e-06, + "loss": 1.223417043685913, + "step": 6730 + }, + { + "epoch": 1.2253572403749886, + "grad_norm": 20.875, + "learning_rate": 7.170162486696117e-06, + "loss": 0.8921594619750977, + "step": 6732 + }, + { + "epoch": 1.2257213069991808, + "grad_norm": 18.75, + "learning_rate": 7.16869713678239e-06, + "loss": 0.931276261806488, + "step": 6734 + }, + { + "epoch": 1.226085373623373, + "grad_norm": 16.25, + "learning_rate": 7.167231615367096e-06, + "loss": 1.604603886604309, + "step": 6736 + }, + { + "epoch": 1.2264494402475652, + "grad_norm": 4.96875, + "learning_rate": 7.1657659226652935e-06, + "loss": 1.1619961261749268, + "step": 6738 + }, + { + "epoch": 1.2268135068717576, + "grad_norm": 8.625, + "learning_rate": 7.164300058892064e-06, + "loss": 1.0576145648956299, + "step": 6740 + }, + { + "epoch": 1.2271775734959498, + "grad_norm": 13.0625, + "learning_rate": 7.162834024262522e-06, + "loss": 1.3183400630950928, + "step": 6742 + }, + { + "epoch": 1.227541640120142, + "grad_norm": 15.5, + "learning_rate": 7.161367818991796e-06, + "loss": 1.1441850662231445, + "step": 6744 + }, + { + "epoch": 1.2279057067443342, + "grad_norm": 12.75, + "learning_rate": 7.159901443295052e-06, + "loss": 0.7580921053886414, + "step": 6746 + }, + { + "epoch": 1.2282697733685264, + "grad_norm": 14.625, + "learning_rate": 7.15843489738747e-06, + "loss": 0.856759786605835, + "step": 6748 + }, + { + "epoch": 1.2286338399927186, + "grad_norm": 13.0625, + "learning_rate": 7.156968181484263e-06, + "loss": 1.4152673482894897, + "step": 6750 + }, + { + "epoch": 1.228997906616911, + "grad_norm": 15.125, + "learning_rate": 7.155501295800664e-06, + "loss": 1.6950697898864746, + "step": 6752 + }, + { + "epoch": 1.2293619732411032, + "grad_norm": 8.375, + "learning_rate": 7.154034240551933e-06, + "loss": 1.3276128768920898, + "step": 6754 + }, + { + "epoch": 1.2297260398652954, + "grad_norm": 7.53125, + "learning_rate": 7.152567015953354e-06, + "loss": 1.2459003925323486, + "step": 6756 + }, + { + "epoch": 1.2300901064894876, + "grad_norm": 12.3125, + "learning_rate": 7.151099622220234e-06, + "loss": 1.3761324882507324, + "step": 6758 + }, + { + "epoch": 1.2304541731136798, + "grad_norm": 16.75, + "learning_rate": 7.149632059567912e-06, + "loss": 1.483574628829956, + "step": 6760 + }, + { + "epoch": 1.230818239737872, + "grad_norm": 7.875, + "learning_rate": 7.1481643282117445e-06, + "loss": 1.220634937286377, + "step": 6762 + }, + { + "epoch": 1.2311823063620642, + "grad_norm": 16.625, + "learning_rate": 7.146696428367115e-06, + "loss": 1.0857765674591064, + "step": 6764 + }, + { + "epoch": 1.2315463729862566, + "grad_norm": 15.6875, + "learning_rate": 7.14522836024943e-06, + "loss": 1.382385015487671, + "step": 6766 + }, + { + "epoch": 1.2319104396104488, + "grad_norm": 7.0625, + "learning_rate": 7.143760124074124e-06, + "loss": 0.7194342613220215, + "step": 6768 + }, + { + "epoch": 1.232274506234641, + "grad_norm": 23.0, + "learning_rate": 7.142291720056655e-06, + "loss": 1.3154785633087158, + "step": 6770 + }, + { + "epoch": 1.2326385728588332, + "grad_norm": 7.6875, + "learning_rate": 7.140823148412508e-06, + "loss": 0.8829584121704102, + "step": 6772 + }, + { + "epoch": 1.2330026394830254, + "grad_norm": 10.75, + "learning_rate": 7.139354409357187e-06, + "loss": 1.566400170326233, + "step": 6774 + }, + { + "epoch": 1.2333667061072175, + "grad_norm": 9.875, + "learning_rate": 7.137885503106223e-06, + "loss": 1.353212833404541, + "step": 6776 + }, + { + "epoch": 1.23373077273141, + "grad_norm": 12.0625, + "learning_rate": 7.136416429875172e-06, + "loss": 1.356162190437317, + "step": 6778 + }, + { + "epoch": 1.2340948393556022, + "grad_norm": 7.9375, + "learning_rate": 7.134947189879615e-06, + "loss": 1.4055001735687256, + "step": 6780 + }, + { + "epoch": 1.2344589059797944, + "grad_norm": 23.125, + "learning_rate": 7.133477783335159e-06, + "loss": 0.9110021591186523, + "step": 6782 + }, + { + "epoch": 1.2348229726039865, + "grad_norm": 27.25, + "learning_rate": 7.132008210457433e-06, + "loss": 0.4519428014755249, + "step": 6784 + }, + { + "epoch": 1.2351870392281787, + "grad_norm": 6.5, + "learning_rate": 7.130538471462087e-06, + "loss": 1.2690303325653076, + "step": 6786 + }, + { + "epoch": 1.235551105852371, + "grad_norm": 10.375, + "learning_rate": 7.1290685665648005e-06, + "loss": 0.9783637523651123, + "step": 6788 + }, + { + "epoch": 1.2359151724765631, + "grad_norm": 44.25, + "learning_rate": 7.127598495981283e-06, + "loss": 0.9811915159225464, + "step": 6790 + }, + { + "epoch": 1.2362792391007553, + "grad_norm": 24.125, + "learning_rate": 7.126128259927252e-06, + "loss": 0.6517534255981445, + "step": 6792 + }, + { + "epoch": 1.2366433057249477, + "grad_norm": 18.0, + "learning_rate": 7.1246578586184645e-06, + "loss": 1.4276130199432373, + "step": 6794 + }, + { + "epoch": 1.23700737234914, + "grad_norm": 5.96875, + "learning_rate": 7.123187292270695e-06, + "loss": 1.326648473739624, + "step": 6796 + }, + { + "epoch": 1.2373714389733321, + "grad_norm": 8.0625, + "learning_rate": 7.121716561099738e-06, + "loss": 1.4149478673934937, + "step": 6798 + }, + { + "epoch": 1.2377355055975243, + "grad_norm": 15.1875, + "learning_rate": 7.1202456653214236e-06, + "loss": 1.5907409191131592, + "step": 6800 + }, + { + "epoch": 1.2380995722217165, + "grad_norm": 14.75, + "learning_rate": 7.118774605151599e-06, + "loss": 1.7426486015319824, + "step": 6802 + }, + { + "epoch": 1.2384636388459087, + "grad_norm": 2.28125, + "learning_rate": 7.117303380806135e-06, + "loss": 0.8373295068740845, + "step": 6804 + }, + { + "epoch": 1.2388277054701011, + "grad_norm": 15.125, + "learning_rate": 7.115831992500928e-06, + "loss": 1.1873797178268433, + "step": 6806 + }, + { + "epoch": 1.2391917720942933, + "grad_norm": 14.875, + "learning_rate": 7.114360440451895e-06, + "loss": 1.3135144710540771, + "step": 6808 + }, + { + "epoch": 1.2395558387184855, + "grad_norm": 9.1875, + "learning_rate": 7.112888724874987e-06, + "loss": 1.4339075088500977, + "step": 6810 + }, + { + "epoch": 1.2399199053426777, + "grad_norm": 13.9375, + "learning_rate": 7.111416845986168e-06, + "loss": 1.3428232669830322, + "step": 6812 + }, + { + "epoch": 1.24028397196687, + "grad_norm": 14.5, + "learning_rate": 7.109944804001432e-06, + "loss": 0.9346180558204651, + "step": 6814 + }, + { + "epoch": 1.240648038591062, + "grad_norm": 12.0, + "learning_rate": 7.108472599136793e-06, + "loss": 1.8763840198516846, + "step": 6816 + }, + { + "epoch": 1.2410121052152543, + "grad_norm": 39.25, + "learning_rate": 7.107000231608292e-06, + "loss": 1.7168290615081787, + "step": 6818 + }, + { + "epoch": 1.2413761718394467, + "grad_norm": 13.8125, + "learning_rate": 7.105527701631994e-06, + "loss": 1.448686957359314, + "step": 6820 + }, + { + "epoch": 1.241740238463639, + "grad_norm": 9.8125, + "learning_rate": 7.104055009423985e-06, + "loss": 1.3222360610961914, + "step": 6822 + }, + { + "epoch": 1.242104305087831, + "grad_norm": 9.4375, + "learning_rate": 7.102582155200379e-06, + "loss": 1.5577512979507446, + "step": 6824 + }, + { + "epoch": 1.2424683717120233, + "grad_norm": 8.5625, + "learning_rate": 7.101109139177309e-06, + "loss": 1.632058024406433, + "step": 6826 + }, + { + "epoch": 1.2428324383362155, + "grad_norm": 19.875, + "learning_rate": 7.099635961570934e-06, + "loss": 1.3941521644592285, + "step": 6828 + }, + { + "epoch": 1.2431965049604077, + "grad_norm": 14.75, + "learning_rate": 7.09816262259744e-06, + "loss": 1.3943427801132202, + "step": 6830 + }, + { + "epoch": 1.2435605715846, + "grad_norm": 12.0625, + "learning_rate": 7.096689122473033e-06, + "loss": 1.5009510517120361, + "step": 6832 + }, + { + "epoch": 1.2439246382087923, + "grad_norm": 26.125, + "learning_rate": 7.09521546141394e-06, + "loss": 1.3903570175170898, + "step": 6834 + }, + { + "epoch": 1.2442887048329845, + "grad_norm": 12.3125, + "learning_rate": 7.093741639636418e-06, + "loss": 1.4346972703933716, + "step": 6836 + }, + { + "epoch": 1.2446527714571767, + "grad_norm": 4.78125, + "learning_rate": 7.0922676573567395e-06, + "loss": 1.3681694269180298, + "step": 6838 + }, + { + "epoch": 1.2450168380813689, + "grad_norm": 15.5, + "learning_rate": 7.0907935147912125e-06, + "loss": 1.421314001083374, + "step": 6840 + }, + { + "epoch": 1.245380904705561, + "grad_norm": 17.0, + "learning_rate": 7.089319212156156e-06, + "loss": 1.6846474409103394, + "step": 6842 + }, + { + "epoch": 1.2457449713297533, + "grad_norm": 9.1875, + "learning_rate": 7.08784474966792e-06, + "loss": 1.434504508972168, + "step": 6844 + }, + { + "epoch": 1.2461090379539455, + "grad_norm": 12.0, + "learning_rate": 7.086370127542876e-06, + "loss": 1.5476224422454834, + "step": 6846 + }, + { + "epoch": 1.2464731045781379, + "grad_norm": 16.5, + "learning_rate": 7.084895345997418e-06, + "loss": 1.3673689365386963, + "step": 6848 + }, + { + "epoch": 1.24683717120233, + "grad_norm": 7.0625, + "learning_rate": 7.083420405247965e-06, + "loss": 1.2037729024887085, + "step": 6850 + }, + { + "epoch": 1.2472012378265223, + "grad_norm": 17.0, + "learning_rate": 7.081945305510958e-06, + "loss": 1.3675122261047363, + "step": 6852 + }, + { + "epoch": 1.2475653044507145, + "grad_norm": 10.6875, + "learning_rate": 7.0804700470028635e-06, + "loss": 1.4422657489776611, + "step": 6854 + }, + { + "epoch": 1.2479293710749066, + "grad_norm": 4.9375, + "learning_rate": 7.078994629940166e-06, + "loss": 0.8554068803787231, + "step": 6856 + }, + { + "epoch": 1.2482934376990988, + "grad_norm": 19.125, + "learning_rate": 7.077519054539379e-06, + "loss": 1.142959475517273, + "step": 6858 + }, + { + "epoch": 1.2486575043232913, + "grad_norm": 18.75, + "learning_rate": 7.076043321017041e-06, + "loss": 1.5435700416564941, + "step": 6860 + }, + { + "epoch": 1.2490215709474835, + "grad_norm": 13.75, + "learning_rate": 7.074567429589703e-06, + "loss": 1.4103915691375732, + "step": 6862 + }, + { + "epoch": 1.2493856375716756, + "grad_norm": 9.4375, + "learning_rate": 7.073091380473951e-06, + "loss": 1.32807457447052, + "step": 6864 + }, + { + "epoch": 1.2497497041958678, + "grad_norm": 15.25, + "learning_rate": 7.071615173886388e-06, + "loss": 1.4018455743789673, + "step": 6866 + }, + { + "epoch": 1.25011377082006, + "grad_norm": 11.75, + "learning_rate": 7.070138810043641e-06, + "loss": 1.264844536781311, + "step": 6868 + }, + { + "epoch": 1.2504778374442522, + "grad_norm": 9.125, + "learning_rate": 7.06866228916236e-06, + "loss": 1.3887810707092285, + "step": 6870 + }, + { + "epoch": 1.2508419040684444, + "grad_norm": 11.375, + "learning_rate": 7.06718561145922e-06, + "loss": 1.4866645336151123, + "step": 6872 + }, + { + "epoch": 1.2512059706926366, + "grad_norm": 22.25, + "learning_rate": 7.065708777150917e-06, + "loss": 1.288819432258606, + "step": 6874 + }, + { + "epoch": 1.251570037316829, + "grad_norm": 10.0625, + "learning_rate": 7.064231786454168e-06, + "loss": 1.5728092193603516, + "step": 6876 + }, + { + "epoch": 1.2519341039410212, + "grad_norm": 13.125, + "learning_rate": 7.062754639585716e-06, + "loss": 1.339041829109192, + "step": 6878 + }, + { + "epoch": 1.2522981705652134, + "grad_norm": 52.75, + "learning_rate": 7.061277336762331e-06, + "loss": 1.5896389484405518, + "step": 6880 + }, + { + "epoch": 1.2526622371894056, + "grad_norm": 10.9375, + "learning_rate": 7.059799878200797e-06, + "loss": 1.3721157312393188, + "step": 6882 + }, + { + "epoch": 1.2530263038135978, + "grad_norm": 10.5, + "learning_rate": 7.058322264117925e-06, + "loss": 1.472983717918396, + "step": 6884 + }, + { + "epoch": 1.2533903704377902, + "grad_norm": 16.125, + "learning_rate": 7.0568444947305504e-06, + "loss": 1.5336334705352783, + "step": 6886 + }, + { + "epoch": 1.2537544370619824, + "grad_norm": 11.25, + "learning_rate": 7.0553665702555286e-06, + "loss": 1.7266628742218018, + "step": 6888 + }, + { + "epoch": 1.2541185036861746, + "grad_norm": 9.375, + "learning_rate": 7.0538884909097395e-06, + "loss": 1.6979012489318848, + "step": 6890 + }, + { + "epoch": 1.2544825703103668, + "grad_norm": 9.75, + "learning_rate": 7.052410256910085e-06, + "loss": 1.4501219987869263, + "step": 6892 + }, + { + "epoch": 1.254846636934559, + "grad_norm": 38.0, + "learning_rate": 7.050931868473492e-06, + "loss": 1.315745234489441, + "step": 6894 + }, + { + "epoch": 1.2552107035587512, + "grad_norm": 15.0625, + "learning_rate": 7.0494533258169065e-06, + "loss": 1.4888767004013062, + "step": 6896 + }, + { + "epoch": 1.2555747701829434, + "grad_norm": 15.75, + "learning_rate": 7.047974629157297e-06, + "loss": 1.5987426042556763, + "step": 6898 + }, + { + "epoch": 1.2559388368071356, + "grad_norm": 8.5625, + "learning_rate": 7.0464957787116575e-06, + "loss": 1.172560214996338, + "step": 6900 + }, + { + "epoch": 1.256302903431328, + "grad_norm": 11.4375, + "learning_rate": 7.045016774697004e-06, + "loss": 1.3965671062469482, + "step": 6902 + }, + { + "epoch": 1.2566669700555202, + "grad_norm": 115.0, + "learning_rate": 7.043537617330376e-06, + "loss": 1.2098493576049805, + "step": 6904 + }, + { + "epoch": 1.2570310366797124, + "grad_norm": 8.75, + "learning_rate": 7.042058306828829e-06, + "loss": 1.3815102577209473, + "step": 6906 + }, + { + "epoch": 1.2573951033039046, + "grad_norm": 15.125, + "learning_rate": 7.040578843409449e-06, + "loss": 1.2485014200210571, + "step": 6908 + }, + { + "epoch": 1.2577591699280968, + "grad_norm": 13.375, + "learning_rate": 7.039099227289341e-06, + "loss": 1.252556324005127, + "step": 6910 + }, + { + "epoch": 1.2581232365522892, + "grad_norm": 17.125, + "learning_rate": 7.037619458685634e-06, + "loss": 1.1102707386016846, + "step": 6912 + }, + { + "epoch": 1.2584873031764814, + "grad_norm": 11.875, + "learning_rate": 7.036139537815476e-06, + "loss": 1.3073909282684326, + "step": 6914 + }, + { + "epoch": 1.2588513698006736, + "grad_norm": 11.3125, + "learning_rate": 7.034659464896039e-06, + "loss": 1.6627213954925537, + "step": 6916 + }, + { + "epoch": 1.2592154364248658, + "grad_norm": 11.375, + "learning_rate": 7.0331792401445165e-06, + "loss": 1.945440411567688, + "step": 6918 + }, + { + "epoch": 1.259579503049058, + "grad_norm": 24.5, + "learning_rate": 7.031698863778132e-06, + "loss": 1.3761060237884521, + "step": 6920 + }, + { + "epoch": 1.2599435696732502, + "grad_norm": 11.75, + "learning_rate": 7.030218336014119e-06, + "loss": 1.4903113842010498, + "step": 6922 + }, + { + "epoch": 1.2603076362974424, + "grad_norm": 5.0625, + "learning_rate": 7.0287376570697395e-06, + "loss": 1.3640656471252441, + "step": 6924 + }, + { + "epoch": 1.2606717029216346, + "grad_norm": 17.0, + "learning_rate": 7.027256827162279e-06, + "loss": 0.8654718995094299, + "step": 6926 + }, + { + "epoch": 1.261035769545827, + "grad_norm": 15.0, + "learning_rate": 7.02577584650904e-06, + "loss": 0.591568112373352, + "step": 6928 + }, + { + "epoch": 1.2613998361700192, + "grad_norm": 18.5, + "learning_rate": 7.024294715327353e-06, + "loss": 1.6607918739318848, + "step": 6930 + }, + { + "epoch": 1.2617639027942114, + "grad_norm": 5.65625, + "learning_rate": 7.0228134338345695e-06, + "loss": 1.1746143102645874, + "step": 6932 + }, + { + "epoch": 1.2621279694184036, + "grad_norm": 8.3125, + "learning_rate": 7.02133200224806e-06, + "loss": 1.3098299503326416, + "step": 6934 + }, + { + "epoch": 1.2624920360425957, + "grad_norm": 11.875, + "learning_rate": 7.019850420785217e-06, + "loss": 1.531429409980774, + "step": 6936 + }, + { + "epoch": 1.2628561026667882, + "grad_norm": 7.0, + "learning_rate": 7.018368689663457e-06, + "loss": 1.0970335006713867, + "step": 6938 + }, + { + "epoch": 1.2632201692909804, + "grad_norm": 15.875, + "learning_rate": 7.016886809100219e-06, + "loss": 1.2609821557998657, + "step": 6940 + }, + { + "epoch": 1.2635842359151725, + "grad_norm": 9.1875, + "learning_rate": 7.0154047793129646e-06, + "loss": 1.3810995817184448, + "step": 6942 + }, + { + "epoch": 1.2639483025393647, + "grad_norm": 21.375, + "learning_rate": 7.013922600519174e-06, + "loss": 1.3613288402557373, + "step": 6944 + }, + { + "epoch": 1.264312369163557, + "grad_norm": 17.375, + "learning_rate": 7.0124402729363496e-06, + "loss": 1.465685486793518, + "step": 6946 + }, + { + "epoch": 1.2646764357877491, + "grad_norm": 6.09375, + "learning_rate": 7.0109577967820165e-06, + "loss": 1.2737704515457153, + "step": 6948 + }, + { + "epoch": 1.2650405024119413, + "grad_norm": 13.4375, + "learning_rate": 7.009475172273725e-06, + "loss": 1.0695053339004517, + "step": 6950 + }, + { + "epoch": 1.2654045690361335, + "grad_norm": 31.75, + "learning_rate": 7.0079923996290445e-06, + "loss": 1.7231273651123047, + "step": 6952 + }, + { + "epoch": 1.2657686356603257, + "grad_norm": 42.75, + "learning_rate": 7.0065094790655645e-06, + "loss": 1.360650658607483, + "step": 6954 + }, + { + "epoch": 1.2661327022845181, + "grad_norm": 14.875, + "learning_rate": 7.005026410800897e-06, + "loss": 1.4140663146972656, + "step": 6956 + }, + { + "epoch": 1.2664967689087103, + "grad_norm": 8.0625, + "learning_rate": 7.003543195052675e-06, + "loss": 1.3838446140289307, + "step": 6958 + }, + { + "epoch": 1.2668608355329025, + "grad_norm": 11.875, + "learning_rate": 7.002059832038557e-06, + "loss": 1.3637138605117798, + "step": 6960 + }, + { + "epoch": 1.2672249021570947, + "grad_norm": 23.875, + "learning_rate": 7.00057632197622e-06, + "loss": 1.8511745929718018, + "step": 6962 + }, + { + "epoch": 1.267588968781287, + "grad_norm": 33.25, + "learning_rate": 6.9990926650833646e-06, + "loss": 1.1482704877853394, + "step": 6964 + }, + { + "epoch": 1.2679530354054793, + "grad_norm": 8.8125, + "learning_rate": 6.997608861577707e-06, + "loss": 1.3576314449310303, + "step": 6966 + }, + { + "epoch": 1.2683171020296715, + "grad_norm": 8.9375, + "learning_rate": 6.996124911676991e-06, + "loss": 1.3747107982635498, + "step": 6968 + }, + { + "epoch": 1.2686811686538637, + "grad_norm": 9.0, + "learning_rate": 6.994640815598983e-06, + "loss": 1.4651124477386475, + "step": 6970 + }, + { + "epoch": 1.269045235278056, + "grad_norm": 20.5, + "learning_rate": 6.993156573561466e-06, + "loss": 1.2367103099822998, + "step": 6972 + }, + { + "epoch": 1.269409301902248, + "grad_norm": 7.84375, + "learning_rate": 6.991672185782248e-06, + "loss": 1.6774638891220093, + "step": 6974 + }, + { + "epoch": 1.2697733685264403, + "grad_norm": 16.875, + "learning_rate": 6.990187652479155e-06, + "loss": 1.187595009803772, + "step": 6976 + }, + { + "epoch": 1.2701374351506325, + "grad_norm": 22.375, + "learning_rate": 6.988702973870035e-06, + "loss": 1.4416632652282715, + "step": 6978 + }, + { + "epoch": 1.2705015017748247, + "grad_norm": 15.5625, + "learning_rate": 6.987218150172763e-06, + "loss": 1.2919238805770874, + "step": 6980 + }, + { + "epoch": 1.270865568399017, + "grad_norm": 12.0, + "learning_rate": 6.985733181605227e-06, + "loss": 1.7377707958221436, + "step": 6982 + }, + { + "epoch": 1.2712296350232093, + "grad_norm": 24.0, + "learning_rate": 6.984248068385342e-06, + "loss": 1.735353708267212, + "step": 6984 + }, + { + "epoch": 1.2715937016474015, + "grad_norm": 20.125, + "learning_rate": 6.982762810731041e-06, + "loss": 1.7191202640533447, + "step": 6986 + }, + { + "epoch": 1.2719577682715937, + "grad_norm": 18.375, + "learning_rate": 6.981277408860279e-06, + "loss": 1.555033802986145, + "step": 6988 + }, + { + "epoch": 1.2723218348957859, + "grad_norm": 10.125, + "learning_rate": 6.979791862991037e-06, + "loss": 0.9993138313293457, + "step": 6990 + }, + { + "epoch": 1.2726859015199783, + "grad_norm": 29.25, + "learning_rate": 6.978306173341307e-06, + "loss": 0.9250248074531555, + "step": 6992 + }, + { + "epoch": 1.2730499681441705, + "grad_norm": 6.5625, + "learning_rate": 6.976820340129114e-06, + "loss": 1.4093331098556519, + "step": 6994 + }, + { + "epoch": 1.2734140347683627, + "grad_norm": 6.25, + "learning_rate": 6.975334363572492e-06, + "loss": 0.9326533079147339, + "step": 6996 + }, + { + "epoch": 1.2737781013925549, + "grad_norm": 17.75, + "learning_rate": 6.973848243889506e-06, + "loss": 1.358811855316162, + "step": 6998 + }, + { + "epoch": 1.274142168016747, + "grad_norm": 27.5, + "learning_rate": 6.9723619812982365e-06, + "loss": 1.8066385984420776, + "step": 7000 + }, + { + "epoch": 1.2745062346409393, + "grad_norm": 11.5625, + "learning_rate": 6.9708755760167865e-06, + "loss": 1.475766897201538, + "step": 7002 + }, + { + "epoch": 1.2748703012651315, + "grad_norm": 43.5, + "learning_rate": 6.9693890282632826e-06, + "loss": 1.4680222272872925, + "step": 7004 + }, + { + "epoch": 1.2752343678893237, + "grad_norm": 15.1875, + "learning_rate": 6.967902338255865e-06, + "loss": 1.514327049255371, + "step": 7006 + }, + { + "epoch": 1.2755984345135158, + "grad_norm": 10.4375, + "learning_rate": 6.966415506212703e-06, + "loss": 1.9397776126861572, + "step": 7008 + }, + { + "epoch": 1.2759625011377083, + "grad_norm": 11.5625, + "learning_rate": 6.96492853235198e-06, + "loss": 1.5866397619247437, + "step": 7010 + }, + { + "epoch": 1.2763265677619005, + "grad_norm": 12.25, + "learning_rate": 6.9634414168919075e-06, + "loss": 1.25542414188385, + "step": 7012 + }, + { + "epoch": 1.2766906343860926, + "grad_norm": 5.84375, + "learning_rate": 6.961954160050712e-06, + "loss": 1.4755549430847168, + "step": 7014 + }, + { + "epoch": 1.2770547010102848, + "grad_norm": 3.90625, + "learning_rate": 6.96046676204664e-06, + "loss": 1.080859899520874, + "step": 7016 + }, + { + "epoch": 1.277418767634477, + "grad_norm": 23.75, + "learning_rate": 6.958979223097964e-06, + "loss": 1.2284001111984253, + "step": 7018 + }, + { + "epoch": 1.2777828342586695, + "grad_norm": 13.375, + "learning_rate": 6.957491543422974e-06, + "loss": 1.3465055227279663, + "step": 7020 + }, + { + "epoch": 1.2781469008828616, + "grad_norm": 12.8125, + "learning_rate": 6.956003723239979e-06, + "loss": 1.5102200508117676, + "step": 7022 + }, + { + "epoch": 1.2785109675070538, + "grad_norm": 24.625, + "learning_rate": 6.954515762767316e-06, + "loss": 1.5981968641281128, + "step": 7024 + }, + { + "epoch": 1.278875034131246, + "grad_norm": 18.5, + "learning_rate": 6.953027662223329e-06, + "loss": 1.6490386724472046, + "step": 7026 + }, + { + "epoch": 1.2792391007554382, + "grad_norm": 18.25, + "learning_rate": 6.951539421826394e-06, + "loss": 1.101287603378296, + "step": 7028 + }, + { + "epoch": 1.2796031673796304, + "grad_norm": 10.375, + "learning_rate": 6.950051041794908e-06, + "loss": 1.3279919624328613, + "step": 7030 + }, + { + "epoch": 1.2799672340038226, + "grad_norm": 7.71875, + "learning_rate": 6.948562522347279e-06, + "loss": 1.2891145944595337, + "step": 7032 + }, + { + "epoch": 1.2803313006280148, + "grad_norm": 10.3125, + "learning_rate": 6.947073863701948e-06, + "loss": 0.885400652885437, + "step": 7034 + }, + { + "epoch": 1.2806953672522072, + "grad_norm": 30.875, + "learning_rate": 6.945585066077363e-06, + "loss": 1.391412615776062, + "step": 7036 + }, + { + "epoch": 1.2810594338763994, + "grad_norm": 10.75, + "learning_rate": 6.944096129692002e-06, + "loss": 1.3074579238891602, + "step": 7038 + }, + { + "epoch": 1.2814235005005916, + "grad_norm": 12.5, + "learning_rate": 6.94260705476436e-06, + "loss": 0.8693200349807739, + "step": 7040 + }, + { + "epoch": 1.2817875671247838, + "grad_norm": 9.25, + "learning_rate": 6.941117841512952e-06, + "loss": 1.5484583377838135, + "step": 7042 + }, + { + "epoch": 1.282151633748976, + "grad_norm": 17.875, + "learning_rate": 6.939628490156317e-06, + "loss": 1.2619388103485107, + "step": 7044 + }, + { + "epoch": 1.2825157003731684, + "grad_norm": 9.1875, + "learning_rate": 6.938139000913009e-06, + "loss": 1.298797607421875, + "step": 7046 + }, + { + "epoch": 1.2828797669973606, + "grad_norm": 10.75, + "learning_rate": 6.936649374001603e-06, + "loss": 1.4176117181777954, + "step": 7048 + }, + { + "epoch": 1.2832438336215528, + "grad_norm": 9.6875, + "learning_rate": 6.935159609640696e-06, + "loss": 1.3727424144744873, + "step": 7050 + }, + { + "epoch": 1.283607900245745, + "grad_norm": 8.3125, + "learning_rate": 6.933669708048909e-06, + "loss": 1.4126691818237305, + "step": 7052 + }, + { + "epoch": 1.2839719668699372, + "grad_norm": 9.5, + "learning_rate": 6.932179669444875e-06, + "loss": 1.187784194946289, + "step": 7054 + }, + { + "epoch": 1.2843360334941294, + "grad_norm": 6.59375, + "learning_rate": 6.9306894940472515e-06, + "loss": 1.0585685968399048, + "step": 7056 + }, + { + "epoch": 1.2847001001183216, + "grad_norm": 7.28125, + "learning_rate": 6.929199182074717e-06, + "loss": 1.2256271839141846, + "step": 7058 + }, + { + "epoch": 1.2850641667425138, + "grad_norm": 14.0, + "learning_rate": 6.927708733745968e-06, + "loss": 1.2778189182281494, + "step": 7060 + }, + { + "epoch": 1.285428233366706, + "grad_norm": 9.875, + "learning_rate": 6.926218149279723e-06, + "loss": 1.4143060445785522, + "step": 7062 + }, + { + "epoch": 1.2857922999908984, + "grad_norm": 7.34375, + "learning_rate": 6.924727428894718e-06, + "loss": 1.478899598121643, + "step": 7064 + }, + { + "epoch": 1.2861563666150906, + "grad_norm": 8.8125, + "learning_rate": 6.92323657280971e-06, + "loss": 1.0721986293792725, + "step": 7066 + }, + { + "epoch": 1.2865204332392828, + "grad_norm": 15.125, + "learning_rate": 6.921745581243477e-06, + "loss": 1.3027867078781128, + "step": 7068 + }, + { + "epoch": 1.286884499863475, + "grad_norm": 11.375, + "learning_rate": 6.920254454414814e-06, + "loss": 1.2663110494613647, + "step": 7070 + }, + { + "epoch": 1.2872485664876672, + "grad_norm": 10.5, + "learning_rate": 6.918763192542542e-06, + "loss": 1.5463697910308838, + "step": 7072 + }, + { + "epoch": 1.2876126331118596, + "grad_norm": 10.0625, + "learning_rate": 6.917271795845492e-06, + "loss": 1.2594268321990967, + "step": 7074 + }, + { + "epoch": 1.2879766997360518, + "grad_norm": 9.3125, + "learning_rate": 6.915780264542526e-06, + "loss": 1.5509223937988281, + "step": 7076 + }, + { + "epoch": 1.288340766360244, + "grad_norm": 6.8125, + "learning_rate": 6.9142885988525145e-06, + "loss": 1.2842024564743042, + "step": 7078 + }, + { + "epoch": 1.2887048329844362, + "grad_norm": 5.21875, + "learning_rate": 6.912796798994359e-06, + "loss": 0.9973230361938477, + "step": 7080 + }, + { + "epoch": 1.2890688996086284, + "grad_norm": 3.5625, + "learning_rate": 6.911304865186972e-06, + "loss": 1.1330246925354004, + "step": 7082 + }, + { + "epoch": 1.2894329662328206, + "grad_norm": 4.15625, + "learning_rate": 6.909812797649289e-06, + "loss": 1.3135521411895752, + "step": 7084 + }, + { + "epoch": 1.2897970328570127, + "grad_norm": 9.4375, + "learning_rate": 6.9083205966002645e-06, + "loss": 1.0735105276107788, + "step": 7086 + }, + { + "epoch": 1.290161099481205, + "grad_norm": 12.375, + "learning_rate": 6.9068282622588735e-06, + "loss": 1.3692142963409424, + "step": 7088 + }, + { + "epoch": 1.2905251661053974, + "grad_norm": 8.9375, + "learning_rate": 6.9053357948441105e-06, + "loss": 1.3653674125671387, + "step": 7090 + }, + { + "epoch": 1.2908892327295896, + "grad_norm": 7.5, + "learning_rate": 6.9038431945749885e-06, + "loss": 1.455733060836792, + "step": 7092 + }, + { + "epoch": 1.2912532993537817, + "grad_norm": 11.875, + "learning_rate": 6.902350461670542e-06, + "loss": 1.2619668245315552, + "step": 7094 + }, + { + "epoch": 1.291617365977974, + "grad_norm": 15.625, + "learning_rate": 6.9008575963498206e-06, + "loss": 1.0950919389724731, + "step": 7096 + }, + { + "epoch": 1.2919814326021661, + "grad_norm": 11.1875, + "learning_rate": 6.8993645988318965e-06, + "loss": 0.5333462953567505, + "step": 7098 + }, + { + "epoch": 1.2923454992263586, + "grad_norm": 9.0625, + "learning_rate": 6.897871469335864e-06, + "loss": 1.2034614086151123, + "step": 7100 + }, + { + "epoch": 1.2927095658505507, + "grad_norm": 13.3125, + "learning_rate": 6.896378208080832e-06, + "loss": 1.4908605813980103, + "step": 7102 + }, + { + "epoch": 1.293073632474743, + "grad_norm": 17.25, + "learning_rate": 6.8948848152859316e-06, + "loss": 1.31070077419281, + "step": 7104 + }, + { + "epoch": 1.2934376990989351, + "grad_norm": 11.1875, + "learning_rate": 6.89339129117031e-06, + "loss": 1.2852356433868408, + "step": 7106 + }, + { + "epoch": 1.2938017657231273, + "grad_norm": 8.1875, + "learning_rate": 6.8918976359531366e-06, + "loss": 1.4274932146072388, + "step": 7108 + }, + { + "epoch": 1.2941658323473195, + "grad_norm": 8.1875, + "learning_rate": 6.890403849853601e-06, + "loss": 1.3167959451675415, + "step": 7110 + }, + { + "epoch": 1.2945298989715117, + "grad_norm": 4.53125, + "learning_rate": 6.888909933090908e-06, + "loss": 1.193117618560791, + "step": 7112 + }, + { + "epoch": 1.294893965595704, + "grad_norm": 5.28125, + "learning_rate": 6.887415885884286e-06, + "loss": 1.2229421138763428, + "step": 7114 + }, + { + "epoch": 1.295258032219896, + "grad_norm": 7.84375, + "learning_rate": 6.885921708452978e-06, + "loss": 1.2449531555175781, + "step": 7116 + }, + { + "epoch": 1.2956220988440885, + "grad_norm": 12.6875, + "learning_rate": 6.884427401016249e-06, + "loss": 1.2884869575500488, + "step": 7118 + }, + { + "epoch": 1.2959861654682807, + "grad_norm": 12.0625, + "learning_rate": 6.882932963793384e-06, + "loss": 1.187927484512329, + "step": 7120 + }, + { + "epoch": 1.296350232092473, + "grad_norm": 11.3125, + "learning_rate": 6.881438397003684e-06, + "loss": 1.4766616821289062, + "step": 7122 + }, + { + "epoch": 1.296714298716665, + "grad_norm": 8.8125, + "learning_rate": 6.879943700866474e-06, + "loss": 1.7511173486709595, + "step": 7124 + }, + { + "epoch": 1.2970783653408573, + "grad_norm": 9.3125, + "learning_rate": 6.878448875601089e-06, + "loss": 1.0145858526229858, + "step": 7126 + }, + { + "epoch": 1.2974424319650497, + "grad_norm": 34.75, + "learning_rate": 6.876953921426892e-06, + "loss": 1.1911375522613525, + "step": 7128 + }, + { + "epoch": 1.297806498589242, + "grad_norm": 4.25, + "learning_rate": 6.875458838563263e-06, + "loss": 0.0983295738697052, + "step": 7130 + }, + { + "epoch": 1.298170565213434, + "grad_norm": 27.5, + "learning_rate": 6.873963627229595e-06, + "loss": 0.3917846977710724, + "step": 7132 + }, + { + "epoch": 1.2985346318376263, + "grad_norm": 28.5, + "learning_rate": 6.872468287645308e-06, + "loss": 1.3198145627975464, + "step": 7134 + }, + { + "epoch": 1.2988986984618185, + "grad_norm": 77.5, + "learning_rate": 6.870972820029835e-06, + "loss": 0.9498114585876465, + "step": 7136 + }, + { + "epoch": 1.2992627650860107, + "grad_norm": 12.0, + "learning_rate": 6.86947722460263e-06, + "loss": 1.508488416671753, + "step": 7138 + }, + { + "epoch": 1.2996268317102029, + "grad_norm": 22.25, + "learning_rate": 6.867981501583168e-06, + "loss": 1.3845430612564087, + "step": 7140 + }, + { + "epoch": 1.299990898334395, + "grad_norm": 8.9375, + "learning_rate": 6.866485651190937e-06, + "loss": 0.8970973491668701, + "step": 7142 + }, + { + "epoch": 1.3003549649585875, + "grad_norm": 18.0, + "learning_rate": 6.864989673645448e-06, + "loss": 1.5058352947235107, + "step": 7144 + }, + { + "epoch": 1.3007190315827797, + "grad_norm": 10.4375, + "learning_rate": 6.8634935691662305e-06, + "loss": 1.2339940071105957, + "step": 7146 + }, + { + "epoch": 1.3010830982069719, + "grad_norm": 23.25, + "learning_rate": 6.86199733797283e-06, + "loss": 0.9971280694007874, + "step": 7148 + }, + { + "epoch": 1.301447164831164, + "grad_norm": 26.25, + "learning_rate": 6.860500980284814e-06, + "loss": 1.3777186870574951, + "step": 7150 + }, + { + "epoch": 1.3018112314553563, + "grad_norm": 11.6875, + "learning_rate": 6.859004496321766e-06, + "loss": 1.247565746307373, + "step": 7152 + }, + { + "epoch": 1.3021752980795487, + "grad_norm": 17.375, + "learning_rate": 6.857507886303292e-06, + "loss": 0.5558710694313049, + "step": 7154 + }, + { + "epoch": 1.3025393647037409, + "grad_norm": 20.375, + "learning_rate": 6.856011150449009e-06, + "loss": 1.7666443586349487, + "step": 7156 + }, + { + "epoch": 1.302903431327933, + "grad_norm": 15.0625, + "learning_rate": 6.854514288978558e-06, + "loss": 1.3393206596374512, + "step": 7158 + }, + { + "epoch": 1.3032674979521253, + "grad_norm": 11.9375, + "learning_rate": 6.853017302111597e-06, + "loss": 1.424713134765625, + "step": 7160 + }, + { + "epoch": 1.3036315645763175, + "grad_norm": 10.0, + "learning_rate": 6.851520190067806e-06, + "loss": 1.3376874923706055, + "step": 7162 + }, + { + "epoch": 1.3039956312005097, + "grad_norm": 14.75, + "learning_rate": 6.850022953066879e-06, + "loss": 1.335485577583313, + "step": 7164 + }, + { + "epoch": 1.3043596978247018, + "grad_norm": 40.0, + "learning_rate": 6.848525591328528e-06, + "loss": 0.8418655395507812, + "step": 7166 + }, + { + "epoch": 1.304723764448894, + "grad_norm": 26.875, + "learning_rate": 6.847028105072483e-06, + "loss": 1.0361363887786865, + "step": 7168 + }, + { + "epoch": 1.3050878310730865, + "grad_norm": 3.078125, + "learning_rate": 6.8455304945184975e-06, + "loss": 0.450295090675354, + "step": 7170 + }, + { + "epoch": 1.3054518976972787, + "grad_norm": 14.375, + "learning_rate": 6.844032759886339e-06, + "loss": 1.2860678434371948, + "step": 7172 + }, + { + "epoch": 1.3058159643214708, + "grad_norm": 14.125, + "learning_rate": 6.842534901395794e-06, + "loss": 1.2256741523742676, + "step": 7174 + }, + { + "epoch": 1.306180030945663, + "grad_norm": 28.5, + "learning_rate": 6.841036919266666e-06, + "loss": 1.5396909713745117, + "step": 7176 + }, + { + "epoch": 1.3065440975698552, + "grad_norm": 4.53125, + "learning_rate": 6.839538813718778e-06, + "loss": 0.9447013139724731, + "step": 7178 + }, + { + "epoch": 1.3069081641940474, + "grad_norm": 11.6875, + "learning_rate": 6.838040584971972e-06, + "loss": 1.2523603439331055, + "step": 7180 + }, + { + "epoch": 1.3072722308182398, + "grad_norm": 11.5625, + "learning_rate": 6.836542233246106e-06, + "loss": 1.3193187713623047, + "step": 7182 + }, + { + "epoch": 1.307636297442432, + "grad_norm": 10.75, + "learning_rate": 6.8350437587610594e-06, + "loss": 1.2456010580062866, + "step": 7184 + }, + { + "epoch": 1.3080003640666242, + "grad_norm": 11.8125, + "learning_rate": 6.833545161736724e-06, + "loss": 1.1271644830703735, + "step": 7186 + }, + { + "epoch": 1.3083644306908164, + "grad_norm": 9.5, + "learning_rate": 6.8320464423930145e-06, + "loss": 1.2888271808624268, + "step": 7188 + }, + { + "epoch": 1.3087284973150086, + "grad_norm": 107.5, + "learning_rate": 6.830547600949859e-06, + "loss": 1.1926859617233276, + "step": 7190 + }, + { + "epoch": 1.3090925639392008, + "grad_norm": 23.875, + "learning_rate": 6.829048637627212e-06, + "loss": 0.6636847853660583, + "step": 7192 + }, + { + "epoch": 1.309456630563393, + "grad_norm": 16.875, + "learning_rate": 6.827549552645037e-06, + "loss": 1.0432558059692383, + "step": 7194 + }, + { + "epoch": 1.3098206971875852, + "grad_norm": 11.25, + "learning_rate": 6.826050346223318e-06, + "loss": 1.3708641529083252, + "step": 7196 + }, + { + "epoch": 1.3101847638117776, + "grad_norm": 15.0625, + "learning_rate": 6.82455101858206e-06, + "loss": 1.497727632522583, + "step": 7198 + }, + { + "epoch": 1.3105488304359698, + "grad_norm": 13.0625, + "learning_rate": 6.823051569941279e-06, + "loss": 1.3389322757720947, + "step": 7200 + }, + { + "epoch": 1.310912897060162, + "grad_norm": 10.625, + "learning_rate": 6.821552000521017e-06, + "loss": 1.2424613237380981, + "step": 7202 + }, + { + "epoch": 1.3112769636843542, + "grad_norm": 30.0, + "learning_rate": 6.82005231054133e-06, + "loss": 1.1347570419311523, + "step": 7204 + }, + { + "epoch": 1.3116410303085464, + "grad_norm": 11.625, + "learning_rate": 6.818552500222286e-06, + "loss": 1.0207314491271973, + "step": 7206 + }, + { + "epoch": 1.3120050969327388, + "grad_norm": 6.90625, + "learning_rate": 6.817052569783982e-06, + "loss": 1.2210968732833862, + "step": 7208 + }, + { + "epoch": 1.312369163556931, + "grad_norm": 7.3125, + "learning_rate": 6.815552519446524e-06, + "loss": 1.3219958543777466, + "step": 7210 + }, + { + "epoch": 1.3127332301811232, + "grad_norm": 112.5, + "learning_rate": 6.814052349430041e-06, + "loss": 1.1523452997207642, + "step": 7212 + }, + { + "epoch": 1.3130972968053154, + "grad_norm": 7.96875, + "learning_rate": 6.8125520599546735e-06, + "loss": 1.1410261392593384, + "step": 7214 + }, + { + "epoch": 1.3134613634295076, + "grad_norm": 38.75, + "learning_rate": 6.811051651240585e-06, + "loss": 1.4542841911315918, + "step": 7216 + }, + { + "epoch": 1.3138254300536998, + "grad_norm": 16.875, + "learning_rate": 6.809551123507951e-06, + "loss": 1.584712028503418, + "step": 7218 + }, + { + "epoch": 1.314189496677892, + "grad_norm": 12.9375, + "learning_rate": 6.8080504769769725e-06, + "loss": 1.3592419624328613, + "step": 7220 + }, + { + "epoch": 1.3145535633020842, + "grad_norm": 8.3125, + "learning_rate": 6.80654971186786e-06, + "loss": 1.3977789878845215, + "step": 7222 + }, + { + "epoch": 1.3149176299262766, + "grad_norm": 7.09375, + "learning_rate": 6.805048828400849e-06, + "loss": 1.1130414009094238, + "step": 7224 + }, + { + "epoch": 1.3152816965504688, + "grad_norm": 12.9375, + "learning_rate": 6.803547826796182e-06, + "loss": 1.3731980323791504, + "step": 7226 + }, + { + "epoch": 1.315645763174661, + "grad_norm": 9.125, + "learning_rate": 6.802046707274128e-06, + "loss": 1.3402268886566162, + "step": 7228 + }, + { + "epoch": 1.3160098297988532, + "grad_norm": 6.09375, + "learning_rate": 6.800545470054971e-06, + "loss": 1.042785406112671, + "step": 7230 + }, + { + "epoch": 1.3163738964230454, + "grad_norm": 18.625, + "learning_rate": 6.79904411535901e-06, + "loss": 1.2189916372299194, + "step": 7232 + }, + { + "epoch": 1.3167379630472378, + "grad_norm": 13.125, + "learning_rate": 6.797542643406565e-06, + "loss": 1.481350302696228, + "step": 7234 + }, + { + "epoch": 1.31710202967143, + "grad_norm": 73.5, + "learning_rate": 6.7960410544179674e-06, + "loss": 1.6615064144134521, + "step": 7236 + }, + { + "epoch": 1.3174660962956222, + "grad_norm": 17.75, + "learning_rate": 6.794539348613571e-06, + "loss": 1.7068772315979004, + "step": 7238 + }, + { + "epoch": 1.3178301629198144, + "grad_norm": 11.5, + "learning_rate": 6.793037526213746e-06, + "loss": 1.9035048484802246, + "step": 7240 + }, + { + "epoch": 1.3181942295440066, + "grad_norm": 3.5, + "learning_rate": 6.791535587438878e-06, + "loss": 1.2285220623016357, + "step": 7242 + }, + { + "epoch": 1.3185582961681988, + "grad_norm": 22.75, + "learning_rate": 6.79003353250937e-06, + "loss": 1.0185660123825073, + "step": 7244 + }, + { + "epoch": 1.318922362792391, + "grad_norm": 20.125, + "learning_rate": 6.788531361645644e-06, + "loss": 1.4466512203216553, + "step": 7246 + }, + { + "epoch": 1.3192864294165831, + "grad_norm": 15.3125, + "learning_rate": 6.787029075068135e-06, + "loss": 1.2173423767089844, + "step": 7248 + }, + { + "epoch": 1.3196504960407753, + "grad_norm": 13.0625, + "learning_rate": 6.785526672997298e-06, + "loss": 1.4120014905929565, + "step": 7250 + }, + { + "epoch": 1.3200145626649677, + "grad_norm": 26.25, + "learning_rate": 6.7840241556536064e-06, + "loss": 1.1994625329971313, + "step": 7252 + }, + { + "epoch": 1.32037862928916, + "grad_norm": 8.25, + "learning_rate": 6.782521523257548e-06, + "loss": 1.2093708515167236, + "step": 7254 + }, + { + "epoch": 1.3207426959133521, + "grad_norm": 6.15625, + "learning_rate": 6.781018776029626e-06, + "loss": 1.0442777872085571, + "step": 7256 + }, + { + "epoch": 1.3211067625375443, + "grad_norm": 12.4375, + "learning_rate": 6.779515914190365e-06, + "loss": 1.5330231189727783, + "step": 7258 + }, + { + "epoch": 1.3214708291617365, + "grad_norm": 5.125, + "learning_rate": 6.778012937960301e-06, + "loss": 1.3253178596496582, + "step": 7260 + }, + { + "epoch": 1.321834895785929, + "grad_norm": 10.8125, + "learning_rate": 6.776509847559993e-06, + "loss": 1.2081809043884277, + "step": 7262 + }, + { + "epoch": 1.3221989624101211, + "grad_norm": 19.25, + "learning_rate": 6.775006643210012e-06, + "loss": 1.3396077156066895, + "step": 7264 + }, + { + "epoch": 1.3225630290343133, + "grad_norm": 25.0, + "learning_rate": 6.773503325130946e-06, + "loss": 1.098000168800354, + "step": 7266 + }, + { + "epoch": 1.3229270956585055, + "grad_norm": 21.0, + "learning_rate": 6.771999893543401e-06, + "loss": 0.40661901235580444, + "step": 7268 + }, + { + "epoch": 1.3232911622826977, + "grad_norm": 9.4375, + "learning_rate": 6.770496348668001e-06, + "loss": 1.2449018955230713, + "step": 7270 + }, + { + "epoch": 1.32365522890689, + "grad_norm": 10.625, + "learning_rate": 6.768992690725384e-06, + "loss": 1.5068029165267944, + "step": 7272 + }, + { + "epoch": 1.324019295531082, + "grad_norm": 780.0, + "learning_rate": 6.767488919936208e-06, + "loss": 1.423985481262207, + "step": 7274 + }, + { + "epoch": 1.3243833621552743, + "grad_norm": 15.625, + "learning_rate": 6.765985036521143e-06, + "loss": 1.6611640453338623, + "step": 7276 + }, + { + "epoch": 1.3247474287794667, + "grad_norm": 8.625, + "learning_rate": 6.764481040700877e-06, + "loss": 1.4383623600006104, + "step": 7278 + }, + { + "epoch": 1.325111495403659, + "grad_norm": 12.0, + "learning_rate": 6.762976932696116e-06, + "loss": 1.1659802198410034, + "step": 7280 + }, + { + "epoch": 1.325475562027851, + "grad_norm": 17.0, + "learning_rate": 6.7614727127275815e-06, + "loss": 1.6228660345077515, + "step": 7282 + }, + { + "epoch": 1.3258396286520433, + "grad_norm": 17.125, + "learning_rate": 6.759968381016016e-06, + "loss": 1.8521240949630737, + "step": 7284 + }, + { + "epoch": 1.3262036952762355, + "grad_norm": 13.9375, + "learning_rate": 6.7584639377821686e-06, + "loss": 1.559242844581604, + "step": 7286 + }, + { + "epoch": 1.326567761900428, + "grad_norm": 6.28125, + "learning_rate": 6.75695938324681e-06, + "loss": 1.2156175374984741, + "step": 7288 + }, + { + "epoch": 1.32693182852462, + "grad_norm": 19.0, + "learning_rate": 6.755454717630732e-06, + "loss": 0.848728358745575, + "step": 7290 + }, + { + "epoch": 1.3272958951488123, + "grad_norm": 17.0, + "learning_rate": 6.753949941154734e-06, + "loss": 0.6268037557601929, + "step": 7292 + }, + { + "epoch": 1.3276599617730045, + "grad_norm": 20.5, + "learning_rate": 6.7524450540396395e-06, + "loss": 1.7513140439987183, + "step": 7294 + }, + { + "epoch": 1.3280240283971967, + "grad_norm": 9.5, + "learning_rate": 6.750940056506282e-06, + "loss": 1.3441169261932373, + "step": 7296 + }, + { + "epoch": 1.3283880950213889, + "grad_norm": 13.4375, + "learning_rate": 6.749434948775514e-06, + "loss": 1.365588903427124, + "step": 7298 + }, + { + "epoch": 1.328752161645581, + "grad_norm": 53.0, + "learning_rate": 6.747929731068205e-06, + "loss": 1.261444330215454, + "step": 7300 + }, + { + "epoch": 1.3291162282697733, + "grad_norm": 7.9375, + "learning_rate": 6.746424403605238e-06, + "loss": 1.058774709701538, + "step": 7302 + }, + { + "epoch": 1.3294802948939655, + "grad_norm": 15.375, + "learning_rate": 6.7449189666075166e-06, + "loss": 0.8494008779525757, + "step": 7304 + }, + { + "epoch": 1.3298443615181579, + "grad_norm": 22.375, + "learning_rate": 6.7434134202959555e-06, + "loss": 1.0898377895355225, + "step": 7306 + }, + { + "epoch": 1.33020842814235, + "grad_norm": 27.5, + "learning_rate": 6.7419077648914865e-06, + "loss": 1.6385815143585205, + "step": 7308 + }, + { + "epoch": 1.3305724947665423, + "grad_norm": 14.3125, + "learning_rate": 6.740402000615061e-06, + "loss": 1.5671327114105225, + "step": 7310 + }, + { + "epoch": 1.3309365613907345, + "grad_norm": 16.375, + "learning_rate": 6.738896127687642e-06, + "loss": 1.370741605758667, + "step": 7312 + }, + { + "epoch": 1.3313006280149267, + "grad_norm": 58.5, + "learning_rate": 6.7373901463302096e-06, + "loss": 1.8039097785949707, + "step": 7314 + }, + { + "epoch": 1.331664694639119, + "grad_norm": 8.5, + "learning_rate": 6.735884056763763e-06, + "loss": 1.320874571800232, + "step": 7316 + }, + { + "epoch": 1.3320287612633113, + "grad_norm": 12.75, + "learning_rate": 6.734377859209313e-06, + "loss": 1.1930052042007446, + "step": 7318 + }, + { + "epoch": 1.3323928278875035, + "grad_norm": 29.0, + "learning_rate": 6.732871553887888e-06, + "loss": 0.7306788563728333, + "step": 7320 + }, + { + "epoch": 1.3327568945116957, + "grad_norm": 2.5, + "learning_rate": 6.731365141020531e-06, + "loss": 0.8885293006896973, + "step": 7322 + }, + { + "epoch": 1.3331209611358878, + "grad_norm": 8.3125, + "learning_rate": 6.729858620828307e-06, + "loss": 1.0472424030303955, + "step": 7324 + }, + { + "epoch": 1.33348502776008, + "grad_norm": 19.375, + "learning_rate": 6.728351993532287e-06, + "loss": 1.5017237663269043, + "step": 7326 + }, + { + "epoch": 1.3338490943842722, + "grad_norm": 15.75, + "learning_rate": 6.726845259353563e-06, + "loss": 1.2247464656829834, + "step": 7328 + }, + { + "epoch": 1.3342131610084644, + "grad_norm": 17.75, + "learning_rate": 6.725338418513243e-06, + "loss": 1.053222894668579, + "step": 7330 + }, + { + "epoch": 1.3345772276326568, + "grad_norm": 14.25, + "learning_rate": 6.723831471232449e-06, + "loss": 1.7148617506027222, + "step": 7332 + }, + { + "epoch": 1.334941294256849, + "grad_norm": 10.0, + "learning_rate": 6.722324417732321e-06, + "loss": 0.9061832427978516, + "step": 7334 + }, + { + "epoch": 1.3353053608810412, + "grad_norm": 620.0, + "learning_rate": 6.720817258234014e-06, + "loss": 1.288906216621399, + "step": 7336 + }, + { + "epoch": 1.3356694275052334, + "grad_norm": 21.25, + "learning_rate": 6.719309992958691e-06, + "loss": 1.7107911109924316, + "step": 7338 + }, + { + "epoch": 1.3360334941294256, + "grad_norm": 8.125, + "learning_rate": 6.7178026221275435e-06, + "loss": 1.1392858028411865, + "step": 7340 + }, + { + "epoch": 1.336397560753618, + "grad_norm": 13.5625, + "learning_rate": 6.716295145961771e-06, + "loss": 1.528315544128418, + "step": 7342 + }, + { + "epoch": 1.3367616273778102, + "grad_norm": 17.125, + "learning_rate": 6.714787564682589e-06, + "loss": 1.5959678888320923, + "step": 7344 + }, + { + "epoch": 1.3371256940020024, + "grad_norm": 11.0625, + "learning_rate": 6.713279878511227e-06, + "loss": 1.3057136535644531, + "step": 7346 + }, + { + "epoch": 1.3374897606261946, + "grad_norm": 40.75, + "learning_rate": 6.711772087668935e-06, + "loss": 1.4564380645751953, + "step": 7348 + }, + { + "epoch": 1.3378538272503868, + "grad_norm": 10.9375, + "learning_rate": 6.710264192376974e-06, + "loss": 1.7661343812942505, + "step": 7350 + }, + { + "epoch": 1.338217893874579, + "grad_norm": 109.5, + "learning_rate": 6.70875619285662e-06, + "loss": 1.1260857582092285, + "step": 7352 + }, + { + "epoch": 1.3385819604987712, + "grad_norm": 13.75, + "learning_rate": 6.7072480893291665e-06, + "loss": 0.750861644744873, + "step": 7354 + }, + { + "epoch": 1.3389460271229634, + "grad_norm": 13.375, + "learning_rate": 6.705739882015925e-06, + "loss": 1.440624713897705, + "step": 7356 + }, + { + "epoch": 1.3393100937471556, + "grad_norm": 13.875, + "learning_rate": 6.704231571138213e-06, + "loss": 1.6715214252471924, + "step": 7358 + }, + { + "epoch": 1.339674160371348, + "grad_norm": 7.5, + "learning_rate": 6.702723156917372e-06, + "loss": 1.191943883895874, + "step": 7360 + }, + { + "epoch": 1.3400382269955402, + "grad_norm": 14.1875, + "learning_rate": 6.701214639574758e-06, + "loss": 1.1247186660766602, + "step": 7362 + }, + { + "epoch": 1.3404022936197324, + "grad_norm": 10.1875, + "learning_rate": 6.699706019331736e-06, + "loss": 1.1612498760223389, + "step": 7364 + }, + { + "epoch": 1.3407663602439246, + "grad_norm": 9.8125, + "learning_rate": 6.698197296409692e-06, + "loss": 1.2469960451126099, + "step": 7366 + }, + { + "epoch": 1.3411304268681168, + "grad_norm": 23.0, + "learning_rate": 6.696688471030023e-06, + "loss": 1.6345014572143555, + "step": 7368 + }, + { + "epoch": 1.3414944934923092, + "grad_norm": 54.25, + "learning_rate": 6.695179543414144e-06, + "loss": 1.997577428817749, + "step": 7370 + }, + { + "epoch": 1.3418585601165014, + "grad_norm": 20.125, + "learning_rate": 6.693670513783486e-06, + "loss": 1.311200737953186, + "step": 7372 + }, + { + "epoch": 1.3422226267406936, + "grad_norm": 54.0, + "learning_rate": 6.69216138235949e-06, + "loss": 1.2803378105163574, + "step": 7374 + }, + { + "epoch": 1.3425866933648858, + "grad_norm": 35.0, + "learning_rate": 6.690652149363619e-06, + "loss": 2.219374179840088, + "step": 7376 + }, + { + "epoch": 1.342950759989078, + "grad_norm": 22.75, + "learning_rate": 6.68914281501734e-06, + "loss": 1.1470637321472168, + "step": 7378 + }, + { + "epoch": 1.3433148266132702, + "grad_norm": 34.75, + "learning_rate": 6.687633379542148e-06, + "loss": 0.8603953719139099, + "step": 7380 + }, + { + "epoch": 1.3436788932374624, + "grad_norm": 10.4375, + "learning_rate": 6.686123843159543e-06, + "loss": 1.4480098485946655, + "step": 7382 + }, + { + "epoch": 1.3440429598616546, + "grad_norm": 12.125, + "learning_rate": 6.684614206091047e-06, + "loss": 1.4918415546417236, + "step": 7384 + }, + { + "epoch": 1.344407026485847, + "grad_norm": 21.25, + "learning_rate": 6.683104468558188e-06, + "loss": 1.3915154933929443, + "step": 7386 + }, + { + "epoch": 1.3447710931100392, + "grad_norm": 74.0, + "learning_rate": 6.681594630782518e-06, + "loss": 1.5127147436141968, + "step": 7388 + }, + { + "epoch": 1.3451351597342314, + "grad_norm": 16.0, + "learning_rate": 6.680084692985598e-06, + "loss": 1.4008169174194336, + "step": 7390 + }, + { + "epoch": 1.3454992263584236, + "grad_norm": 6.3125, + "learning_rate": 6.678574655389005e-06, + "loss": 0.9470155835151672, + "step": 7392 + }, + { + "epoch": 1.3458632929826158, + "grad_norm": 30.0, + "learning_rate": 6.677064518214333e-06, + "loss": 0.9067777991294861, + "step": 7394 + }, + { + "epoch": 1.3462273596068082, + "grad_norm": 6.4375, + "learning_rate": 6.675554281683185e-06, + "loss": 0.7488901615142822, + "step": 7396 + }, + { + "epoch": 1.3465914262310004, + "grad_norm": 7.53125, + "learning_rate": 6.6740439460171845e-06, + "loss": 1.3801857233047485, + "step": 7398 + }, + { + "epoch": 1.3469554928551926, + "grad_norm": 11.75, + "learning_rate": 6.672533511437966e-06, + "loss": 1.4196380376815796, + "step": 7400 + }, + { + "epoch": 1.3473195594793848, + "grad_norm": 16.625, + "learning_rate": 6.6710229781671834e-06, + "loss": 1.3047449588775635, + "step": 7402 + }, + { + "epoch": 1.347683626103577, + "grad_norm": 12.0, + "learning_rate": 6.669512346426495e-06, + "loss": 1.254878282546997, + "step": 7404 + }, + { + "epoch": 1.3480476927277691, + "grad_norm": 20.25, + "learning_rate": 6.6680016164375834e-06, + "loss": 1.1717556715011597, + "step": 7406 + }, + { + "epoch": 1.3484117593519613, + "grad_norm": 4.375, + "learning_rate": 6.666490788422142e-06, + "loss": 0.6975076198577881, + "step": 7408 + }, + { + "epoch": 1.3487758259761535, + "grad_norm": 9.9375, + "learning_rate": 6.664979862601879e-06, + "loss": 1.3274766206741333, + "step": 7410 + }, + { + "epoch": 1.3491398926003457, + "grad_norm": 5.5, + "learning_rate": 6.663468839198516e-06, + "loss": 1.0753499269485474, + "step": 7412 + }, + { + "epoch": 1.3495039592245381, + "grad_norm": 18.25, + "learning_rate": 6.661957718433789e-06, + "loss": 1.2462944984436035, + "step": 7414 + }, + { + "epoch": 1.3498680258487303, + "grad_norm": 17.875, + "learning_rate": 6.660446500529449e-06, + "loss": 1.1585508584976196, + "step": 7416 + }, + { + "epoch": 1.3502320924729225, + "grad_norm": 7.4375, + "learning_rate": 6.658935185707262e-06, + "loss": 0.9352970123291016, + "step": 7418 + }, + { + "epoch": 1.3505961590971147, + "grad_norm": 11.875, + "learning_rate": 6.657423774189009e-06, + "loss": 1.4495834112167358, + "step": 7420 + }, + { + "epoch": 1.350960225721307, + "grad_norm": 9.6875, + "learning_rate": 6.655912266196481e-06, + "loss": 1.2501970529556274, + "step": 7422 + }, + { + "epoch": 1.3513242923454993, + "grad_norm": 11.25, + "learning_rate": 6.654400661951483e-06, + "loss": 1.217156171798706, + "step": 7424 + }, + { + "epoch": 1.3516883589696915, + "grad_norm": 5.21875, + "learning_rate": 6.652888961675843e-06, + "loss": 1.2445909976959229, + "step": 7426 + }, + { + "epoch": 1.3520524255938837, + "grad_norm": 17.375, + "learning_rate": 6.651377165591393e-06, + "loss": 1.4784386157989502, + "step": 7428 + }, + { + "epoch": 1.352416492218076, + "grad_norm": 16.5, + "learning_rate": 6.649865273919982e-06, + "loss": 1.6582244634628296, + "step": 7430 + }, + { + "epoch": 1.352780558842268, + "grad_norm": 21.0, + "learning_rate": 6.648353286883477e-06, + "loss": 1.6432421207427979, + "step": 7432 + }, + { + "epoch": 1.3531446254664603, + "grad_norm": 18.75, + "learning_rate": 6.646841204703755e-06, + "loss": 0.926926851272583, + "step": 7434 + }, + { + "epoch": 1.3535086920906525, + "grad_norm": 24.75, + "learning_rate": 6.64532902760271e-06, + "loss": 1.580348253250122, + "step": 7436 + }, + { + "epoch": 1.3538727587148447, + "grad_norm": 17.125, + "learning_rate": 6.643816755802241e-06, + "loss": 2.0164577960968018, + "step": 7438 + }, + { + "epoch": 1.354236825339037, + "grad_norm": 23.625, + "learning_rate": 6.642304389524274e-06, + "loss": 1.4507524967193604, + "step": 7440 + }, + { + "epoch": 1.3546008919632293, + "grad_norm": 13.0, + "learning_rate": 6.640791928990742e-06, + "loss": 1.440001368522644, + "step": 7442 + }, + { + "epoch": 1.3549649585874215, + "grad_norm": 12.875, + "learning_rate": 6.6392793744235885e-06, + "loss": 1.5932947397232056, + "step": 7444 + }, + { + "epoch": 1.3553290252116137, + "grad_norm": 12.875, + "learning_rate": 6.637766726044781e-06, + "loss": 1.6701114177703857, + "step": 7446 + }, + { + "epoch": 1.3556930918358059, + "grad_norm": 12.0, + "learning_rate": 6.636253984076288e-06, + "loss": 1.492882251739502, + "step": 7448 + }, + { + "epoch": 1.3560571584599983, + "grad_norm": 14.8125, + "learning_rate": 6.6347411487401035e-06, + "loss": 1.1613523960113525, + "step": 7450 + }, + { + "epoch": 1.3564212250841905, + "grad_norm": 21.75, + "learning_rate": 6.633228220258228e-06, + "loss": 1.4374372959136963, + "step": 7452 + }, + { + "epoch": 1.3567852917083827, + "grad_norm": 17.375, + "learning_rate": 6.6317151988526766e-06, + "loss": 1.3111393451690674, + "step": 7454 + }, + { + "epoch": 1.3571493583325749, + "grad_norm": 26.75, + "learning_rate": 6.6302020847454805e-06, + "loss": 1.4557454586029053, + "step": 7456 + }, + { + "epoch": 1.357513424956767, + "grad_norm": 11.3125, + "learning_rate": 6.628688878158681e-06, + "loss": 1.6682789325714111, + "step": 7458 + }, + { + "epoch": 1.3578774915809593, + "grad_norm": 11.875, + "learning_rate": 6.627175579314338e-06, + "loss": 1.2872165441513062, + "step": 7460 + }, + { + "epoch": 1.3582415582051515, + "grad_norm": 18.375, + "learning_rate": 6.625662188434518e-06, + "loss": 1.525971531867981, + "step": 7462 + }, + { + "epoch": 1.3586056248293437, + "grad_norm": 6.34375, + "learning_rate": 6.624148705741311e-06, + "loss": 1.0821521282196045, + "step": 7464 + }, + { + "epoch": 1.358969691453536, + "grad_norm": 12.0, + "learning_rate": 6.6226351314568095e-06, + "loss": 0.4714857339859009, + "step": 7466 + }, + { + "epoch": 1.3593337580777283, + "grad_norm": 9.125, + "learning_rate": 6.621121465803124e-06, + "loss": 1.1857128143310547, + "step": 7468 + }, + { + "epoch": 1.3596978247019205, + "grad_norm": 25.125, + "learning_rate": 6.619607709002383e-06, + "loss": 1.5799140930175781, + "step": 7470 + }, + { + "epoch": 1.3600618913261127, + "grad_norm": 13.6875, + "learning_rate": 6.61809386127672e-06, + "loss": 1.1052372455596924, + "step": 7472 + }, + { + "epoch": 1.3604259579503049, + "grad_norm": 18.25, + "learning_rate": 6.61657992284829e-06, + "loss": 1.3505126237869263, + "step": 7474 + }, + { + "epoch": 1.3607900245744973, + "grad_norm": 15.0, + "learning_rate": 6.615065893939254e-06, + "loss": 1.4402521848678589, + "step": 7476 + }, + { + "epoch": 1.3611540911986895, + "grad_norm": 21.125, + "learning_rate": 6.61355177477179e-06, + "loss": 1.6833484172821045, + "step": 7478 + }, + { + "epoch": 1.3615181578228817, + "grad_norm": 12.9375, + "learning_rate": 6.612037565568088e-06, + "loss": 1.5567283630371094, + "step": 7480 + }, + { + "epoch": 1.3618822244470739, + "grad_norm": 10.125, + "learning_rate": 6.610523266550357e-06, + "loss": 0.9774007797241211, + "step": 7482 + }, + { + "epoch": 1.362246291071266, + "grad_norm": 15.75, + "learning_rate": 6.609008877940808e-06, + "loss": 1.3714394569396973, + "step": 7484 + }, + { + "epoch": 1.3626103576954582, + "grad_norm": 10.9375, + "learning_rate": 6.607494399961677e-06, + "loss": 0.5788627862930298, + "step": 7486 + }, + { + "epoch": 1.3629744243196504, + "grad_norm": 10.875, + "learning_rate": 6.605979832835203e-06, + "loss": 1.324271559715271, + "step": 7488 + }, + { + "epoch": 1.3633384909438426, + "grad_norm": 12.625, + "learning_rate": 6.604465176783645e-06, + "loss": 1.7733110189437866, + "step": 7490 + }, + { + "epoch": 1.3637025575680348, + "grad_norm": 12.75, + "learning_rate": 6.602950432029272e-06, + "loss": 1.4646718502044678, + "step": 7492 + }, + { + "epoch": 1.3640666241922272, + "grad_norm": 12.5625, + "learning_rate": 6.601435598794366e-06, + "loss": 1.0780293941497803, + "step": 7494 + }, + { + "epoch": 1.3644306908164194, + "grad_norm": 14.3125, + "learning_rate": 6.599920677301224e-06, + "loss": 0.8972785472869873, + "step": 7496 + }, + { + "epoch": 1.3647947574406116, + "grad_norm": 9.3125, + "learning_rate": 6.5984056677721516e-06, + "loss": 1.251992106437683, + "step": 7498 + }, + { + "epoch": 1.3651588240648038, + "grad_norm": 2.625, + "learning_rate": 6.596890570429475e-06, + "loss": 1.0147993564605713, + "step": 7500 + }, + { + "epoch": 1.365522890688996, + "grad_norm": 10.4375, + "learning_rate": 6.595375385495526e-06, + "loss": 1.1783654689788818, + "step": 7502 + }, + { + "epoch": 1.3658869573131884, + "grad_norm": 58.0, + "learning_rate": 6.593860113192652e-06, + "loss": 1.3592965602874756, + "step": 7504 + }, + { + "epoch": 1.3662510239373806, + "grad_norm": 8.1875, + "learning_rate": 6.592344753743214e-06, + "loss": 1.3051929473876953, + "step": 7506 + }, + { + "epoch": 1.3666150905615728, + "grad_norm": 17.0, + "learning_rate": 6.590829307369582e-06, + "loss": 1.3840627670288086, + "step": 7508 + }, + { + "epoch": 1.366979157185765, + "grad_norm": 18.625, + "learning_rate": 6.589313774294144e-06, + "loss": 1.184469223022461, + "step": 7510 + }, + { + "epoch": 1.3673432238099572, + "grad_norm": 29.625, + "learning_rate": 6.5877981547392985e-06, + "loss": 1.4012384414672852, + "step": 7512 + }, + { + "epoch": 1.3677072904341494, + "grad_norm": 13.5625, + "learning_rate": 6.586282448927456e-06, + "loss": 1.3458597660064697, + "step": 7514 + }, + { + "epoch": 1.3680713570583416, + "grad_norm": 18.375, + "learning_rate": 6.584766657081041e-06, + "loss": 0.9641464948654175, + "step": 7516 + }, + { + "epoch": 1.3684354236825338, + "grad_norm": 11.75, + "learning_rate": 6.583250779422485e-06, + "loss": 1.3214313983917236, + "step": 7518 + }, + { + "epoch": 1.3687994903067262, + "grad_norm": 7.625, + "learning_rate": 6.581734816174244e-06, + "loss": 1.255645513534546, + "step": 7520 + }, + { + "epoch": 1.3691635569309184, + "grad_norm": 18.25, + "learning_rate": 6.580218767558777e-06, + "loss": 1.2830095291137695, + "step": 7522 + }, + { + "epoch": 1.3695276235551106, + "grad_norm": 14.125, + "learning_rate": 6.578702633798555e-06, + "loss": 1.395407795906067, + "step": 7524 + }, + { + "epoch": 1.3698916901793028, + "grad_norm": 14.375, + "learning_rate": 6.5771864151160705e-06, + "loss": 1.375747561454773, + "step": 7526 + }, + { + "epoch": 1.370255756803495, + "grad_norm": 9.1875, + "learning_rate": 6.575670111733814e-06, + "loss": 1.302952527999878, + "step": 7528 + }, + { + "epoch": 1.3706198234276874, + "grad_norm": 7.46875, + "learning_rate": 6.574153723874304e-06, + "loss": 1.146970272064209, + "step": 7530 + }, + { + "epoch": 1.3709838900518796, + "grad_norm": 7.0625, + "learning_rate": 6.572637251760061e-06, + "loss": 1.2203412055969238, + "step": 7532 + }, + { + "epoch": 1.3713479566760718, + "grad_norm": 10.1875, + "learning_rate": 6.571120695613623e-06, + "loss": 1.3703068494796753, + "step": 7534 + }, + { + "epoch": 1.371712023300264, + "grad_norm": 15.8125, + "learning_rate": 6.569604055657538e-06, + "loss": 1.4244000911712646, + "step": 7536 + }, + { + "epoch": 1.3720760899244562, + "grad_norm": 28.125, + "learning_rate": 6.568087332114363e-06, + "loss": 1.5603991746902466, + "step": 7538 + }, + { + "epoch": 1.3724401565486484, + "grad_norm": 14.25, + "learning_rate": 6.5665705252066765e-06, + "loss": 1.2575956583023071, + "step": 7540 + }, + { + "epoch": 1.3728042231728406, + "grad_norm": 15.5, + "learning_rate": 6.5650536351570625e-06, + "loss": 1.565503478050232, + "step": 7542 + }, + { + "epoch": 1.3731682897970328, + "grad_norm": 10.8125, + "learning_rate": 6.563536662188117e-06, + "loss": 1.1888121366500854, + "step": 7544 + }, + { + "epoch": 1.373532356421225, + "grad_norm": 15.125, + "learning_rate": 6.562019606522449e-06, + "loss": 0.9999662637710571, + "step": 7546 + }, + { + "epoch": 1.3738964230454174, + "grad_norm": 7.25, + "learning_rate": 6.560502468382682e-06, + "loss": 1.159920334815979, + "step": 7548 + }, + { + "epoch": 1.3742604896696096, + "grad_norm": 12.625, + "learning_rate": 6.558985247991449e-06, + "loss": 1.4574644565582275, + "step": 7550 + }, + { + "epoch": 1.3746245562938018, + "grad_norm": 16.75, + "learning_rate": 6.557467945571399e-06, + "loss": 1.4080231189727783, + "step": 7552 + }, + { + "epoch": 1.374988622917994, + "grad_norm": 15.375, + "learning_rate": 6.555950561345184e-06, + "loss": 1.7421507835388184, + "step": 7554 + }, + { + "epoch": 1.3753526895421861, + "grad_norm": 12.75, + "learning_rate": 6.554433095535479e-06, + "loss": 1.697135090827942, + "step": 7556 + }, + { + "epoch": 1.3757167561663786, + "grad_norm": 24.25, + "learning_rate": 6.552915548364962e-06, + "loss": 1.6388407945632935, + "step": 7558 + }, + { + "epoch": 1.3760808227905708, + "grad_norm": 10.3125, + "learning_rate": 6.551397920056331e-06, + "loss": 1.2764458656311035, + "step": 7560 + }, + { + "epoch": 1.376444889414763, + "grad_norm": 31.0, + "learning_rate": 6.549880210832289e-06, + "loss": 1.5978236198425293, + "step": 7562 + }, + { + "epoch": 1.3768089560389551, + "grad_norm": 11.625, + "learning_rate": 6.548362420915554e-06, + "loss": 1.7277586460113525, + "step": 7564 + }, + { + "epoch": 1.3771730226631473, + "grad_norm": 11.25, + "learning_rate": 6.546844550528857e-06, + "loss": 1.4749855995178223, + "step": 7566 + }, + { + "epoch": 1.3775370892873395, + "grad_norm": 25.0, + "learning_rate": 6.545326599894936e-06, + "loss": 1.6118240356445312, + "step": 7568 + }, + { + "epoch": 1.3779011559115317, + "grad_norm": 8.9375, + "learning_rate": 6.543808569236549e-06, + "loss": 0.887904167175293, + "step": 7570 + }, + { + "epoch": 1.378265222535724, + "grad_norm": 9.75, + "learning_rate": 6.542290458776459e-06, + "loss": 0.5731221437454224, + "step": 7572 + }, + { + "epoch": 1.3786292891599163, + "grad_norm": 3.15625, + "learning_rate": 6.540772268737441e-06, + "loss": 1.0332300662994385, + "step": 7574 + }, + { + "epoch": 1.3789933557841085, + "grad_norm": 16.375, + "learning_rate": 6.539253999342283e-06, + "loss": 1.033430576324463, + "step": 7576 + }, + { + "epoch": 1.3793574224083007, + "grad_norm": 15.625, + "learning_rate": 6.537735650813788e-06, + "loss": 1.3279283046722412, + "step": 7578 + }, + { + "epoch": 1.379721489032493, + "grad_norm": 11.625, + "learning_rate": 6.5362172233747635e-06, + "loss": 1.3901052474975586, + "step": 7580 + }, + { + "epoch": 1.3800855556566851, + "grad_norm": 16.125, + "learning_rate": 6.534698717248036e-06, + "loss": 1.350856900215149, + "step": 7582 + }, + { + "epoch": 1.3804496222808775, + "grad_norm": 13.5625, + "learning_rate": 6.533180132656438e-06, + "loss": 0.8734549880027771, + "step": 7584 + }, + { + "epoch": 1.3808136889050697, + "grad_norm": 5.875, + "learning_rate": 6.531661469822817e-06, + "loss": 1.246248722076416, + "step": 7586 + }, + { + "epoch": 1.381177755529262, + "grad_norm": 11.1875, + "learning_rate": 6.5301427289700284e-06, + "loss": 1.71853768825531, + "step": 7588 + }, + { + "epoch": 1.3815418221534541, + "grad_norm": 13.5, + "learning_rate": 6.5286239103209435e-06, + "loss": 1.510822057723999, + "step": 7590 + }, + { + "epoch": 1.3819058887776463, + "grad_norm": 7.4375, + "learning_rate": 6.5271050140984425e-06, + "loss": 1.5353522300720215, + "step": 7592 + }, + { + "epoch": 1.3822699554018385, + "grad_norm": 8.875, + "learning_rate": 6.525586040525415e-06, + "loss": 1.2981197834014893, + "step": 7594 + }, + { + "epoch": 1.3826340220260307, + "grad_norm": 12.75, + "learning_rate": 6.52406698982477e-06, + "loss": 1.3658808469772339, + "step": 7596 + }, + { + "epoch": 1.382998088650223, + "grad_norm": 8.3125, + "learning_rate": 6.5225478622194146e-06, + "loss": 1.2242591381072998, + "step": 7598 + }, + { + "epoch": 1.383362155274415, + "grad_norm": 10.0625, + "learning_rate": 6.52102865793228e-06, + "loss": 0.9514259099960327, + "step": 7600 + }, + { + "epoch": 1.3837262218986075, + "grad_norm": 13.4375, + "learning_rate": 6.519509377186302e-06, + "loss": 1.3797001838684082, + "step": 7602 + }, + { + "epoch": 1.3840902885227997, + "grad_norm": 16.25, + "learning_rate": 6.517990020204425e-06, + "loss": 1.4543248414993286, + "step": 7604 + }, + { + "epoch": 1.3844543551469919, + "grad_norm": 10.3125, + "learning_rate": 6.516470587209616e-06, + "loss": 1.024301290512085, + "step": 7606 + }, + { + "epoch": 1.384818421771184, + "grad_norm": 8.875, + "learning_rate": 6.514951078424838e-06, + "loss": 1.0712511539459229, + "step": 7608 + }, + { + "epoch": 1.3851824883953763, + "grad_norm": 28.375, + "learning_rate": 6.513431494073077e-06, + "loss": 1.6277577877044678, + "step": 7610 + }, + { + "epoch": 1.3855465550195687, + "grad_norm": 25.875, + "learning_rate": 6.511911834377326e-06, + "loss": 1.533146619796753, + "step": 7612 + }, + { + "epoch": 1.3859106216437609, + "grad_norm": 18.25, + "learning_rate": 6.510392099560587e-06, + "loss": 1.6026009321212769, + "step": 7614 + }, + { + "epoch": 1.386274688267953, + "grad_norm": 17.125, + "learning_rate": 6.508872289845878e-06, + "loss": 1.763692855834961, + "step": 7616 + }, + { + "epoch": 1.3866387548921453, + "grad_norm": 44.75, + "learning_rate": 6.5073524054562185e-06, + "loss": 1.4544414281845093, + "step": 7618 + }, + { + "epoch": 1.3870028215163375, + "grad_norm": 9.875, + "learning_rate": 6.505832446614653e-06, + "loss": 1.4079960584640503, + "step": 7620 + }, + { + "epoch": 1.3873668881405297, + "grad_norm": 11.8125, + "learning_rate": 6.504312413544225e-06, + "loss": 1.3743195533752441, + "step": 7622 + }, + { + "epoch": 1.3877309547647219, + "grad_norm": 4.53125, + "learning_rate": 6.502792306467993e-06, + "loss": 1.0719960927963257, + "step": 7624 + }, + { + "epoch": 1.388095021388914, + "grad_norm": 20.5, + "learning_rate": 6.5012721256090306e-06, + "loss": 1.5415546894073486, + "step": 7626 + }, + { + "epoch": 1.3884590880131065, + "grad_norm": 18.75, + "learning_rate": 6.499751871190412e-06, + "loss": 1.4556267261505127, + "step": 7628 + }, + { + "epoch": 1.3888231546372987, + "grad_norm": 12.4375, + "learning_rate": 6.4982315434352314e-06, + "loss": 1.4458072185516357, + "step": 7630 + }, + { + "epoch": 1.3891872212614909, + "grad_norm": 15.4375, + "learning_rate": 6.496711142566594e-06, + "loss": 1.720502495765686, + "step": 7632 + }, + { + "epoch": 1.389551287885683, + "grad_norm": 18.625, + "learning_rate": 6.4951906688076075e-06, + "loss": 1.0731414556503296, + "step": 7634 + }, + { + "epoch": 1.3899153545098752, + "grad_norm": 17.5, + "learning_rate": 6.4936701223813995e-06, + "loss": 1.1338034868240356, + "step": 7636 + }, + { + "epoch": 1.3902794211340677, + "grad_norm": 18.875, + "learning_rate": 6.492149503511099e-06, + "loss": 1.3427939414978027, + "step": 7638 + }, + { + "epoch": 1.3906434877582599, + "grad_norm": 14.5, + "learning_rate": 6.490628812419852e-06, + "loss": 1.8454996347427368, + "step": 7640 + }, + { + "epoch": 1.391007554382452, + "grad_norm": 10.3125, + "learning_rate": 6.489108049330819e-06, + "loss": 1.4115500450134277, + "step": 7642 + }, + { + "epoch": 1.3913716210066442, + "grad_norm": 8.5, + "learning_rate": 6.487587214467159e-06, + "loss": 1.09733247756958, + "step": 7644 + }, + { + "epoch": 1.3917356876308364, + "grad_norm": 11.8125, + "learning_rate": 6.486066308052055e-06, + "loss": 1.4169155359268188, + "step": 7646 + }, + { + "epoch": 1.3920997542550286, + "grad_norm": 11.1875, + "learning_rate": 6.484545330308686e-06, + "loss": 1.3464170694351196, + "step": 7648 + }, + { + "epoch": 1.3924638208792208, + "grad_norm": 8.1875, + "learning_rate": 6.483024281460255e-06, + "loss": 1.254568099975586, + "step": 7650 + }, + { + "epoch": 1.392827887503413, + "grad_norm": 20.75, + "learning_rate": 6.481503161729969e-06, + "loss": 1.137978196144104, + "step": 7652 + }, + { + "epoch": 1.3931919541276052, + "grad_norm": 18.625, + "learning_rate": 6.479981971341045e-06, + "loss": 1.923954963684082, + "step": 7654 + }, + { + "epoch": 1.3935560207517976, + "grad_norm": 9.9375, + "learning_rate": 6.478460710516712e-06, + "loss": 1.641863226890564, + "step": 7656 + }, + { + "epoch": 1.3939200873759898, + "grad_norm": 15.875, + "learning_rate": 6.476939379480207e-06, + "loss": 1.1812974214553833, + "step": 7658 + }, + { + "epoch": 1.394284154000182, + "grad_norm": 18.125, + "learning_rate": 6.4754179784547835e-06, + "loss": 1.8159241676330566, + "step": 7660 + }, + { + "epoch": 1.3946482206243742, + "grad_norm": 31.75, + "learning_rate": 6.473896507663697e-06, + "loss": 1.0422096252441406, + "step": 7662 + }, + { + "epoch": 1.3950122872485664, + "grad_norm": 8.1875, + "learning_rate": 6.472374967330217e-06, + "loss": 1.3442343473434448, + "step": 7664 + }, + { + "epoch": 1.3953763538727588, + "grad_norm": 16.5, + "learning_rate": 6.470853357677629e-06, + "loss": 1.4701565504074097, + "step": 7666 + }, + { + "epoch": 1.395740420496951, + "grad_norm": 171.0, + "learning_rate": 6.469331678929216e-06, + "loss": 0.7476240396499634, + "step": 7668 + }, + { + "epoch": 1.3961044871211432, + "grad_norm": 14.6875, + "learning_rate": 6.467809931308281e-06, + "loss": 0.70750492811203, + "step": 7670 + }, + { + "epoch": 1.3964685537453354, + "grad_norm": 10.0, + "learning_rate": 6.4662881150381356e-06, + "loss": 1.3857115507125854, + "step": 7672 + }, + { + "epoch": 1.3968326203695276, + "grad_norm": 17.25, + "learning_rate": 6.464766230342099e-06, + "loss": 1.4983041286468506, + "step": 7674 + }, + { + "epoch": 1.3971966869937198, + "grad_norm": 13.0, + "learning_rate": 6.463244277443501e-06, + "loss": 1.8194019794464111, + "step": 7676 + }, + { + "epoch": 1.397560753617912, + "grad_norm": 46.5, + "learning_rate": 6.461722256565683e-06, + "loss": 1.6128919124603271, + "step": 7678 + }, + { + "epoch": 1.3979248202421042, + "grad_norm": 22.75, + "learning_rate": 6.4602001679319944e-06, + "loss": 0.8991611003875732, + "step": 7680 + }, + { + "epoch": 1.3982888868662966, + "grad_norm": 8.875, + "learning_rate": 6.458678011765798e-06, + "loss": 1.4467920064926147, + "step": 7682 + }, + { + "epoch": 1.3986529534904888, + "grad_norm": 8.625, + "learning_rate": 6.457155788290459e-06, + "loss": 1.2151093482971191, + "step": 7684 + }, + { + "epoch": 1.399017020114681, + "grad_norm": 8.5625, + "learning_rate": 6.455633497729365e-06, + "loss": 1.107391595840454, + "step": 7686 + }, + { + "epoch": 1.3993810867388732, + "grad_norm": 20.625, + "learning_rate": 6.454111140305899e-06, + "loss": 1.2695037126541138, + "step": 7688 + }, + { + "epoch": 1.3997451533630654, + "grad_norm": 24.375, + "learning_rate": 6.452588716243463e-06, + "loss": 1.5658984184265137, + "step": 7690 + }, + { + "epoch": 1.4001092199872578, + "grad_norm": 10.0, + "learning_rate": 6.451066225765468e-06, + "loss": 0.6073099374771118, + "step": 7692 + }, + { + "epoch": 1.40047328661145, + "grad_norm": 9.3125, + "learning_rate": 6.4495436690953324e-06, + "loss": 1.4313074350357056, + "step": 7694 + }, + { + "epoch": 1.4008373532356422, + "grad_norm": 9.25, + "learning_rate": 6.448021046456487e-06, + "loss": 1.3558878898620605, + "step": 7696 + }, + { + "epoch": 1.4012014198598344, + "grad_norm": 16.25, + "learning_rate": 6.4464983580723665e-06, + "loss": 1.1899185180664062, + "step": 7698 + }, + { + "epoch": 1.4015654864840266, + "grad_norm": 8.25, + "learning_rate": 6.444975604166421e-06, + "loss": 1.286263346672058, + "step": 7700 + }, + { + "epoch": 1.4019295531082188, + "grad_norm": 12.875, + "learning_rate": 6.44345278496211e-06, + "loss": 1.10990571975708, + "step": 7702 + }, + { + "epoch": 1.402293619732411, + "grad_norm": 59.0, + "learning_rate": 6.4419299006829e-06, + "loss": 1.305938482284546, + "step": 7704 + }, + { + "epoch": 1.4026576863566032, + "grad_norm": 9.75, + "learning_rate": 6.4404069515522686e-06, + "loss": 1.478846788406372, + "step": 7706 + }, + { + "epoch": 1.4030217529807956, + "grad_norm": 22.125, + "learning_rate": 6.4388839377937e-06, + "loss": 1.6847670078277588, + "step": 7708 + }, + { + "epoch": 1.4033858196049878, + "grad_norm": 14.4375, + "learning_rate": 6.437360859630692e-06, + "loss": 1.8299840688705444, + "step": 7710 + }, + { + "epoch": 1.40374988622918, + "grad_norm": 38.5, + "learning_rate": 6.435837717286753e-06, + "loss": 1.9381146430969238, + "step": 7712 + }, + { + "epoch": 1.4041139528533721, + "grad_norm": 15.1875, + "learning_rate": 6.434314510985393e-06, + "loss": 1.411583662033081, + "step": 7714 + }, + { + "epoch": 1.4044780194775643, + "grad_norm": 5.9375, + "learning_rate": 6.432791240950141e-06, + "loss": 1.4456018209457397, + "step": 7716 + }, + { + "epoch": 1.4048420861017568, + "grad_norm": 9.3125, + "learning_rate": 6.431267907404526e-06, + "loss": 1.244000792503357, + "step": 7718 + }, + { + "epoch": 1.405206152725949, + "grad_norm": 13.1875, + "learning_rate": 6.429744510572093e-06, + "loss": 1.2111845016479492, + "step": 7720 + }, + { + "epoch": 1.4055702193501411, + "grad_norm": 12.625, + "learning_rate": 6.428221050676398e-06, + "loss": 1.2410328388214111, + "step": 7722 + }, + { + "epoch": 1.4059342859743333, + "grad_norm": 25.25, + "learning_rate": 6.426697527940997e-06, + "loss": 1.1156731843948364, + "step": 7724 + }, + { + "epoch": 1.4062983525985255, + "grad_norm": 9.125, + "learning_rate": 6.4251739425894626e-06, + "loss": 1.451101303100586, + "step": 7726 + }, + { + "epoch": 1.4066624192227177, + "grad_norm": 12.1875, + "learning_rate": 6.423650294845376e-06, + "loss": 1.106182336807251, + "step": 7728 + }, + { + "epoch": 1.40702648584691, + "grad_norm": 19.75, + "learning_rate": 6.422126584932325e-06, + "loss": 1.5805258750915527, + "step": 7730 + }, + { + "epoch": 1.4073905524711021, + "grad_norm": 12.5, + "learning_rate": 6.420602813073909e-06, + "loss": 1.4265527725219727, + "step": 7732 + }, + { + "epoch": 1.4077546190952943, + "grad_norm": 53.25, + "learning_rate": 6.419078979493734e-06, + "loss": 1.388393759727478, + "step": 7734 + }, + { + "epoch": 1.4081186857194867, + "grad_norm": 15.4375, + "learning_rate": 6.4175550844154175e-06, + "loss": 1.3540140390396118, + "step": 7736 + }, + { + "epoch": 1.408482752343679, + "grad_norm": 12.0625, + "learning_rate": 6.416031128062585e-06, + "loss": 1.3345158100128174, + "step": 7738 + }, + { + "epoch": 1.4088468189678711, + "grad_norm": 39.25, + "learning_rate": 6.4145071106588696e-06, + "loss": 1.5851914882659912, + "step": 7740 + }, + { + "epoch": 1.4092108855920633, + "grad_norm": 10.875, + "learning_rate": 6.412983032427915e-06, + "loss": 1.581083059310913, + "step": 7742 + }, + { + "epoch": 1.4095749522162555, + "grad_norm": 3.390625, + "learning_rate": 6.4114588935933764e-06, + "loss": 1.36771559715271, + "step": 7744 + }, + { + "epoch": 1.409939018840448, + "grad_norm": 10.0625, + "learning_rate": 6.409934694378912e-06, + "loss": 1.0774198770523071, + "step": 7746 + }, + { + "epoch": 1.4103030854646401, + "grad_norm": 110.0, + "learning_rate": 6.408410435008191e-06, + "loss": 1.6050190925598145, + "step": 7748 + }, + { + "epoch": 1.4106671520888323, + "grad_norm": 12.25, + "learning_rate": 6.406886115704896e-06, + "loss": 1.5230917930603027, + "step": 7750 + }, + { + "epoch": 1.4110312187130245, + "grad_norm": 13.5625, + "learning_rate": 6.40536173669271e-06, + "loss": 1.3671441078186035, + "step": 7752 + }, + { + "epoch": 1.4113952853372167, + "grad_norm": 11.5, + "learning_rate": 6.403837298195333e-06, + "loss": 1.7444974184036255, + "step": 7754 + }, + { + "epoch": 1.411759351961409, + "grad_norm": 8.0, + "learning_rate": 6.40231280043647e-06, + "loss": 1.3938043117523193, + "step": 7756 + }, + { + "epoch": 1.412123418585601, + "grad_norm": 13.875, + "learning_rate": 6.400788243639833e-06, + "loss": 1.1789401769638062, + "step": 7758 + }, + { + "epoch": 1.4124874852097933, + "grad_norm": 11.8125, + "learning_rate": 6.399263628029145e-06, + "loss": 1.7180346250534058, + "step": 7760 + }, + { + "epoch": 1.4128515518339857, + "grad_norm": 14.3125, + "learning_rate": 6.397738953828139e-06, + "loss": 1.3375813961029053, + "step": 7762 + }, + { + "epoch": 1.413215618458178, + "grad_norm": 6.6875, + "learning_rate": 6.396214221260553e-06, + "loss": 1.2305355072021484, + "step": 7764 + }, + { + "epoch": 1.41357968508237, + "grad_norm": 16.375, + "learning_rate": 6.394689430550136e-06, + "loss": 1.4293663501739502, + "step": 7766 + }, + { + "epoch": 1.4139437517065623, + "grad_norm": 19.0, + "learning_rate": 6.393164581920644e-06, + "loss": 1.6117398738861084, + "step": 7768 + }, + { + "epoch": 1.4143078183307545, + "grad_norm": 6.96875, + "learning_rate": 6.391639675595842e-06, + "loss": 1.616674780845642, + "step": 7770 + }, + { + "epoch": 1.414671884954947, + "grad_norm": 22.25, + "learning_rate": 6.3901147117995065e-06, + "loss": 1.1837644577026367, + "step": 7772 + }, + { + "epoch": 1.415035951579139, + "grad_norm": 14.0, + "learning_rate": 6.388589690755418e-06, + "loss": 1.2595970630645752, + "step": 7774 + }, + { + "epoch": 1.4154000182033313, + "grad_norm": 14.3125, + "learning_rate": 6.387064612687366e-06, + "loss": 1.4133833646774292, + "step": 7776 + }, + { + "epoch": 1.4157640848275235, + "grad_norm": 7.5, + "learning_rate": 6.385539477819151e-06, + "loss": 1.6039048433303833, + "step": 7778 + }, + { + "epoch": 1.4161281514517157, + "grad_norm": 10.125, + "learning_rate": 6.3840142863745806e-06, + "loss": 1.2344555854797363, + "step": 7780 + }, + { + "epoch": 1.4164922180759079, + "grad_norm": 14.875, + "learning_rate": 6.3824890385774695e-06, + "loss": 1.3861414194107056, + "step": 7782 + }, + { + "epoch": 1.4168562847001, + "grad_norm": 6.1875, + "learning_rate": 6.380963734651643e-06, + "loss": 1.3523224592208862, + "step": 7784 + }, + { + "epoch": 1.4172203513242922, + "grad_norm": 8.3125, + "learning_rate": 6.379438374820932e-06, + "loss": 1.0533229112625122, + "step": 7786 + }, + { + "epoch": 1.4175844179484844, + "grad_norm": 16.125, + "learning_rate": 6.377912959309176e-06, + "loss": 1.3463441133499146, + "step": 7788 + }, + { + "epoch": 1.4179484845726769, + "grad_norm": 6.625, + "learning_rate": 6.376387488340225e-06, + "loss": 1.2822659015655518, + "step": 7790 + }, + { + "epoch": 1.418312551196869, + "grad_norm": 8.4375, + "learning_rate": 6.374861962137937e-06, + "loss": 1.2499428987503052, + "step": 7792 + }, + { + "epoch": 1.4186766178210612, + "grad_norm": 8.375, + "learning_rate": 6.373336380926175e-06, + "loss": 1.30571448802948, + "step": 7794 + }, + { + "epoch": 1.4190406844452534, + "grad_norm": 15.8125, + "learning_rate": 6.37181074492881e-06, + "loss": 1.4697508811950684, + "step": 7796 + }, + { + "epoch": 1.4194047510694456, + "grad_norm": 70.0, + "learning_rate": 6.370285054369728e-06, + "loss": 1.6303260326385498, + "step": 7798 + }, + { + "epoch": 1.419768817693638, + "grad_norm": 12.6875, + "learning_rate": 6.368759309472814e-06, + "loss": 1.372489333152771, + "step": 7800 + }, + { + "epoch": 1.4201328843178302, + "grad_norm": 12.375, + "learning_rate": 6.3672335104619654e-06, + "loss": 1.3742971420288086, + "step": 7802 + }, + { + "epoch": 1.4204969509420224, + "grad_norm": 9.8125, + "learning_rate": 6.3657076575610865e-06, + "loss": 1.2103904485702515, + "step": 7804 + }, + { + "epoch": 1.4208610175662146, + "grad_norm": 4.59375, + "learning_rate": 6.3641817509940915e-06, + "loss": 0.9063478708267212, + "step": 7806 + }, + { + "epoch": 1.4212250841904068, + "grad_norm": 4.09375, + "learning_rate": 6.362655790984901e-06, + "loss": 1.069659948348999, + "step": 7808 + }, + { + "epoch": 1.421589150814599, + "grad_norm": 28.125, + "learning_rate": 6.361129777757442e-06, + "loss": 1.3138788938522339, + "step": 7810 + }, + { + "epoch": 1.4219532174387912, + "grad_norm": 15.0625, + "learning_rate": 6.3596037115356514e-06, + "loss": 0.5693075656890869, + "step": 7812 + }, + { + "epoch": 1.4223172840629834, + "grad_norm": 19.75, + "learning_rate": 6.358077592543474e-06, + "loss": 1.3636194467544556, + "step": 7814 + }, + { + "epoch": 1.4226813506871758, + "grad_norm": 7.1875, + "learning_rate": 6.356551421004862e-06, + "loss": 1.2531205415725708, + "step": 7816 + }, + { + "epoch": 1.423045417311368, + "grad_norm": 5.09375, + "learning_rate": 6.355025197143773e-06, + "loss": 0.9501316547393799, + "step": 7818 + }, + { + "epoch": 1.4234094839355602, + "grad_norm": 18.125, + "learning_rate": 6.3534989211841755e-06, + "loss": 1.4885741472244263, + "step": 7820 + }, + { + "epoch": 1.4237735505597524, + "grad_norm": 8.5625, + "learning_rate": 6.351972593350044e-06, + "loss": 1.7057766914367676, + "step": 7822 + }, + { + "epoch": 1.4241376171839446, + "grad_norm": 9.0, + "learning_rate": 6.35044621386536e-06, + "loss": 1.0497283935546875, + "step": 7824 + }, + { + "epoch": 1.424501683808137, + "grad_norm": 8.75, + "learning_rate": 6.348919782954115e-06, + "loss": 1.0681543350219727, + "step": 7826 + }, + { + "epoch": 1.4248657504323292, + "grad_norm": 9.0, + "learning_rate": 6.347393300840305e-06, + "loss": 1.7147891521453857, + "step": 7828 + }, + { + "epoch": 1.4252298170565214, + "grad_norm": 17.75, + "learning_rate": 6.345866767747938e-06, + "loss": 1.4910051822662354, + "step": 7830 + }, + { + "epoch": 1.4255938836807136, + "grad_norm": 10.1875, + "learning_rate": 6.344340183901023e-06, + "loss": 1.5224055051803589, + "step": 7832 + }, + { + "epoch": 1.4259579503049058, + "grad_norm": 33.25, + "learning_rate": 6.342813549523581e-06, + "loss": 0.6270468235015869, + "step": 7834 + }, + { + "epoch": 1.426322016929098, + "grad_norm": 21.75, + "learning_rate": 6.341286864839642e-06, + "loss": 0.4331286549568176, + "step": 7836 + }, + { + "epoch": 1.4266860835532902, + "grad_norm": 14.0, + "learning_rate": 6.339760130073236e-06, + "loss": 0.8436590433120728, + "step": 7838 + }, + { + "epoch": 1.4270501501774824, + "grad_norm": 13.0625, + "learning_rate": 6.33823334544841e-06, + "loss": 1.3369219303131104, + "step": 7840 + }, + { + "epoch": 1.4274142168016746, + "grad_norm": 14.0, + "learning_rate": 6.3367065111892105e-06, + "loss": 1.5874592065811157, + "step": 7842 + }, + { + "epoch": 1.427778283425867, + "grad_norm": 10.4375, + "learning_rate": 6.335179627519694e-06, + "loss": 1.8331938982009888, + "step": 7844 + }, + { + "epoch": 1.4281423500500592, + "grad_norm": 18.125, + "learning_rate": 6.333652694663928e-06, + "loss": 1.39878511428833, + "step": 7846 + }, + { + "epoch": 1.4285064166742514, + "grad_norm": 15.625, + "learning_rate": 6.33212571284598e-06, + "loss": 1.4116853475570679, + "step": 7848 + }, + { + "epoch": 1.4288704832984436, + "grad_norm": 6.5, + "learning_rate": 6.330598682289928e-06, + "loss": 1.092564582824707, + "step": 7850 + }, + { + "epoch": 1.4292345499226358, + "grad_norm": 4.65625, + "learning_rate": 6.329071603219861e-06, + "loss": 0.9261064529418945, + "step": 7852 + }, + { + "epoch": 1.4295986165468282, + "grad_norm": 7.65625, + "learning_rate": 6.32754447585987e-06, + "loss": 1.3433537483215332, + "step": 7854 + }, + { + "epoch": 1.4299626831710204, + "grad_norm": 8.0625, + "learning_rate": 6.326017300434053e-06, + "loss": 1.3853176832199097, + "step": 7856 + }, + { + "epoch": 1.4303267497952126, + "grad_norm": 6.5, + "learning_rate": 6.32449007716652e-06, + "loss": 1.2959727048873901, + "step": 7858 + }, + { + "epoch": 1.4306908164194048, + "grad_norm": 13.3125, + "learning_rate": 6.322962806281383e-06, + "loss": 1.3257619142532349, + "step": 7860 + }, + { + "epoch": 1.431054883043597, + "grad_norm": 10.8125, + "learning_rate": 6.321435488002763e-06, + "loss": 1.3447543382644653, + "step": 7862 + }, + { + "epoch": 1.4314189496677892, + "grad_norm": 5.34375, + "learning_rate": 6.319908122554788e-06, + "loss": 1.2987921237945557, + "step": 7864 + }, + { + "epoch": 1.4317830162919813, + "grad_norm": 31.5, + "learning_rate": 6.318380710161591e-06, + "loss": 1.4172015190124512, + "step": 7866 + }, + { + "epoch": 1.4321470829161735, + "grad_norm": 8.25, + "learning_rate": 6.3168532510473165e-06, + "loss": 1.053973913192749, + "step": 7868 + }, + { + "epoch": 1.432511149540366, + "grad_norm": 10.8125, + "learning_rate": 6.31532574543611e-06, + "loss": 0.394927054643631, + "step": 7870 + }, + { + "epoch": 1.4328752161645582, + "grad_norm": 17.25, + "learning_rate": 6.31379819355213e-06, + "loss": 1.2818745374679565, + "step": 7872 + }, + { + "epoch": 1.4332392827887503, + "grad_norm": 3.625, + "learning_rate": 6.312270595619536e-06, + "loss": 1.3657610416412354, + "step": 7874 + }, + { + "epoch": 1.4336033494129425, + "grad_norm": 43.5, + "learning_rate": 6.310742951862498e-06, + "loss": 1.3637645244598389, + "step": 7876 + }, + { + "epoch": 1.4339674160371347, + "grad_norm": 16.25, + "learning_rate": 6.309215262505191e-06, + "loss": 1.5117475986480713, + "step": 7878 + }, + { + "epoch": 1.4343314826613272, + "grad_norm": 10.3125, + "learning_rate": 6.307687527771798e-06, + "loss": 1.3734525442123413, + "step": 7880 + }, + { + "epoch": 1.4346955492855193, + "grad_norm": 16.0, + "learning_rate": 6.306159747886505e-06, + "loss": 1.3569352626800537, + "step": 7882 + }, + { + "epoch": 1.4350596159097115, + "grad_norm": 12.0625, + "learning_rate": 6.304631923073512e-06, + "loss": 1.6434990167617798, + "step": 7884 + }, + { + "epoch": 1.4354236825339037, + "grad_norm": 21.0, + "learning_rate": 6.303104053557019e-06, + "loss": 1.9740383625030518, + "step": 7886 + }, + { + "epoch": 1.435787749158096, + "grad_norm": 10.1875, + "learning_rate": 6.301576139561232e-06, + "loss": 1.5621310472488403, + "step": 7888 + }, + { + "epoch": 1.4361518157822881, + "grad_norm": 15.4375, + "learning_rate": 6.300048181310372e-06, + "loss": 1.3611433506011963, + "step": 7890 + }, + { + "epoch": 1.4365158824064803, + "grad_norm": 10.5625, + "learning_rate": 6.2985201790286555e-06, + "loss": 1.4110527038574219, + "step": 7892 + }, + { + "epoch": 1.4368799490306725, + "grad_norm": 18.125, + "learning_rate": 6.2969921329403115e-06, + "loss": 1.5412523746490479, + "step": 7894 + }, + { + "epoch": 1.4372440156548647, + "grad_norm": 15.5, + "learning_rate": 6.295464043269577e-06, + "loss": 1.3198649883270264, + "step": 7896 + }, + { + "epoch": 1.4376080822790571, + "grad_norm": 29.25, + "learning_rate": 6.293935910240691e-06, + "loss": 1.452551007270813, + "step": 7898 + }, + { + "epoch": 1.4379721489032493, + "grad_norm": 17.375, + "learning_rate": 6.292407734077902e-06, + "loss": 1.649959683418274, + "step": 7900 + }, + { + "epoch": 1.4383362155274415, + "grad_norm": 6.28125, + "learning_rate": 6.290879515005464e-06, + "loss": 1.087284803390503, + "step": 7902 + }, + { + "epoch": 1.4387002821516337, + "grad_norm": 11.9375, + "learning_rate": 6.289351253247634e-06, + "loss": 1.2501131296157837, + "step": 7904 + }, + { + "epoch": 1.439064348775826, + "grad_norm": 20.625, + "learning_rate": 6.287822949028682e-06, + "loss": 1.6574360132217407, + "step": 7906 + }, + { + "epoch": 1.4394284154000183, + "grad_norm": 11.6875, + "learning_rate": 6.286294602572876e-06, + "loss": 1.2489105463027954, + "step": 7908 + }, + { + "epoch": 1.4397924820242105, + "grad_norm": 11.1875, + "learning_rate": 6.284766214104499e-06, + "loss": 1.785330057144165, + "step": 7910 + }, + { + "epoch": 1.4401565486484027, + "grad_norm": 8.125, + "learning_rate": 6.283237783847836e-06, + "loss": 1.2349014282226562, + "step": 7912 + }, + { + "epoch": 1.440520615272595, + "grad_norm": 8.375, + "learning_rate": 6.281709312027173e-06, + "loss": 0.9802764058113098, + "step": 7914 + }, + { + "epoch": 1.440884681896787, + "grad_norm": 27.875, + "learning_rate": 6.280180798866811e-06, + "loss": 2.0262279510498047, + "step": 7916 + }, + { + "epoch": 1.4412487485209793, + "grad_norm": 7.8125, + "learning_rate": 6.2786522445910525e-06, + "loss": 1.2292743921279907, + "step": 7918 + }, + { + "epoch": 1.4416128151451715, + "grad_norm": 9.0, + "learning_rate": 6.277123649424207e-06, + "loss": 1.2832567691802979, + "step": 7920 + }, + { + "epoch": 1.4419768817693637, + "grad_norm": 6.3125, + "learning_rate": 6.27559501359059e-06, + "loss": 1.0234166383743286, + "step": 7922 + }, + { + "epoch": 1.442340948393556, + "grad_norm": 15.625, + "learning_rate": 6.274066337314519e-06, + "loss": 1.7633954286575317, + "step": 7924 + }, + { + "epoch": 1.4427050150177483, + "grad_norm": 6.84375, + "learning_rate": 6.2725376208203245e-06, + "loss": 1.1534686088562012, + "step": 7926 + }, + { + "epoch": 1.4430690816419405, + "grad_norm": 24.125, + "learning_rate": 6.27100886433234e-06, + "loss": 1.1233798265457153, + "step": 7928 + }, + { + "epoch": 1.4434331482661327, + "grad_norm": 10.8125, + "learning_rate": 6.269480068074902e-06, + "loss": 1.4721591472625732, + "step": 7930 + }, + { + "epoch": 1.4437972148903249, + "grad_norm": 25.625, + "learning_rate": 6.267951232272356e-06, + "loss": 1.6715428829193115, + "step": 7932 + }, + { + "epoch": 1.4441612815145173, + "grad_norm": 13.0625, + "learning_rate": 6.266422357149051e-06, + "loss": 1.6308305263519287, + "step": 7934 + }, + { + "epoch": 1.4445253481387095, + "grad_norm": 5.78125, + "learning_rate": 6.264893442929347e-06, + "loss": 1.2765209674835205, + "step": 7936 + }, + { + "epoch": 1.4448894147629017, + "grad_norm": 6.28125, + "learning_rate": 6.263364489837604e-06, + "loss": 1.2898404598236084, + "step": 7938 + }, + { + "epoch": 1.4452534813870939, + "grad_norm": 11.25, + "learning_rate": 6.261835498098189e-06, + "loss": 1.2175884246826172, + "step": 7940 + }, + { + "epoch": 1.445617548011286, + "grad_norm": 6.53125, + "learning_rate": 6.260306467935475e-06, + "loss": 1.2522536516189575, + "step": 7942 + }, + { + "epoch": 1.4459816146354783, + "grad_norm": 8.625, + "learning_rate": 6.258777399573844e-06, + "loss": 1.265086054801941, + "step": 7944 + }, + { + "epoch": 1.4463456812596704, + "grad_norm": 11.875, + "learning_rate": 6.2572482932376755e-06, + "loss": 2.0765380859375, + "step": 7946 + }, + { + "epoch": 1.4467097478838626, + "grad_norm": 8.375, + "learning_rate": 6.255719149151362e-06, + "loss": 1.2645471096038818, + "step": 7948 + }, + { + "epoch": 1.447073814508055, + "grad_norm": 8.875, + "learning_rate": 6.2541899675393015e-06, + "loss": 1.412146806716919, + "step": 7950 + }, + { + "epoch": 1.4474378811322473, + "grad_norm": 11.5, + "learning_rate": 6.252660748625894e-06, + "loss": 1.317734956741333, + "step": 7952 + }, + { + "epoch": 1.4478019477564394, + "grad_norm": 8.1875, + "learning_rate": 6.2511314926355445e-06, + "loss": 1.4104976654052734, + "step": 7954 + }, + { + "epoch": 1.4481660143806316, + "grad_norm": 12.5625, + "learning_rate": 6.249602199792667e-06, + "loss": 1.13209068775177, + "step": 7956 + }, + { + "epoch": 1.4485300810048238, + "grad_norm": 14.0, + "learning_rate": 6.248072870321677e-06, + "loss": 1.178959608078003, + "step": 7958 + }, + { + "epoch": 1.448894147629016, + "grad_norm": 61.25, + "learning_rate": 6.246543504447e-06, + "loss": 1.770942211151123, + "step": 7960 + }, + { + "epoch": 1.4492582142532084, + "grad_norm": 5.78125, + "learning_rate": 6.245014102393062e-06, + "loss": 1.1800146102905273, + "step": 7962 + }, + { + "epoch": 1.4496222808774006, + "grad_norm": 7.65625, + "learning_rate": 6.243484664384299e-06, + "loss": 1.2909984588623047, + "step": 7964 + }, + { + "epoch": 1.4499863475015928, + "grad_norm": 12.4375, + "learning_rate": 6.241955190645146e-06, + "loss": 1.340165138244629, + "step": 7966 + }, + { + "epoch": 1.450350414125785, + "grad_norm": 11.125, + "learning_rate": 6.240425681400051e-06, + "loss": 1.3229106664657593, + "step": 7968 + }, + { + "epoch": 1.4507144807499772, + "grad_norm": 17.875, + "learning_rate": 6.2388961368734604e-06, + "loss": 1.4175243377685547, + "step": 7970 + }, + { + "epoch": 1.4510785473741694, + "grad_norm": 13.4375, + "learning_rate": 6.2373665572898304e-06, + "loss": 1.316265344619751, + "step": 7972 + }, + { + "epoch": 1.4514426139983616, + "grad_norm": 7.28125, + "learning_rate": 6.235836942873619e-06, + "loss": 1.2253687381744385, + "step": 7974 + }, + { + "epoch": 1.4518066806225538, + "grad_norm": 13.9375, + "learning_rate": 6.2343072938492935e-06, + "loss": 1.4171786308288574, + "step": 7976 + }, + { + "epoch": 1.4521707472467462, + "grad_norm": 10.0625, + "learning_rate": 6.232777610441322e-06, + "loss": 1.399134635925293, + "step": 7978 + }, + { + "epoch": 1.4525348138709384, + "grad_norm": 7.6875, + "learning_rate": 6.231247892874179e-06, + "loss": 1.1127716302871704, + "step": 7980 + }, + { + "epoch": 1.4528988804951306, + "grad_norm": 12.0, + "learning_rate": 6.229718141372345e-06, + "loss": 1.4004936218261719, + "step": 7982 + }, + { + "epoch": 1.4532629471193228, + "grad_norm": 4.59375, + "learning_rate": 6.228188356160305e-06, + "loss": 1.0497952699661255, + "step": 7984 + }, + { + "epoch": 1.453627013743515, + "grad_norm": 11.125, + "learning_rate": 6.226658537462548e-06, + "loss": 1.4026964902877808, + "step": 7986 + }, + { + "epoch": 1.4539910803677074, + "grad_norm": 8.5, + "learning_rate": 6.22512868550357e-06, + "loss": 0.9647002220153809, + "step": 7988 + }, + { + "epoch": 1.4543551469918996, + "grad_norm": 33.5, + "learning_rate": 6.223598800507868e-06, + "loss": 0.4743680953979492, + "step": 7990 + }, + { + "epoch": 1.4547192136160918, + "grad_norm": 12.0, + "learning_rate": 6.22206888269995e-06, + "loss": 1.285109281539917, + "step": 7992 + }, + { + "epoch": 1.455083280240284, + "grad_norm": 17.625, + "learning_rate": 6.220538932304323e-06, + "loss": 1.426268458366394, + "step": 7994 + }, + { + "epoch": 1.4554473468644762, + "grad_norm": 49.5, + "learning_rate": 6.219008949545502e-06, + "loss": 1.6398017406463623, + "step": 7996 + }, + { + "epoch": 1.4558114134886684, + "grad_norm": 7.15625, + "learning_rate": 6.217478934648005e-06, + "loss": 1.1265734434127808, + "step": 7998 + }, + { + "epoch": 1.4561754801128606, + "grad_norm": 11.0625, + "learning_rate": 6.215948887836354e-06, + "loss": 1.5812140703201294, + "step": 8000 + }, + { + "epoch": 1.4565395467370528, + "grad_norm": 11.0, + "learning_rate": 6.214418809335081e-06, + "loss": 2.0047152042388916, + "step": 8002 + }, + { + "epoch": 1.4569036133612452, + "grad_norm": 47.75, + "learning_rate": 6.212888699368715e-06, + "loss": 1.3158432245254517, + "step": 8004 + }, + { + "epoch": 1.4572676799854374, + "grad_norm": 21.375, + "learning_rate": 6.211358558161796e-06, + "loss": 1.5836806297302246, + "step": 8006 + }, + { + "epoch": 1.4576317466096296, + "grad_norm": 16.875, + "learning_rate": 6.209828385938865e-06, + "loss": 1.3055001497268677, + "step": 8008 + }, + { + "epoch": 1.4579958132338218, + "grad_norm": 17.125, + "learning_rate": 6.2082981829244694e-06, + "loss": 1.2016761302947998, + "step": 8010 + }, + { + "epoch": 1.458359879858014, + "grad_norm": 10.375, + "learning_rate": 6.2067679493431586e-06, + "loss": 1.1596183776855469, + "step": 8012 + }, + { + "epoch": 1.4587239464822064, + "grad_norm": 6.5625, + "learning_rate": 6.205237685419489e-06, + "loss": 1.5556707382202148, + "step": 8014 + }, + { + "epoch": 1.4590880131063986, + "grad_norm": 15.625, + "learning_rate": 6.203707391378022e-06, + "loss": 1.308623194694519, + "step": 8016 + }, + { + "epoch": 1.4594520797305908, + "grad_norm": 10.9375, + "learning_rate": 6.2021770674433205e-06, + "loss": 1.185746192932129, + "step": 8018 + }, + { + "epoch": 1.459816146354783, + "grad_norm": 16.625, + "learning_rate": 6.200646713839954e-06, + "loss": 1.3245813846588135, + "step": 8020 + }, + { + "epoch": 1.4601802129789752, + "grad_norm": 16.25, + "learning_rate": 6.199116330792496e-06, + "loss": 1.0862973928451538, + "step": 8022 + }, + { + "epoch": 1.4605442796031674, + "grad_norm": 11.5, + "learning_rate": 6.197585918525522e-06, + "loss": 1.0174875259399414, + "step": 8024 + }, + { + "epoch": 1.4609083462273595, + "grad_norm": 19.125, + "learning_rate": 6.196055477263616e-06, + "loss": 1.508137822151184, + "step": 8026 + }, + { + "epoch": 1.4612724128515517, + "grad_norm": 12.6875, + "learning_rate": 6.194525007231362e-06, + "loss": 1.5162885189056396, + "step": 8028 + }, + { + "epoch": 1.461636479475744, + "grad_norm": 8.5625, + "learning_rate": 6.192994508653352e-06, + "loss": 1.4450267553329468, + "step": 8030 + }, + { + "epoch": 1.4620005460999363, + "grad_norm": 4.90625, + "learning_rate": 6.19146398175418e-06, + "loss": 1.4267585277557373, + "step": 8032 + }, + { + "epoch": 1.4623646127241285, + "grad_norm": 8.0625, + "learning_rate": 6.189933426758446e-06, + "loss": 1.2552216053009033, + "step": 8034 + }, + { + "epoch": 1.4627286793483207, + "grad_norm": 13.1875, + "learning_rate": 6.188402843890751e-06, + "loss": 1.4184731245040894, + "step": 8036 + }, + { + "epoch": 1.463092745972513, + "grad_norm": 8.4375, + "learning_rate": 6.1868722333757e-06, + "loss": 1.3659604787826538, + "step": 8038 + }, + { + "epoch": 1.4634568125967051, + "grad_norm": 13.5625, + "learning_rate": 6.1853415954379085e-06, + "loss": 1.362208366394043, + "step": 8040 + }, + { + "epoch": 1.4638208792208975, + "grad_norm": 17.375, + "learning_rate": 6.183810930301988e-06, + "loss": 1.3494126796722412, + "step": 8042 + }, + { + "epoch": 1.4641849458450897, + "grad_norm": 13.0, + "learning_rate": 6.182280238192558e-06, + "loss": 1.3620030879974365, + "step": 8044 + }, + { + "epoch": 1.464549012469282, + "grad_norm": 5.21875, + "learning_rate": 6.180749519334242e-06, + "loss": 1.3053460121154785, + "step": 8046 + }, + { + "epoch": 1.4649130790934741, + "grad_norm": 9.875, + "learning_rate": 6.179218773951667e-06, + "loss": 1.2864794731140137, + "step": 8048 + }, + { + "epoch": 1.4652771457176663, + "grad_norm": 8.9375, + "learning_rate": 6.177688002269464e-06, + "loss": 1.2111214399337769, + "step": 8050 + }, + { + "epoch": 1.4656412123418585, + "grad_norm": 21.5, + "learning_rate": 6.176157204512266e-06, + "loss": 2.0363869667053223, + "step": 8052 + }, + { + "epoch": 1.4660052789660507, + "grad_norm": 7.1875, + "learning_rate": 6.174626380904711e-06, + "loss": 1.0489566326141357, + "step": 8054 + }, + { + "epoch": 1.466369345590243, + "grad_norm": 9.25, + "learning_rate": 6.173095531671442e-06, + "loss": 1.372649073600769, + "step": 8056 + }, + { + "epoch": 1.4667334122144353, + "grad_norm": 7.1875, + "learning_rate": 6.171564657037108e-06, + "loss": 1.2606165409088135, + "step": 8058 + }, + { + "epoch": 1.4670974788386275, + "grad_norm": 8.9375, + "learning_rate": 6.170033757226353e-06, + "loss": 1.0931357145309448, + "step": 8060 + }, + { + "epoch": 1.4674615454628197, + "grad_norm": 15.8125, + "learning_rate": 6.1685028324638365e-06, + "loss": 1.344740867614746, + "step": 8062 + }, + { + "epoch": 1.467825612087012, + "grad_norm": 6.5, + "learning_rate": 6.166971882974209e-06, + "loss": 1.2903242111206055, + "step": 8064 + }, + { + "epoch": 1.468189678711204, + "grad_norm": 10.4375, + "learning_rate": 6.1654409089821364e-06, + "loss": 0.9969532489776611, + "step": 8066 + }, + { + "epoch": 1.4685537453353965, + "grad_norm": 12.625, + "learning_rate": 6.1639099107122795e-06, + "loss": 1.8316736221313477, + "step": 8068 + }, + { + "epoch": 1.4689178119595887, + "grad_norm": 26.0, + "learning_rate": 6.162378888389308e-06, + "loss": 1.5513944625854492, + "step": 8070 + }, + { + "epoch": 1.469281878583781, + "grad_norm": 6.71875, + "learning_rate": 6.1608478422378935e-06, + "loss": 1.1155644655227661, + "step": 8072 + }, + { + "epoch": 1.469645945207973, + "grad_norm": 6.75, + "learning_rate": 6.159316772482709e-06, + "loss": 1.168056607246399, + "step": 8074 + }, + { + "epoch": 1.4700100118321653, + "grad_norm": 14.4375, + "learning_rate": 6.157785679348434e-06, + "loss": 1.2045965194702148, + "step": 8076 + }, + { + "epoch": 1.4703740784563575, + "grad_norm": 17.75, + "learning_rate": 6.156254563059749e-06, + "loss": 1.296846628189087, + "step": 8078 + }, + { + "epoch": 1.4707381450805497, + "grad_norm": 25.0, + "learning_rate": 6.154723423841342e-06, + "loss": 1.0326735973358154, + "step": 8080 + }, + { + "epoch": 1.4711022117047419, + "grad_norm": 12.5625, + "learning_rate": 6.153192261917899e-06, + "loss": 1.5015560388565063, + "step": 8082 + }, + { + "epoch": 1.471466278328934, + "grad_norm": 7.03125, + "learning_rate": 6.151661077514113e-06, + "loss": 1.3092989921569824, + "step": 8084 + }, + { + "epoch": 1.4718303449531265, + "grad_norm": 29.375, + "learning_rate": 6.150129870854677e-06, + "loss": 1.1972593069076538, + "step": 8086 + }, + { + "epoch": 1.4721944115773187, + "grad_norm": 6.5, + "learning_rate": 6.148598642164292e-06, + "loss": 1.0859806537628174, + "step": 8088 + }, + { + "epoch": 1.4725584782015109, + "grad_norm": 7.4375, + "learning_rate": 6.147067391667657e-06, + "loss": 1.3672444820404053, + "step": 8090 + }, + { + "epoch": 1.472922544825703, + "grad_norm": 12.5625, + "learning_rate": 6.145536119589479e-06, + "loss": 1.3686683177947998, + "step": 8092 + }, + { + "epoch": 1.4732866114498953, + "grad_norm": 8.25, + "learning_rate": 6.144004826154466e-06, + "loss": 1.1499896049499512, + "step": 8094 + }, + { + "epoch": 1.4736506780740877, + "grad_norm": 4.25, + "learning_rate": 6.142473511587328e-06, + "loss": 1.1893773078918457, + "step": 8096 + }, + { + "epoch": 1.4740147446982799, + "grad_norm": 2.5625, + "learning_rate": 6.140942176112779e-06, + "loss": 0.8536308407783508, + "step": 8098 + }, + { + "epoch": 1.474378811322472, + "grad_norm": 13.5625, + "learning_rate": 6.139410819955538e-06, + "loss": 0.30789417028427124, + "step": 8100 + }, + { + "epoch": 1.4747428779466643, + "grad_norm": 13.9375, + "learning_rate": 6.137879443340324e-06, + "loss": 0.8554958701133728, + "step": 8102 + }, + { + "epoch": 1.4751069445708564, + "grad_norm": 7.65625, + "learning_rate": 6.136348046491859e-06, + "loss": 1.2366546392440796, + "step": 8104 + }, + { + "epoch": 1.4754710111950486, + "grad_norm": 31.625, + "learning_rate": 6.134816629634872e-06, + "loss": 1.4379687309265137, + "step": 8106 + }, + { + "epoch": 1.4758350778192408, + "grad_norm": 10.3125, + "learning_rate": 6.1332851929940904e-06, + "loss": 1.5495352745056152, + "step": 8108 + }, + { + "epoch": 1.476199144443433, + "grad_norm": 5.96875, + "learning_rate": 6.131753736794248e-06, + "loss": 1.2944271564483643, + "step": 8110 + }, + { + "epoch": 1.4765632110676254, + "grad_norm": 4.1875, + "learning_rate": 6.130222261260077e-06, + "loss": 0.8558687567710876, + "step": 8112 + }, + { + "epoch": 1.4769272776918176, + "grad_norm": 8.8125, + "learning_rate": 6.128690766616317e-06, + "loss": 1.3518931865692139, + "step": 8114 + }, + { + "epoch": 1.4772913443160098, + "grad_norm": 12.8125, + "learning_rate": 6.127159253087711e-06, + "loss": 1.358198881149292, + "step": 8116 + }, + { + "epoch": 1.477655410940202, + "grad_norm": 18.75, + "learning_rate": 6.125627720898998e-06, + "loss": 1.378767728805542, + "step": 8118 + }, + { + "epoch": 1.4780194775643942, + "grad_norm": 20.625, + "learning_rate": 6.124096170274925e-06, + "loss": 1.7746188640594482, + "step": 8120 + }, + { + "epoch": 1.4783835441885866, + "grad_norm": 10.625, + "learning_rate": 6.122564601440244e-06, + "loss": 1.2736667394638062, + "step": 8122 + }, + { + "epoch": 1.4787476108127788, + "grad_norm": 58.75, + "learning_rate": 6.121033014619704e-06, + "loss": 1.10037100315094, + "step": 8124 + }, + { + "epoch": 1.479111677436971, + "grad_norm": 9.8125, + "learning_rate": 6.119501410038059e-06, + "loss": 1.2981046438217163, + "step": 8126 + }, + { + "epoch": 1.4794757440611632, + "grad_norm": 42.0, + "learning_rate": 6.117969787920066e-06, + "loss": 1.4311425685882568, + "step": 8128 + }, + { + "epoch": 1.4798398106853554, + "grad_norm": 4.78125, + "learning_rate": 6.116438148490487e-06, + "loss": 0.9892849922180176, + "step": 8130 + }, + { + "epoch": 1.4802038773095476, + "grad_norm": 4.59375, + "learning_rate": 6.114906491974078e-06, + "loss": 0.9240947961807251, + "step": 8132 + }, + { + "epoch": 1.4805679439337398, + "grad_norm": 9.125, + "learning_rate": 6.1133748185956095e-06, + "loss": 1.1972553730010986, + "step": 8134 + }, + { + "epoch": 1.480932010557932, + "grad_norm": 8.3125, + "learning_rate": 6.111843128579846e-06, + "loss": 1.4405138492584229, + "step": 8136 + }, + { + "epoch": 1.4812960771821242, + "grad_norm": 17.125, + "learning_rate": 6.110311422151556e-06, + "loss": 1.4274497032165527, + "step": 8138 + }, + { + "epoch": 1.4816601438063166, + "grad_norm": 9.75, + "learning_rate": 6.108779699535512e-06, + "loss": 1.306797742843628, + "step": 8140 + }, + { + "epoch": 1.4820242104305088, + "grad_norm": 6.90625, + "learning_rate": 6.107247960956486e-06, + "loss": 1.5522130727767944, + "step": 8142 + }, + { + "epoch": 1.482388277054701, + "grad_norm": 39.25, + "learning_rate": 6.1057162066392595e-06, + "loss": 1.1375278234481812, + "step": 8144 + }, + { + "epoch": 1.4827523436788932, + "grad_norm": 8.125, + "learning_rate": 6.104184436808607e-06, + "loss": 1.2078139781951904, + "step": 8146 + }, + { + "epoch": 1.4831164103030854, + "grad_norm": 14.9375, + "learning_rate": 6.102652651689309e-06, + "loss": 1.3695788383483887, + "step": 8148 + }, + { + "epoch": 1.4834804769272778, + "grad_norm": 10.6875, + "learning_rate": 6.1011208515061524e-06, + "loss": 1.3216034173965454, + "step": 8150 + }, + { + "epoch": 1.48384454355147, + "grad_norm": 21.375, + "learning_rate": 6.09958903648392e-06, + "loss": 1.2552473545074463, + "step": 8152 + }, + { + "epoch": 1.4842086101756622, + "grad_norm": 17.375, + "learning_rate": 6.098057206847401e-06, + "loss": 1.4203650951385498, + "step": 8154 + }, + { + "epoch": 1.4845726767998544, + "grad_norm": 16.125, + "learning_rate": 6.096525362821382e-06, + "loss": 1.2568223476409912, + "step": 8156 + }, + { + "epoch": 1.4849367434240466, + "grad_norm": 14.9375, + "learning_rate": 6.094993504630659e-06, + "loss": 1.511087417602539, + "step": 8158 + }, + { + "epoch": 1.4853008100482388, + "grad_norm": 51.5, + "learning_rate": 6.093461632500024e-06, + "loss": 1.5030088424682617, + "step": 8160 + }, + { + "epoch": 1.485664876672431, + "grad_norm": 19.25, + "learning_rate": 6.091929746654273e-06, + "loss": 1.551119327545166, + "step": 8162 + }, + { + "epoch": 1.4860289432966232, + "grad_norm": 14.9375, + "learning_rate": 6.090397847318204e-06, + "loss": 1.4594221115112305, + "step": 8164 + }, + { + "epoch": 1.4863930099208156, + "grad_norm": 14.6875, + "learning_rate": 6.088865934716617e-06, + "loss": 1.4273381233215332, + "step": 8166 + }, + { + "epoch": 1.4867570765450078, + "grad_norm": 11.0625, + "learning_rate": 6.087334009074315e-06, + "loss": 1.3513221740722656, + "step": 8168 + }, + { + "epoch": 1.4871211431692, + "grad_norm": 9.375, + "learning_rate": 6.0858020706161e-06, + "loss": 1.3581629991531372, + "step": 8170 + }, + { + "epoch": 1.4874852097933922, + "grad_norm": 12.625, + "learning_rate": 6.0842701195667794e-06, + "loss": 1.2398608922958374, + "step": 8172 + }, + { + "epoch": 1.4878492764175844, + "grad_norm": 12.5, + "learning_rate": 6.082738156151161e-06, + "loss": 1.2210054397583008, + "step": 8174 + }, + { + "epoch": 1.4882133430417768, + "grad_norm": 22.375, + "learning_rate": 6.0812061805940525e-06, + "loss": 0.874728798866272, + "step": 8176 + }, + { + "epoch": 1.488577409665969, + "grad_norm": 7.40625, + "learning_rate": 6.079674193120266e-06, + "loss": 1.320002555847168, + "step": 8178 + }, + { + "epoch": 1.4889414762901612, + "grad_norm": 13.875, + "learning_rate": 6.078142193954615e-06, + "loss": 1.4195284843444824, + "step": 8180 + }, + { + "epoch": 1.4893055429143534, + "grad_norm": 33.25, + "learning_rate": 6.076610183321914e-06, + "loss": 1.6974737644195557, + "step": 8182 + }, + { + "epoch": 1.4896696095385455, + "grad_norm": 14.5625, + "learning_rate": 6.075078161446979e-06, + "loss": 1.4675984382629395, + "step": 8184 + }, + { + "epoch": 1.4900336761627377, + "grad_norm": 20.25, + "learning_rate": 6.073546128554628e-06, + "loss": 1.2003103494644165, + "step": 8186 + }, + { + "epoch": 1.49039774278693, + "grad_norm": 19.875, + "learning_rate": 6.072014084869682e-06, + "loss": 0.804355263710022, + "step": 8188 + }, + { + "epoch": 1.4907618094111221, + "grad_norm": 16.625, + "learning_rate": 6.0704820306169584e-06, + "loss": 1.2440123558044434, + "step": 8190 + }, + { + "epoch": 1.4911258760353143, + "grad_norm": 9.9375, + "learning_rate": 6.068949966021285e-06, + "loss": 1.0877405405044556, + "step": 8192 + }, + { + "epoch": 1.4914899426595067, + "grad_norm": 7.46875, + "learning_rate": 6.067417891307481e-06, + "loss": 1.4059909582138062, + "step": 8194 + }, + { + "epoch": 1.491854009283699, + "grad_norm": 8.375, + "learning_rate": 6.065885806700375e-06, + "loss": 1.3072524070739746, + "step": 8196 + }, + { + "epoch": 1.4922180759078911, + "grad_norm": 9.6875, + "learning_rate": 6.064353712424795e-06, + "loss": 1.1673481464385986, + "step": 8198 + }, + { + "epoch": 1.4925821425320833, + "grad_norm": 10.625, + "learning_rate": 6.062821608705568e-06, + "loss": 1.4568266868591309, + "step": 8200 + }, + { + "epoch": 1.4929462091562755, + "grad_norm": 42.5, + "learning_rate": 6.061289495767525e-06, + "loss": 1.3883720636367798, + "step": 8202 + }, + { + "epoch": 1.493310275780468, + "grad_norm": 6.78125, + "learning_rate": 6.0597573738354975e-06, + "loss": 1.1405024528503418, + "step": 8204 + }, + { + "epoch": 1.4936743424046601, + "grad_norm": 9.8125, + "learning_rate": 6.058225243134315e-06, + "loss": 1.4551286697387695, + "step": 8206 + }, + { + "epoch": 1.4940384090288523, + "grad_norm": 20.0, + "learning_rate": 6.056693103888816e-06, + "loss": 1.2114101648330688, + "step": 8208 + }, + { + "epoch": 1.4944024756530445, + "grad_norm": 10.375, + "learning_rate": 6.055160956323834e-06, + "loss": 0.6524553894996643, + "step": 8210 + }, + { + "epoch": 1.4947665422772367, + "grad_norm": 22.0, + "learning_rate": 6.0536288006642045e-06, + "loss": 1.457849144935608, + "step": 8212 + }, + { + "epoch": 1.495130608901429, + "grad_norm": 15.4375, + "learning_rate": 6.052096637134766e-06, + "loss": 1.4925663471221924, + "step": 8214 + }, + { + "epoch": 1.495494675525621, + "grad_norm": 7.125, + "learning_rate": 6.050564465960357e-06, + "loss": 1.1473684310913086, + "step": 8216 + }, + { + "epoch": 1.4958587421498133, + "grad_norm": 16.75, + "learning_rate": 6.0490322873658165e-06, + "loss": 1.2548805475234985, + "step": 8218 + }, + { + "epoch": 1.4962228087740057, + "grad_norm": 10.0625, + "learning_rate": 6.0475001015759896e-06, + "loss": 1.2143735885620117, + "step": 8220 + }, + { + "epoch": 1.496586875398198, + "grad_norm": 15.375, + "learning_rate": 6.045967908815713e-06, + "loss": 1.7000795602798462, + "step": 8222 + }, + { + "epoch": 1.49695094202239, + "grad_norm": 15.9375, + "learning_rate": 6.044435709309833e-06, + "loss": 1.3217474222183228, + "step": 8224 + }, + { + "epoch": 1.4973150086465823, + "grad_norm": 8.9375, + "learning_rate": 6.042903503283191e-06, + "loss": 1.2151466608047485, + "step": 8226 + }, + { + "epoch": 1.4976790752707745, + "grad_norm": 8.75, + "learning_rate": 6.041371290960635e-06, + "loss": 1.0743054151535034, + "step": 8228 + }, + { + "epoch": 1.498043141894967, + "grad_norm": 8.125, + "learning_rate": 6.039839072567009e-06, + "loss": 1.4411975145339966, + "step": 8230 + }, + { + "epoch": 1.498407208519159, + "grad_norm": 8.8125, + "learning_rate": 6.038306848327162e-06, + "loss": 1.1911547183990479, + "step": 8232 + }, + { + "epoch": 1.4987712751433513, + "grad_norm": 14.5, + "learning_rate": 6.036774618465939e-06, + "loss": 1.4692813158035278, + "step": 8234 + }, + { + "epoch": 1.4991353417675435, + "grad_norm": 24.625, + "learning_rate": 6.035242383208191e-06, + "loss": 1.646880865097046, + "step": 8236 + }, + { + "epoch": 1.4994994083917357, + "grad_norm": 12.5625, + "learning_rate": 6.033710142778765e-06, + "loss": 1.198011040687561, + "step": 8238 + }, + { + "epoch": 1.4998634750159279, + "grad_norm": 16.25, + "learning_rate": 6.032177897402511e-06, + "loss": 1.742721676826477, + "step": 8240 + }, + { + "epoch": 1.50022754164012, + "grad_norm": 19.0, + "learning_rate": 6.030645647304283e-06, + "loss": 1.8102439641952515, + "step": 8242 + }, + { + "epoch": 1.5005916082643123, + "grad_norm": 11.5, + "learning_rate": 6.029113392708931e-06, + "loss": 1.396047592163086, + "step": 8244 + }, + { + "epoch": 1.5009556748885045, + "grad_norm": 10.0625, + "learning_rate": 6.027581133841305e-06, + "loss": 1.1110010147094727, + "step": 8246 + }, + { + "epoch": 1.5013197415126969, + "grad_norm": 29.375, + "learning_rate": 6.02604887092626e-06, + "loss": 1.4422245025634766, + "step": 8248 + }, + { + "epoch": 1.501683808136889, + "grad_norm": 18.75, + "learning_rate": 6.024516604188648e-06, + "loss": 1.5606341361999512, + "step": 8250 + }, + { + "epoch": 1.5020478747610813, + "grad_norm": 16.0, + "learning_rate": 6.022984333853324e-06, + "loss": 1.6321194171905518, + "step": 8252 + }, + { + "epoch": 1.5024119413852735, + "grad_norm": 13.6875, + "learning_rate": 6.021452060145143e-06, + "loss": 1.39895761013031, + "step": 8254 + }, + { + "epoch": 1.5027760080094659, + "grad_norm": 8.6875, + "learning_rate": 6.0199197832889585e-06, + "loss": 1.056370735168457, + "step": 8256 + }, + { + "epoch": 1.503140074633658, + "grad_norm": 12.8125, + "learning_rate": 6.0183875035096275e-06, + "loss": 1.200910210609436, + "step": 8258 + }, + { + "epoch": 1.5035041412578503, + "grad_norm": 7.59375, + "learning_rate": 6.016855221032003e-06, + "loss": 1.2097914218902588, + "step": 8260 + }, + { + "epoch": 1.5038682078820425, + "grad_norm": 16.875, + "learning_rate": 6.015322936080945e-06, + "loss": 1.2440482378005981, + "step": 8262 + }, + { + "epoch": 1.5042322745062346, + "grad_norm": 18.125, + "learning_rate": 6.013790648881307e-06, + "loss": 1.0889358520507812, + "step": 8264 + }, + { + "epoch": 1.5045963411304268, + "grad_norm": 16.75, + "learning_rate": 6.0122583596579475e-06, + "loss": 1.4096873998641968, + "step": 8266 + }, + { + "epoch": 1.504960407754619, + "grad_norm": 12.9375, + "learning_rate": 6.010726068635724e-06, + "loss": 1.7448532581329346, + "step": 8268 + }, + { + "epoch": 1.5053244743788112, + "grad_norm": 13.8125, + "learning_rate": 6.009193776039492e-06, + "loss": 1.354616641998291, + "step": 8270 + }, + { + "epoch": 1.5056885410030034, + "grad_norm": 9.9375, + "learning_rate": 6.007661482094111e-06, + "loss": 0.9735084176063538, + "step": 8272 + }, + { + "epoch": 1.5060526076271956, + "grad_norm": 32.75, + "learning_rate": 6.0061291870244395e-06, + "loss": 1.3060473203659058, + "step": 8274 + }, + { + "epoch": 1.506416674251388, + "grad_norm": 24.125, + "learning_rate": 6.004596891055334e-06, + "loss": 1.4042834043502808, + "step": 8276 + }, + { + "epoch": 1.5067807408755802, + "grad_norm": 9.1875, + "learning_rate": 6.003064594411654e-06, + "loss": 1.3706773519515991, + "step": 8278 + }, + { + "epoch": 1.5071448074997724, + "grad_norm": 7.09375, + "learning_rate": 6.001532297318258e-06, + "loss": 1.087712287902832, + "step": 8280 + }, + { + "epoch": 1.5075088741239648, + "grad_norm": 8.625, + "learning_rate": 6.000000000000001e-06, + "loss": 1.2260565757751465, + "step": 8282 + }, + { + "epoch": 1.507872940748157, + "grad_norm": 16.875, + "learning_rate": 5.998467702681745e-06, + "loss": 1.477408528327942, + "step": 8284 + }, + { + "epoch": 1.5082370073723492, + "grad_norm": 9.3125, + "learning_rate": 5.996935405588348e-06, + "loss": 1.5100369453430176, + "step": 8286 + }, + { + "epoch": 1.5086010739965414, + "grad_norm": 5.96875, + "learning_rate": 5.995403108944667e-06, + "loss": 1.24753999710083, + "step": 8288 + }, + { + "epoch": 1.5089651406207336, + "grad_norm": 9.5625, + "learning_rate": 5.993870812975563e-06, + "loss": 1.4215635061264038, + "step": 8290 + }, + { + "epoch": 1.5093292072449258, + "grad_norm": 8.9375, + "learning_rate": 5.99233851790589e-06, + "loss": 1.2854349613189697, + "step": 8292 + }, + { + "epoch": 1.509693273869118, + "grad_norm": 7.34375, + "learning_rate": 5.990806223960508e-06, + "loss": 1.1699168682098389, + "step": 8294 + }, + { + "epoch": 1.5100573404933102, + "grad_norm": 21.875, + "learning_rate": 5.989273931364279e-06, + "loss": 0.8238331079483032, + "step": 8296 + }, + { + "epoch": 1.5104214071175024, + "grad_norm": 11.125, + "learning_rate": 5.9877416403420545e-06, + "loss": 1.2738925218582153, + "step": 8298 + }, + { + "epoch": 1.5107854737416946, + "grad_norm": 14.4375, + "learning_rate": 5.986209351118696e-06, + "loss": 1.4510915279388428, + "step": 8300 + }, + { + "epoch": 1.511149540365887, + "grad_norm": 25.0, + "learning_rate": 5.984677063919058e-06, + "loss": 1.7367358207702637, + "step": 8302 + }, + { + "epoch": 1.5115136069900792, + "grad_norm": 9.0, + "learning_rate": 5.983144778967998e-06, + "loss": 1.017667531967163, + "step": 8304 + }, + { + "epoch": 1.5118776736142714, + "grad_norm": 8.1875, + "learning_rate": 5.9816124964903745e-06, + "loss": 1.3267130851745605, + "step": 8306 + }, + { + "epoch": 1.5122417402384638, + "grad_norm": 8.6875, + "learning_rate": 5.980080216711043e-06, + "loss": 0.8771546483039856, + "step": 8308 + }, + { + "epoch": 1.512605806862656, + "grad_norm": 15.5, + "learning_rate": 5.9785479398548595e-06, + "loss": 0.9681273698806763, + "step": 8310 + }, + { + "epoch": 1.5129698734868482, + "grad_norm": 15.6875, + "learning_rate": 5.977015666146677e-06, + "loss": 1.7618441581726074, + "step": 8312 + }, + { + "epoch": 1.5133339401110404, + "grad_norm": 20.25, + "learning_rate": 5.975483395811352e-06, + "loss": 1.6499402523040771, + "step": 8314 + }, + { + "epoch": 1.5136980067352326, + "grad_norm": 17.5, + "learning_rate": 5.973951129073743e-06, + "loss": 2.0083208084106445, + "step": 8316 + }, + { + "epoch": 1.5140620733594248, + "grad_norm": 15.0, + "learning_rate": 5.972418866158697e-06, + "loss": 1.960012435913086, + "step": 8318 + }, + { + "epoch": 1.514426139983617, + "grad_norm": 20.25, + "learning_rate": 5.970886607291073e-06, + "loss": 1.7332077026367188, + "step": 8320 + }, + { + "epoch": 1.5147902066078092, + "grad_norm": 25.625, + "learning_rate": 5.969354352695718e-06, + "loss": 1.5385401248931885, + "step": 8322 + }, + { + "epoch": 1.5151542732320014, + "grad_norm": 34.5, + "learning_rate": 5.967822102597489e-06, + "loss": 0.5154561400413513, + "step": 8324 + }, + { + "epoch": 1.5155183398561936, + "grad_norm": 23.125, + "learning_rate": 5.966289857221237e-06, + "loss": 1.3367457389831543, + "step": 8326 + }, + { + "epoch": 1.5158824064803857, + "grad_norm": 11.0, + "learning_rate": 5.964757616791812e-06, + "loss": 1.3716245889663696, + "step": 8328 + }, + { + "epoch": 1.5162464731045782, + "grad_norm": 7.5, + "learning_rate": 5.963225381534063e-06, + "loss": 1.4495644569396973, + "step": 8330 + }, + { + "epoch": 1.5166105397287704, + "grad_norm": 16.0, + "learning_rate": 5.96169315167284e-06, + "loss": 1.4499701261520386, + "step": 8332 + }, + { + "epoch": 1.5169746063529626, + "grad_norm": 31.625, + "learning_rate": 5.960160927432992e-06, + "loss": 0.8963940143585205, + "step": 8334 + }, + { + "epoch": 1.517338672977155, + "grad_norm": 12.4375, + "learning_rate": 5.9586287090393666e-06, + "loss": 1.534785509109497, + "step": 8336 + }, + { + "epoch": 1.5177027396013472, + "grad_norm": 6.03125, + "learning_rate": 5.957096496716811e-06, + "loss": 1.3165816068649292, + "step": 8338 + }, + { + "epoch": 1.5180668062255394, + "grad_norm": 7.375, + "learning_rate": 5.955564290690171e-06, + "loss": 1.3977460861206055, + "step": 8340 + }, + { + "epoch": 1.5184308728497315, + "grad_norm": 11.1875, + "learning_rate": 5.9540320911842895e-06, + "loss": 0.9736803770065308, + "step": 8342 + }, + { + "epoch": 1.5187949394739237, + "grad_norm": 20.125, + "learning_rate": 5.9524998984240124e-06, + "loss": 1.3769147396087646, + "step": 8344 + }, + { + "epoch": 1.519159006098116, + "grad_norm": 41.0, + "learning_rate": 5.950967712634185e-06, + "loss": 1.4024406671524048, + "step": 8346 + }, + { + "epoch": 1.5195230727223081, + "grad_norm": 5.15625, + "learning_rate": 5.949435534039645e-06, + "loss": 0.9985767602920532, + "step": 8348 + }, + { + "epoch": 1.5198871393465003, + "grad_norm": 15.5625, + "learning_rate": 5.947903362865237e-06, + "loss": 1.3010436296463013, + "step": 8350 + }, + { + "epoch": 1.5202512059706925, + "grad_norm": 21.75, + "learning_rate": 5.9463711993357975e-06, + "loss": 1.0750222206115723, + "step": 8352 + }, + { + "epoch": 1.5206152725948847, + "grad_norm": 22.625, + "learning_rate": 5.9448390436761674e-06, + "loss": 1.5535142421722412, + "step": 8354 + }, + { + "epoch": 1.5209793392190771, + "grad_norm": 14.8125, + "learning_rate": 5.943306896111185e-06, + "loss": 1.4182324409484863, + "step": 8356 + }, + { + "epoch": 1.5213434058432693, + "grad_norm": 26.25, + "learning_rate": 5.941774756865686e-06, + "loss": 1.4964677095413208, + "step": 8358 + }, + { + "epoch": 1.5217074724674615, + "grad_norm": 12.625, + "learning_rate": 5.940242626164506e-06, + "loss": 1.5679097175598145, + "step": 8360 + }, + { + "epoch": 1.522071539091654, + "grad_norm": 13.625, + "learning_rate": 5.938710504232476e-06, + "loss": 1.6520633697509766, + "step": 8362 + }, + { + "epoch": 1.5224356057158461, + "grad_norm": 22.625, + "learning_rate": 5.937178391294433e-06, + "loss": 1.4216349124908447, + "step": 8364 + }, + { + "epoch": 1.5227996723400383, + "grad_norm": 17.25, + "learning_rate": 5.935646287575208e-06, + "loss": 1.4336011409759521, + "step": 8366 + }, + { + "epoch": 1.5231637389642305, + "grad_norm": 8.3125, + "learning_rate": 5.9341141932996275e-06, + "loss": 1.4558292627334595, + "step": 8368 + }, + { + "epoch": 1.5235278055884227, + "grad_norm": 36.25, + "learning_rate": 5.932582108692522e-06, + "loss": 1.4104833602905273, + "step": 8370 + }, + { + "epoch": 1.523891872212615, + "grad_norm": 17.25, + "learning_rate": 5.931050033978718e-06, + "loss": 1.018526315689087, + "step": 8372 + }, + { + "epoch": 1.524255938836807, + "grad_norm": 12.0625, + "learning_rate": 5.929517969383043e-06, + "loss": 1.8626515865325928, + "step": 8374 + }, + { + "epoch": 1.5246200054609993, + "grad_norm": 7.75, + "learning_rate": 5.927985915130321e-06, + "loss": 1.7673389911651611, + "step": 8376 + }, + { + "epoch": 1.5249840720851915, + "grad_norm": 14.375, + "learning_rate": 5.926453871445373e-06, + "loss": 1.1028656959533691, + "step": 8378 + }, + { + "epoch": 1.5253481387093837, + "grad_norm": 29.625, + "learning_rate": 5.924921838553024e-06, + "loss": 1.423374056816101, + "step": 8380 + }, + { + "epoch": 1.525712205333576, + "grad_norm": 22.25, + "learning_rate": 5.9233898166780865e-06, + "loss": 1.524986743927002, + "step": 8382 + }, + { + "epoch": 1.5260762719577683, + "grad_norm": 28.625, + "learning_rate": 5.921857806045385e-06, + "loss": 1.250152826309204, + "step": 8384 + }, + { + "epoch": 1.5264403385819605, + "grad_norm": 44.0, + "learning_rate": 5.920325806879736e-06, + "loss": 0.9956077933311462, + "step": 8386 + }, + { + "epoch": 1.5268044052061527, + "grad_norm": 161.0, + "learning_rate": 5.918793819405949e-06, + "loss": 0.8774659037590027, + "step": 8388 + }, + { + "epoch": 1.527168471830345, + "grad_norm": 5.6875, + "learning_rate": 5.917261843848843e-06, + "loss": 1.2939871549606323, + "step": 8390 + }, + { + "epoch": 1.5275325384545373, + "grad_norm": 4.59375, + "learning_rate": 5.915729880433223e-06, + "loss": 1.26493501663208, + "step": 8392 + }, + { + "epoch": 1.5278966050787295, + "grad_norm": 17.0, + "learning_rate": 5.9141979293839e-06, + "loss": 1.4402207136154175, + "step": 8394 + }, + { + "epoch": 1.5282606717029217, + "grad_norm": 22.625, + "learning_rate": 5.912665990925688e-06, + "loss": 1.4345242977142334, + "step": 8396 + }, + { + "epoch": 1.5286247383271139, + "grad_norm": 21.25, + "learning_rate": 5.9111340652833844e-06, + "loss": 1.6565146446228027, + "step": 8398 + }, + { + "epoch": 1.528988804951306, + "grad_norm": 32.25, + "learning_rate": 5.909602152681799e-06, + "loss": 1.5919088125228882, + "step": 8400 + }, + { + "epoch": 1.5293528715754983, + "grad_norm": 20.75, + "learning_rate": 5.90807025334573e-06, + "loss": 1.845825433731079, + "step": 8402 + }, + { + "epoch": 1.5297169381996905, + "grad_norm": 10.9375, + "learning_rate": 5.906538367499977e-06, + "loss": 1.4942909479141235, + "step": 8404 + }, + { + "epoch": 1.5300810048238827, + "grad_norm": 12.8125, + "learning_rate": 5.905006495369343e-06, + "loss": 1.358984351158142, + "step": 8406 + }, + { + "epoch": 1.5304450714480748, + "grad_norm": 7.5625, + "learning_rate": 5.903474637178619e-06, + "loss": 1.354823350906372, + "step": 8408 + }, + { + "epoch": 1.5308091380722673, + "grad_norm": 15.6875, + "learning_rate": 5.901942793152603e-06, + "loss": 1.3463715314865112, + "step": 8410 + }, + { + "epoch": 1.5311732046964595, + "grad_norm": 14.3125, + "learning_rate": 5.9004109635160814e-06, + "loss": 1.397836446762085, + "step": 8412 + }, + { + "epoch": 1.5315372713206516, + "grad_norm": 7.46875, + "learning_rate": 5.898879148493848e-06, + "loss": 1.4281895160675049, + "step": 8414 + }, + { + "epoch": 1.531901337944844, + "grad_norm": 14.625, + "learning_rate": 5.897347348310691e-06, + "loss": 1.494858741760254, + "step": 8416 + }, + { + "epoch": 1.5322654045690363, + "grad_norm": 13.125, + "learning_rate": 5.895815563191396e-06, + "loss": 1.4905179738998413, + "step": 8418 + }, + { + "epoch": 1.5326294711932285, + "grad_norm": 2.828125, + "learning_rate": 5.894283793360744e-06, + "loss": 0.8018645644187927, + "step": 8420 + }, + { + "epoch": 1.5329935378174206, + "grad_norm": 19.75, + "learning_rate": 5.8927520390435145e-06, + "loss": 0.3094245493412018, + "step": 8422 + }, + { + "epoch": 1.5333576044416128, + "grad_norm": 49.0, + "learning_rate": 5.89122030046449e-06, + "loss": 0.5090566277503967, + "step": 8424 + }, + { + "epoch": 1.533721671065805, + "grad_norm": 20.5, + "learning_rate": 5.889688577848447e-06, + "loss": 0.6525697708129883, + "step": 8426 + }, + { + "epoch": 1.5340857376899972, + "grad_norm": 9.125, + "learning_rate": 5.888156871420157e-06, + "loss": 1.0942453145980835, + "step": 8428 + }, + { + "epoch": 1.5344498043141894, + "grad_norm": 11.6875, + "learning_rate": 5.886625181404393e-06, + "loss": 1.3017857074737549, + "step": 8430 + }, + { + "epoch": 1.5348138709383816, + "grad_norm": 18.375, + "learning_rate": 5.885093508025923e-06, + "loss": 1.3903698921203613, + "step": 8432 + }, + { + "epoch": 1.5351779375625738, + "grad_norm": 11.6875, + "learning_rate": 5.883561851509515e-06, + "loss": 1.4669889211654663, + "step": 8434 + }, + { + "epoch": 1.5355420041867662, + "grad_norm": 15.0625, + "learning_rate": 5.882030212079936e-06, + "loss": 1.559173583984375, + "step": 8436 + }, + { + "epoch": 1.5359060708109584, + "grad_norm": 8.75, + "learning_rate": 5.880498589961943e-06, + "loss": 1.368179202079773, + "step": 8438 + }, + { + "epoch": 1.5362701374351506, + "grad_norm": 9.375, + "learning_rate": 5.878966985380299e-06, + "loss": 1.5964009761810303, + "step": 8440 + }, + { + "epoch": 1.5366342040593428, + "grad_norm": 8.5, + "learning_rate": 5.877435398559759e-06, + "loss": 1.2057523727416992, + "step": 8442 + }, + { + "epoch": 1.5369982706835352, + "grad_norm": 14.5, + "learning_rate": 5.875903829725076e-06, + "loss": 1.5160874128341675, + "step": 8444 + }, + { + "epoch": 1.5373623373077274, + "grad_norm": 8.5625, + "learning_rate": 5.8743722791010036e-06, + "loss": 1.3408002853393555, + "step": 8446 + }, + { + "epoch": 1.5377264039319196, + "grad_norm": 40.25, + "learning_rate": 5.872840746912292e-06, + "loss": 1.550868034362793, + "step": 8448 + }, + { + "epoch": 1.5380904705561118, + "grad_norm": 22.125, + "learning_rate": 5.871309233383684e-06, + "loss": 1.6083693504333496, + "step": 8450 + }, + { + "epoch": 1.538454537180304, + "grad_norm": 6.71875, + "learning_rate": 5.869777738739924e-06, + "loss": 1.4470646381378174, + "step": 8452 + }, + { + "epoch": 1.5388186038044962, + "grad_norm": 6.21875, + "learning_rate": 5.868246263205753e-06, + "loss": 1.283869743347168, + "step": 8454 + }, + { + "epoch": 1.5391826704286884, + "grad_norm": 11.0, + "learning_rate": 5.866714807005911e-06, + "loss": 1.3350982666015625, + "step": 8456 + }, + { + "epoch": 1.5395467370528806, + "grad_norm": 8.8125, + "learning_rate": 5.86518337036513e-06, + "loss": 1.3710887432098389, + "step": 8458 + }, + { + "epoch": 1.5399108036770728, + "grad_norm": 13.3125, + "learning_rate": 5.8636519535081435e-06, + "loss": 1.0875675678253174, + "step": 8460 + }, + { + "epoch": 1.540274870301265, + "grad_norm": 20.375, + "learning_rate": 5.862120556659678e-06, + "loss": 1.2588577270507812, + "step": 8462 + }, + { + "epoch": 1.5406389369254574, + "grad_norm": 27.5, + "learning_rate": 5.860589180044463e-06, + "loss": 1.671203851699829, + "step": 8464 + }, + { + "epoch": 1.5410030035496496, + "grad_norm": 13.6875, + "learning_rate": 5.859057823887222e-06, + "loss": 1.5473215579986572, + "step": 8466 + }, + { + "epoch": 1.5413670701738418, + "grad_norm": 23.125, + "learning_rate": 5.857526488412675e-06, + "loss": 1.7330286502838135, + "step": 8468 + }, + { + "epoch": 1.5417311367980342, + "grad_norm": 6.21875, + "learning_rate": 5.855995173845537e-06, + "loss": 1.4392908811569214, + "step": 8470 + }, + { + "epoch": 1.5420952034222264, + "grad_norm": 12.5, + "learning_rate": 5.854463880410523e-06, + "loss": 1.416183590888977, + "step": 8472 + }, + { + "epoch": 1.5424592700464186, + "grad_norm": 13.25, + "learning_rate": 5.852932608332344e-06, + "loss": 1.4640228748321533, + "step": 8474 + }, + { + "epoch": 1.5428233366706108, + "grad_norm": 18.375, + "learning_rate": 5.851401357835711e-06, + "loss": 1.5139390230178833, + "step": 8476 + }, + { + "epoch": 1.543187403294803, + "grad_norm": 30.125, + "learning_rate": 5.8498701291453255e-06, + "loss": 1.9712140560150146, + "step": 8478 + }, + { + "epoch": 1.5435514699189952, + "grad_norm": 300.0, + "learning_rate": 5.848338922485891e-06, + "loss": 1.7092909812927246, + "step": 8480 + }, + { + "epoch": 1.5439155365431874, + "grad_norm": 9.25, + "learning_rate": 5.846807738082103e-06, + "loss": 1.1048921346664429, + "step": 8482 + }, + { + "epoch": 1.5442796031673796, + "grad_norm": 8.1875, + "learning_rate": 5.84527657615866e-06, + "loss": 1.4587063789367676, + "step": 8484 + }, + { + "epoch": 1.5446436697915717, + "grad_norm": 8.4375, + "learning_rate": 5.843745436940252e-06, + "loss": 1.3519057035446167, + "step": 8486 + }, + { + "epoch": 1.545007736415764, + "grad_norm": 11.875, + "learning_rate": 5.842214320651569e-06, + "loss": 1.1182470321655273, + "step": 8488 + }, + { + "epoch": 1.5453718030399564, + "grad_norm": 11.125, + "learning_rate": 5.840683227517294e-06, + "loss": 0.3271283507347107, + "step": 8490 + }, + { + "epoch": 1.5457358696641486, + "grad_norm": 9.6875, + "learning_rate": 5.839152157762109e-06, + "loss": 1.103482961654663, + "step": 8492 + }, + { + "epoch": 1.5460999362883407, + "grad_norm": 16.0, + "learning_rate": 5.837621111610693e-06, + "loss": 1.3002212047576904, + "step": 8494 + }, + { + "epoch": 1.546464002912533, + "grad_norm": 7.3125, + "learning_rate": 5.8360900892877225e-06, + "loss": 1.3260561227798462, + "step": 8496 + }, + { + "epoch": 1.5468280695367254, + "grad_norm": 9.375, + "learning_rate": 5.834559091017866e-06, + "loss": 1.5359067916870117, + "step": 8498 + }, + { + "epoch": 1.5471921361609176, + "grad_norm": 16.5, + "learning_rate": 5.833028117025794e-06, + "loss": 1.2190123796463013, + "step": 8500 + }, + { + "epoch": 1.5475562027851097, + "grad_norm": 14.625, + "learning_rate": 5.831497167536166e-06, + "loss": 0.8445422649383545, + "step": 8502 + }, + { + "epoch": 1.547920269409302, + "grad_norm": 6.46875, + "learning_rate": 5.829966242773647e-06, + "loss": 1.2735350131988525, + "step": 8504 + }, + { + "epoch": 1.5482843360334941, + "grad_norm": 10.625, + "learning_rate": 5.828435342962895e-06, + "loss": 1.376702070236206, + "step": 8506 + }, + { + "epoch": 1.5486484026576863, + "grad_norm": 6.0, + "learning_rate": 5.826904468328558e-06, + "loss": 1.2295076847076416, + "step": 8508 + }, + { + "epoch": 1.5490124692818785, + "grad_norm": 20.5, + "learning_rate": 5.8253736190952915e-06, + "loss": 1.2841095924377441, + "step": 8510 + }, + { + "epoch": 1.5493765359060707, + "grad_norm": 10.875, + "learning_rate": 5.823842795487737e-06, + "loss": 1.907101035118103, + "step": 8512 + }, + { + "epoch": 1.549740602530263, + "grad_norm": 7.5625, + "learning_rate": 5.822311997730538e-06, + "loss": 1.339274525642395, + "step": 8514 + }, + { + "epoch": 1.550104669154455, + "grad_norm": 7.71875, + "learning_rate": 5.820781226048336e-06, + "loss": 1.4191474914550781, + "step": 8516 + }, + { + "epoch": 1.5504687357786475, + "grad_norm": 10.3125, + "learning_rate": 5.819250480665759e-06, + "loss": 1.308241844177246, + "step": 8518 + }, + { + "epoch": 1.5508328024028397, + "grad_norm": 14.4375, + "learning_rate": 5.817719761807445e-06, + "loss": 1.6076023578643799, + "step": 8520 + }, + { + "epoch": 1.551196869027032, + "grad_norm": 6.53125, + "learning_rate": 5.816189069698015e-06, + "loss": 1.2995057106018066, + "step": 8522 + }, + { + "epoch": 1.5515609356512243, + "grad_norm": 8.25, + "learning_rate": 5.814658404562093e-06, + "loss": 1.04618501663208, + "step": 8524 + }, + { + "epoch": 1.5519250022754165, + "grad_norm": 9.0625, + "learning_rate": 5.813127766624301e-06, + "loss": 1.06477952003479, + "step": 8526 + }, + { + "epoch": 1.5522890688996087, + "grad_norm": 14.8125, + "learning_rate": 5.811597156109252e-06, + "loss": 1.6031912565231323, + "step": 8528 + }, + { + "epoch": 1.552653135523801, + "grad_norm": 31.375, + "learning_rate": 5.810066573241557e-06, + "loss": 1.568178653717041, + "step": 8530 + }, + { + "epoch": 1.553017202147993, + "grad_norm": 10.9375, + "learning_rate": 5.80853601824582e-06, + "loss": 1.361877679824829, + "step": 8532 + }, + { + "epoch": 1.5533812687721853, + "grad_norm": 7.65625, + "learning_rate": 5.807005491346649e-06, + "loss": 1.37688410282135, + "step": 8534 + }, + { + "epoch": 1.5537453353963775, + "grad_norm": 13.5625, + "learning_rate": 5.8054749927686405e-06, + "loss": 1.1899266242980957, + "step": 8536 + }, + { + "epoch": 1.5541094020205697, + "grad_norm": 15.4375, + "learning_rate": 5.803944522736387e-06, + "loss": 0.8405632376670837, + "step": 8538 + }, + { + "epoch": 1.5544734686447619, + "grad_norm": 14.625, + "learning_rate": 5.802414081474481e-06, + "loss": 1.0263118743896484, + "step": 8540 + }, + { + "epoch": 1.554837535268954, + "grad_norm": 13.625, + "learning_rate": 5.800883669207507e-06, + "loss": 1.4692708253860474, + "step": 8542 + }, + { + "epoch": 1.5552016018931465, + "grad_norm": 13.25, + "learning_rate": 5.799353286160048e-06, + "loss": 1.644458293914795, + "step": 8544 + }, + { + "epoch": 1.5555656685173387, + "grad_norm": 34.75, + "learning_rate": 5.797822932556681e-06, + "loss": 1.3747681379318237, + "step": 8546 + }, + { + "epoch": 1.5559297351415309, + "grad_norm": 11.5, + "learning_rate": 5.796292608621978e-06, + "loss": 1.0696145296096802, + "step": 8548 + }, + { + "epoch": 1.556293801765723, + "grad_norm": 15.75, + "learning_rate": 5.7947623145805135e-06, + "loss": 0.958878755569458, + "step": 8550 + }, + { + "epoch": 1.5566578683899155, + "grad_norm": 135.0, + "learning_rate": 5.793232050656843e-06, + "loss": 0.7434136867523193, + "step": 8552 + }, + { + "epoch": 1.5570219350141077, + "grad_norm": 7.375, + "learning_rate": 5.7917018170755326e-06, + "loss": 0.9431828856468201, + "step": 8554 + }, + { + "epoch": 1.5573860016382999, + "grad_norm": 27.0, + "learning_rate": 5.790171614061136e-06, + "loss": 0.9860231876373291, + "step": 8556 + }, + { + "epoch": 1.557750068262492, + "grad_norm": 17.75, + "learning_rate": 5.788641441838205e-06, + "loss": 1.4978352785110474, + "step": 8558 + }, + { + "epoch": 1.5581141348866843, + "grad_norm": 119.5, + "learning_rate": 5.787111300631288e-06, + "loss": 1.4909592866897583, + "step": 8560 + }, + { + "epoch": 1.5584782015108765, + "grad_norm": 15.4375, + "learning_rate": 5.785581190664922e-06, + "loss": 1.4978018999099731, + "step": 8562 + }, + { + "epoch": 1.5588422681350687, + "grad_norm": 10.375, + "learning_rate": 5.784051112163647e-06, + "loss": 1.31137216091156, + "step": 8564 + }, + { + "epoch": 1.5592063347592608, + "grad_norm": 19.625, + "learning_rate": 5.782521065351998e-06, + "loss": 1.5296207666397095, + "step": 8566 + }, + { + "epoch": 1.559570401383453, + "grad_norm": 11.8125, + "learning_rate": 5.780991050454501e-06, + "loss": 1.3518354892730713, + "step": 8568 + }, + { + "epoch": 1.5599344680076452, + "grad_norm": 34.75, + "learning_rate": 5.77946106769568e-06, + "loss": 2.0326223373413086, + "step": 8570 + }, + { + "epoch": 1.5602985346318377, + "grad_norm": 18.875, + "learning_rate": 5.7779311173000516e-06, + "loss": 1.4092392921447754, + "step": 8572 + }, + { + "epoch": 1.5606626012560298, + "grad_norm": 17.5, + "learning_rate": 5.776401199492132e-06, + "loss": 1.399479627609253, + "step": 8574 + }, + { + "epoch": 1.561026667880222, + "grad_norm": 12.125, + "learning_rate": 5.774871314496433e-06, + "loss": 1.3655784130096436, + "step": 8576 + }, + { + "epoch": 1.5613907345044145, + "grad_norm": 9.5625, + "learning_rate": 5.773341462537454e-06, + "loss": 1.3360631465911865, + "step": 8578 + }, + { + "epoch": 1.5617548011286067, + "grad_norm": 12.5625, + "learning_rate": 5.771811643839698e-06, + "loss": 0.7992429733276367, + "step": 8580 + }, + { + "epoch": 1.5621188677527988, + "grad_norm": 6.75, + "learning_rate": 5.770281858627658e-06, + "loss": 0.42511245608329773, + "step": 8582 + }, + { + "epoch": 1.562482934376991, + "grad_norm": 22.125, + "learning_rate": 5.768752107125822e-06, + "loss": 1.5596882104873657, + "step": 8584 + }, + { + "epoch": 1.5628470010011832, + "grad_norm": 112.5, + "learning_rate": 5.7672223895586795e-06, + "loss": 1.231987476348877, + "step": 8586 + }, + { + "epoch": 1.5632110676253754, + "grad_norm": 14.5625, + "learning_rate": 5.765692706150709e-06, + "loss": 1.411910057067871, + "step": 8588 + }, + { + "epoch": 1.5635751342495676, + "grad_norm": 10.625, + "learning_rate": 5.764163057126384e-06, + "loss": 1.0222399234771729, + "step": 8590 + }, + { + "epoch": 1.5639392008737598, + "grad_norm": 8.3125, + "learning_rate": 5.762633442710172e-06, + "loss": 1.4847803115844727, + "step": 8592 + }, + { + "epoch": 1.564303267497952, + "grad_norm": 5.125, + "learning_rate": 5.7611038631265416e-06, + "loss": 1.3583101034164429, + "step": 8594 + }, + { + "epoch": 1.5646673341221442, + "grad_norm": 7.125, + "learning_rate": 5.759574318599952e-06, + "loss": 1.162548542022705, + "step": 8596 + }, + { + "epoch": 1.5650314007463366, + "grad_norm": 9.0, + "learning_rate": 5.758044809354857e-06, + "loss": 1.2633427381515503, + "step": 8598 + }, + { + "epoch": 1.5653954673705288, + "grad_norm": 13.0, + "learning_rate": 5.756515335615704e-06, + "loss": 1.4692398309707642, + "step": 8600 + }, + { + "epoch": 1.565759533994721, + "grad_norm": 11.625, + "learning_rate": 5.75498589760694e-06, + "loss": 1.229184865951538, + "step": 8602 + }, + { + "epoch": 1.5661236006189134, + "grad_norm": 18.25, + "learning_rate": 5.753456495553e-06, + "loss": 1.597200870513916, + "step": 8604 + }, + { + "epoch": 1.5664876672431056, + "grad_norm": 31.375, + "learning_rate": 5.7519271296783256e-06, + "loss": 1.7171881198883057, + "step": 8606 + }, + { + "epoch": 1.5668517338672978, + "grad_norm": 11.375, + "learning_rate": 5.750397800207335e-06, + "loss": 1.0852664709091187, + "step": 8608 + }, + { + "epoch": 1.56721580049149, + "grad_norm": 8.375, + "learning_rate": 5.748868507364458e-06, + "loss": 1.3956425189971924, + "step": 8610 + }, + { + "epoch": 1.5675798671156822, + "grad_norm": 7.5, + "learning_rate": 5.7473392513741075e-06, + "loss": 1.3740757703781128, + "step": 8612 + }, + { + "epoch": 1.5679439337398744, + "grad_norm": 9.125, + "learning_rate": 5.745810032460699e-06, + "loss": 0.9973607063293457, + "step": 8614 + }, + { + "epoch": 1.5683080003640666, + "grad_norm": 5.0625, + "learning_rate": 5.744280850848638e-06, + "loss": 1.200483798980713, + "step": 8616 + }, + { + "epoch": 1.5686720669882588, + "grad_norm": 8.0625, + "learning_rate": 5.7427517067623265e-06, + "loss": 1.2188284397125244, + "step": 8618 + }, + { + "epoch": 1.569036133612451, + "grad_norm": 6.625, + "learning_rate": 5.741222600426159e-06, + "loss": 1.246840476989746, + "step": 8620 + }, + { + "epoch": 1.5694002002366432, + "grad_norm": 7.96875, + "learning_rate": 5.739693532064527e-06, + "loss": 1.0373082160949707, + "step": 8622 + }, + { + "epoch": 1.5697642668608354, + "grad_norm": 11.8125, + "learning_rate": 5.7381645019018125e-06, + "loss": 1.562772512435913, + "step": 8624 + }, + { + "epoch": 1.5701283334850278, + "grad_norm": 21.0, + "learning_rate": 5.736635510162398e-06, + "loss": 0.974455714225769, + "step": 8626 + }, + { + "epoch": 1.57049240010922, + "grad_norm": 9.875, + "learning_rate": 5.735106557070655e-06, + "loss": 1.198800802230835, + "step": 8628 + }, + { + "epoch": 1.5708564667334122, + "grad_norm": 9.8125, + "learning_rate": 5.73357764285095e-06, + "loss": 1.1682682037353516, + "step": 8630 + }, + { + "epoch": 1.5712205333576046, + "grad_norm": 21.0, + "learning_rate": 5.732048767727647e-06, + "loss": 1.9496302604675293, + "step": 8632 + }, + { + "epoch": 1.5715845999817968, + "grad_norm": 13.375, + "learning_rate": 5.730519931925101e-06, + "loss": 1.9157482385635376, + "step": 8634 + }, + { + "epoch": 1.571948666605989, + "grad_norm": 4.15625, + "learning_rate": 5.728991135667663e-06, + "loss": 1.082655906677246, + "step": 8636 + }, + { + "epoch": 1.5723127332301812, + "grad_norm": 5.34375, + "learning_rate": 5.727462379179677e-06, + "loss": 1.136138677597046, + "step": 8638 + }, + { + "epoch": 1.5726767998543734, + "grad_norm": 12.5625, + "learning_rate": 5.7259336626854835e-06, + "loss": 1.2212103605270386, + "step": 8640 + }, + { + "epoch": 1.5730408664785656, + "grad_norm": 53.25, + "learning_rate": 5.724404986409413e-06, + "loss": 1.5884782075881958, + "step": 8642 + }, + { + "epoch": 1.5734049331027578, + "grad_norm": 7.875, + "learning_rate": 5.722876350575794e-06, + "loss": 1.4193819761276245, + "step": 8644 + }, + { + "epoch": 1.57376899972695, + "grad_norm": 7.125, + "learning_rate": 5.721347755408948e-06, + "loss": 1.177306890487671, + "step": 8646 + }, + { + "epoch": 1.5741330663511421, + "grad_norm": 16.5, + "learning_rate": 5.7198192011331895e-06, + "loss": 0.8277719020843506, + "step": 8648 + }, + { + "epoch": 1.5744971329753343, + "grad_norm": 23.125, + "learning_rate": 5.71829068797283e-06, + "loss": 0.9947350025177002, + "step": 8650 + }, + { + "epoch": 1.5748611995995268, + "grad_norm": 16.125, + "learning_rate": 5.716762216152167e-06, + "loss": 1.2280057668685913, + "step": 8652 + }, + { + "epoch": 1.575225266223719, + "grad_norm": 13.5625, + "learning_rate": 5.715233785895502e-06, + "loss": 1.5317602157592773, + "step": 8654 + }, + { + "epoch": 1.5755893328479111, + "grad_norm": 47.0, + "learning_rate": 5.713705397427125e-06, + "loss": 1.4001306295394897, + "step": 8656 + }, + { + "epoch": 1.5759533994721036, + "grad_norm": 52.75, + "learning_rate": 5.712177050971321e-06, + "loss": 0.5568578243255615, + "step": 8658 + }, + { + "epoch": 1.5763174660962957, + "grad_norm": 9.125, + "learning_rate": 5.71064874675237e-06, + "loss": 1.4954227209091187, + "step": 8660 + }, + { + "epoch": 1.576681532720488, + "grad_norm": 8.25, + "learning_rate": 5.709120484994539e-06, + "loss": 1.3732085227966309, + "step": 8662 + }, + { + "epoch": 1.5770455993446801, + "grad_norm": 4.40625, + "learning_rate": 5.707592265922099e-06, + "loss": 1.2604023218154907, + "step": 8664 + }, + { + "epoch": 1.5774096659688723, + "grad_norm": 8.0, + "learning_rate": 5.706064089759311e-06, + "loss": 1.3800157308578491, + "step": 8666 + }, + { + "epoch": 1.5777737325930645, + "grad_norm": 13.5, + "learning_rate": 5.704535956730425e-06, + "loss": 1.3275694847106934, + "step": 8668 + }, + { + "epoch": 1.5781377992172567, + "grad_norm": 15.4375, + "learning_rate": 5.703007867059691e-06, + "loss": 1.7590510845184326, + "step": 8670 + }, + { + "epoch": 1.578501865841449, + "grad_norm": 17.125, + "learning_rate": 5.701479820971347e-06, + "loss": 1.1481068134307861, + "step": 8672 + }, + { + "epoch": 1.578865932465641, + "grad_norm": 13.875, + "learning_rate": 5.6999518186896305e-06, + "loss": 1.5052275657653809, + "step": 8674 + }, + { + "epoch": 1.5792299990898333, + "grad_norm": 6.5625, + "learning_rate": 5.698423860438769e-06, + "loss": 1.2088903188705444, + "step": 8676 + }, + { + "epoch": 1.5795940657140257, + "grad_norm": 4.09375, + "learning_rate": 5.696895946442984e-06, + "loss": 1.2658741474151611, + "step": 8678 + }, + { + "epoch": 1.579958132338218, + "grad_norm": 3.984375, + "learning_rate": 5.695368076926491e-06, + "loss": 1.0872126817703247, + "step": 8680 + }, + { + "epoch": 1.58032219896241, + "grad_norm": 36.25, + "learning_rate": 5.693840252113496e-06, + "loss": 1.4419200420379639, + "step": 8682 + }, + { + "epoch": 1.5806862655866023, + "grad_norm": 5.1875, + "learning_rate": 5.6923124722282034e-06, + "loss": 1.0466389656066895, + "step": 8684 + }, + { + "epoch": 1.5810503322107947, + "grad_norm": 19.25, + "learning_rate": 5.690784737494811e-06, + "loss": 1.0109425783157349, + "step": 8686 + }, + { + "epoch": 1.581414398834987, + "grad_norm": 14.125, + "learning_rate": 5.689257048137504e-06, + "loss": 0.5340254902839661, + "step": 8688 + }, + { + "epoch": 1.581778465459179, + "grad_norm": 14.6875, + "learning_rate": 5.687729404380466e-06, + "loss": 1.5743764638900757, + "step": 8690 + }, + { + "epoch": 1.5821425320833713, + "grad_norm": 6.78125, + "learning_rate": 5.686201806447872e-06, + "loss": 1.295348882675171, + "step": 8692 + }, + { + "epoch": 1.5825065987075635, + "grad_norm": 16.125, + "learning_rate": 5.68467425456389e-06, + "loss": 1.126971960067749, + "step": 8694 + }, + { + "epoch": 1.5828706653317557, + "grad_norm": 12.875, + "learning_rate": 5.6831467489526855e-06, + "loss": 1.5214215517044067, + "step": 8696 + }, + { + "epoch": 1.5832347319559479, + "grad_norm": 8.8125, + "learning_rate": 5.681619289838412e-06, + "loss": 1.789697289466858, + "step": 8698 + }, + { + "epoch": 1.58359879858014, + "grad_norm": 6.40625, + "learning_rate": 5.680091877445215e-06, + "loss": 1.026115894317627, + "step": 8700 + }, + { + "epoch": 1.5839628652043323, + "grad_norm": 10.125, + "learning_rate": 5.67856451199724e-06, + "loss": 1.6779255867004395, + "step": 8702 + }, + { + "epoch": 1.5843269318285245, + "grad_norm": 12.5, + "learning_rate": 5.677037193718617e-06, + "loss": 1.417406678199768, + "step": 8704 + }, + { + "epoch": 1.5846909984527169, + "grad_norm": 14.4375, + "learning_rate": 5.675509922833482e-06, + "loss": 1.6378275156021118, + "step": 8706 + }, + { + "epoch": 1.585055065076909, + "grad_norm": 9.375, + "learning_rate": 5.673982699565948e-06, + "loss": 1.3963499069213867, + "step": 8708 + }, + { + "epoch": 1.5854191317011013, + "grad_norm": 7.3125, + "learning_rate": 5.672455524140133e-06, + "loss": 1.308735966682434, + "step": 8710 + }, + { + "epoch": 1.5857831983252937, + "grad_norm": 12.75, + "learning_rate": 5.67092839678014e-06, + "loss": 1.4711189270019531, + "step": 8712 + }, + { + "epoch": 1.5861472649494859, + "grad_norm": 11.0625, + "learning_rate": 5.669401317710073e-06, + "loss": 1.260784387588501, + "step": 8714 + }, + { + "epoch": 1.586511331573678, + "grad_norm": 13.25, + "learning_rate": 5.667874287154023e-06, + "loss": 1.3194794654846191, + "step": 8716 + }, + { + "epoch": 1.5868753981978703, + "grad_norm": 12.5, + "learning_rate": 5.666347305336075e-06, + "loss": 1.4339884519577026, + "step": 8718 + }, + { + "epoch": 1.5872394648220625, + "grad_norm": 9.125, + "learning_rate": 5.664820372480306e-06, + "loss": 1.392461895942688, + "step": 8720 + }, + { + "epoch": 1.5876035314462547, + "grad_norm": 15.75, + "learning_rate": 5.6632934888107915e-06, + "loss": 1.5660393238067627, + "step": 8722 + }, + { + "epoch": 1.5879675980704469, + "grad_norm": 85.0, + "learning_rate": 5.6617666545515905e-06, + "loss": 1.7725472450256348, + "step": 8724 + }, + { + "epoch": 1.588331664694639, + "grad_norm": 17.0, + "learning_rate": 5.660239869926764e-06, + "loss": 1.525301456451416, + "step": 8726 + }, + { + "epoch": 1.5886957313188312, + "grad_norm": 9.5625, + "learning_rate": 5.658713135160361e-06, + "loss": 1.3020918369293213, + "step": 8728 + }, + { + "epoch": 1.5890597979430234, + "grad_norm": 7.15625, + "learning_rate": 5.657186450476419e-06, + "loss": 1.112493872642517, + "step": 8730 + }, + { + "epoch": 1.5894238645672158, + "grad_norm": 6.5, + "learning_rate": 5.65565981609898e-06, + "loss": 1.3058762550354004, + "step": 8732 + }, + { + "epoch": 1.589787931191408, + "grad_norm": 6.34375, + "learning_rate": 5.6541332322520635e-06, + "loss": 1.2176767587661743, + "step": 8734 + }, + { + "epoch": 1.5901519978156002, + "grad_norm": 9.9375, + "learning_rate": 5.652606699159696e-06, + "loss": 1.2101428508758545, + "step": 8736 + }, + { + "epoch": 1.5905160644397924, + "grad_norm": 10.3125, + "learning_rate": 5.651080217045887e-06, + "loss": 0.7322897911071777, + "step": 8738 + }, + { + "epoch": 1.5908801310639848, + "grad_norm": 6.09375, + "learning_rate": 5.649553786134642e-06, + "loss": 1.0011916160583496, + "step": 8740 + }, + { + "epoch": 1.591244197688177, + "grad_norm": 12.125, + "learning_rate": 5.6480274066499585e-06, + "loss": 1.5173137187957764, + "step": 8742 + }, + { + "epoch": 1.5916082643123692, + "grad_norm": 14.625, + "learning_rate": 5.646501078815826e-06, + "loss": 1.4697926044464111, + "step": 8744 + }, + { + "epoch": 1.5919723309365614, + "grad_norm": 20.625, + "learning_rate": 5.644974802856229e-06, + "loss": 1.6070770025253296, + "step": 8746 + }, + { + "epoch": 1.5923363975607536, + "grad_norm": 23.5, + "learning_rate": 5.64344857899514e-06, + "loss": 1.5925300121307373, + "step": 8748 + }, + { + "epoch": 1.5927004641849458, + "grad_norm": 14.1875, + "learning_rate": 5.641922407456527e-06, + "loss": 1.7517449855804443, + "step": 8750 + }, + { + "epoch": 1.593064530809138, + "grad_norm": 33.0, + "learning_rate": 5.640396288464349e-06, + "loss": 1.6902092695236206, + "step": 8752 + }, + { + "epoch": 1.5934285974333302, + "grad_norm": 10.25, + "learning_rate": 5.638870222242558e-06, + "loss": 1.4073898792266846, + "step": 8754 + }, + { + "epoch": 1.5937926640575224, + "grad_norm": 7.125, + "learning_rate": 5.637344209015101e-06, + "loss": 1.3730113506317139, + "step": 8756 + }, + { + "epoch": 1.5941567306817146, + "grad_norm": 9.0, + "learning_rate": 5.635818249005911e-06, + "loss": 1.0621342658996582, + "step": 8758 + }, + { + "epoch": 1.594520797305907, + "grad_norm": 17.25, + "learning_rate": 5.634292342438916e-06, + "loss": 0.5995445847511292, + "step": 8760 + }, + { + "epoch": 1.5948848639300992, + "grad_norm": 7.1875, + "learning_rate": 5.632766489538037e-06, + "loss": 1.5728943347930908, + "step": 8762 + }, + { + "epoch": 1.5952489305542914, + "grad_norm": 11.0, + "learning_rate": 5.631240690527189e-06, + "loss": 1.1080061197280884, + "step": 8764 + }, + { + "epoch": 1.5956129971784838, + "grad_norm": 18.0, + "learning_rate": 5.629714945630274e-06, + "loss": 1.4351915121078491, + "step": 8766 + }, + { + "epoch": 1.595977063802676, + "grad_norm": 14.3125, + "learning_rate": 5.62818925507119e-06, + "loss": 1.8252661228179932, + "step": 8768 + }, + { + "epoch": 1.5963411304268682, + "grad_norm": 9.1875, + "learning_rate": 5.626663619073827e-06, + "loss": 0.8909593224525452, + "step": 8770 + }, + { + "epoch": 1.5967051970510604, + "grad_norm": 28.125, + "learning_rate": 5.625138037862065e-06, + "loss": 1.1606981754302979, + "step": 8772 + }, + { + "epoch": 1.5970692636752526, + "grad_norm": 12.9375, + "learning_rate": 5.623612511659775e-06, + "loss": 1.5173490047454834, + "step": 8774 + }, + { + "epoch": 1.5974333302994448, + "grad_norm": 31.375, + "learning_rate": 5.622087040690824e-06, + "loss": 1.4508112668991089, + "step": 8776 + }, + { + "epoch": 1.597797396923637, + "grad_norm": 20.25, + "learning_rate": 5.6205616251790704e-06, + "loss": 0.8631260991096497, + "step": 8778 + }, + { + "epoch": 1.5981614635478292, + "grad_norm": 15.0625, + "learning_rate": 5.619036265348359e-06, + "loss": 1.7663922309875488, + "step": 8780 + }, + { + "epoch": 1.5985255301720214, + "grad_norm": 10.0, + "learning_rate": 5.617510961422532e-06, + "loss": 1.3401504755020142, + "step": 8782 + }, + { + "epoch": 1.5988895967962136, + "grad_norm": 40.5, + "learning_rate": 5.615985713625421e-06, + "loss": 1.2212750911712646, + "step": 8784 + }, + { + "epoch": 1.599253663420406, + "grad_norm": 15.1875, + "learning_rate": 5.614460522180852e-06, + "loss": 1.282362937927246, + "step": 8786 + }, + { + "epoch": 1.5996177300445982, + "grad_norm": 9.375, + "learning_rate": 5.6129353873126354e-06, + "loss": 1.5193630456924438, + "step": 8788 + }, + { + "epoch": 1.5999817966687904, + "grad_norm": 7.25, + "learning_rate": 5.611410309244585e-06, + "loss": 1.2703156471252441, + "step": 8790 + }, + { + "epoch": 1.6003458632929826, + "grad_norm": 23.125, + "learning_rate": 5.609885288200496e-06, + "loss": 1.7064123153686523, + "step": 8792 + }, + { + "epoch": 1.600709929917175, + "grad_norm": 12.375, + "learning_rate": 5.608360324404158e-06, + "loss": 1.427061676979065, + "step": 8794 + }, + { + "epoch": 1.6010739965413672, + "grad_norm": 9.5625, + "learning_rate": 5.606835418079358e-06, + "loss": 1.0867226123809814, + "step": 8796 + }, + { + "epoch": 1.6014380631655594, + "grad_norm": 9.4375, + "learning_rate": 5.605310569449867e-06, + "loss": 1.4145636558532715, + "step": 8798 + }, + { + "epoch": 1.6018021297897516, + "grad_norm": 9.4375, + "learning_rate": 5.603785778739449e-06, + "loss": 1.1019208431243896, + "step": 8800 + }, + { + "epoch": 1.6021661964139438, + "grad_norm": 17.125, + "learning_rate": 5.602261046171863e-06, + "loss": 1.4459761381149292, + "step": 8802 + }, + { + "epoch": 1.602530263038136, + "grad_norm": 10.5, + "learning_rate": 5.600736371970855e-06, + "loss": 1.4090943336486816, + "step": 8804 + }, + { + "epoch": 1.6028943296623281, + "grad_norm": 25.25, + "learning_rate": 5.599211756360169e-06, + "loss": 1.6040135622024536, + "step": 8806 + }, + { + "epoch": 1.6032583962865203, + "grad_norm": 8.875, + "learning_rate": 5.597687199563533e-06, + "loss": 1.7145119905471802, + "step": 8808 + }, + { + "epoch": 1.6036224629107125, + "grad_norm": 3.109375, + "learning_rate": 5.596162701804669e-06, + "loss": 0.9624874591827393, + "step": 8810 + }, + { + "epoch": 1.6039865295349047, + "grad_norm": 7.8125, + "learning_rate": 5.594638263307292e-06, + "loss": 0.9814913272857666, + "step": 8812 + }, + { + "epoch": 1.6043505961590971, + "grad_norm": 25.625, + "learning_rate": 5.593113884295106e-06, + "loss": 1.1439611911773682, + "step": 8814 + }, + { + "epoch": 1.6047146627832893, + "grad_norm": 13.0625, + "learning_rate": 5.591589564991811e-06, + "loss": 1.449749231338501, + "step": 8816 + }, + { + "epoch": 1.6050787294074815, + "grad_norm": 18.75, + "learning_rate": 5.590065305621091e-06, + "loss": 1.4518253803253174, + "step": 8818 + }, + { + "epoch": 1.605442796031674, + "grad_norm": 14.3125, + "learning_rate": 5.5885411064066256e-06, + "loss": 1.2628220319747925, + "step": 8820 + }, + { + "epoch": 1.6058068626558661, + "grad_norm": 15.5, + "learning_rate": 5.5870169675720855e-06, + "loss": 1.7324600219726562, + "step": 8822 + }, + { + "epoch": 1.6061709292800583, + "grad_norm": 17.375, + "learning_rate": 5.585492889341131e-06, + "loss": 1.5263853073120117, + "step": 8824 + }, + { + "epoch": 1.6065349959042505, + "grad_norm": 23.75, + "learning_rate": 5.583968871937418e-06, + "loss": 1.8368717432022095, + "step": 8826 + }, + { + "epoch": 1.6068990625284427, + "grad_norm": 13.3125, + "learning_rate": 5.582444915584584e-06, + "loss": 1.5513668060302734, + "step": 8828 + }, + { + "epoch": 1.607263129152635, + "grad_norm": 10.625, + "learning_rate": 5.580921020506268e-06, + "loss": 1.3387057781219482, + "step": 8830 + }, + { + "epoch": 1.6076271957768271, + "grad_norm": 26.125, + "learning_rate": 5.579397186926093e-06, + "loss": 1.421623945236206, + "step": 8832 + }, + { + "epoch": 1.6079912624010193, + "grad_norm": 23.125, + "learning_rate": 5.5778734150676765e-06, + "loss": 1.663716435432434, + "step": 8834 + }, + { + "epoch": 1.6083553290252115, + "grad_norm": 41.25, + "learning_rate": 5.576349705154626e-06, + "loss": 1.5509027242660522, + "step": 8836 + }, + { + "epoch": 1.6087193956494037, + "grad_norm": 8.3125, + "learning_rate": 5.5748260574105394e-06, + "loss": 0.9681510925292969, + "step": 8838 + }, + { + "epoch": 1.609083462273596, + "grad_norm": 11.1875, + "learning_rate": 5.573302472059005e-06, + "loss": 0.9092290997505188, + "step": 8840 + }, + { + "epoch": 1.6094475288977883, + "grad_norm": 8.25, + "learning_rate": 5.571778949323605e-06, + "loss": 0.5999207496643066, + "step": 8842 + }, + { + "epoch": 1.6098115955219805, + "grad_norm": 9.9375, + "learning_rate": 5.570255489427907e-06, + "loss": 1.4470223188400269, + "step": 8844 + }, + { + "epoch": 1.610175662146173, + "grad_norm": 13.125, + "learning_rate": 5.568732092595476e-06, + "loss": 1.619198203086853, + "step": 8846 + }, + { + "epoch": 1.610539728770365, + "grad_norm": 11.0, + "learning_rate": 5.567208759049862e-06, + "loss": 1.3654470443725586, + "step": 8848 + }, + { + "epoch": 1.6109037953945573, + "grad_norm": 10.8125, + "learning_rate": 5.565685489014608e-06, + "loss": 1.2100518941879272, + "step": 8850 + }, + { + "epoch": 1.6112678620187495, + "grad_norm": 12.0625, + "learning_rate": 5.564162282713249e-06, + "loss": 1.229964017868042, + "step": 8852 + }, + { + "epoch": 1.6116319286429417, + "grad_norm": 15.375, + "learning_rate": 5.562639140369308e-06, + "loss": 1.3368597030639648, + "step": 8854 + }, + { + "epoch": 1.6119959952671339, + "grad_norm": 10.25, + "learning_rate": 5.561116062206302e-06, + "loss": 1.3706207275390625, + "step": 8856 + }, + { + "epoch": 1.612360061891326, + "grad_norm": 21.5, + "learning_rate": 5.5595930484477334e-06, + "loss": 1.4467570781707764, + "step": 8858 + }, + { + "epoch": 1.6127241285155183, + "grad_norm": 7.0625, + "learning_rate": 5.558070099317103e-06, + "loss": 1.3247259855270386, + "step": 8860 + }, + { + "epoch": 1.6130881951397105, + "grad_norm": 5.78125, + "learning_rate": 5.556547215037893e-06, + "loss": 1.3324511051177979, + "step": 8862 + }, + { + "epoch": 1.6134522617639027, + "grad_norm": 14.375, + "learning_rate": 5.55502439583358e-06, + "loss": 1.5234766006469727, + "step": 8864 + }, + { + "epoch": 1.6138163283880949, + "grad_norm": 11.0, + "learning_rate": 5.553501641927636e-06, + "loss": 1.4986146688461304, + "step": 8866 + }, + { + "epoch": 1.6141803950122873, + "grad_norm": 17.25, + "learning_rate": 5.551978953543515e-06, + "loss": 1.4198793172836304, + "step": 8868 + }, + { + "epoch": 1.6145444616364795, + "grad_norm": 85.0, + "learning_rate": 5.5504563309046695e-06, + "loss": 1.4635794162750244, + "step": 8870 + }, + { + "epoch": 1.6149085282606717, + "grad_norm": 18.375, + "learning_rate": 5.548933774234533e-06, + "loss": 0.9434354305267334, + "step": 8872 + }, + { + "epoch": 1.615272594884864, + "grad_norm": 7.75, + "learning_rate": 5.5474112837565385e-06, + "loss": 1.0740314722061157, + "step": 8874 + }, + { + "epoch": 1.6156366615090563, + "grad_norm": 22.0, + "learning_rate": 5.545888859694104e-06, + "loss": 1.0221545696258545, + "step": 8876 + }, + { + "epoch": 1.6160007281332485, + "grad_norm": 16.25, + "learning_rate": 5.544366502270637e-06, + "loss": 1.4753661155700684, + "step": 8878 + }, + { + "epoch": 1.6163647947574407, + "grad_norm": 9.5625, + "learning_rate": 5.542844211709542e-06, + "loss": 1.646446943283081, + "step": 8880 + }, + { + "epoch": 1.6167288613816329, + "grad_norm": 8.3125, + "learning_rate": 5.541321988234205e-06, + "loss": 1.248424768447876, + "step": 8882 + }, + { + "epoch": 1.617092928005825, + "grad_norm": 11.875, + "learning_rate": 5.539799832068007e-06, + "loss": 1.565284252166748, + "step": 8884 + }, + { + "epoch": 1.6174569946300172, + "grad_norm": 28.875, + "learning_rate": 5.538277743434319e-06, + "loss": 1.7744112014770508, + "step": 8886 + }, + { + "epoch": 1.6178210612542094, + "grad_norm": 11.3125, + "learning_rate": 5.536755722556499e-06, + "loss": 1.1932414770126343, + "step": 8888 + }, + { + "epoch": 1.6181851278784016, + "grad_norm": 8.375, + "learning_rate": 5.535233769657904e-06, + "loss": 0.6376725435256958, + "step": 8890 + }, + { + "epoch": 1.6185491945025938, + "grad_norm": 15.25, + "learning_rate": 5.5337118849618664e-06, + "loss": 1.260134220123291, + "step": 8892 + }, + { + "epoch": 1.6189132611267862, + "grad_norm": 10.0625, + "learning_rate": 5.532190068691719e-06, + "loss": 1.6569552421569824, + "step": 8894 + }, + { + "epoch": 1.6192773277509784, + "grad_norm": 4.625, + "learning_rate": 5.5306683210707866e-06, + "loss": 0.8247285485267639, + "step": 8896 + }, + { + "epoch": 1.6196413943751706, + "grad_norm": 3.984375, + "learning_rate": 5.529146642322374e-06, + "loss": 1.441962718963623, + "step": 8898 + }, + { + "epoch": 1.620005460999363, + "grad_norm": 3.34375, + "learning_rate": 5.527625032669783e-06, + "loss": 0.9892392158508301, + "step": 8900 + }, + { + "epoch": 1.6203695276235552, + "grad_norm": 12.25, + "learning_rate": 5.526103492336306e-06, + "loss": 1.0255471467971802, + "step": 8902 + }, + { + "epoch": 1.6207335942477474, + "grad_norm": 6.59375, + "learning_rate": 5.5245820215452185e-06, + "loss": 1.160487174987793, + "step": 8904 + }, + { + "epoch": 1.6210976608719396, + "grad_norm": 6.09375, + "learning_rate": 5.523060620519795e-06, + "loss": 1.2020061016082764, + "step": 8906 + }, + { + "epoch": 1.6214617274961318, + "grad_norm": 16.875, + "learning_rate": 5.521539289483291e-06, + "loss": 1.1621804237365723, + "step": 8908 + }, + { + "epoch": 1.621825794120324, + "grad_norm": 9.5625, + "learning_rate": 5.520018028658958e-06, + "loss": 1.438092589378357, + "step": 8910 + }, + { + "epoch": 1.6221898607445162, + "grad_norm": 20.125, + "learning_rate": 5.518496838270034e-06, + "loss": 1.1545531749725342, + "step": 8912 + }, + { + "epoch": 1.6225539273687084, + "grad_norm": 1.859375, + "learning_rate": 5.5169757185397456e-06, + "loss": 0.8762395977973938, + "step": 8914 + }, + { + "epoch": 1.6229179939929006, + "grad_norm": 16.125, + "learning_rate": 5.515454669691316e-06, + "loss": 1.1666585206985474, + "step": 8916 + }, + { + "epoch": 1.6232820606170928, + "grad_norm": 10.5625, + "learning_rate": 5.513933691947948e-06, + "loss": 1.4057941436767578, + "step": 8918 + }, + { + "epoch": 1.6236461272412852, + "grad_norm": 6.8125, + "learning_rate": 5.512412785532841e-06, + "loss": 1.4671525955200195, + "step": 8920 + }, + { + "epoch": 1.6240101938654774, + "grad_norm": 12.9375, + "learning_rate": 5.510891950669184e-06, + "loss": 1.3448476791381836, + "step": 8922 + }, + { + "epoch": 1.6243742604896696, + "grad_norm": 56.0, + "learning_rate": 5.509371187580148e-06, + "loss": 1.086970329284668, + "step": 8924 + }, + { + "epoch": 1.6247383271138618, + "grad_norm": 8.625, + "learning_rate": 5.507850496488904e-06, + "loss": 0.7681834101676941, + "step": 8926 + }, + { + "epoch": 1.6251023937380542, + "grad_norm": 12.25, + "learning_rate": 5.506329877618603e-06, + "loss": 1.516718864440918, + "step": 8928 + }, + { + "epoch": 1.6254664603622464, + "grad_norm": 21.25, + "learning_rate": 5.504809331192394e-06, + "loss": 1.3979264497756958, + "step": 8930 + }, + { + "epoch": 1.6258305269864386, + "grad_norm": 10.25, + "learning_rate": 5.503288857433409e-06, + "loss": 1.8341989517211914, + "step": 8932 + }, + { + "epoch": 1.6261945936106308, + "grad_norm": 17.375, + "learning_rate": 5.501768456564769e-06, + "loss": 1.3230714797973633, + "step": 8934 + }, + { + "epoch": 1.626558660234823, + "grad_norm": 26.875, + "learning_rate": 5.500248128809591e-06, + "loss": 1.917144775390625, + "step": 8936 + }, + { + "epoch": 1.6269227268590152, + "grad_norm": 8.1875, + "learning_rate": 5.498727874390972e-06, + "loss": 1.2894903421401978, + "step": 8938 + }, + { + "epoch": 1.6272867934832074, + "grad_norm": 10.875, + "learning_rate": 5.497207693532008e-06, + "loss": 1.4657738208770752, + "step": 8940 + }, + { + "epoch": 1.6276508601073996, + "grad_norm": 16.5, + "learning_rate": 5.495687586455778e-06, + "loss": 1.2809555530548096, + "step": 8942 + }, + { + "epoch": 1.6280149267315918, + "grad_norm": 7.65625, + "learning_rate": 5.4941675533853485e-06, + "loss": 0.8499621152877808, + "step": 8944 + }, + { + "epoch": 1.628378993355784, + "grad_norm": 54.5, + "learning_rate": 5.492647594543783e-06, + "loss": 1.1398581266403198, + "step": 8946 + }, + { + "epoch": 1.6287430599799764, + "grad_norm": 7.71875, + "learning_rate": 5.491127710154125e-06, + "loss": 1.3814674615859985, + "step": 8948 + }, + { + "epoch": 1.6291071266041686, + "grad_norm": 13.5, + "learning_rate": 5.4896079004394155e-06, + "loss": 1.2310429811477661, + "step": 8950 + }, + { + "epoch": 1.6294711932283608, + "grad_norm": 32.25, + "learning_rate": 5.488088165622676e-06, + "loss": 1.471536636352539, + "step": 8952 + }, + { + "epoch": 1.6298352598525532, + "grad_norm": 10.0, + "learning_rate": 5.486568505926924e-06, + "loss": 1.4782040119171143, + "step": 8954 + }, + { + "epoch": 1.6301993264767454, + "grad_norm": 10.625, + "learning_rate": 5.485048921575165e-06, + "loss": 1.1005005836486816, + "step": 8956 + }, + { + "epoch": 1.6305633931009376, + "grad_norm": 11.875, + "learning_rate": 5.483529412790387e-06, + "loss": 1.4505761861801147, + "step": 8958 + }, + { + "epoch": 1.6309274597251298, + "grad_norm": 6.1875, + "learning_rate": 5.482009979795576e-06, + "loss": 1.2646241188049316, + "step": 8960 + }, + { + "epoch": 1.631291526349322, + "grad_norm": 4.625, + "learning_rate": 5.480490622813701e-06, + "loss": 1.01548171043396, + "step": 8962 + }, + { + "epoch": 1.6316555929735141, + "grad_norm": 8.1875, + "learning_rate": 5.47897134206772e-06, + "loss": 1.4203006029129028, + "step": 8964 + }, + { + "epoch": 1.6320196595977063, + "grad_norm": 14.125, + "learning_rate": 5.477452137780587e-06, + "loss": 1.7511482238769531, + "step": 8966 + }, + { + "epoch": 1.6323837262218985, + "grad_norm": 16.625, + "learning_rate": 5.475933010175232e-06, + "loss": 1.571290135383606, + "step": 8968 + }, + { + "epoch": 1.6327477928460907, + "grad_norm": 9.9375, + "learning_rate": 5.474413959474585e-06, + "loss": 1.7354551553726196, + "step": 8970 + }, + { + "epoch": 1.633111859470283, + "grad_norm": 29.25, + "learning_rate": 5.47289498590156e-06, + "loss": 1.4419825077056885, + "step": 8972 + }, + { + "epoch": 1.6334759260944753, + "grad_norm": 15.8125, + "learning_rate": 5.471376089679057e-06, + "loss": 1.440701961517334, + "step": 8974 + }, + { + "epoch": 1.6338399927186675, + "grad_norm": 13.625, + "learning_rate": 5.4698572710299736e-06, + "loss": 1.2962970733642578, + "step": 8976 + }, + { + "epoch": 1.6342040593428597, + "grad_norm": 5.03125, + "learning_rate": 5.468338530177185e-06, + "loss": 1.3956835269927979, + "step": 8978 + }, + { + "epoch": 1.634568125967052, + "grad_norm": 3.46875, + "learning_rate": 5.466819867343564e-06, + "loss": 0.9811263084411621, + "step": 8980 + }, + { + "epoch": 1.6349321925912443, + "grad_norm": 5216.0, + "learning_rate": 5.465301282751967e-06, + "loss": 1.2953565120697021, + "step": 8982 + }, + { + "epoch": 1.6352962592154365, + "grad_norm": 9.1875, + "learning_rate": 5.463782776625237e-06, + "loss": 0.8272882699966431, + "step": 8984 + }, + { + "epoch": 1.6356603258396287, + "grad_norm": 10.5625, + "learning_rate": 5.462264349186215e-06, + "loss": 1.404762625694275, + "step": 8986 + }, + { + "epoch": 1.636024392463821, + "grad_norm": 3.03125, + "learning_rate": 5.460746000657717e-06, + "loss": 1.0570034980773926, + "step": 8988 + }, + { + "epoch": 1.6363884590880131, + "grad_norm": 13.0625, + "learning_rate": 5.459227731262562e-06, + "loss": 1.164574146270752, + "step": 8990 + }, + { + "epoch": 1.6367525257122053, + "grad_norm": 8.25, + "learning_rate": 5.457709541223544e-06, + "loss": 1.4518458843231201, + "step": 8992 + }, + { + "epoch": 1.6371165923363975, + "grad_norm": 124.5, + "learning_rate": 5.456191430763451e-06, + "loss": 1.3307973146438599, + "step": 8994 + }, + { + "epoch": 1.6374806589605897, + "grad_norm": 13.75, + "learning_rate": 5.454673400105065e-06, + "loss": 1.1192717552185059, + "step": 8996 + }, + { + "epoch": 1.637844725584782, + "grad_norm": 6.6875, + "learning_rate": 5.4531554494711445e-06, + "loss": 1.2777091264724731, + "step": 8998 + }, + { + "epoch": 1.638208792208974, + "grad_norm": 10.125, + "learning_rate": 5.451637579084448e-06, + "loss": 1.3846396207809448, + "step": 9000 + }, + { + "epoch": 1.6385728588331665, + "grad_norm": 13.8125, + "learning_rate": 5.450119789167714e-06, + "loss": 1.405811071395874, + "step": 9002 + }, + { + "epoch": 1.6389369254573587, + "grad_norm": 7.03125, + "learning_rate": 5.448602079943671e-06, + "loss": 1.1963441371917725, + "step": 9004 + }, + { + "epoch": 1.639300992081551, + "grad_norm": 11.1875, + "learning_rate": 5.4470844516350404e-06, + "loss": 1.3561813831329346, + "step": 9006 + }, + { + "epoch": 1.6396650587057433, + "grad_norm": 12.0625, + "learning_rate": 5.445566904464523e-06, + "loss": 1.2439383268356323, + "step": 9008 + }, + { + "epoch": 1.6400291253299355, + "grad_norm": 15.625, + "learning_rate": 5.444049438654819e-06, + "loss": 1.3630499839782715, + "step": 9010 + }, + { + "epoch": 1.6403931919541277, + "grad_norm": 69.5, + "learning_rate": 5.442532054428604e-06, + "loss": 1.6038141250610352, + "step": 9012 + }, + { + "epoch": 1.6407572585783199, + "grad_norm": 27.25, + "learning_rate": 5.441014752008551e-06, + "loss": 1.2607192993164062, + "step": 9014 + }, + { + "epoch": 1.641121325202512, + "grad_norm": 21.75, + "learning_rate": 5.439497531617319e-06, + "loss": 1.7777246236801147, + "step": 9016 + }, + { + "epoch": 1.6414853918267043, + "grad_norm": 8.9375, + "learning_rate": 5.437980393477551e-06, + "loss": 1.5362765789031982, + "step": 9018 + }, + { + "epoch": 1.6418494584508965, + "grad_norm": 7.9375, + "learning_rate": 5.436463337811886e-06, + "loss": 1.2003294229507446, + "step": 9020 + }, + { + "epoch": 1.6422135250750887, + "grad_norm": 6.28125, + "learning_rate": 5.434946364842939e-06, + "loss": 1.3044151067733765, + "step": 9022 + }, + { + "epoch": 1.6425775916992809, + "grad_norm": 10.75, + "learning_rate": 5.433429474793324e-06, + "loss": 1.3526506423950195, + "step": 9024 + }, + { + "epoch": 1.642941658323473, + "grad_norm": 7.96875, + "learning_rate": 5.431912667885637e-06, + "loss": 1.2049086093902588, + "step": 9026 + }, + { + "epoch": 1.6433057249476655, + "grad_norm": 10.6875, + "learning_rate": 5.430395944342463e-06, + "loss": 1.1403303146362305, + "step": 9028 + }, + { + "epoch": 1.6436697915718577, + "grad_norm": 11.0, + "learning_rate": 5.4288793043863784e-06, + "loss": 1.4107240438461304, + "step": 9030 + }, + { + "epoch": 1.6440338581960499, + "grad_norm": 17.375, + "learning_rate": 5.427362748239941e-06, + "loss": 1.4551666975021362, + "step": 9032 + }, + { + "epoch": 1.644397924820242, + "grad_norm": 6.3125, + "learning_rate": 5.425846276125697e-06, + "loss": 1.3451907634735107, + "step": 9034 + }, + { + "epoch": 1.6447619914444345, + "grad_norm": 7.9375, + "learning_rate": 5.424329888266188e-06, + "loss": 1.4933338165283203, + "step": 9036 + }, + { + "epoch": 1.6451260580686267, + "grad_norm": 9.3125, + "learning_rate": 5.422813584883932e-06, + "loss": 1.297314167022705, + "step": 9038 + }, + { + "epoch": 1.6454901246928189, + "grad_norm": 22.875, + "learning_rate": 5.421297366201446e-06, + "loss": 1.2356611490249634, + "step": 9040 + }, + { + "epoch": 1.645854191317011, + "grad_norm": 10.375, + "learning_rate": 5.419781232441226e-06, + "loss": 1.3946754932403564, + "step": 9042 + }, + { + "epoch": 1.6462182579412032, + "grad_norm": 247.0, + "learning_rate": 5.418265183825757e-06, + "loss": 1.477485179901123, + "step": 9044 + }, + { + "epoch": 1.6465823245653954, + "grad_norm": 8.8125, + "learning_rate": 5.416749220577515e-06, + "loss": 1.5828111171722412, + "step": 9046 + }, + { + "epoch": 1.6469463911895876, + "grad_norm": 13.5, + "learning_rate": 5.415233342918962e-06, + "loss": 1.320845127105713, + "step": 9048 + }, + { + "epoch": 1.6473104578137798, + "grad_norm": 5.9375, + "learning_rate": 5.413717551072546e-06, + "loss": 1.3015503883361816, + "step": 9050 + }, + { + "epoch": 1.647674524437972, + "grad_norm": 29.625, + "learning_rate": 5.412201845260703e-06, + "loss": 1.0998867750167847, + "step": 9052 + }, + { + "epoch": 1.6480385910621642, + "grad_norm": 24.625, + "learning_rate": 5.410686225705857e-06, + "loss": 1.0409722328186035, + "step": 9054 + }, + { + "epoch": 1.6484026576863566, + "grad_norm": 12.8125, + "learning_rate": 5.40917069263042e-06, + "loss": 0.7813601493835449, + "step": 9056 + }, + { + "epoch": 1.6487667243105488, + "grad_norm": 8.4375, + "learning_rate": 5.407655246256789e-06, + "loss": 1.0301543474197388, + "step": 9058 + }, + { + "epoch": 1.649130790934741, + "grad_norm": 38.5, + "learning_rate": 5.406139886807349e-06, + "loss": 1.2839436531066895, + "step": 9060 + }, + { + "epoch": 1.6494948575589334, + "grad_norm": 19.0, + "learning_rate": 5.4046246145044755e-06, + "loss": 1.2923681735992432, + "step": 9062 + }, + { + "epoch": 1.6498589241831256, + "grad_norm": 10.1875, + "learning_rate": 5.403109429570525e-06, + "loss": 1.2729697227478027, + "step": 9064 + }, + { + "epoch": 1.6502229908073178, + "grad_norm": 2.9375, + "learning_rate": 5.401594332227849e-06, + "loss": 1.033754587173462, + "step": 9066 + }, + { + "epoch": 1.65058705743151, + "grad_norm": 13.25, + "learning_rate": 5.400079322698777e-06, + "loss": 0.9543336629867554, + "step": 9068 + }, + { + "epoch": 1.6509511240557022, + "grad_norm": 16.25, + "learning_rate": 5.398564401205637e-06, + "loss": 1.4318230152130127, + "step": 9070 + }, + { + "epoch": 1.6513151906798944, + "grad_norm": 8.0625, + "learning_rate": 5.397049567970731e-06, + "loss": 1.3613770008087158, + "step": 9072 + }, + { + "epoch": 1.6516792573040866, + "grad_norm": 11.1875, + "learning_rate": 5.395534823216358e-06, + "loss": 1.47922682762146, + "step": 9074 + }, + { + "epoch": 1.6520433239282788, + "grad_norm": 13.5625, + "learning_rate": 5.3940201671648e-06, + "loss": 1.669333577156067, + "step": 9076 + }, + { + "epoch": 1.652407390552471, + "grad_norm": 16.75, + "learning_rate": 5.3925056000383245e-06, + "loss": 1.2753726243972778, + "step": 9078 + }, + { + "epoch": 1.6527714571766632, + "grad_norm": 8.375, + "learning_rate": 5.390991122059193e-06, + "loss": 1.3380441665649414, + "step": 9080 + }, + { + "epoch": 1.6531355238008556, + "grad_norm": 13.875, + "learning_rate": 5.389476733449646e-06, + "loss": 1.5330275297164917, + "step": 9082 + }, + { + "epoch": 1.6534995904250478, + "grad_norm": 12.8125, + "learning_rate": 5.387962434431913e-06, + "loss": 1.524970293045044, + "step": 9084 + }, + { + "epoch": 1.65386365704924, + "grad_norm": 9.375, + "learning_rate": 5.386448225228213e-06, + "loss": 1.6511247158050537, + "step": 9086 + }, + { + "epoch": 1.6542277236734324, + "grad_norm": 15.6875, + "learning_rate": 5.384934106060748e-06, + "loss": 1.4060384035110474, + "step": 9088 + }, + { + "epoch": 1.6545917902976246, + "grad_norm": 5.9375, + "learning_rate": 5.383420077151713e-06, + "loss": 0.9899530410766602, + "step": 9090 + }, + { + "epoch": 1.6549558569218168, + "grad_norm": 13.0625, + "learning_rate": 5.3819061387232816e-06, + "loss": 1.388381004333496, + "step": 9092 + }, + { + "epoch": 1.655319923546009, + "grad_norm": 6.6875, + "learning_rate": 5.3803922909976184e-06, + "loss": 1.1267688274383545, + "step": 9094 + }, + { + "epoch": 1.6556839901702012, + "grad_norm": 12.0, + "learning_rate": 5.378878534196877e-06, + "loss": 1.3740040063858032, + "step": 9096 + }, + { + "epoch": 1.6560480567943934, + "grad_norm": 18.625, + "learning_rate": 5.377364868543193e-06, + "loss": 1.334236741065979, + "step": 9098 + }, + { + "epoch": 1.6564121234185856, + "grad_norm": 23.75, + "learning_rate": 5.375851294258692e-06, + "loss": 1.568598747253418, + "step": 9100 + }, + { + "epoch": 1.6567761900427778, + "grad_norm": 20.625, + "learning_rate": 5.374337811565482e-06, + "loss": 1.7845431566238403, + "step": 9102 + }, + { + "epoch": 1.65714025666697, + "grad_norm": 8.5625, + "learning_rate": 5.372824420685663e-06, + "loss": 1.3297598361968994, + "step": 9104 + }, + { + "epoch": 1.6575043232911622, + "grad_norm": 4.25, + "learning_rate": 5.37131112184132e-06, + "loss": 1.066178321838379, + "step": 9106 + }, + { + "epoch": 1.6578683899153543, + "grad_norm": 6.75, + "learning_rate": 5.369797915254522e-06, + "loss": 0.8099057674407959, + "step": 9108 + }, + { + "epoch": 1.6582324565395468, + "grad_norm": 8.6875, + "learning_rate": 5.3682848011473254e-06, + "loss": 1.3451005220413208, + "step": 9110 + }, + { + "epoch": 1.658596523163739, + "grad_norm": 19.0, + "learning_rate": 5.366771779741775e-06, + "loss": 1.580949068069458, + "step": 9112 + }, + { + "epoch": 1.6589605897879312, + "grad_norm": 29.5, + "learning_rate": 5.365258851259898e-06, + "loss": 1.4729431867599487, + "step": 9114 + }, + { + "epoch": 1.6593246564121236, + "grad_norm": 5.4375, + "learning_rate": 5.363746015923713e-06, + "loss": 0.19191277027130127, + "step": 9116 + }, + { + "epoch": 1.6596887230363158, + "grad_norm": 35.25, + "learning_rate": 5.362233273955221e-06, + "loss": 1.3085311651229858, + "step": 9118 + }, + { + "epoch": 1.660052789660508, + "grad_norm": 10.125, + "learning_rate": 5.360720625576412e-06, + "loss": 1.7102875709533691, + "step": 9120 + }, + { + "epoch": 1.6604168562847001, + "grad_norm": 17.125, + "learning_rate": 5.359208071009261e-06, + "loss": 1.4008978605270386, + "step": 9122 + }, + { + "epoch": 1.6607809229088923, + "grad_norm": 13.4375, + "learning_rate": 5.357695610475727e-06, + "loss": 1.5351812839508057, + "step": 9124 + }, + { + "epoch": 1.6611449895330845, + "grad_norm": 32.25, + "learning_rate": 5.356183244197761e-06, + "loss": 1.5203293561935425, + "step": 9126 + }, + { + "epoch": 1.6615090561572767, + "grad_norm": 13.5625, + "learning_rate": 5.354670972397293e-06, + "loss": 1.6168484687805176, + "step": 9128 + }, + { + "epoch": 1.661873122781469, + "grad_norm": 17.25, + "learning_rate": 5.353158795296247e-06, + "loss": 1.8464418649673462, + "step": 9130 + }, + { + "epoch": 1.6622371894056611, + "grad_norm": 16.625, + "learning_rate": 5.351646713116525e-06, + "loss": 1.4194422960281372, + "step": 9132 + }, + { + "epoch": 1.6626012560298533, + "grad_norm": 12.0, + "learning_rate": 5.350134726080018e-06, + "loss": 1.9352693557739258, + "step": 9134 + }, + { + "epoch": 1.6629653226540457, + "grad_norm": 24.375, + "learning_rate": 5.34862283440861e-06, + "loss": 1.3733594417572021, + "step": 9136 + }, + { + "epoch": 1.663329389278238, + "grad_norm": 8.9375, + "learning_rate": 5.347111038324158e-06, + "loss": 1.1910452842712402, + "step": 9138 + }, + { + "epoch": 1.6636934559024301, + "grad_norm": 8.3125, + "learning_rate": 5.34559933804852e-06, + "loss": 0.8649009466171265, + "step": 9140 + }, + { + "epoch": 1.6640575225266225, + "grad_norm": 20.625, + "learning_rate": 5.344087733803522e-06, + "loss": 1.5014541149139404, + "step": 9142 + }, + { + "epoch": 1.6644215891508147, + "grad_norm": 17.625, + "learning_rate": 5.3425762258109925e-06, + "loss": 1.9736855030059814, + "step": 9144 + }, + { + "epoch": 1.664785655775007, + "grad_norm": 11.1875, + "learning_rate": 5.341064814292739e-06, + "loss": 1.4210644960403442, + "step": 9146 + }, + { + "epoch": 1.6651497223991991, + "grad_norm": 10.125, + "learning_rate": 5.339553499470551e-06, + "loss": 1.3818964958190918, + "step": 9148 + }, + { + "epoch": 1.6655137890233913, + "grad_norm": 10.625, + "learning_rate": 5.3380422815662135e-06, + "loss": 1.3692893981933594, + "step": 9150 + }, + { + "epoch": 1.6658778556475835, + "grad_norm": 12.0, + "learning_rate": 5.3365311608014855e-06, + "loss": 1.957620620727539, + "step": 9152 + }, + { + "epoch": 1.6662419222717757, + "grad_norm": 23.25, + "learning_rate": 5.335020137398121e-06, + "loss": 1.610182762145996, + "step": 9154 + }, + { + "epoch": 1.666605988895968, + "grad_norm": 4.09375, + "learning_rate": 5.33350921157786e-06, + "loss": 0.9809678792953491, + "step": 9156 + }, + { + "epoch": 1.66697005552016, + "grad_norm": 11.6875, + "learning_rate": 5.331998383562418e-06, + "loss": 1.2095112800598145, + "step": 9158 + }, + { + "epoch": 1.6673341221443523, + "grad_norm": 14.0625, + "learning_rate": 5.330487653573507e-06, + "loss": 1.2562638521194458, + "step": 9160 + }, + { + "epoch": 1.6676981887685447, + "grad_norm": 24.125, + "learning_rate": 5.328977021832819e-06, + "loss": 1.8170607089996338, + "step": 9162 + }, + { + "epoch": 1.668062255392737, + "grad_norm": 16.625, + "learning_rate": 5.327466488562034e-06, + "loss": 1.8475511074066162, + "step": 9164 + }, + { + "epoch": 1.668426322016929, + "grad_norm": 18.875, + "learning_rate": 5.325956053982817e-06, + "loss": 1.1698594093322754, + "step": 9166 + }, + { + "epoch": 1.6687903886411213, + "grad_norm": 19.75, + "learning_rate": 5.324445718316815e-06, + "loss": 1.3467518091201782, + "step": 9168 + }, + { + "epoch": 1.6691544552653137, + "grad_norm": 8.6875, + "learning_rate": 5.322935481785669e-06, + "loss": 1.2459766864776611, + "step": 9170 + }, + { + "epoch": 1.669518521889506, + "grad_norm": 26.75, + "learning_rate": 5.321425344610995e-06, + "loss": 1.4066283702850342, + "step": 9172 + }, + { + "epoch": 1.669882588513698, + "grad_norm": 10.5, + "learning_rate": 5.319915307014402e-06, + "loss": 1.3927595615386963, + "step": 9174 + }, + { + "epoch": 1.6702466551378903, + "grad_norm": 8.0625, + "learning_rate": 5.318405369217483e-06, + "loss": 1.5326435565948486, + "step": 9176 + }, + { + "epoch": 1.6706107217620825, + "grad_norm": 38.25, + "learning_rate": 5.316895531441812e-06, + "loss": 1.6073873043060303, + "step": 9178 + }, + { + "epoch": 1.6709747883862747, + "grad_norm": 24.0, + "learning_rate": 5.315385793908956e-06, + "loss": 2.1608691215515137, + "step": 9180 + }, + { + "epoch": 1.6713388550104669, + "grad_norm": 12.0, + "learning_rate": 5.313876156840459e-06, + "loss": 1.6498310565948486, + "step": 9182 + }, + { + "epoch": 1.671702921634659, + "grad_norm": 42.5, + "learning_rate": 5.312366620457854e-06, + "loss": 1.7011055946350098, + "step": 9184 + }, + { + "epoch": 1.6720669882588513, + "grad_norm": 10.0625, + "learning_rate": 5.3108571849826615e-06, + "loss": 0.814986526966095, + "step": 9186 + }, + { + "epoch": 1.6724310548830434, + "grad_norm": 24.25, + "learning_rate": 5.309347850636384e-06, + "loss": 1.368643045425415, + "step": 9188 + }, + { + "epoch": 1.6727951215072359, + "grad_norm": 22.75, + "learning_rate": 5.307838617640512e-06, + "loss": 0.6063592433929443, + "step": 9190 + }, + { + "epoch": 1.673159188131428, + "grad_norm": 12.6875, + "learning_rate": 5.306329486216516e-06, + "loss": 1.4647926092147827, + "step": 9192 + }, + { + "epoch": 1.6735232547556202, + "grad_norm": 10.9375, + "learning_rate": 5.304820456585856e-06, + "loss": 1.5418574810028076, + "step": 9194 + }, + { + "epoch": 1.6738873213798127, + "grad_norm": 20.625, + "learning_rate": 5.303311528969979e-06, + "loss": 1.108280897140503, + "step": 9196 + }, + { + "epoch": 1.6742513880040049, + "grad_norm": 24.5, + "learning_rate": 5.301802703590311e-06, + "loss": 0.5406228303909302, + "step": 9198 + }, + { + "epoch": 1.674615454628197, + "grad_norm": 35.0, + "learning_rate": 5.300293980668266e-06, + "loss": 1.4860377311706543, + "step": 9200 + }, + { + "epoch": 1.6749795212523892, + "grad_norm": 12.375, + "learning_rate": 5.298785360425245e-06, + "loss": 1.735909104347229, + "step": 9202 + }, + { + "epoch": 1.6753435878765814, + "grad_norm": 5.78125, + "learning_rate": 5.297276843082628e-06, + "loss": 1.5042681694030762, + "step": 9204 + }, + { + "epoch": 1.6757076545007736, + "grad_norm": 12.375, + "learning_rate": 5.2957684288617895e-06, + "loss": 1.05649995803833, + "step": 9206 + }, + { + "epoch": 1.6760717211249658, + "grad_norm": 22.0, + "learning_rate": 5.294260117984077e-06, + "loss": 1.1301065683364868, + "step": 9208 + }, + { + "epoch": 1.676435787749158, + "grad_norm": 18.75, + "learning_rate": 5.292751910670835e-06, + "loss": 0.9675709009170532, + "step": 9210 + }, + { + "epoch": 1.6767998543733502, + "grad_norm": 10.625, + "learning_rate": 5.291243807143383e-06, + "loss": 1.2153215408325195, + "step": 9212 + }, + { + "epoch": 1.6771639209975424, + "grad_norm": 12.125, + "learning_rate": 5.289735807623028e-06, + "loss": 1.546874761581421, + "step": 9214 + }, + { + "epoch": 1.6775279876217348, + "grad_norm": 16.875, + "learning_rate": 5.288227912331068e-06, + "loss": 2.194091558456421, + "step": 9216 + }, + { + "epoch": 1.677892054245927, + "grad_norm": 3.75, + "learning_rate": 5.286720121488773e-06, + "loss": 1.0411674976348877, + "step": 9218 + }, + { + "epoch": 1.6782561208701192, + "grad_norm": 7.40625, + "learning_rate": 5.285212435317413e-06, + "loss": 1.4277840852737427, + "step": 9220 + }, + { + "epoch": 1.6786201874943114, + "grad_norm": 10.9375, + "learning_rate": 5.283704854038231e-06, + "loss": 1.3571604490280151, + "step": 9222 + }, + { + "epoch": 1.6789842541185038, + "grad_norm": 13.875, + "learning_rate": 5.282197377872458e-06, + "loss": 1.5189456939697266, + "step": 9224 + }, + { + "epoch": 1.679348320742696, + "grad_norm": 11.125, + "learning_rate": 5.2806900070413115e-06, + "loss": 1.382796287536621, + "step": 9226 + }, + { + "epoch": 1.6797123873668882, + "grad_norm": 6.625, + "learning_rate": 5.279182741765989e-06, + "loss": 1.0974713563919067, + "step": 9228 + }, + { + "epoch": 1.6800764539910804, + "grad_norm": 8.6875, + "learning_rate": 5.27767558226768e-06, + "loss": 1.148158311843872, + "step": 9230 + }, + { + "epoch": 1.6804405206152726, + "grad_norm": 3.75, + "learning_rate": 5.276168528767554e-06, + "loss": 0.6674026250839233, + "step": 9232 + }, + { + "epoch": 1.6808045872394648, + "grad_norm": 25.375, + "learning_rate": 5.2746615814867584e-06, + "loss": 1.2732802629470825, + "step": 9234 + }, + { + "epoch": 1.681168653863657, + "grad_norm": 9.3125, + "learning_rate": 5.27315474064644e-06, + "loss": 0.7327215075492859, + "step": 9236 + }, + { + "epoch": 1.6815327204878492, + "grad_norm": 15.3125, + "learning_rate": 5.271648006467716e-06, + "loss": 1.437905192375183, + "step": 9238 + }, + { + "epoch": 1.6818967871120414, + "grad_norm": 6.25, + "learning_rate": 5.270141379171696e-06, + "loss": 1.0680959224700928, + "step": 9240 + }, + { + "epoch": 1.6822608537362336, + "grad_norm": 23.0, + "learning_rate": 5.268634858979469e-06, + "loss": 1.5189906358718872, + "step": 9242 + }, + { + "epoch": 1.682624920360426, + "grad_norm": 10.375, + "learning_rate": 5.267128446112113e-06, + "loss": 0.9952925443649292, + "step": 9244 + }, + { + "epoch": 1.6829889869846182, + "grad_norm": 8.375, + "learning_rate": 5.2656221407906895e-06, + "loss": 1.3214839696884155, + "step": 9246 + }, + { + "epoch": 1.6833530536088104, + "grad_norm": 13.8125, + "learning_rate": 5.264115943236238e-06, + "loss": 1.367179274559021, + "step": 9248 + }, + { + "epoch": 1.6837171202330028, + "grad_norm": 18.25, + "learning_rate": 5.262609853669793e-06, + "loss": 1.3042641878128052, + "step": 9250 + }, + { + "epoch": 1.684081186857195, + "grad_norm": 12.25, + "learning_rate": 5.261103872312361e-06, + "loss": 1.419074535369873, + "step": 9252 + }, + { + "epoch": 1.6844452534813872, + "grad_norm": 31.75, + "learning_rate": 5.259597999384941e-06, + "loss": 1.2369577884674072, + "step": 9254 + }, + { + "epoch": 1.6848093201055794, + "grad_norm": 9.5625, + "learning_rate": 5.258092235108516e-06, + "loss": 1.074294924736023, + "step": 9256 + }, + { + "epoch": 1.6851733867297716, + "grad_norm": 6.84375, + "learning_rate": 5.256586579704046e-06, + "loss": 1.322725772857666, + "step": 9258 + }, + { + "epoch": 1.6855374533539638, + "grad_norm": 5.28125, + "learning_rate": 5.255081033392486e-06, + "loss": 1.184746503829956, + "step": 9260 + }, + { + "epoch": 1.685901519978156, + "grad_norm": 8.5625, + "learning_rate": 5.253575596394763e-06, + "loss": 1.1378341913223267, + "step": 9262 + }, + { + "epoch": 1.6862655866023482, + "grad_norm": 10.875, + "learning_rate": 5.2520702689317975e-06, + "loss": 1.686555027961731, + "step": 9264 + }, + { + "epoch": 1.6866296532265403, + "grad_norm": 7.625, + "learning_rate": 5.250565051224488e-06, + "loss": 1.6420520544052124, + "step": 9266 + }, + { + "epoch": 1.6869937198507325, + "grad_norm": 5.5, + "learning_rate": 5.24905994349372e-06, + "loss": 1.1952345371246338, + "step": 9268 + }, + { + "epoch": 1.687357786474925, + "grad_norm": 7.65625, + "learning_rate": 5.2475549459603625e-06, + "loss": 0.9739567041397095, + "step": 9270 + }, + { + "epoch": 1.6877218530991172, + "grad_norm": 29.75, + "learning_rate": 5.246050058845266e-06, + "loss": 1.4336153268814087, + "step": 9272 + }, + { + "epoch": 1.6880859197233093, + "grad_norm": 15.0, + "learning_rate": 5.24454528236927e-06, + "loss": 1.7244280576705933, + "step": 9274 + }, + { + "epoch": 1.6884499863475015, + "grad_norm": 9.375, + "learning_rate": 5.243040616753192e-06, + "loss": 1.275808334350586, + "step": 9276 + }, + { + "epoch": 1.688814052971694, + "grad_norm": 6.75, + "learning_rate": 5.2415360622178334e-06, + "loss": 1.4948639869689941, + "step": 9278 + }, + { + "epoch": 1.6891781195958862, + "grad_norm": 9.125, + "learning_rate": 5.240031618983987e-06, + "loss": 1.428116798400879, + "step": 9280 + }, + { + "epoch": 1.6895421862200783, + "grad_norm": 16.125, + "learning_rate": 5.238527287272419e-06, + "loss": 1.5895593166351318, + "step": 9282 + }, + { + "epoch": 1.6899062528442705, + "grad_norm": 12.25, + "learning_rate": 5.237023067303886e-06, + "loss": 1.3284916877746582, + "step": 9284 + }, + { + "epoch": 1.6902703194684627, + "grad_norm": 5.03125, + "learning_rate": 5.235518959299126e-06, + "loss": 1.3085006475448608, + "step": 9286 + }, + { + "epoch": 1.690634386092655, + "grad_norm": 7.0625, + "learning_rate": 5.2340149634788594e-06, + "loss": 1.1924309730529785, + "step": 9288 + }, + { + "epoch": 1.6909984527168471, + "grad_norm": 13.0, + "learning_rate": 5.232511080063793e-06, + "loss": 1.3618104457855225, + "step": 9290 + }, + { + "epoch": 1.6913625193410393, + "grad_norm": 3.9375, + "learning_rate": 5.231007309274616e-06, + "loss": 1.1392757892608643, + "step": 9292 + }, + { + "epoch": 1.6917265859652315, + "grad_norm": 18.0, + "learning_rate": 5.229503651332e-06, + "loss": 1.2429265975952148, + "step": 9294 + }, + { + "epoch": 1.6920906525894237, + "grad_norm": 111.5, + "learning_rate": 5.228000106456601e-06, + "loss": 0.9867711067199707, + "step": 9296 + }, + { + "epoch": 1.6924547192136161, + "grad_norm": 23.25, + "learning_rate": 5.226496674869055e-06, + "loss": 1.2803643941879272, + "step": 9298 + }, + { + "epoch": 1.6928187858378083, + "grad_norm": 8.75, + "learning_rate": 5.224993356789991e-06, + "loss": 1.7405083179473877, + "step": 9300 + }, + { + "epoch": 1.6931828524620005, + "grad_norm": 29.625, + "learning_rate": 5.22349015244001e-06, + "loss": 1.165168046951294, + "step": 9302 + }, + { + "epoch": 1.693546919086193, + "grad_norm": 15.3125, + "learning_rate": 5.2219870620397e-06, + "loss": 0.8452050089836121, + "step": 9304 + }, + { + "epoch": 1.6939109857103851, + "grad_norm": 12.875, + "learning_rate": 5.220484085809637e-06, + "loss": 1.5507532358169556, + "step": 9306 + }, + { + "epoch": 1.6942750523345773, + "grad_norm": 4.46875, + "learning_rate": 5.218981223970374e-06, + "loss": 1.3271949291229248, + "step": 9308 + }, + { + "epoch": 1.6946391189587695, + "grad_norm": 7.34375, + "learning_rate": 5.217478476742455e-06, + "loss": 1.477545976638794, + "step": 9310 + }, + { + "epoch": 1.6950031855829617, + "grad_norm": 8.8125, + "learning_rate": 5.215975844346395e-06, + "loss": 1.3406667709350586, + "step": 9312 + }, + { + "epoch": 1.695367252207154, + "grad_norm": 12.3125, + "learning_rate": 5.214473327002703e-06, + "loss": 1.5259292125701904, + "step": 9314 + }, + { + "epoch": 1.695731318831346, + "grad_norm": 33.5, + "learning_rate": 5.212970924931868e-06, + "loss": 1.3747682571411133, + "step": 9316 + }, + { + "epoch": 1.6960953854555383, + "grad_norm": 22.25, + "learning_rate": 5.211468638354358e-06, + "loss": 1.6125006675720215, + "step": 9318 + }, + { + "epoch": 1.6964594520797305, + "grad_norm": 9.5, + "learning_rate": 5.2099664674906325e-06, + "loss": 1.3490309715270996, + "step": 9320 + }, + { + "epoch": 1.6968235187039227, + "grad_norm": 37.25, + "learning_rate": 5.208464412561124e-06, + "loss": 1.115883469581604, + "step": 9322 + }, + { + "epoch": 1.697187585328115, + "grad_norm": 25.375, + "learning_rate": 5.206962473786254e-06, + "loss": 0.8189382553100586, + "step": 9324 + }, + { + "epoch": 1.6975516519523073, + "grad_norm": 27.0, + "learning_rate": 5.205460651386432e-06, + "loss": 1.2329597473144531, + "step": 9326 + }, + { + "epoch": 1.6979157185764995, + "grad_norm": 19.125, + "learning_rate": 5.2039589455820346e-06, + "loss": 1.3114888668060303, + "step": 9328 + }, + { + "epoch": 1.6982797852006917, + "grad_norm": 6.71875, + "learning_rate": 5.202457356593438e-06, + "loss": 1.1543605327606201, + "step": 9330 + }, + { + "epoch": 1.698643851824884, + "grad_norm": 13.75, + "learning_rate": 5.2009558846409925e-06, + "loss": 0.8102421760559082, + "step": 9332 + }, + { + "epoch": 1.6990079184490763, + "grad_norm": 54.0, + "learning_rate": 5.199454529945031e-06, + "loss": 0.7338652014732361, + "step": 9334 + }, + { + "epoch": 1.6993719850732685, + "grad_norm": 5.5, + "learning_rate": 5.197953292725875e-06, + "loss": 1.1436655521392822, + "step": 9336 + }, + { + "epoch": 1.6997360516974607, + "grad_norm": 23.75, + "learning_rate": 5.19645217320382e-06, + "loss": 1.6214869022369385, + "step": 9338 + }, + { + "epoch": 1.7001001183216529, + "grad_norm": 7.71875, + "learning_rate": 5.194951171599154e-06, + "loss": 1.4811253547668457, + "step": 9340 + }, + { + "epoch": 1.700464184945845, + "grad_norm": 13.3125, + "learning_rate": 5.193450288132141e-06, + "loss": 1.2000652551651, + "step": 9342 + }, + { + "epoch": 1.7008282515700373, + "grad_norm": 12.0625, + "learning_rate": 5.191949523023029e-06, + "loss": 1.1856586933135986, + "step": 9344 + }, + { + "epoch": 1.7011923181942294, + "grad_norm": 10.6875, + "learning_rate": 5.190448876492051e-06, + "loss": 1.1105928421020508, + "step": 9346 + }, + { + "epoch": 1.7015563848184216, + "grad_norm": 7.9375, + "learning_rate": 5.188948348759418e-06, + "loss": 1.2888965606689453, + "step": 9348 + }, + { + "epoch": 1.7019204514426138, + "grad_norm": 17.25, + "learning_rate": 5.187447940045329e-06, + "loss": 1.5396370887756348, + "step": 9350 + }, + { + "epoch": 1.7022845180668063, + "grad_norm": 24.125, + "learning_rate": 5.185947650569961e-06, + "loss": 1.5175641775131226, + "step": 9352 + }, + { + "epoch": 1.7026485846909984, + "grad_norm": 16.25, + "learning_rate": 5.184447480553476e-06, + "loss": 1.5482220649719238, + "step": 9354 + }, + { + "epoch": 1.7030126513151906, + "grad_norm": 11.1875, + "learning_rate": 5.182947430216019e-06, + "loss": 1.1901549100875854, + "step": 9356 + }, + { + "epoch": 1.703376717939383, + "grad_norm": 8.0625, + "learning_rate": 5.181447499777714e-06, + "loss": 1.4369940757751465, + "step": 9358 + }, + { + "epoch": 1.7037407845635753, + "grad_norm": 23.0, + "learning_rate": 5.179947689458673e-06, + "loss": 1.5072280168533325, + "step": 9360 + }, + { + "epoch": 1.7041048511877674, + "grad_norm": 11.375, + "learning_rate": 5.178447999478985e-06, + "loss": 1.5146267414093018, + "step": 9362 + }, + { + "epoch": 1.7044689178119596, + "grad_norm": 18.875, + "learning_rate": 5.1769484300587215e-06, + "loss": 1.6368751525878906, + "step": 9364 + }, + { + "epoch": 1.7048329844361518, + "grad_norm": 5.78125, + "learning_rate": 5.175448981417943e-06, + "loss": 0.9928371906280518, + "step": 9366 + }, + { + "epoch": 1.705197051060344, + "grad_norm": 6.65625, + "learning_rate": 5.173949653776683e-06, + "loss": 1.5561423301696777, + "step": 9368 + }, + { + "epoch": 1.7055611176845362, + "grad_norm": 11.6875, + "learning_rate": 5.172450447354966e-06, + "loss": 1.412014365196228, + "step": 9370 + }, + { + "epoch": 1.7059251843087284, + "grad_norm": 7.8125, + "learning_rate": 5.170951362372789e-06, + "loss": 1.345872402191162, + "step": 9372 + }, + { + "epoch": 1.7062892509329206, + "grad_norm": 7.125, + "learning_rate": 5.169452399050141e-06, + "loss": 1.2993645668029785, + "step": 9374 + }, + { + "epoch": 1.7066533175571128, + "grad_norm": 6.09375, + "learning_rate": 5.167953557606988e-06, + "loss": 1.3439033031463623, + "step": 9376 + }, + { + "epoch": 1.7070173841813052, + "grad_norm": 15.0625, + "learning_rate": 5.166454838263278e-06, + "loss": 1.2377959489822388, + "step": 9378 + }, + { + "epoch": 1.7073814508054974, + "grad_norm": 7.90625, + "learning_rate": 5.164956241238943e-06, + "loss": 1.341631293296814, + "step": 9380 + }, + { + "epoch": 1.7077455174296896, + "grad_norm": 14.8125, + "learning_rate": 5.163457766753894e-06, + "loss": 1.4095263481140137, + "step": 9382 + }, + { + "epoch": 1.708109584053882, + "grad_norm": 29.875, + "learning_rate": 5.161959415028028e-06, + "loss": 1.945432186126709, + "step": 9384 + }, + { + "epoch": 1.7084736506780742, + "grad_norm": 17.125, + "learning_rate": 5.160461186281224e-06, + "loss": 1.323392629623413, + "step": 9386 + }, + { + "epoch": 1.7088377173022664, + "grad_norm": 6.34375, + "learning_rate": 5.158963080733335e-06, + "loss": 1.144814372062683, + "step": 9388 + }, + { + "epoch": 1.7092017839264586, + "grad_norm": 6.8125, + "learning_rate": 5.1574650986042085e-06, + "loss": 1.1991299390792847, + "step": 9390 + }, + { + "epoch": 1.7095658505506508, + "grad_norm": 7.90625, + "learning_rate": 5.155967240113662e-06, + "loss": 1.3379712104797363, + "step": 9392 + }, + { + "epoch": 1.709929917174843, + "grad_norm": 19.625, + "learning_rate": 5.154469505481503e-06, + "loss": 1.4230942726135254, + "step": 9394 + }, + { + "epoch": 1.7102939837990352, + "grad_norm": 7.84375, + "learning_rate": 5.152971894927518e-06, + "loss": 1.030795931816101, + "step": 9396 + }, + { + "epoch": 1.7106580504232274, + "grad_norm": 12.0625, + "learning_rate": 5.151474408671475e-06, + "loss": 0.7345353364944458, + "step": 9398 + }, + { + "epoch": 1.7110221170474196, + "grad_norm": 13.6875, + "learning_rate": 5.1499770469331235e-06, + "loss": 1.1238645315170288, + "step": 9400 + }, + { + "epoch": 1.7113861836716118, + "grad_norm": 13.0, + "learning_rate": 5.148479809932195e-06, + "loss": 1.5702509880065918, + "step": 9402 + }, + { + "epoch": 1.711750250295804, + "grad_norm": 16.125, + "learning_rate": 5.146982697888403e-06, + "loss": 1.2398544549942017, + "step": 9404 + }, + { + "epoch": 1.7121143169199964, + "grad_norm": 20.875, + "learning_rate": 5.145485711021445e-06, + "loss": 1.4667586088180542, + "step": 9406 + }, + { + "epoch": 1.7124783835441886, + "grad_norm": 13.625, + "learning_rate": 5.143988849550994e-06, + "loss": 1.22396981716156, + "step": 9408 + }, + { + "epoch": 1.7128424501683808, + "grad_norm": 13.8125, + "learning_rate": 5.142492113696711e-06, + "loss": 1.5100560188293457, + "step": 9410 + }, + { + "epoch": 1.7132065167925732, + "grad_norm": 10.4375, + "learning_rate": 5.140995503678235e-06, + "loss": 1.011368989944458, + "step": 9412 + }, + { + "epoch": 1.7135705834167654, + "grad_norm": 11.9375, + "learning_rate": 5.139499019715188e-06, + "loss": 1.2850549221038818, + "step": 9414 + }, + { + "epoch": 1.7139346500409576, + "grad_norm": 28.625, + "learning_rate": 5.138002662027173e-06, + "loss": 0.9053092002868652, + "step": 9416 + }, + { + "epoch": 1.7142987166651498, + "grad_norm": 12.75, + "learning_rate": 5.136506430833772e-06, + "loss": 1.3963650465011597, + "step": 9418 + }, + { + "epoch": 1.714662783289342, + "grad_norm": 8.3125, + "learning_rate": 5.135010326354555e-06, + "loss": 1.2343649864196777, + "step": 9420 + }, + { + "epoch": 1.7150268499135342, + "grad_norm": 10.625, + "learning_rate": 5.1335143488090656e-06, + "loss": 1.386676549911499, + "step": 9422 + }, + { + "epoch": 1.7153909165377264, + "grad_norm": 7.625, + "learning_rate": 5.132018498416833e-06, + "loss": 1.2937157154083252, + "step": 9424 + }, + { + "epoch": 1.7157549831619185, + "grad_norm": 14.5625, + "learning_rate": 5.130522775397371e-06, + "loss": 0.9588303565979004, + "step": 9426 + }, + { + "epoch": 1.7161190497861107, + "grad_norm": 16.0, + "learning_rate": 5.129027179970165e-06, + "loss": 1.663581132888794, + "step": 9428 + }, + { + "epoch": 1.716483116410303, + "grad_norm": 13.875, + "learning_rate": 5.1275317123546945e-06, + "loss": 1.381098747253418, + "step": 9430 + }, + { + "epoch": 1.7168471830344953, + "grad_norm": 10.75, + "learning_rate": 5.126036372770407e-06, + "loss": 1.3656188249588013, + "step": 9432 + }, + { + "epoch": 1.7172112496586875, + "grad_norm": 5.53125, + "learning_rate": 5.124541161436738e-06, + "loss": 0.9090326428413391, + "step": 9434 + }, + { + "epoch": 1.7175753162828797, + "grad_norm": 13.75, + "learning_rate": 5.123046078573109e-06, + "loss": 0.985514760017395, + "step": 9436 + }, + { + "epoch": 1.7179393829070722, + "grad_norm": 15.9375, + "learning_rate": 5.121551124398912e-06, + "loss": 0.6309006214141846, + "step": 9438 + }, + { + "epoch": 1.7183034495312643, + "grad_norm": 16.5, + "learning_rate": 5.120056299133529e-06, + "loss": 1.5923762321472168, + "step": 9440 + }, + { + "epoch": 1.7186675161554565, + "grad_norm": 20.625, + "learning_rate": 5.118561602996317e-06, + "loss": 1.670379400253296, + "step": 9442 + }, + { + "epoch": 1.7190315827796487, + "grad_norm": 11.25, + "learning_rate": 5.1170670362066175e-06, + "loss": 1.2797925472259521, + "step": 9444 + }, + { + "epoch": 1.719395649403841, + "grad_norm": 6.8125, + "learning_rate": 5.115572598983753e-06, + "loss": 1.2918821573257446, + "step": 9446 + }, + { + "epoch": 1.7197597160280331, + "grad_norm": 10.25, + "learning_rate": 5.114078291547024e-06, + "loss": 1.371572732925415, + "step": 9448 + }, + { + "epoch": 1.7201237826522253, + "grad_norm": 6.75, + "learning_rate": 5.112584114115717e-06, + "loss": 1.7317709922790527, + "step": 9450 + }, + { + "epoch": 1.7204878492764175, + "grad_norm": 4.34375, + "learning_rate": 5.1110900669090945e-06, + "loss": 1.1518534421920776, + "step": 9452 + }, + { + "epoch": 1.7208519159006097, + "grad_norm": 9.25, + "learning_rate": 5.109596150146401e-06, + "loss": 1.1583870649337769, + "step": 9454 + }, + { + "epoch": 1.721215982524802, + "grad_norm": 30.0, + "learning_rate": 5.1081023640468654e-06, + "loss": 1.1486270427703857, + "step": 9456 + }, + { + "epoch": 1.7215800491489943, + "grad_norm": 6.1875, + "learning_rate": 5.106608708829693e-06, + "loss": 1.5493502616882324, + "step": 9458 + }, + { + "epoch": 1.7219441157731865, + "grad_norm": 23.625, + "learning_rate": 5.105115184714071e-06, + "loss": 1.1435967683792114, + "step": 9460 + }, + { + "epoch": 1.7223081823973787, + "grad_norm": 27.625, + "learning_rate": 5.10362179191917e-06, + "loss": 1.833583950996399, + "step": 9462 + }, + { + "epoch": 1.722672249021571, + "grad_norm": 9.125, + "learning_rate": 5.102128530664136e-06, + "loss": 1.5355030298233032, + "step": 9464 + }, + { + "epoch": 1.7230363156457633, + "grad_norm": 27.75, + "learning_rate": 5.1006354011681055e-06, + "loss": 1.1746819019317627, + "step": 9466 + }, + { + "epoch": 1.7234003822699555, + "grad_norm": 8.9375, + "learning_rate": 5.0991424036501814e-06, + "loss": 0.8762860894203186, + "step": 9468 + }, + { + "epoch": 1.7237644488941477, + "grad_norm": 7.78125, + "learning_rate": 5.097649538329461e-06, + "loss": 1.2093828916549683, + "step": 9470 + }, + { + "epoch": 1.72412851551834, + "grad_norm": 6.53125, + "learning_rate": 5.096156805425014e-06, + "loss": 1.2469215393066406, + "step": 9472 + }, + { + "epoch": 1.724492582142532, + "grad_norm": 16.125, + "learning_rate": 5.094664205155891e-06, + "loss": 1.2745331525802612, + "step": 9474 + }, + { + "epoch": 1.7248566487667243, + "grad_norm": 8.9375, + "learning_rate": 5.093171737741129e-06, + "loss": 1.4616479873657227, + "step": 9476 + }, + { + "epoch": 1.7252207153909165, + "grad_norm": 7.3125, + "learning_rate": 5.0916794033997375e-06, + "loss": 1.0668138265609741, + "step": 9478 + }, + { + "epoch": 1.7255847820151087, + "grad_norm": 49.0, + "learning_rate": 5.090187202350714e-06, + "loss": 1.4169104099273682, + "step": 9480 + }, + { + "epoch": 1.7259488486393009, + "grad_norm": 5.84375, + "learning_rate": 5.088695134813031e-06, + "loss": 1.3103148937225342, + "step": 9482 + }, + { + "epoch": 1.726312915263493, + "grad_norm": 14.4375, + "learning_rate": 5.087203201005642e-06, + "loss": 1.83831787109375, + "step": 9484 + }, + { + "epoch": 1.7266769818876855, + "grad_norm": 11.5, + "learning_rate": 5.085711401147486e-06, + "loss": 1.5398776531219482, + "step": 9486 + }, + { + "epoch": 1.7270410485118777, + "grad_norm": 61.5, + "learning_rate": 5.084219735457476e-06, + "loss": 1.3492802381515503, + "step": 9488 + }, + { + "epoch": 1.7274051151360699, + "grad_norm": 13.8125, + "learning_rate": 5.08272820415451e-06, + "loss": 1.3110461235046387, + "step": 9490 + }, + { + "epoch": 1.7277691817602623, + "grad_norm": 22.5, + "learning_rate": 5.081236807457461e-06, + "loss": 1.5548452138900757, + "step": 9492 + }, + { + "epoch": 1.7281332483844545, + "grad_norm": 21.875, + "learning_rate": 5.0797455455851865e-06, + "loss": 1.5569431781768799, + "step": 9494 + }, + { + "epoch": 1.7284973150086467, + "grad_norm": 16.5, + "learning_rate": 5.0782544187565255e-06, + "loss": 1.5821748971939087, + "step": 9496 + }, + { + "epoch": 1.7288613816328389, + "grad_norm": 14.375, + "learning_rate": 5.076763427190291e-06, + "loss": 1.7407346963882446, + "step": 9498 + }, + { + "epoch": 1.729225448257031, + "grad_norm": 7.53125, + "learning_rate": 5.0752725711052846e-06, + "loss": 1.4714140892028809, + "step": 9500 + }, + { + "epoch": 1.7295895148812233, + "grad_norm": 7.5, + "learning_rate": 5.073781850720278e-06, + "loss": 1.117492437362671, + "step": 9502 + }, + { + "epoch": 1.7299535815054154, + "grad_norm": 6.3125, + "learning_rate": 5.072291266254033e-06, + "loss": 1.2635581493377686, + "step": 9504 + }, + { + "epoch": 1.7303176481296076, + "grad_norm": 350.0, + "learning_rate": 5.070800817925286e-06, + "loss": 1.3402974605560303, + "step": 9506 + }, + { + "epoch": 1.7306817147537998, + "grad_norm": 12.125, + "learning_rate": 5.069310505952749e-06, + "loss": 1.6758350133895874, + "step": 9508 + }, + { + "epoch": 1.731045781377992, + "grad_norm": 19.75, + "learning_rate": 5.0678203305551274e-06, + "loss": 1.5974210500717163, + "step": 9510 + }, + { + "epoch": 1.7314098480021844, + "grad_norm": 27.75, + "learning_rate": 5.0663302919510935e-06, + "loss": 0.6250400543212891, + "step": 9512 + }, + { + "epoch": 1.7317739146263766, + "grad_norm": 30.125, + "learning_rate": 5.064840390359305e-06, + "loss": 0.9814783334732056, + "step": 9514 + }, + { + "epoch": 1.7321379812505688, + "grad_norm": 21.875, + "learning_rate": 5.0633506259984e-06, + "loss": 1.5342494249343872, + "step": 9516 + }, + { + "epoch": 1.732502047874761, + "grad_norm": 16.375, + "learning_rate": 5.061860999086994e-06, + "loss": 1.9756970405578613, + "step": 9518 + }, + { + "epoch": 1.7328661144989534, + "grad_norm": 22.875, + "learning_rate": 5.060371509843684e-06, + "loss": 1.25270414352417, + "step": 9520 + }, + { + "epoch": 1.7332301811231456, + "grad_norm": 15.3125, + "learning_rate": 5.058882158487049e-06, + "loss": 1.0654053688049316, + "step": 9522 + }, + { + "epoch": 1.7335942477473378, + "grad_norm": 7.75, + "learning_rate": 5.057392945235642e-06, + "loss": 1.390311598777771, + "step": 9524 + }, + { + "epoch": 1.73395831437153, + "grad_norm": 7.75, + "learning_rate": 5.055903870308001e-06, + "loss": 0.8419263362884521, + "step": 9526 + }, + { + "epoch": 1.7343223809957222, + "grad_norm": 8.125, + "learning_rate": 5.0544149339226375e-06, + "loss": 1.2089424133300781, + "step": 9528 + }, + { + "epoch": 1.7346864476199144, + "grad_norm": 8.9375, + "learning_rate": 5.052926136298055e-06, + "loss": 0.4906526803970337, + "step": 9530 + }, + { + "epoch": 1.7350505142441066, + "grad_norm": 9.9375, + "learning_rate": 5.051437477652721e-06, + "loss": 1.4194902181625366, + "step": 9532 + }, + { + "epoch": 1.7354145808682988, + "grad_norm": 15.875, + "learning_rate": 5.049948958205093e-06, + "loss": 1.4018341302871704, + "step": 9534 + }, + { + "epoch": 1.735778647492491, + "grad_norm": 9.875, + "learning_rate": 5.048460578173607e-06, + "loss": 1.3147599697113037, + "step": 9536 + }, + { + "epoch": 1.7361427141166832, + "grad_norm": 7.96875, + "learning_rate": 5.046972337776673e-06, + "loss": 1.4243731498718262, + "step": 9538 + }, + { + "epoch": 1.7365067807408756, + "grad_norm": 10.25, + "learning_rate": 5.045484237232687e-06, + "loss": 1.4286456108093262, + "step": 9540 + }, + { + "epoch": 1.7368708473650678, + "grad_norm": 9.5625, + "learning_rate": 5.0439962767600214e-06, + "loss": 1.455309271812439, + "step": 9542 + }, + { + "epoch": 1.73723491398926, + "grad_norm": 17.375, + "learning_rate": 5.0425084565770266e-06, + "loss": 1.3779274225234985, + "step": 9544 + }, + { + "epoch": 1.7375989806134524, + "grad_norm": 22.25, + "learning_rate": 5.041020776902037e-06, + "loss": 1.2114263772964478, + "step": 9546 + }, + { + "epoch": 1.7379630472376446, + "grad_norm": 16.625, + "learning_rate": 5.03953323795336e-06, + "loss": 1.4143348932266235, + "step": 9548 + }, + { + "epoch": 1.7383271138618368, + "grad_norm": 35.0, + "learning_rate": 5.038045839949291e-06, + "loss": 1.9574635028839111, + "step": 9550 + }, + { + "epoch": 1.738691180486029, + "grad_norm": 8.125, + "learning_rate": 5.036558583108093e-06, + "loss": 1.0166016817092896, + "step": 9552 + }, + { + "epoch": 1.7390552471102212, + "grad_norm": 14.625, + "learning_rate": 5.03507146764802e-06, + "loss": 1.1967015266418457, + "step": 9554 + }, + { + "epoch": 1.7394193137344134, + "grad_norm": 14.6875, + "learning_rate": 5.0335844937873e-06, + "loss": 1.814778447151184, + "step": 9556 + }, + { + "epoch": 1.7397833803586056, + "grad_norm": 13.8125, + "learning_rate": 5.032097661744135e-06, + "loss": 1.4534716606140137, + "step": 9558 + }, + { + "epoch": 1.7401474469827978, + "grad_norm": 50.5, + "learning_rate": 5.03061097173672e-06, + "loss": 1.4580074548721313, + "step": 9560 + }, + { + "epoch": 1.74051151360699, + "grad_norm": 10.75, + "learning_rate": 5.029124423983215e-06, + "loss": 1.4458574056625366, + "step": 9562 + }, + { + "epoch": 1.7408755802311822, + "grad_norm": 6.75, + "learning_rate": 5.027638018701764e-06, + "loss": 1.387359857559204, + "step": 9564 + }, + { + "epoch": 1.7412396468553746, + "grad_norm": 37.5, + "learning_rate": 5.026151756110496e-06, + "loss": 1.1520726680755615, + "step": 9566 + }, + { + "epoch": 1.7416037134795668, + "grad_norm": 14.9375, + "learning_rate": 5.024665636427509e-06, + "loss": 1.3024561405181885, + "step": 9568 + }, + { + "epoch": 1.741967780103759, + "grad_norm": 12.875, + "learning_rate": 5.023179659870889e-06, + "loss": 1.454302191734314, + "step": 9570 + }, + { + "epoch": 1.7423318467279512, + "grad_norm": 21.125, + "learning_rate": 5.0216938266586934e-06, + "loss": 1.5647664070129395, + "step": 9572 + }, + { + "epoch": 1.7426959133521436, + "grad_norm": 8.75, + "learning_rate": 5.020208137008965e-06, + "loss": 1.5302326679229736, + "step": 9574 + }, + { + "epoch": 1.7430599799763358, + "grad_norm": 15.3125, + "learning_rate": 5.018722591139722e-06, + "loss": 1.4510908126831055, + "step": 9576 + }, + { + "epoch": 1.743424046600528, + "grad_norm": 11.3125, + "learning_rate": 5.017237189268961e-06, + "loss": 0.5014241933822632, + "step": 9578 + }, + { + "epoch": 1.7437881132247202, + "grad_norm": 13.4375, + "learning_rate": 5.015751931614661e-06, + "loss": 1.3752236366271973, + "step": 9580 + }, + { + "epoch": 1.7441521798489124, + "grad_norm": 40.25, + "learning_rate": 5.014266818394775e-06, + "loss": 1.66645085811615, + "step": 9582 + }, + { + "epoch": 1.7445162464731045, + "grad_norm": 6.03125, + "learning_rate": 5.01278184982724e-06, + "loss": 1.3075840473175049, + "step": 9584 + }, + { + "epoch": 1.7448803130972967, + "grad_norm": 15.375, + "learning_rate": 5.011297026129967e-06, + "loss": 1.4868955612182617, + "step": 9586 + }, + { + "epoch": 1.745244379721489, + "grad_norm": 13.375, + "learning_rate": 5.009812347520846e-06, + "loss": 1.9733105897903442, + "step": 9588 + }, + { + "epoch": 1.7456084463456811, + "grad_norm": 26.625, + "learning_rate": 5.008327814217755e-06, + "loss": 1.2591856718063354, + "step": 9590 + }, + { + "epoch": 1.7459725129698733, + "grad_norm": 9.25, + "learning_rate": 5.006843426438534e-06, + "loss": 1.5363489389419556, + "step": 9592 + }, + { + "epoch": 1.7463365795940657, + "grad_norm": 10.3125, + "learning_rate": 5.005359184401017e-06, + "loss": 1.3313227891921997, + "step": 9594 + }, + { + "epoch": 1.746700646218258, + "grad_norm": 18.625, + "learning_rate": 5.003875088323009e-06, + "loss": 1.4109573364257812, + "step": 9596 + }, + { + "epoch": 1.7470647128424501, + "grad_norm": 14.25, + "learning_rate": 5.0023911384222955e-06, + "loss": 1.4049272537231445, + "step": 9598 + }, + { + "epoch": 1.7474287794666425, + "grad_norm": 10.375, + "learning_rate": 5.000907334916638e-06, + "loss": 1.351060390472412, + "step": 9600 + }, + { + "epoch": 1.7477928460908347, + "grad_norm": 6.25, + "learning_rate": 4.999423678023782e-06, + "loss": 1.3122403621673584, + "step": 9602 + }, + { + "epoch": 1.748156912715027, + "grad_norm": 3.84375, + "learning_rate": 4.997940167961444e-06, + "loss": 0.9075431227684021, + "step": 9604 + }, + { + "epoch": 1.7485209793392191, + "grad_norm": 23.625, + "learning_rate": 4.9964568049473275e-06, + "loss": 1.1615616083145142, + "step": 9606 + }, + { + "epoch": 1.7488850459634113, + "grad_norm": 20.625, + "learning_rate": 4.9949735891991055e-06, + "loss": 0.8063016533851624, + "step": 9608 + }, + { + "epoch": 1.7492491125876035, + "grad_norm": 10.9375, + "learning_rate": 4.993490520934438e-06, + "loss": 1.422484278678894, + "step": 9610 + }, + { + "epoch": 1.7496131792117957, + "grad_norm": 35.25, + "learning_rate": 4.9920076003709575e-06, + "loss": 1.580983281135559, + "step": 9612 + }, + { + "epoch": 1.749977245835988, + "grad_norm": 10.4375, + "learning_rate": 4.990524827726275e-06, + "loss": 1.4442027807235718, + "step": 9614 + }, + { + "epoch": 1.75034131246018, + "grad_norm": 29.375, + "learning_rate": 4.9890422032179855e-06, + "loss": 1.4181630611419678, + "step": 9616 + }, + { + "epoch": 1.7507053790843723, + "grad_norm": 9.5, + "learning_rate": 4.987559727063653e-06, + "loss": 1.2040762901306152, + "step": 9618 + }, + { + "epoch": 1.7510694457085647, + "grad_norm": 13.0, + "learning_rate": 4.9860773994808295e-06, + "loss": 1.7428909540176392, + "step": 9620 + }, + { + "epoch": 1.751433512332757, + "grad_norm": 16.375, + "learning_rate": 4.984595220687038e-06, + "loss": 1.8364250659942627, + "step": 9622 + }, + { + "epoch": 1.751797578956949, + "grad_norm": 6.96875, + "learning_rate": 4.983113190899782e-06, + "loss": 1.2444138526916504, + "step": 9624 + }, + { + "epoch": 1.7521616455811415, + "grad_norm": 7.34375, + "learning_rate": 4.981631310336546e-06, + "loss": 0.8715922832489014, + "step": 9626 + }, + { + "epoch": 1.7525257122053337, + "grad_norm": 6.03125, + "learning_rate": 4.980149579214786e-06, + "loss": 1.0471537113189697, + "step": 9628 + }, + { + "epoch": 1.752889778829526, + "grad_norm": 15.375, + "learning_rate": 4.9786679977519435e-06, + "loss": 1.3787966966629028, + "step": 9630 + }, + { + "epoch": 1.753253845453718, + "grad_norm": 9.125, + "learning_rate": 4.9771865661654325e-06, + "loss": 1.387390375137329, + "step": 9632 + }, + { + "epoch": 1.7536179120779103, + "grad_norm": 4.625, + "learning_rate": 4.975705284672647e-06, + "loss": 1.1727123260498047, + "step": 9634 + }, + { + "epoch": 1.7539819787021025, + "grad_norm": 12.5, + "learning_rate": 4.974224153490963e-06, + "loss": 1.7207012176513672, + "step": 9636 + }, + { + "epoch": 1.7543460453262947, + "grad_norm": 10.6875, + "learning_rate": 4.972743172837724e-06, + "loss": 1.40591299533844, + "step": 9638 + }, + { + "epoch": 1.7547101119504869, + "grad_norm": 12.5625, + "learning_rate": 4.971262342930263e-06, + "loss": 1.2562315464019775, + "step": 9640 + }, + { + "epoch": 1.755074178574679, + "grad_norm": 13.375, + "learning_rate": 4.969781663985884e-06, + "loss": 1.6212990283966064, + "step": 9642 + }, + { + "epoch": 1.7554382451988713, + "grad_norm": 11.0, + "learning_rate": 4.9683011362218695e-06, + "loss": 0.8833602666854858, + "step": 9644 + }, + { + "epoch": 1.7558023118230635, + "grad_norm": 10.625, + "learning_rate": 4.966820759855484e-06, + "loss": 1.2150349617004395, + "step": 9646 + }, + { + "epoch": 1.7561663784472559, + "grad_norm": 26.375, + "learning_rate": 4.965340535103964e-06, + "loss": 1.778388500213623, + "step": 9648 + }, + { + "epoch": 1.756530445071448, + "grad_norm": 6.3125, + "learning_rate": 4.963860462184527e-06, + "loss": 1.2072433233261108, + "step": 9650 + }, + { + "epoch": 1.7568945116956403, + "grad_norm": 8.5625, + "learning_rate": 4.962380541314369e-06, + "loss": 1.3482211828231812, + "step": 9652 + }, + { + "epoch": 1.7572585783198327, + "grad_norm": 6.875, + "learning_rate": 4.96090077271066e-06, + "loss": 1.3736768960952759, + "step": 9654 + }, + { + "epoch": 1.7576226449440249, + "grad_norm": 13.0625, + "learning_rate": 4.9594211565905535e-06, + "loss": 1.7501568794250488, + "step": 9656 + }, + { + "epoch": 1.757986711568217, + "grad_norm": 16.25, + "learning_rate": 4.957941693171173e-06, + "loss": 1.4537973403930664, + "step": 9658 + }, + { + "epoch": 1.7583507781924093, + "grad_norm": 13.625, + "learning_rate": 4.956462382669627e-06, + "loss": 1.5427837371826172, + "step": 9660 + }, + { + "epoch": 1.7587148448166015, + "grad_norm": 45.0, + "learning_rate": 4.954983225302998e-06, + "loss": 1.3104393482208252, + "step": 9662 + }, + { + "epoch": 1.7590789114407936, + "grad_norm": 6.0, + "learning_rate": 4.953504221288344e-06, + "loss": 1.290662407875061, + "step": 9664 + }, + { + "epoch": 1.7594429780649858, + "grad_norm": 18.375, + "learning_rate": 4.952025370842706e-06, + "loss": 1.1948498487472534, + "step": 9666 + }, + { + "epoch": 1.759807044689178, + "grad_norm": 17.75, + "learning_rate": 4.950546674183096e-06, + "loss": 1.5208001136779785, + "step": 9668 + }, + { + "epoch": 1.7601711113133702, + "grad_norm": 16.5, + "learning_rate": 4.94906813152651e-06, + "loss": 1.3499042987823486, + "step": 9670 + }, + { + "epoch": 1.7605351779375624, + "grad_norm": 9.5625, + "learning_rate": 4.947589743089916e-06, + "loss": 1.2112205028533936, + "step": 9672 + }, + { + "epoch": 1.7608992445617548, + "grad_norm": 11.0625, + "learning_rate": 4.946111509090262e-06, + "loss": 1.1928448677062988, + "step": 9674 + }, + { + "epoch": 1.761263311185947, + "grad_norm": 14.125, + "learning_rate": 4.944633429744474e-06, + "loss": 1.924996256828308, + "step": 9676 + }, + { + "epoch": 1.7616273778101392, + "grad_norm": 22.875, + "learning_rate": 4.9431555052694516e-06, + "loss": 1.5360736846923828, + "step": 9678 + }, + { + "epoch": 1.7619914444343316, + "grad_norm": 15.625, + "learning_rate": 4.941677735882078e-06, + "loss": 1.4673950672149658, + "step": 9680 + }, + { + "epoch": 1.7623555110585238, + "grad_norm": 7.46875, + "learning_rate": 4.940200121799206e-06, + "loss": 1.2095304727554321, + "step": 9682 + }, + { + "epoch": 1.762719577682716, + "grad_norm": 8.375, + "learning_rate": 4.93872266323767e-06, + "loss": 1.2931064367294312, + "step": 9684 + }, + { + "epoch": 1.7630836443069082, + "grad_norm": 7.4375, + "learning_rate": 4.937245360414285e-06, + "loss": 1.0370876789093018, + "step": 9686 + }, + { + "epoch": 1.7634477109311004, + "grad_norm": 15.375, + "learning_rate": 4.935768213545834e-06, + "loss": 1.4221354722976685, + "step": 9688 + }, + { + "epoch": 1.7638117775552926, + "grad_norm": 8.375, + "learning_rate": 4.934291222849086e-06, + "loss": 1.0306072235107422, + "step": 9690 + }, + { + "epoch": 1.7641758441794848, + "grad_norm": 8.9375, + "learning_rate": 4.932814388540783e-06, + "loss": 1.6003035306930542, + "step": 9692 + }, + { + "epoch": 1.764539910803677, + "grad_norm": 10.1875, + "learning_rate": 4.9313377108376405e-06, + "loss": 1.466977834701538, + "step": 9694 + }, + { + "epoch": 1.7649039774278692, + "grad_norm": 15.6875, + "learning_rate": 4.929861189956362e-06, + "loss": 1.6034886837005615, + "step": 9696 + }, + { + "epoch": 1.7652680440520614, + "grad_norm": 18.125, + "learning_rate": 4.928384826113613e-06, + "loss": 1.4333608150482178, + "step": 9698 + }, + { + "epoch": 1.7656321106762538, + "grad_norm": 15.4375, + "learning_rate": 4.926908619526051e-06, + "loss": 1.4016716480255127, + "step": 9700 + }, + { + "epoch": 1.765996177300446, + "grad_norm": 6.5, + "learning_rate": 4.925432570410299e-06, + "loss": 1.491098403930664, + "step": 9702 + }, + { + "epoch": 1.7663602439246382, + "grad_norm": 11.1875, + "learning_rate": 4.923956678982962e-06, + "loss": 1.1578155755996704, + "step": 9704 + }, + { + "epoch": 1.7667243105488304, + "grad_norm": 8.5625, + "learning_rate": 4.922480945460623e-06, + "loss": 1.3627545833587646, + "step": 9706 + }, + { + "epoch": 1.7670883771730228, + "grad_norm": 8.9375, + "learning_rate": 4.921005370059836e-06, + "loss": 1.1596611738204956, + "step": 9708 + }, + { + "epoch": 1.767452443797215, + "grad_norm": 15.25, + "learning_rate": 4.91952995299714e-06, + "loss": 1.0563263893127441, + "step": 9710 + }, + { + "epoch": 1.7678165104214072, + "grad_norm": 7.125, + "learning_rate": 4.918054694489045e-06, + "loss": 1.2106642723083496, + "step": 9712 + }, + { + "epoch": 1.7681805770455994, + "grad_norm": 6.25, + "learning_rate": 4.916579594752037e-06, + "loss": 1.1421802043914795, + "step": 9714 + }, + { + "epoch": 1.7685446436697916, + "grad_norm": 10.1875, + "learning_rate": 4.915104654002585e-06, + "loss": 1.6308321952819824, + "step": 9716 + }, + { + "epoch": 1.7689087102939838, + "grad_norm": 27.5, + "learning_rate": 4.9136298724571265e-06, + "loss": 1.9257757663726807, + "step": 9718 + }, + { + "epoch": 1.769272776918176, + "grad_norm": 7.375, + "learning_rate": 4.912155250332082e-06, + "loss": 1.3204705715179443, + "step": 9720 + }, + { + "epoch": 1.7696368435423682, + "grad_norm": 17.125, + "learning_rate": 4.9106807878438465e-06, + "loss": 1.5168133974075317, + "step": 9722 + }, + { + "epoch": 1.7700009101665604, + "grad_norm": 22.875, + "learning_rate": 4.9092064852087895e-06, + "loss": 1.393210768699646, + "step": 9724 + }, + { + "epoch": 1.7703649767907526, + "grad_norm": 10.5, + "learning_rate": 4.9077323426432625e-06, + "loss": 1.0466089248657227, + "step": 9726 + }, + { + "epoch": 1.770729043414945, + "grad_norm": 13.0, + "learning_rate": 4.906258360363585e-06, + "loss": 0.4767671823501587, + "step": 9728 + }, + { + "epoch": 1.7710931100391372, + "grad_norm": 27.625, + "learning_rate": 4.904784538586063e-06, + "loss": 1.5078160762786865, + "step": 9730 + }, + { + "epoch": 1.7714571766633294, + "grad_norm": 10.0625, + "learning_rate": 4.90331087752697e-06, + "loss": 1.4488803148269653, + "step": 9732 + }, + { + "epoch": 1.7718212432875218, + "grad_norm": 18.0, + "learning_rate": 4.90183737740256e-06, + "loss": 1.5841262340545654, + "step": 9734 + }, + { + "epoch": 1.772185309911714, + "grad_norm": 5.5, + "learning_rate": 4.900364038429067e-06, + "loss": 1.2088572978973389, + "step": 9736 + }, + { + "epoch": 1.7725493765359062, + "grad_norm": 18.875, + "learning_rate": 4.898890860822693e-06, + "loss": 1.2688897848129272, + "step": 9738 + }, + { + "epoch": 1.7729134431600984, + "grad_norm": 10.1875, + "learning_rate": 4.897417844799624e-06, + "loss": 1.2133854627609253, + "step": 9740 + }, + { + "epoch": 1.7732775097842906, + "grad_norm": 19.5, + "learning_rate": 4.895944990576018e-06, + "loss": 1.4112749099731445, + "step": 9742 + }, + { + "epoch": 1.7736415764084827, + "grad_norm": 24.75, + "learning_rate": 4.894472298368009e-06, + "loss": 1.697887897491455, + "step": 9744 + }, + { + "epoch": 1.774005643032675, + "grad_norm": 8.3125, + "learning_rate": 4.892999768391711e-06, + "loss": 1.4441301822662354, + "step": 9746 + }, + { + "epoch": 1.7743697096568671, + "grad_norm": 14.3125, + "learning_rate": 4.8915274008632095e-06, + "loss": 1.5689034461975098, + "step": 9748 + }, + { + "epoch": 1.7747337762810593, + "grad_norm": 11.75, + "learning_rate": 4.890055195998571e-06, + "loss": 1.5739667415618896, + "step": 9750 + }, + { + "epoch": 1.7750978429052515, + "grad_norm": 7.375, + "learning_rate": 4.888583154013834e-06, + "loss": 1.1880637407302856, + "step": 9752 + }, + { + "epoch": 1.775461909529444, + "grad_norm": 9.625, + "learning_rate": 4.887111275125014e-06, + "loss": 1.3103233575820923, + "step": 9754 + }, + { + "epoch": 1.7758259761536361, + "grad_norm": 22.625, + "learning_rate": 4.885639559548105e-06, + "loss": 1.568418264389038, + "step": 9756 + }, + { + "epoch": 1.7761900427778283, + "grad_norm": 17.5, + "learning_rate": 4.884168007499075e-06, + "loss": 1.8974366188049316, + "step": 9758 + }, + { + "epoch": 1.7765541094020205, + "grad_norm": 6.8125, + "learning_rate": 4.882696619193868e-06, + "loss": 1.309972882270813, + "step": 9760 + }, + { + "epoch": 1.776918176026213, + "grad_norm": 86.5, + "learning_rate": 4.881225394848404e-06, + "loss": 1.1577789783477783, + "step": 9762 + }, + { + "epoch": 1.7772822426504051, + "grad_norm": 7.78125, + "learning_rate": 4.879754334678577e-06, + "loss": 1.3325051069259644, + "step": 9764 + }, + { + "epoch": 1.7776463092745973, + "grad_norm": 12.3125, + "learning_rate": 4.8782834389002645e-06, + "loss": 1.4475016593933105, + "step": 9766 + }, + { + "epoch": 1.7780103758987895, + "grad_norm": 16.5, + "learning_rate": 4.876812707729309e-06, + "loss": 1.1812524795532227, + "step": 9768 + }, + { + "epoch": 1.7783744425229817, + "grad_norm": 6.375, + "learning_rate": 4.875342141381538e-06, + "loss": 0.9161214828491211, + "step": 9770 + }, + { + "epoch": 1.778738509147174, + "grad_norm": 7.96875, + "learning_rate": 4.87387174007275e-06, + "loss": 1.3399105072021484, + "step": 9772 + }, + { + "epoch": 1.779102575771366, + "grad_norm": 8.5, + "learning_rate": 4.872401504018719e-06, + "loss": 1.2644909620285034, + "step": 9774 + }, + { + "epoch": 1.7794666423955583, + "grad_norm": 16.875, + "learning_rate": 4.8709314334352e-06, + "loss": 1.1939635276794434, + "step": 9776 + }, + { + "epoch": 1.7798307090197505, + "grad_norm": 16.375, + "learning_rate": 4.869461528537916e-06, + "loss": 1.680476427078247, + "step": 9778 + }, + { + "epoch": 1.7801947756439427, + "grad_norm": 6.375, + "learning_rate": 4.867991789542571e-06, + "loss": 1.2742371559143066, + "step": 9780 + }, + { + "epoch": 1.780558842268135, + "grad_norm": 7.0, + "learning_rate": 4.866522216664844e-06, + "loss": 1.0444464683532715, + "step": 9782 + }, + { + "epoch": 1.7809229088923273, + "grad_norm": 7.6875, + "learning_rate": 4.865052810120386e-06, + "loss": 1.1968847513198853, + "step": 9784 + }, + { + "epoch": 1.7812869755165195, + "grad_norm": 14.8125, + "learning_rate": 4.8635835701248304e-06, + "loss": 1.2499253749847412, + "step": 9786 + }, + { + "epoch": 1.781651042140712, + "grad_norm": 15.1875, + "learning_rate": 4.8621144968937795e-06, + "loss": 1.7331328392028809, + "step": 9788 + }, + { + "epoch": 1.782015108764904, + "grad_norm": 5.125, + "learning_rate": 4.860645590642816e-06, + "loss": 1.2850385904312134, + "step": 9790 + }, + { + "epoch": 1.7823791753890963, + "grad_norm": 11.5, + "learning_rate": 4.859176851587494e-06, + "loss": 1.3192908763885498, + "step": 9792 + }, + { + "epoch": 1.7827432420132885, + "grad_norm": 9.5, + "learning_rate": 4.857708279943345e-06, + "loss": 1.550360918045044, + "step": 9794 + }, + { + "epoch": 1.7831073086374807, + "grad_norm": 9.6875, + "learning_rate": 4.856239875925878e-06, + "loss": 1.3419959545135498, + "step": 9796 + }, + { + "epoch": 1.7834713752616729, + "grad_norm": 21.5, + "learning_rate": 4.854771639750573e-06, + "loss": 1.4597018957138062, + "step": 9798 + }, + { + "epoch": 1.783835441885865, + "grad_norm": 19.75, + "learning_rate": 4.853303571632889e-06, + "loss": 1.4212193489074707, + "step": 9800 + }, + { + "epoch": 1.7841995085100573, + "grad_norm": 11.3125, + "learning_rate": 4.851835671788258e-06, + "loss": 1.2252379655838013, + "step": 9802 + }, + { + "epoch": 1.7845635751342495, + "grad_norm": 108.0, + "learning_rate": 4.85036794043209e-06, + "loss": 1.308140516281128, + "step": 9804 + }, + { + "epoch": 1.7849276417584417, + "grad_norm": 12.875, + "learning_rate": 4.848900377779768e-06, + "loss": 1.1591253280639648, + "step": 9806 + }, + { + "epoch": 1.785291708382634, + "grad_norm": 17.875, + "learning_rate": 4.847432984046649e-06, + "loss": 1.3936431407928467, + "step": 9808 + }, + { + "epoch": 1.7856557750068263, + "grad_norm": 4.78125, + "learning_rate": 4.84596575944807e-06, + "loss": 1.1082038879394531, + "step": 9810 + }, + { + "epoch": 1.7860198416310185, + "grad_norm": 19.25, + "learning_rate": 4.844498704199338e-06, + "loss": 1.299839735031128, + "step": 9812 + }, + { + "epoch": 1.7863839082552107, + "grad_norm": 14.625, + "learning_rate": 4.843031818515738e-06, + "loss": 1.3926259279251099, + "step": 9814 + }, + { + "epoch": 1.786747974879403, + "grad_norm": 17.25, + "learning_rate": 4.841565102612531e-06, + "loss": 1.6030869483947754, + "step": 9816 + }, + { + "epoch": 1.7871120415035953, + "grad_norm": 21.375, + "learning_rate": 4.84009855670495e-06, + "loss": 1.7322543859481812, + "step": 9818 + }, + { + "epoch": 1.7874761081277875, + "grad_norm": 4.25, + "learning_rate": 4.838632181008206e-06, + "loss": 0.9729434251785278, + "step": 9820 + }, + { + "epoch": 1.7878401747519796, + "grad_norm": 5.875, + "learning_rate": 4.837165975737481e-06, + "loss": 1.184375286102295, + "step": 9822 + }, + { + "epoch": 1.7882042413761718, + "grad_norm": 16.25, + "learning_rate": 4.835699941107938e-06, + "loss": 1.1851658821105957, + "step": 9824 + }, + { + "epoch": 1.788568308000364, + "grad_norm": 26.125, + "learning_rate": 4.83423407733471e-06, + "loss": 0.8616297245025635, + "step": 9826 + }, + { + "epoch": 1.7889323746245562, + "grad_norm": 20.625, + "learning_rate": 4.832768384632906e-06, + "loss": 0.9894980192184448, + "step": 9828 + }, + { + "epoch": 1.7892964412487484, + "grad_norm": 15.8125, + "learning_rate": 4.831302863217613e-06, + "loss": 1.2195518016815186, + "step": 9830 + }, + { + "epoch": 1.7896605078729406, + "grad_norm": 10.5625, + "learning_rate": 4.829837513303886e-06, + "loss": 1.5083246231079102, + "step": 9832 + }, + { + "epoch": 1.7900245744971328, + "grad_norm": 11.375, + "learning_rate": 4.828372335106762e-06, + "loss": 0.9479171633720398, + "step": 9834 + }, + { + "epoch": 1.7903886411213252, + "grad_norm": 11.0, + "learning_rate": 4.826907328841251e-06, + "loss": 1.514286756515503, + "step": 9836 + }, + { + "epoch": 1.7907527077455174, + "grad_norm": 16.875, + "learning_rate": 4.825442494722334e-06, + "loss": 1.240665316581726, + "step": 9838 + }, + { + "epoch": 1.7911167743697096, + "grad_norm": 11.8125, + "learning_rate": 4.823977832964972e-06, + "loss": 0.9979418516159058, + "step": 9840 + }, + { + "epoch": 1.791480840993902, + "grad_norm": 15.4375, + "learning_rate": 4.8225133437840965e-06, + "loss": 1.7140535116195679, + "step": 9842 + }, + { + "epoch": 1.7918449076180942, + "grad_norm": 3.046875, + "learning_rate": 4.821049027394615e-06, + "loss": 0.9580082893371582, + "step": 9844 + }, + { + "epoch": 1.7922089742422864, + "grad_norm": 10.6875, + "learning_rate": 4.819584884011413e-06, + "loss": 1.4786930084228516, + "step": 9846 + }, + { + "epoch": 1.7925730408664786, + "grad_norm": 8.3125, + "learning_rate": 4.818120913849344e-06, + "loss": 1.4283615350723267, + "step": 9848 + }, + { + "epoch": 1.7929371074906708, + "grad_norm": 7.15625, + "learning_rate": 4.816657117123243e-06, + "loss": 1.3162473440170288, + "step": 9850 + }, + { + "epoch": 1.793301174114863, + "grad_norm": 8.8125, + "learning_rate": 4.815193494047911e-06, + "loss": 1.4530799388885498, + "step": 9852 + }, + { + "epoch": 1.7936652407390552, + "grad_norm": 6.6875, + "learning_rate": 4.813730044838134e-06, + "loss": 1.4405879974365234, + "step": 9854 + }, + { + "epoch": 1.7940293073632474, + "grad_norm": 12.125, + "learning_rate": 4.8122667697086664e-06, + "loss": 1.2693815231323242, + "step": 9856 + }, + { + "epoch": 1.7943933739874396, + "grad_norm": 12.5, + "learning_rate": 4.8108036688742345e-06, + "loss": 1.287493109703064, + "step": 9858 + }, + { + "epoch": 1.7947574406116318, + "grad_norm": 62.5, + "learning_rate": 4.809340742549548e-06, + "loss": 1.3426483869552612, + "step": 9860 + }, + { + "epoch": 1.7951215072358242, + "grad_norm": 5.75, + "learning_rate": 4.807877990949279e-06, + "loss": 1.3801523447036743, + "step": 9862 + }, + { + "epoch": 1.7954855738600164, + "grad_norm": 11.25, + "learning_rate": 4.806415414288085e-06, + "loss": 1.3038803339004517, + "step": 9864 + }, + { + "epoch": 1.7958496404842086, + "grad_norm": 7.125, + "learning_rate": 4.8049530127805925e-06, + "loss": 1.341579794883728, + "step": 9866 + }, + { + "epoch": 1.796213707108401, + "grad_norm": 5.3125, + "learning_rate": 4.8034907866414005e-06, + "loss": 1.26124107837677, + "step": 9868 + }, + { + "epoch": 1.7965777737325932, + "grad_norm": 21.0, + "learning_rate": 4.802028736085089e-06, + "loss": 1.4446187019348145, + "step": 9870 + }, + { + "epoch": 1.7969418403567854, + "grad_norm": 15.375, + "learning_rate": 4.800566861326203e-06, + "loss": 1.2436654567718506, + "step": 9872 + }, + { + "epoch": 1.7973059069809776, + "grad_norm": 18.0, + "learning_rate": 4.799105162579269e-06, + "loss": 1.4584167003631592, + "step": 9874 + }, + { + "epoch": 1.7976699736051698, + "grad_norm": 10.3125, + "learning_rate": 4.797643640058789e-06, + "loss": 1.368586778640747, + "step": 9876 + }, + { + "epoch": 1.798034040229362, + "grad_norm": 13.25, + "learning_rate": 4.7961822939792285e-06, + "loss": 1.2624218463897705, + "step": 9878 + }, + { + "epoch": 1.7983981068535542, + "grad_norm": 24.5, + "learning_rate": 4.79472112455504e-06, + "loss": 1.330539584159851, + "step": 9880 + }, + { + "epoch": 1.7987621734777464, + "grad_norm": 15.375, + "learning_rate": 4.7932601320006405e-06, + "loss": 1.5191045999526978, + "step": 9882 + }, + { + "epoch": 1.7991262401019386, + "grad_norm": 9.8125, + "learning_rate": 4.7917993165304265e-06, + "loss": 1.120861530303955, + "step": 9884 + }, + { + "epoch": 1.7994903067261308, + "grad_norm": 12.4375, + "learning_rate": 4.790338678358767e-06, + "loss": 0.6580009460449219, + "step": 9886 + }, + { + "epoch": 1.799854373350323, + "grad_norm": 17.625, + "learning_rate": 4.788878217700003e-06, + "loss": 1.2356891632080078, + "step": 9888 + }, + { + "epoch": 1.8002184399745154, + "grad_norm": 10.25, + "learning_rate": 4.787417934768455e-06, + "loss": 1.4245901107788086, + "step": 9890 + }, + { + "epoch": 1.8005825065987076, + "grad_norm": 6.0, + "learning_rate": 4.785957829778407e-06, + "loss": 1.2386894226074219, + "step": 9892 + }, + { + "epoch": 1.8009465732228997, + "grad_norm": 8.25, + "learning_rate": 4.78449790294413e-06, + "loss": 1.229024887084961, + "step": 9894 + }, + { + "epoch": 1.8013106398470922, + "grad_norm": 14.125, + "learning_rate": 4.78303815447986e-06, + "loss": 1.4715592861175537, + "step": 9896 + }, + { + "epoch": 1.8016747064712844, + "grad_norm": 14.5, + "learning_rate": 4.781578584599807e-06, + "loss": 1.2632544040679932, + "step": 9898 + }, + { + "epoch": 1.8020387730954766, + "grad_norm": 14.25, + "learning_rate": 4.78011919351816e-06, + "loss": 1.2378745079040527, + "step": 9900 + }, + { + "epoch": 1.8024028397196687, + "grad_norm": 15.125, + "learning_rate": 4.778659981449077e-06, + "loss": 0.8463464975357056, + "step": 9902 + }, + { + "epoch": 1.802766906343861, + "grad_norm": 10.1875, + "learning_rate": 4.777200948606693e-06, + "loss": 1.6600208282470703, + "step": 9904 + }, + { + "epoch": 1.8031309729680531, + "grad_norm": 8.9375, + "learning_rate": 4.775742095205114e-06, + "loss": 1.0083280801773071, + "step": 9906 + }, + { + "epoch": 1.8034950395922453, + "grad_norm": 2.640625, + "learning_rate": 4.77428342145842e-06, + "loss": 0.6644299030303955, + "step": 9908 + }, + { + "epoch": 1.8038591062164375, + "grad_norm": 6.3125, + "learning_rate": 4.772824927580668e-06, + "loss": 1.184407353401184, + "step": 9910 + }, + { + "epoch": 1.8042231728406297, + "grad_norm": 25.125, + "learning_rate": 4.771366613785884e-06, + "loss": 1.1789664030075073, + "step": 9912 + }, + { + "epoch": 1.804587239464822, + "grad_norm": 3.6875, + "learning_rate": 4.769908480288069e-06, + "loss": 1.1635903120040894, + "step": 9914 + }, + { + "epoch": 1.8049513060890143, + "grad_norm": 9.5, + "learning_rate": 4.768450527301202e-06, + "loss": 1.1157206296920776, + "step": 9916 + }, + { + "epoch": 1.8053153727132065, + "grad_norm": 7.78125, + "learning_rate": 4.7669927550392264e-06, + "loss": 1.4344539642333984, + "step": 9918 + }, + { + "epoch": 1.8056794393373987, + "grad_norm": 13.375, + "learning_rate": 4.76553516371607e-06, + "loss": 1.2899246215820312, + "step": 9920 + }, + { + "epoch": 1.8060435059615911, + "grad_norm": 6.0, + "learning_rate": 4.764077753545622e-06, + "loss": 1.222097635269165, + "step": 9922 + }, + { + "epoch": 1.8064075725857833, + "grad_norm": 14.125, + "learning_rate": 4.762620524741756e-06, + "loss": 1.786362886428833, + "step": 9924 + }, + { + "epoch": 1.8067716392099755, + "grad_norm": 7.40625, + "learning_rate": 4.761163477518315e-06, + "loss": 1.2622867822647095, + "step": 9926 + }, + { + "epoch": 1.8071357058341677, + "grad_norm": 15.75, + "learning_rate": 4.759706612089112e-06, + "loss": 1.1997376680374146, + "step": 9928 + }, + { + "epoch": 1.80749977245836, + "grad_norm": 9.375, + "learning_rate": 4.758249928667938e-06, + "loss": 1.3268487453460693, + "step": 9930 + }, + { + "epoch": 1.807863839082552, + "grad_norm": 13.625, + "learning_rate": 4.756793427468553e-06, + "loss": 1.278355360031128, + "step": 9932 + }, + { + "epoch": 1.8082279057067443, + "grad_norm": 3.25, + "learning_rate": 4.755337108704695e-06, + "loss": 1.045915126800537, + "step": 9934 + }, + { + "epoch": 1.8085919723309365, + "grad_norm": 17.25, + "learning_rate": 4.753880972590073e-06, + "loss": 1.578112244606018, + "step": 9936 + }, + { + "epoch": 1.8089560389551287, + "grad_norm": 21.0, + "learning_rate": 4.752425019338367e-06, + "loss": 1.659615397453308, + "step": 9938 + }, + { + "epoch": 1.8093201055793209, + "grad_norm": 9.0625, + "learning_rate": 4.750969249163234e-06, + "loss": 1.2165932655334473, + "step": 9940 + }, + { + "epoch": 1.8096841722035133, + "grad_norm": 9.1875, + "learning_rate": 4.749513662278301e-06, + "loss": 1.5094040632247925, + "step": 9942 + }, + { + "epoch": 1.8100482388277055, + "grad_norm": 6.125, + "learning_rate": 4.748058258897172e-06, + "loss": 0.9273290634155273, + "step": 9944 + }, + { + "epoch": 1.8104123054518977, + "grad_norm": 12.0625, + "learning_rate": 4.746603039233419e-06, + "loss": 1.0045676231384277, + "step": 9946 + }, + { + "epoch": 1.8107763720760899, + "grad_norm": 27.25, + "learning_rate": 4.745148003500589e-06, + "loss": 0.7226732969284058, + "step": 9948 + }, + { + "epoch": 1.8111404387002823, + "grad_norm": 10.625, + "learning_rate": 4.7436931519122065e-06, + "loss": 0.9850039482116699, + "step": 9950 + }, + { + "epoch": 1.8115045053244745, + "grad_norm": 22.125, + "learning_rate": 4.74223848468176e-06, + "loss": 1.3006930351257324, + "step": 9952 + }, + { + "epoch": 1.8118685719486667, + "grad_norm": 19.0, + "learning_rate": 4.740784002022721e-06, + "loss": 1.9448378086090088, + "step": 9954 + }, + { + "epoch": 1.8122326385728589, + "grad_norm": 21.625, + "learning_rate": 4.739329704148525e-06, + "loss": 1.9606869220733643, + "step": 9956 + }, + { + "epoch": 1.812596705197051, + "grad_norm": 13.6875, + "learning_rate": 4.737875591272586e-06, + "loss": 1.9865272045135498, + "step": 9958 + }, + { + "epoch": 1.8129607718212433, + "grad_norm": 9.3125, + "learning_rate": 4.7364216636082895e-06, + "loss": 1.4596893787384033, + "step": 9960 + }, + { + "epoch": 1.8133248384454355, + "grad_norm": 16.25, + "learning_rate": 4.7349679213689925e-06, + "loss": 1.4548094272613525, + "step": 9962 + }, + { + "epoch": 1.8136889050696277, + "grad_norm": 25.125, + "learning_rate": 4.7335143647680265e-06, + "loss": 1.3416130542755127, + "step": 9964 + }, + { + "epoch": 1.8140529716938198, + "grad_norm": 11.125, + "learning_rate": 4.732060994018696e-06, + "loss": 1.4447228908538818, + "step": 9966 + }, + { + "epoch": 1.814417038318012, + "grad_norm": 11.5625, + "learning_rate": 4.730607809334275e-06, + "loss": 1.4781229496002197, + "step": 9968 + }, + { + "epoch": 1.8147811049422045, + "grad_norm": 44.5, + "learning_rate": 4.729154810928014e-06, + "loss": 1.0811924934387207, + "step": 9970 + }, + { + "epoch": 1.8151451715663967, + "grad_norm": 11.3125, + "learning_rate": 4.727701999013133e-06, + "loss": 1.3054535388946533, + "step": 9972 + }, + { + "epoch": 1.8155092381905888, + "grad_norm": 7.3125, + "learning_rate": 4.726249373802829e-06, + "loss": 1.839009404182434, + "step": 9974 + }, + { + "epoch": 1.8158733048147813, + "grad_norm": 5.40625, + "learning_rate": 4.7247969355102675e-06, + "loss": 1.3314069509506226, + "step": 9976 + }, + { + "epoch": 1.8162373714389735, + "grad_norm": 9.9375, + "learning_rate": 4.7233446843485854e-06, + "loss": 1.6068639755249023, + "step": 9978 + }, + { + "epoch": 1.8166014380631657, + "grad_norm": 8.375, + "learning_rate": 4.7218926205309e-06, + "loss": 1.2942819595336914, + "step": 9980 + }, + { + "epoch": 1.8169655046873578, + "grad_norm": 15.0625, + "learning_rate": 4.720440744270291e-06, + "loss": 1.0803054571151733, + "step": 9982 + }, + { + "epoch": 1.81732957131155, + "grad_norm": 23.875, + "learning_rate": 4.718989055779817e-06, + "loss": 0.6113383769989014, + "step": 9984 + }, + { + "epoch": 1.8176936379357422, + "grad_norm": 25.875, + "learning_rate": 4.717537555272509e-06, + "loss": 1.4741604328155518, + "step": 9986 + }, + { + "epoch": 1.8180577045599344, + "grad_norm": 17.75, + "learning_rate": 4.716086242961367e-06, + "loss": 1.5401732921600342, + "step": 9988 + }, + { + "epoch": 1.8184217711841266, + "grad_norm": 41.5, + "learning_rate": 4.714635119059366e-06, + "loss": 1.5616130828857422, + "step": 9990 + }, + { + "epoch": 1.8187858378083188, + "grad_norm": 12.3125, + "learning_rate": 4.71318418377945e-06, + "loss": 1.656862497329712, + "step": 9992 + }, + { + "epoch": 1.819149904432511, + "grad_norm": 15.8125, + "learning_rate": 4.711733437334541e-06, + "loss": 1.5075008869171143, + "step": 9994 + }, + { + "epoch": 1.8195139710567034, + "grad_norm": 9.8125, + "learning_rate": 4.7102828799375315e-06, + "loss": 1.3021337985992432, + "step": 9996 + }, + { + "epoch": 1.8198780376808956, + "grad_norm": 10.5, + "learning_rate": 4.708832511801279e-06, + "loss": 1.0407510995864868, + "step": 9998 + }, + { + "epoch": 1.8202421043050878, + "grad_norm": 14.5, + "learning_rate": 4.707382333138626e-06, + "loss": 1.476670265197754, + "step": 10000 + }, + { + "epoch": 1.82060617092928, + "grad_norm": 15.25, + "learning_rate": 4.705932344162374e-06, + "loss": 1.5759352445602417, + "step": 10002 + }, + { + "epoch": 1.8209702375534724, + "grad_norm": 33.5, + "learning_rate": 4.704482545085307e-06, + "loss": 0.4886060357093811, + "step": 10004 + }, + { + "epoch": 1.8213343041776646, + "grad_norm": 45.5, + "learning_rate": 4.7030329361201785e-06, + "loss": 1.4418752193450928, + "step": 10006 + }, + { + "epoch": 1.8216983708018568, + "grad_norm": 11.25, + "learning_rate": 4.701583517479708e-06, + "loss": 1.2543349266052246, + "step": 10008 + }, + { + "epoch": 1.822062437426049, + "grad_norm": 40.25, + "learning_rate": 4.700134289376597e-06, + "loss": 1.3194719552993774, + "step": 10010 + }, + { + "epoch": 1.8224265040502412, + "grad_norm": 12.9375, + "learning_rate": 4.698685252023508e-06, + "loss": 1.386330485343933, + "step": 10012 + }, + { + "epoch": 1.8227905706744334, + "grad_norm": 26.5, + "learning_rate": 4.6972364056330855e-06, + "loss": 1.2390315532684326, + "step": 10014 + }, + { + "epoch": 1.8231546372986256, + "grad_norm": 9.1875, + "learning_rate": 4.695787750417942e-06, + "loss": 0.41816461086273193, + "step": 10016 + }, + { + "epoch": 1.8235187039228178, + "grad_norm": 8.5, + "learning_rate": 4.694339286590659e-06, + "loss": 1.2061243057250977, + "step": 10018 + }, + { + "epoch": 1.82388277054701, + "grad_norm": 12.0625, + "learning_rate": 4.692891014363796e-06, + "loss": 1.2117953300476074, + "step": 10020 + }, + { + "epoch": 1.8242468371712022, + "grad_norm": 10.0, + "learning_rate": 4.6914429339498774e-06, + "loss": 1.5956456661224365, + "step": 10022 + }, + { + "epoch": 1.8246109037953946, + "grad_norm": 12.3125, + "learning_rate": 4.689995045561406e-06, + "loss": 1.4610309600830078, + "step": 10024 + }, + { + "epoch": 1.8249749704195868, + "grad_norm": 8.8125, + "learning_rate": 4.688547349410854e-06, + "loss": 1.142910361289978, + "step": 10026 + }, + { + "epoch": 1.825339037043779, + "grad_norm": 8.5, + "learning_rate": 4.687099845710661e-06, + "loss": 1.2656497955322266, + "step": 10028 + }, + { + "epoch": 1.8257031036679714, + "grad_norm": 11.6875, + "learning_rate": 4.685652534673248e-06, + "loss": 1.1357722282409668, + "step": 10030 + }, + { + "epoch": 1.8260671702921636, + "grad_norm": 20.625, + "learning_rate": 4.6842054165109965e-06, + "loss": 1.333534598350525, + "step": 10032 + }, + { + "epoch": 1.8264312369163558, + "grad_norm": 14.3125, + "learning_rate": 4.6827584914362675e-06, + "loss": 1.4362547397613525, + "step": 10034 + }, + { + "epoch": 1.826795303540548, + "grad_norm": 7.78125, + "learning_rate": 4.681311759661394e-06, + "loss": 1.5378859043121338, + "step": 10036 + }, + { + "epoch": 1.8271593701647402, + "grad_norm": 6.5625, + "learning_rate": 4.679865221398674e-06, + "loss": 1.2298191785812378, + "step": 10038 + }, + { + "epoch": 1.8275234367889324, + "grad_norm": 10.875, + "learning_rate": 4.678418876860383e-06, + "loss": 1.3100981712341309, + "step": 10040 + }, + { + "epoch": 1.8278875034131246, + "grad_norm": 9.3125, + "learning_rate": 4.676972726258766e-06, + "loss": 1.1052302122116089, + "step": 10042 + }, + { + "epoch": 1.8282515700373168, + "grad_norm": 5.15625, + "learning_rate": 4.675526769806039e-06, + "loss": 0.9375574588775635, + "step": 10044 + }, + { + "epoch": 1.828615636661509, + "grad_norm": 16.625, + "learning_rate": 4.674081007714392e-06, + "loss": 1.1996800899505615, + "step": 10046 + }, + { + "epoch": 1.8289797032857011, + "grad_norm": 18.125, + "learning_rate": 4.672635440195982e-06, + "loss": 1.739487648010254, + "step": 10048 + }, + { + "epoch": 1.8293437699098936, + "grad_norm": 12.5, + "learning_rate": 4.671190067462944e-06, + "loss": 1.3939199447631836, + "step": 10050 + }, + { + "epoch": 1.8297078365340858, + "grad_norm": 34.0, + "learning_rate": 4.669744889727377e-06, + "loss": 1.3368499279022217, + "step": 10052 + }, + { + "epoch": 1.830071903158278, + "grad_norm": 11.25, + "learning_rate": 4.6682999072013554e-06, + "loss": 1.3886959552764893, + "step": 10054 + }, + { + "epoch": 1.8304359697824701, + "grad_norm": 10.0625, + "learning_rate": 4.6668551200969285e-06, + "loss": 1.3839491605758667, + "step": 10056 + }, + { + "epoch": 1.8308000364066626, + "grad_norm": 6.5625, + "learning_rate": 4.665410528626107e-06, + "loss": 0.9638664722442627, + "step": 10058 + }, + { + "epoch": 1.8311641030308548, + "grad_norm": 43.25, + "learning_rate": 4.663966133000884e-06, + "loss": 1.2880927324295044, + "step": 10060 + }, + { + "epoch": 1.831528169655047, + "grad_norm": 13.5625, + "learning_rate": 4.662521933433215e-06, + "loss": 1.6466302871704102, + "step": 10062 + }, + { + "epoch": 1.8318922362792391, + "grad_norm": 8.125, + "learning_rate": 4.661077930135033e-06, + "loss": 1.3043192625045776, + "step": 10064 + }, + { + "epoch": 1.8322563029034313, + "grad_norm": 8.75, + "learning_rate": 4.659634123318238e-06, + "loss": 1.303576946258545, + "step": 10066 + }, + { + "epoch": 1.8326203695276235, + "grad_norm": 11.375, + "learning_rate": 4.658190513194703e-06, + "loss": 1.4081577062606812, + "step": 10068 + }, + { + "epoch": 1.8329844361518157, + "grad_norm": 11.75, + "learning_rate": 4.656747099976273e-06, + "loss": 1.198384404182434, + "step": 10070 + }, + { + "epoch": 1.833348502776008, + "grad_norm": 18.0, + "learning_rate": 4.655303883874761e-06, + "loss": 1.3355695009231567, + "step": 10072 + }, + { + "epoch": 1.8337125694002, + "grad_norm": 13.875, + "learning_rate": 4.653860865101956e-06, + "loss": 1.8721139430999756, + "step": 10074 + }, + { + "epoch": 1.8340766360243923, + "grad_norm": 22.625, + "learning_rate": 4.652418043869614e-06, + "loss": 1.429003119468689, + "step": 10076 + }, + { + "epoch": 1.8344407026485847, + "grad_norm": 30.0, + "learning_rate": 4.650975420389461e-06, + "loss": 1.5270614624023438, + "step": 10078 + }, + { + "epoch": 1.834804769272777, + "grad_norm": 12.9375, + "learning_rate": 4.6495329948732e-06, + "loss": 1.2187944650650024, + "step": 10080 + }, + { + "epoch": 1.835168835896969, + "grad_norm": 53.75, + "learning_rate": 4.648090767532496e-06, + "loss": 1.854748010635376, + "step": 10082 + }, + { + "epoch": 1.8355329025211615, + "grad_norm": 19.125, + "learning_rate": 4.646648738578996e-06, + "loss": 1.834840178489685, + "step": 10084 + }, + { + "epoch": 1.8358969691453537, + "grad_norm": 4.3125, + "learning_rate": 4.645206908224309e-06, + "loss": 1.055801510810852, + "step": 10086 + }, + { + "epoch": 1.836261035769546, + "grad_norm": 7.84375, + "learning_rate": 4.643765276680016e-06, + "loss": 1.058341145515442, + "step": 10088 + }, + { + "epoch": 1.836625102393738, + "grad_norm": 10.25, + "learning_rate": 4.642323844157674e-06, + "loss": 0.9838607311248779, + "step": 10090 + }, + { + "epoch": 1.8369891690179303, + "grad_norm": 10.625, + "learning_rate": 4.6408826108688035e-06, + "loss": 1.3639460802078247, + "step": 10092 + }, + { + "epoch": 1.8373532356421225, + "grad_norm": 20.875, + "learning_rate": 4.639441577024903e-06, + "loss": 1.926166296005249, + "step": 10094 + }, + { + "epoch": 1.8377173022663147, + "grad_norm": 20.375, + "learning_rate": 4.638000742837438e-06, + "loss": 1.8123565912246704, + "step": 10096 + }, + { + "epoch": 1.8380813688905069, + "grad_norm": 34.75, + "learning_rate": 4.636560108517842e-06, + "loss": 1.4493234157562256, + "step": 10098 + }, + { + "epoch": 1.838445435514699, + "grad_norm": 20.125, + "learning_rate": 4.635119674277528e-06, + "loss": 1.2812522649765015, + "step": 10100 + }, + { + "epoch": 1.8388095021388913, + "grad_norm": 10.0, + "learning_rate": 4.633679440327867e-06, + "loss": 1.3218059539794922, + "step": 10102 + }, + { + "epoch": 1.8391735687630837, + "grad_norm": 22.875, + "learning_rate": 4.632239406880212e-06, + "loss": 1.4651618003845215, + "step": 10104 + }, + { + "epoch": 1.8395376353872759, + "grad_norm": 22.0, + "learning_rate": 4.630799574145883e-06, + "loss": 1.6552772521972656, + "step": 10106 + }, + { + "epoch": 1.839901702011468, + "grad_norm": 4.875, + "learning_rate": 4.629359942336164e-06, + "loss": 1.0609335899353027, + "step": 10108 + }, + { + "epoch": 1.8402657686356603, + "grad_norm": 1.9375, + "learning_rate": 4.627920511662323e-06, + "loss": 1.0964921712875366, + "step": 10110 + }, + { + "epoch": 1.8406298352598527, + "grad_norm": 26.75, + "learning_rate": 4.626481282335582e-06, + "loss": 1.1380648612976074, + "step": 10112 + }, + { + "epoch": 1.8409939018840449, + "grad_norm": 17.625, + "learning_rate": 4.6250422545671495e-06, + "loss": 1.6771175861358643, + "step": 10114 + }, + { + "epoch": 1.841357968508237, + "grad_norm": 8.6875, + "learning_rate": 4.623603428568193e-06, + "loss": 1.4338878393173218, + "step": 10116 + }, + { + "epoch": 1.8417220351324293, + "grad_norm": 6.71875, + "learning_rate": 4.622164804549855e-06, + "loss": 1.366791844367981, + "step": 10118 + }, + { + "epoch": 1.8420861017566215, + "grad_norm": 18.75, + "learning_rate": 4.620726382723248e-06, + "loss": 1.72682523727417, + "step": 10120 + }, + { + "epoch": 1.8424501683808137, + "grad_norm": 13.5625, + "learning_rate": 4.619288163299455e-06, + "loss": 1.548140287399292, + "step": 10122 + }, + { + "epoch": 1.8428142350050059, + "grad_norm": 39.75, + "learning_rate": 4.617850146489529e-06, + "loss": 1.6242822408676147, + "step": 10124 + }, + { + "epoch": 1.843178301629198, + "grad_norm": 15.0625, + "learning_rate": 4.616412332504493e-06, + "loss": 1.9507834911346436, + "step": 10126 + }, + { + "epoch": 1.8435423682533902, + "grad_norm": 68.0, + "learning_rate": 4.6149747215553385e-06, + "loss": 1.2795758247375488, + "step": 10128 + }, + { + "epoch": 1.8439064348775824, + "grad_norm": 77.5, + "learning_rate": 4.613537313853032e-06, + "loss": 0.890384316444397, + "step": 10130 + }, + { + "epoch": 1.8442705015017749, + "grad_norm": 8.125, + "learning_rate": 4.612100109608503e-06, + "loss": 1.4209778308868408, + "step": 10132 + }, + { + "epoch": 1.844634568125967, + "grad_norm": 9.5, + "learning_rate": 4.61066310903266e-06, + "loss": 1.2322556972503662, + "step": 10134 + }, + { + "epoch": 1.8449986347501592, + "grad_norm": 14.5625, + "learning_rate": 4.6092263123363775e-06, + "loss": 1.2881290912628174, + "step": 10136 + }, + { + "epoch": 1.8453627013743517, + "grad_norm": 6.0625, + "learning_rate": 4.607789719730494e-06, + "loss": 1.3625489473342896, + "step": 10138 + }, + { + "epoch": 1.8457267679985438, + "grad_norm": 12.3125, + "learning_rate": 4.60635333142583e-06, + "loss": 1.4665266275405884, + "step": 10140 + }, + { + "epoch": 1.846090834622736, + "grad_norm": 9.625, + "learning_rate": 4.604917147633163e-06, + "loss": 1.5087851285934448, + "step": 10142 + }, + { + "epoch": 1.8464549012469282, + "grad_norm": 205.0, + "learning_rate": 4.603481168563253e-06, + "loss": 0.543316125869751, + "step": 10144 + }, + { + "epoch": 1.8468189678711204, + "grad_norm": 15.25, + "learning_rate": 4.602045394426823e-06, + "loss": 1.400770902633667, + "step": 10146 + }, + { + "epoch": 1.8471830344953126, + "grad_norm": 11.4375, + "learning_rate": 4.600609825434564e-06, + "loss": 1.1860394477844238, + "step": 10148 + }, + { + "epoch": 1.8475471011195048, + "grad_norm": 7.03125, + "learning_rate": 4.599174461797143e-06, + "loss": 1.3179806470870972, + "step": 10150 + }, + { + "epoch": 1.847911167743697, + "grad_norm": 6.96875, + "learning_rate": 4.597739303725192e-06, + "loss": 1.2298710346221924, + "step": 10152 + }, + { + "epoch": 1.8482752343678892, + "grad_norm": 16.375, + "learning_rate": 4.596304351429315e-06, + "loss": 1.504056453704834, + "step": 10154 + }, + { + "epoch": 1.8486393009920814, + "grad_norm": 12.875, + "learning_rate": 4.594869605120088e-06, + "loss": 1.791691541671753, + "step": 10156 + }, + { + "epoch": 1.8490033676162738, + "grad_norm": 3.9375, + "learning_rate": 4.59343506500805e-06, + "loss": 0.8267419338226318, + "step": 10158 + }, + { + "epoch": 1.849367434240466, + "grad_norm": 6.03125, + "learning_rate": 4.592000731303716e-06, + "loss": 0.9686307311058044, + "step": 10160 + }, + { + "epoch": 1.8497315008646582, + "grad_norm": 23.0, + "learning_rate": 4.590566604217568e-06, + "loss": 1.5720093250274658, + "step": 10162 + }, + { + "epoch": 1.8500955674888506, + "grad_norm": 22.125, + "learning_rate": 4.58913268396006e-06, + "loss": 1.6447392702102661, + "step": 10164 + }, + { + "epoch": 1.8504596341130428, + "grad_norm": 14.5, + "learning_rate": 4.587698970741613e-06, + "loss": 1.420624852180481, + "step": 10166 + }, + { + "epoch": 1.850823700737235, + "grad_norm": 11.125, + "learning_rate": 4.586265464772617e-06, + "loss": 1.4557628631591797, + "step": 10168 + }, + { + "epoch": 1.8511877673614272, + "grad_norm": 21.625, + "learning_rate": 4.584832166263437e-06, + "loss": 1.7179882526397705, + "step": 10170 + }, + { + "epoch": 1.8515518339856194, + "grad_norm": 34.0, + "learning_rate": 4.583399075424399e-06, + "loss": 1.2712702751159668, + "step": 10172 + }, + { + "epoch": 1.8519159006098116, + "grad_norm": 13.1875, + "learning_rate": 4.581966192465807e-06, + "loss": 1.2924363613128662, + "step": 10174 + }, + { + "epoch": 1.8522799672340038, + "grad_norm": 16.25, + "learning_rate": 4.580533517597931e-06, + "loss": 1.3899078369140625, + "step": 10176 + }, + { + "epoch": 1.852644033858196, + "grad_norm": 9.75, + "learning_rate": 4.579101051031005e-06, + "loss": 1.3126336336135864, + "step": 10178 + }, + { + "epoch": 1.8530081004823882, + "grad_norm": 12.625, + "learning_rate": 4.577668792975245e-06, + "loss": 1.108689546585083, + "step": 10180 + }, + { + "epoch": 1.8533721671065804, + "grad_norm": 27.25, + "learning_rate": 4.576236743640823e-06, + "loss": 1.0283482074737549, + "step": 10182 + }, + { + "epoch": 1.8537362337307728, + "grad_norm": 15.1875, + "learning_rate": 4.5748049032378895e-06, + "loss": 1.379903793334961, + "step": 10184 + }, + { + "epoch": 1.854100300354965, + "grad_norm": 8.375, + "learning_rate": 4.5733732719765615e-06, + "loss": 1.6895334720611572, + "step": 10186 + }, + { + "epoch": 1.8544643669791572, + "grad_norm": 19.875, + "learning_rate": 4.5719418500669234e-06, + "loss": 1.9689158201217651, + "step": 10188 + }, + { + "epoch": 1.8548284336033494, + "grad_norm": 12.25, + "learning_rate": 4.570510637719032e-06, + "loss": 1.6806762218475342, + "step": 10190 + }, + { + "epoch": 1.8551925002275418, + "grad_norm": 8.4375, + "learning_rate": 4.5690796351429105e-06, + "loss": 1.3381773233413696, + "step": 10192 + }, + { + "epoch": 1.855556566851734, + "grad_norm": 12.875, + "learning_rate": 4.567648842548553e-06, + "loss": 1.3012750148773193, + "step": 10194 + }, + { + "epoch": 1.8559206334759262, + "grad_norm": 11.5625, + "learning_rate": 4.5662182601459245e-06, + "loss": 1.4648109674453735, + "step": 10196 + }, + { + "epoch": 1.8562847001001184, + "grad_norm": 13.3125, + "learning_rate": 4.5647878881449545e-06, + "loss": 1.315039038658142, + "step": 10198 + }, + { + "epoch": 1.8566487667243106, + "grad_norm": 10.3125, + "learning_rate": 4.563357726755547e-06, + "loss": 1.0652984380722046, + "step": 10200 + }, + { + "epoch": 1.8570128333485028, + "grad_norm": 8.0625, + "learning_rate": 4.561927776187569e-06, + "loss": 1.421647310256958, + "step": 10202 + }, + { + "epoch": 1.857376899972695, + "grad_norm": 18.625, + "learning_rate": 4.560498036650863e-06, + "loss": 1.300749659538269, + "step": 10204 + }, + { + "epoch": 1.8577409665968871, + "grad_norm": 50.25, + "learning_rate": 4.559068508355237e-06, + "loss": 0.8592878580093384, + "step": 10206 + }, + { + "epoch": 1.8581050332210793, + "grad_norm": 6.53125, + "learning_rate": 4.557639191510466e-06, + "loss": 0.3649294674396515, + "step": 10208 + }, + { + "epoch": 1.8584690998452715, + "grad_norm": 11.5625, + "learning_rate": 4.5562100863263e-06, + "loss": 1.651382327079773, + "step": 10210 + }, + { + "epoch": 1.858833166469464, + "grad_norm": 6.6875, + "learning_rate": 4.554781193012451e-06, + "loss": 1.2910923957824707, + "step": 10212 + }, + { + "epoch": 1.8591972330936561, + "grad_norm": 15.75, + "learning_rate": 4.553352511778606e-06, + "loss": 1.9338804483413696, + "step": 10214 + }, + { + "epoch": 1.8595612997178483, + "grad_norm": 15.25, + "learning_rate": 4.551924042834418e-06, + "loss": 1.8648362159729004, + "step": 10216 + }, + { + "epoch": 1.8599253663420408, + "grad_norm": 14.1875, + "learning_rate": 4.550495786389507e-06, + "loss": 1.2972800731658936, + "step": 10218 + }, + { + "epoch": 1.860289432966233, + "grad_norm": 55.5, + "learning_rate": 4.549067742653466e-06, + "loss": 1.4796545505523682, + "step": 10220 + }, + { + "epoch": 1.8606534995904251, + "grad_norm": 24.375, + "learning_rate": 4.547639911835852e-06, + "loss": 1.3768882751464844, + "step": 10222 + }, + { + "epoch": 1.8610175662146173, + "grad_norm": 7.28125, + "learning_rate": 4.546212294146196e-06, + "loss": 1.211484670639038, + "step": 10224 + }, + { + "epoch": 1.8613816328388095, + "grad_norm": 10.75, + "learning_rate": 4.544784889793994e-06, + "loss": 1.3258265256881714, + "step": 10226 + }, + { + "epoch": 1.8617456994630017, + "grad_norm": 12.5, + "learning_rate": 4.543357698988712e-06, + "loss": 1.4483202695846558, + "step": 10228 + }, + { + "epoch": 1.862109766087194, + "grad_norm": 15.6875, + "learning_rate": 4.541930721939785e-06, + "loss": 1.4156239032745361, + "step": 10230 + }, + { + "epoch": 1.8624738327113861, + "grad_norm": 24.875, + "learning_rate": 4.540503958856615e-06, + "loss": 1.218030333518982, + "step": 10232 + }, + { + "epoch": 1.8628378993355783, + "grad_norm": 10.8125, + "learning_rate": 4.5390774099485735e-06, + "loss": 1.024248480796814, + "step": 10234 + }, + { + "epoch": 1.8632019659597705, + "grad_norm": 12.0625, + "learning_rate": 4.537651075425003e-06, + "loss": 0.5905171632766724, + "step": 10236 + }, + { + "epoch": 1.863566032583963, + "grad_norm": 9.6875, + "learning_rate": 4.536224955495209e-06, + "loss": 1.526942253112793, + "step": 10238 + }, + { + "epoch": 1.8639300992081551, + "grad_norm": 9.5625, + "learning_rate": 4.534799050368473e-06, + "loss": 1.4773681163787842, + "step": 10240 + }, + { + "epoch": 1.8642941658323473, + "grad_norm": 12.1875, + "learning_rate": 4.533373360254036e-06, + "loss": 1.3674228191375732, + "step": 10242 + }, + { + "epoch": 1.8646582324565395, + "grad_norm": 52.75, + "learning_rate": 4.531947885361115e-06, + "loss": 1.23373544216156, + "step": 10244 + }, + { + "epoch": 1.865022299080732, + "grad_norm": 21.375, + "learning_rate": 4.5305226258988945e-06, + "loss": 1.6065036058425903, + "step": 10246 + }, + { + "epoch": 1.865386365704924, + "grad_norm": 15.6875, + "learning_rate": 4.529097582076521e-06, + "loss": 1.4302940368652344, + "step": 10248 + }, + { + "epoch": 1.8657504323291163, + "grad_norm": 9.125, + "learning_rate": 4.527672754103118e-06, + "loss": 1.3910472393035889, + "step": 10250 + }, + { + "epoch": 1.8661144989533085, + "grad_norm": 12.5, + "learning_rate": 4.52624814218777e-06, + "loss": 1.1082059144973755, + "step": 10252 + }, + { + "epoch": 1.8664785655775007, + "grad_norm": 5.34375, + "learning_rate": 4.524823746539535e-06, + "loss": 1.2091387510299683, + "step": 10254 + }, + { + "epoch": 1.8668426322016929, + "grad_norm": 8.8125, + "learning_rate": 4.523399567367437e-06, + "loss": 1.4354215860366821, + "step": 10256 + }, + { + "epoch": 1.867206698825885, + "grad_norm": 12.625, + "learning_rate": 4.521975604880469e-06, + "loss": 1.4698941707611084, + "step": 10258 + }, + { + "epoch": 1.8675707654500773, + "grad_norm": 6.03125, + "learning_rate": 4.520551859287591e-06, + "loss": 1.2633163928985596, + "step": 10260 + }, + { + "epoch": 1.8679348320742695, + "grad_norm": 11.9375, + "learning_rate": 4.519128330797731e-06, + "loss": 1.2887117862701416, + "step": 10262 + }, + { + "epoch": 1.8682988986984617, + "grad_norm": 16.125, + "learning_rate": 4.517705019619787e-06, + "loss": 1.693401575088501, + "step": 10264 + }, + { + "epoch": 1.868662965322654, + "grad_norm": 7.125, + "learning_rate": 4.516281925962626e-06, + "loss": 1.133830189704895, + "step": 10266 + }, + { + "epoch": 1.8690270319468463, + "grad_norm": 9.875, + "learning_rate": 4.5148590500350766e-06, + "loss": 1.4101113080978394, + "step": 10268 + }, + { + "epoch": 1.8693910985710385, + "grad_norm": 7.125, + "learning_rate": 4.513436392045945e-06, + "loss": 1.1699773073196411, + "step": 10270 + }, + { + "epoch": 1.8697551651952309, + "grad_norm": 24.25, + "learning_rate": 4.512013952203997e-06, + "loss": 1.2959704399108887, + "step": 10272 + }, + { + "epoch": 1.870119231819423, + "grad_norm": 9.875, + "learning_rate": 4.510591730717972e-06, + "loss": 1.1705671548843384, + "step": 10274 + }, + { + "epoch": 1.8704832984436153, + "grad_norm": 13.375, + "learning_rate": 4.509169727796574e-06, + "loss": 1.3706467151641846, + "step": 10276 + }, + { + "epoch": 1.8708473650678075, + "grad_norm": 18.875, + "learning_rate": 4.507747943648477e-06, + "loss": 1.4365633726119995, + "step": 10278 + }, + { + "epoch": 1.8712114316919997, + "grad_norm": 23.625, + "learning_rate": 4.506326378482322e-06, + "loss": 1.3904542922973633, + "step": 10280 + }, + { + "epoch": 1.8715754983161919, + "grad_norm": 9.875, + "learning_rate": 4.504905032506717e-06, + "loss": 1.3144638538360596, + "step": 10282 + }, + { + "epoch": 1.871939564940384, + "grad_norm": 9.3125, + "learning_rate": 4.503483905930239e-06, + "loss": 1.3469189405441284, + "step": 10284 + }, + { + "epoch": 1.8723036315645762, + "grad_norm": 19.125, + "learning_rate": 4.502062998961434e-06, + "loss": 1.4580113887786865, + "step": 10286 + }, + { + "epoch": 1.8726676981887684, + "grad_norm": 16.0, + "learning_rate": 4.5006423118088136e-06, + "loss": 1.3660454750061035, + "step": 10288 + }, + { + "epoch": 1.8730317648129606, + "grad_norm": 34.5, + "learning_rate": 4.499221844680857e-06, + "loss": 2.1086161136627197, + "step": 10290 + }, + { + "epoch": 1.873395831437153, + "grad_norm": 18.75, + "learning_rate": 4.497801597786011e-06, + "loss": 1.1748324632644653, + "step": 10292 + }, + { + "epoch": 1.8737598980613452, + "grad_norm": 19.0, + "learning_rate": 4.496381571332695e-06, + "loss": 1.843911051750183, + "step": 10294 + }, + { + "epoch": 1.8741239646855374, + "grad_norm": 11.5625, + "learning_rate": 4.494961765529289e-06, + "loss": 1.594116449356079, + "step": 10296 + }, + { + "epoch": 1.8744880313097296, + "grad_norm": 5.15625, + "learning_rate": 4.493542180584145e-06, + "loss": 1.3023351430892944, + "step": 10298 + }, + { + "epoch": 1.874852097933922, + "grad_norm": 7.09375, + "learning_rate": 4.4921228167055805e-06, + "loss": 1.3033276796340942, + "step": 10300 + }, + { + "epoch": 1.8752161645581142, + "grad_norm": 18.125, + "learning_rate": 4.490703674101881e-06, + "loss": 1.3037970066070557, + "step": 10302 + }, + { + "epoch": 1.8755802311823064, + "grad_norm": 12.375, + "learning_rate": 4.4892847529813005e-06, + "loss": 0.8734592199325562, + "step": 10304 + }, + { + "epoch": 1.8759442978064986, + "grad_norm": 8.125, + "learning_rate": 4.487866053552062e-06, + "loss": 1.3332806825637817, + "step": 10306 + }, + { + "epoch": 1.8763083644306908, + "grad_norm": 35.5, + "learning_rate": 4.4864475760223495e-06, + "loss": 1.343780279159546, + "step": 10308 + }, + { + "epoch": 1.876672431054883, + "grad_norm": 18.75, + "learning_rate": 4.4850293206003235e-06, + "loss": 1.638200044631958, + "step": 10310 + }, + { + "epoch": 1.8770364976790752, + "grad_norm": 16.125, + "learning_rate": 4.483611287494104e-06, + "loss": 1.4613993167877197, + "step": 10312 + }, + { + "epoch": 1.8774005643032674, + "grad_norm": 22.875, + "learning_rate": 4.482193476911782e-06, + "loss": 1.8559073209762573, + "step": 10314 + }, + { + "epoch": 1.8777646309274596, + "grad_norm": 41.75, + "learning_rate": 4.480775889061418e-06, + "loss": 1.727916955947876, + "step": 10316 + }, + { + "epoch": 1.8781286975516518, + "grad_norm": 10.0625, + "learning_rate": 4.479358524151034e-06, + "loss": 1.1891539096832275, + "step": 10318 + }, + { + "epoch": 1.8784927641758442, + "grad_norm": 270.0, + "learning_rate": 4.477941382388625e-06, + "loss": 1.0184335708618164, + "step": 10320 + }, + { + "epoch": 1.8788568308000364, + "grad_norm": 10.1875, + "learning_rate": 4.476524463982149e-06, + "loss": 1.256813645362854, + "step": 10322 + }, + { + "epoch": 1.8792208974242286, + "grad_norm": 16.5, + "learning_rate": 4.475107769139534e-06, + "loss": 1.7416784763336182, + "step": 10324 + }, + { + "epoch": 1.879584964048421, + "grad_norm": 9.875, + "learning_rate": 4.4736912980686745e-06, + "loss": 0.8409368991851807, + "step": 10326 + }, + { + "epoch": 1.8799490306726132, + "grad_norm": 20.625, + "learning_rate": 4.47227505097743e-06, + "loss": 1.396700143814087, + "step": 10328 + }, + { + "epoch": 1.8803130972968054, + "grad_norm": 4.84375, + "learning_rate": 4.470859028073632e-06, + "loss": 0.9189120531082153, + "step": 10330 + }, + { + "epoch": 1.8806771639209976, + "grad_norm": 7.96875, + "learning_rate": 4.469443229565073e-06, + "loss": 1.0982768535614014, + "step": 10332 + }, + { + "epoch": 1.8810412305451898, + "grad_norm": 6.53125, + "learning_rate": 4.468027655659518e-06, + "loss": 1.2586933374404907, + "step": 10334 + }, + { + "epoch": 1.881405297169382, + "grad_norm": 20.25, + "learning_rate": 4.4666123065646975e-06, + "loss": 1.3678619861602783, + "step": 10336 + }, + { + "epoch": 1.8817693637935742, + "grad_norm": 22.5, + "learning_rate": 4.465197182488304e-06, + "loss": 0.8838649988174438, + "step": 10338 + }, + { + "epoch": 1.8821334304177664, + "grad_norm": 45.75, + "learning_rate": 4.463782283638006e-06, + "loss": 1.314103364944458, + "step": 10340 + }, + { + "epoch": 1.8824974970419586, + "grad_norm": 11.25, + "learning_rate": 4.462367610221431e-06, + "loss": 0.8045669794082642, + "step": 10342 + }, + { + "epoch": 1.8828615636661508, + "grad_norm": 15.6875, + "learning_rate": 4.460953162446178e-06, + "loss": 1.2336615324020386, + "step": 10344 + }, + { + "epoch": 1.8832256302903432, + "grad_norm": 35.0, + "learning_rate": 4.459538940519813e-06, + "loss": 1.5921138525009155, + "step": 10346 + }, + { + "epoch": 1.8835896969145354, + "grad_norm": 18.875, + "learning_rate": 4.458124944649863e-06, + "loss": 0.7174089550971985, + "step": 10348 + }, + { + "epoch": 1.8839537635387276, + "grad_norm": 16.0, + "learning_rate": 4.45671117504383e-06, + "loss": 1.6809015274047852, + "step": 10350 + }, + { + "epoch": 1.8843178301629198, + "grad_norm": 9.8125, + "learning_rate": 4.455297631909177e-06, + "loss": 1.4973671436309814, + "step": 10352 + }, + { + "epoch": 1.8846818967871122, + "grad_norm": 24.875, + "learning_rate": 4.453884315453336e-06, + "loss": 1.8407665491104126, + "step": 10354 + }, + { + "epoch": 1.8850459634113044, + "grad_norm": 10.125, + "learning_rate": 4.452471225883708e-06, + "loss": 1.7408779859542847, + "step": 10356 + }, + { + "epoch": 1.8854100300354966, + "grad_norm": 5.1875, + "learning_rate": 4.4510583634076535e-06, + "loss": 1.2544187307357788, + "step": 10358 + }, + { + "epoch": 1.8857740966596888, + "grad_norm": 8.75, + "learning_rate": 4.4496457282325084e-06, + "loss": 0.773290753364563, + "step": 10360 + }, + { + "epoch": 1.886138163283881, + "grad_norm": 11.375, + "learning_rate": 4.448233320565569e-06, + "loss": 0.9481028914451599, + "step": 10362 + }, + { + "epoch": 1.8865022299080731, + "grad_norm": 11.5625, + "learning_rate": 4.4468211406141e-06, + "loss": 0.9567828178405762, + "step": 10364 + }, + { + "epoch": 1.8868662965322653, + "grad_norm": 10.1875, + "learning_rate": 4.445409188585337e-06, + "loss": 1.2877835035324097, + "step": 10366 + }, + { + "epoch": 1.8872303631564575, + "grad_norm": 13.3125, + "learning_rate": 4.443997464686472e-06, + "loss": 1.4767482280731201, + "step": 10368 + }, + { + "epoch": 1.8875944297806497, + "grad_norm": 20.625, + "learning_rate": 4.442585969124676e-06, + "loss": 1.4532097578048706, + "step": 10370 + }, + { + "epoch": 1.887958496404842, + "grad_norm": 21.75, + "learning_rate": 4.441174702107076e-06, + "loss": 1.56870436668396, + "step": 10372 + }, + { + "epoch": 1.8883225630290343, + "grad_norm": 4.5, + "learning_rate": 4.439763663840771e-06, + "loss": 1.191565990447998, + "step": 10374 + }, + { + "epoch": 1.8886866296532265, + "grad_norm": 6.5625, + "learning_rate": 4.438352854532826e-06, + "loss": 1.1797826290130615, + "step": 10376 + }, + { + "epoch": 1.8890506962774187, + "grad_norm": 14.1875, + "learning_rate": 4.43694227439027e-06, + "loss": 1.0686910152435303, + "step": 10378 + }, + { + "epoch": 1.8894147629016111, + "grad_norm": 4.59375, + "learning_rate": 4.435531923620102e-06, + "loss": 1.491809368133545, + "step": 10380 + }, + { + "epoch": 1.8897788295258033, + "grad_norm": 8.8125, + "learning_rate": 4.434121802429282e-06, + "loss": 1.065504789352417, + "step": 10382 + }, + { + "epoch": 1.8901428961499955, + "grad_norm": 15.9375, + "learning_rate": 4.432711911024743e-06, + "loss": 1.457078456878662, + "step": 10384 + }, + { + "epoch": 1.8905069627741877, + "grad_norm": 25.0, + "learning_rate": 4.431302249613379e-06, + "loss": 1.5185307264328003, + "step": 10386 + }, + { + "epoch": 1.89087102939838, + "grad_norm": 6.8125, + "learning_rate": 4.429892818402052e-06, + "loss": 1.2498611211776733, + "step": 10388 + }, + { + "epoch": 1.8912350960225721, + "grad_norm": 12.0625, + "learning_rate": 4.428483617597592e-06, + "loss": 0.7743430137634277, + "step": 10390 + }, + { + "epoch": 1.8915991626467643, + "grad_norm": 24.375, + "learning_rate": 4.427074647406791e-06, + "loss": 0.6134580373764038, + "step": 10392 + }, + { + "epoch": 1.8919632292709565, + "grad_norm": 3.625, + "learning_rate": 4.425665908036412e-06, + "loss": 0.9048395156860352, + "step": 10394 + }, + { + "epoch": 1.8923272958951487, + "grad_norm": 8.25, + "learning_rate": 4.424257399693181e-06, + "loss": 1.2680261135101318, + "step": 10396 + }, + { + "epoch": 1.892691362519341, + "grad_norm": 24.875, + "learning_rate": 4.422849122583789e-06, + "loss": 1.261030673980713, + "step": 10398 + }, + { + "epoch": 1.8930554291435333, + "grad_norm": 21.375, + "learning_rate": 4.421441076914898e-06, + "loss": 1.0411328077316284, + "step": 10400 + }, + { + "epoch": 1.8934194957677255, + "grad_norm": 20.25, + "learning_rate": 4.420033262893131e-06, + "loss": 1.1487151384353638, + "step": 10402 + }, + { + "epoch": 1.8937835623919177, + "grad_norm": 22.125, + "learning_rate": 4.41862568072508e-06, + "loss": 1.7431784868240356, + "step": 10404 + }, + { + "epoch": 1.8941476290161101, + "grad_norm": 25.875, + "learning_rate": 4.417218330617302e-06, + "loss": 1.1477004289627075, + "step": 10406 + }, + { + "epoch": 1.8945116956403023, + "grad_norm": 8.75, + "learning_rate": 4.415811212776319e-06, + "loss": 1.4733740091323853, + "step": 10408 + }, + { + "epoch": 1.8948757622644945, + "grad_norm": 5.9375, + "learning_rate": 4.41440432740862e-06, + "loss": 1.3657646179199219, + "step": 10410 + }, + { + "epoch": 1.8952398288886867, + "grad_norm": 69.5, + "learning_rate": 4.4129976747206605e-06, + "loss": 1.1342673301696777, + "step": 10412 + }, + { + "epoch": 1.895603895512879, + "grad_norm": 29.5, + "learning_rate": 4.4115912549188606e-06, + "loss": 1.9397865533828735, + "step": 10414 + }, + { + "epoch": 1.895967962137071, + "grad_norm": 18.0, + "learning_rate": 4.410185068209608e-06, + "loss": 1.4048717021942139, + "step": 10416 + }, + { + "epoch": 1.8963320287612633, + "grad_norm": 17.0, + "learning_rate": 4.408779114799251e-06, + "loss": 1.068558692932129, + "step": 10418 + }, + { + "epoch": 1.8966960953854555, + "grad_norm": 10.0, + "learning_rate": 4.407373394894113e-06, + "loss": 1.4947537183761597, + "step": 10420 + }, + { + "epoch": 1.8970601620096477, + "grad_norm": 7.28125, + "learning_rate": 4.405967908700472e-06, + "loss": 1.3134336471557617, + "step": 10422 + }, + { + "epoch": 1.8974242286338399, + "grad_norm": 42.0, + "learning_rate": 4.404562656424583e-06, + "loss": 1.6720588207244873, + "step": 10424 + }, + { + "epoch": 1.897788295258032, + "grad_norm": 11.5625, + "learning_rate": 4.4031576382726585e-06, + "loss": 1.2858198881149292, + "step": 10426 + }, + { + "epoch": 1.8981523618822245, + "grad_norm": 19.5, + "learning_rate": 4.401752854450877e-06, + "loss": 1.5044560432434082, + "step": 10428 + }, + { + "epoch": 1.8985164285064167, + "grad_norm": 6.09375, + "learning_rate": 4.4003483051653885e-06, + "loss": 1.1447221040725708, + "step": 10430 + }, + { + "epoch": 1.8988804951306089, + "grad_norm": 6.53125, + "learning_rate": 4.398943990622303e-06, + "loss": 1.2118511199951172, + "step": 10432 + }, + { + "epoch": 1.8992445617548013, + "grad_norm": 17.625, + "learning_rate": 4.397539911027698e-06, + "loss": 1.5545129776000977, + "step": 10434 + }, + { + "epoch": 1.8996086283789935, + "grad_norm": 14.0, + "learning_rate": 4.3961360665876176e-06, + "loss": 1.7559641599655151, + "step": 10436 + }, + { + "epoch": 1.8999726950031857, + "grad_norm": 5.5625, + "learning_rate": 4.394732457508069e-06, + "loss": 1.1165286302566528, + "step": 10438 + }, + { + "epoch": 1.9003367616273779, + "grad_norm": 8.3125, + "learning_rate": 4.393329083995028e-06, + "loss": 1.2374608516693115, + "step": 10440 + }, + { + "epoch": 1.90070082825157, + "grad_norm": 9.0, + "learning_rate": 4.39192594625443e-06, + "loss": 1.300781488418579, + "step": 10442 + }, + { + "epoch": 1.9010648948757622, + "grad_norm": 8.8125, + "learning_rate": 4.3905230444921845e-06, + "loss": 1.2841315269470215, + "step": 10444 + }, + { + "epoch": 1.9014289614999544, + "grad_norm": 30.5, + "learning_rate": 4.38912037891416e-06, + "loss": 1.8599058389663696, + "step": 10446 + }, + { + "epoch": 1.9017930281241466, + "grad_norm": 10.0, + "learning_rate": 4.3877179497261894e-06, + "loss": 0.9666900634765625, + "step": 10448 + }, + { + "epoch": 1.9021570947483388, + "grad_norm": 20.125, + "learning_rate": 4.386315757134078e-06, + "loss": 1.015418529510498, + "step": 10450 + }, + { + "epoch": 1.902521161372531, + "grad_norm": 8.9375, + "learning_rate": 4.384913801343588e-06, + "loss": 1.3500049114227295, + "step": 10452 + }, + { + "epoch": 1.9028852279967234, + "grad_norm": 33.0, + "learning_rate": 4.383512082560452e-06, + "loss": 1.4793367385864258, + "step": 10454 + }, + { + "epoch": 1.9032492946209156, + "grad_norm": 5.9375, + "learning_rate": 4.382110600990368e-06, + "loss": 1.116987705230713, + "step": 10456 + }, + { + "epoch": 1.9036133612451078, + "grad_norm": 20.875, + "learning_rate": 4.380709356838996e-06, + "loss": 1.3126630783081055, + "step": 10458 + }, + { + "epoch": 1.9039774278693002, + "grad_norm": 15.0625, + "learning_rate": 4.3793083503119645e-06, + "loss": 1.2656311988830566, + "step": 10460 + }, + { + "epoch": 1.9043414944934924, + "grad_norm": 15.4375, + "learning_rate": 4.377907581614862e-06, + "loss": 0.9602458477020264, + "step": 10462 + }, + { + "epoch": 1.9047055611176846, + "grad_norm": 13.625, + "learning_rate": 4.37650705095325e-06, + "loss": 1.130002498626709, + "step": 10464 + }, + { + "epoch": 1.9050696277418768, + "grad_norm": 16.75, + "learning_rate": 4.37510675853265e-06, + "loss": 1.4291210174560547, + "step": 10466 + }, + { + "epoch": 1.905433694366069, + "grad_norm": 50.5, + "learning_rate": 4.373706704558546e-06, + "loss": 1.5221493244171143, + "step": 10468 + }, + { + "epoch": 1.9057977609902612, + "grad_norm": 9.8125, + "learning_rate": 4.372306889236394e-06, + "loss": 1.388022541999817, + "step": 10470 + }, + { + "epoch": 1.9061618276144534, + "grad_norm": 9.3125, + "learning_rate": 4.370907312771607e-06, + "loss": 1.3597133159637451, + "step": 10472 + }, + { + "epoch": 1.9065258942386456, + "grad_norm": 13.0625, + "learning_rate": 4.3695079753695716e-06, + "loss": 1.316415548324585, + "step": 10474 + }, + { + "epoch": 1.9068899608628378, + "grad_norm": 15.8125, + "learning_rate": 4.368108877235633e-06, + "loss": 1.2514973878860474, + "step": 10476 + }, + { + "epoch": 1.90725402748703, + "grad_norm": 63.5, + "learning_rate": 4.366710018575102e-06, + "loss": 1.9783823490142822, + "step": 10478 + }, + { + "epoch": 1.9076180941112224, + "grad_norm": 8.375, + "learning_rate": 4.365311399593258e-06, + "loss": 1.1456583738327026, + "step": 10480 + }, + { + "epoch": 1.9079821607354146, + "grad_norm": 24.375, + "learning_rate": 4.363913020495341e-06, + "loss": 1.7451246976852417, + "step": 10482 + }, + { + "epoch": 1.9083462273596068, + "grad_norm": 19.625, + "learning_rate": 4.362514881486557e-06, + "loss": 1.3901368379592896, + "step": 10484 + }, + { + "epoch": 1.908710293983799, + "grad_norm": 59.25, + "learning_rate": 4.3611169827720795e-06, + "loss": 1.9135520458221436, + "step": 10486 + }, + { + "epoch": 1.9090743606079914, + "grad_norm": 37.25, + "learning_rate": 4.35971932455704e-06, + "loss": 1.4027525186538696, + "step": 10488 + }, + { + "epoch": 1.9094384272321836, + "grad_norm": 9.875, + "learning_rate": 4.3583219070465455e-06, + "loss": 1.3585864305496216, + "step": 10490 + }, + { + "epoch": 1.9098024938563758, + "grad_norm": 14.6875, + "learning_rate": 4.356924730445656e-06, + "loss": 0.8971824645996094, + "step": 10492 + }, + { + "epoch": 1.910166560480568, + "grad_norm": 8.5625, + "learning_rate": 4.3555277949594044e-06, + "loss": 0.9596958756446838, + "step": 10494 + }, + { + "epoch": 1.9105306271047602, + "grad_norm": 12.6875, + "learning_rate": 4.354131100792785e-06, + "loss": 0.4878948926925659, + "step": 10496 + }, + { + "epoch": 1.9108946937289524, + "grad_norm": 9.1875, + "learning_rate": 4.352734648150754e-06, + "loss": 1.3880257606506348, + "step": 10498 + }, + { + "epoch": 1.9112587603531446, + "grad_norm": 34.25, + "learning_rate": 4.35133843723824e-06, + "loss": 1.8474534749984741, + "step": 10500 + }, + { + "epoch": 1.9116228269773368, + "grad_norm": 9.0, + "learning_rate": 4.349942468260128e-06, + "loss": 1.1689791679382324, + "step": 10502 + }, + { + "epoch": 1.911986893601529, + "grad_norm": 9.25, + "learning_rate": 4.348546741421271e-06, + "loss": 1.4555976390838623, + "step": 10504 + }, + { + "epoch": 1.9123509602257212, + "grad_norm": 6.59375, + "learning_rate": 4.347151256926489e-06, + "loss": 1.0344799757003784, + "step": 10506 + }, + { + "epoch": 1.9127150268499136, + "grad_norm": 9.5, + "learning_rate": 4.345756014980559e-06, + "loss": 1.4202744960784912, + "step": 10508 + }, + { + "epoch": 1.9130790934741058, + "grad_norm": 10.6875, + "learning_rate": 4.344361015788232e-06, + "loss": 1.1562318801879883, + "step": 10510 + }, + { + "epoch": 1.913443160098298, + "grad_norm": 6.6875, + "learning_rate": 4.342966259554215e-06, + "loss": 1.3088953495025635, + "step": 10512 + }, + { + "epoch": 1.9138072267224904, + "grad_norm": 14.0625, + "learning_rate": 4.341571746483184e-06, + "loss": 1.3426487445831299, + "step": 10514 + }, + { + "epoch": 1.9141712933466826, + "grad_norm": 10.8125, + "learning_rate": 4.34017747677978e-06, + "loss": 1.414621114730835, + "step": 10516 + }, + { + "epoch": 1.9145353599708748, + "grad_norm": 14.1875, + "learning_rate": 4.338783450648602e-06, + "loss": 1.2499535083770752, + "step": 10518 + }, + { + "epoch": 1.914899426595067, + "grad_norm": 5.8125, + "learning_rate": 4.3373896682942215e-06, + "loss": 1.1770846843719482, + "step": 10520 + }, + { + "epoch": 1.9152634932192591, + "grad_norm": 24.25, + "learning_rate": 4.335996129921168e-06, + "loss": 1.4655331373214722, + "step": 10522 + }, + { + "epoch": 1.9156275598434513, + "grad_norm": 25.125, + "learning_rate": 4.33460283573394e-06, + "loss": 1.5368444919586182, + "step": 10524 + }, + { + "epoch": 1.9159916264676435, + "grad_norm": 24.25, + "learning_rate": 4.3332097859369985e-06, + "loss": 1.507416009902954, + "step": 10526 + }, + { + "epoch": 1.9163556930918357, + "grad_norm": 16.125, + "learning_rate": 4.331816980734762e-06, + "loss": 1.5046542882919312, + "step": 10528 + }, + { + "epoch": 1.916719759716028, + "grad_norm": 10.5625, + "learning_rate": 4.330424420331626e-06, + "loss": 1.3257375955581665, + "step": 10530 + }, + { + "epoch": 1.9170838263402201, + "grad_norm": 15.75, + "learning_rate": 4.32903210493194e-06, + "loss": 1.4912198781967163, + "step": 10532 + }, + { + "epoch": 1.9174478929644125, + "grad_norm": 10.75, + "learning_rate": 4.327640034740018e-06, + "loss": 1.3608543872833252, + "step": 10534 + }, + { + "epoch": 1.9178119595886047, + "grad_norm": 7.46875, + "learning_rate": 4.326248209960147e-06, + "loss": 1.2356034517288208, + "step": 10536 + }, + { + "epoch": 1.918176026212797, + "grad_norm": 6.25, + "learning_rate": 4.3248566307965645e-06, + "loss": 0.938212513923645, + "step": 10538 + }, + { + "epoch": 1.9185400928369891, + "grad_norm": 14.875, + "learning_rate": 4.323465297453486e-06, + "loss": 1.8588355779647827, + "step": 10540 + }, + { + "epoch": 1.9189041594611815, + "grad_norm": 15.0, + "learning_rate": 4.32207421013508e-06, + "loss": 1.1650190353393555, + "step": 10542 + }, + { + "epoch": 1.9192682260853737, + "grad_norm": 3.25, + "learning_rate": 4.320683369045481e-06, + "loss": 0.8501220941543579, + "step": 10544 + }, + { + "epoch": 1.919632292709566, + "grad_norm": 9.125, + "learning_rate": 4.3192927743887955e-06, + "loss": 1.5645121335983276, + "step": 10546 + }, + { + "epoch": 1.9199963593337581, + "grad_norm": 7.8125, + "learning_rate": 4.317902426369081e-06, + "loss": 0.9423190355300903, + "step": 10548 + }, + { + "epoch": 1.9203604259579503, + "grad_norm": 10.5625, + "learning_rate": 4.316512325190371e-06, + "loss": 1.2525100708007812, + "step": 10550 + }, + { + "epoch": 1.9207244925821425, + "grad_norm": 8.5, + "learning_rate": 4.315122471056653e-06, + "loss": 1.223229169845581, + "step": 10552 + }, + { + "epoch": 1.9210885592063347, + "grad_norm": 8.4375, + "learning_rate": 4.313732864171884e-06, + "loss": 1.190844178199768, + "step": 10554 + }, + { + "epoch": 1.921452625830527, + "grad_norm": 4.1875, + "learning_rate": 4.312343504739985e-06, + "loss": 1.204777717590332, + "step": 10556 + }, + { + "epoch": 1.921816692454719, + "grad_norm": 6.375, + "learning_rate": 4.310954392964835e-06, + "loss": 1.1608490943908691, + "step": 10558 + }, + { + "epoch": 1.9221807590789113, + "grad_norm": 13.0625, + "learning_rate": 4.309565529050286e-06, + "loss": 1.3439178466796875, + "step": 10560 + }, + { + "epoch": 1.9225448257031037, + "grad_norm": 22.75, + "learning_rate": 4.308176913200142e-06, + "loss": 1.4133667945861816, + "step": 10562 + }, + { + "epoch": 1.922908892327296, + "grad_norm": 11.0625, + "learning_rate": 4.306788545618179e-06, + "loss": 1.536535620689392, + "step": 10564 + }, + { + "epoch": 1.923272958951488, + "grad_norm": 10.0, + "learning_rate": 4.305400426508138e-06, + "loss": 0.4578596353530884, + "step": 10566 + }, + { + "epoch": 1.9236370255756805, + "grad_norm": 16.25, + "learning_rate": 4.304012556073714e-06, + "loss": 1.3371623754501343, + "step": 10568 + }, + { + "epoch": 1.9240010921998727, + "grad_norm": 14.3125, + "learning_rate": 4.302624934518577e-06, + "loss": 1.0038235187530518, + "step": 10570 + }, + { + "epoch": 1.924365158824065, + "grad_norm": 4.9375, + "learning_rate": 4.301237562046351e-06, + "loss": 0.9368493556976318, + "step": 10572 + }, + { + "epoch": 1.924729225448257, + "grad_norm": 11.3125, + "learning_rate": 4.299850438860625e-06, + "loss": 1.2630239725112915, + "step": 10574 + }, + { + "epoch": 1.9250932920724493, + "grad_norm": 11.1875, + "learning_rate": 4.298463565164962e-06, + "loss": 1.2804149389266968, + "step": 10576 + }, + { + "epoch": 1.9254573586966415, + "grad_norm": 19.875, + "learning_rate": 4.297076941162871e-06, + "loss": 1.0377178192138672, + "step": 10578 + }, + { + "epoch": 1.9258214253208337, + "grad_norm": 14.5, + "learning_rate": 4.295690567057841e-06, + "loss": 1.1005542278289795, + "step": 10580 + }, + { + "epoch": 1.9261854919450259, + "grad_norm": 9.125, + "learning_rate": 4.294304443053311e-06, + "loss": 1.353513240814209, + "step": 10582 + }, + { + "epoch": 1.926549558569218, + "grad_norm": 13.375, + "learning_rate": 4.292918569352691e-06, + "loss": 1.2248767614364624, + "step": 10584 + }, + { + "epoch": 1.9269136251934103, + "grad_norm": 11.6875, + "learning_rate": 4.291532946159355e-06, + "loss": 1.732736349105835, + "step": 10586 + }, + { + "epoch": 1.9272776918176027, + "grad_norm": 14.3125, + "learning_rate": 4.290147573676633e-06, + "loss": 1.9308345317840576, + "step": 10588 + }, + { + "epoch": 1.9276417584417949, + "grad_norm": 6.125, + "learning_rate": 4.288762452107827e-06, + "loss": 1.313896656036377, + "step": 10590 + }, + { + "epoch": 1.928005825065987, + "grad_norm": 15.875, + "learning_rate": 4.287377581656196e-06, + "loss": 1.16761314868927, + "step": 10592 + }, + { + "epoch": 1.9283698916901792, + "grad_norm": 4.375, + "learning_rate": 4.285992962524962e-06, + "loss": 1.0834652185440063, + "step": 10594 + }, + { + "epoch": 1.9287339583143717, + "grad_norm": 14.4375, + "learning_rate": 4.284608594917318e-06, + "loss": 1.215872049331665, + "step": 10596 + }, + { + "epoch": 1.9290980249385639, + "grad_norm": 10.375, + "learning_rate": 4.2832244790364075e-06, + "loss": 1.291481614112854, + "step": 10598 + }, + { + "epoch": 1.929462091562756, + "grad_norm": 19.5, + "learning_rate": 4.28184061508535e-06, + "loss": 1.6211323738098145, + "step": 10600 + }, + { + "epoch": 1.9298261581869482, + "grad_norm": 6.125, + "learning_rate": 4.280457003267218e-06, + "loss": 1.3089375495910645, + "step": 10602 + }, + { + "epoch": 1.9301902248111404, + "grad_norm": 5.375, + "learning_rate": 4.27907364378505e-06, + "loss": 1.319156289100647, + "step": 10604 + }, + { + "epoch": 1.9305542914353326, + "grad_norm": 22.25, + "learning_rate": 4.277690536841854e-06, + "loss": 1.1688460111618042, + "step": 10606 + }, + { + "epoch": 1.9309183580595248, + "grad_norm": 12.3125, + "learning_rate": 4.276307682640588e-06, + "loss": 0.8768727779388428, + "step": 10608 + }, + { + "epoch": 1.931282424683717, + "grad_norm": 25.75, + "learning_rate": 4.274925081384189e-06, + "loss": 1.332898497581482, + "step": 10610 + }, + { + "epoch": 1.9316464913079092, + "grad_norm": 12.3125, + "learning_rate": 4.2735427332755395e-06, + "loss": 1.9641807079315186, + "step": 10612 + }, + { + "epoch": 1.9320105579321014, + "grad_norm": 6.78125, + "learning_rate": 4.2721606385174966e-06, + "loss": 1.1488454341888428, + "step": 10614 + }, + { + "epoch": 1.9323746245562938, + "grad_norm": 10.5, + "learning_rate": 4.27077879731288e-06, + "loss": 1.191027283668518, + "step": 10616 + }, + { + "epoch": 1.932738691180486, + "grad_norm": 16.625, + "learning_rate": 4.269397209864465e-06, + "loss": 1.5805479288101196, + "step": 10618 + }, + { + "epoch": 1.9331027578046782, + "grad_norm": 6.90625, + "learning_rate": 4.268015876374999e-06, + "loss": 1.1137583255767822, + "step": 10620 + }, + { + "epoch": 1.9334668244288706, + "grad_norm": 3.078125, + "learning_rate": 4.266634797047182e-06, + "loss": 1.2779500484466553, + "step": 10622 + }, + { + "epoch": 1.9338308910530628, + "grad_norm": 7.5625, + "learning_rate": 4.2652539720836826e-06, + "loss": 0.9298569560050964, + "step": 10624 + }, + { + "epoch": 1.934194957677255, + "grad_norm": 11.0625, + "learning_rate": 4.2638734016871355e-06, + "loss": 1.3740111589431763, + "step": 10626 + }, + { + "epoch": 1.9345590243014472, + "grad_norm": 11.1875, + "learning_rate": 4.262493086060127e-06, + "loss": 1.0150814056396484, + "step": 10628 + }, + { + "epoch": 1.9349230909256394, + "grad_norm": 10.4375, + "learning_rate": 4.2611130254052204e-06, + "loss": 1.4758589267730713, + "step": 10630 + }, + { + "epoch": 1.9352871575498316, + "grad_norm": 33.75, + "learning_rate": 4.259733219924929e-06, + "loss": 1.858879804611206, + "step": 10632 + }, + { + "epoch": 1.9356512241740238, + "grad_norm": 13.8125, + "learning_rate": 4.258353669821732e-06, + "loss": 1.7634761333465576, + "step": 10634 + }, + { + "epoch": 1.936015290798216, + "grad_norm": 8.4375, + "learning_rate": 4.256974375298079e-06, + "loss": 1.4033633470535278, + "step": 10636 + }, + { + "epoch": 1.9363793574224082, + "grad_norm": 21.625, + "learning_rate": 4.255595336556371e-06, + "loss": 1.578845739364624, + "step": 10638 + }, + { + "epoch": 1.9367434240466004, + "grad_norm": 7.21875, + "learning_rate": 4.25421655379898e-06, + "loss": 1.3658075332641602, + "step": 10640 + }, + { + "epoch": 1.9371074906707928, + "grad_norm": 4.71875, + "learning_rate": 4.252838027228233e-06, + "loss": 0.9435325860977173, + "step": 10642 + }, + { + "epoch": 1.937471557294985, + "grad_norm": 2.75, + "learning_rate": 4.251459757046424e-06, + "loss": 0.9834281206130981, + "step": 10644 + }, + { + "epoch": 1.9378356239191772, + "grad_norm": 8.6875, + "learning_rate": 4.250081743455812e-06, + "loss": 1.2774618864059448, + "step": 10646 + }, + { + "epoch": 1.9381996905433696, + "grad_norm": 8.6875, + "learning_rate": 4.248703986658609e-06, + "loss": 1.2626127004623413, + "step": 10648 + }, + { + "epoch": 1.9385637571675618, + "grad_norm": 15.3125, + "learning_rate": 4.247326486857002e-06, + "loss": 1.7468152046203613, + "step": 10650 + }, + { + "epoch": 1.938927823791754, + "grad_norm": 7.375, + "learning_rate": 4.245949244253129e-06, + "loss": 1.2844215631484985, + "step": 10652 + }, + { + "epoch": 1.9392918904159462, + "grad_norm": 14.375, + "learning_rate": 4.2445722590490934e-06, + "loss": 1.3195265531539917, + "step": 10654 + }, + { + "epoch": 1.9396559570401384, + "grad_norm": 28.875, + "learning_rate": 4.2431955314469686e-06, + "loss": 1.322016954421997, + "step": 10656 + }, + { + "epoch": 1.9400200236643306, + "grad_norm": 9.25, + "learning_rate": 4.241819061648777e-06, + "loss": 1.160323977470398, + "step": 10658 + }, + { + "epoch": 1.9403840902885228, + "grad_norm": 9.75, + "learning_rate": 4.240442849856515e-06, + "loss": 0.9990017414093018, + "step": 10660 + }, + { + "epoch": 1.940748156912715, + "grad_norm": 13.625, + "learning_rate": 4.239066896272133e-06, + "loss": 1.4734833240509033, + "step": 10662 + }, + { + "epoch": 1.9411122235369072, + "grad_norm": 30.125, + "learning_rate": 4.237691201097545e-06, + "loss": 1.6726047992706299, + "step": 10664 + }, + { + "epoch": 1.9414762901610993, + "grad_norm": 26.25, + "learning_rate": 4.236315764534635e-06, + "loss": 1.4053781032562256, + "step": 10666 + }, + { + "epoch": 1.9418403567852915, + "grad_norm": 8.9375, + "learning_rate": 4.234940586785236e-06, + "loss": 1.4766428470611572, + "step": 10668 + }, + { + "epoch": 1.942204423409484, + "grad_norm": 12.5625, + "learning_rate": 4.233565668051156e-06, + "loss": 1.1533327102661133, + "step": 10670 + }, + { + "epoch": 1.9425684900336762, + "grad_norm": 12.875, + "learning_rate": 4.232191008534154e-06, + "loss": 1.349991798400879, + "step": 10672 + }, + { + "epoch": 1.9429325566578683, + "grad_norm": 21.0, + "learning_rate": 4.230816608435955e-06, + "loss": 1.355294108390808, + "step": 10674 + }, + { + "epoch": 1.9432966232820608, + "grad_norm": 14.9375, + "learning_rate": 4.2294424679582514e-06, + "loss": 1.3827952146530151, + "step": 10676 + }, + { + "epoch": 1.943660689906253, + "grad_norm": 11.25, + "learning_rate": 4.228068587302688e-06, + "loss": 1.0594600439071655, + "step": 10678 + }, + { + "epoch": 1.9440247565304452, + "grad_norm": 23.75, + "learning_rate": 4.226694966670882e-06, + "loss": 0.8736613988876343, + "step": 10680 + }, + { + "epoch": 1.9443888231546373, + "grad_norm": 11.8125, + "learning_rate": 4.225321606264401e-06, + "loss": 1.3247336149215698, + "step": 10682 + }, + { + "epoch": 1.9447528897788295, + "grad_norm": 8.0, + "learning_rate": 4.22394850628478e-06, + "loss": 1.281076192855835, + "step": 10684 + }, + { + "epoch": 1.9451169564030217, + "grad_norm": 6.21875, + "learning_rate": 4.222575666933521e-06, + "loss": 1.1978704929351807, + "step": 10686 + }, + { + "epoch": 1.945481023027214, + "grad_norm": 18.375, + "learning_rate": 4.221203088412078e-06, + "loss": 1.4343655109405518, + "step": 10688 + }, + { + "epoch": 1.9458450896514061, + "grad_norm": 10.625, + "learning_rate": 4.219830770921875e-06, + "loss": 1.3603440523147583, + "step": 10690 + }, + { + "epoch": 1.9462091562755983, + "grad_norm": 10.9375, + "learning_rate": 4.218458714664291e-06, + "loss": 1.562471628189087, + "step": 10692 + }, + { + "epoch": 1.9465732228997905, + "grad_norm": 10.6875, + "learning_rate": 4.217086919840669e-06, + "loss": 1.9436063766479492, + "step": 10694 + }, + { + "epoch": 1.946937289523983, + "grad_norm": 12.5625, + "learning_rate": 4.2157153866523185e-06, + "loss": 1.183310866355896, + "step": 10696 + }, + { + "epoch": 1.9473013561481751, + "grad_norm": 14.0625, + "learning_rate": 4.214344115300501e-06, + "loss": 1.3032796382904053, + "step": 10698 + }, + { + "epoch": 1.9476654227723673, + "grad_norm": 15.1875, + "learning_rate": 4.212973105986451e-06, + "loss": 1.7914981842041016, + "step": 10700 + }, + { + "epoch": 1.9480294893965597, + "grad_norm": 7.5, + "learning_rate": 4.211602358911354e-06, + "loss": 1.3102962970733643, + "step": 10702 + }, + { + "epoch": 1.948393556020752, + "grad_norm": 9.625, + "learning_rate": 4.210231874276362e-06, + "loss": 0.9511297941207886, + "step": 10704 + }, + { + "epoch": 1.9487576226449441, + "grad_norm": 11.8125, + "learning_rate": 4.208861652282592e-06, + "loss": 1.601237416267395, + "step": 10706 + }, + { + "epoch": 1.9491216892691363, + "grad_norm": 13.4375, + "learning_rate": 4.2074916931311124e-06, + "loss": 1.498426079750061, + "step": 10708 + }, + { + "epoch": 1.9494857558933285, + "grad_norm": 27.375, + "learning_rate": 4.206121997022966e-06, + "loss": 1.3698067665100098, + "step": 10710 + }, + { + "epoch": 1.9498498225175207, + "grad_norm": 17.625, + "learning_rate": 4.204752564159144e-06, + "loss": 1.1138907670974731, + "step": 10712 + }, + { + "epoch": 1.950213889141713, + "grad_norm": 41.5, + "learning_rate": 4.203383394740607e-06, + "loss": 0.8284420371055603, + "step": 10714 + }, + { + "epoch": 1.950577955765905, + "grad_norm": 8.9375, + "learning_rate": 4.202014488968279e-06, + "loss": 1.0195693969726562, + "step": 10716 + }, + { + "epoch": 1.9509420223900973, + "grad_norm": 5.3125, + "learning_rate": 4.200645847043034e-06, + "loss": 0.914522647857666, + "step": 10718 + }, + { + "epoch": 1.9513060890142895, + "grad_norm": 7.84375, + "learning_rate": 4.199277469165724e-06, + "loss": 1.1466375589370728, + "step": 10720 + }, + { + "epoch": 1.951670155638482, + "grad_norm": 7.59375, + "learning_rate": 4.197909355537144e-06, + "loss": 1.3353852033615112, + "step": 10722 + }, + { + "epoch": 1.952034222262674, + "grad_norm": 10.8125, + "learning_rate": 4.196541506358062e-06, + "loss": 1.3710817098617554, + "step": 10724 + }, + { + "epoch": 1.9523982888868663, + "grad_norm": 6.71875, + "learning_rate": 4.195173921829208e-06, + "loss": 1.2640271186828613, + "step": 10726 + }, + { + "epoch": 1.9527623555110585, + "grad_norm": 19.625, + "learning_rate": 4.193806602151264e-06, + "loss": 1.4582655429840088, + "step": 10728 + }, + { + "epoch": 1.953126422135251, + "grad_norm": 28.25, + "learning_rate": 4.192439547524885e-06, + "loss": 2.082249402999878, + "step": 10730 + }, + { + "epoch": 1.953490488759443, + "grad_norm": 12.5, + "learning_rate": 4.191072758150674e-06, + "loss": 1.3775582313537598, + "step": 10732 + }, + { + "epoch": 1.9538545553836353, + "grad_norm": 14.0, + "learning_rate": 4.189706234229204e-06, + "loss": 1.4993131160736084, + "step": 10734 + }, + { + "epoch": 1.9542186220078275, + "grad_norm": 22.125, + "learning_rate": 4.1883399759610114e-06, + "loss": 1.3735322952270508, + "step": 10736 + }, + { + "epoch": 1.9545826886320197, + "grad_norm": 13.9375, + "learning_rate": 4.18697398354658e-06, + "loss": 1.4608542919158936, + "step": 10738 + }, + { + "epoch": 1.9549467552562119, + "grad_norm": 20.0, + "learning_rate": 4.185608257186374e-06, + "loss": 1.4929122924804688, + "step": 10740 + }, + { + "epoch": 1.955310821880404, + "grad_norm": 10.125, + "learning_rate": 4.184242797080802e-06, + "loss": 1.2406938076019287, + "step": 10742 + }, + { + "epoch": 1.9556748885045963, + "grad_norm": 9.5625, + "learning_rate": 4.182877603430238e-06, + "loss": 0.9509090185165405, + "step": 10744 + }, + { + "epoch": 1.9560389551287884, + "grad_norm": 9.8125, + "learning_rate": 4.1815126764350255e-06, + "loss": 1.6715571880340576, + "step": 10746 + }, + { + "epoch": 1.9564030217529806, + "grad_norm": 10.8125, + "learning_rate": 4.180148016295454e-06, + "loss": 1.6843599081039429, + "step": 10748 + }, + { + "epoch": 1.956767088377173, + "grad_norm": 17.25, + "learning_rate": 4.1787836232117905e-06, + "loss": 1.501615285873413, + "step": 10750 + }, + { + "epoch": 1.9571311550013653, + "grad_norm": 20.5, + "learning_rate": 4.177419497384247e-06, + "loss": 1.7046353816986084, + "step": 10752 + }, + { + "epoch": 1.9574952216255574, + "grad_norm": 15.75, + "learning_rate": 4.176055639013005e-06, + "loss": 1.4324826002120972, + "step": 10754 + }, + { + "epoch": 1.9578592882497499, + "grad_norm": 20.625, + "learning_rate": 4.174692048298208e-06, + "loss": 1.0059279203414917, + "step": 10756 + }, + { + "epoch": 1.958223354873942, + "grad_norm": 8.75, + "learning_rate": 4.173328725439953e-06, + "loss": 1.5212697982788086, + "step": 10758 + }, + { + "epoch": 1.9585874214981343, + "grad_norm": 14.1875, + "learning_rate": 4.171965670638309e-06, + "loss": 1.4247984886169434, + "step": 10760 + }, + { + "epoch": 1.9589514881223264, + "grad_norm": 8.8125, + "learning_rate": 4.1706028840932924e-06, + "loss": 1.3881982564926147, + "step": 10762 + }, + { + "epoch": 1.9593155547465186, + "grad_norm": 4.125, + "learning_rate": 4.169240366004887e-06, + "loss": 1.0480122566223145, + "step": 10764 + }, + { + "epoch": 1.9596796213707108, + "grad_norm": 12.6875, + "learning_rate": 4.167878116573041e-06, + "loss": 1.0187500715255737, + "step": 10766 + }, + { + "epoch": 1.960043687994903, + "grad_norm": 16.5, + "learning_rate": 4.166516135997654e-06, + "loss": 1.4734399318695068, + "step": 10768 + }, + { + "epoch": 1.9604077546190952, + "grad_norm": 13.4375, + "learning_rate": 4.165154424478597e-06, + "loss": 1.3496389389038086, + "step": 10770 + }, + { + "epoch": 1.9607718212432874, + "grad_norm": 7.6875, + "learning_rate": 4.1637929822156885e-06, + "loss": 1.3059468269348145, + "step": 10772 + }, + { + "epoch": 1.9611358878674796, + "grad_norm": 8.25, + "learning_rate": 4.162431809408719e-06, + "loss": 1.4281243085861206, + "step": 10774 + }, + { + "epoch": 1.961499954491672, + "grad_norm": 12.8125, + "learning_rate": 4.161070906257437e-06, + "loss": 1.4421210289001465, + "step": 10776 + }, + { + "epoch": 1.9618640211158642, + "grad_norm": 7.6875, + "learning_rate": 4.1597102729615435e-06, + "loss": 1.0801186561584473, + "step": 10778 + }, + { + "epoch": 1.9622280877400564, + "grad_norm": 13.375, + "learning_rate": 4.158349909720713e-06, + "loss": 1.1573163270950317, + "step": 10780 + }, + { + "epoch": 1.9625921543642486, + "grad_norm": 5.46875, + "learning_rate": 4.156989816734568e-06, + "loss": 1.343355655670166, + "step": 10782 + }, + { + "epoch": 1.962956220988441, + "grad_norm": 9.0, + "learning_rate": 4.1556299942026965e-06, + "loss": 0.8314065337181091, + "step": 10784 + }, + { + "epoch": 1.9633202876126332, + "grad_norm": 9.0, + "learning_rate": 4.154270442324653e-06, + "loss": 0.9527825117111206, + "step": 10786 + }, + { + "epoch": 1.9636843542368254, + "grad_norm": 22.0, + "learning_rate": 4.1529111612999376e-06, + "loss": 1.5368661880493164, + "step": 10788 + }, + { + "epoch": 1.9640484208610176, + "grad_norm": 31.5, + "learning_rate": 4.151552151328026e-06, + "loss": 1.8109310865402222, + "step": 10790 + }, + { + "epoch": 1.9644124874852098, + "grad_norm": 9.0, + "learning_rate": 4.150193412608346e-06, + "loss": 0.9711395502090454, + "step": 10792 + }, + { + "epoch": 1.964776554109402, + "grad_norm": 23.125, + "learning_rate": 4.148834945340283e-06, + "loss": 0.9717168211936951, + "step": 10794 + }, + { + "epoch": 1.9651406207335942, + "grad_norm": 32.0, + "learning_rate": 4.1474767497231924e-06, + "loss": 1.4291647672653198, + "step": 10796 + }, + { + "epoch": 1.9655046873577864, + "grad_norm": 4.9375, + "learning_rate": 4.146118825956379e-06, + "loss": 1.1666853427886963, + "step": 10798 + }, + { + "epoch": 1.9658687539819786, + "grad_norm": 22.125, + "learning_rate": 4.144761174239118e-06, + "loss": 1.2857627868652344, + "step": 10800 + }, + { + "epoch": 1.9662328206061708, + "grad_norm": 55.25, + "learning_rate": 4.143403794770633e-06, + "loss": 1.5387756824493408, + "step": 10802 + }, + { + "epoch": 1.9665968872303632, + "grad_norm": 10.125, + "learning_rate": 4.142046687750117e-06, + "loss": 1.6857328414916992, + "step": 10804 + }, + { + "epoch": 1.9669609538545554, + "grad_norm": 8.5625, + "learning_rate": 4.1406898533767225e-06, + "loss": 1.089185357093811, + "step": 10806 + }, + { + "epoch": 1.9673250204787476, + "grad_norm": 6.59375, + "learning_rate": 4.139333291849555e-06, + "loss": 1.3702129125595093, + "step": 10808 + }, + { + "epoch": 1.96768908710294, + "grad_norm": 13.625, + "learning_rate": 4.137977003367687e-06, + "loss": 1.2946057319641113, + "step": 10810 + }, + { + "epoch": 1.9680531537271322, + "grad_norm": 13.875, + "learning_rate": 4.136620988130148e-06, + "loss": 1.5037271976470947, + "step": 10812 + }, + { + "epoch": 1.9684172203513244, + "grad_norm": 11.1875, + "learning_rate": 4.135265246335924e-06, + "loss": 1.3822699785232544, + "step": 10814 + }, + { + "epoch": 1.9687812869755166, + "grad_norm": 62.0, + "learning_rate": 4.133909778183973e-06, + "loss": 1.1629102230072021, + "step": 10816 + }, + { + "epoch": 1.9691453535997088, + "grad_norm": 11.8125, + "learning_rate": 4.132554583873195e-06, + "loss": 1.7002830505371094, + "step": 10818 + }, + { + "epoch": 1.969509420223901, + "grad_norm": 12.3125, + "learning_rate": 4.131199663602468e-06, + "loss": 1.3595935106277466, + "step": 10820 + }, + { + "epoch": 1.9698734868480932, + "grad_norm": 9.5625, + "learning_rate": 4.129845017570615e-06, + "loss": 1.2798871994018555, + "step": 10822 + }, + { + "epoch": 1.9702375534722854, + "grad_norm": 15.25, + "learning_rate": 4.128490645976424e-06, + "loss": 1.2766427993774414, + "step": 10824 + }, + { + "epoch": 1.9706016200964775, + "grad_norm": 21.875, + "learning_rate": 4.12713654901865e-06, + "loss": 1.4783878326416016, + "step": 10826 + }, + { + "epoch": 1.9709656867206697, + "grad_norm": 18.875, + "learning_rate": 4.125782726895995e-06, + "loss": 1.8505593538284302, + "step": 10828 + }, + { + "epoch": 1.9713297533448622, + "grad_norm": 17.0, + "learning_rate": 4.124429179807131e-06, + "loss": 1.2525584697723389, + "step": 10830 + }, + { + "epoch": 1.9716938199690544, + "grad_norm": 16.375, + "learning_rate": 4.123075907950683e-06, + "loss": 1.4278440475463867, + "step": 10832 + }, + { + "epoch": 1.9720578865932465, + "grad_norm": 17.5, + "learning_rate": 4.121722911525237e-06, + "loss": 1.7229260206222534, + "step": 10834 + }, + { + "epoch": 1.9724219532174387, + "grad_norm": 20.625, + "learning_rate": 4.120370190729345e-06, + "loss": 1.595231294631958, + "step": 10836 + }, + { + "epoch": 1.9727860198416312, + "grad_norm": 18.875, + "learning_rate": 4.119017745761507e-06, + "loss": 0.9405343532562256, + "step": 10838 + }, + { + "epoch": 1.9731500864658233, + "grad_norm": 26.875, + "learning_rate": 4.117665576820196e-06, + "loss": 1.1189336776733398, + "step": 10840 + }, + { + "epoch": 1.9735141530900155, + "grad_norm": 8.3125, + "learning_rate": 4.11631368410383e-06, + "loss": 1.4718209505081177, + "step": 10842 + }, + { + "epoch": 1.9738782197142077, + "grad_norm": 5.3125, + "learning_rate": 4.114962067810796e-06, + "loss": 1.3799368143081665, + "step": 10844 + }, + { + "epoch": 1.9742422863384, + "grad_norm": 7.34375, + "learning_rate": 4.113610728139443e-06, + "loss": 1.3353067636489868, + "step": 10846 + }, + { + "epoch": 1.9746063529625921, + "grad_norm": 6.90625, + "learning_rate": 4.112259665288067e-06, + "loss": 1.2081918716430664, + "step": 10848 + }, + { + "epoch": 1.9749704195867843, + "grad_norm": 5.15625, + "learning_rate": 4.110908879454938e-06, + "loss": 1.038845419883728, + "step": 10850 + }, + { + "epoch": 1.9753344862109765, + "grad_norm": 6.0, + "learning_rate": 4.109558370838273e-06, + "loss": 1.1737656593322754, + "step": 10852 + }, + { + "epoch": 1.9756985528351687, + "grad_norm": 14.8125, + "learning_rate": 4.108208139636255e-06, + "loss": 1.2012486457824707, + "step": 10854 + }, + { + "epoch": 1.976062619459361, + "grad_norm": 12.125, + "learning_rate": 4.106858186047028e-06, + "loss": 1.0143401622772217, + "step": 10856 + }, + { + "epoch": 1.9764266860835533, + "grad_norm": 19.25, + "learning_rate": 4.105508510268688e-06, + "loss": 1.3670973777770996, + "step": 10858 + }, + { + "epoch": 1.9767907527077455, + "grad_norm": 12.5625, + "learning_rate": 4.104159112499298e-06, + "loss": 1.4914178848266602, + "step": 10860 + }, + { + "epoch": 1.9771548193319377, + "grad_norm": 19.375, + "learning_rate": 4.102809992936875e-06, + "loss": 1.5278525352478027, + "step": 10862 + }, + { + "epoch": 1.9775188859561301, + "grad_norm": 6.28125, + "learning_rate": 4.101461151779395e-06, + "loss": 1.3893541097640991, + "step": 10864 + }, + { + "epoch": 1.9778829525803223, + "grad_norm": 20.375, + "learning_rate": 4.1001125892248e-06, + "loss": 1.114429235458374, + "step": 10866 + }, + { + "epoch": 1.9782470192045145, + "grad_norm": 24.125, + "learning_rate": 4.098764305470979e-06, + "loss": 2.0433480739593506, + "step": 10868 + }, + { + "epoch": 1.9786110858287067, + "grad_norm": 21.125, + "learning_rate": 4.0974163007157956e-06, + "loss": 1.8635485172271729, + "step": 10870 + }, + { + "epoch": 1.978975152452899, + "grad_norm": 45.5, + "learning_rate": 4.0960685751570565e-06, + "loss": 1.1606404781341553, + "step": 10872 + }, + { + "epoch": 1.979339219077091, + "grad_norm": 48.75, + "learning_rate": 4.0947211289925375e-06, + "loss": 0.9447245597839355, + "step": 10874 + }, + { + "epoch": 1.9797032857012833, + "grad_norm": 18.375, + "learning_rate": 4.093373962419974e-06, + "loss": 1.3852027654647827, + "step": 10876 + }, + { + "epoch": 1.9800673523254755, + "grad_norm": 16.625, + "learning_rate": 4.092027075637053e-06, + "loss": 1.3616926670074463, + "step": 10878 + }, + { + "epoch": 1.9804314189496677, + "grad_norm": 11.25, + "learning_rate": 4.090680468841428e-06, + "loss": 1.2145402431488037, + "step": 10880 + }, + { + "epoch": 1.9807954855738599, + "grad_norm": 30.0, + "learning_rate": 4.089334142230704e-06, + "loss": 0.5566257238388062, + "step": 10882 + }, + { + "epoch": 1.9811595521980523, + "grad_norm": 8.75, + "learning_rate": 4.087988096002451e-06, + "loss": 0.8723641633987427, + "step": 10884 + }, + { + "epoch": 1.9815236188222445, + "grad_norm": 8.0625, + "learning_rate": 4.0866423303541995e-06, + "loss": 1.459789514541626, + "step": 10886 + }, + { + "epoch": 1.9818876854464367, + "grad_norm": 8.1875, + "learning_rate": 4.085296845483429e-06, + "loss": 0.9855976700782776, + "step": 10888 + }, + { + "epoch": 1.9822517520706289, + "grad_norm": 7.375, + "learning_rate": 4.083951641587589e-06, + "loss": 1.1203888654708862, + "step": 10890 + }, + { + "epoch": 1.9826158186948213, + "grad_norm": 8.625, + "learning_rate": 4.082606718864079e-06, + "loss": 1.3982939720153809, + "step": 10892 + }, + { + "epoch": 1.9829798853190135, + "grad_norm": 9.375, + "learning_rate": 4.081262077510262e-06, + "loss": 1.5338878631591797, + "step": 10894 + }, + { + "epoch": 1.9833439519432057, + "grad_norm": 6.375, + "learning_rate": 4.079917717723461e-06, + "loss": 1.0103753805160522, + "step": 10896 + }, + { + "epoch": 1.9837080185673979, + "grad_norm": 7.0625, + "learning_rate": 4.078573639700951e-06, + "loss": 1.4745938777923584, + "step": 10898 + }, + { + "epoch": 1.98407208519159, + "grad_norm": 12.125, + "learning_rate": 4.077229843639976e-06, + "loss": 1.472595453262329, + "step": 10900 + }, + { + "epoch": 1.9844361518157823, + "grad_norm": 23.0, + "learning_rate": 4.075886329737727e-06, + "loss": 1.6663594245910645, + "step": 10902 + }, + { + "epoch": 1.9848002184399745, + "grad_norm": 10.25, + "learning_rate": 4.07454309819136e-06, + "loss": 1.2754945755004883, + "step": 10904 + }, + { + "epoch": 1.9851642850641666, + "grad_norm": 12.1875, + "learning_rate": 4.073200149197993e-06, + "loss": 1.1397364139556885, + "step": 10906 + }, + { + "epoch": 1.9855283516883588, + "grad_norm": 10.125, + "learning_rate": 4.071857482954692e-06, + "loss": 1.3538984060287476, + "step": 10908 + }, + { + "epoch": 1.985892418312551, + "grad_norm": 5.84375, + "learning_rate": 4.070515099658496e-06, + "loss": 1.2290761470794678, + "step": 10910 + }, + { + "epoch": 1.9862564849367434, + "grad_norm": 7.28125, + "learning_rate": 4.069172999506387e-06, + "loss": 0.6596604585647583, + "step": 10912 + }, + { + "epoch": 1.9866205515609356, + "grad_norm": 35.25, + "learning_rate": 4.067831182695313e-06, + "loss": 0.29634204506874084, + "step": 10914 + }, + { + "epoch": 1.9869846181851278, + "grad_norm": 12.9375, + "learning_rate": 4.0664896494221875e-06, + "loss": 1.6917842626571655, + "step": 10916 + }, + { + "epoch": 1.9873486848093203, + "grad_norm": 8.875, + "learning_rate": 4.0651483998838655e-06, + "loss": 1.1050643920898438, + "step": 10918 + }, + { + "epoch": 1.9877127514335124, + "grad_norm": 8.375, + "learning_rate": 4.0638074342771784e-06, + "loss": 1.35196852684021, + "step": 10920 + }, + { + "epoch": 1.9880768180577046, + "grad_norm": 7.0, + "learning_rate": 4.062466752798901e-06, + "loss": 1.3063468933105469, + "step": 10922 + }, + { + "epoch": 1.9884408846818968, + "grad_norm": 16.625, + "learning_rate": 4.061126355645775e-06, + "loss": 1.1754240989685059, + "step": 10924 + }, + { + "epoch": 1.988804951306089, + "grad_norm": 18.125, + "learning_rate": 4.059786243014503e-06, + "loss": 1.528351068496704, + "step": 10926 + }, + { + "epoch": 1.9891690179302812, + "grad_norm": 9.9375, + "learning_rate": 4.0584464151017315e-06, + "loss": 1.0657646656036377, + "step": 10928 + }, + { + "epoch": 1.9895330845544734, + "grad_norm": 10.0625, + "learning_rate": 4.057106872104084e-06, + "loss": 1.4041496515274048, + "step": 10930 + }, + { + "epoch": 1.9898971511786656, + "grad_norm": 14.9375, + "learning_rate": 4.055767614218128e-06, + "loss": 1.2570862770080566, + "step": 10932 + }, + { + "epoch": 1.9902612178028578, + "grad_norm": 16.25, + "learning_rate": 4.054428641640393e-06, + "loss": 1.221881628036499, + "step": 10934 + }, + { + "epoch": 1.99062528442705, + "grad_norm": 28.625, + "learning_rate": 4.053089954567374e-06, + "loss": 0.8489706516265869, + "step": 10936 + }, + { + "epoch": 1.9909893510512424, + "grad_norm": 11.3125, + "learning_rate": 4.051751553195511e-06, + "loss": 1.3841204643249512, + "step": 10938 + }, + { + "epoch": 1.9913534176754346, + "grad_norm": 5.3125, + "learning_rate": 4.050413437721214e-06, + "loss": 1.3610183000564575, + "step": 10940 + }, + { + "epoch": 1.9917174842996268, + "grad_norm": 207.0, + "learning_rate": 4.049075608340845e-06, + "loss": 1.3409671783447266, + "step": 10942 + }, + { + "epoch": 1.9920815509238192, + "grad_norm": 8.8125, + "learning_rate": 4.04773806525072e-06, + "loss": 1.311365008354187, + "step": 10944 + }, + { + "epoch": 1.9924456175480114, + "grad_norm": 9.6875, + "learning_rate": 4.046400808647126e-06, + "loss": 1.4495058059692383, + "step": 10946 + }, + { + "epoch": 1.9928096841722036, + "grad_norm": 22.375, + "learning_rate": 4.045063838726293e-06, + "loss": 1.3023087978363037, + "step": 10948 + }, + { + "epoch": 1.9931737507963958, + "grad_norm": 28.25, + "learning_rate": 4.043727155684422e-06, + "loss": 1.3837381601333618, + "step": 10950 + }, + { + "epoch": 1.993537817420588, + "grad_norm": 20.875, + "learning_rate": 4.042390759717661e-06, + "loss": 1.3718791007995605, + "step": 10952 + }, + { + "epoch": 1.9939018840447802, + "grad_norm": 22.125, + "learning_rate": 4.041054651022121e-06, + "loss": 1.923041582107544, + "step": 10954 + }, + { + "epoch": 1.9942659506689724, + "grad_norm": 14.25, + "learning_rate": 4.039718829793876e-06, + "loss": 1.4297645092010498, + "step": 10956 + }, + { + "epoch": 1.9946300172931646, + "grad_norm": 9.25, + "learning_rate": 4.038383296228945e-06, + "loss": 1.2713063955307007, + "step": 10958 + }, + { + "epoch": 1.9949940839173568, + "grad_norm": 228.0, + "learning_rate": 4.037048050523318e-06, + "loss": 1.1857223510742188, + "step": 10960 + }, + { + "epoch": 1.995358150541549, + "grad_norm": 4.5625, + "learning_rate": 4.035713092872933e-06, + "loss": 1.20783269405365, + "step": 10962 + }, + { + "epoch": 1.9957222171657414, + "grad_norm": 12.875, + "learning_rate": 4.0343784234736905e-06, + "loss": 1.4405242204666138, + "step": 10964 + }, + { + "epoch": 1.9960862837899336, + "grad_norm": 21.625, + "learning_rate": 4.03304404252145e-06, + "loss": 1.31243097782135, + "step": 10966 + }, + { + "epoch": 1.9964503504141258, + "grad_norm": 96.0, + "learning_rate": 4.031709950212023e-06, + "loss": 0.7867608070373535, + "step": 10968 + }, + { + "epoch": 1.996814417038318, + "grad_norm": 6.28125, + "learning_rate": 4.030376146741187e-06, + "loss": 1.3857314586639404, + "step": 10970 + }, + { + "epoch": 1.9971784836625104, + "grad_norm": 6.09375, + "learning_rate": 4.029042632304667e-06, + "loss": 1.2450498342514038, + "step": 10972 + }, + { + "epoch": 1.9975425502867026, + "grad_norm": 36.0, + "learning_rate": 4.027709407098152e-06, + "loss": 1.1632217168807983, + "step": 10974 + }, + { + "epoch": 1.9979066169108948, + "grad_norm": 7.8125, + "learning_rate": 4.026376471317292e-06, + "loss": 1.3292152881622314, + "step": 10976 + }, + { + "epoch": 1.998270683535087, + "grad_norm": 27.0, + "learning_rate": 4.025043825157683e-06, + "loss": 1.2493515014648438, + "step": 10978 + }, + { + "epoch": 1.9986347501592792, + "grad_norm": 9.0, + "learning_rate": 4.023711468814892e-06, + "loss": 1.3530218601226807, + "step": 10980 + }, + { + "epoch": 1.9989988167834714, + "grad_norm": 9.1875, + "learning_rate": 4.022379402484432e-06, + "loss": 1.179415225982666, + "step": 10982 + }, + { + "epoch": 1.9993628834076635, + "grad_norm": 11.0625, + "learning_rate": 4.021047626361778e-06, + "loss": 1.8628883361816406, + "step": 10984 + }, + { + "epoch": 1.9997269500318557, + "grad_norm": 20.625, + "learning_rate": 4.019716140642369e-06, + "loss": 1.5577448606491089, + "step": 10986 + }, + { + "epoch": 2.0, + "grad_norm": 26.125, + "learning_rate": 4.018384945521587e-06, + "loss": 1.4067811965942383, + "step": 10988 + }, + { + "epoch": 2.000364066624192, + "grad_norm": 4.5, + "learning_rate": 4.017054041194788e-06, + "loss": 1.392914891242981, + "step": 10990 + }, + { + "epoch": 2.0007281332483844, + "grad_norm": 26.375, + "learning_rate": 4.015723427857269e-06, + "loss": 0.8794772624969482, + "step": 10992 + }, + { + "epoch": 2.0010921998725766, + "grad_norm": 10.125, + "learning_rate": 4.014393105704295e-06, + "loss": 1.4271111488342285, + "step": 10994 + }, + { + "epoch": 2.0014562664967688, + "grad_norm": 6.28125, + "learning_rate": 4.0130630749310885e-06, + "loss": 1.029604196548462, + "step": 10996 + }, + { + "epoch": 2.001820333120961, + "grad_norm": 24.5, + "learning_rate": 4.01173333573282e-06, + "loss": 1.4767204523086548, + "step": 10998 + }, + { + "epoch": 2.002184399745153, + "grad_norm": 30.5, + "learning_rate": 4.010403888304631e-06, + "loss": 0.2808459997177124, + "step": 11000 + }, + { + "epoch": 2.002548466369346, + "grad_norm": 7.28125, + "learning_rate": 4.009074732841605e-06, + "loss": 1.2969317436218262, + "step": 11002 + }, + { + "epoch": 2.002912532993538, + "grad_norm": 3.171875, + "learning_rate": 4.007745869538793e-06, + "loss": 1.2992312908172607, + "step": 11004 + }, + { + "epoch": 2.00327659961773, + "grad_norm": 15.0, + "learning_rate": 4.006417298591203e-06, + "loss": 1.2189079523086548, + "step": 11006 + }, + { + "epoch": 2.0036406662419224, + "grad_norm": 23.375, + "learning_rate": 4.005089020193793e-06, + "loss": 1.8249293565750122, + "step": 11008 + }, + { + "epoch": 2.0040047328661146, + "grad_norm": 65.0, + "learning_rate": 4.003761034541487e-06, + "loss": 1.1246477365493774, + "step": 11010 + }, + { + "epoch": 2.0043687994903068, + "grad_norm": 17.125, + "learning_rate": 4.0024333418291586e-06, + "loss": 1.397111177444458, + "step": 11012 + }, + { + "epoch": 2.004732866114499, + "grad_norm": 6.90625, + "learning_rate": 4.001105942251639e-06, + "loss": 1.3780319690704346, + "step": 11014 + }, + { + "epoch": 2.005096932738691, + "grad_norm": 19.625, + "learning_rate": 3.999778836003725e-06, + "loss": 1.4079086780548096, + "step": 11016 + }, + { + "epoch": 2.0054609993628834, + "grad_norm": 9.4375, + "learning_rate": 3.998452023280158e-06, + "loss": 1.456685185432434, + "step": 11018 + }, + { + "epoch": 2.0058250659870756, + "grad_norm": 13.5625, + "learning_rate": 3.997125504275649e-06, + "loss": 1.3323144912719727, + "step": 11020 + }, + { + "epoch": 2.0061891326112677, + "grad_norm": 35.75, + "learning_rate": 3.995799279184852e-06, + "loss": 1.7964556217193604, + "step": 11022 + }, + { + "epoch": 2.00655319923546, + "grad_norm": 10.875, + "learning_rate": 3.9944733482023874e-06, + "loss": 1.0823254585266113, + "step": 11024 + }, + { + "epoch": 2.006917265859652, + "grad_norm": 15.625, + "learning_rate": 3.993147711522835e-06, + "loss": 1.273159384727478, + "step": 11026 + }, + { + "epoch": 2.0072813324838443, + "grad_norm": 20.875, + "learning_rate": 3.991822369340719e-06, + "loss": 1.3612158298492432, + "step": 11028 + }, + { + "epoch": 2.007645399108037, + "grad_norm": 11.0, + "learning_rate": 3.9904973218505355e-06, + "loss": 1.4647706747055054, + "step": 11030 + }, + { + "epoch": 2.008009465732229, + "grad_norm": 9.625, + "learning_rate": 3.989172569246724e-06, + "loss": 1.4063844680786133, + "step": 11032 + }, + { + "epoch": 2.0083735323564214, + "grad_norm": 18.0, + "learning_rate": 3.987848111723689e-06, + "loss": 1.2605879306793213, + "step": 11034 + }, + { + "epoch": 2.0087375989806135, + "grad_norm": 8.625, + "learning_rate": 3.9865239494757905e-06, + "loss": 1.2857003211975098, + "step": 11036 + }, + { + "epoch": 2.0091016656048057, + "grad_norm": 25.0, + "learning_rate": 3.98520008269734e-06, + "loss": 1.5507296323776245, + "step": 11038 + }, + { + "epoch": 2.009465732228998, + "grad_norm": 13.3125, + "learning_rate": 3.983876511582615e-06, + "loss": 1.1744754314422607, + "step": 11040 + }, + { + "epoch": 2.00982979885319, + "grad_norm": 1.796875, + "learning_rate": 3.982553236325839e-06, + "loss": 1.1620770692825317, + "step": 11042 + }, + { + "epoch": 2.0101938654773823, + "grad_norm": 7.46875, + "learning_rate": 3.981230257121199e-06, + "loss": 0.9784259796142578, + "step": 11044 + }, + { + "epoch": 2.0105579321015745, + "grad_norm": 26.0, + "learning_rate": 3.97990757416284e-06, + "loss": 1.6096962690353394, + "step": 11046 + }, + { + "epoch": 2.0109219987257667, + "grad_norm": 4.59375, + "learning_rate": 3.978585187644855e-06, + "loss": 1.377596378326416, + "step": 11048 + }, + { + "epoch": 2.011286065349959, + "grad_norm": 13.375, + "learning_rate": 3.977263097761305e-06, + "loss": 1.538359522819519, + "step": 11050 + }, + { + "epoch": 2.011650131974151, + "grad_norm": 7.09375, + "learning_rate": 3.975941304706195e-06, + "loss": 1.344555139541626, + "step": 11052 + }, + { + "epoch": 2.0120141985983433, + "grad_norm": 12.1875, + "learning_rate": 3.974619808673496e-06, + "loss": 1.3899462223052979, + "step": 11054 + }, + { + "epoch": 2.012378265222536, + "grad_norm": 12.5625, + "learning_rate": 3.973298609857135e-06, + "loss": 1.104241967201233, + "step": 11056 + }, + { + "epoch": 2.012742331846728, + "grad_norm": 5.65625, + "learning_rate": 3.971977708450984e-06, + "loss": 0.7514855861663818, + "step": 11058 + }, + { + "epoch": 2.0131063984709203, + "grad_norm": 10.0625, + "learning_rate": 3.970657104648892e-06, + "loss": 1.4637733697891235, + "step": 11060 + }, + { + "epoch": 2.0134704650951125, + "grad_norm": 13.625, + "learning_rate": 3.969336798644642e-06, + "loss": 1.3208070993423462, + "step": 11062 + }, + { + "epoch": 2.0138345317193047, + "grad_norm": 9.4375, + "learning_rate": 3.968016790631986e-06, + "loss": 1.2899292707443237, + "step": 11064 + }, + { + "epoch": 2.014198598343497, + "grad_norm": 17.125, + "learning_rate": 3.966697080804634e-06, + "loss": 1.3354780673980713, + "step": 11066 + }, + { + "epoch": 2.014562664967689, + "grad_norm": 5.15625, + "learning_rate": 3.965377669356242e-06, + "loss": 1.1566604375839233, + "step": 11068 + }, + { + "epoch": 2.0149267315918813, + "grad_norm": 15.375, + "learning_rate": 3.964058556480435e-06, + "loss": 1.470249891281128, + "step": 11070 + }, + { + "epoch": 2.0152907982160735, + "grad_norm": 4.125, + "learning_rate": 3.9627397423707825e-06, + "loss": 1.1017526388168335, + "step": 11072 + }, + { + "epoch": 2.0156548648402657, + "grad_norm": 6.59375, + "learning_rate": 3.961421227220814e-06, + "loss": 1.3219066858291626, + "step": 11074 + }, + { + "epoch": 2.016018931464458, + "grad_norm": 13.9375, + "learning_rate": 3.960103011224023e-06, + "loss": 1.4901299476623535, + "step": 11076 + }, + { + "epoch": 2.01638299808865, + "grad_norm": 5.4375, + "learning_rate": 3.958785094573844e-06, + "loss": 1.5103120803833008, + "step": 11078 + }, + { + "epoch": 2.0167470647128423, + "grad_norm": 12.125, + "learning_rate": 3.957467477463684e-06, + "loss": 1.3990949392318726, + "step": 11080 + }, + { + "epoch": 2.0171111313370345, + "grad_norm": 21.25, + "learning_rate": 3.956150160086892e-06, + "loss": 1.569573998451233, + "step": 11082 + }, + { + "epoch": 2.017475197961227, + "grad_norm": 4.5, + "learning_rate": 3.95483314263678e-06, + "loss": 0.9206288456916809, + "step": 11084 + }, + { + "epoch": 2.0178392645854193, + "grad_norm": 21.0, + "learning_rate": 3.953516425306618e-06, + "loss": 2.0021207332611084, + "step": 11086 + }, + { + "epoch": 2.0182033312096115, + "grad_norm": 15.875, + "learning_rate": 3.952200008289624e-06, + "loss": 1.2481396198272705, + "step": 11088 + }, + { + "epoch": 2.0185673978338037, + "grad_norm": 13.6875, + "learning_rate": 3.950883891778985e-06, + "loss": 1.3283307552337646, + "step": 11090 + }, + { + "epoch": 2.018931464457996, + "grad_norm": 30.625, + "learning_rate": 3.949568075967826e-06, + "loss": 1.3686829805374146, + "step": 11092 + }, + { + "epoch": 2.019295531082188, + "grad_norm": 30.875, + "learning_rate": 3.948252561049242e-06, + "loss": 1.3300321102142334, + "step": 11094 + }, + { + "epoch": 2.0196595977063803, + "grad_norm": 51.5, + "learning_rate": 3.946937347216283e-06, + "loss": 1.2486698627471924, + "step": 11096 + }, + { + "epoch": 2.0200236643305725, + "grad_norm": 17.0, + "learning_rate": 3.945622434661944e-06, + "loss": 1.2822213172912598, + "step": 11098 + }, + { + "epoch": 2.0203877309547646, + "grad_norm": 8.25, + "learning_rate": 3.944307823579192e-06, + "loss": 1.1264550685882568, + "step": 11100 + }, + { + "epoch": 2.020751797578957, + "grad_norm": 32.0, + "learning_rate": 3.942993514160933e-06, + "loss": 1.4714895486831665, + "step": 11102 + }, + { + "epoch": 2.021115864203149, + "grad_norm": 31.5, + "learning_rate": 3.941679506600037e-06, + "loss": 1.3934190273284912, + "step": 11104 + }, + { + "epoch": 2.0214799308273412, + "grad_norm": 17.875, + "learning_rate": 3.940365801089336e-06, + "loss": 1.7528350353240967, + "step": 11106 + }, + { + "epoch": 2.0218439974515334, + "grad_norm": 12.0625, + "learning_rate": 3.9390523978216034e-06, + "loss": 1.0156023502349854, + "step": 11108 + }, + { + "epoch": 2.022208064075726, + "grad_norm": 5.875, + "learning_rate": 3.9377392969895825e-06, + "loss": 1.0663440227508545, + "step": 11110 + }, + { + "epoch": 2.0225721306999183, + "grad_norm": 19.125, + "learning_rate": 3.9364264987859605e-06, + "loss": 1.446930170059204, + "step": 11112 + }, + { + "epoch": 2.0229361973241105, + "grad_norm": 7.34375, + "learning_rate": 3.935114003403385e-06, + "loss": 1.0982558727264404, + "step": 11114 + }, + { + "epoch": 2.0233002639483026, + "grad_norm": 12.75, + "learning_rate": 3.933801811034465e-06, + "loss": 1.4989956617355347, + "step": 11116 + }, + { + "epoch": 2.023664330572495, + "grad_norm": 15.4375, + "learning_rate": 3.932489921871752e-06, + "loss": 1.4031810760498047, + "step": 11118 + }, + { + "epoch": 2.024028397196687, + "grad_norm": 18.625, + "learning_rate": 3.931178336107768e-06, + "loss": 1.4167940616607666, + "step": 11120 + }, + { + "epoch": 2.0243924638208792, + "grad_norm": 17.875, + "learning_rate": 3.929867053934976e-06, + "loss": 1.3779305219650269, + "step": 11122 + }, + { + "epoch": 2.0247565304450714, + "grad_norm": 15.9375, + "learning_rate": 3.928556075545804e-06, + "loss": 1.3135592937469482, + "step": 11124 + }, + { + "epoch": 2.0251205970692636, + "grad_norm": 59.5, + "learning_rate": 3.927245401132635e-06, + "loss": 1.4298725128173828, + "step": 11126 + }, + { + "epoch": 2.025484663693456, + "grad_norm": 91.5, + "learning_rate": 3.925935030887802e-06, + "loss": 1.334614634513855, + "step": 11128 + }, + { + "epoch": 2.025848730317648, + "grad_norm": 3.265625, + "learning_rate": 3.9246249650035985e-06, + "loss": 0.9276155829429626, + "step": 11130 + }, + { + "epoch": 2.02621279694184, + "grad_norm": 9.8125, + "learning_rate": 3.923315203672271e-06, + "loss": 1.5382261276245117, + "step": 11132 + }, + { + "epoch": 2.0265768635660324, + "grad_norm": 18.25, + "learning_rate": 3.922005747086019e-06, + "loss": 0.7835226058959961, + "step": 11134 + }, + { + "epoch": 2.026940930190225, + "grad_norm": 5.40625, + "learning_rate": 3.9206965954370055e-06, + "loss": 0.8796530365943909, + "step": 11136 + }, + { + "epoch": 2.0273049968144172, + "grad_norm": 24.25, + "learning_rate": 3.919387748917337e-06, + "loss": 0.8988465666770935, + "step": 11138 + }, + { + "epoch": 2.0276690634386094, + "grad_norm": 22.5, + "learning_rate": 3.918079207719086e-06, + "loss": 1.7242884635925293, + "step": 11140 + }, + { + "epoch": 2.0280331300628016, + "grad_norm": 13.25, + "learning_rate": 3.916770972034274e-06, + "loss": 1.3035143613815308, + "step": 11142 + }, + { + "epoch": 2.028397196686994, + "grad_norm": 7.375, + "learning_rate": 3.915463042054878e-06, + "loss": 1.4893219470977783, + "step": 11144 + }, + { + "epoch": 2.028761263311186, + "grad_norm": 11.8125, + "learning_rate": 3.914155417972834e-06, + "loss": 1.3972722291946411, + "step": 11146 + }, + { + "epoch": 2.029125329935378, + "grad_norm": 7.40625, + "learning_rate": 3.912848099980028e-06, + "loss": 1.5186147689819336, + "step": 11148 + }, + { + "epoch": 2.0294893965595704, + "grad_norm": 12.25, + "learning_rate": 3.911541088268307e-06, + "loss": 1.3541737794876099, + "step": 11150 + }, + { + "epoch": 2.0298534631837626, + "grad_norm": 4.6875, + "learning_rate": 3.910234383029467e-06, + "loss": 1.1067891120910645, + "step": 11152 + }, + { + "epoch": 2.030217529807955, + "grad_norm": 47.75, + "learning_rate": 3.908927984455259e-06, + "loss": 1.448687195777893, + "step": 11154 + }, + { + "epoch": 2.030581596432147, + "grad_norm": 8.75, + "learning_rate": 3.907621892737399e-06, + "loss": 0.577587902545929, + "step": 11156 + }, + { + "epoch": 2.030945663056339, + "grad_norm": 7.21875, + "learning_rate": 3.906316108067543e-06, + "loss": 1.4380801916122437, + "step": 11158 + }, + { + "epoch": 2.0313097296805314, + "grad_norm": 8.125, + "learning_rate": 3.905010630637317e-06, + "loss": 1.3181122541427612, + "step": 11160 + }, + { + "epoch": 2.0316737963047236, + "grad_norm": 9.625, + "learning_rate": 3.903705460638289e-06, + "loss": 1.0836912393569946, + "step": 11162 + }, + { + "epoch": 2.032037862928916, + "grad_norm": 17.125, + "learning_rate": 3.902400598261986e-06, + "loss": 1.470362901687622, + "step": 11164 + }, + { + "epoch": 2.0324019295531084, + "grad_norm": 13.6875, + "learning_rate": 3.9010960436999e-06, + "loss": 1.3512824773788452, + "step": 11166 + }, + { + "epoch": 2.0327659961773006, + "grad_norm": 10.1875, + "learning_rate": 3.899791797143459e-06, + "loss": 1.1623103618621826, + "step": 11168 + }, + { + "epoch": 2.0331300628014928, + "grad_norm": 33.75, + "learning_rate": 3.8984878587840635e-06, + "loss": 1.8825910091400146, + "step": 11170 + }, + { + "epoch": 2.033494129425685, + "grad_norm": 3.640625, + "learning_rate": 3.8971842288130564e-06, + "loss": 1.0461876392364502, + "step": 11172 + }, + { + "epoch": 2.033858196049877, + "grad_norm": 9.3125, + "learning_rate": 3.895880907421741e-06, + "loss": 1.3835265636444092, + "step": 11174 + }, + { + "epoch": 2.0342222626740694, + "grad_norm": 2.375, + "learning_rate": 3.894577894801377e-06, + "loss": 1.050366997718811, + "step": 11176 + }, + { + "epoch": 2.0345863292982616, + "grad_norm": 10.125, + "learning_rate": 3.893275191143172e-06, + "loss": 1.6249148845672607, + "step": 11178 + }, + { + "epoch": 2.0349503959224537, + "grad_norm": 7.84375, + "learning_rate": 3.891972796638298e-06, + "loss": 1.4690312147140503, + "step": 11180 + }, + { + "epoch": 2.035314462546646, + "grad_norm": 15.0, + "learning_rate": 3.8906707114778715e-06, + "loss": 0.18220192193984985, + "step": 11182 + }, + { + "epoch": 2.035678529170838, + "grad_norm": 10.375, + "learning_rate": 3.889368935852968e-06, + "loss": 1.3766498565673828, + "step": 11184 + }, + { + "epoch": 2.0360425957950303, + "grad_norm": 5.34375, + "learning_rate": 3.888067469954624e-06, + "loss": 1.3434513807296753, + "step": 11186 + }, + { + "epoch": 2.0364066624192225, + "grad_norm": 18.75, + "learning_rate": 3.886766313973815e-06, + "loss": 1.4428796768188477, + "step": 11188 + }, + { + "epoch": 2.036770729043415, + "grad_norm": 3.359375, + "learning_rate": 3.885465468101488e-06, + "loss": 0.7841074466705322, + "step": 11190 + }, + { + "epoch": 2.0371347956676074, + "grad_norm": 8.75, + "learning_rate": 3.884164932528534e-06, + "loss": 0.930863082408905, + "step": 11192 + }, + { + "epoch": 2.0374988622917996, + "grad_norm": 9.25, + "learning_rate": 3.882864707445799e-06, + "loss": 1.3061288595199585, + "step": 11194 + }, + { + "epoch": 2.0378629289159917, + "grad_norm": 9.0, + "learning_rate": 3.881564793044092e-06, + "loss": 1.2532784938812256, + "step": 11196 + }, + { + "epoch": 2.038226995540184, + "grad_norm": 2.78125, + "learning_rate": 3.880265189514163e-06, + "loss": 1.035947561264038, + "step": 11198 + }, + { + "epoch": 2.038591062164376, + "grad_norm": 13.0625, + "learning_rate": 3.878965897046729e-06, + "loss": 1.5455666780471802, + "step": 11200 + }, + { + "epoch": 2.0389551287885683, + "grad_norm": 7.84375, + "learning_rate": 3.8776669158324535e-06, + "loss": 0.8870929479598999, + "step": 11202 + }, + { + "epoch": 2.0393191954127605, + "grad_norm": 18.625, + "learning_rate": 3.8763682460619554e-06, + "loss": 1.224616527557373, + "step": 11204 + }, + { + "epoch": 2.0396832620369527, + "grad_norm": 35.0, + "learning_rate": 3.875069887925813e-06, + "loss": 1.652757167816162, + "step": 11206 + }, + { + "epoch": 2.040047328661145, + "grad_norm": 8.0625, + "learning_rate": 3.873771841614551e-06, + "loss": 1.435332179069519, + "step": 11208 + }, + { + "epoch": 2.040411395285337, + "grad_norm": 3.359375, + "learning_rate": 3.872474107318656e-06, + "loss": 1.0747565031051636, + "step": 11210 + }, + { + "epoch": 2.0407754619095293, + "grad_norm": 63.5, + "learning_rate": 3.871176685228564e-06, + "loss": 1.3681832551956177, + "step": 11212 + }, + { + "epoch": 2.0411395285337215, + "grad_norm": 27.75, + "learning_rate": 3.8698795755346626e-06, + "loss": 1.3852930068969727, + "step": 11214 + }, + { + "epoch": 2.0415035951579137, + "grad_norm": 8.8125, + "learning_rate": 3.868582778427306e-06, + "loss": 1.4589409828186035, + "step": 11216 + }, + { + "epoch": 2.0418676617821063, + "grad_norm": 6.5, + "learning_rate": 3.867286294096784e-06, + "loss": 1.1996346712112427, + "step": 11218 + }, + { + "epoch": 2.0422317284062985, + "grad_norm": 10.5625, + "learning_rate": 3.865990122733359e-06, + "loss": 1.3251181840896606, + "step": 11220 + }, + { + "epoch": 2.0425957950304907, + "grad_norm": 9.8125, + "learning_rate": 3.864694264527234e-06, + "loss": 1.1638710498809814, + "step": 11222 + }, + { + "epoch": 2.042959861654683, + "grad_norm": 31.625, + "learning_rate": 3.863398719668569e-06, + "loss": 1.9275388717651367, + "step": 11224 + }, + { + "epoch": 2.043323928278875, + "grad_norm": 2.875, + "learning_rate": 3.862103488347488e-06, + "loss": 0.816726803779602, + "step": 11226 + }, + { + "epoch": 2.0436879949030673, + "grad_norm": 10.8125, + "learning_rate": 3.860808570754052e-06, + "loss": 1.4633066654205322, + "step": 11228 + }, + { + "epoch": 2.0440520615272595, + "grad_norm": 8.4375, + "learning_rate": 3.8595139670782925e-06, + "loss": 0.8681273460388184, + "step": 11230 + }, + { + "epoch": 2.0444161281514517, + "grad_norm": 25.0, + "learning_rate": 3.858219677510181e-06, + "loss": 1.2102720737457275, + "step": 11232 + }, + { + "epoch": 2.044780194775644, + "grad_norm": 13.1875, + "learning_rate": 3.856925702239651e-06, + "loss": 1.6891427040100098, + "step": 11234 + }, + { + "epoch": 2.045144261399836, + "grad_norm": 13.625, + "learning_rate": 3.855632041456592e-06, + "loss": 1.4816524982452393, + "step": 11236 + }, + { + "epoch": 2.0455083280240283, + "grad_norm": 4.625, + "learning_rate": 3.854338695350838e-06, + "loss": 1.0426243543624878, + "step": 11238 + }, + { + "epoch": 2.0458723946482205, + "grad_norm": 3.84375, + "learning_rate": 3.853045664112187e-06, + "loss": 1.1335670948028564, + "step": 11240 + }, + { + "epoch": 2.0462364612724127, + "grad_norm": 20.125, + "learning_rate": 3.8517529479303825e-06, + "loss": 2.009348154067993, + "step": 11242 + }, + { + "epoch": 2.0466005278966053, + "grad_norm": 39.25, + "learning_rate": 3.850460546995126e-06, + "loss": 1.5380167961120605, + "step": 11244 + }, + { + "epoch": 2.0469645945207975, + "grad_norm": 19.0, + "learning_rate": 3.849168461496076e-06, + "loss": 0.7257711291313171, + "step": 11246 + }, + { + "epoch": 2.0473286611449897, + "grad_norm": 16.125, + "learning_rate": 3.847876691622835e-06, + "loss": 1.3549449443817139, + "step": 11248 + }, + { + "epoch": 2.047692727769182, + "grad_norm": 10.4375, + "learning_rate": 3.8465852375649696e-06, + "loss": 2.0038070678710938, + "step": 11250 + }, + { + "epoch": 2.048056794393374, + "grad_norm": 7.84375, + "learning_rate": 3.845294099511993e-06, + "loss": 1.1168270111083984, + "step": 11252 + }, + { + "epoch": 2.0484208610175663, + "grad_norm": 14.75, + "learning_rate": 3.844003277653372e-06, + "loss": 0.11435945332050323, + "step": 11254 + }, + { + "epoch": 2.0487849276417585, + "grad_norm": 13.125, + "learning_rate": 3.842712772178537e-06, + "loss": 1.4028353691101074, + "step": 11256 + }, + { + "epoch": 2.0491489942659507, + "grad_norm": 15.5, + "learning_rate": 3.8414225832768574e-06, + "loss": 0.9009632468223572, + "step": 11258 + }, + { + "epoch": 2.049513060890143, + "grad_norm": 9.3125, + "learning_rate": 3.840132711137667e-06, + "loss": 1.3315761089324951, + "step": 11260 + }, + { + "epoch": 2.049877127514335, + "grad_norm": 5.1875, + "learning_rate": 3.838843155950248e-06, + "loss": 0.6985101103782654, + "step": 11262 + }, + { + "epoch": 2.0502411941385272, + "grad_norm": 2.203125, + "learning_rate": 3.837553917903835e-06, + "loss": 0.7500526309013367, + "step": 11264 + }, + { + "epoch": 2.0506052607627194, + "grad_norm": 24.625, + "learning_rate": 3.8362649971876246e-06, + "loss": 1.187584638595581, + "step": 11266 + }, + { + "epoch": 2.0509693273869116, + "grad_norm": 17.375, + "learning_rate": 3.834976393990753e-06, + "loss": 0.5070589780807495, + "step": 11268 + }, + { + "epoch": 2.051333394011104, + "grad_norm": 30.0, + "learning_rate": 3.833688108502326e-06, + "loss": 1.5097854137420654, + "step": 11270 + }, + { + "epoch": 2.0516974606352965, + "grad_norm": 12.5625, + "learning_rate": 3.832400140911388e-06, + "loss": 1.521182894706726, + "step": 11272 + }, + { + "epoch": 2.0520615272594886, + "grad_norm": 22.75, + "learning_rate": 3.831112491406943e-06, + "loss": 1.826069951057434, + "step": 11274 + }, + { + "epoch": 2.052425593883681, + "grad_norm": 2.46875, + "learning_rate": 3.829825160177952e-06, + "loss": 1.0081204175949097, + "step": 11276 + }, + { + "epoch": 2.052789660507873, + "grad_norm": 80.0, + "learning_rate": 3.828538147413322e-06, + "loss": 2.2084999084472656, + "step": 11278 + }, + { + "epoch": 2.0531537271320652, + "grad_norm": 27.0, + "learning_rate": 3.827251453301922e-06, + "loss": 1.9426029920578003, + "step": 11280 + }, + { + "epoch": 2.0535177937562574, + "grad_norm": 17.875, + "learning_rate": 3.825965078032563e-06, + "loss": 1.8204866647720337, + "step": 11282 + }, + { + "epoch": 2.0538818603804496, + "grad_norm": 9.8125, + "learning_rate": 3.824679021794018e-06, + "loss": 1.1048431396484375, + "step": 11284 + }, + { + "epoch": 2.054245927004642, + "grad_norm": 11.0, + "learning_rate": 3.8233932847750134e-06, + "loss": 1.450287103652954, + "step": 11286 + }, + { + "epoch": 2.054609993628834, + "grad_norm": 15.6875, + "learning_rate": 3.82210786716422e-06, + "loss": 1.7937439680099487, + "step": 11288 + }, + { + "epoch": 2.054974060253026, + "grad_norm": 3.515625, + "learning_rate": 3.8208227691502745e-06, + "loss": 1.0372117757797241, + "step": 11290 + }, + { + "epoch": 2.0553381268772184, + "grad_norm": 6.09375, + "learning_rate": 3.8195379909217535e-06, + "loss": 1.0177583694458008, + "step": 11292 + }, + { + "epoch": 2.0557021935014106, + "grad_norm": 11.8125, + "learning_rate": 3.818253532667195e-06, + "loss": 1.0975708961486816, + "step": 11294 + }, + { + "epoch": 2.056066260125603, + "grad_norm": 14.9375, + "learning_rate": 3.816969394575092e-06, + "loss": 1.6444745063781738, + "step": 11296 + }, + { + "epoch": 2.0564303267497954, + "grad_norm": 9.1875, + "learning_rate": 3.81568557683388e-06, + "loss": 1.3613479137420654, + "step": 11298 + }, + { + "epoch": 2.0567943933739876, + "grad_norm": 34.0, + "learning_rate": 3.81440207963196e-06, + "loss": 0.6512670516967773, + "step": 11300 + }, + { + "epoch": 2.05715845999818, + "grad_norm": 18.625, + "learning_rate": 3.8131189031576765e-06, + "loss": 1.3207588195800781, + "step": 11302 + }, + { + "epoch": 2.057522526622372, + "grad_norm": 31.125, + "learning_rate": 3.8118360475993293e-06, + "loss": 1.1373512744903564, + "step": 11304 + }, + { + "epoch": 2.057886593246564, + "grad_norm": 27.5, + "learning_rate": 3.8105535131451774e-06, + "loss": 2.149296283721924, + "step": 11306 + }, + { + "epoch": 2.0582506598707564, + "grad_norm": 15.6875, + "learning_rate": 3.809271299983421e-06, + "loss": 1.4325075149536133, + "step": 11308 + }, + { + "epoch": 2.0586147264949486, + "grad_norm": 9.9375, + "learning_rate": 3.807989408302227e-06, + "loss": 0.20745335519313812, + "step": 11310 + }, + { + "epoch": 2.058978793119141, + "grad_norm": 16.875, + "learning_rate": 3.806707838289702e-06, + "loss": 1.7461762428283691, + "step": 11312 + }, + { + "epoch": 2.059342859743333, + "grad_norm": 16.375, + "learning_rate": 3.8054265901339114e-06, + "loss": 1.3343355655670166, + "step": 11314 + }, + { + "epoch": 2.059706926367525, + "grad_norm": 19.0, + "learning_rate": 3.804145664022878e-06, + "loss": 1.6899375915527344, + "step": 11316 + }, + { + "epoch": 2.0600709929917174, + "grad_norm": 16.5, + "learning_rate": 3.8028650601445664e-06, + "loss": 1.9408360719680786, + "step": 11318 + }, + { + "epoch": 2.0604350596159096, + "grad_norm": 14.1875, + "learning_rate": 3.8015847786869067e-06, + "loss": 1.6124509572982788, + "step": 11320 + }, + { + "epoch": 2.0607991262401018, + "grad_norm": 9.9375, + "learning_rate": 3.8003048198377686e-06, + "loss": 1.2918286323547363, + "step": 11322 + }, + { + "epoch": 2.061163192864294, + "grad_norm": 18.125, + "learning_rate": 3.799025183784983e-06, + "loss": 1.3394672870635986, + "step": 11324 + }, + { + "epoch": 2.0615272594884866, + "grad_norm": 15.75, + "learning_rate": 3.797745870716334e-06, + "loss": 1.372384786605835, + "step": 11326 + }, + { + "epoch": 2.061891326112679, + "grad_norm": 9.3125, + "learning_rate": 3.7964668808195515e-06, + "loss": 1.3626676797866821, + "step": 11328 + }, + { + "epoch": 2.062255392736871, + "grad_norm": 7.125, + "learning_rate": 3.795188214282326e-06, + "loss": 1.2860833406448364, + "step": 11330 + }, + { + "epoch": 2.062619459361063, + "grad_norm": 11.0625, + "learning_rate": 3.793909871292294e-06, + "loss": 1.3646643161773682, + "step": 11332 + }, + { + "epoch": 2.0629835259852554, + "grad_norm": 16.5, + "learning_rate": 3.7926318520370465e-06, + "loss": 1.3925278186798096, + "step": 11334 + }, + { + "epoch": 2.0633475926094476, + "grad_norm": 20.625, + "learning_rate": 3.7913541567041317e-06, + "loss": 1.871671438217163, + "step": 11336 + }, + { + "epoch": 2.0637116592336398, + "grad_norm": 12.5625, + "learning_rate": 3.7900767854810405e-06, + "loss": 1.4626822471618652, + "step": 11338 + }, + { + "epoch": 2.064075725857832, + "grad_norm": 9.1875, + "learning_rate": 3.788799738555228e-06, + "loss": 1.534839391708374, + "step": 11340 + }, + { + "epoch": 2.064439792482024, + "grad_norm": 24.875, + "learning_rate": 3.7875230161140896e-06, + "loss": 1.0633103847503662, + "step": 11342 + }, + { + "epoch": 2.0648038591062163, + "grad_norm": 7.5625, + "learning_rate": 3.786246618344984e-06, + "loss": 1.4124540090560913, + "step": 11344 + }, + { + "epoch": 2.0651679257304085, + "grad_norm": 13.75, + "learning_rate": 3.784970545435217e-06, + "loss": 1.5882494449615479, + "step": 11346 + }, + { + "epoch": 2.0655319923546007, + "grad_norm": 10.9375, + "learning_rate": 3.783694797572043e-06, + "loss": 1.3247013092041016, + "step": 11348 + }, + { + "epoch": 2.065896058978793, + "grad_norm": 20.25, + "learning_rate": 3.7824193749426784e-06, + "loss": 1.5170966386795044, + "step": 11350 + }, + { + "epoch": 2.0662601256029856, + "grad_norm": 10.1875, + "learning_rate": 3.78114427773428e-06, + "loss": 1.1787314414978027, + "step": 11352 + }, + { + "epoch": 2.0666241922271777, + "grad_norm": 7.71875, + "learning_rate": 3.7798695061339686e-06, + "loss": 1.1644823551177979, + "step": 11354 + }, + { + "epoch": 2.06698825885137, + "grad_norm": 3.453125, + "learning_rate": 3.7785950603288114e-06, + "loss": 1.359602689743042, + "step": 11356 + }, + { + "epoch": 2.067352325475562, + "grad_norm": 4.96875, + "learning_rate": 3.7773209405058226e-06, + "loss": 0.8447359800338745, + "step": 11358 + }, + { + "epoch": 2.0677163920997543, + "grad_norm": 15.4375, + "learning_rate": 3.7760471468519822e-06, + "loss": 1.4743925333023071, + "step": 11360 + }, + { + "epoch": 2.0680804587239465, + "grad_norm": 26.375, + "learning_rate": 3.7747736795542062e-06, + "loss": 1.3202470541000366, + "step": 11362 + }, + { + "epoch": 2.0684445253481387, + "grad_norm": 8.9375, + "learning_rate": 3.7735005387993768e-06, + "loss": 1.404314398765564, + "step": 11364 + }, + { + "epoch": 2.068808591972331, + "grad_norm": 10.625, + "learning_rate": 3.772227724774321e-06, + "loss": 1.4834040403366089, + "step": 11366 + }, + { + "epoch": 2.069172658596523, + "grad_norm": 15.125, + "learning_rate": 3.770955237665816e-06, + "loss": 1.7776987552642822, + "step": 11368 + }, + { + "epoch": 2.0695367252207153, + "grad_norm": 11.25, + "learning_rate": 3.7696830776605985e-06, + "loss": 1.4459638595581055, + "step": 11370 + }, + { + "epoch": 2.0699007918449075, + "grad_norm": 20.5, + "learning_rate": 3.7684112449453484e-06, + "loss": 1.6306496858596802, + "step": 11372 + }, + { + "epoch": 2.0702648584690997, + "grad_norm": 5.34375, + "learning_rate": 3.7671397397067056e-06, + "loss": 1.1137031316757202, + "step": 11374 + }, + { + "epoch": 2.070628925093292, + "grad_norm": 41.5, + "learning_rate": 3.7658685621312587e-06, + "loss": 0.5532140731811523, + "step": 11376 + }, + { + "epoch": 2.0709929917174845, + "grad_norm": 9.4375, + "learning_rate": 3.764597712405542e-06, + "loss": 1.3796281814575195, + "step": 11378 + }, + { + "epoch": 2.0713570583416767, + "grad_norm": 8.625, + "learning_rate": 3.7633271907160556e-06, + "loss": 1.3515645265579224, + "step": 11380 + }, + { + "epoch": 2.071721124965869, + "grad_norm": 13.125, + "learning_rate": 3.762056997249237e-06, + "loss": 1.1346735954284668, + "step": 11382 + }, + { + "epoch": 2.072085191590061, + "grad_norm": 12.5, + "learning_rate": 3.7607871321914853e-06, + "loss": 1.5071709156036377, + "step": 11384 + }, + { + "epoch": 2.0724492582142533, + "grad_norm": 20.375, + "learning_rate": 3.7595175957291486e-06, + "loss": 1.382782220840454, + "step": 11386 + }, + { + "epoch": 2.0728133248384455, + "grad_norm": 4.0625, + "learning_rate": 3.758248388048522e-06, + "loss": 0.8292750716209412, + "step": 11388 + }, + { + "epoch": 2.0731773914626377, + "grad_norm": 29.875, + "learning_rate": 3.756979509335863e-06, + "loss": 1.8969202041625977, + "step": 11390 + }, + { + "epoch": 2.07354145808683, + "grad_norm": 11.1875, + "learning_rate": 3.7557109597773674e-06, + "loss": 1.4979089498519897, + "step": 11392 + }, + { + "epoch": 2.073905524711022, + "grad_norm": 19.375, + "learning_rate": 3.7544427395591943e-06, + "loss": 1.3307009935379028, + "step": 11394 + }, + { + "epoch": 2.0742695913352143, + "grad_norm": 9.25, + "learning_rate": 3.753174848867451e-06, + "loss": 1.5596909523010254, + "step": 11396 + }, + { + "epoch": 2.0746336579594065, + "grad_norm": 17.0, + "learning_rate": 3.751907287888189e-06, + "loss": 1.345757007598877, + "step": 11398 + }, + { + "epoch": 2.0749977245835987, + "grad_norm": 13.5, + "learning_rate": 3.750640056807426e-06, + "loss": 1.5483698844909668, + "step": 11400 + }, + { + "epoch": 2.075361791207791, + "grad_norm": 7.40625, + "learning_rate": 3.7493731558111147e-06, + "loss": 1.0907464027404785, + "step": 11402 + }, + { + "epoch": 2.075725857831983, + "grad_norm": 33.25, + "learning_rate": 3.748106585085174e-06, + "loss": 1.285001277923584, + "step": 11404 + }, + { + "epoch": 2.0760899244561757, + "grad_norm": 30.5, + "learning_rate": 3.7468403448154667e-06, + "loss": 2.0893640518188477, + "step": 11406 + }, + { + "epoch": 2.076453991080368, + "grad_norm": 11.875, + "learning_rate": 3.745574435187805e-06, + "loss": 1.3165132999420166, + "step": 11408 + }, + { + "epoch": 2.07681805770456, + "grad_norm": 7.28125, + "learning_rate": 3.7443088563879617e-06, + "loss": 1.2641518115997314, + "step": 11410 + }, + { + "epoch": 2.0771821243287523, + "grad_norm": 17.75, + "learning_rate": 3.7430436086016486e-06, + "loss": 1.3984546661376953, + "step": 11412 + }, + { + "epoch": 2.0775461909529445, + "grad_norm": 10.9375, + "learning_rate": 3.7417786920145415e-06, + "loss": 1.1035141944885254, + "step": 11414 + }, + { + "epoch": 2.0779102575771367, + "grad_norm": 12.0, + "learning_rate": 3.7405141068122607e-06, + "loss": 1.3482505083084106, + "step": 11416 + }, + { + "epoch": 2.078274324201329, + "grad_norm": 22.875, + "learning_rate": 3.7392498531803744e-06, + "loss": 1.8221603631973267, + "step": 11418 + }, + { + "epoch": 2.078638390825521, + "grad_norm": 10.9375, + "learning_rate": 3.737985931304414e-06, + "loss": 1.325646162033081, + "step": 11420 + }, + { + "epoch": 2.0790024574497132, + "grad_norm": 98.0, + "learning_rate": 3.7367223413698474e-06, + "loss": 1.4047844409942627, + "step": 11422 + }, + { + "epoch": 2.0793665240739054, + "grad_norm": 80.0, + "learning_rate": 3.7354590835621067e-06, + "loss": 1.4732098579406738, + "step": 11424 + }, + { + "epoch": 2.0797305906980976, + "grad_norm": 9.6875, + "learning_rate": 3.7341961580665696e-06, + "loss": 1.3628209829330444, + "step": 11426 + }, + { + "epoch": 2.08009465732229, + "grad_norm": 11.5, + "learning_rate": 3.732933565068561e-06, + "loss": 1.5524787902832031, + "step": 11428 + }, + { + "epoch": 2.080458723946482, + "grad_norm": 4.875, + "learning_rate": 3.7316713047533666e-06, + "loss": 1.2368056774139404, + "step": 11430 + }, + { + "epoch": 2.080822790570674, + "grad_norm": 16.5, + "learning_rate": 3.730409377306213e-06, + "loss": 2.109367847442627, + "step": 11432 + }, + { + "epoch": 2.081186857194867, + "grad_norm": 9.1875, + "learning_rate": 3.729147782912287e-06, + "loss": 1.4427129030227661, + "step": 11434 + }, + { + "epoch": 2.081550923819059, + "grad_norm": 3.015625, + "learning_rate": 3.727886521756722e-06, + "loss": 0.949647843837738, + "step": 11436 + }, + { + "epoch": 2.0819149904432512, + "grad_norm": 16.375, + "learning_rate": 3.7266255940245986e-06, + "loss": 0.9821884036064148, + "step": 11438 + }, + { + "epoch": 2.0822790570674434, + "grad_norm": 9.4375, + "learning_rate": 3.72536499990096e-06, + "loss": 1.3183737993240356, + "step": 11440 + }, + { + "epoch": 2.0826431236916356, + "grad_norm": 87.0, + "learning_rate": 3.724104739570786e-06, + "loss": 1.8024661540985107, + "step": 11442 + }, + { + "epoch": 2.083007190315828, + "grad_norm": 21.125, + "learning_rate": 3.7228448132190186e-06, + "loss": 1.5356311798095703, + "step": 11444 + }, + { + "epoch": 2.08337125694002, + "grad_norm": 10.25, + "learning_rate": 3.721585221030549e-06, + "loss": 0.6040663123130798, + "step": 11446 + }, + { + "epoch": 2.083735323564212, + "grad_norm": 11.75, + "learning_rate": 3.720325963190211e-06, + "loss": 1.276921272277832, + "step": 11448 + }, + { + "epoch": 2.0840993901884044, + "grad_norm": 22.5, + "learning_rate": 3.719067039882803e-06, + "loss": 2.054856777191162, + "step": 11450 + }, + { + "epoch": 2.0844634568125966, + "grad_norm": 9.125, + "learning_rate": 3.717808451293059e-06, + "loss": 1.3334741592407227, + "step": 11452 + }, + { + "epoch": 2.084827523436789, + "grad_norm": 20.0, + "learning_rate": 3.7165501976056783e-06, + "loss": 1.9373116493225098, + "step": 11454 + }, + { + "epoch": 2.085191590060981, + "grad_norm": 10.25, + "learning_rate": 3.715292279005303e-06, + "loss": 1.428931713104248, + "step": 11456 + }, + { + "epoch": 2.085555656685173, + "grad_norm": 13.375, + "learning_rate": 3.714034695676524e-06, + "loss": 1.243409276008606, + "step": 11458 + }, + { + "epoch": 2.085919723309366, + "grad_norm": 14.75, + "learning_rate": 3.7127774478038917e-06, + "loss": 0.7346692681312561, + "step": 11460 + }, + { + "epoch": 2.086283789933558, + "grad_norm": 10.375, + "learning_rate": 3.7115205355718976e-06, + "loss": 1.1530519723892212, + "step": 11462 + }, + { + "epoch": 2.08664785655775, + "grad_norm": 10.4375, + "learning_rate": 3.7102639591649907e-06, + "loss": 0.7723678350448608, + "step": 11464 + }, + { + "epoch": 2.0870119231819424, + "grad_norm": 9.5, + "learning_rate": 3.709007718767571e-06, + "loss": 1.0693626403808594, + "step": 11466 + }, + { + "epoch": 2.0873759898061346, + "grad_norm": 16.375, + "learning_rate": 3.70775181456398e-06, + "loss": 1.3240866661071777, + "step": 11468 + }, + { + "epoch": 2.087740056430327, + "grad_norm": 10.625, + "learning_rate": 3.706496246738525e-06, + "loss": 1.3641058206558228, + "step": 11470 + }, + { + "epoch": 2.088104123054519, + "grad_norm": 110.5, + "learning_rate": 3.7052410154754463e-06, + "loss": 1.300267219543457, + "step": 11472 + }, + { + "epoch": 2.088468189678711, + "grad_norm": 27.25, + "learning_rate": 3.703986120958951e-06, + "loss": 1.1412022113800049, + "step": 11474 + }, + { + "epoch": 2.0888322563029034, + "grad_norm": 12.9375, + "learning_rate": 3.702731563373189e-06, + "loss": 1.4686508178710938, + "step": 11476 + }, + { + "epoch": 2.0891963229270956, + "grad_norm": 8.75, + "learning_rate": 3.7014773429022576e-06, + "loss": 1.3583439588546753, + "step": 11478 + }, + { + "epoch": 2.0895603895512878, + "grad_norm": 18.5, + "learning_rate": 3.7002234597302134e-06, + "loss": 2.0067975521087646, + "step": 11480 + }, + { + "epoch": 2.08992445617548, + "grad_norm": 199.0, + "learning_rate": 3.6989699140410527e-06, + "loss": 1.7385756969451904, + "step": 11482 + }, + { + "epoch": 2.090288522799672, + "grad_norm": 14.25, + "learning_rate": 3.6977167060187347e-06, + "loss": 1.9352350234985352, + "step": 11484 + }, + { + "epoch": 2.090652589423865, + "grad_norm": 20.875, + "learning_rate": 3.6964638358471603e-06, + "loss": 0.7607420086860657, + "step": 11486 + }, + { + "epoch": 2.091016656048057, + "grad_norm": 5.78125, + "learning_rate": 3.69521130371018e-06, + "loss": 1.0673882961273193, + "step": 11488 + }, + { + "epoch": 2.091380722672249, + "grad_norm": 15.75, + "learning_rate": 3.6939591097916035e-06, + "loss": 1.2340679168701172, + "step": 11490 + }, + { + "epoch": 2.0917447892964414, + "grad_norm": 3.9375, + "learning_rate": 3.6927072542751786e-06, + "loss": 0.9380825757980347, + "step": 11492 + }, + { + "epoch": 2.0921088559206336, + "grad_norm": 15.875, + "learning_rate": 3.6914557373446158e-06, + "loss": 1.5175800323486328, + "step": 11494 + }, + { + "epoch": 2.0924729225448258, + "grad_norm": 127.5, + "learning_rate": 3.6902045591835685e-06, + "loss": 0.7016688585281372, + "step": 11496 + }, + { + "epoch": 2.092836989169018, + "grad_norm": 18.125, + "learning_rate": 3.6889537199756385e-06, + "loss": 1.7808164358139038, + "step": 11498 + }, + { + "epoch": 2.09320105579321, + "grad_norm": 13.375, + "learning_rate": 3.687703219904388e-06, + "loss": 1.443461298942566, + "step": 11500 + }, + { + "epoch": 2.0935651224174023, + "grad_norm": 23.0, + "learning_rate": 3.686453059153315e-06, + "loss": 1.4294456243515015, + "step": 11502 + }, + { + "epoch": 2.0939291890415945, + "grad_norm": 15.8125, + "learning_rate": 3.6852032379058815e-06, + "loss": 0.9319020509719849, + "step": 11504 + }, + { + "epoch": 2.0942932556657867, + "grad_norm": 53.0, + "learning_rate": 3.683953756345493e-06, + "loss": 1.5301628112792969, + "step": 11506 + }, + { + "epoch": 2.094657322289979, + "grad_norm": 25.375, + "learning_rate": 3.682704614655502e-06, + "loss": 1.4465856552124023, + "step": 11508 + }, + { + "epoch": 2.095021388914171, + "grad_norm": 212.0, + "learning_rate": 3.6814558130192212e-06, + "loss": 1.5864955186843872, + "step": 11510 + }, + { + "epoch": 2.0953854555383633, + "grad_norm": 18.75, + "learning_rate": 3.6802073516199e-06, + "loss": 2.0906662940979004, + "step": 11512 + }, + { + "epoch": 2.095749522162556, + "grad_norm": 25.375, + "learning_rate": 3.67895923064075e-06, + "loss": 1.0467870235443115, + "step": 11514 + }, + { + "epoch": 2.096113588786748, + "grad_norm": 9.0, + "learning_rate": 3.677711450264928e-06, + "loss": 1.5268776416778564, + "step": 11516 + }, + { + "epoch": 2.0964776554109403, + "grad_norm": 12.625, + "learning_rate": 3.6764640106755363e-06, + "loss": 1.3490040302276611, + "step": 11518 + }, + { + "epoch": 2.0968417220351325, + "grad_norm": 6.3125, + "learning_rate": 3.675216912055638e-06, + "loss": 1.0428179502487183, + "step": 11520 + }, + { + "epoch": 2.0972057886593247, + "grad_norm": 4.46875, + "learning_rate": 3.6739701545882333e-06, + "loss": 1.1493134498596191, + "step": 11522 + }, + { + "epoch": 2.097569855283517, + "grad_norm": 29.25, + "learning_rate": 3.672723738456283e-06, + "loss": 1.2034779787063599, + "step": 11524 + }, + { + "epoch": 2.097933921907709, + "grad_norm": 7.46875, + "learning_rate": 3.6714776638426935e-06, + "loss": 1.3758158683776855, + "step": 11526 + }, + { + "epoch": 2.0982979885319013, + "grad_norm": 7.5, + "learning_rate": 3.670231930930318e-06, + "loss": 1.081048846244812, + "step": 11528 + }, + { + "epoch": 2.0986620551560935, + "grad_norm": 3.390625, + "learning_rate": 3.6689865399019676e-06, + "loss": 1.2714335918426514, + "step": 11530 + }, + { + "epoch": 2.0990261217802857, + "grad_norm": 11.0, + "learning_rate": 3.667741490940393e-06, + "loss": 1.5025876760482788, + "step": 11532 + }, + { + "epoch": 2.099390188404478, + "grad_norm": 3.53125, + "learning_rate": 3.666496784228304e-06, + "loss": 1.409659504890442, + "step": 11534 + }, + { + "epoch": 2.09975425502867, + "grad_norm": 2.8125, + "learning_rate": 3.6652524199483565e-06, + "loss": 1.1324552297592163, + "step": 11536 + }, + { + "epoch": 2.1001183216528623, + "grad_norm": 19.875, + "learning_rate": 3.6640083982831514e-06, + "loss": 1.2281301021575928, + "step": 11538 + }, + { + "epoch": 2.100482388277055, + "grad_norm": 63.25, + "learning_rate": 3.6627647194152498e-06, + "loss": 0.4560912847518921, + "step": 11540 + }, + { + "epoch": 2.100846454901247, + "grad_norm": 16.875, + "learning_rate": 3.661521383527151e-06, + "loss": 1.259648323059082, + "step": 11542 + }, + { + "epoch": 2.1012105215254393, + "grad_norm": 11.875, + "learning_rate": 3.6602783908013136e-06, + "loss": 1.769161581993103, + "step": 11544 + }, + { + "epoch": 2.1015745881496315, + "grad_norm": 10.3125, + "learning_rate": 3.6590357414201415e-06, + "loss": 1.790071964263916, + "step": 11546 + }, + { + "epoch": 2.1019386547738237, + "grad_norm": 12.75, + "learning_rate": 3.6577934355659844e-06, + "loss": 1.2681019306182861, + "step": 11548 + }, + { + "epoch": 2.102302721398016, + "grad_norm": 25.5, + "learning_rate": 3.6565514734211515e-06, + "loss": 1.7064169645309448, + "step": 11550 + }, + { + "epoch": 2.102666788022208, + "grad_norm": 28.5, + "learning_rate": 3.6553098551678902e-06, + "loss": 0.9402908682823181, + "step": 11552 + }, + { + "epoch": 2.1030308546464003, + "grad_norm": 11.75, + "learning_rate": 3.654068580988406e-06, + "loss": 1.2075145244598389, + "step": 11554 + }, + { + "epoch": 2.1033949212705925, + "grad_norm": 12.625, + "learning_rate": 3.6528276510648527e-06, + "loss": 1.2668508291244507, + "step": 11556 + }, + { + "epoch": 2.1037589878947847, + "grad_norm": 14.6875, + "learning_rate": 3.6515870655793255e-06, + "loss": 0.8139492869377136, + "step": 11558 + }, + { + "epoch": 2.104123054518977, + "grad_norm": 15.5, + "learning_rate": 3.650346824713883e-06, + "loss": 1.3122189044952393, + "step": 11560 + }, + { + "epoch": 2.104487121143169, + "grad_norm": 10.3125, + "learning_rate": 3.649106928650518e-06, + "loss": 1.3857519626617432, + "step": 11562 + }, + { + "epoch": 2.1048511877673612, + "grad_norm": 14.5625, + "learning_rate": 3.647867377571186e-06, + "loss": 1.3880122900009155, + "step": 11564 + }, + { + "epoch": 2.1052152543915534, + "grad_norm": 11.0625, + "learning_rate": 3.6466281716577844e-06, + "loss": 1.6191080808639526, + "step": 11566 + }, + { + "epoch": 2.105579321015746, + "grad_norm": 26.125, + "learning_rate": 3.6453893110921595e-06, + "loss": 2.107329845428467, + "step": 11568 + }, + { + "epoch": 2.1059433876399383, + "grad_norm": 12.6875, + "learning_rate": 3.6441507960561134e-06, + "loss": 1.3498234748840332, + "step": 11570 + }, + { + "epoch": 2.1063074542641305, + "grad_norm": 10.0625, + "learning_rate": 3.6429126267313873e-06, + "loss": 1.5198556184768677, + "step": 11572 + }, + { + "epoch": 2.1066715208883227, + "grad_norm": 21.125, + "learning_rate": 3.6416748032996824e-06, + "loss": 1.3618147373199463, + "step": 11574 + }, + { + "epoch": 2.107035587512515, + "grad_norm": 27.375, + "learning_rate": 3.640437325942644e-06, + "loss": 1.2114366292953491, + "step": 11576 + }, + { + "epoch": 2.107399654136707, + "grad_norm": 38.75, + "learning_rate": 3.639200194841863e-06, + "loss": 1.0727523565292358, + "step": 11578 + }, + { + "epoch": 2.1077637207608992, + "grad_norm": 9.3125, + "learning_rate": 3.6379634101788885e-06, + "loss": 1.446696162223816, + "step": 11580 + }, + { + "epoch": 2.1081277873850914, + "grad_norm": 9.4375, + "learning_rate": 3.6367269721352083e-06, + "loss": 1.2902332544326782, + "step": 11582 + }, + { + "epoch": 2.1084918540092836, + "grad_norm": 15.125, + "learning_rate": 3.635490880892269e-06, + "loss": 1.6324158906936646, + "step": 11584 + }, + { + "epoch": 2.108855920633476, + "grad_norm": 24.75, + "learning_rate": 3.6342551366314618e-06, + "loss": 1.7397615909576416, + "step": 11586 + }, + { + "epoch": 2.109219987257668, + "grad_norm": 14.4375, + "learning_rate": 3.633019739534123e-06, + "loss": 1.2016204595565796, + "step": 11588 + }, + { + "epoch": 2.10958405388186, + "grad_norm": 7.15625, + "learning_rate": 3.6317846897815467e-06, + "loss": 1.2775236368179321, + "step": 11590 + }, + { + "epoch": 2.1099481205060524, + "grad_norm": 9.9375, + "learning_rate": 3.6305499875549675e-06, + "loss": 1.326016902923584, + "step": 11592 + }, + { + "epoch": 2.110312187130245, + "grad_norm": 9.4375, + "learning_rate": 3.629315633035577e-06, + "loss": 1.3310823440551758, + "step": 11594 + }, + { + "epoch": 2.1106762537544372, + "grad_norm": 11.1875, + "learning_rate": 3.628081626404511e-06, + "loss": 1.4421488046646118, + "step": 11596 + }, + { + "epoch": 2.1110403203786294, + "grad_norm": 8.625, + "learning_rate": 3.6268479678428505e-06, + "loss": 1.3825407028198242, + "step": 11598 + }, + { + "epoch": 2.1114043870028216, + "grad_norm": 7.5625, + "learning_rate": 3.6256146575316366e-06, + "loss": 1.1559851169586182, + "step": 11600 + }, + { + "epoch": 2.111768453627014, + "grad_norm": 7.3125, + "learning_rate": 3.624381695651846e-06, + "loss": 1.1735668182373047, + "step": 11602 + }, + { + "epoch": 2.112132520251206, + "grad_norm": 11.375, + "learning_rate": 3.623149082384415e-06, + "loss": 1.358276605606079, + "step": 11604 + }, + { + "epoch": 2.112496586875398, + "grad_norm": 6.34375, + "learning_rate": 3.6219168179102253e-06, + "loss": 1.1880605220794678, + "step": 11606 + }, + { + "epoch": 2.1128606534995904, + "grad_norm": 14.625, + "learning_rate": 3.6206849024101033e-06, + "loss": 1.2851117849349976, + "step": 11608 + }, + { + "epoch": 2.1132247201237826, + "grad_norm": 20.5, + "learning_rate": 3.619453336064831e-06, + "loss": 1.5145275592803955, + "step": 11610 + }, + { + "epoch": 2.113588786747975, + "grad_norm": 19.75, + "learning_rate": 3.6182221190551315e-06, + "loss": 1.297667384147644, + "step": 11612 + }, + { + "epoch": 2.113952853372167, + "grad_norm": 16.25, + "learning_rate": 3.616991251561685e-06, + "loss": 1.5837275981903076, + "step": 11614 + }, + { + "epoch": 2.114316919996359, + "grad_norm": 30.875, + "learning_rate": 3.6157607337651163e-06, + "loss": 1.446892261505127, + "step": 11616 + }, + { + "epoch": 2.1146809866205514, + "grad_norm": 7.84375, + "learning_rate": 3.6145305658459946e-06, + "loss": 1.5234527587890625, + "step": 11618 + }, + { + "epoch": 2.115045053244744, + "grad_norm": 9.1875, + "learning_rate": 3.6133007479848474e-06, + "loss": 1.3599328994750977, + "step": 11620 + }, + { + "epoch": 2.115409119868936, + "grad_norm": 52.25, + "learning_rate": 3.612071280362141e-06, + "loss": 1.804826259613037, + "step": 11622 + }, + { + "epoch": 2.1157731864931284, + "grad_norm": 8.125, + "learning_rate": 3.6108421631582973e-06, + "loss": 1.3728814125061035, + "step": 11624 + }, + { + "epoch": 2.1161372531173206, + "grad_norm": 16.125, + "learning_rate": 3.609613396553686e-06, + "loss": 1.414371132850647, + "step": 11626 + }, + { + "epoch": 2.116501319741513, + "grad_norm": 12.1875, + "learning_rate": 3.608384980728618e-06, + "loss": 1.5621743202209473, + "step": 11628 + }, + { + "epoch": 2.116865386365705, + "grad_norm": 12.6875, + "learning_rate": 3.6071569158633646e-06, + "loss": 1.4366450309753418, + "step": 11630 + }, + { + "epoch": 2.117229452989897, + "grad_norm": 11.1875, + "learning_rate": 3.6059292021381336e-06, + "loss": 1.2083649635314941, + "step": 11632 + }, + { + "epoch": 2.1175935196140894, + "grad_norm": 7.59375, + "learning_rate": 3.6047018397330913e-06, + "loss": 1.3041483163833618, + "step": 11634 + }, + { + "epoch": 2.1179575862382816, + "grad_norm": 19.125, + "learning_rate": 3.6034748288283483e-06, + "loss": 1.3927923440933228, + "step": 11636 + }, + { + "epoch": 2.1183216528624738, + "grad_norm": 5.71875, + "learning_rate": 3.602248169603959e-06, + "loss": 0.922240674495697, + "step": 11638 + }, + { + "epoch": 2.118685719486666, + "grad_norm": 14.1875, + "learning_rate": 3.6010218622399363e-06, + "loss": 1.390246033668518, + "step": 11640 + }, + { + "epoch": 2.119049786110858, + "grad_norm": 15.875, + "learning_rate": 3.5997959069162305e-06, + "loss": 2.0122480392456055, + "step": 11642 + }, + { + "epoch": 2.1194138527350503, + "grad_norm": 22.875, + "learning_rate": 3.5985703038127494e-06, + "loss": 0.9682165384292603, + "step": 11644 + }, + { + "epoch": 2.1197779193592425, + "grad_norm": 10.3125, + "learning_rate": 3.5973450531093453e-06, + "loss": 0.9166086912155151, + "step": 11646 + }, + { + "epoch": 2.120141985983435, + "grad_norm": 14.125, + "learning_rate": 3.596120154985815e-06, + "loss": 0.8686960339546204, + "step": 11648 + }, + { + "epoch": 2.1205060526076274, + "grad_norm": 6.46875, + "learning_rate": 3.5948956096219125e-06, + "loss": 0.9494041204452515, + "step": 11650 + }, + { + "epoch": 2.1208701192318196, + "grad_norm": 11.625, + "learning_rate": 3.59367141719733e-06, + "loss": 1.3254351615905762, + "step": 11652 + }, + { + "epoch": 2.1212341858560118, + "grad_norm": 13.3125, + "learning_rate": 3.5924475778917154e-06, + "loss": 1.3369855880737305, + "step": 11654 + }, + { + "epoch": 2.121598252480204, + "grad_norm": 13.0625, + "learning_rate": 3.5912240918846644e-06, + "loss": 1.413733720779419, + "step": 11656 + }, + { + "epoch": 2.121962319104396, + "grad_norm": 8.1875, + "learning_rate": 3.590000959355712e-06, + "loss": 1.4298150539398193, + "step": 11658 + }, + { + "epoch": 2.1223263857285883, + "grad_norm": 30.125, + "learning_rate": 3.5887781804843558e-06, + "loss": 2.0864064693450928, + "step": 11660 + }, + { + "epoch": 2.1226904523527805, + "grad_norm": 21.875, + "learning_rate": 3.5875557554500264e-06, + "loss": 1.1395196914672852, + "step": 11662 + }, + { + "epoch": 2.1230545189769727, + "grad_norm": 22.5, + "learning_rate": 3.5863336844321144e-06, + "loss": 1.1446491479873657, + "step": 11664 + }, + { + "epoch": 2.123418585601165, + "grad_norm": 24.75, + "learning_rate": 3.585111967609954e-06, + "loss": 1.2273386716842651, + "step": 11666 + }, + { + "epoch": 2.123782652225357, + "grad_norm": 9.875, + "learning_rate": 3.5838906051628233e-06, + "loss": 1.3664453029632568, + "step": 11668 + }, + { + "epoch": 2.1241467188495493, + "grad_norm": 14.3125, + "learning_rate": 3.5826695972699568e-06, + "loss": 1.2502048015594482, + "step": 11670 + }, + { + "epoch": 2.1245107854737415, + "grad_norm": 2.921875, + "learning_rate": 3.5814489441105283e-06, + "loss": 0.9418635368347168, + "step": 11672 + }, + { + "epoch": 2.1248748520979337, + "grad_norm": 14.1875, + "learning_rate": 3.5802286458636666e-06, + "loss": 1.3807302713394165, + "step": 11674 + }, + { + "epoch": 2.1252389187221263, + "grad_norm": 12.625, + "learning_rate": 3.5790087027084456e-06, + "loss": 1.4288439750671387, + "step": 11676 + }, + { + "epoch": 2.1256029853463185, + "grad_norm": 10.0, + "learning_rate": 3.577789114823884e-06, + "loss": 1.3997688293457031, + "step": 11678 + }, + { + "epoch": 2.1259670519705107, + "grad_norm": 4.0, + "learning_rate": 3.5765698823889563e-06, + "loss": 1.1656535863876343, + "step": 11680 + }, + { + "epoch": 2.126331118594703, + "grad_norm": 8.4375, + "learning_rate": 3.5753510055825737e-06, + "loss": 1.1855342388153076, + "step": 11682 + }, + { + "epoch": 2.126695185218895, + "grad_norm": 15.8125, + "learning_rate": 3.574132484583606e-06, + "loss": 1.4016551971435547, + "step": 11684 + }, + { + "epoch": 2.1270592518430873, + "grad_norm": 6.03125, + "learning_rate": 3.5729143195708677e-06, + "loss": 1.3941712379455566, + "step": 11686 + }, + { + "epoch": 2.1274233184672795, + "grad_norm": 36.0, + "learning_rate": 3.5716965107231127e-06, + "loss": 1.9570362567901611, + "step": 11688 + }, + { + "epoch": 2.1277873850914717, + "grad_norm": 16.5, + "learning_rate": 3.5704790582190575e-06, + "loss": 1.331141471862793, + "step": 11690 + }, + { + "epoch": 2.128151451715664, + "grad_norm": 6.625, + "learning_rate": 3.5692619622373515e-06, + "loss": 1.0926845073699951, + "step": 11692 + }, + { + "epoch": 2.128515518339856, + "grad_norm": 8.4375, + "learning_rate": 3.568045222956602e-06, + "loss": 1.224328875541687, + "step": 11694 + }, + { + "epoch": 2.1288795849640483, + "grad_norm": 13.125, + "learning_rate": 3.566828840555363e-06, + "loss": 1.718050479888916, + "step": 11696 + }, + { + "epoch": 2.1292436515882405, + "grad_norm": 2.46875, + "learning_rate": 3.5656128152121273e-06, + "loss": 0.9652382135391235, + "step": 11698 + }, + { + "epoch": 2.1296077182124327, + "grad_norm": 8.125, + "learning_rate": 3.564397147105348e-06, + "loss": 1.2943415641784668, + "step": 11700 + }, + { + "epoch": 2.1299717848366253, + "grad_norm": 13.9375, + "learning_rate": 3.563181836413414e-06, + "loss": 0.5209476351737976, + "step": 11702 + }, + { + "epoch": 2.1303358514608175, + "grad_norm": 10.9375, + "learning_rate": 3.5619668833146712e-06, + "loss": 1.2705987691879272, + "step": 11704 + }, + { + "epoch": 2.1306999180850097, + "grad_norm": 18.5, + "learning_rate": 3.56075228798741e-06, + "loss": 0.9336791634559631, + "step": 11706 + }, + { + "epoch": 2.131063984709202, + "grad_norm": 12.5625, + "learning_rate": 3.5595380506098613e-06, + "loss": 1.8572324514389038, + "step": 11708 + }, + { + "epoch": 2.131428051333394, + "grad_norm": 22.5, + "learning_rate": 3.558324171360217e-06, + "loss": 1.4580127000808716, + "step": 11710 + }, + { + "epoch": 2.1317921179575863, + "grad_norm": 237.0, + "learning_rate": 3.557110650416602e-06, + "loss": 1.339434266090393, + "step": 11712 + }, + { + "epoch": 2.1321561845817785, + "grad_norm": 12.75, + "learning_rate": 3.5558974879571007e-06, + "loss": 1.6232883930206299, + "step": 11714 + }, + { + "epoch": 2.1325202512059707, + "grad_norm": 22.0, + "learning_rate": 3.5546846841597394e-06, + "loss": 1.7452282905578613, + "step": 11716 + }, + { + "epoch": 2.132884317830163, + "grad_norm": 11.9375, + "learning_rate": 3.5534722392024868e-06, + "loss": 0.8611134886741638, + "step": 11718 + }, + { + "epoch": 2.133248384454355, + "grad_norm": 6.84375, + "learning_rate": 3.552260153263272e-06, + "loss": 1.0348193645477295, + "step": 11720 + }, + { + "epoch": 2.1336124510785472, + "grad_norm": 13.375, + "learning_rate": 3.5510484265199568e-06, + "loss": 1.2420247793197632, + "step": 11722 + }, + { + "epoch": 2.1339765177027394, + "grad_norm": 14.25, + "learning_rate": 3.5498370591503616e-06, + "loss": 1.4249308109283447, + "step": 11724 + }, + { + "epoch": 2.1343405843269316, + "grad_norm": 8.75, + "learning_rate": 3.5486260513322503e-06, + "loss": 1.3123869895935059, + "step": 11726 + }, + { + "epoch": 2.1347046509511243, + "grad_norm": 16.25, + "learning_rate": 3.547415403243328e-06, + "loss": 1.9426798820495605, + "step": 11728 + }, + { + "epoch": 2.1350687175753165, + "grad_norm": 11.1875, + "learning_rate": 3.5462051150612587e-06, + "loss": 1.3553457260131836, + "step": 11730 + }, + { + "epoch": 2.1354327841995087, + "grad_norm": 7.125, + "learning_rate": 3.544995186963642e-06, + "loss": 1.3344693183898926, + "step": 11732 + }, + { + "epoch": 2.135796850823701, + "grad_norm": 22.125, + "learning_rate": 3.5437856191280327e-06, + "loss": 0.5752114653587341, + "step": 11734 + }, + { + "epoch": 2.136160917447893, + "grad_norm": 16.25, + "learning_rate": 3.542576411731933e-06, + "loss": 1.3905531167984009, + "step": 11736 + }, + { + "epoch": 2.1365249840720852, + "grad_norm": 4.78125, + "learning_rate": 3.5413675649527814e-06, + "loss": 1.1650147438049316, + "step": 11738 + }, + { + "epoch": 2.1368890506962774, + "grad_norm": 14.3125, + "learning_rate": 3.5401590789679787e-06, + "loss": 1.4482753276824951, + "step": 11740 + }, + { + "epoch": 2.1372531173204696, + "grad_norm": 11.25, + "learning_rate": 3.5389509539548596e-06, + "loss": 1.3894133567810059, + "step": 11742 + }, + { + "epoch": 2.137617183944662, + "grad_norm": 9.8125, + "learning_rate": 3.5377431900907157e-06, + "loss": 1.3424606323242188, + "step": 11744 + }, + { + "epoch": 2.137981250568854, + "grad_norm": 6.25, + "learning_rate": 3.5365357875527816e-06, + "loss": 1.211737871170044, + "step": 11746 + }, + { + "epoch": 2.138345317193046, + "grad_norm": 6.875, + "learning_rate": 3.535328746518234e-06, + "loss": 1.3907968997955322, + "step": 11748 + }, + { + "epoch": 2.1387093838172384, + "grad_norm": 14.5, + "learning_rate": 3.5341220671642074e-06, + "loss": 1.0780372619628906, + "step": 11750 + }, + { + "epoch": 2.1390734504414306, + "grad_norm": 6.90625, + "learning_rate": 3.5329157496677715e-06, + "loss": 1.3604546785354614, + "step": 11752 + }, + { + "epoch": 2.1394375170656232, + "grad_norm": 11.0625, + "learning_rate": 3.531709794205952e-06, + "loss": 1.4022518396377563, + "step": 11754 + }, + { + "epoch": 2.1398015836898154, + "grad_norm": 9.75, + "learning_rate": 3.530504200955719e-06, + "loss": 1.324486494064331, + "step": 11756 + }, + { + "epoch": 2.1401656503140076, + "grad_norm": 13.8125, + "learning_rate": 3.5292989700939835e-06, + "loss": 1.3497986793518066, + "step": 11758 + }, + { + "epoch": 2.1405297169382, + "grad_norm": 11.6875, + "learning_rate": 3.528094101797614e-06, + "loss": 1.386899471282959, + "step": 11760 + }, + { + "epoch": 2.140893783562392, + "grad_norm": 11.0625, + "learning_rate": 3.526889596243415e-06, + "loss": 1.3024482727050781, + "step": 11762 + }, + { + "epoch": 2.141257850186584, + "grad_norm": 18.125, + "learning_rate": 3.525685453608145e-06, + "loss": 1.3635355234146118, + "step": 11764 + }, + { + "epoch": 2.1416219168107764, + "grad_norm": 59.25, + "learning_rate": 3.5244816740685103e-06, + "loss": 1.3361120223999023, + "step": 11766 + }, + { + "epoch": 2.1419859834349686, + "grad_norm": 17.875, + "learning_rate": 3.523278257801154e-06, + "loss": 1.527967095375061, + "step": 11768 + }, + { + "epoch": 2.142350050059161, + "grad_norm": 26.5, + "learning_rate": 3.522075204982679e-06, + "loss": 1.9064382314682007, + "step": 11770 + }, + { + "epoch": 2.142714116683353, + "grad_norm": 24.125, + "learning_rate": 3.5208725157896223e-06, + "loss": 1.6158825159072876, + "step": 11772 + }, + { + "epoch": 2.143078183307545, + "grad_norm": 7.1875, + "learning_rate": 3.5196701903984786e-06, + "loss": 1.1356252431869507, + "step": 11774 + }, + { + "epoch": 2.1434422499317374, + "grad_norm": 10.75, + "learning_rate": 3.518468228985684e-06, + "loss": 1.0046056509017944, + "step": 11776 + }, + { + "epoch": 2.1438063165559296, + "grad_norm": 10.1875, + "learning_rate": 3.5172666317276178e-06, + "loss": 1.4122179746627808, + "step": 11778 + }, + { + "epoch": 2.1441703831801218, + "grad_norm": 18.625, + "learning_rate": 3.516065398800614e-06, + "loss": 1.3657464981079102, + "step": 11780 + }, + { + "epoch": 2.144534449804314, + "grad_norm": 12.75, + "learning_rate": 3.5148645303809436e-06, + "loss": 1.243217945098877, + "step": 11782 + }, + { + "epoch": 2.1448985164285066, + "grad_norm": 4.375, + "learning_rate": 3.5136640266448336e-06, + "loss": 1.1562645435333252, + "step": 11784 + }, + { + "epoch": 2.145262583052699, + "grad_norm": 10.625, + "learning_rate": 3.5124638877684527e-06, + "loss": 2.1999475955963135, + "step": 11786 + }, + { + "epoch": 2.145626649676891, + "grad_norm": 6.15625, + "learning_rate": 3.5112641139279126e-06, + "loss": 1.5270907878875732, + "step": 11788 + }, + { + "epoch": 2.145990716301083, + "grad_norm": 2.90625, + "learning_rate": 3.51006470529928e-06, + "loss": 1.1781255006790161, + "step": 11790 + }, + { + "epoch": 2.1463547829252754, + "grad_norm": 6.84375, + "learning_rate": 3.508865662058558e-06, + "loss": 1.3630359172821045, + "step": 11792 + }, + { + "epoch": 2.1467188495494676, + "grad_norm": 72.5, + "learning_rate": 3.507666984381707e-06, + "loss": 0.9165368676185608, + "step": 11794 + }, + { + "epoch": 2.1470829161736598, + "grad_norm": 33.5, + "learning_rate": 3.5064686724446263e-06, + "loss": 1.914183259010315, + "step": 11796 + }, + { + "epoch": 2.147446982797852, + "grad_norm": 13.0625, + "learning_rate": 3.50527072642316e-06, + "loss": 1.9940767288208008, + "step": 11798 + }, + { + "epoch": 2.147811049422044, + "grad_norm": 15.25, + "learning_rate": 3.5040731464931064e-06, + "loss": 0.7465848326683044, + "step": 11800 + }, + { + "epoch": 2.1481751160462363, + "grad_norm": 25.25, + "learning_rate": 3.502875932830201e-06, + "loss": 1.5110869407653809, + "step": 11802 + }, + { + "epoch": 2.1485391826704285, + "grad_norm": 13.4375, + "learning_rate": 3.5016790856101345e-06, + "loss": 1.272931456565857, + "step": 11804 + }, + { + "epoch": 2.1489032492946207, + "grad_norm": 14.9375, + "learning_rate": 3.500482605008538e-06, + "loss": 1.331356406211853, + "step": 11806 + }, + { + "epoch": 2.149267315918813, + "grad_norm": 18.5, + "learning_rate": 3.4992864912009873e-06, + "loss": 1.2928212881088257, + "step": 11808 + }, + { + "epoch": 2.1496313825430056, + "grad_norm": 13.6875, + "learning_rate": 3.498090744363012e-06, + "loss": 1.4406646490097046, + "step": 11810 + }, + { + "epoch": 2.1499954491671978, + "grad_norm": 11.4375, + "learning_rate": 3.4968953646700777e-06, + "loss": 1.5231435298919678, + "step": 11812 + }, + { + "epoch": 2.15035951579139, + "grad_norm": 10.0625, + "learning_rate": 3.495700352297606e-06, + "loss": 1.1035821437835693, + "step": 11814 + }, + { + "epoch": 2.150723582415582, + "grad_norm": 23.0, + "learning_rate": 3.49450570742096e-06, + "loss": 0.7823556661605835, + "step": 11816 + }, + { + "epoch": 2.1510876490397743, + "grad_norm": 32.25, + "learning_rate": 3.4933114302154448e-06, + "loss": 1.5058984756469727, + "step": 11818 + }, + { + "epoch": 2.1514517156639665, + "grad_norm": 3.890625, + "learning_rate": 3.492117520856322e-06, + "loss": 1.0921863317489624, + "step": 11820 + }, + { + "epoch": 2.1518157822881587, + "grad_norm": 47.25, + "learning_rate": 3.4909239795187854e-06, + "loss": 1.3037811517715454, + "step": 11822 + }, + { + "epoch": 2.152179848912351, + "grad_norm": 5.53125, + "learning_rate": 3.489730806377988e-06, + "loss": 0.9490963816642761, + "step": 11824 + }, + { + "epoch": 2.152543915536543, + "grad_norm": 16.875, + "learning_rate": 3.4885380016090237e-06, + "loss": 1.9181374311447144, + "step": 11826 + }, + { + "epoch": 2.1529079821607353, + "grad_norm": 72.0, + "learning_rate": 3.487345565386926e-06, + "loss": 1.4276347160339355, + "step": 11828 + }, + { + "epoch": 2.1532720487849275, + "grad_norm": 11.4375, + "learning_rate": 3.486153497886687e-06, + "loss": 1.294939637184143, + "step": 11830 + }, + { + "epoch": 2.1536361154091197, + "grad_norm": 8.625, + "learning_rate": 3.484961799283232e-06, + "loss": 1.2862040996551514, + "step": 11832 + }, + { + "epoch": 2.154000182033312, + "grad_norm": 12.0, + "learning_rate": 3.4837704697514405e-06, + "loss": 1.4528831243515015, + "step": 11834 + }, + { + "epoch": 2.1543642486575045, + "grad_norm": 17.5, + "learning_rate": 3.4825795094661375e-06, + "loss": 1.5572094917297363, + "step": 11836 + }, + { + "epoch": 2.1547283152816967, + "grad_norm": 16.875, + "learning_rate": 3.4813889186020868e-06, + "loss": 2.0158350467681885, + "step": 11838 + }, + { + "epoch": 2.155092381905889, + "grad_norm": 7.25, + "learning_rate": 3.4801986973340075e-06, + "loss": 1.38032066822052, + "step": 11840 + }, + { + "epoch": 2.155456448530081, + "grad_norm": 16.0, + "learning_rate": 3.4790088458365546e-06, + "loss": 1.3108739852905273, + "step": 11842 + }, + { + "epoch": 2.1558205151542733, + "grad_norm": 17.375, + "learning_rate": 3.4778193642843383e-06, + "loss": 1.3640022277832031, + "step": 11844 + }, + { + "epoch": 2.1561845817784655, + "grad_norm": 14.375, + "learning_rate": 3.47663025285191e-06, + "loss": 1.3018871545791626, + "step": 11846 + }, + { + "epoch": 2.1565486484026577, + "grad_norm": 13.125, + "learning_rate": 3.4754415117137643e-06, + "loss": 1.208632230758667, + "step": 11848 + }, + { + "epoch": 2.15691271502685, + "grad_norm": 15.9375, + "learning_rate": 3.474253141044347e-06, + "loss": 1.2062726020812988, + "step": 11850 + }, + { + "epoch": 2.157276781651042, + "grad_norm": 15.3125, + "learning_rate": 3.4730651410180426e-06, + "loss": 0.43019193410873413, + "step": 11852 + }, + { + "epoch": 2.1576408482752343, + "grad_norm": 7.0, + "learning_rate": 3.47187751180919e-06, + "loss": 1.3824466466903687, + "step": 11854 + }, + { + "epoch": 2.1580049148994265, + "grad_norm": 5.8125, + "learning_rate": 3.470690253592068e-06, + "loss": 1.3821743726730347, + "step": 11856 + }, + { + "epoch": 2.1583689815236187, + "grad_norm": 7.15625, + "learning_rate": 3.4695033665408984e-06, + "loss": 1.4098341464996338, + "step": 11858 + }, + { + "epoch": 2.158733048147811, + "grad_norm": 7.5625, + "learning_rate": 3.468316850829857e-06, + "loss": 1.1751147508621216, + "step": 11860 + }, + { + "epoch": 2.1590971147720035, + "grad_norm": 11.5625, + "learning_rate": 3.467130706633055e-06, + "loss": 1.5015373229980469, + "step": 11862 + }, + { + "epoch": 2.1594611813961957, + "grad_norm": 8.625, + "learning_rate": 3.4659449341245586e-06, + "loss": 1.2517118453979492, + "step": 11864 + }, + { + "epoch": 2.159825248020388, + "grad_norm": 15.875, + "learning_rate": 3.4647595334783753e-06, + "loss": 1.3400590419769287, + "step": 11866 + }, + { + "epoch": 2.16018931464458, + "grad_norm": 14.3125, + "learning_rate": 3.4635745048684523e-06, + "loss": 1.5122504234313965, + "step": 11868 + }, + { + "epoch": 2.1605533812687723, + "grad_norm": 27.875, + "learning_rate": 3.4623898484686948e-06, + "loss": 1.7282097339630127, + "step": 11870 + }, + { + "epoch": 2.1609174478929645, + "grad_norm": 16.375, + "learning_rate": 3.4612055644529397e-06, + "loss": 1.5225062370300293, + "step": 11872 + }, + { + "epoch": 2.1612815145171567, + "grad_norm": 16.5, + "learning_rate": 3.4600216529949813e-06, + "loss": 1.9339747428894043, + "step": 11874 + }, + { + "epoch": 2.161645581141349, + "grad_norm": 8.375, + "learning_rate": 3.4588381142685524e-06, + "loss": 1.2551170587539673, + "step": 11876 + }, + { + "epoch": 2.162009647765541, + "grad_norm": 2.84375, + "learning_rate": 3.45765494844733e-06, + "loss": 1.0581399202346802, + "step": 11878 + }, + { + "epoch": 2.1623737143897332, + "grad_norm": 5.21875, + "learning_rate": 3.4564721557049425e-06, + "loss": 1.1101672649383545, + "step": 11880 + }, + { + "epoch": 2.1627377810139254, + "grad_norm": 5.8125, + "learning_rate": 3.4552897362149556e-06, + "loss": 0.9227743148803711, + "step": 11882 + }, + { + "epoch": 2.1631018476381176, + "grad_norm": 8.9375, + "learning_rate": 3.4541076901508886e-06, + "loss": 1.0068249702453613, + "step": 11884 + }, + { + "epoch": 2.16346591426231, + "grad_norm": 8.8125, + "learning_rate": 3.452926017686201e-06, + "loss": 1.3254737854003906, + "step": 11886 + }, + { + "epoch": 2.163829980886502, + "grad_norm": 9.25, + "learning_rate": 3.4517447189942965e-06, + "loss": 1.3896158933639526, + "step": 11888 + }, + { + "epoch": 2.1641940475106947, + "grad_norm": 14.0, + "learning_rate": 3.45056379424853e-06, + "loss": 1.4383244514465332, + "step": 11890 + }, + { + "epoch": 2.164558114134887, + "grad_norm": 10.875, + "learning_rate": 3.4493832436221913e-06, + "loss": 1.3949906826019287, + "step": 11892 + }, + { + "epoch": 2.164922180759079, + "grad_norm": 9.9375, + "learning_rate": 3.4482030672885258e-06, + "loss": 1.3575925827026367, + "step": 11894 + }, + { + "epoch": 2.1652862473832712, + "grad_norm": 10.125, + "learning_rate": 3.4470232654207215e-06, + "loss": 1.101464867591858, + "step": 11896 + }, + { + "epoch": 2.1656503140074634, + "grad_norm": 7.1875, + "learning_rate": 3.445843838191903e-06, + "loss": 1.0390342473983765, + "step": 11898 + }, + { + "epoch": 2.1660143806316556, + "grad_norm": 14.375, + "learning_rate": 3.4446647857751523e-06, + "loss": 0.9165394306182861, + "step": 11900 + }, + { + "epoch": 2.166378447255848, + "grad_norm": 7.96875, + "learning_rate": 3.443486108343487e-06, + "loss": 1.411670446395874, + "step": 11902 + }, + { + "epoch": 2.16674251388004, + "grad_norm": 20.875, + "learning_rate": 3.4423078060698758e-06, + "loss": 1.4992218017578125, + "step": 11904 + }, + { + "epoch": 2.167106580504232, + "grad_norm": 32.75, + "learning_rate": 3.4411298791272295e-06, + "loss": 1.2359474897384644, + "step": 11906 + }, + { + "epoch": 2.1674706471284244, + "grad_norm": 7.3125, + "learning_rate": 3.4399523276884007e-06, + "loss": 1.5338691473007202, + "step": 11908 + }, + { + "epoch": 2.1678347137526166, + "grad_norm": 15.0, + "learning_rate": 3.438775151926195e-06, + "loss": 1.4390363693237305, + "step": 11910 + }, + { + "epoch": 2.168198780376809, + "grad_norm": 21.25, + "learning_rate": 3.4375983520133538e-06, + "loss": 1.083348035812378, + "step": 11912 + }, + { + "epoch": 2.168562847001001, + "grad_norm": 9.0, + "learning_rate": 3.4364219281225713e-06, + "loss": 1.3756667375564575, + "step": 11914 + }, + { + "epoch": 2.168926913625193, + "grad_norm": 11.625, + "learning_rate": 3.4352458804264834e-06, + "loss": 1.4592182636260986, + "step": 11916 + }, + { + "epoch": 2.169290980249386, + "grad_norm": 23.5, + "learning_rate": 3.434070209097665e-06, + "loss": 1.615593433380127, + "step": 11918 + }, + { + "epoch": 2.169655046873578, + "grad_norm": 32.0, + "learning_rate": 3.4328949143086475e-06, + "loss": 1.8358324766159058, + "step": 11920 + }, + { + "epoch": 2.17001911349777, + "grad_norm": 13.5, + "learning_rate": 3.4317199962318954e-06, + "loss": 1.3876359462738037, + "step": 11922 + }, + { + "epoch": 2.1703831801219624, + "grad_norm": 7.46875, + "learning_rate": 3.4305454550398265e-06, + "loss": 1.1813454627990723, + "step": 11924 + }, + { + "epoch": 2.1707472467461546, + "grad_norm": 35.75, + "learning_rate": 3.4293712909048006e-06, + "loss": 1.3358076810836792, + "step": 11926 + }, + { + "epoch": 2.171111313370347, + "grad_norm": 13.5625, + "learning_rate": 3.428197503999117e-06, + "loss": 1.2706727981567383, + "step": 11928 + }, + { + "epoch": 2.171475379994539, + "grad_norm": 13.375, + "learning_rate": 3.4270240944950297e-06, + "loss": 1.5030291080474854, + "step": 11930 + }, + { + "epoch": 2.171839446618731, + "grad_norm": 15.25, + "learning_rate": 3.425851062564727e-06, + "loss": 1.4681525230407715, + "step": 11932 + }, + { + "epoch": 2.1722035132429234, + "grad_norm": 25.5, + "learning_rate": 3.4246784083803496e-06, + "loss": 1.3972972631454468, + "step": 11934 + }, + { + "epoch": 2.1725675798671156, + "grad_norm": 5.125, + "learning_rate": 3.4235061321139807e-06, + "loss": 1.0363470315933228, + "step": 11936 + }, + { + "epoch": 2.1729316464913078, + "grad_norm": 9.9375, + "learning_rate": 3.422334233937642e-06, + "loss": 1.3285410404205322, + "step": 11938 + }, + { + "epoch": 2.1732957131155, + "grad_norm": 18.375, + "learning_rate": 3.4211627140233116e-06, + "loss": 1.4808104038238525, + "step": 11940 + }, + { + "epoch": 2.173659779739692, + "grad_norm": 10.3125, + "learning_rate": 3.4199915725428984e-06, + "loss": 1.3016339540481567, + "step": 11942 + }, + { + "epoch": 2.174023846363885, + "grad_norm": 22.625, + "learning_rate": 3.4188208096682673e-06, + "loss": 1.4737329483032227, + "step": 11944 + }, + { + "epoch": 2.174387912988077, + "grad_norm": 13.4375, + "learning_rate": 3.417650425571224e-06, + "loss": 1.5741899013519287, + "step": 11946 + }, + { + "epoch": 2.174751979612269, + "grad_norm": 12.9375, + "learning_rate": 3.4164804204235118e-06, + "loss": 1.8713114261627197, + "step": 11948 + }, + { + "epoch": 2.1751160462364614, + "grad_norm": 8.25, + "learning_rate": 3.4153107943968313e-06, + "loss": 1.374143362045288, + "step": 11950 + }, + { + "epoch": 2.1754801128606536, + "grad_norm": 14.4375, + "learning_rate": 3.4141415476628135e-06, + "loss": 1.372848629951477, + "step": 11952 + }, + { + "epoch": 2.1758441794848458, + "grad_norm": 39.75, + "learning_rate": 3.412972680393046e-06, + "loss": 1.4966922998428345, + "step": 11954 + }, + { + "epoch": 2.176208246109038, + "grad_norm": 31.25, + "learning_rate": 3.411804192759054e-06, + "loss": 2.1731247901916504, + "step": 11956 + }, + { + "epoch": 2.17657231273323, + "grad_norm": 6.5, + "learning_rate": 3.410636084932305e-06, + "loss": 1.2354437112808228, + "step": 11958 + }, + { + "epoch": 2.1769363793574223, + "grad_norm": 10.1875, + "learning_rate": 3.40946835708422e-06, + "loss": 1.1443841457366943, + "step": 11960 + }, + { + "epoch": 2.1773004459816145, + "grad_norm": 9.625, + "learning_rate": 3.4083010093861524e-06, + "loss": 1.3149747848510742, + "step": 11962 + }, + { + "epoch": 2.1776645126058067, + "grad_norm": 12.5, + "learning_rate": 3.407134042009409e-06, + "loss": 1.5538043975830078, + "step": 11964 + }, + { + "epoch": 2.178028579229999, + "grad_norm": 31.125, + "learning_rate": 3.4059674551252396e-06, + "loss": 1.489409327507019, + "step": 11966 + }, + { + "epoch": 2.178392645854191, + "grad_norm": 11.125, + "learning_rate": 3.4048012489048297e-06, + "loss": 1.757889986038208, + "step": 11968 + }, + { + "epoch": 2.1787567124783838, + "grad_norm": 7.53125, + "learning_rate": 3.4036354235193224e-06, + "loss": 1.307370901107788, + "step": 11970 + }, + { + "epoch": 2.179120779102576, + "grad_norm": 8.125, + "learning_rate": 3.4024699791397914e-06, + "loss": 1.2587099075317383, + "step": 11972 + }, + { + "epoch": 2.179484845726768, + "grad_norm": 11.875, + "learning_rate": 3.4013049159372658e-06, + "loss": 1.131037712097168, + "step": 11974 + }, + { + "epoch": 2.1798489123509603, + "grad_norm": 96.0, + "learning_rate": 3.400140234082714e-06, + "loss": 1.0939170122146606, + "step": 11976 + }, + { + "epoch": 2.1802129789751525, + "grad_norm": 35.25, + "learning_rate": 3.3989759337470422e-06, + "loss": 1.1858646869659424, + "step": 11978 + }, + { + "epoch": 2.1805770455993447, + "grad_norm": 10.625, + "learning_rate": 3.397812015101115e-06, + "loss": 1.44175386428833, + "step": 11980 + }, + { + "epoch": 2.180941112223537, + "grad_norm": 39.75, + "learning_rate": 3.396648478315726e-06, + "loss": 1.3288320302963257, + "step": 11982 + }, + { + "epoch": 2.181305178847729, + "grad_norm": 14.25, + "learning_rate": 3.3954853235616237e-06, + "loss": 1.5406124591827393, + "step": 11984 + }, + { + "epoch": 2.1816692454719213, + "grad_norm": 15.625, + "learning_rate": 3.394322551009497e-06, + "loss": 2.010634183883667, + "step": 11986 + }, + { + "epoch": 2.1820333120961135, + "grad_norm": 8.625, + "learning_rate": 3.393160160829972e-06, + "loss": 1.1759456396102905, + "step": 11988 + }, + { + "epoch": 2.1823973787203057, + "grad_norm": 9.5, + "learning_rate": 3.3919981531936324e-06, + "loss": 1.1139094829559326, + "step": 11990 + }, + { + "epoch": 2.182761445344498, + "grad_norm": 11.125, + "learning_rate": 3.3908365282709914e-06, + "loss": 0.8248730301856995, + "step": 11992 + }, + { + "epoch": 2.18312551196869, + "grad_norm": 10.0, + "learning_rate": 3.3896752862325176e-06, + "loss": 0.9815011620521545, + "step": 11994 + }, + { + "epoch": 2.1834895785928827, + "grad_norm": 5.46875, + "learning_rate": 3.3885144272486175e-06, + "loss": 0.06556104123592377, + "step": 11996 + }, + { + "epoch": 2.183853645217075, + "grad_norm": 12.75, + "learning_rate": 3.3873539514896404e-06, + "loss": 0.4528769552707672, + "step": 11998 + }, + { + "epoch": 2.184217711841267, + "grad_norm": 17.375, + "learning_rate": 3.386193859125884e-06, + "loss": 1.5093042850494385, + "step": 12000 + }, + { + "epoch": 2.1845817784654593, + "grad_norm": 19.75, + "learning_rate": 3.3850341503275843e-06, + "loss": 1.704003930091858, + "step": 12002 + }, + { + "epoch": 2.1849458450896515, + "grad_norm": 8.1875, + "learning_rate": 3.383874825264926e-06, + "loss": 1.5869312286376953, + "step": 12004 + }, + { + "epoch": 2.1853099117138437, + "grad_norm": 22.125, + "learning_rate": 3.3827158841080363e-06, + "loss": 2.0539438724517822, + "step": 12006 + }, + { + "epoch": 2.185673978338036, + "grad_norm": 9.75, + "learning_rate": 3.381557327026982e-06, + "loss": 1.730271577835083, + "step": 12008 + }, + { + "epoch": 2.186038044962228, + "grad_norm": 15.8125, + "learning_rate": 3.38039915419178e-06, + "loss": 1.1404756307601929, + "step": 12010 + }, + { + "epoch": 2.1864021115864203, + "grad_norm": 14.0625, + "learning_rate": 3.3792413657723833e-06, + "loss": 1.7714409828186035, + "step": 12012 + }, + { + "epoch": 2.1867661782106125, + "grad_norm": 10.0625, + "learning_rate": 3.3780839619386968e-06, + "loss": 1.3931514024734497, + "step": 12014 + }, + { + "epoch": 2.1871302448348047, + "grad_norm": 15.1875, + "learning_rate": 3.3769269428605646e-06, + "loss": 1.1737124919891357, + "step": 12016 + }, + { + "epoch": 2.187494311458997, + "grad_norm": 9.6875, + "learning_rate": 3.37577030870777e-06, + "loss": 1.6138839721679688, + "step": 12018 + }, + { + "epoch": 2.187858378083189, + "grad_norm": 32.5, + "learning_rate": 3.374614059650051e-06, + "loss": 1.4430218935012817, + "step": 12020 + }, + { + "epoch": 2.1882224447073813, + "grad_norm": 9.0, + "learning_rate": 3.3734581958570754e-06, + "loss": 1.4185245037078857, + "step": 12022 + }, + { + "epoch": 2.1885865113315734, + "grad_norm": 10.25, + "learning_rate": 3.372302717498467e-06, + "loss": 1.3078869581222534, + "step": 12024 + }, + { + "epoch": 2.188950577955766, + "grad_norm": 14.9375, + "learning_rate": 3.371147624743787e-06, + "loss": 1.4749256372451782, + "step": 12026 + }, + { + "epoch": 2.1893146445799583, + "grad_norm": 17.25, + "learning_rate": 3.369992917762536e-06, + "loss": 1.4176750183105469, + "step": 12028 + }, + { + "epoch": 2.1896787112041505, + "grad_norm": 13.6875, + "learning_rate": 3.368838596724169e-06, + "loss": 1.318245530128479, + "step": 12030 + }, + { + "epoch": 2.1900427778283427, + "grad_norm": 9.6875, + "learning_rate": 3.3676846617980724e-06, + "loss": 1.1626064777374268, + "step": 12032 + }, + { + "epoch": 2.190406844452535, + "grad_norm": 10.0625, + "learning_rate": 3.366531113153585e-06, + "loss": 1.1749370098114014, + "step": 12034 + }, + { + "epoch": 2.190770911076727, + "grad_norm": 20.25, + "learning_rate": 3.365377950959985e-06, + "loss": 1.3727855682373047, + "step": 12036 + }, + { + "epoch": 2.1911349777009193, + "grad_norm": 10.0625, + "learning_rate": 3.3642251753864913e-06, + "loss": 1.436610460281372, + "step": 12038 + }, + { + "epoch": 2.1914990443251114, + "grad_norm": 13.0625, + "learning_rate": 3.3630727866022737e-06, + "loss": 1.3140615224838257, + "step": 12040 + }, + { + "epoch": 2.1918631109493036, + "grad_norm": 12.125, + "learning_rate": 3.3619207847764357e-06, + "loss": 1.3537803888320923, + "step": 12042 + }, + { + "epoch": 2.192227177573496, + "grad_norm": 13.5, + "learning_rate": 3.360769170078033e-06, + "loss": 1.4226624965667725, + "step": 12044 + }, + { + "epoch": 2.192591244197688, + "grad_norm": 27.5, + "learning_rate": 3.3596179426760594e-06, + "loss": 1.5557861328125, + "step": 12046 + }, + { + "epoch": 2.19295531082188, + "grad_norm": 9.5, + "learning_rate": 3.3584671027394496e-06, + "loss": 1.7801411151885986, + "step": 12048 + }, + { + "epoch": 2.1933193774460724, + "grad_norm": 12.3125, + "learning_rate": 3.3573166504370902e-06, + "loss": 1.2176892757415771, + "step": 12050 + }, + { + "epoch": 2.193683444070265, + "grad_norm": 3.734375, + "learning_rate": 3.3561665859377997e-06, + "loss": 0.8376621007919312, + "step": 12052 + }, + { + "epoch": 2.1940475106944572, + "grad_norm": 9.3125, + "learning_rate": 3.3550169094103497e-06, + "loss": 1.181688666343689, + "step": 12054 + }, + { + "epoch": 2.1944115773186494, + "grad_norm": 17.375, + "learning_rate": 3.35386762102345e-06, + "loss": 1.4156429767608643, + "step": 12056 + }, + { + "epoch": 2.1947756439428416, + "grad_norm": 23.5, + "learning_rate": 3.352718720945751e-06, + "loss": 1.3488410711288452, + "step": 12058 + }, + { + "epoch": 2.195139710567034, + "grad_norm": 14.5625, + "learning_rate": 3.3515702093458534e-06, + "loss": 1.056420922279358, + "step": 12060 + }, + { + "epoch": 2.195503777191226, + "grad_norm": 12.75, + "learning_rate": 3.350422086392292e-06, + "loss": 1.1180254220962524, + "step": 12062 + }, + { + "epoch": 2.195867843815418, + "grad_norm": 10.6875, + "learning_rate": 3.349274352253553e-06, + "loss": 1.0712347030639648, + "step": 12064 + }, + { + "epoch": 2.1962319104396104, + "grad_norm": 9.6875, + "learning_rate": 3.348127007098061e-06, + "loss": 1.569072961807251, + "step": 12066 + }, + { + "epoch": 2.1965959770638026, + "grad_norm": 12.3125, + "learning_rate": 3.346980051094182e-06, + "loss": 1.359500765800476, + "step": 12068 + }, + { + "epoch": 2.196960043687995, + "grad_norm": 9.6875, + "learning_rate": 3.34583348441023e-06, + "loss": 1.3142786026000977, + "step": 12070 + }, + { + "epoch": 2.197324110312187, + "grad_norm": 11.875, + "learning_rate": 3.3446873072144566e-06, + "loss": 1.2809697389602661, + "step": 12072 + }, + { + "epoch": 2.197688176936379, + "grad_norm": 9.125, + "learning_rate": 3.3435415196750605e-06, + "loss": 1.0553083419799805, + "step": 12074 + }, + { + "epoch": 2.1980522435605714, + "grad_norm": 5.1875, + "learning_rate": 3.342396121960182e-06, + "loss": 1.3698594570159912, + "step": 12076 + }, + { + "epoch": 2.198416310184764, + "grad_norm": 3.21875, + "learning_rate": 3.3412511142379002e-06, + "loss": 0.8591696619987488, + "step": 12078 + }, + { + "epoch": 2.198780376808956, + "grad_norm": 10.5625, + "learning_rate": 3.3401064966762443e-06, + "loss": 1.2664319276809692, + "step": 12080 + }, + { + "epoch": 2.1991444434331484, + "grad_norm": 15.6875, + "learning_rate": 3.3389622694431778e-06, + "loss": 1.5193262100219727, + "step": 12082 + }, + { + "epoch": 2.1995085100573406, + "grad_norm": 12.25, + "learning_rate": 3.3378184327066156e-06, + "loss": 1.5679194927215576, + "step": 12084 + }, + { + "epoch": 2.199872576681533, + "grad_norm": 10.0625, + "learning_rate": 3.3366749866344106e-06, + "loss": 0.9765119552612305, + "step": 12086 + }, + { + "epoch": 2.200236643305725, + "grad_norm": 7.5625, + "learning_rate": 3.3355319313943556e-06, + "loss": 1.3023643493652344, + "step": 12088 + }, + { + "epoch": 2.200600709929917, + "grad_norm": 14.5625, + "learning_rate": 3.3343892671541942e-06, + "loss": 1.3609817028045654, + "step": 12090 + }, + { + "epoch": 2.2009647765541094, + "grad_norm": 10.3125, + "learning_rate": 3.3332469940816024e-06, + "loss": 1.764069676399231, + "step": 12092 + }, + { + "epoch": 2.2013288431783016, + "grad_norm": 8.125, + "learning_rate": 3.3321051123442072e-06, + "loss": 1.5629851818084717, + "step": 12094 + }, + { + "epoch": 2.2016929098024938, + "grad_norm": 9.1875, + "learning_rate": 3.3309636221095776e-06, + "loss": 1.4875407218933105, + "step": 12096 + }, + { + "epoch": 2.202056976426686, + "grad_norm": 8.3125, + "learning_rate": 3.3298225235452164e-06, + "loss": 1.309502124786377, + "step": 12098 + }, + { + "epoch": 2.202421043050878, + "grad_norm": 15.5, + "learning_rate": 3.328681816818581e-06, + "loss": 1.3239184617996216, + "step": 12100 + }, + { + "epoch": 2.2027851096750704, + "grad_norm": 2.9375, + "learning_rate": 3.32754150209706e-06, + "loss": 1.223656415939331, + "step": 12102 + }, + { + "epoch": 2.203149176299263, + "grad_norm": 13.1875, + "learning_rate": 3.3264015795479955e-06, + "loss": 1.2184572219848633, + "step": 12104 + }, + { + "epoch": 2.203513242923455, + "grad_norm": 14.1875, + "learning_rate": 3.3252620493386646e-06, + "loss": 1.6346687078475952, + "step": 12106 + }, + { + "epoch": 2.2038773095476474, + "grad_norm": 12.5625, + "learning_rate": 3.3241229116362855e-06, + "loss": 1.3852317333221436, + "step": 12108 + }, + { + "epoch": 2.2042413761718396, + "grad_norm": 53.0, + "learning_rate": 3.3229841666080275e-06, + "loss": 1.761644721031189, + "step": 12110 + }, + { + "epoch": 2.2046054427960318, + "grad_norm": 11.9375, + "learning_rate": 3.321845814420992e-06, + "loss": 1.3510289192199707, + "step": 12112 + }, + { + "epoch": 2.204969509420224, + "grad_norm": 8.6875, + "learning_rate": 3.32070785524223e-06, + "loss": 1.3094415664672852, + "step": 12114 + }, + { + "epoch": 2.205333576044416, + "grad_norm": 28.5, + "learning_rate": 3.319570289238734e-06, + "loss": 0.8440742492675781, + "step": 12116 + }, + { + "epoch": 2.2056976426686083, + "grad_norm": 4.0, + "learning_rate": 3.318433116577433e-06, + "loss": 1.1359409093856812, + "step": 12118 + }, + { + "epoch": 2.2060617092928005, + "grad_norm": 10.3125, + "learning_rate": 3.3172963374252064e-06, + "loss": 0.9074627161026001, + "step": 12120 + }, + { + "epoch": 2.2064257759169927, + "grad_norm": 10.9375, + "learning_rate": 3.316159951948868e-06, + "loss": 1.136393666267395, + "step": 12122 + }, + { + "epoch": 2.206789842541185, + "grad_norm": 17.75, + "learning_rate": 3.315023960315181e-06, + "loss": 1.3155993223190308, + "step": 12124 + }, + { + "epoch": 2.207153909165377, + "grad_norm": 11.4375, + "learning_rate": 3.313888362690848e-06, + "loss": 1.5051074028015137, + "step": 12126 + }, + { + "epoch": 2.2075179757895693, + "grad_norm": 67.0, + "learning_rate": 3.31275315924251e-06, + "loss": 1.341226577758789, + "step": 12128 + }, + { + "epoch": 2.2078820424137615, + "grad_norm": 19.25, + "learning_rate": 3.3116183501367573e-06, + "loss": 1.3257554769515991, + "step": 12130 + }, + { + "epoch": 2.208246109037954, + "grad_norm": 20.75, + "learning_rate": 3.310483935540114e-06, + "loss": 1.3258298635482788, + "step": 12132 + }, + { + "epoch": 2.2086101756621463, + "grad_norm": 9.5, + "learning_rate": 3.3093499156190554e-06, + "loss": 1.315000295639038, + "step": 12134 + }, + { + "epoch": 2.2089742422863385, + "grad_norm": 5.9375, + "learning_rate": 3.3082162905399928e-06, + "loss": 1.1560317277908325, + "step": 12136 + }, + { + "epoch": 2.2093383089105307, + "grad_norm": 5.1875, + "learning_rate": 3.3070830604692796e-06, + "loss": 0.8162409067153931, + "step": 12138 + }, + { + "epoch": 2.209702375534723, + "grad_norm": 6.25, + "learning_rate": 3.3059502255732155e-06, + "loss": 0.9415969252586365, + "step": 12140 + }, + { + "epoch": 2.210066442158915, + "grad_norm": 5.25, + "learning_rate": 3.304817786018035e-06, + "loss": 0.37937045097351074, + "step": 12142 + }, + { + "epoch": 2.2104305087831073, + "grad_norm": 6.46875, + "learning_rate": 3.303685741969923e-06, + "loss": 1.390806794166565, + "step": 12144 + }, + { + "epoch": 2.2107945754072995, + "grad_norm": 16.125, + "learning_rate": 3.3025540935950027e-06, + "loss": 0.951095461845398, + "step": 12146 + }, + { + "epoch": 2.2111586420314917, + "grad_norm": 8.0625, + "learning_rate": 3.301422841059335e-06, + "loss": 1.3348033428192139, + "step": 12148 + }, + { + "epoch": 2.211522708655684, + "grad_norm": 12.5, + "learning_rate": 3.3002919845289315e-06, + "loss": 1.309812307357788, + "step": 12150 + }, + { + "epoch": 2.211886775279876, + "grad_norm": 126.0, + "learning_rate": 3.299161524169736e-06, + "loss": 1.5212230682373047, + "step": 12152 + }, + { + "epoch": 2.2122508419040683, + "grad_norm": 52.25, + "learning_rate": 3.2980314601476423e-06, + "loss": 0.5571275949478149, + "step": 12154 + }, + { + "epoch": 2.2126149085282605, + "grad_norm": 5.78125, + "learning_rate": 3.2969017926284822e-06, + "loss": 1.0546280145645142, + "step": 12156 + }, + { + "epoch": 2.2129789751524527, + "grad_norm": 18.875, + "learning_rate": 3.2957725217780277e-06, + "loss": 1.5623667240142822, + "step": 12158 + }, + { + "epoch": 2.2133430417766453, + "grad_norm": 13.625, + "learning_rate": 3.294643647761999e-06, + "loss": 1.4025273323059082, + "step": 12160 + }, + { + "epoch": 2.2137071084008375, + "grad_norm": 10.5625, + "learning_rate": 3.2935151707460476e-06, + "loss": 1.3843116760253906, + "step": 12162 + }, + { + "epoch": 2.2140711750250297, + "grad_norm": 6.09375, + "learning_rate": 3.292387090895779e-06, + "loss": 1.3283593654632568, + "step": 12164 + }, + { + "epoch": 2.214435241649222, + "grad_norm": 6.90625, + "learning_rate": 3.2912594083767326e-06, + "loss": 1.0081628561019897, + "step": 12166 + }, + { + "epoch": 2.214799308273414, + "grad_norm": 12.6875, + "learning_rate": 3.2901321233543882e-06, + "loss": 1.3083016872406006, + "step": 12168 + }, + { + "epoch": 2.2151633748976063, + "grad_norm": 51.25, + "learning_rate": 3.2890052359941742e-06, + "loss": 1.4367625713348389, + "step": 12170 + }, + { + "epoch": 2.2155274415217985, + "grad_norm": 14.875, + "learning_rate": 3.287878746461453e-06, + "loss": 1.7600932121276855, + "step": 12172 + }, + { + "epoch": 2.2158915081459907, + "grad_norm": 8.0625, + "learning_rate": 3.2867526549215354e-06, + "loss": 1.391483187675476, + "step": 12174 + }, + { + "epoch": 2.216255574770183, + "grad_norm": 10.8125, + "learning_rate": 3.285626961539672e-06, + "loss": 1.356466293334961, + "step": 12176 + }, + { + "epoch": 2.216619641394375, + "grad_norm": 9.8125, + "learning_rate": 3.2845016664810486e-06, + "loss": 1.3807181119918823, + "step": 12178 + }, + { + "epoch": 2.2169837080185673, + "grad_norm": 11.25, + "learning_rate": 3.283376769910803e-06, + "loss": 1.5090417861938477, + "step": 12180 + }, + { + "epoch": 2.2173477746427595, + "grad_norm": 16.25, + "learning_rate": 3.282252271994005e-06, + "loss": 1.2962932586669922, + "step": 12182 + }, + { + "epoch": 2.2177118412669516, + "grad_norm": 8.3125, + "learning_rate": 3.2811281728956733e-06, + "loss": 1.3331317901611328, + "step": 12184 + }, + { + "epoch": 2.2180759078911443, + "grad_norm": 156.0, + "learning_rate": 3.2800044727807645e-06, + "loss": 1.4396144151687622, + "step": 12186 + }, + { + "epoch": 2.2184399745153365, + "grad_norm": 13.75, + "learning_rate": 3.278881171814173e-06, + "loss": 1.1396660804748535, + "step": 12188 + }, + { + "epoch": 2.2188040411395287, + "grad_norm": 9.9375, + "learning_rate": 3.277758270160745e-06, + "loss": 0.6598251461982727, + "step": 12190 + }, + { + "epoch": 2.219168107763721, + "grad_norm": 15.3125, + "learning_rate": 3.276635767985257e-06, + "loss": 0.5037153959274292, + "step": 12192 + }, + { + "epoch": 2.219532174387913, + "grad_norm": 9.5625, + "learning_rate": 3.2755136654524326e-06, + "loss": 1.090395450592041, + "step": 12194 + }, + { + "epoch": 2.2198962410121053, + "grad_norm": 14.625, + "learning_rate": 3.2743919627269396e-06, + "loss": 0.9242769479751587, + "step": 12196 + }, + { + "epoch": 2.2202603076362974, + "grad_norm": 28.75, + "learning_rate": 3.273270659973376e-06, + "loss": 1.5078730583190918, + "step": 12198 + }, + { + "epoch": 2.2206243742604896, + "grad_norm": 12.0625, + "learning_rate": 3.2721497573562955e-06, + "loss": 1.3433661460876465, + "step": 12200 + }, + { + "epoch": 2.220988440884682, + "grad_norm": 14.125, + "learning_rate": 3.271029255040181e-06, + "loss": 1.2620413303375244, + "step": 12202 + }, + { + "epoch": 2.221352507508874, + "grad_norm": 8.0625, + "learning_rate": 3.2699091531894646e-06, + "loss": 1.3057994842529297, + "step": 12204 + }, + { + "epoch": 2.2217165741330662, + "grad_norm": 4.1875, + "learning_rate": 3.268789451968517e-06, + "loss": 1.1626518964767456, + "step": 12206 + }, + { + "epoch": 2.2220806407572584, + "grad_norm": 6.09375, + "learning_rate": 3.267670151541647e-06, + "loss": 1.0821871757507324, + "step": 12208 + }, + { + "epoch": 2.2224447073814506, + "grad_norm": 10.0, + "learning_rate": 3.266551252073111e-06, + "loss": 1.0745364427566528, + "step": 12210 + }, + { + "epoch": 2.2228087740056433, + "grad_norm": 91.5, + "learning_rate": 3.2654327537270984e-06, + "loss": 0.7835872173309326, + "step": 12212 + }, + { + "epoch": 2.2231728406298354, + "grad_norm": 9.3125, + "learning_rate": 3.2643146566677487e-06, + "loss": 1.2483313083648682, + "step": 12214 + }, + { + "epoch": 2.2235369072540276, + "grad_norm": 9.75, + "learning_rate": 3.2631969610591375e-06, + "loss": 1.2677079439163208, + "step": 12216 + }, + { + "epoch": 2.22390097387822, + "grad_norm": 7.40625, + "learning_rate": 3.262079667065279e-06, + "loss": 1.2716532945632935, + "step": 12218 + }, + { + "epoch": 2.224265040502412, + "grad_norm": 28.375, + "learning_rate": 3.260962774850135e-06, + "loss": 1.4709292650222778, + "step": 12220 + }, + { + "epoch": 2.224629107126604, + "grad_norm": 14.3125, + "learning_rate": 3.2598462845776014e-06, + "loss": 1.2574224472045898, + "step": 12222 + }, + { + "epoch": 2.2249931737507964, + "grad_norm": 5.8125, + "learning_rate": 3.2587301964115213e-06, + "loss": 1.220444917678833, + "step": 12224 + }, + { + "epoch": 2.2253572403749886, + "grad_norm": 18.875, + "learning_rate": 3.2576145105156777e-06, + "loss": 0.8707526922225952, + "step": 12226 + }, + { + "epoch": 2.225721306999181, + "grad_norm": 199.0, + "learning_rate": 3.2564992270537878e-06, + "loss": 0.9179031848907471, + "step": 12228 + }, + { + "epoch": 2.226085373623373, + "grad_norm": 82.0, + "learning_rate": 3.2553843461895206e-06, + "loss": 1.598044514656067, + "step": 12230 + }, + { + "epoch": 2.226449440247565, + "grad_norm": 14.25, + "learning_rate": 3.2542698680864747e-06, + "loss": 1.1532329320907593, + "step": 12232 + }, + { + "epoch": 2.2268135068717574, + "grad_norm": 11.4375, + "learning_rate": 3.2531557929082003e-06, + "loss": 1.0540663003921509, + "step": 12234 + }, + { + "epoch": 2.2271775734959496, + "grad_norm": 10.25, + "learning_rate": 3.252042120818182e-06, + "loss": 1.313855767250061, + "step": 12236 + }, + { + "epoch": 2.227541640120142, + "grad_norm": 14.375, + "learning_rate": 3.2509288519798433e-06, + "loss": 1.1382684707641602, + "step": 12238 + }, + { + "epoch": 2.2279057067443344, + "grad_norm": 19.75, + "learning_rate": 3.249815986556557e-06, + "loss": 0.746185302734375, + "step": 12240 + }, + { + "epoch": 2.2282697733685266, + "grad_norm": 11.0, + "learning_rate": 3.248703524711627e-06, + "loss": 0.8475677371025085, + "step": 12242 + }, + { + "epoch": 2.228633839992719, + "grad_norm": 12.5, + "learning_rate": 3.247591466608306e-06, + "loss": 1.4046236276626587, + "step": 12244 + }, + { + "epoch": 2.228997906616911, + "grad_norm": 11.375, + "learning_rate": 3.246479812409784e-06, + "loss": 1.6883646249771118, + "step": 12246 + }, + { + "epoch": 2.229361973241103, + "grad_norm": 60.25, + "learning_rate": 3.2453685622791875e-06, + "loss": 1.3166425228118896, + "step": 12248 + }, + { + "epoch": 2.2297260398652954, + "grad_norm": 8.25, + "learning_rate": 3.2442577163795935e-06, + "loss": 1.2409021854400635, + "step": 12250 + }, + { + "epoch": 2.2300901064894876, + "grad_norm": 13.6875, + "learning_rate": 3.24314727487401e-06, + "loss": 1.3663290739059448, + "step": 12252 + }, + { + "epoch": 2.2304541731136798, + "grad_norm": 19.125, + "learning_rate": 3.242037237925392e-06, + "loss": 1.4763394594192505, + "step": 12254 + }, + { + "epoch": 2.230818239737872, + "grad_norm": 7.375, + "learning_rate": 3.240927605696633e-06, + "loss": 1.2091948986053467, + "step": 12256 + }, + { + "epoch": 2.231182306362064, + "grad_norm": 14.5, + "learning_rate": 3.2398183783505643e-06, + "loss": 1.0823699235916138, + "step": 12258 + }, + { + "epoch": 2.2315463729862564, + "grad_norm": 13.875, + "learning_rate": 3.2387095560499656e-06, + "loss": 1.3707095384597778, + "step": 12260 + }, + { + "epoch": 2.2319104396104485, + "grad_norm": 7.59375, + "learning_rate": 3.2376011389575456e-06, + "loss": 0.7047957181930542, + "step": 12262 + }, + { + "epoch": 2.2322745062346407, + "grad_norm": 14.1875, + "learning_rate": 3.236493127235965e-06, + "loss": 1.3123741149902344, + "step": 12264 + }, + { + "epoch": 2.232638572858833, + "grad_norm": 16.375, + "learning_rate": 3.2353855210478204e-06, + "loss": 0.8632066249847412, + "step": 12266 + }, + { + "epoch": 2.2330026394830256, + "grad_norm": 12.8125, + "learning_rate": 3.234278320555643e-06, + "loss": 1.5546512603759766, + "step": 12268 + }, + { + "epoch": 2.2333667061072178, + "grad_norm": 10.1875, + "learning_rate": 3.2331715259219163e-06, + "loss": 1.3444409370422363, + "step": 12270 + }, + { + "epoch": 2.23373077273141, + "grad_norm": 8.0, + "learning_rate": 3.2320651373090527e-06, + "loss": 1.3481435775756836, + "step": 12272 + }, + { + "epoch": 2.234094839355602, + "grad_norm": 8.0625, + "learning_rate": 3.2309591548794145e-06, + "loss": 1.3942453861236572, + "step": 12274 + }, + { + "epoch": 2.2344589059797944, + "grad_norm": 410.0, + "learning_rate": 3.2298535787952993e-06, + "loss": 0.8943444490432739, + "step": 12276 + }, + { + "epoch": 2.2348229726039865, + "grad_norm": 13.8125, + "learning_rate": 3.2287484092189426e-06, + "loss": 0.44548726081848145, + "step": 12278 + }, + { + "epoch": 2.2351870392281787, + "grad_norm": 6.625, + "learning_rate": 3.2276436463125284e-06, + "loss": 1.2603511810302734, + "step": 12280 + }, + { + "epoch": 2.235551105852371, + "grad_norm": 6.21875, + "learning_rate": 3.226539290238171e-06, + "loss": 0.9752300977706909, + "step": 12282 + }, + { + "epoch": 2.235915172476563, + "grad_norm": 22.875, + "learning_rate": 3.2254353411579346e-06, + "loss": 0.966333270072937, + "step": 12284 + }, + { + "epoch": 2.2362792391007553, + "grad_norm": 23.125, + "learning_rate": 3.2243317992338184e-06, + "loss": 0.6371181607246399, + "step": 12286 + }, + { + "epoch": 2.2366433057249475, + "grad_norm": 55.25, + "learning_rate": 3.22322866462776e-06, + "loss": 1.4175171852111816, + "step": 12288 + }, + { + "epoch": 2.2370073723491397, + "grad_norm": 6.15625, + "learning_rate": 3.2221259375016434e-06, + "loss": 1.317901611328125, + "step": 12290 + }, + { + "epoch": 2.237371438973332, + "grad_norm": 7.46875, + "learning_rate": 3.221023618017286e-06, + "loss": 1.4108023643493652, + "step": 12292 + }, + { + "epoch": 2.2377355055975245, + "grad_norm": 13.4375, + "learning_rate": 3.2199217063364513e-06, + "loss": 1.5835630893707275, + "step": 12294 + }, + { + "epoch": 2.2380995722217167, + "grad_norm": 9.5, + "learning_rate": 3.21882020262084e-06, + "loss": 1.737712025642395, + "step": 12296 + }, + { + "epoch": 2.238463638845909, + "grad_norm": 4.125, + "learning_rate": 3.2177191070320916e-06, + "loss": 0.8339627981185913, + "step": 12298 + }, + { + "epoch": 2.238827705470101, + "grad_norm": 9.875, + "learning_rate": 3.2166184197317914e-06, + "loss": 1.1847567558288574, + "step": 12300 + }, + { + "epoch": 2.2391917720942933, + "grad_norm": 9.4375, + "learning_rate": 3.2155181408814544e-06, + "loss": 1.3000099658966064, + "step": 12302 + }, + { + "epoch": 2.2395558387184855, + "grad_norm": 9.8125, + "learning_rate": 3.2144182706425474e-06, + "loss": 1.4241355657577515, + "step": 12304 + }, + { + "epoch": 2.2399199053426777, + "grad_norm": 4.8125, + "learning_rate": 3.2133188091764706e-06, + "loss": 1.340789556503296, + "step": 12306 + }, + { + "epoch": 2.24028397196687, + "grad_norm": 8.9375, + "learning_rate": 3.2122197566445646e-06, + "loss": 0.9283239841461182, + "step": 12308 + }, + { + "epoch": 2.240648038591062, + "grad_norm": 13.5625, + "learning_rate": 3.211121113208113e-06, + "loss": 1.868080735206604, + "step": 12310 + }, + { + "epoch": 2.2410121052152543, + "grad_norm": 32.5, + "learning_rate": 3.2100228790283327e-06, + "loss": 1.7148187160491943, + "step": 12312 + }, + { + "epoch": 2.2413761718394465, + "grad_norm": 9.625, + "learning_rate": 3.2089250542663897e-06, + "loss": 1.4423878192901611, + "step": 12314 + }, + { + "epoch": 2.2417402384636387, + "grad_norm": 9.6875, + "learning_rate": 3.207827639083384e-06, + "loss": 1.3118674755096436, + "step": 12316 + }, + { + "epoch": 2.242104305087831, + "grad_norm": 27.75, + "learning_rate": 3.206730633640356e-06, + "loss": 1.5447744131088257, + "step": 12318 + }, + { + "epoch": 2.2424683717120235, + "grad_norm": 10.875, + "learning_rate": 3.205634038098289e-06, + "loss": 1.6219093799591064, + "step": 12320 + }, + { + "epoch": 2.2428324383362157, + "grad_norm": 13.3125, + "learning_rate": 3.2045378526181e-06, + "loss": 1.3868744373321533, + "step": 12322 + }, + { + "epoch": 2.243196504960408, + "grad_norm": 11.375, + "learning_rate": 3.203442077360653e-06, + "loss": 1.3873951435089111, + "step": 12324 + }, + { + "epoch": 2.2435605715846, + "grad_norm": 27.625, + "learning_rate": 3.202346712486748e-06, + "loss": 1.4910807609558105, + "step": 12326 + }, + { + "epoch": 2.2439246382087923, + "grad_norm": 19.0, + "learning_rate": 3.2012517581571245e-06, + "loss": 1.3788236379623413, + "step": 12328 + }, + { + "epoch": 2.2442887048329845, + "grad_norm": 24.0, + "learning_rate": 3.200157214532465e-06, + "loss": 1.4303592443466187, + "step": 12330 + }, + { + "epoch": 2.2446527714571767, + "grad_norm": 34.75, + "learning_rate": 3.199063081773385e-06, + "loss": 1.3633836507797241, + "step": 12332 + }, + { + "epoch": 2.245016838081369, + "grad_norm": 16.5, + "learning_rate": 3.197969360040447e-06, + "loss": 1.4099583625793457, + "step": 12334 + }, + { + "epoch": 2.245380904705561, + "grad_norm": 14.5625, + "learning_rate": 3.196876049494151e-06, + "loss": 1.6726154088974, + "step": 12336 + }, + { + "epoch": 2.2457449713297533, + "grad_norm": 16.25, + "learning_rate": 3.195783150294934e-06, + "loss": 1.4227968454360962, + "step": 12338 + }, + { + "epoch": 2.2461090379539455, + "grad_norm": 15.625, + "learning_rate": 3.1946906626031767e-06, + "loss": 1.5397989749908447, + "step": 12340 + }, + { + "epoch": 2.2464731045781376, + "grad_norm": 14.3125, + "learning_rate": 3.1935985865791926e-06, + "loss": 1.360124111175537, + "step": 12342 + }, + { + "epoch": 2.24683717120233, + "grad_norm": 6.1875, + "learning_rate": 3.1925069223832446e-06, + "loss": 1.1941728591918945, + "step": 12344 + }, + { + "epoch": 2.2472012378265225, + "grad_norm": 22.125, + "learning_rate": 3.191415670175527e-06, + "loss": 1.358879566192627, + "step": 12346 + }, + { + "epoch": 2.2475653044507147, + "grad_norm": 5.78125, + "learning_rate": 3.190324830116178e-06, + "loss": 1.4363832473754883, + "step": 12348 + }, + { + "epoch": 2.247929371074907, + "grad_norm": 11.9375, + "learning_rate": 3.1892344023652753e-06, + "loss": 0.8540009260177612, + "step": 12350 + }, + { + "epoch": 2.248293437699099, + "grad_norm": 19.375, + "learning_rate": 3.18814438708283e-06, + "loss": 1.1413938999176025, + "step": 12352 + }, + { + "epoch": 2.2486575043232913, + "grad_norm": 17.625, + "learning_rate": 3.1870547844288026e-06, + "loss": 1.535794734954834, + "step": 12354 + }, + { + "epoch": 2.2490215709474835, + "grad_norm": 9.1875, + "learning_rate": 3.1859655945630846e-06, + "loss": 1.399773359298706, + "step": 12356 + }, + { + "epoch": 2.2493856375716756, + "grad_norm": 8.9375, + "learning_rate": 3.184876817645512e-06, + "loss": 1.3213396072387695, + "step": 12358 + }, + { + "epoch": 2.249749704195868, + "grad_norm": 16.625, + "learning_rate": 3.1837884538358587e-06, + "loss": 1.392475962638855, + "step": 12360 + }, + { + "epoch": 2.25011377082006, + "grad_norm": 10.6875, + "learning_rate": 3.182700503293834e-06, + "loss": 1.2594785690307617, + "step": 12362 + }, + { + "epoch": 2.2504778374442522, + "grad_norm": 13.0625, + "learning_rate": 3.1816129661790945e-06, + "loss": 1.3794629573822021, + "step": 12364 + }, + { + "epoch": 2.2508419040684444, + "grad_norm": 33.0, + "learning_rate": 3.1805258426512297e-06, + "loss": 1.4756104946136475, + "step": 12366 + }, + { + "epoch": 2.2512059706926366, + "grad_norm": 38.5, + "learning_rate": 3.179439132869772e-06, + "loss": 1.2784026861190796, + "step": 12368 + }, + { + "epoch": 2.251570037316829, + "grad_norm": 7.34375, + "learning_rate": 3.1783528369941917e-06, + "loss": 1.567686915397644, + "step": 12370 + }, + { + "epoch": 2.2519341039410214, + "grad_norm": 20.125, + "learning_rate": 3.1772669551838952e-06, + "loss": 1.3300023078918457, + "step": 12372 + }, + { + "epoch": 2.252298170565213, + "grad_norm": 15.9375, + "learning_rate": 3.1761814875982344e-06, + "loss": 1.584209680557251, + "step": 12374 + }, + { + "epoch": 2.252662237189406, + "grad_norm": 7.28125, + "learning_rate": 3.1750964343964964e-06, + "loss": 1.3606492280960083, + "step": 12376 + }, + { + "epoch": 2.253026303813598, + "grad_norm": 10.25, + "learning_rate": 3.174011795737908e-06, + "loss": 1.4643335342407227, + "step": 12378 + }, + { + "epoch": 2.2533903704377902, + "grad_norm": 16.5, + "learning_rate": 3.1729275717816377e-06, + "loss": 1.5258057117462158, + "step": 12380 + }, + { + "epoch": 2.2537544370619824, + "grad_norm": 8.75, + "learning_rate": 3.171843762686786e-06, + "loss": 1.6648608446121216, + "step": 12382 + }, + { + "epoch": 2.2541185036861746, + "grad_norm": 12.0625, + "learning_rate": 3.1707603686124023e-06, + "loss": 1.6886719465255737, + "step": 12384 + }, + { + "epoch": 2.254482570310367, + "grad_norm": 13.9375, + "learning_rate": 3.1696773897174686e-06, + "loss": 1.4433162212371826, + "step": 12386 + }, + { + "epoch": 2.254846636934559, + "grad_norm": 8.625, + "learning_rate": 3.168594826160908e-06, + "loss": 1.3029026985168457, + "step": 12388 + }, + { + "epoch": 2.255210703558751, + "grad_norm": 26.125, + "learning_rate": 3.1675126781015815e-06, + "loss": 1.481706142425537, + "step": 12390 + }, + { + "epoch": 2.2555747701829434, + "grad_norm": 16.25, + "learning_rate": 3.1664309456982904e-06, + "loss": 1.5893282890319824, + "step": 12392 + }, + { + "epoch": 2.2559388368071356, + "grad_norm": 7.53125, + "learning_rate": 3.1653496291097746e-06, + "loss": 1.158312201499939, + "step": 12394 + }, + { + "epoch": 2.2563029034313278, + "grad_norm": 10.1875, + "learning_rate": 3.1642687284947125e-06, + "loss": 1.3891575336456299, + "step": 12396 + }, + { + "epoch": 2.25666697005552, + "grad_norm": 7.5625, + "learning_rate": 3.1631882440117235e-06, + "loss": 1.2034833431243896, + "step": 12398 + }, + { + "epoch": 2.257031036679712, + "grad_norm": 9.3125, + "learning_rate": 3.1621081758193624e-06, + "loss": 1.373058795928955, + "step": 12400 + }, + { + "epoch": 2.257395103303905, + "grad_norm": 12.5, + "learning_rate": 3.161028524076125e-06, + "loss": 1.2438926696777344, + "step": 12402 + }, + { + "epoch": 2.257759169928097, + "grad_norm": 26.25, + "learning_rate": 3.1599492889404472e-06, + "loss": 1.2456939220428467, + "step": 12404 + }, + { + "epoch": 2.258123236552289, + "grad_norm": 12.8125, + "learning_rate": 3.158870470570701e-06, + "loss": 1.1077613830566406, + "step": 12406 + }, + { + "epoch": 2.2584873031764814, + "grad_norm": 7.96875, + "learning_rate": 3.157792069125199e-06, + "loss": 1.30093514919281, + "step": 12408 + }, + { + "epoch": 2.2588513698006736, + "grad_norm": 9.5625, + "learning_rate": 3.156714084762193e-06, + "loss": 1.6580302715301514, + "step": 12410 + }, + { + "epoch": 2.2592154364248658, + "grad_norm": 26.625, + "learning_rate": 3.1556365176398717e-06, + "loss": 1.93913996219635, + "step": 12412 + }, + { + "epoch": 2.259579503049058, + "grad_norm": 14.75, + "learning_rate": 3.1545593679163642e-06, + "loss": 1.3676456212997437, + "step": 12414 + }, + { + "epoch": 2.25994356967325, + "grad_norm": 10.4375, + "learning_rate": 3.1534826357497383e-06, + "loss": 1.4838814735412598, + "step": 12416 + }, + { + "epoch": 2.2603076362974424, + "grad_norm": 7.40625, + "learning_rate": 3.1524063212979998e-06, + "loss": 1.357383131980896, + "step": 12418 + }, + { + "epoch": 2.2606717029216346, + "grad_norm": 5.28125, + "learning_rate": 3.151330424719093e-06, + "loss": 0.8611851930618286, + "step": 12420 + }, + { + "epoch": 2.2610357695458267, + "grad_norm": 25.5, + "learning_rate": 3.150254946170902e-06, + "loss": 0.5798748731613159, + "step": 12422 + }, + { + "epoch": 2.261399836170019, + "grad_norm": 9.875, + "learning_rate": 3.1491798858112484e-06, + "loss": 1.6534793376922607, + "step": 12424 + }, + { + "epoch": 2.261763902794211, + "grad_norm": 9.875, + "learning_rate": 3.1481052437978932e-06, + "loss": 1.1673952341079712, + "step": 12426 + }, + { + "epoch": 2.2621279694184038, + "grad_norm": 12.6875, + "learning_rate": 3.147031020288536e-06, + "loss": 1.3034383058547974, + "step": 12428 + }, + { + "epoch": 2.262492036042596, + "grad_norm": 9.125, + "learning_rate": 3.145957215440814e-06, + "loss": 1.521657943725586, + "step": 12430 + }, + { + "epoch": 2.262856102666788, + "grad_norm": 8.625, + "learning_rate": 3.1448838294123046e-06, + "loss": 1.0947458744049072, + "step": 12432 + }, + { + "epoch": 2.2632201692909804, + "grad_norm": 10.25, + "learning_rate": 3.1438108623605223e-06, + "loss": 1.2592262029647827, + "step": 12434 + }, + { + "epoch": 2.2635842359151725, + "grad_norm": 8.6875, + "learning_rate": 3.1427383144429214e-06, + "loss": 1.372435212135315, + "step": 12436 + }, + { + "epoch": 2.2639483025393647, + "grad_norm": 9.3125, + "learning_rate": 3.1416661858168925e-06, + "loss": 1.3548641204833984, + "step": 12438 + }, + { + "epoch": 2.264312369163557, + "grad_norm": 10.8125, + "learning_rate": 3.1405944766397673e-06, + "loss": 1.4571447372436523, + "step": 12440 + }, + { + "epoch": 2.264676435787749, + "grad_norm": 6.3125, + "learning_rate": 3.1395231870688148e-06, + "loss": 1.2718209028244019, + "step": 12442 + }, + { + "epoch": 2.2650405024119413, + "grad_norm": 7.3125, + "learning_rate": 3.1384523172612417e-06, + "loss": 1.0654772520065308, + "step": 12444 + }, + { + "epoch": 2.2654045690361335, + "grad_norm": 15.3125, + "learning_rate": 3.137381867374195e-06, + "loss": 1.7056589126586914, + "step": 12446 + }, + { + "epoch": 2.2657686356603257, + "grad_norm": 36.75, + "learning_rate": 3.136311837564757e-06, + "loss": 1.3483936786651611, + "step": 12448 + }, + { + "epoch": 2.266132702284518, + "grad_norm": 11.75, + "learning_rate": 3.135242227989952e-06, + "loss": 1.4078763723373413, + "step": 12450 + }, + { + "epoch": 2.26649676890871, + "grad_norm": 5.84375, + "learning_rate": 3.13417303880674e-06, + "loss": 1.3722052574157715, + "step": 12452 + }, + { + "epoch": 2.2668608355329027, + "grad_norm": 12.0625, + "learning_rate": 3.1331042701720203e-06, + "loss": 1.3576308488845825, + "step": 12454 + }, + { + "epoch": 2.267224902157095, + "grad_norm": 32.25, + "learning_rate": 3.1320359222426304e-06, + "loss": 1.8384394645690918, + "step": 12456 + }, + { + "epoch": 2.267588968781287, + "grad_norm": 18.0, + "learning_rate": 3.1309679951753462e-06, + "loss": 1.1462087631225586, + "step": 12458 + }, + { + "epoch": 2.2679530354054793, + "grad_norm": 19.25, + "learning_rate": 3.129900489126882e-06, + "loss": 1.3432759046554565, + "step": 12460 + }, + { + "epoch": 2.2683171020296715, + "grad_norm": 10.3125, + "learning_rate": 3.128833404253889e-06, + "loss": 1.364044189453125, + "step": 12462 + }, + { + "epoch": 2.2686811686538637, + "grad_norm": 10.9375, + "learning_rate": 3.127766740712958e-06, + "loss": 1.4586470127105713, + "step": 12464 + }, + { + "epoch": 2.269045235278056, + "grad_norm": 10.75, + "learning_rate": 3.1267004986606175e-06, + "loss": 1.2342406511306763, + "step": 12466 + }, + { + "epoch": 2.269409301902248, + "grad_norm": 10.625, + "learning_rate": 3.125634678253335e-06, + "loss": 1.669901967048645, + "step": 12468 + }, + { + "epoch": 2.2697733685264403, + "grad_norm": 15.125, + "learning_rate": 3.124569279647514e-06, + "loss": 1.1866904497146606, + "step": 12470 + }, + { + "epoch": 2.2701374351506325, + "grad_norm": 13.5625, + "learning_rate": 3.123504302999499e-06, + "loss": 1.430922269821167, + "step": 12472 + }, + { + "epoch": 2.2705015017748247, + "grad_norm": 12.4375, + "learning_rate": 3.1224397484655693e-06, + "loss": 1.2870221138000488, + "step": 12474 + }, + { + "epoch": 2.270865568399017, + "grad_norm": 15.5, + "learning_rate": 3.121375616201945e-06, + "loss": 1.7282233238220215, + "step": 12476 + }, + { + "epoch": 2.271229635023209, + "grad_norm": 14.125, + "learning_rate": 3.1203119063647834e-06, + "loss": 1.7285444736480713, + "step": 12478 + }, + { + "epoch": 2.2715937016474017, + "grad_norm": 16.75, + "learning_rate": 3.1192486191101782e-06, + "loss": 1.7052456140518188, + "step": 12480 + }, + { + "epoch": 2.2719577682715935, + "grad_norm": 13.625, + "learning_rate": 3.1181857545941647e-06, + "loss": 1.547616720199585, + "step": 12482 + }, + { + "epoch": 2.272321834895786, + "grad_norm": 10.6875, + "learning_rate": 3.117123312972712e-06, + "loss": 0.990469753742218, + "step": 12484 + }, + { + "epoch": 2.2726859015199783, + "grad_norm": 35.5, + "learning_rate": 3.11606129440173e-06, + "loss": 0.9018906354904175, + "step": 12486 + }, + { + "epoch": 2.2730499681441705, + "grad_norm": 9.3125, + "learning_rate": 3.114999699037065e-06, + "loss": 1.4034992456436157, + "step": 12488 + }, + { + "epoch": 2.2734140347683627, + "grad_norm": 7.65625, + "learning_rate": 3.1139385270345035e-06, + "loss": 0.928857684135437, + "step": 12490 + }, + { + "epoch": 2.273778101392555, + "grad_norm": 11.8125, + "learning_rate": 3.1128777785497654e-06, + "loss": 1.351243257522583, + "step": 12492 + }, + { + "epoch": 2.274142168016747, + "grad_norm": 12.375, + "learning_rate": 3.111817453738514e-06, + "loss": 1.8022680282592773, + "step": 12494 + }, + { + "epoch": 2.2745062346409393, + "grad_norm": 13.0625, + "learning_rate": 3.110757552756346e-06, + "loss": 1.4590038061141968, + "step": 12496 + }, + { + "epoch": 2.2748703012651315, + "grad_norm": 15.5625, + "learning_rate": 3.109698075758798e-06, + "loss": 1.4637423753738403, + "step": 12498 + }, + { + "epoch": 2.2752343678893237, + "grad_norm": 30.75, + "learning_rate": 3.1086390229013448e-06, + "loss": 1.5078284740447998, + "step": 12500 + }, + { + "epoch": 2.275598434513516, + "grad_norm": 9.9375, + "learning_rate": 3.1075803943393967e-06, + "loss": 1.9286373853683472, + "step": 12502 + }, + { + "epoch": 2.275962501137708, + "grad_norm": 9.25, + "learning_rate": 3.106522190228304e-06, + "loss": 1.584265947341919, + "step": 12504 + }, + { + "epoch": 2.2763265677619007, + "grad_norm": 9.6875, + "learning_rate": 3.1054644107233535e-06, + "loss": 1.2462928295135498, + "step": 12506 + }, + { + "epoch": 2.2766906343860924, + "grad_norm": 5.40625, + "learning_rate": 3.10440705597977e-06, + "loss": 1.471488356590271, + "step": 12508 + }, + { + "epoch": 2.277054701010285, + "grad_norm": 6.125, + "learning_rate": 3.103350126152716e-06, + "loss": 1.0790464878082275, + "step": 12510 + }, + { + "epoch": 2.2774187676344773, + "grad_norm": 16.625, + "learning_rate": 3.102293621397292e-06, + "loss": 1.2261512279510498, + "step": 12512 + }, + { + "epoch": 2.2777828342586695, + "grad_norm": 9.375, + "learning_rate": 3.101237541868536e-06, + "loss": 1.3318239450454712, + "step": 12514 + }, + { + "epoch": 2.2781469008828616, + "grad_norm": 12.4375, + "learning_rate": 3.100181887721423e-06, + "loss": 1.4968301057815552, + "step": 12516 + }, + { + "epoch": 2.278510967507054, + "grad_norm": 29.0, + "learning_rate": 3.0991266591108664e-06, + "loss": 1.558044195175171, + "step": 12518 + }, + { + "epoch": 2.278875034131246, + "grad_norm": 19.375, + "learning_rate": 3.0980718561917158e-06, + "loss": 1.632189154624939, + "step": 12520 + }, + { + "epoch": 2.2792391007554382, + "grad_norm": 6.9375, + "learning_rate": 3.09701747911876e-06, + "loss": 1.0974186658859253, + "step": 12522 + }, + { + "epoch": 2.2796031673796304, + "grad_norm": 8.5, + "learning_rate": 3.095963528046725e-06, + "loss": 1.3167983293533325, + "step": 12524 + }, + { + "epoch": 2.2799672340038226, + "grad_norm": 7.0, + "learning_rate": 3.094910003130272e-06, + "loss": 1.2831642627716064, + "step": 12526 + }, + { + "epoch": 2.280331300628015, + "grad_norm": 8.1875, + "learning_rate": 3.0938569045240043e-06, + "loss": 0.8829589486122131, + "step": 12528 + }, + { + "epoch": 2.280695367252207, + "grad_norm": 10.6875, + "learning_rate": 3.092804232382457e-06, + "loss": 1.383821964263916, + "step": 12530 + }, + { + "epoch": 2.281059433876399, + "grad_norm": 9.375, + "learning_rate": 3.091751986860107e-06, + "loss": 1.297087550163269, + "step": 12532 + }, + { + "epoch": 2.2814235005005914, + "grad_norm": 15.1875, + "learning_rate": 3.0907001681113667e-06, + "loss": 0.8560866713523865, + "step": 12534 + }, + { + "epoch": 2.281787567124784, + "grad_norm": 9.5, + "learning_rate": 3.089648776290587e-06, + "loss": 1.5389878749847412, + "step": 12536 + }, + { + "epoch": 2.2821516337489762, + "grad_norm": 12.5625, + "learning_rate": 3.0885978115520543e-06, + "loss": 1.2532103061676025, + "step": 12538 + }, + { + "epoch": 2.2825157003731684, + "grad_norm": 9.625, + "learning_rate": 3.087547274049994e-06, + "loss": 1.2819963693618774, + "step": 12540 + }, + { + "epoch": 2.2828797669973606, + "grad_norm": 11.5, + "learning_rate": 3.0864971639385684e-06, + "loss": 1.4069114923477173, + "step": 12542 + }, + { + "epoch": 2.283243833621553, + "grad_norm": 9.8125, + "learning_rate": 3.085447481371876e-06, + "loss": 1.3645291328430176, + "step": 12544 + }, + { + "epoch": 2.283607900245745, + "grad_norm": 10.0625, + "learning_rate": 3.084398226503954e-06, + "loss": 1.4025938510894775, + "step": 12546 + }, + { + "epoch": 2.283971966869937, + "grad_norm": 15.75, + "learning_rate": 3.083349399488777e-06, + "loss": 1.1785614490509033, + "step": 12548 + }, + { + "epoch": 2.2843360334941294, + "grad_norm": 23.25, + "learning_rate": 3.082301000480255e-06, + "loss": 1.0558698177337646, + "step": 12550 + }, + { + "epoch": 2.2847001001183216, + "grad_norm": 5.375, + "learning_rate": 3.0812530296322366e-06, + "loss": 1.221854329109192, + "step": 12552 + }, + { + "epoch": 2.285064166742514, + "grad_norm": 12.4375, + "learning_rate": 3.080205487098508e-06, + "loss": 1.270862102508545, + "step": 12554 + }, + { + "epoch": 2.285428233366706, + "grad_norm": 7.8125, + "learning_rate": 3.0791583730327914e-06, + "loss": 1.4056766033172607, + "step": 12556 + }, + { + "epoch": 2.285792299990898, + "grad_norm": 8.375, + "learning_rate": 3.078111687588746e-06, + "loss": 1.4692795276641846, + "step": 12558 + }, + { + "epoch": 2.2861563666150904, + "grad_norm": 16.375, + "learning_rate": 3.077065430919969e-06, + "loss": 1.06184720993042, + "step": 12560 + }, + { + "epoch": 2.286520433239283, + "grad_norm": 15.8125, + "learning_rate": 3.0760196031799944e-06, + "loss": 1.2931253910064697, + "step": 12562 + }, + { + "epoch": 2.286884499863475, + "grad_norm": 9.75, + "learning_rate": 3.0749742045222934e-06, + "loss": 1.2624245882034302, + "step": 12564 + }, + { + "epoch": 2.2872485664876674, + "grad_norm": 8.125, + "learning_rate": 3.073929235100274e-06, + "loss": 1.5429012775421143, + "step": 12566 + }, + { + "epoch": 2.2876126331118596, + "grad_norm": 20.375, + "learning_rate": 3.072884695067281e-06, + "loss": 1.2527109384536743, + "step": 12568 + }, + { + "epoch": 2.2879766997360518, + "grad_norm": 18.0, + "learning_rate": 3.071840584576596e-06, + "loss": 1.5365409851074219, + "step": 12570 + }, + { + "epoch": 2.288340766360244, + "grad_norm": 7.6875, + "learning_rate": 3.070796903781439e-06, + "loss": 1.2739225625991821, + "step": 12572 + }, + { + "epoch": 2.288704832984436, + "grad_norm": 18.875, + "learning_rate": 3.0697536528349642e-06, + "loss": 0.9951099157333374, + "step": 12574 + }, + { + "epoch": 2.2890688996086284, + "grad_norm": 7.84375, + "learning_rate": 3.068710831890267e-06, + "loss": 1.1309419870376587, + "step": 12576 + }, + { + "epoch": 2.2894329662328206, + "grad_norm": 4.28125, + "learning_rate": 3.0676684411003747e-06, + "loss": 1.3102229833602905, + "step": 12578 + }, + { + "epoch": 2.2897970328570127, + "grad_norm": 10.1875, + "learning_rate": 3.0666264806182556e-06, + "loss": 1.0711464881896973, + "step": 12580 + }, + { + "epoch": 2.290161099481205, + "grad_norm": 9.5, + "learning_rate": 3.065584950596812e-06, + "loss": 1.3585255146026611, + "step": 12582 + }, + { + "epoch": 2.290525166105397, + "grad_norm": 17.75, + "learning_rate": 3.0645438511888854e-06, + "loss": 1.3575489521026611, + "step": 12584 + }, + { + "epoch": 2.2908892327295893, + "grad_norm": 7.9375, + "learning_rate": 3.0635031825472518e-06, + "loss": 1.4524719715118408, + "step": 12586 + }, + { + "epoch": 2.291253299353782, + "grad_norm": 12.0, + "learning_rate": 3.0624629448246257e-06, + "loss": 1.2567143440246582, + "step": 12588 + }, + { + "epoch": 2.291617365977974, + "grad_norm": 13.8125, + "learning_rate": 3.0614231381736577e-06, + "loss": 1.0736030340194702, + "step": 12590 + }, + { + "epoch": 2.2919814326021664, + "grad_norm": 17.0, + "learning_rate": 3.0603837627469345e-06, + "loss": 0.522739589214325, + "step": 12592 + }, + { + "epoch": 2.2923454992263586, + "grad_norm": 7.6875, + "learning_rate": 3.059344818696982e-06, + "loss": 1.1999160051345825, + "step": 12594 + }, + { + "epoch": 2.2927095658505507, + "grad_norm": 10.8125, + "learning_rate": 3.0583063061762597e-06, + "loss": 1.4812519550323486, + "step": 12596 + }, + { + "epoch": 2.293073632474743, + "grad_norm": 13.3125, + "learning_rate": 3.057268225337165e-06, + "loss": 1.3010470867156982, + "step": 12598 + }, + { + "epoch": 2.293437699098935, + "grad_norm": 7.4375, + "learning_rate": 3.0562305763320327e-06, + "loss": 1.275359869003296, + "step": 12600 + }, + { + "epoch": 2.2938017657231273, + "grad_norm": 9.5625, + "learning_rate": 3.055193359313133e-06, + "loss": 1.4190359115600586, + "step": 12602 + }, + { + "epoch": 2.2941658323473195, + "grad_norm": 25.125, + "learning_rate": 3.0541565744326735e-06, + "loss": 1.3066437244415283, + "step": 12604 + }, + { + "epoch": 2.2945298989715117, + "grad_norm": 25.625, + "learning_rate": 3.053120221842798e-06, + "loss": 1.190384864807129, + "step": 12606 + }, + { + "epoch": 2.294893965595704, + "grad_norm": 10.0, + "learning_rate": 3.052084301695588e-06, + "loss": 1.2210527658462524, + "step": 12608 + }, + { + "epoch": 2.295258032219896, + "grad_norm": 7.21875, + "learning_rate": 3.0510488141430596e-06, + "loss": 1.2423650026321411, + "step": 12610 + }, + { + "epoch": 2.2956220988440883, + "grad_norm": 18.625, + "learning_rate": 3.0500137593371666e-06, + "loss": 1.2772867679595947, + "step": 12612 + }, + { + "epoch": 2.295986165468281, + "grad_norm": 7.96875, + "learning_rate": 3.048979137429799e-06, + "loss": 1.1831212043762207, + "step": 12614 + }, + { + "epoch": 2.2963502320924727, + "grad_norm": 11.0, + "learning_rate": 3.047944948572783e-06, + "loss": 1.4668102264404297, + "step": 12616 + }, + { + "epoch": 2.2967142987166653, + "grad_norm": 10.0625, + "learning_rate": 3.046911192917883e-06, + "loss": 1.7448139190673828, + "step": 12618 + }, + { + "epoch": 2.2970783653408575, + "grad_norm": 8.375, + "learning_rate": 3.045877870616798e-06, + "loss": 1.0120562314987183, + "step": 12620 + }, + { + "epoch": 2.2974424319650497, + "grad_norm": 15.4375, + "learning_rate": 3.044844981821162e-06, + "loss": 1.180618166923523, + "step": 12622 + }, + { + "epoch": 2.297806498589242, + "grad_norm": 14.125, + "learning_rate": 3.04381252668255e-06, + "loss": 0.08765455335378647, + "step": 12624 + }, + { + "epoch": 2.298170565213434, + "grad_norm": 34.25, + "learning_rate": 3.0427805053524697e-06, + "loss": 0.3928592801094055, + "step": 12626 + }, + { + "epoch": 2.2985346318376263, + "grad_norm": 15.1875, + "learning_rate": 3.0417489179823646e-06, + "loss": 1.3172202110290527, + "step": 12628 + }, + { + "epoch": 2.2988986984618185, + "grad_norm": 24.75, + "learning_rate": 3.0407177647236173e-06, + "loss": 0.9428372979164124, + "step": 12630 + }, + { + "epoch": 2.2992627650860107, + "grad_norm": 8.3125, + "learning_rate": 3.039687045727545e-06, + "loss": 1.5005693435668945, + "step": 12632 + }, + { + "epoch": 2.299626831710203, + "grad_norm": 6.875, + "learning_rate": 3.038656761145402e-06, + "loss": 1.377175211906433, + "step": 12634 + }, + { + "epoch": 2.299990898334395, + "grad_norm": 9.875, + "learning_rate": 3.037626911128378e-06, + "loss": 0.8938100934028625, + "step": 12636 + }, + { + "epoch": 2.3003549649585873, + "grad_norm": 9.9375, + "learning_rate": 3.036597495827599e-06, + "loss": 1.5012824535369873, + "step": 12638 + }, + { + "epoch": 2.3007190315827795, + "grad_norm": 11.375, + "learning_rate": 3.0355685153941283e-06, + "loss": 1.2269644737243652, + "step": 12640 + }, + { + "epoch": 2.3010830982069717, + "grad_norm": 13.9375, + "learning_rate": 3.0345399699789642e-06, + "loss": 0.9781918525695801, + "step": 12642 + }, + { + "epoch": 2.3014471648311643, + "grad_norm": 14.0625, + "learning_rate": 3.033511859733041e-06, + "loss": 1.3739736080169678, + "step": 12644 + }, + { + "epoch": 2.3018112314553565, + "grad_norm": 10.625, + "learning_rate": 3.0324841848072304e-06, + "loss": 1.2351527214050293, + "step": 12646 + }, + { + "epoch": 2.3021752980795487, + "grad_norm": 54.0, + "learning_rate": 3.03145694535234e-06, + "loss": 0.5533649325370789, + "step": 12648 + }, + { + "epoch": 2.302539364703741, + "grad_norm": 22.125, + "learning_rate": 3.0304301415191115e-06, + "loss": 1.7622896432876587, + "step": 12650 + }, + { + "epoch": 2.302903431327933, + "grad_norm": 148.0, + "learning_rate": 3.029403773458226e-06, + "loss": 1.3319618701934814, + "step": 12652 + }, + { + "epoch": 2.3032674979521253, + "grad_norm": 21.125, + "learning_rate": 3.0283778413202975e-06, + "loss": 1.4163382053375244, + "step": 12654 + }, + { + "epoch": 2.3036315645763175, + "grad_norm": 12.625, + "learning_rate": 3.027352345255878e-06, + "loss": 1.3314707279205322, + "step": 12656 + }, + { + "epoch": 2.3039956312005097, + "grad_norm": 15.875, + "learning_rate": 3.0263272854154548e-06, + "loss": 1.3248398303985596, + "step": 12658 + }, + { + "epoch": 2.304359697824702, + "grad_norm": 9.5, + "learning_rate": 3.025302661949451e-06, + "loss": 0.838480532169342, + "step": 12660 + }, + { + "epoch": 2.304723764448894, + "grad_norm": 18.375, + "learning_rate": 3.024278475008226e-06, + "loss": 1.0200480222702026, + "step": 12662 + }, + { + "epoch": 2.3050878310730862, + "grad_norm": 5.75, + "learning_rate": 3.023254724742075e-06, + "loss": 0.44165003299713135, + "step": 12664 + }, + { + "epoch": 2.3054518976972784, + "grad_norm": 7.1875, + "learning_rate": 3.022231411301231e-06, + "loss": 1.2833613157272339, + "step": 12666 + }, + { + "epoch": 2.3058159643214706, + "grad_norm": 14.75, + "learning_rate": 3.021208534835858e-06, + "loss": 1.2190918922424316, + "step": 12668 + }, + { + "epoch": 2.3061800309456633, + "grad_norm": 8.125, + "learning_rate": 3.020186095496061e-06, + "loss": 1.5317522287368774, + "step": 12670 + }, + { + "epoch": 2.3065440975698555, + "grad_norm": 5.09375, + "learning_rate": 3.0191640934318783e-06, + "loss": 0.9419819116592407, + "step": 12672 + }, + { + "epoch": 2.3069081641940477, + "grad_norm": 9.9375, + "learning_rate": 3.0181425287932845e-06, + "loss": 1.2479746341705322, + "step": 12674 + }, + { + "epoch": 2.30727223081824, + "grad_norm": 9.375, + "learning_rate": 3.017121401730191e-06, + "loss": 1.3125934600830078, + "step": 12676 + }, + { + "epoch": 2.307636297442432, + "grad_norm": 12.9375, + "learning_rate": 3.0161007123924425e-06, + "loss": 1.241598129272461, + "step": 12678 + }, + { + "epoch": 2.3080003640666242, + "grad_norm": 12.0, + "learning_rate": 3.015080460929822e-06, + "loss": 1.119701862335205, + "step": 12680 + }, + { + "epoch": 2.3083644306908164, + "grad_norm": 77.5, + "learning_rate": 3.014060647492047e-06, + "loss": 1.2777540683746338, + "step": 12682 + }, + { + "epoch": 2.3087284973150086, + "grad_norm": 40.5, + "learning_rate": 3.0130412722287705e-06, + "loss": 1.179136872291565, + "step": 12684 + }, + { + "epoch": 2.309092563939201, + "grad_norm": 22.875, + "learning_rate": 3.0120223352895827e-06, + "loss": 0.6506235003471375, + "step": 12686 + }, + { + "epoch": 2.309456630563393, + "grad_norm": 8.375, + "learning_rate": 3.011003836824008e-06, + "loss": 1.0332425832748413, + "step": 12688 + }, + { + "epoch": 2.309820697187585, + "grad_norm": 9.4375, + "learning_rate": 3.009985776981507e-06, + "loss": 1.3676140308380127, + "step": 12690 + }, + { + "epoch": 2.3101847638117774, + "grad_norm": 18.125, + "learning_rate": 3.0089681559114758e-06, + "loss": 1.4909124374389648, + "step": 12692 + }, + { + "epoch": 2.3105488304359696, + "grad_norm": 14.5625, + "learning_rate": 3.0079509737632455e-06, + "loss": 1.3306161165237427, + "step": 12694 + }, + { + "epoch": 2.3109128970601622, + "grad_norm": 6.96875, + "learning_rate": 3.006934230686085e-06, + "loss": 1.2349447011947632, + "step": 12696 + }, + { + "epoch": 2.3112769636843544, + "grad_norm": 23.875, + "learning_rate": 3.005917926829196e-06, + "loss": 1.1286929845809937, + "step": 12698 + }, + { + "epoch": 2.3116410303085466, + "grad_norm": 15.5, + "learning_rate": 3.0049020623417167e-06, + "loss": 1.0020360946655273, + "step": 12700 + }, + { + "epoch": 2.312005096932739, + "grad_norm": 6.78125, + "learning_rate": 3.0038866373727223e-06, + "loss": 1.217538595199585, + "step": 12702 + }, + { + "epoch": 2.312369163556931, + "grad_norm": 8.75, + "learning_rate": 3.0028716520712212e-06, + "loss": 1.3113577365875244, + "step": 12704 + }, + { + "epoch": 2.312733230181123, + "grad_norm": 9.6875, + "learning_rate": 3.001857106586159e-06, + "loss": 1.1445388793945312, + "step": 12706 + }, + { + "epoch": 2.3130972968053154, + "grad_norm": 7.125, + "learning_rate": 3.0008430010664164e-06, + "loss": 1.1376641988754272, + "step": 12708 + }, + { + "epoch": 2.3134613634295076, + "grad_norm": 14.0, + "learning_rate": 2.999829335660809e-06, + "loss": 1.436424970626831, + "step": 12710 + }, + { + "epoch": 2.3138254300537, + "grad_norm": 19.375, + "learning_rate": 2.9988161105180875e-06, + "loss": 1.5562865734100342, + "step": 12712 + }, + { + "epoch": 2.314189496677892, + "grad_norm": 19.75, + "learning_rate": 2.997803325786939e-06, + "loss": 1.3501567840576172, + "step": 12714 + }, + { + "epoch": 2.314553563302084, + "grad_norm": 9.375, + "learning_rate": 2.9967909816159857e-06, + "loss": 1.3954524993896484, + "step": 12716 + }, + { + "epoch": 2.3149176299262764, + "grad_norm": 6.4375, + "learning_rate": 2.995779078153785e-06, + "loss": 1.108249545097351, + "step": 12718 + }, + { + "epoch": 2.3152816965504686, + "grad_norm": 13.25, + "learning_rate": 2.99476761554883e-06, + "loss": 1.3637020587921143, + "step": 12720 + }, + { + "epoch": 2.315645763174661, + "grad_norm": 8.375, + "learning_rate": 2.9937565939495472e-06, + "loss": 1.3312394618988037, + "step": 12722 + }, + { + "epoch": 2.316009829798853, + "grad_norm": 8.5, + "learning_rate": 2.992746013504302e-06, + "loss": 1.0356110334396362, + "step": 12724 + }, + { + "epoch": 2.3163738964230456, + "grad_norm": 7.90625, + "learning_rate": 2.9917358743613913e-06, + "loss": 1.2071000337600708, + "step": 12726 + }, + { + "epoch": 2.316737963047238, + "grad_norm": 10.5, + "learning_rate": 2.99072617666905e-06, + "loss": 1.4758578538894653, + "step": 12728 + }, + { + "epoch": 2.31710202967143, + "grad_norm": 71.5, + "learning_rate": 2.9897169205754466e-06, + "loss": 1.6389625072479248, + "step": 12730 + }, + { + "epoch": 2.317466096295622, + "grad_norm": 20.75, + "learning_rate": 2.9887081062286856e-06, + "loss": 1.6999410390853882, + "step": 12732 + }, + { + "epoch": 2.3178301629198144, + "grad_norm": 53.5, + "learning_rate": 2.987699733776806e-06, + "loss": 1.8977272510528564, + "step": 12734 + }, + { + "epoch": 2.3181942295440066, + "grad_norm": 5.25, + "learning_rate": 2.9866918033677827e-06, + "loss": 1.2279573678970337, + "step": 12736 + }, + { + "epoch": 2.3185582961681988, + "grad_norm": 8.5, + "learning_rate": 2.985684315149526e-06, + "loss": 1.0158472061157227, + "step": 12738 + }, + { + "epoch": 2.318922362792391, + "grad_norm": 9.1875, + "learning_rate": 2.9846772692698795e-06, + "loss": 1.4386669397354126, + "step": 12740 + }, + { + "epoch": 2.319286429416583, + "grad_norm": 17.25, + "learning_rate": 2.9836706658766233e-06, + "loss": 1.2104482650756836, + "step": 12742 + }, + { + "epoch": 2.3196504960407753, + "grad_norm": 10.375, + "learning_rate": 2.982664505117474e-06, + "loss": 1.4086564779281616, + "step": 12744 + }, + { + "epoch": 2.3200145626649675, + "grad_norm": 15.8125, + "learning_rate": 2.9816587871400796e-06, + "loss": 1.1924282312393188, + "step": 12746 + }, + { + "epoch": 2.3203786292891597, + "grad_norm": 11.375, + "learning_rate": 2.9806535120920268e-06, + "loss": 1.2032430171966553, + "step": 12748 + }, + { + "epoch": 2.320742695913352, + "grad_norm": 51.5, + "learning_rate": 2.9796486801208337e-06, + "loss": 1.0400818586349487, + "step": 12750 + }, + { + "epoch": 2.3211067625375446, + "grad_norm": 12.25, + "learning_rate": 2.9786442913739566e-06, + "loss": 1.5251060724258423, + "step": 12752 + }, + { + "epoch": 2.3214708291617367, + "grad_norm": 9.0625, + "learning_rate": 2.977640345998785e-06, + "loss": 1.3160400390625, + "step": 12754 + }, + { + "epoch": 2.321834895785929, + "grad_norm": 3.609375, + "learning_rate": 2.976636844142645e-06, + "loss": 1.206324577331543, + "step": 12756 + }, + { + "epoch": 2.322198962410121, + "grad_norm": 17.375, + "learning_rate": 2.9756337859527943e-06, + "loss": 1.3376787900924683, + "step": 12758 + }, + { + "epoch": 2.3225630290343133, + "grad_norm": 12.375, + "learning_rate": 2.9746311715764296e-06, + "loss": 1.088538408279419, + "step": 12760 + }, + { + "epoch": 2.3229270956585055, + "grad_norm": 16.75, + "learning_rate": 2.97362900116068e-06, + "loss": 0.3987538814544678, + "step": 12762 + }, + { + "epoch": 2.3232911622826977, + "grad_norm": 9.0625, + "learning_rate": 2.9726272748526087e-06, + "loss": 1.2419790029525757, + "step": 12764 + }, + { + "epoch": 2.32365522890689, + "grad_norm": 9.0625, + "learning_rate": 2.9716259927992166e-06, + "loss": 1.496699571609497, + "step": 12766 + }, + { + "epoch": 2.324019295531082, + "grad_norm": 27.125, + "learning_rate": 2.9706251551474374e-06, + "loss": 1.4162895679473877, + "step": 12768 + }, + { + "epoch": 2.3243833621552743, + "grad_norm": 13.0625, + "learning_rate": 2.9696247620441386e-06, + "loss": 1.6553939580917358, + "step": 12770 + }, + { + "epoch": 2.3247474287794665, + "grad_norm": 9.125, + "learning_rate": 2.9686248136361264e-06, + "loss": 1.430681586265564, + "step": 12772 + }, + { + "epoch": 2.3251114954036587, + "grad_norm": 12.0, + "learning_rate": 2.9676253100701367e-06, + "loss": 1.151663064956665, + "step": 12774 + }, + { + "epoch": 2.325475562027851, + "grad_norm": 16.125, + "learning_rate": 2.966626251492844e-06, + "loss": 1.6114152669906616, + "step": 12776 + }, + { + "epoch": 2.3258396286520435, + "grad_norm": 20.625, + "learning_rate": 2.965627638050855e-06, + "loss": 1.8489385843276978, + "step": 12778 + }, + { + "epoch": 2.3262036952762357, + "grad_norm": 14.8125, + "learning_rate": 2.964629469890714e-06, + "loss": 1.5465797185897827, + "step": 12780 + }, + { + "epoch": 2.326567761900428, + "grad_norm": 12.5, + "learning_rate": 2.9636317471588966e-06, + "loss": 1.2140628099441528, + "step": 12782 + }, + { + "epoch": 2.32693182852462, + "grad_norm": 17.125, + "learning_rate": 2.962634470001815e-06, + "loss": 0.848121166229248, + "step": 12784 + }, + { + "epoch": 2.3272958951488123, + "grad_norm": 19.75, + "learning_rate": 2.961637638565815e-06, + "loss": 0.6174255609512329, + "step": 12786 + }, + { + "epoch": 2.3276599617730045, + "grad_norm": 58.25, + "learning_rate": 2.9606412529971782e-06, + "loss": 1.7457401752471924, + "step": 12788 + }, + { + "epoch": 2.3280240283971967, + "grad_norm": 10.75, + "learning_rate": 2.959645313442121e-06, + "loss": 1.3340728282928467, + "step": 12790 + }, + { + "epoch": 2.328388095021389, + "grad_norm": 11.625, + "learning_rate": 2.9586498200467925e-06, + "loss": 1.3564493656158447, + "step": 12792 + }, + { + "epoch": 2.328752161645581, + "grad_norm": 8.3125, + "learning_rate": 2.9576547729572763e-06, + "loss": 1.2549247741699219, + "step": 12794 + }, + { + "epoch": 2.3291162282697733, + "grad_norm": 21.0, + "learning_rate": 2.956660172319593e-06, + "loss": 1.0571786165237427, + "step": 12796 + }, + { + "epoch": 2.3294802948939655, + "grad_norm": 51.25, + "learning_rate": 2.9556660182796963e-06, + "loss": 0.8457317352294922, + "step": 12798 + }, + { + "epoch": 2.3298443615181577, + "grad_norm": 27.875, + "learning_rate": 2.9546723109834734e-06, + "loss": 1.0769550800323486, + "step": 12800 + }, + { + "epoch": 2.33020842814235, + "grad_norm": 26.375, + "learning_rate": 2.9536790505767475e-06, + "loss": 1.6301898956298828, + "step": 12802 + }, + { + "epoch": 2.3305724947665425, + "grad_norm": 23.375, + "learning_rate": 2.9526862372052755e-06, + "loss": 1.5576831102371216, + "step": 12804 + }, + { + "epoch": 2.3309365613907347, + "grad_norm": 14.1875, + "learning_rate": 2.951693871014748e-06, + "loss": 1.3612929582595825, + "step": 12806 + }, + { + "epoch": 2.331300628014927, + "grad_norm": 16.375, + "learning_rate": 2.950701952150791e-06, + "loss": 1.789056658744812, + "step": 12808 + }, + { + "epoch": 2.331664694639119, + "grad_norm": 9.8125, + "learning_rate": 2.9497104807589655e-06, + "loss": 1.3118585348129272, + "step": 12810 + }, + { + "epoch": 2.3320287612633113, + "grad_norm": 19.25, + "learning_rate": 2.948719456984765e-06, + "loss": 1.1820694208145142, + "step": 12812 + }, + { + "epoch": 2.3323928278875035, + "grad_norm": 126.0, + "learning_rate": 2.947728880973618e-06, + "loss": 0.7172834873199463, + "step": 12814 + }, + { + "epoch": 2.3327568945116957, + "grad_norm": 2.4375, + "learning_rate": 2.9467387528708884e-06, + "loss": 0.8870652914047241, + "step": 12816 + }, + { + "epoch": 2.333120961135888, + "grad_norm": 8.25, + "learning_rate": 2.945749072821873e-06, + "loss": 1.0431147813796997, + "step": 12818 + }, + { + "epoch": 2.33348502776008, + "grad_norm": 8.375, + "learning_rate": 2.944759840971803e-06, + "loss": 1.4970178604125977, + "step": 12820 + }, + { + "epoch": 2.3338490943842722, + "grad_norm": 20.25, + "learning_rate": 2.9437710574658453e-06, + "loss": 1.2170723676681519, + "step": 12822 + }, + { + "epoch": 2.3342131610084644, + "grad_norm": 16.75, + "learning_rate": 2.9427827224490984e-06, + "loss": 1.045131802558899, + "step": 12824 + }, + { + "epoch": 2.3345772276326566, + "grad_norm": 14.5, + "learning_rate": 2.941794836066598e-06, + "loss": 1.7131435871124268, + "step": 12826 + }, + { + "epoch": 2.334941294256849, + "grad_norm": 3.328125, + "learning_rate": 2.94080739846331e-06, + "loss": 0.9052638411521912, + "step": 12828 + }, + { + "epoch": 2.3353053608810415, + "grad_norm": 15.5, + "learning_rate": 2.93982040978414e-06, + "loss": 1.2864320278167725, + "step": 12830 + }, + { + "epoch": 2.335669427505233, + "grad_norm": 8.8125, + "learning_rate": 2.938833870173922e-06, + "loss": 1.7033592462539673, + "step": 12832 + }, + { + "epoch": 2.336033494129426, + "grad_norm": 8.5, + "learning_rate": 2.9378477797774287e-06, + "loss": 1.1344698667526245, + "step": 12834 + }, + { + "epoch": 2.336397560753618, + "grad_norm": 12.875, + "learning_rate": 2.936862138739363e-06, + "loss": 1.5222827196121216, + "step": 12836 + }, + { + "epoch": 2.3367616273778102, + "grad_norm": 15.1875, + "learning_rate": 2.9358769472043654e-06, + "loss": 1.5892690420150757, + "step": 12838 + }, + { + "epoch": 2.3371256940020024, + "grad_norm": 12.9375, + "learning_rate": 2.9348922053170076e-06, + "loss": 1.3002058267593384, + "step": 12840 + }, + { + "epoch": 2.3374897606261946, + "grad_norm": 40.5, + "learning_rate": 2.933907913221796e-06, + "loss": 1.4518928527832031, + "step": 12842 + }, + { + "epoch": 2.337853827250387, + "grad_norm": 16.875, + "learning_rate": 2.932924071063174e-06, + "loss": 1.7592427730560303, + "step": 12844 + }, + { + "epoch": 2.338217893874579, + "grad_norm": 15.0625, + "learning_rate": 2.931940678985514e-06, + "loss": 1.122901439666748, + "step": 12846 + }, + { + "epoch": 2.338581960498771, + "grad_norm": 63.5, + "learning_rate": 2.9309577371331255e-06, + "loss": 0.7378765344619751, + "step": 12848 + }, + { + "epoch": 2.3389460271229634, + "grad_norm": 18.75, + "learning_rate": 2.9299752456502517e-06, + "loss": 1.4323711395263672, + "step": 12850 + }, + { + "epoch": 2.3393100937471556, + "grad_norm": 40.25, + "learning_rate": 2.928993204681068e-06, + "loss": 1.6694300174713135, + "step": 12852 + }, + { + "epoch": 2.339674160371348, + "grad_norm": 6.75, + "learning_rate": 2.928011614369687e-06, + "loss": 1.1885361671447754, + "step": 12854 + }, + { + "epoch": 2.3400382269955404, + "grad_norm": 16.875, + "learning_rate": 2.927030474860151e-06, + "loss": 1.1138637065887451, + "step": 12856 + }, + { + "epoch": 2.340402293619732, + "grad_norm": 19.5, + "learning_rate": 2.92604978629644e-06, + "loss": 1.1558057069778442, + "step": 12858 + }, + { + "epoch": 2.340766360243925, + "grad_norm": 10.75, + "learning_rate": 2.9250695488224646e-06, + "loss": 1.2405270338058472, + "step": 12860 + }, + { + "epoch": 2.341130426868117, + "grad_norm": 21.625, + "learning_rate": 2.9240897625820713e-06, + "loss": 1.6242928504943848, + "step": 12862 + }, + { + "epoch": 2.341494493492309, + "grad_norm": 21.625, + "learning_rate": 2.92311042771904e-06, + "loss": 1.9920815229415894, + "step": 12864 + }, + { + "epoch": 2.3418585601165014, + "grad_norm": 7.75, + "learning_rate": 2.922131544377084e-06, + "loss": 1.305228352546692, + "step": 12866 + }, + { + "epoch": 2.3422226267406936, + "grad_norm": 13.5, + "learning_rate": 2.92115311269985e-06, + "loss": 1.2778526544570923, + "step": 12868 + }, + { + "epoch": 2.342586693364886, + "grad_norm": 22.5, + "learning_rate": 2.92017513283092e-06, + "loss": 2.2170212268829346, + "step": 12870 + }, + { + "epoch": 2.342950759989078, + "grad_norm": 31.625, + "learning_rate": 2.9191976049138064e-06, + "loss": 1.1426204442977905, + "step": 12872 + }, + { + "epoch": 2.34331482661327, + "grad_norm": 26.625, + "learning_rate": 2.9182205290919595e-06, + "loss": 0.8540529608726501, + "step": 12874 + }, + { + "epoch": 2.3436788932374624, + "grad_norm": 15.75, + "learning_rate": 2.9172439055087616e-06, + "loss": 1.4425444602966309, + "step": 12876 + }, + { + "epoch": 2.3440429598616546, + "grad_norm": 23.25, + "learning_rate": 2.916267734307526e-06, + "loss": 1.4816980361938477, + "step": 12878 + }, + { + "epoch": 2.3444070264858468, + "grad_norm": 20.25, + "learning_rate": 2.9152920156315035e-06, + "loss": 1.3841744661331177, + "step": 12880 + }, + { + "epoch": 2.344771093110039, + "grad_norm": 21.75, + "learning_rate": 2.9143167496238765e-06, + "loss": 1.5072520971298218, + "step": 12882 + }, + { + "epoch": 2.345135159734231, + "grad_norm": 7.03125, + "learning_rate": 2.9133419364277616e-06, + "loss": 1.3918832540512085, + "step": 12884 + }, + { + "epoch": 2.345499226358424, + "grad_norm": 10.875, + "learning_rate": 2.912367576186208e-06, + "loss": 0.9455580711364746, + "step": 12886 + }, + { + "epoch": 2.345863292982616, + "grad_norm": 22.125, + "learning_rate": 2.9113936690421996e-06, + "loss": 0.9071639180183411, + "step": 12888 + }, + { + "epoch": 2.346227359606808, + "grad_norm": 5.4375, + "learning_rate": 2.9104202151386537e-06, + "loss": 0.741948127746582, + "step": 12890 + }, + { + "epoch": 2.3465914262310004, + "grad_norm": 12.75, + "learning_rate": 2.909447214618419e-06, + "loss": 1.376150131225586, + "step": 12892 + }, + { + "epoch": 2.3469554928551926, + "grad_norm": 21.25, + "learning_rate": 2.908474667624282e-06, + "loss": 1.4099239110946655, + "step": 12894 + }, + { + "epoch": 2.3473195594793848, + "grad_norm": 16.75, + "learning_rate": 2.907502574298958e-06, + "loss": 1.299912691116333, + "step": 12896 + }, + { + "epoch": 2.347683626103577, + "grad_norm": 29.75, + "learning_rate": 2.906530934785099e-06, + "loss": 1.2498507499694824, + "step": 12898 + }, + { + "epoch": 2.348047692727769, + "grad_norm": 7.96875, + "learning_rate": 2.9055597492252885e-06, + "loss": 1.166631817817688, + "step": 12900 + }, + { + "epoch": 2.3484117593519613, + "grad_norm": 3.046875, + "learning_rate": 2.9045890177620433e-06, + "loss": 0.6954485177993774, + "step": 12902 + }, + { + "epoch": 2.3487758259761535, + "grad_norm": 6.5625, + "learning_rate": 2.9036187405378158e-06, + "loss": 1.3160232305526733, + "step": 12904 + }, + { + "epoch": 2.3491398926003457, + "grad_norm": 4.90625, + "learning_rate": 2.9026489176949895e-06, + "loss": 1.0704166889190674, + "step": 12906 + }, + { + "epoch": 2.349503959224538, + "grad_norm": 23.5, + "learning_rate": 2.9016795493758822e-06, + "loss": 1.2405421733856201, + "step": 12908 + }, + { + "epoch": 2.34986802584873, + "grad_norm": 7.625, + "learning_rate": 2.900710635722744e-06, + "loss": 1.1511577367782593, + "step": 12910 + }, + { + "epoch": 2.3502320924729228, + "grad_norm": 10.3125, + "learning_rate": 2.89974217687776e-06, + "loss": 0.9296000003814697, + "step": 12912 + }, + { + "epoch": 2.350596159097115, + "grad_norm": 19.0, + "learning_rate": 2.898774172983048e-06, + "loss": 1.4404160976409912, + "step": 12914 + }, + { + "epoch": 2.350960225721307, + "grad_norm": 7.96875, + "learning_rate": 2.897806624180657e-06, + "loss": 1.2417014837265015, + "step": 12916 + }, + { + "epoch": 2.3513242923454993, + "grad_norm": 9.25, + "learning_rate": 2.8968395306125725e-06, + "loss": 1.2106091976165771, + "step": 12918 + }, + { + "epoch": 2.3516883589696915, + "grad_norm": 10.875, + "learning_rate": 2.8958728924207103e-06, + "loss": 1.2392441034317017, + "step": 12920 + }, + { + "epoch": 2.3520524255938837, + "grad_norm": 14.125, + "learning_rate": 2.8949067097469214e-06, + "loss": 1.4720377922058105, + "step": 12922 + }, + { + "epoch": 2.352416492218076, + "grad_norm": 18.5, + "learning_rate": 2.8939409827329894e-06, + "loss": 1.6523391008377075, + "step": 12924 + }, + { + "epoch": 2.352780558842268, + "grad_norm": 20.875, + "learning_rate": 2.89297571152063e-06, + "loss": 1.635360598564148, + "step": 12926 + }, + { + "epoch": 2.3531446254664603, + "grad_norm": 27.75, + "learning_rate": 2.8920108962514935e-06, + "loss": 0.9188194870948792, + "step": 12928 + }, + { + "epoch": 2.3535086920906525, + "grad_norm": 39.25, + "learning_rate": 2.891046537067162e-06, + "loss": 1.56831955909729, + "step": 12930 + }, + { + "epoch": 2.3538727587148447, + "grad_norm": 15.1875, + "learning_rate": 2.890082634109152e-06, + "loss": 1.983557105064392, + "step": 12932 + }, + { + "epoch": 2.354236825339037, + "grad_norm": 17.75, + "learning_rate": 2.8891191875189117e-06, + "loss": 1.4402353763580322, + "step": 12934 + }, + { + "epoch": 2.354600891963229, + "grad_norm": 9.1875, + "learning_rate": 2.8881561974378237e-06, + "loss": 1.432793140411377, + "step": 12936 + }, + { + "epoch": 2.3549649585874217, + "grad_norm": 12.75, + "learning_rate": 2.8871936640072027e-06, + "loss": 1.5808932781219482, + "step": 12938 + }, + { + "epoch": 2.355329025211614, + "grad_norm": 14.6875, + "learning_rate": 2.886231587368296e-06, + "loss": 1.6627857685089111, + "step": 12940 + }, + { + "epoch": 2.355693091835806, + "grad_norm": 9.5625, + "learning_rate": 2.8852699676622855e-06, + "loss": 1.4841033220291138, + "step": 12942 + }, + { + "epoch": 2.3560571584599983, + "grad_norm": 16.75, + "learning_rate": 2.8843088050302837e-06, + "loss": 1.154237151145935, + "step": 12944 + }, + { + "epoch": 2.3564212250841905, + "grad_norm": 22.375, + "learning_rate": 2.8833480996133383e-06, + "loss": 1.4320063591003418, + "step": 12946 + }, + { + "epoch": 2.3567852917083827, + "grad_norm": 19.0, + "learning_rate": 2.8823878515524283e-06, + "loss": 1.3051279783248901, + "step": 12948 + }, + { + "epoch": 2.357149358332575, + "grad_norm": 15.0625, + "learning_rate": 2.8814280609884665e-06, + "loss": 1.4349632263183594, + "step": 12950 + }, + { + "epoch": 2.357513424956767, + "grad_norm": 11.9375, + "learning_rate": 2.8804687280622983e-06, + "loss": 1.6198092699050903, + "step": 12952 + }, + { + "epoch": 2.3578774915809593, + "grad_norm": 10.0, + "learning_rate": 2.879509852914702e-06, + "loss": 1.2842026948928833, + "step": 12954 + }, + { + "epoch": 2.3582415582051515, + "grad_norm": 9.8125, + "learning_rate": 2.8785514356863893e-06, + "loss": 1.5202780961990356, + "step": 12956 + }, + { + "epoch": 2.3586056248293437, + "grad_norm": 6.5625, + "learning_rate": 2.877593476518002e-06, + "loss": 1.0736358165740967, + "step": 12958 + }, + { + "epoch": 2.358969691453536, + "grad_norm": 10.75, + "learning_rate": 2.876635975550119e-06, + "loss": 0.4618733525276184, + "step": 12960 + }, + { + "epoch": 2.359333758077728, + "grad_norm": 12.125, + "learning_rate": 2.875678932923248e-06, + "loss": 1.1793478727340698, + "step": 12962 + }, + { + "epoch": 2.3596978247019207, + "grad_norm": 19.0, + "learning_rate": 2.874722348777832e-06, + "loss": 1.57537841796875, + "step": 12964 + }, + { + "epoch": 2.3600618913261124, + "grad_norm": 62.0, + "learning_rate": 2.873766223254246e-06, + "loss": 1.095525860786438, + "step": 12966 + }, + { + "epoch": 2.360425957950305, + "grad_norm": 14.9375, + "learning_rate": 2.872810556492797e-06, + "loss": 1.3439770936965942, + "step": 12968 + }, + { + "epoch": 2.3607900245744973, + "grad_norm": 16.25, + "learning_rate": 2.8718553486337253e-06, + "loss": 1.428471326828003, + "step": 12970 + }, + { + "epoch": 2.3611540911986895, + "grad_norm": 10.25, + "learning_rate": 2.870900599817204e-06, + "loss": 1.6733887195587158, + "step": 12972 + }, + { + "epoch": 2.3615181578228817, + "grad_norm": 19.125, + "learning_rate": 2.8699463101833385e-06, + "loss": 1.5539997816085815, + "step": 12974 + }, + { + "epoch": 2.361882224447074, + "grad_norm": 11.8125, + "learning_rate": 2.8689924798721673e-06, + "loss": 0.9679824113845825, + "step": 12976 + }, + { + "epoch": 2.362246291071266, + "grad_norm": 23.625, + "learning_rate": 2.868039109023661e-06, + "loss": 1.359969139099121, + "step": 12978 + }, + { + "epoch": 2.3626103576954582, + "grad_norm": 9.375, + "learning_rate": 2.867086197777722e-06, + "loss": 0.5674327611923218, + "step": 12980 + }, + { + "epoch": 2.3629744243196504, + "grad_norm": 12.375, + "learning_rate": 2.8661337462741873e-06, + "loss": 1.3166778087615967, + "step": 12982 + }, + { + "epoch": 2.3633384909438426, + "grad_norm": 58.25, + "learning_rate": 2.865181754652825e-06, + "loss": 1.7650721073150635, + "step": 12984 + }, + { + "epoch": 2.363702557568035, + "grad_norm": 11.6875, + "learning_rate": 2.864230223053335e-06, + "loss": 1.4578721523284912, + "step": 12986 + }, + { + "epoch": 2.364066624192227, + "grad_norm": 19.875, + "learning_rate": 2.863279151615353e-06, + "loss": 1.0696241855621338, + "step": 12988 + }, + { + "epoch": 2.364430690816419, + "grad_norm": 18.0, + "learning_rate": 2.862328540478443e-06, + "loss": 0.8878389596939087, + "step": 12990 + }, + { + "epoch": 2.3647947574406114, + "grad_norm": 19.125, + "learning_rate": 2.8613783897821033e-06, + "loss": 1.2449302673339844, + "step": 12992 + }, + { + "epoch": 2.365158824064804, + "grad_norm": 3.03125, + "learning_rate": 2.8604286996657656e-06, + "loss": 1.0127277374267578, + "step": 12994 + }, + { + "epoch": 2.3655228906889962, + "grad_norm": 9.4375, + "learning_rate": 2.859479470268793e-06, + "loss": 1.1745678186416626, + "step": 12996 + }, + { + "epoch": 2.3658869573131884, + "grad_norm": 9.0, + "learning_rate": 2.8585307017304796e-06, + "loss": 1.353560447692871, + "step": 12998 + }, + { + "epoch": 2.3662510239373806, + "grad_norm": 13.9375, + "learning_rate": 2.857582394190055e-06, + "loss": 1.301735520362854, + "step": 13000 + }, + { + "epoch": 2.366615090561573, + "grad_norm": 8.625, + "learning_rate": 2.8566345477866793e-06, + "loss": 1.3826541900634766, + "step": 13002 + }, + { + "epoch": 2.366979157185765, + "grad_norm": 10.375, + "learning_rate": 2.8556871626594446e-06, + "loss": 1.1823984384536743, + "step": 13004 + }, + { + "epoch": 2.367343223809957, + "grad_norm": 21.625, + "learning_rate": 2.854740238947376e-06, + "loss": 1.398445725440979, + "step": 13006 + }, + { + "epoch": 2.3677072904341494, + "grad_norm": 16.875, + "learning_rate": 2.853793776789431e-06, + "loss": 1.334977388381958, + "step": 13008 + }, + { + "epoch": 2.3680713570583416, + "grad_norm": 16.5, + "learning_rate": 2.8528477763244984e-06, + "loss": 0.957625150680542, + "step": 13010 + }, + { + "epoch": 2.368435423682534, + "grad_norm": 8.0625, + "learning_rate": 2.8519022376913997e-06, + "loss": 1.3189506530761719, + "step": 13012 + }, + { + "epoch": 2.368799490306726, + "grad_norm": 8.75, + "learning_rate": 2.8509571610288904e-06, + "loss": 1.247552514076233, + "step": 13014 + }, + { + "epoch": 2.369163556930918, + "grad_norm": 9.4375, + "learning_rate": 2.850012546475656e-06, + "loss": 1.2783420085906982, + "step": 13016 + }, + { + "epoch": 2.3695276235551104, + "grad_norm": 13.5, + "learning_rate": 2.8490683941703136e-06, + "loss": 1.3923428058624268, + "step": 13018 + }, + { + "epoch": 2.369891690179303, + "grad_norm": 19.875, + "learning_rate": 2.848124704251416e-06, + "loss": 1.3748270273208618, + "step": 13020 + }, + { + "epoch": 2.370255756803495, + "grad_norm": 10.5625, + "learning_rate": 2.8471814768574436e-06, + "loss": 1.2959493398666382, + "step": 13022 + }, + { + "epoch": 2.3706198234276874, + "grad_norm": 7.6875, + "learning_rate": 2.846238712126812e-06, + "loss": 1.1435142755508423, + "step": 13024 + }, + { + "epoch": 2.3709838900518796, + "grad_norm": 8.0625, + "learning_rate": 2.845296410197869e-06, + "loss": 1.2150239944458008, + "step": 13026 + }, + { + "epoch": 2.371347956676072, + "grad_norm": 8.0625, + "learning_rate": 2.8443545712088935e-06, + "loss": 1.36673104763031, + "step": 13028 + }, + { + "epoch": 2.371712023300264, + "grad_norm": 14.9375, + "learning_rate": 2.843413195298095e-06, + "loss": 1.423221230506897, + "step": 13030 + }, + { + "epoch": 2.372076089924456, + "grad_norm": 18.875, + "learning_rate": 2.8424722826036176e-06, + "loss": 1.5557280778884888, + "step": 13032 + }, + { + "epoch": 2.3724401565486484, + "grad_norm": 19.625, + "learning_rate": 2.8415318332635365e-06, + "loss": 1.251732587814331, + "step": 13034 + }, + { + "epoch": 2.3728042231728406, + "grad_norm": 12.875, + "learning_rate": 2.840591847415859e-06, + "loss": 1.5633962154388428, + "step": 13036 + }, + { + "epoch": 2.3731682897970328, + "grad_norm": 5.71875, + "learning_rate": 2.8396523251985236e-06, + "loss": 1.179494857788086, + "step": 13038 + }, + { + "epoch": 2.373532356421225, + "grad_norm": 11.125, + "learning_rate": 2.8387132667494023e-06, + "loss": 0.9931835532188416, + "step": 13040 + }, + { + "epoch": 2.373896423045417, + "grad_norm": 9.75, + "learning_rate": 2.8377746722062963e-06, + "loss": 1.1571617126464844, + "step": 13042 + }, + { + "epoch": 2.3742604896696093, + "grad_norm": 9.375, + "learning_rate": 2.8368365417069426e-06, + "loss": 1.4481487274169922, + "step": 13044 + }, + { + "epoch": 2.374624556293802, + "grad_norm": 13.5625, + "learning_rate": 2.835898875389007e-06, + "loss": 1.3988542556762695, + "step": 13046 + }, + { + "epoch": 2.374988622917994, + "grad_norm": 18.0, + "learning_rate": 2.8349616733900885e-06, + "loss": 1.7332851886749268, + "step": 13048 + }, + { + "epoch": 2.3753526895421864, + "grad_norm": 13.8125, + "learning_rate": 2.8340249358477184e-06, + "loss": 1.6849112510681152, + "step": 13050 + }, + { + "epoch": 2.3757167561663786, + "grad_norm": 24.625, + "learning_rate": 2.8330886628993578e-06, + "loss": 1.6017203330993652, + "step": 13052 + }, + { + "epoch": 2.3760808227905708, + "grad_norm": 12.1875, + "learning_rate": 2.8321528546824015e-06, + "loss": 1.2707186937332153, + "step": 13054 + }, + { + "epoch": 2.376444889414763, + "grad_norm": 34.5, + "learning_rate": 2.8312175113341754e-06, + "loss": 1.5900399684906006, + "step": 13056 + }, + { + "epoch": 2.376808956038955, + "grad_norm": 10.5625, + "learning_rate": 2.830282632991938e-06, + "loss": 1.698850154876709, + "step": 13058 + }, + { + "epoch": 2.3771730226631473, + "grad_norm": 9.6875, + "learning_rate": 2.8293482197928777e-06, + "loss": 1.4708998203277588, + "step": 13060 + }, + { + "epoch": 2.3775370892873395, + "grad_norm": 15.75, + "learning_rate": 2.8284142718741173e-06, + "loss": 1.6085891723632812, + "step": 13062 + }, + { + "epoch": 2.3779011559115317, + "grad_norm": 5.28125, + "learning_rate": 2.8274807893727094e-06, + "loss": 0.8812276124954224, + "step": 13064 + }, + { + "epoch": 2.378265222535724, + "grad_norm": 9.875, + "learning_rate": 2.8265477724256383e-06, + "loss": 0.5617726445198059, + "step": 13066 + }, + { + "epoch": 2.378629289159916, + "grad_norm": 10.375, + "learning_rate": 2.8256152211698205e-06, + "loss": 1.0298148393630981, + "step": 13068 + }, + { + "epoch": 2.3789933557841083, + "grad_norm": 16.0, + "learning_rate": 2.8246831357421044e-06, + "loss": 1.0253523588180542, + "step": 13070 + }, + { + "epoch": 2.379357422408301, + "grad_norm": 25.75, + "learning_rate": 2.82375151627927e-06, + "loss": 1.3209866285324097, + "step": 13072 + }, + { + "epoch": 2.3797214890324927, + "grad_norm": 19.75, + "learning_rate": 2.8228203629180286e-06, + "loss": 1.3811129331588745, + "step": 13074 + }, + { + "epoch": 2.3800855556566853, + "grad_norm": 36.75, + "learning_rate": 2.821889675795022e-06, + "loss": 1.3401545286178589, + "step": 13076 + }, + { + "epoch": 2.3804496222808775, + "grad_norm": 13.5, + "learning_rate": 2.8209594550468263e-06, + "loss": 0.8718788623809814, + "step": 13078 + }, + { + "epoch": 2.3808136889050697, + "grad_norm": 10.5, + "learning_rate": 2.820029700809947e-06, + "loss": 1.2428971529006958, + "step": 13080 + }, + { + "epoch": 2.381177755529262, + "grad_norm": 11.0625, + "learning_rate": 2.8191004132208214e-06, + "loss": 1.7124323844909668, + "step": 13082 + }, + { + "epoch": 2.381541822153454, + "grad_norm": 11.75, + "learning_rate": 2.8181715924158197e-06, + "loss": 1.5032646656036377, + "step": 13084 + }, + { + "epoch": 2.3819058887776463, + "grad_norm": 6.03125, + "learning_rate": 2.817243238531242e-06, + "loss": 1.5273163318634033, + "step": 13086 + }, + { + "epoch": 2.3822699554018385, + "grad_norm": 11.3125, + "learning_rate": 2.81631535170332e-06, + "loss": 1.294839859008789, + "step": 13088 + }, + { + "epoch": 2.3826340220260307, + "grad_norm": 11.4375, + "learning_rate": 2.815387932068218e-06, + "loss": 1.3597090244293213, + "step": 13090 + }, + { + "epoch": 2.382998088650223, + "grad_norm": 7.5, + "learning_rate": 2.814460979762031e-06, + "loss": 1.2180516719818115, + "step": 13092 + }, + { + "epoch": 2.383362155274415, + "grad_norm": 8.6875, + "learning_rate": 2.8135344949207856e-06, + "loss": 0.9353842735290527, + "step": 13094 + }, + { + "epoch": 2.3837262218986073, + "grad_norm": 14.875, + "learning_rate": 2.8126084776804386e-06, + "loss": 1.3736299276351929, + "step": 13096 + }, + { + "epoch": 2.3840902885228, + "grad_norm": 10.75, + "learning_rate": 2.811682928176881e-06, + "loss": 1.4455559253692627, + "step": 13098 + }, + { + "epoch": 2.3844543551469917, + "grad_norm": 6.46875, + "learning_rate": 2.8107578465459322e-06, + "loss": 1.0201411247253418, + "step": 13100 + }, + { + "epoch": 2.3848184217711843, + "grad_norm": 6.46875, + "learning_rate": 2.8098332329233447e-06, + "loss": 1.0659406185150146, + "step": 13102 + }, + { + "epoch": 2.3851824883953765, + "grad_norm": 17.125, + "learning_rate": 2.8089090874448015e-06, + "loss": 1.624879240989685, + "step": 13104 + }, + { + "epoch": 2.3855465550195687, + "grad_norm": 9.4375, + "learning_rate": 2.807985410245917e-06, + "loss": 1.5225954055786133, + "step": 13106 + }, + { + "epoch": 2.385910621643761, + "grad_norm": 14.5625, + "learning_rate": 2.8070622014622384e-06, + "loss": 1.5918283462524414, + "step": 13108 + }, + { + "epoch": 2.386274688267953, + "grad_norm": 23.375, + "learning_rate": 2.806139461229241e-06, + "loss": 1.7529596090316772, + "step": 13110 + }, + { + "epoch": 2.3866387548921453, + "grad_norm": 10.6875, + "learning_rate": 2.8052171896823344e-06, + "loss": 1.4478387832641602, + "step": 13112 + }, + { + "epoch": 2.3870028215163375, + "grad_norm": 11.5, + "learning_rate": 2.804295386956858e-06, + "loss": 1.3970688581466675, + "step": 13114 + }, + { + "epoch": 2.3873668881405297, + "grad_norm": 13.0, + "learning_rate": 2.803374053188082e-06, + "loss": 1.3684873580932617, + "step": 13116 + }, + { + "epoch": 2.387730954764722, + "grad_norm": 4.21875, + "learning_rate": 2.8024531885112092e-06, + "loss": 1.0634028911590576, + "step": 13118 + }, + { + "epoch": 2.388095021388914, + "grad_norm": 17.125, + "learning_rate": 2.8015327930613727e-06, + "loss": 1.5346989631652832, + "step": 13120 + }, + { + "epoch": 2.3884590880131062, + "grad_norm": 18.125, + "learning_rate": 2.8006128669736366e-06, + "loss": 1.447345495223999, + "step": 13122 + }, + { + "epoch": 2.3888231546372984, + "grad_norm": 10.8125, + "learning_rate": 2.7996934103829966e-06, + "loss": 1.4378901720046997, + "step": 13124 + }, + { + "epoch": 2.3891872212614906, + "grad_norm": 20.625, + "learning_rate": 2.798774423424378e-06, + "loss": 1.7141553163528442, + "step": 13126 + }, + { + "epoch": 2.3895512878856833, + "grad_norm": 12.0625, + "learning_rate": 2.79785590623264e-06, + "loss": 1.0675203800201416, + "step": 13128 + }, + { + "epoch": 2.3899153545098755, + "grad_norm": 14.0, + "learning_rate": 2.79693785894257e-06, + "loss": 1.1264009475708008, + "step": 13130 + }, + { + "epoch": 2.3902794211340677, + "grad_norm": 12.0, + "learning_rate": 2.796020281688889e-06, + "loss": 1.332797884941101, + "step": 13132 + }, + { + "epoch": 2.39064348775826, + "grad_norm": 15.625, + "learning_rate": 2.795103174606246e-06, + "loss": 1.8389105796813965, + "step": 13134 + }, + { + "epoch": 2.391007554382452, + "grad_norm": 9.25, + "learning_rate": 2.7941865378292254e-06, + "loss": 1.405497431755066, + "step": 13136 + }, + { + "epoch": 2.3913716210066442, + "grad_norm": 8.125, + "learning_rate": 2.7932703714923377e-06, + "loss": 1.0936295986175537, + "step": 13138 + }, + { + "epoch": 2.3917356876308364, + "grad_norm": 10.5, + "learning_rate": 2.792354675730027e-06, + "loss": 1.4086532592773438, + "step": 13140 + }, + { + "epoch": 2.3920997542550286, + "grad_norm": 8.25, + "learning_rate": 2.7914394506766678e-06, + "loss": 1.3387991189956665, + "step": 13142 + }, + { + "epoch": 2.392463820879221, + "grad_norm": 13.375, + "learning_rate": 2.7905246964665665e-06, + "loss": 1.2444496154785156, + "step": 13144 + }, + { + "epoch": 2.392827887503413, + "grad_norm": 12.25, + "learning_rate": 2.789610413233959e-06, + "loss": 1.138127088546753, + "step": 13146 + }, + { + "epoch": 2.393191954127605, + "grad_norm": 38.25, + "learning_rate": 2.7886966011130136e-06, + "loss": 1.9173109531402588, + "step": 13148 + }, + { + "epoch": 2.3935560207517974, + "grad_norm": 25.0, + "learning_rate": 2.787783260237826e-06, + "loss": 1.6392672061920166, + "step": 13150 + }, + { + "epoch": 2.3939200873759896, + "grad_norm": 15.625, + "learning_rate": 2.786870390742429e-06, + "loss": 1.1793363094329834, + "step": 13152 + }, + { + "epoch": 2.3942841540001822, + "grad_norm": 18.5, + "learning_rate": 2.7859579927607793e-06, + "loss": 1.8120503425598145, + "step": 13154 + }, + { + "epoch": 2.3946482206243744, + "grad_norm": 5.875, + "learning_rate": 2.7850460664267687e-06, + "loss": 1.0406832695007324, + "step": 13156 + }, + { + "epoch": 2.3950122872485666, + "grad_norm": 15.125, + "learning_rate": 2.784134611874219e-06, + "loss": 1.3404755592346191, + "step": 13158 + }, + { + "epoch": 2.395376353872759, + "grad_norm": 14.4375, + "learning_rate": 2.7832236292368824e-06, + "loss": 1.4649168252944946, + "step": 13160 + }, + { + "epoch": 2.395740420496951, + "grad_norm": 9.8125, + "learning_rate": 2.782313118648442e-06, + "loss": 0.7475913166999817, + "step": 13162 + }, + { + "epoch": 2.396104487121143, + "grad_norm": 70.0, + "learning_rate": 2.7814030802425105e-06, + "loss": 0.7028264999389648, + "step": 13164 + }, + { + "epoch": 2.3964685537453354, + "grad_norm": 9.0625, + "learning_rate": 2.780493514152634e-06, + "loss": 1.376814603805542, + "step": 13166 + }, + { + "epoch": 2.3968326203695276, + "grad_norm": 15.875, + "learning_rate": 2.7795844205122866e-06, + "loss": 1.488100528717041, + "step": 13168 + }, + { + "epoch": 2.39719668699372, + "grad_norm": 13.75, + "learning_rate": 2.7786757994548742e-06, + "loss": 1.8152649402618408, + "step": 13170 + }, + { + "epoch": 2.397560753617912, + "grad_norm": 25.625, + "learning_rate": 2.777767651113733e-06, + "loss": 1.6000306606292725, + "step": 13172 + }, + { + "epoch": 2.397924820242104, + "grad_norm": 17.25, + "learning_rate": 2.7768599756221303e-06, + "loss": 0.8882613778114319, + "step": 13174 + }, + { + "epoch": 2.3982888868662964, + "grad_norm": 11.3125, + "learning_rate": 2.7759527731132647e-06, + "loss": 1.4401156902313232, + "step": 13176 + }, + { + "epoch": 2.3986529534904886, + "grad_norm": 26.125, + "learning_rate": 2.775046043720263e-06, + "loss": 1.213514804840088, + "step": 13178 + }, + { + "epoch": 2.399017020114681, + "grad_norm": 4.625, + "learning_rate": 2.7741397875761855e-06, + "loss": 1.1056969165802002, + "step": 13180 + }, + { + "epoch": 2.3993810867388734, + "grad_norm": 40.75, + "learning_rate": 2.7732340048140203e-06, + "loss": 1.2660188674926758, + "step": 13182 + }, + { + "epoch": 2.3997451533630656, + "grad_norm": 47.75, + "learning_rate": 2.7723286955666885e-06, + "loss": 1.5601726770401, + "step": 13184 + }, + { + "epoch": 2.400109219987258, + "grad_norm": 10.125, + "learning_rate": 2.7714238599670394e-06, + "loss": 0.6036195158958435, + "step": 13186 + }, + { + "epoch": 2.40047328661145, + "grad_norm": 12.125, + "learning_rate": 2.7705194981478545e-06, + "loss": 1.4248721599578857, + "step": 13188 + }, + { + "epoch": 2.400837353235642, + "grad_norm": 16.25, + "learning_rate": 2.769615610241846e-06, + "loss": 1.3486409187316895, + "step": 13190 + }, + { + "epoch": 2.4012014198598344, + "grad_norm": 15.375, + "learning_rate": 2.768712196381655e-06, + "loss": 1.1833536624908447, + "step": 13192 + }, + { + "epoch": 2.4015654864840266, + "grad_norm": 3.453125, + "learning_rate": 2.767809256699854e-06, + "loss": 1.285017490386963, + "step": 13194 + }, + { + "epoch": 2.4019295531082188, + "grad_norm": 9.125, + "learning_rate": 2.7669067913289447e-06, + "loss": 1.1078941822052002, + "step": 13196 + }, + { + "epoch": 2.402293619732411, + "grad_norm": 10.0625, + "learning_rate": 2.766004800401362e-06, + "loss": 1.3018831014633179, + "step": 13198 + }, + { + "epoch": 2.402657686356603, + "grad_norm": 11.625, + "learning_rate": 2.7651032840494685e-06, + "loss": 1.4734562635421753, + "step": 13200 + }, + { + "epoch": 2.4030217529807953, + "grad_norm": 16.875, + "learning_rate": 2.764202242405558e-06, + "loss": 1.6767512559890747, + "step": 13202 + }, + { + "epoch": 2.4033858196049875, + "grad_norm": 14.1875, + "learning_rate": 2.7633016756018547e-06, + "loss": 1.8229765892028809, + "step": 13204 + }, + { + "epoch": 2.40374988622918, + "grad_norm": 22.625, + "learning_rate": 2.7624015837705136e-06, + "loss": 1.9370911121368408, + "step": 13206 + }, + { + "epoch": 2.404113952853372, + "grad_norm": 14.8125, + "learning_rate": 2.7615019670436194e-06, + "loss": 1.4044655561447144, + "step": 13208 + }, + { + "epoch": 2.4044780194775646, + "grad_norm": 14.0, + "learning_rate": 2.760602825553187e-06, + "loss": 1.4396326541900635, + "step": 13210 + }, + { + "epoch": 2.4048420861017568, + "grad_norm": 6.46875, + "learning_rate": 2.7597041594311618e-06, + "loss": 1.2416374683380127, + "step": 13212 + }, + { + "epoch": 2.405206152725949, + "grad_norm": 10.75, + "learning_rate": 2.7588059688094194e-06, + "loss": 1.2069329023361206, + "step": 13214 + }, + { + "epoch": 2.405570219350141, + "grad_norm": 87.0, + "learning_rate": 2.7579082538197653e-06, + "loss": 1.2373838424682617, + "step": 13216 + }, + { + "epoch": 2.4059342859743333, + "grad_norm": 4.9375, + "learning_rate": 2.7570110145939365e-06, + "loss": 1.113019347190857, + "step": 13218 + }, + { + "epoch": 2.4062983525985255, + "grad_norm": 9.9375, + "learning_rate": 2.756114251263598e-06, + "loss": 1.4458307027816772, + "step": 13220 + }, + { + "epoch": 2.4066624192227177, + "grad_norm": 24.875, + "learning_rate": 2.7552179639603477e-06, + "loss": 1.103837251663208, + "step": 13222 + }, + { + "epoch": 2.40702648584691, + "grad_norm": 12.1875, + "learning_rate": 2.7543221528157104e-06, + "loss": 1.5726428031921387, + "step": 13224 + }, + { + "epoch": 2.407390552471102, + "grad_norm": 8.4375, + "learning_rate": 2.753426817961144e-06, + "loss": 1.418402910232544, + "step": 13226 + }, + { + "epoch": 2.4077546190952943, + "grad_norm": 11.75, + "learning_rate": 2.7525319595280347e-06, + "loss": 1.3836729526519775, + "step": 13228 + }, + { + "epoch": 2.4081186857194865, + "grad_norm": 12.6875, + "learning_rate": 2.7516375776476993e-06, + "loss": 1.3468658924102783, + "step": 13230 + }, + { + "epoch": 2.4084827523436787, + "grad_norm": 10.375, + "learning_rate": 2.7507436724513853e-06, + "loss": 1.330183982849121, + "step": 13232 + }, + { + "epoch": 2.408846818967871, + "grad_norm": 16.375, + "learning_rate": 2.749850244070269e-06, + "loss": 1.5821893215179443, + "step": 13234 + }, + { + "epoch": 2.4092108855920635, + "grad_norm": 64.5, + "learning_rate": 2.748957292635458e-06, + "loss": 1.578415870666504, + "step": 13236 + }, + { + "epoch": 2.4095749522162557, + "grad_norm": 17.75, + "learning_rate": 2.748064818277989e-06, + "loss": 1.363551139831543, + "step": 13238 + }, + { + "epoch": 2.409939018840448, + "grad_norm": 10.875, + "learning_rate": 2.7471728211288283e-06, + "loss": 1.07505464553833, + "step": 13240 + }, + { + "epoch": 2.41030308546464, + "grad_norm": 9.625, + "learning_rate": 2.7462813013188746e-06, + "loss": 1.5999397039413452, + "step": 13242 + }, + { + "epoch": 2.4106671520888323, + "grad_norm": 20.25, + "learning_rate": 2.745390258978953e-06, + "loss": 1.5222870111465454, + "step": 13244 + }, + { + "epoch": 2.4110312187130245, + "grad_norm": 14.25, + "learning_rate": 2.744499694239821e-06, + "loss": 1.3627160787582397, + "step": 13246 + }, + { + "epoch": 2.4113952853372167, + "grad_norm": 19.75, + "learning_rate": 2.743609607232166e-06, + "loss": 1.7409682273864746, + "step": 13248 + }, + { + "epoch": 2.411759351961409, + "grad_norm": 14.3125, + "learning_rate": 2.7427199980866035e-06, + "loss": 1.3861503601074219, + "step": 13250 + }, + { + "epoch": 2.412123418585601, + "grad_norm": 10.5, + "learning_rate": 2.741830866933681e-06, + "loss": 1.1757136583328247, + "step": 13252 + }, + { + "epoch": 2.4124874852097933, + "grad_norm": 13.0625, + "learning_rate": 2.740942213903875e-06, + "loss": 1.7118579149246216, + "step": 13254 + }, + { + "epoch": 2.4128515518339855, + "grad_norm": 6.6875, + "learning_rate": 2.7400540391275908e-06, + "loss": 1.3312727212905884, + "step": 13256 + }, + { + "epoch": 2.4132156184581777, + "grad_norm": 30.375, + "learning_rate": 2.7391663427351658e-06, + "loss": 1.226712703704834, + "step": 13258 + }, + { + "epoch": 2.41357968508237, + "grad_norm": 13.1875, + "learning_rate": 2.7382791248568642e-06, + "loss": 1.4244717359542847, + "step": 13260 + }, + { + "epoch": 2.4139437517065625, + "grad_norm": 12.4375, + "learning_rate": 2.7373923856228822e-06, + "loss": 1.6039049625396729, + "step": 13262 + }, + { + "epoch": 2.4143078183307547, + "grad_norm": 14.625, + "learning_rate": 2.7365061251633457e-06, + "loss": 1.6134241819381714, + "step": 13264 + }, + { + "epoch": 2.414671884954947, + "grad_norm": 9.3125, + "learning_rate": 2.7356203436083093e-06, + "loss": 1.1808816194534302, + "step": 13266 + }, + { + "epoch": 2.415035951579139, + "grad_norm": 42.25, + "learning_rate": 2.734735041087759e-06, + "loss": 1.2498751878738403, + "step": 13268 + }, + { + "epoch": 2.4154000182033313, + "grad_norm": 11.3125, + "learning_rate": 2.7338502177316077e-06, + "loss": 1.4044361114501953, + "step": 13270 + }, + { + "epoch": 2.4157640848275235, + "grad_norm": 30.75, + "learning_rate": 2.7329658736697008e-06, + "loss": 1.59952712059021, + "step": 13272 + }, + { + "epoch": 2.4161281514517157, + "grad_norm": 21.875, + "learning_rate": 2.732082009031812e-06, + "loss": 1.2319375276565552, + "step": 13274 + }, + { + "epoch": 2.416492218075908, + "grad_norm": 14.3125, + "learning_rate": 2.731198623947644e-06, + "loss": 1.3763971328735352, + "step": 13276 + }, + { + "epoch": 2.4168562847001, + "grad_norm": 5.21875, + "learning_rate": 2.730315718546831e-06, + "loss": 1.3468014001846313, + "step": 13278 + }, + { + "epoch": 2.4172203513242922, + "grad_norm": 7.28125, + "learning_rate": 2.729433292958935e-06, + "loss": 1.0505539178848267, + "step": 13280 + }, + { + "epoch": 2.4175844179484844, + "grad_norm": 9.1875, + "learning_rate": 2.7285513473134494e-06, + "loss": 1.338356375694275, + "step": 13282 + }, + { + "epoch": 2.4179484845726766, + "grad_norm": 6.15625, + "learning_rate": 2.7276698817397953e-06, + "loss": 1.280744194984436, + "step": 13284 + }, + { + "epoch": 2.418312551196869, + "grad_norm": 3.8125, + "learning_rate": 2.7267888963673246e-06, + "loss": 1.2489186525344849, + "step": 13286 + }, + { + "epoch": 2.4186766178210615, + "grad_norm": 12.0625, + "learning_rate": 2.7259083913253183e-06, + "loss": 1.3006317615509033, + "step": 13288 + }, + { + "epoch": 2.4190406844452537, + "grad_norm": 16.0, + "learning_rate": 2.725028366742986e-06, + "loss": 1.4647231101989746, + "step": 13290 + }, + { + "epoch": 2.419404751069446, + "grad_norm": 23.25, + "learning_rate": 2.7241488227494693e-06, + "loss": 1.6237925291061401, + "step": 13292 + }, + { + "epoch": 2.419768817693638, + "grad_norm": 12.9375, + "learning_rate": 2.7232697594738365e-06, + "loss": 1.3615989685058594, + "step": 13294 + }, + { + "epoch": 2.4201328843178302, + "grad_norm": 15.625, + "learning_rate": 2.7223911770450876e-06, + "loss": 1.3674650192260742, + "step": 13296 + }, + { + "epoch": 2.4204969509420224, + "grad_norm": 6.96875, + "learning_rate": 2.7215130755921504e-06, + "loss": 1.2055695056915283, + "step": 13298 + }, + { + "epoch": 2.4208610175662146, + "grad_norm": 11.8125, + "learning_rate": 2.720635455243883e-06, + "loss": 0.9048939347267151, + "step": 13300 + }, + { + "epoch": 2.421225084190407, + "grad_norm": 9.4375, + "learning_rate": 2.719758316129072e-06, + "loss": 1.0693939924240112, + "step": 13302 + }, + { + "epoch": 2.421589150814599, + "grad_norm": 35.75, + "learning_rate": 2.718881658376435e-06, + "loss": 1.3081045150756836, + "step": 13304 + }, + { + "epoch": 2.421953217438791, + "grad_norm": 25.625, + "learning_rate": 2.718005482114617e-06, + "loss": 0.558391273021698, + "step": 13306 + }, + { + "epoch": 2.4223172840629834, + "grad_norm": 17.25, + "learning_rate": 2.7171297874721937e-06, + "loss": 1.3600088357925415, + "step": 13308 + }, + { + "epoch": 2.4226813506871756, + "grad_norm": 6.53125, + "learning_rate": 2.7162545745776696e-06, + "loss": 1.2515337467193604, + "step": 13310 + }, + { + "epoch": 2.423045417311368, + "grad_norm": 22.25, + "learning_rate": 2.715379843559479e-06, + "loss": 0.9471852779388428, + "step": 13312 + }, + { + "epoch": 2.4234094839355604, + "grad_norm": 118.0, + "learning_rate": 2.7145055945459853e-06, + "loss": 1.4826947450637817, + "step": 13314 + }, + { + "epoch": 2.423773550559752, + "grad_norm": 11.5, + "learning_rate": 2.7136318276654804e-06, + "loss": 1.686980128288269, + "step": 13316 + }, + { + "epoch": 2.424137617183945, + "grad_norm": 13.375, + "learning_rate": 2.7127585430461863e-06, + "loss": 1.0486658811569214, + "step": 13318 + }, + { + "epoch": 2.424501683808137, + "grad_norm": 7.90625, + "learning_rate": 2.711885740816254e-06, + "loss": 1.0649824142456055, + "step": 13320 + }, + { + "epoch": 2.424865750432329, + "grad_norm": 14.9375, + "learning_rate": 2.7110134211037635e-06, + "loss": 1.705924391746521, + "step": 13322 + }, + { + "epoch": 2.4252298170565214, + "grad_norm": 13.5625, + "learning_rate": 2.710141584036725e-06, + "loss": 1.4834460020065308, + "step": 13324 + }, + { + "epoch": 2.4255938836807136, + "grad_norm": 19.25, + "learning_rate": 2.7092702297430757e-06, + "loss": 1.5113946199417114, + "step": 13326 + }, + { + "epoch": 2.425957950304906, + "grad_norm": 24.0, + "learning_rate": 2.708399358350684e-06, + "loss": 0.6128663420677185, + "step": 13328 + }, + { + "epoch": 2.426322016929098, + "grad_norm": 33.25, + "learning_rate": 2.7075289699873476e-06, + "loss": 0.42841148376464844, + "step": 13330 + }, + { + "epoch": 2.42668608355329, + "grad_norm": 36.0, + "learning_rate": 2.7066590647807907e-06, + "loss": 0.8327840566635132, + "step": 13332 + }, + { + "epoch": 2.4270501501774824, + "grad_norm": 10.625, + "learning_rate": 2.7057896428586694e-06, + "loss": 1.3326199054718018, + "step": 13334 + }, + { + "epoch": 2.4274142168016746, + "grad_norm": 10.5625, + "learning_rate": 2.7049207043485683e-06, + "loss": 1.5825438499450684, + "step": 13336 + }, + { + "epoch": 2.4277782834258668, + "grad_norm": 9.125, + "learning_rate": 2.704052249378e-06, + "loss": 1.827541470527649, + "step": 13338 + }, + { + "epoch": 2.4281423500500594, + "grad_norm": 14.0, + "learning_rate": 2.7031842780744065e-06, + "loss": 1.390425682067871, + "step": 13340 + }, + { + "epoch": 2.428506416674251, + "grad_norm": 19.625, + "learning_rate": 2.702316790565159e-06, + "loss": 1.4069594144821167, + "step": 13342 + }, + { + "epoch": 2.428870483298444, + "grad_norm": 11.3125, + "learning_rate": 2.701449786977559e-06, + "loss": 1.091294765472412, + "step": 13344 + }, + { + "epoch": 2.429234549922636, + "grad_norm": 5.59375, + "learning_rate": 2.7005832674388342e-06, + "loss": 0.9225186109542847, + "step": 13346 + }, + { + "epoch": 2.429598616546828, + "grad_norm": 8.125, + "learning_rate": 2.6997172320761445e-06, + "loss": 1.338148593902588, + "step": 13348 + }, + { + "epoch": 2.4299626831710204, + "grad_norm": 8.0625, + "learning_rate": 2.698851681016575e-06, + "loss": 1.3794116973876953, + "step": 13350 + }, + { + "epoch": 2.4303267497952126, + "grad_norm": 6.59375, + "learning_rate": 2.697986614387143e-06, + "loss": 1.2935271263122559, + "step": 13352 + }, + { + "epoch": 2.4306908164194048, + "grad_norm": 7.09375, + "learning_rate": 2.697122032314794e-06, + "loss": 1.3203860521316528, + "step": 13354 + }, + { + "epoch": 2.431054883043597, + "grad_norm": 8.875, + "learning_rate": 2.696257934926401e-06, + "loss": 1.3398065567016602, + "step": 13356 + }, + { + "epoch": 2.431418949667789, + "grad_norm": 6.15625, + "learning_rate": 2.695394322348768e-06, + "loss": 1.2970964908599854, + "step": 13358 + }, + { + "epoch": 2.4317830162919813, + "grad_norm": 8.5, + "learning_rate": 2.694531194708625e-06, + "loss": 1.4082047939300537, + "step": 13360 + }, + { + "epoch": 2.4321470829161735, + "grad_norm": 8.5625, + "learning_rate": 2.6936685521326335e-06, + "loss": 1.0450191497802734, + "step": 13362 + }, + { + "epoch": 2.4325111495403657, + "grad_norm": 8.875, + "learning_rate": 2.6928063947473825e-06, + "loss": 0.38913848996162415, + "step": 13364 + }, + { + "epoch": 2.432875216164558, + "grad_norm": 24.125, + "learning_rate": 2.6919447226793897e-06, + "loss": 1.2759171724319458, + "step": 13366 + }, + { + "epoch": 2.43323928278875, + "grad_norm": 4.40625, + "learning_rate": 2.6910835360551037e-06, + "loss": 1.3626065254211426, + "step": 13368 + }, + { + "epoch": 2.4336033494129428, + "grad_norm": 13.8125, + "learning_rate": 2.6902228350008975e-06, + "loss": 1.3550106287002563, + "step": 13370 + }, + { + "epoch": 2.433967416037135, + "grad_norm": 11.25, + "learning_rate": 2.6893626196430776e-06, + "loss": 1.5025393962860107, + "step": 13372 + }, + { + "epoch": 2.434331482661327, + "grad_norm": 7.375, + "learning_rate": 2.6885028901078763e-06, + "loss": 1.3633432388305664, + "step": 13374 + }, + { + "epoch": 2.4346955492855193, + "grad_norm": 14.625, + "learning_rate": 2.687643646521456e-06, + "loss": 1.3471213579177856, + "step": 13376 + }, + { + "epoch": 2.4350596159097115, + "grad_norm": 26.25, + "learning_rate": 2.6867848890099064e-06, + "loss": 1.6405763626098633, + "step": 13378 + }, + { + "epoch": 2.4354236825339037, + "grad_norm": 10.5, + "learning_rate": 2.685926617699247e-06, + "loss": 1.96724271774292, + "step": 13380 + }, + { + "epoch": 2.435787749158096, + "grad_norm": 21.0, + "learning_rate": 2.6850688327154256e-06, + "loss": 1.5573025941848755, + "step": 13382 + }, + { + "epoch": 2.436151815782288, + "grad_norm": 32.25, + "learning_rate": 2.684211534184319e-06, + "loss": 1.3523973226547241, + "step": 13384 + }, + { + "epoch": 2.4365158824064803, + "grad_norm": 22.875, + "learning_rate": 2.683354722231732e-06, + "loss": 1.4019713401794434, + "step": 13386 + }, + { + "epoch": 2.4368799490306725, + "grad_norm": 25.5, + "learning_rate": 2.6824983969833983e-06, + "loss": 1.538862943649292, + "step": 13388 + }, + { + "epoch": 2.4372440156548647, + "grad_norm": 19.5, + "learning_rate": 2.68164255856498e-06, + "loss": 1.3116371631622314, + "step": 13390 + }, + { + "epoch": 2.437608082279057, + "grad_norm": 20.25, + "learning_rate": 2.6807872071020684e-06, + "loss": 1.4484238624572754, + "step": 13392 + }, + { + "epoch": 2.437972148903249, + "grad_norm": 24.875, + "learning_rate": 2.6799323427201823e-06, + "loss": 1.6456549167633057, + "step": 13394 + }, + { + "epoch": 2.4383362155274417, + "grad_norm": 3.5625, + "learning_rate": 2.67907796554477e-06, + "loss": 1.0868059396743774, + "step": 13396 + }, + { + "epoch": 2.438700282151634, + "grad_norm": 14.5625, + "learning_rate": 2.678224075701208e-06, + "loss": 1.248133897781372, + "step": 13398 + }, + { + "epoch": 2.439064348775826, + "grad_norm": 16.0, + "learning_rate": 2.677370673314801e-06, + "loss": 1.6528799533843994, + "step": 13400 + }, + { + "epoch": 2.4394284154000183, + "grad_norm": 12.75, + "learning_rate": 2.6765177585107816e-06, + "loss": 1.24089777469635, + "step": 13402 + }, + { + "epoch": 2.4397924820242105, + "grad_norm": 18.0, + "learning_rate": 2.6756653314143124e-06, + "loss": 1.7661129236221313, + "step": 13404 + }, + { + "epoch": 2.4401565486484027, + "grad_norm": 19.625, + "learning_rate": 2.674813392150484e-06, + "loss": 1.2290040254592896, + "step": 13406 + }, + { + "epoch": 2.440520615272595, + "grad_norm": 4.8125, + "learning_rate": 2.673961940844314e-06, + "loss": 0.9780008792877197, + "step": 13408 + }, + { + "epoch": 2.440884681896787, + "grad_norm": 17.75, + "learning_rate": 2.67311097762075e-06, + "loss": 2.018935203552246, + "step": 13410 + }, + { + "epoch": 2.4412487485209793, + "grad_norm": 11.6875, + "learning_rate": 2.672260502604667e-06, + "loss": 1.2240355014801025, + "step": 13412 + }, + { + "epoch": 2.4416128151451715, + "grad_norm": 7.9375, + "learning_rate": 2.6714105159208693e-06, + "loss": 1.2782444953918457, + "step": 13414 + }, + { + "epoch": 2.4419768817693637, + "grad_norm": 8.875, + "learning_rate": 2.6705610176940887e-06, + "loss": 1.0177124738693237, + "step": 13416 + }, + { + "epoch": 2.442340948393556, + "grad_norm": 10.8125, + "learning_rate": 2.6697120080489864e-06, + "loss": 1.7586997747421265, + "step": 13418 + }, + { + "epoch": 2.442705015017748, + "grad_norm": 10.5625, + "learning_rate": 2.66886348711015e-06, + "loss": 1.1449360847473145, + "step": 13420 + }, + { + "epoch": 2.4430690816419407, + "grad_norm": 6.90625, + "learning_rate": 2.6680154550020972e-06, + "loss": 1.1197320222854614, + "step": 13422 + }, + { + "epoch": 2.443433148266133, + "grad_norm": 14.5, + "learning_rate": 2.6671679118492727e-06, + "loss": 1.4658730030059814, + "step": 13424 + }, + { + "epoch": 2.443797214890325, + "grad_norm": 73.0, + "learning_rate": 2.6663208577760503e-06, + "loss": 1.6675937175750732, + "step": 13426 + }, + { + "epoch": 2.4441612815145173, + "grad_norm": 18.5, + "learning_rate": 2.665474292906732e-06, + "loss": 1.6271227598190308, + "step": 13428 + }, + { + "epoch": 2.4445253481387095, + "grad_norm": 10.625, + "learning_rate": 2.6646282173655473e-06, + "loss": 1.274809718132019, + "step": 13430 + }, + { + "epoch": 2.4448894147629017, + "grad_norm": 65.0, + "learning_rate": 2.6637826312766545e-06, + "loss": 1.2878271341323853, + "step": 13432 + }, + { + "epoch": 2.445253481387094, + "grad_norm": 6.40625, + "learning_rate": 2.6629375347641406e-06, + "loss": 1.2144771814346313, + "step": 13434 + }, + { + "epoch": 2.445617548011286, + "grad_norm": 5.71875, + "learning_rate": 2.66209292795202e-06, + "loss": 1.2489968538284302, + "step": 13436 + }, + { + "epoch": 2.4459816146354783, + "grad_norm": 13.375, + "learning_rate": 2.6612488109642338e-06, + "loss": 1.2636427879333496, + "step": 13438 + }, + { + "epoch": 2.4463456812596704, + "grad_norm": 21.875, + "learning_rate": 2.660405183924654e-06, + "loss": 2.0725436210632324, + "step": 13440 + }, + { + "epoch": 2.4467097478838626, + "grad_norm": 22.75, + "learning_rate": 2.65956204695708e-06, + "loss": 1.2559139728546143, + "step": 13442 + }, + { + "epoch": 2.447073814508055, + "grad_norm": 8.4375, + "learning_rate": 2.658719400185237e-06, + "loss": 1.4041410684585571, + "step": 13444 + }, + { + "epoch": 2.447437881132247, + "grad_norm": 63.75, + "learning_rate": 2.6578772437327815e-06, + "loss": 1.3121445178985596, + "step": 13446 + }, + { + "epoch": 2.4478019477564397, + "grad_norm": 11.4375, + "learning_rate": 2.6570355777232966e-06, + "loss": 1.407128095626831, + "step": 13448 + }, + { + "epoch": 2.4481660143806314, + "grad_norm": 12.25, + "learning_rate": 2.656194402280292e-06, + "loss": 1.1246237754821777, + "step": 13450 + }, + { + "epoch": 2.448530081004824, + "grad_norm": 29.0, + "learning_rate": 2.6553537175272074e-06, + "loss": 1.1758387088775635, + "step": 13452 + }, + { + "epoch": 2.4488941476290162, + "grad_norm": 12.8125, + "learning_rate": 2.6545135235874108e-06, + "loss": 1.7648210525512695, + "step": 13454 + }, + { + "epoch": 2.4492582142532084, + "grad_norm": 5.34375, + "learning_rate": 2.653673820584196e-06, + "loss": 1.1780683994293213, + "step": 13456 + }, + { + "epoch": 2.4496222808774006, + "grad_norm": 7.1875, + "learning_rate": 2.6528346086407868e-06, + "loss": 1.2844581604003906, + "step": 13458 + }, + { + "epoch": 2.449986347501593, + "grad_norm": 14.75, + "learning_rate": 2.6519958878803342e-06, + "loss": 1.333465576171875, + "step": 13460 + }, + { + "epoch": 2.450350414125785, + "grad_norm": 26.75, + "learning_rate": 2.651157658425916e-06, + "loss": 1.3125548362731934, + "step": 13462 + }, + { + "epoch": 2.450714480749977, + "grad_norm": 30.75, + "learning_rate": 2.650319920400541e-06, + "loss": 1.4113140106201172, + "step": 13464 + }, + { + "epoch": 2.4510785473741694, + "grad_norm": 15.125, + "learning_rate": 2.649482673927142e-06, + "loss": 1.3063167333602905, + "step": 13466 + }, + { + "epoch": 2.4514426139983616, + "grad_norm": 10.3125, + "learning_rate": 2.6486459191285815e-06, + "loss": 1.2190064191818237, + "step": 13468 + }, + { + "epoch": 2.451806680622554, + "grad_norm": 17.0, + "learning_rate": 2.647809656127651e-06, + "loss": 1.4076989889144897, + "step": 13470 + }, + { + "epoch": 2.452170747246746, + "grad_norm": 12.625, + "learning_rate": 2.646973885047068e-06, + "loss": 1.3949708938598633, + "step": 13472 + }, + { + "epoch": 2.452534813870938, + "grad_norm": 9.25, + "learning_rate": 2.6461386060094796e-06, + "loss": 1.10963773727417, + "step": 13474 + }, + { + "epoch": 2.4528988804951304, + "grad_norm": 5.875, + "learning_rate": 2.645303819137458e-06, + "loss": 1.3966336250305176, + "step": 13476 + }, + { + "epoch": 2.453262947119323, + "grad_norm": 24.0, + "learning_rate": 2.6444695245535058e-06, + "loss": 1.0470508337020874, + "step": 13478 + }, + { + "epoch": 2.453627013743515, + "grad_norm": 10.0625, + "learning_rate": 2.643635722380052e-06, + "loss": 1.3997278213500977, + "step": 13480 + }, + { + "epoch": 2.4539910803677074, + "grad_norm": 12.4375, + "learning_rate": 2.6428024127394536e-06, + "loss": 0.9530977010726929, + "step": 13482 + }, + { + "epoch": 2.4543551469918996, + "grad_norm": 28.5, + "learning_rate": 2.641969595753996e-06, + "loss": 0.47040119767189026, + "step": 13484 + }, + { + "epoch": 2.454719213616092, + "grad_norm": 12.6875, + "learning_rate": 2.6411372715458905e-06, + "loss": 1.276247501373291, + "step": 13486 + }, + { + "epoch": 2.455083280240284, + "grad_norm": 19.5, + "learning_rate": 2.640305440237279e-06, + "loss": 1.4224942922592163, + "step": 13488 + }, + { + "epoch": 2.455447346864476, + "grad_norm": 9.125, + "learning_rate": 2.6394741019502285e-06, + "loss": 1.6374298334121704, + "step": 13490 + }, + { + "epoch": 2.4558114134886684, + "grad_norm": 5.4375, + "learning_rate": 2.6386432568067343e-06, + "loss": 1.1236530542373657, + "step": 13492 + }, + { + "epoch": 2.4561754801128606, + "grad_norm": 18.0, + "learning_rate": 2.6378129049287193e-06, + "loss": 1.5757429599761963, + "step": 13494 + }, + { + "epoch": 2.4565395467370528, + "grad_norm": 10.1875, + "learning_rate": 2.636983046438035e-06, + "loss": 1.9966856241226196, + "step": 13496 + }, + { + "epoch": 2.456903613361245, + "grad_norm": 47.25, + "learning_rate": 2.63615368145646e-06, + "loss": 1.3129832744598389, + "step": 13498 + }, + { + "epoch": 2.457267679985437, + "grad_norm": 33.25, + "learning_rate": 2.6353248101056995e-06, + "loss": 1.5786428451538086, + "step": 13500 + }, + { + "epoch": 2.4576317466096294, + "grad_norm": 21.375, + "learning_rate": 2.6344964325073873e-06, + "loss": 1.2999707460403442, + "step": 13502 + }, + { + "epoch": 2.457995813233822, + "grad_norm": 6.625, + "learning_rate": 2.633668548783084e-06, + "loss": 1.1972272396087646, + "step": 13504 + }, + { + "epoch": 2.458359879858014, + "grad_norm": 5.3125, + "learning_rate": 2.6328411590542795e-06, + "loss": 1.1587053537368774, + "step": 13506 + }, + { + "epoch": 2.4587239464822064, + "grad_norm": 7.46875, + "learning_rate": 2.6320142634423885e-06, + "loss": 1.5507616996765137, + "step": 13508 + }, + { + "epoch": 2.4590880131063986, + "grad_norm": 11.4375, + "learning_rate": 2.6311878620687546e-06, + "loss": 1.3037623167037964, + "step": 13510 + }, + { + "epoch": 2.4594520797305908, + "grad_norm": 5.84375, + "learning_rate": 2.63036195505465e-06, + "loss": 1.182337760925293, + "step": 13512 + }, + { + "epoch": 2.459816146354783, + "grad_norm": 17.625, + "learning_rate": 2.6295365425212727e-06, + "loss": 1.3224929571151733, + "step": 13514 + }, + { + "epoch": 2.460180212978975, + "grad_norm": 13.1875, + "learning_rate": 2.628711624589748e-06, + "loss": 1.0804643630981445, + "step": 13516 + }, + { + "epoch": 2.4605442796031674, + "grad_norm": 13.5625, + "learning_rate": 2.6278872013811296e-06, + "loss": 1.0033141374588013, + "step": 13518 + }, + { + "epoch": 2.4609083462273595, + "grad_norm": 15.625, + "learning_rate": 2.6270632730163993e-06, + "loss": 1.5021710395812988, + "step": 13520 + }, + { + "epoch": 2.4612724128515517, + "grad_norm": 12.3125, + "learning_rate": 2.6262398396164635e-06, + "loss": 1.5142216682434082, + "step": 13522 + }, + { + "epoch": 2.461636479475744, + "grad_norm": 9.875, + "learning_rate": 2.6254169013021584e-06, + "loss": 1.4393287897109985, + "step": 13524 + }, + { + "epoch": 2.462000546099936, + "grad_norm": 5.59375, + "learning_rate": 2.6245944581942478e-06, + "loss": 1.422282099723816, + "step": 13526 + }, + { + "epoch": 2.4623646127241283, + "grad_norm": 6.15625, + "learning_rate": 2.623772510413421e-06, + "loss": 1.2538256645202637, + "step": 13528 + }, + { + "epoch": 2.462728679348321, + "grad_norm": 17.125, + "learning_rate": 2.622951058080296e-06, + "loss": 1.4100571870803833, + "step": 13530 + }, + { + "epoch": 2.463092745972513, + "grad_norm": 10.125, + "learning_rate": 2.6221301013154165e-06, + "loss": 1.3595296144485474, + "step": 13532 + }, + { + "epoch": 2.4634568125967053, + "grad_norm": 12.9375, + "learning_rate": 2.621309640239256e-06, + "loss": 1.3561580181121826, + "step": 13534 + }, + { + "epoch": 2.4638208792208975, + "grad_norm": 12.0, + "learning_rate": 2.620489674972212e-06, + "loss": 1.3407347202301025, + "step": 13536 + }, + { + "epoch": 2.4641849458450897, + "grad_norm": 19.875, + "learning_rate": 2.619670205634613e-06, + "loss": 1.353811264038086, + "step": 13538 + }, + { + "epoch": 2.464549012469282, + "grad_norm": 5.25, + "learning_rate": 2.618851232346712e-06, + "loss": 1.2983731031417847, + "step": 13540 + }, + { + "epoch": 2.464913079093474, + "grad_norm": 5.78125, + "learning_rate": 2.61803275522869e-06, + "loss": 1.2819554805755615, + "step": 13542 + }, + { + "epoch": 2.4652771457176663, + "grad_norm": 18.125, + "learning_rate": 2.617214774400656e-06, + "loss": 1.2022639513015747, + "step": 13544 + }, + { + "epoch": 2.4656412123418585, + "grad_norm": 18.125, + "learning_rate": 2.6163972899826436e-06, + "loss": 2.0331835746765137, + "step": 13546 + }, + { + "epoch": 2.4660052789660507, + "grad_norm": 8.9375, + "learning_rate": 2.6155803020946164e-06, + "loss": 1.044741153717041, + "step": 13548 + }, + { + "epoch": 2.466369345590243, + "grad_norm": 10.4375, + "learning_rate": 2.6147638108564644e-06, + "loss": 1.3638688325881958, + "step": 13550 + }, + { + "epoch": 2.466733412214435, + "grad_norm": 5.34375, + "learning_rate": 2.613947816388004e-06, + "loss": 1.2540897130966187, + "step": 13552 + }, + { + "epoch": 2.4670974788386273, + "grad_norm": 8.0625, + "learning_rate": 2.6131323188089793e-06, + "loss": 1.0862540006637573, + "step": 13554 + }, + { + "epoch": 2.46746154546282, + "grad_norm": 17.0, + "learning_rate": 2.6123173182390605e-06, + "loss": 1.3320529460906982, + "step": 13556 + }, + { + "epoch": 2.4678256120870117, + "grad_norm": 6.28125, + "learning_rate": 2.611502814797846e-06, + "loss": 1.2859331369400024, + "step": 13558 + }, + { + "epoch": 2.4681896787112043, + "grad_norm": 12.125, + "learning_rate": 2.610688808604862e-06, + "loss": 0.9958328008651733, + "step": 13560 + }, + { + "epoch": 2.4685537453353965, + "grad_norm": 14.875, + "learning_rate": 2.609875299779559e-06, + "loss": 1.829673171043396, + "step": 13562 + }, + { + "epoch": 2.4689178119595887, + "grad_norm": 15.75, + "learning_rate": 2.609062288441317e-06, + "loss": 1.5439484119415283, + "step": 13564 + }, + { + "epoch": 2.469281878583781, + "grad_norm": 56.0, + "learning_rate": 2.6082497747094416e-06, + "loss": 1.1055101156234741, + "step": 13566 + }, + { + "epoch": 2.469645945207973, + "grad_norm": 6.875, + "learning_rate": 2.6074377587031663e-06, + "loss": 1.164146900177002, + "step": 13568 + }, + { + "epoch": 2.4700100118321653, + "grad_norm": 18.125, + "learning_rate": 2.6066262405416514e-06, + "loss": 1.1970126628875732, + "step": 13570 + }, + { + "epoch": 2.4703740784563575, + "grad_norm": 34.75, + "learning_rate": 2.6058152203439833e-06, + "loss": 1.2932348251342773, + "step": 13572 + }, + { + "epoch": 2.4707381450805497, + "grad_norm": 34.75, + "learning_rate": 2.6050046982291766e-06, + "loss": 1.0211294889450073, + "step": 13574 + }, + { + "epoch": 2.471102211704742, + "grad_norm": 38.75, + "learning_rate": 2.604194674316171e-06, + "loss": 1.4936089515686035, + "step": 13576 + }, + { + "epoch": 2.471466278328934, + "grad_norm": 15.375, + "learning_rate": 2.6033851487238352e-06, + "loss": 1.3037607669830322, + "step": 13578 + }, + { + "epoch": 2.4718303449531263, + "grad_norm": 4.4375, + "learning_rate": 2.6025761215709633e-06, + "loss": 1.1959121227264404, + "step": 13580 + }, + { + "epoch": 2.472194411577319, + "grad_norm": 3.59375, + "learning_rate": 2.6017675929762775e-06, + "loss": 1.0831255912780762, + "step": 13582 + }, + { + "epoch": 2.4725584782015106, + "grad_norm": 4.40625, + "learning_rate": 2.6009595630584255e-06, + "loss": 1.3629963397979736, + "step": 13584 + }, + { + "epoch": 2.4729225448257033, + "grad_norm": 18.75, + "learning_rate": 2.6001520319359823e-06, + "loss": 1.3672566413879395, + "step": 13586 + }, + { + "epoch": 2.4732866114498955, + "grad_norm": 12.6875, + "learning_rate": 2.5993449997274506e-06, + "loss": 1.143486499786377, + "step": 13588 + }, + { + "epoch": 2.4736506780740877, + "grad_norm": 4.71875, + "learning_rate": 2.598538466551258e-06, + "loss": 1.1877418756484985, + "step": 13590 + }, + { + "epoch": 2.47401474469828, + "grad_norm": 2.265625, + "learning_rate": 2.5977324325257606e-06, + "loss": 0.8492780923843384, + "step": 13592 + }, + { + "epoch": 2.474378811322472, + "grad_norm": 10.375, + "learning_rate": 2.5969268977692407e-06, + "loss": 0.29919368028640747, + "step": 13594 + }, + { + "epoch": 2.4747428779466643, + "grad_norm": 8.0, + "learning_rate": 2.596121862399907e-06, + "loss": 0.8483954071998596, + "step": 13596 + }, + { + "epoch": 2.4751069445708564, + "grad_norm": 7.28125, + "learning_rate": 2.5953173265358956e-06, + "loss": 1.230735182762146, + "step": 13598 + }, + { + "epoch": 2.4754710111950486, + "grad_norm": 12.1875, + "learning_rate": 2.594513290295268e-06, + "loss": 1.4282658100128174, + "step": 13600 + }, + { + "epoch": 2.475835077819241, + "grad_norm": 26.5, + "learning_rate": 2.5937097537960136e-06, + "loss": 1.5406758785247803, + "step": 13602 + }, + { + "epoch": 2.476199144443433, + "grad_norm": 10.3125, + "learning_rate": 2.592906717156049e-06, + "loss": 1.286152720451355, + "step": 13604 + }, + { + "epoch": 2.4765632110676252, + "grad_norm": 4.28125, + "learning_rate": 2.5921041804932155e-06, + "loss": 0.8500762581825256, + "step": 13606 + }, + { + "epoch": 2.4769272776918174, + "grad_norm": 9.0, + "learning_rate": 2.5913021439252826e-06, + "loss": 1.3473135232925415, + "step": 13608 + }, + { + "epoch": 2.4772913443160096, + "grad_norm": 62.75, + "learning_rate": 2.5905006075699462e-06, + "loss": 1.3492763042449951, + "step": 13610 + }, + { + "epoch": 2.4776554109402023, + "grad_norm": 25.375, + "learning_rate": 2.589699571544828e-06, + "loss": 1.372023105621338, + "step": 13612 + }, + { + "epoch": 2.4780194775643944, + "grad_norm": 25.125, + "learning_rate": 2.5888990359674767e-06, + "loss": 1.7664756774902344, + "step": 13614 + }, + { + "epoch": 2.4783835441885866, + "grad_norm": 74.0, + "learning_rate": 2.588099000955368e-06, + "loss": 1.2682101726531982, + "step": 13616 + }, + { + "epoch": 2.478747610812779, + "grad_norm": 7.28125, + "learning_rate": 2.5872994666259037e-06, + "loss": 1.0988736152648926, + "step": 13618 + }, + { + "epoch": 2.479111677436971, + "grad_norm": 10.5625, + "learning_rate": 2.5865004330964126e-06, + "loss": 1.2914401292800903, + "step": 13620 + }, + { + "epoch": 2.4794757440611632, + "grad_norm": 9.4375, + "learning_rate": 2.585701900484149e-06, + "loss": 1.4311060905456543, + "step": 13622 + }, + { + "epoch": 2.4798398106853554, + "grad_norm": 5.34375, + "learning_rate": 2.5849038689062944e-06, + "loss": 0.9893988370895386, + "step": 13624 + }, + { + "epoch": 2.4802038773095476, + "grad_norm": 19.375, + "learning_rate": 2.5841063384799563e-06, + "loss": 0.9233954548835754, + "step": 13626 + }, + { + "epoch": 2.48056794393374, + "grad_norm": 8.625, + "learning_rate": 2.5833093093221708e-06, + "loss": 1.1942216157913208, + "step": 13628 + }, + { + "epoch": 2.480932010557932, + "grad_norm": 5.9375, + "learning_rate": 2.5825127815498967e-06, + "loss": 1.435471773147583, + "step": 13630 + }, + { + "epoch": 2.481296077182124, + "grad_norm": 6.46875, + "learning_rate": 2.581716755280022e-06, + "loss": 1.4254781007766724, + "step": 13632 + }, + { + "epoch": 2.4816601438063164, + "grad_norm": 8.25, + "learning_rate": 2.5809212306293606e-06, + "loss": 1.3009958267211914, + "step": 13634 + }, + { + "epoch": 2.4820242104305086, + "grad_norm": 12.5, + "learning_rate": 2.5801262077146527e-06, + "loss": 1.5462100505828857, + "step": 13636 + }, + { + "epoch": 2.482388277054701, + "grad_norm": 18.375, + "learning_rate": 2.5793316866525635e-06, + "loss": 1.1356589794158936, + "step": 13638 + }, + { + "epoch": 2.4827523436788934, + "grad_norm": 22.625, + "learning_rate": 2.5785376675596875e-06, + "loss": 1.202012062072754, + "step": 13640 + }, + { + "epoch": 2.4831164103030856, + "grad_norm": 16.625, + "learning_rate": 2.577744150552542e-06, + "loss": 1.3646906614303589, + "step": 13642 + }, + { + "epoch": 2.483480476927278, + "grad_norm": 9.0, + "learning_rate": 2.5769511357475734e-06, + "loss": 1.3165546655654907, + "step": 13644 + }, + { + "epoch": 2.48384454355147, + "grad_norm": 21.0, + "learning_rate": 2.5761586232611533e-06, + "loss": 1.2491192817687988, + "step": 13646 + }, + { + "epoch": 2.484208610175662, + "grad_norm": 10.0625, + "learning_rate": 2.57536661320958e-06, + "loss": 1.4154661893844604, + "step": 13648 + }, + { + "epoch": 2.4845726767998544, + "grad_norm": 11.25, + "learning_rate": 2.5745751057090764e-06, + "loss": 1.2536170482635498, + "step": 13650 + }, + { + "epoch": 2.4849367434240466, + "grad_norm": 14.625, + "learning_rate": 2.5737841008757945e-06, + "loss": 1.5065913200378418, + "step": 13652 + }, + { + "epoch": 2.4853008100482388, + "grad_norm": 22.125, + "learning_rate": 2.572993598825811e-06, + "loss": 1.4974873065948486, + "step": 13654 + }, + { + "epoch": 2.485664876672431, + "grad_norm": 15.4375, + "learning_rate": 2.572203599675128e-06, + "loss": 1.5454620122909546, + "step": 13656 + }, + { + "epoch": 2.486028943296623, + "grad_norm": 19.75, + "learning_rate": 2.5714141035396757e-06, + "loss": 1.4531348943710327, + "step": 13658 + }, + { + "epoch": 2.4863930099208154, + "grad_norm": 12.875, + "learning_rate": 2.570625110535308e-06, + "loss": 1.4217097759246826, + "step": 13660 + }, + { + "epoch": 2.4867570765450075, + "grad_norm": 21.125, + "learning_rate": 2.5698366207778073e-06, + "loss": 1.3445327281951904, + "step": 13662 + }, + { + "epoch": 2.4871211431692, + "grad_norm": 20.0, + "learning_rate": 2.5690486343828817e-06, + "loss": 1.351833701133728, + "step": 13664 + }, + { + "epoch": 2.4874852097933924, + "grad_norm": 15.1875, + "learning_rate": 2.568261151466165e-06, + "loss": 1.2386531829833984, + "step": 13666 + }, + { + "epoch": 2.4878492764175846, + "grad_norm": 14.125, + "learning_rate": 2.5674741721432162e-06, + "loss": 1.2128888368606567, + "step": 13668 + }, + { + "epoch": 2.4882133430417768, + "grad_norm": 16.25, + "learning_rate": 2.5666876965295216e-06, + "loss": 0.8626888990402222, + "step": 13670 + }, + { + "epoch": 2.488577409665969, + "grad_norm": 6.875, + "learning_rate": 2.5659017247404938e-06, + "loss": 1.3152241706848145, + "step": 13672 + }, + { + "epoch": 2.488941476290161, + "grad_norm": 48.0, + "learning_rate": 2.5651162568914707e-06, + "loss": 1.4157578945159912, + "step": 13674 + }, + { + "epoch": 2.4893055429143534, + "grad_norm": 21.125, + "learning_rate": 2.5643312930977168e-06, + "loss": 1.6922650337219238, + "step": 13676 + }, + { + "epoch": 2.4896696095385455, + "grad_norm": 21.375, + "learning_rate": 2.563546833474421e-06, + "loss": 1.460087537765503, + "step": 13678 + }, + { + "epoch": 2.4900336761627377, + "grad_norm": 150.0, + "learning_rate": 2.562762878136702e-06, + "loss": 1.1895815134048462, + "step": 13680 + }, + { + "epoch": 2.49039774278693, + "grad_norm": 30.125, + "learning_rate": 2.5619794271996e-06, + "loss": 0.7901835441589355, + "step": 13682 + }, + { + "epoch": 2.490761809411122, + "grad_norm": 8.4375, + "learning_rate": 2.561196480778084e-06, + "loss": 1.2408534288406372, + "step": 13684 + }, + { + "epoch": 2.4911258760353143, + "grad_norm": 8.8125, + "learning_rate": 2.5604140389870487e-06, + "loss": 1.0823071002960205, + "step": 13686 + }, + { + "epoch": 2.4914899426595065, + "grad_norm": 11.875, + "learning_rate": 2.5596321019413135e-06, + "loss": 1.4040659666061401, + "step": 13688 + }, + { + "epoch": 2.491854009283699, + "grad_norm": 15.75, + "learning_rate": 2.5588506697556244e-06, + "loss": 1.3021875619888306, + "step": 13690 + }, + { + "epoch": 2.492218075907891, + "grad_norm": 6.8125, + "learning_rate": 2.558069742544654e-06, + "loss": 1.1599243879318237, + "step": 13692 + }, + { + "epoch": 2.4925821425320835, + "grad_norm": 9.5625, + "learning_rate": 2.5572893204229996e-06, + "loss": 1.4490139484405518, + "step": 13694 + }, + { + "epoch": 2.4929462091562757, + "grad_norm": 8.375, + "learning_rate": 2.5565094035051856e-06, + "loss": 1.3817355632781982, + "step": 13696 + }, + { + "epoch": 2.493310275780468, + "grad_norm": 12.8125, + "learning_rate": 2.555729991905662e-06, + "loss": 1.1381415128707886, + "step": 13698 + }, + { + "epoch": 2.49367434240466, + "grad_norm": 11.5625, + "learning_rate": 2.5549510857388033e-06, + "loss": 1.4469672441482544, + "step": 13700 + }, + { + "epoch": 2.4940384090288523, + "grad_norm": 15.375, + "learning_rate": 2.5541726851189107e-06, + "loss": 1.2025855779647827, + "step": 13702 + }, + { + "epoch": 2.4944024756530445, + "grad_norm": 12.1875, + "learning_rate": 2.5533947901602124e-06, + "loss": 0.6385471820831299, + "step": 13704 + }, + { + "epoch": 2.4947665422772367, + "grad_norm": 19.25, + "learning_rate": 2.5526174009768606e-06, + "loss": 1.451906681060791, + "step": 13706 + }, + { + "epoch": 2.495130608901429, + "grad_norm": 8.9375, + "learning_rate": 2.551840517682934e-06, + "loss": 1.4848006963729858, + "step": 13708 + }, + { + "epoch": 2.495494675525621, + "grad_norm": 7.40625, + "learning_rate": 2.551064140392438e-06, + "loss": 1.1459434032440186, + "step": 13710 + }, + { + "epoch": 2.4958587421498133, + "grad_norm": 10.0625, + "learning_rate": 2.550288269219301e-06, + "loss": 1.2502771615982056, + "step": 13712 + }, + { + "epoch": 2.4962228087740055, + "grad_norm": 9.25, + "learning_rate": 2.549512904277381e-06, + "loss": 1.2116224765777588, + "step": 13714 + }, + { + "epoch": 2.4965868753981977, + "grad_norm": 10.9375, + "learning_rate": 2.5487380456804585e-06, + "loss": 1.691530466079712, + "step": 13716 + }, + { + "epoch": 2.49695094202239, + "grad_norm": 15.3125, + "learning_rate": 2.5479636935422403e-06, + "loss": 1.3164284229278564, + "step": 13718 + }, + { + "epoch": 2.4973150086465825, + "grad_norm": 10.375, + "learning_rate": 2.5471898479763614e-06, + "loss": 1.2102293968200684, + "step": 13720 + }, + { + "epoch": 2.4976790752707747, + "grad_norm": 4.40625, + "learning_rate": 2.5464165090963783e-06, + "loss": 1.0726637840270996, + "step": 13722 + }, + { + "epoch": 2.498043141894967, + "grad_norm": 7.53125, + "learning_rate": 2.5456436770157766e-06, + "loss": 1.4328835010528564, + "step": 13724 + }, + { + "epoch": 2.498407208519159, + "grad_norm": 11.3125, + "learning_rate": 2.5448713518479663e-06, + "loss": 1.1862092018127441, + "step": 13726 + }, + { + "epoch": 2.4987712751433513, + "grad_norm": 14.0, + "learning_rate": 2.5440995337062817e-06, + "loss": 1.467098593711853, + "step": 13728 + }, + { + "epoch": 2.4991353417675435, + "grad_norm": 9.3125, + "learning_rate": 2.5433282227039864e-06, + "loss": 1.6445116996765137, + "step": 13730 + }, + { + "epoch": 2.4994994083917357, + "grad_norm": 15.75, + "learning_rate": 2.542557418954265e-06, + "loss": 1.1949834823608398, + "step": 13732 + }, + { + "epoch": 2.499863475015928, + "grad_norm": 28.0, + "learning_rate": 2.5417871225702307e-06, + "loss": 1.7359752655029297, + "step": 13734 + }, + { + "epoch": 2.50022754164012, + "grad_norm": 17.75, + "learning_rate": 2.5410173336649213e-06, + "loss": 1.80339777469635, + "step": 13736 + }, + { + "epoch": 2.5005916082643123, + "grad_norm": 6.9375, + "learning_rate": 2.5402480523512994e-06, + "loss": 1.3936741352081299, + "step": 13738 + }, + { + "epoch": 2.5009556748885045, + "grad_norm": 16.125, + "learning_rate": 2.5394792787422562e-06, + "loss": 1.1109888553619385, + "step": 13740 + }, + { + "epoch": 2.501319741512697, + "grad_norm": 26.125, + "learning_rate": 2.538711012950603e-06, + "loss": 1.4367766380310059, + "step": 13742 + }, + { + "epoch": 2.501683808136889, + "grad_norm": 16.375, + "learning_rate": 2.5379432550890826e-06, + "loss": 1.556336760520935, + "step": 13744 + }, + { + "epoch": 2.5020478747610815, + "grad_norm": 17.25, + "learning_rate": 2.5371760052703586e-06, + "loss": 1.6233019828796387, + "step": 13746 + }, + { + "epoch": 2.5024119413852732, + "grad_norm": 11.25, + "learning_rate": 2.536409263607022e-06, + "loss": 1.3923618793487549, + "step": 13748 + }, + { + "epoch": 2.502776008009466, + "grad_norm": 21.375, + "learning_rate": 2.535643030211589e-06, + "loss": 1.0545963048934937, + "step": 13750 + }, + { + "epoch": 2.503140074633658, + "grad_norm": 15.3125, + "learning_rate": 2.534877305196502e-06, + "loss": 1.2005292177200317, + "step": 13752 + }, + { + "epoch": 2.5035041412578503, + "grad_norm": 64.5, + "learning_rate": 2.534112088674128e-06, + "loss": 1.2051119804382324, + "step": 13754 + }, + { + "epoch": 2.5038682078820425, + "grad_norm": 17.25, + "learning_rate": 2.5333473807567577e-06, + "loss": 1.2427839040756226, + "step": 13756 + }, + { + "epoch": 2.5042322745062346, + "grad_norm": 4.5625, + "learning_rate": 2.532583181556611e-06, + "loss": 1.0875390768051147, + "step": 13758 + }, + { + "epoch": 2.504596341130427, + "grad_norm": 23.375, + "learning_rate": 2.5318194911858294e-06, + "loss": 1.4024766683578491, + "step": 13760 + }, + { + "epoch": 2.504960407754619, + "grad_norm": 10.75, + "learning_rate": 2.5310563097564834e-06, + "loss": 1.7428451776504517, + "step": 13762 + }, + { + "epoch": 2.5053244743788112, + "grad_norm": 4.25, + "learning_rate": 2.530293637380565e-06, + "loss": 1.353786587715149, + "step": 13764 + }, + { + "epoch": 2.5056885410030034, + "grad_norm": 4.25, + "learning_rate": 2.5295314741699933e-06, + "loss": 0.9711217880249023, + "step": 13766 + }, + { + "epoch": 2.5060526076271956, + "grad_norm": 18.75, + "learning_rate": 2.528769820236614e-06, + "loss": 1.3038735389709473, + "step": 13768 + }, + { + "epoch": 2.506416674251388, + "grad_norm": 25.5, + "learning_rate": 2.528008675692195e-06, + "loss": 1.3974357843399048, + "step": 13770 + }, + { + "epoch": 2.5067807408755804, + "grad_norm": 13.4375, + "learning_rate": 2.527248040648433e-06, + "loss": 1.3673768043518066, + "step": 13772 + }, + { + "epoch": 2.507144807499772, + "grad_norm": 5.1875, + "learning_rate": 2.526487915216947e-06, + "loss": 1.086998701095581, + "step": 13774 + }, + { + "epoch": 2.507508874123965, + "grad_norm": 13.875, + "learning_rate": 2.5257282995092824e-06, + "loss": 1.2242774963378906, + "step": 13776 + }, + { + "epoch": 2.507872940748157, + "grad_norm": 7.71875, + "learning_rate": 2.52496919363691e-06, + "loss": 1.471954584121704, + "step": 13778 + }, + { + "epoch": 2.5082370073723492, + "grad_norm": 10.0625, + "learning_rate": 2.5242105977112253e-06, + "loss": 1.5057036876678467, + "step": 13780 + }, + { + "epoch": 2.5086010739965414, + "grad_norm": 9.375, + "learning_rate": 2.523452511843549e-06, + "loss": 1.245390772819519, + "step": 13782 + }, + { + "epoch": 2.5089651406207336, + "grad_norm": 10.4375, + "learning_rate": 2.522694936145127e-06, + "loss": 1.4163994789123535, + "step": 13784 + }, + { + "epoch": 2.509329207244926, + "grad_norm": 9.8125, + "learning_rate": 2.5219378707271315e-06, + "loss": 1.2796659469604492, + "step": 13786 + }, + { + "epoch": 2.509693273869118, + "grad_norm": 7.25, + "learning_rate": 2.5211813157006582e-06, + "loss": 1.1654201745986938, + "step": 13788 + }, + { + "epoch": 2.51005734049331, + "grad_norm": 20.625, + "learning_rate": 2.520425271176728e-06, + "loss": 0.8108351230621338, + "step": 13790 + }, + { + "epoch": 2.5104214071175024, + "grad_norm": 11.8125, + "learning_rate": 2.519669737266288e-06, + "loss": 1.2694549560546875, + "step": 13792 + }, + { + "epoch": 2.5107854737416946, + "grad_norm": 18.25, + "learning_rate": 2.5189147140802093e-06, + "loss": 1.4437546730041504, + "step": 13794 + }, + { + "epoch": 2.511149540365887, + "grad_norm": 18.875, + "learning_rate": 2.518160201729289e-06, + "loss": 1.7303444147109985, + "step": 13796 + }, + { + "epoch": 2.5115136069900794, + "grad_norm": 8.1875, + "learning_rate": 2.5174062003242483e-06, + "loss": 1.0157277584075928, + "step": 13798 + }, + { + "epoch": 2.511877673614271, + "grad_norm": 16.125, + "learning_rate": 2.516652709975734e-06, + "loss": 1.3200199604034424, + "step": 13800 + }, + { + "epoch": 2.512241740238464, + "grad_norm": 11.75, + "learning_rate": 2.515899730794318e-06, + "loss": 0.8717083930969238, + "step": 13802 + }, + { + "epoch": 2.512605806862656, + "grad_norm": 31.625, + "learning_rate": 2.515147262890496e-06, + "loss": 0.9612295627593994, + "step": 13804 + }, + { + "epoch": 2.512969873486848, + "grad_norm": 10.1875, + "learning_rate": 2.5143953063746907e-06, + "loss": 1.7548043727874756, + "step": 13806 + }, + { + "epoch": 2.5133339401110404, + "grad_norm": 47.0, + "learning_rate": 2.5136438613572486e-06, + "loss": 1.640061378479004, + "step": 13808 + }, + { + "epoch": 2.5136980067352326, + "grad_norm": 19.25, + "learning_rate": 2.5128929279484406e-06, + "loss": 1.9909260272979736, + "step": 13810 + }, + { + "epoch": 2.5140620733594248, + "grad_norm": 29.0, + "learning_rate": 2.512142506258463e-06, + "loss": 1.9370267391204834, + "step": 13812 + }, + { + "epoch": 2.514426139983617, + "grad_norm": 13.5625, + "learning_rate": 2.5113925963974376e-06, + "loss": 1.7296578884124756, + "step": 13814 + }, + { + "epoch": 2.514790206607809, + "grad_norm": 37.75, + "learning_rate": 2.5106431984754107e-06, + "loss": 1.53657066822052, + "step": 13816 + }, + { + "epoch": 2.5151542732320014, + "grad_norm": 19.25, + "learning_rate": 2.509894312602354e-06, + "loss": 0.5026533007621765, + "step": 13818 + }, + { + "epoch": 2.5155183398561936, + "grad_norm": 30.75, + "learning_rate": 2.509145938888162e-06, + "loss": 1.3250442743301392, + "step": 13820 + }, + { + "epoch": 2.5158824064803857, + "grad_norm": 16.375, + "learning_rate": 2.508398077442657e-06, + "loss": 1.3676886558532715, + "step": 13822 + }, + { + "epoch": 2.5162464731045784, + "grad_norm": 11.375, + "learning_rate": 2.507650728375583e-06, + "loss": 1.4402050971984863, + "step": 13824 + }, + { + "epoch": 2.51661053972877, + "grad_norm": 23.125, + "learning_rate": 2.506903891796612e-06, + "loss": 1.4431074857711792, + "step": 13826 + }, + { + "epoch": 2.5169746063529628, + "grad_norm": 47.5, + "learning_rate": 2.5061575678153384e-06, + "loss": 0.8885664939880371, + "step": 13828 + }, + { + "epoch": 2.517338672977155, + "grad_norm": 14.5, + "learning_rate": 2.505411756541282e-06, + "loss": 1.5238444805145264, + "step": 13830 + }, + { + "epoch": 2.517702739601347, + "grad_norm": 4.59375, + "learning_rate": 2.5046664580838885e-06, + "loss": 1.3137186765670776, + "step": 13832 + }, + { + "epoch": 2.5180668062255394, + "grad_norm": 6.40625, + "learning_rate": 2.5039216725525273e-06, + "loss": 1.3961814641952515, + "step": 13834 + }, + { + "epoch": 2.5184308728497315, + "grad_norm": 10.0, + "learning_rate": 2.5031774000564914e-06, + "loss": 0.9666022062301636, + "step": 13836 + }, + { + "epoch": 2.5187949394739237, + "grad_norm": 19.5, + "learning_rate": 2.5024336407050016e-06, + "loss": 1.367753267288208, + "step": 13838 + }, + { + "epoch": 2.519159006098116, + "grad_norm": 13.375, + "learning_rate": 2.5016903946071996e-06, + "loss": 1.3994977474212646, + "step": 13840 + }, + { + "epoch": 2.519523072722308, + "grad_norm": 9.1875, + "learning_rate": 2.500947661872155e-06, + "loss": 0.9966473579406738, + "step": 13842 + }, + { + "epoch": 2.5198871393465003, + "grad_norm": 17.125, + "learning_rate": 2.500205442608861e-06, + "loss": 1.293057918548584, + "step": 13844 + }, + { + "epoch": 2.5202512059706925, + "grad_norm": 21.375, + "learning_rate": 2.499463736926235e-06, + "loss": 1.063389539718628, + "step": 13846 + }, + { + "epoch": 2.5206152725948847, + "grad_norm": 35.25, + "learning_rate": 2.4987225449331185e-06, + "loss": 1.5489585399627686, + "step": 13848 + }, + { + "epoch": 2.5209793392190774, + "grad_norm": 18.125, + "learning_rate": 2.49798186673828e-06, + "loss": 1.410095453262329, + "step": 13850 + }, + { + "epoch": 2.521343405843269, + "grad_norm": 12.1875, + "learning_rate": 2.4972417024504096e-06, + "loss": 1.4894192218780518, + "step": 13852 + }, + { + "epoch": 2.5217074724674617, + "grad_norm": 15.125, + "learning_rate": 2.496502052178124e-06, + "loss": 1.558536171913147, + "step": 13854 + }, + { + "epoch": 2.522071539091654, + "grad_norm": 11.0625, + "learning_rate": 2.495762916029964e-06, + "loss": 1.6361110210418701, + "step": 13856 + }, + { + "epoch": 2.522435605715846, + "grad_norm": 16.5, + "learning_rate": 2.4950242941143944e-06, + "loss": 1.4143916368484497, + "step": 13858 + }, + { + "epoch": 2.5227996723400383, + "grad_norm": 19.75, + "learning_rate": 2.494286186539805e-06, + "loss": 1.42330002784729, + "step": 13860 + }, + { + "epoch": 2.5231637389642305, + "grad_norm": 16.625, + "learning_rate": 2.493548593414511e-06, + "loss": 1.4504057168960571, + "step": 13862 + }, + { + "epoch": 2.5235278055884227, + "grad_norm": 14.8125, + "learning_rate": 2.4928115148467498e-06, + "loss": 1.4021843671798706, + "step": 13864 + }, + { + "epoch": 2.523891872212615, + "grad_norm": 11.625, + "learning_rate": 2.4920749509446855e-06, + "loss": 1.0065877437591553, + "step": 13866 + }, + { + "epoch": 2.524255938836807, + "grad_norm": 22.75, + "learning_rate": 2.491338901816406e-06, + "loss": 1.8538141250610352, + "step": 13868 + }, + { + "epoch": 2.5246200054609993, + "grad_norm": 70.5, + "learning_rate": 2.4906033675699235e-06, + "loss": 1.7632840871810913, + "step": 13870 + }, + { + "epoch": 2.5249840720851915, + "grad_norm": 15.9375, + "learning_rate": 2.489868348313174e-06, + "loss": 1.097691535949707, + "step": 13872 + }, + { + "epoch": 2.5253481387093837, + "grad_norm": 104.5, + "learning_rate": 2.4891338441540194e-06, + "loss": 1.4180630445480347, + "step": 13874 + }, + { + "epoch": 2.5257122053335763, + "grad_norm": 21.875, + "learning_rate": 2.488399855200245e-06, + "loss": 1.52070152759552, + "step": 13876 + }, + { + "epoch": 2.526076271957768, + "grad_norm": 6.78125, + "learning_rate": 2.4876663815595604e-06, + "loss": 1.246095895767212, + "step": 13878 + }, + { + "epoch": 2.5264403385819607, + "grad_norm": 16.625, + "learning_rate": 2.4869334233395997e-06, + "loss": 0.9933915138244629, + "step": 13880 + }, + { + "epoch": 2.5268044052061525, + "grad_norm": 108.5, + "learning_rate": 2.486200980647922e-06, + "loss": 0.8729953765869141, + "step": 13882 + }, + { + "epoch": 2.527168471830345, + "grad_norm": 6.5, + "learning_rate": 2.485469053592011e-06, + "loss": 1.2897939682006836, + "step": 13884 + }, + { + "epoch": 2.5275325384545373, + "grad_norm": 7.28125, + "learning_rate": 2.4847376422792723e-06, + "loss": 1.262458086013794, + "step": 13886 + }, + { + "epoch": 2.5278966050787295, + "grad_norm": 34.5, + "learning_rate": 2.4840067468170386e-06, + "loss": 1.4337372779846191, + "step": 13888 + }, + { + "epoch": 2.5282606717029217, + "grad_norm": 12.25, + "learning_rate": 2.483276367312566e-06, + "loss": 1.4279046058654785, + "step": 13890 + }, + { + "epoch": 2.528624738327114, + "grad_norm": 18.5, + "learning_rate": 2.4825465038730345e-06, + "loss": 1.6506714820861816, + "step": 13892 + }, + { + "epoch": 2.528988804951306, + "grad_norm": 23.375, + "learning_rate": 2.4818171566055486e-06, + "loss": 1.5873976945877075, + "step": 13894 + }, + { + "epoch": 2.5293528715754983, + "grad_norm": 13.875, + "learning_rate": 2.481088325617137e-06, + "loss": 1.840796709060669, + "step": 13896 + }, + { + "epoch": 2.5297169381996905, + "grad_norm": 12.25, + "learning_rate": 2.4803600110147527e-06, + "loss": 1.4852625131607056, + "step": 13898 + }, + { + "epoch": 2.5300810048238827, + "grad_norm": 9.75, + "learning_rate": 2.479632212905273e-06, + "loss": 1.3497776985168457, + "step": 13900 + }, + { + "epoch": 2.530445071448075, + "grad_norm": 10.4375, + "learning_rate": 2.4789049313954986e-06, + "loss": 1.345975637435913, + "step": 13902 + }, + { + "epoch": 2.530809138072267, + "grad_norm": 14.0625, + "learning_rate": 2.4781781665921565e-06, + "loss": 1.3405169248580933, + "step": 13904 + }, + { + "epoch": 2.5311732046964597, + "grad_norm": 10.5625, + "learning_rate": 2.4774519186018955e-06, + "loss": 1.3927009105682373, + "step": 13906 + }, + { + "epoch": 2.5315372713206514, + "grad_norm": 14.6875, + "learning_rate": 2.4767261875312897e-06, + "loss": 1.4216716289520264, + "step": 13908 + }, + { + "epoch": 2.531901337944844, + "grad_norm": 20.125, + "learning_rate": 2.4760009734868374e-06, + "loss": 1.4923580884933472, + "step": 13910 + }, + { + "epoch": 2.5322654045690363, + "grad_norm": 9.625, + "learning_rate": 2.475276276574961e-06, + "loss": 1.4852474927902222, + "step": 13912 + }, + { + "epoch": 2.5326294711932285, + "grad_norm": 5.21875, + "learning_rate": 2.4745520969020065e-06, + "loss": 0.798814058303833, + "step": 13914 + }, + { + "epoch": 2.5329935378174206, + "grad_norm": 17.625, + "learning_rate": 2.4738284345742442e-06, + "loss": 0.3050364851951599, + "step": 13916 + }, + { + "epoch": 2.533357604441613, + "grad_norm": 18.0, + "learning_rate": 2.4731052896978684e-06, + "loss": 0.5007586479187012, + "step": 13918 + }, + { + "epoch": 2.533721671065805, + "grad_norm": 22.125, + "learning_rate": 2.4723826623789988e-06, + "loss": 0.6491674184799194, + "step": 13920 + }, + { + "epoch": 2.5340857376899972, + "grad_norm": 5.0625, + "learning_rate": 2.471660552723677e-06, + "loss": 1.091306447982788, + "step": 13922 + }, + { + "epoch": 2.5344498043141894, + "grad_norm": 10.1875, + "learning_rate": 2.47093896083787e-06, + "loss": 1.2955987453460693, + "step": 13924 + }, + { + "epoch": 2.5348138709383816, + "grad_norm": 16.75, + "learning_rate": 2.4702178868274686e-06, + "loss": 1.3831698894500732, + "step": 13926 + }, + { + "epoch": 2.535177937562574, + "grad_norm": 12.0, + "learning_rate": 2.469497330798287e-06, + "loss": 1.4622896909713745, + "step": 13928 + }, + { + "epoch": 2.535542004186766, + "grad_norm": 27.375, + "learning_rate": 2.468777292856064e-06, + "loss": 1.553640365600586, + "step": 13930 + }, + { + "epoch": 2.5359060708109586, + "grad_norm": 22.875, + "learning_rate": 2.4680577731064637e-06, + "loss": 1.3646631240844727, + "step": 13932 + }, + { + "epoch": 2.5362701374351504, + "grad_norm": 9.3125, + "learning_rate": 2.46733877165507e-06, + "loss": 1.591979742050171, + "step": 13934 + }, + { + "epoch": 2.536634204059343, + "grad_norm": 11.6875, + "learning_rate": 2.466620288607396e-06, + "loss": 1.199021816253662, + "step": 13936 + }, + { + "epoch": 2.5369982706835352, + "grad_norm": 24.625, + "learning_rate": 2.4659023240688747e-06, + "loss": 1.5112149715423584, + "step": 13938 + }, + { + "epoch": 2.5373623373077274, + "grad_norm": 6.8125, + "learning_rate": 2.465184878144865e-06, + "loss": 1.3368182182312012, + "step": 13940 + }, + { + "epoch": 2.5377264039319196, + "grad_norm": 30.0, + "learning_rate": 2.464467950940649e-06, + "loss": 1.5485413074493408, + "step": 13942 + }, + { + "epoch": 2.538090470556112, + "grad_norm": 40.0, + "learning_rate": 2.4637515425614327e-06, + "loss": 1.6032359600067139, + "step": 13944 + }, + { + "epoch": 2.538454537180304, + "grad_norm": 9.0625, + "learning_rate": 2.4630356531123467e-06, + "loss": 1.4440189599990845, + "step": 13946 + }, + { + "epoch": 2.538818603804496, + "grad_norm": 9.875, + "learning_rate": 2.4623202826984445e-06, + "loss": 1.2808880805969238, + "step": 13948 + }, + { + "epoch": 2.5391826704286884, + "grad_norm": 17.625, + "learning_rate": 2.4616054314247038e-06, + "loss": 1.329640507698059, + "step": 13950 + }, + { + "epoch": 2.5395467370528806, + "grad_norm": 24.125, + "learning_rate": 2.4608910993960265e-06, + "loss": 1.365883708000183, + "step": 13952 + }, + { + "epoch": 2.539910803677073, + "grad_norm": 6.84375, + "learning_rate": 2.460177286717237e-06, + "loss": 1.0846781730651855, + "step": 13954 + }, + { + "epoch": 2.540274870301265, + "grad_norm": 17.875, + "learning_rate": 2.4594639934930855e-06, + "loss": 1.2535054683685303, + "step": 13956 + }, + { + "epoch": 2.5406389369254576, + "grad_norm": 20.875, + "learning_rate": 2.4587512198282443e-06, + "loss": 1.6685168743133545, + "step": 13958 + }, + { + "epoch": 2.5410030035496494, + "grad_norm": 53.5, + "learning_rate": 2.45803896582731e-06, + "loss": 1.5431749820709229, + "step": 13960 + }, + { + "epoch": 2.541367070173842, + "grad_norm": 45.75, + "learning_rate": 2.4573272315948034e-06, + "loss": 1.725101113319397, + "step": 13962 + }, + { + "epoch": 2.541731136798034, + "grad_norm": 12.0, + "learning_rate": 2.4566160172351684e-06, + "loss": 1.432145595550537, + "step": 13964 + }, + { + "epoch": 2.5420952034222264, + "grad_norm": 11.3125, + "learning_rate": 2.4559053228527725e-06, + "loss": 1.4086716175079346, + "step": 13966 + }, + { + "epoch": 2.5424592700464186, + "grad_norm": 11.9375, + "learning_rate": 2.4551951485519076e-06, + "loss": 1.4589173793792725, + "step": 13968 + }, + { + "epoch": 2.542823336670611, + "grad_norm": 16.75, + "learning_rate": 2.4544854944367887e-06, + "loss": 1.5004044771194458, + "step": 13970 + }, + { + "epoch": 2.543187403294803, + "grad_norm": 23.0, + "learning_rate": 2.453776360611555e-06, + "loss": 1.9516510963439941, + "step": 13972 + }, + { + "epoch": 2.543551469918995, + "grad_norm": 24.875, + "learning_rate": 2.453067747180269e-06, + "loss": 1.7078602313995361, + "step": 13974 + }, + { + "epoch": 2.5439155365431874, + "grad_norm": 9.8125, + "learning_rate": 2.4523596542469164e-06, + "loss": 1.1005921363830566, + "step": 13976 + }, + { + "epoch": 2.5442796031673796, + "grad_norm": 17.0, + "learning_rate": 2.451652081915407e-06, + "loss": 1.4509215354919434, + "step": 13978 + }, + { + "epoch": 2.5446436697915717, + "grad_norm": 8.4375, + "learning_rate": 2.4509450302895745e-06, + "loss": 1.3434514999389648, + "step": 13980 + }, + { + "epoch": 2.545007736415764, + "grad_norm": 20.375, + "learning_rate": 2.4502384994731757e-06, + "loss": 1.1130987405776978, + "step": 13982 + }, + { + "epoch": 2.5453718030399566, + "grad_norm": 7.34375, + "learning_rate": 2.4495324895698914e-06, + "loss": 0.3280452489852905, + "step": 13984 + }, + { + "epoch": 2.5457358696641483, + "grad_norm": 21.5, + "learning_rate": 2.4488270006833255e-06, + "loss": 1.099259853363037, + "step": 13986 + }, + { + "epoch": 2.546099936288341, + "grad_norm": 9.8125, + "learning_rate": 2.4481220329170057e-06, + "loss": 1.2947473526000977, + "step": 13988 + }, + { + "epoch": 2.5464640029125327, + "grad_norm": 10.6875, + "learning_rate": 2.4474175863743823e-06, + "loss": 1.319547176361084, + "step": 13990 + }, + { + "epoch": 2.5468280695367254, + "grad_norm": 9.5625, + "learning_rate": 2.4467136611588315e-06, + "loss": 1.5339314937591553, + "step": 13992 + }, + { + "epoch": 2.5471921361609176, + "grad_norm": 18.0, + "learning_rate": 2.4460102573736506e-06, + "loss": 1.2120921611785889, + "step": 13994 + }, + { + "epoch": 2.5475562027851097, + "grad_norm": 9.3125, + "learning_rate": 2.445307375122061e-06, + "loss": 0.8374010324478149, + "step": 13996 + }, + { + "epoch": 2.547920269409302, + "grad_norm": 9.5, + "learning_rate": 2.4446050145072085e-06, + "loss": 1.2683812379837036, + "step": 13998 + }, + { + "epoch": 2.548284336033494, + "grad_norm": 7.5, + "learning_rate": 2.4439031756321612e-06, + "loss": 1.372105360031128, + "step": 14000 + }, + { + "epoch": 2.5486484026576863, + "grad_norm": 5.9375, + "learning_rate": 2.443201858599912e-06, + "loss": 1.2243845462799072, + "step": 14002 + }, + { + "epoch": 2.5490124692818785, + "grad_norm": 25.625, + "learning_rate": 2.4425010635133744e-06, + "loss": 1.2761088609695435, + "step": 14004 + }, + { + "epoch": 2.5493765359060707, + "grad_norm": 16.25, + "learning_rate": 2.4418007904753894e-06, + "loss": 1.9006787538528442, + "step": 14006 + }, + { + "epoch": 2.549740602530263, + "grad_norm": 8.6875, + "learning_rate": 2.441101039588718e-06, + "loss": 1.3298379182815552, + "step": 14008 + }, + { + "epoch": 2.550104669154455, + "grad_norm": 11.5625, + "learning_rate": 2.4404018109560456e-06, + "loss": 1.4144887924194336, + "step": 14010 + }, + { + "epoch": 2.5504687357786473, + "grad_norm": 5.9375, + "learning_rate": 2.4397031046799823e-06, + "loss": 1.3036308288574219, + "step": 14012 + }, + { + "epoch": 2.55083280240284, + "grad_norm": 11.4375, + "learning_rate": 2.4390049208630596e-06, + "loss": 1.601028323173523, + "step": 14014 + }, + { + "epoch": 2.5511968690270317, + "grad_norm": 93.5, + "learning_rate": 2.4383072596077328e-06, + "loss": 1.2932220697402954, + "step": 14016 + }, + { + "epoch": 2.5515609356512243, + "grad_norm": 6.34375, + "learning_rate": 2.437610121016382e-06, + "loss": 1.045025110244751, + "step": 14018 + }, + { + "epoch": 2.5519250022754165, + "grad_norm": 7.5, + "learning_rate": 2.436913505191309e-06, + "loss": 1.05776047706604, + "step": 14020 + }, + { + "epoch": 2.5522890688996087, + "grad_norm": 13.875, + "learning_rate": 2.436217412234739e-06, + "loss": 1.5952341556549072, + "step": 14022 + }, + { + "epoch": 2.552653135523801, + "grad_norm": 24.375, + "learning_rate": 2.4355218422488202e-06, + "loss": 1.5637341737747192, + "step": 14024 + }, + { + "epoch": 2.553017202147993, + "grad_norm": 16.0, + "learning_rate": 2.4348267953356265e-06, + "loss": 1.3550633192062378, + "step": 14026 + }, + { + "epoch": 2.5533812687721853, + "grad_norm": 8.375, + "learning_rate": 2.4341322715971514e-06, + "loss": 1.3722586631774902, + "step": 14028 + }, + { + "epoch": 2.5537453353963775, + "grad_norm": 13.1875, + "learning_rate": 2.4334382711353147e-06, + "loss": 1.1840168237686157, + "step": 14030 + }, + { + "epoch": 2.5541094020205697, + "grad_norm": 11.5, + "learning_rate": 2.432744794051958e-06, + "loss": 0.8268307447433472, + "step": 14032 + }, + { + "epoch": 2.554473468644762, + "grad_norm": 7.0, + "learning_rate": 2.4320518404488455e-06, + "loss": 1.0226351022720337, + "step": 14034 + }, + { + "epoch": 2.554837535268954, + "grad_norm": 16.25, + "learning_rate": 2.431359410427666e-06, + "loss": 1.4597655534744263, + "step": 14036 + }, + { + "epoch": 2.5552016018931463, + "grad_norm": 13.875, + "learning_rate": 2.430667504090031e-06, + "loss": 1.644943118095398, + "step": 14038 + }, + { + "epoch": 2.555565668517339, + "grad_norm": 17.125, + "learning_rate": 2.429976121537474e-06, + "loss": 1.3708240985870361, + "step": 14040 + }, + { + "epoch": 2.5559297351415307, + "grad_norm": 8.5, + "learning_rate": 2.4292852628714524e-06, + "loss": 1.067210078239441, + "step": 14042 + }, + { + "epoch": 2.5562938017657233, + "grad_norm": 9.125, + "learning_rate": 2.4285949281933486e-06, + "loss": 0.957159161567688, + "step": 14044 + }, + { + "epoch": 2.5566578683899155, + "grad_norm": 58.25, + "learning_rate": 2.427905117604465e-06, + "loss": 0.7425721883773804, + "step": 14046 + }, + { + "epoch": 2.5570219350141077, + "grad_norm": 6.59375, + "learning_rate": 2.4272158312060295e-06, + "loss": 0.9386372566223145, + "step": 14048 + }, + { + "epoch": 2.5573860016383, + "grad_norm": 26.625, + "learning_rate": 2.426527069099191e-06, + "loss": 0.9818754196166992, + "step": 14050 + }, + { + "epoch": 2.557750068262492, + "grad_norm": 27.875, + "learning_rate": 2.4258388313850236e-06, + "loss": 1.4963124990463257, + "step": 14052 + }, + { + "epoch": 2.5581141348866843, + "grad_norm": 13.3125, + "learning_rate": 2.4251511181645226e-06, + "loss": 1.4882919788360596, + "step": 14054 + }, + { + "epoch": 2.5584782015108765, + "grad_norm": 15.25, + "learning_rate": 2.4244639295386072e-06, + "loss": 1.495132327079773, + "step": 14056 + }, + { + "epoch": 2.5588422681350687, + "grad_norm": 21.625, + "learning_rate": 2.42377726560812e-06, + "loss": 1.30695641040802, + "step": 14058 + }, + { + "epoch": 2.559206334759261, + "grad_norm": 14.875, + "learning_rate": 2.423091126473826e-06, + "loss": 1.5260798931121826, + "step": 14060 + }, + { + "epoch": 2.559570401383453, + "grad_norm": 8.4375, + "learning_rate": 2.4224055122364132e-06, + "loss": 1.3462334871292114, + "step": 14062 + }, + { + "epoch": 2.5599344680076452, + "grad_norm": 42.5, + "learning_rate": 2.4217204229964926e-06, + "loss": 2.029184579849243, + "step": 14064 + }, + { + "epoch": 2.560298534631838, + "grad_norm": 8.875, + "learning_rate": 2.4210358588545987e-06, + "loss": 1.4047307968139648, + "step": 14066 + }, + { + "epoch": 2.5606626012560296, + "grad_norm": 31.0, + "learning_rate": 2.4203518199111876e-06, + "loss": 1.3947217464447021, + "step": 14068 + }, + { + "epoch": 2.5610266678802223, + "grad_norm": 10.1875, + "learning_rate": 2.4196683062666404e-06, + "loss": 1.3592097759246826, + "step": 14070 + }, + { + "epoch": 2.5613907345044145, + "grad_norm": 12.3125, + "learning_rate": 2.4189853180212596e-06, + "loss": 1.3318440914154053, + "step": 14072 + }, + { + "epoch": 2.5617548011286067, + "grad_norm": 17.5, + "learning_rate": 2.418302855275271e-06, + "loss": 0.7932029962539673, + "step": 14074 + }, + { + "epoch": 2.562118867752799, + "grad_norm": 14.4375, + "learning_rate": 2.417620918128822e-06, + "loss": 0.417538046836853, + "step": 14076 + }, + { + "epoch": 2.562482934376991, + "grad_norm": 16.5, + "learning_rate": 2.4169395066819857e-06, + "loss": 1.5427486896514893, + "step": 14078 + }, + { + "epoch": 2.5628470010011832, + "grad_norm": 165.0, + "learning_rate": 2.4162586210347565e-06, + "loss": 1.2161442041397095, + "step": 14080 + }, + { + "epoch": 2.5632110676253754, + "grad_norm": 3.375, + "learning_rate": 2.41557826128705e-06, + "loss": 1.4107484817504883, + "step": 14082 + }, + { + "epoch": 2.5635751342495676, + "grad_norm": 10.0625, + "learning_rate": 2.4148984275387077e-06, + "loss": 1.0203148126602173, + "step": 14084 + }, + { + "epoch": 2.56393920087376, + "grad_norm": 9.0, + "learning_rate": 2.4142191198894927e-06, + "loss": 1.4793447256088257, + "step": 14086 + }, + { + "epoch": 2.564303267497952, + "grad_norm": 6.28125, + "learning_rate": 2.4135403384390886e-06, + "loss": 1.3543294668197632, + "step": 14088 + }, + { + "epoch": 2.564667334122144, + "grad_norm": 5.5, + "learning_rate": 2.4128620832871065e-06, + "loss": 1.1586647033691406, + "step": 14090 + }, + { + "epoch": 2.565031400746337, + "grad_norm": 6.40625, + "learning_rate": 2.4121843545330757e-06, + "loss": 1.2547489404678345, + "step": 14092 + }, + { + "epoch": 2.5653954673705286, + "grad_norm": 21.375, + "learning_rate": 2.4115071522764506e-06, + "loss": 1.4677958488464355, + "step": 14094 + }, + { + "epoch": 2.5657595339947212, + "grad_norm": 10.125, + "learning_rate": 2.410830476616608e-06, + "loss": 1.2225617170333862, + "step": 14096 + }, + { + "epoch": 2.5661236006189134, + "grad_norm": 36.25, + "learning_rate": 2.410154327652848e-06, + "loss": 1.591138482093811, + "step": 14098 + }, + { + "epoch": 2.5664876672431056, + "grad_norm": 37.5, + "learning_rate": 2.409478705484391e-06, + "loss": 1.7120736837387085, + "step": 14100 + }, + { + "epoch": 2.566851733867298, + "grad_norm": 6.0625, + "learning_rate": 2.408803610210384e-06, + "loss": 1.0847187042236328, + "step": 14102 + }, + { + "epoch": 2.56721580049149, + "grad_norm": 10.4375, + "learning_rate": 2.4081290419298923e-06, + "loss": 1.3927597999572754, + "step": 14104 + }, + { + "epoch": 2.567579867115682, + "grad_norm": 9.0625, + "learning_rate": 2.4074550007419077e-06, + "loss": 1.3700611591339111, + "step": 14106 + }, + { + "epoch": 2.5679439337398744, + "grad_norm": 3.84375, + "learning_rate": 2.406781486745342e-06, + "loss": 0.9951527118682861, + "step": 14108 + }, + { + "epoch": 2.5683080003640666, + "grad_norm": 4.3125, + "learning_rate": 2.4061085000390318e-06, + "loss": 1.1989988088607788, + "step": 14110 + }, + { + "epoch": 2.568672066988259, + "grad_norm": 17.125, + "learning_rate": 2.4054360407217336e-06, + "loss": 1.2150812149047852, + "step": 14112 + }, + { + "epoch": 2.569036133612451, + "grad_norm": 4.59375, + "learning_rate": 2.4047641088921295e-06, + "loss": 1.2451732158660889, + "step": 14114 + }, + { + "epoch": 2.569400200236643, + "grad_norm": 14.1875, + "learning_rate": 2.4040927046488224e-06, + "loss": 1.0359206199645996, + "step": 14116 + }, + { + "epoch": 2.5697642668608354, + "grad_norm": 14.0625, + "learning_rate": 2.4034218280903375e-06, + "loss": 1.558552622795105, + "step": 14118 + }, + { + "epoch": 2.5701283334850276, + "grad_norm": 8.4375, + "learning_rate": 2.4027514793151237e-06, + "loss": 0.9711041450500488, + "step": 14120 + }, + { + "epoch": 2.57049240010922, + "grad_norm": 7.65625, + "learning_rate": 2.402081658421552e-06, + "loss": 1.195290207862854, + "step": 14122 + }, + { + "epoch": 2.570856466733412, + "grad_norm": 8.8125, + "learning_rate": 2.401412365507916e-06, + "loss": 1.167384386062622, + "step": 14124 + }, + { + "epoch": 2.5712205333576046, + "grad_norm": 21.0, + "learning_rate": 2.400743600672431e-06, + "loss": 1.945046067237854, + "step": 14126 + }, + { + "epoch": 2.571584599981797, + "grad_norm": 15.4375, + "learning_rate": 2.4000753640132367e-06, + "loss": 1.9064695835113525, + "step": 14128 + }, + { + "epoch": 2.571948666605989, + "grad_norm": 14.125, + "learning_rate": 2.399407655628393e-06, + "loss": 1.0812196731567383, + "step": 14130 + }, + { + "epoch": 2.572312733230181, + "grad_norm": 100.0, + "learning_rate": 2.3987404756158844e-06, + "loss": 1.1352931261062622, + "step": 14132 + }, + { + "epoch": 2.5726767998543734, + "grad_norm": 16.75, + "learning_rate": 2.3980738240736164e-06, + "loss": 1.2162847518920898, + "step": 14134 + }, + { + "epoch": 2.5730408664785656, + "grad_norm": 15.125, + "learning_rate": 2.3974077010994175e-06, + "loss": 1.5836262702941895, + "step": 14136 + }, + { + "epoch": 2.5734049331027578, + "grad_norm": 7.78125, + "learning_rate": 2.396742106791038e-06, + "loss": 1.4147813320159912, + "step": 14138 + }, + { + "epoch": 2.57376899972695, + "grad_norm": 3.421875, + "learning_rate": 2.396077041246152e-06, + "loss": 1.1749075651168823, + "step": 14140 + }, + { + "epoch": 2.574133066351142, + "grad_norm": 25.5, + "learning_rate": 2.3954125045623537e-06, + "loss": 0.823738157749176, + "step": 14142 + }, + { + "epoch": 2.5744971329753343, + "grad_norm": 27.125, + "learning_rate": 2.3947484968371636e-06, + "loss": 0.9977008104324341, + "step": 14144 + }, + { + "epoch": 2.5748611995995265, + "grad_norm": 8.9375, + "learning_rate": 2.3940850181680197e-06, + "loss": 1.2254635095596313, + "step": 14146 + }, + { + "epoch": 2.575225266223719, + "grad_norm": 31.0, + "learning_rate": 2.3934220686522868e-06, + "loss": 1.5278208255767822, + "step": 14148 + }, + { + "epoch": 2.575589332847911, + "grad_norm": 17.0, + "learning_rate": 2.392759648387249e-06, + "loss": 1.3942779302597046, + "step": 14150 + }, + { + "epoch": 2.5759533994721036, + "grad_norm": 32.25, + "learning_rate": 2.392097757470113e-06, + "loss": 0.5594972968101501, + "step": 14152 + }, + { + "epoch": 2.5763174660962957, + "grad_norm": 11.375, + "learning_rate": 2.3914363959980107e-06, + "loss": 1.4911736249923706, + "step": 14154 + }, + { + "epoch": 2.576681532720488, + "grad_norm": 7.8125, + "learning_rate": 2.390775564067993e-06, + "loss": 1.3684496879577637, + "step": 14156 + }, + { + "epoch": 2.57704559934468, + "grad_norm": 2.796875, + "learning_rate": 2.3901152617770333e-06, + "loss": 1.2593903541564941, + "step": 14158 + }, + { + "epoch": 2.5774096659688723, + "grad_norm": 9.125, + "learning_rate": 2.38945548922203e-06, + "loss": 1.3786157369613647, + "step": 14160 + }, + { + "epoch": 2.5777737325930645, + "grad_norm": 21.75, + "learning_rate": 2.3887962464998016e-06, + "loss": 1.3256924152374268, + "step": 14162 + }, + { + "epoch": 2.5781377992172567, + "grad_norm": 16.625, + "learning_rate": 2.388137533707089e-06, + "loss": 1.7599164247512817, + "step": 14164 + }, + { + "epoch": 2.578501865841449, + "grad_norm": 33.5, + "learning_rate": 2.3874793509405554e-06, + "loss": 1.1446453332901, + "step": 14166 + }, + { + "epoch": 2.578865932465641, + "grad_norm": 11.5625, + "learning_rate": 2.3868216982967875e-06, + "loss": 1.5041176080703735, + "step": 14168 + }, + { + "epoch": 2.5792299990898333, + "grad_norm": 6.5625, + "learning_rate": 2.3861645758722915e-06, + "loss": 1.199200987815857, + "step": 14170 + }, + { + "epoch": 2.5795940657140255, + "grad_norm": 8.5625, + "learning_rate": 2.385507983763499e-06, + "loss": 1.264521598815918, + "step": 14172 + }, + { + "epoch": 2.579958132338218, + "grad_norm": 5.84375, + "learning_rate": 2.384851922066761e-06, + "loss": 1.0853348970413208, + "step": 14174 + }, + { + "epoch": 2.58032219896241, + "grad_norm": 14.0, + "learning_rate": 2.384196390878354e-06, + "loss": 1.4397194385528564, + "step": 14176 + }, + { + "epoch": 2.5806862655866025, + "grad_norm": 3.734375, + "learning_rate": 2.3835413902944716e-06, + "loss": 1.045330286026001, + "step": 14178 + }, + { + "epoch": 2.5810503322107947, + "grad_norm": 20.0, + "learning_rate": 2.382886920411234e-06, + "loss": 1.0101420879364014, + "step": 14180 + }, + { + "epoch": 2.581414398834987, + "grad_norm": 14.625, + "learning_rate": 2.382232981324683e-06, + "loss": 0.5374385714530945, + "step": 14182 + }, + { + "epoch": 2.581778465459179, + "grad_norm": 13.375, + "learning_rate": 2.3815795731307795e-06, + "loss": 1.5702306032180786, + "step": 14184 + }, + { + "epoch": 2.5821425320833713, + "grad_norm": 6.65625, + "learning_rate": 2.38092669592541e-06, + "loss": 1.2941797971725464, + "step": 14186 + }, + { + "epoch": 2.5825065987075635, + "grad_norm": 28.75, + "learning_rate": 2.380274349804381e-06, + "loss": 1.126169204711914, + "step": 14188 + }, + { + "epoch": 2.5828706653317557, + "grad_norm": 11.4375, + "learning_rate": 2.379622534863421e-06, + "loss": 1.514399766921997, + "step": 14190 + }, + { + "epoch": 2.583234731955948, + "grad_norm": 7.71875, + "learning_rate": 2.378971251198183e-06, + "loss": 1.7874150276184082, + "step": 14192 + }, + { + "epoch": 2.58359879858014, + "grad_norm": 3.875, + "learning_rate": 2.3783204989042384e-06, + "loss": 1.0225834846496582, + "step": 14194 + }, + { + "epoch": 2.5839628652043323, + "grad_norm": 15.5625, + "learning_rate": 2.3776702780770835e-06, + "loss": 1.6737661361694336, + "step": 14196 + }, + { + "epoch": 2.5843269318285245, + "grad_norm": 10.375, + "learning_rate": 2.377020588812135e-06, + "loss": 1.4150168895721436, + "step": 14198 + }, + { + "epoch": 2.584690998452717, + "grad_norm": 13.6875, + "learning_rate": 2.376371431204733e-06, + "loss": 1.61765456199646, + "step": 14200 + }, + { + "epoch": 2.585055065076909, + "grad_norm": 21.25, + "learning_rate": 2.3757228053501376e-06, + "loss": 1.3891081809997559, + "step": 14202 + }, + { + "epoch": 2.5854191317011015, + "grad_norm": 9.1875, + "learning_rate": 2.375074711343533e-06, + "loss": 1.3046094179153442, + "step": 14204 + }, + { + "epoch": 2.5857831983252937, + "grad_norm": 9.75, + "learning_rate": 2.374427149280024e-06, + "loss": 1.4639339447021484, + "step": 14206 + }, + { + "epoch": 2.586147264949486, + "grad_norm": 25.0, + "learning_rate": 2.373780119254637e-06, + "loss": 1.2532970905303955, + "step": 14208 + }, + { + "epoch": 2.586511331573678, + "grad_norm": 10.875, + "learning_rate": 2.3731336213623222e-06, + "loss": 1.312552571296692, + "step": 14210 + }, + { + "epoch": 2.5868753981978703, + "grad_norm": 21.625, + "learning_rate": 2.37248765569795e-06, + "loss": 1.4278985261917114, + "step": 14212 + }, + { + "epoch": 2.5872394648220625, + "grad_norm": 20.625, + "learning_rate": 2.3718422223563137e-06, + "loss": 1.3898577690124512, + "step": 14214 + }, + { + "epoch": 2.5876035314462547, + "grad_norm": 18.0, + "learning_rate": 2.371197321432127e-06, + "loss": 1.5639362335205078, + "step": 14216 + }, + { + "epoch": 2.587967598070447, + "grad_norm": 15.75, + "learning_rate": 2.370552953020028e-06, + "loss": 1.7671689987182617, + "step": 14218 + }, + { + "epoch": 2.588331664694639, + "grad_norm": 17.125, + "learning_rate": 2.3699091172145732e-06, + "loss": 1.5203633308410645, + "step": 14220 + }, + { + "epoch": 2.5886957313188312, + "grad_norm": 9.0, + "learning_rate": 2.369265814110244e-06, + "loss": 1.2983410358428955, + "step": 14222 + }, + { + "epoch": 2.5890597979430234, + "grad_norm": 6.53125, + "learning_rate": 2.3686230438014434e-06, + "loss": 1.10750150680542, + "step": 14224 + }, + { + "epoch": 2.589423864567216, + "grad_norm": 8.5625, + "learning_rate": 2.3679808063824943e-06, + "loss": 1.3014956712722778, + "step": 14226 + }, + { + "epoch": 2.589787931191408, + "grad_norm": 64.5, + "learning_rate": 2.3673391019476423e-06, + "loss": 1.2168763875961304, + "step": 14228 + }, + { + "epoch": 2.5901519978156005, + "grad_norm": 32.0, + "learning_rate": 2.366697930591055e-06, + "loss": 1.2090435028076172, + "step": 14230 + }, + { + "epoch": 2.590516064439792, + "grad_norm": 13.25, + "learning_rate": 2.3660572924068225e-06, + "loss": 0.7218501567840576, + "step": 14232 + }, + { + "epoch": 2.590880131063985, + "grad_norm": 12.9375, + "learning_rate": 2.365417187488954e-06, + "loss": 0.9979938268661499, + "step": 14234 + }, + { + "epoch": 2.591244197688177, + "grad_norm": 9.75, + "learning_rate": 2.364777615931385e-06, + "loss": 1.5098721981048584, + "step": 14236 + }, + { + "epoch": 2.5916082643123692, + "grad_norm": 12.25, + "learning_rate": 2.3641385778279675e-06, + "loss": 1.4654102325439453, + "step": 14238 + }, + { + "epoch": 2.5919723309365614, + "grad_norm": 20.375, + "learning_rate": 2.3635000732724795e-06, + "loss": 1.600963830947876, + "step": 14240 + }, + { + "epoch": 2.5923363975607536, + "grad_norm": 11.875, + "learning_rate": 2.3628621023586183e-06, + "loss": 1.5925166606903076, + "step": 14242 + }, + { + "epoch": 2.592700464184946, + "grad_norm": 18.625, + "learning_rate": 2.3622246651800034e-06, + "loss": 1.7492650747299194, + "step": 14244 + }, + { + "epoch": 2.593064530809138, + "grad_norm": 11.5, + "learning_rate": 2.3615877618301765e-06, + "loss": 1.6882426738739014, + "step": 14246 + }, + { + "epoch": 2.59342859743333, + "grad_norm": 10.0625, + "learning_rate": 2.3609513924026e-06, + "loss": 1.4019925594329834, + "step": 14248 + }, + { + "epoch": 2.5937926640575224, + "grad_norm": 10.1875, + "learning_rate": 2.360315556990659e-06, + "loss": 1.3696213960647583, + "step": 14250 + }, + { + "epoch": 2.5941567306817146, + "grad_norm": 10.5625, + "learning_rate": 2.3596802556876596e-06, + "loss": 1.0603593587875366, + "step": 14252 + }, + { + "epoch": 2.594520797305907, + "grad_norm": 16.75, + "learning_rate": 2.35904548858683e-06, + "loss": 0.5949654579162598, + "step": 14254 + }, + { + "epoch": 2.5948848639300994, + "grad_norm": 6.59375, + "learning_rate": 2.358411255781319e-06, + "loss": 1.5676074028015137, + "step": 14256 + }, + { + "epoch": 2.595248930554291, + "grad_norm": 7.96875, + "learning_rate": 2.3577775573641987e-06, + "loss": 1.106121301651001, + "step": 14258 + }, + { + "epoch": 2.595612997178484, + "grad_norm": 15.0, + "learning_rate": 2.357144393428461e-06, + "loss": 1.4317102432250977, + "step": 14260 + }, + { + "epoch": 2.595977063802676, + "grad_norm": 12.875, + "learning_rate": 2.35651176406702e-06, + "loss": 1.816224455833435, + "step": 14262 + }, + { + "epoch": 2.596341130426868, + "grad_norm": 6.34375, + "learning_rate": 2.355879669372712e-06, + "loss": 0.8903669714927673, + "step": 14264 + }, + { + "epoch": 2.5967051970510604, + "grad_norm": 6.25, + "learning_rate": 2.355248109438295e-06, + "loss": 1.159472942352295, + "step": 14266 + }, + { + "epoch": 2.5970692636752526, + "grad_norm": 12.8125, + "learning_rate": 2.354617084356446e-06, + "loss": 1.5132120847702026, + "step": 14268 + }, + { + "epoch": 2.597433330299445, + "grad_norm": 15.25, + "learning_rate": 2.353986594219767e-06, + "loss": 1.4413330554962158, + "step": 14270 + }, + { + "epoch": 2.597797396923637, + "grad_norm": 26.875, + "learning_rate": 2.353356639120779e-06, + "loss": 0.8578499555587769, + "step": 14272 + }, + { + "epoch": 2.598161463547829, + "grad_norm": 16.375, + "learning_rate": 2.3527272191519256e-06, + "loss": 1.7673108577728271, + "step": 14274 + }, + { + "epoch": 2.5985255301720214, + "grad_norm": 11.8125, + "learning_rate": 2.352098334405572e-06, + "loss": 1.3355708122253418, + "step": 14276 + }, + { + "epoch": 2.5988895967962136, + "grad_norm": 26.75, + "learning_rate": 2.3514699849740043e-06, + "loss": 1.2164437770843506, + "step": 14278 + }, + { + "epoch": 2.5992536634204058, + "grad_norm": 52.25, + "learning_rate": 2.35084217094943e-06, + "loss": 1.2727967500686646, + "step": 14280 + }, + { + "epoch": 2.5996177300445984, + "grad_norm": 10.9375, + "learning_rate": 2.350214892423978e-06, + "loss": 1.511594295501709, + "step": 14282 + }, + { + "epoch": 2.59998179666879, + "grad_norm": 28.75, + "learning_rate": 2.3495881494896994e-06, + "loss": 1.2684283256530762, + "step": 14284 + }, + { + "epoch": 2.600345863292983, + "grad_norm": 59.0, + "learning_rate": 2.348961942238566e-06, + "loss": 1.7031049728393555, + "step": 14286 + }, + { + "epoch": 2.600709929917175, + "grad_norm": 10.0625, + "learning_rate": 2.3483362707624716e-06, + "loss": 1.4205446243286133, + "step": 14288 + }, + { + "epoch": 2.601073996541367, + "grad_norm": 7.71875, + "learning_rate": 2.34771113515323e-06, + "loss": 1.084086298942566, + "step": 14290 + }, + { + "epoch": 2.6014380631655594, + "grad_norm": 7.6875, + "learning_rate": 2.347086535502578e-06, + "loss": 1.4121547937393188, + "step": 14292 + }, + { + "epoch": 2.6018021297897516, + "grad_norm": 9.3125, + "learning_rate": 2.3464624719021733e-06, + "loss": 1.0999360084533691, + "step": 14294 + }, + { + "epoch": 2.6021661964139438, + "grad_norm": 13.5, + "learning_rate": 2.3458389444435944e-06, + "loss": 1.4395822286605835, + "step": 14296 + }, + { + "epoch": 2.602530263038136, + "grad_norm": 12.6875, + "learning_rate": 2.345215953218341e-06, + "loss": 1.4034985303878784, + "step": 14298 + }, + { + "epoch": 2.602894329662328, + "grad_norm": 19.125, + "learning_rate": 2.344593498317835e-06, + "loss": 1.598644733428955, + "step": 14300 + }, + { + "epoch": 2.6032583962865203, + "grad_norm": 13.4375, + "learning_rate": 2.3439715798334193e-06, + "loss": 1.7083077430725098, + "step": 14302 + }, + { + "epoch": 2.6036224629107125, + "grad_norm": 2.734375, + "learning_rate": 2.3433501978563575e-06, + "loss": 0.9617128968238831, + "step": 14304 + }, + { + "epoch": 2.6039865295349047, + "grad_norm": 14.5625, + "learning_rate": 2.3427293524778348e-06, + "loss": 0.9819358587265015, + "step": 14306 + }, + { + "epoch": 2.6043505961590974, + "grad_norm": 7.625, + "learning_rate": 2.342109043788959e-06, + "loss": 1.1405773162841797, + "step": 14308 + }, + { + "epoch": 2.604714662783289, + "grad_norm": 23.75, + "learning_rate": 2.341489271880756e-06, + "loss": 1.4422760009765625, + "step": 14310 + }, + { + "epoch": 2.6050787294074818, + "grad_norm": 26.0, + "learning_rate": 2.340870036844176e-06, + "loss": 1.445676565170288, + "step": 14312 + }, + { + "epoch": 2.605442796031674, + "grad_norm": 27.625, + "learning_rate": 2.3402513387700886e-06, + "loss": 1.2540534734725952, + "step": 14314 + }, + { + "epoch": 2.605806862655866, + "grad_norm": 17.5, + "learning_rate": 2.3396331777492853e-06, + "loss": 1.7215461730957031, + "step": 14316 + }, + { + "epoch": 2.6061709292800583, + "grad_norm": 14.75, + "learning_rate": 2.3390155538724795e-06, + "loss": 1.525050401687622, + "step": 14318 + }, + { + "epoch": 2.6065349959042505, + "grad_norm": 167.0, + "learning_rate": 2.338398467230305e-06, + "loss": 1.8181389570236206, + "step": 14320 + }, + { + "epoch": 2.6068990625284427, + "grad_norm": 10.3125, + "learning_rate": 2.3377819179133156e-06, + "loss": 1.5395398139953613, + "step": 14322 + }, + { + "epoch": 2.607263129152635, + "grad_norm": 21.375, + "learning_rate": 2.337165906011988e-06, + "loss": 1.333439588546753, + "step": 14324 + }, + { + "epoch": 2.607627195776827, + "grad_norm": 11.5, + "learning_rate": 2.3365504316167197e-06, + "loss": 1.4189414978027344, + "step": 14326 + }, + { + "epoch": 2.6079912624010193, + "grad_norm": 23.25, + "learning_rate": 2.335935494817829e-06, + "loss": 1.6621311902999878, + "step": 14328 + }, + { + "epoch": 2.6083553290252115, + "grad_norm": 33.25, + "learning_rate": 2.3353210957055554e-06, + "loss": 1.5448209047317505, + "step": 14330 + }, + { + "epoch": 2.6087193956494037, + "grad_norm": 18.25, + "learning_rate": 2.334707234370059e-06, + "loss": 0.9667041301727295, + "step": 14332 + }, + { + "epoch": 2.6090834622735963, + "grad_norm": 9.3125, + "learning_rate": 2.3340939109014217e-06, + "loss": 0.906665563583374, + "step": 14334 + }, + { + "epoch": 2.609447528897788, + "grad_norm": 9.0, + "learning_rate": 2.333481125389647e-06, + "loss": 0.5902478694915771, + "step": 14336 + }, + { + "epoch": 2.6098115955219807, + "grad_norm": 34.5, + "learning_rate": 2.332868877924658e-06, + "loss": 1.4425134658813477, + "step": 14338 + }, + { + "epoch": 2.610175662146173, + "grad_norm": 12.375, + "learning_rate": 2.332257168596299e-06, + "loss": 1.6120284795761108, + "step": 14340 + }, + { + "epoch": 2.610539728770365, + "grad_norm": 12.5625, + "learning_rate": 2.3316459974943366e-06, + "loss": 1.3588635921478271, + "step": 14342 + }, + { + "epoch": 2.6109037953945573, + "grad_norm": 7.84375, + "learning_rate": 2.331035364708458e-06, + "loss": 1.2055678367614746, + "step": 14344 + }, + { + "epoch": 2.6112678620187495, + "grad_norm": 16.875, + "learning_rate": 2.33042527032827e-06, + "loss": 1.2236871719360352, + "step": 14346 + }, + { + "epoch": 2.6116319286429417, + "grad_norm": 22.75, + "learning_rate": 2.3298157144433025e-06, + "loss": 1.3296234607696533, + "step": 14348 + }, + { + "epoch": 2.611995995267134, + "grad_norm": 11.6875, + "learning_rate": 2.3292066971430047e-06, + "loss": 1.363838791847229, + "step": 14350 + }, + { + "epoch": 2.612360061891326, + "grad_norm": 12.6875, + "learning_rate": 2.328598218516748e-06, + "loss": 1.4381417036056519, + "step": 14352 + }, + { + "epoch": 2.6127241285155183, + "grad_norm": 16.75, + "learning_rate": 2.3279902786538235e-06, + "loss": 1.3195042610168457, + "step": 14354 + }, + { + "epoch": 2.6130881951397105, + "grad_norm": 5.71875, + "learning_rate": 2.3273828776434447e-06, + "loss": 1.3307335376739502, + "step": 14356 + }, + { + "epoch": 2.6134522617639027, + "grad_norm": 13.25, + "learning_rate": 2.3267760155747443e-06, + "loss": 1.5191245079040527, + "step": 14358 + }, + { + "epoch": 2.613816328388095, + "grad_norm": 15.8125, + "learning_rate": 2.326169692536777e-06, + "loss": 1.4936727285385132, + "step": 14360 + }, + { + "epoch": 2.614180395012287, + "grad_norm": 17.25, + "learning_rate": 2.3255639086185193e-06, + "loss": 1.4101324081420898, + "step": 14362 + }, + { + "epoch": 2.6145444616364797, + "grad_norm": 17.0, + "learning_rate": 2.324958663908867e-06, + "loss": 1.464810848236084, + "step": 14364 + }, + { + "epoch": 2.6149085282606714, + "grad_norm": 5.9375, + "learning_rate": 2.3243539584966364e-06, + "loss": 0.9426772594451904, + "step": 14366 + }, + { + "epoch": 2.615272594884864, + "grad_norm": 13.5625, + "learning_rate": 2.3237497924705667e-06, + "loss": 1.0723059177398682, + "step": 14368 + }, + { + "epoch": 2.6156366615090563, + "grad_norm": 21.625, + "learning_rate": 2.3231461659193165e-06, + "loss": 1.020393967628479, + "step": 14370 + }, + { + "epoch": 2.6160007281332485, + "grad_norm": 16.75, + "learning_rate": 2.322543078931465e-06, + "loss": 1.4742343425750732, + "step": 14372 + }, + { + "epoch": 2.6163647947574407, + "grad_norm": 34.0, + "learning_rate": 2.3219405315955136e-06, + "loss": 1.6413697004318237, + "step": 14374 + }, + { + "epoch": 2.616728861381633, + "grad_norm": 11.125, + "learning_rate": 2.3213385239998836e-06, + "loss": 1.2438894510269165, + "step": 14376 + }, + { + "epoch": 2.617092928005825, + "grad_norm": 10.625, + "learning_rate": 2.320737056232917e-06, + "loss": 1.5618762969970703, + "step": 14378 + }, + { + "epoch": 2.6174569946300172, + "grad_norm": 31.5, + "learning_rate": 2.320136128382876e-06, + "loss": 1.7725112438201904, + "step": 14380 + }, + { + "epoch": 2.6178210612542094, + "grad_norm": 21.375, + "learning_rate": 2.3195357405379447e-06, + "loss": 1.1901895999908447, + "step": 14382 + }, + { + "epoch": 2.6181851278784016, + "grad_norm": 10.1875, + "learning_rate": 2.3189358927862284e-06, + "loss": 0.6279973983764648, + "step": 14384 + }, + { + "epoch": 2.618549194502594, + "grad_norm": 12.4375, + "learning_rate": 2.3183365852157524e-06, + "loss": 1.2559270858764648, + "step": 14386 + }, + { + "epoch": 2.618913261126786, + "grad_norm": 11.4375, + "learning_rate": 2.317737817914461e-06, + "loss": 1.6523911952972412, + "step": 14388 + }, + { + "epoch": 2.6192773277509787, + "grad_norm": 18.75, + "learning_rate": 2.3171395909702225e-06, + "loss": 0.824800968170166, + "step": 14390 + }, + { + "epoch": 2.6196413943751704, + "grad_norm": 7.78125, + "learning_rate": 2.3165419044708234e-06, + "loss": 1.4402625560760498, + "step": 14392 + }, + { + "epoch": 2.620005460999363, + "grad_norm": 3.53125, + "learning_rate": 2.315944758503972e-06, + "loss": 0.9888216257095337, + "step": 14394 + }, + { + "epoch": 2.6203695276235552, + "grad_norm": 4.375, + "learning_rate": 2.3153481531572976e-06, + "loss": 1.024646282196045, + "step": 14396 + }, + { + "epoch": 2.6207335942477474, + "grad_norm": 8.1875, + "learning_rate": 2.3147520885183483e-06, + "loss": 1.1572277545928955, + "step": 14398 + }, + { + "epoch": 2.6210976608719396, + "grad_norm": 8.1875, + "learning_rate": 2.314156564674596e-06, + "loss": 1.1977852582931519, + "step": 14400 + }, + { + "epoch": 2.621461727496132, + "grad_norm": 7.09375, + "learning_rate": 2.3135615817134296e-06, + "loss": 1.1601250171661377, + "step": 14402 + }, + { + "epoch": 2.621825794120324, + "grad_norm": 8.5625, + "learning_rate": 2.3129671397221617e-06, + "loss": 1.4369096755981445, + "step": 14404 + }, + { + "epoch": 2.622189860744516, + "grad_norm": 3.484375, + "learning_rate": 2.3123732387880238e-06, + "loss": 1.1548892259597778, + "step": 14406 + }, + { + "epoch": 2.6225539273687084, + "grad_norm": 3.34375, + "learning_rate": 2.3117798789981683e-06, + "loss": 0.8749016523361206, + "step": 14408 + }, + { + "epoch": 2.6229179939929006, + "grad_norm": 12.0, + "learning_rate": 2.3111870604396686e-06, + "loss": 1.1647199392318726, + "step": 14410 + }, + { + "epoch": 2.623282060617093, + "grad_norm": 16.25, + "learning_rate": 2.3105947831995184e-06, + "loss": 1.4007911682128906, + "step": 14412 + }, + { + "epoch": 2.623646127241285, + "grad_norm": 7.8125, + "learning_rate": 2.3100030473646316e-06, + "loss": 1.462831735610962, + "step": 14414 + }, + { + "epoch": 2.6240101938654776, + "grad_norm": 8.5625, + "learning_rate": 2.309411853021844e-06, + "loss": 1.3439569473266602, + "step": 14416 + }, + { + "epoch": 2.6243742604896694, + "grad_norm": 7.46875, + "learning_rate": 2.3088212002579097e-06, + "loss": 1.0835633277893066, + "step": 14418 + }, + { + "epoch": 2.624738327113862, + "grad_norm": 9.5, + "learning_rate": 2.3082310891595054e-06, + "loss": 0.754274845123291, + "step": 14420 + }, + { + "epoch": 2.625102393738054, + "grad_norm": 13.25, + "learning_rate": 2.3076415198132275e-06, + "loss": 1.5140894651412964, + "step": 14422 + }, + { + "epoch": 2.6254664603622464, + "grad_norm": 28.5, + "learning_rate": 2.3070524923055925e-06, + "loss": 1.3967156410217285, + "step": 14424 + }, + { + "epoch": 2.6258305269864386, + "grad_norm": 12.9375, + "learning_rate": 2.3064640067230383e-06, + "loss": 1.8292180299758911, + "step": 14426 + }, + { + "epoch": 2.626194593610631, + "grad_norm": 13.6875, + "learning_rate": 2.305876063151922e-06, + "loss": 1.3166353702545166, + "step": 14428 + }, + { + "epoch": 2.626558660234823, + "grad_norm": 38.0, + "learning_rate": 2.305288661678523e-06, + "loss": 1.9137250185012817, + "step": 14430 + }, + { + "epoch": 2.626922726859015, + "grad_norm": 17.75, + "learning_rate": 2.30470180238904e-06, + "loss": 1.2852834463119507, + "step": 14432 + }, + { + "epoch": 2.6272867934832074, + "grad_norm": 21.5, + "learning_rate": 2.3041154853695904e-06, + "loss": 1.461565613746643, + "step": 14434 + }, + { + "epoch": 2.6276508601073996, + "grad_norm": 19.75, + "learning_rate": 2.303529710706216e-06, + "loss": 1.2691551446914673, + "step": 14436 + }, + { + "epoch": 2.6280149267315918, + "grad_norm": 7.21875, + "learning_rate": 2.302944478484876e-06, + "loss": 0.8443024754524231, + "step": 14438 + }, + { + "epoch": 2.628378993355784, + "grad_norm": 8.6875, + "learning_rate": 2.302359788791451e-06, + "loss": 1.1387990713119507, + "step": 14440 + }, + { + "epoch": 2.6287430599799766, + "grad_norm": 7.0, + "learning_rate": 2.301775641711742e-06, + "loss": 1.3775629997253418, + "step": 14442 + }, + { + "epoch": 2.6291071266041683, + "grad_norm": 14.5625, + "learning_rate": 2.3011920373314697e-06, + "loss": 1.2274903059005737, + "step": 14444 + }, + { + "epoch": 2.629471193228361, + "grad_norm": 7.8125, + "learning_rate": 2.300608975736276e-06, + "loss": 1.4638116359710693, + "step": 14446 + }, + { + "epoch": 2.629835259852553, + "grad_norm": 6.90625, + "learning_rate": 2.3000264570117227e-06, + "loss": 1.4709136486053467, + "step": 14448 + }, + { + "epoch": 2.6301993264767454, + "grad_norm": 14.1875, + "learning_rate": 2.2994444812432927e-06, + "loss": 1.0979920625686646, + "step": 14450 + }, + { + "epoch": 2.6305633931009376, + "grad_norm": 15.0625, + "learning_rate": 2.298863048516387e-06, + "loss": 1.4460787773132324, + "step": 14452 + }, + { + "epoch": 2.6309274597251298, + "grad_norm": 7.25, + "learning_rate": 2.29828215891633e-06, + "loss": 1.2614566087722778, + "step": 14454 + }, + { + "epoch": 2.631291526349322, + "grad_norm": 3.84375, + "learning_rate": 2.297701812528365e-06, + "loss": 1.0139660835266113, + "step": 14456 + }, + { + "epoch": 2.631655592973514, + "grad_norm": 7.5625, + "learning_rate": 2.297122009437654e-06, + "loss": 1.4179061651229858, + "step": 14458 + }, + { + "epoch": 2.6320196595977063, + "grad_norm": 15.0, + "learning_rate": 2.296542749729282e-06, + "loss": 1.7426719665527344, + "step": 14460 + }, + { + "epoch": 2.6323837262218985, + "grad_norm": 13.875, + "learning_rate": 2.295964033488253e-06, + "loss": 1.5685440301895142, + "step": 14462 + }, + { + "epoch": 2.6327477928460907, + "grad_norm": 12.125, + "learning_rate": 2.2953858607994907e-06, + "loss": 1.7342426776885986, + "step": 14464 + }, + { + "epoch": 2.633111859470283, + "grad_norm": 190.0, + "learning_rate": 2.2948082317478402e-06, + "loss": 1.4392192363739014, + "step": 14466 + }, + { + "epoch": 2.6334759260944756, + "grad_norm": 25.625, + "learning_rate": 2.294231146418065e-06, + "loss": 1.4335014820098877, + "step": 14468 + }, + { + "epoch": 2.6338399927186673, + "grad_norm": 17.5, + "learning_rate": 2.2936546048948516e-06, + "loss": 1.2903966903686523, + "step": 14470 + }, + { + "epoch": 2.63420405934286, + "grad_norm": 4.9375, + "learning_rate": 2.2930786072628044e-06, + "loss": 1.3931260108947754, + "step": 14472 + }, + { + "epoch": 2.6345681259670517, + "grad_norm": 4.4375, + "learning_rate": 2.292503153606448e-06, + "loss": 0.9813063144683838, + "step": 14474 + }, + { + "epoch": 2.6349321925912443, + "grad_norm": 92.5, + "learning_rate": 2.2919282440102296e-06, + "loss": 1.2947978973388672, + "step": 14476 + }, + { + "epoch": 2.6352962592154365, + "grad_norm": 11.5, + "learning_rate": 2.2913538785585136e-06, + "loss": 0.8180490732192993, + "step": 14478 + }, + { + "epoch": 2.6356603258396287, + "grad_norm": 15.0, + "learning_rate": 2.290780057335586e-06, + "loss": 1.3983210325241089, + "step": 14480 + }, + { + "epoch": 2.636024392463821, + "grad_norm": 7.5, + "learning_rate": 2.290206780425653e-06, + "loss": 1.0530699491500854, + "step": 14482 + }, + { + "epoch": 2.636388459088013, + "grad_norm": 11.8125, + "learning_rate": 2.2896340479128402e-06, + "loss": 1.1618951559066772, + "step": 14484 + }, + { + "epoch": 2.6367525257122053, + "grad_norm": 12.125, + "learning_rate": 2.2890618598811943e-06, + "loss": 1.4503509998321533, + "step": 14486 + }, + { + "epoch": 2.6371165923363975, + "grad_norm": 8.0, + "learning_rate": 2.288490216414681e-06, + "loss": 1.3243434429168701, + "step": 14488 + }, + { + "epoch": 2.6374806589605897, + "grad_norm": 14.25, + "learning_rate": 2.2879191175971874e-06, + "loss": 1.1167292594909668, + "step": 14490 + }, + { + "epoch": 2.637844725584782, + "grad_norm": 28.5, + "learning_rate": 2.287348563512519e-06, + "loss": 1.273335576057434, + "step": 14492 + }, + { + "epoch": 2.638208792208974, + "grad_norm": 12.375, + "learning_rate": 2.2867785542444035e-06, + "loss": 1.3807586431503296, + "step": 14494 + }, + { + "epoch": 2.6385728588331663, + "grad_norm": 10.6875, + "learning_rate": 2.2862090898764865e-06, + "loss": 1.3990294933319092, + "step": 14496 + }, + { + "epoch": 2.638936925457359, + "grad_norm": 9.75, + "learning_rate": 2.285640170492335e-06, + "loss": 1.1912405490875244, + "step": 14498 + }, + { + "epoch": 2.6393009920815507, + "grad_norm": 7.25, + "learning_rate": 2.2850717961754355e-06, + "loss": 1.3534977436065674, + "step": 14500 + }, + { + "epoch": 2.6396650587057433, + "grad_norm": 9.3125, + "learning_rate": 2.284503967009194e-06, + "loss": 1.2397384643554688, + "step": 14502 + }, + { + "epoch": 2.6400291253299355, + "grad_norm": 11.5, + "learning_rate": 2.2839366830769386e-06, + "loss": 1.3599773645401, + "step": 14504 + }, + { + "epoch": 2.6403931919541277, + "grad_norm": 10.6875, + "learning_rate": 2.283369944461915e-06, + "loss": 1.5993993282318115, + "step": 14506 + }, + { + "epoch": 2.64075725857832, + "grad_norm": 16.25, + "learning_rate": 2.2828037512472893e-06, + "loss": 1.2562651634216309, + "step": 14508 + }, + { + "epoch": 2.641121325202512, + "grad_norm": 52.0, + "learning_rate": 2.282238103516149e-06, + "loss": 1.7715880870819092, + "step": 14510 + }, + { + "epoch": 2.6414853918267043, + "grad_norm": 11.25, + "learning_rate": 2.2816730013515008e-06, + "loss": 1.5312063694000244, + "step": 14512 + }, + { + "epoch": 2.6418494584508965, + "grad_norm": 7.53125, + "learning_rate": 2.28110844483627e-06, + "loss": 1.1991188526153564, + "step": 14514 + }, + { + "epoch": 2.6422135250750887, + "grad_norm": 7.875, + "learning_rate": 2.2805444340533034e-06, + "loss": 1.2983596324920654, + "step": 14516 + }, + { + "epoch": 2.642577591699281, + "grad_norm": 8.9375, + "learning_rate": 2.2799809690853675e-06, + "loss": 1.3489129543304443, + "step": 14518 + }, + { + "epoch": 2.642941658323473, + "grad_norm": 22.375, + "learning_rate": 2.2794180500151485e-06, + "loss": 1.1990947723388672, + "step": 14520 + }, + { + "epoch": 2.6433057249476652, + "grad_norm": 11.6875, + "learning_rate": 2.2788556769252527e-06, + "loss": 1.138641595840454, + "step": 14522 + }, + { + "epoch": 2.643669791571858, + "grad_norm": 12.25, + "learning_rate": 2.2782938498982055e-06, + "loss": 1.4047293663024902, + "step": 14524 + }, + { + "epoch": 2.6440338581960496, + "grad_norm": 14.1875, + "learning_rate": 2.2777325690164533e-06, + "loss": 1.4530627727508545, + "step": 14526 + }, + { + "epoch": 2.6443979248202423, + "grad_norm": 10.125, + "learning_rate": 2.277171834362361e-06, + "loss": 1.341822624206543, + "step": 14528 + }, + { + "epoch": 2.6447619914444345, + "grad_norm": 8.625, + "learning_rate": 2.2766116460182155e-06, + "loss": 1.4875551462173462, + "step": 14530 + }, + { + "epoch": 2.6451260580686267, + "grad_norm": 7.28125, + "learning_rate": 2.2760520040662215e-06, + "loss": 1.2909774780273438, + "step": 14532 + }, + { + "epoch": 2.645490124692819, + "grad_norm": 10.0625, + "learning_rate": 2.2754929085885034e-06, + "loss": 1.2349538803100586, + "step": 14534 + }, + { + "epoch": 2.645854191317011, + "grad_norm": 12.5, + "learning_rate": 2.274934359667107e-06, + "loss": 1.3877525329589844, + "step": 14536 + }, + { + "epoch": 2.6462182579412032, + "grad_norm": 50.25, + "learning_rate": 2.274376357383997e-06, + "loss": 1.4723800420761108, + "step": 14538 + }, + { + "epoch": 2.6465823245653954, + "grad_norm": 8.875, + "learning_rate": 2.2738189018210587e-06, + "loss": 1.580697774887085, + "step": 14540 + }, + { + "epoch": 2.6469463911895876, + "grad_norm": 7.25, + "learning_rate": 2.273261993060095e-06, + "loss": 1.3183248043060303, + "step": 14542 + }, + { + "epoch": 2.64731045781378, + "grad_norm": 7.1875, + "learning_rate": 2.27270563118283e-06, + "loss": 1.2986781597137451, + "step": 14544 + }, + { + "epoch": 2.647674524437972, + "grad_norm": 4.28125, + "learning_rate": 2.272149816270909e-06, + "loss": 1.0991648435592651, + "step": 14546 + }, + { + "epoch": 2.648038591062164, + "grad_norm": 6.21875, + "learning_rate": 2.2715945484058945e-06, + "loss": 1.0388941764831543, + "step": 14548 + }, + { + "epoch": 2.648402657686357, + "grad_norm": 10.125, + "learning_rate": 2.27103982766927e-06, + "loss": 0.7706663608551025, + "step": 14550 + }, + { + "epoch": 2.6487667243105486, + "grad_norm": 13.25, + "learning_rate": 2.2704856541424396e-06, + "loss": 1.0283136367797852, + "step": 14552 + }, + { + "epoch": 2.6491307909347412, + "grad_norm": 20.75, + "learning_rate": 2.269932027906724e-06, + "loss": 1.2795251607894897, + "step": 14554 + }, + { + "epoch": 2.6494948575589334, + "grad_norm": 20.0, + "learning_rate": 2.2693789490433672e-06, + "loss": 1.2816487550735474, + "step": 14556 + }, + { + "epoch": 2.6498589241831256, + "grad_norm": 13.9375, + "learning_rate": 2.2688264176335305e-06, + "loss": 1.2676740884780884, + "step": 14558 + }, + { + "epoch": 2.650222990807318, + "grad_norm": 3.59375, + "learning_rate": 2.2682744337582964e-06, + "loss": 1.0327903032302856, + "step": 14560 + }, + { + "epoch": 2.65058705743151, + "grad_norm": 12.6875, + "learning_rate": 2.2677229974986646e-06, + "loss": 0.953514039516449, + "step": 14562 + }, + { + "epoch": 2.650951124055702, + "grad_norm": 10.125, + "learning_rate": 2.267172108935558e-06, + "loss": 1.427741527557373, + "step": 14564 + }, + { + "epoch": 2.6513151906798944, + "grad_norm": 9.125, + "learning_rate": 2.266621768149817e-06, + "loss": 1.3594467639923096, + "step": 14566 + }, + { + "epoch": 2.6516792573040866, + "grad_norm": 37.0, + "learning_rate": 2.266071975222201e-06, + "loss": 1.4734126329421997, + "step": 14568 + }, + { + "epoch": 2.652043323928279, + "grad_norm": 16.0, + "learning_rate": 2.26552273023339e-06, + "loss": 1.6638107299804688, + "step": 14570 + }, + { + "epoch": 2.652407390552471, + "grad_norm": 12.0625, + "learning_rate": 2.2649740332639847e-06, + "loss": 1.2713983058929443, + "step": 14572 + }, + { + "epoch": 2.652771457176663, + "grad_norm": 27.5, + "learning_rate": 2.264425884394503e-06, + "loss": 1.333370566368103, + "step": 14574 + }, + { + "epoch": 2.653135523800856, + "grad_norm": 32.5, + "learning_rate": 2.2638782837053833e-06, + "loss": 1.5306910276412964, + "step": 14576 + }, + { + "epoch": 2.6534995904250476, + "grad_norm": 12.0625, + "learning_rate": 2.263331231276985e-06, + "loss": 1.5224599838256836, + "step": 14578 + }, + { + "epoch": 2.65386365704924, + "grad_norm": 8.1875, + "learning_rate": 2.262784727189584e-06, + "loss": 1.6478886604309082, + "step": 14580 + }, + { + "epoch": 2.6542277236734324, + "grad_norm": 15.1875, + "learning_rate": 2.2622387715233802e-06, + "loss": 1.4007929563522339, + "step": 14582 + }, + { + "epoch": 2.6545917902976246, + "grad_norm": 7.71875, + "learning_rate": 2.261693364358488e-06, + "loss": 0.9857863187789917, + "step": 14584 + }, + { + "epoch": 2.654955856921817, + "grad_norm": 35.75, + "learning_rate": 2.261148505774945e-06, + "loss": 1.383042812347412, + "step": 14586 + }, + { + "epoch": 2.655319923546009, + "grad_norm": 5.53125, + "learning_rate": 2.260604195852706e-06, + "loss": 1.1225652694702148, + "step": 14588 + }, + { + "epoch": 2.655683990170201, + "grad_norm": 11.625, + "learning_rate": 2.2600604346716463e-06, + "loss": 1.3695437908172607, + "step": 14590 + }, + { + "epoch": 2.6560480567943934, + "grad_norm": 26.875, + "learning_rate": 2.2595172223115626e-06, + "loss": 1.3275632858276367, + "step": 14592 + }, + { + "epoch": 2.6564121234185856, + "grad_norm": 72.5, + "learning_rate": 2.258974558852167e-06, + "loss": 1.5609420537948608, + "step": 14594 + }, + { + "epoch": 2.6567761900427778, + "grad_norm": 22.25, + "learning_rate": 2.2584324443730937e-06, + "loss": 1.7811380624771118, + "step": 14596 + }, + { + "epoch": 2.65714025666697, + "grad_norm": 9.125, + "learning_rate": 2.257890878953896e-06, + "loss": 1.324152946472168, + "step": 14598 + }, + { + "epoch": 2.657504323291162, + "grad_norm": 3.3125, + "learning_rate": 2.2573498626740457e-06, + "loss": 1.0647938251495361, + "step": 14600 + }, + { + "epoch": 2.6578683899153543, + "grad_norm": 5.34375, + "learning_rate": 2.2568093956129368e-06, + "loss": 0.8082306385040283, + "step": 14602 + }, + { + "epoch": 2.6582324565395465, + "grad_norm": 10.1875, + "learning_rate": 2.2562694778498786e-06, + "loss": 1.3415851593017578, + "step": 14604 + }, + { + "epoch": 2.658596523163739, + "grad_norm": 23.0, + "learning_rate": 2.2557301094641026e-06, + "loss": 1.5772604942321777, + "step": 14606 + }, + { + "epoch": 2.658960589787931, + "grad_norm": 20.375, + "learning_rate": 2.2551912905347586e-06, + "loss": 1.470680832862854, + "step": 14608 + }, + { + "epoch": 2.6593246564121236, + "grad_norm": 4.21875, + "learning_rate": 2.2546530211409157e-06, + "loss": 0.18548749387264252, + "step": 14610 + }, + { + "epoch": 2.6596887230363158, + "grad_norm": 21.5, + "learning_rate": 2.254115301361565e-06, + "loss": 1.3066803216934204, + "step": 14612 + }, + { + "epoch": 2.660052789660508, + "grad_norm": 13.3125, + "learning_rate": 2.253578131275612e-06, + "loss": 1.705047607421875, + "step": 14614 + }, + { + "epoch": 2.6604168562847, + "grad_norm": 13.625, + "learning_rate": 2.2530415109618863e-06, + "loss": 1.3961238861083984, + "step": 14616 + }, + { + "epoch": 2.6607809229088923, + "grad_norm": 10.8125, + "learning_rate": 2.2525054404991327e-06, + "loss": 1.5302186012268066, + "step": 14618 + }, + { + "epoch": 2.6611449895330845, + "grad_norm": 12.3125, + "learning_rate": 2.2519699199660182e-06, + "loss": 1.515799641609192, + "step": 14620 + }, + { + "epoch": 2.6615090561572767, + "grad_norm": 17.125, + "learning_rate": 2.251434949441129e-06, + "loss": 1.6144614219665527, + "step": 14622 + }, + { + "epoch": 2.661873122781469, + "grad_norm": 19.0, + "learning_rate": 2.2509005290029697e-06, + "loss": 1.8386269807815552, + "step": 14624 + }, + { + "epoch": 2.662237189405661, + "grad_norm": 12.625, + "learning_rate": 2.2503666587299637e-06, + "loss": 1.4178409576416016, + "step": 14626 + }, + { + "epoch": 2.6626012560298533, + "grad_norm": 11.0625, + "learning_rate": 2.249833338700455e-06, + "loss": 1.9339309930801392, + "step": 14628 + }, + { + "epoch": 2.6629653226540455, + "grad_norm": 36.75, + "learning_rate": 2.2493005689927046e-06, + "loss": 1.367071270942688, + "step": 14630 + }, + { + "epoch": 2.663329389278238, + "grad_norm": 9.6875, + "learning_rate": 2.248768349684897e-06, + "loss": 1.1875, + "step": 14632 + }, + { + "epoch": 2.66369345590243, + "grad_norm": 28.5, + "learning_rate": 2.2482366808551306e-06, + "loss": 0.8653788566589355, + "step": 14634 + }, + { + "epoch": 2.6640575225266225, + "grad_norm": 30.625, + "learning_rate": 2.2477055625814273e-06, + "loss": 1.4966187477111816, + "step": 14636 + }, + { + "epoch": 2.6644215891508147, + "grad_norm": 18.0, + "learning_rate": 2.2471749949417253e-06, + "loss": 1.96842360496521, + "step": 14638 + }, + { + "epoch": 2.664785655775007, + "grad_norm": 32.5, + "learning_rate": 2.246644978013884e-06, + "loss": 1.4189033508300781, + "step": 14640 + }, + { + "epoch": 2.665149722399199, + "grad_norm": 11.8125, + "learning_rate": 2.246115511875682e-06, + "loss": 1.377279281616211, + "step": 14642 + }, + { + "epoch": 2.6655137890233913, + "grad_norm": 13.375, + "learning_rate": 2.2455865966048152e-06, + "loss": 1.3651355504989624, + "step": 14644 + }, + { + "epoch": 2.6658778556475835, + "grad_norm": 29.0, + "learning_rate": 2.2450582322788996e-06, + "loss": 1.9583160877227783, + "step": 14646 + }, + { + "epoch": 2.6662419222717757, + "grad_norm": 13.4375, + "learning_rate": 2.2445304189754714e-06, + "loss": 1.6047117710113525, + "step": 14648 + }, + { + "epoch": 2.666605988895968, + "grad_norm": 3.578125, + "learning_rate": 2.2440031567719833e-06, + "loss": 0.9793223738670349, + "step": 14650 + }, + { + "epoch": 2.66697005552016, + "grad_norm": 27.625, + "learning_rate": 2.243476445745812e-06, + "loss": 1.208122730255127, + "step": 14652 + }, + { + "epoch": 2.6673341221443523, + "grad_norm": 7.84375, + "learning_rate": 2.242950285974248e-06, + "loss": 1.2500379085540771, + "step": 14654 + }, + { + "epoch": 2.6676981887685445, + "grad_norm": 14.5, + "learning_rate": 2.242424677534503e-06, + "loss": 1.81507408618927, + "step": 14656 + }, + { + "epoch": 2.668062255392737, + "grad_norm": 19.75, + "learning_rate": 2.241899620503709e-06, + "loss": 1.8425989151000977, + "step": 14658 + }, + { + "epoch": 2.668426322016929, + "grad_norm": 13.125, + "learning_rate": 2.2413751149589145e-06, + "loss": 1.1693949699401855, + "step": 14660 + }, + { + "epoch": 2.6687903886411215, + "grad_norm": 12.4375, + "learning_rate": 2.240851160977091e-06, + "loss": 1.3427404165267944, + "step": 14662 + }, + { + "epoch": 2.6691544552653137, + "grad_norm": 8.625, + "learning_rate": 2.2403277586351236e-06, + "loss": 1.2425905466079712, + "step": 14664 + }, + { + "epoch": 2.669518521889506, + "grad_norm": 17.625, + "learning_rate": 2.2398049080098215e-06, + "loss": 1.4040961265563965, + "step": 14666 + }, + { + "epoch": 2.669882588513698, + "grad_norm": 10.6875, + "learning_rate": 2.2392826091779106e-06, + "loss": 1.386849045753479, + "step": 14668 + }, + { + "epoch": 2.6702466551378903, + "grad_norm": 10.125, + "learning_rate": 2.238760862216036e-06, + "loss": 1.525034785270691, + "step": 14670 + }, + { + "epoch": 2.6706107217620825, + "grad_norm": 21.875, + "learning_rate": 2.238239667200762e-06, + "loss": 1.605167031288147, + "step": 14672 + }, + { + "epoch": 2.6709747883862747, + "grad_norm": 22.625, + "learning_rate": 2.237719024208572e-06, + "loss": 2.1560471057891846, + "step": 14674 + }, + { + "epoch": 2.671338855010467, + "grad_norm": 19.25, + "learning_rate": 2.2371989333158673e-06, + "loss": 1.6465058326721191, + "step": 14676 + }, + { + "epoch": 2.671702921634659, + "grad_norm": 42.75, + "learning_rate": 2.2366793945989706e-06, + "loss": 1.6958762407302856, + "step": 14678 + }, + { + "epoch": 2.6720669882588513, + "grad_norm": 14.625, + "learning_rate": 2.2361604081341203e-06, + "loss": 0.8031466007232666, + "step": 14680 + }, + { + "epoch": 2.6724310548830434, + "grad_norm": 9.875, + "learning_rate": 2.2356419739974774e-06, + "loss": 1.3648557662963867, + "step": 14682 + }, + { + "epoch": 2.672795121507236, + "grad_norm": 8.625, + "learning_rate": 2.235124092265119e-06, + "loss": 0.6042052507400513, + "step": 14684 + }, + { + "epoch": 2.673159188131428, + "grad_norm": 21.25, + "learning_rate": 2.234606763013042e-06, + "loss": 1.4619104862213135, + "step": 14686 + }, + { + "epoch": 2.6735232547556205, + "grad_norm": 13.0625, + "learning_rate": 2.2340899863171632e-06, + "loss": 1.5344159603118896, + "step": 14688 + }, + { + "epoch": 2.6738873213798127, + "grad_norm": 14.75, + "learning_rate": 2.2335737622533166e-06, + "loss": 1.0969370603561401, + "step": 14690 + }, + { + "epoch": 2.674251388004005, + "grad_norm": 34.0, + "learning_rate": 2.2330580908972574e-06, + "loss": 0.5326094627380371, + "step": 14692 + }, + { + "epoch": 2.674615454628197, + "grad_norm": 20.25, + "learning_rate": 2.232542972324656e-06, + "loss": 1.4828294515609741, + "step": 14694 + }, + { + "epoch": 2.6749795212523892, + "grad_norm": 22.5, + "learning_rate": 2.232028406611106e-06, + "loss": 1.7330148220062256, + "step": 14696 + }, + { + "epoch": 2.6753435878765814, + "grad_norm": 7.09375, + "learning_rate": 2.2315143938321173e-06, + "loss": 1.4993782043457031, + "step": 14698 + }, + { + "epoch": 2.6757076545007736, + "grad_norm": 11.4375, + "learning_rate": 2.2310009340631176e-06, + "loss": 1.0553703308105469, + "step": 14700 + }, + { + "epoch": 2.676071721124966, + "grad_norm": 17.0, + "learning_rate": 2.230488027379458e-06, + "loss": 1.1226357221603394, + "step": 14702 + }, + { + "epoch": 2.676435787749158, + "grad_norm": 27.25, + "learning_rate": 2.2299756738564037e-06, + "loss": 0.956444263458252, + "step": 14704 + }, + { + "epoch": 2.67679985437335, + "grad_norm": 13.75, + "learning_rate": 2.2294638735691398e-06, + "loss": 1.2135436534881592, + "step": 14706 + }, + { + "epoch": 2.6771639209975424, + "grad_norm": 13.1875, + "learning_rate": 2.2289526265927724e-06, + "loss": 1.5399620532989502, + "step": 14708 + }, + { + "epoch": 2.677527987621735, + "grad_norm": 27.25, + "learning_rate": 2.228441933002323e-06, + "loss": 2.1901984214782715, + "step": 14710 + }, + { + "epoch": 2.677892054245927, + "grad_norm": 4.25, + "learning_rate": 2.2279317928727374e-06, + "loss": 1.040196418762207, + "step": 14712 + }, + { + "epoch": 2.6782561208701194, + "grad_norm": 6.09375, + "learning_rate": 2.2274222062788732e-06, + "loss": 1.4260969161987305, + "step": 14714 + }, + { + "epoch": 2.678620187494311, + "grad_norm": 10.375, + "learning_rate": 2.226913173295511e-06, + "loss": 1.3539091348648071, + "step": 14716 + }, + { + "epoch": 2.678984254118504, + "grad_norm": 16.125, + "learning_rate": 2.2264046939973503e-06, + "loss": 1.5118415355682373, + "step": 14718 + }, + { + "epoch": 2.679348320742696, + "grad_norm": 14.875, + "learning_rate": 2.225896768459007e-06, + "loss": 1.3771758079528809, + "step": 14720 + }, + { + "epoch": 2.679712387366888, + "grad_norm": 7.15625, + "learning_rate": 2.225389396755019e-06, + "loss": 1.0954978466033936, + "step": 14722 + }, + { + "epoch": 2.6800764539910804, + "grad_norm": 15.875, + "learning_rate": 2.2248825789598384e-06, + "loss": 1.1442478895187378, + "step": 14724 + }, + { + "epoch": 2.6804405206152726, + "grad_norm": 10.0, + "learning_rate": 2.2243763151478415e-06, + "loss": 0.6619093418121338, + "step": 14726 + }, + { + "epoch": 2.680804587239465, + "grad_norm": 17.0, + "learning_rate": 2.223870605393318e-06, + "loss": 1.2649319171905518, + "step": 14728 + }, + { + "epoch": 2.681168653863657, + "grad_norm": 9.8125, + "learning_rate": 2.2233654497704795e-06, + "loss": 0.7245772480964661, + "step": 14730 + }, + { + "epoch": 2.681532720487849, + "grad_norm": 14.9375, + "learning_rate": 2.2228608483534573e-06, + "loss": 1.4337693452835083, + "step": 14732 + }, + { + "epoch": 2.6818967871120414, + "grad_norm": 6.28125, + "learning_rate": 2.222356801216298e-06, + "loss": 1.0649837255477905, + "step": 14734 + }, + { + "epoch": 2.6822608537362336, + "grad_norm": 19.375, + "learning_rate": 2.2218533084329676e-06, + "loss": 1.519266963005066, + "step": 14736 + }, + { + "epoch": 2.6826249203604258, + "grad_norm": 8.6875, + "learning_rate": 2.221350370077354e-06, + "loss": 0.9926698207855225, + "step": 14738 + }, + { + "epoch": 2.6829889869846184, + "grad_norm": 7.09375, + "learning_rate": 2.2208479862232586e-06, + "loss": 1.3184170722961426, + "step": 14740 + }, + { + "epoch": 2.68335305360881, + "grad_norm": 12.875, + "learning_rate": 2.220346156944407e-06, + "loss": 1.365871787071228, + "step": 14742 + }, + { + "epoch": 2.683717120233003, + "grad_norm": 14.3125, + "learning_rate": 2.2198448823144384e-06, + "loss": 1.2982304096221924, + "step": 14744 + }, + { + "epoch": 2.684081186857195, + "grad_norm": 14.3125, + "learning_rate": 2.219344162406914e-06, + "loss": 1.4157578945159912, + "step": 14746 + }, + { + "epoch": 2.684445253481387, + "grad_norm": 17.375, + "learning_rate": 2.218843997295312e-06, + "loss": 1.2325361967086792, + "step": 14748 + }, + { + "epoch": 2.6848093201055794, + "grad_norm": 6.46875, + "learning_rate": 2.2183443870530295e-06, + "loss": 1.0730057954788208, + "step": 14750 + }, + { + "epoch": 2.6851733867297716, + "grad_norm": 7.21875, + "learning_rate": 2.2178453317533833e-06, + "loss": 1.3177622556686401, + "step": 14752 + }, + { + "epoch": 2.6855374533539638, + "grad_norm": 10.8125, + "learning_rate": 2.2173468314696066e-06, + "loss": 1.1811798810958862, + "step": 14754 + }, + { + "epoch": 2.685901519978156, + "grad_norm": 8.1875, + "learning_rate": 2.2168488862748522e-06, + "loss": 1.1338566541671753, + "step": 14756 + }, + { + "epoch": 2.686265586602348, + "grad_norm": 14.8125, + "learning_rate": 2.2163514962421924e-06, + "loss": 1.6747138500213623, + "step": 14758 + }, + { + "epoch": 2.6866296532265403, + "grad_norm": 11.75, + "learning_rate": 2.215854661444616e-06, + "loss": 1.6357932090759277, + "step": 14760 + }, + { + "epoch": 2.6869937198507325, + "grad_norm": 5.0625, + "learning_rate": 2.2153583819550336e-06, + "loss": 1.193169355392456, + "step": 14762 + }, + { + "epoch": 2.6873577864749247, + "grad_norm": 8.25, + "learning_rate": 2.21486265784627e-06, + "loss": 0.9699419736862183, + "step": 14764 + }, + { + "epoch": 2.6877218530991174, + "grad_norm": 10.25, + "learning_rate": 2.214367489191071e-06, + "loss": 1.4303457736968994, + "step": 14766 + }, + { + "epoch": 2.688085919723309, + "grad_norm": 19.0, + "learning_rate": 2.213872876062102e-06, + "loss": 1.7218208312988281, + "step": 14768 + }, + { + "epoch": 2.6884499863475018, + "grad_norm": 12.375, + "learning_rate": 2.2133788185319438e-06, + "loss": 1.265617847442627, + "step": 14770 + }, + { + "epoch": 2.688814052971694, + "grad_norm": 6.125, + "learning_rate": 2.2128853166730995e-06, + "loss": 1.4886524677276611, + "step": 14772 + }, + { + "epoch": 2.689178119595886, + "grad_norm": 8.875, + "learning_rate": 2.2123923705579866e-06, + "loss": 1.4232155084609985, + "step": 14774 + }, + { + "epoch": 2.6895421862200783, + "grad_norm": 8.625, + "learning_rate": 2.2118999802589425e-06, + "loss": 1.5835509300231934, + "step": 14776 + }, + { + "epoch": 2.6899062528442705, + "grad_norm": 12.4375, + "learning_rate": 2.2114081458482255e-06, + "loss": 1.3214442729949951, + "step": 14778 + }, + { + "epoch": 2.6902703194684627, + "grad_norm": 11.3125, + "learning_rate": 2.2109168673980087e-06, + "loss": 1.3034852743148804, + "step": 14780 + }, + { + "epoch": 2.690634386092655, + "grad_norm": 11.375, + "learning_rate": 2.2104261449803864e-06, + "loss": 1.1885371208190918, + "step": 14782 + }, + { + "epoch": 2.690998452716847, + "grad_norm": 28.625, + "learning_rate": 2.2099359786673693e-06, + "loss": 1.3588523864746094, + "step": 14784 + }, + { + "epoch": 2.6913625193410393, + "grad_norm": 4.78125, + "learning_rate": 2.2094463685308873e-06, + "loss": 1.1375269889831543, + "step": 14786 + }, + { + "epoch": 2.6917265859652315, + "grad_norm": 16.5, + "learning_rate": 2.208957314642789e-06, + "loss": 1.2421104907989502, + "step": 14788 + }, + { + "epoch": 2.6920906525894237, + "grad_norm": 8.625, + "learning_rate": 2.2084688170748404e-06, + "loss": 0.9773150682449341, + "step": 14790 + }, + { + "epoch": 2.6924547192136163, + "grad_norm": 15.3125, + "learning_rate": 2.2079808758987287e-06, + "loss": 1.2801337242126465, + "step": 14792 + }, + { + "epoch": 2.692818785837808, + "grad_norm": 10.4375, + "learning_rate": 2.2074934911860544e-06, + "loss": 1.7359005212783813, + "step": 14794 + }, + { + "epoch": 2.6931828524620007, + "grad_norm": 19.25, + "learning_rate": 2.207006663008341e-06, + "loss": 1.1615086793899536, + "step": 14796 + }, + { + "epoch": 2.693546919086193, + "grad_norm": 21.25, + "learning_rate": 2.2065203914370287e-06, + "loss": 0.834223747253418, + "step": 14798 + }, + { + "epoch": 2.693910985710385, + "grad_norm": 9.5, + "learning_rate": 2.2060346765434743e-06, + "loss": 1.5428526401519775, + "step": 14800 + }, + { + "epoch": 2.6942750523345773, + "grad_norm": 4.90625, + "learning_rate": 2.2055495183989565e-06, + "loss": 1.3263686895370483, + "step": 14802 + }, + { + "epoch": 2.6946391189587695, + "grad_norm": 10.1875, + "learning_rate": 2.205064917074669e-06, + "loss": 1.4751248359680176, + "step": 14804 + }, + { + "epoch": 2.6950031855829617, + "grad_norm": 10.9375, + "learning_rate": 2.2045808726417254e-06, + "loss": 1.3308625221252441, + "step": 14806 + }, + { + "epoch": 2.695367252207154, + "grad_norm": 13.25, + "learning_rate": 2.204097385171157e-06, + "loss": 1.5199995040893555, + "step": 14808 + }, + { + "epoch": 2.695731318831346, + "grad_norm": 27.25, + "learning_rate": 2.2036144547339135e-06, + "loss": 1.3691648244857788, + "step": 14810 + }, + { + "epoch": 2.6960953854555383, + "grad_norm": 22.25, + "learning_rate": 2.2031320814008646e-06, + "loss": 1.6099021434783936, + "step": 14812 + }, + { + "epoch": 2.6964594520797305, + "grad_norm": 11.25, + "learning_rate": 2.2026502652427944e-06, + "loss": 1.3485583066940308, + "step": 14814 + }, + { + "epoch": 2.6968235187039227, + "grad_norm": 13.25, + "learning_rate": 2.202169006330409e-06, + "loss": 1.1106489896774292, + "step": 14816 + }, + { + "epoch": 2.6971875853281153, + "grad_norm": 44.25, + "learning_rate": 2.2016883047343305e-06, + "loss": 0.8135997653007507, + "step": 14818 + }, + { + "epoch": 2.697551651952307, + "grad_norm": 23.5, + "learning_rate": 2.201208160525099e-06, + "loss": 1.2263531684875488, + "step": 14820 + }, + { + "epoch": 2.6979157185764997, + "grad_norm": 6.21875, + "learning_rate": 2.2007285737731765e-06, + "loss": 1.3085579872131348, + "step": 14822 + }, + { + "epoch": 2.6982797852006914, + "grad_norm": 4.21875, + "learning_rate": 2.200249544548938e-06, + "loss": 1.15388023853302, + "step": 14824 + }, + { + "epoch": 2.698643851824884, + "grad_norm": 52.75, + "learning_rate": 2.19977107292268e-06, + "loss": 0.8058637380599976, + "step": 14826 + }, + { + "epoch": 2.6990079184490763, + "grad_norm": 56.25, + "learning_rate": 2.1992931589646153e-06, + "loss": 0.7274093627929688, + "step": 14828 + }, + { + "epoch": 2.6993719850732685, + "grad_norm": 5.09375, + "learning_rate": 2.198815802744877e-06, + "loss": 1.1417689323425293, + "step": 14830 + }, + { + "epoch": 2.6997360516974607, + "grad_norm": 9.4375, + "learning_rate": 2.1983390043335152e-06, + "loss": 1.6141223907470703, + "step": 14832 + }, + { + "epoch": 2.700100118321653, + "grad_norm": 9.25, + "learning_rate": 2.1978627638004977e-06, + "loss": 1.4747462272644043, + "step": 14834 + }, + { + "epoch": 2.700464184945845, + "grad_norm": 8.3125, + "learning_rate": 2.1973870812157105e-06, + "loss": 1.1974679231643677, + "step": 14836 + }, + { + "epoch": 2.7008282515700373, + "grad_norm": 16.25, + "learning_rate": 2.1969119566489584e-06, + "loss": 1.1827150583267212, + "step": 14838 + }, + { + "epoch": 2.7011923181942294, + "grad_norm": 7.4375, + "learning_rate": 2.196437390169964e-06, + "loss": 1.1084064245224, + "step": 14840 + }, + { + "epoch": 2.7015563848184216, + "grad_norm": 7.78125, + "learning_rate": 2.195963381848369e-06, + "loss": 1.2836636304855347, + "step": 14842 + }, + { + "epoch": 2.701920451442614, + "grad_norm": 11.3125, + "learning_rate": 2.1954899317537306e-06, + "loss": 1.534792184829712, + "step": 14844 + }, + { + "epoch": 2.702284518066806, + "grad_norm": 23.75, + "learning_rate": 2.1950170399555267e-06, + "loss": 1.5105608701705933, + "step": 14846 + }, + { + "epoch": 2.7026485846909987, + "grad_norm": 17.75, + "learning_rate": 2.1945447065231518e-06, + "loss": 1.5451600551605225, + "step": 14848 + }, + { + "epoch": 2.7030126513151904, + "grad_norm": 8.125, + "learning_rate": 2.194072931525918e-06, + "loss": 1.1832053661346436, + "step": 14850 + }, + { + "epoch": 2.703376717939383, + "grad_norm": 7.53125, + "learning_rate": 2.1936017150330593e-06, + "loss": 1.4322609901428223, + "step": 14852 + }, + { + "epoch": 2.7037407845635753, + "grad_norm": 22.5, + "learning_rate": 2.193131057113722e-06, + "loss": 1.5028266906738281, + "step": 14854 + }, + { + "epoch": 2.7041048511877674, + "grad_norm": 11.875, + "learning_rate": 2.1926609578369744e-06, + "loss": 1.5083743333816528, + "step": 14856 + }, + { + "epoch": 2.7044689178119596, + "grad_norm": 19.75, + "learning_rate": 2.192191417271801e-06, + "loss": 1.633476972579956, + "step": 14858 + }, + { + "epoch": 2.704832984436152, + "grad_norm": 10.0, + "learning_rate": 2.1917224354871052e-06, + "loss": 0.9893850684165955, + "step": 14860 + }, + { + "epoch": 2.705197051060344, + "grad_norm": 12.25, + "learning_rate": 2.1912540125517095e-06, + "loss": 1.5509364604949951, + "step": 14862 + }, + { + "epoch": 2.705561117684536, + "grad_norm": 43.5, + "learning_rate": 2.1907861485343516e-06, + "loss": 1.4076491594314575, + "step": 14864 + }, + { + "epoch": 2.7059251843087284, + "grad_norm": 11.4375, + "learning_rate": 2.1903188435036884e-06, + "loss": 1.3426060676574707, + "step": 14866 + }, + { + "epoch": 2.7062892509329206, + "grad_norm": 14.8125, + "learning_rate": 2.189852097528296e-06, + "loss": 1.2959814071655273, + "step": 14868 + }, + { + "epoch": 2.706653317557113, + "grad_norm": 5.0625, + "learning_rate": 2.1893859106766668e-06, + "loss": 1.3426402807235718, + "step": 14870 + }, + { + "epoch": 2.707017384181305, + "grad_norm": 9.0, + "learning_rate": 2.188920283017213e-06, + "loss": 1.2335952520370483, + "step": 14872 + }, + { + "epoch": 2.7073814508054976, + "grad_norm": 11.5625, + "learning_rate": 2.1884552146182623e-06, + "loss": 1.3386529684066772, + "step": 14874 + }, + { + "epoch": 2.7077455174296894, + "grad_norm": 18.0, + "learning_rate": 2.1879907055480618e-06, + "loss": 1.4073715209960938, + "step": 14876 + }, + { + "epoch": 2.708109584053882, + "grad_norm": 15.4375, + "learning_rate": 2.1875267558747766e-06, + "loss": 1.9418431520462036, + "step": 14878 + }, + { + "epoch": 2.708473650678074, + "grad_norm": 8.8125, + "learning_rate": 2.1870633656664885e-06, + "loss": 1.32002854347229, + "step": 14880 + }, + { + "epoch": 2.7088377173022664, + "grad_norm": 8.5625, + "learning_rate": 2.186600534991201e-06, + "loss": 1.1419707536697388, + "step": 14882 + }, + { + "epoch": 2.7092017839264586, + "grad_norm": 13.375, + "learning_rate": 2.1861382639168294e-06, + "loss": 1.194664478302002, + "step": 14884 + }, + { + "epoch": 2.709565850550651, + "grad_norm": 20.75, + "learning_rate": 2.1856765525112112e-06, + "loss": 1.3358380794525146, + "step": 14886 + }, + { + "epoch": 2.709929917174843, + "grad_norm": 16.75, + "learning_rate": 2.1852154008421013e-06, + "loss": 1.4162893295288086, + "step": 14888 + }, + { + "epoch": 2.710293983799035, + "grad_norm": 20.0, + "learning_rate": 2.184754808977171e-06, + "loss": 1.0267837047576904, + "step": 14890 + }, + { + "epoch": 2.7106580504232274, + "grad_norm": 8.25, + "learning_rate": 2.1842947769840106e-06, + "loss": 0.7245466113090515, + "step": 14892 + }, + { + "epoch": 2.7110221170474196, + "grad_norm": 16.375, + "learning_rate": 2.1838353049301285e-06, + "loss": 1.1227068901062012, + "step": 14894 + }, + { + "epoch": 2.7113861836716118, + "grad_norm": 15.3125, + "learning_rate": 2.1833763928829497e-06, + "loss": 1.5669455528259277, + "step": 14896 + }, + { + "epoch": 2.711750250295804, + "grad_norm": 8.375, + "learning_rate": 2.1829180409098173e-06, + "loss": 1.2360657453536987, + "step": 14898 + }, + { + "epoch": 2.7121143169199966, + "grad_norm": 9.5625, + "learning_rate": 2.182460249077993e-06, + "loss": 1.4627820253372192, + "step": 14900 + }, + { + "epoch": 2.7124783835441884, + "grad_norm": 14.6875, + "learning_rate": 2.182003017454657e-06, + "loss": 1.2225757837295532, + "step": 14902 + }, + { + "epoch": 2.712842450168381, + "grad_norm": 10.3125, + "learning_rate": 2.181546346106905e-06, + "loss": 1.5045427083969116, + "step": 14904 + }, + { + "epoch": 2.713206516792573, + "grad_norm": 29.625, + "learning_rate": 2.181090235101751e-06, + "loss": 1.0104668140411377, + "step": 14906 + }, + { + "epoch": 2.7135705834167654, + "grad_norm": 14.0, + "learning_rate": 2.180634684506129e-06, + "loss": 1.2800325155258179, + "step": 14908 + }, + { + "epoch": 2.7139346500409576, + "grad_norm": 37.0, + "learning_rate": 2.180179694386888e-06, + "loss": 0.8998664021492004, + "step": 14910 + }, + { + "epoch": 2.7142987166651498, + "grad_norm": 14.75, + "learning_rate": 2.179725264810797e-06, + "loss": 1.3930096626281738, + "step": 14912 + }, + { + "epoch": 2.714662783289342, + "grad_norm": 8.9375, + "learning_rate": 2.179271395844541e-06, + "loss": 1.2303917407989502, + "step": 14914 + }, + { + "epoch": 2.715026849913534, + "grad_norm": 8.625, + "learning_rate": 2.178818087554724e-06, + "loss": 1.38310706615448, + "step": 14916 + }, + { + "epoch": 2.7153909165377264, + "grad_norm": 7.9375, + "learning_rate": 2.178365340007866e-06, + "loss": 1.2885059118270874, + "step": 14918 + }, + { + "epoch": 2.7157549831619185, + "grad_norm": 11.375, + "learning_rate": 2.177913153270407e-06, + "loss": 0.9565805196762085, + "step": 14920 + }, + { + "epoch": 2.7161190497861107, + "grad_norm": 11.125, + "learning_rate": 2.1774615274087033e-06, + "loss": 1.658626675605774, + "step": 14922 + }, + { + "epoch": 2.716483116410303, + "grad_norm": 11.0625, + "learning_rate": 2.1770104624890287e-06, + "loss": 1.3765509128570557, + "step": 14924 + }, + { + "epoch": 2.7168471830344956, + "grad_norm": 13.0625, + "learning_rate": 2.176559958577576e-06, + "loss": 1.358661413192749, + "step": 14926 + }, + { + "epoch": 2.7172112496586873, + "grad_norm": 4.0625, + "learning_rate": 2.176110015740454e-06, + "loss": 0.9088307619094849, + "step": 14928 + }, + { + "epoch": 2.71757531628288, + "grad_norm": 18.5, + "learning_rate": 2.17566063404369e-06, + "loss": 0.9821543097496033, + "step": 14930 + }, + { + "epoch": 2.717939382907072, + "grad_norm": 17.75, + "learning_rate": 2.1752118135532297e-06, + "loss": 0.6211255192756653, + "step": 14932 + }, + { + "epoch": 2.7183034495312643, + "grad_norm": 12.125, + "learning_rate": 2.1747635543349355e-06, + "loss": 1.5896955728530884, + "step": 14934 + }, + { + "epoch": 2.7186675161554565, + "grad_norm": 15.625, + "learning_rate": 2.1743158564545873e-06, + "loss": 1.669480800628662, + "step": 14936 + }, + { + "epoch": 2.7190315827796487, + "grad_norm": 7.90625, + "learning_rate": 2.173868719977883e-06, + "loss": 1.278374195098877, + "step": 14938 + }, + { + "epoch": 2.719395649403841, + "grad_norm": 6.4375, + "learning_rate": 2.173422144970437e-06, + "loss": 1.2889635562896729, + "step": 14940 + }, + { + "epoch": 2.719759716028033, + "grad_norm": 11.3125, + "learning_rate": 2.172976131497785e-06, + "loss": 1.3702778816223145, + "step": 14942 + }, + { + "epoch": 2.7201237826522253, + "grad_norm": 8.0, + "learning_rate": 2.1725306796253754e-06, + "loss": 1.725634217262268, + "step": 14944 + }, + { + "epoch": 2.7204878492764175, + "grad_norm": 6.875, + "learning_rate": 2.172085789418577e-06, + "loss": 1.1504065990447998, + "step": 14946 + }, + { + "epoch": 2.7208519159006097, + "grad_norm": 7.46875, + "learning_rate": 2.1716414609426762e-06, + "loss": 1.1584641933441162, + "step": 14948 + }, + { + "epoch": 2.721215982524802, + "grad_norm": 17.5, + "learning_rate": 2.1711976942628754e-06, + "loss": 1.146274209022522, + "step": 14950 + }, + { + "epoch": 2.7215800491489945, + "grad_norm": 7.03125, + "learning_rate": 2.1707544894442967e-06, + "loss": 1.547098994255066, + "step": 14952 + }, + { + "epoch": 2.7219441157731863, + "grad_norm": 9.9375, + "learning_rate": 2.1703118465519785e-06, + "loss": 1.1408462524414062, + "step": 14954 + }, + { + "epoch": 2.722308182397379, + "grad_norm": 18.125, + "learning_rate": 2.169869765650876e-06, + "loss": 1.8340272903442383, + "step": 14956 + }, + { + "epoch": 2.7226722490215707, + "grad_norm": 7.0625, + "learning_rate": 2.1694282468058634e-06, + "loss": 1.5290647745132446, + "step": 14958 + }, + { + "epoch": 2.7230363156457633, + "grad_norm": 86.5, + "learning_rate": 2.1689872900817312e-06, + "loss": 1.17153799533844, + "step": 14960 + }, + { + "epoch": 2.7234003822699555, + "grad_norm": 8.8125, + "learning_rate": 2.168546895543189e-06, + "loss": 0.8713878393173218, + "step": 14962 + }, + { + "epoch": 2.7237644488941477, + "grad_norm": 6.03125, + "learning_rate": 2.168107063254862e-06, + "loss": 1.2060374021530151, + "step": 14964 + }, + { + "epoch": 2.72412851551834, + "grad_norm": 18.125, + "learning_rate": 2.1676677932812945e-06, + "loss": 1.2441211938858032, + "step": 14966 + }, + { + "epoch": 2.724492582142532, + "grad_norm": 18.625, + "learning_rate": 2.1672290856869472e-06, + "loss": 1.272569179534912, + "step": 14968 + }, + { + "epoch": 2.7248566487667243, + "grad_norm": 9.1875, + "learning_rate": 2.166790940536198e-06, + "loss": 1.4582860469818115, + "step": 14970 + }, + { + "epoch": 2.7252207153909165, + "grad_norm": 25.625, + "learning_rate": 2.1663533578933447e-06, + "loss": 1.0654188394546509, + "step": 14972 + }, + { + "epoch": 2.7255847820151087, + "grad_norm": 16.0, + "learning_rate": 2.165916337822599e-06, + "loss": 1.414611577987671, + "step": 14974 + }, + { + "epoch": 2.725948848639301, + "grad_norm": 17.875, + "learning_rate": 2.165479880388093e-06, + "loss": 1.3049029111862183, + "step": 14976 + }, + { + "epoch": 2.726312915263493, + "grad_norm": 16.125, + "learning_rate": 2.165043985653874e-06, + "loss": 1.8345887660980225, + "step": 14978 + }, + { + "epoch": 2.7266769818876853, + "grad_norm": 15.6875, + "learning_rate": 2.1646086536839083e-06, + "loss": 1.536637544631958, + "step": 14980 + }, + { + "epoch": 2.727041048511878, + "grad_norm": 13.0625, + "learning_rate": 2.16417388454208e-06, + "loss": 1.3506499528884888, + "step": 14982 + }, + { + "epoch": 2.7274051151360696, + "grad_norm": 14.1875, + "learning_rate": 2.1637396782921885e-06, + "loss": 1.3078205585479736, + "step": 14984 + }, + { + "epoch": 2.7277691817602623, + "grad_norm": 20.375, + "learning_rate": 2.1633060349979524e-06, + "loss": 1.5477087497711182, + "step": 14986 + }, + { + "epoch": 2.7281332483844545, + "grad_norm": 12.75, + "learning_rate": 2.1628729547230066e-06, + "loss": 1.5537257194519043, + "step": 14988 + }, + { + "epoch": 2.7284973150086467, + "grad_norm": 15.6875, + "learning_rate": 2.162440437530904e-06, + "loss": 1.5752886533737183, + "step": 14990 + }, + { + "epoch": 2.728861381632839, + "grad_norm": 9.5625, + "learning_rate": 2.162008483485116e-06, + "loss": 1.7336336374282837, + "step": 14992 + }, + { + "epoch": 2.729225448257031, + "grad_norm": 8.875, + "learning_rate": 2.161577092649028e-06, + "loss": 1.469306230545044, + "step": 14994 + }, + { + "epoch": 2.7295895148812233, + "grad_norm": 7.53125, + "learning_rate": 2.1611462650859463e-06, + "loss": 1.1119840145111084, + "step": 14996 + }, + { + "epoch": 2.7299535815054154, + "grad_norm": 11.0625, + "learning_rate": 2.1607160008590925e-06, + "loss": 1.259823203086853, + "step": 14998 + }, + { + "epoch": 2.7303176481296076, + "grad_norm": 14.75, + "learning_rate": 2.160286300031606e-06, + "loss": 1.3366429805755615, + "step": 15000 + }, + { + "epoch": 2.7306817147538, + "grad_norm": 11.25, + "learning_rate": 2.1598571626665447e-06, + "loss": 1.6711106300354004, + "step": 15002 + }, + { + "epoch": 2.731045781377992, + "grad_norm": 20.25, + "learning_rate": 2.1594285888268816e-06, + "loss": 1.5893425941467285, + "step": 15004 + }, + { + "epoch": 2.7314098480021842, + "grad_norm": 21.875, + "learning_rate": 2.1590005785755087e-06, + "loss": 0.6128767728805542, + "step": 15006 + }, + { + "epoch": 2.731773914626377, + "grad_norm": 11.8125, + "learning_rate": 2.1585731319752344e-06, + "loss": 0.9804597496986389, + "step": 15008 + }, + { + "epoch": 2.7321379812505686, + "grad_norm": 27.25, + "learning_rate": 2.158146249088785e-06, + "loss": 1.5301352739334106, + "step": 15010 + }, + { + "epoch": 2.7325020478747613, + "grad_norm": 10.625, + "learning_rate": 2.1577199299788045e-06, + "loss": 1.9731130599975586, + "step": 15012 + }, + { + "epoch": 2.7328661144989534, + "grad_norm": 10.9375, + "learning_rate": 2.1572941747078526e-06, + "loss": 1.2514795064926147, + "step": 15014 + }, + { + "epoch": 2.7332301811231456, + "grad_norm": 14.125, + "learning_rate": 2.1568689833384077e-06, + "loss": 1.0623831748962402, + "step": 15016 + }, + { + "epoch": 2.733594247747338, + "grad_norm": 15.3125, + "learning_rate": 2.1564443559328644e-06, + "loss": 1.384691596031189, + "step": 15018 + }, + { + "epoch": 2.73395831437153, + "grad_norm": 6.15625, + "learning_rate": 2.1560202925535344e-06, + "loss": 0.8401814699172974, + "step": 15020 + }, + { + "epoch": 2.7343223809957222, + "grad_norm": 7.65625, + "learning_rate": 2.15559679326265e-06, + "loss": 1.2028660774230957, + "step": 15022 + }, + { + "epoch": 2.7346864476199144, + "grad_norm": 18.75, + "learning_rate": 2.1551738581223547e-06, + "loss": 0.48643553256988525, + "step": 15024 + }, + { + "epoch": 2.7350505142441066, + "grad_norm": 9.5, + "learning_rate": 2.1547514871947147e-06, + "loss": 1.4175217151641846, + "step": 15026 + }, + { + "epoch": 2.735414580868299, + "grad_norm": 21.125, + "learning_rate": 2.15432968054171e-06, + "loss": 1.3999290466308594, + "step": 15028 + }, + { + "epoch": 2.735778647492491, + "grad_norm": 13.6875, + "learning_rate": 2.1539084382252398e-06, + "loss": 1.3097772598266602, + "step": 15030 + }, + { + "epoch": 2.736142714116683, + "grad_norm": 7.03125, + "learning_rate": 2.15348776030712e-06, + "loss": 1.4213893413543701, + "step": 15032 + }, + { + "epoch": 2.736506780740876, + "grad_norm": 18.0, + "learning_rate": 2.1530676468490823e-06, + "loss": 1.4243406057357788, + "step": 15034 + }, + { + "epoch": 2.7368708473650676, + "grad_norm": 21.125, + "learning_rate": 2.152648097912777e-06, + "loss": 1.4513845443725586, + "step": 15036 + }, + { + "epoch": 2.73723491398926, + "grad_norm": 100.0, + "learning_rate": 2.152229113559772e-06, + "loss": 1.3756598234176636, + "step": 15038 + }, + { + "epoch": 2.7375989806134524, + "grad_norm": 11.3125, + "learning_rate": 2.1518106938515493e-06, + "loss": 1.209734320640564, + "step": 15040 + }, + { + "epoch": 2.7379630472376446, + "grad_norm": 28.0, + "learning_rate": 2.1513928388495142e-06, + "loss": 1.4093501567840576, + "step": 15042 + }, + { + "epoch": 2.738327113861837, + "grad_norm": 60.5, + "learning_rate": 2.150975548614982e-06, + "loss": 1.954169511795044, + "step": 15044 + }, + { + "epoch": 2.738691180486029, + "grad_norm": 23.375, + "learning_rate": 2.150558823209189e-06, + "loss": 1.0156046152114868, + "step": 15046 + }, + { + "epoch": 2.739055247110221, + "grad_norm": 9.25, + "learning_rate": 2.1501426626932888e-06, + "loss": 1.1951603889465332, + "step": 15048 + }, + { + "epoch": 2.7394193137344134, + "grad_norm": 11.125, + "learning_rate": 2.14972706712835e-06, + "loss": 1.8005973100662231, + "step": 15050 + }, + { + "epoch": 2.7397833803586056, + "grad_norm": 15.25, + "learning_rate": 2.149312036575361e-06, + "loss": 1.4487876892089844, + "step": 15052 + }, + { + "epoch": 2.7401474469827978, + "grad_norm": 19.25, + "learning_rate": 2.148897571095225e-06, + "loss": 1.4537475109100342, + "step": 15054 + }, + { + "epoch": 2.74051151360699, + "grad_norm": 24.25, + "learning_rate": 2.1484836707487633e-06, + "loss": 1.4401055574417114, + "step": 15056 + }, + { + "epoch": 2.740875580231182, + "grad_norm": 11.375, + "learning_rate": 2.1480703355967134e-06, + "loss": 1.3841688632965088, + "step": 15058 + }, + { + "epoch": 2.741239646855375, + "grad_norm": 11.4375, + "learning_rate": 2.1476575656997313e-06, + "loss": 1.1530070304870605, + "step": 15060 + }, + { + "epoch": 2.7416037134795666, + "grad_norm": 16.375, + "learning_rate": 2.1472453611183903e-06, + "loss": 1.2965025901794434, + "step": 15062 + }, + { + "epoch": 2.741967780103759, + "grad_norm": 16.125, + "learning_rate": 2.1468337219131783e-06, + "loss": 1.4502832889556885, + "step": 15064 + }, + { + "epoch": 2.742331846727951, + "grad_norm": 8.9375, + "learning_rate": 2.146422648144502e-06, + "loss": 1.5623334646224976, + "step": 15066 + }, + { + "epoch": 2.7426959133521436, + "grad_norm": 16.375, + "learning_rate": 2.1460121398726853e-06, + "loss": 1.527327537536621, + "step": 15068 + }, + { + "epoch": 2.7430599799763358, + "grad_norm": 25.125, + "learning_rate": 2.145602197157967e-06, + "loss": 1.4480284452438354, + "step": 15070 + }, + { + "epoch": 2.743424046600528, + "grad_norm": 5.1875, + "learning_rate": 2.145192820060507e-06, + "loss": 0.4914882779121399, + "step": 15072 + }, + { + "epoch": 2.74378811322472, + "grad_norm": 18.5, + "learning_rate": 2.1447840086403783e-06, + "loss": 1.370298147201538, + "step": 15074 + }, + { + "epoch": 2.7441521798489124, + "grad_norm": 15.9375, + "learning_rate": 2.144375762957572e-06, + "loss": 1.6641881465911865, + "step": 15076 + }, + { + "epoch": 2.7445162464731045, + "grad_norm": 7.09375, + "learning_rate": 2.143968083071998e-06, + "loss": 1.306210994720459, + "step": 15078 + }, + { + "epoch": 2.7448803130972967, + "grad_norm": 17.875, + "learning_rate": 2.143560969043479e-06, + "loss": 1.4841179847717285, + "step": 15080 + }, + { + "epoch": 2.745244379721489, + "grad_norm": 12.0, + "learning_rate": 2.1431544209317603e-06, + "loss": 1.9717035293579102, + "step": 15082 + }, + { + "epoch": 2.745608446345681, + "grad_norm": 21.625, + "learning_rate": 2.1427484387964994e-06, + "loss": 1.2555760145187378, + "step": 15084 + }, + { + "epoch": 2.7459725129698733, + "grad_norm": 8.4375, + "learning_rate": 2.1423430226972735e-06, + "loss": 1.5314890146255493, + "step": 15086 + }, + { + "epoch": 2.7463365795940655, + "grad_norm": 8.625, + "learning_rate": 2.141938172693575e-06, + "loss": 1.3276894092559814, + "step": 15088 + }, + { + "epoch": 2.746700646218258, + "grad_norm": 21.75, + "learning_rate": 2.141533888844814e-06, + "loss": 1.4084265232086182, + "step": 15090 + }, + { + "epoch": 2.74706471284245, + "grad_norm": 16.0, + "learning_rate": 2.1411301712103183e-06, + "loss": 1.401522159576416, + "step": 15092 + }, + { + "epoch": 2.7474287794666425, + "grad_norm": 13.75, + "learning_rate": 2.1407270198493313e-06, + "loss": 1.3489482402801514, + "step": 15094 + }, + { + "epoch": 2.7477928460908347, + "grad_norm": 6.78125, + "learning_rate": 2.1403244348210138e-06, + "loss": 1.3110331296920776, + "step": 15096 + }, + { + "epoch": 2.748156912715027, + "grad_norm": 4.375, + "learning_rate": 2.1399224161844436e-06, + "loss": 0.9076242446899414, + "step": 15098 + }, + { + "epoch": 2.748520979339219, + "grad_norm": 26.5, + "learning_rate": 2.139520963998615e-06, + "loss": 1.1531896591186523, + "step": 15100 + }, + { + "epoch": 2.7488850459634113, + "grad_norm": 6.03125, + "learning_rate": 2.1391200783224402e-06, + "loss": 0.8014776706695557, + "step": 15102 + }, + { + "epoch": 2.7492491125876035, + "grad_norm": 12.0, + "learning_rate": 2.1387197592147467e-06, + "loss": 1.4162440299987793, + "step": 15104 + }, + { + "epoch": 2.7496131792117957, + "grad_norm": 116.5, + "learning_rate": 2.13832000673428e-06, + "loss": 1.578439474105835, + "step": 15106 + }, + { + "epoch": 2.749977245835988, + "grad_norm": 11.625, + "learning_rate": 2.137920820939703e-06, + "loss": 1.4433608055114746, + "step": 15108 + }, + { + "epoch": 2.75034131246018, + "grad_norm": 10.0625, + "learning_rate": 2.1375222018895932e-06, + "loss": 1.4136310815811157, + "step": 15110 + }, + { + "epoch": 2.7507053790843723, + "grad_norm": 15.0625, + "learning_rate": 2.137124149642448e-06, + "loss": 1.2016401290893555, + "step": 15112 + }, + { + "epoch": 2.7510694457085645, + "grad_norm": 33.25, + "learning_rate": 2.1367266642566785e-06, + "loss": 1.7396986484527588, + "step": 15114 + }, + { + "epoch": 2.751433512332757, + "grad_norm": 24.125, + "learning_rate": 2.136329745790614e-06, + "loss": 1.8357279300689697, + "step": 15116 + }, + { + "epoch": 2.751797578956949, + "grad_norm": 8.0625, + "learning_rate": 2.1359333943025017e-06, + "loss": 1.2419112920761108, + "step": 15118 + }, + { + "epoch": 2.7521616455811415, + "grad_norm": 7.5625, + "learning_rate": 2.1355376098505033e-06, + "loss": 0.870661735534668, + "step": 15120 + }, + { + "epoch": 2.7525257122053337, + "grad_norm": 21.0, + "learning_rate": 2.1351423924927006e-06, + "loss": 1.0458451509475708, + "step": 15122 + }, + { + "epoch": 2.752889778829526, + "grad_norm": 18.25, + "learning_rate": 2.1347477422870885e-06, + "loss": 1.3755351305007935, + "step": 15124 + }, + { + "epoch": 2.753253845453718, + "grad_norm": 12.4375, + "learning_rate": 2.1343536592915805e-06, + "loss": 1.3855971097946167, + "step": 15126 + }, + { + "epoch": 2.7536179120779103, + "grad_norm": 3.921875, + "learning_rate": 2.133960143564007e-06, + "loss": 1.1703510284423828, + "step": 15128 + }, + { + "epoch": 2.7539819787021025, + "grad_norm": 14.625, + "learning_rate": 2.133567195162114e-06, + "loss": 1.7142263650894165, + "step": 15130 + }, + { + "epoch": 2.7543460453262947, + "grad_norm": 10.75, + "learning_rate": 2.1331748141435675e-06, + "loss": 1.4029278755187988, + "step": 15132 + }, + { + "epoch": 2.754710111950487, + "grad_norm": 11.625, + "learning_rate": 2.1327830005659454e-06, + "loss": 1.249886393547058, + "step": 15134 + }, + { + "epoch": 2.755074178574679, + "grad_norm": 18.0, + "learning_rate": 2.132391754486745e-06, + "loss": 1.6176830530166626, + "step": 15136 + }, + { + "epoch": 2.7554382451988713, + "grad_norm": 7.78125, + "learning_rate": 2.1320010759633812e-06, + "loss": 0.8771883845329285, + "step": 15138 + }, + { + "epoch": 2.7558023118230635, + "grad_norm": 23.125, + "learning_rate": 2.1316109650531826e-06, + "loss": 1.2134993076324463, + "step": 15140 + }, + { + "epoch": 2.756166378447256, + "grad_norm": 7.53125, + "learning_rate": 2.131221421813399e-06, + "loss": 1.7754442691802979, + "step": 15142 + }, + { + "epoch": 2.756530445071448, + "grad_norm": 9.4375, + "learning_rate": 2.1308324463011932e-06, + "loss": 1.2062547206878662, + "step": 15144 + }, + { + "epoch": 2.7568945116956405, + "grad_norm": 5.9375, + "learning_rate": 2.130444038573645e-06, + "loss": 1.3467555046081543, + "step": 15146 + }, + { + "epoch": 2.7572585783198327, + "grad_norm": 10.9375, + "learning_rate": 2.1300561986877517e-06, + "loss": 1.3722015619277954, + "step": 15148 + }, + { + "epoch": 2.757622644944025, + "grad_norm": 12.5625, + "learning_rate": 2.129668926700428e-06, + "loss": 1.748478651046753, + "step": 15150 + }, + { + "epoch": 2.757986711568217, + "grad_norm": 11.25, + "learning_rate": 2.129282222668505e-06, + "loss": 1.450814962387085, + "step": 15152 + }, + { + "epoch": 2.7583507781924093, + "grad_norm": 16.625, + "learning_rate": 2.128896086648728e-06, + "loss": 1.5398958921432495, + "step": 15154 + }, + { + "epoch": 2.7587148448166015, + "grad_norm": 14.75, + "learning_rate": 2.1285105186977627e-06, + "loss": 1.3081355094909668, + "step": 15156 + }, + { + "epoch": 2.7590789114407936, + "grad_norm": 7.90625, + "learning_rate": 2.1281255188721894e-06, + "loss": 1.2844856977462769, + "step": 15158 + }, + { + "epoch": 2.759442978064986, + "grad_norm": 9.0, + "learning_rate": 2.1277410872285037e-06, + "loss": 1.192986011505127, + "step": 15160 + }, + { + "epoch": 2.759807044689178, + "grad_norm": 10.625, + "learning_rate": 2.1273572238231217e-06, + "loss": 1.519608736038208, + "step": 15162 + }, + { + "epoch": 2.7601711113133702, + "grad_norm": 10.6875, + "learning_rate": 2.126973928712372e-06, + "loss": 1.3468029499053955, + "step": 15164 + }, + { + "epoch": 2.7605351779375624, + "grad_norm": 11.125, + "learning_rate": 2.126591201952503e-06, + "loss": 1.2082629203796387, + "step": 15166 + }, + { + "epoch": 2.760899244561755, + "grad_norm": 13.125, + "learning_rate": 2.126209043599677e-06, + "loss": 1.1842734813690186, + "step": 15168 + }, + { + "epoch": 2.761263311185947, + "grad_norm": 13.125, + "learning_rate": 2.125827453709974e-06, + "loss": 1.920096755027771, + "step": 15170 + }, + { + "epoch": 2.7616273778101394, + "grad_norm": 23.375, + "learning_rate": 2.125446432339393e-06, + "loss": 1.5306665897369385, + "step": 15172 + }, + { + "epoch": 2.7619914444343316, + "grad_norm": 14.75, + "learning_rate": 2.1250659795438453e-06, + "loss": 1.4640711545944214, + "step": 15174 + }, + { + "epoch": 2.762355511058524, + "grad_norm": 20.5, + "learning_rate": 2.124686095379161e-06, + "loss": 1.20827317237854, + "step": 15176 + }, + { + "epoch": 2.762719577682716, + "grad_norm": 7.40625, + "learning_rate": 2.1243067799010875e-06, + "loss": 1.2888175249099731, + "step": 15178 + }, + { + "epoch": 2.7630836443069082, + "grad_norm": 8.0, + "learning_rate": 2.1239280331652862e-06, + "loss": 1.035913348197937, + "step": 15180 + }, + { + "epoch": 2.7634477109311004, + "grad_norm": 12.8125, + "learning_rate": 2.123549855227339e-06, + "loss": 1.4175491333007812, + "step": 15182 + }, + { + "epoch": 2.7638117775552926, + "grad_norm": 12.25, + "learning_rate": 2.12317224614274e-06, + "loss": 1.025011658668518, + "step": 15184 + }, + { + "epoch": 2.764175844179485, + "grad_norm": 18.75, + "learning_rate": 2.1227952059669026e-06, + "loss": 1.5971038341522217, + "step": 15186 + }, + { + "epoch": 2.764539910803677, + "grad_norm": 13.0, + "learning_rate": 2.122418734755156e-06, + "loss": 1.4588137865066528, + "step": 15188 + }, + { + "epoch": 2.764903977427869, + "grad_norm": 20.125, + "learning_rate": 2.1220428325627447e-06, + "loss": 1.587701439857483, + "step": 15190 + }, + { + "epoch": 2.7652680440520614, + "grad_norm": 12.75, + "learning_rate": 2.121667499444833e-06, + "loss": 1.4294147491455078, + "step": 15192 + }, + { + "epoch": 2.765632110676254, + "grad_norm": 10.8125, + "learning_rate": 2.1212927354564973e-06, + "loss": 1.398268699645996, + "step": 15194 + }, + { + "epoch": 2.765996177300446, + "grad_norm": 11.0, + "learning_rate": 2.120918540652734e-06, + "loss": 1.4866583347320557, + "step": 15196 + }, + { + "epoch": 2.7663602439246384, + "grad_norm": 19.875, + "learning_rate": 2.1205449150884542e-06, + "loss": 1.1561260223388672, + "step": 15198 + }, + { + "epoch": 2.76672431054883, + "grad_norm": 12.125, + "learning_rate": 2.120171858818486e-06, + "loss": 1.3582969903945923, + "step": 15200 + }, + { + "epoch": 2.767088377173023, + "grad_norm": 11.875, + "learning_rate": 2.119799371897574e-06, + "loss": 1.1560012102127075, + "step": 15202 + }, + { + "epoch": 2.767452443797215, + "grad_norm": 8.9375, + "learning_rate": 2.11942745438038e-06, + "loss": 1.0556871891021729, + "step": 15204 + }, + { + "epoch": 2.767816510421407, + "grad_norm": 6.53125, + "learning_rate": 2.1190561063214795e-06, + "loss": 1.2058480978012085, + "step": 15206 + }, + { + "epoch": 2.7681805770455994, + "grad_norm": 12.4375, + "learning_rate": 2.118685327775367e-06, + "loss": 1.141218900680542, + "step": 15208 + }, + { + "epoch": 2.7685446436697916, + "grad_norm": 9.875, + "learning_rate": 2.1183151187964533e-06, + "loss": 1.6277103424072266, + "step": 15210 + }, + { + "epoch": 2.7689087102939838, + "grad_norm": 27.875, + "learning_rate": 2.117945479439066e-06, + "loss": 1.9199076890945435, + "step": 15212 + }, + { + "epoch": 2.769272776918176, + "grad_norm": 17.0, + "learning_rate": 2.117576409757446e-06, + "loss": 1.3194208145141602, + "step": 15214 + }, + { + "epoch": 2.769636843542368, + "grad_norm": 17.75, + "learning_rate": 2.1172079098057537e-06, + "loss": 1.5167080163955688, + "step": 15216 + }, + { + "epoch": 2.7700009101665604, + "grad_norm": 16.5, + "learning_rate": 2.116839979638065e-06, + "loss": 1.3913594484329224, + "step": 15218 + }, + { + "epoch": 2.7703649767907526, + "grad_norm": 11.0, + "learning_rate": 2.116472619308372e-06, + "loss": 1.0446672439575195, + "step": 15220 + }, + { + "epoch": 2.7707290434149447, + "grad_norm": 10.1875, + "learning_rate": 2.1161058288705846e-06, + "loss": 0.47014302015304565, + "step": 15222 + }, + { + "epoch": 2.7710931100391374, + "grad_norm": 27.125, + "learning_rate": 2.1157396083785263e-06, + "loss": 1.5051355361938477, + "step": 15224 + }, + { + "epoch": 2.771457176663329, + "grad_norm": 11.3125, + "learning_rate": 2.1153739578859384e-06, + "loss": 1.4453892707824707, + "step": 15226 + }, + { + "epoch": 2.7718212432875218, + "grad_norm": 5.90625, + "learning_rate": 2.1150088774464795e-06, + "loss": 1.5794672966003418, + "step": 15228 + }, + { + "epoch": 2.772185309911714, + "grad_norm": 7.9375, + "learning_rate": 2.114644367113723e-06, + "loss": 1.2086212635040283, + "step": 15230 + }, + { + "epoch": 2.772549376535906, + "grad_norm": 14.625, + "learning_rate": 2.114280426941161e-06, + "loss": 1.2649345397949219, + "step": 15232 + }, + { + "epoch": 2.7729134431600984, + "grad_norm": 7.96875, + "learning_rate": 2.1139170569821976e-06, + "loss": 1.2124547958374023, + "step": 15234 + }, + { + "epoch": 2.7732775097842906, + "grad_norm": 15.0625, + "learning_rate": 2.113554257290158e-06, + "loss": 1.4067473411560059, + "step": 15236 + }, + { + "epoch": 2.7736415764084827, + "grad_norm": 13.1875, + "learning_rate": 2.1131920279182798e-06, + "loss": 1.6940354108810425, + "step": 15238 + }, + { + "epoch": 2.774005643032675, + "grad_norm": 11.0, + "learning_rate": 2.1128303689197198e-06, + "loss": 1.4393317699432373, + "step": 15240 + }, + { + "epoch": 2.774369709656867, + "grad_norm": 23.0, + "learning_rate": 2.112469280347551e-06, + "loss": 1.5619231462478638, + "step": 15242 + }, + { + "epoch": 2.7747337762810593, + "grad_norm": 8.5, + "learning_rate": 2.1121087622547594e-06, + "loss": 1.5718462467193604, + "step": 15244 + }, + { + "epoch": 2.7750978429052515, + "grad_norm": 18.125, + "learning_rate": 2.111748814694251e-06, + "loss": 1.1862127780914307, + "step": 15246 + }, + { + "epoch": 2.7754619095294437, + "grad_norm": 12.875, + "learning_rate": 2.1113894377188463e-06, + "loss": 1.3076564073562622, + "step": 15248 + }, + { + "epoch": 2.7758259761536364, + "grad_norm": 15.125, + "learning_rate": 2.111030631381282e-06, + "loss": 1.5666275024414062, + "step": 15250 + }, + { + "epoch": 2.776190042777828, + "grad_norm": 39.75, + "learning_rate": 2.1106723957342127e-06, + "loss": 1.8964344263076782, + "step": 15252 + }, + { + "epoch": 2.7765541094020207, + "grad_norm": 10.0625, + "learning_rate": 2.1103147308302073e-06, + "loss": 1.3087196350097656, + "step": 15254 + }, + { + "epoch": 2.776918176026213, + "grad_norm": 9.5625, + "learning_rate": 2.109957636721751e-06, + "loss": 1.155623435974121, + "step": 15256 + }, + { + "epoch": 2.777282242650405, + "grad_norm": 7.4375, + "learning_rate": 2.109601113461247e-06, + "loss": 1.3293956518173218, + "step": 15258 + }, + { + "epoch": 2.7776463092745973, + "grad_norm": 14.75, + "learning_rate": 2.1092451611010124e-06, + "loss": 1.4465820789337158, + "step": 15260 + }, + { + "epoch": 2.7780103758987895, + "grad_norm": 18.75, + "learning_rate": 2.108889779693284e-06, + "loss": 1.1767781972885132, + "step": 15262 + }, + { + "epoch": 2.7783744425229817, + "grad_norm": 18.25, + "learning_rate": 2.1085349692902103e-06, + "loss": 0.9152565002441406, + "step": 15264 + }, + { + "epoch": 2.778738509147174, + "grad_norm": 9.625, + "learning_rate": 2.108180729943859e-06, + "loss": 1.3390154838562012, + "step": 15266 + }, + { + "epoch": 2.779102575771366, + "grad_norm": 5.90625, + "learning_rate": 2.1078270617062135e-06, + "loss": 1.2630285024642944, + "step": 15268 + }, + { + "epoch": 2.7794666423955583, + "grad_norm": 14.1875, + "learning_rate": 2.1074739646291733e-06, + "loss": 1.1905521154403687, + "step": 15270 + }, + { + "epoch": 2.7798307090197505, + "grad_norm": 18.5, + "learning_rate": 2.1071214387645537e-06, + "loss": 1.6768046617507935, + "step": 15272 + }, + { + "epoch": 2.7801947756439427, + "grad_norm": 7.5625, + "learning_rate": 2.106769484164086e-06, + "loss": 1.272176742553711, + "step": 15274 + }, + { + "epoch": 2.7805588422681353, + "grad_norm": 5.625, + "learning_rate": 2.1064181008794195e-06, + "loss": 1.0415253639221191, + "step": 15276 + }, + { + "epoch": 2.780922908892327, + "grad_norm": 5.6875, + "learning_rate": 2.106067288962117e-06, + "loss": 1.1963510513305664, + "step": 15278 + }, + { + "epoch": 2.7812869755165197, + "grad_norm": 9.5, + "learning_rate": 2.1057170484636587e-06, + "loss": 1.2466492652893066, + "step": 15280 + }, + { + "epoch": 2.781651042140712, + "grad_norm": 64.5, + "learning_rate": 2.1053673794354424e-06, + "loss": 1.7249022722244263, + "step": 15282 + }, + { + "epoch": 2.782015108764904, + "grad_norm": 6.21875, + "learning_rate": 2.1050182819287787e-06, + "loss": 1.2817299365997314, + "step": 15284 + }, + { + "epoch": 2.7823791753890963, + "grad_norm": 48.5, + "learning_rate": 2.1046697559948974e-06, + "loss": 1.3179740905761719, + "step": 15286 + }, + { + "epoch": 2.7827432420132885, + "grad_norm": 11.1875, + "learning_rate": 2.104321801684943e-06, + "loss": 1.5478861331939697, + "step": 15288 + }, + { + "epoch": 2.7831073086374807, + "grad_norm": 10.6875, + "learning_rate": 2.103974419049976e-06, + "loss": 1.339264988899231, + "step": 15290 + }, + { + "epoch": 2.783471375261673, + "grad_norm": 14.4375, + "learning_rate": 2.1036276081409745e-06, + "loss": 1.4556256532669067, + "step": 15292 + }, + { + "epoch": 2.783835441885865, + "grad_norm": 25.125, + "learning_rate": 2.1032813690088307e-06, + "loss": 1.416735053062439, + "step": 15294 + }, + { + "epoch": 2.7841995085100573, + "grad_norm": 110.0, + "learning_rate": 2.102935701704354e-06, + "loss": 1.2232414484024048, + "step": 15296 + }, + { + "epoch": 2.7845635751342495, + "grad_norm": 23.125, + "learning_rate": 2.1025906062782694e-06, + "loss": 1.3057293891906738, + "step": 15298 + }, + { + "epoch": 2.7849276417584417, + "grad_norm": 8.125, + "learning_rate": 2.1022460827812185e-06, + "loss": 1.1561237573623657, + "step": 15300 + }, + { + "epoch": 2.7852917083826343, + "grad_norm": 27.75, + "learning_rate": 2.1019021312637592e-06, + "loss": 1.3912616968154907, + "step": 15302 + }, + { + "epoch": 2.785655775006826, + "grad_norm": 3.59375, + "learning_rate": 2.1015587517763645e-06, + "loss": 1.1076183319091797, + "step": 15304 + }, + { + "epoch": 2.7860198416310187, + "grad_norm": 12.5625, + "learning_rate": 2.1012159443694234e-06, + "loss": 1.2979843616485596, + "step": 15306 + }, + { + "epoch": 2.7863839082552104, + "grad_norm": 12.5, + "learning_rate": 2.1008737090932426e-06, + "loss": 1.3886433839797974, + "step": 15308 + }, + { + "epoch": 2.786747974879403, + "grad_norm": 14.5625, + "learning_rate": 2.1005320459980425e-06, + "loss": 1.5989450216293335, + "step": 15310 + }, + { + "epoch": 2.7871120415035953, + "grad_norm": 13.5625, + "learning_rate": 2.1001909551339626e-06, + "loss": 1.7310841083526611, + "step": 15312 + }, + { + "epoch": 2.7874761081277875, + "grad_norm": 5.15625, + "learning_rate": 2.099850436551055e-06, + "loss": 0.9716083407402039, + "step": 15314 + }, + { + "epoch": 2.7878401747519796, + "grad_norm": 25.25, + "learning_rate": 2.0995104902992895e-06, + "loss": 1.184140920639038, + "step": 15316 + }, + { + "epoch": 2.788204241376172, + "grad_norm": 11.5625, + "learning_rate": 2.0991711164285525e-06, + "loss": 1.1790357828140259, + "step": 15318 + }, + { + "epoch": 2.788568308000364, + "grad_norm": 33.5, + "learning_rate": 2.098832314988645e-06, + "loss": 0.8578373193740845, + "step": 15320 + }, + { + "epoch": 2.7889323746245562, + "grad_norm": 5.90625, + "learning_rate": 2.0984940860292864e-06, + "loss": 0.9887957572937012, + "step": 15322 + }, + { + "epoch": 2.7892964412487484, + "grad_norm": 9.75, + "learning_rate": 2.098156429600108e-06, + "loss": 1.2182203531265259, + "step": 15324 + }, + { + "epoch": 2.7896605078729406, + "grad_norm": 6.8125, + "learning_rate": 2.0978193457506616e-06, + "loss": 1.5072941780090332, + "step": 15326 + }, + { + "epoch": 2.790024574497133, + "grad_norm": 7.375, + "learning_rate": 2.097482834530412e-06, + "loss": 0.9449323415756226, + "step": 15328 + }, + { + "epoch": 2.790388641121325, + "grad_norm": 26.0, + "learning_rate": 2.0971468959887405e-06, + "loss": 1.5134354829788208, + "step": 15330 + }, + { + "epoch": 2.7907527077455176, + "grad_norm": 110.5, + "learning_rate": 2.0968115301749454e-06, + "loss": 1.2343155145645142, + "step": 15332 + }, + { + "epoch": 2.7911167743697094, + "grad_norm": 20.0, + "learning_rate": 2.09647673713824e-06, + "loss": 0.9876255989074707, + "step": 15334 + }, + { + "epoch": 2.791480840993902, + "grad_norm": 24.25, + "learning_rate": 2.0961425169277537e-06, + "loss": 1.7146257162094116, + "step": 15336 + }, + { + "epoch": 2.7918449076180942, + "grad_norm": 5.09375, + "learning_rate": 2.0958088695925324e-06, + "loss": 0.9566456079483032, + "step": 15338 + }, + { + "epoch": 2.7922089742422864, + "grad_norm": 14.375, + "learning_rate": 2.095475795181536e-06, + "loss": 1.476578712463379, + "step": 15340 + }, + { + "epoch": 2.7925730408664786, + "grad_norm": 6.625, + "learning_rate": 2.095143293743645e-06, + "loss": 1.4261420965194702, + "step": 15342 + }, + { + "epoch": 2.792937107490671, + "grad_norm": 8.4375, + "learning_rate": 2.0948113653276496e-06, + "loss": 1.3135368824005127, + "step": 15344 + }, + { + "epoch": 2.793301174114863, + "grad_norm": 7.25, + "learning_rate": 2.0944800099822603e-06, + "loss": 1.449081540107727, + "step": 15346 + }, + { + "epoch": 2.793665240739055, + "grad_norm": 8.0625, + "learning_rate": 2.0941492277561014e-06, + "loss": 1.4387869834899902, + "step": 15348 + }, + { + "epoch": 2.7940293073632474, + "grad_norm": 7.34375, + "learning_rate": 2.0938190186977137e-06, + "loss": 1.2676608562469482, + "step": 15350 + }, + { + "epoch": 2.7943933739874396, + "grad_norm": 10.625, + "learning_rate": 2.093489382855556e-06, + "loss": 1.2819623947143555, + "step": 15352 + }, + { + "epoch": 2.794757440611632, + "grad_norm": 19.875, + "learning_rate": 2.0931603202779994e-06, + "loss": 1.3368175029754639, + "step": 15354 + }, + { + "epoch": 2.795121507235824, + "grad_norm": 5.71875, + "learning_rate": 2.0928318310133324e-06, + "loss": 1.376208782196045, + "step": 15356 + }, + { + "epoch": 2.7954855738600166, + "grad_norm": 15.125, + "learning_rate": 2.0925039151097596e-06, + "loss": 1.300868272781372, + "step": 15358 + }, + { + "epoch": 2.7958496404842084, + "grad_norm": 7.5, + "learning_rate": 2.0921765726154014e-06, + "loss": 1.3364508152008057, + "step": 15360 + }, + { + "epoch": 2.796213707108401, + "grad_norm": 4.65625, + "learning_rate": 2.0918498035782948e-06, + "loss": 1.2598626613616943, + "step": 15362 + }, + { + "epoch": 2.796577773732593, + "grad_norm": 9.1875, + "learning_rate": 2.0915236080463906e-06, + "loss": 1.4418838024139404, + "step": 15364 + }, + { + "epoch": 2.7969418403567854, + "grad_norm": 8.8125, + "learning_rate": 2.091197986067558e-06, + "loss": 1.2414124011993408, + "step": 15366 + }, + { + "epoch": 2.7973059069809776, + "grad_norm": 9.875, + "learning_rate": 2.09087293768958e-06, + "loss": 1.4556466341018677, + "step": 15368 + }, + { + "epoch": 2.79766997360517, + "grad_norm": 10.5, + "learning_rate": 2.090548462960155e-06, + "loss": 1.3624932765960693, + "step": 15370 + }, + { + "epoch": 2.798034040229362, + "grad_norm": 10.5, + "learning_rate": 2.0902245619269005e-06, + "loss": 1.255264163017273, + "step": 15372 + }, + { + "epoch": 2.798398106853554, + "grad_norm": 11.375, + "learning_rate": 2.089901234637346e-06, + "loss": 1.3311280012130737, + "step": 15374 + }, + { + "epoch": 2.7987621734777464, + "grad_norm": 96.5, + "learning_rate": 2.0895784811389393e-06, + "loss": 1.5165164470672607, + "step": 15376 + }, + { + "epoch": 2.7991262401019386, + "grad_norm": 9.5625, + "learning_rate": 2.0892563014790427e-06, + "loss": 1.1123021841049194, + "step": 15378 + }, + { + "epoch": 2.7994903067261308, + "grad_norm": 14.5, + "learning_rate": 2.088934695704935e-06, + "loss": 0.6490347385406494, + "step": 15380 + }, + { + "epoch": 2.799854373350323, + "grad_norm": 29.875, + "learning_rate": 2.088613663863811e-06, + "loss": 1.233338475227356, + "step": 15382 + }, + { + "epoch": 2.8002184399745156, + "grad_norm": 7.96875, + "learning_rate": 2.08829320600278e-06, + "loss": 1.4194599390029907, + "step": 15384 + }, + { + "epoch": 2.8005825065987073, + "grad_norm": 6.78125, + "learning_rate": 2.0879733221688685e-06, + "loss": 1.238454818725586, + "step": 15386 + }, + { + "epoch": 2.8009465732229, + "grad_norm": 18.5, + "learning_rate": 2.087654012409018e-06, + "loss": 1.2255334854125977, + "step": 15388 + }, + { + "epoch": 2.801310639847092, + "grad_norm": 17.0, + "learning_rate": 2.087335276770085e-06, + "loss": 1.4678912162780762, + "step": 15390 + }, + { + "epoch": 2.8016747064712844, + "grad_norm": 14.1875, + "learning_rate": 2.0870171152988443e-06, + "loss": 1.2598161697387695, + "step": 15392 + }, + { + "epoch": 2.8020387730954766, + "grad_norm": 112.0, + "learning_rate": 2.0866995280419843e-06, + "loss": 1.2336199283599854, + "step": 15394 + }, + { + "epoch": 2.8024028397196687, + "grad_norm": 14.1875, + "learning_rate": 2.086382515046108e-06, + "loss": 0.8416131734848022, + "step": 15396 + }, + { + "epoch": 2.802766906343861, + "grad_norm": 8.4375, + "learning_rate": 2.0860660763577384e-06, + "loss": 1.6554086208343506, + "step": 15398 + }, + { + "epoch": 2.803130972968053, + "grad_norm": 12.625, + "learning_rate": 2.0857502120233093e-06, + "loss": 1.0026733875274658, + "step": 15400 + }, + { + "epoch": 2.8034950395922453, + "grad_norm": 2.609375, + "learning_rate": 2.0854349220891746e-06, + "loss": 0.6647732257843018, + "step": 15402 + }, + { + "epoch": 2.8038591062164375, + "grad_norm": 6.5, + "learning_rate": 2.0851202066016e-06, + "loss": 1.1833223104476929, + "step": 15404 + }, + { + "epoch": 2.8042231728406297, + "grad_norm": 8.625, + "learning_rate": 2.08480606560677e-06, + "loss": 1.1770908832550049, + "step": 15406 + }, + { + "epoch": 2.804587239464822, + "grad_norm": 2.34375, + "learning_rate": 2.084492499150782e-06, + "loss": 1.1623116731643677, + "step": 15408 + }, + { + "epoch": 2.8049513060890146, + "grad_norm": 11.875, + "learning_rate": 2.0841795072796524e-06, + "loss": 1.1141105890274048, + "step": 15410 + }, + { + "epoch": 2.8053153727132063, + "grad_norm": 7.21875, + "learning_rate": 2.0838670900393107e-06, + "loss": 1.430346965789795, + "step": 15412 + }, + { + "epoch": 2.805679439337399, + "grad_norm": 14.0625, + "learning_rate": 2.083555247475603e-06, + "loss": 1.2860307693481445, + "step": 15414 + }, + { + "epoch": 2.806043505961591, + "grad_norm": 5.125, + "learning_rate": 2.0832439796342902e-06, + "loss": 1.220209002494812, + "step": 15416 + }, + { + "epoch": 2.8064075725857833, + "grad_norm": 26.75, + "learning_rate": 2.0829332865610503e-06, + "loss": 1.7848587036132812, + "step": 15418 + }, + { + "epoch": 2.8067716392099755, + "grad_norm": 16.375, + "learning_rate": 2.082623168301476e-06, + "loss": 1.2602730989456177, + "step": 15420 + }, + { + "epoch": 2.8071357058341677, + "grad_norm": 6.65625, + "learning_rate": 2.082313624901077e-06, + "loss": 1.1987271308898926, + "step": 15422 + }, + { + "epoch": 2.80749977245836, + "grad_norm": 9.625, + "learning_rate": 2.0820046564052753e-06, + "loss": 1.3231608867645264, + "step": 15424 + }, + { + "epoch": 2.807863839082552, + "grad_norm": 7.28125, + "learning_rate": 2.0816962628594124e-06, + "loss": 1.2748762369155884, + "step": 15426 + }, + { + "epoch": 2.8082279057067443, + "grad_norm": 3.828125, + "learning_rate": 2.0813884443087436e-06, + "loss": 1.0443068742752075, + "step": 15428 + }, + { + "epoch": 2.8085919723309365, + "grad_norm": 16.375, + "learning_rate": 2.0810812007984394e-06, + "loss": 1.5760655403137207, + "step": 15430 + }, + { + "epoch": 2.8089560389551287, + "grad_norm": 27.25, + "learning_rate": 2.0807745323735877e-06, + "loss": 1.6567323207855225, + "step": 15432 + }, + { + "epoch": 2.809320105579321, + "grad_norm": 27.625, + "learning_rate": 2.0804684390791897e-06, + "loss": 1.2157567739486694, + "step": 15434 + }, + { + "epoch": 2.8096841722035135, + "grad_norm": 6.25, + "learning_rate": 2.080162920960164e-06, + "loss": 1.5075218677520752, + "step": 15436 + }, + { + "epoch": 2.8100482388277053, + "grad_norm": 4.0625, + "learning_rate": 2.079857978061344e-06, + "loss": 0.9266378879547119, + "step": 15438 + }, + { + "epoch": 2.810412305451898, + "grad_norm": 40.0, + "learning_rate": 2.079553610427478e-06, + "loss": 1.0016363859176636, + "step": 15440 + }, + { + "epoch": 2.8107763720760897, + "grad_norm": 13.9375, + "learning_rate": 2.0792498181032326e-06, + "loss": 0.7211898565292358, + "step": 15442 + }, + { + "epoch": 2.8111404387002823, + "grad_norm": 12.5625, + "learning_rate": 2.0789466011331863e-06, + "loss": 0.9842705726623535, + "step": 15444 + }, + { + "epoch": 2.8115045053244745, + "grad_norm": 21.25, + "learning_rate": 2.078643959561836e-06, + "loss": 1.2994048595428467, + "step": 15446 + }, + { + "epoch": 2.8118685719486667, + "grad_norm": 19.25, + "learning_rate": 2.0783418934335922e-06, + "loss": 1.939651608467102, + "step": 15448 + }, + { + "epoch": 2.812232638572859, + "grad_norm": 19.25, + "learning_rate": 2.0780404027927827e-06, + "loss": 1.9610989093780518, + "step": 15450 + }, + { + "epoch": 2.812596705197051, + "grad_norm": 16.75, + "learning_rate": 2.0777394876836503e-06, + "loss": 1.9850043058395386, + "step": 15452 + }, + { + "epoch": 2.8129607718212433, + "grad_norm": 7.46875, + "learning_rate": 2.077439148150352e-06, + "loss": 1.4576671123504639, + "step": 15454 + }, + { + "epoch": 2.8133248384454355, + "grad_norm": 6.71875, + "learning_rate": 2.0771393842369627e-06, + "loss": 1.4522960186004639, + "step": 15456 + }, + { + "epoch": 2.8136889050696277, + "grad_norm": 9.1875, + "learning_rate": 2.0768401959874697e-06, + "loss": 1.3378958702087402, + "step": 15458 + }, + { + "epoch": 2.81405297169382, + "grad_norm": 11.5, + "learning_rate": 2.0765415834457787e-06, + "loss": 1.4418935775756836, + "step": 15460 + }, + { + "epoch": 2.814417038318012, + "grad_norm": 6.875, + "learning_rate": 2.076243546655711e-06, + "loss": 1.4748448133468628, + "step": 15462 + }, + { + "epoch": 2.8147811049422042, + "grad_norm": 14.3125, + "learning_rate": 2.075946085661001e-06, + "loss": 1.0807702541351318, + "step": 15464 + }, + { + "epoch": 2.815145171566397, + "grad_norm": 11.4375, + "learning_rate": 2.0756492005053e-06, + "loss": 1.3025349378585815, + "step": 15466 + }, + { + "epoch": 2.8155092381905886, + "grad_norm": 7.46875, + "learning_rate": 2.0753528912321747e-06, + "loss": 1.8363618850708008, + "step": 15468 + }, + { + "epoch": 2.8158733048147813, + "grad_norm": 10.5625, + "learning_rate": 2.075057157885107e-06, + "loss": 1.331172227859497, + "step": 15470 + }, + { + "epoch": 2.8162373714389735, + "grad_norm": 11.25, + "learning_rate": 2.074762000507496e-06, + "loss": 1.6024922132492065, + "step": 15472 + }, + { + "epoch": 2.8166014380631657, + "grad_norm": 10.25, + "learning_rate": 2.074467419142653e-06, + "loss": 1.291154384613037, + "step": 15474 + }, + { + "epoch": 2.816965504687358, + "grad_norm": 20.25, + "learning_rate": 2.074173413833808e-06, + "loss": 1.0730677843093872, + "step": 15476 + }, + { + "epoch": 2.81732957131155, + "grad_norm": 17.75, + "learning_rate": 2.0738799846241036e-06, + "loss": 0.6087172031402588, + "step": 15478 + }, + { + "epoch": 2.8176936379357422, + "grad_norm": 17.75, + "learning_rate": 2.073587131556601e-06, + "loss": 1.4700336456298828, + "step": 15480 + }, + { + "epoch": 2.8180577045599344, + "grad_norm": 165.0, + "learning_rate": 2.0732948546742745e-06, + "loss": 1.5363982915878296, + "step": 15482 + }, + { + "epoch": 2.8184217711841266, + "grad_norm": 12.8125, + "learning_rate": 2.0730031540200142e-06, + "loss": 1.5605051517486572, + "step": 15484 + }, + { + "epoch": 2.818785837808319, + "grad_norm": 10.8125, + "learning_rate": 2.072712029636627e-06, + "loss": 1.6522586345672607, + "step": 15486 + }, + { + "epoch": 2.819149904432511, + "grad_norm": 15.6875, + "learning_rate": 2.072421481566833e-06, + "loss": 1.505131721496582, + "step": 15488 + }, + { + "epoch": 2.819513971056703, + "grad_norm": 8.375, + "learning_rate": 2.072131509853269e-06, + "loss": 1.3004802465438843, + "step": 15490 + }, + { + "epoch": 2.819878037680896, + "grad_norm": 23.75, + "learning_rate": 2.0718421145384884e-06, + "loss": 1.040716290473938, + "step": 15492 + }, + { + "epoch": 2.8202421043050876, + "grad_norm": 11.0625, + "learning_rate": 2.0715532956649584e-06, + "loss": 1.477367877960205, + "step": 15494 + }, + { + "epoch": 2.8206061709292802, + "grad_norm": 19.125, + "learning_rate": 2.071265053275061e-06, + "loss": 1.567819595336914, + "step": 15496 + }, + { + "epoch": 2.8209702375534724, + "grad_norm": 14.1875, + "learning_rate": 2.0709773874110956e-06, + "loss": 0.4847180247306824, + "step": 15498 + }, + { + "epoch": 2.8213343041776646, + "grad_norm": 11.6875, + "learning_rate": 2.070690298115275e-06, + "loss": 1.4409312009811401, + "step": 15500 + }, + { + "epoch": 2.821698370801857, + "grad_norm": 81.5, + "learning_rate": 2.07040378542973e-06, + "loss": 1.2525432109832764, + "step": 15502 + }, + { + "epoch": 2.822062437426049, + "grad_norm": 14.6875, + "learning_rate": 2.070117849396504e-06, + "loss": 1.3137834072113037, + "step": 15504 + }, + { + "epoch": 2.822426504050241, + "grad_norm": 15.0625, + "learning_rate": 2.0698324900575563e-06, + "loss": 1.3841185569763184, + "step": 15506 + }, + { + "epoch": 2.8227905706744334, + "grad_norm": 19.875, + "learning_rate": 2.069547707454764e-06, + "loss": 1.2313940525054932, + "step": 15508 + }, + { + "epoch": 2.8231546372986256, + "grad_norm": 8.6875, + "learning_rate": 2.0692635016299163e-06, + "loss": 0.4161737263202667, + "step": 15510 + }, + { + "epoch": 2.823518703922818, + "grad_norm": 6.375, + "learning_rate": 2.0689798726247205e-06, + "loss": 1.2001042366027832, + "step": 15512 + }, + { + "epoch": 2.82388277054701, + "grad_norm": 41.5, + "learning_rate": 2.0686968204807968e-06, + "loss": 1.2105071544647217, + "step": 15514 + }, + { + "epoch": 2.824246837171202, + "grad_norm": 13.75, + "learning_rate": 2.068414345239683e-06, + "loss": 1.5903499126434326, + "step": 15516 + }, + { + "epoch": 2.824610903795395, + "grad_norm": 16.125, + "learning_rate": 2.068132446942831e-06, + "loss": 1.4579613208770752, + "step": 15518 + }, + { + "epoch": 2.8249749704195866, + "grad_norm": 8.125, + "learning_rate": 2.067851125631607e-06, + "loss": 1.1417038440704346, + "step": 15520 + }, + { + "epoch": 2.825339037043779, + "grad_norm": 8.8125, + "learning_rate": 2.0675703813472953e-06, + "loss": 1.263839602470398, + "step": 15522 + }, + { + "epoch": 2.8257031036679714, + "grad_norm": 8.6875, + "learning_rate": 2.067290214131093e-06, + "loss": 1.1335194110870361, + "step": 15524 + }, + { + "epoch": 2.8260671702921636, + "grad_norm": 12.4375, + "learning_rate": 2.067010624024114e-06, + "loss": 1.3315210342407227, + "step": 15526 + }, + { + "epoch": 2.826431236916356, + "grad_norm": 14.75, + "learning_rate": 2.0667316110673875e-06, + "loss": 1.434949278831482, + "step": 15528 + }, + { + "epoch": 2.826795303540548, + "grad_norm": 30.625, + "learning_rate": 2.066453175301856e-06, + "loss": 1.5310791730880737, + "step": 15530 + }, + { + "epoch": 2.82715937016474, + "grad_norm": 15.6875, + "learning_rate": 2.0661753167683805e-06, + "loss": 1.2255624532699585, + "step": 15532 + }, + { + "epoch": 2.8275234367889324, + "grad_norm": 16.875, + "learning_rate": 2.0658980355077346e-06, + "loss": 1.3062666654586792, + "step": 15534 + }, + { + "epoch": 2.8278875034131246, + "grad_norm": 11.1875, + "learning_rate": 2.065621331560609e-06, + "loss": 1.1019104719161987, + "step": 15536 + }, + { + "epoch": 2.8282515700373168, + "grad_norm": 4.40625, + "learning_rate": 2.0653452049676073e-06, + "loss": 0.9320621490478516, + "step": 15538 + }, + { + "epoch": 2.828615636661509, + "grad_norm": 26.375, + "learning_rate": 2.0650696557692517e-06, + "loss": 1.1965681314468384, + "step": 15540 + }, + { + "epoch": 2.828979703285701, + "grad_norm": 19.75, + "learning_rate": 2.064794684005977e-06, + "loss": 1.7365366220474243, + "step": 15542 + }, + { + "epoch": 2.829343769909894, + "grad_norm": 13.8125, + "learning_rate": 2.0645202897181345e-06, + "loss": 1.3912160396575928, + "step": 15544 + }, + { + "epoch": 2.8297078365340855, + "grad_norm": 10.6875, + "learning_rate": 2.0642464729459906e-06, + "loss": 1.3329311609268188, + "step": 15546 + }, + { + "epoch": 2.830071903158278, + "grad_norm": 11.75, + "learning_rate": 2.0639732337297263e-06, + "loss": 1.3877556324005127, + "step": 15548 + }, + { + "epoch": 2.83043596978247, + "grad_norm": 11.5, + "learning_rate": 2.0637005721094386e-06, + "loss": 1.3779581785202026, + "step": 15550 + }, + { + "epoch": 2.8308000364066626, + "grad_norm": 13.1875, + "learning_rate": 2.06342848812514e-06, + "loss": 0.9625064134597778, + "step": 15552 + }, + { + "epoch": 2.8311641030308548, + "grad_norm": 14.0, + "learning_rate": 2.0631569818167563e-06, + "loss": 1.2871441841125488, + "step": 15554 + }, + { + "epoch": 2.831528169655047, + "grad_norm": 19.625, + "learning_rate": 2.062886053224132e-06, + "loss": 1.640420913696289, + "step": 15556 + }, + { + "epoch": 2.831892236279239, + "grad_norm": 14.6875, + "learning_rate": 2.062615702387023e-06, + "loss": 1.3000365495681763, + "step": 15558 + }, + { + "epoch": 2.8322563029034313, + "grad_norm": 9.0, + "learning_rate": 2.0623459293451026e-06, + "loss": 1.299499273300171, + "step": 15560 + }, + { + "epoch": 2.8326203695276235, + "grad_norm": 6.625, + "learning_rate": 2.06207673413796e-06, + "loss": 1.4046026468276978, + "step": 15562 + }, + { + "epoch": 2.8329844361518157, + "grad_norm": 3.609375, + "learning_rate": 2.0618081168050965e-06, + "loss": 1.197812557220459, + "step": 15564 + }, + { + "epoch": 2.833348502776008, + "grad_norm": 11.8125, + "learning_rate": 2.061540077385933e-06, + "loss": 1.3336005210876465, + "step": 15566 + }, + { + "epoch": 2.8337125694002, + "grad_norm": 14.6875, + "learning_rate": 2.0612726159198015e-06, + "loss": 1.8679522275924683, + "step": 15568 + }, + { + "epoch": 2.8340766360243923, + "grad_norm": 20.375, + "learning_rate": 2.0610057324459504e-06, + "loss": 1.4219310283660889, + "step": 15570 + }, + { + "epoch": 2.8344407026485845, + "grad_norm": 24.75, + "learning_rate": 2.0607394270035465e-06, + "loss": 1.5261874198913574, + "step": 15572 + }, + { + "epoch": 2.834804769272777, + "grad_norm": 10.4375, + "learning_rate": 2.060473699631666e-06, + "loss": 1.2180685997009277, + "step": 15574 + }, + { + "epoch": 2.835168835896969, + "grad_norm": 19.0, + "learning_rate": 2.0602085503693048e-06, + "loss": 1.8519091606140137, + "step": 15576 + }, + { + "epoch": 2.8355329025211615, + "grad_norm": 14.75, + "learning_rate": 2.0599439792553727e-06, + "loss": 1.835161805152893, + "step": 15578 + }, + { + "epoch": 2.8358969691453537, + "grad_norm": 2.578125, + "learning_rate": 2.0596799863286932e-06, + "loss": 1.0542484521865845, + "step": 15580 + }, + { + "epoch": 2.836261035769546, + "grad_norm": 3.53125, + "learning_rate": 2.059416571628008e-06, + "loss": 1.0575069189071655, + "step": 15582 + }, + { + "epoch": 2.836625102393738, + "grad_norm": 9.1875, + "learning_rate": 2.05915373519197e-06, + "loss": 0.9817519187927246, + "step": 15584 + }, + { + "epoch": 2.8369891690179303, + "grad_norm": 13.6875, + "learning_rate": 2.058891477059151e-06, + "loss": 1.3612810373306274, + "step": 15586 + }, + { + "epoch": 2.8373532356421225, + "grad_norm": 17.375, + "learning_rate": 2.058629797268036e-06, + "loss": 1.9211276769638062, + "step": 15588 + }, + { + "epoch": 2.8377173022663147, + "grad_norm": 19.0, + "learning_rate": 2.0583686958570247e-06, + "loss": 1.8029141426086426, + "step": 15590 + }, + { + "epoch": 2.838081368890507, + "grad_norm": 7.46875, + "learning_rate": 2.0581081728644346e-06, + "loss": 1.4475373029708862, + "step": 15592 + }, + { + "epoch": 2.838445435514699, + "grad_norm": 12.875, + "learning_rate": 2.057848228328494e-06, + "loss": 1.2792654037475586, + "step": 15594 + }, + { + "epoch": 2.8388095021388913, + "grad_norm": 10.75, + "learning_rate": 2.0575888622873496e-06, + "loss": 1.319126844406128, + "step": 15596 + }, + { + "epoch": 2.8391735687630835, + "grad_norm": 30.5, + "learning_rate": 2.057330074779063e-06, + "loss": 1.4657354354858398, + "step": 15598 + }, + { + "epoch": 2.839537635387276, + "grad_norm": 9.4375, + "learning_rate": 2.057071865841609e-06, + "loss": 1.6511945724487305, + "step": 15600 + }, + { + "epoch": 2.839901702011468, + "grad_norm": 2.796875, + "learning_rate": 2.05681423551288e-06, + "loss": 1.0609612464904785, + "step": 15602 + }, + { + "epoch": 2.8402657686356605, + "grad_norm": 2.34375, + "learning_rate": 2.0565571838306815e-06, + "loss": 1.0955684185028076, + "step": 15604 + }, + { + "epoch": 2.8406298352598527, + "grad_norm": 71.0, + "learning_rate": 2.056300710832735e-06, + "loss": 1.1401350498199463, + "step": 15606 + }, + { + "epoch": 2.840993901884045, + "grad_norm": 28.625, + "learning_rate": 2.0560448165566767e-06, + "loss": 1.673246145248413, + "step": 15608 + }, + { + "epoch": 2.841357968508237, + "grad_norm": 12.4375, + "learning_rate": 2.0557895010400577e-06, + "loss": 1.4311206340789795, + "step": 15610 + }, + { + "epoch": 2.8417220351324293, + "grad_norm": 7.84375, + "learning_rate": 2.0555347643203457e-06, + "loss": 1.3651249408721924, + "step": 15612 + }, + { + "epoch": 2.8420861017566215, + "grad_norm": 8.5625, + "learning_rate": 2.0552806064349207e-06, + "loss": 1.7251278162002563, + "step": 15614 + }, + { + "epoch": 2.8424501683808137, + "grad_norm": 17.5, + "learning_rate": 2.0550270274210805e-06, + "loss": 1.5464675426483154, + "step": 15616 + }, + { + "epoch": 2.842814235005006, + "grad_norm": 22.0, + "learning_rate": 2.054774027316037e-06, + "loss": 1.6187057495117188, + "step": 15618 + }, + { + "epoch": 2.843178301629198, + "grad_norm": 14.5625, + "learning_rate": 2.054521606156915e-06, + "loss": 1.9455593824386597, + "step": 15620 + }, + { + "epoch": 2.8435423682533902, + "grad_norm": 21.75, + "learning_rate": 2.0542697639807596e-06, + "loss": 1.276139497756958, + "step": 15622 + }, + { + "epoch": 2.8439064348775824, + "grad_norm": 21.0, + "learning_rate": 2.054018500824524e-06, + "loss": 0.8827497959136963, + "step": 15624 + }, + { + "epoch": 2.844270501501775, + "grad_norm": 10.9375, + "learning_rate": 2.0537678167250825e-06, + "loss": 1.4192324876785278, + "step": 15626 + }, + { + "epoch": 2.844634568125967, + "grad_norm": 8.0, + "learning_rate": 2.0535177117192215e-06, + "loss": 1.2304869890213013, + "step": 15628 + }, + { + "epoch": 2.8449986347501595, + "grad_norm": 10.9375, + "learning_rate": 2.053268185843642e-06, + "loss": 1.2841804027557373, + "step": 15630 + }, + { + "epoch": 2.8453627013743517, + "grad_norm": 6.875, + "learning_rate": 2.0530192391349617e-06, + "loss": 1.3602601289749146, + "step": 15632 + }, + { + "epoch": 2.845726767998544, + "grad_norm": 8.5, + "learning_rate": 2.052770871629712e-06, + "loss": 1.4628217220306396, + "step": 15634 + }, + { + "epoch": 2.846090834622736, + "grad_norm": 12.8125, + "learning_rate": 2.052523083364341e-06, + "loss": 1.5020369291305542, + "step": 15636 + }, + { + "epoch": 2.8464549012469282, + "grad_norm": 37.0, + "learning_rate": 2.052275874375209e-06, + "loss": 0.5445114374160767, + "step": 15638 + }, + { + "epoch": 2.8468189678711204, + "grad_norm": 15.1875, + "learning_rate": 2.0520292446985944e-06, + "loss": 1.3979467153549194, + "step": 15640 + }, + { + "epoch": 2.8471830344953126, + "grad_norm": 10.25, + "learning_rate": 2.051783194370688e-06, + "loss": 1.17988920211792, + "step": 15642 + }, + { + "epoch": 2.847547101119505, + "grad_norm": 9.1875, + "learning_rate": 2.051537723427597e-06, + "loss": 1.3148683309555054, + "step": 15644 + }, + { + "epoch": 2.847911167743697, + "grad_norm": 6.71875, + "learning_rate": 2.0512928319053436e-06, + "loss": 1.2289530038833618, + "step": 15646 + }, + { + "epoch": 2.848275234367889, + "grad_norm": 19.0, + "learning_rate": 2.0510485198398644e-06, + "loss": 1.502204179763794, + "step": 15648 + }, + { + "epoch": 2.8486393009920814, + "grad_norm": 12.25, + "learning_rate": 2.050804787267011e-06, + "loss": 1.7907421588897705, + "step": 15650 + }, + { + "epoch": 2.849003367616274, + "grad_norm": 3.390625, + "learning_rate": 2.050561634222551e-06, + "loss": 0.8259409666061401, + "step": 15652 + }, + { + "epoch": 2.849367434240466, + "grad_norm": 66.5, + "learning_rate": 2.0503190607421645e-06, + "loss": 0.9685641527175903, + "step": 15654 + }, + { + "epoch": 2.8497315008646584, + "grad_norm": 25.0, + "learning_rate": 2.05007706686145e-06, + "loss": 1.569624662399292, + "step": 15656 + }, + { + "epoch": 2.8500955674888506, + "grad_norm": 25.875, + "learning_rate": 2.049835652615918e-06, + "loss": 1.6426762342453003, + "step": 15658 + }, + { + "epoch": 2.850459634113043, + "grad_norm": 11.75, + "learning_rate": 2.0495948180409954e-06, + "loss": 1.4154115915298462, + "step": 15660 + }, + { + "epoch": 2.850823700737235, + "grad_norm": 19.75, + "learning_rate": 2.0493545631720233e-06, + "loss": 1.4515305757522583, + "step": 15662 + }, + { + "epoch": 2.851187767361427, + "grad_norm": 18.5, + "learning_rate": 2.049114888044259e-06, + "loss": 1.7178009748458862, + "step": 15664 + }, + { + "epoch": 2.8515518339856194, + "grad_norm": 38.75, + "learning_rate": 2.048875792692873e-06, + "loss": 1.2690167427062988, + "step": 15666 + }, + { + "epoch": 2.8519159006098116, + "grad_norm": 21.75, + "learning_rate": 2.0486372771529523e-06, + "loss": 1.2907917499542236, + "step": 15668 + }, + { + "epoch": 2.852279967234004, + "grad_norm": 8.25, + "learning_rate": 2.048399341459497e-06, + "loss": 1.3868441581726074, + "step": 15670 + }, + { + "epoch": 2.852644033858196, + "grad_norm": 9.0625, + "learning_rate": 2.048161985647425e-06, + "loss": 1.3096508979797363, + "step": 15672 + }, + { + "epoch": 2.853008100482388, + "grad_norm": 3.828125, + "learning_rate": 2.0479252097515657e-06, + "loss": 1.108668327331543, + "step": 15674 + }, + { + "epoch": 2.8533721671065804, + "grad_norm": 34.5, + "learning_rate": 2.0476890138066656e-06, + "loss": 1.0276817083358765, + "step": 15676 + }, + { + "epoch": 2.853736233730773, + "grad_norm": 12.6875, + "learning_rate": 2.047453397847385e-06, + "loss": 1.3779515027999878, + "step": 15678 + }, + { + "epoch": 2.8541003003549648, + "grad_norm": 60.25, + "learning_rate": 2.0472183619083e-06, + "loss": 1.6853160858154297, + "step": 15680 + }, + { + "epoch": 2.8544643669791574, + "grad_norm": 30.0, + "learning_rate": 2.0469839060239015e-06, + "loss": 1.9663193225860596, + "step": 15682 + }, + { + "epoch": 2.854828433603349, + "grad_norm": 88.5, + "learning_rate": 2.0467500302285945e-06, + "loss": 1.6754333972930908, + "step": 15684 + }, + { + "epoch": 2.855192500227542, + "grad_norm": 14.3125, + "learning_rate": 2.0465167345566994e-06, + "loss": 1.333894968032837, + "step": 15686 + }, + { + "epoch": 2.855556566851734, + "grad_norm": 13.0, + "learning_rate": 2.0462840190424515e-06, + "loss": 1.3003911972045898, + "step": 15688 + }, + { + "epoch": 2.855920633475926, + "grad_norm": 7.03125, + "learning_rate": 2.0460518837200007e-06, + "loss": 1.461794376373291, + "step": 15690 + }, + { + "epoch": 2.8562847001001184, + "grad_norm": 19.0, + "learning_rate": 2.0458203286234124e-06, + "loss": 1.3130040168762207, + "step": 15692 + }, + { + "epoch": 2.8566487667243106, + "grad_norm": 16.125, + "learning_rate": 2.045589353786665e-06, + "loss": 1.0638649463653564, + "step": 15694 + }, + { + "epoch": 2.8570128333485028, + "grad_norm": 7.5625, + "learning_rate": 2.045358959243655e-06, + "loss": 1.4187135696411133, + "step": 15696 + }, + { + "epoch": 2.857376899972695, + "grad_norm": 18.75, + "learning_rate": 2.045129145028191e-06, + "loss": 1.2999001741409302, + "step": 15698 + }, + { + "epoch": 2.857740966596887, + "grad_norm": 42.0, + "learning_rate": 2.044899911173997e-06, + "loss": 0.8540710210800171, + "step": 15700 + }, + { + "epoch": 2.8581050332210793, + "grad_norm": 22.625, + "learning_rate": 2.0446712577147128e-06, + "loss": 0.3583671748638153, + "step": 15702 + }, + { + "epoch": 2.8584690998452715, + "grad_norm": 16.625, + "learning_rate": 2.044443184683891e-06, + "loss": 1.6492469310760498, + "step": 15704 + }, + { + "epoch": 2.8588331664694637, + "grad_norm": 7.25, + "learning_rate": 2.0442156921150025e-06, + "loss": 1.2886207103729248, + "step": 15706 + }, + { + "epoch": 2.8591972330936564, + "grad_norm": 15.6875, + "learning_rate": 2.0439887800414294e-06, + "loss": 1.9317054748535156, + "step": 15708 + }, + { + "epoch": 2.859561299717848, + "grad_norm": 18.125, + "learning_rate": 2.04376244849647e-06, + "loss": 1.86313796043396, + "step": 15710 + }, + { + "epoch": 2.8599253663420408, + "grad_norm": 13.6875, + "learning_rate": 2.0435366975133384e-06, + "loss": 1.291890263557434, + "step": 15712 + }, + { + "epoch": 2.860289432966233, + "grad_norm": 22.75, + "learning_rate": 2.0433115271251626e-06, + "loss": 1.4764728546142578, + "step": 15714 + }, + { + "epoch": 2.860653499590425, + "grad_norm": 16.25, + "learning_rate": 2.0430869373649847e-06, + "loss": 1.3750933408737183, + "step": 15716 + }, + { + "epoch": 2.8610175662146173, + "grad_norm": 13.875, + "learning_rate": 2.042862928265763e-06, + "loss": 1.2089996337890625, + "step": 15718 + }, + { + "epoch": 2.8613816328388095, + "grad_norm": 8.375, + "learning_rate": 2.0426394998603694e-06, + "loss": 1.3231537342071533, + "step": 15720 + }, + { + "epoch": 2.8617456994630017, + "grad_norm": 9.125, + "learning_rate": 2.0424166521815924e-06, + "loss": 1.4454478025436401, + "step": 15722 + }, + { + "epoch": 2.862109766087194, + "grad_norm": 8.8125, + "learning_rate": 2.042194385262132e-06, + "loss": 1.411362648010254, + "step": 15724 + }, + { + "epoch": 2.862473832711386, + "grad_norm": 6.6875, + "learning_rate": 2.0419726991346065e-06, + "loss": 1.2141002416610718, + "step": 15726 + }, + { + "epoch": 2.8628378993355783, + "grad_norm": 11.8125, + "learning_rate": 2.0417515938315468e-06, + "loss": 1.022182822227478, + "step": 15728 + }, + { + "epoch": 2.8632019659597705, + "grad_norm": 15.4375, + "learning_rate": 2.041531069385399e-06, + "loss": 0.5809782147407532, + "step": 15730 + }, + { + "epoch": 2.8635660325839627, + "grad_norm": 8.5625, + "learning_rate": 2.0413111258285247e-06, + "loss": 1.5237994194030762, + "step": 15732 + }, + { + "epoch": 2.8639300992081553, + "grad_norm": 12.8125, + "learning_rate": 2.0410917631931994e-06, + "loss": 1.4772496223449707, + "step": 15734 + }, + { + "epoch": 2.864294165832347, + "grad_norm": 11.0, + "learning_rate": 2.040872981511614e-06, + "loss": 1.3659594058990479, + "step": 15736 + }, + { + "epoch": 2.8646582324565397, + "grad_norm": 12.125, + "learning_rate": 2.040654780815874e-06, + "loss": 1.2304561138153076, + "step": 15738 + }, + { + "epoch": 2.865022299080732, + "grad_norm": 24.0, + "learning_rate": 2.040437161137998e-06, + "loss": 1.602603554725647, + "step": 15740 + }, + { + "epoch": 2.865386365704924, + "grad_norm": 8.5, + "learning_rate": 2.040220122509923e-06, + "loss": 1.4287970066070557, + "step": 15742 + }, + { + "epoch": 2.8657504323291163, + "grad_norm": 12.625, + "learning_rate": 2.0400036649634967e-06, + "loss": 1.388832449913025, + "step": 15744 + }, + { + "epoch": 2.8661144989533085, + "grad_norm": 8.375, + "learning_rate": 2.039787788530485e-06, + "loss": 1.1057956218719482, + "step": 15746 + }, + { + "epoch": 2.8664785655775007, + "grad_norm": 6.59375, + "learning_rate": 2.0395724932425652e-06, + "loss": 1.2073942422866821, + "step": 15748 + }, + { + "epoch": 2.866842632201693, + "grad_norm": 9.0625, + "learning_rate": 2.0393577791313314e-06, + "loss": 1.4321943521499634, + "step": 15750 + }, + { + "epoch": 2.867206698825885, + "grad_norm": 16.125, + "learning_rate": 2.0391436462282934e-06, + "loss": 1.4703847169876099, + "step": 15752 + }, + { + "epoch": 2.8675707654500773, + "grad_norm": 36.75, + "learning_rate": 2.0389300945648733e-06, + "loss": 1.25998854637146, + "step": 15754 + }, + { + "epoch": 2.8679348320742695, + "grad_norm": 11.8125, + "learning_rate": 2.038717124172409e-06, + "loss": 1.2880170345306396, + "step": 15756 + }, + { + "epoch": 2.8682988986984617, + "grad_norm": 10.25, + "learning_rate": 2.0385047350821524e-06, + "loss": 1.6903637647628784, + "step": 15758 + }, + { + "epoch": 2.8686629653226543, + "grad_norm": 8.75, + "learning_rate": 2.0382929273252716e-06, + "loss": 1.130076289176941, + "step": 15760 + }, + { + "epoch": 2.869027031946846, + "grad_norm": 8.5625, + "learning_rate": 2.038081700932849e-06, + "loss": 1.4082366228103638, + "step": 15762 + }, + { + "epoch": 2.8693910985710387, + "grad_norm": 7.625, + "learning_rate": 2.0378710559358796e-06, + "loss": 1.1690301895141602, + "step": 15764 + }, + { + "epoch": 2.869755165195231, + "grad_norm": 26.875, + "learning_rate": 2.037660992365276e-06, + "loss": 1.290572166442871, + "step": 15766 + }, + { + "epoch": 2.870119231819423, + "grad_norm": 10.75, + "learning_rate": 2.037451510251864e-06, + "loss": 1.1646549701690674, + "step": 15768 + }, + { + "epoch": 2.8704832984436153, + "grad_norm": 103.5, + "learning_rate": 2.037242609626384e-06, + "loss": 1.3685389757156372, + "step": 15770 + }, + { + "epoch": 2.8708473650678075, + "grad_norm": 11.5, + "learning_rate": 2.037034290519492e-06, + "loss": 1.4348698854446411, + "step": 15772 + }, + { + "epoch": 2.8712114316919997, + "grad_norm": 56.0, + "learning_rate": 2.036826552961756e-06, + "loss": 1.3860540390014648, + "step": 15774 + }, + { + "epoch": 2.871575498316192, + "grad_norm": 24.5, + "learning_rate": 2.036619396983663e-06, + "loss": 1.311455488204956, + "step": 15776 + }, + { + "epoch": 2.871939564940384, + "grad_norm": 7.25, + "learning_rate": 2.036412822615611e-06, + "loss": 1.3445053100585938, + "step": 15778 + }, + { + "epoch": 2.8723036315645762, + "grad_norm": 11.1875, + "learning_rate": 2.0362068298879143e-06, + "loss": 1.4567331075668335, + "step": 15780 + }, + { + "epoch": 2.8726676981887684, + "grad_norm": 6.5625, + "learning_rate": 2.0360014188308016e-06, + "loss": 1.3662030696868896, + "step": 15782 + }, + { + "epoch": 2.8730317648129606, + "grad_norm": 9.0, + "learning_rate": 2.035796589474416e-06, + "loss": 2.10536527633667, + "step": 15784 + }, + { + "epoch": 2.8733958314371533, + "grad_norm": 8.3125, + "learning_rate": 2.035592341848815e-06, + "loss": 1.174559235572815, + "step": 15786 + }, + { + "epoch": 2.873759898061345, + "grad_norm": 15.0625, + "learning_rate": 2.035388675983972e-06, + "loss": 1.8436174392700195, + "step": 15788 + }, + { + "epoch": 2.8741239646855377, + "grad_norm": 13.3125, + "learning_rate": 2.035185591909773e-06, + "loss": 1.5882307291030884, + "step": 15790 + }, + { + "epoch": 2.8744880313097294, + "grad_norm": 5.0, + "learning_rate": 2.034983089656021e-06, + "loss": 1.302262306213379, + "step": 15792 + }, + { + "epoch": 2.874852097933922, + "grad_norm": 330.0, + "learning_rate": 2.0347811692524312e-06, + "loss": 1.3024449348449707, + "step": 15794 + }, + { + "epoch": 2.8752161645581142, + "grad_norm": 20.625, + "learning_rate": 2.034579830728636e-06, + "loss": 1.30109441280365, + "step": 15796 + }, + { + "epoch": 2.8755802311823064, + "grad_norm": 9.5, + "learning_rate": 2.03437907411418e-06, + "loss": 0.8696068525314331, + "step": 15798 + }, + { + "epoch": 2.8759442978064986, + "grad_norm": 5.53125, + "learning_rate": 2.0341788994385227e-06, + "loss": 1.3312145471572876, + "step": 15800 + }, + { + "epoch": 2.876308364430691, + "grad_norm": 12.3125, + "learning_rate": 2.033979306731041e-06, + "loss": 1.341722011566162, + "step": 15802 + }, + { + "epoch": 2.876672431054883, + "grad_norm": 16.5, + "learning_rate": 2.0337802960210225e-06, + "loss": 1.6390020847320557, + "step": 15804 + }, + { + "epoch": 2.877036497679075, + "grad_norm": 30.75, + "learning_rate": 2.033581867337672e-06, + "loss": 1.4610161781311035, + "step": 15806 + }, + { + "epoch": 2.8774005643032674, + "grad_norm": 18.0, + "learning_rate": 2.033384020710108e-06, + "loss": 1.8534249067306519, + "step": 15808 + }, + { + "epoch": 2.8777646309274596, + "grad_norm": 40.0, + "learning_rate": 2.0331867561673636e-06, + "loss": 1.7233022451400757, + "step": 15810 + }, + { + "epoch": 2.878128697551652, + "grad_norm": 9.6875, + "learning_rate": 2.032990073738387e-06, + "loss": 1.1840872764587402, + "step": 15812 + }, + { + "epoch": 2.878492764175844, + "grad_norm": 39.5, + "learning_rate": 2.0327939734520398e-06, + "loss": 1.0164161920547485, + "step": 15814 + }, + { + "epoch": 2.8788568308000366, + "grad_norm": 21.75, + "learning_rate": 2.0325984553370995e-06, + "loss": 1.2551268339157104, + "step": 15816 + }, + { + "epoch": 2.8792208974242284, + "grad_norm": 27.75, + "learning_rate": 2.0324035194222573e-06, + "loss": 1.7345207929611206, + "step": 15818 + }, + { + "epoch": 2.879584964048421, + "grad_norm": 9.75, + "learning_rate": 2.0322091657361194e-06, + "loss": 0.8374035358428955, + "step": 15820 + }, + { + "epoch": 2.879949030672613, + "grad_norm": 9.5625, + "learning_rate": 2.0320153943072065e-06, + "loss": 1.3922512531280518, + "step": 15822 + }, + { + "epoch": 2.8803130972968054, + "grad_norm": 3.46875, + "learning_rate": 2.0318222051639535e-06, + "loss": 0.9179459810256958, + "step": 15824 + }, + { + "epoch": 2.8806771639209976, + "grad_norm": 7.1875, + "learning_rate": 2.0316295983347107e-06, + "loss": 1.0979242324829102, + "step": 15826 + }, + { + "epoch": 2.88104123054519, + "grad_norm": 5.84375, + "learning_rate": 2.0314375738477415e-06, + "loss": 1.2580231428146362, + "step": 15828 + }, + { + "epoch": 2.881405297169382, + "grad_norm": 16.125, + "learning_rate": 2.0312461317312248e-06, + "loss": 1.3663318157196045, + "step": 15830 + }, + { + "epoch": 2.881769363793574, + "grad_norm": 15.4375, + "learning_rate": 2.031055272013255e-06, + "loss": 0.878201425075531, + "step": 15832 + }, + { + "epoch": 2.8821334304177664, + "grad_norm": 9.3125, + "learning_rate": 2.030864994721839e-06, + "loss": 1.3123582601547241, + "step": 15834 + }, + { + "epoch": 2.8824974970419586, + "grad_norm": 13.6875, + "learning_rate": 2.0306752998849e-06, + "loss": 0.7969541549682617, + "step": 15836 + }, + { + "epoch": 2.8828615636661508, + "grad_norm": 6.40625, + "learning_rate": 2.030486187530274e-06, + "loss": 1.2317688465118408, + "step": 15838 + }, + { + "epoch": 2.883225630290343, + "grad_norm": 13.25, + "learning_rate": 2.0302976576857127e-06, + "loss": 1.588843584060669, + "step": 15840 + }, + { + "epoch": 2.8835896969145356, + "grad_norm": 12.0625, + "learning_rate": 2.030109710378883e-06, + "loss": 0.7127867341041565, + "step": 15842 + }, + { + "epoch": 2.8839537635387273, + "grad_norm": 15.125, + "learning_rate": 2.029922345637364e-06, + "loss": 1.6774282455444336, + "step": 15844 + }, + { + "epoch": 2.88431783016292, + "grad_norm": 13.9375, + "learning_rate": 2.029735563488652e-06, + "loss": 1.494886875152588, + "step": 15846 + }, + { + "epoch": 2.884681896787112, + "grad_norm": 20.125, + "learning_rate": 2.029549363960156e-06, + "loss": 1.8388550281524658, + "step": 15848 + }, + { + "epoch": 2.8850459634113044, + "grad_norm": 10.4375, + "learning_rate": 2.0293637470791996e-06, + "loss": 1.740090250968933, + "step": 15850 + }, + { + "epoch": 2.8854100300354966, + "grad_norm": 3.671875, + "learning_rate": 2.0291787128730223e-06, + "loss": 1.2536745071411133, + "step": 15852 + }, + { + "epoch": 2.8857740966596888, + "grad_norm": 4.46875, + "learning_rate": 2.028994261368776e-06, + "loss": 0.7723858952522278, + "step": 15854 + }, + { + "epoch": 2.886138163283881, + "grad_norm": 16.375, + "learning_rate": 2.028810392593529e-06, + "loss": 0.9452016949653625, + "step": 15856 + }, + { + "epoch": 2.886502229908073, + "grad_norm": 7.96875, + "learning_rate": 2.028627106574263e-06, + "loss": 0.9506535530090332, + "step": 15858 + }, + { + "epoch": 2.8868662965322653, + "grad_norm": 7.25, + "learning_rate": 2.0284444033378744e-06, + "loss": 1.289560317993164, + "step": 15860 + }, + { + "epoch": 2.8872303631564575, + "grad_norm": 52.75, + "learning_rate": 2.0282622829111753e-06, + "loss": 1.4744064807891846, + "step": 15862 + }, + { + "epoch": 2.8875944297806497, + "grad_norm": 19.375, + "learning_rate": 2.0280807453208887e-06, + "loss": 1.450692892074585, + "step": 15864 + }, + { + "epoch": 2.887958496404842, + "grad_norm": 8.1875, + "learning_rate": 2.0278997905936566e-06, + "loss": 1.568651795387268, + "step": 15866 + }, + { + "epoch": 2.8883225630290346, + "grad_norm": 5.9375, + "learning_rate": 2.0277194187560332e-06, + "loss": 1.1895782947540283, + "step": 15868 + }, + { + "epoch": 2.8886866296532263, + "grad_norm": 8.6875, + "learning_rate": 2.0275396298344856e-06, + "loss": 1.176336407661438, + "step": 15870 + }, + { + "epoch": 2.889050696277419, + "grad_norm": 8.6875, + "learning_rate": 2.0273604238554e-06, + "loss": 1.067319393157959, + "step": 15872 + }, + { + "epoch": 2.889414762901611, + "grad_norm": 3.984375, + "learning_rate": 2.027181800845071e-06, + "loss": 1.488231897354126, + "step": 15874 + }, + { + "epoch": 2.8897788295258033, + "grad_norm": 11.4375, + "learning_rate": 2.027003760829713e-06, + "loss": 1.062765121459961, + "step": 15876 + }, + { + "epoch": 2.8901428961499955, + "grad_norm": 14.8125, + "learning_rate": 2.026826303835452e-06, + "loss": 1.4547913074493408, + "step": 15878 + }, + { + "epoch": 2.8905069627741877, + "grad_norm": 24.0, + "learning_rate": 2.0266494298883286e-06, + "loss": 1.5140360593795776, + "step": 15880 + }, + { + "epoch": 2.89087102939838, + "grad_norm": 5.21875, + "learning_rate": 2.0264731390142997e-06, + "loss": 1.241590976715088, + "step": 15882 + }, + { + "epoch": 2.891235096022572, + "grad_norm": 15.3125, + "learning_rate": 2.0262974312392335e-06, + "loss": 0.7726262211799622, + "step": 15884 + }, + { + "epoch": 2.8915991626467643, + "grad_norm": 32.5, + "learning_rate": 2.0261223065889155e-06, + "loss": 0.5962456464767456, + "step": 15886 + }, + { + "epoch": 2.8919632292709565, + "grad_norm": 5.78125, + "learning_rate": 2.0259477650890442e-06, + "loss": 0.9035093188285828, + "step": 15888 + }, + { + "epoch": 2.8923272958951487, + "grad_norm": 31.375, + "learning_rate": 2.025773806765233e-06, + "loss": 1.264603853225708, + "step": 15890 + }, + { + "epoch": 2.892691362519341, + "grad_norm": 43.25, + "learning_rate": 2.02560043164301e-06, + "loss": 1.2605006694793701, + "step": 15892 + }, + { + "epoch": 2.8930554291435335, + "grad_norm": 23.125, + "learning_rate": 2.025427639747816e-06, + "loss": 1.0313756465911865, + "step": 15894 + }, + { + "epoch": 2.8934194957677253, + "grad_norm": 14.9375, + "learning_rate": 2.025255431105009e-06, + "loss": 1.1444826126098633, + "step": 15896 + }, + { + "epoch": 2.893783562391918, + "grad_norm": 14.875, + "learning_rate": 2.0250838057398586e-06, + "loss": 1.7389105558395386, + "step": 15898 + }, + { + "epoch": 2.89414762901611, + "grad_norm": 6.0, + "learning_rate": 2.024912763677551e-06, + "loss": 1.1460657119750977, + "step": 15900 + }, + { + "epoch": 2.8945116956403023, + "grad_norm": 9.875, + "learning_rate": 2.0247423049431864e-06, + "loss": 1.4714908599853516, + "step": 15902 + }, + { + "epoch": 2.8948757622644945, + "grad_norm": 11.1875, + "learning_rate": 2.0245724295617776e-06, + "loss": 1.3630484342575073, + "step": 15904 + }, + { + "epoch": 2.8952398288886867, + "grad_norm": 16.875, + "learning_rate": 2.024403137558254e-06, + "loss": 1.1351715326309204, + "step": 15906 + }, + { + "epoch": 2.895603895512879, + "grad_norm": 24.875, + "learning_rate": 2.024234428957458e-06, + "loss": 1.93813157081604, + "step": 15908 + }, + { + "epoch": 2.895967962137071, + "grad_norm": 21.625, + "learning_rate": 2.024066303784147e-06, + "loss": 1.401777982711792, + "step": 15910 + }, + { + "epoch": 2.8963320287612633, + "grad_norm": 16.625, + "learning_rate": 2.0238987620629936e-06, + "loss": 1.0589489936828613, + "step": 15912 + }, + { + "epoch": 2.8966960953854555, + "grad_norm": 11.5625, + "learning_rate": 2.0237318038185824e-06, + "loss": 1.4927003383636475, + "step": 15914 + }, + { + "epoch": 2.8970601620096477, + "grad_norm": 10.875, + "learning_rate": 2.023565429075415e-06, + "loss": 1.311495065689087, + "step": 15916 + }, + { + "epoch": 2.89742422863384, + "grad_norm": 36.75, + "learning_rate": 2.0233996378579057e-06, + "loss": 1.671440601348877, + "step": 15918 + }, + { + "epoch": 2.897788295258032, + "grad_norm": 15.9375, + "learning_rate": 2.023234430190384e-06, + "loss": 1.2805222272872925, + "step": 15920 + }, + { + "epoch": 2.8981523618822242, + "grad_norm": 8.25, + "learning_rate": 2.0230698060970934e-06, + "loss": 1.5018774271011353, + "step": 15922 + }, + { + "epoch": 2.898516428506417, + "grad_norm": 9.625, + "learning_rate": 2.022905765602191e-06, + "loss": 1.1422563791275024, + "step": 15924 + }, + { + "epoch": 2.8988804951306086, + "grad_norm": 10.125, + "learning_rate": 2.02274230872975e-06, + "loss": 1.209306001663208, + "step": 15926 + }, + { + "epoch": 2.8992445617548013, + "grad_norm": 41.0, + "learning_rate": 2.022579435503757e-06, + "loss": 1.550145149230957, + "step": 15928 + }, + { + "epoch": 2.8996086283789935, + "grad_norm": 18.75, + "learning_rate": 2.0224171459481125e-06, + "loss": 1.749100685119629, + "step": 15930 + }, + { + "epoch": 2.8999726950031857, + "grad_norm": 6.03125, + "learning_rate": 2.0222554400866327e-06, + "loss": 1.1123734712600708, + "step": 15932 + }, + { + "epoch": 2.900336761627378, + "grad_norm": 7.15625, + "learning_rate": 2.0220943179430455e-06, + "loss": 1.2370554208755493, + "step": 15934 + }, + { + "epoch": 2.90070082825157, + "grad_norm": 13.375, + "learning_rate": 2.0219337795409973e-06, + "loss": 1.2984586954116821, + "step": 15936 + }, + { + "epoch": 2.9010648948757622, + "grad_norm": 9.5625, + "learning_rate": 2.021773824904045e-06, + "loss": 1.2779642343521118, + "step": 15938 + }, + { + "epoch": 2.9014289614999544, + "grad_norm": 20.125, + "learning_rate": 2.021614454055661e-06, + "loss": 1.8613547086715698, + "step": 15940 + }, + { + "epoch": 2.9017930281241466, + "grad_norm": 10.4375, + "learning_rate": 2.0214556670192334e-06, + "loss": 0.9604834318161011, + "step": 15942 + }, + { + "epoch": 2.902157094748339, + "grad_norm": 6.71875, + "learning_rate": 2.0212974638180626e-06, + "loss": 1.0142650604248047, + "step": 15944 + }, + { + "epoch": 2.902521161372531, + "grad_norm": 8.5, + "learning_rate": 2.021139844475365e-06, + "loss": 1.3438489437103271, + "step": 15946 + }, + { + "epoch": 2.902885227996723, + "grad_norm": 11.625, + "learning_rate": 2.0209828090142704e-06, + "loss": 1.478317379951477, + "step": 15948 + }, + { + "epoch": 2.903249294620916, + "grad_norm": 9.125, + "learning_rate": 2.0208263574578226e-06, + "loss": 1.1138684749603271, + "step": 15950 + }, + { + "epoch": 2.9036133612451076, + "grad_norm": 19.125, + "learning_rate": 2.020670489828981e-06, + "loss": 1.3077504634857178, + "step": 15952 + }, + { + "epoch": 2.9039774278693002, + "grad_norm": 7.90625, + "learning_rate": 2.0205152061506184e-06, + "loss": 1.2632169723510742, + "step": 15954 + }, + { + "epoch": 2.9043414944934924, + "grad_norm": 32.75, + "learning_rate": 2.0203605064455214e-06, + "loss": 0.9591492414474487, + "step": 15956 + }, + { + "epoch": 2.9047055611176846, + "grad_norm": 12.125, + "learning_rate": 2.020206390736392e-06, + "loss": 1.1280864477157593, + "step": 15958 + }, + { + "epoch": 2.905069627741877, + "grad_norm": 14.6875, + "learning_rate": 2.0200528590458466e-06, + "loss": 1.4261302947998047, + "step": 15960 + }, + { + "epoch": 2.905433694366069, + "grad_norm": 41.25, + "learning_rate": 2.0198999113964145e-06, + "loss": 1.5224285125732422, + "step": 15962 + }, + { + "epoch": 2.905797760990261, + "grad_norm": 10.1875, + "learning_rate": 2.0197475478105403e-06, + "loss": 1.3850963115692139, + "step": 15964 + }, + { + "epoch": 2.9061618276144534, + "grad_norm": 8.1875, + "learning_rate": 2.0195957683105833e-06, + "loss": 1.3561177253723145, + "step": 15966 + }, + { + "epoch": 2.9065258942386456, + "grad_norm": 36.5, + "learning_rate": 2.019444572918816e-06, + "loss": 1.3137898445129395, + "step": 15968 + }, + { + "epoch": 2.906889960862838, + "grad_norm": 13.0, + "learning_rate": 2.0192939616574258e-06, + "loss": 1.251298189163208, + "step": 15970 + }, + { + "epoch": 2.90725402748703, + "grad_norm": 61.5, + "learning_rate": 2.019143934548514e-06, + "loss": 1.9831271171569824, + "step": 15972 + }, + { + "epoch": 2.907618094111222, + "grad_norm": 6.46875, + "learning_rate": 2.018994491614097e-06, + "loss": 1.1440458297729492, + "step": 15974 + }, + { + "epoch": 2.907982160735415, + "grad_norm": 24.625, + "learning_rate": 2.0188456328761052e-06, + "loss": 1.7433085441589355, + "step": 15976 + }, + { + "epoch": 2.9083462273596066, + "grad_norm": 21.875, + "learning_rate": 2.018697358356382e-06, + "loss": 1.3896424770355225, + "step": 15978 + }, + { + "epoch": 2.908710293983799, + "grad_norm": 26.125, + "learning_rate": 2.018549668076687e-06, + "loss": 1.91237211227417, + "step": 15980 + }, + { + "epoch": 2.9090743606079914, + "grad_norm": 10.6875, + "learning_rate": 2.018402562058693e-06, + "loss": 1.3994228839874268, + "step": 15982 + }, + { + "epoch": 2.9094384272321836, + "grad_norm": 26.5, + "learning_rate": 2.0182560403239863e-06, + "loss": 1.3572702407836914, + "step": 15984 + }, + { + "epoch": 2.909802493856376, + "grad_norm": 10.75, + "learning_rate": 2.0181101028940698e-06, + "loss": 0.895589292049408, + "step": 15986 + }, + { + "epoch": 2.910166560480568, + "grad_norm": 13.4375, + "learning_rate": 2.0179647497903583e-06, + "loss": 0.9564281105995178, + "step": 15988 + }, + { + "epoch": 2.91053062710476, + "grad_norm": 10.75, + "learning_rate": 2.0178199810341815e-06, + "loss": 0.4834787845611572, + "step": 15990 + }, + { + "epoch": 2.9108946937289524, + "grad_norm": 8.1875, + "learning_rate": 2.0176757966467842e-06, + "loss": 1.3822435140609741, + "step": 15992 + }, + { + "epoch": 2.9112587603531446, + "grad_norm": 17.625, + "learning_rate": 2.0175321966493254e-06, + "loss": 1.8450682163238525, + "step": 15994 + }, + { + "epoch": 2.9116228269773368, + "grad_norm": 7.53125, + "learning_rate": 2.017389181062877e-06, + "loss": 1.1677730083465576, + "step": 15996 + }, + { + "epoch": 2.911986893601529, + "grad_norm": 10.5, + "learning_rate": 2.0172467499084263e-06, + "loss": 1.4569188356399536, + "step": 15998 + }, + { + "epoch": 2.912350960225721, + "grad_norm": 7.96875, + "learning_rate": 2.0171049032068736e-06, + "loss": 1.0316945314407349, + "step": 16000 + }, + { + "epoch": 2.912715026849914, + "grad_norm": 10.5, + "learning_rate": 2.016963640979036e-06, + "loss": 1.4173057079315186, + "step": 16002 + }, + { + "epoch": 2.9130790934741055, + "grad_norm": 7.03125, + "learning_rate": 2.0168229632456415e-06, + "loss": 1.1555143594741821, + "step": 16004 + }, + { + "epoch": 2.913443160098298, + "grad_norm": 27.0, + "learning_rate": 2.0166828700273355e-06, + "loss": 1.3070731163024902, + "step": 16006 + }, + { + "epoch": 2.9138072267224904, + "grad_norm": 10.75, + "learning_rate": 2.016543361344675e-06, + "loss": 1.3394155502319336, + "step": 16008 + }, + { + "epoch": 2.9141712933466826, + "grad_norm": 11.625, + "learning_rate": 2.0164044372181328e-06, + "loss": 1.4120908975601196, + "step": 16010 + }, + { + "epoch": 2.9145353599708748, + "grad_norm": 9.25, + "learning_rate": 2.016266097668095e-06, + "loss": 1.2500807046890259, + "step": 16012 + }, + { + "epoch": 2.914899426595067, + "grad_norm": 6.28125, + "learning_rate": 2.0161283427148625e-06, + "loss": 1.174961805343628, + "step": 16014 + }, + { + "epoch": 2.915263493219259, + "grad_norm": 24.875, + "learning_rate": 2.0159911723786513e-06, + "loss": 1.4608769416809082, + "step": 16016 + }, + { + "epoch": 2.9156275598434513, + "grad_norm": 17.25, + "learning_rate": 2.0158545866795896e-06, + "loss": 1.5372098684310913, + "step": 16018 + }, + { + "epoch": 2.9159916264676435, + "grad_norm": 12.9375, + "learning_rate": 2.0157185856377205e-06, + "loss": 1.503348469734192, + "step": 16020 + }, + { + "epoch": 2.9163556930918357, + "grad_norm": 14.0625, + "learning_rate": 2.0155831692730026e-06, + "loss": 1.502626895904541, + "step": 16022 + }, + { + "epoch": 2.916719759716028, + "grad_norm": 7.25, + "learning_rate": 2.015448337605307e-06, + "loss": 1.3237050771713257, + "step": 16024 + }, + { + "epoch": 2.91708382634022, + "grad_norm": 15.0625, + "learning_rate": 2.0153140906544194e-06, + "loss": 1.4869897365570068, + "step": 16026 + }, + { + "epoch": 2.9174478929644128, + "grad_norm": 13.0, + "learning_rate": 2.015180428440041e-06, + "loss": 1.3573061227798462, + "step": 16028 + }, + { + "epoch": 2.9178119595886045, + "grad_norm": 10.3125, + "learning_rate": 2.015047350981785e-06, + "loss": 1.2338680028915405, + "step": 16030 + }, + { + "epoch": 2.918176026212797, + "grad_norm": 7.375, + "learning_rate": 2.0149148582991816e-06, + "loss": 0.936947226524353, + "step": 16032 + }, + { + "epoch": 2.918540092836989, + "grad_norm": 18.375, + "learning_rate": 2.0147829504116724e-06, + "loss": 1.853521704673767, + "step": 16034 + }, + { + "epoch": 2.9189041594611815, + "grad_norm": 8.25, + "learning_rate": 2.0146516273386145e-06, + "loss": 1.1638528108596802, + "step": 16036 + }, + { + "epoch": 2.9192682260853737, + "grad_norm": 4.6875, + "learning_rate": 2.0145208890992784e-06, + "loss": 0.8490620255470276, + "step": 16038 + }, + { + "epoch": 2.919632292709566, + "grad_norm": 11.875, + "learning_rate": 2.0143907357128507e-06, + "loss": 1.5628931522369385, + "step": 16040 + }, + { + "epoch": 2.919996359333758, + "grad_norm": 9.375, + "learning_rate": 2.0142611671984304e-06, + "loss": 0.942184567451477, + "step": 16042 + }, + { + "epoch": 2.9203604259579503, + "grad_norm": 11.0, + "learning_rate": 2.0141321835750306e-06, + "loss": 1.249287486076355, + "step": 16044 + }, + { + "epoch": 2.9207244925821425, + "grad_norm": 5.9375, + "learning_rate": 2.0140037848615798e-06, + "loss": 1.2209815979003906, + "step": 16046 + }, + { + "epoch": 2.9210885592063347, + "grad_norm": 6.8125, + "learning_rate": 2.0138759710769196e-06, + "loss": 1.1884042024612427, + "step": 16048 + }, + { + "epoch": 2.921452625830527, + "grad_norm": 4.15625, + "learning_rate": 2.0137487422398063e-06, + "loss": 1.2047877311706543, + "step": 16050 + }, + { + "epoch": 2.921816692454719, + "grad_norm": 8.6875, + "learning_rate": 2.0136220983689104e-06, + "loss": 1.1593250036239624, + "step": 16052 + }, + { + "epoch": 2.9221807590789113, + "grad_norm": 15.0, + "learning_rate": 2.0134960394828164e-06, + "loss": 1.3413877487182617, + "step": 16054 + }, + { + "epoch": 2.9225448257031035, + "grad_norm": 25.625, + "learning_rate": 2.0133705656000224e-06, + "loss": 1.4110627174377441, + "step": 16056 + }, + { + "epoch": 2.922908892327296, + "grad_norm": 10.625, + "learning_rate": 2.0132456767389415e-06, + "loss": 1.5338093042373657, + "step": 16058 + }, + { + "epoch": 2.923272958951488, + "grad_norm": 26.0, + "learning_rate": 2.0131213729179002e-06, + "loss": 0.45551732182502747, + "step": 16060 + }, + { + "epoch": 2.9236370255756805, + "grad_norm": 12.5, + "learning_rate": 2.012997654155141e-06, + "loss": 1.3342375755310059, + "step": 16062 + }, + { + "epoch": 2.9240010921998727, + "grad_norm": 6.75, + "learning_rate": 2.012874520468817e-06, + "loss": 1.0010552406311035, + "step": 16064 + }, + { + "epoch": 2.924365158824065, + "grad_norm": 5.25, + "learning_rate": 2.0127519718769997e-06, + "loss": 0.9340542554855347, + "step": 16066 + }, + { + "epoch": 2.924729225448257, + "grad_norm": 12.25, + "learning_rate": 2.0126300083976714e-06, + "loss": 1.2615128755569458, + "step": 16068 + }, + { + "epoch": 2.9250932920724493, + "grad_norm": 11.0625, + "learning_rate": 2.0125086300487293e-06, + "loss": 1.278813123703003, + "step": 16070 + }, + { + "epoch": 2.9254573586966415, + "grad_norm": 12.8125, + "learning_rate": 2.0123878368479866e-06, + "loss": 1.0369194746017456, + "step": 16072 + }, + { + "epoch": 2.9258214253208337, + "grad_norm": 29.0, + "learning_rate": 2.0122676288131687e-06, + "loss": 1.0971245765686035, + "step": 16074 + }, + { + "epoch": 2.926185491945026, + "grad_norm": 11.3125, + "learning_rate": 2.012148005961915e-06, + "loss": 1.3509385585784912, + "step": 16076 + }, + { + "epoch": 2.926549558569218, + "grad_norm": 12.875, + "learning_rate": 2.01202896831178e-06, + "loss": 1.2240263223648071, + "step": 16078 + }, + { + "epoch": 2.9269136251934103, + "grad_norm": 27.25, + "learning_rate": 2.0119105158802314e-06, + "loss": 1.729132890701294, + "step": 16080 + }, + { + "epoch": 2.9272776918176024, + "grad_norm": 14.5, + "learning_rate": 2.0117926486846533e-06, + "loss": 1.9291480779647827, + "step": 16082 + }, + { + "epoch": 2.927641758441795, + "grad_norm": 6.0, + "learning_rate": 2.0116753667423405e-06, + "loss": 1.3116145133972168, + "step": 16084 + }, + { + "epoch": 2.928005825065987, + "grad_norm": 15.875, + "learning_rate": 2.011558670070505e-06, + "loss": 1.166590690612793, + "step": 16086 + }, + { + "epoch": 2.9283698916901795, + "grad_norm": 11.3125, + "learning_rate": 2.011442558686271e-06, + "loss": 1.081920862197876, + "step": 16088 + }, + { + "epoch": 2.9287339583143717, + "grad_norm": 23.75, + "learning_rate": 2.011327032606677e-06, + "loss": 1.2143311500549316, + "step": 16090 + }, + { + "epoch": 2.929098024938564, + "grad_norm": 31.25, + "learning_rate": 2.011212091848676e-06, + "loss": 1.2882063388824463, + "step": 16092 + }, + { + "epoch": 2.929462091562756, + "grad_norm": 10.1875, + "learning_rate": 2.0110977364291356e-06, + "loss": 1.613126516342163, + "step": 16094 + }, + { + "epoch": 2.9298261581869482, + "grad_norm": 5.90625, + "learning_rate": 2.0109839663648365e-06, + "loss": 1.3092408180236816, + "step": 16096 + }, + { + "epoch": 2.9301902248111404, + "grad_norm": 7.1875, + "learning_rate": 2.010870781672475e-06, + "loss": 1.3179875612258911, + "step": 16098 + }, + { + "epoch": 2.9305542914353326, + "grad_norm": 14.75, + "learning_rate": 2.0107581823686592e-06, + "loss": 1.1656293869018555, + "step": 16100 + }, + { + "epoch": 2.930918358059525, + "grad_norm": 13.1875, + "learning_rate": 2.010646168469913e-06, + "loss": 0.8745218515396118, + "step": 16102 + }, + { + "epoch": 2.931282424683717, + "grad_norm": 45.25, + "learning_rate": 2.0105347399926747e-06, + "loss": 1.3327651023864746, + "step": 16104 + }, + { + "epoch": 2.931646491307909, + "grad_norm": 13.125, + "learning_rate": 2.010423896953295e-06, + "loss": 1.9630465507507324, + "step": 16106 + }, + { + "epoch": 2.9320105579321014, + "grad_norm": 9.125, + "learning_rate": 2.0103136393680406e-06, + "loss": 1.1468077898025513, + "step": 16108 + }, + { + "epoch": 2.932374624556294, + "grad_norm": 12.875, + "learning_rate": 2.0102039672530904e-06, + "loss": 1.1899211406707764, + "step": 16110 + }, + { + "epoch": 2.932738691180486, + "grad_norm": 32.0, + "learning_rate": 2.010094880624539e-06, + "loss": 1.5794847011566162, + "step": 16112 + }, + { + "epoch": 2.9331027578046784, + "grad_norm": 27.75, + "learning_rate": 2.009986379498394e-06, + "loss": 1.1136564016342163, + "step": 16114 + }, + { + "epoch": 2.9334668244288706, + "grad_norm": 3.203125, + "learning_rate": 2.0098784638905776e-06, + "loss": 1.2769078016281128, + "step": 16116 + }, + { + "epoch": 2.933830891053063, + "grad_norm": 8.5, + "learning_rate": 2.0097711338169264e-06, + "loss": 0.9284353256225586, + "step": 16118 + }, + { + "epoch": 2.934194957677255, + "grad_norm": 8.125, + "learning_rate": 2.00966438929319e-06, + "loss": 1.3715157508850098, + "step": 16120 + }, + { + "epoch": 2.934559024301447, + "grad_norm": 12.5, + "learning_rate": 2.0095582303350334e-06, + "loss": 1.0132701396942139, + "step": 16122 + }, + { + "epoch": 2.9349230909256394, + "grad_norm": 14.1875, + "learning_rate": 2.0094526569580343e-06, + "loss": 1.4751384258270264, + "step": 16124 + }, + { + "epoch": 2.9352871575498316, + "grad_norm": 28.875, + "learning_rate": 2.009347669177686e-06, + "loss": 1.856689691543579, + "step": 16126 + }, + { + "epoch": 2.935651224174024, + "grad_norm": 19.625, + "learning_rate": 2.009243267009394e-06, + "loss": 1.7609423398971558, + "step": 16128 + }, + { + "epoch": 2.936015290798216, + "grad_norm": 14.5625, + "learning_rate": 2.0091394504684792e-06, + "loss": 1.3988747596740723, + "step": 16130 + }, + { + "epoch": 2.936379357422408, + "grad_norm": 14.625, + "learning_rate": 2.009036219570177e-06, + "loss": 1.5770587921142578, + "step": 16132 + }, + { + "epoch": 2.9367434240466004, + "grad_norm": 8.8125, + "learning_rate": 2.008933574329636e-06, + "loss": 1.36320960521698, + "step": 16134 + }, + { + "epoch": 2.937107490670793, + "grad_norm": 3.53125, + "learning_rate": 2.0088315147619187e-06, + "loss": 0.9429577589035034, + "step": 16136 + }, + { + "epoch": 2.9374715572949848, + "grad_norm": 7.9375, + "learning_rate": 2.008730040882001e-06, + "loss": 0.9821655750274658, + "step": 16138 + }, + { + "epoch": 2.9378356239191774, + "grad_norm": 10.875, + "learning_rate": 2.008629152704775e-06, + "loss": 1.2745946645736694, + "step": 16140 + }, + { + "epoch": 2.9381996905433696, + "grad_norm": 10.6875, + "learning_rate": 2.008528850245045e-06, + "loss": 1.2576141357421875, + "step": 16142 + }, + { + "epoch": 2.938563757167562, + "grad_norm": 12.0625, + "learning_rate": 2.00842913351753e-06, + "loss": 1.7421000003814697, + "step": 16144 + }, + { + "epoch": 2.938927823791754, + "grad_norm": 8.375, + "learning_rate": 2.008330002536864e-06, + "loss": 1.280937671661377, + "step": 16146 + }, + { + "epoch": 2.939291890415946, + "grad_norm": 14.125, + "learning_rate": 2.008231457317593e-06, + "loss": 1.3187261819839478, + "step": 16148 + }, + { + "epoch": 2.9396559570401384, + "grad_norm": 13.625, + "learning_rate": 2.008133497874178e-06, + "loss": 1.3234012126922607, + "step": 16150 + }, + { + "epoch": 2.9400200236643306, + "grad_norm": 8.4375, + "learning_rate": 2.0080361242209945e-06, + "loss": 1.1583294868469238, + "step": 16152 + }, + { + "epoch": 2.9403840902885228, + "grad_norm": 12.1875, + "learning_rate": 2.0079393363723322e-06, + "loss": 0.9946235418319702, + "step": 16154 + }, + { + "epoch": 2.940748156912715, + "grad_norm": 18.875, + "learning_rate": 2.0078431343423945e-06, + "loss": 1.4703614711761475, + "step": 16156 + }, + { + "epoch": 2.941112223536907, + "grad_norm": 30.375, + "learning_rate": 2.0077475181452967e-06, + "loss": 1.6725093126296997, + "step": 16158 + }, + { + "epoch": 2.9414762901610993, + "grad_norm": 12.0625, + "learning_rate": 2.007652487795072e-06, + "loss": 1.404555082321167, + "step": 16160 + }, + { + "epoch": 2.9418403567852915, + "grad_norm": 10.0625, + "learning_rate": 2.0075580433056654e-06, + "loss": 1.473804235458374, + "step": 16162 + }, + { + "epoch": 2.9422044234094837, + "grad_norm": 5.25, + "learning_rate": 2.007464184690936e-06, + "loss": 1.1531662940979004, + "step": 16164 + }, + { + "epoch": 2.9425684900336764, + "grad_norm": 9.125, + "learning_rate": 2.0073709119646567e-06, + "loss": 1.3482539653778076, + "step": 16166 + }, + { + "epoch": 2.942932556657868, + "grad_norm": 14.5, + "learning_rate": 2.0072782251405155e-06, + "loss": 1.3525104522705078, + "step": 16168 + }, + { + "epoch": 2.9432966232820608, + "grad_norm": 10.375, + "learning_rate": 2.0071861242321142e-06, + "loss": 1.3789052963256836, + "step": 16170 + }, + { + "epoch": 2.943660689906253, + "grad_norm": 136.0, + "learning_rate": 2.007094609252967e-06, + "loss": 1.0541635751724243, + "step": 16172 + }, + { + "epoch": 2.944024756530445, + "grad_norm": 12.4375, + "learning_rate": 2.0070036802165044e-06, + "loss": 0.8645976781845093, + "step": 16174 + }, + { + "epoch": 2.9443888231546373, + "grad_norm": 19.5, + "learning_rate": 2.0069133371360693e-06, + "loss": 1.3241825103759766, + "step": 16176 + }, + { + "epoch": 2.9447528897788295, + "grad_norm": 13.25, + "learning_rate": 2.0068235800249197e-06, + "loss": 1.2801040410995483, + "step": 16178 + }, + { + "epoch": 2.9451169564030217, + "grad_norm": 6.9375, + "learning_rate": 2.0067344088962266e-06, + "loss": 1.1969417333602905, + "step": 16180 + }, + { + "epoch": 2.945481023027214, + "grad_norm": 8.875, + "learning_rate": 2.0066458237630758e-06, + "loss": 1.4311037063598633, + "step": 16182 + }, + { + "epoch": 2.945845089651406, + "grad_norm": 12.125, + "learning_rate": 2.006557824638467e-06, + "loss": 1.3590706586837769, + "step": 16184 + }, + { + "epoch": 2.9462091562755983, + "grad_norm": 14.25, + "learning_rate": 2.0064704115353135e-06, + "loss": 1.559118628501892, + "step": 16186 + }, + { + "epoch": 2.9465732228997905, + "grad_norm": 18.25, + "learning_rate": 2.006383584466442e-06, + "loss": 1.9387481212615967, + "step": 16188 + }, + { + "epoch": 2.9469372895239827, + "grad_norm": 9.5625, + "learning_rate": 2.0062973434445953e-06, + "loss": 1.1815623044967651, + "step": 16190 + }, + { + "epoch": 2.9473013561481753, + "grad_norm": 13.8125, + "learning_rate": 2.006211688482428e-06, + "loss": 1.3023184537887573, + "step": 16192 + }, + { + "epoch": 2.947665422772367, + "grad_norm": 10.5625, + "learning_rate": 2.0061266195925104e-06, + "loss": 1.7914209365844727, + "step": 16194 + }, + { + "epoch": 2.9480294893965597, + "grad_norm": 5.53125, + "learning_rate": 2.0060421367873255e-06, + "loss": 1.307399868965149, + "step": 16196 + }, + { + "epoch": 2.948393556020752, + "grad_norm": 4.1875, + "learning_rate": 2.005958240079271e-06, + "loss": 0.950873851776123, + "step": 16198 + }, + { + "epoch": 2.948757622644944, + "grad_norm": 53.75, + "learning_rate": 2.005874929480658e-06, + "loss": 1.5999395847320557, + "step": 16200 + }, + { + "epoch": 2.9491216892691363, + "grad_norm": 8.6875, + "learning_rate": 2.005792205003713e-06, + "loss": 1.4964826107025146, + "step": 16202 + }, + { + "epoch": 2.9494857558933285, + "grad_norm": 17.375, + "learning_rate": 2.0057100666605743e-06, + "loss": 1.369545340538025, + "step": 16204 + }, + { + "epoch": 2.9498498225175207, + "grad_norm": 16.875, + "learning_rate": 2.005628514463296e-06, + "loss": 1.1087909936904907, + "step": 16206 + }, + { + "epoch": 2.950213889141713, + "grad_norm": 17.5, + "learning_rate": 2.0055475484238453e-06, + "loss": 0.8225274085998535, + "step": 16208 + }, + { + "epoch": 2.950577955765905, + "grad_norm": 9.0625, + "learning_rate": 2.005467168554104e-06, + "loss": 1.014095425605774, + "step": 16210 + }, + { + "epoch": 2.9509420223900973, + "grad_norm": 3.078125, + "learning_rate": 2.005387374865867e-06, + "loss": 0.9132812023162842, + "step": 16212 + }, + { + "epoch": 2.9513060890142895, + "grad_norm": 7.75, + "learning_rate": 2.005308167370844e-06, + "loss": 1.1453416347503662, + "step": 16214 + }, + { + "epoch": 2.9516701556384817, + "grad_norm": 9.6875, + "learning_rate": 2.005229546080659e-06, + "loss": 1.3302788734436035, + "step": 16216 + }, + { + "epoch": 2.9520342222626743, + "grad_norm": 12.125, + "learning_rate": 2.0051515110068477e-06, + "loss": 1.365898609161377, + "step": 16218 + }, + { + "epoch": 2.952398288886866, + "grad_norm": 8.625, + "learning_rate": 2.0050740621608632e-06, + "loss": 1.263388991355896, + "step": 16220 + }, + { + "epoch": 2.9527623555110587, + "grad_norm": 23.75, + "learning_rate": 2.00499719955407e-06, + "loss": 1.4513204097747803, + "step": 16222 + }, + { + "epoch": 2.953126422135251, + "grad_norm": 51.5, + "learning_rate": 2.004920923197747e-06, + "loss": 2.0801005363464355, + "step": 16224 + }, + { + "epoch": 2.953490488759443, + "grad_norm": 26.125, + "learning_rate": 2.004845233103088e-06, + "loss": 1.3742839097976685, + "step": 16226 + }, + { + "epoch": 2.9538545553836353, + "grad_norm": 9.875, + "learning_rate": 2.0047701292812003e-06, + "loss": 1.4993829727172852, + "step": 16228 + }, + { + "epoch": 2.9542186220078275, + "grad_norm": 11.3125, + "learning_rate": 2.004695611743105e-06, + "loss": 1.3701081275939941, + "step": 16230 + }, + { + "epoch": 2.9545826886320197, + "grad_norm": 19.125, + "learning_rate": 2.004621680499737e-06, + "loss": 1.457590103149414, + "step": 16232 + }, + { + "epoch": 2.954946755256212, + "grad_norm": 9.0, + "learning_rate": 2.0045483355619455e-06, + "loss": 1.4919146299362183, + "step": 16234 + }, + { + "epoch": 2.955310821880404, + "grad_norm": 8.875, + "learning_rate": 2.0044755769404937e-06, + "loss": 1.237777590751648, + "step": 16236 + }, + { + "epoch": 2.9556748885045963, + "grad_norm": 9.0625, + "learning_rate": 2.004403404646058e-06, + "loss": 0.9515402913093567, + "step": 16238 + }, + { + "epoch": 2.9560389551287884, + "grad_norm": 29.75, + "learning_rate": 2.0043318186892303e-06, + "loss": 1.6690784692764282, + "step": 16240 + }, + { + "epoch": 2.9564030217529806, + "grad_norm": 14.4375, + "learning_rate": 2.004260819080516e-06, + "loss": 1.6799991130828857, + "step": 16242 + }, + { + "epoch": 2.9567670883771733, + "grad_norm": 16.625, + "learning_rate": 2.004190405830332e-06, + "loss": 1.5002727508544922, + "step": 16244 + }, + { + "epoch": 2.957131155001365, + "grad_norm": 16.375, + "learning_rate": 2.0041205789490127e-06, + "loss": 1.704007625579834, + "step": 16246 + }, + { + "epoch": 2.9574952216255577, + "grad_norm": 17.25, + "learning_rate": 2.0040513384468047e-06, + "loss": 1.4280048608779907, + "step": 16248 + }, + { + "epoch": 2.95785928824975, + "grad_norm": 8.25, + "learning_rate": 2.0039826843338687e-06, + "loss": 1.0051014423370361, + "step": 16250 + }, + { + "epoch": 2.958223354873942, + "grad_norm": 8.875, + "learning_rate": 2.0039146166202793e-06, + "loss": 1.5191724300384521, + "step": 16252 + }, + { + "epoch": 2.9585874214981343, + "grad_norm": 11.375, + "learning_rate": 2.0038471353160248e-06, + "loss": 1.4216265678405762, + "step": 16254 + }, + { + "epoch": 2.9589514881223264, + "grad_norm": 10.125, + "learning_rate": 2.0037802404310086e-06, + "loss": 1.3860918283462524, + "step": 16256 + }, + { + "epoch": 2.9593155547465186, + "grad_norm": 3.234375, + "learning_rate": 2.0037139319750465e-06, + "loss": 1.0458552837371826, + "step": 16258 + }, + { + "epoch": 2.959679621370711, + "grad_norm": 10.5625, + "learning_rate": 2.00364820995787e-06, + "loss": 1.0160319805145264, + "step": 16260 + }, + { + "epoch": 2.960043687994903, + "grad_norm": 15.1875, + "learning_rate": 2.0035830743891223e-06, + "loss": 1.4707136154174805, + "step": 16262 + }, + { + "epoch": 2.960407754619095, + "grad_norm": 13.4375, + "learning_rate": 2.0035185252783627e-06, + "loss": 1.3480578660964966, + "step": 16264 + }, + { + "epoch": 2.9607718212432874, + "grad_norm": 18.875, + "learning_rate": 2.003454562635063e-06, + "loss": 1.3034281730651855, + "step": 16266 + }, + { + "epoch": 2.9611358878674796, + "grad_norm": 21.625, + "learning_rate": 2.0033911864686097e-06, + "loss": 1.4252829551696777, + "step": 16268 + }, + { + "epoch": 2.9614999544916722, + "grad_norm": 9.9375, + "learning_rate": 2.0033283967883027e-06, + "loss": 1.4396251440048218, + "step": 16270 + }, + { + "epoch": 2.961864021115864, + "grad_norm": 10.0, + "learning_rate": 2.003266193603357e-06, + "loss": 1.0804598331451416, + "step": 16272 + }, + { + "epoch": 2.9622280877400566, + "grad_norm": 21.875, + "learning_rate": 2.003204576922899e-06, + "loss": 1.15805983543396, + "step": 16274 + }, + { + "epoch": 2.9625921543642484, + "grad_norm": 42.25, + "learning_rate": 2.003143546755973e-06, + "loss": 1.342362403869629, + "step": 16276 + }, + { + "epoch": 2.962956220988441, + "grad_norm": 12.5, + "learning_rate": 2.0030831031115332e-06, + "loss": 0.8277813792228699, + "step": 16278 + }, + { + "epoch": 2.963320287612633, + "grad_norm": 9.5, + "learning_rate": 2.00302324599845e-06, + "loss": 0.9522460699081421, + "step": 16280 + }, + { + "epoch": 2.9636843542368254, + "grad_norm": 17.125, + "learning_rate": 2.002963975425506e-06, + "loss": 1.5351015329360962, + "step": 16282 + }, + { + "epoch": 2.9640484208610176, + "grad_norm": 18.125, + "learning_rate": 2.0029052914014014e-06, + "loss": 1.8087066411972046, + "step": 16284 + }, + { + "epoch": 2.96441248748521, + "grad_norm": 8.9375, + "learning_rate": 2.002847193934746e-06, + "loss": 0.9702337980270386, + "step": 16286 + }, + { + "epoch": 2.964776554109402, + "grad_norm": 11.6875, + "learning_rate": 2.002789683034066e-06, + "loss": 0.9694652557373047, + "step": 16288 + }, + { + "epoch": 2.965140620733594, + "grad_norm": 10.6875, + "learning_rate": 2.0027327587078006e-06, + "loss": 1.4253320693969727, + "step": 16290 + }, + { + "epoch": 2.9655046873577864, + "grad_norm": 6.09375, + "learning_rate": 2.0026764209643033e-06, + "loss": 1.1654529571533203, + "step": 16292 + }, + { + "epoch": 2.9658687539819786, + "grad_norm": 53.25, + "learning_rate": 2.0026206698118417e-06, + "loss": 1.2843589782714844, + "step": 16294 + }, + { + "epoch": 2.9662328206061708, + "grad_norm": 26.375, + "learning_rate": 2.002565505258597e-06, + "loss": 1.5341386795043945, + "step": 16296 + }, + { + "epoch": 2.966596887230363, + "grad_norm": 11.25, + "learning_rate": 2.0025109273126634e-06, + "loss": 1.6817675828933716, + "step": 16298 + }, + { + "epoch": 2.9669609538545556, + "grad_norm": 20.0, + "learning_rate": 2.0024569359820513e-06, + "loss": 1.088269591331482, + "step": 16300 + }, + { + "epoch": 2.9673250204787474, + "grad_norm": 8.4375, + "learning_rate": 2.0024035312746833e-06, + "loss": 1.3678847551345825, + "step": 16302 + }, + { + "epoch": 2.96768908710294, + "grad_norm": 20.5, + "learning_rate": 2.0023507131983966e-06, + "loss": 1.2941250801086426, + "step": 16304 + }, + { + "epoch": 2.968053153727132, + "grad_norm": 24.625, + "learning_rate": 2.0022984817609407e-06, + "loss": 1.5039629936218262, + "step": 16306 + }, + { + "epoch": 2.9684172203513244, + "grad_norm": 7.875, + "learning_rate": 2.0022468369699825e-06, + "loss": 1.3794997930526733, + "step": 16308 + }, + { + "epoch": 2.9687812869755166, + "grad_norm": 10.8125, + "learning_rate": 2.0021957788330986e-06, + "loss": 1.1620562076568604, + "step": 16310 + }, + { + "epoch": 2.9691453535997088, + "grad_norm": 18.625, + "learning_rate": 2.0021453073577825e-06, + "loss": 1.695955514907837, + "step": 16312 + }, + { + "epoch": 2.969509420223901, + "grad_norm": 14.25, + "learning_rate": 2.0020954225514413e-06, + "loss": 1.35657799243927, + "step": 16314 + }, + { + "epoch": 2.969873486848093, + "grad_norm": 8.6875, + "learning_rate": 2.0020461244213943e-06, + "loss": 1.2799112796783447, + "step": 16316 + }, + { + "epoch": 2.9702375534722854, + "grad_norm": 10.125, + "learning_rate": 2.0019974129748765e-06, + "loss": 1.2740790843963623, + "step": 16318 + }, + { + "epoch": 2.9706016200964775, + "grad_norm": 13.25, + "learning_rate": 2.001949288219036e-06, + "loss": 1.4777576923370361, + "step": 16320 + }, + { + "epoch": 2.9709656867206697, + "grad_norm": 22.875, + "learning_rate": 2.001901750160934e-06, + "loss": 1.8469033241271973, + "step": 16322 + }, + { + "epoch": 2.971329753344862, + "grad_norm": 18.375, + "learning_rate": 2.0018547988075476e-06, + "loss": 1.25077223777771, + "step": 16324 + }, + { + "epoch": 2.9716938199690546, + "grad_norm": 30.25, + "learning_rate": 2.001808434165767e-06, + "loss": 1.4238383769989014, + "step": 16326 + }, + { + "epoch": 2.9720578865932463, + "grad_norm": 25.875, + "learning_rate": 2.0017626562423947e-06, + "loss": 1.7187809944152832, + "step": 16328 + }, + { + "epoch": 2.972421953217439, + "grad_norm": 14.0625, + "learning_rate": 2.0017174650441494e-06, + "loss": 1.59052574634552, + "step": 16330 + }, + { + "epoch": 2.972786019841631, + "grad_norm": 25.125, + "learning_rate": 2.001672860577663e-06, + "loss": 0.9362858533859253, + "step": 16332 + }, + { + "epoch": 2.9731500864658233, + "grad_norm": 22.875, + "learning_rate": 2.0016288428494803e-06, + "loss": 1.1181987524032593, + "step": 16334 + }, + { + "epoch": 2.9735141530900155, + "grad_norm": 20.125, + "learning_rate": 2.00158541186606e-06, + "loss": 1.4670336246490479, + "step": 16336 + }, + { + "epoch": 2.9738782197142077, + "grad_norm": 19.25, + "learning_rate": 2.0015425676337773e-06, + "loss": 1.3774993419647217, + "step": 16338 + }, + { + "epoch": 2.9742422863384, + "grad_norm": 5.59375, + "learning_rate": 2.001500310158918e-06, + "loss": 1.333992838859558, + "step": 16340 + }, + { + "epoch": 2.974606352962592, + "grad_norm": 2.75, + "learning_rate": 2.001458639447684e-06, + "loss": 1.207629680633545, + "step": 16342 + }, + { + "epoch": 2.9749704195867843, + "grad_norm": 5.03125, + "learning_rate": 2.0014175555061897e-06, + "loss": 1.0353045463562012, + "step": 16344 + }, + { + "epoch": 2.9753344862109765, + "grad_norm": 4.875, + "learning_rate": 2.001377058340465e-06, + "loss": 1.170973777770996, + "step": 16346 + }, + { + "epoch": 2.9756985528351687, + "grad_norm": 24.25, + "learning_rate": 2.0013371479564514e-06, + "loss": 1.2009278535842896, + "step": 16348 + }, + { + "epoch": 2.976062619459361, + "grad_norm": 10.0, + "learning_rate": 2.001297824360006e-06, + "loss": 1.0104331970214844, + "step": 16350 + }, + { + "epoch": 2.9764266860835535, + "grad_norm": 17.25, + "learning_rate": 2.0012590875568997e-06, + "loss": 1.3624790906906128, + "step": 16352 + }, + { + "epoch": 2.9767907527077453, + "grad_norm": 12.5, + "learning_rate": 2.001220937552817e-06, + "loss": 1.492138147354126, + "step": 16354 + }, + { + "epoch": 2.977154819331938, + "grad_norm": 38.0, + "learning_rate": 2.001183374353356e-06, + "loss": 1.5225915908813477, + "step": 16356 + }, + { + "epoch": 2.97751888595613, + "grad_norm": 6.4375, + "learning_rate": 2.001146397964029e-06, + "loss": 1.3869601488113403, + "step": 16358 + }, + { + "epoch": 2.9778829525803223, + "grad_norm": 28.25, + "learning_rate": 2.0011100083902625e-06, + "loss": 1.1126316785812378, + "step": 16360 + }, + { + "epoch": 2.9782470192045145, + "grad_norm": 26.625, + "learning_rate": 2.0010742056373954e-06, + "loss": 2.039464235305786, + "step": 16362 + }, + { + "epoch": 2.9786110858287067, + "grad_norm": 15.8125, + "learning_rate": 2.001038989710683e-06, + "loss": 1.8592197895050049, + "step": 16364 + }, + { + "epoch": 2.978975152452899, + "grad_norm": 16.25, + "learning_rate": 2.0010043606152925e-06, + "loss": 1.1575515270233154, + "step": 16366 + }, + { + "epoch": 2.979339219077091, + "grad_norm": 19.0, + "learning_rate": 2.0009703183563054e-06, + "loss": 0.9458638429641724, + "step": 16368 + }, + { + "epoch": 2.9797032857012833, + "grad_norm": 16.0, + "learning_rate": 2.0009368629387174e-06, + "loss": 1.3846774101257324, + "step": 16370 + }, + { + "epoch": 2.9800673523254755, + "grad_norm": 11.1875, + "learning_rate": 2.000903994367438e-06, + "loss": 1.3616442680358887, + "step": 16372 + }, + { + "epoch": 2.9804314189496677, + "grad_norm": 11.25, + "learning_rate": 2.0008717126472904e-06, + "loss": 1.2130813598632812, + "step": 16374 + }, + { + "epoch": 2.98079548557386, + "grad_norm": 13.5625, + "learning_rate": 2.0008400177830123e-06, + "loss": 0.5574257373809814, + "step": 16376 + }, + { + "epoch": 2.9811595521980525, + "grad_norm": 10.375, + "learning_rate": 2.000808909779254e-06, + "loss": 0.8714935183525085, + "step": 16378 + }, + { + "epoch": 2.9815236188222443, + "grad_norm": 4.8125, + "learning_rate": 2.0007783886405813e-06, + "loss": 1.4558019638061523, + "step": 16380 + }, + { + "epoch": 2.981887685446437, + "grad_norm": 3.703125, + "learning_rate": 2.0007484543714718e-06, + "loss": 0.9858094453811646, + "step": 16382 + }, + { + "epoch": 2.9822517520706286, + "grad_norm": 8.875, + "learning_rate": 2.00071910697632e-06, + "loss": 1.1196403503417969, + "step": 16384 + }, + { + "epoch": 2.9826158186948213, + "grad_norm": 9.875, + "learning_rate": 2.000690346459431e-06, + "loss": 1.3953125476837158, + "step": 16386 + }, + { + "epoch": 2.9829798853190135, + "grad_norm": 6.8125, + "learning_rate": 2.0006621728250264e-06, + "loss": 1.5328149795532227, + "step": 16388 + }, + { + "epoch": 2.9833439519432057, + "grad_norm": 6.28125, + "learning_rate": 2.0006345860772395e-06, + "loss": 1.0095698833465576, + "step": 16390 + }, + { + "epoch": 2.983708018567398, + "grad_norm": 4.84375, + "learning_rate": 2.0006075862201195e-06, + "loss": 1.47171151638031, + "step": 16392 + }, + { + "epoch": 2.98407208519159, + "grad_norm": 8.8125, + "learning_rate": 2.000581173257628e-06, + "loss": 1.4703402519226074, + "step": 16394 + }, + { + "epoch": 2.9844361518157823, + "grad_norm": 16.5, + "learning_rate": 2.0005553471936413e-06, + "loss": 1.6649112701416016, + "step": 16396 + }, + { + "epoch": 2.9848002184399745, + "grad_norm": 13.875, + "learning_rate": 2.0005301080319485e-06, + "loss": 1.2723469734191895, + "step": 16398 + }, + { + "epoch": 2.9851642850641666, + "grad_norm": 9.5, + "learning_rate": 2.000505455776254e-06, + "loss": 1.138935923576355, + "step": 16400 + }, + { + "epoch": 2.985528351688359, + "grad_norm": 10.9375, + "learning_rate": 2.0004813904301756e-06, + "loss": 1.3505103588104248, + "step": 16402 + }, + { + "epoch": 2.985892418312551, + "grad_norm": 19.25, + "learning_rate": 2.0004579119972446e-06, + "loss": 1.2249739170074463, + "step": 16404 + }, + { + "epoch": 2.9862564849367432, + "grad_norm": 11.875, + "learning_rate": 2.0004350204809063e-06, + "loss": 0.6565002202987671, + "step": 16406 + }, + { + "epoch": 2.986620551560936, + "grad_norm": 16.75, + "learning_rate": 2.00041271588452e-06, + "loss": 0.2887282967567444, + "step": 16408 + }, + { + "epoch": 2.9869846181851276, + "grad_norm": 12.875, + "learning_rate": 2.000390998211358e-06, + "loss": 1.6883145570755005, + "step": 16410 + }, + { + "epoch": 2.9873486848093203, + "grad_norm": 22.25, + "learning_rate": 2.000369867464609e-06, + "loss": 1.104772686958313, + "step": 16412 + }, + { + "epoch": 2.9877127514335124, + "grad_norm": 9.0, + "learning_rate": 2.0003493236473725e-06, + "loss": 1.3531534671783447, + "step": 16414 + }, + { + "epoch": 2.9880768180577046, + "grad_norm": 7.46875, + "learning_rate": 2.000329366762663e-06, + "loss": 1.304721474647522, + "step": 16416 + }, + { + "epoch": 2.988440884681897, + "grad_norm": 23.875, + "learning_rate": 2.0003099968134104e-06, + "loss": 1.1761491298675537, + "step": 16418 + }, + { + "epoch": 2.988804951306089, + "grad_norm": 24.125, + "learning_rate": 2.0002912138024565e-06, + "loss": 1.5270304679870605, + "step": 16420 + }, + { + "epoch": 2.9891690179302812, + "grad_norm": 13.0625, + "learning_rate": 2.000273017732557e-06, + "loss": 1.0654093027114868, + "step": 16422 + }, + { + "epoch": 2.9895330845544734, + "grad_norm": 14.375, + "learning_rate": 2.000255408606383e-06, + "loss": 1.4028539657592773, + "step": 16424 + }, + { + "epoch": 2.9898971511786656, + "grad_norm": 16.375, + "learning_rate": 2.000238386426518e-06, + "loss": 1.2568185329437256, + "step": 16426 + }, + { + "epoch": 2.990261217802858, + "grad_norm": 14.0, + "learning_rate": 2.0002219511954605e-06, + "loss": 1.2188596725463867, + "step": 16428 + }, + { + "epoch": 2.99062528442705, + "grad_norm": 21.0, + "learning_rate": 2.000206102915622e-06, + "loss": 0.8418086767196655, + "step": 16430 + }, + { + "epoch": 2.990989351051242, + "grad_norm": 23.25, + "learning_rate": 2.000190841589328e-06, + "loss": 1.3837864398956299, + "step": 16432 + }, + { + "epoch": 2.991353417675435, + "grad_norm": 7.8125, + "learning_rate": 2.0001761672188182e-06, + "loss": 1.3582119941711426, + "step": 16434 + }, + { + "epoch": 2.9917174842996266, + "grad_norm": 5.0, + "learning_rate": 2.000162079806246e-06, + "loss": 1.3401474952697754, + "step": 16436 + }, + { + "epoch": 2.992081550923819, + "grad_norm": 15.375, + "learning_rate": 2.0001485793536785e-06, + "loss": 1.3087061643600464, + "step": 16438 + }, + { + "epoch": 2.9924456175480114, + "grad_norm": 11.5, + "learning_rate": 2.000135665863097e-06, + "loss": 1.4459304809570312, + "step": 16440 + }, + { + "epoch": 2.9928096841722036, + "grad_norm": 14.5625, + "learning_rate": 2.0001233393363968e-06, + "loss": 1.300643801689148, + "step": 16442 + }, + { + "epoch": 2.993173750796396, + "grad_norm": 11.9375, + "learning_rate": 2.0001115997753866e-06, + "loss": 1.3814854621887207, + "step": 16444 + }, + { + "epoch": 2.993537817420588, + "grad_norm": 25.125, + "learning_rate": 2.0001004471817887e-06, + "loss": 1.370299220085144, + "step": 16446 + }, + { + "epoch": 2.99390188404478, + "grad_norm": 24.5, + "learning_rate": 2.00008988155724e-06, + "loss": 1.9163622856140137, + "step": 16448 + }, + { + "epoch": 2.9942659506689724, + "grad_norm": 47.5, + "learning_rate": 2.0000799029032906e-06, + "loss": 1.4259893894195557, + "step": 16450 + }, + { + "epoch": 2.9946300172931646, + "grad_norm": 12.25, + "learning_rate": 2.0000705112214055e-06, + "loss": 1.2691893577575684, + "step": 16452 + }, + { + "epoch": 2.9949940839173568, + "grad_norm": 6.1875, + "learning_rate": 2.0000617065129626e-06, + "loss": 1.184010624885559, + "step": 16454 + }, + { + "epoch": 2.995358150541549, + "grad_norm": 6.4375, + "learning_rate": 2.000053488779254e-06, + "loss": 1.2077614068984985, + "step": 16456 + }, + { + "epoch": 2.995722217165741, + "grad_norm": 10.25, + "learning_rate": 2.000045858021486e-06, + "loss": 1.4405052661895752, + "step": 16458 + }, + { + "epoch": 2.996086283789934, + "grad_norm": 79.5, + "learning_rate": 2.0000388142407775e-06, + "loss": 1.3099960088729858, + "step": 16460 + }, + { + "epoch": 2.9964503504141256, + "grad_norm": 12.125, + "learning_rate": 2.0000323574381624e-06, + "loss": 0.7813707590103149, + "step": 16462 + }, + { + "epoch": 2.996814417038318, + "grad_norm": 5.8125, + "learning_rate": 2.0000264876145884e-06, + "loss": 1.3825950622558594, + "step": 16464 + }, + { + "epoch": 2.9971784836625104, + "grad_norm": 5.34375, + "learning_rate": 2.000021204770917e-06, + "loss": 1.2435548305511475, + "step": 16466 + }, + { + "epoch": 2.9975425502867026, + "grad_norm": 8.25, + "learning_rate": 2.0000165089079237e-06, + "loss": 1.1585209369659424, + "step": 16468 + }, + { + "epoch": 2.9979066169108948, + "grad_norm": 8.375, + "learning_rate": 2.0000124000262966e-06, + "loss": 1.3255114555358887, + "step": 16470 + }, + { + "epoch": 2.998270683535087, + "grad_norm": 12.875, + "learning_rate": 2.0000088781266396e-06, + "loss": 1.2500178813934326, + "step": 16472 + }, + { + "epoch": 2.998634750159279, + "grad_norm": 12.125, + "learning_rate": 2.000005943209469e-06, + "loss": 1.351958155632019, + "step": 16474 + }, + { + "epoch": 2.9989988167834714, + "grad_norm": 7.15625, + "learning_rate": 2.000003595275216e-06, + "loss": 1.1764051914215088, + "step": 16476 + }, + { + "epoch": 2.9993628834076635, + "grad_norm": 14.375, + "learning_rate": 2.0000018343242243e-06, + "loss": 1.8574588298797607, + "step": 16478 + }, + { + "epoch": 2.9997269500318557, + "grad_norm": 11.75, + "learning_rate": 2.000000660356753e-06, + "loss": 1.5543640851974487, + "step": 16480 + }, + { + "epoch": 3.0, + "grad_norm": 15.9375, + "learning_rate": 2.0000000733729745e-06, + "loss": 1.4083051681518555, + "step": 16482 + }, + { + "epoch": 3.0, + "step": 16482, + "total_flos": 3.229073396012679e+18, + "train_loss": 1.3400128969104237, + "train_runtime": 21051.3439, + "train_samples_per_second": 1.566, + "train_steps_per_second": 0.783 + } + ], + "logging_steps": 2, + "max_steps": 16482, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 9999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.229073396012679e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}