diff --git "a/checkpoint-17500/trainer_state.json" "b/checkpoint-17500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-17500/trainer_state.json" @@ -0,0 +1,122534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.870276367738297, + "eval_steps": 500, + "global_step": 17500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005640157924421884, + "grad_norm": 19.41988182067871, + "learning_rate": 5e-05, + "loss": 7.7198, + "step": 1 + }, + { + "epoch": 0.0011280315848843769, + "grad_norm": 18.848299026489258, + "learning_rate": 4.999717992103779e-05, + "loss": 7.366, + "step": 2 + }, + { + "epoch": 0.001692047377326565, + "grad_norm": 17.485559463500977, + "learning_rate": 4.999435984207558e-05, + "loss": 6.563, + "step": 3 + }, + { + "epoch": 0.0022560631697687537, + "grad_norm": 17.985149383544922, + "learning_rate": 4.999153976311337e-05, + "loss": 6.4823, + "step": 4 + }, + { + "epoch": 0.0028200789622109417, + "grad_norm": 13.97096061706543, + "learning_rate": 4.998871968415116e-05, + "loss": 6.2621, + "step": 5 + }, + { + "epoch": 0.00338409475465313, + "grad_norm": 10.492752075195312, + "learning_rate": 4.998589960518894e-05, + "loss": 5.9465, + "step": 6 + }, + { + "epoch": 0.003948110547095319, + "grad_norm": 9.915308952331543, + "learning_rate": 4.998307952622674e-05, + "loss": 5.4358, + "step": 7 + }, + { + "epoch": 0.0045121263395375075, + "grad_norm": 12.366442680358887, + "learning_rate": 4.998025944726453e-05, + "loss": 5.353, + "step": 8 + }, + { + "epoch": 0.005076142131979695, + "grad_norm": 11.515579223632812, + "learning_rate": 4.997743936830231e-05, + "loss": 5.4083, + "step": 9 + }, + { + "epoch": 0.0056401579244218835, + "grad_norm": 8.645106315612793, + "learning_rate": 4.9974619289340105e-05, + "loss": 5.0717, + "step": 10 + }, + { + "epoch": 0.006204173716864072, + "grad_norm": 7.880457878112793, + "learning_rate": 4.99717992103779e-05, + "loss": 5.4358, + "step": 11 + }, + { + "epoch": 0.00676818950930626, + "grad_norm": 8.207262992858887, + "learning_rate": 4.996897913141568e-05, + "loss": 4.6393, + "step": 12 + }, + { + "epoch": 0.007332205301748449, + "grad_norm": 8.118475914001465, + "learning_rate": 4.996615905245347e-05, + "loss": 5.0949, + "step": 13 + }, + { + "epoch": 0.007896221094190638, + "grad_norm": 8.418349266052246, + "learning_rate": 4.996333897349126e-05, + "loss": 5.2285, + "step": 14 + }, + { + "epoch": 0.008460236886632826, + "grad_norm": 8.138864517211914, + "learning_rate": 4.996051889452905e-05, + "loss": 4.7941, + "step": 15 + }, + { + "epoch": 0.009024252679075015, + "grad_norm": 7.882900238037109, + "learning_rate": 4.995769881556684e-05, + "loss": 4.8822, + "step": 16 + }, + { + "epoch": 0.009588268471517203, + "grad_norm": 9.206087112426758, + "learning_rate": 4.995487873660463e-05, + "loss": 5.0619, + "step": 17 + }, + { + "epoch": 0.01015228426395939, + "grad_norm": 11.654960632324219, + "learning_rate": 4.9952058657642415e-05, + "loss": 4.6107, + "step": 18 + }, + { + "epoch": 0.01071630005640158, + "grad_norm": 9.568180084228516, + "learning_rate": 4.994923857868021e-05, + "loss": 4.2661, + "step": 19 + }, + { + "epoch": 0.011280315848843767, + "grad_norm": 9.761383056640625, + "learning_rate": 4.994641849971799e-05, + "loss": 4.879, + "step": 20 + }, + { + "epoch": 0.011844331641285956, + "grad_norm": 7.187350273132324, + "learning_rate": 4.9943598420755785e-05, + "loss": 3.9344, + "step": 21 + }, + { + "epoch": 0.012408347433728144, + "grad_norm": 7.881505489349365, + "learning_rate": 4.994077834179357e-05, + "loss": 4.5831, + "step": 22 + }, + { + "epoch": 0.012972363226170333, + "grad_norm": 7.518311023712158, + "learning_rate": 4.993795826283136e-05, + "loss": 4.5082, + "step": 23 + }, + { + "epoch": 0.01353637901861252, + "grad_norm": 9.563621520996094, + "learning_rate": 4.993513818386915e-05, + "loss": 4.745, + "step": 24 + }, + { + "epoch": 0.01410039481105471, + "grad_norm": 8.299757957458496, + "learning_rate": 4.993231810490694e-05, + "loss": 4.6875, + "step": 25 + }, + { + "epoch": 0.014664410603496898, + "grad_norm": 7.765680313110352, + "learning_rate": 4.9929498025944726e-05, + "loss": 4.3203, + "step": 26 + }, + { + "epoch": 0.015228426395939087, + "grad_norm": 7.908237934112549, + "learning_rate": 4.992667794698252e-05, + "loss": 4.0889, + "step": 27 + }, + { + "epoch": 0.015792442188381276, + "grad_norm": 8.398841857910156, + "learning_rate": 4.992385786802031e-05, + "loss": 4.2154, + "step": 28 + }, + { + "epoch": 0.016356457980823462, + "grad_norm": 8.607576370239258, + "learning_rate": 4.9921037789058095e-05, + "loss": 4.0805, + "step": 29 + }, + { + "epoch": 0.01692047377326565, + "grad_norm": 7.623193740844727, + "learning_rate": 4.991821771009588e-05, + "loss": 4.1947, + "step": 30 + }, + { + "epoch": 0.01748448956570784, + "grad_norm": 8.176064491271973, + "learning_rate": 4.991539763113367e-05, + "loss": 4.494, + "step": 31 + }, + { + "epoch": 0.01804850535815003, + "grad_norm": 8.526585578918457, + "learning_rate": 4.9912577552171465e-05, + "loss": 4.1747, + "step": 32 + }, + { + "epoch": 0.018612521150592216, + "grad_norm": 7.092719078063965, + "learning_rate": 4.990975747320925e-05, + "loss": 4.3934, + "step": 33 + }, + { + "epoch": 0.019176536943034405, + "grad_norm": 7.451559066772461, + "learning_rate": 4.9906937394247036e-05, + "loss": 5.1707, + "step": 34 + }, + { + "epoch": 0.019740552735476594, + "grad_norm": 6.571346759796143, + "learning_rate": 4.9904117315284835e-05, + "loss": 3.7457, + "step": 35 + }, + { + "epoch": 0.02030456852791878, + "grad_norm": 6.961138725280762, + "learning_rate": 4.990129723632262e-05, + "loss": 4.1985, + "step": 36 + }, + { + "epoch": 0.02086858432036097, + "grad_norm": 8.064659118652344, + "learning_rate": 4.9898477157360406e-05, + "loss": 3.8052, + "step": 37 + }, + { + "epoch": 0.02143260011280316, + "grad_norm": 7.101634979248047, + "learning_rate": 4.98956570783982e-05, + "loss": 4.1799, + "step": 38 + }, + { + "epoch": 0.021996615905245348, + "grad_norm": 7.165801048278809, + "learning_rate": 4.989283699943599e-05, + "loss": 3.5669, + "step": 39 + }, + { + "epoch": 0.022560631697687534, + "grad_norm": 6.828774929046631, + "learning_rate": 4.9890016920473776e-05, + "loss": 4.4623, + "step": 40 + }, + { + "epoch": 0.023124647490129723, + "grad_norm": 7.77141809463501, + "learning_rate": 4.988719684151156e-05, + "loss": 3.9852, + "step": 41 + }, + { + "epoch": 0.023688663282571912, + "grad_norm": 6.432172775268555, + "learning_rate": 4.988437676254935e-05, + "loss": 3.8468, + "step": 42 + }, + { + "epoch": 0.024252679075014102, + "grad_norm": 8.539366722106934, + "learning_rate": 4.9881556683587145e-05, + "loss": 3.4945, + "step": 43 + }, + { + "epoch": 0.024816694867456288, + "grad_norm": 6.945478439331055, + "learning_rate": 4.987873660462493e-05, + "loss": 4.0193, + "step": 44 + }, + { + "epoch": 0.025380710659898477, + "grad_norm": 8.223091125488281, + "learning_rate": 4.9875916525662716e-05, + "loss": 4.0442, + "step": 45 + }, + { + "epoch": 0.025944726452340666, + "grad_norm": 6.831948280334473, + "learning_rate": 4.9873096446700515e-05, + "loss": 3.8699, + "step": 46 + }, + { + "epoch": 0.026508742244782856, + "grad_norm": 7.637156963348389, + "learning_rate": 4.98702763677383e-05, + "loss": 4.0018, + "step": 47 + }, + { + "epoch": 0.02707275803722504, + "grad_norm": 7.977843761444092, + "learning_rate": 4.9867456288776086e-05, + "loss": 3.542, + "step": 48 + }, + { + "epoch": 0.02763677382966723, + "grad_norm": 9.195748329162598, + "learning_rate": 4.986463620981388e-05, + "loss": 4.7686, + "step": 49 + }, + { + "epoch": 0.02820078962210942, + "grad_norm": 6.571633338928223, + "learning_rate": 4.986181613085167e-05, + "loss": 4.059, + "step": 50 + }, + { + "epoch": 0.028764805414551606, + "grad_norm": 7.068306922912598, + "learning_rate": 4.9858996051889456e-05, + "loss": 4.0327, + "step": 51 + }, + { + "epoch": 0.029328821206993795, + "grad_norm": 6.370657920837402, + "learning_rate": 4.985617597292724e-05, + "loss": 3.6922, + "step": 52 + }, + { + "epoch": 0.029892836999435984, + "grad_norm": 6.706864833831787, + "learning_rate": 4.985335589396503e-05, + "loss": 4.0669, + "step": 53 + }, + { + "epoch": 0.030456852791878174, + "grad_norm": 7.130467414855957, + "learning_rate": 4.9850535815002826e-05, + "loss": 4.2357, + "step": 54 + }, + { + "epoch": 0.03102086858432036, + "grad_norm": 8.897139549255371, + "learning_rate": 4.984771573604061e-05, + "loss": 3.8955, + "step": 55 + }, + { + "epoch": 0.03158488437676255, + "grad_norm": 6.400907039642334, + "learning_rate": 4.98448956570784e-05, + "loss": 3.6367, + "step": 56 + }, + { + "epoch": 0.032148900169204735, + "grad_norm": 6.772108554840088, + "learning_rate": 4.984207557811619e-05, + "loss": 3.7469, + "step": 57 + }, + { + "epoch": 0.032712915961646924, + "grad_norm": 6.177105903625488, + "learning_rate": 4.983925549915398e-05, + "loss": 3.5711, + "step": 58 + }, + { + "epoch": 0.03327693175408911, + "grad_norm": 6.6000237464904785, + "learning_rate": 4.9836435420191766e-05, + "loss": 4.3874, + "step": 59 + }, + { + "epoch": 0.0338409475465313, + "grad_norm": 7.6974029541015625, + "learning_rate": 4.983361534122956e-05, + "loss": 4.3898, + "step": 60 + }, + { + "epoch": 0.03440496333897349, + "grad_norm": 6.430440425872803, + "learning_rate": 4.9830795262267344e-05, + "loss": 3.4487, + "step": 61 + }, + { + "epoch": 0.03496897913141568, + "grad_norm": 6.948535442352295, + "learning_rate": 4.9827975183305136e-05, + "loss": 3.9136, + "step": 62 + }, + { + "epoch": 0.03553299492385787, + "grad_norm": 6.744254112243652, + "learning_rate": 4.982515510434292e-05, + "loss": 4.0194, + "step": 63 + }, + { + "epoch": 0.03609701071630006, + "grad_norm": 6.421425819396973, + "learning_rate": 4.9822335025380714e-05, + "loss": 3.7355, + "step": 64 + }, + { + "epoch": 0.03666102650874224, + "grad_norm": 8.99809455871582, + "learning_rate": 4.98195149464185e-05, + "loss": 3.8445, + "step": 65 + }, + { + "epoch": 0.03722504230118443, + "grad_norm": 8.989853858947754, + "learning_rate": 4.981669486745629e-05, + "loss": 4.0861, + "step": 66 + }, + { + "epoch": 0.03778905809362662, + "grad_norm": 5.8741021156311035, + "learning_rate": 4.981387478849408e-05, + "loss": 3.3569, + "step": 67 + }, + { + "epoch": 0.03835307388606881, + "grad_norm": 7.43571138381958, + "learning_rate": 4.981105470953187e-05, + "loss": 3.3106, + "step": 68 + }, + { + "epoch": 0.038917089678511, + "grad_norm": 7.607334136962891, + "learning_rate": 4.9808234630569654e-05, + "loss": 3.5154, + "step": 69 + }, + { + "epoch": 0.03948110547095319, + "grad_norm": 7.31130838394165, + "learning_rate": 4.9805414551607446e-05, + "loss": 3.9599, + "step": 70 + }, + { + "epoch": 0.04004512126339538, + "grad_norm": 7.973847389221191, + "learning_rate": 4.980259447264524e-05, + "loss": 3.5066, + "step": 71 + }, + { + "epoch": 0.04060913705583756, + "grad_norm": 6.666601181030273, + "learning_rate": 4.9799774393683024e-05, + "loss": 3.1606, + "step": 72 + }, + { + "epoch": 0.04117315284827975, + "grad_norm": 7.157909393310547, + "learning_rate": 4.9796954314720816e-05, + "loss": 3.6849, + "step": 73 + }, + { + "epoch": 0.04173716864072194, + "grad_norm": 6.387709140777588, + "learning_rate": 4.979413423575861e-05, + "loss": 3.0423, + "step": 74 + }, + { + "epoch": 0.04230118443316413, + "grad_norm": 7.730493068695068, + "learning_rate": 4.9791314156796394e-05, + "loss": 3.8421, + "step": 75 + }, + { + "epoch": 0.04286520022560632, + "grad_norm": 7.446045875549316, + "learning_rate": 4.978849407783418e-05, + "loss": 3.5639, + "step": 76 + }, + { + "epoch": 0.04342921601804851, + "grad_norm": 7.039213180541992, + "learning_rate": 4.978567399887197e-05, + "loss": 3.6286, + "step": 77 + }, + { + "epoch": 0.043993231810490696, + "grad_norm": 8.242051124572754, + "learning_rate": 4.9782853919909763e-05, + "loss": 3.9716, + "step": 78 + }, + { + "epoch": 0.044557247602932885, + "grad_norm": 6.042781352996826, + "learning_rate": 4.978003384094755e-05, + "loss": 3.326, + "step": 79 + }, + { + "epoch": 0.04512126339537507, + "grad_norm": 8.139811515808105, + "learning_rate": 4.9777213761985334e-05, + "loss": 3.5218, + "step": 80 + }, + { + "epoch": 0.04568527918781726, + "grad_norm": 7.764042854309082, + "learning_rate": 4.9774393683023126e-05, + "loss": 4.0708, + "step": 81 + }, + { + "epoch": 0.046249294980259446, + "grad_norm": 7.409045696258545, + "learning_rate": 4.977157360406092e-05, + "loss": 3.8121, + "step": 82 + }, + { + "epoch": 0.046813310772701636, + "grad_norm": 10.04455280303955, + "learning_rate": 4.9768753525098704e-05, + "loss": 3.5646, + "step": 83 + }, + { + "epoch": 0.047377326565143825, + "grad_norm": 7.190754413604736, + "learning_rate": 4.976593344613649e-05, + "loss": 3.654, + "step": 84 + }, + { + "epoch": 0.047941342357586014, + "grad_norm": 7.537837505340576, + "learning_rate": 4.976311336717429e-05, + "loss": 3.539, + "step": 85 + }, + { + "epoch": 0.048505358150028204, + "grad_norm": 6.887396812438965, + "learning_rate": 4.9760293288212074e-05, + "loss": 3.2265, + "step": 86 + }, + { + "epoch": 0.049069373942470386, + "grad_norm": 9.217448234558105, + "learning_rate": 4.975747320924986e-05, + "loss": 3.7489, + "step": 87 + }, + { + "epoch": 0.049633389734912575, + "grad_norm": 7.252974033355713, + "learning_rate": 4.975465313028765e-05, + "loss": 3.3808, + "step": 88 + }, + { + "epoch": 0.050197405527354765, + "grad_norm": 6.819150924682617, + "learning_rate": 4.9751833051325444e-05, + "loss": 3.1852, + "step": 89 + }, + { + "epoch": 0.050761421319796954, + "grad_norm": 8.281214714050293, + "learning_rate": 4.974901297236323e-05, + "loss": 3.6556, + "step": 90 + }, + { + "epoch": 0.05132543711223914, + "grad_norm": 7.091026782989502, + "learning_rate": 4.9746192893401014e-05, + "loss": 3.219, + "step": 91 + }, + { + "epoch": 0.05188945290468133, + "grad_norm": 6.970754146575928, + "learning_rate": 4.9743372814438807e-05, + "loss": 3.4041, + "step": 92 + }, + { + "epoch": 0.05245346869712352, + "grad_norm": 7.223953723907471, + "learning_rate": 4.97405527354766e-05, + "loss": 3.2998, + "step": 93 + }, + { + "epoch": 0.05301748448956571, + "grad_norm": 6.865255355834961, + "learning_rate": 4.9737732656514384e-05, + "loss": 3.2498, + "step": 94 + }, + { + "epoch": 0.05358150028200789, + "grad_norm": 8.02442455291748, + "learning_rate": 4.973491257755217e-05, + "loss": 3.5172, + "step": 95 + }, + { + "epoch": 0.05414551607445008, + "grad_norm": 7.133455276489258, + "learning_rate": 4.973209249858996e-05, + "loss": 3.0543, + "step": 96 + }, + { + "epoch": 0.05470953186689227, + "grad_norm": 7.497572898864746, + "learning_rate": 4.9729272419627754e-05, + "loss": 3.3935, + "step": 97 + }, + { + "epoch": 0.05527354765933446, + "grad_norm": 6.03790283203125, + "learning_rate": 4.972645234066554e-05, + "loss": 3.2787, + "step": 98 + }, + { + "epoch": 0.05583756345177665, + "grad_norm": 6.226919174194336, + "learning_rate": 4.972363226170333e-05, + "loss": 2.7267, + "step": 99 + }, + { + "epoch": 0.05640157924421884, + "grad_norm": 7.469327449798584, + "learning_rate": 4.972081218274112e-05, + "loss": 3.0223, + "step": 100 + }, + { + "epoch": 0.05696559503666103, + "grad_norm": 7.465392112731934, + "learning_rate": 4.971799210377891e-05, + "loss": 3.5359, + "step": 101 + }, + { + "epoch": 0.05752961082910321, + "grad_norm": 8.025592803955078, + "learning_rate": 4.9715172024816695e-05, + "loss": 3.0917, + "step": 102 + }, + { + "epoch": 0.0580936266215454, + "grad_norm": 6.866366863250732, + "learning_rate": 4.971235194585449e-05, + "loss": 3.0212, + "step": 103 + }, + { + "epoch": 0.05865764241398759, + "grad_norm": 7.218946933746338, + "learning_rate": 4.970953186689227e-05, + "loss": 3.1704, + "step": 104 + }, + { + "epoch": 0.05922165820642978, + "grad_norm": 6.850453853607178, + "learning_rate": 4.9706711787930064e-05, + "loss": 3.3614, + "step": 105 + }, + { + "epoch": 0.05978567399887197, + "grad_norm": 7.06443977355957, + "learning_rate": 4.9703891708967857e-05, + "loss": 2.9395, + "step": 106 + }, + { + "epoch": 0.06034968979131416, + "grad_norm": 7.439785957336426, + "learning_rate": 4.970107163000564e-05, + "loss": 3.5035, + "step": 107 + }, + { + "epoch": 0.06091370558375635, + "grad_norm": 8.692646980285645, + "learning_rate": 4.9698251551043434e-05, + "loss": 3.5351, + "step": 108 + }, + { + "epoch": 0.06147772137619854, + "grad_norm": 6.755487442016602, + "learning_rate": 4.969543147208122e-05, + "loss": 2.559, + "step": 109 + }, + { + "epoch": 0.06204173716864072, + "grad_norm": 6.973780155181885, + "learning_rate": 4.969261139311901e-05, + "loss": 3.4044, + "step": 110 + }, + { + "epoch": 0.06260575296108291, + "grad_norm": 7.211248874664307, + "learning_rate": 4.96897913141568e-05, + "loss": 2.5415, + "step": 111 + }, + { + "epoch": 0.0631697687535251, + "grad_norm": 6.568201065063477, + "learning_rate": 4.968697123519459e-05, + "loss": 3.06, + "step": 112 + }, + { + "epoch": 0.06373378454596729, + "grad_norm": 7.562126159667969, + "learning_rate": 4.9684151156232375e-05, + "loss": 3.2664, + "step": 113 + }, + { + "epoch": 0.06429780033840947, + "grad_norm": 8.446998596191406, + "learning_rate": 4.968133107727017e-05, + "loss": 3.3757, + "step": 114 + }, + { + "epoch": 0.06486181613085167, + "grad_norm": 7.208504676818848, + "learning_rate": 4.967851099830795e-05, + "loss": 3.0984, + "step": 115 + }, + { + "epoch": 0.06542583192329385, + "grad_norm": 7.120885372161865, + "learning_rate": 4.9675690919345744e-05, + "loss": 2.838, + "step": 116 + }, + { + "epoch": 0.06598984771573604, + "grad_norm": 7.01420783996582, + "learning_rate": 4.967287084038354e-05, + "loss": 3.1304, + "step": 117 + }, + { + "epoch": 0.06655386350817823, + "grad_norm": 6.882701873779297, + "learning_rate": 4.967005076142132e-05, + "loss": 3.151, + "step": 118 + }, + { + "epoch": 0.06711787930062042, + "grad_norm": 7.189032077789307, + "learning_rate": 4.966723068245911e-05, + "loss": 2.8668, + "step": 119 + }, + { + "epoch": 0.0676818950930626, + "grad_norm": 6.818087100982666, + "learning_rate": 4.96644106034969e-05, + "loss": 3.67, + "step": 120 + }, + { + "epoch": 0.06824591088550479, + "grad_norm": 7.581429481506348, + "learning_rate": 4.966159052453469e-05, + "loss": 2.9997, + "step": 121 + }, + { + "epoch": 0.06880992667794698, + "grad_norm": 6.835054397583008, + "learning_rate": 4.965877044557248e-05, + "loss": 2.818, + "step": 122 + }, + { + "epoch": 0.06937394247038917, + "grad_norm": 6.6179890632629395, + "learning_rate": 4.965595036661026e-05, + "loss": 2.7862, + "step": 123 + }, + { + "epoch": 0.06993795826283136, + "grad_norm": 6.652092456817627, + "learning_rate": 4.965313028764806e-05, + "loss": 2.9158, + "step": 124 + }, + { + "epoch": 0.07050197405527354, + "grad_norm": 6.6767096519470215, + "learning_rate": 4.965031020868585e-05, + "loss": 3.0779, + "step": 125 + }, + { + "epoch": 0.07106598984771574, + "grad_norm": 7.382199287414551, + "learning_rate": 4.964749012972363e-05, + "loss": 2.8173, + "step": 126 + }, + { + "epoch": 0.07163000564015792, + "grad_norm": 6.068853378295898, + "learning_rate": 4.9644670050761425e-05, + "loss": 2.4025, + "step": 127 + }, + { + "epoch": 0.07219402143260012, + "grad_norm": 7.232067584991455, + "learning_rate": 4.964184997179922e-05, + "loss": 3.1752, + "step": 128 + }, + { + "epoch": 0.0727580372250423, + "grad_norm": 7.2300543785095215, + "learning_rate": 4.9639029892837e-05, + "loss": 3.1188, + "step": 129 + }, + { + "epoch": 0.07332205301748448, + "grad_norm": 7.587802886962891, + "learning_rate": 4.963620981387479e-05, + "loss": 3.0744, + "step": 130 + }, + { + "epoch": 0.07388606880992668, + "grad_norm": 7.076419353485107, + "learning_rate": 4.963338973491258e-05, + "loss": 3.0406, + "step": 131 + }, + { + "epoch": 0.07445008460236886, + "grad_norm": 6.808330059051514, + "learning_rate": 4.963056965595037e-05, + "loss": 3.1644, + "step": 132 + }, + { + "epoch": 0.07501410039481106, + "grad_norm": 6.956964492797852, + "learning_rate": 4.962774957698816e-05, + "loss": 2.7242, + "step": 133 + }, + { + "epoch": 0.07557811618725324, + "grad_norm": 6.930006980895996, + "learning_rate": 4.962492949802594e-05, + "loss": 2.3826, + "step": 134 + }, + { + "epoch": 0.07614213197969544, + "grad_norm": 6.602227210998535, + "learning_rate": 4.9622109419063735e-05, + "loss": 2.8111, + "step": 135 + }, + { + "epoch": 0.07670614777213762, + "grad_norm": 6.013307571411133, + "learning_rate": 4.961928934010153e-05, + "loss": 2.4956, + "step": 136 + }, + { + "epoch": 0.0772701635645798, + "grad_norm": 7.099822521209717, + "learning_rate": 4.961646926113931e-05, + "loss": 3.2416, + "step": 137 + }, + { + "epoch": 0.077834179357022, + "grad_norm": 6.321027755737305, + "learning_rate": 4.9613649182177105e-05, + "loss": 2.1323, + "step": 138 + }, + { + "epoch": 0.07839819514946418, + "grad_norm": 6.30959939956665, + "learning_rate": 4.961082910321489e-05, + "loss": 2.3846, + "step": 139 + }, + { + "epoch": 0.07896221094190638, + "grad_norm": 6.858240604400635, + "learning_rate": 4.960800902425268e-05, + "loss": 3.2196, + "step": 140 + }, + { + "epoch": 0.07952622673434856, + "grad_norm": 8.24159049987793, + "learning_rate": 4.960518894529047e-05, + "loss": 3.0505, + "step": 141 + }, + { + "epoch": 0.08009024252679076, + "grad_norm": 7.103549480438232, + "learning_rate": 4.960236886632826e-05, + "loss": 2.5429, + "step": 142 + }, + { + "epoch": 0.08065425831923294, + "grad_norm": 6.565817832946777, + "learning_rate": 4.959954878736605e-05, + "loss": 3.2179, + "step": 143 + }, + { + "epoch": 0.08121827411167512, + "grad_norm": 7.656014919281006, + "learning_rate": 4.959672870840384e-05, + "loss": 3.2248, + "step": 144 + }, + { + "epoch": 0.08178228990411732, + "grad_norm": 6.323434352874756, + "learning_rate": 4.959390862944163e-05, + "loss": 2.8096, + "step": 145 + }, + { + "epoch": 0.0823463056965595, + "grad_norm": 6.862704753875732, + "learning_rate": 4.9591088550479415e-05, + "loss": 3.2072, + "step": 146 + }, + { + "epoch": 0.0829103214890017, + "grad_norm": 6.898926734924316, + "learning_rate": 4.958826847151721e-05, + "loss": 3.2767, + "step": 147 + }, + { + "epoch": 0.08347433728144388, + "grad_norm": 6.9346842765808105, + "learning_rate": 4.958544839255499e-05, + "loss": 2.5067, + "step": 148 + }, + { + "epoch": 0.08403835307388607, + "grad_norm": 7.560790538787842, + "learning_rate": 4.9582628313592785e-05, + "loss": 2.4656, + "step": 149 + }, + { + "epoch": 0.08460236886632826, + "grad_norm": 6.444140911102295, + "learning_rate": 4.957980823463057e-05, + "loss": 2.6816, + "step": 150 + }, + { + "epoch": 0.08516638465877044, + "grad_norm": 6.7115349769592285, + "learning_rate": 4.957698815566836e-05, + "loss": 3.2669, + "step": 151 + }, + { + "epoch": 0.08573040045121263, + "grad_norm": 5.732504844665527, + "learning_rate": 4.957416807670615e-05, + "loss": 2.0262, + "step": 152 + }, + { + "epoch": 0.08629441624365482, + "grad_norm": 6.591984748840332, + "learning_rate": 4.957134799774394e-05, + "loss": 2.5945, + "step": 153 + }, + { + "epoch": 0.08685843203609701, + "grad_norm": 6.478238105773926, + "learning_rate": 4.9568527918781726e-05, + "loss": 2.4902, + "step": 154 + }, + { + "epoch": 0.0874224478285392, + "grad_norm": 6.9311394691467285, + "learning_rate": 4.956570783981952e-05, + "loss": 3.1725, + "step": 155 + }, + { + "epoch": 0.08798646362098139, + "grad_norm": 6.635754108428955, + "learning_rate": 4.956288776085731e-05, + "loss": 2.7506, + "step": 156 + }, + { + "epoch": 0.08855047941342357, + "grad_norm": 6.895791053771973, + "learning_rate": 4.9560067681895095e-05, + "loss": 2.9627, + "step": 157 + }, + { + "epoch": 0.08911449520586577, + "grad_norm": 7.148859977722168, + "learning_rate": 4.955724760293288e-05, + "loss": 2.7193, + "step": 158 + }, + { + "epoch": 0.08967851099830795, + "grad_norm": 6.903415679931641, + "learning_rate": 4.955442752397067e-05, + "loss": 2.977, + "step": 159 + }, + { + "epoch": 0.09024252679075014, + "grad_norm": 5.954835891723633, + "learning_rate": 4.9551607445008465e-05, + "loss": 2.9334, + "step": 160 + }, + { + "epoch": 0.09080654258319233, + "grad_norm": 7.816580295562744, + "learning_rate": 4.954878736604625e-05, + "loss": 2.8872, + "step": 161 + }, + { + "epoch": 0.09137055837563451, + "grad_norm": 6.633319854736328, + "learning_rate": 4.9545967287084036e-05, + "loss": 2.9701, + "step": 162 + }, + { + "epoch": 0.09193457416807671, + "grad_norm": 11.131697654724121, + "learning_rate": 4.9543147208121835e-05, + "loss": 2.1847, + "step": 163 + }, + { + "epoch": 0.09249858996051889, + "grad_norm": 6.046445846557617, + "learning_rate": 4.954032712915962e-05, + "loss": 2.3769, + "step": 164 + }, + { + "epoch": 0.09306260575296109, + "grad_norm": 6.825990676879883, + "learning_rate": 4.9537507050197406e-05, + "loss": 2.4738, + "step": 165 + }, + { + "epoch": 0.09362662154540327, + "grad_norm": 6.133873462677002, + "learning_rate": 4.95346869712352e-05, + "loss": 2.5374, + "step": 166 + }, + { + "epoch": 0.09419063733784545, + "grad_norm": 6.055384159088135, + "learning_rate": 4.953186689227299e-05, + "loss": 1.8559, + "step": 167 + }, + { + "epoch": 0.09475465313028765, + "grad_norm": 7.923935413360596, + "learning_rate": 4.9529046813310775e-05, + "loss": 2.7421, + "step": 168 + }, + { + "epoch": 0.09531866892272983, + "grad_norm": 6.359164237976074, + "learning_rate": 4.952622673434856e-05, + "loss": 2.5317, + "step": 169 + }, + { + "epoch": 0.09588268471517203, + "grad_norm": 5.742668151855469, + "learning_rate": 4.952340665538635e-05, + "loss": 2.5628, + "step": 170 + }, + { + "epoch": 0.09644670050761421, + "grad_norm": 7.499442100524902, + "learning_rate": 4.9520586576424145e-05, + "loss": 2.5789, + "step": 171 + }, + { + "epoch": 0.09701071630005641, + "grad_norm": 5.675538063049316, + "learning_rate": 4.951776649746193e-05, + "loss": 2.9348, + "step": 172 + }, + { + "epoch": 0.09757473209249859, + "grad_norm": 6.202669620513916, + "learning_rate": 4.9514946418499716e-05, + "loss": 2.6999, + "step": 173 + }, + { + "epoch": 0.09813874788494077, + "grad_norm": 4.760848522186279, + "learning_rate": 4.951212633953751e-05, + "loss": 2.0329, + "step": 174 + }, + { + "epoch": 0.09870276367738297, + "grad_norm": 6.9251790046691895, + "learning_rate": 4.95093062605753e-05, + "loss": 2.5949, + "step": 175 + }, + { + "epoch": 0.09926677946982515, + "grad_norm": 6.4343180656433105, + "learning_rate": 4.9506486181613086e-05, + "loss": 2.9485, + "step": 176 + }, + { + "epoch": 0.09983079526226735, + "grad_norm": 7.334808826446533, + "learning_rate": 4.950366610265088e-05, + "loss": 2.4718, + "step": 177 + }, + { + "epoch": 0.10039481105470953, + "grad_norm": 6.532999515533447, + "learning_rate": 4.950084602368867e-05, + "loss": 2.7785, + "step": 178 + }, + { + "epoch": 0.10095882684715173, + "grad_norm": 5.271857261657715, + "learning_rate": 4.9498025944726456e-05, + "loss": 2.2051, + "step": 179 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 6.05438756942749, + "learning_rate": 4.949520586576424e-05, + "loss": 2.1704, + "step": 180 + }, + { + "epoch": 0.1020868584320361, + "grad_norm": 6.799954891204834, + "learning_rate": 4.949238578680203e-05, + "loss": 3.4036, + "step": 181 + }, + { + "epoch": 0.10265087422447829, + "grad_norm": 6.387793064117432, + "learning_rate": 4.9489565707839825e-05, + "loss": 2.5425, + "step": 182 + }, + { + "epoch": 0.10321489001692047, + "grad_norm": 6.152683734893799, + "learning_rate": 4.948674562887761e-05, + "loss": 2.465, + "step": 183 + }, + { + "epoch": 0.10377890580936266, + "grad_norm": 8.368980407714844, + "learning_rate": 4.94839255499154e-05, + "loss": 2.3773, + "step": 184 + }, + { + "epoch": 0.10434292160180485, + "grad_norm": 5.44979190826416, + "learning_rate": 4.948110547095319e-05, + "loss": 2.023, + "step": 185 + }, + { + "epoch": 0.10490693739424704, + "grad_norm": 5.823932647705078, + "learning_rate": 4.947828539199098e-05, + "loss": 2.6851, + "step": 186 + }, + { + "epoch": 0.10547095318668923, + "grad_norm": 8.459972381591797, + "learning_rate": 4.9475465313028766e-05, + "loss": 2.3618, + "step": 187 + }, + { + "epoch": 0.10603496897913142, + "grad_norm": 6.517510414123535, + "learning_rate": 4.947264523406656e-05, + "loss": 2.6396, + "step": 188 + }, + { + "epoch": 0.1065989847715736, + "grad_norm": 7.0371575355529785, + "learning_rate": 4.9469825155104344e-05, + "loss": 2.9449, + "step": 189 + }, + { + "epoch": 0.10716300056401579, + "grad_norm": 6.643599987030029, + "learning_rate": 4.9467005076142136e-05, + "loss": 2.1484, + "step": 190 + }, + { + "epoch": 0.10772701635645798, + "grad_norm": 5.943167209625244, + "learning_rate": 4.946418499717992e-05, + "loss": 2.6899, + "step": 191 + }, + { + "epoch": 0.10829103214890017, + "grad_norm": 6.968503475189209, + "learning_rate": 4.946136491821771e-05, + "loss": 2.539, + "step": 192 + }, + { + "epoch": 0.10885504794134236, + "grad_norm": 5.977431774139404, + "learning_rate": 4.94585448392555e-05, + "loss": 2.6134, + "step": 193 + }, + { + "epoch": 0.10941906373378454, + "grad_norm": 6.545126914978027, + "learning_rate": 4.945572476029329e-05, + "loss": 2.722, + "step": 194 + }, + { + "epoch": 0.10998307952622674, + "grad_norm": 6.582836151123047, + "learning_rate": 4.945290468133108e-05, + "loss": 2.775, + "step": 195 + }, + { + "epoch": 0.11054709531866892, + "grad_norm": 7.134071350097656, + "learning_rate": 4.945008460236887e-05, + "loss": 2.437, + "step": 196 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 6.161172389984131, + "learning_rate": 4.9447264523406654e-05, + "loss": 2.655, + "step": 197 + }, + { + "epoch": 0.1116751269035533, + "grad_norm": 7.216391086578369, + "learning_rate": 4.9444444444444446e-05, + "loss": 2.9699, + "step": 198 + }, + { + "epoch": 0.11223914269599548, + "grad_norm": 5.362102508544922, + "learning_rate": 4.944162436548224e-05, + "loss": 2.403, + "step": 199 + }, + { + "epoch": 0.11280315848843768, + "grad_norm": 6.173458576202393, + "learning_rate": 4.9438804286520024e-05, + "loss": 2.7469, + "step": 200 + }, + { + "epoch": 0.11336717428087986, + "grad_norm": 5.570171356201172, + "learning_rate": 4.943598420755781e-05, + "loss": 2.3885, + "step": 201 + }, + { + "epoch": 0.11393119007332206, + "grad_norm": 7.0714430809021, + "learning_rate": 4.943316412859561e-05, + "loss": 3.0837, + "step": 202 + }, + { + "epoch": 0.11449520586576424, + "grad_norm": 5.620211601257324, + "learning_rate": 4.9430344049633394e-05, + "loss": 2.6542, + "step": 203 + }, + { + "epoch": 0.11505922165820642, + "grad_norm": 6.805936336517334, + "learning_rate": 4.942752397067118e-05, + "loss": 2.5025, + "step": 204 + }, + { + "epoch": 0.11562323745064862, + "grad_norm": 5.7327351570129395, + "learning_rate": 4.942470389170897e-05, + "loss": 2.1123, + "step": 205 + }, + { + "epoch": 0.1161872532430908, + "grad_norm": 6.174817085266113, + "learning_rate": 4.942188381274676e-05, + "loss": 2.4319, + "step": 206 + }, + { + "epoch": 0.116751269035533, + "grad_norm": 7.045943260192871, + "learning_rate": 4.941906373378455e-05, + "loss": 2.8551, + "step": 207 + }, + { + "epoch": 0.11731528482797518, + "grad_norm": 6.091428756713867, + "learning_rate": 4.9416243654822334e-05, + "loss": 2.2801, + "step": 208 + }, + { + "epoch": 0.11787930062041738, + "grad_norm": 7.4420084953308105, + "learning_rate": 4.9413423575860126e-05, + "loss": 2.4778, + "step": 209 + }, + { + "epoch": 0.11844331641285956, + "grad_norm": 5.4205851554870605, + "learning_rate": 4.941060349689792e-05, + "loss": 2.8442, + "step": 210 + }, + { + "epoch": 0.11900733220530176, + "grad_norm": 5.828869819641113, + "learning_rate": 4.9407783417935704e-05, + "loss": 1.8828, + "step": 211 + }, + { + "epoch": 0.11957134799774394, + "grad_norm": 6.077840805053711, + "learning_rate": 4.940496333897349e-05, + "loss": 1.9694, + "step": 212 + }, + { + "epoch": 0.12013536379018612, + "grad_norm": 5.31550931930542, + "learning_rate": 4.940214326001129e-05, + "loss": 1.9734, + "step": 213 + }, + { + "epoch": 0.12069937958262832, + "grad_norm": 6.302183628082275, + "learning_rate": 4.9399323181049074e-05, + "loss": 2.7354, + "step": 214 + }, + { + "epoch": 0.1212633953750705, + "grad_norm": 5.8508381843566895, + "learning_rate": 4.939650310208686e-05, + "loss": 2.3738, + "step": 215 + }, + { + "epoch": 0.1218274111675127, + "grad_norm": 7.492915153503418, + "learning_rate": 4.939368302312465e-05, + "loss": 2.9625, + "step": 216 + }, + { + "epoch": 0.12239142695995488, + "grad_norm": 5.432050704956055, + "learning_rate": 4.9390862944162443e-05, + "loss": 2.1506, + "step": 217 + }, + { + "epoch": 0.12295544275239707, + "grad_norm": 7.9324750900268555, + "learning_rate": 4.938804286520023e-05, + "loss": 3.0948, + "step": 218 + }, + { + "epoch": 0.12351945854483926, + "grad_norm": 5.646500110626221, + "learning_rate": 4.9385222786238014e-05, + "loss": 1.9823, + "step": 219 + }, + { + "epoch": 0.12408347433728144, + "grad_norm": 6.131877422332764, + "learning_rate": 4.9382402707275806e-05, + "loss": 2.7335, + "step": 220 + }, + { + "epoch": 0.12464749012972363, + "grad_norm": 5.27899169921875, + "learning_rate": 4.93795826283136e-05, + "loss": 1.9472, + "step": 221 + }, + { + "epoch": 0.12521150592216582, + "grad_norm": 4.621928691864014, + "learning_rate": 4.9376762549351384e-05, + "loss": 1.8764, + "step": 222 + }, + { + "epoch": 0.125775521714608, + "grad_norm": 5.350039482116699, + "learning_rate": 4.9373942470389176e-05, + "loss": 2.1432, + "step": 223 + }, + { + "epoch": 0.1263395375070502, + "grad_norm": 7.557504177093506, + "learning_rate": 4.937112239142696e-05, + "loss": 2.6498, + "step": 224 + }, + { + "epoch": 0.12690355329949238, + "grad_norm": 6.323827743530273, + "learning_rate": 4.9368302312464754e-05, + "loss": 2.5711, + "step": 225 + }, + { + "epoch": 0.12746756909193457, + "grad_norm": 6.85659122467041, + "learning_rate": 4.936548223350254e-05, + "loss": 2.8828, + "step": 226 + }, + { + "epoch": 0.12803158488437677, + "grad_norm": 6.510000705718994, + "learning_rate": 4.936266215454033e-05, + "loss": 2.9148, + "step": 227 + }, + { + "epoch": 0.12859560067681894, + "grad_norm": 5.367193222045898, + "learning_rate": 4.935984207557812e-05, + "loss": 2.7445, + "step": 228 + }, + { + "epoch": 0.12915961646926113, + "grad_norm": 5.197378635406494, + "learning_rate": 4.935702199661591e-05, + "loss": 1.9463, + "step": 229 + }, + { + "epoch": 0.12972363226170333, + "grad_norm": 6.823288440704346, + "learning_rate": 4.9354201917653694e-05, + "loss": 2.3726, + "step": 230 + }, + { + "epoch": 0.13028764805414553, + "grad_norm": 6.158012866973877, + "learning_rate": 4.9351381838691487e-05, + "loss": 2.5113, + "step": 231 + }, + { + "epoch": 0.1308516638465877, + "grad_norm": 6.248989105224609, + "learning_rate": 4.934856175972927e-05, + "loss": 2.0928, + "step": 232 + }, + { + "epoch": 0.1314156796390299, + "grad_norm": 5.456482410430908, + "learning_rate": 4.9345741680767064e-05, + "loss": 2.5029, + "step": 233 + }, + { + "epoch": 0.1319796954314721, + "grad_norm": 5.32204532623291, + "learning_rate": 4.9342921601804856e-05, + "loss": 2.3078, + "step": 234 + }, + { + "epoch": 0.13254371122391426, + "grad_norm": 5.691492557525635, + "learning_rate": 4.934010152284264e-05, + "loss": 2.0716, + "step": 235 + }, + { + "epoch": 0.13310772701635645, + "grad_norm": 6.5251851081848145, + "learning_rate": 4.933728144388043e-05, + "loss": 2.3903, + "step": 236 + }, + { + "epoch": 0.13367174280879865, + "grad_norm": 4.88683557510376, + "learning_rate": 4.933446136491822e-05, + "loss": 2.0622, + "step": 237 + }, + { + "epoch": 0.13423575860124085, + "grad_norm": 6.960509300231934, + "learning_rate": 4.933164128595601e-05, + "loss": 3.1366, + "step": 238 + }, + { + "epoch": 0.13479977439368301, + "grad_norm": 5.3670783042907715, + "learning_rate": 4.93288212069938e-05, + "loss": 2.3891, + "step": 239 + }, + { + "epoch": 0.1353637901861252, + "grad_norm": 7.282630443572998, + "learning_rate": 4.932600112803158e-05, + "loss": 2.1792, + "step": 240 + }, + { + "epoch": 0.1359278059785674, + "grad_norm": 4.234830379486084, + "learning_rate": 4.932318104906938e-05, + "loss": 1.4517, + "step": 241 + }, + { + "epoch": 0.13649182177100957, + "grad_norm": 5.9051432609558105, + "learning_rate": 4.932036097010717e-05, + "loss": 2.3832, + "step": 242 + }, + { + "epoch": 0.13705583756345177, + "grad_norm": 7.357797622680664, + "learning_rate": 4.931754089114495e-05, + "loss": 2.2492, + "step": 243 + }, + { + "epoch": 0.13761985335589397, + "grad_norm": 6.030848979949951, + "learning_rate": 4.931472081218274e-05, + "loss": 2.4378, + "step": 244 + }, + { + "epoch": 0.13818386914833616, + "grad_norm": 4.643002510070801, + "learning_rate": 4.9311900733220537e-05, + "loss": 1.632, + "step": 245 + }, + { + "epoch": 0.13874788494077833, + "grad_norm": 6.144219398498535, + "learning_rate": 4.930908065425832e-05, + "loss": 2.477, + "step": 246 + }, + { + "epoch": 0.13931190073322053, + "grad_norm": 4.876280307769775, + "learning_rate": 4.930626057529611e-05, + "loss": 2.5585, + "step": 247 + }, + { + "epoch": 0.13987591652566272, + "grad_norm": 7.281432151794434, + "learning_rate": 4.93034404963339e-05, + "loss": 2.4867, + "step": 248 + }, + { + "epoch": 0.1404399323181049, + "grad_norm": 5.19519567489624, + "learning_rate": 4.930062041737169e-05, + "loss": 2.3945, + "step": 249 + }, + { + "epoch": 0.1410039481105471, + "grad_norm": 6.445837020874023, + "learning_rate": 4.929780033840948e-05, + "loss": 2.8155, + "step": 250 + }, + { + "epoch": 0.14156796390298929, + "grad_norm": 6.755675792694092, + "learning_rate": 4.929498025944726e-05, + "loss": 2.744, + "step": 251 + }, + { + "epoch": 0.14213197969543148, + "grad_norm": 4.770914077758789, + "learning_rate": 4.929216018048506e-05, + "loss": 2.2254, + "step": 252 + }, + { + "epoch": 0.14269599548787365, + "grad_norm": 5.253628253936768, + "learning_rate": 4.928934010152285e-05, + "loss": 2.6607, + "step": 253 + }, + { + "epoch": 0.14326001128031585, + "grad_norm": 7.7680816650390625, + "learning_rate": 4.928652002256063e-05, + "loss": 2.3766, + "step": 254 + }, + { + "epoch": 0.14382402707275804, + "grad_norm": 6.365548133850098, + "learning_rate": 4.9283699943598424e-05, + "loss": 2.5167, + "step": 255 + }, + { + "epoch": 0.14438804286520024, + "grad_norm": 5.346089839935303, + "learning_rate": 4.928087986463622e-05, + "loss": 2.2808, + "step": 256 + }, + { + "epoch": 0.1449520586576424, + "grad_norm": 6.50400447845459, + "learning_rate": 4.9278059785674e-05, + "loss": 2.8864, + "step": 257 + }, + { + "epoch": 0.1455160744500846, + "grad_norm": 5.220261096954346, + "learning_rate": 4.927523970671179e-05, + "loss": 2.0551, + "step": 258 + }, + { + "epoch": 0.1460800902425268, + "grad_norm": 6.653620719909668, + "learning_rate": 4.927241962774958e-05, + "loss": 1.8256, + "step": 259 + }, + { + "epoch": 0.14664410603496897, + "grad_norm": 5.515750885009766, + "learning_rate": 4.926959954878737e-05, + "loss": 2.0624, + "step": 260 + }, + { + "epoch": 0.14720812182741116, + "grad_norm": 6.943575382232666, + "learning_rate": 4.926677946982516e-05, + "loss": 2.4218, + "step": 261 + }, + { + "epoch": 0.14777213761985336, + "grad_norm": 6.190911293029785, + "learning_rate": 4.926395939086294e-05, + "loss": 2.077, + "step": 262 + }, + { + "epoch": 0.14833615341229556, + "grad_norm": 5.8481645584106445, + "learning_rate": 4.9261139311900735e-05, + "loss": 2.442, + "step": 263 + }, + { + "epoch": 0.14890016920473773, + "grad_norm": 6.02017879486084, + "learning_rate": 4.925831923293853e-05, + "loss": 2.5794, + "step": 264 + }, + { + "epoch": 0.14946418499717992, + "grad_norm": 6.383442401885986, + "learning_rate": 4.925549915397631e-05, + "loss": 3.133, + "step": 265 + }, + { + "epoch": 0.15002820078962212, + "grad_norm": 4.859953880310059, + "learning_rate": 4.9252679075014105e-05, + "loss": 2.0586, + "step": 266 + }, + { + "epoch": 0.1505922165820643, + "grad_norm": 4.46875, + "learning_rate": 4.924985899605189e-05, + "loss": 1.768, + "step": 267 + }, + { + "epoch": 0.15115623237450648, + "grad_norm": 5.985985279083252, + "learning_rate": 4.924703891708968e-05, + "loss": 2.5052, + "step": 268 + }, + { + "epoch": 0.15172024816694868, + "grad_norm": 6.997572422027588, + "learning_rate": 4.924421883812747e-05, + "loss": 2.8102, + "step": 269 + }, + { + "epoch": 0.15228426395939088, + "grad_norm": 5.873311996459961, + "learning_rate": 4.924139875916526e-05, + "loss": 2.8955, + "step": 270 + }, + { + "epoch": 0.15284827975183304, + "grad_norm": 8.265763282775879, + "learning_rate": 4.9238578680203045e-05, + "loss": 2.3288, + "step": 271 + }, + { + "epoch": 0.15341229554427524, + "grad_norm": 5.37157678604126, + "learning_rate": 4.923575860124084e-05, + "loss": 2.1383, + "step": 272 + }, + { + "epoch": 0.15397631133671744, + "grad_norm": 5.673976898193359, + "learning_rate": 4.923293852227863e-05, + "loss": 2.2288, + "step": 273 + }, + { + "epoch": 0.1545403271291596, + "grad_norm": 6.421933650970459, + "learning_rate": 4.9230118443316415e-05, + "loss": 2.3167, + "step": 274 + }, + { + "epoch": 0.1551043429216018, + "grad_norm": 5.403313636779785, + "learning_rate": 4.92272983643542e-05, + "loss": 2.1879, + "step": 275 + }, + { + "epoch": 0.155668358714044, + "grad_norm": 6.157882213592529, + "learning_rate": 4.922447828539199e-05, + "loss": 2.7597, + "step": 276 + }, + { + "epoch": 0.1562323745064862, + "grad_norm": 4.99591064453125, + "learning_rate": 4.9221658206429785e-05, + "loss": 2.2425, + "step": 277 + }, + { + "epoch": 0.15679639029892836, + "grad_norm": 5.418835163116455, + "learning_rate": 4.921883812746757e-05, + "loss": 2.2883, + "step": 278 + }, + { + "epoch": 0.15736040609137056, + "grad_norm": 5.366525650024414, + "learning_rate": 4.9216018048505356e-05, + "loss": 2.4422, + "step": 279 + }, + { + "epoch": 0.15792442188381275, + "grad_norm": 4.332320690155029, + "learning_rate": 4.921319796954315e-05, + "loss": 1.7421, + "step": 280 + }, + { + "epoch": 0.15848843767625492, + "grad_norm": 6.520541667938232, + "learning_rate": 4.921037789058094e-05, + "loss": 2.0802, + "step": 281 + }, + { + "epoch": 0.15905245346869712, + "grad_norm": 7.128161430358887, + "learning_rate": 4.9207557811618725e-05, + "loss": 2.6576, + "step": 282 + }, + { + "epoch": 0.15961646926113932, + "grad_norm": 5.522243499755859, + "learning_rate": 4.920473773265651e-05, + "loss": 2.3088, + "step": 283 + }, + { + "epoch": 0.1601804850535815, + "grad_norm": 5.734799861907959, + "learning_rate": 4.920191765369431e-05, + "loss": 1.9114, + "step": 284 + }, + { + "epoch": 0.16074450084602368, + "grad_norm": 5.000633716583252, + "learning_rate": 4.9199097574732095e-05, + "loss": 1.5885, + "step": 285 + }, + { + "epoch": 0.16130851663846588, + "grad_norm": 5.541326999664307, + "learning_rate": 4.919627749576988e-05, + "loss": 1.5913, + "step": 286 + }, + { + "epoch": 0.16187253243090807, + "grad_norm": 6.008398056030273, + "learning_rate": 4.919345741680767e-05, + "loss": 1.8194, + "step": 287 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 4.757733345031738, + "learning_rate": 4.9190637337845465e-05, + "loss": 2.537, + "step": 288 + }, + { + "epoch": 0.16300056401579244, + "grad_norm": 5.175214767456055, + "learning_rate": 4.918781725888325e-05, + "loss": 2.0472, + "step": 289 + }, + { + "epoch": 0.16356457980823463, + "grad_norm": 6.525706768035889, + "learning_rate": 4.9184997179921036e-05, + "loss": 2.8197, + "step": 290 + }, + { + "epoch": 0.16412859560067683, + "grad_norm": 6.03848123550415, + "learning_rate": 4.9182177100958835e-05, + "loss": 2.034, + "step": 291 + }, + { + "epoch": 0.164692611393119, + "grad_norm": 6.060798168182373, + "learning_rate": 4.917935702199662e-05, + "loss": 2.6529, + "step": 292 + }, + { + "epoch": 0.1652566271855612, + "grad_norm": 5.467141151428223, + "learning_rate": 4.9176536943034406e-05, + "loss": 2.0874, + "step": 293 + }, + { + "epoch": 0.1658206429780034, + "grad_norm": 5.8727030754089355, + "learning_rate": 4.91737168640722e-05, + "loss": 2.7057, + "step": 294 + }, + { + "epoch": 0.16638465877044556, + "grad_norm": 5.224161148071289, + "learning_rate": 4.917089678510999e-05, + "loss": 2.1851, + "step": 295 + }, + { + "epoch": 0.16694867456288776, + "grad_norm": 6.096386432647705, + "learning_rate": 4.9168076706147775e-05, + "loss": 2.0372, + "step": 296 + }, + { + "epoch": 0.16751269035532995, + "grad_norm": 6.222507476806641, + "learning_rate": 4.916525662718556e-05, + "loss": 2.4027, + "step": 297 + }, + { + "epoch": 0.16807670614777215, + "grad_norm": 5.883434772491455, + "learning_rate": 4.916243654822335e-05, + "loss": 2.1882, + "step": 298 + }, + { + "epoch": 0.16864072194021432, + "grad_norm": 5.597428798675537, + "learning_rate": 4.9159616469261145e-05, + "loss": 2.1965, + "step": 299 + }, + { + "epoch": 0.1692047377326565, + "grad_norm": 6.96850061416626, + "learning_rate": 4.915679639029893e-05, + "loss": 2.4843, + "step": 300 + }, + { + "epoch": 0.1697687535250987, + "grad_norm": 6.771056175231934, + "learning_rate": 4.9153976311336716e-05, + "loss": 2.556, + "step": 301 + }, + { + "epoch": 0.17033276931754088, + "grad_norm": 15.91870403289795, + "learning_rate": 4.915115623237451e-05, + "loss": 2.3157, + "step": 302 + }, + { + "epoch": 0.17089678510998307, + "grad_norm": 4.357451915740967, + "learning_rate": 4.91483361534123e-05, + "loss": 1.6829, + "step": 303 + }, + { + "epoch": 0.17146080090242527, + "grad_norm": 5.459149360656738, + "learning_rate": 4.9145516074450086e-05, + "loss": 2.4028, + "step": 304 + }, + { + "epoch": 0.17202481669486747, + "grad_norm": 5.928791046142578, + "learning_rate": 4.914269599548788e-05, + "loss": 2.1328, + "step": 305 + }, + { + "epoch": 0.17258883248730963, + "grad_norm": 6.9224629402160645, + "learning_rate": 4.913987591652566e-05, + "loss": 2.2894, + "step": 306 + }, + { + "epoch": 0.17315284827975183, + "grad_norm": 6.960050106048584, + "learning_rate": 4.9137055837563455e-05, + "loss": 2.6377, + "step": 307 + }, + { + "epoch": 0.17371686407219403, + "grad_norm": 6.141111850738525, + "learning_rate": 4.913423575860124e-05, + "loss": 2.5968, + "step": 308 + }, + { + "epoch": 0.17428087986463622, + "grad_norm": 6.182928085327148, + "learning_rate": 4.913141567963903e-05, + "loss": 2.7179, + "step": 309 + }, + { + "epoch": 0.1748448956570784, + "grad_norm": 7.77579927444458, + "learning_rate": 4.912859560067682e-05, + "loss": 2.1409, + "step": 310 + }, + { + "epoch": 0.1754089114495206, + "grad_norm": 5.044154644012451, + "learning_rate": 4.912577552171461e-05, + "loss": 2.1304, + "step": 311 + }, + { + "epoch": 0.17597292724196278, + "grad_norm": 5.037315845489502, + "learning_rate": 4.91229554427524e-05, + "loss": 1.9045, + "step": 312 + }, + { + "epoch": 0.17653694303440495, + "grad_norm": 4.889622211456299, + "learning_rate": 4.912013536379019e-05, + "loss": 2.2941, + "step": 313 + }, + { + "epoch": 0.17710095882684715, + "grad_norm": 6.313348293304443, + "learning_rate": 4.9117315284827974e-05, + "loss": 2.2574, + "step": 314 + }, + { + "epoch": 0.17766497461928935, + "grad_norm": 6.114498138427734, + "learning_rate": 4.9114495205865766e-05, + "loss": 2.7468, + "step": 315 + }, + { + "epoch": 0.17822899041173154, + "grad_norm": 6.436418056488037, + "learning_rate": 4.911167512690356e-05, + "loss": 2.2051, + "step": 316 + }, + { + "epoch": 0.1787930062041737, + "grad_norm": 7.242603778839111, + "learning_rate": 4.9108855047941343e-05, + "loss": 2.0257, + "step": 317 + }, + { + "epoch": 0.1793570219966159, + "grad_norm": 5.78814172744751, + "learning_rate": 4.910603496897913e-05, + "loss": 2.2034, + "step": 318 + }, + { + "epoch": 0.1799210377890581, + "grad_norm": 5.437631607055664, + "learning_rate": 4.910321489001692e-05, + "loss": 1.9275, + "step": 319 + }, + { + "epoch": 0.18048505358150027, + "grad_norm": 5.396062850952148, + "learning_rate": 4.910039481105471e-05, + "loss": 1.9401, + "step": 320 + }, + { + "epoch": 0.18104906937394247, + "grad_norm": 4.728531360626221, + "learning_rate": 4.90975747320925e-05, + "loss": 2.1036, + "step": 321 + }, + { + "epoch": 0.18161308516638466, + "grad_norm": 6.391375541687012, + "learning_rate": 4.909475465313029e-05, + "loss": 2.3828, + "step": 322 + }, + { + "epoch": 0.18217710095882686, + "grad_norm": 5.680989742279053, + "learning_rate": 4.909193457416808e-05, + "loss": 1.9629, + "step": 323 + }, + { + "epoch": 0.18274111675126903, + "grad_norm": 7.140889644622803, + "learning_rate": 4.908911449520587e-05, + "loss": 2.3429, + "step": 324 + }, + { + "epoch": 0.18330513254371122, + "grad_norm": 4.796309947967529, + "learning_rate": 4.9086294416243654e-05, + "loss": 2.3004, + "step": 325 + }, + { + "epoch": 0.18386914833615342, + "grad_norm": 4.512798309326172, + "learning_rate": 4.9083474337281446e-05, + "loss": 1.9388, + "step": 326 + }, + { + "epoch": 0.1844331641285956, + "grad_norm": 5.078066825866699, + "learning_rate": 4.908065425831924e-05, + "loss": 2.2116, + "step": 327 + }, + { + "epoch": 0.18499717992103779, + "grad_norm": 6.4919209480285645, + "learning_rate": 4.9077834179357024e-05, + "loss": 2.1509, + "step": 328 + }, + { + "epoch": 0.18556119571347998, + "grad_norm": 6.2677106857299805, + "learning_rate": 4.907501410039481e-05, + "loss": 1.8507, + "step": 329 + }, + { + "epoch": 0.18612521150592218, + "grad_norm": 6.710062503814697, + "learning_rate": 4.907219402143261e-05, + "loss": 2.2052, + "step": 330 + }, + { + "epoch": 0.18668922729836435, + "grad_norm": 4.868067264556885, + "learning_rate": 4.906937394247039e-05, + "loss": 2.1941, + "step": 331 + }, + { + "epoch": 0.18725324309080654, + "grad_norm": 5.802441596984863, + "learning_rate": 4.906655386350818e-05, + "loss": 1.8704, + "step": 332 + }, + { + "epoch": 0.18781725888324874, + "grad_norm": 5.787342548370361, + "learning_rate": 4.906373378454597e-05, + "loss": 2.2772, + "step": 333 + }, + { + "epoch": 0.1883812746756909, + "grad_norm": 4.379059314727783, + "learning_rate": 4.906091370558376e-05, + "loss": 1.733, + "step": 334 + }, + { + "epoch": 0.1889452904681331, + "grad_norm": 7.119242191314697, + "learning_rate": 4.905809362662155e-05, + "loss": 1.987, + "step": 335 + }, + { + "epoch": 0.1895093062605753, + "grad_norm": 5.345183849334717, + "learning_rate": 4.9055273547659334e-05, + "loss": 1.7995, + "step": 336 + }, + { + "epoch": 0.1900733220530175, + "grad_norm": 6.178272247314453, + "learning_rate": 4.9052453468697126e-05, + "loss": 2.2757, + "step": 337 + }, + { + "epoch": 0.19063733784545966, + "grad_norm": 4.967794418334961, + "learning_rate": 4.904963338973492e-05, + "loss": 2.0951, + "step": 338 + }, + { + "epoch": 0.19120135363790186, + "grad_norm": 5.234055519104004, + "learning_rate": 4.9046813310772704e-05, + "loss": 1.6713, + "step": 339 + }, + { + "epoch": 0.19176536943034406, + "grad_norm": 5.579483985900879, + "learning_rate": 4.904399323181049e-05, + "loss": 1.9808, + "step": 340 + }, + { + "epoch": 0.19232938522278623, + "grad_norm": 6.310529708862305, + "learning_rate": 4.904117315284828e-05, + "loss": 2.1677, + "step": 341 + }, + { + "epoch": 0.19289340101522842, + "grad_norm": 5.249783992767334, + "learning_rate": 4.9038353073886073e-05, + "loss": 2.3286, + "step": 342 + }, + { + "epoch": 0.19345741680767062, + "grad_norm": 4.562863349914551, + "learning_rate": 4.903553299492386e-05, + "loss": 2.3506, + "step": 343 + }, + { + "epoch": 0.19402143260011281, + "grad_norm": 7.859304904937744, + "learning_rate": 4.903271291596165e-05, + "loss": 2.0091, + "step": 344 + }, + { + "epoch": 0.19458544839255498, + "grad_norm": 5.595649242401123, + "learning_rate": 4.9029892836999436e-05, + "loss": 1.9108, + "step": 345 + }, + { + "epoch": 0.19514946418499718, + "grad_norm": 5.583905220031738, + "learning_rate": 4.902707275803723e-05, + "loss": 2.5468, + "step": 346 + }, + { + "epoch": 0.19571347997743938, + "grad_norm": 6.928435802459717, + "learning_rate": 4.9024252679075014e-05, + "loss": 2.1572, + "step": 347 + }, + { + "epoch": 0.19627749576988154, + "grad_norm": 6.570063591003418, + "learning_rate": 4.9021432600112806e-05, + "loss": 1.9121, + "step": 348 + }, + { + "epoch": 0.19684151156232374, + "grad_norm": 5.6633477210998535, + "learning_rate": 4.901861252115059e-05, + "loss": 2.1271, + "step": 349 + }, + { + "epoch": 0.19740552735476594, + "grad_norm": 5.963122844696045, + "learning_rate": 4.9015792442188384e-05, + "loss": 1.9492, + "step": 350 + }, + { + "epoch": 0.19796954314720813, + "grad_norm": 6.974600315093994, + "learning_rate": 4.9012972363226176e-05, + "loss": 2.7947, + "step": 351 + }, + { + "epoch": 0.1985335589396503, + "grad_norm": 4.597723960876465, + "learning_rate": 4.901015228426396e-05, + "loss": 2.0616, + "step": 352 + }, + { + "epoch": 0.1990975747320925, + "grad_norm": 6.427048206329346, + "learning_rate": 4.900733220530175e-05, + "loss": 2.7416, + "step": 353 + }, + { + "epoch": 0.1996615905245347, + "grad_norm": 8.616435050964355, + "learning_rate": 4.900451212633954e-05, + "loss": 2.1062, + "step": 354 + }, + { + "epoch": 0.20022560631697686, + "grad_norm": 6.058923721313477, + "learning_rate": 4.900169204737733e-05, + "loss": 2.5127, + "step": 355 + }, + { + "epoch": 0.20078962210941906, + "grad_norm": 5.6004638671875, + "learning_rate": 4.899887196841512e-05, + "loss": 2.0221, + "step": 356 + }, + { + "epoch": 0.20135363790186125, + "grad_norm": 5.602899551391602, + "learning_rate": 4.899605188945291e-05, + "loss": 2.4269, + "step": 357 + }, + { + "epoch": 0.20191765369430345, + "grad_norm": 6.8599348068237305, + "learning_rate": 4.8993231810490694e-05, + "loss": 1.9987, + "step": 358 + }, + { + "epoch": 0.20248166948674562, + "grad_norm": 7.590954780578613, + "learning_rate": 4.8990411731528486e-05, + "loss": 2.7928, + "step": 359 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 8.128922462463379, + "learning_rate": 4.898759165256627e-05, + "loss": 2.798, + "step": 360 + }, + { + "epoch": 0.20360970107163, + "grad_norm": 5.295723915100098, + "learning_rate": 4.8984771573604064e-05, + "loss": 1.8838, + "step": 361 + }, + { + "epoch": 0.2041737168640722, + "grad_norm": 6.323337554931641, + "learning_rate": 4.8981951494641856e-05, + "loss": 2.6237, + "step": 362 + }, + { + "epoch": 0.20473773265651438, + "grad_norm": 5.918339252471924, + "learning_rate": 4.897913141567964e-05, + "loss": 2.2158, + "step": 363 + }, + { + "epoch": 0.20530174844895657, + "grad_norm": 4.799668312072754, + "learning_rate": 4.897631133671743e-05, + "loss": 1.7438, + "step": 364 + }, + { + "epoch": 0.20586576424139877, + "grad_norm": 5.375868320465088, + "learning_rate": 4.897349125775522e-05, + "loss": 2.0182, + "step": 365 + }, + { + "epoch": 0.20642978003384094, + "grad_norm": 4.036234378814697, + "learning_rate": 4.897067117879301e-05, + "loss": 1.6224, + "step": 366 + }, + { + "epoch": 0.20699379582628313, + "grad_norm": 6.226414203643799, + "learning_rate": 4.89678510998308e-05, + "loss": 2.288, + "step": 367 + }, + { + "epoch": 0.20755781161872533, + "grad_norm": 4.895843029022217, + "learning_rate": 4.896503102086858e-05, + "loss": 2.2558, + "step": 368 + }, + { + "epoch": 0.20812182741116753, + "grad_norm": 5.61391019821167, + "learning_rate": 4.896221094190638e-05, + "loss": 1.8241, + "step": 369 + }, + { + "epoch": 0.2086858432036097, + "grad_norm": 5.889829158782959, + "learning_rate": 4.8959390862944167e-05, + "loss": 2.1989, + "step": 370 + }, + { + "epoch": 0.2092498589960519, + "grad_norm": 4.582970142364502, + "learning_rate": 4.895657078398195e-05, + "loss": 1.5699, + "step": 371 + }, + { + "epoch": 0.2098138747884941, + "grad_norm": 5.1368489265441895, + "learning_rate": 4.8953750705019744e-05, + "loss": 1.6437, + "step": 372 + }, + { + "epoch": 0.21037789058093626, + "grad_norm": 7.242769241333008, + "learning_rate": 4.8950930626057536e-05, + "loss": 2.9114, + "step": 373 + }, + { + "epoch": 0.21094190637337845, + "grad_norm": 7.3376617431640625, + "learning_rate": 4.894811054709532e-05, + "loss": 2.2813, + "step": 374 + }, + { + "epoch": 0.21150592216582065, + "grad_norm": 5.106856346130371, + "learning_rate": 4.894529046813311e-05, + "loss": 2.1852, + "step": 375 + }, + { + "epoch": 0.21206993795826284, + "grad_norm": 7.545553684234619, + "learning_rate": 4.89424703891709e-05, + "loss": 2.8516, + "step": 376 + }, + { + "epoch": 0.212633953750705, + "grad_norm": 6.738352298736572, + "learning_rate": 4.893965031020869e-05, + "loss": 2.3598, + "step": 377 + }, + { + "epoch": 0.2131979695431472, + "grad_norm": 4.881913661956787, + "learning_rate": 4.893683023124648e-05, + "loss": 1.7258, + "step": 378 + }, + { + "epoch": 0.2137619853355894, + "grad_norm": 5.605672836303711, + "learning_rate": 4.893401015228426e-05, + "loss": 1.8516, + "step": 379 + }, + { + "epoch": 0.21432600112803157, + "grad_norm": 6.27271842956543, + "learning_rate": 4.8931190073322055e-05, + "loss": 1.9198, + "step": 380 + }, + { + "epoch": 0.21489001692047377, + "grad_norm": 7.0708723068237305, + "learning_rate": 4.892836999435985e-05, + "loss": 2.5845, + "step": 381 + }, + { + "epoch": 0.21545403271291597, + "grad_norm": 5.651270866394043, + "learning_rate": 4.892554991539763e-05, + "loss": 1.911, + "step": 382 + }, + { + "epoch": 0.21601804850535816, + "grad_norm": 5.8619160652160645, + "learning_rate": 4.8922729836435424e-05, + "loss": 2.1351, + "step": 383 + }, + { + "epoch": 0.21658206429780033, + "grad_norm": 5.5989861488342285, + "learning_rate": 4.891990975747321e-05, + "loss": 1.8394, + "step": 384 + }, + { + "epoch": 0.21714608009024253, + "grad_norm": 8.06535530090332, + "learning_rate": 4.8917089678511e-05, + "loss": 2.5395, + "step": 385 + }, + { + "epoch": 0.21771009588268472, + "grad_norm": 6.409775733947754, + "learning_rate": 4.891426959954879e-05, + "loss": 2.0447, + "step": 386 + }, + { + "epoch": 0.2182741116751269, + "grad_norm": 6.291415214538574, + "learning_rate": 4.891144952058658e-05, + "loss": 2.1694, + "step": 387 + }, + { + "epoch": 0.2188381274675691, + "grad_norm": 6.066865921020508, + "learning_rate": 4.8908629441624365e-05, + "loss": 1.9861, + "step": 388 + }, + { + "epoch": 0.21940214326001128, + "grad_norm": 6.179241180419922, + "learning_rate": 4.890580936266216e-05, + "loss": 2.2746, + "step": 389 + }, + { + "epoch": 0.21996615905245348, + "grad_norm": 6.308311939239502, + "learning_rate": 4.890298928369995e-05, + "loss": 2.0508, + "step": 390 + }, + { + "epoch": 0.22053017484489565, + "grad_norm": 4.496337413787842, + "learning_rate": 4.8900169204737735e-05, + "loss": 1.9686, + "step": 391 + }, + { + "epoch": 0.22109419063733785, + "grad_norm": 5.2776689529418945, + "learning_rate": 4.889734912577553e-05, + "loss": 1.8349, + "step": 392 + }, + { + "epoch": 0.22165820642978004, + "grad_norm": 5.398794651031494, + "learning_rate": 4.889452904681331e-05, + "loss": 2.1648, + "step": 393 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 6.467186450958252, + "learning_rate": 4.8891708967851104e-05, + "loss": 2.1327, + "step": 394 + }, + { + "epoch": 0.2227862380146644, + "grad_norm": 5.570446491241455, + "learning_rate": 4.888888888888889e-05, + "loss": 1.8935, + "step": 395 + }, + { + "epoch": 0.2233502538071066, + "grad_norm": 5.678336143493652, + "learning_rate": 4.888606880992668e-05, + "loss": 2.0613, + "step": 396 + }, + { + "epoch": 0.2239142695995488, + "grad_norm": 5.097002983093262, + "learning_rate": 4.888324873096447e-05, + "loss": 1.8947, + "step": 397 + }, + { + "epoch": 0.22447828539199097, + "grad_norm": 6.178385257720947, + "learning_rate": 4.888042865200226e-05, + "loss": 2.6134, + "step": 398 + }, + { + "epoch": 0.22504230118443316, + "grad_norm": 6.843069553375244, + "learning_rate": 4.8877608573040045e-05, + "loss": 2.5408, + "step": 399 + }, + { + "epoch": 0.22560631697687536, + "grad_norm": 6.204075813293457, + "learning_rate": 4.887478849407784e-05, + "loss": 2.2456, + "step": 400 + }, + { + "epoch": 0.22617033276931753, + "grad_norm": 5.276905059814453, + "learning_rate": 4.887196841511563e-05, + "loss": 1.9284, + "step": 401 + }, + { + "epoch": 0.22673434856175972, + "grad_norm": 4.517727851867676, + "learning_rate": 4.8869148336153415e-05, + "loss": 1.9845, + "step": 402 + }, + { + "epoch": 0.22729836435420192, + "grad_norm": 4.350528240203857, + "learning_rate": 4.88663282571912e-05, + "loss": 1.9742, + "step": 403 + }, + { + "epoch": 0.22786238014664412, + "grad_norm": 4.463402271270752, + "learning_rate": 4.886350817822899e-05, + "loss": 1.7715, + "step": 404 + }, + { + "epoch": 0.22842639593908629, + "grad_norm": 6.2965593338012695, + "learning_rate": 4.8860688099266785e-05, + "loss": 1.8575, + "step": 405 + }, + { + "epoch": 0.22899041173152848, + "grad_norm": 5.528021812438965, + "learning_rate": 4.885786802030457e-05, + "loss": 2.2136, + "step": 406 + }, + { + "epoch": 0.22955442752397068, + "grad_norm": 6.7709574699401855, + "learning_rate": 4.8855047941342355e-05, + "loss": 2.076, + "step": 407 + }, + { + "epoch": 0.23011844331641285, + "grad_norm": 3.614089250564575, + "learning_rate": 4.8852227862380154e-05, + "loss": 1.5754, + "step": 408 + }, + { + "epoch": 0.23068245910885504, + "grad_norm": 9.53490161895752, + "learning_rate": 4.884940778341794e-05, + "loss": 1.9355, + "step": 409 + }, + { + "epoch": 0.23124647490129724, + "grad_norm": 6.318592548370361, + "learning_rate": 4.8846587704455725e-05, + "loss": 2.8424, + "step": 410 + }, + { + "epoch": 0.23181049069373943, + "grad_norm": 4.9906134605407715, + "learning_rate": 4.884376762549351e-05, + "loss": 2.1274, + "step": 411 + }, + { + "epoch": 0.2323745064861816, + "grad_norm": 4.670078277587891, + "learning_rate": 4.884094754653131e-05, + "loss": 1.7799, + "step": 412 + }, + { + "epoch": 0.2329385222786238, + "grad_norm": 4.803771495819092, + "learning_rate": 4.8838127467569095e-05, + "loss": 2.1948, + "step": 413 + }, + { + "epoch": 0.233502538071066, + "grad_norm": 5.618857383728027, + "learning_rate": 4.883530738860688e-05, + "loss": 2.0307, + "step": 414 + }, + { + "epoch": 0.23406655386350816, + "grad_norm": 4.966344833374023, + "learning_rate": 4.883248730964467e-05, + "loss": 1.8022, + "step": 415 + }, + { + "epoch": 0.23463056965595036, + "grad_norm": 4.428290367126465, + "learning_rate": 4.8829667230682465e-05, + "loss": 1.7136, + "step": 416 + }, + { + "epoch": 0.23519458544839256, + "grad_norm": 5.158620357513428, + "learning_rate": 4.882684715172025e-05, + "loss": 1.6828, + "step": 417 + }, + { + "epoch": 0.23575860124083475, + "grad_norm": 5.717961311340332, + "learning_rate": 4.8824027072758036e-05, + "loss": 2.285, + "step": 418 + }, + { + "epoch": 0.23632261703327692, + "grad_norm": 7.1577935218811035, + "learning_rate": 4.882120699379583e-05, + "loss": 2.5848, + "step": 419 + }, + { + "epoch": 0.23688663282571912, + "grad_norm": 4.747495651245117, + "learning_rate": 4.881838691483362e-05, + "loss": 1.9103, + "step": 420 + }, + { + "epoch": 0.23745064861816131, + "grad_norm": 5.462038516998291, + "learning_rate": 4.8815566835871405e-05, + "loss": 2.6962, + "step": 421 + }, + { + "epoch": 0.2380146644106035, + "grad_norm": 5.973637104034424, + "learning_rate": 4.88127467569092e-05, + "loss": 2.3631, + "step": 422 + }, + { + "epoch": 0.23857868020304568, + "grad_norm": 11.724513053894043, + "learning_rate": 4.880992667794698e-05, + "loss": 1.9909, + "step": 423 + }, + { + "epoch": 0.23914269599548788, + "grad_norm": 7.057259559631348, + "learning_rate": 4.8807106598984775e-05, + "loss": 2.3016, + "step": 424 + }, + { + "epoch": 0.23970671178793007, + "grad_norm": 5.356383800506592, + "learning_rate": 4.880428652002256e-05, + "loss": 1.78, + "step": 425 + }, + { + "epoch": 0.24027072758037224, + "grad_norm": 6.5428900718688965, + "learning_rate": 4.880146644106035e-05, + "loss": 1.6618, + "step": 426 + }, + { + "epoch": 0.24083474337281444, + "grad_norm": 4.670787811279297, + "learning_rate": 4.8798646362098145e-05, + "loss": 1.9298, + "step": 427 + }, + { + "epoch": 0.24139875916525663, + "grad_norm": 6.232101917266846, + "learning_rate": 4.879582628313593e-05, + "loss": 2.1015, + "step": 428 + }, + { + "epoch": 0.24196277495769883, + "grad_norm": 4.610374450683594, + "learning_rate": 4.8793006204173716e-05, + "loss": 1.5354, + "step": 429 + }, + { + "epoch": 0.242526790750141, + "grad_norm": 5.099283695220947, + "learning_rate": 4.879018612521151e-05, + "loss": 2.2891, + "step": 430 + }, + { + "epoch": 0.2430908065425832, + "grad_norm": 6.759159564971924, + "learning_rate": 4.87873660462493e-05, + "loss": 1.7562, + "step": 431 + }, + { + "epoch": 0.2436548223350254, + "grad_norm": 4.784867286682129, + "learning_rate": 4.8784545967287086e-05, + "loss": 1.7483, + "step": 432 + }, + { + "epoch": 0.24421883812746756, + "grad_norm": 7.355743885040283, + "learning_rate": 4.878172588832488e-05, + "loss": 2.0108, + "step": 433 + }, + { + "epoch": 0.24478285391990975, + "grad_norm": 3.56080961227417, + "learning_rate": 4.877890580936266e-05, + "loss": 1.5189, + "step": 434 + }, + { + "epoch": 0.24534686971235195, + "grad_norm": 5.851147174835205, + "learning_rate": 4.8776085730400455e-05, + "loss": 1.7453, + "step": 435 + }, + { + "epoch": 0.24591088550479415, + "grad_norm": 5.766703128814697, + "learning_rate": 4.877326565143824e-05, + "loss": 1.9764, + "step": 436 + }, + { + "epoch": 0.24647490129723632, + "grad_norm": 7.396605014801025, + "learning_rate": 4.877044557247603e-05, + "loss": 2.3237, + "step": 437 + }, + { + "epoch": 0.2470389170896785, + "grad_norm": 5.122827529907227, + "learning_rate": 4.876762549351382e-05, + "loss": 1.6395, + "step": 438 + }, + { + "epoch": 0.2476029328821207, + "grad_norm": 5.4599432945251465, + "learning_rate": 4.876480541455161e-05, + "loss": 1.8071, + "step": 439 + }, + { + "epoch": 0.24816694867456288, + "grad_norm": 5.575592041015625, + "learning_rate": 4.87619853355894e-05, + "loss": 1.9738, + "step": 440 + }, + { + "epoch": 0.24873096446700507, + "grad_norm": 5.983273506164551, + "learning_rate": 4.875916525662719e-05, + "loss": 1.6155, + "step": 441 + }, + { + "epoch": 0.24929498025944727, + "grad_norm": 5.393769264221191, + "learning_rate": 4.8756345177664973e-05, + "loss": 1.9002, + "step": 442 + }, + { + "epoch": 0.24985899605188946, + "grad_norm": 5.205740928649902, + "learning_rate": 4.8753525098702766e-05, + "loss": 1.7911, + "step": 443 + }, + { + "epoch": 0.25042301184433163, + "grad_norm": 6.182704925537109, + "learning_rate": 4.875070501974056e-05, + "loss": 1.9344, + "step": 444 + }, + { + "epoch": 0.25098702763677383, + "grad_norm": 5.068206310272217, + "learning_rate": 4.874788494077834e-05, + "loss": 1.7438, + "step": 445 + }, + { + "epoch": 0.251551043429216, + "grad_norm": 4.996670722961426, + "learning_rate": 4.874506486181613e-05, + "loss": 2.0422, + "step": 446 + }, + { + "epoch": 0.2521150592216582, + "grad_norm": 4.984970569610596, + "learning_rate": 4.874224478285392e-05, + "loss": 1.8576, + "step": 447 + }, + { + "epoch": 0.2526790750141004, + "grad_norm": 6.379758358001709, + "learning_rate": 4.873942470389171e-05, + "loss": 1.6493, + "step": 448 + }, + { + "epoch": 0.25324309080654256, + "grad_norm": 6.254807949066162, + "learning_rate": 4.87366046249295e-05, + "loss": 2.3975, + "step": 449 + }, + { + "epoch": 0.25380710659898476, + "grad_norm": 6.005599498748779, + "learning_rate": 4.8733784545967284e-05, + "loss": 1.9327, + "step": 450 + }, + { + "epoch": 0.25437112239142695, + "grad_norm": 24.45233726501465, + "learning_rate": 4.873096446700508e-05, + "loss": 2.1039, + "step": 451 + }, + { + "epoch": 0.25493513818386915, + "grad_norm": 8.071932792663574, + "learning_rate": 4.872814438804287e-05, + "loss": 2.5927, + "step": 452 + }, + { + "epoch": 0.25549915397631134, + "grad_norm": 5.998068809509277, + "learning_rate": 4.8725324309080654e-05, + "loss": 2.0129, + "step": 453 + }, + { + "epoch": 0.25606316976875354, + "grad_norm": 6.439023494720459, + "learning_rate": 4.8722504230118446e-05, + "loss": 1.9671, + "step": 454 + }, + { + "epoch": 0.25662718556119574, + "grad_norm": 5.47766637802124, + "learning_rate": 4.871968415115624e-05, + "loss": 1.7135, + "step": 455 + }, + { + "epoch": 0.2571912013536379, + "grad_norm": 6.295760154724121, + "learning_rate": 4.8716864072194023e-05, + "loss": 1.9728, + "step": 456 + }, + { + "epoch": 0.2577552171460801, + "grad_norm": 6.526418685913086, + "learning_rate": 4.871404399323181e-05, + "loss": 2.1801, + "step": 457 + }, + { + "epoch": 0.25831923293852227, + "grad_norm": 6.622709274291992, + "learning_rate": 4.87112239142696e-05, + "loss": 2.2185, + "step": 458 + }, + { + "epoch": 0.25888324873096447, + "grad_norm": 5.966617107391357, + "learning_rate": 4.870840383530739e-05, + "loss": 2.2824, + "step": 459 + }, + { + "epoch": 0.25944726452340666, + "grad_norm": 5.472635746002197, + "learning_rate": 4.870558375634518e-05, + "loss": 1.6674, + "step": 460 + }, + { + "epoch": 0.26001128031584886, + "grad_norm": 7.572980880737305, + "learning_rate": 4.870276367738297e-05, + "loss": 2.5559, + "step": 461 + }, + { + "epoch": 0.26057529610829105, + "grad_norm": 5.037454128265381, + "learning_rate": 4.869994359842076e-05, + "loss": 1.6998, + "step": 462 + }, + { + "epoch": 0.2611393119007332, + "grad_norm": 5.205267429351807, + "learning_rate": 4.869712351945855e-05, + "loss": 1.774, + "step": 463 + }, + { + "epoch": 0.2617033276931754, + "grad_norm": 6.927367210388184, + "learning_rate": 4.8694303440496334e-05, + "loss": 2.0984, + "step": 464 + }, + { + "epoch": 0.2622673434856176, + "grad_norm": 5.729448318481445, + "learning_rate": 4.8691483361534126e-05, + "loss": 2.0036, + "step": 465 + }, + { + "epoch": 0.2628313592780598, + "grad_norm": 9.230050086975098, + "learning_rate": 4.868866328257192e-05, + "loss": 2.0406, + "step": 466 + }, + { + "epoch": 0.263395375070502, + "grad_norm": 6.209466457366943, + "learning_rate": 4.8685843203609704e-05, + "loss": 1.7354, + "step": 467 + }, + { + "epoch": 0.2639593908629442, + "grad_norm": 5.682797908782959, + "learning_rate": 4.868302312464749e-05, + "loss": 1.8862, + "step": 468 + }, + { + "epoch": 0.2645234066553864, + "grad_norm": 5.4770002365112305, + "learning_rate": 4.868020304568528e-05, + "loss": 1.5274, + "step": 469 + }, + { + "epoch": 0.2650874224478285, + "grad_norm": 8.200128555297852, + "learning_rate": 4.867738296672307e-05, + "loss": 2.2434, + "step": 470 + }, + { + "epoch": 0.2656514382402707, + "grad_norm": 6.0913896560668945, + "learning_rate": 4.867456288776086e-05, + "loss": 2.1489, + "step": 471 + }, + { + "epoch": 0.2662154540327129, + "grad_norm": 4.7407121658325195, + "learning_rate": 4.867174280879865e-05, + "loss": 1.9704, + "step": 472 + }, + { + "epoch": 0.2667794698251551, + "grad_norm": 5.6418280601501465, + "learning_rate": 4.8668922729836436e-05, + "loss": 1.76, + "step": 473 + }, + { + "epoch": 0.2673434856175973, + "grad_norm": 4.18148946762085, + "learning_rate": 4.866610265087423e-05, + "loss": 1.6357, + "step": 474 + }, + { + "epoch": 0.2679075014100395, + "grad_norm": 5.315045356750488, + "learning_rate": 4.8663282571912014e-05, + "loss": 2.2257, + "step": 475 + }, + { + "epoch": 0.2684715172024817, + "grad_norm": 6.17592716217041, + "learning_rate": 4.8660462492949806e-05, + "loss": 2.0084, + "step": 476 + }, + { + "epoch": 0.26903553299492383, + "grad_norm": 7.171428680419922, + "learning_rate": 4.865764241398759e-05, + "loss": 2.4326, + "step": 477 + }, + { + "epoch": 0.26959954878736603, + "grad_norm": 7.780812740325928, + "learning_rate": 4.8654822335025384e-05, + "loss": 2.1771, + "step": 478 + }, + { + "epoch": 0.2701635645798082, + "grad_norm": 5.824283123016357, + "learning_rate": 4.8652002256063176e-05, + "loss": 2.1159, + "step": 479 + }, + { + "epoch": 0.2707275803722504, + "grad_norm": 5.152700424194336, + "learning_rate": 4.864918217710096e-05, + "loss": 1.9999, + "step": 480 + }, + { + "epoch": 0.2712915961646926, + "grad_norm": 6.358910083770752, + "learning_rate": 4.864636209813875e-05, + "loss": 2.0107, + "step": 481 + }, + { + "epoch": 0.2718556119571348, + "grad_norm": 6.32039213180542, + "learning_rate": 4.864354201917654e-05, + "loss": 2.0684, + "step": 482 + }, + { + "epoch": 0.272419627749577, + "grad_norm": 8.314254760742188, + "learning_rate": 4.864072194021433e-05, + "loss": 1.4954, + "step": 483 + }, + { + "epoch": 0.27298364354201915, + "grad_norm": 5.0951972007751465, + "learning_rate": 4.8637901861252116e-05, + "loss": 1.8612, + "step": 484 + }, + { + "epoch": 0.27354765933446135, + "grad_norm": 8.508191108703613, + "learning_rate": 4.86350817822899e-05, + "loss": 2.6429, + "step": 485 + }, + { + "epoch": 0.27411167512690354, + "grad_norm": 4.7058491706848145, + "learning_rate": 4.8632261703327694e-05, + "loss": 1.8296, + "step": 486 + }, + { + "epoch": 0.27467569091934574, + "grad_norm": 4.896066188812256, + "learning_rate": 4.8629441624365486e-05, + "loss": 1.5622, + "step": 487 + }, + { + "epoch": 0.27523970671178793, + "grad_norm": 7.203108787536621, + "learning_rate": 4.862662154540327e-05, + "loss": 2.3517, + "step": 488 + }, + { + "epoch": 0.27580372250423013, + "grad_norm": 6.21107816696167, + "learning_rate": 4.862380146644106e-05, + "loss": 1.9816, + "step": 489 + }, + { + "epoch": 0.2763677382966723, + "grad_norm": 5.479990005493164, + "learning_rate": 4.8620981387478856e-05, + "loss": 1.9942, + "step": 490 + }, + { + "epoch": 0.27693175408911447, + "grad_norm": 5.464663982391357, + "learning_rate": 4.861816130851664e-05, + "loss": 2.1927, + "step": 491 + }, + { + "epoch": 0.27749576988155666, + "grad_norm": 6.298027515411377, + "learning_rate": 4.861534122955443e-05, + "loss": 1.999, + "step": 492 + }, + { + "epoch": 0.27805978567399886, + "grad_norm": 5.389096736907959, + "learning_rate": 4.861252115059222e-05, + "loss": 1.8053, + "step": 493 + }, + { + "epoch": 0.27862380146644106, + "grad_norm": 4.494071960449219, + "learning_rate": 4.860970107163001e-05, + "loss": 1.6075, + "step": 494 + }, + { + "epoch": 0.27918781725888325, + "grad_norm": 5.047133445739746, + "learning_rate": 4.86068809926678e-05, + "loss": 1.8319, + "step": 495 + }, + { + "epoch": 0.27975183305132545, + "grad_norm": 5.272826194763184, + "learning_rate": 4.860406091370558e-05, + "loss": 1.9765, + "step": 496 + }, + { + "epoch": 0.28031584884376765, + "grad_norm": 8.581808090209961, + "learning_rate": 4.860124083474338e-05, + "loss": 2.4241, + "step": 497 + }, + { + "epoch": 0.2808798646362098, + "grad_norm": 5.464324474334717, + "learning_rate": 4.8598420755781166e-05, + "loss": 1.5232, + "step": 498 + }, + { + "epoch": 0.281443880428652, + "grad_norm": 5.455721855163574, + "learning_rate": 4.859560067681895e-05, + "loss": 2.2421, + "step": 499 + }, + { + "epoch": 0.2820078962210942, + "grad_norm": 6.143496036529541, + "learning_rate": 4.8592780597856744e-05, + "loss": 1.7312, + "step": 500 + }, + { + "epoch": 0.2825719120135364, + "grad_norm": 6.741121292114258, + "learning_rate": 4.8589960518894536e-05, + "loss": 1.8496, + "step": 501 + }, + { + "epoch": 0.28313592780597857, + "grad_norm": 7.46879243850708, + "learning_rate": 4.858714043993232e-05, + "loss": 1.7644, + "step": 502 + }, + { + "epoch": 0.28369994359842077, + "grad_norm": 5.301628112792969, + "learning_rate": 4.858432036097011e-05, + "loss": 2.0668, + "step": 503 + }, + { + "epoch": 0.28426395939086296, + "grad_norm": 5.383737564086914, + "learning_rate": 4.85815002820079e-05, + "loss": 1.9122, + "step": 504 + }, + { + "epoch": 0.28482797518330516, + "grad_norm": 3.4668638706207275, + "learning_rate": 4.857868020304569e-05, + "loss": 1.258, + "step": 505 + }, + { + "epoch": 0.2853919909757473, + "grad_norm": 5.0729804039001465, + "learning_rate": 4.857586012408348e-05, + "loss": 1.7589, + "step": 506 + }, + { + "epoch": 0.2859560067681895, + "grad_norm": 4.300259113311768, + "learning_rate": 4.857304004512126e-05, + "loss": 1.296, + "step": 507 + }, + { + "epoch": 0.2865200225606317, + "grad_norm": 5.187671661376953, + "learning_rate": 4.8570219966159054e-05, + "loss": 1.6992, + "step": 508 + }, + { + "epoch": 0.2870840383530739, + "grad_norm": 6.518775463104248, + "learning_rate": 4.8567399887196847e-05, + "loss": 2.0268, + "step": 509 + }, + { + "epoch": 0.2876480541455161, + "grad_norm": 7.092610836029053, + "learning_rate": 4.856457980823463e-05, + "loss": 2.2368, + "step": 510 + }, + { + "epoch": 0.2882120699379583, + "grad_norm": 6.042579650878906, + "learning_rate": 4.8561759729272424e-05, + "loss": 1.8199, + "step": 511 + }, + { + "epoch": 0.2887760857304005, + "grad_norm": 4.997673034667969, + "learning_rate": 4.855893965031021e-05, + "loss": 1.7945, + "step": 512 + }, + { + "epoch": 0.2893401015228426, + "grad_norm": 4.588385581970215, + "learning_rate": 4.8556119571348e-05, + "loss": 1.4465, + "step": 513 + }, + { + "epoch": 0.2899041173152848, + "grad_norm": 6.265031337738037, + "learning_rate": 4.855329949238579e-05, + "loss": 2.2631, + "step": 514 + }, + { + "epoch": 0.290468133107727, + "grad_norm": 4.793542861938477, + "learning_rate": 4.855047941342358e-05, + "loss": 2.0342, + "step": 515 + }, + { + "epoch": 0.2910321489001692, + "grad_norm": 7.559356212615967, + "learning_rate": 4.8547659334461365e-05, + "loss": 2.4467, + "step": 516 + }, + { + "epoch": 0.2915961646926114, + "grad_norm": 5.5803632736206055, + "learning_rate": 4.854483925549916e-05, + "loss": 2.0369, + "step": 517 + }, + { + "epoch": 0.2921601804850536, + "grad_norm": 5.048254013061523, + "learning_rate": 4.854201917653695e-05, + "loss": 1.5551, + "step": 518 + }, + { + "epoch": 0.2927241962774958, + "grad_norm": 3.53938627243042, + "learning_rate": 4.8539199097574735e-05, + "loss": 1.4022, + "step": 519 + }, + { + "epoch": 0.29328821206993794, + "grad_norm": 7.338493347167969, + "learning_rate": 4.853637901861252e-05, + "loss": 2.3026, + "step": 520 + }, + { + "epoch": 0.29385222786238013, + "grad_norm": 4.177496433258057, + "learning_rate": 4.853355893965031e-05, + "loss": 1.5624, + "step": 521 + }, + { + "epoch": 0.29441624365482233, + "grad_norm": 6.141679286956787, + "learning_rate": 4.8530738860688104e-05, + "loss": 2.0098, + "step": 522 + }, + { + "epoch": 0.2949802594472645, + "grad_norm": 6.201845645904541, + "learning_rate": 4.852791878172589e-05, + "loss": 1.9496, + "step": 523 + }, + { + "epoch": 0.2955442752397067, + "grad_norm": 7.839127540588379, + "learning_rate": 4.8525098702763675e-05, + "loss": 2.7439, + "step": 524 + }, + { + "epoch": 0.2961082910321489, + "grad_norm": 5.007416725158691, + "learning_rate": 4.852227862380147e-05, + "loss": 1.8813, + "step": 525 + }, + { + "epoch": 0.2966723068245911, + "grad_norm": 6.5371832847595215, + "learning_rate": 4.851945854483926e-05, + "loss": 2.2877, + "step": 526 + }, + { + "epoch": 0.29723632261703326, + "grad_norm": 4.440764904022217, + "learning_rate": 4.8516638465877045e-05, + "loss": 1.7618, + "step": 527 + }, + { + "epoch": 0.29780033840947545, + "grad_norm": 6.371473789215088, + "learning_rate": 4.851381838691483e-05, + "loss": 2.1943, + "step": 528 + }, + { + "epoch": 0.29836435420191765, + "grad_norm": 6.388687610626221, + "learning_rate": 4.851099830795263e-05, + "loss": 2.0439, + "step": 529 + }, + { + "epoch": 0.29892836999435984, + "grad_norm": 6.308291435241699, + "learning_rate": 4.8508178228990415e-05, + "loss": 1.7113, + "step": 530 + }, + { + "epoch": 0.29949238578680204, + "grad_norm": 6.272515773773193, + "learning_rate": 4.85053581500282e-05, + "loss": 2.3247, + "step": 531 + }, + { + "epoch": 0.30005640157924424, + "grad_norm": 3.8825125694274902, + "learning_rate": 4.850253807106599e-05, + "loss": 1.6048, + "step": 532 + }, + { + "epoch": 0.30062041737168643, + "grad_norm": 5.165565013885498, + "learning_rate": 4.8499717992103784e-05, + "loss": 1.8456, + "step": 533 + }, + { + "epoch": 0.3011844331641286, + "grad_norm": 5.607080936431885, + "learning_rate": 4.849689791314157e-05, + "loss": 1.6364, + "step": 534 + }, + { + "epoch": 0.30174844895657077, + "grad_norm": 5.356417655944824, + "learning_rate": 4.8494077834179355e-05, + "loss": 1.6806, + "step": 535 + }, + { + "epoch": 0.30231246474901297, + "grad_norm": 5.958977222442627, + "learning_rate": 4.8491257755217154e-05, + "loss": 1.4257, + "step": 536 + }, + { + "epoch": 0.30287648054145516, + "grad_norm": 5.6222710609436035, + "learning_rate": 4.848843767625494e-05, + "loss": 2.5489, + "step": 537 + }, + { + "epoch": 0.30344049633389736, + "grad_norm": 4.4213080406188965, + "learning_rate": 4.8485617597292725e-05, + "loss": 1.6609, + "step": 538 + }, + { + "epoch": 0.30400451212633955, + "grad_norm": 4.861043930053711, + "learning_rate": 4.848279751833052e-05, + "loss": 1.6302, + "step": 539 + }, + { + "epoch": 0.30456852791878175, + "grad_norm": 5.806236267089844, + "learning_rate": 4.847997743936831e-05, + "loss": 1.7722, + "step": 540 + }, + { + "epoch": 0.3051325437112239, + "grad_norm": 5.790777683258057, + "learning_rate": 4.8477157360406095e-05, + "loss": 1.6093, + "step": 541 + }, + { + "epoch": 0.3056965595036661, + "grad_norm": 3.4421603679656982, + "learning_rate": 4.847433728144388e-05, + "loss": 1.7615, + "step": 542 + }, + { + "epoch": 0.3062605752961083, + "grad_norm": 5.866281032562256, + "learning_rate": 4.847151720248167e-05, + "loss": 2.0244, + "step": 543 + }, + { + "epoch": 0.3068245910885505, + "grad_norm": 4.113983154296875, + "learning_rate": 4.8468697123519465e-05, + "loss": 1.5708, + "step": 544 + }, + { + "epoch": 0.3073886068809927, + "grad_norm": 6.14336633682251, + "learning_rate": 4.846587704455725e-05, + "loss": 1.6626, + "step": 545 + }, + { + "epoch": 0.3079526226734349, + "grad_norm": 4.85399055480957, + "learning_rate": 4.8463056965595035e-05, + "loss": 1.7122, + "step": 546 + }, + { + "epoch": 0.30851663846587707, + "grad_norm": 5.660431861877441, + "learning_rate": 4.846023688663283e-05, + "loss": 2.6628, + "step": 547 + }, + { + "epoch": 0.3090806542583192, + "grad_norm": 5.642589092254639, + "learning_rate": 4.845741680767062e-05, + "loss": 1.6459, + "step": 548 + }, + { + "epoch": 0.3096446700507614, + "grad_norm": 9.846410751342773, + "learning_rate": 4.8454596728708405e-05, + "loss": 2.2064, + "step": 549 + }, + { + "epoch": 0.3102086858432036, + "grad_norm": 6.601630687713623, + "learning_rate": 4.84517766497462e-05, + "loss": 2.2288, + "step": 550 + }, + { + "epoch": 0.3107727016356458, + "grad_norm": 6.590675354003906, + "learning_rate": 4.844895657078398e-05, + "loss": 2.0478, + "step": 551 + }, + { + "epoch": 0.311336717428088, + "grad_norm": 5.431647777557373, + "learning_rate": 4.8446136491821775e-05, + "loss": 1.6659, + "step": 552 + }, + { + "epoch": 0.3119007332205302, + "grad_norm": 5.406395435333252, + "learning_rate": 4.844331641285956e-05, + "loss": 1.5811, + "step": 553 + }, + { + "epoch": 0.3124647490129724, + "grad_norm": 3.5734663009643555, + "learning_rate": 4.844049633389735e-05, + "loss": 1.3302, + "step": 554 + }, + { + "epoch": 0.3130287648054145, + "grad_norm": 5.707324504852295, + "learning_rate": 4.843767625493514e-05, + "loss": 1.6948, + "step": 555 + }, + { + "epoch": 0.3135927805978567, + "grad_norm": 5.7278547286987305, + "learning_rate": 4.843485617597293e-05, + "loss": 1.8611, + "step": 556 + }, + { + "epoch": 0.3141567963902989, + "grad_norm": 6.306571960449219, + "learning_rate": 4.843203609701072e-05, + "loss": 1.9592, + "step": 557 + }, + { + "epoch": 0.3147208121827411, + "grad_norm": 4.866029262542725, + "learning_rate": 4.842921601804851e-05, + "loss": 2.2924, + "step": 558 + }, + { + "epoch": 0.3152848279751833, + "grad_norm": 4.112710952758789, + "learning_rate": 4.842639593908629e-05, + "loss": 1.5837, + "step": 559 + }, + { + "epoch": 0.3158488437676255, + "grad_norm": 5.617038249969482, + "learning_rate": 4.8423575860124085e-05, + "loss": 1.4423, + "step": 560 + }, + { + "epoch": 0.3164128595600677, + "grad_norm": 4.527608394622803, + "learning_rate": 4.842075578116188e-05, + "loss": 1.7584, + "step": 561 + }, + { + "epoch": 0.31697687535250985, + "grad_norm": 6.515766620635986, + "learning_rate": 4.841793570219966e-05, + "loss": 1.9519, + "step": 562 + }, + { + "epoch": 0.31754089114495204, + "grad_norm": 7.482338905334473, + "learning_rate": 4.841511562323745e-05, + "loss": 1.7407, + "step": 563 + }, + { + "epoch": 0.31810490693739424, + "grad_norm": 5.233174800872803, + "learning_rate": 4.841229554427524e-05, + "loss": 1.8086, + "step": 564 + }, + { + "epoch": 0.31866892272983643, + "grad_norm": 9.869227409362793, + "learning_rate": 4.840947546531303e-05, + "loss": 2.3591, + "step": 565 + }, + { + "epoch": 0.31923293852227863, + "grad_norm": 5.697141647338867, + "learning_rate": 4.840665538635082e-05, + "loss": 1.814, + "step": 566 + }, + { + "epoch": 0.3197969543147208, + "grad_norm": 4.783058166503906, + "learning_rate": 4.840383530738861e-05, + "loss": 1.6114, + "step": 567 + }, + { + "epoch": 0.320360970107163, + "grad_norm": 5.755559921264648, + "learning_rate": 4.84010152284264e-05, + "loss": 1.5644, + "step": 568 + }, + { + "epoch": 0.32092498589960516, + "grad_norm": 4.377317905426025, + "learning_rate": 4.839819514946419e-05, + "loss": 1.797, + "step": 569 + }, + { + "epoch": 0.32148900169204736, + "grad_norm": 4.936992645263672, + "learning_rate": 4.839537507050197e-05, + "loss": 2.0262, + "step": 570 + }, + { + "epoch": 0.32205301748448956, + "grad_norm": 3.0696635246276855, + "learning_rate": 4.8392554991539766e-05, + "loss": 1.4391, + "step": 571 + }, + { + "epoch": 0.32261703327693175, + "grad_norm": 3.835374355316162, + "learning_rate": 4.838973491257756e-05, + "loss": 1.5725, + "step": 572 + }, + { + "epoch": 0.32318104906937395, + "grad_norm": 6.203639507293701, + "learning_rate": 4.838691483361534e-05, + "loss": 1.8624, + "step": 573 + }, + { + "epoch": 0.32374506486181615, + "grad_norm": 5.817049980163574, + "learning_rate": 4.838409475465313e-05, + "loss": 1.9629, + "step": 574 + }, + { + "epoch": 0.32430908065425834, + "grad_norm": 5.890774250030518, + "learning_rate": 4.838127467569093e-05, + "loss": 1.8678, + "step": 575 + }, + { + "epoch": 0.3248730964467005, + "grad_norm": 8.69396686553955, + "learning_rate": 4.837845459672871e-05, + "loss": 2.0897, + "step": 576 + }, + { + "epoch": 0.3254371122391427, + "grad_norm": 7.370789527893066, + "learning_rate": 4.83756345177665e-05, + "loss": 2.4568, + "step": 577 + }, + { + "epoch": 0.3260011280315849, + "grad_norm": 5.129816055297852, + "learning_rate": 4.8372814438804284e-05, + "loss": 1.6356, + "step": 578 + }, + { + "epoch": 0.32656514382402707, + "grad_norm": 8.271219253540039, + "learning_rate": 4.836999435984208e-05, + "loss": 2.5829, + "step": 579 + }, + { + "epoch": 0.32712915961646927, + "grad_norm": 6.176612854003906, + "learning_rate": 4.836717428087987e-05, + "loss": 1.6552, + "step": 580 + }, + { + "epoch": 0.32769317540891146, + "grad_norm": 5.693054676055908, + "learning_rate": 4.8364354201917653e-05, + "loss": 1.7681, + "step": 581 + }, + { + "epoch": 0.32825719120135366, + "grad_norm": 4.740327835083008, + "learning_rate": 4.8361534122955446e-05, + "loss": 1.3492, + "step": 582 + }, + { + "epoch": 0.3288212069937958, + "grad_norm": 5.100081443786621, + "learning_rate": 4.835871404399324e-05, + "loss": 2.0423, + "step": 583 + }, + { + "epoch": 0.329385222786238, + "grad_norm": 27.60358428955078, + "learning_rate": 4.835589396503102e-05, + "loss": 1.851, + "step": 584 + }, + { + "epoch": 0.3299492385786802, + "grad_norm": 4.529689788818359, + "learning_rate": 4.835307388606881e-05, + "loss": 1.396, + "step": 585 + }, + { + "epoch": 0.3305132543711224, + "grad_norm": 4.543384552001953, + "learning_rate": 4.83502538071066e-05, + "loss": 1.281, + "step": 586 + }, + { + "epoch": 0.3310772701635646, + "grad_norm": 6.491982936859131, + "learning_rate": 4.834743372814439e-05, + "loss": 2.0122, + "step": 587 + }, + { + "epoch": 0.3316412859560068, + "grad_norm": 4.314792633056641, + "learning_rate": 4.834461364918218e-05, + "loss": 1.5038, + "step": 588 + }, + { + "epoch": 0.332205301748449, + "grad_norm": 4.917687892913818, + "learning_rate": 4.834179357021997e-05, + "loss": 2.0075, + "step": 589 + }, + { + "epoch": 0.3327693175408911, + "grad_norm": 4.805911064147949, + "learning_rate": 4.8338973491257756e-05, + "loss": 1.6593, + "step": 590 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 4.928843021392822, + "learning_rate": 4.833615341229555e-05, + "loss": 1.8202, + "step": 591 + }, + { + "epoch": 0.3338973491257755, + "grad_norm": 4.624151706695557, + "learning_rate": 4.8333333333333334e-05, + "loss": 1.5879, + "step": 592 + }, + { + "epoch": 0.3344613649182177, + "grad_norm": 5.283743858337402, + "learning_rate": 4.8330513254371126e-05, + "loss": 1.8303, + "step": 593 + }, + { + "epoch": 0.3350253807106599, + "grad_norm": 6.156787872314453, + "learning_rate": 4.832769317540891e-05, + "loss": 1.8164, + "step": 594 + }, + { + "epoch": 0.3355893965031021, + "grad_norm": 5.177414417266846, + "learning_rate": 4.8324873096446703e-05, + "loss": 1.8637, + "step": 595 + }, + { + "epoch": 0.3361534122955443, + "grad_norm": 6.746670722961426, + "learning_rate": 4.832205301748449e-05, + "loss": 1.8738, + "step": 596 + }, + { + "epoch": 0.33671742808798644, + "grad_norm": 5.661622524261475, + "learning_rate": 4.831923293852228e-05, + "loss": 1.8041, + "step": 597 + }, + { + "epoch": 0.33728144388042863, + "grad_norm": 3.5031256675720215, + "learning_rate": 4.8316412859560066e-05, + "loss": 1.8789, + "step": 598 + }, + { + "epoch": 0.33784545967287083, + "grad_norm": 3.392223358154297, + "learning_rate": 4.831359278059786e-05, + "loss": 1.2667, + "step": 599 + }, + { + "epoch": 0.338409475465313, + "grad_norm": 4.951959133148193, + "learning_rate": 4.831077270163565e-05, + "loss": 2.3997, + "step": 600 + }, + { + "epoch": 0.3389734912577552, + "grad_norm": 6.924822807312012, + "learning_rate": 4.8307952622673436e-05, + "loss": 1.7995, + "step": 601 + }, + { + "epoch": 0.3395375070501974, + "grad_norm": 6.9308576583862305, + "learning_rate": 4.830513254371123e-05, + "loss": 2.0852, + "step": 602 + }, + { + "epoch": 0.3401015228426396, + "grad_norm": 3.851527690887451, + "learning_rate": 4.8302312464749014e-05, + "loss": 1.3729, + "step": 603 + }, + { + "epoch": 0.34066553863508175, + "grad_norm": 8.59643840789795, + "learning_rate": 4.8299492385786806e-05, + "loss": 1.679, + "step": 604 + }, + { + "epoch": 0.34122955442752395, + "grad_norm": 5.019068717956543, + "learning_rate": 4.829667230682459e-05, + "loss": 1.5816, + "step": 605 + }, + { + "epoch": 0.34179357021996615, + "grad_norm": 5.376804351806641, + "learning_rate": 4.8293852227862384e-05, + "loss": 1.5724, + "step": 606 + }, + { + "epoch": 0.34235758601240834, + "grad_norm": 4.086066246032715, + "learning_rate": 4.8291032148900176e-05, + "loss": 1.6386, + "step": 607 + }, + { + "epoch": 0.34292160180485054, + "grad_norm": 5.187455177307129, + "learning_rate": 4.828821206993796e-05, + "loss": 1.7846, + "step": 608 + }, + { + "epoch": 0.34348561759729274, + "grad_norm": 5.4488420486450195, + "learning_rate": 4.8285391990975747e-05, + "loss": 1.7313, + "step": 609 + }, + { + "epoch": 0.34404963338973493, + "grad_norm": 6.752497673034668, + "learning_rate": 4.828257191201354e-05, + "loss": 1.8437, + "step": 610 + }, + { + "epoch": 0.34461364918217713, + "grad_norm": 6.695676326751709, + "learning_rate": 4.827975183305133e-05, + "loss": 2.199, + "step": 611 + }, + { + "epoch": 0.34517766497461927, + "grad_norm": 4.684739589691162, + "learning_rate": 4.8276931754089116e-05, + "loss": 1.4374, + "step": 612 + }, + { + "epoch": 0.34574168076706147, + "grad_norm": 4.924534797668457, + "learning_rate": 4.82741116751269e-05, + "loss": 1.7559, + "step": 613 + }, + { + "epoch": 0.34630569655950366, + "grad_norm": 4.72761344909668, + "learning_rate": 4.8271291596164694e-05, + "loss": 1.8962, + "step": 614 + }, + { + "epoch": 0.34686971235194586, + "grad_norm": 4.646229267120361, + "learning_rate": 4.8268471517202486e-05, + "loss": 1.571, + "step": 615 + }, + { + "epoch": 0.34743372814438805, + "grad_norm": 4.538805961608887, + "learning_rate": 4.826565143824027e-05, + "loss": 1.3419, + "step": 616 + }, + { + "epoch": 0.34799774393683025, + "grad_norm": 5.283565521240234, + "learning_rate": 4.826283135927806e-05, + "loss": 1.9909, + "step": 617 + }, + { + "epoch": 0.34856175972927245, + "grad_norm": 3.931777238845825, + "learning_rate": 4.8260011280315856e-05, + "loss": 1.447, + "step": 618 + }, + { + "epoch": 0.3491257755217146, + "grad_norm": 6.033408164978027, + "learning_rate": 4.825719120135364e-05, + "loss": 1.5491, + "step": 619 + }, + { + "epoch": 0.3496897913141568, + "grad_norm": 5.071351528167725, + "learning_rate": 4.825437112239143e-05, + "loss": 1.6925, + "step": 620 + }, + { + "epoch": 0.350253807106599, + "grad_norm": 6.540615558624268, + "learning_rate": 4.825155104342922e-05, + "loss": 1.552, + "step": 621 + }, + { + "epoch": 0.3508178228990412, + "grad_norm": 5.434493541717529, + "learning_rate": 4.824873096446701e-05, + "loss": 1.4024, + "step": 622 + }, + { + "epoch": 0.3513818386914834, + "grad_norm": 6.8106536865234375, + "learning_rate": 4.8245910885504796e-05, + "loss": 2.304, + "step": 623 + }, + { + "epoch": 0.35194585448392557, + "grad_norm": 5.464517593383789, + "learning_rate": 4.824309080654258e-05, + "loss": 1.5266, + "step": 624 + }, + { + "epoch": 0.35250987027636776, + "grad_norm": 3.5451643466949463, + "learning_rate": 4.8240270727580374e-05, + "loss": 1.5595, + "step": 625 + }, + { + "epoch": 0.3530738860688099, + "grad_norm": 4.735382080078125, + "learning_rate": 4.8237450648618166e-05, + "loss": 1.3958, + "step": 626 + }, + { + "epoch": 0.3536379018612521, + "grad_norm": 5.163818836212158, + "learning_rate": 4.823463056965595e-05, + "loss": 1.41, + "step": 627 + }, + { + "epoch": 0.3542019176536943, + "grad_norm": 6.018257141113281, + "learning_rate": 4.8231810490693744e-05, + "loss": 1.9547, + "step": 628 + }, + { + "epoch": 0.3547659334461365, + "grad_norm": 5.130728244781494, + "learning_rate": 4.822899041173153e-05, + "loss": 1.6667, + "step": 629 + }, + { + "epoch": 0.3553299492385787, + "grad_norm": 8.379697799682617, + "learning_rate": 4.822617033276932e-05, + "loss": 1.4958, + "step": 630 + }, + { + "epoch": 0.3558939650310209, + "grad_norm": 4.465334415435791, + "learning_rate": 4.822335025380711e-05, + "loss": 1.5231, + "step": 631 + }, + { + "epoch": 0.3564579808234631, + "grad_norm": 6.271803855895996, + "learning_rate": 4.82205301748449e-05, + "loss": 1.8883, + "step": 632 + }, + { + "epoch": 0.3570219966159052, + "grad_norm": 4.53215217590332, + "learning_rate": 4.8217710095882684e-05, + "loss": 1.7003, + "step": 633 + }, + { + "epoch": 0.3575860124083474, + "grad_norm": 6.542550563812256, + "learning_rate": 4.821489001692048e-05, + "loss": 1.7913, + "step": 634 + }, + { + "epoch": 0.3581500282007896, + "grad_norm": 6.982221603393555, + "learning_rate": 4.821206993795826e-05, + "loss": 2.2752, + "step": 635 + }, + { + "epoch": 0.3587140439932318, + "grad_norm": 5.334516525268555, + "learning_rate": 4.8209249858996054e-05, + "loss": 1.6898, + "step": 636 + }, + { + "epoch": 0.359278059785674, + "grad_norm": 7.43574857711792, + "learning_rate": 4.8206429780033846e-05, + "loss": 1.9431, + "step": 637 + }, + { + "epoch": 0.3598420755781162, + "grad_norm": 4.133084774017334, + "learning_rate": 4.820360970107163e-05, + "loss": 1.6802, + "step": 638 + }, + { + "epoch": 0.3604060913705584, + "grad_norm": 3.4233031272888184, + "learning_rate": 4.8200789622109424e-05, + "loss": 1.3057, + "step": 639 + }, + { + "epoch": 0.36097010716300054, + "grad_norm": 4.908380031585693, + "learning_rate": 4.819796954314721e-05, + "loss": 1.735, + "step": 640 + }, + { + "epoch": 0.36153412295544274, + "grad_norm": 6.421344757080078, + "learning_rate": 4.8195149464185e-05, + "loss": 2.035, + "step": 641 + }, + { + "epoch": 0.36209813874788493, + "grad_norm": 5.438602924346924, + "learning_rate": 4.819232938522279e-05, + "loss": 1.7864, + "step": 642 + }, + { + "epoch": 0.36266215454032713, + "grad_norm": 5.178281307220459, + "learning_rate": 4.818950930626058e-05, + "loss": 1.5567, + "step": 643 + }, + { + "epoch": 0.3632261703327693, + "grad_norm": 5.437366962432861, + "learning_rate": 4.8186689227298365e-05, + "loss": 1.563, + "step": 644 + }, + { + "epoch": 0.3637901861252115, + "grad_norm": 5.520788192749023, + "learning_rate": 4.818386914833616e-05, + "loss": 1.954, + "step": 645 + }, + { + "epoch": 0.3643542019176537, + "grad_norm": 3.3858766555786133, + "learning_rate": 4.818104906937395e-05, + "loss": 1.4374, + "step": 646 + }, + { + "epoch": 0.36491821771009586, + "grad_norm": 4.695160388946533, + "learning_rate": 4.8178228990411734e-05, + "loss": 1.4438, + "step": 647 + }, + { + "epoch": 0.36548223350253806, + "grad_norm": 4.566708087921143, + "learning_rate": 4.817540891144952e-05, + "loss": 1.6031, + "step": 648 + }, + { + "epoch": 0.36604624929498025, + "grad_norm": 4.352097511291504, + "learning_rate": 4.817258883248731e-05, + "loss": 1.4116, + "step": 649 + }, + { + "epoch": 0.36661026508742245, + "grad_norm": 4.989455223083496, + "learning_rate": 4.8169768753525104e-05, + "loss": 1.5832, + "step": 650 + }, + { + "epoch": 0.36717428087986465, + "grad_norm": 4.242460250854492, + "learning_rate": 4.816694867456289e-05, + "loss": 1.6338, + "step": 651 + }, + { + "epoch": 0.36773829667230684, + "grad_norm": 6.1508636474609375, + "learning_rate": 4.8164128595600675e-05, + "loss": 2.0134, + "step": 652 + }, + { + "epoch": 0.36830231246474904, + "grad_norm": 5.148771286010742, + "learning_rate": 4.816130851663847e-05, + "loss": 1.6446, + "step": 653 + }, + { + "epoch": 0.3688663282571912, + "grad_norm": 5.2455854415893555, + "learning_rate": 4.815848843767626e-05, + "loss": 1.6457, + "step": 654 + }, + { + "epoch": 0.3694303440496334, + "grad_norm": 3.4809508323669434, + "learning_rate": 4.8155668358714045e-05, + "loss": 1.3179, + "step": 655 + }, + { + "epoch": 0.36999435984207557, + "grad_norm": 7.121181011199951, + "learning_rate": 4.815284827975183e-05, + "loss": 1.7756, + "step": 656 + }, + { + "epoch": 0.37055837563451777, + "grad_norm": 4.632933139801025, + "learning_rate": 4.815002820078963e-05, + "loss": 1.8241, + "step": 657 + }, + { + "epoch": 0.37112239142695996, + "grad_norm": 9.443252563476562, + "learning_rate": 4.8147208121827415e-05, + "loss": 1.9422, + "step": 658 + }, + { + "epoch": 0.37168640721940216, + "grad_norm": 3.9334068298339844, + "learning_rate": 4.81443880428652e-05, + "loss": 1.3858, + "step": 659 + }, + { + "epoch": 0.37225042301184436, + "grad_norm": 7.9685378074646, + "learning_rate": 4.814156796390299e-05, + "loss": 2.2366, + "step": 660 + }, + { + "epoch": 0.3728144388042865, + "grad_norm": 6.410548686981201, + "learning_rate": 4.8138747884940784e-05, + "loss": 2.2615, + "step": 661 + }, + { + "epoch": 0.3733784545967287, + "grad_norm": 5.898066997528076, + "learning_rate": 4.813592780597857e-05, + "loss": 1.7433, + "step": 662 + }, + { + "epoch": 0.3739424703891709, + "grad_norm": 4.903052806854248, + "learning_rate": 4.8133107727016355e-05, + "loss": 1.6691, + "step": 663 + }, + { + "epoch": 0.3745064861816131, + "grad_norm": 4.625649452209473, + "learning_rate": 4.813028764805415e-05, + "loss": 1.752, + "step": 664 + }, + { + "epoch": 0.3750705019740553, + "grad_norm": 3.9270308017730713, + "learning_rate": 4.812746756909194e-05, + "loss": 1.7121, + "step": 665 + }, + { + "epoch": 0.3756345177664975, + "grad_norm": 5.840092182159424, + "learning_rate": 4.8124647490129725e-05, + "loss": 1.8341, + "step": 666 + }, + { + "epoch": 0.3761985335589397, + "grad_norm": 4.544562816619873, + "learning_rate": 4.812182741116752e-05, + "loss": 1.5548, + "step": 667 + }, + { + "epoch": 0.3767625493513818, + "grad_norm": 5.013934135437012, + "learning_rate": 4.81190073322053e-05, + "loss": 1.5487, + "step": 668 + }, + { + "epoch": 0.377326565143824, + "grad_norm": 3.7605910301208496, + "learning_rate": 4.8116187253243095e-05, + "loss": 1.462, + "step": 669 + }, + { + "epoch": 0.3778905809362662, + "grad_norm": 5.726848125457764, + "learning_rate": 4.811336717428088e-05, + "loss": 1.5579, + "step": 670 + }, + { + "epoch": 0.3784545967287084, + "grad_norm": 4.184903144836426, + "learning_rate": 4.811054709531867e-05, + "loss": 1.4205, + "step": 671 + }, + { + "epoch": 0.3790186125211506, + "grad_norm": 4.063729286193848, + "learning_rate": 4.810772701635646e-05, + "loss": 1.4437, + "step": 672 + }, + { + "epoch": 0.3795826283135928, + "grad_norm": 5.009604454040527, + "learning_rate": 4.810490693739425e-05, + "loss": 1.8427, + "step": 673 + }, + { + "epoch": 0.380146644106035, + "grad_norm": 6.359590530395508, + "learning_rate": 4.8102086858432035e-05, + "loss": 2.1179, + "step": 674 + }, + { + "epoch": 0.38071065989847713, + "grad_norm": 6.458817005157471, + "learning_rate": 4.809926677946983e-05, + "loss": 1.8501, + "step": 675 + }, + { + "epoch": 0.38127467569091933, + "grad_norm": 3.876912832260132, + "learning_rate": 4.809644670050762e-05, + "loss": 1.4618, + "step": 676 + }, + { + "epoch": 0.3818386914833615, + "grad_norm": 5.980722427368164, + "learning_rate": 4.8093626621545405e-05, + "loss": 1.5771, + "step": 677 + }, + { + "epoch": 0.3824027072758037, + "grad_norm": 5.717752456665039, + "learning_rate": 4.80908065425832e-05, + "loss": 1.4419, + "step": 678 + }, + { + "epoch": 0.3829667230682459, + "grad_norm": 6.589305877685547, + "learning_rate": 4.808798646362098e-05, + "loss": 1.8554, + "step": 679 + }, + { + "epoch": 0.3835307388606881, + "grad_norm": 4.387290954589844, + "learning_rate": 4.8085166384658775e-05, + "loss": 1.8585, + "step": 680 + }, + { + "epoch": 0.3840947546531303, + "grad_norm": 5.602443695068359, + "learning_rate": 4.808234630569656e-05, + "loss": 1.5524, + "step": 681 + }, + { + "epoch": 0.38465877044557245, + "grad_norm": 4.366858959197998, + "learning_rate": 4.807952622673435e-05, + "loss": 1.45, + "step": 682 + }, + { + "epoch": 0.38522278623801465, + "grad_norm": 4.116496562957764, + "learning_rate": 4.807670614777214e-05, + "loss": 1.711, + "step": 683 + }, + { + "epoch": 0.38578680203045684, + "grad_norm": 4.092191219329834, + "learning_rate": 4.807388606880993e-05, + "loss": 2.0551, + "step": 684 + }, + { + "epoch": 0.38635081782289904, + "grad_norm": 7.792598247528076, + "learning_rate": 4.807106598984772e-05, + "loss": 1.4864, + "step": 685 + }, + { + "epoch": 0.38691483361534124, + "grad_norm": 4.917025566101074, + "learning_rate": 4.806824591088551e-05, + "loss": 1.4545, + "step": 686 + }, + { + "epoch": 0.38747884940778343, + "grad_norm": 4.864682674407959, + "learning_rate": 4.806542583192329e-05, + "loss": 1.8185, + "step": 687 + }, + { + "epoch": 0.38804286520022563, + "grad_norm": 5.807408809661865, + "learning_rate": 4.8062605752961085e-05, + "loss": 1.7296, + "step": 688 + }, + { + "epoch": 0.38860688099266777, + "grad_norm": 5.002364158630371, + "learning_rate": 4.805978567399888e-05, + "loss": 1.607, + "step": 689 + }, + { + "epoch": 0.38917089678510997, + "grad_norm": 5.169330596923828, + "learning_rate": 4.805696559503666e-05, + "loss": 1.4286, + "step": 690 + }, + { + "epoch": 0.38973491257755216, + "grad_norm": 4.84013557434082, + "learning_rate": 4.805414551607445e-05, + "loss": 1.4559, + "step": 691 + }, + { + "epoch": 0.39029892836999436, + "grad_norm": 6.072085380554199, + "learning_rate": 4.805132543711224e-05, + "loss": 1.8596, + "step": 692 + }, + { + "epoch": 0.39086294416243655, + "grad_norm": 7.027515888214111, + "learning_rate": 4.804850535815003e-05, + "loss": 1.8542, + "step": 693 + }, + { + "epoch": 0.39142695995487875, + "grad_norm": 5.156787395477295, + "learning_rate": 4.804568527918782e-05, + "loss": 1.4895, + "step": 694 + }, + { + "epoch": 0.39199097574732095, + "grad_norm": 4.624627590179443, + "learning_rate": 4.80428652002256e-05, + "loss": 1.757, + "step": 695 + }, + { + "epoch": 0.3925549915397631, + "grad_norm": 7.199217796325684, + "learning_rate": 4.80400451212634e-05, + "loss": 1.7138, + "step": 696 + }, + { + "epoch": 0.3931190073322053, + "grad_norm": 5.74238395690918, + "learning_rate": 4.803722504230119e-05, + "loss": 2.2512, + "step": 697 + }, + { + "epoch": 0.3936830231246475, + "grad_norm": 5.761086940765381, + "learning_rate": 4.803440496333897e-05, + "loss": 1.1714, + "step": 698 + }, + { + "epoch": 0.3942470389170897, + "grad_norm": 4.163524627685547, + "learning_rate": 4.8031584884376765e-05, + "loss": 1.6682, + "step": 699 + }, + { + "epoch": 0.3948110547095319, + "grad_norm": 4.202450752258301, + "learning_rate": 4.802876480541456e-05, + "loss": 1.2225, + "step": 700 + }, + { + "epoch": 0.39537507050197407, + "grad_norm": 7.286525249481201, + "learning_rate": 4.802594472645234e-05, + "loss": 2.0321, + "step": 701 + }, + { + "epoch": 0.39593908629441626, + "grad_norm": 7.129409313201904, + "learning_rate": 4.802312464749013e-05, + "loss": 1.8907, + "step": 702 + }, + { + "epoch": 0.3965031020868584, + "grad_norm": 3.5672855377197266, + "learning_rate": 4.802030456852792e-05, + "loss": 1.5021, + "step": 703 + }, + { + "epoch": 0.3970671178793006, + "grad_norm": 4.986635208129883, + "learning_rate": 4.801748448956571e-05, + "loss": 1.3472, + "step": 704 + }, + { + "epoch": 0.3976311336717428, + "grad_norm": 5.1451592445373535, + "learning_rate": 4.80146644106035e-05, + "loss": 1.5405, + "step": 705 + }, + { + "epoch": 0.398195149464185, + "grad_norm": 4.0332746505737305, + "learning_rate": 4.801184433164129e-05, + "loss": 1.3263, + "step": 706 + }, + { + "epoch": 0.3987591652566272, + "grad_norm": 6.908361911773682, + "learning_rate": 4.8009024252679076e-05, + "loss": 1.4708, + "step": 707 + }, + { + "epoch": 0.3993231810490694, + "grad_norm": 6.627707481384277, + "learning_rate": 4.800620417371687e-05, + "loss": 1.7572, + "step": 708 + }, + { + "epoch": 0.3998871968415116, + "grad_norm": 5.247193813323975, + "learning_rate": 4.800338409475465e-05, + "loss": 1.1656, + "step": 709 + }, + { + "epoch": 0.4004512126339537, + "grad_norm": 4.9000701904296875, + "learning_rate": 4.8000564015792445e-05, + "loss": 1.4231, + "step": 710 + }, + { + "epoch": 0.4010152284263959, + "grad_norm": 5.301180839538574, + "learning_rate": 4.799774393683024e-05, + "loss": 1.6861, + "step": 711 + }, + { + "epoch": 0.4015792442188381, + "grad_norm": 6.078762054443359, + "learning_rate": 4.799492385786802e-05, + "loss": 1.6196, + "step": 712 + }, + { + "epoch": 0.4021432600112803, + "grad_norm": 4.6258440017700195, + "learning_rate": 4.799210377890581e-05, + "loss": 1.3953, + "step": 713 + }, + { + "epoch": 0.4027072758037225, + "grad_norm": 4.39781379699707, + "learning_rate": 4.79892836999436e-05, + "loss": 1.8792, + "step": 714 + }, + { + "epoch": 0.4032712915961647, + "grad_norm": 3.6993212699890137, + "learning_rate": 4.798646362098139e-05, + "loss": 1.2314, + "step": 715 + }, + { + "epoch": 0.4038353073886069, + "grad_norm": 7.457652568817139, + "learning_rate": 4.798364354201918e-05, + "loss": 1.7322, + "step": 716 + }, + { + "epoch": 0.40439932318104904, + "grad_norm": 4.502429962158203, + "learning_rate": 4.798082346305697e-05, + "loss": 1.4232, + "step": 717 + }, + { + "epoch": 0.40496333897349124, + "grad_norm": 4.772620677947998, + "learning_rate": 4.7978003384094756e-05, + "loss": 1.6012, + "step": 718 + }, + { + "epoch": 0.40552735476593343, + "grad_norm": 3.804670810699463, + "learning_rate": 4.797518330513255e-05, + "loss": 1.3167, + "step": 719 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 4.739498615264893, + "learning_rate": 4.7972363226170333e-05, + "loss": 1.482, + "step": 720 + }, + { + "epoch": 0.4066553863508178, + "grad_norm": 6.62346887588501, + "learning_rate": 4.7969543147208126e-05, + "loss": 2.2499, + "step": 721 + }, + { + "epoch": 0.40721940214326, + "grad_norm": 4.494105339050293, + "learning_rate": 4.796672306824591e-05, + "loss": 1.6917, + "step": 722 + }, + { + "epoch": 0.4077834179357022, + "grad_norm": 3.672499179840088, + "learning_rate": 4.79639029892837e-05, + "loss": 1.4002, + "step": 723 + }, + { + "epoch": 0.4083474337281444, + "grad_norm": 6.008072376251221, + "learning_rate": 4.7961082910321495e-05, + "loss": 1.6709, + "step": 724 + }, + { + "epoch": 0.40891144952058656, + "grad_norm": 6.885530471801758, + "learning_rate": 4.795826283135928e-05, + "loss": 1.8886, + "step": 725 + }, + { + "epoch": 0.40947546531302875, + "grad_norm": 6.350656986236572, + "learning_rate": 4.7955442752397066e-05, + "loss": 1.6692, + "step": 726 + }, + { + "epoch": 0.41003948110547095, + "grad_norm": 3.938678741455078, + "learning_rate": 4.795262267343486e-05, + "loss": 1.4544, + "step": 727 + }, + { + "epoch": 0.41060349689791314, + "grad_norm": 3.9103355407714844, + "learning_rate": 4.794980259447265e-05, + "loss": 1.2516, + "step": 728 + }, + { + "epoch": 0.41116751269035534, + "grad_norm": 3.498885154724121, + "learning_rate": 4.7946982515510436e-05, + "loss": 1.4938, + "step": 729 + }, + { + "epoch": 0.41173152848279754, + "grad_norm": 4.206144332885742, + "learning_rate": 4.794416243654822e-05, + "loss": 1.739, + "step": 730 + }, + { + "epoch": 0.41229554427523973, + "grad_norm": 3.503406047821045, + "learning_rate": 4.7941342357586014e-05, + "loss": 1.3534, + "step": 731 + }, + { + "epoch": 0.4128595600676819, + "grad_norm": 4.370572090148926, + "learning_rate": 4.7938522278623806e-05, + "loss": 1.2779, + "step": 732 + }, + { + "epoch": 0.41342357586012407, + "grad_norm": 5.922369480133057, + "learning_rate": 4.793570219966159e-05, + "loss": 1.3747, + "step": 733 + }, + { + "epoch": 0.41398759165256627, + "grad_norm": 4.7200798988342285, + "learning_rate": 4.7932882120699377e-05, + "loss": 1.5566, + "step": 734 + }, + { + "epoch": 0.41455160744500846, + "grad_norm": 4.265335559844971, + "learning_rate": 4.7930062041737176e-05, + "loss": 1.5938, + "step": 735 + }, + { + "epoch": 0.41511562323745066, + "grad_norm": 6.244687557220459, + "learning_rate": 4.792724196277496e-05, + "loss": 1.7697, + "step": 736 + }, + { + "epoch": 0.41567963902989286, + "grad_norm": 3.7206320762634277, + "learning_rate": 4.7924421883812746e-05, + "loss": 1.429, + "step": 737 + }, + { + "epoch": 0.41624365482233505, + "grad_norm": 5.571257591247559, + "learning_rate": 4.792160180485054e-05, + "loss": 1.6286, + "step": 738 + }, + { + "epoch": 0.4168076706147772, + "grad_norm": 5.447929859161377, + "learning_rate": 4.791878172588833e-05, + "loss": 1.3975, + "step": 739 + }, + { + "epoch": 0.4173716864072194, + "grad_norm": 7.689380645751953, + "learning_rate": 4.7915961646926116e-05, + "loss": 1.7162, + "step": 740 + }, + { + "epoch": 0.4179357021996616, + "grad_norm": 5.2262749671936035, + "learning_rate": 4.79131415679639e-05, + "loss": 1.4383, + "step": 741 + }, + { + "epoch": 0.4184997179921038, + "grad_norm": 5.1292724609375, + "learning_rate": 4.7910321489001694e-05, + "loss": 1.915, + "step": 742 + }, + { + "epoch": 0.419063733784546, + "grad_norm": 4.560790061950684, + "learning_rate": 4.7907501410039486e-05, + "loss": 1.3462, + "step": 743 + }, + { + "epoch": 0.4196277495769882, + "grad_norm": 4.293846130371094, + "learning_rate": 4.790468133107727e-05, + "loss": 1.5358, + "step": 744 + }, + { + "epoch": 0.42019176536943037, + "grad_norm": 5.441418647766113, + "learning_rate": 4.790186125211506e-05, + "loss": 1.666, + "step": 745 + }, + { + "epoch": 0.4207557811618725, + "grad_norm": 6.321639060974121, + "learning_rate": 4.7899041173152856e-05, + "loss": 1.7921, + "step": 746 + }, + { + "epoch": 0.4213197969543147, + "grad_norm": 4.5405731201171875, + "learning_rate": 4.789622109419064e-05, + "loss": 1.3451, + "step": 747 + }, + { + "epoch": 0.4218838127467569, + "grad_norm": 4.499442100524902, + "learning_rate": 4.7893401015228427e-05, + "loss": 1.8243, + "step": 748 + }, + { + "epoch": 0.4224478285391991, + "grad_norm": 5.120096206665039, + "learning_rate": 4.789058093626622e-05, + "loss": 1.3463, + "step": 749 + }, + { + "epoch": 0.4230118443316413, + "grad_norm": 5.558868408203125, + "learning_rate": 4.788776085730401e-05, + "loss": 1.8468, + "step": 750 + }, + { + "epoch": 0.4235758601240835, + "grad_norm": 3.769968032836914, + "learning_rate": 4.7884940778341796e-05, + "loss": 1.641, + "step": 751 + }, + { + "epoch": 0.4241398759165257, + "grad_norm": 5.449748516082764, + "learning_rate": 4.788212069937958e-05, + "loss": 1.6218, + "step": 752 + }, + { + "epoch": 0.42470389170896783, + "grad_norm": 5.050848007202148, + "learning_rate": 4.7879300620417374e-05, + "loss": 1.5783, + "step": 753 + }, + { + "epoch": 0.42526790750141, + "grad_norm": 4.183392524719238, + "learning_rate": 4.7876480541455166e-05, + "loss": 1.6246, + "step": 754 + }, + { + "epoch": 0.4258319232938522, + "grad_norm": 4.515164852142334, + "learning_rate": 4.787366046249295e-05, + "loss": 1.73, + "step": 755 + }, + { + "epoch": 0.4263959390862944, + "grad_norm": 5.233850479125977, + "learning_rate": 4.7870840383530744e-05, + "loss": 1.4208, + "step": 756 + }, + { + "epoch": 0.4269599548787366, + "grad_norm": 6.387871742248535, + "learning_rate": 4.786802030456853e-05, + "loss": 1.5098, + "step": 757 + }, + { + "epoch": 0.4275239706711788, + "grad_norm": 7.004554748535156, + "learning_rate": 4.786520022560632e-05, + "loss": 2.233, + "step": 758 + }, + { + "epoch": 0.428087986463621, + "grad_norm": 5.064212322235107, + "learning_rate": 4.786238014664411e-05, + "loss": 1.5617, + "step": 759 + }, + { + "epoch": 0.42865200225606315, + "grad_norm": 5.972962856292725, + "learning_rate": 4.78595600676819e-05, + "loss": 1.5475, + "step": 760 + }, + { + "epoch": 0.42921601804850534, + "grad_norm": 5.623086452484131, + "learning_rate": 4.7856739988719684e-05, + "loss": 2.0142, + "step": 761 + }, + { + "epoch": 0.42978003384094754, + "grad_norm": 5.080687522888184, + "learning_rate": 4.7853919909757476e-05, + "loss": 1.3863, + "step": 762 + }, + { + "epoch": 0.43034404963338974, + "grad_norm": 5.093840599060059, + "learning_rate": 4.785109983079526e-05, + "loss": 1.5306, + "step": 763 + }, + { + "epoch": 0.43090806542583193, + "grad_norm": 6.585203647613525, + "learning_rate": 4.7848279751833054e-05, + "loss": 1.9317, + "step": 764 + }, + { + "epoch": 0.43147208121827413, + "grad_norm": 4.596655368804932, + "learning_rate": 4.784545967287084e-05, + "loss": 1.9859, + "step": 765 + }, + { + "epoch": 0.4320360970107163, + "grad_norm": 3.598259687423706, + "learning_rate": 4.784263959390863e-05, + "loss": 1.3134, + "step": 766 + }, + { + "epoch": 0.43260011280315847, + "grad_norm": 4.596447467803955, + "learning_rate": 4.7839819514946424e-05, + "loss": 1.3446, + "step": 767 + }, + { + "epoch": 0.43316412859560066, + "grad_norm": 3.583453893661499, + "learning_rate": 4.783699943598421e-05, + "loss": 1.2823, + "step": 768 + }, + { + "epoch": 0.43372814438804286, + "grad_norm": 4.7837443351745605, + "learning_rate": 4.7834179357021995e-05, + "loss": 1.4983, + "step": 769 + }, + { + "epoch": 0.43429216018048505, + "grad_norm": 3.5563433170318604, + "learning_rate": 4.783135927805979e-05, + "loss": 1.1966, + "step": 770 + }, + { + "epoch": 0.43485617597292725, + "grad_norm": 4.639252662658691, + "learning_rate": 4.782853919909758e-05, + "loss": 1.5083, + "step": 771 + }, + { + "epoch": 0.43542019176536945, + "grad_norm": 4.54072380065918, + "learning_rate": 4.7825719120135364e-05, + "loss": 1.264, + "step": 772 + }, + { + "epoch": 0.43598420755781164, + "grad_norm": 5.5049333572387695, + "learning_rate": 4.782289904117315e-05, + "loss": 1.4853, + "step": 773 + }, + { + "epoch": 0.4365482233502538, + "grad_norm": 3.721602439880371, + "learning_rate": 4.782007896221095e-05, + "loss": 1.0473, + "step": 774 + }, + { + "epoch": 0.437112239142696, + "grad_norm": 5.669522762298584, + "learning_rate": 4.7817258883248734e-05, + "loss": 1.5778, + "step": 775 + }, + { + "epoch": 0.4376762549351382, + "grad_norm": 4.561578750610352, + "learning_rate": 4.781443880428652e-05, + "loss": 1.5905, + "step": 776 + }, + { + "epoch": 0.43824027072758037, + "grad_norm": 4.463866710662842, + "learning_rate": 4.781161872532431e-05, + "loss": 1.5064, + "step": 777 + }, + { + "epoch": 0.43880428652002257, + "grad_norm": 6.890807151794434, + "learning_rate": 4.7808798646362104e-05, + "loss": 1.9157, + "step": 778 + }, + { + "epoch": 0.43936830231246476, + "grad_norm": 7.229199409484863, + "learning_rate": 4.780597856739989e-05, + "loss": 2.0981, + "step": 779 + }, + { + "epoch": 0.43993231810490696, + "grad_norm": 8.785821914672852, + "learning_rate": 4.7803158488437675e-05, + "loss": 2.0926, + "step": 780 + }, + { + "epoch": 0.4404963338973491, + "grad_norm": 3.6694109439849854, + "learning_rate": 4.780033840947547e-05, + "loss": 1.297, + "step": 781 + }, + { + "epoch": 0.4410603496897913, + "grad_norm": 4.669529438018799, + "learning_rate": 4.779751833051326e-05, + "loss": 1.523, + "step": 782 + }, + { + "epoch": 0.4416243654822335, + "grad_norm": 5.214766025543213, + "learning_rate": 4.7794698251551045e-05, + "loss": 1.6815, + "step": 783 + }, + { + "epoch": 0.4421883812746757, + "grad_norm": 4.402360916137695, + "learning_rate": 4.779187817258883e-05, + "loss": 1.5948, + "step": 784 + }, + { + "epoch": 0.4427523970671179, + "grad_norm": 4.432865619659424, + "learning_rate": 4.778905809362663e-05, + "loss": 1.5232, + "step": 785 + }, + { + "epoch": 0.4433164128595601, + "grad_norm": 6.022193908691406, + "learning_rate": 4.7786238014664414e-05, + "loss": 1.9337, + "step": 786 + }, + { + "epoch": 0.4438804286520023, + "grad_norm": 5.746842384338379, + "learning_rate": 4.77834179357022e-05, + "loss": 2.0964, + "step": 787 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 4.699117660522461, + "learning_rate": 4.778059785673999e-05, + "loss": 1.4296, + "step": 788 + }, + { + "epoch": 0.4450084602368866, + "grad_norm": 5.434800148010254, + "learning_rate": 4.7777777777777784e-05, + "loss": 1.7995, + "step": 789 + }, + { + "epoch": 0.4455724760293288, + "grad_norm": 5.22377347946167, + "learning_rate": 4.777495769881557e-05, + "loss": 1.5467, + "step": 790 + }, + { + "epoch": 0.446136491821771, + "grad_norm": 5.332780838012695, + "learning_rate": 4.7772137619853355e-05, + "loss": 1.5601, + "step": 791 + }, + { + "epoch": 0.4467005076142132, + "grad_norm": 5.72987174987793, + "learning_rate": 4.776931754089115e-05, + "loss": 1.4731, + "step": 792 + }, + { + "epoch": 0.4472645234066554, + "grad_norm": 4.9892449378967285, + "learning_rate": 4.776649746192894e-05, + "loss": 1.3818, + "step": 793 + }, + { + "epoch": 0.4478285391990976, + "grad_norm": 6.095922946929932, + "learning_rate": 4.7763677382966725e-05, + "loss": 1.4111, + "step": 794 + }, + { + "epoch": 0.44839255499153974, + "grad_norm": 5.566129684448242, + "learning_rate": 4.776085730400452e-05, + "loss": 1.9047, + "step": 795 + }, + { + "epoch": 0.44895657078398193, + "grad_norm": 6.35567569732666, + "learning_rate": 4.77580372250423e-05, + "loss": 1.6796, + "step": 796 + }, + { + "epoch": 0.44952058657642413, + "grad_norm": 5.765298366546631, + "learning_rate": 4.7755217146080095e-05, + "loss": 1.5821, + "step": 797 + }, + { + "epoch": 0.4500846023688663, + "grad_norm": 4.962862968444824, + "learning_rate": 4.775239706711788e-05, + "loss": 1.8606, + "step": 798 + }, + { + "epoch": 0.4506486181613085, + "grad_norm": 3.4611527919769287, + "learning_rate": 4.774957698815567e-05, + "loss": 1.158, + "step": 799 + }, + { + "epoch": 0.4512126339537507, + "grad_norm": 5.233653545379639, + "learning_rate": 4.774675690919346e-05, + "loss": 1.9988, + "step": 800 + }, + { + "epoch": 0.4517766497461929, + "grad_norm": 5.115118980407715, + "learning_rate": 4.774393683023125e-05, + "loss": 1.3694, + "step": 801 + }, + { + "epoch": 0.45234066553863506, + "grad_norm": 5.990991592407227, + "learning_rate": 4.7741116751269035e-05, + "loss": 1.4975, + "step": 802 + }, + { + "epoch": 0.45290468133107725, + "grad_norm": 4.717063903808594, + "learning_rate": 4.773829667230683e-05, + "loss": 1.6726, + "step": 803 + }, + { + "epoch": 0.45346869712351945, + "grad_norm": 5.268399715423584, + "learning_rate": 4.773547659334461e-05, + "loss": 1.7507, + "step": 804 + }, + { + "epoch": 0.45403271291596164, + "grad_norm": 6.096558094024658, + "learning_rate": 4.7732656514382405e-05, + "loss": 1.7875, + "step": 805 + }, + { + "epoch": 0.45459672870840384, + "grad_norm": 3.6054141521453857, + "learning_rate": 4.77298364354202e-05, + "loss": 1.3721, + "step": 806 + }, + { + "epoch": 0.45516074450084604, + "grad_norm": 6.381369590759277, + "learning_rate": 4.772701635645798e-05, + "loss": 1.8327, + "step": 807 + }, + { + "epoch": 0.45572476029328823, + "grad_norm": 4.978155612945557, + "learning_rate": 4.772419627749577e-05, + "loss": 1.3754, + "step": 808 + }, + { + "epoch": 0.4562887760857304, + "grad_norm": 4.621859073638916, + "learning_rate": 4.772137619853356e-05, + "loss": 1.471, + "step": 809 + }, + { + "epoch": 0.45685279187817257, + "grad_norm": 5.185132026672363, + "learning_rate": 4.771855611957135e-05, + "loss": 1.523, + "step": 810 + }, + { + "epoch": 0.45741680767061477, + "grad_norm": 4.730311393737793, + "learning_rate": 4.771573604060914e-05, + "loss": 1.5808, + "step": 811 + }, + { + "epoch": 0.45798082346305696, + "grad_norm": 5.031077861785889, + "learning_rate": 4.771291596164692e-05, + "loss": 1.9932, + "step": 812 + }, + { + "epoch": 0.45854483925549916, + "grad_norm": 5.05968713760376, + "learning_rate": 4.771009588268472e-05, + "loss": 1.8144, + "step": 813 + }, + { + "epoch": 0.45910885504794136, + "grad_norm": 7.4864726066589355, + "learning_rate": 4.770727580372251e-05, + "loss": 1.6338, + "step": 814 + }, + { + "epoch": 0.45967287084038355, + "grad_norm": 5.4765448570251465, + "learning_rate": 4.770445572476029e-05, + "loss": 1.6989, + "step": 815 + }, + { + "epoch": 0.4602368866328257, + "grad_norm": 6.020376205444336, + "learning_rate": 4.7701635645798085e-05, + "loss": 1.6365, + "step": 816 + }, + { + "epoch": 0.4608009024252679, + "grad_norm": 4.481546401977539, + "learning_rate": 4.769881556683588e-05, + "loss": 1.251, + "step": 817 + }, + { + "epoch": 0.4613649182177101, + "grad_norm": 6.599130630493164, + "learning_rate": 4.769599548787366e-05, + "loss": 1.8048, + "step": 818 + }, + { + "epoch": 0.4619289340101523, + "grad_norm": 6.169004440307617, + "learning_rate": 4.769317540891145e-05, + "loss": 1.4166, + "step": 819 + }, + { + "epoch": 0.4624929498025945, + "grad_norm": 13.822855949401855, + "learning_rate": 4.769035532994924e-05, + "loss": 1.5824, + "step": 820 + }, + { + "epoch": 0.4630569655950367, + "grad_norm": 6.219581604003906, + "learning_rate": 4.768753525098703e-05, + "loss": 1.6948, + "step": 821 + }, + { + "epoch": 0.46362098138747887, + "grad_norm": 3.521306276321411, + "learning_rate": 4.768471517202482e-05, + "loss": 1.5382, + "step": 822 + }, + { + "epoch": 0.464184997179921, + "grad_norm": 4.865744590759277, + "learning_rate": 4.76818950930626e-05, + "loss": 1.5762, + "step": 823 + }, + { + "epoch": 0.4647490129723632, + "grad_norm": 5.07082986831665, + "learning_rate": 4.76790750141004e-05, + "loss": 1.4356, + "step": 824 + }, + { + "epoch": 0.4653130287648054, + "grad_norm": 6.141546726226807, + "learning_rate": 4.767625493513819e-05, + "loss": 1.9316, + "step": 825 + }, + { + "epoch": 0.4658770445572476, + "grad_norm": 4.523364067077637, + "learning_rate": 4.767343485617597e-05, + "loss": 1.38, + "step": 826 + }, + { + "epoch": 0.4664410603496898, + "grad_norm": 4.716594696044922, + "learning_rate": 4.7670614777213765e-05, + "loss": 1.439, + "step": 827 + }, + { + "epoch": 0.467005076142132, + "grad_norm": 3.777954578399658, + "learning_rate": 4.766779469825156e-05, + "loss": 1.1427, + "step": 828 + }, + { + "epoch": 0.4675690919345742, + "grad_norm": 4.496201992034912, + "learning_rate": 4.766497461928934e-05, + "loss": 1.6929, + "step": 829 + }, + { + "epoch": 0.46813310772701633, + "grad_norm": 6.150945663452148, + "learning_rate": 4.766215454032713e-05, + "loss": 1.6808, + "step": 830 + }, + { + "epoch": 0.4686971235194585, + "grad_norm": 3.585253953933716, + "learning_rate": 4.765933446136492e-05, + "loss": 1.4936, + "step": 831 + }, + { + "epoch": 0.4692611393119007, + "grad_norm": 6.16508150100708, + "learning_rate": 4.765651438240271e-05, + "loss": 1.5498, + "step": 832 + }, + { + "epoch": 0.4698251551043429, + "grad_norm": 3.0787978172302246, + "learning_rate": 4.76536943034405e-05, + "loss": 1.2031, + "step": 833 + }, + { + "epoch": 0.4703891708967851, + "grad_norm": 16.18996238708496, + "learning_rate": 4.765087422447829e-05, + "loss": 2.2054, + "step": 834 + }, + { + "epoch": 0.4709531866892273, + "grad_norm": 5.382989883422852, + "learning_rate": 4.7648054145516076e-05, + "loss": 1.4005, + "step": 835 + }, + { + "epoch": 0.4715172024816695, + "grad_norm": 4.832571029663086, + "learning_rate": 4.764523406655387e-05, + "loss": 1.2832, + "step": 836 + }, + { + "epoch": 0.4720812182741117, + "grad_norm": 3.9150424003601074, + "learning_rate": 4.764241398759165e-05, + "loss": 1.2502, + "step": 837 + }, + { + "epoch": 0.47264523406655384, + "grad_norm": 4.660031795501709, + "learning_rate": 4.7639593908629445e-05, + "loss": 1.6151, + "step": 838 + }, + { + "epoch": 0.47320924985899604, + "grad_norm": 6.7347846031188965, + "learning_rate": 4.763677382966723e-05, + "loss": 2.2231, + "step": 839 + }, + { + "epoch": 0.47377326565143824, + "grad_norm": 5.890491485595703, + "learning_rate": 4.763395375070502e-05, + "loss": 1.6036, + "step": 840 + }, + { + "epoch": 0.47433728144388043, + "grad_norm": 4.828084945678711, + "learning_rate": 4.763113367174281e-05, + "loss": 1.6229, + "step": 841 + }, + { + "epoch": 0.47490129723632263, + "grad_norm": 5.685123920440674, + "learning_rate": 4.76283135927806e-05, + "loss": 1.6329, + "step": 842 + }, + { + "epoch": 0.4754653130287648, + "grad_norm": 6.433988571166992, + "learning_rate": 4.7625493513818386e-05, + "loss": 1.6685, + "step": 843 + }, + { + "epoch": 0.476029328821207, + "grad_norm": 4.614927291870117, + "learning_rate": 4.762267343485618e-05, + "loss": 1.4287, + "step": 844 + }, + { + "epoch": 0.47659334461364916, + "grad_norm": 5.270617961883545, + "learning_rate": 4.761985335589397e-05, + "loss": 1.5033, + "step": 845 + }, + { + "epoch": 0.47715736040609136, + "grad_norm": 4.784862518310547, + "learning_rate": 4.7617033276931756e-05, + "loss": 1.2859, + "step": 846 + }, + { + "epoch": 0.47772137619853355, + "grad_norm": 5.0521464347839355, + "learning_rate": 4.761421319796954e-05, + "loss": 1.5614, + "step": 847 + }, + { + "epoch": 0.47828539199097575, + "grad_norm": 6.971360683441162, + "learning_rate": 4.761139311900733e-05, + "loss": 1.6656, + "step": 848 + }, + { + "epoch": 0.47884940778341795, + "grad_norm": 6.047747611999512, + "learning_rate": 4.7608573040045125e-05, + "loss": 1.7495, + "step": 849 + }, + { + "epoch": 0.47941342357586014, + "grad_norm": 5.108450412750244, + "learning_rate": 4.760575296108291e-05, + "loss": 1.4721, + "step": 850 + }, + { + "epoch": 0.47997743936830234, + "grad_norm": 4.792567729949951, + "learning_rate": 4.76029328821207e-05, + "loss": 1.3218, + "step": 851 + }, + { + "epoch": 0.4805414551607445, + "grad_norm": 4.252721309661865, + "learning_rate": 4.7600112803158495e-05, + "loss": 1.4168, + "step": 852 + }, + { + "epoch": 0.4811054709531867, + "grad_norm": 4.247704029083252, + "learning_rate": 4.759729272419628e-05, + "loss": 1.5015, + "step": 853 + }, + { + "epoch": 0.48166948674562887, + "grad_norm": 4.214919567108154, + "learning_rate": 4.7594472645234066e-05, + "loss": 1.227, + "step": 854 + }, + { + "epoch": 0.48223350253807107, + "grad_norm": 4.828568935394287, + "learning_rate": 4.759165256627186e-05, + "loss": 1.5855, + "step": 855 + }, + { + "epoch": 0.48279751833051326, + "grad_norm": 4.8427300453186035, + "learning_rate": 4.758883248730965e-05, + "loss": 1.3818, + "step": 856 + }, + { + "epoch": 0.48336153412295546, + "grad_norm": 7.4816508293151855, + "learning_rate": 4.7586012408347436e-05, + "loss": 1.7711, + "step": 857 + }, + { + "epoch": 0.48392554991539766, + "grad_norm": 4.8959197998046875, + "learning_rate": 4.758319232938522e-05, + "loss": 1.3013, + "step": 858 + }, + { + "epoch": 0.4844895657078398, + "grad_norm": 2.958606719970703, + "learning_rate": 4.7580372250423013e-05, + "loss": 1.1763, + "step": 859 + }, + { + "epoch": 0.485053581500282, + "grad_norm": 5.566822528839111, + "learning_rate": 4.7577552171460806e-05, + "loss": 1.5608, + "step": 860 + }, + { + "epoch": 0.4856175972927242, + "grad_norm": 5.040197849273682, + "learning_rate": 4.757473209249859e-05, + "loss": 1.5962, + "step": 861 + }, + { + "epoch": 0.4861816130851664, + "grad_norm": 5.389475345611572, + "learning_rate": 4.7571912013536376e-05, + "loss": 1.5931, + "step": 862 + }, + { + "epoch": 0.4867456288776086, + "grad_norm": 4.0164570808410645, + "learning_rate": 4.7569091934574175e-05, + "loss": 1.467, + "step": 863 + }, + { + "epoch": 0.4873096446700508, + "grad_norm": 4.589940547943115, + "learning_rate": 4.756627185561196e-05, + "loss": 1.5394, + "step": 864 + }, + { + "epoch": 0.487873660462493, + "grad_norm": 4.398247718811035, + "learning_rate": 4.7563451776649746e-05, + "loss": 1.6913, + "step": 865 + }, + { + "epoch": 0.4884376762549351, + "grad_norm": 4.154451370239258, + "learning_rate": 4.756063169768754e-05, + "loss": 1.1263, + "step": 866 + }, + { + "epoch": 0.4890016920473773, + "grad_norm": 7.622505187988281, + "learning_rate": 4.755781161872533e-05, + "loss": 1.2621, + "step": 867 + }, + { + "epoch": 0.4895657078398195, + "grad_norm": 4.0905938148498535, + "learning_rate": 4.7554991539763116e-05, + "loss": 1.2517, + "step": 868 + }, + { + "epoch": 0.4901297236322617, + "grad_norm": 3.4048922061920166, + "learning_rate": 4.75521714608009e-05, + "loss": 1.7511, + "step": 869 + }, + { + "epoch": 0.4906937394247039, + "grad_norm": 3.9028964042663574, + "learning_rate": 4.7549351381838694e-05, + "loss": 1.5472, + "step": 870 + }, + { + "epoch": 0.4912577552171461, + "grad_norm": 4.3720574378967285, + "learning_rate": 4.7546531302876486e-05, + "loss": 1.487, + "step": 871 + }, + { + "epoch": 0.4918217710095883, + "grad_norm": 4.1365532875061035, + "learning_rate": 4.754371122391427e-05, + "loss": 1.3882, + "step": 872 + }, + { + "epoch": 0.49238578680203043, + "grad_norm": 5.727081298828125, + "learning_rate": 4.7540891144952057e-05, + "loss": 1.9381, + "step": 873 + }, + { + "epoch": 0.49294980259447263, + "grad_norm": 3.7722771167755127, + "learning_rate": 4.753807106598985e-05, + "loss": 1.5229, + "step": 874 + }, + { + "epoch": 0.4935138183869148, + "grad_norm": 3.5545053482055664, + "learning_rate": 4.753525098702764e-05, + "loss": 1.262, + "step": 875 + }, + { + "epoch": 0.494077834179357, + "grad_norm": 3.930879831314087, + "learning_rate": 4.7532430908065426e-05, + "loss": 1.4009, + "step": 876 + }, + { + "epoch": 0.4946418499717992, + "grad_norm": 4.266712188720703, + "learning_rate": 4.752961082910322e-05, + "loss": 1.2, + "step": 877 + }, + { + "epoch": 0.4952058657642414, + "grad_norm": 4.848991870880127, + "learning_rate": 4.7526790750141004e-05, + "loss": 1.6068, + "step": 878 + }, + { + "epoch": 0.4957698815566836, + "grad_norm": 7.084873199462891, + "learning_rate": 4.7523970671178796e-05, + "loss": 1.6705, + "step": 879 + }, + { + "epoch": 0.49633389734912575, + "grad_norm": 4.954356670379639, + "learning_rate": 4.752115059221658e-05, + "loss": 1.3036, + "step": 880 + }, + { + "epoch": 0.49689791314156795, + "grad_norm": 3.3608322143554688, + "learning_rate": 4.7518330513254374e-05, + "loss": 1.202, + "step": 881 + }, + { + "epoch": 0.49746192893401014, + "grad_norm": 4.396366596221924, + "learning_rate": 4.751551043429216e-05, + "loss": 1.4466, + "step": 882 + }, + { + "epoch": 0.49802594472645234, + "grad_norm": 5.651327610015869, + "learning_rate": 4.751269035532995e-05, + "loss": 1.4611, + "step": 883 + }, + { + "epoch": 0.49858996051889454, + "grad_norm": 4.6090569496154785, + "learning_rate": 4.7509870276367744e-05, + "loss": 1.2902, + "step": 884 + }, + { + "epoch": 0.49915397631133673, + "grad_norm": 4.125630855560303, + "learning_rate": 4.750705019740553e-05, + "loss": 1.1521, + "step": 885 + }, + { + "epoch": 0.49971799210377893, + "grad_norm": 3.369593381881714, + "learning_rate": 4.750423011844332e-05, + "loss": 1.1004, + "step": 886 + }, + { + "epoch": 0.5002820078962211, + "grad_norm": 4.558630466461182, + "learning_rate": 4.7501410039481107e-05, + "loss": 1.6133, + "step": 887 + }, + { + "epoch": 0.5008460236886633, + "grad_norm": 4.221480846405029, + "learning_rate": 4.74985899605189e-05, + "loss": 1.5455, + "step": 888 + }, + { + "epoch": 0.5014100394811055, + "grad_norm": 6.021843433380127, + "learning_rate": 4.7495769881556684e-05, + "loss": 1.7863, + "step": 889 + }, + { + "epoch": 0.5019740552735477, + "grad_norm": 4.371219158172607, + "learning_rate": 4.7492949802594476e-05, + "loss": 1.3943, + "step": 890 + }, + { + "epoch": 0.5025380710659898, + "grad_norm": 5.980149269104004, + "learning_rate": 4.749012972363226e-05, + "loss": 1.7475, + "step": 891 + }, + { + "epoch": 0.503102086858432, + "grad_norm": 5.3358869552612305, + "learning_rate": 4.7487309644670054e-05, + "loss": 1.3633, + "step": 892 + }, + { + "epoch": 0.5036661026508742, + "grad_norm": 4.372663974761963, + "learning_rate": 4.748448956570784e-05, + "loss": 1.3356, + "step": 893 + }, + { + "epoch": 0.5042301184433164, + "grad_norm": 5.865492343902588, + "learning_rate": 4.748166948674563e-05, + "loss": 1.3771, + "step": 894 + }, + { + "epoch": 0.5047941342357586, + "grad_norm": 6.365415573120117, + "learning_rate": 4.7478849407783424e-05, + "loss": 2.0435, + "step": 895 + }, + { + "epoch": 0.5053581500282008, + "grad_norm": 4.093962669372559, + "learning_rate": 4.747602932882121e-05, + "loss": 1.2564, + "step": 896 + }, + { + "epoch": 0.505922165820643, + "grad_norm": 4.786006450653076, + "learning_rate": 4.7473209249858994e-05, + "loss": 1.6207, + "step": 897 + }, + { + "epoch": 0.5064861816130851, + "grad_norm": 5.884952545166016, + "learning_rate": 4.747038917089679e-05, + "loss": 1.5804, + "step": 898 + }, + { + "epoch": 0.5070501974055274, + "grad_norm": 3.5150692462921143, + "learning_rate": 4.746756909193458e-05, + "loss": 1.3337, + "step": 899 + }, + { + "epoch": 0.5076142131979695, + "grad_norm": 4.533715724945068, + "learning_rate": 4.7464749012972364e-05, + "loss": 1.3547, + "step": 900 + }, + { + "epoch": 0.5081782289904118, + "grad_norm": 3.3490161895751953, + "learning_rate": 4.746192893401015e-05, + "loss": 1.2021, + "step": 901 + }, + { + "epoch": 0.5087422447828539, + "grad_norm": 3.3067619800567627, + "learning_rate": 4.745910885504795e-05, + "loss": 1.2757, + "step": 902 + }, + { + "epoch": 0.5093062605752962, + "grad_norm": 5.169573783874512, + "learning_rate": 4.7456288776085734e-05, + "loss": 1.5756, + "step": 903 + }, + { + "epoch": 0.5098702763677383, + "grad_norm": 4.788201332092285, + "learning_rate": 4.745346869712352e-05, + "loss": 1.5154, + "step": 904 + }, + { + "epoch": 0.5104342921601804, + "grad_norm": 3.732741594314575, + "learning_rate": 4.745064861816131e-05, + "loss": 1.1296, + "step": 905 + }, + { + "epoch": 0.5109983079526227, + "grad_norm": 5.7042059898376465, + "learning_rate": 4.7447828539199104e-05, + "loss": 1.8871, + "step": 906 + }, + { + "epoch": 0.5115623237450648, + "grad_norm": 6.19961404800415, + "learning_rate": 4.744500846023689e-05, + "loss": 1.5049, + "step": 907 + }, + { + "epoch": 0.5121263395375071, + "grad_norm": 5.345874786376953, + "learning_rate": 4.7442188381274675e-05, + "loss": 1.4233, + "step": 908 + }, + { + "epoch": 0.5126903553299492, + "grad_norm": 7.190402507781982, + "learning_rate": 4.743936830231247e-05, + "loss": 1.4327, + "step": 909 + }, + { + "epoch": 0.5132543711223915, + "grad_norm": 4.683623790740967, + "learning_rate": 4.743654822335026e-05, + "loss": 1.2834, + "step": 910 + }, + { + "epoch": 0.5138183869148336, + "grad_norm": 4.282026290893555, + "learning_rate": 4.7433728144388044e-05, + "loss": 1.1772, + "step": 911 + }, + { + "epoch": 0.5143824027072758, + "grad_norm": 4.766520023345947, + "learning_rate": 4.743090806542583e-05, + "loss": 1.4546, + "step": 912 + }, + { + "epoch": 0.514946418499718, + "grad_norm": 3.8123104572296143, + "learning_rate": 4.742808798646362e-05, + "loss": 1.1405, + "step": 913 + }, + { + "epoch": 0.5155104342921601, + "grad_norm": 4.433116436004639, + "learning_rate": 4.7425267907501414e-05, + "loss": 1.2994, + "step": 914 + }, + { + "epoch": 0.5160744500846024, + "grad_norm": 5.990032196044922, + "learning_rate": 4.74224478285392e-05, + "loss": 1.2777, + "step": 915 + }, + { + "epoch": 0.5166384658770445, + "grad_norm": 6.053679943084717, + "learning_rate": 4.741962774957699e-05, + "loss": 1.7565, + "step": 916 + }, + { + "epoch": 0.5172024816694868, + "grad_norm": 5.9099225997924805, + "learning_rate": 4.741680767061478e-05, + "loss": 1.5347, + "step": 917 + }, + { + "epoch": 0.5177664974619289, + "grad_norm": 4.726917266845703, + "learning_rate": 4.741398759165257e-05, + "loss": 1.6742, + "step": 918 + }, + { + "epoch": 0.5183305132543711, + "grad_norm": 4.63238525390625, + "learning_rate": 4.7411167512690355e-05, + "loss": 1.3593, + "step": 919 + }, + { + "epoch": 0.5188945290468133, + "grad_norm": 7.75998592376709, + "learning_rate": 4.740834743372815e-05, + "loss": 2.2917, + "step": 920 + }, + { + "epoch": 0.5194585448392555, + "grad_norm": 5.377588272094727, + "learning_rate": 4.740552735476594e-05, + "loss": 1.4179, + "step": 921 + }, + { + "epoch": 0.5200225606316977, + "grad_norm": 4.484057903289795, + "learning_rate": 4.7402707275803725e-05, + "loss": 1.4044, + "step": 922 + }, + { + "epoch": 0.5205865764241399, + "grad_norm": 7.017827987670898, + "learning_rate": 4.739988719684152e-05, + "loss": 1.6137, + "step": 923 + }, + { + "epoch": 0.5211505922165821, + "grad_norm": 8.925349235534668, + "learning_rate": 4.73970671178793e-05, + "loss": 1.3964, + "step": 924 + }, + { + "epoch": 0.5217146080090242, + "grad_norm": 6.732776165008545, + "learning_rate": 4.7394247038917094e-05, + "loss": 1.5037, + "step": 925 + }, + { + "epoch": 0.5222786238014664, + "grad_norm": 4.71398401260376, + "learning_rate": 4.739142695995488e-05, + "loss": 1.6152, + "step": 926 + }, + { + "epoch": 0.5228426395939086, + "grad_norm": 4.208507537841797, + "learning_rate": 4.738860688099267e-05, + "loss": 1.4456, + "step": 927 + }, + { + "epoch": 0.5234066553863508, + "grad_norm": 4.447089672088623, + "learning_rate": 4.738578680203046e-05, + "loss": 1.4132, + "step": 928 + }, + { + "epoch": 0.523970671178793, + "grad_norm": 4.618978023529053, + "learning_rate": 4.738296672306825e-05, + "loss": 1.3094, + "step": 929 + }, + { + "epoch": 0.5245346869712352, + "grad_norm": 7.274431228637695, + "learning_rate": 4.7380146644106035e-05, + "loss": 1.7548, + "step": 930 + }, + { + "epoch": 0.5250987027636774, + "grad_norm": 12.718948364257812, + "learning_rate": 4.737732656514383e-05, + "loss": 2.0359, + "step": 931 + }, + { + "epoch": 0.5256627185561196, + "grad_norm": 7.835977077484131, + "learning_rate": 4.737450648618161e-05, + "loss": 1.4348, + "step": 932 + }, + { + "epoch": 0.5262267343485617, + "grad_norm": 3.9860105514526367, + "learning_rate": 4.7371686407219405e-05, + "loss": 1.3108, + "step": 933 + }, + { + "epoch": 0.526790750141004, + "grad_norm": 5.122458457946777, + "learning_rate": 4.73688663282572e-05, + "loss": 1.5414, + "step": 934 + }, + { + "epoch": 0.5273547659334461, + "grad_norm": 4.282352924346924, + "learning_rate": 4.736604624929498e-05, + "loss": 1.1398, + "step": 935 + }, + { + "epoch": 0.5279187817258884, + "grad_norm": 6.446314334869385, + "learning_rate": 4.736322617033277e-05, + "loss": 1.6087, + "step": 936 + }, + { + "epoch": 0.5284827975183305, + "grad_norm": 4.96248197555542, + "learning_rate": 4.736040609137056e-05, + "loss": 1.3586, + "step": 937 + }, + { + "epoch": 0.5290468133107727, + "grad_norm": 4.520605564117432, + "learning_rate": 4.735758601240835e-05, + "loss": 1.381, + "step": 938 + }, + { + "epoch": 0.5296108291032149, + "grad_norm": 4.763948440551758, + "learning_rate": 4.735476593344614e-05, + "loss": 1.3554, + "step": 939 + }, + { + "epoch": 0.530174844895657, + "grad_norm": 4.022312164306641, + "learning_rate": 4.735194585448392e-05, + "loss": 1.1966, + "step": 940 + }, + { + "epoch": 0.5307388606880993, + "grad_norm": 4.31895637512207, + "learning_rate": 4.734912577552172e-05, + "loss": 1.5305, + "step": 941 + }, + { + "epoch": 0.5313028764805414, + "grad_norm": 5.3822431564331055, + "learning_rate": 4.734630569655951e-05, + "loss": 1.5174, + "step": 942 + }, + { + "epoch": 0.5318668922729837, + "grad_norm": 5.183292388916016, + "learning_rate": 4.734348561759729e-05, + "loss": 1.6546, + "step": 943 + }, + { + "epoch": 0.5324309080654258, + "grad_norm": 4.620934963226318, + "learning_rate": 4.7340665538635085e-05, + "loss": 1.5235, + "step": 944 + }, + { + "epoch": 0.5329949238578681, + "grad_norm": 4.343638896942139, + "learning_rate": 4.733784545967288e-05, + "loss": 1.4348, + "step": 945 + }, + { + "epoch": 0.5335589396503102, + "grad_norm": 6.408099174499512, + "learning_rate": 4.733502538071066e-05, + "loss": 1.4399, + "step": 946 + }, + { + "epoch": 0.5341229554427523, + "grad_norm": 5.784123420715332, + "learning_rate": 4.733220530174845e-05, + "loss": 1.5893, + "step": 947 + }, + { + "epoch": 0.5346869712351946, + "grad_norm": 6.168650150299072, + "learning_rate": 4.732938522278624e-05, + "loss": 2.0608, + "step": 948 + }, + { + "epoch": 0.5352509870276367, + "grad_norm": 4.084285736083984, + "learning_rate": 4.732656514382403e-05, + "loss": 1.3042, + "step": 949 + }, + { + "epoch": 0.535815002820079, + "grad_norm": 4.674496650695801, + "learning_rate": 4.732374506486182e-05, + "loss": 1.5589, + "step": 950 + }, + { + "epoch": 0.5363790186125211, + "grad_norm": 4.456788063049316, + "learning_rate": 4.73209249858996e-05, + "loss": 1.1799, + "step": 951 + }, + { + "epoch": 0.5369430344049634, + "grad_norm": 5.662277698516846, + "learning_rate": 4.7318104906937395e-05, + "loss": 1.6188, + "step": 952 + }, + { + "epoch": 0.5375070501974055, + "grad_norm": 6.241349697113037, + "learning_rate": 4.731528482797519e-05, + "loss": 1.3505, + "step": 953 + }, + { + "epoch": 0.5380710659898477, + "grad_norm": 5.405340671539307, + "learning_rate": 4.731246474901297e-05, + "loss": 1.4613, + "step": 954 + }, + { + "epoch": 0.5386350817822899, + "grad_norm": 5.641672134399414, + "learning_rate": 4.7309644670050765e-05, + "loss": 1.5226, + "step": 955 + }, + { + "epoch": 0.5391990975747321, + "grad_norm": 4.359735012054443, + "learning_rate": 4.730682459108856e-05, + "loss": 1.3432, + "step": 956 + }, + { + "epoch": 0.5397631133671743, + "grad_norm": 8.038213729858398, + "learning_rate": 4.730400451212634e-05, + "loss": 1.5942, + "step": 957 + }, + { + "epoch": 0.5403271291596164, + "grad_norm": 4.172324180603027, + "learning_rate": 4.730118443316413e-05, + "loss": 1.4227, + "step": 958 + }, + { + "epoch": 0.5408911449520587, + "grad_norm": 3.9340879917144775, + "learning_rate": 4.729836435420192e-05, + "loss": 1.398, + "step": 959 + }, + { + "epoch": 0.5414551607445008, + "grad_norm": 5.880470275878906, + "learning_rate": 4.729554427523971e-05, + "loss": 1.2695, + "step": 960 + }, + { + "epoch": 0.542019176536943, + "grad_norm": 5.079125881195068, + "learning_rate": 4.72927241962775e-05, + "loss": 1.5883, + "step": 961 + }, + { + "epoch": 0.5425831923293852, + "grad_norm": 7.554305076599121, + "learning_rate": 4.728990411731529e-05, + "loss": 1.6501, + "step": 962 + }, + { + "epoch": 0.5431472081218274, + "grad_norm": 6.508305549621582, + "learning_rate": 4.7287084038353075e-05, + "loss": 1.8115, + "step": 963 + }, + { + "epoch": 0.5437112239142696, + "grad_norm": 3.5275094509124756, + "learning_rate": 4.728426395939087e-05, + "loss": 1.1642, + "step": 964 + }, + { + "epoch": 0.5442752397067118, + "grad_norm": 4.028336524963379, + "learning_rate": 4.728144388042865e-05, + "loss": 1.5341, + "step": 965 + }, + { + "epoch": 0.544839255499154, + "grad_norm": 7.433475494384766, + "learning_rate": 4.7278623801466445e-05, + "loss": 1.4029, + "step": 966 + }, + { + "epoch": 0.5454032712915962, + "grad_norm": 6.400893211364746, + "learning_rate": 4.727580372250423e-05, + "loss": 1.9512, + "step": 967 + }, + { + "epoch": 0.5459672870840383, + "grad_norm": 4.490955352783203, + "learning_rate": 4.727298364354202e-05, + "loss": 1.4525, + "step": 968 + }, + { + "epoch": 0.5465313028764806, + "grad_norm": 4.518901348114014, + "learning_rate": 4.727016356457981e-05, + "loss": 1.7548, + "step": 969 + }, + { + "epoch": 0.5470953186689227, + "grad_norm": 6.96211576461792, + "learning_rate": 4.72673434856176e-05, + "loss": 1.8177, + "step": 970 + }, + { + "epoch": 0.5476593344613649, + "grad_norm": 7.2445549964904785, + "learning_rate": 4.7264523406655386e-05, + "loss": 1.9235, + "step": 971 + }, + { + "epoch": 0.5482233502538071, + "grad_norm": 3.515080451965332, + "learning_rate": 4.726170332769318e-05, + "loss": 1.145, + "step": 972 + }, + { + "epoch": 0.5487873660462493, + "grad_norm": 4.191176414489746, + "learning_rate": 4.725888324873097e-05, + "loss": 1.3666, + "step": 973 + }, + { + "epoch": 0.5493513818386915, + "grad_norm": 6.881161212921143, + "learning_rate": 4.7256063169768756e-05, + "loss": 1.9299, + "step": 974 + }, + { + "epoch": 0.5499153976311336, + "grad_norm": 5.375434398651123, + "learning_rate": 4.725324309080654e-05, + "loss": 1.6969, + "step": 975 + }, + { + "epoch": 0.5504794134235759, + "grad_norm": 5.391693115234375, + "learning_rate": 4.725042301184433e-05, + "loss": 1.5798, + "step": 976 + }, + { + "epoch": 0.551043429216018, + "grad_norm": 4.531365871429443, + "learning_rate": 4.7247602932882125e-05, + "loss": 1.4833, + "step": 977 + }, + { + "epoch": 0.5516074450084603, + "grad_norm": 4.380483627319336, + "learning_rate": 4.724478285391991e-05, + "loss": 1.6147, + "step": 978 + }, + { + "epoch": 0.5521714608009024, + "grad_norm": 6.106522560119629, + "learning_rate": 4.7241962774957696e-05, + "loss": 1.5462, + "step": 979 + }, + { + "epoch": 0.5527354765933447, + "grad_norm": 5.226863861083984, + "learning_rate": 4.7239142695995495e-05, + "loss": 1.5574, + "step": 980 + }, + { + "epoch": 0.5532994923857868, + "grad_norm": 4.208264350891113, + "learning_rate": 4.723632261703328e-05, + "loss": 1.0957, + "step": 981 + }, + { + "epoch": 0.5538635081782289, + "grad_norm": 3.1974427700042725, + "learning_rate": 4.7233502538071066e-05, + "loss": 1.2425, + "step": 982 + }, + { + "epoch": 0.5544275239706712, + "grad_norm": 3.6732070446014404, + "learning_rate": 4.723068245910886e-05, + "loss": 1.2944, + "step": 983 + }, + { + "epoch": 0.5549915397631133, + "grad_norm": 4.081805229187012, + "learning_rate": 4.722786238014665e-05, + "loss": 1.3236, + "step": 984 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 6.868940830230713, + "learning_rate": 4.7225042301184436e-05, + "loss": 1.5071, + "step": 985 + }, + { + "epoch": 0.5561195713479977, + "grad_norm": 4.509050369262695, + "learning_rate": 4.722222222222222e-05, + "loss": 1.3728, + "step": 986 + }, + { + "epoch": 0.55668358714044, + "grad_norm": 5.545350551605225, + "learning_rate": 4.721940214326001e-05, + "loss": 1.2762, + "step": 987 + }, + { + "epoch": 0.5572476029328821, + "grad_norm": 5.145623683929443, + "learning_rate": 4.7216582064297805e-05, + "loss": 1.3774, + "step": 988 + }, + { + "epoch": 0.5578116187253243, + "grad_norm": 5.648937225341797, + "learning_rate": 4.721376198533559e-05, + "loss": 1.3956, + "step": 989 + }, + { + "epoch": 0.5583756345177665, + "grad_norm": 5.251291275024414, + "learning_rate": 4.7210941906373376e-05, + "loss": 1.3934, + "step": 990 + }, + { + "epoch": 0.5589396503102086, + "grad_norm": 4.712246894836426, + "learning_rate": 4.7208121827411175e-05, + "loss": 1.4796, + "step": 991 + }, + { + "epoch": 0.5595036661026509, + "grad_norm": 4.796618461608887, + "learning_rate": 4.720530174844896e-05, + "loss": 1.4587, + "step": 992 + }, + { + "epoch": 0.560067681895093, + "grad_norm": 4.608601093292236, + "learning_rate": 4.7202481669486746e-05, + "loss": 1.5362, + "step": 993 + }, + { + "epoch": 0.5606316976875353, + "grad_norm": 3.825537919998169, + "learning_rate": 4.719966159052454e-05, + "loss": 1.1891, + "step": 994 + }, + { + "epoch": 0.5611957134799774, + "grad_norm": 4.159191131591797, + "learning_rate": 4.719684151156233e-05, + "loss": 1.729, + "step": 995 + }, + { + "epoch": 0.5617597292724196, + "grad_norm": 3.789294958114624, + "learning_rate": 4.7194021432600116e-05, + "loss": 1.0594, + "step": 996 + }, + { + "epoch": 0.5623237450648618, + "grad_norm": 5.194594383239746, + "learning_rate": 4.71912013536379e-05, + "loss": 1.6597, + "step": 997 + }, + { + "epoch": 0.562887760857304, + "grad_norm": 6.584299087524414, + "learning_rate": 4.7188381274675693e-05, + "loss": 1.8227, + "step": 998 + }, + { + "epoch": 0.5634517766497462, + "grad_norm": 4.753972053527832, + "learning_rate": 4.7185561195713486e-05, + "loss": 1.4077, + "step": 999 + }, + { + "epoch": 0.5640157924421884, + "grad_norm": 4.590651035308838, + "learning_rate": 4.718274111675127e-05, + "loss": 1.6161, + "step": 1000 + }, + { + "epoch": 0.5645798082346306, + "grad_norm": 3.6335256099700928, + "learning_rate": 4.717992103778906e-05, + "loss": 1.2916, + "step": 1001 + }, + { + "epoch": 0.5651438240270727, + "grad_norm": 4.254095554351807, + "learning_rate": 4.717710095882685e-05, + "loss": 1.3131, + "step": 1002 + }, + { + "epoch": 0.565707839819515, + "grad_norm": 6.68216609954834, + "learning_rate": 4.717428087986464e-05, + "loss": 1.4095, + "step": 1003 + }, + { + "epoch": 0.5662718556119571, + "grad_norm": 5.092117786407471, + "learning_rate": 4.7171460800902426e-05, + "loss": 1.4447, + "step": 1004 + }, + { + "epoch": 0.5668358714043993, + "grad_norm": 4.2928290367126465, + "learning_rate": 4.716864072194022e-05, + "loss": 1.1985, + "step": 1005 + }, + { + "epoch": 0.5673998871968415, + "grad_norm": 4.5257134437561035, + "learning_rate": 4.7165820642978004e-05, + "loss": 1.4575, + "step": 1006 + }, + { + "epoch": 0.5679639029892837, + "grad_norm": 6.190221309661865, + "learning_rate": 4.7163000564015796e-05, + "loss": 1.6871, + "step": 1007 + }, + { + "epoch": 0.5685279187817259, + "grad_norm": 5.199507236480713, + "learning_rate": 4.716018048505358e-05, + "loss": 1.452, + "step": 1008 + }, + { + "epoch": 0.5690919345741681, + "grad_norm": 4.901209354400635, + "learning_rate": 4.7157360406091374e-05, + "loss": 1.4872, + "step": 1009 + }, + { + "epoch": 0.5696559503666103, + "grad_norm": 16.918209075927734, + "learning_rate": 4.715454032712916e-05, + "loss": 1.6574, + "step": 1010 + }, + { + "epoch": 0.5702199661590525, + "grad_norm": 5.069075107574463, + "learning_rate": 4.715172024816695e-05, + "loss": 1.2585, + "step": 1011 + }, + { + "epoch": 0.5707839819514946, + "grad_norm": 4.456029415130615, + "learning_rate": 4.714890016920474e-05, + "loss": 1.3532, + "step": 1012 + }, + { + "epoch": 0.5713479977439369, + "grad_norm": 3.3560380935668945, + "learning_rate": 4.714608009024253e-05, + "loss": 1.1522, + "step": 1013 + }, + { + "epoch": 0.571912013536379, + "grad_norm": 5.678893566131592, + "learning_rate": 4.7143260011280314e-05, + "loss": 1.4785, + "step": 1014 + }, + { + "epoch": 0.5724760293288212, + "grad_norm": 5.488903045654297, + "learning_rate": 4.7140439932318106e-05, + "loss": 1.5242, + "step": 1015 + }, + { + "epoch": 0.5730400451212634, + "grad_norm": 4.371301651000977, + "learning_rate": 4.71376198533559e-05, + "loss": 1.7102, + "step": 1016 + }, + { + "epoch": 0.5736040609137056, + "grad_norm": 3.937553882598877, + "learning_rate": 4.7134799774393684e-05, + "loss": 1.4478, + "step": 1017 + }, + { + "epoch": 0.5741680767061478, + "grad_norm": 6.811775207519531, + "learning_rate": 4.713197969543147e-05, + "loss": 1.3043, + "step": 1018 + }, + { + "epoch": 0.5747320924985899, + "grad_norm": 5.312098503112793, + "learning_rate": 4.712915961646927e-05, + "loss": 1.1984, + "step": 1019 + }, + { + "epoch": 0.5752961082910322, + "grad_norm": 4.635124683380127, + "learning_rate": 4.7126339537507054e-05, + "loss": 1.206, + "step": 1020 + }, + { + "epoch": 0.5758601240834743, + "grad_norm": 5.623350143432617, + "learning_rate": 4.712351945854484e-05, + "loss": 1.474, + "step": 1021 + }, + { + "epoch": 0.5764241398759166, + "grad_norm": 3.448275327682495, + "learning_rate": 4.7120699379582625e-05, + "loss": 1.4183, + "step": 1022 + }, + { + "epoch": 0.5769881556683587, + "grad_norm": 3.9150991439819336, + "learning_rate": 4.7117879300620424e-05, + "loss": 1.334, + "step": 1023 + }, + { + "epoch": 0.577552171460801, + "grad_norm": 3.695573568344116, + "learning_rate": 4.711505922165821e-05, + "loss": 1.3809, + "step": 1024 + }, + { + "epoch": 0.5781161872532431, + "grad_norm": 5.144802093505859, + "learning_rate": 4.7112239142695994e-05, + "loss": 1.3596, + "step": 1025 + }, + { + "epoch": 0.5786802030456852, + "grad_norm": 5.942796230316162, + "learning_rate": 4.7109419063733787e-05, + "loss": 1.2634, + "step": 1026 + }, + { + "epoch": 0.5792442188381275, + "grad_norm": 5.043041229248047, + "learning_rate": 4.710659898477158e-05, + "loss": 1.2163, + "step": 1027 + }, + { + "epoch": 0.5798082346305696, + "grad_norm": 5.329798698425293, + "learning_rate": 4.7103778905809364e-05, + "loss": 1.5402, + "step": 1028 + }, + { + "epoch": 0.5803722504230119, + "grad_norm": 4.469917297363281, + "learning_rate": 4.710095882684715e-05, + "loss": 1.3338, + "step": 1029 + }, + { + "epoch": 0.580936266215454, + "grad_norm": 5.052639484405518, + "learning_rate": 4.709813874788495e-05, + "loss": 1.2879, + "step": 1030 + }, + { + "epoch": 0.5815002820078963, + "grad_norm": 4.784060955047607, + "learning_rate": 4.7095318668922734e-05, + "loss": 1.5122, + "step": 1031 + }, + { + "epoch": 0.5820642978003384, + "grad_norm": 3.4285976886749268, + "learning_rate": 4.709249858996052e-05, + "loss": 1.146, + "step": 1032 + }, + { + "epoch": 0.5826283135927806, + "grad_norm": 4.9975714683532715, + "learning_rate": 4.708967851099831e-05, + "loss": 1.3285, + "step": 1033 + }, + { + "epoch": 0.5831923293852228, + "grad_norm": 4.850039482116699, + "learning_rate": 4.7086858432036104e-05, + "loss": 1.3414, + "step": 1034 + }, + { + "epoch": 0.583756345177665, + "grad_norm": 6.464447975158691, + "learning_rate": 4.708403835307389e-05, + "loss": 2.1672, + "step": 1035 + }, + { + "epoch": 0.5843203609701072, + "grad_norm": 5.019586086273193, + "learning_rate": 4.7081218274111674e-05, + "loss": 1.5321, + "step": 1036 + }, + { + "epoch": 0.5848843767625493, + "grad_norm": 6.505669593811035, + "learning_rate": 4.707839819514947e-05, + "loss": 1.7181, + "step": 1037 + }, + { + "epoch": 0.5854483925549916, + "grad_norm": 5.703197479248047, + "learning_rate": 4.707557811618726e-05, + "loss": 1.5422, + "step": 1038 + }, + { + "epoch": 0.5860124083474337, + "grad_norm": 3.7636520862579346, + "learning_rate": 4.7072758037225044e-05, + "loss": 1.0493, + "step": 1039 + }, + { + "epoch": 0.5865764241398759, + "grad_norm": 5.7188286781311035, + "learning_rate": 4.706993795826283e-05, + "loss": 1.5416, + "step": 1040 + }, + { + "epoch": 0.5871404399323181, + "grad_norm": 4.347475051879883, + "learning_rate": 4.706711787930062e-05, + "loss": 1.3211, + "step": 1041 + }, + { + "epoch": 0.5877044557247603, + "grad_norm": 3.7370059490203857, + "learning_rate": 4.7064297800338414e-05, + "loss": 1.0893, + "step": 1042 + }, + { + "epoch": 0.5882684715172025, + "grad_norm": 4.905318260192871, + "learning_rate": 4.70614777213762e-05, + "loss": 1.3018, + "step": 1043 + }, + { + "epoch": 0.5888324873096447, + "grad_norm": 5.694608688354492, + "learning_rate": 4.705865764241399e-05, + "loss": 1.2191, + "step": 1044 + }, + { + "epoch": 0.5893965031020869, + "grad_norm": 5.125327110290527, + "learning_rate": 4.705583756345178e-05, + "loss": 1.3628, + "step": 1045 + }, + { + "epoch": 0.589960518894529, + "grad_norm": 4.872847557067871, + "learning_rate": 4.705301748448957e-05, + "loss": 1.4268, + "step": 1046 + }, + { + "epoch": 0.5905245346869712, + "grad_norm": 4.197105884552002, + "learning_rate": 4.7050197405527355e-05, + "loss": 1.1332, + "step": 1047 + }, + { + "epoch": 0.5910885504794134, + "grad_norm": 5.706971645355225, + "learning_rate": 4.704737732656515e-05, + "loss": 1.7608, + "step": 1048 + }, + { + "epoch": 0.5916525662718556, + "grad_norm": 4.329052448272705, + "learning_rate": 4.704455724760293e-05, + "loss": 1.3825, + "step": 1049 + }, + { + "epoch": 0.5922165820642978, + "grad_norm": 3.0934951305389404, + "learning_rate": 4.7041737168640724e-05, + "loss": 1.0695, + "step": 1050 + }, + { + "epoch": 0.59278059785674, + "grad_norm": 3.5555922985076904, + "learning_rate": 4.7038917089678517e-05, + "loss": 1.4081, + "step": 1051 + }, + { + "epoch": 0.5933446136491822, + "grad_norm": 4.603205680847168, + "learning_rate": 4.70360970107163e-05, + "loss": 1.1571, + "step": 1052 + }, + { + "epoch": 0.5939086294416244, + "grad_norm": 6.754583358764648, + "learning_rate": 4.703327693175409e-05, + "loss": 1.4483, + "step": 1053 + }, + { + "epoch": 0.5944726452340665, + "grad_norm": 5.298036575317383, + "learning_rate": 4.703045685279188e-05, + "loss": 1.4777, + "step": 1054 + }, + { + "epoch": 0.5950366610265088, + "grad_norm": 5.365951061248779, + "learning_rate": 4.702763677382967e-05, + "loss": 1.4004, + "step": 1055 + }, + { + "epoch": 0.5956006768189509, + "grad_norm": 3.769099473953247, + "learning_rate": 4.702481669486746e-05, + "loss": 1.2415, + "step": 1056 + }, + { + "epoch": 0.5961646926113932, + "grad_norm": 4.179627418518066, + "learning_rate": 4.702199661590524e-05, + "loss": 1.487, + "step": 1057 + }, + { + "epoch": 0.5967287084038353, + "grad_norm": 5.870244026184082, + "learning_rate": 4.7019176536943035e-05, + "loss": 1.4756, + "step": 1058 + }, + { + "epoch": 0.5972927241962775, + "grad_norm": 4.838006496429443, + "learning_rate": 4.701635645798083e-05, + "loss": 1.5291, + "step": 1059 + }, + { + "epoch": 0.5978567399887197, + "grad_norm": 4.393826007843018, + "learning_rate": 4.701353637901861e-05, + "loss": 1.364, + "step": 1060 + }, + { + "epoch": 0.5984207557811618, + "grad_norm": 5.573141574859619, + "learning_rate": 4.7010716300056405e-05, + "loss": 1.5174, + "step": 1061 + }, + { + "epoch": 0.5989847715736041, + "grad_norm": 4.820369243621826, + "learning_rate": 4.70078962210942e-05, + "loss": 1.2102, + "step": 1062 + }, + { + "epoch": 0.5995487873660462, + "grad_norm": 3.2018182277679443, + "learning_rate": 4.700507614213198e-05, + "loss": 1.0755, + "step": 1063 + }, + { + "epoch": 0.6001128031584885, + "grad_norm": 6.888023376464844, + "learning_rate": 4.700225606316977e-05, + "loss": 2.0819, + "step": 1064 + }, + { + "epoch": 0.6006768189509306, + "grad_norm": 3.0820305347442627, + "learning_rate": 4.699943598420756e-05, + "loss": 1.1287, + "step": 1065 + }, + { + "epoch": 0.6012408347433729, + "grad_norm": 4.0998430252075195, + "learning_rate": 4.699661590524535e-05, + "loss": 1.1594, + "step": 1066 + }, + { + "epoch": 0.601804850535815, + "grad_norm": 4.818719387054443, + "learning_rate": 4.699379582628314e-05, + "loss": 1.3246, + "step": 1067 + }, + { + "epoch": 0.6023688663282571, + "grad_norm": 4.393697261810303, + "learning_rate": 4.699097574732092e-05, + "loss": 1.1424, + "step": 1068 + }, + { + "epoch": 0.6029328821206994, + "grad_norm": 4.304764747619629, + "learning_rate": 4.698815566835872e-05, + "loss": 1.4859, + "step": 1069 + }, + { + "epoch": 0.6034968979131415, + "grad_norm": 6.398843288421631, + "learning_rate": 4.698533558939651e-05, + "loss": 1.6635, + "step": 1070 + }, + { + "epoch": 0.6040609137055838, + "grad_norm": 4.801956653594971, + "learning_rate": 4.698251551043429e-05, + "loss": 1.3703, + "step": 1071 + }, + { + "epoch": 0.6046249294980259, + "grad_norm": 6.089813232421875, + "learning_rate": 4.6979695431472085e-05, + "loss": 1.4315, + "step": 1072 + }, + { + "epoch": 0.6051889452904682, + "grad_norm": 4.9900031089782715, + "learning_rate": 4.697687535250988e-05, + "loss": 1.1915, + "step": 1073 + }, + { + "epoch": 0.6057529610829103, + "grad_norm": 4.183412075042725, + "learning_rate": 4.697405527354766e-05, + "loss": 1.1747, + "step": 1074 + }, + { + "epoch": 0.6063169768753525, + "grad_norm": 3.7212274074554443, + "learning_rate": 4.697123519458545e-05, + "loss": 1.1324, + "step": 1075 + }, + { + "epoch": 0.6068809926677947, + "grad_norm": 4.123123645782471, + "learning_rate": 4.696841511562324e-05, + "loss": 1.4551, + "step": 1076 + }, + { + "epoch": 0.6074450084602369, + "grad_norm": 3.900921106338501, + "learning_rate": 4.696559503666103e-05, + "loss": 1.0263, + "step": 1077 + }, + { + "epoch": 0.6080090242526791, + "grad_norm": 4.004858493804932, + "learning_rate": 4.696277495769882e-05, + "loss": 1.2244, + "step": 1078 + }, + { + "epoch": 0.6085730400451212, + "grad_norm": 4.2640533447265625, + "learning_rate": 4.69599548787366e-05, + "loss": 0.9947, + "step": 1079 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 5.407710075378418, + "learning_rate": 4.6957134799774395e-05, + "loss": 1.2679, + "step": 1080 + }, + { + "epoch": 0.6097010716300056, + "grad_norm": 4.379478454589844, + "learning_rate": 4.695431472081219e-05, + "loss": 1.4516, + "step": 1081 + }, + { + "epoch": 0.6102650874224478, + "grad_norm": 5.061854362487793, + "learning_rate": 4.695149464184997e-05, + "loss": 1.1557, + "step": 1082 + }, + { + "epoch": 0.61082910321489, + "grad_norm": 4.660594463348389, + "learning_rate": 4.6948674562887765e-05, + "loss": 1.1946, + "step": 1083 + }, + { + "epoch": 0.6113931190073322, + "grad_norm": 5.681849002838135, + "learning_rate": 4.694585448392555e-05, + "loss": 1.3375, + "step": 1084 + }, + { + "epoch": 0.6119571347997744, + "grad_norm": 7.050771713256836, + "learning_rate": 4.694303440496334e-05, + "loss": 1.7231, + "step": 1085 + }, + { + "epoch": 0.6125211505922166, + "grad_norm": 6.657812595367432, + "learning_rate": 4.694021432600113e-05, + "loss": 1.7029, + "step": 1086 + }, + { + "epoch": 0.6130851663846588, + "grad_norm": 3.8815479278564453, + "learning_rate": 4.693739424703892e-05, + "loss": 1.247, + "step": 1087 + }, + { + "epoch": 0.613649182177101, + "grad_norm": 4.542172431945801, + "learning_rate": 4.6934574168076705e-05, + "loss": 1.1388, + "step": 1088 + }, + { + "epoch": 0.6142131979695431, + "grad_norm": 5.484024524688721, + "learning_rate": 4.69317540891145e-05, + "loss": 1.3212, + "step": 1089 + }, + { + "epoch": 0.6147772137619854, + "grad_norm": 5.070075511932373, + "learning_rate": 4.692893401015229e-05, + "loss": 1.4495, + "step": 1090 + }, + { + "epoch": 0.6153412295544275, + "grad_norm": 6.700381278991699, + "learning_rate": 4.6926113931190075e-05, + "loss": 1.6048, + "step": 1091 + }, + { + "epoch": 0.6159052453468697, + "grad_norm": 7.588993072509766, + "learning_rate": 4.692329385222786e-05, + "loss": 1.6993, + "step": 1092 + }, + { + "epoch": 0.6164692611393119, + "grad_norm": 4.589969158172607, + "learning_rate": 4.692047377326565e-05, + "loss": 1.4273, + "step": 1093 + }, + { + "epoch": 0.6170332769317541, + "grad_norm": 4.674349784851074, + "learning_rate": 4.6917653694303445e-05, + "loss": 1.3435, + "step": 1094 + }, + { + "epoch": 0.6175972927241963, + "grad_norm": 5.322865009307861, + "learning_rate": 4.691483361534123e-05, + "loss": 1.2104, + "step": 1095 + }, + { + "epoch": 0.6181613085166384, + "grad_norm": 5.394670009613037, + "learning_rate": 4.691201353637902e-05, + "loss": 1.3973, + "step": 1096 + }, + { + "epoch": 0.6187253243090807, + "grad_norm": 6.097675800323486, + "learning_rate": 4.690919345741681e-05, + "loss": 1.3834, + "step": 1097 + }, + { + "epoch": 0.6192893401015228, + "grad_norm": 5.909371852874756, + "learning_rate": 4.69063733784546e-05, + "loss": 1.5712, + "step": 1098 + }, + { + "epoch": 0.6198533558939651, + "grad_norm": 5.563446044921875, + "learning_rate": 4.6903553299492386e-05, + "loss": 1.5441, + "step": 1099 + }, + { + "epoch": 0.6204173716864072, + "grad_norm": 4.850680351257324, + "learning_rate": 4.690073322053018e-05, + "loss": 1.6204, + "step": 1100 + }, + { + "epoch": 0.6209813874788495, + "grad_norm": 5.0594940185546875, + "learning_rate": 4.689791314156797e-05, + "loss": 1.4599, + "step": 1101 + }, + { + "epoch": 0.6215454032712916, + "grad_norm": 4.910492897033691, + "learning_rate": 4.6895093062605755e-05, + "loss": 1.172, + "step": 1102 + }, + { + "epoch": 0.6221094190637337, + "grad_norm": 5.68325662612915, + "learning_rate": 4.689227298364354e-05, + "loss": 1.1708, + "step": 1103 + }, + { + "epoch": 0.622673434856176, + "grad_norm": 4.344282150268555, + "learning_rate": 4.688945290468133e-05, + "loss": 1.3463, + "step": 1104 + }, + { + "epoch": 0.6232374506486181, + "grad_norm": 3.718914270401001, + "learning_rate": 4.6886632825719125e-05, + "loss": 0.946, + "step": 1105 + }, + { + "epoch": 0.6238014664410604, + "grad_norm": 3.657344341278076, + "learning_rate": 4.688381274675691e-05, + "loss": 1.1399, + "step": 1106 + }, + { + "epoch": 0.6243654822335025, + "grad_norm": 4.025956153869629, + "learning_rate": 4.6880992667794696e-05, + "loss": 1.0986, + "step": 1107 + }, + { + "epoch": 0.6249294980259448, + "grad_norm": 4.855621814727783, + "learning_rate": 4.6878172588832495e-05, + "loss": 1.4509, + "step": 1108 + }, + { + "epoch": 0.6254935138183869, + "grad_norm": 5.117452144622803, + "learning_rate": 4.687535250987028e-05, + "loss": 1.4817, + "step": 1109 + }, + { + "epoch": 0.626057529610829, + "grad_norm": 5.656859874725342, + "learning_rate": 4.6872532430908066e-05, + "loss": 1.4341, + "step": 1110 + }, + { + "epoch": 0.6266215454032713, + "grad_norm": 4.45808744430542, + "learning_rate": 4.686971235194586e-05, + "loss": 1.1492, + "step": 1111 + }, + { + "epoch": 0.6271855611957134, + "grad_norm": 6.089437007904053, + "learning_rate": 4.686689227298365e-05, + "loss": 1.543, + "step": 1112 + }, + { + "epoch": 0.6277495769881557, + "grad_norm": 4.424623966217041, + "learning_rate": 4.6864072194021436e-05, + "loss": 1.2055, + "step": 1113 + }, + { + "epoch": 0.6283135927805978, + "grad_norm": 6.095499515533447, + "learning_rate": 4.686125211505922e-05, + "loss": 1.7325, + "step": 1114 + }, + { + "epoch": 0.6288776085730401, + "grad_norm": 4.936422348022461, + "learning_rate": 4.685843203609701e-05, + "loss": 1.1532, + "step": 1115 + }, + { + "epoch": 0.6294416243654822, + "grad_norm": 4.363779067993164, + "learning_rate": 4.6855611957134805e-05, + "loss": 1.3617, + "step": 1116 + }, + { + "epoch": 0.6300056401579244, + "grad_norm": 7.264186382293701, + "learning_rate": 4.685279187817259e-05, + "loss": 1.5817, + "step": 1117 + }, + { + "epoch": 0.6305696559503666, + "grad_norm": 6.13068151473999, + "learning_rate": 4.6849971799210376e-05, + "loss": 1.4343, + "step": 1118 + }, + { + "epoch": 0.6311336717428088, + "grad_norm": 4.716847896575928, + "learning_rate": 4.684715172024817e-05, + "loss": 1.2594, + "step": 1119 + }, + { + "epoch": 0.631697687535251, + "grad_norm": 6.611894130706787, + "learning_rate": 4.684433164128596e-05, + "loss": 1.7263, + "step": 1120 + }, + { + "epoch": 0.6322617033276932, + "grad_norm": 5.6364827156066895, + "learning_rate": 4.6841511562323746e-05, + "loss": 1.5404, + "step": 1121 + }, + { + "epoch": 0.6328257191201354, + "grad_norm": 5.011992931365967, + "learning_rate": 4.683869148336154e-05, + "loss": 1.3006, + "step": 1122 + }, + { + "epoch": 0.6333897349125776, + "grad_norm": 4.574671745300293, + "learning_rate": 4.6835871404399323e-05, + "loss": 1.0969, + "step": 1123 + }, + { + "epoch": 0.6339537507050197, + "grad_norm": 5.342245578765869, + "learning_rate": 4.6833051325437116e-05, + "loss": 1.4756, + "step": 1124 + }, + { + "epoch": 0.6345177664974619, + "grad_norm": 4.633478164672852, + "learning_rate": 4.68302312464749e-05, + "loss": 1.499, + "step": 1125 + }, + { + "epoch": 0.6350817822899041, + "grad_norm": 3.8555595874786377, + "learning_rate": 4.682741116751269e-05, + "loss": 1.041, + "step": 1126 + }, + { + "epoch": 0.6356457980823463, + "grad_norm": 7.146008014678955, + "learning_rate": 4.682459108855048e-05, + "loss": 2.141, + "step": 1127 + }, + { + "epoch": 0.6362098138747885, + "grad_norm": 4.9898576736450195, + "learning_rate": 4.682177100958827e-05, + "loss": 1.2881, + "step": 1128 + }, + { + "epoch": 0.6367738296672307, + "grad_norm": 5.422835350036621, + "learning_rate": 4.681895093062606e-05, + "loss": 1.5523, + "step": 1129 + }, + { + "epoch": 0.6373378454596729, + "grad_norm": 3.6664302349090576, + "learning_rate": 4.681613085166385e-05, + "loss": 1.0823, + "step": 1130 + }, + { + "epoch": 0.637901861252115, + "grad_norm": 4.2581706047058105, + "learning_rate": 4.6813310772701634e-05, + "loss": 1.2739, + "step": 1131 + }, + { + "epoch": 0.6384658770445573, + "grad_norm": 4.135062217712402, + "learning_rate": 4.6810490693739426e-05, + "loss": 1.3855, + "step": 1132 + }, + { + "epoch": 0.6390298928369994, + "grad_norm": 3.6931657791137695, + "learning_rate": 4.680767061477722e-05, + "loss": 1.0726, + "step": 1133 + }, + { + "epoch": 0.6395939086294417, + "grad_norm": 7.2882513999938965, + "learning_rate": 4.6804850535815004e-05, + "loss": 1.536, + "step": 1134 + }, + { + "epoch": 0.6401579244218838, + "grad_norm": 6.164764404296875, + "learning_rate": 4.6802030456852796e-05, + "loss": 1.4164, + "step": 1135 + }, + { + "epoch": 0.640721940214326, + "grad_norm": 4.606715679168701, + "learning_rate": 4.679921037789058e-05, + "loss": 1.4148, + "step": 1136 + }, + { + "epoch": 0.6412859560067682, + "grad_norm": 3.877195119857788, + "learning_rate": 4.6796390298928373e-05, + "loss": 1.0662, + "step": 1137 + }, + { + "epoch": 0.6418499717992103, + "grad_norm": 5.336626052856445, + "learning_rate": 4.679357021996616e-05, + "loss": 1.4226, + "step": 1138 + }, + { + "epoch": 0.6424139875916526, + "grad_norm": 5.173532009124756, + "learning_rate": 4.679075014100395e-05, + "loss": 1.2361, + "step": 1139 + }, + { + "epoch": 0.6429780033840947, + "grad_norm": 4.383628845214844, + "learning_rate": 4.678793006204174e-05, + "loss": 1.1616, + "step": 1140 + }, + { + "epoch": 0.643542019176537, + "grad_norm": 7.018625736236572, + "learning_rate": 4.678510998307953e-05, + "loss": 1.3707, + "step": 1141 + }, + { + "epoch": 0.6441060349689791, + "grad_norm": 4.940186977386475, + "learning_rate": 4.6782289904117314e-05, + "loss": 1.1478, + "step": 1142 + }, + { + "epoch": 0.6446700507614214, + "grad_norm": 5.378572940826416, + "learning_rate": 4.6779469825155106e-05, + "loss": 1.4974, + "step": 1143 + }, + { + "epoch": 0.6452340665538635, + "grad_norm": 7.391772270202637, + "learning_rate": 4.67766497461929e-05, + "loss": 1.6239, + "step": 1144 + }, + { + "epoch": 0.6457980823463056, + "grad_norm": 4.445626258850098, + "learning_rate": 4.6773829667230684e-05, + "loss": 1.213, + "step": 1145 + }, + { + "epoch": 0.6463620981387479, + "grad_norm": 3.1323697566986084, + "learning_rate": 4.677100958826847e-05, + "loss": 1.0304, + "step": 1146 + }, + { + "epoch": 0.64692611393119, + "grad_norm": 5.94108247756958, + "learning_rate": 4.676818950930627e-05, + "loss": 1.6976, + "step": 1147 + }, + { + "epoch": 0.6474901297236323, + "grad_norm": 5.036246299743652, + "learning_rate": 4.6765369430344054e-05, + "loss": 1.4689, + "step": 1148 + }, + { + "epoch": 0.6480541455160744, + "grad_norm": 4.874112129211426, + "learning_rate": 4.676254935138184e-05, + "loss": 1.185, + "step": 1149 + }, + { + "epoch": 0.6486181613085167, + "grad_norm": 4.834783554077148, + "learning_rate": 4.675972927241963e-05, + "loss": 1.3062, + "step": 1150 + }, + { + "epoch": 0.6491821771009588, + "grad_norm": 5.3878960609436035, + "learning_rate": 4.675690919345742e-05, + "loss": 1.2708, + "step": 1151 + }, + { + "epoch": 0.649746192893401, + "grad_norm": 5.1879730224609375, + "learning_rate": 4.675408911449521e-05, + "loss": 0.9897, + "step": 1152 + }, + { + "epoch": 0.6503102086858432, + "grad_norm": 5.8195576667785645, + "learning_rate": 4.6751269035532994e-05, + "loss": 1.5418, + "step": 1153 + }, + { + "epoch": 0.6508742244782854, + "grad_norm": 5.663949489593506, + "learning_rate": 4.6748448956570786e-05, + "loss": 1.4568, + "step": 1154 + }, + { + "epoch": 0.6514382402707276, + "grad_norm": 4.6978535652160645, + "learning_rate": 4.674562887760858e-05, + "loss": 1.2448, + "step": 1155 + }, + { + "epoch": 0.6520022560631697, + "grad_norm": 4.2803215980529785, + "learning_rate": 4.6742808798646364e-05, + "loss": 1.5815, + "step": 1156 + }, + { + "epoch": 0.652566271855612, + "grad_norm": 4.641829967498779, + "learning_rate": 4.673998871968415e-05, + "loss": 1.3879, + "step": 1157 + }, + { + "epoch": 0.6531302876480541, + "grad_norm": 4.772002220153809, + "learning_rate": 4.673716864072194e-05, + "loss": 1.3339, + "step": 1158 + }, + { + "epoch": 0.6536943034404963, + "grad_norm": 5.195184707641602, + "learning_rate": 4.6734348561759734e-05, + "loss": 1.2728, + "step": 1159 + }, + { + "epoch": 0.6542583192329385, + "grad_norm": 4.007447242736816, + "learning_rate": 4.673152848279752e-05, + "loss": 1.2285, + "step": 1160 + }, + { + "epoch": 0.6548223350253807, + "grad_norm": 4.485952377319336, + "learning_rate": 4.672870840383531e-05, + "loss": 1.2207, + "step": 1161 + }, + { + "epoch": 0.6553863508178229, + "grad_norm": 3.149611234664917, + "learning_rate": 4.67258883248731e-05, + "loss": 1.2359, + "step": 1162 + }, + { + "epoch": 0.6559503666102651, + "grad_norm": 4.983494758605957, + "learning_rate": 4.672306824591089e-05, + "loss": 1.5795, + "step": 1163 + }, + { + "epoch": 0.6565143824027073, + "grad_norm": 5.7057061195373535, + "learning_rate": 4.6720248166948674e-05, + "loss": 1.3806, + "step": 1164 + }, + { + "epoch": 0.6570783981951495, + "grad_norm": 5.864782810211182, + "learning_rate": 4.6717428087986466e-05, + "loss": 1.0625, + "step": 1165 + }, + { + "epoch": 0.6576424139875916, + "grad_norm": 5.941567897796631, + "learning_rate": 4.671460800902425e-05, + "loss": 1.4563, + "step": 1166 + }, + { + "epoch": 0.6582064297800339, + "grad_norm": 4.069194793701172, + "learning_rate": 4.6711787930062044e-05, + "loss": 1.1089, + "step": 1167 + }, + { + "epoch": 0.658770445572476, + "grad_norm": 3.901177406311035, + "learning_rate": 4.6708967851099836e-05, + "loss": 1.2213, + "step": 1168 + }, + { + "epoch": 0.6593344613649182, + "grad_norm": 4.068305015563965, + "learning_rate": 4.670614777213762e-05, + "loss": 1.0855, + "step": 1169 + }, + { + "epoch": 0.6598984771573604, + "grad_norm": 3.9526312351226807, + "learning_rate": 4.6703327693175414e-05, + "loss": 1.2104, + "step": 1170 + }, + { + "epoch": 0.6604624929498026, + "grad_norm": 4.526355266571045, + "learning_rate": 4.67005076142132e-05, + "loss": 1.1048, + "step": 1171 + }, + { + "epoch": 0.6610265087422448, + "grad_norm": 7.304831504821777, + "learning_rate": 4.669768753525099e-05, + "loss": 1.1853, + "step": 1172 + }, + { + "epoch": 0.6615905245346869, + "grad_norm": 6.707693099975586, + "learning_rate": 4.669486745628878e-05, + "loss": 1.4564, + "step": 1173 + }, + { + "epoch": 0.6621545403271292, + "grad_norm": 6.813276767730713, + "learning_rate": 4.669204737732657e-05, + "loss": 1.4371, + "step": 1174 + }, + { + "epoch": 0.6627185561195713, + "grad_norm": 6.285336971282959, + "learning_rate": 4.6689227298364354e-05, + "loss": 1.5005, + "step": 1175 + }, + { + "epoch": 0.6632825719120136, + "grad_norm": 4.505369186401367, + "learning_rate": 4.668640721940215e-05, + "loss": 1.3795, + "step": 1176 + }, + { + "epoch": 0.6638465877044557, + "grad_norm": 5.161136150360107, + "learning_rate": 4.668358714043993e-05, + "loss": 1.2443, + "step": 1177 + }, + { + "epoch": 0.664410603496898, + "grad_norm": 5.303092002868652, + "learning_rate": 4.6680767061477724e-05, + "loss": 1.4264, + "step": 1178 + }, + { + "epoch": 0.6649746192893401, + "grad_norm": 5.399478912353516, + "learning_rate": 4.6677946982515516e-05, + "loss": 1.8334, + "step": 1179 + }, + { + "epoch": 0.6655386350817822, + "grad_norm": 5.930105209350586, + "learning_rate": 4.66751269035533e-05, + "loss": 1.7986, + "step": 1180 + }, + { + "epoch": 0.6661026508742245, + "grad_norm": 5.740060806274414, + "learning_rate": 4.667230682459109e-05, + "loss": 1.5075, + "step": 1181 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 5.291078567504883, + "learning_rate": 4.666948674562888e-05, + "loss": 1.3768, + "step": 1182 + }, + { + "epoch": 0.6672306824591089, + "grad_norm": 4.050992488861084, + "learning_rate": 4.666666666666667e-05, + "loss": 1.2998, + "step": 1183 + }, + { + "epoch": 0.667794698251551, + "grad_norm": 5.020971298217773, + "learning_rate": 4.666384658770446e-05, + "loss": 1.3137, + "step": 1184 + }, + { + "epoch": 0.6683587140439933, + "grad_norm": 3.180283546447754, + "learning_rate": 4.666102650874224e-05, + "loss": 1.2665, + "step": 1185 + }, + { + "epoch": 0.6689227298364354, + "grad_norm": 5.309293270111084, + "learning_rate": 4.665820642978004e-05, + "loss": 1.2502, + "step": 1186 + }, + { + "epoch": 0.6694867456288776, + "grad_norm": 6.301662445068359, + "learning_rate": 4.665538635081783e-05, + "loss": 1.6905, + "step": 1187 + }, + { + "epoch": 0.6700507614213198, + "grad_norm": 4.0718488693237305, + "learning_rate": 4.665256627185561e-05, + "loss": 1.2581, + "step": 1188 + }, + { + "epoch": 0.670614777213762, + "grad_norm": 4.891712665557861, + "learning_rate": 4.66497461928934e-05, + "loss": 1.1209, + "step": 1189 + }, + { + "epoch": 0.6711787930062042, + "grad_norm": 5.02025032043457, + "learning_rate": 4.6646926113931197e-05, + "loss": 1.2784, + "step": 1190 + }, + { + "epoch": 0.6717428087986463, + "grad_norm": 4.729605674743652, + "learning_rate": 4.664410603496898e-05, + "loss": 1.305, + "step": 1191 + }, + { + "epoch": 0.6723068245910886, + "grad_norm": 3.899299383163452, + "learning_rate": 4.664128595600677e-05, + "loss": 1.0526, + "step": 1192 + }, + { + "epoch": 0.6728708403835307, + "grad_norm": 5.642962455749512, + "learning_rate": 4.663846587704456e-05, + "loss": 1.4296, + "step": 1193 + }, + { + "epoch": 0.6734348561759729, + "grad_norm": 3.665529727935791, + "learning_rate": 4.663564579808235e-05, + "loss": 1.0351, + "step": 1194 + }, + { + "epoch": 0.6739988719684151, + "grad_norm": 3.3918087482452393, + "learning_rate": 4.663282571912014e-05, + "loss": 0.9686, + "step": 1195 + }, + { + "epoch": 0.6745628877608573, + "grad_norm": 5.674928188323975, + "learning_rate": 4.663000564015792e-05, + "loss": 1.226, + "step": 1196 + }, + { + "epoch": 0.6751269035532995, + "grad_norm": 5.587332248687744, + "learning_rate": 4.6627185561195715e-05, + "loss": 1.5741, + "step": 1197 + }, + { + "epoch": 0.6756909193457417, + "grad_norm": 4.4313178062438965, + "learning_rate": 4.662436548223351e-05, + "loss": 1.2676, + "step": 1198 + }, + { + "epoch": 0.6762549351381839, + "grad_norm": 3.3719983100891113, + "learning_rate": 4.662154540327129e-05, + "loss": 1.1753, + "step": 1199 + }, + { + "epoch": 0.676818950930626, + "grad_norm": 3.42828106880188, + "learning_rate": 4.6618725324309085e-05, + "loss": 1.2201, + "step": 1200 + }, + { + "epoch": 0.6773829667230682, + "grad_norm": 3.8969738483428955, + "learning_rate": 4.661590524534687e-05, + "loss": 1.1726, + "step": 1201 + }, + { + "epoch": 0.6779469825155104, + "grad_norm": 6.05566930770874, + "learning_rate": 4.661308516638466e-05, + "loss": 1.7026, + "step": 1202 + }, + { + "epoch": 0.6785109983079526, + "grad_norm": 5.004970550537109, + "learning_rate": 4.661026508742245e-05, + "loss": 1.208, + "step": 1203 + }, + { + "epoch": 0.6790750141003948, + "grad_norm": 3.6862499713897705, + "learning_rate": 4.660744500846024e-05, + "loss": 1.1622, + "step": 1204 + }, + { + "epoch": 0.679639029892837, + "grad_norm": 4.404603958129883, + "learning_rate": 4.660462492949803e-05, + "loss": 1.3341, + "step": 1205 + }, + { + "epoch": 0.6802030456852792, + "grad_norm": 3.480797529220581, + "learning_rate": 4.660180485053582e-05, + "loss": 1.1682, + "step": 1206 + }, + { + "epoch": 0.6807670614777214, + "grad_norm": 4.883697509765625, + "learning_rate": 4.65989847715736e-05, + "loss": 1.1422, + "step": 1207 + }, + { + "epoch": 0.6813310772701635, + "grad_norm": 4.343221187591553, + "learning_rate": 4.6596164692611395e-05, + "loss": 0.8021, + "step": 1208 + }, + { + "epoch": 0.6818950930626058, + "grad_norm": 5.646923542022705, + "learning_rate": 4.659334461364919e-05, + "loss": 1.4106, + "step": 1209 + }, + { + "epoch": 0.6824591088550479, + "grad_norm": 3.271510362625122, + "learning_rate": 4.659052453468697e-05, + "loss": 0.9382, + "step": 1210 + }, + { + "epoch": 0.6830231246474902, + "grad_norm": 3.8560843467712402, + "learning_rate": 4.6587704455724765e-05, + "loss": 1.2182, + "step": 1211 + }, + { + "epoch": 0.6835871404399323, + "grad_norm": 5.907066345214844, + "learning_rate": 4.658488437676255e-05, + "loss": 0.9887, + "step": 1212 + }, + { + "epoch": 0.6841511562323745, + "grad_norm": 5.422514915466309, + "learning_rate": 4.658206429780034e-05, + "loss": 1.6876, + "step": 1213 + }, + { + "epoch": 0.6847151720248167, + "grad_norm": 3.49554181098938, + "learning_rate": 4.657924421883813e-05, + "loss": 1.1126, + "step": 1214 + }, + { + "epoch": 0.6852791878172588, + "grad_norm": 5.019167423248291, + "learning_rate": 4.657642413987592e-05, + "loss": 1.1241, + "step": 1215 + }, + { + "epoch": 0.6858432036097011, + "grad_norm": 4.3689398765563965, + "learning_rate": 4.6573604060913705e-05, + "loss": 1.2203, + "step": 1216 + }, + { + "epoch": 0.6864072194021432, + "grad_norm": 4.046571731567383, + "learning_rate": 4.65707839819515e-05, + "loss": 1.0885, + "step": 1217 + }, + { + "epoch": 0.6869712351945855, + "grad_norm": 4.889891147613525, + "learning_rate": 4.656796390298929e-05, + "loss": 1.2092, + "step": 1218 + }, + { + "epoch": 0.6875352509870276, + "grad_norm": 6.692928314208984, + "learning_rate": 4.6565143824027075e-05, + "loss": 1.5903, + "step": 1219 + }, + { + "epoch": 0.6880992667794699, + "grad_norm": 7.251490116119385, + "learning_rate": 4.656232374506486e-05, + "loss": 1.4017, + "step": 1220 + }, + { + "epoch": 0.688663282571912, + "grad_norm": 3.564626932144165, + "learning_rate": 4.655950366610265e-05, + "loss": 1.0102, + "step": 1221 + }, + { + "epoch": 0.6892272983643543, + "grad_norm": 3.308093309402466, + "learning_rate": 4.6556683587140445e-05, + "loss": 0.9596, + "step": 1222 + }, + { + "epoch": 0.6897913141567964, + "grad_norm": 5.200413227081299, + "learning_rate": 4.655386350817823e-05, + "loss": 1.3602, + "step": 1223 + }, + { + "epoch": 0.6903553299492385, + "grad_norm": 5.622106552124023, + "learning_rate": 4.6551043429216016e-05, + "loss": 1.2903, + "step": 1224 + }, + { + "epoch": 0.6909193457416808, + "grad_norm": 5.3304009437561035, + "learning_rate": 4.654822335025381e-05, + "loss": 0.998, + "step": 1225 + }, + { + "epoch": 0.6914833615341229, + "grad_norm": 5.268664836883545, + "learning_rate": 4.65454032712916e-05, + "loss": 1.2702, + "step": 1226 + }, + { + "epoch": 0.6920473773265652, + "grad_norm": 51.98427963256836, + "learning_rate": 4.6542583192329385e-05, + "loss": 2.0539, + "step": 1227 + }, + { + "epoch": 0.6926113931190073, + "grad_norm": 4.7120795249938965, + "learning_rate": 4.653976311336717e-05, + "loss": 1.325, + "step": 1228 + }, + { + "epoch": 0.6931754089114496, + "grad_norm": 6.283467769622803, + "learning_rate": 4.653694303440497e-05, + "loss": 1.3503, + "step": 1229 + }, + { + "epoch": 0.6937394247038917, + "grad_norm": 5.246824264526367, + "learning_rate": 4.6534122955442755e-05, + "loss": 1.3786, + "step": 1230 + }, + { + "epoch": 0.6943034404963339, + "grad_norm": 5.047653675079346, + "learning_rate": 4.653130287648054e-05, + "loss": 1.3133, + "step": 1231 + }, + { + "epoch": 0.6948674562887761, + "grad_norm": 4.58451509475708, + "learning_rate": 4.652848279751833e-05, + "loss": 1.131, + "step": 1232 + }, + { + "epoch": 0.6954314720812182, + "grad_norm": 7.418198585510254, + "learning_rate": 4.6525662718556125e-05, + "loss": 1.3903, + "step": 1233 + }, + { + "epoch": 0.6959954878736605, + "grad_norm": 5.698420524597168, + "learning_rate": 4.652284263959391e-05, + "loss": 1.2917, + "step": 1234 + }, + { + "epoch": 0.6965595036661026, + "grad_norm": 4.483616828918457, + "learning_rate": 4.6520022560631696e-05, + "loss": 1.3039, + "step": 1235 + }, + { + "epoch": 0.6971235194585449, + "grad_norm": 4.862963676452637, + "learning_rate": 4.651720248166949e-05, + "loss": 1.4317, + "step": 1236 + }, + { + "epoch": 0.697687535250987, + "grad_norm": 5.387257099151611, + "learning_rate": 4.651438240270728e-05, + "loss": 1.4847, + "step": 1237 + }, + { + "epoch": 0.6982515510434292, + "grad_norm": 6.15781307220459, + "learning_rate": 4.6511562323745066e-05, + "loss": 1.3542, + "step": 1238 + }, + { + "epoch": 0.6988155668358714, + "grad_norm": 5.806750297546387, + "learning_rate": 4.650874224478286e-05, + "loss": 1.4448, + "step": 1239 + }, + { + "epoch": 0.6993795826283136, + "grad_norm": 3.6676154136657715, + "learning_rate": 4.650592216582065e-05, + "loss": 1.0858, + "step": 1240 + }, + { + "epoch": 0.6999435984207558, + "grad_norm": 3.741786003112793, + "learning_rate": 4.6503102086858435e-05, + "loss": 1.1165, + "step": 1241 + }, + { + "epoch": 0.700507614213198, + "grad_norm": 5.2298173904418945, + "learning_rate": 4.650028200789622e-05, + "loss": 1.5969, + "step": 1242 + }, + { + "epoch": 0.7010716300056402, + "grad_norm": 5.308511734008789, + "learning_rate": 4.649746192893401e-05, + "loss": 1.2655, + "step": 1243 + }, + { + "epoch": 0.7016356457980824, + "grad_norm": 5.477936267852783, + "learning_rate": 4.6494641849971805e-05, + "loss": 1.267, + "step": 1244 + }, + { + "epoch": 0.7021996615905245, + "grad_norm": 5.057722091674805, + "learning_rate": 4.649182177100959e-05, + "loss": 1.3384, + "step": 1245 + }, + { + "epoch": 0.7027636773829667, + "grad_norm": 6.680202960968018, + "learning_rate": 4.6489001692047376e-05, + "loss": 1.6019, + "step": 1246 + }, + { + "epoch": 0.7033276931754089, + "grad_norm": 4.862525463104248, + "learning_rate": 4.648618161308517e-05, + "loss": 1.2476, + "step": 1247 + }, + { + "epoch": 0.7038917089678511, + "grad_norm": 5.250217914581299, + "learning_rate": 4.648336153412296e-05, + "loss": 1.0971, + "step": 1248 + }, + { + "epoch": 0.7044557247602933, + "grad_norm": 4.439472198486328, + "learning_rate": 4.6480541455160746e-05, + "loss": 1.0394, + "step": 1249 + }, + { + "epoch": 0.7050197405527355, + "grad_norm": 6.069138050079346, + "learning_rate": 4.647772137619854e-05, + "loss": 1.4241, + "step": 1250 + }, + { + "epoch": 0.7055837563451777, + "grad_norm": 5.004185676574707, + "learning_rate": 4.647490129723632e-05, + "loss": 1.1926, + "step": 1251 + }, + { + "epoch": 0.7061477721376198, + "grad_norm": 5.002621650695801, + "learning_rate": 4.6472081218274116e-05, + "loss": 1.4616, + "step": 1252 + }, + { + "epoch": 0.7067117879300621, + "grad_norm": 3.482895851135254, + "learning_rate": 4.64692611393119e-05, + "loss": 0.9479, + "step": 1253 + }, + { + "epoch": 0.7072758037225042, + "grad_norm": 3.4346818923950195, + "learning_rate": 4.646644106034969e-05, + "loss": 1.0531, + "step": 1254 + }, + { + "epoch": 0.7078398195149465, + "grad_norm": 5.240386486053467, + "learning_rate": 4.646362098138748e-05, + "loss": 1.2969, + "step": 1255 + }, + { + "epoch": 0.7084038353073886, + "grad_norm": 4.6741132736206055, + "learning_rate": 4.646080090242527e-05, + "loss": 1.2089, + "step": 1256 + }, + { + "epoch": 0.7089678510998308, + "grad_norm": 4.125059127807617, + "learning_rate": 4.645798082346306e-05, + "loss": 1.2511, + "step": 1257 + }, + { + "epoch": 0.709531866892273, + "grad_norm": 4.796258926391602, + "learning_rate": 4.645516074450085e-05, + "loss": 1.3117, + "step": 1258 + }, + { + "epoch": 0.7100958826847151, + "grad_norm": 4.432982921600342, + "learning_rate": 4.6452340665538634e-05, + "loss": 0.9243, + "step": 1259 + }, + { + "epoch": 0.7106598984771574, + "grad_norm": 6.361678123474121, + "learning_rate": 4.6449520586576426e-05, + "loss": 1.4256, + "step": 1260 + }, + { + "epoch": 0.7112239142695995, + "grad_norm": 3.642392873764038, + "learning_rate": 4.644670050761422e-05, + "loss": 1.2871, + "step": 1261 + }, + { + "epoch": 0.7117879300620418, + "grad_norm": 5.613339900970459, + "learning_rate": 4.6443880428652003e-05, + "loss": 1.3321, + "step": 1262 + }, + { + "epoch": 0.7123519458544839, + "grad_norm": 5.108872413635254, + "learning_rate": 4.644106034968979e-05, + "loss": 1.2924, + "step": 1263 + }, + { + "epoch": 0.7129159616469262, + "grad_norm": 4.3528361320495605, + "learning_rate": 4.643824027072758e-05, + "loss": 1.2087, + "step": 1264 + }, + { + "epoch": 0.7134799774393683, + "grad_norm": 6.139060020446777, + "learning_rate": 4.643542019176537e-05, + "loss": 1.497, + "step": 1265 + }, + { + "epoch": 0.7140439932318104, + "grad_norm": 5.353428840637207, + "learning_rate": 4.643260011280316e-05, + "loss": 1.2808, + "step": 1266 + }, + { + "epoch": 0.7146080090242527, + "grad_norm": 6.466920852661133, + "learning_rate": 4.6429780033840944e-05, + "loss": 1.1128, + "step": 1267 + }, + { + "epoch": 0.7151720248166948, + "grad_norm": 5.043046474456787, + "learning_rate": 4.642695995487874e-05, + "loss": 1.4033, + "step": 1268 + }, + { + "epoch": 0.7157360406091371, + "grad_norm": 3.8946022987365723, + "learning_rate": 4.642413987591653e-05, + "loss": 1.1729, + "step": 1269 + }, + { + "epoch": 0.7163000564015792, + "grad_norm": 6.392773628234863, + "learning_rate": 4.6421319796954314e-05, + "loss": 1.3958, + "step": 1270 + }, + { + "epoch": 0.7168640721940215, + "grad_norm": 4.768704414367676, + "learning_rate": 4.6418499717992106e-05, + "loss": 1.0703, + "step": 1271 + }, + { + "epoch": 0.7174280879864636, + "grad_norm": 6.11190128326416, + "learning_rate": 4.64156796390299e-05, + "loss": 1.089, + "step": 1272 + }, + { + "epoch": 0.7179921037789058, + "grad_norm": 3.6770598888397217, + "learning_rate": 4.6412859560067684e-05, + "loss": 1.1206, + "step": 1273 + }, + { + "epoch": 0.718556119571348, + "grad_norm": 4.998250484466553, + "learning_rate": 4.641003948110547e-05, + "loss": 1.0838, + "step": 1274 + }, + { + "epoch": 0.7191201353637902, + "grad_norm": 11.97719669342041, + "learning_rate": 4.640721940214327e-05, + "loss": 1.6823, + "step": 1275 + }, + { + "epoch": 0.7196841511562324, + "grad_norm": 2.8924663066864014, + "learning_rate": 4.6404399323181053e-05, + "loss": 0.9828, + "step": 1276 + }, + { + "epoch": 0.7202481669486746, + "grad_norm": 5.3080010414123535, + "learning_rate": 4.640157924421884e-05, + "loss": 1.427, + "step": 1277 + }, + { + "epoch": 0.7208121827411168, + "grad_norm": 4.348835468292236, + "learning_rate": 4.639875916525663e-05, + "loss": 1.035, + "step": 1278 + }, + { + "epoch": 0.7213761985335589, + "grad_norm": 4.160242080688477, + "learning_rate": 4.639593908629442e-05, + "loss": 1.0971, + "step": 1279 + }, + { + "epoch": 0.7219402143260011, + "grad_norm": 4.515955448150635, + "learning_rate": 4.639311900733221e-05, + "loss": 1.1427, + "step": 1280 + }, + { + "epoch": 0.7225042301184433, + "grad_norm": 7.763788223266602, + "learning_rate": 4.6390298928369994e-05, + "loss": 1.5976, + "step": 1281 + }, + { + "epoch": 0.7230682459108855, + "grad_norm": 6.243752956390381, + "learning_rate": 4.6387478849407786e-05, + "loss": 1.2761, + "step": 1282 + }, + { + "epoch": 0.7236322617033277, + "grad_norm": 4.031036376953125, + "learning_rate": 4.638465877044558e-05, + "loss": 1.0863, + "step": 1283 + }, + { + "epoch": 0.7241962774957699, + "grad_norm": 4.293300628662109, + "learning_rate": 4.6381838691483364e-05, + "loss": 1.0488, + "step": 1284 + }, + { + "epoch": 0.7247602932882121, + "grad_norm": 5.533941745758057, + "learning_rate": 4.637901861252115e-05, + "loss": 1.1624, + "step": 1285 + }, + { + "epoch": 0.7253243090806543, + "grad_norm": 15.349486351013184, + "learning_rate": 4.637619853355894e-05, + "loss": 1.5161, + "step": 1286 + }, + { + "epoch": 0.7258883248730964, + "grad_norm": 4.765090465545654, + "learning_rate": 4.6373378454596734e-05, + "loss": 1.2742, + "step": 1287 + }, + { + "epoch": 0.7264523406655387, + "grad_norm": 7.067110061645508, + "learning_rate": 4.637055837563452e-05, + "loss": 1.6332, + "step": 1288 + }, + { + "epoch": 0.7270163564579808, + "grad_norm": 4.35655403137207, + "learning_rate": 4.636773829667231e-05, + "loss": 1.0923, + "step": 1289 + }, + { + "epoch": 0.727580372250423, + "grad_norm": 5.346774578094482, + "learning_rate": 4.6364918217710097e-05, + "loss": 1.375, + "step": 1290 + }, + { + "epoch": 0.7281443880428652, + "grad_norm": 5.141659259796143, + "learning_rate": 4.636209813874789e-05, + "loss": 1.1256, + "step": 1291 + }, + { + "epoch": 0.7287084038353074, + "grad_norm": 26.45516014099121, + "learning_rate": 4.6359278059785674e-05, + "loss": 0.9784, + "step": 1292 + }, + { + "epoch": 0.7292724196277496, + "grad_norm": 5.761019706726074, + "learning_rate": 4.6356457980823466e-05, + "loss": 1.3635, + "step": 1293 + }, + { + "epoch": 0.7298364354201917, + "grad_norm": 5.343832969665527, + "learning_rate": 4.635363790186125e-05, + "loss": 1.4283, + "step": 1294 + }, + { + "epoch": 0.730400451212634, + "grad_norm": 3.4639737606048584, + "learning_rate": 4.6350817822899044e-05, + "loss": 1.0921, + "step": 1295 + }, + { + "epoch": 0.7309644670050761, + "grad_norm": 3.6261470317840576, + "learning_rate": 4.6347997743936836e-05, + "loss": 0.9391, + "step": 1296 + }, + { + "epoch": 0.7315284827975184, + "grad_norm": 4.780145645141602, + "learning_rate": 4.634517766497462e-05, + "loss": 1.259, + "step": 1297 + }, + { + "epoch": 0.7320924985899605, + "grad_norm": 3.992382526397705, + "learning_rate": 4.634235758601241e-05, + "loss": 1.0094, + "step": 1298 + }, + { + "epoch": 0.7326565143824028, + "grad_norm": 4.688031196594238, + "learning_rate": 4.63395375070502e-05, + "loss": 1.194, + "step": 1299 + }, + { + "epoch": 0.7332205301748449, + "grad_norm": 4.504694938659668, + "learning_rate": 4.633671742808799e-05, + "loss": 1.2261, + "step": 1300 + }, + { + "epoch": 0.733784545967287, + "grad_norm": 4.7181572914123535, + "learning_rate": 4.633389734912578e-05, + "loss": 1.3821, + "step": 1301 + }, + { + "epoch": 0.7343485617597293, + "grad_norm": 4.652824401855469, + "learning_rate": 4.633107727016356e-05, + "loss": 1.1812, + "step": 1302 + }, + { + "epoch": 0.7349125775521714, + "grad_norm": 6.717075824737549, + "learning_rate": 4.6328257191201354e-05, + "loss": 1.3386, + "step": 1303 + }, + { + "epoch": 0.7354765933446137, + "grad_norm": 6.727533340454102, + "learning_rate": 4.6325437112239146e-05, + "loss": 1.2235, + "step": 1304 + }, + { + "epoch": 0.7360406091370558, + "grad_norm": 4.707339286804199, + "learning_rate": 4.632261703327693e-05, + "loss": 1.1236, + "step": 1305 + }, + { + "epoch": 0.7366046249294981, + "grad_norm": 5.065624713897705, + "learning_rate": 4.631979695431472e-05, + "loss": 1.2248, + "step": 1306 + }, + { + "epoch": 0.7371686407219402, + "grad_norm": 6.593886852264404, + "learning_rate": 4.6316976875352516e-05, + "loss": 1.2639, + "step": 1307 + }, + { + "epoch": 0.7377326565143824, + "grad_norm": 3.5967447757720947, + "learning_rate": 4.63141567963903e-05, + "loss": 1.1969, + "step": 1308 + }, + { + "epoch": 0.7382966723068246, + "grad_norm": 6.6553730964660645, + "learning_rate": 4.631133671742809e-05, + "loss": 1.5867, + "step": 1309 + }, + { + "epoch": 0.7388606880992667, + "grad_norm": 5.4524078369140625, + "learning_rate": 4.630851663846588e-05, + "loss": 1.56, + "step": 1310 + }, + { + "epoch": 0.739424703891709, + "grad_norm": 3.7582552433013916, + "learning_rate": 4.630569655950367e-05, + "loss": 0.9417, + "step": 1311 + }, + { + "epoch": 0.7399887196841511, + "grad_norm": 4.382091999053955, + "learning_rate": 4.630287648054146e-05, + "loss": 1.056, + "step": 1312 + }, + { + "epoch": 0.7405527354765934, + "grad_norm": 5.5746660232543945, + "learning_rate": 4.630005640157924e-05, + "loss": 1.2678, + "step": 1313 + }, + { + "epoch": 0.7411167512690355, + "grad_norm": 6.602484226226807, + "learning_rate": 4.629723632261704e-05, + "loss": 1.1895, + "step": 1314 + }, + { + "epoch": 0.7416807670614777, + "grad_norm": 3.4922544956207275, + "learning_rate": 4.629441624365483e-05, + "loss": 1.0784, + "step": 1315 + }, + { + "epoch": 0.7422447828539199, + "grad_norm": 3.9028944969177246, + "learning_rate": 4.629159616469261e-05, + "loss": 1.0081, + "step": 1316 + }, + { + "epoch": 0.7428087986463621, + "grad_norm": 3.881178855895996, + "learning_rate": 4.6288776085730404e-05, + "loss": 1.2869, + "step": 1317 + }, + { + "epoch": 0.7433728144388043, + "grad_norm": 4.8327436447143555, + "learning_rate": 4.6285956006768196e-05, + "loss": 1.3268, + "step": 1318 + }, + { + "epoch": 0.7439368302312465, + "grad_norm": 4.730016231536865, + "learning_rate": 4.628313592780598e-05, + "loss": 1.1771, + "step": 1319 + }, + { + "epoch": 0.7445008460236887, + "grad_norm": 4.469271659851074, + "learning_rate": 4.628031584884377e-05, + "loss": 1.2476, + "step": 1320 + }, + { + "epoch": 0.7450648618161309, + "grad_norm": 5.744274616241455, + "learning_rate": 4.627749576988156e-05, + "loss": 1.1561, + "step": 1321 + }, + { + "epoch": 0.745628877608573, + "grad_norm": 4.762886047363281, + "learning_rate": 4.627467569091935e-05, + "loss": 1.3029, + "step": 1322 + }, + { + "epoch": 0.7461928934010152, + "grad_norm": 4.259934425354004, + "learning_rate": 4.627185561195714e-05, + "loss": 1.1402, + "step": 1323 + }, + { + "epoch": 0.7467569091934574, + "grad_norm": 4.6229729652404785, + "learning_rate": 4.626903553299492e-05, + "loss": 1.2217, + "step": 1324 + }, + { + "epoch": 0.7473209249858996, + "grad_norm": 4.667693614959717, + "learning_rate": 4.6266215454032715e-05, + "loss": 1.2772, + "step": 1325 + }, + { + "epoch": 0.7478849407783418, + "grad_norm": 5.601677417755127, + "learning_rate": 4.626339537507051e-05, + "loss": 1.0548, + "step": 1326 + }, + { + "epoch": 0.748448956570784, + "grad_norm": 3.7440171241760254, + "learning_rate": 4.626057529610829e-05, + "loss": 1.152, + "step": 1327 + }, + { + "epoch": 0.7490129723632262, + "grad_norm": 4.846717357635498, + "learning_rate": 4.6257755217146084e-05, + "loss": 1.2383, + "step": 1328 + }, + { + "epoch": 0.7495769881556683, + "grad_norm": 7.935200214385986, + "learning_rate": 4.625493513818387e-05, + "loss": 2.0655, + "step": 1329 + }, + { + "epoch": 0.7501410039481106, + "grad_norm": 4.414703369140625, + "learning_rate": 4.625211505922166e-05, + "loss": 1.0868, + "step": 1330 + }, + { + "epoch": 0.7507050197405527, + "grad_norm": 4.3390092849731445, + "learning_rate": 4.624929498025945e-05, + "loss": 1.2667, + "step": 1331 + }, + { + "epoch": 0.751269035532995, + "grad_norm": 5.294330596923828, + "learning_rate": 4.624647490129724e-05, + "loss": 1.5106, + "step": 1332 + }, + { + "epoch": 0.7518330513254371, + "grad_norm": 6.00322151184082, + "learning_rate": 4.6243654822335025e-05, + "loss": 1.5528, + "step": 1333 + }, + { + "epoch": 0.7523970671178793, + "grad_norm": 5.83137321472168, + "learning_rate": 4.624083474337282e-05, + "loss": 1.4499, + "step": 1334 + }, + { + "epoch": 0.7529610829103215, + "grad_norm": 4.285374641418457, + "learning_rate": 4.623801466441061e-05, + "loss": 1.2589, + "step": 1335 + }, + { + "epoch": 0.7535250987027636, + "grad_norm": 6.205440998077393, + "learning_rate": 4.6235194585448395e-05, + "loss": 1.5297, + "step": 1336 + }, + { + "epoch": 0.7540891144952059, + "grad_norm": 4.836482524871826, + "learning_rate": 4.623237450648618e-05, + "loss": 1.1148, + "step": 1337 + }, + { + "epoch": 0.754653130287648, + "grad_norm": 8.850518226623535, + "learning_rate": 4.622955442752397e-05, + "loss": 1.3363, + "step": 1338 + }, + { + "epoch": 0.7552171460800903, + "grad_norm": 5.757955074310303, + "learning_rate": 4.6226734348561765e-05, + "loss": 1.1108, + "step": 1339 + }, + { + "epoch": 0.7557811618725324, + "grad_norm": 4.435173034667969, + "learning_rate": 4.622391426959955e-05, + "loss": 1.2057, + "step": 1340 + }, + { + "epoch": 0.7563451776649747, + "grad_norm": 3.972360372543335, + "learning_rate": 4.6221094190637335e-05, + "loss": 1.0112, + "step": 1341 + }, + { + "epoch": 0.7569091934574168, + "grad_norm": 4.717401504516602, + "learning_rate": 4.621827411167513e-05, + "loss": 1.3507, + "step": 1342 + }, + { + "epoch": 0.757473209249859, + "grad_norm": 3.7226743698120117, + "learning_rate": 4.621545403271292e-05, + "loss": 1.0458, + "step": 1343 + }, + { + "epoch": 0.7580372250423012, + "grad_norm": 6.94791841506958, + "learning_rate": 4.6212633953750705e-05, + "loss": 1.6339, + "step": 1344 + }, + { + "epoch": 0.7586012408347433, + "grad_norm": 3.605520248413086, + "learning_rate": 4.62098138747885e-05, + "loss": 0.8064, + "step": 1345 + }, + { + "epoch": 0.7591652566271856, + "grad_norm": 4.66180419921875, + "learning_rate": 4.620699379582629e-05, + "loss": 1.2703, + "step": 1346 + }, + { + "epoch": 0.7597292724196277, + "grad_norm": 4.087086200714111, + "learning_rate": 4.6204173716864075e-05, + "loss": 1.0134, + "step": 1347 + }, + { + "epoch": 0.76029328821207, + "grad_norm": 4.666589736938477, + "learning_rate": 4.620135363790186e-05, + "loss": 1.1166, + "step": 1348 + }, + { + "epoch": 0.7608573040045121, + "grad_norm": 4.827851295471191, + "learning_rate": 4.619853355893965e-05, + "loss": 1.1367, + "step": 1349 + }, + { + "epoch": 0.7614213197969543, + "grad_norm": 4.342495441436768, + "learning_rate": 4.6195713479977445e-05, + "loss": 1.1395, + "step": 1350 + }, + { + "epoch": 0.7619853355893965, + "grad_norm": 8.691017150878906, + "learning_rate": 4.619289340101523e-05, + "loss": 1.5567, + "step": 1351 + }, + { + "epoch": 0.7625493513818387, + "grad_norm": 5.497622013092041, + "learning_rate": 4.6190073322053015e-05, + "loss": 1.4862, + "step": 1352 + }, + { + "epoch": 0.7631133671742809, + "grad_norm": 4.201109886169434, + "learning_rate": 4.6187253243090814e-05, + "loss": 1.2507, + "step": 1353 + }, + { + "epoch": 0.763677382966723, + "grad_norm": 5.961581230163574, + "learning_rate": 4.61844331641286e-05, + "loss": 1.5087, + "step": 1354 + }, + { + "epoch": 0.7642413987591653, + "grad_norm": 4.14495849609375, + "learning_rate": 4.6181613085166385e-05, + "loss": 1.0924, + "step": 1355 + }, + { + "epoch": 0.7648054145516074, + "grad_norm": 4.348697185516357, + "learning_rate": 4.617879300620417e-05, + "loss": 1.0674, + "step": 1356 + }, + { + "epoch": 0.7653694303440496, + "grad_norm": 2.77095365524292, + "learning_rate": 4.617597292724197e-05, + "loss": 0.9745, + "step": 1357 + }, + { + "epoch": 0.7659334461364918, + "grad_norm": 4.3702921867370605, + "learning_rate": 4.6173152848279755e-05, + "loss": 0.9384, + "step": 1358 + }, + { + "epoch": 0.766497461928934, + "grad_norm": 6.754129886627197, + "learning_rate": 4.617033276931754e-05, + "loss": 1.2899, + "step": 1359 + }, + { + "epoch": 0.7670614777213762, + "grad_norm": 4.950490951538086, + "learning_rate": 4.616751269035533e-05, + "loss": 1.1622, + "step": 1360 + }, + { + "epoch": 0.7676254935138184, + "grad_norm": 5.607694149017334, + "learning_rate": 4.6164692611393125e-05, + "loss": 1.4068, + "step": 1361 + }, + { + "epoch": 0.7681895093062606, + "grad_norm": 4.770968437194824, + "learning_rate": 4.616187253243091e-05, + "loss": 1.1551, + "step": 1362 + }, + { + "epoch": 0.7687535250987028, + "grad_norm": 5.478562831878662, + "learning_rate": 4.6159052453468696e-05, + "loss": 1.2631, + "step": 1363 + }, + { + "epoch": 0.7693175408911449, + "grad_norm": 6.065536975860596, + "learning_rate": 4.615623237450649e-05, + "loss": 1.359, + "step": 1364 + }, + { + "epoch": 0.7698815566835872, + "grad_norm": 4.936931610107422, + "learning_rate": 4.615341229554428e-05, + "loss": 1.274, + "step": 1365 + }, + { + "epoch": 0.7704455724760293, + "grad_norm": 6.388620376586914, + "learning_rate": 4.6150592216582065e-05, + "loss": 1.1431, + "step": 1366 + }, + { + "epoch": 0.7710095882684715, + "grad_norm": 5.244349002838135, + "learning_rate": 4.614777213761986e-05, + "loss": 1.3011, + "step": 1367 + }, + { + "epoch": 0.7715736040609137, + "grad_norm": 4.451101303100586, + "learning_rate": 4.614495205865764e-05, + "loss": 1.2989, + "step": 1368 + }, + { + "epoch": 0.7721376198533559, + "grad_norm": 3.903989553451538, + "learning_rate": 4.6142131979695435e-05, + "loss": 1.0484, + "step": 1369 + }, + { + "epoch": 0.7727016356457981, + "grad_norm": 5.349419593811035, + "learning_rate": 4.613931190073322e-05, + "loss": 1.0938, + "step": 1370 + }, + { + "epoch": 0.7732656514382402, + "grad_norm": 7.113912105560303, + "learning_rate": 4.613649182177101e-05, + "loss": 1.6742, + "step": 1371 + }, + { + "epoch": 0.7738296672306825, + "grad_norm": 4.013664245605469, + "learning_rate": 4.61336717428088e-05, + "loss": 0.8215, + "step": 1372 + }, + { + "epoch": 0.7743936830231246, + "grad_norm": 6.0130438804626465, + "learning_rate": 4.613085166384659e-05, + "loss": 1.4246, + "step": 1373 + }, + { + "epoch": 0.7749576988155669, + "grad_norm": 5.752763271331787, + "learning_rate": 4.6128031584884376e-05, + "loss": 1.2577, + "step": 1374 + }, + { + "epoch": 0.775521714608009, + "grad_norm": 7.310444355010986, + "learning_rate": 4.612521150592217e-05, + "loss": 1.3726, + "step": 1375 + }, + { + "epoch": 0.7760857304004513, + "grad_norm": 3.708407163619995, + "learning_rate": 4.6122391426959953e-05, + "loss": 1.1695, + "step": 1376 + }, + { + "epoch": 0.7766497461928934, + "grad_norm": 5.541306018829346, + "learning_rate": 4.6119571347997746e-05, + "loss": 1.5612, + "step": 1377 + }, + { + "epoch": 0.7772137619853355, + "grad_norm": 5.670698165893555, + "learning_rate": 4.611675126903554e-05, + "loss": 1.2584, + "step": 1378 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 6.31307315826416, + "learning_rate": 4.611393119007332e-05, + "loss": 1.0352, + "step": 1379 + }, + { + "epoch": 0.7783417935702199, + "grad_norm": 3.214341163635254, + "learning_rate": 4.6111111111111115e-05, + "loss": 1.0661, + "step": 1380 + }, + { + "epoch": 0.7789058093626622, + "grad_norm": 5.052979469299316, + "learning_rate": 4.61082910321489e-05, + "loss": 1.097, + "step": 1381 + }, + { + "epoch": 0.7794698251551043, + "grad_norm": 6.013751983642578, + "learning_rate": 4.610547095318669e-05, + "loss": 1.3282, + "step": 1382 + }, + { + "epoch": 0.7800338409475466, + "grad_norm": 3.7293577194213867, + "learning_rate": 4.610265087422448e-05, + "loss": 1.116, + "step": 1383 + }, + { + "epoch": 0.7805978567399887, + "grad_norm": 6.475741386413574, + "learning_rate": 4.609983079526227e-05, + "loss": 1.435, + "step": 1384 + }, + { + "epoch": 0.7811618725324309, + "grad_norm": 5.144115924835205, + "learning_rate": 4.609701071630006e-05, + "loss": 1.429, + "step": 1385 + }, + { + "epoch": 0.7817258883248731, + "grad_norm": 4.952719688415527, + "learning_rate": 4.609419063733785e-05, + "loss": 1.2419, + "step": 1386 + }, + { + "epoch": 0.7822899041173152, + "grad_norm": 3.3172194957733154, + "learning_rate": 4.6091370558375634e-05, + "loss": 0.9921, + "step": 1387 + }, + { + "epoch": 0.7828539199097575, + "grad_norm": 3.874866008758545, + "learning_rate": 4.6088550479413426e-05, + "loss": 0.9616, + "step": 1388 + }, + { + "epoch": 0.7834179357021996, + "grad_norm": 5.3815717697143555, + "learning_rate": 4.608573040045122e-05, + "loss": 1.2951, + "step": 1389 + }, + { + "epoch": 0.7839819514946419, + "grad_norm": 5.515038967132568, + "learning_rate": 4.6082910321489e-05, + "loss": 1.1663, + "step": 1390 + }, + { + "epoch": 0.784545967287084, + "grad_norm": 4.689113140106201, + "learning_rate": 4.608009024252679e-05, + "loss": 1.185, + "step": 1391 + }, + { + "epoch": 0.7851099830795262, + "grad_norm": 5.176579475402832, + "learning_rate": 4.607727016356458e-05, + "loss": 1.2528, + "step": 1392 + }, + { + "epoch": 0.7856739988719684, + "grad_norm": 6.617683410644531, + "learning_rate": 4.607445008460237e-05, + "loss": 1.4477, + "step": 1393 + }, + { + "epoch": 0.7862380146644106, + "grad_norm": 4.128128528594971, + "learning_rate": 4.607163000564016e-05, + "loss": 1.2683, + "step": 1394 + }, + { + "epoch": 0.7868020304568528, + "grad_norm": 5.077265739440918, + "learning_rate": 4.6068809926677944e-05, + "loss": 1.3368, + "step": 1395 + }, + { + "epoch": 0.787366046249295, + "grad_norm": 7.556220054626465, + "learning_rate": 4.606598984771574e-05, + "loss": 1.4717, + "step": 1396 + }, + { + "epoch": 0.7879300620417372, + "grad_norm": 4.9235992431640625, + "learning_rate": 4.606316976875353e-05, + "loss": 1.0668, + "step": 1397 + }, + { + "epoch": 0.7884940778341794, + "grad_norm": 3.6266872882843018, + "learning_rate": 4.6060349689791314e-05, + "loss": 0.9019, + "step": 1398 + }, + { + "epoch": 0.7890580936266215, + "grad_norm": 4.747989177703857, + "learning_rate": 4.6057529610829106e-05, + "loss": 1.2453, + "step": 1399 + }, + { + "epoch": 0.7896221094190637, + "grad_norm": 4.033945560455322, + "learning_rate": 4.60547095318669e-05, + "loss": 1.1544, + "step": 1400 + }, + { + "epoch": 0.7901861252115059, + "grad_norm": 5.412357330322266, + "learning_rate": 4.6051889452904683e-05, + "loss": 1.3927, + "step": 1401 + }, + { + "epoch": 0.7907501410039481, + "grad_norm": 5.350828647613525, + "learning_rate": 4.604906937394247e-05, + "loss": 1.4453, + "step": 1402 + }, + { + "epoch": 0.7913141567963903, + "grad_norm": 5.168945789337158, + "learning_rate": 4.604624929498026e-05, + "loss": 1.0984, + "step": 1403 + }, + { + "epoch": 0.7918781725888325, + "grad_norm": 5.460413455963135, + "learning_rate": 4.604342921601805e-05, + "loss": 1.0874, + "step": 1404 + }, + { + "epoch": 0.7924421883812747, + "grad_norm": 4.536509037017822, + "learning_rate": 4.604060913705584e-05, + "loss": 0.9983, + "step": 1405 + }, + { + "epoch": 0.7930062041737168, + "grad_norm": 4.781903266906738, + "learning_rate": 4.603778905809363e-05, + "loss": 1.0554, + "step": 1406 + }, + { + "epoch": 0.7935702199661591, + "grad_norm": 5.236364841461182, + "learning_rate": 4.6034968979131416e-05, + "loss": 1.3632, + "step": 1407 + }, + { + "epoch": 0.7941342357586012, + "grad_norm": 3.505204916000366, + "learning_rate": 4.603214890016921e-05, + "loss": 1.0508, + "step": 1408 + }, + { + "epoch": 0.7946982515510435, + "grad_norm": 5.673813819885254, + "learning_rate": 4.6029328821206994e-05, + "loss": 1.1312, + "step": 1409 + }, + { + "epoch": 0.7952622673434856, + "grad_norm": 4.09086799621582, + "learning_rate": 4.6026508742244786e-05, + "loss": 1.0113, + "step": 1410 + }, + { + "epoch": 0.7958262831359278, + "grad_norm": 5.673789024353027, + "learning_rate": 4.602368866328257e-05, + "loss": 1.2544, + "step": 1411 + }, + { + "epoch": 0.79639029892837, + "grad_norm": 4.877456188201904, + "learning_rate": 4.6020868584320364e-05, + "loss": 1.2579, + "step": 1412 + }, + { + "epoch": 0.7969543147208121, + "grad_norm": 5.427639961242676, + "learning_rate": 4.601804850535815e-05, + "loss": 1.459, + "step": 1413 + }, + { + "epoch": 0.7975183305132544, + "grad_norm": 4.172876834869385, + "learning_rate": 4.601522842639594e-05, + "loss": 0.9782, + "step": 1414 + }, + { + "epoch": 0.7980823463056965, + "grad_norm": 4.267716407775879, + "learning_rate": 4.6012408347433733e-05, + "loss": 1.0739, + "step": 1415 + }, + { + "epoch": 0.7986463620981388, + "grad_norm": 4.436158180236816, + "learning_rate": 4.600958826847152e-05, + "loss": 1.1761, + "step": 1416 + }, + { + "epoch": 0.7992103778905809, + "grad_norm": 4.8914875984191895, + "learning_rate": 4.600676818950931e-05, + "loss": 1.1257, + "step": 1417 + }, + { + "epoch": 0.7997743936830232, + "grad_norm": 4.253779411315918, + "learning_rate": 4.6003948110547096e-05, + "loss": 1.0621, + "step": 1418 + }, + { + "epoch": 0.8003384094754653, + "grad_norm": 6.894676685333252, + "learning_rate": 4.600112803158489e-05, + "loss": 1.4117, + "step": 1419 + }, + { + "epoch": 0.8009024252679074, + "grad_norm": 4.56071662902832, + "learning_rate": 4.5998307952622674e-05, + "loss": 1.1702, + "step": 1420 + }, + { + "epoch": 0.8014664410603497, + "grad_norm": 4.7874555587768555, + "learning_rate": 4.5995487873660466e-05, + "loss": 1.1362, + "step": 1421 + }, + { + "epoch": 0.8020304568527918, + "grad_norm": 4.85036563873291, + "learning_rate": 4.599266779469825e-05, + "loss": 1.075, + "step": 1422 + }, + { + "epoch": 0.8025944726452341, + "grad_norm": 4.528674602508545, + "learning_rate": 4.5989847715736044e-05, + "loss": 1.0941, + "step": 1423 + }, + { + "epoch": 0.8031584884376762, + "grad_norm": 5.936415195465088, + "learning_rate": 4.5987027636773836e-05, + "loss": 1.3116, + "step": 1424 + }, + { + "epoch": 0.8037225042301185, + "grad_norm": 6.173067569732666, + "learning_rate": 4.598420755781162e-05, + "loss": 1.4368, + "step": 1425 + }, + { + "epoch": 0.8042865200225606, + "grad_norm": 3.903080940246582, + "learning_rate": 4.598138747884941e-05, + "loss": 1.3401, + "step": 1426 + }, + { + "epoch": 0.8048505358150028, + "grad_norm": 5.9880805015563965, + "learning_rate": 4.59785673998872e-05, + "loss": 1.2248, + "step": 1427 + }, + { + "epoch": 0.805414551607445, + "grad_norm": 4.17664909362793, + "learning_rate": 4.597574732092499e-05, + "loss": 1.14, + "step": 1428 + }, + { + "epoch": 0.8059785673998872, + "grad_norm": 4.551692008972168, + "learning_rate": 4.5972927241962777e-05, + "loss": 0.9384, + "step": 1429 + }, + { + "epoch": 0.8065425831923294, + "grad_norm": 2.9848532676696777, + "learning_rate": 4.597010716300056e-05, + "loss": 0.9075, + "step": 1430 + }, + { + "epoch": 0.8071065989847716, + "grad_norm": 5.661645412445068, + "learning_rate": 4.5967287084038354e-05, + "loss": 1.2406, + "step": 1431 + }, + { + "epoch": 0.8076706147772138, + "grad_norm": 3.602959156036377, + "learning_rate": 4.5964467005076146e-05, + "loss": 0.8976, + "step": 1432 + }, + { + "epoch": 0.8082346305696559, + "grad_norm": 6.131488800048828, + "learning_rate": 4.596164692611393e-05, + "loss": 1.3838, + "step": 1433 + }, + { + "epoch": 0.8087986463620981, + "grad_norm": 5.023265838623047, + "learning_rate": 4.595882684715172e-05, + "loss": 1.3562, + "step": 1434 + }, + { + "epoch": 0.8093626621545403, + "grad_norm": 4.502446174621582, + "learning_rate": 4.5956006768189516e-05, + "loss": 1.249, + "step": 1435 + }, + { + "epoch": 0.8099266779469825, + "grad_norm": 4.59259557723999, + "learning_rate": 4.59531866892273e-05, + "loss": 1.0929, + "step": 1436 + }, + { + "epoch": 0.8104906937394247, + "grad_norm": 4.441952228546143, + "learning_rate": 4.595036661026509e-05, + "loss": 1.0285, + "step": 1437 + }, + { + "epoch": 0.8110547095318669, + "grad_norm": 2.9452507495880127, + "learning_rate": 4.594754653130288e-05, + "loss": 0.9668, + "step": 1438 + }, + { + "epoch": 0.8116187253243091, + "grad_norm": 3.9475152492523193, + "learning_rate": 4.594472645234067e-05, + "loss": 1.0833, + "step": 1439 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 4.696314334869385, + "learning_rate": 4.594190637337846e-05, + "loss": 1.154, + "step": 1440 + }, + { + "epoch": 0.8127467569091935, + "grad_norm": 5.911883354187012, + "learning_rate": 4.593908629441624e-05, + "loss": 1.1766, + "step": 1441 + }, + { + "epoch": 0.8133107727016357, + "grad_norm": 4.853512287139893, + "learning_rate": 4.5936266215454034e-05, + "loss": 0.8527, + "step": 1442 + }, + { + "epoch": 0.8138747884940778, + "grad_norm": 4.640796184539795, + "learning_rate": 4.5933446136491826e-05, + "loss": 1.1054, + "step": 1443 + }, + { + "epoch": 0.81443880428652, + "grad_norm": 4.663665771484375, + "learning_rate": 4.593062605752961e-05, + "loss": 1.09, + "step": 1444 + }, + { + "epoch": 0.8150028200789622, + "grad_norm": 3.7449283599853516, + "learning_rate": 4.5927805978567404e-05, + "loss": 1.1285, + "step": 1445 + }, + { + "epoch": 0.8155668358714044, + "grad_norm": 4.306171894073486, + "learning_rate": 4.592498589960519e-05, + "loss": 1.2471, + "step": 1446 + }, + { + "epoch": 0.8161308516638466, + "grad_norm": 5.673454761505127, + "learning_rate": 4.592216582064298e-05, + "loss": 1.1784, + "step": 1447 + }, + { + "epoch": 0.8166948674562888, + "grad_norm": 6.245659351348877, + "learning_rate": 4.591934574168077e-05, + "loss": 1.29, + "step": 1448 + }, + { + "epoch": 0.817258883248731, + "grad_norm": 4.755579471588135, + "learning_rate": 4.591652566271856e-05, + "loss": 1.0925, + "step": 1449 + }, + { + "epoch": 0.8178228990411731, + "grad_norm": 5.522821426391602, + "learning_rate": 4.591370558375635e-05, + "loss": 1.1241, + "step": 1450 + }, + { + "epoch": 0.8183869148336154, + "grad_norm": 4.0832200050354, + "learning_rate": 4.591088550479414e-05, + "loss": 1.0229, + "step": 1451 + }, + { + "epoch": 0.8189509306260575, + "grad_norm": 3.736671209335327, + "learning_rate": 4.590806542583192e-05, + "loss": 1.1049, + "step": 1452 + }, + { + "epoch": 0.8195149464184998, + "grad_norm": 3.6435739994049072, + "learning_rate": 4.5905245346869714e-05, + "loss": 1.1495, + "step": 1453 + }, + { + "epoch": 0.8200789622109419, + "grad_norm": 5.558770656585693, + "learning_rate": 4.590242526790751e-05, + "loss": 1.1803, + "step": 1454 + }, + { + "epoch": 0.8206429780033841, + "grad_norm": 4.128566741943359, + "learning_rate": 4.589960518894529e-05, + "loss": 1.0248, + "step": 1455 + }, + { + "epoch": 0.8212069937958263, + "grad_norm": 4.250308036804199, + "learning_rate": 4.5896785109983084e-05, + "loss": 0.9929, + "step": 1456 + }, + { + "epoch": 0.8217710095882684, + "grad_norm": 6.121755599975586, + "learning_rate": 4.589396503102087e-05, + "loss": 1.2508, + "step": 1457 + }, + { + "epoch": 0.8223350253807107, + "grad_norm": 4.497706890106201, + "learning_rate": 4.589114495205866e-05, + "loss": 1.0474, + "step": 1458 + }, + { + "epoch": 0.8228990411731528, + "grad_norm": 4.753546237945557, + "learning_rate": 4.588832487309645e-05, + "loss": 1.0028, + "step": 1459 + }, + { + "epoch": 0.8234630569655951, + "grad_norm": 5.425995349884033, + "learning_rate": 4.588550479413424e-05, + "loss": 1.2169, + "step": 1460 + }, + { + "epoch": 0.8240270727580372, + "grad_norm": 3.2092244625091553, + "learning_rate": 4.5882684715172025e-05, + "loss": 0.805, + "step": 1461 + }, + { + "epoch": 0.8245910885504795, + "grad_norm": 4.531754970550537, + "learning_rate": 4.587986463620982e-05, + "loss": 1.0792, + "step": 1462 + }, + { + "epoch": 0.8251551043429216, + "grad_norm": 6.085624694824219, + "learning_rate": 4.587704455724761e-05, + "loss": 1.2484, + "step": 1463 + }, + { + "epoch": 0.8257191201353637, + "grad_norm": 5.228318214416504, + "learning_rate": 4.5874224478285395e-05, + "loss": 1.3603, + "step": 1464 + }, + { + "epoch": 0.826283135927806, + "grad_norm": 4.700536727905273, + "learning_rate": 4.587140439932318e-05, + "loss": 0.9741, + "step": 1465 + }, + { + "epoch": 0.8268471517202481, + "grad_norm": 4.078838348388672, + "learning_rate": 4.586858432036097e-05, + "loss": 1.0489, + "step": 1466 + }, + { + "epoch": 0.8274111675126904, + "grad_norm": 5.896365642547607, + "learning_rate": 4.5865764241398764e-05, + "loss": 1.3072, + "step": 1467 + }, + { + "epoch": 0.8279751833051325, + "grad_norm": 4.20445442199707, + "learning_rate": 4.586294416243655e-05, + "loss": 1.0698, + "step": 1468 + }, + { + "epoch": 0.8285391990975748, + "grad_norm": 4.571406841278076, + "learning_rate": 4.5860124083474335e-05, + "loss": 1.1127, + "step": 1469 + }, + { + "epoch": 0.8291032148900169, + "grad_norm": 5.496472358703613, + "learning_rate": 4.585730400451213e-05, + "loss": 1.1624, + "step": 1470 + }, + { + "epoch": 0.8296672306824591, + "grad_norm": 4.0838398933410645, + "learning_rate": 4.585448392554992e-05, + "loss": 0.9716, + "step": 1471 + }, + { + "epoch": 0.8302312464749013, + "grad_norm": 4.330972194671631, + "learning_rate": 4.5851663846587705e-05, + "loss": 1.107, + "step": 1472 + }, + { + "epoch": 0.8307952622673435, + "grad_norm": 4.077773094177246, + "learning_rate": 4.584884376762549e-05, + "loss": 1.0922, + "step": 1473 + }, + { + "epoch": 0.8313592780597857, + "grad_norm": 6.369618892669678, + "learning_rate": 4.584602368866329e-05, + "loss": 1.2778, + "step": 1474 + }, + { + "epoch": 0.8319232938522279, + "grad_norm": 5.41652250289917, + "learning_rate": 4.5843203609701075e-05, + "loss": 1.2767, + "step": 1475 + }, + { + "epoch": 0.8324873096446701, + "grad_norm": 5.413517951965332, + "learning_rate": 4.584038353073886e-05, + "loss": 0.9731, + "step": 1476 + }, + { + "epoch": 0.8330513254371122, + "grad_norm": 4.618905067443848, + "learning_rate": 4.583756345177665e-05, + "loss": 1.0746, + "step": 1477 + }, + { + "epoch": 0.8336153412295544, + "grad_norm": 5.4160356521606445, + "learning_rate": 4.5834743372814445e-05, + "loss": 1.1168, + "step": 1478 + }, + { + "epoch": 0.8341793570219966, + "grad_norm": 4.623969078063965, + "learning_rate": 4.583192329385223e-05, + "loss": 1.2515, + "step": 1479 + }, + { + "epoch": 0.8347433728144388, + "grad_norm": 4.783052444458008, + "learning_rate": 4.5829103214890015e-05, + "loss": 1.1537, + "step": 1480 + }, + { + "epoch": 0.835307388606881, + "grad_norm": 5.9046831130981445, + "learning_rate": 4.582628313592781e-05, + "loss": 1.3204, + "step": 1481 + }, + { + "epoch": 0.8358714043993232, + "grad_norm": 8.751752853393555, + "learning_rate": 4.58234630569656e-05, + "loss": 1.2496, + "step": 1482 + }, + { + "epoch": 0.8364354201917654, + "grad_norm": 4.2875566482543945, + "learning_rate": 4.5820642978003385e-05, + "loss": 1.2196, + "step": 1483 + }, + { + "epoch": 0.8369994359842076, + "grad_norm": 7.043860912322998, + "learning_rate": 4.581782289904118e-05, + "loss": 1.5004, + "step": 1484 + }, + { + "epoch": 0.8375634517766497, + "grad_norm": 4.448366641998291, + "learning_rate": 4.581500282007897e-05, + "loss": 1.2489, + "step": 1485 + }, + { + "epoch": 0.838127467569092, + "grad_norm": 4.928878307342529, + "learning_rate": 4.5812182741116755e-05, + "loss": 1.0945, + "step": 1486 + }, + { + "epoch": 0.8386914833615341, + "grad_norm": 2.9424033164978027, + "learning_rate": 4.580936266215454e-05, + "loss": 1.0428, + "step": 1487 + }, + { + "epoch": 0.8392554991539763, + "grad_norm": 3.6185407638549805, + "learning_rate": 4.580654258319233e-05, + "loss": 0.8959, + "step": 1488 + }, + { + "epoch": 0.8398195149464185, + "grad_norm": 4.76283597946167, + "learning_rate": 4.5803722504230125e-05, + "loss": 1.0965, + "step": 1489 + }, + { + "epoch": 0.8403835307388607, + "grad_norm": 5.434373378753662, + "learning_rate": 4.580090242526791e-05, + "loss": 1.2421, + "step": 1490 + }, + { + "epoch": 0.8409475465313029, + "grad_norm": 5.419107913970947, + "learning_rate": 4.5798082346305695e-05, + "loss": 1.3337, + "step": 1491 + }, + { + "epoch": 0.841511562323745, + "grad_norm": 4.714910984039307, + "learning_rate": 4.579526226734349e-05, + "loss": 1.1447, + "step": 1492 + }, + { + "epoch": 0.8420755781161873, + "grad_norm": 4.987724304199219, + "learning_rate": 4.579244218838128e-05, + "loss": 1.2567, + "step": 1493 + }, + { + "epoch": 0.8426395939086294, + "grad_norm": 3.855093002319336, + "learning_rate": 4.5789622109419065e-05, + "loss": 0.9207, + "step": 1494 + }, + { + "epoch": 0.8432036097010717, + "grad_norm": 5.485004901885986, + "learning_rate": 4.578680203045686e-05, + "loss": 1.1495, + "step": 1495 + }, + { + "epoch": 0.8437676254935138, + "grad_norm": 5.25796365737915, + "learning_rate": 4.578398195149464e-05, + "loss": 1.3094, + "step": 1496 + }, + { + "epoch": 0.8443316412859561, + "grad_norm": 5.20716667175293, + "learning_rate": 4.5781161872532435e-05, + "loss": 1.2321, + "step": 1497 + }, + { + "epoch": 0.8448956570783982, + "grad_norm": 5.869433879852295, + "learning_rate": 4.577834179357022e-05, + "loss": 1.0542, + "step": 1498 + }, + { + "epoch": 0.8454596728708403, + "grad_norm": 3.579408645629883, + "learning_rate": 4.577552171460801e-05, + "loss": 0.9965, + "step": 1499 + }, + { + "epoch": 0.8460236886632826, + "grad_norm": 3.0185601711273193, + "learning_rate": 4.57727016356458e-05, + "loss": 0.9088, + "step": 1500 + }, + { + "epoch": 0.8465877044557247, + "grad_norm": 6.642983436584473, + "learning_rate": 4.576988155668359e-05, + "loss": 1.2578, + "step": 1501 + }, + { + "epoch": 0.847151720248167, + "grad_norm": 5.529515743255615, + "learning_rate": 4.576706147772138e-05, + "loss": 1.2579, + "step": 1502 + }, + { + "epoch": 0.8477157360406091, + "grad_norm": 4.238567352294922, + "learning_rate": 4.576424139875917e-05, + "loss": 1.0096, + "step": 1503 + }, + { + "epoch": 0.8482797518330514, + "grad_norm": 5.6437530517578125, + "learning_rate": 4.576142131979695e-05, + "loss": 1.2076, + "step": 1504 + }, + { + "epoch": 0.8488437676254935, + "grad_norm": 3.0957305431365967, + "learning_rate": 4.5758601240834745e-05, + "loss": 0.9321, + "step": 1505 + }, + { + "epoch": 0.8494077834179357, + "grad_norm": 4.7582688331604, + "learning_rate": 4.575578116187254e-05, + "loss": 0.9412, + "step": 1506 + }, + { + "epoch": 0.8499717992103779, + "grad_norm": 5.82043981552124, + "learning_rate": 4.575296108291032e-05, + "loss": 1.3594, + "step": 1507 + }, + { + "epoch": 0.85053581500282, + "grad_norm": 5.750604152679443, + "learning_rate": 4.575014100394811e-05, + "loss": 1.3094, + "step": 1508 + }, + { + "epoch": 0.8510998307952623, + "grad_norm": 4.608838081359863, + "learning_rate": 4.57473209249859e-05, + "loss": 0.9646, + "step": 1509 + }, + { + "epoch": 0.8516638465877044, + "grad_norm": 3.8547887802124023, + "learning_rate": 4.574450084602369e-05, + "loss": 0.9768, + "step": 1510 + }, + { + "epoch": 0.8522278623801467, + "grad_norm": 6.250915050506592, + "learning_rate": 4.574168076706148e-05, + "loss": 1.4922, + "step": 1511 + }, + { + "epoch": 0.8527918781725888, + "grad_norm": 4.303508281707764, + "learning_rate": 4.5738860688099264e-05, + "loss": 1.1314, + "step": 1512 + }, + { + "epoch": 0.853355893965031, + "grad_norm": 5.713797569274902, + "learning_rate": 4.573604060913706e-05, + "loss": 1.0992, + "step": 1513 + }, + { + "epoch": 0.8539199097574732, + "grad_norm": 4.545214653015137, + "learning_rate": 4.573322053017485e-05, + "loss": 1.2388, + "step": 1514 + }, + { + "epoch": 0.8544839255499154, + "grad_norm": 5.5733642578125, + "learning_rate": 4.573040045121263e-05, + "loss": 1.1104, + "step": 1515 + }, + { + "epoch": 0.8550479413423576, + "grad_norm": 4.852461338043213, + "learning_rate": 4.5727580372250426e-05, + "loss": 0.999, + "step": 1516 + }, + { + "epoch": 0.8556119571347998, + "grad_norm": 4.700019359588623, + "learning_rate": 4.572476029328822e-05, + "loss": 1.0795, + "step": 1517 + }, + { + "epoch": 0.856175972927242, + "grad_norm": 4.862957954406738, + "learning_rate": 4.5721940214326e-05, + "loss": 1.3796, + "step": 1518 + }, + { + "epoch": 0.8567399887196842, + "grad_norm": 3.438724994659424, + "learning_rate": 4.571912013536379e-05, + "loss": 1.1318, + "step": 1519 + }, + { + "epoch": 0.8573040045121263, + "grad_norm": 3.0249338150024414, + "learning_rate": 4.571630005640159e-05, + "loss": 0.9478, + "step": 1520 + }, + { + "epoch": 0.8578680203045685, + "grad_norm": 4.249497413635254, + "learning_rate": 4.571347997743937e-05, + "loss": 1.0985, + "step": 1521 + }, + { + "epoch": 0.8584320360970107, + "grad_norm": 5.176840782165527, + "learning_rate": 4.571065989847716e-05, + "loss": 1.1181, + "step": 1522 + }, + { + "epoch": 0.8589960518894529, + "grad_norm": 6.70627498626709, + "learning_rate": 4.5707839819514944e-05, + "loss": 1.5654, + "step": 1523 + }, + { + "epoch": 0.8595600676818951, + "grad_norm": 3.9612317085266113, + "learning_rate": 4.570501974055274e-05, + "loss": 1.1098, + "step": 1524 + }, + { + "epoch": 0.8601240834743373, + "grad_norm": 5.028614044189453, + "learning_rate": 4.570219966159053e-05, + "loss": 0.9545, + "step": 1525 + }, + { + "epoch": 0.8606880992667795, + "grad_norm": 4.310511112213135, + "learning_rate": 4.5699379582628314e-05, + "loss": 1.1992, + "step": 1526 + }, + { + "epoch": 0.8612521150592216, + "grad_norm": 4.292834281921387, + "learning_rate": 4.5696559503666106e-05, + "loss": 1.2314, + "step": 1527 + }, + { + "epoch": 0.8618161308516639, + "grad_norm": 4.222016334533691, + "learning_rate": 4.56937394247039e-05, + "loss": 0.8649, + "step": 1528 + }, + { + "epoch": 0.862380146644106, + "grad_norm": 4.521690368652344, + "learning_rate": 4.569091934574168e-05, + "loss": 1.2662, + "step": 1529 + }, + { + "epoch": 0.8629441624365483, + "grad_norm": 7.2918219566345215, + "learning_rate": 4.568809926677947e-05, + "loss": 1.4771, + "step": 1530 + }, + { + "epoch": 0.8635081782289904, + "grad_norm": 4.462699890136719, + "learning_rate": 4.568527918781726e-05, + "loss": 0.9603, + "step": 1531 + }, + { + "epoch": 0.8640721940214326, + "grad_norm": 4.199741840362549, + "learning_rate": 4.568245910885505e-05, + "loss": 1.0344, + "step": 1532 + }, + { + "epoch": 0.8646362098138748, + "grad_norm": 4.844109058380127, + "learning_rate": 4.567963902989284e-05, + "loss": 1.0088, + "step": 1533 + }, + { + "epoch": 0.8652002256063169, + "grad_norm": 5.101992607116699, + "learning_rate": 4.567681895093063e-05, + "loss": 1.2703, + "step": 1534 + }, + { + "epoch": 0.8657642413987592, + "grad_norm": 4.657732009887695, + "learning_rate": 4.5673998871968416e-05, + "loss": 1.0155, + "step": 1535 + }, + { + "epoch": 0.8663282571912013, + "grad_norm": 4.278534889221191, + "learning_rate": 4.567117879300621e-05, + "loss": 1.1232, + "step": 1536 + }, + { + "epoch": 0.8668922729836436, + "grad_norm": 3.2044129371643066, + "learning_rate": 4.5668358714043994e-05, + "loss": 0.9698, + "step": 1537 + }, + { + "epoch": 0.8674562887760857, + "grad_norm": 3.5521140098571777, + "learning_rate": 4.5665538635081786e-05, + "loss": 0.873, + "step": 1538 + }, + { + "epoch": 0.868020304568528, + "grad_norm": 4.030319690704346, + "learning_rate": 4.566271855611957e-05, + "loss": 1.2047, + "step": 1539 + }, + { + "epoch": 0.8685843203609701, + "grad_norm": 5.287849426269531, + "learning_rate": 4.5659898477157363e-05, + "loss": 1.1881, + "step": 1540 + }, + { + "epoch": 0.8691483361534122, + "grad_norm": 5.137966632843018, + "learning_rate": 4.565707839819515e-05, + "loss": 1.3603, + "step": 1541 + }, + { + "epoch": 0.8697123519458545, + "grad_norm": 4.6329450607299805, + "learning_rate": 4.565425831923294e-05, + "loss": 1.0857, + "step": 1542 + }, + { + "epoch": 0.8702763677382966, + "grad_norm": 3.636190891265869, + "learning_rate": 4.5651438240270726e-05, + "loss": 1.0418, + "step": 1543 + }, + { + "epoch": 0.8708403835307389, + "grad_norm": 4.1060967445373535, + "learning_rate": 4.564861816130852e-05, + "loss": 1.0636, + "step": 1544 + }, + { + "epoch": 0.871404399323181, + "grad_norm": 5.834307670593262, + "learning_rate": 4.564579808234631e-05, + "loss": 1.0778, + "step": 1545 + }, + { + "epoch": 0.8719684151156233, + "grad_norm": 4.481376647949219, + "learning_rate": 4.5642978003384096e-05, + "loss": 1.3369, + "step": 1546 + }, + { + "epoch": 0.8725324309080654, + "grad_norm": 5.135843276977539, + "learning_rate": 4.564015792442188e-05, + "loss": 1.1252, + "step": 1547 + }, + { + "epoch": 0.8730964467005076, + "grad_norm": 2.7273409366607666, + "learning_rate": 4.5637337845459674e-05, + "loss": 0.8394, + "step": 1548 + }, + { + "epoch": 0.8736604624929498, + "grad_norm": 4.139870643615723, + "learning_rate": 4.5634517766497466e-05, + "loss": 1.2617, + "step": 1549 + }, + { + "epoch": 0.874224478285392, + "grad_norm": 6.971037864685059, + "learning_rate": 4.563169768753525e-05, + "loss": 1.2786, + "step": 1550 + }, + { + "epoch": 0.8747884940778342, + "grad_norm": 6.646045684814453, + "learning_rate": 4.562887760857304e-05, + "loss": 1.2246, + "step": 1551 + }, + { + "epoch": 0.8753525098702764, + "grad_norm": 7.146888256072998, + "learning_rate": 4.5626057529610836e-05, + "loss": 1.2706, + "step": 1552 + }, + { + "epoch": 0.8759165256627186, + "grad_norm": 3.957871198654175, + "learning_rate": 4.562323745064862e-05, + "loss": 1.066, + "step": 1553 + }, + { + "epoch": 0.8764805414551607, + "grad_norm": 4.875335693359375, + "learning_rate": 4.5620417371686407e-05, + "loss": 1.0257, + "step": 1554 + }, + { + "epoch": 0.8770445572476029, + "grad_norm": 6.228886604309082, + "learning_rate": 4.56175972927242e-05, + "loss": 1.1419, + "step": 1555 + }, + { + "epoch": 0.8776085730400451, + "grad_norm": 6.4083571434021, + "learning_rate": 4.561477721376199e-05, + "loss": 1.2931, + "step": 1556 + }, + { + "epoch": 0.8781725888324873, + "grad_norm": 3.9603004455566406, + "learning_rate": 4.5611957134799776e-05, + "loss": 1.4233, + "step": 1557 + }, + { + "epoch": 0.8787366046249295, + "grad_norm": 6.671829700469971, + "learning_rate": 4.560913705583756e-05, + "loss": 1.1438, + "step": 1558 + }, + { + "epoch": 0.8793006204173717, + "grad_norm": 4.784319877624512, + "learning_rate": 4.5606316976875354e-05, + "loss": 1.2276, + "step": 1559 + }, + { + "epoch": 0.8798646362098139, + "grad_norm": 8.520052909851074, + "learning_rate": 4.5603496897913146e-05, + "loss": 1.278, + "step": 1560 + }, + { + "epoch": 0.8804286520022561, + "grad_norm": 5.478610038757324, + "learning_rate": 4.560067681895093e-05, + "loss": 1.2033, + "step": 1561 + }, + { + "epoch": 0.8809926677946982, + "grad_norm": 3.8521640300750732, + "learning_rate": 4.559785673998872e-05, + "loss": 1.1756, + "step": 1562 + }, + { + "epoch": 0.8815566835871405, + "grad_norm": 3.2289931774139404, + "learning_rate": 4.5595036661026516e-05, + "loss": 0.9344, + "step": 1563 + }, + { + "epoch": 0.8821206993795826, + "grad_norm": 2.6700541973114014, + "learning_rate": 4.55922165820643e-05, + "loss": 0.782, + "step": 1564 + }, + { + "epoch": 0.8826847151720248, + "grad_norm": 4.994978904724121, + "learning_rate": 4.558939650310209e-05, + "loss": 1.1861, + "step": 1565 + }, + { + "epoch": 0.883248730964467, + "grad_norm": 4.055960655212402, + "learning_rate": 4.558657642413988e-05, + "loss": 0.9732, + "step": 1566 + }, + { + "epoch": 0.8838127467569092, + "grad_norm": 4.197771072387695, + "learning_rate": 4.558375634517767e-05, + "loss": 1.0809, + "step": 1567 + }, + { + "epoch": 0.8843767625493514, + "grad_norm": 5.771745204925537, + "learning_rate": 4.5580936266215457e-05, + "loss": 1.1225, + "step": 1568 + }, + { + "epoch": 0.8849407783417935, + "grad_norm": 5.204948902130127, + "learning_rate": 4.557811618725324e-05, + "loss": 1.0635, + "step": 1569 + }, + { + "epoch": 0.8855047941342358, + "grad_norm": 4.980511665344238, + "learning_rate": 4.5575296108291034e-05, + "loss": 1.0433, + "step": 1570 + }, + { + "epoch": 0.8860688099266779, + "grad_norm": 4.743584156036377, + "learning_rate": 4.5572476029328826e-05, + "loss": 1.263, + "step": 1571 + }, + { + "epoch": 0.8866328257191202, + "grad_norm": 5.382486343383789, + "learning_rate": 4.556965595036661e-05, + "loss": 1.0764, + "step": 1572 + }, + { + "epoch": 0.8871968415115623, + "grad_norm": 4.417191982269287, + "learning_rate": 4.5566835871404404e-05, + "loss": 1.1092, + "step": 1573 + }, + { + "epoch": 0.8877608573040046, + "grad_norm": 4.139051914215088, + "learning_rate": 4.556401579244219e-05, + "loss": 0.8739, + "step": 1574 + }, + { + "epoch": 0.8883248730964467, + "grad_norm": 4.062941551208496, + "learning_rate": 4.556119571347998e-05, + "loss": 0.9868, + "step": 1575 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 6.385035514831543, + "learning_rate": 4.555837563451777e-05, + "loss": 1.1747, + "step": 1576 + }, + { + "epoch": 0.8894529046813311, + "grad_norm": 5.495694160461426, + "learning_rate": 4.555555555555556e-05, + "loss": 1.1264, + "step": 1577 + }, + { + "epoch": 0.8900169204737732, + "grad_norm": 5.307939052581787, + "learning_rate": 4.5552735476593345e-05, + "loss": 0.9984, + "step": 1578 + }, + { + "epoch": 0.8905809362662155, + "grad_norm": 3.512604236602783, + "learning_rate": 4.554991539763114e-05, + "loss": 1.071, + "step": 1579 + }, + { + "epoch": 0.8911449520586576, + "grad_norm": 5.982153415679932, + "learning_rate": 4.554709531866892e-05, + "loss": 1.3683, + "step": 1580 + }, + { + "epoch": 0.8917089678510999, + "grad_norm": 5.469848155975342, + "learning_rate": 4.5544275239706714e-05, + "loss": 1.1814, + "step": 1581 + }, + { + "epoch": 0.892272983643542, + "grad_norm": 3.8971848487854004, + "learning_rate": 4.55414551607445e-05, + "loss": 1.0216, + "step": 1582 + }, + { + "epoch": 0.8928369994359842, + "grad_norm": 3.626577615737915, + "learning_rate": 4.553863508178229e-05, + "loss": 0.7912, + "step": 1583 + }, + { + "epoch": 0.8934010152284264, + "grad_norm": 4.844275951385498, + "learning_rate": 4.5535815002820084e-05, + "loss": 1.2683, + "step": 1584 + }, + { + "epoch": 0.8939650310208686, + "grad_norm": 4.300941467285156, + "learning_rate": 4.553299492385787e-05, + "loss": 1.0946, + "step": 1585 + }, + { + "epoch": 0.8945290468133108, + "grad_norm": 5.7074151039123535, + "learning_rate": 4.5530174844895655e-05, + "loss": 1.3177, + "step": 1586 + }, + { + "epoch": 0.8950930626057529, + "grad_norm": 4.1714253425598145, + "learning_rate": 4.552735476593345e-05, + "loss": 1.0207, + "step": 1587 + }, + { + "epoch": 0.8956570783981952, + "grad_norm": 3.950143814086914, + "learning_rate": 4.552453468697124e-05, + "loss": 0.9398, + "step": 1588 + }, + { + "epoch": 0.8962210941906373, + "grad_norm": 5.708602428436279, + "learning_rate": 4.5521714608009025e-05, + "loss": 1.1957, + "step": 1589 + }, + { + "epoch": 0.8967851099830795, + "grad_norm": 3.9559926986694336, + "learning_rate": 4.551889452904681e-05, + "loss": 1.0005, + "step": 1590 + }, + { + "epoch": 0.8973491257755217, + "grad_norm": 3.8733317852020264, + "learning_rate": 4.551607445008461e-05, + "loss": 1.1099, + "step": 1591 + }, + { + "epoch": 0.8979131415679639, + "grad_norm": 3.520401954650879, + "learning_rate": 4.5513254371122394e-05, + "loss": 0.8804, + "step": 1592 + }, + { + "epoch": 0.8984771573604061, + "grad_norm": 4.457393646240234, + "learning_rate": 4.551043429216018e-05, + "loss": 1.0029, + "step": 1593 + }, + { + "epoch": 0.8990411731528483, + "grad_norm": 3.959407091140747, + "learning_rate": 4.550761421319797e-05, + "loss": 0.9479, + "step": 1594 + }, + { + "epoch": 0.8996051889452905, + "grad_norm": 3.0859627723693848, + "learning_rate": 4.5504794134235764e-05, + "loss": 0.8539, + "step": 1595 + }, + { + "epoch": 0.9001692047377327, + "grad_norm": 3.5678465366363525, + "learning_rate": 4.550197405527355e-05, + "loss": 1.105, + "step": 1596 + }, + { + "epoch": 0.9007332205301748, + "grad_norm": 4.1950297355651855, + "learning_rate": 4.5499153976311335e-05, + "loss": 1.0218, + "step": 1597 + }, + { + "epoch": 0.901297236322617, + "grad_norm": 6.295259475708008, + "learning_rate": 4.549633389734913e-05, + "loss": 1.2292, + "step": 1598 + }, + { + "epoch": 0.9018612521150592, + "grad_norm": 5.289989471435547, + "learning_rate": 4.549351381838692e-05, + "loss": 1.1871, + "step": 1599 + }, + { + "epoch": 0.9024252679075014, + "grad_norm": 4.712060928344727, + "learning_rate": 4.5490693739424705e-05, + "loss": 1.0702, + "step": 1600 + }, + { + "epoch": 0.9029892836999436, + "grad_norm": 9.3281831741333, + "learning_rate": 4.548787366046249e-05, + "loss": 1.1449, + "step": 1601 + }, + { + "epoch": 0.9035532994923858, + "grad_norm": 5.781343460083008, + "learning_rate": 4.548505358150029e-05, + "loss": 1.3275, + "step": 1602 + }, + { + "epoch": 0.904117315284828, + "grad_norm": 3.888228416442871, + "learning_rate": 4.5482233502538075e-05, + "loss": 1.0873, + "step": 1603 + }, + { + "epoch": 0.9046813310772701, + "grad_norm": 3.8993923664093018, + "learning_rate": 4.547941342357586e-05, + "loss": 1.0654, + "step": 1604 + }, + { + "epoch": 0.9052453468697124, + "grad_norm": 4.703512191772461, + "learning_rate": 4.547659334461365e-05, + "loss": 0.9688, + "step": 1605 + }, + { + "epoch": 0.9058093626621545, + "grad_norm": 4.146636486053467, + "learning_rate": 4.5473773265651444e-05, + "loss": 0.9911, + "step": 1606 + }, + { + "epoch": 0.9063733784545968, + "grad_norm": 4.234034061431885, + "learning_rate": 4.547095318668923e-05, + "loss": 0.9642, + "step": 1607 + }, + { + "epoch": 0.9069373942470389, + "grad_norm": 3.8315956592559814, + "learning_rate": 4.5468133107727015e-05, + "loss": 1.1813, + "step": 1608 + }, + { + "epoch": 0.9075014100394811, + "grad_norm": 4.280881881713867, + "learning_rate": 4.546531302876481e-05, + "loss": 0.9055, + "step": 1609 + }, + { + "epoch": 0.9080654258319233, + "grad_norm": 5.494202136993408, + "learning_rate": 4.54624929498026e-05, + "loss": 1.0692, + "step": 1610 + }, + { + "epoch": 0.9086294416243654, + "grad_norm": 5.319856643676758, + "learning_rate": 4.5459672870840385e-05, + "loss": 0.8652, + "step": 1611 + }, + { + "epoch": 0.9091934574168077, + "grad_norm": 5.547131061553955, + "learning_rate": 4.545685279187818e-05, + "loss": 1.2424, + "step": 1612 + }, + { + "epoch": 0.9097574732092498, + "grad_norm": 7.955689907073975, + "learning_rate": 4.545403271291596e-05, + "loss": 1.2293, + "step": 1613 + }, + { + "epoch": 0.9103214890016921, + "grad_norm": 3.376760244369507, + "learning_rate": 4.5451212633953755e-05, + "loss": 0.9952, + "step": 1614 + }, + { + "epoch": 0.9108855047941342, + "grad_norm": 4.339445114135742, + "learning_rate": 4.544839255499154e-05, + "loss": 1.1966, + "step": 1615 + }, + { + "epoch": 0.9114495205865765, + "grad_norm": 4.462192535400391, + "learning_rate": 4.544557247602933e-05, + "loss": 1.013, + "step": 1616 + }, + { + "epoch": 0.9120135363790186, + "grad_norm": 3.195354461669922, + "learning_rate": 4.544275239706712e-05, + "loss": 0.6924, + "step": 1617 + }, + { + "epoch": 0.9125775521714607, + "grad_norm": 3.884880542755127, + "learning_rate": 4.543993231810491e-05, + "loss": 1.0315, + "step": 1618 + }, + { + "epoch": 0.913141567963903, + "grad_norm": 5.128406524658203, + "learning_rate": 4.5437112239142695e-05, + "loss": 1.1366, + "step": 1619 + }, + { + "epoch": 0.9137055837563451, + "grad_norm": 4.670577049255371, + "learning_rate": 4.543429216018049e-05, + "loss": 1.1666, + "step": 1620 + }, + { + "epoch": 0.9142695995487874, + "grad_norm": 3.579533576965332, + "learning_rate": 4.543147208121827e-05, + "loss": 1.0918, + "step": 1621 + }, + { + "epoch": 0.9148336153412295, + "grad_norm": 2.1410043239593506, + "learning_rate": 4.5428652002256065e-05, + "loss": 0.8924, + "step": 1622 + }, + { + "epoch": 0.9153976311336718, + "grad_norm": 3.6777968406677246, + "learning_rate": 4.542583192329386e-05, + "loss": 0.9576, + "step": 1623 + }, + { + "epoch": 0.9159616469261139, + "grad_norm": 4.825714588165283, + "learning_rate": 4.542301184433164e-05, + "loss": 0.9793, + "step": 1624 + }, + { + "epoch": 0.9165256627185561, + "grad_norm": 4.918032169342041, + "learning_rate": 4.542019176536943e-05, + "loss": 1.0646, + "step": 1625 + }, + { + "epoch": 0.9170896785109983, + "grad_norm": 3.4716012477874756, + "learning_rate": 4.541737168640722e-05, + "loss": 0.7971, + "step": 1626 + }, + { + "epoch": 0.9176536943034405, + "grad_norm": 4.649503707885742, + "learning_rate": 4.541455160744501e-05, + "loss": 0.8053, + "step": 1627 + }, + { + "epoch": 0.9182177100958827, + "grad_norm": 2.7109529972076416, + "learning_rate": 4.54117315284828e-05, + "loss": 0.8227, + "step": 1628 + }, + { + "epoch": 0.9187817258883249, + "grad_norm": 5.5054097175598145, + "learning_rate": 4.540891144952059e-05, + "loss": 1.2959, + "step": 1629 + }, + { + "epoch": 0.9193457416807671, + "grad_norm": 5.826297760009766, + "learning_rate": 4.540609137055838e-05, + "loss": 1.2373, + "step": 1630 + }, + { + "epoch": 0.9199097574732092, + "grad_norm": 4.176784038543701, + "learning_rate": 4.540327129159617e-05, + "loss": 0.9711, + "step": 1631 + }, + { + "epoch": 0.9204737732656514, + "grad_norm": 5.501131057739258, + "learning_rate": 4.540045121263395e-05, + "loss": 1.0901, + "step": 1632 + }, + { + "epoch": 0.9210377890580936, + "grad_norm": 4.278841018676758, + "learning_rate": 4.5397631133671745e-05, + "loss": 1.088, + "step": 1633 + }, + { + "epoch": 0.9216018048505358, + "grad_norm": 3.173179864883423, + "learning_rate": 4.539481105470954e-05, + "loss": 0.9403, + "step": 1634 + }, + { + "epoch": 0.922165820642978, + "grad_norm": 5.801565170288086, + "learning_rate": 4.539199097574732e-05, + "loss": 1.2434, + "step": 1635 + }, + { + "epoch": 0.9227298364354202, + "grad_norm": 4.2680559158325195, + "learning_rate": 4.538917089678511e-05, + "loss": 1.0564, + "step": 1636 + }, + { + "epoch": 0.9232938522278624, + "grad_norm": 4.932446002960205, + "learning_rate": 4.53863508178229e-05, + "loss": 1.0294, + "step": 1637 + }, + { + "epoch": 0.9238578680203046, + "grad_norm": 4.777100563049316, + "learning_rate": 4.538353073886069e-05, + "loss": 1.0212, + "step": 1638 + }, + { + "epoch": 0.9244218838127467, + "grad_norm": 3.7417681217193604, + "learning_rate": 4.538071065989848e-05, + "loss": 0.8569, + "step": 1639 + }, + { + "epoch": 0.924985899605189, + "grad_norm": 14.84492015838623, + "learning_rate": 4.5377890580936263e-05, + "loss": 1.1135, + "step": 1640 + }, + { + "epoch": 0.9255499153976311, + "grad_norm": 6.4316253662109375, + "learning_rate": 4.537507050197406e-05, + "loss": 1.2877, + "step": 1641 + }, + { + "epoch": 0.9261139311900733, + "grad_norm": 3.0407819747924805, + "learning_rate": 4.537225042301185e-05, + "loss": 0.856, + "step": 1642 + }, + { + "epoch": 0.9266779469825155, + "grad_norm": 5.057603359222412, + "learning_rate": 4.536943034404963e-05, + "loss": 1.1351, + "step": 1643 + }, + { + "epoch": 0.9272419627749577, + "grad_norm": 6.342163562774658, + "learning_rate": 4.5366610265087425e-05, + "loss": 1.1372, + "step": 1644 + }, + { + "epoch": 0.9278059785673999, + "grad_norm": 3.9481189250946045, + "learning_rate": 4.536379018612522e-05, + "loss": 0.9381, + "step": 1645 + }, + { + "epoch": 0.928369994359842, + "grad_norm": 5.7754011154174805, + "learning_rate": 4.5360970107163e-05, + "loss": 1.2085, + "step": 1646 + }, + { + "epoch": 0.9289340101522843, + "grad_norm": 5.4320197105407715, + "learning_rate": 4.535815002820079e-05, + "loss": 0.9729, + "step": 1647 + }, + { + "epoch": 0.9294980259447264, + "grad_norm": 3.4409332275390625, + "learning_rate": 4.535532994923858e-05, + "loss": 0.8648, + "step": 1648 + }, + { + "epoch": 0.9300620417371687, + "grad_norm": 4.842418670654297, + "learning_rate": 4.535250987027637e-05, + "loss": 0.8708, + "step": 1649 + }, + { + "epoch": 0.9306260575296108, + "grad_norm": 3.1905126571655273, + "learning_rate": 4.534968979131416e-05, + "loss": 0.8509, + "step": 1650 + }, + { + "epoch": 0.9311900733220531, + "grad_norm": 4.419463634490967, + "learning_rate": 4.534686971235195e-05, + "loss": 1.0804, + "step": 1651 + }, + { + "epoch": 0.9317540891144952, + "grad_norm": 4.904521465301514, + "learning_rate": 4.5344049633389736e-05, + "loss": 1.0944, + "step": 1652 + }, + { + "epoch": 0.9323181049069373, + "grad_norm": 5.426485061645508, + "learning_rate": 4.534122955442753e-05, + "loss": 1.1463, + "step": 1653 + }, + { + "epoch": 0.9328821206993796, + "grad_norm": 4.052720546722412, + "learning_rate": 4.533840947546531e-05, + "loss": 0.9539, + "step": 1654 + }, + { + "epoch": 0.9334461364918217, + "grad_norm": 5.198144912719727, + "learning_rate": 4.5335589396503106e-05, + "loss": 1.3814, + "step": 1655 + }, + { + "epoch": 0.934010152284264, + "grad_norm": 4.832601547241211, + "learning_rate": 4.533276931754089e-05, + "loss": 0.946, + "step": 1656 + }, + { + "epoch": 0.9345741680767061, + "grad_norm": 6.418742656707764, + "learning_rate": 4.532994923857868e-05, + "loss": 1.0341, + "step": 1657 + }, + { + "epoch": 0.9351381838691484, + "grad_norm": 5.021646976470947, + "learning_rate": 4.532712915961647e-05, + "loss": 1.1042, + "step": 1658 + }, + { + "epoch": 0.9357021996615905, + "grad_norm": 6.12965202331543, + "learning_rate": 4.532430908065426e-05, + "loss": 1.3781, + "step": 1659 + }, + { + "epoch": 0.9362662154540327, + "grad_norm": 6.268660068511963, + "learning_rate": 4.5321489001692046e-05, + "loss": 1.3499, + "step": 1660 + }, + { + "epoch": 0.9368302312464749, + "grad_norm": 2.9468424320220947, + "learning_rate": 4.531866892272984e-05, + "loss": 0.8849, + "step": 1661 + }, + { + "epoch": 0.937394247038917, + "grad_norm": 4.91391134262085, + "learning_rate": 4.531584884376763e-05, + "loss": 1.0656, + "step": 1662 + }, + { + "epoch": 0.9379582628313593, + "grad_norm": 4.072263717651367, + "learning_rate": 4.5313028764805416e-05, + "loss": 0.9175, + "step": 1663 + }, + { + "epoch": 0.9385222786238014, + "grad_norm": 6.174554347991943, + "learning_rate": 4.531020868584321e-05, + "loss": 1.2241, + "step": 1664 + }, + { + "epoch": 0.9390862944162437, + "grad_norm": 4.245241641998291, + "learning_rate": 4.5307388606880994e-05, + "loss": 1.0137, + "step": 1665 + }, + { + "epoch": 0.9396503102086858, + "grad_norm": 3.23919415473938, + "learning_rate": 4.5304568527918786e-05, + "loss": 0.9294, + "step": 1666 + }, + { + "epoch": 0.9402143260011281, + "grad_norm": 5.790242671966553, + "learning_rate": 4.530174844895657e-05, + "loss": 1.1785, + "step": 1667 + }, + { + "epoch": 0.9407783417935702, + "grad_norm": 4.5235371589660645, + "learning_rate": 4.529892836999436e-05, + "loss": 0.9323, + "step": 1668 + }, + { + "epoch": 0.9413423575860124, + "grad_norm": 4.223364353179932, + "learning_rate": 4.5296108291032155e-05, + "loss": 1.2105, + "step": 1669 + }, + { + "epoch": 0.9419063733784546, + "grad_norm": 3.9270403385162354, + "learning_rate": 4.529328821206994e-05, + "loss": 0.8941, + "step": 1670 + }, + { + "epoch": 0.9424703891708968, + "grad_norm": 4.010498046875, + "learning_rate": 4.5290468133107726e-05, + "loss": 1.0131, + "step": 1671 + }, + { + "epoch": 0.943034404963339, + "grad_norm": 7.043329238891602, + "learning_rate": 4.528764805414552e-05, + "loss": 1.2853, + "step": 1672 + }, + { + "epoch": 0.9435984207557812, + "grad_norm": 4.521618843078613, + "learning_rate": 4.528482797518331e-05, + "loss": 1.0067, + "step": 1673 + }, + { + "epoch": 0.9441624365482234, + "grad_norm": 4.518204689025879, + "learning_rate": 4.5282007896221096e-05, + "loss": 1.1355, + "step": 1674 + }, + { + "epoch": 0.9447264523406655, + "grad_norm": 4.614575386047363, + "learning_rate": 4.527918781725888e-05, + "loss": 1.0271, + "step": 1675 + }, + { + "epoch": 0.9452904681331077, + "grad_norm": 3.0002574920654297, + "learning_rate": 4.5276367738296674e-05, + "loss": 0.9087, + "step": 1676 + }, + { + "epoch": 0.9458544839255499, + "grad_norm": 6.0918755531311035, + "learning_rate": 4.5273547659334466e-05, + "loss": 1.1374, + "step": 1677 + }, + { + "epoch": 0.9464184997179921, + "grad_norm": 3.99605655670166, + "learning_rate": 4.527072758037225e-05, + "loss": 1.1359, + "step": 1678 + }, + { + "epoch": 0.9469825155104343, + "grad_norm": 3.570462465286255, + "learning_rate": 4.526790750141004e-05, + "loss": 0.8249, + "step": 1679 + }, + { + "epoch": 0.9475465313028765, + "grad_norm": 4.941257476806641, + "learning_rate": 4.5265087422447836e-05, + "loss": 1.0971, + "step": 1680 + }, + { + "epoch": 0.9481105470953187, + "grad_norm": 3.325646162033081, + "learning_rate": 4.526226734348562e-05, + "loss": 1.1141, + "step": 1681 + }, + { + "epoch": 0.9486745628877609, + "grad_norm": 4.476725101470947, + "learning_rate": 4.5259447264523406e-05, + "loss": 1.1206, + "step": 1682 + }, + { + "epoch": 0.949238578680203, + "grad_norm": 4.893871307373047, + "learning_rate": 4.52566271855612e-05, + "loss": 1.0309, + "step": 1683 + }, + { + "epoch": 0.9498025944726453, + "grad_norm": 4.7422637939453125, + "learning_rate": 4.525380710659899e-05, + "loss": 0.9613, + "step": 1684 + }, + { + "epoch": 0.9503666102650874, + "grad_norm": 5.297237396240234, + "learning_rate": 4.5250987027636776e-05, + "loss": 1.0654, + "step": 1685 + }, + { + "epoch": 0.9509306260575296, + "grad_norm": 4.510578632354736, + "learning_rate": 4.524816694867456e-05, + "loss": 0.9234, + "step": 1686 + }, + { + "epoch": 0.9514946418499718, + "grad_norm": 5.816103458404541, + "learning_rate": 4.5245346869712354e-05, + "loss": 1.0044, + "step": 1687 + }, + { + "epoch": 0.952058657642414, + "grad_norm": 3.5632994174957275, + "learning_rate": 4.5242526790750146e-05, + "loss": 0.8664, + "step": 1688 + }, + { + "epoch": 0.9526226734348562, + "grad_norm": 3.32661771774292, + "learning_rate": 4.523970671178793e-05, + "loss": 1.2088, + "step": 1689 + }, + { + "epoch": 0.9531866892272983, + "grad_norm": 5.256080627441406, + "learning_rate": 4.523688663282572e-05, + "loss": 0.9398, + "step": 1690 + }, + { + "epoch": 0.9537507050197406, + "grad_norm": 4.75949764251709, + "learning_rate": 4.523406655386351e-05, + "loss": 1.041, + "step": 1691 + }, + { + "epoch": 0.9543147208121827, + "grad_norm": 2.1553962230682373, + "learning_rate": 4.52312464749013e-05, + "loss": 0.921, + "step": 1692 + }, + { + "epoch": 0.954878736604625, + "grad_norm": 11.133243560791016, + "learning_rate": 4.5228426395939087e-05, + "loss": 1.0181, + "step": 1693 + }, + { + "epoch": 0.9554427523970671, + "grad_norm": 5.160607814788818, + "learning_rate": 4.522560631697688e-05, + "loss": 1.2304, + "step": 1694 + }, + { + "epoch": 0.9560067681895094, + "grad_norm": 4.2246294021606445, + "learning_rate": 4.5222786238014664e-05, + "loss": 1.0057, + "step": 1695 + }, + { + "epoch": 0.9565707839819515, + "grad_norm": 4.263093948364258, + "learning_rate": 4.5219966159052456e-05, + "loss": 1.0823, + "step": 1696 + }, + { + "epoch": 0.9571347997743936, + "grad_norm": 3.5656795501708984, + "learning_rate": 4.521714608009024e-05, + "loss": 0.8233, + "step": 1697 + }, + { + "epoch": 0.9576988155668359, + "grad_norm": 5.350344657897949, + "learning_rate": 4.5214326001128034e-05, + "loss": 1.1429, + "step": 1698 + }, + { + "epoch": 0.958262831359278, + "grad_norm": 2.9744746685028076, + "learning_rate": 4.5211505922165826e-05, + "loss": 0.9573, + "step": 1699 + }, + { + "epoch": 0.9588268471517203, + "grad_norm": 4.260674476623535, + "learning_rate": 4.520868584320361e-05, + "loss": 1.0127, + "step": 1700 + }, + { + "epoch": 0.9593908629441624, + "grad_norm": 4.696333408355713, + "learning_rate": 4.5205865764241404e-05, + "loss": 0.983, + "step": 1701 + }, + { + "epoch": 0.9599548787366047, + "grad_norm": 5.458011627197266, + "learning_rate": 4.520304568527919e-05, + "loss": 1.2216, + "step": 1702 + }, + { + "epoch": 0.9605188945290468, + "grad_norm": 5.313178062438965, + "learning_rate": 4.520022560631698e-05, + "loss": 1.2142, + "step": 1703 + }, + { + "epoch": 0.961082910321489, + "grad_norm": 5.5854902267456055, + "learning_rate": 4.519740552735477e-05, + "loss": 0.9393, + "step": 1704 + }, + { + "epoch": 0.9616469261139312, + "grad_norm": 3.4164931774139404, + "learning_rate": 4.519458544839256e-05, + "loss": 0.8981, + "step": 1705 + }, + { + "epoch": 0.9622109419063734, + "grad_norm": 4.695150375366211, + "learning_rate": 4.5191765369430344e-05, + "loss": 1.0229, + "step": 1706 + }, + { + "epoch": 0.9627749576988156, + "grad_norm": 7.216432094573975, + "learning_rate": 4.5188945290468137e-05, + "loss": 1.1877, + "step": 1707 + }, + { + "epoch": 0.9633389734912577, + "grad_norm": 5.089524745941162, + "learning_rate": 4.518612521150592e-05, + "loss": 1.2435, + "step": 1708 + }, + { + "epoch": 0.9639029892837, + "grad_norm": 4.2332563400268555, + "learning_rate": 4.5183305132543714e-05, + "loss": 0.9618, + "step": 1709 + }, + { + "epoch": 0.9644670050761421, + "grad_norm": 3.2003471851348877, + "learning_rate": 4.51804850535815e-05, + "loss": 0.8405, + "step": 1710 + }, + { + "epoch": 0.9650310208685843, + "grad_norm": 6.969939231872559, + "learning_rate": 4.517766497461929e-05, + "loss": 1.2612, + "step": 1711 + }, + { + "epoch": 0.9655950366610265, + "grad_norm": 4.284248352050781, + "learning_rate": 4.5174844895657084e-05, + "loss": 1.1053, + "step": 1712 + }, + { + "epoch": 0.9661590524534687, + "grad_norm": 3.6411969661712646, + "learning_rate": 4.517202481669487e-05, + "loss": 0.9549, + "step": 1713 + }, + { + "epoch": 0.9667230682459109, + "grad_norm": 3.7788054943084717, + "learning_rate": 4.5169204737732655e-05, + "loss": 0.9166, + "step": 1714 + }, + { + "epoch": 0.9672870840383531, + "grad_norm": 5.385708808898926, + "learning_rate": 4.516638465877045e-05, + "loss": 1.0913, + "step": 1715 + }, + { + "epoch": 0.9678510998307953, + "grad_norm": 4.380808353424072, + "learning_rate": 4.516356457980824e-05, + "loss": 1.1227, + "step": 1716 + }, + { + "epoch": 0.9684151156232375, + "grad_norm": 2.951021432876587, + "learning_rate": 4.5160744500846024e-05, + "loss": 1.0498, + "step": 1717 + }, + { + "epoch": 0.9689791314156796, + "grad_norm": 3.2646286487579346, + "learning_rate": 4.515792442188381e-05, + "loss": 0.8368, + "step": 1718 + }, + { + "epoch": 0.9695431472081218, + "grad_norm": 9.961503028869629, + "learning_rate": 4.515510434292161e-05, + "loss": 1.1455, + "step": 1719 + }, + { + "epoch": 0.970107163000564, + "grad_norm": 5.036589622497559, + "learning_rate": 4.5152284263959394e-05, + "loss": 1.0541, + "step": 1720 + }, + { + "epoch": 0.9706711787930062, + "grad_norm": 4.313574314117432, + "learning_rate": 4.514946418499718e-05, + "loss": 1.1063, + "step": 1721 + }, + { + "epoch": 0.9712351945854484, + "grad_norm": 4.774763584136963, + "learning_rate": 4.514664410603497e-05, + "loss": 1.1985, + "step": 1722 + }, + { + "epoch": 0.9717992103778906, + "grad_norm": 4.052539825439453, + "learning_rate": 4.5143824027072764e-05, + "loss": 0.8461, + "step": 1723 + }, + { + "epoch": 0.9723632261703328, + "grad_norm": 2.9401068687438965, + "learning_rate": 4.514100394811055e-05, + "loss": 0.9612, + "step": 1724 + }, + { + "epoch": 0.9729272419627749, + "grad_norm": 3.4539906978607178, + "learning_rate": 4.5138183869148335e-05, + "loss": 1.0623, + "step": 1725 + }, + { + "epoch": 0.9734912577552172, + "grad_norm": 4.699190616607666, + "learning_rate": 4.513536379018613e-05, + "loss": 0.9909, + "step": 1726 + }, + { + "epoch": 0.9740552735476593, + "grad_norm": 5.713874340057373, + "learning_rate": 4.513254371122392e-05, + "loss": 1.0622, + "step": 1727 + }, + { + "epoch": 0.9746192893401016, + "grad_norm": 3.7533867359161377, + "learning_rate": 4.5129723632261705e-05, + "loss": 1.0584, + "step": 1728 + }, + { + "epoch": 0.9751833051325437, + "grad_norm": 3.2110755443573, + "learning_rate": 4.512690355329949e-05, + "loss": 0.8927, + "step": 1729 + }, + { + "epoch": 0.975747320924986, + "grad_norm": 4.549949645996094, + "learning_rate": 4.512408347433728e-05, + "loss": 1.0448, + "step": 1730 + }, + { + "epoch": 0.9763113367174281, + "grad_norm": 1.9480654001235962, + "learning_rate": 4.5121263395375074e-05, + "loss": 0.799, + "step": 1731 + }, + { + "epoch": 0.9768753525098702, + "grad_norm": 5.313620090484619, + "learning_rate": 4.511844331641286e-05, + "loss": 1.2057, + "step": 1732 + }, + { + "epoch": 0.9774393683023125, + "grad_norm": 8.405488014221191, + "learning_rate": 4.511562323745065e-05, + "loss": 1.2959, + "step": 1733 + }, + { + "epoch": 0.9780033840947546, + "grad_norm": 4.1315741539001465, + "learning_rate": 4.5112803158488444e-05, + "loss": 0.9533, + "step": 1734 + }, + { + "epoch": 0.9785673998871969, + "grad_norm": 4.292085647583008, + "learning_rate": 4.510998307952623e-05, + "loss": 1.1047, + "step": 1735 + }, + { + "epoch": 0.979131415679639, + "grad_norm": 2.408864974975586, + "learning_rate": 4.5107163000564015e-05, + "loss": 0.7532, + "step": 1736 + }, + { + "epoch": 0.9796954314720813, + "grad_norm": 3.3780126571655273, + "learning_rate": 4.510434292160181e-05, + "loss": 0.9082, + "step": 1737 + }, + { + "epoch": 0.9802594472645234, + "grad_norm": 3.3917717933654785, + "learning_rate": 4.51015228426396e-05, + "loss": 1.0976, + "step": 1738 + }, + { + "epoch": 0.9808234630569655, + "grad_norm": 3.49168062210083, + "learning_rate": 4.5098702763677385e-05, + "loss": 0.9053, + "step": 1739 + }, + { + "epoch": 0.9813874788494078, + "grad_norm": 3.2827236652374268, + "learning_rate": 4.509588268471518e-05, + "loss": 0.7675, + "step": 1740 + }, + { + "epoch": 0.9819514946418499, + "grad_norm": 3.5060882568359375, + "learning_rate": 4.509306260575296e-05, + "loss": 0.9848, + "step": 1741 + }, + { + "epoch": 0.9825155104342922, + "grad_norm": 5.1073503494262695, + "learning_rate": 4.5090242526790755e-05, + "loss": 1.067, + "step": 1742 + }, + { + "epoch": 0.9830795262267343, + "grad_norm": 4.0421857833862305, + "learning_rate": 4.508742244782854e-05, + "loss": 1.1863, + "step": 1743 + }, + { + "epoch": 0.9836435420191766, + "grad_norm": 4.06631326675415, + "learning_rate": 4.508460236886633e-05, + "loss": 1.0173, + "step": 1744 + }, + { + "epoch": 0.9842075578116187, + "grad_norm": 6.122565746307373, + "learning_rate": 4.508178228990412e-05, + "loss": 1.1708, + "step": 1745 + }, + { + "epoch": 0.9847715736040609, + "grad_norm": 4.341383934020996, + "learning_rate": 4.507896221094191e-05, + "loss": 1.0094, + "step": 1746 + }, + { + "epoch": 0.9853355893965031, + "grad_norm": 3.2483766078948975, + "learning_rate": 4.5076142131979695e-05, + "loss": 1.0, + "step": 1747 + }, + { + "epoch": 0.9858996051889453, + "grad_norm": 4.461364269256592, + "learning_rate": 4.507332205301749e-05, + "loss": 1.0153, + "step": 1748 + }, + { + "epoch": 0.9864636209813875, + "grad_norm": 3.771568775177002, + "learning_rate": 4.507050197405527e-05, + "loss": 0.9587, + "step": 1749 + }, + { + "epoch": 0.9870276367738297, + "grad_norm": 3.4897541999816895, + "learning_rate": 4.5067681895093065e-05, + "loss": 0.8535, + "step": 1750 + }, + { + "epoch": 0.9875916525662719, + "grad_norm": 3.3558709621429443, + "learning_rate": 4.506486181613086e-05, + "loss": 0.8951, + "step": 1751 + }, + { + "epoch": 0.988155668358714, + "grad_norm": 4.519599914550781, + "learning_rate": 4.506204173716864e-05, + "loss": 0.9751, + "step": 1752 + }, + { + "epoch": 0.9887196841511562, + "grad_norm": 4.508656978607178, + "learning_rate": 4.505922165820643e-05, + "loss": 1.1451, + "step": 1753 + }, + { + "epoch": 0.9892836999435984, + "grad_norm": 4.155647277832031, + "learning_rate": 4.505640157924422e-05, + "loss": 0.8985, + "step": 1754 + }, + { + "epoch": 0.9898477157360406, + "grad_norm": 3.0201761722564697, + "learning_rate": 4.505358150028201e-05, + "loss": 0.934, + "step": 1755 + }, + { + "epoch": 0.9904117315284828, + "grad_norm": 5.161604881286621, + "learning_rate": 4.50507614213198e-05, + "loss": 0.8998, + "step": 1756 + }, + { + "epoch": 0.990975747320925, + "grad_norm": 5.479249954223633, + "learning_rate": 4.504794134235758e-05, + "loss": 1.0838, + "step": 1757 + }, + { + "epoch": 0.9915397631133672, + "grad_norm": 4.463426113128662, + "learning_rate": 4.504512126339538e-05, + "loss": 1.026, + "step": 1758 + }, + { + "epoch": 0.9921037789058094, + "grad_norm": 3.606771230697632, + "learning_rate": 4.504230118443317e-05, + "loss": 0.8727, + "step": 1759 + }, + { + "epoch": 0.9926677946982515, + "grad_norm": 4.425990104675293, + "learning_rate": 4.503948110547095e-05, + "loss": 0.8916, + "step": 1760 + }, + { + "epoch": 0.9932318104906938, + "grad_norm": 4.444554805755615, + "learning_rate": 4.5036661026508745e-05, + "loss": 0.9708, + "step": 1761 + }, + { + "epoch": 0.9937958262831359, + "grad_norm": 3.9400794506073, + "learning_rate": 4.503384094754654e-05, + "loss": 1.0143, + "step": 1762 + }, + { + "epoch": 0.9943598420755781, + "grad_norm": 7.035092830657959, + "learning_rate": 4.503102086858432e-05, + "loss": 1.3453, + "step": 1763 + }, + { + "epoch": 0.9949238578680203, + "grad_norm": 3.8592028617858887, + "learning_rate": 4.502820078962211e-05, + "loss": 1.0818, + "step": 1764 + }, + { + "epoch": 0.9954878736604625, + "grad_norm": 3.6350369453430176, + "learning_rate": 4.50253807106599e-05, + "loss": 0.8416, + "step": 1765 + }, + { + "epoch": 0.9960518894529047, + "grad_norm": 2.2256298065185547, + "learning_rate": 4.502256063169769e-05, + "loss": 0.8455, + "step": 1766 + }, + { + "epoch": 0.9966159052453468, + "grad_norm": 5.692108154296875, + "learning_rate": 4.501974055273548e-05, + "loss": 1.0048, + "step": 1767 + }, + { + "epoch": 0.9971799210377891, + "grad_norm": 4.61898946762085, + "learning_rate": 4.501692047377326e-05, + "loss": 0.8505, + "step": 1768 + }, + { + "epoch": 0.9977439368302312, + "grad_norm": 4.258606433868408, + "learning_rate": 4.501410039481106e-05, + "loss": 1.0326, + "step": 1769 + }, + { + "epoch": 0.9983079526226735, + "grad_norm": 9.164579391479492, + "learning_rate": 4.501128031584885e-05, + "loss": 1.1397, + "step": 1770 + }, + { + "epoch": 0.9988719684151156, + "grad_norm": 4.945356845855713, + "learning_rate": 4.500846023688663e-05, + "loss": 0.9768, + "step": 1771 + }, + { + "epoch": 0.9994359842075579, + "grad_norm": 4.871583461761475, + "learning_rate": 4.5005640157924425e-05, + "loss": 1.052, + "step": 1772 + }, + { + "epoch": 1.0, + "grad_norm": 8.35786247253418, + "learning_rate": 4.500282007896222e-05, + "loss": 1.4219, + "step": 1773 + }, + { + "epoch": 1.0005640157924423, + "grad_norm": 5.476369857788086, + "learning_rate": 4.5e-05, + "loss": 1.1912, + "step": 1774 + }, + { + "epoch": 1.0011280315848843, + "grad_norm": 4.244253635406494, + "learning_rate": 4.499717992103779e-05, + "loss": 0.9146, + "step": 1775 + }, + { + "epoch": 1.0016920473773265, + "grad_norm": 2.941633939743042, + "learning_rate": 4.499435984207558e-05, + "loss": 0.822, + "step": 1776 + }, + { + "epoch": 1.0022560631697688, + "grad_norm": 4.382091045379639, + "learning_rate": 4.499153976311337e-05, + "loss": 1.1167, + "step": 1777 + }, + { + "epoch": 1.002820078962211, + "grad_norm": 5.223126411437988, + "learning_rate": 4.498871968415116e-05, + "loss": 1.1036, + "step": 1778 + }, + { + "epoch": 1.003384094754653, + "grad_norm": 5.20250940322876, + "learning_rate": 4.498589960518895e-05, + "loss": 1.0076, + "step": 1779 + }, + { + "epoch": 1.0039481105470953, + "grad_norm": 2.7812888622283936, + "learning_rate": 4.4983079526226736e-05, + "loss": 0.8928, + "step": 1780 + }, + { + "epoch": 1.0045121263395376, + "grad_norm": 2.926771879196167, + "learning_rate": 4.498025944726453e-05, + "loss": 0.7129, + "step": 1781 + }, + { + "epoch": 1.0050761421319796, + "grad_norm": 10.033702850341797, + "learning_rate": 4.497743936830231e-05, + "loss": 1.0322, + "step": 1782 + }, + { + "epoch": 1.0056401579244219, + "grad_norm": 4.251841068267822, + "learning_rate": 4.4974619289340105e-05, + "loss": 1.0362, + "step": 1783 + }, + { + "epoch": 1.006204173716864, + "grad_norm": 2.5143537521362305, + "learning_rate": 4.497179921037789e-05, + "loss": 0.8632, + "step": 1784 + }, + { + "epoch": 1.0067681895093064, + "grad_norm": 4.038387298583984, + "learning_rate": 4.496897913141568e-05, + "loss": 1.0188, + "step": 1785 + }, + { + "epoch": 1.0073322053017484, + "grad_norm": 5.273099899291992, + "learning_rate": 4.496615905245347e-05, + "loss": 0.901, + "step": 1786 + }, + { + "epoch": 1.0078962210941906, + "grad_norm": 3.500215768814087, + "learning_rate": 4.496333897349126e-05, + "loss": 0.7496, + "step": 1787 + }, + { + "epoch": 1.0084602368866329, + "grad_norm": 4.076648235321045, + "learning_rate": 4.4960518894529046e-05, + "loss": 1.0555, + "step": 1788 + }, + { + "epoch": 1.009024252679075, + "grad_norm": 5.527298927307129, + "learning_rate": 4.495769881556684e-05, + "loss": 1.0827, + "step": 1789 + }, + { + "epoch": 1.0095882684715172, + "grad_norm": 3.420997381210327, + "learning_rate": 4.495487873660463e-05, + "loss": 0.8796, + "step": 1790 + }, + { + "epoch": 1.0101522842639594, + "grad_norm": 3.196505546569824, + "learning_rate": 4.4952058657642416e-05, + "loss": 0.9927, + "step": 1791 + }, + { + "epoch": 1.0107163000564017, + "grad_norm": 4.068558692932129, + "learning_rate": 4.49492385786802e-05, + "loss": 0.9418, + "step": 1792 + }, + { + "epoch": 1.0112803158488437, + "grad_norm": 5.36614990234375, + "learning_rate": 4.494641849971799e-05, + "loss": 1.0504, + "step": 1793 + }, + { + "epoch": 1.011844331641286, + "grad_norm": 5.200252056121826, + "learning_rate": 4.4943598420755786e-05, + "loss": 1.0853, + "step": 1794 + }, + { + "epoch": 1.0124083474337282, + "grad_norm": 4.2829270362854, + "learning_rate": 4.494077834179357e-05, + "loss": 0.985, + "step": 1795 + }, + { + "epoch": 1.0129723632261702, + "grad_norm": 2.518584966659546, + "learning_rate": 4.4937958262831356e-05, + "loss": 0.8126, + "step": 1796 + }, + { + "epoch": 1.0135363790186125, + "grad_norm": 4.553541660308838, + "learning_rate": 4.4935138183869155e-05, + "loss": 1.0319, + "step": 1797 + }, + { + "epoch": 1.0141003948110547, + "grad_norm": 3.742863178253174, + "learning_rate": 4.493231810490694e-05, + "loss": 0.9795, + "step": 1798 + }, + { + "epoch": 1.014664410603497, + "grad_norm": 3.72177791595459, + "learning_rate": 4.4929498025944726e-05, + "loss": 0.9407, + "step": 1799 + }, + { + "epoch": 1.015228426395939, + "grad_norm": 7.937282562255859, + "learning_rate": 4.492667794698252e-05, + "loss": 1.0699, + "step": 1800 + }, + { + "epoch": 1.0157924421883813, + "grad_norm": 3.2706289291381836, + "learning_rate": 4.492385786802031e-05, + "loss": 0.8492, + "step": 1801 + }, + { + "epoch": 1.0163564579808235, + "grad_norm": 4.856723785400391, + "learning_rate": 4.4921037789058096e-05, + "loss": 0.9483, + "step": 1802 + }, + { + "epoch": 1.0169204737732656, + "grad_norm": 5.194263458251953, + "learning_rate": 4.491821771009588e-05, + "loss": 0.9589, + "step": 1803 + }, + { + "epoch": 1.0174844895657078, + "grad_norm": 5.668319225311279, + "learning_rate": 4.4915397631133674e-05, + "loss": 1.2185, + "step": 1804 + }, + { + "epoch": 1.01804850535815, + "grad_norm": 4.381819725036621, + "learning_rate": 4.4912577552171466e-05, + "loss": 0.9025, + "step": 1805 + }, + { + "epoch": 1.0186125211505923, + "grad_norm": 3.471737861633301, + "learning_rate": 4.490975747320925e-05, + "loss": 0.8429, + "step": 1806 + }, + { + "epoch": 1.0191765369430343, + "grad_norm": 2.840193748474121, + "learning_rate": 4.4906937394247037e-05, + "loss": 0.7771, + "step": 1807 + }, + { + "epoch": 1.0197405527354766, + "grad_norm": 3.87164044380188, + "learning_rate": 4.4904117315284835e-05, + "loss": 0.8249, + "step": 1808 + }, + { + "epoch": 1.0203045685279188, + "grad_norm": 3.9421017169952393, + "learning_rate": 4.490129723632262e-05, + "loss": 1.037, + "step": 1809 + }, + { + "epoch": 1.0208685843203609, + "grad_norm": 3.6265599727630615, + "learning_rate": 4.4898477157360406e-05, + "loss": 1.0086, + "step": 1810 + }, + { + "epoch": 1.0214326001128031, + "grad_norm": 2.2377710342407227, + "learning_rate": 4.48956570783982e-05, + "loss": 0.7632, + "step": 1811 + }, + { + "epoch": 1.0219966159052454, + "grad_norm": 2.7351083755493164, + "learning_rate": 4.489283699943599e-05, + "loss": 0.7691, + "step": 1812 + }, + { + "epoch": 1.0225606316976876, + "grad_norm": 5.309479236602783, + "learning_rate": 4.4890016920473776e-05, + "loss": 1.19, + "step": 1813 + }, + { + "epoch": 1.0231246474901297, + "grad_norm": 3.818305730819702, + "learning_rate": 4.488719684151156e-05, + "loss": 0.8561, + "step": 1814 + }, + { + "epoch": 1.023688663282572, + "grad_norm": 4.708005905151367, + "learning_rate": 4.4884376762549354e-05, + "loss": 1.0147, + "step": 1815 + }, + { + "epoch": 1.0242526790750142, + "grad_norm": 4.332998275756836, + "learning_rate": 4.4881556683587146e-05, + "loss": 0.879, + "step": 1816 + }, + { + "epoch": 1.0248166948674562, + "grad_norm": 2.0618433952331543, + "learning_rate": 4.487873660462493e-05, + "loss": 0.8496, + "step": 1817 + }, + { + "epoch": 1.0253807106598984, + "grad_norm": 5.384362697601318, + "learning_rate": 4.4875916525662723e-05, + "loss": 1.3019, + "step": 1818 + }, + { + "epoch": 1.0259447264523407, + "grad_norm": 4.397813320159912, + "learning_rate": 4.487309644670051e-05, + "loss": 0.955, + "step": 1819 + }, + { + "epoch": 1.026508742244783, + "grad_norm": 3.595842123031616, + "learning_rate": 4.48702763677383e-05, + "loss": 0.8829, + "step": 1820 + }, + { + "epoch": 1.027072758037225, + "grad_norm": 2.689359664916992, + "learning_rate": 4.4867456288776086e-05, + "loss": 0.9735, + "step": 1821 + }, + { + "epoch": 1.0276367738296672, + "grad_norm": 4.170784950256348, + "learning_rate": 4.486463620981388e-05, + "loss": 0.8343, + "step": 1822 + }, + { + "epoch": 1.0282007896221095, + "grad_norm": 3.2710177898406982, + "learning_rate": 4.4861816130851664e-05, + "loss": 1.0165, + "step": 1823 + }, + { + "epoch": 1.0287648054145515, + "grad_norm": 2.0707859992980957, + "learning_rate": 4.4858996051889456e-05, + "loss": 0.7751, + "step": 1824 + }, + { + "epoch": 1.0293288212069938, + "grad_norm": 3.7904577255249023, + "learning_rate": 4.485617597292724e-05, + "loss": 0.9002, + "step": 1825 + }, + { + "epoch": 1.029892836999436, + "grad_norm": 3.270793914794922, + "learning_rate": 4.4853355893965034e-05, + "loss": 0.7089, + "step": 1826 + }, + { + "epoch": 1.0304568527918783, + "grad_norm": 3.196826934814453, + "learning_rate": 4.485053581500282e-05, + "loss": 0.8201, + "step": 1827 + }, + { + "epoch": 1.0310208685843203, + "grad_norm": 3.9351041316986084, + "learning_rate": 4.484771573604061e-05, + "loss": 0.9178, + "step": 1828 + }, + { + "epoch": 1.0315848843767625, + "grad_norm": 3.599043846130371, + "learning_rate": 4.4844895657078404e-05, + "loss": 0.9007, + "step": 1829 + }, + { + "epoch": 1.0321489001692048, + "grad_norm": 4.90047025680542, + "learning_rate": 4.484207557811619e-05, + "loss": 0.9762, + "step": 1830 + }, + { + "epoch": 1.0327129159616468, + "grad_norm": 4.32556676864624, + "learning_rate": 4.4839255499153974e-05, + "loss": 0.947, + "step": 1831 + }, + { + "epoch": 1.033276931754089, + "grad_norm": 2.368957757949829, + "learning_rate": 4.4836435420191767e-05, + "loss": 0.7281, + "step": 1832 + }, + { + "epoch": 1.0338409475465313, + "grad_norm": 2.7168638706207275, + "learning_rate": 4.483361534122956e-05, + "loss": 0.8562, + "step": 1833 + }, + { + "epoch": 1.0344049633389736, + "grad_norm": 4.648215293884277, + "learning_rate": 4.4830795262267344e-05, + "loss": 1.0097, + "step": 1834 + }, + { + "epoch": 1.0349689791314156, + "grad_norm": 1.809063196182251, + "learning_rate": 4.482797518330513e-05, + "loss": 0.9567, + "step": 1835 + }, + { + "epoch": 1.0355329949238579, + "grad_norm": 4.609445095062256, + "learning_rate": 4.482515510434293e-05, + "loss": 0.9178, + "step": 1836 + }, + { + "epoch": 1.0360970107163001, + "grad_norm": 2.950679302215576, + "learning_rate": 4.4822335025380714e-05, + "loss": 0.8639, + "step": 1837 + }, + { + "epoch": 1.0366610265087421, + "grad_norm": 4.255027770996094, + "learning_rate": 4.48195149464185e-05, + "loss": 0.9774, + "step": 1838 + }, + { + "epoch": 1.0372250423011844, + "grad_norm": 3.3106558322906494, + "learning_rate": 4.481669486745629e-05, + "loss": 0.8298, + "step": 1839 + }, + { + "epoch": 1.0377890580936266, + "grad_norm": 3.8540618419647217, + "learning_rate": 4.4813874788494084e-05, + "loss": 0.8053, + "step": 1840 + }, + { + "epoch": 1.038353073886069, + "grad_norm": 3.68900203704834, + "learning_rate": 4.481105470953187e-05, + "loss": 1.0213, + "step": 1841 + }, + { + "epoch": 1.038917089678511, + "grad_norm": 3.4370245933532715, + "learning_rate": 4.4808234630569655e-05, + "loss": 0.9268, + "step": 1842 + }, + { + "epoch": 1.0394811054709532, + "grad_norm": 6.825281143188477, + "learning_rate": 4.480541455160745e-05, + "loss": 1.2629, + "step": 1843 + }, + { + "epoch": 1.0400451212633954, + "grad_norm": 3.069934368133545, + "learning_rate": 4.480259447264524e-05, + "loss": 0.9168, + "step": 1844 + }, + { + "epoch": 1.0406091370558375, + "grad_norm": 4.911325931549072, + "learning_rate": 4.4799774393683024e-05, + "loss": 1.2213, + "step": 1845 + }, + { + "epoch": 1.0411731528482797, + "grad_norm": 5.497409820556641, + "learning_rate": 4.479695431472081e-05, + "loss": 1.0908, + "step": 1846 + }, + { + "epoch": 1.041737168640722, + "grad_norm": 4.538361549377441, + "learning_rate": 4.479413423575861e-05, + "loss": 0.973, + "step": 1847 + }, + { + "epoch": 1.0423011844331642, + "grad_norm": 3.5846619606018066, + "learning_rate": 4.4791314156796394e-05, + "loss": 0.8213, + "step": 1848 + }, + { + "epoch": 1.0428652002256062, + "grad_norm": 1.6744142770767212, + "learning_rate": 4.478849407783418e-05, + "loss": 0.7545, + "step": 1849 + }, + { + "epoch": 1.0434292160180485, + "grad_norm": 2.808312177658081, + "learning_rate": 4.478567399887197e-05, + "loss": 0.9301, + "step": 1850 + }, + { + "epoch": 1.0439932318104908, + "grad_norm": 3.9027788639068604, + "learning_rate": 4.4782853919909764e-05, + "loss": 0.91, + "step": 1851 + }, + { + "epoch": 1.0445572476029328, + "grad_norm": 3.407555103302002, + "learning_rate": 4.478003384094755e-05, + "loss": 0.9712, + "step": 1852 + }, + { + "epoch": 1.045121263395375, + "grad_norm": 4.7431840896606445, + "learning_rate": 4.4777213761985335e-05, + "loss": 1.1509, + "step": 1853 + }, + { + "epoch": 1.0456852791878173, + "grad_norm": 2.9802186489105225, + "learning_rate": 4.477439368302313e-05, + "loss": 0.9167, + "step": 1854 + }, + { + "epoch": 1.0462492949802595, + "grad_norm": 2.6138715744018555, + "learning_rate": 4.477157360406092e-05, + "loss": 0.7622, + "step": 1855 + }, + { + "epoch": 1.0468133107727016, + "grad_norm": 5.143129825592041, + "learning_rate": 4.4768753525098704e-05, + "loss": 0.8519, + "step": 1856 + }, + { + "epoch": 1.0473773265651438, + "grad_norm": 2.4903030395507812, + "learning_rate": 4.476593344613649e-05, + "loss": 0.7454, + "step": 1857 + }, + { + "epoch": 1.047941342357586, + "grad_norm": 3.2487447261810303, + "learning_rate": 4.476311336717428e-05, + "loss": 0.8727, + "step": 1858 + }, + { + "epoch": 1.048505358150028, + "grad_norm": 1.4997626543045044, + "learning_rate": 4.4760293288212074e-05, + "loss": 0.7807, + "step": 1859 + }, + { + "epoch": 1.0490693739424704, + "grad_norm": 4.767689228057861, + "learning_rate": 4.475747320924986e-05, + "loss": 0.9545, + "step": 1860 + }, + { + "epoch": 1.0496333897349126, + "grad_norm": 3.7253096103668213, + "learning_rate": 4.475465313028765e-05, + "loss": 0.8377, + "step": 1861 + }, + { + "epoch": 1.0501974055273549, + "grad_norm": 3.2596912384033203, + "learning_rate": 4.475183305132544e-05, + "loss": 0.901, + "step": 1862 + }, + { + "epoch": 1.0507614213197969, + "grad_norm": 4.146965026855469, + "learning_rate": 4.474901297236323e-05, + "loss": 0.8625, + "step": 1863 + }, + { + "epoch": 1.0513254371122391, + "grad_norm": 3.2159574031829834, + "learning_rate": 4.4746192893401015e-05, + "loss": 0.8992, + "step": 1864 + }, + { + "epoch": 1.0518894529046814, + "grad_norm": 3.1794168949127197, + "learning_rate": 4.474337281443881e-05, + "loss": 0.825, + "step": 1865 + }, + { + "epoch": 1.0524534686971234, + "grad_norm": 2.9281136989593506, + "learning_rate": 4.474055273547659e-05, + "loss": 0.8246, + "step": 1866 + }, + { + "epoch": 1.0530174844895657, + "grad_norm": 6.9968132972717285, + "learning_rate": 4.4737732656514385e-05, + "loss": 1.267, + "step": 1867 + }, + { + "epoch": 1.053581500282008, + "grad_norm": 3.3109166622161865, + "learning_rate": 4.473491257755218e-05, + "loss": 0.8985, + "step": 1868 + }, + { + "epoch": 1.0541455160744502, + "grad_norm": 3.559021472930908, + "learning_rate": 4.473209249858996e-05, + "loss": 0.8057, + "step": 1869 + }, + { + "epoch": 1.0547095318668922, + "grad_norm": 4.0369486808776855, + "learning_rate": 4.472927241962775e-05, + "loss": 1.0914, + "step": 1870 + }, + { + "epoch": 1.0552735476593345, + "grad_norm": 3.1964240074157715, + "learning_rate": 4.472645234066554e-05, + "loss": 0.9098, + "step": 1871 + }, + { + "epoch": 1.0558375634517767, + "grad_norm": 4.419713020324707, + "learning_rate": 4.472363226170333e-05, + "loss": 1.0935, + "step": 1872 + }, + { + "epoch": 1.0564015792442187, + "grad_norm": 3.6065917015075684, + "learning_rate": 4.472081218274112e-05, + "loss": 0.9162, + "step": 1873 + }, + { + "epoch": 1.056965595036661, + "grad_norm": 2.7318227291107178, + "learning_rate": 4.471799210377891e-05, + "loss": 0.8443, + "step": 1874 + }, + { + "epoch": 1.0575296108291032, + "grad_norm": 1.8410216569900513, + "learning_rate": 4.4715172024816695e-05, + "loss": 0.8108, + "step": 1875 + }, + { + "epoch": 1.0580936266215455, + "grad_norm": 2.5516488552093506, + "learning_rate": 4.471235194585449e-05, + "loss": 0.8579, + "step": 1876 + }, + { + "epoch": 1.0586576424139875, + "grad_norm": 3.377153158187866, + "learning_rate": 4.470953186689227e-05, + "loss": 0.9086, + "step": 1877 + }, + { + "epoch": 1.0592216582064298, + "grad_norm": 2.185159921646118, + "learning_rate": 4.4706711787930065e-05, + "loss": 0.9286, + "step": 1878 + }, + { + "epoch": 1.059785673998872, + "grad_norm": 3.9465718269348145, + "learning_rate": 4.470389170896786e-05, + "loss": 0.9254, + "step": 1879 + }, + { + "epoch": 1.060349689791314, + "grad_norm": 2.4490818977355957, + "learning_rate": 4.470107163000564e-05, + "loss": 0.7326, + "step": 1880 + }, + { + "epoch": 1.0609137055837563, + "grad_norm": 2.0533788204193115, + "learning_rate": 4.469825155104343e-05, + "loss": 0.7397, + "step": 1881 + }, + { + "epoch": 1.0614777213761986, + "grad_norm": 3.742586851119995, + "learning_rate": 4.469543147208122e-05, + "loss": 0.9198, + "step": 1882 + }, + { + "epoch": 1.0620417371686408, + "grad_norm": 1.6038953065872192, + "learning_rate": 4.469261139311901e-05, + "loss": 0.7192, + "step": 1883 + }, + { + "epoch": 1.0626057529610828, + "grad_norm": 4.138272285461426, + "learning_rate": 4.46897913141568e-05, + "loss": 0.8638, + "step": 1884 + }, + { + "epoch": 1.063169768753525, + "grad_norm": 4.4620680809021, + "learning_rate": 4.468697123519458e-05, + "loss": 0.9045, + "step": 1885 + }, + { + "epoch": 1.0637337845459673, + "grad_norm": 5.168742656707764, + "learning_rate": 4.468415115623238e-05, + "loss": 0.8815, + "step": 1886 + }, + { + "epoch": 1.0642978003384094, + "grad_norm": 3.9797585010528564, + "learning_rate": 4.468133107727017e-05, + "loss": 1.0055, + "step": 1887 + }, + { + "epoch": 1.0648618161308516, + "grad_norm": 4.689877033233643, + "learning_rate": 4.467851099830795e-05, + "loss": 0.9565, + "step": 1888 + }, + { + "epoch": 1.0654258319232939, + "grad_norm": 3.3399713039398193, + "learning_rate": 4.4675690919345745e-05, + "loss": 0.9539, + "step": 1889 + }, + { + "epoch": 1.0659898477157361, + "grad_norm": 2.188206672668457, + "learning_rate": 4.467287084038354e-05, + "loss": 0.7857, + "step": 1890 + }, + { + "epoch": 1.0665538635081782, + "grad_norm": 3.625551223754883, + "learning_rate": 4.467005076142132e-05, + "loss": 0.9166, + "step": 1891 + }, + { + "epoch": 1.0671178793006204, + "grad_norm": 3.3088295459747314, + "learning_rate": 4.466723068245911e-05, + "loss": 0.8399, + "step": 1892 + }, + { + "epoch": 1.0676818950930627, + "grad_norm": 2.865393877029419, + "learning_rate": 4.46644106034969e-05, + "loss": 0.9651, + "step": 1893 + }, + { + "epoch": 1.0682459108855047, + "grad_norm": 4.463068962097168, + "learning_rate": 4.466159052453469e-05, + "loss": 0.9589, + "step": 1894 + }, + { + "epoch": 1.068809926677947, + "grad_norm": 3.4752984046936035, + "learning_rate": 4.465877044557248e-05, + "loss": 0.7722, + "step": 1895 + }, + { + "epoch": 1.0693739424703892, + "grad_norm": 5.454896450042725, + "learning_rate": 4.465595036661026e-05, + "loss": 1.095, + "step": 1896 + }, + { + "epoch": 1.0699379582628314, + "grad_norm": 3.1599056720733643, + "learning_rate": 4.4653130287648055e-05, + "loss": 0.8322, + "step": 1897 + }, + { + "epoch": 1.0705019740552735, + "grad_norm": 3.4906294345855713, + "learning_rate": 4.465031020868585e-05, + "loss": 0.9181, + "step": 1898 + }, + { + "epoch": 1.0710659898477157, + "grad_norm": 3.220459222793579, + "learning_rate": 4.464749012972363e-05, + "loss": 0.7596, + "step": 1899 + }, + { + "epoch": 1.071630005640158, + "grad_norm": 4.036924839019775, + "learning_rate": 4.4644670050761425e-05, + "loss": 0.9302, + "step": 1900 + }, + { + "epoch": 1.0721940214326002, + "grad_norm": 4.939606189727783, + "learning_rate": 4.464184997179921e-05, + "loss": 0.9564, + "step": 1901 + }, + { + "epoch": 1.0727580372250423, + "grad_norm": 4.338982105255127, + "learning_rate": 4.4639029892837e-05, + "loss": 0.8326, + "step": 1902 + }, + { + "epoch": 1.0733220530174845, + "grad_norm": 2.6111135482788086, + "learning_rate": 4.463620981387479e-05, + "loss": 0.88, + "step": 1903 + }, + { + "epoch": 1.0738860688099268, + "grad_norm": 1.9602066278457642, + "learning_rate": 4.463338973491258e-05, + "loss": 0.8367, + "step": 1904 + }, + { + "epoch": 1.0744500846023688, + "grad_norm": 10.060860633850098, + "learning_rate": 4.4630569655950366e-05, + "loss": 1.0402, + "step": 1905 + }, + { + "epoch": 1.075014100394811, + "grad_norm": 3.7827765941619873, + "learning_rate": 4.462774957698816e-05, + "loss": 0.8958, + "step": 1906 + }, + { + "epoch": 1.0755781161872533, + "grad_norm": 2.363044261932373, + "learning_rate": 4.462492949802595e-05, + "loss": 0.835, + "step": 1907 + }, + { + "epoch": 1.0761421319796955, + "grad_norm": 5.19504451751709, + "learning_rate": 4.4622109419063735e-05, + "loss": 0.9536, + "step": 1908 + }, + { + "epoch": 1.0767061477721376, + "grad_norm": 3.4505245685577393, + "learning_rate": 4.461928934010153e-05, + "loss": 0.8951, + "step": 1909 + }, + { + "epoch": 1.0772701635645798, + "grad_norm": 2.334623098373413, + "learning_rate": 4.461646926113931e-05, + "loss": 0.9674, + "step": 1910 + }, + { + "epoch": 1.077834179357022, + "grad_norm": 4.471513271331787, + "learning_rate": 4.4613649182177105e-05, + "loss": 1.0246, + "step": 1911 + }, + { + "epoch": 1.0783981951494641, + "grad_norm": 7.020613193511963, + "learning_rate": 4.461082910321489e-05, + "loss": 0.8956, + "step": 1912 + }, + { + "epoch": 1.0789622109419064, + "grad_norm": 4.808978080749512, + "learning_rate": 4.460800902425268e-05, + "loss": 0.999, + "step": 1913 + }, + { + "epoch": 1.0795262267343486, + "grad_norm": 4.778241157531738, + "learning_rate": 4.460518894529047e-05, + "loss": 0.8723, + "step": 1914 + }, + { + "epoch": 1.0800902425267909, + "grad_norm": 4.555966854095459, + "learning_rate": 4.460236886632826e-05, + "loss": 0.8999, + "step": 1915 + }, + { + "epoch": 1.080654258319233, + "grad_norm": 2.323153018951416, + "learning_rate": 4.4599548787366046e-05, + "loss": 0.9356, + "step": 1916 + }, + { + "epoch": 1.0812182741116751, + "grad_norm": 2.3146767616271973, + "learning_rate": 4.459672870840384e-05, + "loss": 0.7822, + "step": 1917 + }, + { + "epoch": 1.0817822899041174, + "grad_norm": 5.34544038772583, + "learning_rate": 4.459390862944163e-05, + "loss": 0.885, + "step": 1918 + }, + { + "epoch": 1.0823463056965594, + "grad_norm": 4.515475273132324, + "learning_rate": 4.4591088550479416e-05, + "loss": 0.9845, + "step": 1919 + }, + { + "epoch": 1.0829103214890017, + "grad_norm": 4.690851211547852, + "learning_rate": 4.45882684715172e-05, + "loss": 0.8885, + "step": 1920 + }, + { + "epoch": 1.083474337281444, + "grad_norm": 2.2037105560302734, + "learning_rate": 4.458544839255499e-05, + "loss": 0.7744, + "step": 1921 + }, + { + "epoch": 1.0840383530738862, + "grad_norm": 2.5983142852783203, + "learning_rate": 4.4582628313592785e-05, + "loss": 0.6592, + "step": 1922 + }, + { + "epoch": 1.0846023688663282, + "grad_norm": 5.646725654602051, + "learning_rate": 4.457980823463057e-05, + "loss": 1.2446, + "step": 1923 + }, + { + "epoch": 1.0851663846587705, + "grad_norm": 3.994508981704712, + "learning_rate": 4.4576988155668356e-05, + "loss": 0.8615, + "step": 1924 + }, + { + "epoch": 1.0857304004512127, + "grad_norm": 2.960393190383911, + "learning_rate": 4.4574168076706155e-05, + "loss": 0.9225, + "step": 1925 + }, + { + "epoch": 1.0862944162436547, + "grad_norm": 3.8797106742858887, + "learning_rate": 4.457134799774394e-05, + "loss": 0.8508, + "step": 1926 + }, + { + "epoch": 1.086858432036097, + "grad_norm": 2.249382257461548, + "learning_rate": 4.4568527918781726e-05, + "loss": 0.7123, + "step": 1927 + }, + { + "epoch": 1.0874224478285393, + "grad_norm": 2.167388439178467, + "learning_rate": 4.456570783981952e-05, + "loss": 0.8492, + "step": 1928 + }, + { + "epoch": 1.0879864636209815, + "grad_norm": 5.1758222579956055, + "learning_rate": 4.456288776085731e-05, + "loss": 0.9648, + "step": 1929 + }, + { + "epoch": 1.0885504794134235, + "grad_norm": 6.424041271209717, + "learning_rate": 4.4560067681895096e-05, + "loss": 1.239, + "step": 1930 + }, + { + "epoch": 1.0891144952058658, + "grad_norm": 4.670988082885742, + "learning_rate": 4.455724760293288e-05, + "loss": 1.1356, + "step": 1931 + }, + { + "epoch": 1.089678510998308, + "grad_norm": 4.555474281311035, + "learning_rate": 4.455442752397067e-05, + "loss": 0.9897, + "step": 1932 + }, + { + "epoch": 1.09024252679075, + "grad_norm": 3.4329986572265625, + "learning_rate": 4.4551607445008466e-05, + "loss": 0.8561, + "step": 1933 + }, + { + "epoch": 1.0908065425831923, + "grad_norm": 4.818148612976074, + "learning_rate": 4.454878736604625e-05, + "loss": 0.9382, + "step": 1934 + }, + { + "epoch": 1.0913705583756346, + "grad_norm": 2.5106966495513916, + "learning_rate": 4.4545967287084036e-05, + "loss": 0.7877, + "step": 1935 + }, + { + "epoch": 1.0919345741680768, + "grad_norm": 3.8270785808563232, + "learning_rate": 4.454314720812183e-05, + "loss": 0.8178, + "step": 1936 + }, + { + "epoch": 1.0924985899605189, + "grad_norm": 2.7750723361968994, + "learning_rate": 4.454032712915962e-05, + "loss": 0.9262, + "step": 1937 + }, + { + "epoch": 1.093062605752961, + "grad_norm": 4.53749418258667, + "learning_rate": 4.4537507050197406e-05, + "loss": 0.9877, + "step": 1938 + }, + { + "epoch": 1.0936266215454034, + "grad_norm": 3.6971256732940674, + "learning_rate": 4.45346869712352e-05, + "loss": 0.955, + "step": 1939 + }, + { + "epoch": 1.0941906373378454, + "grad_norm": 4.860289573669434, + "learning_rate": 4.4531866892272984e-05, + "loss": 1.0919, + "step": 1940 + }, + { + "epoch": 1.0947546531302876, + "grad_norm": 1.6883279085159302, + "learning_rate": 4.4529046813310776e-05, + "loss": 0.8608, + "step": 1941 + }, + { + "epoch": 1.0953186689227299, + "grad_norm": 2.789377450942993, + "learning_rate": 4.452622673434856e-05, + "loss": 0.8987, + "step": 1942 + }, + { + "epoch": 1.0958826847151721, + "grad_norm": 5.573638439178467, + "learning_rate": 4.4523406655386353e-05, + "loss": 1.2312, + "step": 1943 + }, + { + "epoch": 1.0964467005076142, + "grad_norm": 3.7058298587799072, + "learning_rate": 4.4520586576424146e-05, + "loss": 0.6786, + "step": 1944 + }, + { + "epoch": 1.0970107163000564, + "grad_norm": 4.192314147949219, + "learning_rate": 4.451776649746193e-05, + "loss": 1.0184, + "step": 1945 + }, + { + "epoch": 1.0975747320924987, + "grad_norm": 3.442593812942505, + "learning_rate": 4.451494641849972e-05, + "loss": 0.979, + "step": 1946 + }, + { + "epoch": 1.0981387478849407, + "grad_norm": 4.165545463562012, + "learning_rate": 4.451212633953751e-05, + "loss": 0.8817, + "step": 1947 + }, + { + "epoch": 1.098702763677383, + "grad_norm": 4.5408453941345215, + "learning_rate": 4.45093062605753e-05, + "loss": 0.7476, + "step": 1948 + }, + { + "epoch": 1.0992667794698252, + "grad_norm": 3.387550115585327, + "learning_rate": 4.4506486181613086e-05, + "loss": 0.7316, + "step": 1949 + }, + { + "epoch": 1.0998307952622675, + "grad_norm": 3.3775758743286133, + "learning_rate": 4.450366610265088e-05, + "loss": 0.6596, + "step": 1950 + }, + { + "epoch": 1.1003948110547095, + "grad_norm": 4.770999431610107, + "learning_rate": 4.4500846023688664e-05, + "loss": 1.0752, + "step": 1951 + }, + { + "epoch": 1.1009588268471517, + "grad_norm": 4.40075159072876, + "learning_rate": 4.4498025944726456e-05, + "loss": 0.9186, + "step": 1952 + }, + { + "epoch": 1.101522842639594, + "grad_norm": 3.458310127258301, + "learning_rate": 4.449520586576424e-05, + "loss": 0.885, + "step": 1953 + }, + { + "epoch": 1.102086858432036, + "grad_norm": 5.275932788848877, + "learning_rate": 4.4492385786802034e-05, + "loss": 1.0321, + "step": 1954 + }, + { + "epoch": 1.1026508742244783, + "grad_norm": 3.0411806106567383, + "learning_rate": 4.448956570783982e-05, + "loss": 0.9799, + "step": 1955 + }, + { + "epoch": 1.1032148900169205, + "grad_norm": 5.291767120361328, + "learning_rate": 4.448674562887761e-05, + "loss": 1.1234, + "step": 1956 + }, + { + "epoch": 1.1037789058093628, + "grad_norm": 3.437504529953003, + "learning_rate": 4.4483925549915403e-05, + "loss": 0.8881, + "step": 1957 + }, + { + "epoch": 1.1043429216018048, + "grad_norm": 4.221705913543701, + "learning_rate": 4.448110547095319e-05, + "loss": 1.0549, + "step": 1958 + }, + { + "epoch": 1.104906937394247, + "grad_norm": 2.5069785118103027, + "learning_rate": 4.4478285391990974e-05, + "loss": 0.8594, + "step": 1959 + }, + { + "epoch": 1.1054709531866893, + "grad_norm": 4.2401957511901855, + "learning_rate": 4.4475465313028766e-05, + "loss": 1.0032, + "step": 1960 + }, + { + "epoch": 1.1060349689791313, + "grad_norm": 4.061955451965332, + "learning_rate": 4.447264523406656e-05, + "loss": 0.894, + "step": 1961 + }, + { + "epoch": 1.1065989847715736, + "grad_norm": 3.1087002754211426, + "learning_rate": 4.4469825155104344e-05, + "loss": 0.824, + "step": 1962 + }, + { + "epoch": 1.1071630005640158, + "grad_norm": 6.087307453155518, + "learning_rate": 4.446700507614213e-05, + "loss": 1.4458, + "step": 1963 + }, + { + "epoch": 1.107727016356458, + "grad_norm": 2.015291213989258, + "learning_rate": 4.446418499717993e-05, + "loss": 0.8775, + "step": 1964 + }, + { + "epoch": 1.1082910321489001, + "grad_norm": 3.508071184158325, + "learning_rate": 4.4461364918217714e-05, + "loss": 0.9509, + "step": 1965 + }, + { + "epoch": 1.1088550479413424, + "grad_norm": 5.849016189575195, + "learning_rate": 4.44585448392555e-05, + "loss": 1.1215, + "step": 1966 + }, + { + "epoch": 1.1094190637337846, + "grad_norm": 3.509551525115967, + "learning_rate": 4.445572476029329e-05, + "loss": 0.795, + "step": 1967 + }, + { + "epoch": 1.1099830795262267, + "grad_norm": 3.0461225509643555, + "learning_rate": 4.4452904681331084e-05, + "loss": 0.9082, + "step": 1968 + }, + { + "epoch": 1.110547095318669, + "grad_norm": 3.5537476539611816, + "learning_rate": 4.445008460236887e-05, + "loss": 0.8429, + "step": 1969 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 3.5639610290527344, + "learning_rate": 4.4447264523406654e-05, + "loss": 0.8096, + "step": 1970 + }, + { + "epoch": 1.1116751269035534, + "grad_norm": 7.4759111404418945, + "learning_rate": 4.4444444444444447e-05, + "loss": 1.2094, + "step": 1971 + }, + { + "epoch": 1.1122391426959954, + "grad_norm": 2.859276533126831, + "learning_rate": 4.444162436548224e-05, + "loss": 0.7657, + "step": 1972 + }, + { + "epoch": 1.1128031584884377, + "grad_norm": 4.561776638031006, + "learning_rate": 4.4438804286520024e-05, + "loss": 0.9902, + "step": 1973 + }, + { + "epoch": 1.11336717428088, + "grad_norm": 3.9854331016540527, + "learning_rate": 4.443598420755781e-05, + "loss": 0.968, + "step": 1974 + }, + { + "epoch": 1.113931190073322, + "grad_norm": 6.388465404510498, + "learning_rate": 4.44331641285956e-05, + "loss": 1.1741, + "step": 1975 + }, + { + "epoch": 1.1144952058657642, + "grad_norm": 4.5194091796875, + "learning_rate": 4.4430344049633394e-05, + "loss": 0.9344, + "step": 1976 + }, + { + "epoch": 1.1150592216582065, + "grad_norm": 2.543686628341675, + "learning_rate": 4.442752397067118e-05, + "loss": 0.8217, + "step": 1977 + }, + { + "epoch": 1.1156232374506487, + "grad_norm": 2.527564287185669, + "learning_rate": 4.442470389170897e-05, + "loss": 0.7794, + "step": 1978 + }, + { + "epoch": 1.1161872532430908, + "grad_norm": 3.3769567012786865, + "learning_rate": 4.4421883812746764e-05, + "loss": 0.8877, + "step": 1979 + }, + { + "epoch": 1.116751269035533, + "grad_norm": 2.062347888946533, + "learning_rate": 4.441906373378455e-05, + "loss": 0.722, + "step": 1980 + }, + { + "epoch": 1.1173152848279753, + "grad_norm": 3.2196600437164307, + "learning_rate": 4.4416243654822335e-05, + "loss": 0.8605, + "step": 1981 + }, + { + "epoch": 1.1178793006204173, + "grad_norm": 3.3534443378448486, + "learning_rate": 4.441342357586013e-05, + "loss": 0.8769, + "step": 1982 + }, + { + "epoch": 1.1184433164128595, + "grad_norm": 4.871739864349365, + "learning_rate": 4.441060349689792e-05, + "loss": 1.0945, + "step": 1983 + }, + { + "epoch": 1.1190073322053018, + "grad_norm": 3.2716240882873535, + "learning_rate": 4.4407783417935704e-05, + "loss": 1.1439, + "step": 1984 + }, + { + "epoch": 1.119571347997744, + "grad_norm": 3.4301493167877197, + "learning_rate": 4.4404963338973497e-05, + "loss": 0.9347, + "step": 1985 + }, + { + "epoch": 1.120135363790186, + "grad_norm": 3.025991439819336, + "learning_rate": 4.440214326001128e-05, + "loss": 0.8631, + "step": 1986 + }, + { + "epoch": 1.1206993795826283, + "grad_norm": 3.605684518814087, + "learning_rate": 4.4399323181049074e-05, + "loss": 1.0022, + "step": 1987 + }, + { + "epoch": 1.1212633953750706, + "grad_norm": 3.8420863151550293, + "learning_rate": 4.439650310208686e-05, + "loss": 0.814, + "step": 1988 + }, + { + "epoch": 1.1218274111675126, + "grad_norm": 3.9961326122283936, + "learning_rate": 4.439368302312465e-05, + "loss": 0.9618, + "step": 1989 + }, + { + "epoch": 1.1223914269599549, + "grad_norm": 3.4992945194244385, + "learning_rate": 4.439086294416244e-05, + "loss": 0.8025, + "step": 1990 + }, + { + "epoch": 1.1229554427523971, + "grad_norm": 5.0302414894104, + "learning_rate": 4.438804286520023e-05, + "loss": 0.865, + "step": 1991 + }, + { + "epoch": 1.1235194585448394, + "grad_norm": 4.3211164474487305, + "learning_rate": 4.4385222786238015e-05, + "loss": 1.0004, + "step": 1992 + }, + { + "epoch": 1.1240834743372814, + "grad_norm": 2.2145581245422363, + "learning_rate": 4.438240270727581e-05, + "loss": 0.9778, + "step": 1993 + }, + { + "epoch": 1.1246474901297236, + "grad_norm": 2.6655824184417725, + "learning_rate": 4.437958262831359e-05, + "loss": 0.813, + "step": 1994 + }, + { + "epoch": 1.125211505922166, + "grad_norm": 2.1757752895355225, + "learning_rate": 4.4376762549351384e-05, + "loss": 0.8321, + "step": 1995 + }, + { + "epoch": 1.125775521714608, + "grad_norm": 3.310335397720337, + "learning_rate": 4.437394247038918e-05, + "loss": 0.9125, + "step": 1996 + }, + { + "epoch": 1.1263395375070502, + "grad_norm": 3.416743755340576, + "learning_rate": 4.437112239142696e-05, + "loss": 0.7381, + "step": 1997 + }, + { + "epoch": 1.1269035532994924, + "grad_norm": 4.742486953735352, + "learning_rate": 4.436830231246475e-05, + "loss": 1.0215, + "step": 1998 + }, + { + "epoch": 1.1274675690919347, + "grad_norm": 3.504629611968994, + "learning_rate": 4.436548223350254e-05, + "loss": 0.9945, + "step": 1999 + }, + { + "epoch": 1.1280315848843767, + "grad_norm": 3.503263473510742, + "learning_rate": 4.436266215454033e-05, + "loss": 1.0067, + "step": 2000 + }, + { + "epoch": 1.128595600676819, + "grad_norm": 2.5812952518463135, + "learning_rate": 4.435984207557812e-05, + "loss": 0.7935, + "step": 2001 + }, + { + "epoch": 1.1291596164692612, + "grad_norm": 6.2291388511657715, + "learning_rate": 4.43570219966159e-05, + "loss": 0.8433, + "step": 2002 + }, + { + "epoch": 1.1297236322617032, + "grad_norm": 1.9581272602081299, + "learning_rate": 4.43542019176537e-05, + "loss": 0.8036, + "step": 2003 + }, + { + "epoch": 1.1302876480541455, + "grad_norm": 3.907684326171875, + "learning_rate": 4.435138183869149e-05, + "loss": 0.8954, + "step": 2004 + }, + { + "epoch": 1.1308516638465878, + "grad_norm": 1.766859769821167, + "learning_rate": 4.434856175972927e-05, + "loss": 0.7093, + "step": 2005 + }, + { + "epoch": 1.13141567963903, + "grad_norm": 4.32037353515625, + "learning_rate": 4.434574168076706e-05, + "loss": 0.8942, + "step": 2006 + }, + { + "epoch": 1.131979695431472, + "grad_norm": 5.245221138000488, + "learning_rate": 4.434292160180486e-05, + "loss": 1.1684, + "step": 2007 + }, + { + "epoch": 1.1325437112239143, + "grad_norm": 3.607696533203125, + "learning_rate": 4.434010152284264e-05, + "loss": 0.8941, + "step": 2008 + }, + { + "epoch": 1.1331077270163565, + "grad_norm": 3.736194610595703, + "learning_rate": 4.433728144388043e-05, + "loss": 1.0358, + "step": 2009 + }, + { + "epoch": 1.1336717428087986, + "grad_norm": 8.063179969787598, + "learning_rate": 4.433446136491822e-05, + "loss": 1.4845, + "step": 2010 + }, + { + "epoch": 1.1342357586012408, + "grad_norm": 3.0735459327697754, + "learning_rate": 4.433164128595601e-05, + "loss": 0.8998, + "step": 2011 + }, + { + "epoch": 1.134799774393683, + "grad_norm": 4.405383110046387, + "learning_rate": 4.43288212069938e-05, + "loss": 0.9778, + "step": 2012 + }, + { + "epoch": 1.1353637901861253, + "grad_norm": 3.911346673965454, + "learning_rate": 4.432600112803158e-05, + "loss": 0.8124, + "step": 2013 + }, + { + "epoch": 1.1359278059785674, + "grad_norm": 3.9228780269622803, + "learning_rate": 4.4323181049069375e-05, + "loss": 0.9668, + "step": 2014 + }, + { + "epoch": 1.1364918217710096, + "grad_norm": 12.538582801818848, + "learning_rate": 4.432036097010717e-05, + "loss": 1.1543, + "step": 2015 + }, + { + "epoch": 1.1370558375634519, + "grad_norm": 3.484938859939575, + "learning_rate": 4.431754089114495e-05, + "loss": 0.8554, + "step": 2016 + }, + { + "epoch": 1.1376198533558939, + "grad_norm": 4.198097229003906, + "learning_rate": 4.4314720812182745e-05, + "loss": 1.0827, + "step": 2017 + }, + { + "epoch": 1.1381838691483361, + "grad_norm": 1.9551703929901123, + "learning_rate": 4.431190073322054e-05, + "loss": 0.8734, + "step": 2018 + }, + { + "epoch": 1.1387478849407784, + "grad_norm": 2.2196075916290283, + "learning_rate": 4.430908065425832e-05, + "loss": 0.782, + "step": 2019 + }, + { + "epoch": 1.1393119007332206, + "grad_norm": 4.689267158508301, + "learning_rate": 4.430626057529611e-05, + "loss": 1.0102, + "step": 2020 + }, + { + "epoch": 1.1398759165256627, + "grad_norm": 7.1022820472717285, + "learning_rate": 4.43034404963339e-05, + "loss": 0.9122, + "step": 2021 + }, + { + "epoch": 1.140439932318105, + "grad_norm": 4.525922775268555, + "learning_rate": 4.430062041737169e-05, + "loss": 0.954, + "step": 2022 + }, + { + "epoch": 1.1410039481105472, + "grad_norm": 4.075003147125244, + "learning_rate": 4.429780033840948e-05, + "loss": 1.0374, + "step": 2023 + }, + { + "epoch": 1.1415679639029892, + "grad_norm": 5.382485866546631, + "learning_rate": 4.429498025944726e-05, + "loss": 1.139, + "step": 2024 + }, + { + "epoch": 1.1421319796954315, + "grad_norm": 2.8128201961517334, + "learning_rate": 4.4292160180485055e-05, + "loss": 0.9042, + "step": 2025 + }, + { + "epoch": 1.1426959954878737, + "grad_norm": 5.320822715759277, + "learning_rate": 4.428934010152285e-05, + "loss": 0.9687, + "step": 2026 + }, + { + "epoch": 1.143260011280316, + "grad_norm": 5.1553544998168945, + "learning_rate": 4.428652002256063e-05, + "loss": 1.0591, + "step": 2027 + }, + { + "epoch": 1.143824027072758, + "grad_norm": 3.006814479827881, + "learning_rate": 4.4283699943598425e-05, + "loss": 0.8733, + "step": 2028 + }, + { + "epoch": 1.1443880428652002, + "grad_norm": 5.1633758544921875, + "learning_rate": 4.428087986463621e-05, + "loss": 1.2066, + "step": 2029 + }, + { + "epoch": 1.1449520586576425, + "grad_norm": 2.6150197982788086, + "learning_rate": 4.4278059785674e-05, + "loss": 0.8426, + "step": 2030 + }, + { + "epoch": 1.1455160744500845, + "grad_norm": 3.802273750305176, + "learning_rate": 4.427523970671179e-05, + "loss": 0.8972, + "step": 2031 + }, + { + "epoch": 1.1460800902425268, + "grad_norm": 2.1529154777526855, + "learning_rate": 4.427241962774958e-05, + "loss": 0.7784, + "step": 2032 + }, + { + "epoch": 1.146644106034969, + "grad_norm": 3.4917171001434326, + "learning_rate": 4.4269599548787366e-05, + "loss": 1.036, + "step": 2033 + }, + { + "epoch": 1.1472081218274113, + "grad_norm": 4.0119757652282715, + "learning_rate": 4.426677946982516e-05, + "loss": 0.9958, + "step": 2034 + }, + { + "epoch": 1.1477721376198533, + "grad_norm": 2.9624717235565186, + "learning_rate": 4.426395939086295e-05, + "loss": 0.8181, + "step": 2035 + }, + { + "epoch": 1.1483361534122956, + "grad_norm": 1.9068695306777954, + "learning_rate": 4.4261139311900735e-05, + "loss": 0.8486, + "step": 2036 + }, + { + "epoch": 1.1489001692047378, + "grad_norm": 5.7797088623046875, + "learning_rate": 4.425831923293852e-05, + "loss": 1.0811, + "step": 2037 + }, + { + "epoch": 1.1494641849971798, + "grad_norm": 3.7954342365264893, + "learning_rate": 4.425549915397631e-05, + "loss": 0.9506, + "step": 2038 + }, + { + "epoch": 1.150028200789622, + "grad_norm": 2.4066739082336426, + "learning_rate": 4.4252679075014105e-05, + "loss": 0.8855, + "step": 2039 + }, + { + "epoch": 1.1505922165820643, + "grad_norm": 6.914726257324219, + "learning_rate": 4.424985899605189e-05, + "loss": 1.0111, + "step": 2040 + }, + { + "epoch": 1.1511562323745066, + "grad_norm": 3.132476806640625, + "learning_rate": 4.4247038917089676e-05, + "loss": 0.8133, + "step": 2041 + }, + { + "epoch": 1.1517202481669486, + "grad_norm": 4.636385440826416, + "learning_rate": 4.424421883812747e-05, + "loss": 0.8464, + "step": 2042 + }, + { + "epoch": 1.1522842639593909, + "grad_norm": 2.3458592891693115, + "learning_rate": 4.424139875916526e-05, + "loss": 0.8646, + "step": 2043 + }, + { + "epoch": 1.1528482797518331, + "grad_norm": 2.8831589221954346, + "learning_rate": 4.4238578680203046e-05, + "loss": 0.8922, + "step": 2044 + }, + { + "epoch": 1.1534122955442752, + "grad_norm": 3.36492919921875, + "learning_rate": 4.423575860124083e-05, + "loss": 0.84, + "step": 2045 + }, + { + "epoch": 1.1539763113367174, + "grad_norm": 5.195201396942139, + "learning_rate": 4.423293852227863e-05, + "loss": 1.0521, + "step": 2046 + }, + { + "epoch": 1.1545403271291597, + "grad_norm": 3.7358131408691406, + "learning_rate": 4.4230118443316415e-05, + "loss": 0.982, + "step": 2047 + }, + { + "epoch": 1.155104342921602, + "grad_norm": 2.065312147140503, + "learning_rate": 4.42272983643542e-05, + "loss": 0.7709, + "step": 2048 + }, + { + "epoch": 1.155668358714044, + "grad_norm": 3.5667457580566406, + "learning_rate": 4.422447828539199e-05, + "loss": 0.8931, + "step": 2049 + }, + { + "epoch": 1.1562323745064862, + "grad_norm": 2.471011161804199, + "learning_rate": 4.4221658206429785e-05, + "loss": 0.8339, + "step": 2050 + }, + { + "epoch": 1.1567963902989284, + "grad_norm": 3.662376880645752, + "learning_rate": 4.421883812746757e-05, + "loss": 1.0085, + "step": 2051 + }, + { + "epoch": 1.1573604060913705, + "grad_norm": 5.276257038116455, + "learning_rate": 4.4216018048505356e-05, + "loss": 1.1604, + "step": 2052 + }, + { + "epoch": 1.1579244218838127, + "grad_norm": 3.371835947036743, + "learning_rate": 4.4213197969543155e-05, + "loss": 0.9389, + "step": 2053 + }, + { + "epoch": 1.158488437676255, + "grad_norm": 1.83493173122406, + "learning_rate": 4.421037789058094e-05, + "loss": 0.8681, + "step": 2054 + }, + { + "epoch": 1.1590524534686972, + "grad_norm": 6.707424163818359, + "learning_rate": 4.4207557811618726e-05, + "loss": 1.0216, + "step": 2055 + }, + { + "epoch": 1.1596164692611393, + "grad_norm": 3.753272294998169, + "learning_rate": 4.420473773265652e-05, + "loss": 0.8876, + "step": 2056 + }, + { + "epoch": 1.1601804850535815, + "grad_norm": 3.1182150840759277, + "learning_rate": 4.420191765369431e-05, + "loss": 0.9092, + "step": 2057 + }, + { + "epoch": 1.1607445008460238, + "grad_norm": 2.738119125366211, + "learning_rate": 4.4199097574732096e-05, + "loss": 0.8082, + "step": 2058 + }, + { + "epoch": 1.1613085166384658, + "grad_norm": 4.042611122131348, + "learning_rate": 4.419627749576988e-05, + "loss": 0.9346, + "step": 2059 + }, + { + "epoch": 1.161872532430908, + "grad_norm": 3.0103485584259033, + "learning_rate": 4.419345741680767e-05, + "loss": 0.9312, + "step": 2060 + }, + { + "epoch": 1.1624365482233503, + "grad_norm": 1.7981040477752686, + "learning_rate": 4.4190637337845465e-05, + "loss": 0.7338, + "step": 2061 + }, + { + "epoch": 1.1630005640157925, + "grad_norm": 3.009744644165039, + "learning_rate": 4.418781725888325e-05, + "loss": 0.8419, + "step": 2062 + }, + { + "epoch": 1.1635645798082346, + "grad_norm": 4.083003044128418, + "learning_rate": 4.4184997179921036e-05, + "loss": 1.085, + "step": 2063 + }, + { + "epoch": 1.1641285956006768, + "grad_norm": 3.4164276123046875, + "learning_rate": 4.418217710095883e-05, + "loss": 0.9641, + "step": 2064 + }, + { + "epoch": 1.164692611393119, + "grad_norm": 4.083009719848633, + "learning_rate": 4.417935702199662e-05, + "loss": 0.9491, + "step": 2065 + }, + { + "epoch": 1.1652566271855611, + "grad_norm": 3.371964454650879, + "learning_rate": 4.4176536943034406e-05, + "loss": 0.8788, + "step": 2066 + }, + { + "epoch": 1.1658206429780034, + "grad_norm": 5.450358867645264, + "learning_rate": 4.41737168640722e-05, + "loss": 1.3482, + "step": 2067 + }, + { + "epoch": 1.1663846587704456, + "grad_norm": 4.008681774139404, + "learning_rate": 4.4170896785109984e-05, + "loss": 1.0093, + "step": 2068 + }, + { + "epoch": 1.1669486745628879, + "grad_norm": 2.332261323928833, + "learning_rate": 4.4168076706147776e-05, + "loss": 0.8487, + "step": 2069 + }, + { + "epoch": 1.16751269035533, + "grad_norm": 5.405789852142334, + "learning_rate": 4.416525662718556e-05, + "loss": 0.9688, + "step": 2070 + }, + { + "epoch": 1.1680767061477721, + "grad_norm": 5.752939701080322, + "learning_rate": 4.416243654822335e-05, + "loss": 1.0577, + "step": 2071 + }, + { + "epoch": 1.1686407219402144, + "grad_norm": 3.600985050201416, + "learning_rate": 4.415961646926114e-05, + "loss": 0.913, + "step": 2072 + }, + { + "epoch": 1.1692047377326564, + "grad_norm": 4.840665817260742, + "learning_rate": 4.415679639029893e-05, + "loss": 0.8971, + "step": 2073 + }, + { + "epoch": 1.1697687535250987, + "grad_norm": 3.5260937213897705, + "learning_rate": 4.415397631133672e-05, + "loss": 0.7856, + "step": 2074 + }, + { + "epoch": 1.170332769317541, + "grad_norm": 3.3531174659729004, + "learning_rate": 4.415115623237451e-05, + "loss": 0.8766, + "step": 2075 + }, + { + "epoch": 1.1708967851099832, + "grad_norm": 5.320346832275391, + "learning_rate": 4.4148336153412294e-05, + "loss": 1.1447, + "step": 2076 + }, + { + "epoch": 1.1714608009024252, + "grad_norm": 3.029419422149658, + "learning_rate": 4.4145516074450086e-05, + "loss": 0.9237, + "step": 2077 + }, + { + "epoch": 1.1720248166948675, + "grad_norm": 3.6201465129852295, + "learning_rate": 4.414269599548788e-05, + "loss": 0.896, + "step": 2078 + }, + { + "epoch": 1.1725888324873097, + "grad_norm": 3.09976863861084, + "learning_rate": 4.4139875916525664e-05, + "loss": 0.8202, + "step": 2079 + }, + { + "epoch": 1.1731528482797517, + "grad_norm": 2.1503100395202637, + "learning_rate": 4.413705583756345e-05, + "loss": 0.6625, + "step": 2080 + }, + { + "epoch": 1.173716864072194, + "grad_norm": 4.147995948791504, + "learning_rate": 4.413423575860124e-05, + "loss": 1.0018, + "step": 2081 + }, + { + "epoch": 1.1742808798646363, + "grad_norm": 2.6759698390960693, + "learning_rate": 4.4131415679639033e-05, + "loss": 0.7345, + "step": 2082 + }, + { + "epoch": 1.1748448956570785, + "grad_norm": 2.0098719596862793, + "learning_rate": 4.412859560067682e-05, + "loss": 0.8598, + "step": 2083 + }, + { + "epoch": 1.1754089114495205, + "grad_norm": 3.7898895740509033, + "learning_rate": 4.4125775521714604e-05, + "loss": 0.8514, + "step": 2084 + }, + { + "epoch": 1.1759729272419628, + "grad_norm": 3.5494391918182373, + "learning_rate": 4.41229554427524e-05, + "loss": 0.8373, + "step": 2085 + }, + { + "epoch": 1.176536943034405, + "grad_norm": 3.074028968811035, + "learning_rate": 4.412013536379019e-05, + "loss": 0.8978, + "step": 2086 + }, + { + "epoch": 1.177100958826847, + "grad_norm": 3.1520943641662598, + "learning_rate": 4.4117315284827974e-05, + "loss": 0.7895, + "step": 2087 + }, + { + "epoch": 1.1776649746192893, + "grad_norm": 2.420053005218506, + "learning_rate": 4.4114495205865766e-05, + "loss": 0.9667, + "step": 2088 + }, + { + "epoch": 1.1782289904117316, + "grad_norm": 2.5553181171417236, + "learning_rate": 4.411167512690356e-05, + "loss": 0.9186, + "step": 2089 + }, + { + "epoch": 1.1787930062041738, + "grad_norm": 4.9435577392578125, + "learning_rate": 4.4108855047941344e-05, + "loss": 1.1301, + "step": 2090 + }, + { + "epoch": 1.1793570219966159, + "grad_norm": 2.1874234676361084, + "learning_rate": 4.410603496897913e-05, + "loss": 0.9298, + "step": 2091 + }, + { + "epoch": 1.179921037789058, + "grad_norm": 2.3036062717437744, + "learning_rate": 4.410321489001693e-05, + "loss": 0.8533, + "step": 2092 + }, + { + "epoch": 1.1804850535815004, + "grad_norm": 3.6243104934692383, + "learning_rate": 4.4100394811054714e-05, + "loss": 0.9584, + "step": 2093 + }, + { + "epoch": 1.1810490693739424, + "grad_norm": 3.482959270477295, + "learning_rate": 4.40975747320925e-05, + "loss": 1.001, + "step": 2094 + }, + { + "epoch": 1.1816130851663846, + "grad_norm": 2.1455767154693604, + "learning_rate": 4.409475465313029e-05, + "loss": 0.819, + "step": 2095 + }, + { + "epoch": 1.1821771009588269, + "grad_norm": 3.673299551010132, + "learning_rate": 4.4091934574168083e-05, + "loss": 0.9717, + "step": 2096 + }, + { + "epoch": 1.1827411167512691, + "grad_norm": 4.577380180358887, + "learning_rate": 4.408911449520587e-05, + "loss": 1.0882, + "step": 2097 + }, + { + "epoch": 1.1833051325437112, + "grad_norm": 4.352725505828857, + "learning_rate": 4.4086294416243654e-05, + "loss": 0.8664, + "step": 2098 + }, + { + "epoch": 1.1838691483361534, + "grad_norm": 3.7551677227020264, + "learning_rate": 4.4083474337281446e-05, + "loss": 0.8446, + "step": 2099 + }, + { + "epoch": 1.1844331641285957, + "grad_norm": 4.145913124084473, + "learning_rate": 4.408065425831924e-05, + "loss": 0.8707, + "step": 2100 + }, + { + "epoch": 1.1849971799210377, + "grad_norm": 2.506878614425659, + "learning_rate": 4.4077834179357024e-05, + "loss": 0.7803, + "step": 2101 + }, + { + "epoch": 1.18556119571348, + "grad_norm": 4.820011138916016, + "learning_rate": 4.407501410039481e-05, + "loss": 1.0055, + "step": 2102 + }, + { + "epoch": 1.1861252115059222, + "grad_norm": 3.299269437789917, + "learning_rate": 4.40721940214326e-05, + "loss": 0.9465, + "step": 2103 + }, + { + "epoch": 1.1866892272983645, + "grad_norm": 4.233479022979736, + "learning_rate": 4.4069373942470394e-05, + "loss": 0.8511, + "step": 2104 + }, + { + "epoch": 1.1872532430908065, + "grad_norm": 2.889674663543701, + "learning_rate": 4.406655386350818e-05, + "loss": 0.8577, + "step": 2105 + }, + { + "epoch": 1.1878172588832487, + "grad_norm": 2.1606781482696533, + "learning_rate": 4.406373378454597e-05, + "loss": 0.7935, + "step": 2106 + }, + { + "epoch": 1.188381274675691, + "grad_norm": 1.603434443473816, + "learning_rate": 4.406091370558376e-05, + "loss": 0.6274, + "step": 2107 + }, + { + "epoch": 1.188945290468133, + "grad_norm": 3.2441158294677734, + "learning_rate": 4.405809362662155e-05, + "loss": 0.9335, + "step": 2108 + }, + { + "epoch": 1.1895093062605753, + "grad_norm": 5.982673645019531, + "learning_rate": 4.4055273547659334e-05, + "loss": 1.1203, + "step": 2109 + }, + { + "epoch": 1.1900733220530175, + "grad_norm": 3.234947919845581, + "learning_rate": 4.4052453468697127e-05, + "loss": 0.8735, + "step": 2110 + }, + { + "epoch": 1.1906373378454598, + "grad_norm": 4.648860454559326, + "learning_rate": 4.404963338973491e-05, + "loss": 0.9031, + "step": 2111 + }, + { + "epoch": 1.1912013536379018, + "grad_norm": 2.608424186706543, + "learning_rate": 4.4046813310772704e-05, + "loss": 0.8064, + "step": 2112 + }, + { + "epoch": 1.191765369430344, + "grad_norm": 4.394226551055908, + "learning_rate": 4.4043993231810496e-05, + "loss": 0.9614, + "step": 2113 + }, + { + "epoch": 1.1923293852227863, + "grad_norm": 3.0320637226104736, + "learning_rate": 4.404117315284828e-05, + "loss": 0.7271, + "step": 2114 + }, + { + "epoch": 1.1928934010152283, + "grad_norm": 5.641594409942627, + "learning_rate": 4.403835307388607e-05, + "loss": 1.0605, + "step": 2115 + }, + { + "epoch": 1.1934574168076706, + "grad_norm": 4.426129341125488, + "learning_rate": 4.403553299492386e-05, + "loss": 0.912, + "step": 2116 + }, + { + "epoch": 1.1940214326001128, + "grad_norm": 2.4522545337677, + "learning_rate": 4.403271291596165e-05, + "loss": 0.8405, + "step": 2117 + }, + { + "epoch": 1.194585448392555, + "grad_norm": 3.719707727432251, + "learning_rate": 4.402989283699944e-05, + "loss": 0.8597, + "step": 2118 + }, + { + "epoch": 1.1951494641849971, + "grad_norm": 5.922776222229004, + "learning_rate": 4.402707275803722e-05, + "loss": 1.0036, + "step": 2119 + }, + { + "epoch": 1.1957134799774394, + "grad_norm": 3.130011558532715, + "learning_rate": 4.4024252679075015e-05, + "loss": 1.0183, + "step": 2120 + }, + { + "epoch": 1.1962774957698816, + "grad_norm": 7.6943864822387695, + "learning_rate": 4.402143260011281e-05, + "loss": 1.1418, + "step": 2121 + }, + { + "epoch": 1.1968415115623237, + "grad_norm": 2.0514657497406006, + "learning_rate": 4.401861252115059e-05, + "loss": 0.6336, + "step": 2122 + }, + { + "epoch": 1.197405527354766, + "grad_norm": 4.518202304840088, + "learning_rate": 4.4015792442188384e-05, + "loss": 1.1295, + "step": 2123 + }, + { + "epoch": 1.1979695431472082, + "grad_norm": 1.5789276361465454, + "learning_rate": 4.4012972363226176e-05, + "loss": 0.7478, + "step": 2124 + }, + { + "epoch": 1.1985335589396504, + "grad_norm": 3.3062808513641357, + "learning_rate": 4.401015228426396e-05, + "loss": 0.8507, + "step": 2125 + }, + { + "epoch": 1.1990975747320924, + "grad_norm": 1.7649129629135132, + "learning_rate": 4.400733220530175e-05, + "loss": 0.7661, + "step": 2126 + }, + { + "epoch": 1.1996615905245347, + "grad_norm": 1.4735568761825562, + "learning_rate": 4.400451212633954e-05, + "loss": 0.7109, + "step": 2127 + }, + { + "epoch": 1.200225606316977, + "grad_norm": 3.446725845336914, + "learning_rate": 4.400169204737733e-05, + "loss": 0.9169, + "step": 2128 + }, + { + "epoch": 1.200789622109419, + "grad_norm": 3.8792786598205566, + "learning_rate": 4.399887196841512e-05, + "loss": 0.9521, + "step": 2129 + }, + { + "epoch": 1.2013536379018612, + "grad_norm": 3.4499948024749756, + "learning_rate": 4.39960518894529e-05, + "loss": 0.8766, + "step": 2130 + }, + { + "epoch": 1.2019176536943035, + "grad_norm": 2.5344247817993164, + "learning_rate": 4.39932318104907e-05, + "loss": 0.7519, + "step": 2131 + }, + { + "epoch": 1.2024816694867457, + "grad_norm": 2.795020818710327, + "learning_rate": 4.399041173152849e-05, + "loss": 0.7106, + "step": 2132 + }, + { + "epoch": 1.2030456852791878, + "grad_norm": 5.220913410186768, + "learning_rate": 4.398759165256627e-05, + "loss": 1.2139, + "step": 2133 + }, + { + "epoch": 1.20360970107163, + "grad_norm": 1.7342504262924194, + "learning_rate": 4.3984771573604064e-05, + "loss": 0.7849, + "step": 2134 + }, + { + "epoch": 1.2041737168640723, + "grad_norm": 4.920086860656738, + "learning_rate": 4.398195149464186e-05, + "loss": 1.056, + "step": 2135 + }, + { + "epoch": 1.2047377326565143, + "grad_norm": 2.749166250228882, + "learning_rate": 4.397913141567964e-05, + "loss": 0.8488, + "step": 2136 + }, + { + "epoch": 1.2053017484489565, + "grad_norm": 4.2160515785217285, + "learning_rate": 4.397631133671743e-05, + "loss": 0.9325, + "step": 2137 + }, + { + "epoch": 1.2058657642413988, + "grad_norm": 1.9281260967254639, + "learning_rate": 4.397349125775522e-05, + "loss": 0.7384, + "step": 2138 + }, + { + "epoch": 1.206429780033841, + "grad_norm": 2.6572492122650146, + "learning_rate": 4.397067117879301e-05, + "loss": 0.773, + "step": 2139 + }, + { + "epoch": 1.206993795826283, + "grad_norm": 3.099022388458252, + "learning_rate": 4.39678510998308e-05, + "loss": 0.9355, + "step": 2140 + }, + { + "epoch": 1.2075578116187253, + "grad_norm": 4.632925987243652, + "learning_rate": 4.396503102086858e-05, + "loss": 1.1168, + "step": 2141 + }, + { + "epoch": 1.2081218274111676, + "grad_norm": 3.141653299331665, + "learning_rate": 4.3962210941906375e-05, + "loss": 0.7856, + "step": 2142 + }, + { + "epoch": 1.2086858432036096, + "grad_norm": 3.6257805824279785, + "learning_rate": 4.395939086294417e-05, + "loss": 0.8467, + "step": 2143 + }, + { + "epoch": 1.2092498589960519, + "grad_norm": 3.751558542251587, + "learning_rate": 4.395657078398195e-05, + "loss": 0.8521, + "step": 2144 + }, + { + "epoch": 1.2098138747884941, + "grad_norm": 1.3676831722259521, + "learning_rate": 4.3953750705019745e-05, + "loss": 0.847, + "step": 2145 + }, + { + "epoch": 1.2103778905809364, + "grad_norm": 4.160841464996338, + "learning_rate": 4.395093062605753e-05, + "loss": 0.8184, + "step": 2146 + }, + { + "epoch": 1.2109419063733784, + "grad_norm": 2.667952060699463, + "learning_rate": 4.394811054709532e-05, + "loss": 0.9204, + "step": 2147 + }, + { + "epoch": 1.2115059221658206, + "grad_norm": 3.6264328956604004, + "learning_rate": 4.394529046813311e-05, + "loss": 0.7645, + "step": 2148 + }, + { + "epoch": 1.212069937958263, + "grad_norm": 3.381425380706787, + "learning_rate": 4.39424703891709e-05, + "loss": 1.104, + "step": 2149 + }, + { + "epoch": 1.212633953750705, + "grad_norm": 2.9956486225128174, + "learning_rate": 4.3939650310208685e-05, + "loss": 0.7329, + "step": 2150 + }, + { + "epoch": 1.2131979695431472, + "grad_norm": 2.8850655555725098, + "learning_rate": 4.393683023124648e-05, + "loss": 0.8189, + "step": 2151 + }, + { + "epoch": 1.2137619853355894, + "grad_norm": 4.11080265045166, + "learning_rate": 4.393401015228427e-05, + "loss": 0.7945, + "step": 2152 + }, + { + "epoch": 1.2143260011280317, + "grad_norm": 2.8749804496765137, + "learning_rate": 4.3931190073322055e-05, + "loss": 0.8512, + "step": 2153 + }, + { + "epoch": 1.2148900169204737, + "grad_norm": 2.9970033168792725, + "learning_rate": 4.392836999435984e-05, + "loss": 0.7719, + "step": 2154 + }, + { + "epoch": 1.215454032712916, + "grad_norm": 5.223832130432129, + "learning_rate": 4.392554991539763e-05, + "loss": 1.0056, + "step": 2155 + }, + { + "epoch": 1.2160180485053582, + "grad_norm": 3.668938636779785, + "learning_rate": 4.3922729836435425e-05, + "loss": 0.8538, + "step": 2156 + }, + { + "epoch": 1.2165820642978002, + "grad_norm": 3.095707654953003, + "learning_rate": 4.391990975747321e-05, + "loss": 0.7473, + "step": 2157 + }, + { + "epoch": 1.2171460800902425, + "grad_norm": 4.056222438812256, + "learning_rate": 4.3917089678511e-05, + "loss": 0.9777, + "step": 2158 + }, + { + "epoch": 1.2177100958826848, + "grad_norm": 2.224691152572632, + "learning_rate": 4.391426959954879e-05, + "loss": 0.8175, + "step": 2159 + }, + { + "epoch": 1.218274111675127, + "grad_norm": 2.711400032043457, + "learning_rate": 4.391144952058658e-05, + "loss": 0.7583, + "step": 2160 + }, + { + "epoch": 1.218838127467569, + "grad_norm": 2.8553009033203125, + "learning_rate": 4.3908629441624365e-05, + "loss": 0.7166, + "step": 2161 + }, + { + "epoch": 1.2194021432600113, + "grad_norm": 4.871114253997803, + "learning_rate": 4.390580936266216e-05, + "loss": 0.8441, + "step": 2162 + }, + { + "epoch": 1.2199661590524535, + "grad_norm": 3.3297269344329834, + "learning_rate": 4.390298928369995e-05, + "loss": 0.7537, + "step": 2163 + }, + { + "epoch": 1.2205301748448956, + "grad_norm": 1.7021045684814453, + "learning_rate": 4.3900169204737735e-05, + "loss": 0.8309, + "step": 2164 + }, + { + "epoch": 1.2210941906373378, + "grad_norm": 4.275166988372803, + "learning_rate": 4.389734912577552e-05, + "loss": 1.0154, + "step": 2165 + }, + { + "epoch": 1.22165820642978, + "grad_norm": 4.458328723907471, + "learning_rate": 4.389452904681331e-05, + "loss": 0.8359, + "step": 2166 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 4.373507976531982, + "learning_rate": 4.3891708967851105e-05, + "loss": 0.9095, + "step": 2167 + }, + { + "epoch": 1.2227862380146644, + "grad_norm": 2.662008762359619, + "learning_rate": 4.388888888888889e-05, + "loss": 0.8031, + "step": 2168 + }, + { + "epoch": 1.2233502538071066, + "grad_norm": 2.458322286605835, + "learning_rate": 4.3886068809926676e-05, + "loss": 0.9507, + "step": 2169 + }, + { + "epoch": 1.2239142695995489, + "grad_norm": 3.4950430393218994, + "learning_rate": 4.3883248730964475e-05, + "loss": 0.8468, + "step": 2170 + }, + { + "epoch": 1.2244782853919909, + "grad_norm": 2.24760103225708, + "learning_rate": 4.388042865200226e-05, + "loss": 0.8611, + "step": 2171 + }, + { + "epoch": 1.2250423011844331, + "grad_norm": 1.5492864847183228, + "learning_rate": 4.3877608573040046e-05, + "loss": 0.8931, + "step": 2172 + }, + { + "epoch": 1.2256063169768754, + "grad_norm": 10.666678428649902, + "learning_rate": 4.387478849407783e-05, + "loss": 0.9151, + "step": 2173 + }, + { + "epoch": 1.2261703327693176, + "grad_norm": 3.5939571857452393, + "learning_rate": 4.387196841511563e-05, + "loss": 0.8121, + "step": 2174 + }, + { + "epoch": 1.2267343485617597, + "grad_norm": 4.418440341949463, + "learning_rate": 4.3869148336153415e-05, + "loss": 1.0151, + "step": 2175 + }, + { + "epoch": 1.227298364354202, + "grad_norm": 2.1269428730010986, + "learning_rate": 4.38663282571912e-05, + "loss": 0.8059, + "step": 2176 + }, + { + "epoch": 1.2278623801466442, + "grad_norm": 2.9684677124023438, + "learning_rate": 4.386350817822899e-05, + "loss": 0.8032, + "step": 2177 + }, + { + "epoch": 1.2284263959390862, + "grad_norm": 3.9565517902374268, + "learning_rate": 4.3860688099266785e-05, + "loss": 1.0648, + "step": 2178 + }, + { + "epoch": 1.2289904117315285, + "grad_norm": 2.473252534866333, + "learning_rate": 4.385786802030457e-05, + "loss": 0.9249, + "step": 2179 + }, + { + "epoch": 1.2295544275239707, + "grad_norm": 4.647927761077881, + "learning_rate": 4.3855047941342356e-05, + "loss": 0.7883, + "step": 2180 + }, + { + "epoch": 1.230118443316413, + "grad_norm": 3.691237688064575, + "learning_rate": 4.385222786238015e-05, + "loss": 0.8565, + "step": 2181 + }, + { + "epoch": 1.230682459108855, + "grad_norm": 2.4910011291503906, + "learning_rate": 4.384940778341794e-05, + "loss": 0.6606, + "step": 2182 + }, + { + "epoch": 1.2312464749012972, + "grad_norm": 3.069905996322632, + "learning_rate": 4.3846587704455726e-05, + "loss": 0.9133, + "step": 2183 + }, + { + "epoch": 1.2318104906937395, + "grad_norm": 6.810321807861328, + "learning_rate": 4.384376762549352e-05, + "loss": 1.1149, + "step": 2184 + }, + { + "epoch": 1.2323745064861815, + "grad_norm": 2.7976372241973877, + "learning_rate": 4.38409475465313e-05, + "loss": 0.714, + "step": 2185 + }, + { + "epoch": 1.2329385222786238, + "grad_norm": 3.4659180641174316, + "learning_rate": 4.3838127467569095e-05, + "loss": 0.7675, + "step": 2186 + }, + { + "epoch": 1.233502538071066, + "grad_norm": 4.196119785308838, + "learning_rate": 4.383530738860688e-05, + "loss": 1.043, + "step": 2187 + }, + { + "epoch": 1.2340665538635083, + "grad_norm": 3.3885979652404785, + "learning_rate": 4.383248730964467e-05, + "loss": 0.9806, + "step": 2188 + }, + { + "epoch": 1.2346305696559503, + "grad_norm": 5.549649238586426, + "learning_rate": 4.382966723068246e-05, + "loss": 1.2138, + "step": 2189 + }, + { + "epoch": 1.2351945854483926, + "grad_norm": 5.189638614654541, + "learning_rate": 4.382684715172025e-05, + "loss": 0.9915, + "step": 2190 + }, + { + "epoch": 1.2357586012408348, + "grad_norm": 2.276278018951416, + "learning_rate": 4.3824027072758036e-05, + "loss": 0.818, + "step": 2191 + }, + { + "epoch": 1.2363226170332768, + "grad_norm": 3.5665442943573, + "learning_rate": 4.382120699379583e-05, + "loss": 0.8001, + "step": 2192 + }, + { + "epoch": 1.236886632825719, + "grad_norm": 2.5886240005493164, + "learning_rate": 4.381838691483362e-05, + "loss": 0.7627, + "step": 2193 + }, + { + "epoch": 1.2374506486181613, + "grad_norm": 3.5997018814086914, + "learning_rate": 4.3815566835871406e-05, + "loss": 0.888, + "step": 2194 + }, + { + "epoch": 1.2380146644106036, + "grad_norm": 2.5860979557037354, + "learning_rate": 4.38127467569092e-05, + "loss": 1.0042, + "step": 2195 + }, + { + "epoch": 1.2385786802030456, + "grad_norm": 5.284895420074463, + "learning_rate": 4.3809926677946983e-05, + "loss": 1.1679, + "step": 2196 + }, + { + "epoch": 1.2391426959954879, + "grad_norm": 2.997577428817749, + "learning_rate": 4.3807106598984776e-05, + "loss": 0.7187, + "step": 2197 + }, + { + "epoch": 1.2397067117879301, + "grad_norm": 2.517648458480835, + "learning_rate": 4.380428652002256e-05, + "loss": 0.7521, + "step": 2198 + }, + { + "epoch": 1.2402707275803722, + "grad_norm": 3.603433132171631, + "learning_rate": 4.380146644106035e-05, + "loss": 0.8667, + "step": 2199 + }, + { + "epoch": 1.2408347433728144, + "grad_norm": 3.7608907222747803, + "learning_rate": 4.379864636209814e-05, + "loss": 0.8736, + "step": 2200 + }, + { + "epoch": 1.2413987591652567, + "grad_norm": 2.167787551879883, + "learning_rate": 4.379582628313593e-05, + "loss": 0.8319, + "step": 2201 + }, + { + "epoch": 1.241962774957699, + "grad_norm": 3.305814027786255, + "learning_rate": 4.379300620417372e-05, + "loss": 0.802, + "step": 2202 + }, + { + "epoch": 1.242526790750141, + "grad_norm": 2.0148088932037354, + "learning_rate": 4.379018612521151e-05, + "loss": 0.8936, + "step": 2203 + }, + { + "epoch": 1.2430908065425832, + "grad_norm": 2.800147533416748, + "learning_rate": 4.3787366046249294e-05, + "loss": 0.8246, + "step": 2204 + }, + { + "epoch": 1.2436548223350254, + "grad_norm": 2.254072427749634, + "learning_rate": 4.3784545967287086e-05, + "loss": 0.7819, + "step": 2205 + }, + { + "epoch": 1.2442188381274675, + "grad_norm": 2.5732953548431396, + "learning_rate": 4.378172588832488e-05, + "loss": 0.8812, + "step": 2206 + }, + { + "epoch": 1.2447828539199097, + "grad_norm": 3.5920441150665283, + "learning_rate": 4.3778905809362664e-05, + "loss": 0.8106, + "step": 2207 + }, + { + "epoch": 1.245346869712352, + "grad_norm": 2.608541488647461, + "learning_rate": 4.377608573040045e-05, + "loss": 0.8945, + "step": 2208 + }, + { + "epoch": 1.2459108855047942, + "grad_norm": 3.847038507461548, + "learning_rate": 4.377326565143824e-05, + "loss": 0.9067, + "step": 2209 + }, + { + "epoch": 1.2464749012972363, + "grad_norm": 2.687857151031494, + "learning_rate": 4.377044557247603e-05, + "loss": 0.8102, + "step": 2210 + }, + { + "epoch": 1.2470389170896785, + "grad_norm": 3.7515454292297363, + "learning_rate": 4.376762549351382e-05, + "loss": 1.0185, + "step": 2211 + }, + { + "epoch": 1.2476029328821208, + "grad_norm": 3.2205872535705566, + "learning_rate": 4.3764805414551604e-05, + "loss": 0.8921, + "step": 2212 + }, + { + "epoch": 1.2481669486745628, + "grad_norm": 2.2612593173980713, + "learning_rate": 4.37619853355894e-05, + "loss": 0.691, + "step": 2213 + }, + { + "epoch": 1.248730964467005, + "grad_norm": 3.9423458576202393, + "learning_rate": 4.375916525662719e-05, + "loss": 0.9591, + "step": 2214 + }, + { + "epoch": 1.2492949802594473, + "grad_norm": 2.897458076477051, + "learning_rate": 4.3756345177664974e-05, + "loss": 0.8502, + "step": 2215 + }, + { + "epoch": 1.2498589960518895, + "grad_norm": 6.46681022644043, + "learning_rate": 4.3753525098702766e-05, + "loss": 1.1639, + "step": 2216 + }, + { + "epoch": 1.2504230118443316, + "grad_norm": 2.596604824066162, + "learning_rate": 4.375070501974056e-05, + "loss": 0.9746, + "step": 2217 + }, + { + "epoch": 1.2509870276367738, + "grad_norm": 1.705689787864685, + "learning_rate": 4.3747884940778344e-05, + "loss": 0.7291, + "step": 2218 + }, + { + "epoch": 1.251551043429216, + "grad_norm": 5.68400764465332, + "learning_rate": 4.374506486181613e-05, + "loss": 1.0748, + "step": 2219 + }, + { + "epoch": 1.252115059221658, + "grad_norm": 2.2420599460601807, + "learning_rate": 4.374224478285392e-05, + "loss": 0.7799, + "step": 2220 + }, + { + "epoch": 1.2526790750141004, + "grad_norm": 6.170358180999756, + "learning_rate": 4.3739424703891713e-05, + "loss": 1.0449, + "step": 2221 + }, + { + "epoch": 1.2532430908065426, + "grad_norm": 3.3455049991607666, + "learning_rate": 4.37366046249295e-05, + "loss": 0.7999, + "step": 2222 + }, + { + "epoch": 1.2538071065989849, + "grad_norm": 2.6959688663482666, + "learning_rate": 4.373378454596729e-05, + "loss": 0.8041, + "step": 2223 + }, + { + "epoch": 1.254371122391427, + "grad_norm": 4.242905139923096, + "learning_rate": 4.3730964467005076e-05, + "loss": 0.9986, + "step": 2224 + }, + { + "epoch": 1.2549351381838691, + "grad_norm": 4.141283988952637, + "learning_rate": 4.372814438804287e-05, + "loss": 0.8722, + "step": 2225 + }, + { + "epoch": 1.2554991539763114, + "grad_norm": 2.326759099960327, + "learning_rate": 4.3725324309080654e-05, + "loss": 0.6871, + "step": 2226 + }, + { + "epoch": 1.2560631697687534, + "grad_norm": 2.763699531555176, + "learning_rate": 4.3722504230118446e-05, + "loss": 0.8275, + "step": 2227 + }, + { + "epoch": 1.2566271855611957, + "grad_norm": 3.0653631687164307, + "learning_rate": 4.371968415115624e-05, + "loss": 0.7873, + "step": 2228 + }, + { + "epoch": 1.257191201353638, + "grad_norm": 3.2030398845672607, + "learning_rate": 4.3716864072194024e-05, + "loss": 0.924, + "step": 2229 + }, + { + "epoch": 1.2577552171460802, + "grad_norm": 2.41568660736084, + "learning_rate": 4.371404399323181e-05, + "loss": 0.8542, + "step": 2230 + }, + { + "epoch": 1.2583192329385222, + "grad_norm": 3.039867639541626, + "learning_rate": 4.37112239142696e-05, + "loss": 0.9208, + "step": 2231 + }, + { + "epoch": 1.2588832487309645, + "grad_norm": 3.779513120651245, + "learning_rate": 4.3708403835307394e-05, + "loss": 1.1205, + "step": 2232 + }, + { + "epoch": 1.2594472645234067, + "grad_norm": 4.1074018478393555, + "learning_rate": 4.370558375634518e-05, + "loss": 0.8761, + "step": 2233 + }, + { + "epoch": 1.2600112803158487, + "grad_norm": 3.3486924171447754, + "learning_rate": 4.370276367738297e-05, + "loss": 0.9323, + "step": 2234 + }, + { + "epoch": 1.260575296108291, + "grad_norm": 1.823339581489563, + "learning_rate": 4.369994359842076e-05, + "loss": 0.7809, + "step": 2235 + }, + { + "epoch": 1.2611393119007333, + "grad_norm": 2.2052228450775146, + "learning_rate": 4.369712351945855e-05, + "loss": 0.7751, + "step": 2236 + }, + { + "epoch": 1.2617033276931755, + "grad_norm": 3.216566801071167, + "learning_rate": 4.3694303440496334e-05, + "loss": 1.0295, + "step": 2237 + }, + { + "epoch": 1.2622673434856175, + "grad_norm": 9.8163480758667, + "learning_rate": 4.3691483361534126e-05, + "loss": 1.3902, + "step": 2238 + }, + { + "epoch": 1.2628313592780598, + "grad_norm": 3.6132984161376953, + "learning_rate": 4.368866328257191e-05, + "loss": 0.856, + "step": 2239 + }, + { + "epoch": 1.263395375070502, + "grad_norm": 3.1158978939056396, + "learning_rate": 4.3685843203609704e-05, + "loss": 0.6921, + "step": 2240 + }, + { + "epoch": 1.263959390862944, + "grad_norm": 2.722532033920288, + "learning_rate": 4.3683023124647496e-05, + "loss": 0.7759, + "step": 2241 + }, + { + "epoch": 1.2645234066553863, + "grad_norm": 3.2954368591308594, + "learning_rate": 4.368020304568528e-05, + "loss": 0.7646, + "step": 2242 + }, + { + "epoch": 1.2650874224478286, + "grad_norm": 3.97165584564209, + "learning_rate": 4.367738296672307e-05, + "loss": 0.9669, + "step": 2243 + }, + { + "epoch": 1.2656514382402708, + "grad_norm": 3.1469929218292236, + "learning_rate": 4.367456288776086e-05, + "loss": 0.8488, + "step": 2244 + }, + { + "epoch": 1.2662154540327129, + "grad_norm": 3.518319845199585, + "learning_rate": 4.367174280879865e-05, + "loss": 0.9015, + "step": 2245 + }, + { + "epoch": 1.266779469825155, + "grad_norm": 2.9194235801696777, + "learning_rate": 4.366892272983644e-05, + "loss": 0.8842, + "step": 2246 + }, + { + "epoch": 1.2673434856175974, + "grad_norm": 2.650631904602051, + "learning_rate": 4.366610265087422e-05, + "loss": 0.8709, + "step": 2247 + }, + { + "epoch": 1.2679075014100394, + "grad_norm": 4.111074924468994, + "learning_rate": 4.3663282571912014e-05, + "loss": 0.9377, + "step": 2248 + }, + { + "epoch": 1.2684715172024816, + "grad_norm": 2.0897693634033203, + "learning_rate": 4.3660462492949807e-05, + "loss": 0.7377, + "step": 2249 + }, + { + "epoch": 1.2690355329949239, + "grad_norm": 4.589169502258301, + "learning_rate": 4.365764241398759e-05, + "loss": 0.9038, + "step": 2250 + }, + { + "epoch": 1.2695995487873661, + "grad_norm": 3.9021494388580322, + "learning_rate": 4.365482233502538e-05, + "loss": 0.9982, + "step": 2251 + }, + { + "epoch": 1.2701635645798082, + "grad_norm": 6.634449005126953, + "learning_rate": 4.3652002256063176e-05, + "loss": 0.8719, + "step": 2252 + }, + { + "epoch": 1.2707275803722504, + "grad_norm": 4.881796360015869, + "learning_rate": 4.364918217710096e-05, + "loss": 1.2385, + "step": 2253 + }, + { + "epoch": 1.2712915961646927, + "grad_norm": 1.8891854286193848, + "learning_rate": 4.364636209813875e-05, + "loss": 0.6782, + "step": 2254 + }, + { + "epoch": 1.2718556119571347, + "grad_norm": 1.831498622894287, + "learning_rate": 4.364354201917654e-05, + "loss": 0.8888, + "step": 2255 + }, + { + "epoch": 1.272419627749577, + "grad_norm": 3.861952781677246, + "learning_rate": 4.364072194021433e-05, + "loss": 0.8389, + "step": 2256 + }, + { + "epoch": 1.2729836435420192, + "grad_norm": 4.487656593322754, + "learning_rate": 4.363790186125212e-05, + "loss": 0.8146, + "step": 2257 + }, + { + "epoch": 1.2735476593344615, + "grad_norm": 5.827868461608887, + "learning_rate": 4.36350817822899e-05, + "loss": 0.8116, + "step": 2258 + }, + { + "epoch": 1.2741116751269035, + "grad_norm": 5.032545566558838, + "learning_rate": 4.3632261703327695e-05, + "loss": 1.0158, + "step": 2259 + }, + { + "epoch": 1.2746756909193457, + "grad_norm": 3.274500846862793, + "learning_rate": 4.362944162436549e-05, + "loss": 1.0181, + "step": 2260 + }, + { + "epoch": 1.275239706711788, + "grad_norm": 2.818551540374756, + "learning_rate": 4.362662154540327e-05, + "loss": 0.9269, + "step": 2261 + }, + { + "epoch": 1.27580372250423, + "grad_norm": 5.41886568069458, + "learning_rate": 4.3623801466441064e-05, + "loss": 0.9682, + "step": 2262 + }, + { + "epoch": 1.2763677382966723, + "grad_norm": 3.4859793186187744, + "learning_rate": 4.3620981387478856e-05, + "loss": 0.7812, + "step": 2263 + }, + { + "epoch": 1.2769317540891145, + "grad_norm": 4.30391263961792, + "learning_rate": 4.361816130851664e-05, + "loss": 0.92, + "step": 2264 + }, + { + "epoch": 1.2774957698815568, + "grad_norm": 4.104955673217773, + "learning_rate": 4.361534122955443e-05, + "loss": 0.9733, + "step": 2265 + }, + { + "epoch": 1.2780597856739988, + "grad_norm": 2.3948259353637695, + "learning_rate": 4.361252115059222e-05, + "loss": 0.7567, + "step": 2266 + }, + { + "epoch": 1.278623801466441, + "grad_norm": 2.8198745250701904, + "learning_rate": 4.360970107163001e-05, + "loss": 0.8989, + "step": 2267 + }, + { + "epoch": 1.2791878172588833, + "grad_norm": 3.1658644676208496, + "learning_rate": 4.36068809926678e-05, + "loss": 0.8359, + "step": 2268 + }, + { + "epoch": 1.2797518330513253, + "grad_norm": 3.4356658458709717, + "learning_rate": 4.360406091370558e-05, + "loss": 0.9821, + "step": 2269 + }, + { + "epoch": 1.2803158488437676, + "grad_norm": 4.691191673278809, + "learning_rate": 4.3601240834743375e-05, + "loss": 1.0283, + "step": 2270 + }, + { + "epoch": 1.2808798646362098, + "grad_norm": 2.341146945953369, + "learning_rate": 4.359842075578117e-05, + "loss": 0.7308, + "step": 2271 + }, + { + "epoch": 1.281443880428652, + "grad_norm": 4.457566261291504, + "learning_rate": 4.359560067681895e-05, + "loss": 1.0215, + "step": 2272 + }, + { + "epoch": 1.2820078962210941, + "grad_norm": 4.936463832855225, + "learning_rate": 4.3592780597856744e-05, + "loss": 0.8452, + "step": 2273 + }, + { + "epoch": 1.2825719120135364, + "grad_norm": 2.8520610332489014, + "learning_rate": 4.358996051889453e-05, + "loss": 0.9325, + "step": 2274 + }, + { + "epoch": 1.2831359278059786, + "grad_norm": 3.7956063747406006, + "learning_rate": 4.358714043993232e-05, + "loss": 0.7899, + "step": 2275 + }, + { + "epoch": 1.2836999435984207, + "grad_norm": 5.576205730438232, + "learning_rate": 4.358432036097011e-05, + "loss": 1.1224, + "step": 2276 + }, + { + "epoch": 1.284263959390863, + "grad_norm": 3.8631374835968018, + "learning_rate": 4.35815002820079e-05, + "loss": 0.8597, + "step": 2277 + }, + { + "epoch": 1.2848279751833052, + "grad_norm": 2.2099435329437256, + "learning_rate": 4.3578680203045685e-05, + "loss": 0.8076, + "step": 2278 + }, + { + "epoch": 1.2853919909757474, + "grad_norm": 2.3940844535827637, + "learning_rate": 4.357586012408348e-05, + "loss": 0.7891, + "step": 2279 + }, + { + "epoch": 1.2859560067681894, + "grad_norm": 3.21038818359375, + "learning_rate": 4.357304004512127e-05, + "loss": 0.8818, + "step": 2280 + }, + { + "epoch": 1.2865200225606317, + "grad_norm": 2.065847396850586, + "learning_rate": 4.3570219966159055e-05, + "loss": 0.75, + "step": 2281 + }, + { + "epoch": 1.287084038353074, + "grad_norm": 3.7226791381835938, + "learning_rate": 4.356739988719684e-05, + "loss": 0.8565, + "step": 2282 + }, + { + "epoch": 1.287648054145516, + "grad_norm": 4.709227561950684, + "learning_rate": 4.356457980823463e-05, + "loss": 0.9615, + "step": 2283 + }, + { + "epoch": 1.2882120699379582, + "grad_norm": 2.529489517211914, + "learning_rate": 4.3561759729272425e-05, + "loss": 1.0003, + "step": 2284 + }, + { + "epoch": 1.2887760857304005, + "grad_norm": 2.5278029441833496, + "learning_rate": 4.355893965031021e-05, + "loss": 0.7736, + "step": 2285 + }, + { + "epoch": 1.2893401015228427, + "grad_norm": 2.9218666553497314, + "learning_rate": 4.3556119571347995e-05, + "loss": 0.8929, + "step": 2286 + }, + { + "epoch": 1.2899041173152848, + "grad_norm": 2.6127769947052, + "learning_rate": 4.355329949238579e-05, + "loss": 0.9481, + "step": 2287 + }, + { + "epoch": 1.290468133107727, + "grad_norm": 3.6320297718048096, + "learning_rate": 4.355047941342358e-05, + "loss": 1.0387, + "step": 2288 + }, + { + "epoch": 1.2910321489001693, + "grad_norm": 2.5385990142822266, + "learning_rate": 4.3547659334461365e-05, + "loss": 0.8931, + "step": 2289 + }, + { + "epoch": 1.2915961646926113, + "grad_norm": 4.809693813323975, + "learning_rate": 4.354483925549915e-05, + "loss": 1.0242, + "step": 2290 + }, + { + "epoch": 1.2921601804850535, + "grad_norm": 2.5849130153656006, + "learning_rate": 4.354201917653695e-05, + "loss": 0.8434, + "step": 2291 + }, + { + "epoch": 1.2927241962774958, + "grad_norm": 3.170881986618042, + "learning_rate": 4.3539199097574735e-05, + "loss": 0.8223, + "step": 2292 + }, + { + "epoch": 1.293288212069938, + "grad_norm": 1.4191616773605347, + "learning_rate": 4.353637901861252e-05, + "loss": 0.8063, + "step": 2293 + }, + { + "epoch": 1.29385222786238, + "grad_norm": 4.702634811401367, + "learning_rate": 4.353355893965031e-05, + "loss": 0.9149, + "step": 2294 + }, + { + "epoch": 1.2944162436548223, + "grad_norm": 4.243208885192871, + "learning_rate": 4.3530738860688105e-05, + "loss": 0.9267, + "step": 2295 + }, + { + "epoch": 1.2949802594472646, + "grad_norm": 2.336121082305908, + "learning_rate": 4.352791878172589e-05, + "loss": 0.8107, + "step": 2296 + }, + { + "epoch": 1.2955442752397066, + "grad_norm": 2.1538398265838623, + "learning_rate": 4.3525098702763676e-05, + "loss": 0.7895, + "step": 2297 + }, + { + "epoch": 1.2961082910321489, + "grad_norm": 3.477349042892456, + "learning_rate": 4.3522278623801475e-05, + "loss": 0.9108, + "step": 2298 + }, + { + "epoch": 1.2966723068245911, + "grad_norm": 2.814889430999756, + "learning_rate": 4.351945854483926e-05, + "loss": 0.7863, + "step": 2299 + }, + { + "epoch": 1.2972363226170334, + "grad_norm": 3.077761650085449, + "learning_rate": 4.3516638465877045e-05, + "loss": 0.8944, + "step": 2300 + }, + { + "epoch": 1.2978003384094754, + "grad_norm": 25.90468978881836, + "learning_rate": 4.351381838691484e-05, + "loss": 0.9677, + "step": 2301 + }, + { + "epoch": 1.2983643542019176, + "grad_norm": 3.375361442565918, + "learning_rate": 4.351099830795263e-05, + "loss": 0.9601, + "step": 2302 + }, + { + "epoch": 1.29892836999436, + "grad_norm": 4.372297286987305, + "learning_rate": 4.3508178228990415e-05, + "loss": 0.8692, + "step": 2303 + }, + { + "epoch": 1.299492385786802, + "grad_norm": 2.8490960597991943, + "learning_rate": 4.35053581500282e-05, + "loss": 0.7994, + "step": 2304 + }, + { + "epoch": 1.3000564015792442, + "grad_norm": 2.977151870727539, + "learning_rate": 4.350253807106599e-05, + "loss": 0.8523, + "step": 2305 + }, + { + "epoch": 1.3006204173716864, + "grad_norm": 2.6469385623931885, + "learning_rate": 4.3499717992103785e-05, + "loss": 0.7723, + "step": 2306 + }, + { + "epoch": 1.3011844331641287, + "grad_norm": 2.4402058124542236, + "learning_rate": 4.349689791314157e-05, + "loss": 0.8745, + "step": 2307 + }, + { + "epoch": 1.3017484489565707, + "grad_norm": 3.8233954906463623, + "learning_rate": 4.3494077834179356e-05, + "loss": 0.8957, + "step": 2308 + }, + { + "epoch": 1.302312464749013, + "grad_norm": 1.553084135055542, + "learning_rate": 4.349125775521715e-05, + "loss": 0.7418, + "step": 2309 + }, + { + "epoch": 1.3028764805414552, + "grad_norm": 2.7869038581848145, + "learning_rate": 4.348843767625494e-05, + "loss": 0.9098, + "step": 2310 + }, + { + "epoch": 1.3034404963338972, + "grad_norm": 2.554579734802246, + "learning_rate": 4.3485617597292725e-05, + "loss": 0.9035, + "step": 2311 + }, + { + "epoch": 1.3040045121263395, + "grad_norm": 3.806363344192505, + "learning_rate": 4.348279751833052e-05, + "loss": 0.8553, + "step": 2312 + }, + { + "epoch": 1.3045685279187818, + "grad_norm": 2.6800284385681152, + "learning_rate": 4.34799774393683e-05, + "loss": 0.7859, + "step": 2313 + }, + { + "epoch": 1.305132543711224, + "grad_norm": 2.3737385272979736, + "learning_rate": 4.3477157360406095e-05, + "loss": 0.7326, + "step": 2314 + }, + { + "epoch": 1.305696559503666, + "grad_norm": 3.7374887466430664, + "learning_rate": 4.347433728144388e-05, + "loss": 0.9415, + "step": 2315 + }, + { + "epoch": 1.3062605752961083, + "grad_norm": 2.2226338386535645, + "learning_rate": 4.347151720248167e-05, + "loss": 0.8144, + "step": 2316 + }, + { + "epoch": 1.3068245910885505, + "grad_norm": 5.41491174697876, + "learning_rate": 4.346869712351946e-05, + "loss": 1.173, + "step": 2317 + }, + { + "epoch": 1.3073886068809926, + "grad_norm": 2.5569815635681152, + "learning_rate": 4.346587704455725e-05, + "loss": 0.7788, + "step": 2318 + }, + { + "epoch": 1.3079526226734348, + "grad_norm": 4.123485088348389, + "learning_rate": 4.346305696559504e-05, + "loss": 0.9311, + "step": 2319 + }, + { + "epoch": 1.308516638465877, + "grad_norm": 3.474733591079712, + "learning_rate": 4.346023688663283e-05, + "loss": 0.799, + "step": 2320 + }, + { + "epoch": 1.3090806542583193, + "grad_norm": 4.630918979644775, + "learning_rate": 4.3457416807670613e-05, + "loss": 1.0848, + "step": 2321 + }, + { + "epoch": 1.3096446700507614, + "grad_norm": 2.1592249870300293, + "learning_rate": 4.3454596728708406e-05, + "loss": 0.7564, + "step": 2322 + }, + { + "epoch": 1.3102086858432036, + "grad_norm": 2.9226701259613037, + "learning_rate": 4.34517766497462e-05, + "loss": 0.8899, + "step": 2323 + }, + { + "epoch": 1.3107727016356459, + "grad_norm": 6.808071613311768, + "learning_rate": 4.344895657078398e-05, + "loss": 1.1291, + "step": 2324 + }, + { + "epoch": 1.3113367174280879, + "grad_norm": 2.9479386806488037, + "learning_rate": 4.344613649182177e-05, + "loss": 0.7251, + "step": 2325 + }, + { + "epoch": 1.3119007332205301, + "grad_norm": 4.243619441986084, + "learning_rate": 4.344331641285956e-05, + "loss": 1.0591, + "step": 2326 + }, + { + "epoch": 1.3124647490129724, + "grad_norm": 4.420961380004883, + "learning_rate": 4.344049633389735e-05, + "loss": 1.112, + "step": 2327 + }, + { + "epoch": 1.3130287648054146, + "grad_norm": 2.9413740634918213, + "learning_rate": 4.343767625493514e-05, + "loss": 0.77, + "step": 2328 + }, + { + "epoch": 1.3135927805978567, + "grad_norm": 2.0313642024993896, + "learning_rate": 4.3434856175972924e-05, + "loss": 0.7297, + "step": 2329 + }, + { + "epoch": 1.314156796390299, + "grad_norm": 2.3640549182891846, + "learning_rate": 4.343203609701072e-05, + "loss": 0.8033, + "step": 2330 + }, + { + "epoch": 1.3147208121827412, + "grad_norm": 2.1214189529418945, + "learning_rate": 4.342921601804851e-05, + "loss": 0.9606, + "step": 2331 + }, + { + "epoch": 1.3152848279751832, + "grad_norm": 6.101818084716797, + "learning_rate": 4.3426395939086294e-05, + "loss": 0.9523, + "step": 2332 + }, + { + "epoch": 1.3158488437676255, + "grad_norm": 5.601726055145264, + "learning_rate": 4.3423575860124086e-05, + "loss": 1.101, + "step": 2333 + }, + { + "epoch": 1.3164128595600677, + "grad_norm": 1.4370383024215698, + "learning_rate": 4.342075578116188e-05, + "loss": 0.7088, + "step": 2334 + }, + { + "epoch": 1.31697687535251, + "grad_norm": 1.9492497444152832, + "learning_rate": 4.341793570219966e-05, + "loss": 0.8682, + "step": 2335 + }, + { + "epoch": 1.317540891144952, + "grad_norm": 2.419466257095337, + "learning_rate": 4.341511562323745e-05, + "loss": 0.8917, + "step": 2336 + }, + { + "epoch": 1.3181049069373942, + "grad_norm": 1.8325120210647583, + "learning_rate": 4.341229554427525e-05, + "loss": 0.7953, + "step": 2337 + }, + { + "epoch": 1.3186689227298365, + "grad_norm": 5.129246711730957, + "learning_rate": 4.340947546531303e-05, + "loss": 0.7709, + "step": 2338 + }, + { + "epoch": 1.3192329385222785, + "grad_norm": 4.29916524887085, + "learning_rate": 4.340665538635082e-05, + "loss": 1.0045, + "step": 2339 + }, + { + "epoch": 1.3197969543147208, + "grad_norm": 3.052858352661133, + "learning_rate": 4.3403835307388604e-05, + "loss": 0.9444, + "step": 2340 + }, + { + "epoch": 1.320360970107163, + "grad_norm": 2.9553537368774414, + "learning_rate": 4.34010152284264e-05, + "loss": 0.9752, + "step": 2341 + }, + { + "epoch": 1.3209249858996053, + "grad_norm": 3.1385443210601807, + "learning_rate": 4.339819514946419e-05, + "loss": 0.7746, + "step": 2342 + }, + { + "epoch": 1.3214890016920473, + "grad_norm": 5.728468418121338, + "learning_rate": 4.3395375070501974e-05, + "loss": 0.9378, + "step": 2343 + }, + { + "epoch": 1.3220530174844896, + "grad_norm": 4.114583969116211, + "learning_rate": 4.3392554991539766e-05, + "loss": 0.8373, + "step": 2344 + }, + { + "epoch": 1.3226170332769318, + "grad_norm": 1.7576569318771362, + "learning_rate": 4.338973491257756e-05, + "loss": 0.7973, + "step": 2345 + }, + { + "epoch": 1.3231810490693738, + "grad_norm": 3.527146816253662, + "learning_rate": 4.3386914833615344e-05, + "loss": 0.7911, + "step": 2346 + }, + { + "epoch": 1.323745064861816, + "grad_norm": 3.175243616104126, + "learning_rate": 4.338409475465313e-05, + "loss": 0.7982, + "step": 2347 + }, + { + "epoch": 1.3243090806542583, + "grad_norm": 4.0751824378967285, + "learning_rate": 4.338127467569092e-05, + "loss": 0.906, + "step": 2348 + }, + { + "epoch": 1.3248730964467006, + "grad_norm": 2.9161291122436523, + "learning_rate": 4.337845459672871e-05, + "loss": 0.8841, + "step": 2349 + }, + { + "epoch": 1.3254371122391426, + "grad_norm": 3.001725196838379, + "learning_rate": 4.33756345177665e-05, + "loss": 0.9732, + "step": 2350 + }, + { + "epoch": 1.3260011280315849, + "grad_norm": 4.356836795806885, + "learning_rate": 4.337281443880429e-05, + "loss": 0.9347, + "step": 2351 + }, + { + "epoch": 1.3265651438240271, + "grad_norm": 3.6001076698303223, + "learning_rate": 4.3369994359842076e-05, + "loss": 0.8007, + "step": 2352 + }, + { + "epoch": 1.3271291596164692, + "grad_norm": 2.6812539100646973, + "learning_rate": 4.336717428087987e-05, + "loss": 0.8937, + "step": 2353 + }, + { + "epoch": 1.3276931754089114, + "grad_norm": 1.8899224996566772, + "learning_rate": 4.3364354201917654e-05, + "loss": 0.7671, + "step": 2354 + }, + { + "epoch": 1.3282571912013537, + "grad_norm": 3.0196588039398193, + "learning_rate": 4.3361534122955446e-05, + "loss": 0.7823, + "step": 2355 + }, + { + "epoch": 1.328821206993796, + "grad_norm": 4.635570526123047, + "learning_rate": 4.335871404399323e-05, + "loss": 0.945, + "step": 2356 + }, + { + "epoch": 1.329385222786238, + "grad_norm": 1.9717237949371338, + "learning_rate": 4.3355893965031024e-05, + "loss": 0.8317, + "step": 2357 + }, + { + "epoch": 1.3299492385786802, + "grad_norm": 2.1274573802948, + "learning_rate": 4.335307388606881e-05, + "loss": 0.6838, + "step": 2358 + }, + { + "epoch": 1.3305132543711224, + "grad_norm": 3.792177677154541, + "learning_rate": 4.33502538071066e-05, + "loss": 0.8531, + "step": 2359 + }, + { + "epoch": 1.3310772701635645, + "grad_norm": 3.226926565170288, + "learning_rate": 4.334743372814439e-05, + "loss": 0.8329, + "step": 2360 + }, + { + "epoch": 1.3316412859560067, + "grad_norm": 1.963423490524292, + "learning_rate": 4.334461364918218e-05, + "loss": 0.6964, + "step": 2361 + }, + { + "epoch": 1.332205301748449, + "grad_norm": 3.466524839401245, + "learning_rate": 4.334179357021997e-05, + "loss": 0.8592, + "step": 2362 + }, + { + "epoch": 1.3327693175408912, + "grad_norm": 3.9662399291992188, + "learning_rate": 4.3338973491257756e-05, + "loss": 0.8593, + "step": 2363 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 3.792027235031128, + "learning_rate": 4.333615341229554e-05, + "loss": 0.813, + "step": 2364 + }, + { + "epoch": 1.3338973491257755, + "grad_norm": 5.34633207321167, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.9738, + "step": 2365 + }, + { + "epoch": 1.3344613649182178, + "grad_norm": 3.908031463623047, + "learning_rate": 4.3330513254371126e-05, + "loss": 0.9547, + "step": 2366 + }, + { + "epoch": 1.3350253807106598, + "grad_norm": 2.689704418182373, + "learning_rate": 4.332769317540891e-05, + "loss": 0.7912, + "step": 2367 + }, + { + "epoch": 1.335589396503102, + "grad_norm": 4.451303958892822, + "learning_rate": 4.3324873096446704e-05, + "loss": 0.9473, + "step": 2368 + }, + { + "epoch": 1.3361534122955443, + "grad_norm": 4.024834632873535, + "learning_rate": 4.3322053017484496e-05, + "loss": 1.0586, + "step": 2369 + }, + { + "epoch": 1.3367174280879865, + "grad_norm": 4.451879501342773, + "learning_rate": 4.331923293852228e-05, + "loss": 1.1025, + "step": 2370 + }, + { + "epoch": 1.3372814438804286, + "grad_norm": 3.625438690185547, + "learning_rate": 4.331641285956007e-05, + "loss": 0.8625, + "step": 2371 + }, + { + "epoch": 1.3378454596728708, + "grad_norm": 1.871866226196289, + "learning_rate": 4.331359278059786e-05, + "loss": 0.7423, + "step": 2372 + }, + { + "epoch": 1.338409475465313, + "grad_norm": 4.421525001525879, + "learning_rate": 4.331077270163565e-05, + "loss": 1.2046, + "step": 2373 + }, + { + "epoch": 1.338973491257755, + "grad_norm": 3.8372604846954346, + "learning_rate": 4.3307952622673437e-05, + "loss": 0.9317, + "step": 2374 + }, + { + "epoch": 1.3395375070501974, + "grad_norm": 2.3898255825042725, + "learning_rate": 4.330513254371122e-05, + "loss": 0.9984, + "step": 2375 + }, + { + "epoch": 1.3401015228426396, + "grad_norm": 3.7653634548187256, + "learning_rate": 4.3302312464749014e-05, + "loss": 0.6995, + "step": 2376 + }, + { + "epoch": 1.3406655386350819, + "grad_norm": 3.7549285888671875, + "learning_rate": 4.3299492385786806e-05, + "loss": 0.8791, + "step": 2377 + }, + { + "epoch": 1.341229554427524, + "grad_norm": 1.6765230894088745, + "learning_rate": 4.329667230682459e-05, + "loss": 0.7893, + "step": 2378 + }, + { + "epoch": 1.3417935702199661, + "grad_norm": 3.999744415283203, + "learning_rate": 4.329385222786238e-05, + "loss": 0.9771, + "step": 2379 + }, + { + "epoch": 1.3423575860124084, + "grad_norm": 1.7921088933944702, + "learning_rate": 4.3291032148900176e-05, + "loss": 0.7688, + "step": 2380 + }, + { + "epoch": 1.3429216018048504, + "grad_norm": 4.77140998840332, + "learning_rate": 4.328821206993796e-05, + "loss": 0.9233, + "step": 2381 + }, + { + "epoch": 1.3434856175972927, + "grad_norm": 4.265617847442627, + "learning_rate": 4.328539199097575e-05, + "loss": 0.8813, + "step": 2382 + }, + { + "epoch": 1.344049633389735, + "grad_norm": 2.7927944660186768, + "learning_rate": 4.328257191201354e-05, + "loss": 0.7858, + "step": 2383 + }, + { + "epoch": 1.3446136491821772, + "grad_norm": 3.9609217643737793, + "learning_rate": 4.327975183305133e-05, + "loss": 0.9327, + "step": 2384 + }, + { + "epoch": 1.3451776649746192, + "grad_norm": 2.5695133209228516, + "learning_rate": 4.327693175408912e-05, + "loss": 0.8184, + "step": 2385 + }, + { + "epoch": 1.3457416807670615, + "grad_norm": 3.2145020961761475, + "learning_rate": 4.32741116751269e-05, + "loss": 0.8453, + "step": 2386 + }, + { + "epoch": 1.3463056965595037, + "grad_norm": 2.811419725418091, + "learning_rate": 4.3271291596164694e-05, + "loss": 0.9127, + "step": 2387 + }, + { + "epoch": 1.3468697123519457, + "grad_norm": 2.293226480484009, + "learning_rate": 4.3268471517202487e-05, + "loss": 0.856, + "step": 2388 + }, + { + "epoch": 1.347433728144388, + "grad_norm": 2.5743095874786377, + "learning_rate": 4.326565143824027e-05, + "loss": 0.8681, + "step": 2389 + }, + { + "epoch": 1.3479977439368303, + "grad_norm": 4.385245323181152, + "learning_rate": 4.3262831359278064e-05, + "loss": 0.9496, + "step": 2390 + }, + { + "epoch": 1.3485617597292725, + "grad_norm": 2.527973175048828, + "learning_rate": 4.326001128031585e-05, + "loss": 0.896, + "step": 2391 + }, + { + "epoch": 1.3491257755217145, + "grad_norm": 1.8402235507965088, + "learning_rate": 4.325719120135364e-05, + "loss": 0.8761, + "step": 2392 + }, + { + "epoch": 1.3496897913141568, + "grad_norm": 1.924173355102539, + "learning_rate": 4.325437112239143e-05, + "loss": 0.8147, + "step": 2393 + }, + { + "epoch": 1.350253807106599, + "grad_norm": 1.9167754650115967, + "learning_rate": 4.325155104342922e-05, + "loss": 0.7476, + "step": 2394 + }, + { + "epoch": 1.350817822899041, + "grad_norm": 2.8154242038726807, + "learning_rate": 4.3248730964467005e-05, + "loss": 0.8955, + "step": 2395 + }, + { + "epoch": 1.3513818386914833, + "grad_norm": 2.111995220184326, + "learning_rate": 4.32459108855048e-05, + "loss": 0.8491, + "step": 2396 + }, + { + "epoch": 1.3519458544839256, + "grad_norm": 3.2789499759674072, + "learning_rate": 4.324309080654258e-05, + "loss": 0.9784, + "step": 2397 + }, + { + "epoch": 1.3525098702763678, + "grad_norm": 3.8975322246551514, + "learning_rate": 4.3240270727580375e-05, + "loss": 0.9012, + "step": 2398 + }, + { + "epoch": 1.3530738860688099, + "grad_norm": 3.8877367973327637, + "learning_rate": 4.323745064861816e-05, + "loss": 0.7643, + "step": 2399 + }, + { + "epoch": 1.353637901861252, + "grad_norm": 2.9582765102386475, + "learning_rate": 4.323463056965595e-05, + "loss": 0.8782, + "step": 2400 + }, + { + "epoch": 1.3542019176536944, + "grad_norm": 3.5888350009918213, + "learning_rate": 4.3231810490693744e-05, + "loss": 0.8734, + "step": 2401 + }, + { + "epoch": 1.3547659334461364, + "grad_norm": 1.5518797636032104, + "learning_rate": 4.322899041173153e-05, + "loss": 0.7462, + "step": 2402 + }, + { + "epoch": 1.3553299492385786, + "grad_norm": 1.7575122117996216, + "learning_rate": 4.322617033276932e-05, + "loss": 0.6177, + "step": 2403 + }, + { + "epoch": 1.3558939650310209, + "grad_norm": 3.843905448913574, + "learning_rate": 4.322335025380711e-05, + "loss": 0.876, + "step": 2404 + }, + { + "epoch": 1.3564579808234631, + "grad_norm": 5.090063095092773, + "learning_rate": 4.32205301748449e-05, + "loss": 0.8365, + "step": 2405 + }, + { + "epoch": 1.3570219966159052, + "grad_norm": 4.217265605926514, + "learning_rate": 4.3217710095882685e-05, + "loss": 0.9936, + "step": 2406 + }, + { + "epoch": 1.3575860124083474, + "grad_norm": 3.115016222000122, + "learning_rate": 4.321489001692048e-05, + "loss": 0.8045, + "step": 2407 + }, + { + "epoch": 1.3581500282007897, + "grad_norm": 3.7476308345794678, + "learning_rate": 4.321206993795827e-05, + "loss": 0.8732, + "step": 2408 + }, + { + "epoch": 1.3587140439932317, + "grad_norm": 4.757698059082031, + "learning_rate": 4.3209249858996055e-05, + "loss": 0.8449, + "step": 2409 + }, + { + "epoch": 1.359278059785674, + "grad_norm": 3.247448205947876, + "learning_rate": 4.320642978003384e-05, + "loss": 0.8187, + "step": 2410 + }, + { + "epoch": 1.3598420755781162, + "grad_norm": 1.6593856811523438, + "learning_rate": 4.320360970107163e-05, + "loss": 0.8311, + "step": 2411 + }, + { + "epoch": 1.3604060913705585, + "grad_norm": 3.2789723873138428, + "learning_rate": 4.3200789622109424e-05, + "loss": 0.8291, + "step": 2412 + }, + { + "epoch": 1.3609701071630005, + "grad_norm": 3.4748873710632324, + "learning_rate": 4.319796954314721e-05, + "loss": 0.8945, + "step": 2413 + }, + { + "epoch": 1.3615341229554427, + "grad_norm": 2.070493698120117, + "learning_rate": 4.3195149464184995e-05, + "loss": 0.8195, + "step": 2414 + }, + { + "epoch": 1.362098138747885, + "grad_norm": 2.382716178894043, + "learning_rate": 4.319232938522279e-05, + "loss": 0.9191, + "step": 2415 + }, + { + "epoch": 1.362662154540327, + "grad_norm": 2.946866989135742, + "learning_rate": 4.318950930626058e-05, + "loss": 0.8338, + "step": 2416 + }, + { + "epoch": 1.3632261703327693, + "grad_norm": 3.3594236373901367, + "learning_rate": 4.3186689227298365e-05, + "loss": 0.8558, + "step": 2417 + }, + { + "epoch": 1.3637901861252115, + "grad_norm": 3.13407826423645, + "learning_rate": 4.318386914833615e-05, + "loss": 0.8496, + "step": 2418 + }, + { + "epoch": 1.3643542019176538, + "grad_norm": 2.277627944946289, + "learning_rate": 4.318104906937395e-05, + "loss": 0.7712, + "step": 2419 + }, + { + "epoch": 1.3649182177100958, + "grad_norm": 2.997520685195923, + "learning_rate": 4.3178228990411735e-05, + "loss": 0.6944, + "step": 2420 + }, + { + "epoch": 1.365482233502538, + "grad_norm": 3.627260208129883, + "learning_rate": 4.317540891144952e-05, + "loss": 0.8279, + "step": 2421 + }, + { + "epoch": 1.3660462492949803, + "grad_norm": 2.393641233444214, + "learning_rate": 4.317258883248731e-05, + "loss": 0.8367, + "step": 2422 + }, + { + "epoch": 1.3666102650874223, + "grad_norm": 3.102307081222534, + "learning_rate": 4.3169768753525105e-05, + "loss": 0.809, + "step": 2423 + }, + { + "epoch": 1.3671742808798646, + "grad_norm": 5.308002948760986, + "learning_rate": 4.316694867456289e-05, + "loss": 1.1344, + "step": 2424 + }, + { + "epoch": 1.3677382966723068, + "grad_norm": 4.887362480163574, + "learning_rate": 4.3164128595600675e-05, + "loss": 0.9069, + "step": 2425 + }, + { + "epoch": 1.368302312464749, + "grad_norm": 3.1149330139160156, + "learning_rate": 4.316130851663847e-05, + "loss": 1.0009, + "step": 2426 + }, + { + "epoch": 1.3688663282571911, + "grad_norm": 2.0758249759674072, + "learning_rate": 4.315848843767626e-05, + "loss": 0.8009, + "step": 2427 + }, + { + "epoch": 1.3694303440496334, + "grad_norm": 2.180295944213867, + "learning_rate": 4.3155668358714045e-05, + "loss": 0.8525, + "step": 2428 + }, + { + "epoch": 1.3699943598420756, + "grad_norm": 1.699157953262329, + "learning_rate": 4.315284827975184e-05, + "loss": 0.9066, + "step": 2429 + }, + { + "epoch": 1.3705583756345177, + "grad_norm": 1.4857347011566162, + "learning_rate": 4.315002820078962e-05, + "loss": 0.761, + "step": 2430 + }, + { + "epoch": 1.37112239142696, + "grad_norm": 3.9570610523223877, + "learning_rate": 4.3147208121827415e-05, + "loss": 0.8658, + "step": 2431 + }, + { + "epoch": 1.3716864072194022, + "grad_norm": 4.590373992919922, + "learning_rate": 4.31443880428652e-05, + "loss": 0.8316, + "step": 2432 + }, + { + "epoch": 1.3722504230118444, + "grad_norm": 4.484099388122559, + "learning_rate": 4.314156796390299e-05, + "loss": 0.8709, + "step": 2433 + }, + { + "epoch": 1.3728144388042864, + "grad_norm": 2.019742727279663, + "learning_rate": 4.313874788494078e-05, + "loss": 0.7066, + "step": 2434 + }, + { + "epoch": 1.3733784545967287, + "grad_norm": 5.395395755767822, + "learning_rate": 4.313592780597857e-05, + "loss": 0.981, + "step": 2435 + }, + { + "epoch": 1.373942470389171, + "grad_norm": 2.7196059226989746, + "learning_rate": 4.3133107727016356e-05, + "loss": 0.6935, + "step": 2436 + }, + { + "epoch": 1.374506486181613, + "grad_norm": 2.404933214187622, + "learning_rate": 4.313028764805415e-05, + "loss": 0.8078, + "step": 2437 + }, + { + "epoch": 1.3750705019740552, + "grad_norm": 3.242422580718994, + "learning_rate": 4.312746756909194e-05, + "loss": 0.8923, + "step": 2438 + }, + { + "epoch": 1.3756345177664975, + "grad_norm": 2.210879325866699, + "learning_rate": 4.3124647490129725e-05, + "loss": 0.8509, + "step": 2439 + }, + { + "epoch": 1.3761985335589397, + "grad_norm": 3.846574544906616, + "learning_rate": 4.312182741116752e-05, + "loss": 1.0573, + "step": 2440 + }, + { + "epoch": 1.3767625493513818, + "grad_norm": 3.5045273303985596, + "learning_rate": 4.31190073322053e-05, + "loss": 0.871, + "step": 2441 + }, + { + "epoch": 1.377326565143824, + "grad_norm": 4.186235427856445, + "learning_rate": 4.3116187253243095e-05, + "loss": 1.1482, + "step": 2442 + }, + { + "epoch": 1.3778905809362663, + "grad_norm": 4.8551344871521, + "learning_rate": 4.311336717428088e-05, + "loss": 0.9868, + "step": 2443 + }, + { + "epoch": 1.3784545967287083, + "grad_norm": 3.6204442977905273, + "learning_rate": 4.311054709531867e-05, + "loss": 0.7964, + "step": 2444 + }, + { + "epoch": 1.3790186125211505, + "grad_norm": 4.194634914398193, + "learning_rate": 4.310772701635646e-05, + "loss": 1.0075, + "step": 2445 + }, + { + "epoch": 1.3795826283135928, + "grad_norm": 4.237339019775391, + "learning_rate": 4.310490693739425e-05, + "loss": 0.8978, + "step": 2446 + }, + { + "epoch": 1.380146644106035, + "grad_norm": 3.150205373764038, + "learning_rate": 4.310208685843204e-05, + "loss": 0.8099, + "step": 2447 + }, + { + "epoch": 1.380710659898477, + "grad_norm": 2.809218168258667, + "learning_rate": 4.309926677946983e-05, + "loss": 0.8447, + "step": 2448 + }, + { + "epoch": 1.3812746756909193, + "grad_norm": 1.8131065368652344, + "learning_rate": 4.309644670050761e-05, + "loss": 0.77, + "step": 2449 + }, + { + "epoch": 1.3818386914833616, + "grad_norm": 4.531746864318848, + "learning_rate": 4.3093626621545405e-05, + "loss": 1.0509, + "step": 2450 + }, + { + "epoch": 1.3824027072758036, + "grad_norm": 2.501941442489624, + "learning_rate": 4.30908065425832e-05, + "loss": 0.8873, + "step": 2451 + }, + { + "epoch": 1.3829667230682459, + "grad_norm": 5.253747940063477, + "learning_rate": 4.308798646362098e-05, + "loss": 1.0411, + "step": 2452 + }, + { + "epoch": 1.3835307388606881, + "grad_norm": 3.8068368434906006, + "learning_rate": 4.308516638465877e-05, + "loss": 0.9727, + "step": 2453 + }, + { + "epoch": 1.3840947546531304, + "grad_norm": 3.202730655670166, + "learning_rate": 4.308234630569656e-05, + "loss": 0.8424, + "step": 2454 + }, + { + "epoch": 1.3846587704455724, + "grad_norm": 1.4569939374923706, + "learning_rate": 4.307952622673435e-05, + "loss": 0.8201, + "step": 2455 + }, + { + "epoch": 1.3852227862380146, + "grad_norm": 3.6214511394500732, + "learning_rate": 4.307670614777214e-05, + "loss": 1.0107, + "step": 2456 + }, + { + "epoch": 1.385786802030457, + "grad_norm": 1.210475206375122, + "learning_rate": 4.3073886068809924e-05, + "loss": 0.6649, + "step": 2457 + }, + { + "epoch": 1.386350817822899, + "grad_norm": 2.647029161453247, + "learning_rate": 4.307106598984772e-05, + "loss": 0.7681, + "step": 2458 + }, + { + "epoch": 1.3869148336153412, + "grad_norm": 3.963759422302246, + "learning_rate": 4.306824591088551e-05, + "loss": 0.8834, + "step": 2459 + }, + { + "epoch": 1.3874788494077834, + "grad_norm": 3.4074456691741943, + "learning_rate": 4.3065425831923293e-05, + "loss": 0.9282, + "step": 2460 + }, + { + "epoch": 1.3880428652002257, + "grad_norm": 2.6341700553894043, + "learning_rate": 4.3062605752961086e-05, + "loss": 0.9583, + "step": 2461 + }, + { + "epoch": 1.3886068809926677, + "grad_norm": 4.5626678466796875, + "learning_rate": 4.305978567399888e-05, + "loss": 0.9062, + "step": 2462 + }, + { + "epoch": 1.38917089678511, + "grad_norm": 2.0079050064086914, + "learning_rate": 4.305696559503666e-05, + "loss": 0.7575, + "step": 2463 + }, + { + "epoch": 1.3897349125775522, + "grad_norm": 2.711350202560425, + "learning_rate": 4.305414551607445e-05, + "loss": 0.8534, + "step": 2464 + }, + { + "epoch": 1.3902989283699942, + "grad_norm": 2.845393419265747, + "learning_rate": 4.305132543711224e-05, + "loss": 0.8691, + "step": 2465 + }, + { + "epoch": 1.3908629441624365, + "grad_norm": 2.865546941757202, + "learning_rate": 4.304850535815003e-05, + "loss": 0.7958, + "step": 2466 + }, + { + "epoch": 1.3914269599548788, + "grad_norm": 2.725033760070801, + "learning_rate": 4.304568527918782e-05, + "loss": 0.9222, + "step": 2467 + }, + { + "epoch": 1.391990975747321, + "grad_norm": 2.602813720703125, + "learning_rate": 4.3042865200225604e-05, + "loss": 0.9777, + "step": 2468 + }, + { + "epoch": 1.392554991539763, + "grad_norm": 1.7716422080993652, + "learning_rate": 4.3040045121263396e-05, + "loss": 0.7173, + "step": 2469 + }, + { + "epoch": 1.3931190073322053, + "grad_norm": 3.603685140609741, + "learning_rate": 4.303722504230119e-05, + "loss": 1.0114, + "step": 2470 + }, + { + "epoch": 1.3936830231246475, + "grad_norm": 2.6056926250457764, + "learning_rate": 4.3034404963338974e-05, + "loss": 0.9, + "step": 2471 + }, + { + "epoch": 1.3942470389170896, + "grad_norm": 2.9058187007904053, + "learning_rate": 4.3031584884376766e-05, + "loss": 0.7956, + "step": 2472 + }, + { + "epoch": 1.3948110547095318, + "grad_norm": 1.6005337238311768, + "learning_rate": 4.302876480541455e-05, + "loss": 0.8175, + "step": 2473 + }, + { + "epoch": 1.395375070501974, + "grad_norm": 2.7210419178009033, + "learning_rate": 4.302594472645234e-05, + "loss": 0.7812, + "step": 2474 + }, + { + "epoch": 1.3959390862944163, + "grad_norm": 3.4352214336395264, + "learning_rate": 4.302312464749013e-05, + "loss": 0.9006, + "step": 2475 + }, + { + "epoch": 1.3965031020868584, + "grad_norm": 2.0293915271759033, + "learning_rate": 4.302030456852792e-05, + "loss": 0.9173, + "step": 2476 + }, + { + "epoch": 1.3970671178793006, + "grad_norm": 3.420081853866577, + "learning_rate": 4.301748448956571e-05, + "loss": 1.0419, + "step": 2477 + }, + { + "epoch": 1.3976311336717429, + "grad_norm": 5.030663013458252, + "learning_rate": 4.30146644106035e-05, + "loss": 0.8664, + "step": 2478 + }, + { + "epoch": 1.3981951494641849, + "grad_norm": 2.635416269302368, + "learning_rate": 4.301184433164129e-05, + "loss": 0.8283, + "step": 2479 + }, + { + "epoch": 1.3987591652566271, + "grad_norm": 4.598982334136963, + "learning_rate": 4.3009024252679076e-05, + "loss": 0.8308, + "step": 2480 + }, + { + "epoch": 1.3993231810490694, + "grad_norm": 2.8800065517425537, + "learning_rate": 4.300620417371687e-05, + "loss": 0.7679, + "step": 2481 + }, + { + "epoch": 1.3998871968415116, + "grad_norm": 2.19408917427063, + "learning_rate": 4.3003384094754654e-05, + "loss": 0.6705, + "step": 2482 + }, + { + "epoch": 1.4004512126339537, + "grad_norm": 4.2693023681640625, + "learning_rate": 4.3000564015792446e-05, + "loss": 0.9724, + "step": 2483 + }, + { + "epoch": 1.401015228426396, + "grad_norm": 3.04180645942688, + "learning_rate": 4.299774393683023e-05, + "loss": 1.0077, + "step": 2484 + }, + { + "epoch": 1.4015792442188382, + "grad_norm": 5.1695942878723145, + "learning_rate": 4.2994923857868024e-05, + "loss": 1.0902, + "step": 2485 + }, + { + "epoch": 1.4021432600112802, + "grad_norm": 3.655311346054077, + "learning_rate": 4.299210377890581e-05, + "loss": 0.9069, + "step": 2486 + }, + { + "epoch": 1.4027072758037225, + "grad_norm": 3.4962778091430664, + "learning_rate": 4.29892836999436e-05, + "loss": 0.8727, + "step": 2487 + }, + { + "epoch": 1.4032712915961647, + "grad_norm": 2.263958692550659, + "learning_rate": 4.2986463620981387e-05, + "loss": 0.7975, + "step": 2488 + }, + { + "epoch": 1.403835307388607, + "grad_norm": 2.8836357593536377, + "learning_rate": 4.298364354201918e-05, + "loss": 0.9262, + "step": 2489 + }, + { + "epoch": 1.404399323181049, + "grad_norm": 3.779473066329956, + "learning_rate": 4.298082346305697e-05, + "loss": 0.939, + "step": 2490 + }, + { + "epoch": 1.4049633389734912, + "grad_norm": 3.0260918140411377, + "learning_rate": 4.2978003384094756e-05, + "loss": 0.8289, + "step": 2491 + }, + { + "epoch": 1.4055273547659335, + "grad_norm": 3.742770195007324, + "learning_rate": 4.297518330513254e-05, + "loss": 0.7978, + "step": 2492 + }, + { + "epoch": 1.4060913705583755, + "grad_norm": 4.152154922485352, + "learning_rate": 4.2972363226170334e-05, + "loss": 0.8612, + "step": 2493 + }, + { + "epoch": 1.4066553863508178, + "grad_norm": 2.487340211868286, + "learning_rate": 4.2969543147208126e-05, + "loss": 0.7771, + "step": 2494 + }, + { + "epoch": 1.40721940214326, + "grad_norm": 2.4594290256500244, + "learning_rate": 4.296672306824591e-05, + "loss": 0.7779, + "step": 2495 + }, + { + "epoch": 1.4077834179357023, + "grad_norm": 3.9530904293060303, + "learning_rate": 4.29639029892837e-05, + "loss": 0.9567, + "step": 2496 + }, + { + "epoch": 1.4083474337281445, + "grad_norm": 4.351789951324463, + "learning_rate": 4.2961082910321496e-05, + "loss": 0.8658, + "step": 2497 + }, + { + "epoch": 1.4089114495205866, + "grad_norm": 5.19230318069458, + "learning_rate": 4.295826283135928e-05, + "loss": 1.0078, + "step": 2498 + }, + { + "epoch": 1.4094754653130288, + "grad_norm": 3.5382864475250244, + "learning_rate": 4.295544275239707e-05, + "loss": 0.9173, + "step": 2499 + }, + { + "epoch": 1.4100394811054708, + "grad_norm": 3.974696397781372, + "learning_rate": 4.295262267343486e-05, + "loss": 0.8414, + "step": 2500 + }, + { + "epoch": 1.410603496897913, + "grad_norm": 2.85090708732605, + "learning_rate": 4.294980259447265e-05, + "loss": 0.8165, + "step": 2501 + }, + { + "epoch": 1.4111675126903553, + "grad_norm": 1.5797343254089355, + "learning_rate": 4.2946982515510436e-05, + "loss": 0.7924, + "step": 2502 + }, + { + "epoch": 1.4117315284827976, + "grad_norm": 2.0748085975646973, + "learning_rate": 4.294416243654822e-05, + "loss": 0.7973, + "step": 2503 + }, + { + "epoch": 1.4122955442752398, + "grad_norm": 2.295903444290161, + "learning_rate": 4.2941342357586014e-05, + "loss": 0.857, + "step": 2504 + }, + { + "epoch": 1.4128595600676819, + "grad_norm": 3.7717013359069824, + "learning_rate": 4.2938522278623806e-05, + "loss": 1.0661, + "step": 2505 + }, + { + "epoch": 1.4134235758601241, + "grad_norm": 1.7560375928878784, + "learning_rate": 4.293570219966159e-05, + "loss": 0.8059, + "step": 2506 + }, + { + "epoch": 1.4139875916525662, + "grad_norm": 3.454378843307495, + "learning_rate": 4.293288212069938e-05, + "loss": 0.838, + "step": 2507 + }, + { + "epoch": 1.4145516074450084, + "grad_norm": 1.6939561367034912, + "learning_rate": 4.293006204173717e-05, + "loss": 0.7245, + "step": 2508 + }, + { + "epoch": 1.4151156232374507, + "grad_norm": 1.3726410865783691, + "learning_rate": 4.292724196277496e-05, + "loss": 0.7575, + "step": 2509 + }, + { + "epoch": 1.415679639029893, + "grad_norm": 3.7486586570739746, + "learning_rate": 4.292442188381275e-05, + "loss": 1.0404, + "step": 2510 + }, + { + "epoch": 1.4162436548223352, + "grad_norm": 2.7030887603759766, + "learning_rate": 4.292160180485054e-05, + "loss": 0.7139, + "step": 2511 + }, + { + "epoch": 1.4168076706147772, + "grad_norm": 4.12406587600708, + "learning_rate": 4.291878172588833e-05, + "loss": 0.8043, + "step": 2512 + }, + { + "epoch": 1.4173716864072194, + "grad_norm": 1.6882472038269043, + "learning_rate": 4.2915961646926117e-05, + "loss": 0.7203, + "step": 2513 + }, + { + "epoch": 1.4179357021996615, + "grad_norm": 4.685092449188232, + "learning_rate": 4.29131415679639e-05, + "loss": 1.1335, + "step": 2514 + }, + { + "epoch": 1.4184997179921037, + "grad_norm": 4.084644794464111, + "learning_rate": 4.2910321489001694e-05, + "loss": 0.902, + "step": 2515 + }, + { + "epoch": 1.419063733784546, + "grad_norm": 1.4260591268539429, + "learning_rate": 4.2907501410039486e-05, + "loss": 0.8344, + "step": 2516 + }, + { + "epoch": 1.4196277495769882, + "grad_norm": 3.6676971912384033, + "learning_rate": 4.290468133107727e-05, + "loss": 0.882, + "step": 2517 + }, + { + "epoch": 1.4201917653694305, + "grad_norm": 3.7101454734802246, + "learning_rate": 4.2901861252115064e-05, + "loss": 0.7627, + "step": 2518 + }, + { + "epoch": 1.4207557811618725, + "grad_norm": 5.335522651672363, + "learning_rate": 4.289904117315285e-05, + "loss": 0.9346, + "step": 2519 + }, + { + "epoch": 1.4213197969543148, + "grad_norm": 2.8395590782165527, + "learning_rate": 4.289622109419064e-05, + "loss": 0.9537, + "step": 2520 + }, + { + "epoch": 1.4218838127467568, + "grad_norm": 2.545769691467285, + "learning_rate": 4.289340101522843e-05, + "loss": 0.8193, + "step": 2521 + }, + { + "epoch": 1.422447828539199, + "grad_norm": 2.014802932739258, + "learning_rate": 4.289058093626622e-05, + "loss": 0.8331, + "step": 2522 + }, + { + "epoch": 1.4230118443316413, + "grad_norm": 2.6591484546661377, + "learning_rate": 4.2887760857304005e-05, + "loss": 0.8026, + "step": 2523 + }, + { + "epoch": 1.4235758601240835, + "grad_norm": 2.9979898929595947, + "learning_rate": 4.28849407783418e-05, + "loss": 0.8167, + "step": 2524 + }, + { + "epoch": 1.4241398759165258, + "grad_norm": 3.1576945781707764, + "learning_rate": 4.288212069937958e-05, + "loss": 0.8774, + "step": 2525 + }, + { + "epoch": 1.4247038917089678, + "grad_norm": 2.9864654541015625, + "learning_rate": 4.2879300620417374e-05, + "loss": 0.857, + "step": 2526 + }, + { + "epoch": 1.42526790750141, + "grad_norm": 2.1953344345092773, + "learning_rate": 4.287648054145516e-05, + "loss": 0.7923, + "step": 2527 + }, + { + "epoch": 1.425831923293852, + "grad_norm": 2.1861066818237305, + "learning_rate": 4.287366046249295e-05, + "loss": 0.8562, + "step": 2528 + }, + { + "epoch": 1.4263959390862944, + "grad_norm": 3.6974291801452637, + "learning_rate": 4.2870840383530744e-05, + "loss": 0.8549, + "step": 2529 + }, + { + "epoch": 1.4269599548787366, + "grad_norm": 2.5503287315368652, + "learning_rate": 4.286802030456853e-05, + "loss": 0.733, + "step": 2530 + }, + { + "epoch": 1.4275239706711789, + "grad_norm": 3.373342990875244, + "learning_rate": 4.2865200225606315e-05, + "loss": 0.8695, + "step": 2531 + }, + { + "epoch": 1.4280879864636211, + "grad_norm": 6.737033843994141, + "learning_rate": 4.286238014664411e-05, + "loss": 0.8881, + "step": 2532 + }, + { + "epoch": 1.4286520022560631, + "grad_norm": 3.2232296466827393, + "learning_rate": 4.28595600676819e-05, + "loss": 0.8235, + "step": 2533 + }, + { + "epoch": 1.4292160180485054, + "grad_norm": 1.4919273853302002, + "learning_rate": 4.2856739988719685e-05, + "loss": 0.8856, + "step": 2534 + }, + { + "epoch": 1.4297800338409474, + "grad_norm": 2.7853004932403564, + "learning_rate": 4.285391990975747e-05, + "loss": 1.0302, + "step": 2535 + }, + { + "epoch": 1.4303440496333897, + "grad_norm": 3.7132680416107178, + "learning_rate": 4.285109983079527e-05, + "loss": 0.8124, + "step": 2536 + }, + { + "epoch": 1.430908065425832, + "grad_norm": 4.866472244262695, + "learning_rate": 4.2848279751833054e-05, + "loss": 1.0945, + "step": 2537 + }, + { + "epoch": 1.4314720812182742, + "grad_norm": 2.143165111541748, + "learning_rate": 4.284545967287084e-05, + "loss": 0.9339, + "step": 2538 + }, + { + "epoch": 1.4320360970107164, + "grad_norm": 6.660165309906006, + "learning_rate": 4.284263959390863e-05, + "loss": 1.0924, + "step": 2539 + }, + { + "epoch": 1.4326001128031585, + "grad_norm": 3.2384917736053467, + "learning_rate": 4.2839819514946424e-05, + "loss": 0.9434, + "step": 2540 + }, + { + "epoch": 1.4331641285956007, + "grad_norm": 5.6547346115112305, + "learning_rate": 4.283699943598421e-05, + "loss": 0.9648, + "step": 2541 + }, + { + "epoch": 1.4337281443880427, + "grad_norm": 3.000035047531128, + "learning_rate": 4.2834179357021995e-05, + "loss": 0.8355, + "step": 2542 + }, + { + "epoch": 1.434292160180485, + "grad_norm": 2.239551544189453, + "learning_rate": 4.283135927805979e-05, + "loss": 0.8128, + "step": 2543 + }, + { + "epoch": 1.4348561759729273, + "grad_norm": 2.7170023918151855, + "learning_rate": 4.282853919909758e-05, + "loss": 0.7715, + "step": 2544 + }, + { + "epoch": 1.4354201917653695, + "grad_norm": 3.451430559158325, + "learning_rate": 4.2825719120135365e-05, + "loss": 0.8268, + "step": 2545 + }, + { + "epoch": 1.4359842075578118, + "grad_norm": 2.63214373588562, + "learning_rate": 4.282289904117315e-05, + "loss": 0.84, + "step": 2546 + }, + { + "epoch": 1.4365482233502538, + "grad_norm": 3.0433199405670166, + "learning_rate": 4.282007896221095e-05, + "loss": 0.8889, + "step": 2547 + }, + { + "epoch": 1.437112239142696, + "grad_norm": 2.7444040775299072, + "learning_rate": 4.2817258883248735e-05, + "loss": 0.7953, + "step": 2548 + }, + { + "epoch": 1.437676254935138, + "grad_norm": 1.6078157424926758, + "learning_rate": 4.281443880428652e-05, + "loss": 0.7153, + "step": 2549 + }, + { + "epoch": 1.4382402707275803, + "grad_norm": 1.2418228387832642, + "learning_rate": 4.281161872532431e-05, + "loss": 0.7122, + "step": 2550 + }, + { + "epoch": 1.4388042865200226, + "grad_norm": 3.424583673477173, + "learning_rate": 4.2808798646362104e-05, + "loss": 0.9084, + "step": 2551 + }, + { + "epoch": 1.4393683023124648, + "grad_norm": 3.327075719833374, + "learning_rate": 4.280597856739989e-05, + "loss": 1.0036, + "step": 2552 + }, + { + "epoch": 1.439932318104907, + "grad_norm": 2.8461809158325195, + "learning_rate": 4.2803158488437675e-05, + "loss": 0.7451, + "step": 2553 + }, + { + "epoch": 1.440496333897349, + "grad_norm": 2.3486266136169434, + "learning_rate": 4.280033840947547e-05, + "loss": 0.8235, + "step": 2554 + }, + { + "epoch": 1.4410603496897914, + "grad_norm": 2.782473087310791, + "learning_rate": 4.279751833051326e-05, + "loss": 0.9241, + "step": 2555 + }, + { + "epoch": 1.4416243654822334, + "grad_norm": 3.510802745819092, + "learning_rate": 4.2794698251551045e-05, + "loss": 1.052, + "step": 2556 + }, + { + "epoch": 1.4421883812746756, + "grad_norm": 1.9560116529464722, + "learning_rate": 4.279187817258884e-05, + "loss": 0.6716, + "step": 2557 + }, + { + "epoch": 1.4427523970671179, + "grad_norm": 3.0234243869781494, + "learning_rate": 4.278905809362662e-05, + "loss": 0.7616, + "step": 2558 + }, + { + "epoch": 1.4433164128595601, + "grad_norm": 2.210685968399048, + "learning_rate": 4.2786238014664415e-05, + "loss": 0.7539, + "step": 2559 + }, + { + "epoch": 1.4438804286520024, + "grad_norm": 1.303659439086914, + "learning_rate": 4.27834179357022e-05, + "loss": 0.6491, + "step": 2560 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 3.212557077407837, + "learning_rate": 4.278059785673999e-05, + "loss": 0.685, + "step": 2561 + }, + { + "epoch": 1.4450084602368867, + "grad_norm": 1.6378083229064941, + "learning_rate": 4.277777777777778e-05, + "loss": 0.7466, + "step": 2562 + }, + { + "epoch": 1.4455724760293287, + "grad_norm": 1.7202388048171997, + "learning_rate": 4.277495769881557e-05, + "loss": 0.8614, + "step": 2563 + }, + { + "epoch": 1.446136491821771, + "grad_norm": 2.4355595111846924, + "learning_rate": 4.2772137619853355e-05, + "loss": 0.8462, + "step": 2564 + }, + { + "epoch": 1.4467005076142132, + "grad_norm": 6.128222942352295, + "learning_rate": 4.276931754089115e-05, + "loss": 0.9375, + "step": 2565 + }, + { + "epoch": 1.4472645234066555, + "grad_norm": 3.0101318359375, + "learning_rate": 4.276649746192893e-05, + "loss": 0.8807, + "step": 2566 + }, + { + "epoch": 1.4478285391990977, + "grad_norm": 3.010840654373169, + "learning_rate": 4.2763677382966725e-05, + "loss": 0.9578, + "step": 2567 + }, + { + "epoch": 1.4483925549915397, + "grad_norm": 2.7269198894500732, + "learning_rate": 4.276085730400452e-05, + "loss": 0.7574, + "step": 2568 + }, + { + "epoch": 1.448956570783982, + "grad_norm": 1.990370273590088, + "learning_rate": 4.27580372250423e-05, + "loss": 0.8394, + "step": 2569 + }, + { + "epoch": 1.449520586576424, + "grad_norm": 4.904703617095947, + "learning_rate": 4.275521714608009e-05, + "loss": 0.8039, + "step": 2570 + }, + { + "epoch": 1.4500846023688663, + "grad_norm": 1.9849117994308472, + "learning_rate": 4.275239706711788e-05, + "loss": 0.8275, + "step": 2571 + }, + { + "epoch": 1.4506486181613085, + "grad_norm": 3.300210952758789, + "learning_rate": 4.274957698815567e-05, + "loss": 0.7452, + "step": 2572 + }, + { + "epoch": 1.4512126339537508, + "grad_norm": 7.863717079162598, + "learning_rate": 4.274675690919346e-05, + "loss": 0.9808, + "step": 2573 + }, + { + "epoch": 1.451776649746193, + "grad_norm": 3.994337797164917, + "learning_rate": 4.274393683023124e-05, + "loss": 1.0852, + "step": 2574 + }, + { + "epoch": 1.452340665538635, + "grad_norm": 4.2230072021484375, + "learning_rate": 4.274111675126904e-05, + "loss": 0.7909, + "step": 2575 + }, + { + "epoch": 1.4529046813310773, + "grad_norm": 4.379868030548096, + "learning_rate": 4.273829667230683e-05, + "loss": 0.8593, + "step": 2576 + }, + { + "epoch": 1.4534686971235193, + "grad_norm": 4.000130653381348, + "learning_rate": 4.273547659334461e-05, + "loss": 0.9298, + "step": 2577 + }, + { + "epoch": 1.4540327129159616, + "grad_norm": 1.4661203622817993, + "learning_rate": 4.2732656514382405e-05, + "loss": 0.8814, + "step": 2578 + }, + { + "epoch": 1.4545967287084038, + "grad_norm": 1.7053170204162598, + "learning_rate": 4.27298364354202e-05, + "loss": 0.6896, + "step": 2579 + }, + { + "epoch": 1.455160744500846, + "grad_norm": 1.7430565357208252, + "learning_rate": 4.272701635645798e-05, + "loss": 0.8168, + "step": 2580 + }, + { + "epoch": 1.4557247602932883, + "grad_norm": 2.294140100479126, + "learning_rate": 4.272419627749577e-05, + "loss": 0.7711, + "step": 2581 + }, + { + "epoch": 1.4562887760857304, + "grad_norm": 5.777412414550781, + "learning_rate": 4.272137619853356e-05, + "loss": 1.1045, + "step": 2582 + }, + { + "epoch": 1.4568527918781726, + "grad_norm": 2.0015110969543457, + "learning_rate": 4.271855611957135e-05, + "loss": 0.7749, + "step": 2583 + }, + { + "epoch": 1.4574168076706147, + "grad_norm": 2.829876661300659, + "learning_rate": 4.271573604060914e-05, + "loss": 0.97, + "step": 2584 + }, + { + "epoch": 1.457980823463057, + "grad_norm": 5.540492057800293, + "learning_rate": 4.2712915961646924e-05, + "loss": 0.9507, + "step": 2585 + }, + { + "epoch": 1.4585448392554992, + "grad_norm": 3.7699403762817383, + "learning_rate": 4.271009588268472e-05, + "loss": 0.8842, + "step": 2586 + }, + { + "epoch": 1.4591088550479414, + "grad_norm": 5.326240062713623, + "learning_rate": 4.270727580372251e-05, + "loss": 1.1002, + "step": 2587 + }, + { + "epoch": 1.4596728708403837, + "grad_norm": 1.264776587486267, + "learning_rate": 4.270445572476029e-05, + "loss": 0.6844, + "step": 2588 + }, + { + "epoch": 1.4602368866328257, + "grad_norm": 2.6232991218566895, + "learning_rate": 4.2701635645798085e-05, + "loss": 0.7858, + "step": 2589 + }, + { + "epoch": 1.460800902425268, + "grad_norm": 1.8531043529510498, + "learning_rate": 4.269881556683588e-05, + "loss": 0.7847, + "step": 2590 + }, + { + "epoch": 1.46136491821771, + "grad_norm": 2.5095667839050293, + "learning_rate": 4.269599548787366e-05, + "loss": 0.9262, + "step": 2591 + }, + { + "epoch": 1.4619289340101522, + "grad_norm": 3.2795088291168213, + "learning_rate": 4.269317540891145e-05, + "loss": 0.774, + "step": 2592 + }, + { + "epoch": 1.4624929498025945, + "grad_norm": 2.236332654953003, + "learning_rate": 4.269035532994924e-05, + "loss": 0.7716, + "step": 2593 + }, + { + "epoch": 1.4630569655950367, + "grad_norm": 2.840449571609497, + "learning_rate": 4.268753525098703e-05, + "loss": 0.8063, + "step": 2594 + }, + { + "epoch": 1.463620981387479, + "grad_norm": 2.8698084354400635, + "learning_rate": 4.268471517202482e-05, + "loss": 0.8956, + "step": 2595 + }, + { + "epoch": 1.464184997179921, + "grad_norm": 3.3021934032440186, + "learning_rate": 4.268189509306261e-05, + "loss": 0.9198, + "step": 2596 + }, + { + "epoch": 1.4647490129723633, + "grad_norm": 1.7213008403778076, + "learning_rate": 4.2679075014100396e-05, + "loss": 0.6794, + "step": 2597 + }, + { + "epoch": 1.4653130287648053, + "grad_norm": 2.1871869564056396, + "learning_rate": 4.267625493513819e-05, + "loss": 0.7151, + "step": 2598 + }, + { + "epoch": 1.4658770445572475, + "grad_norm": 1.4624241590499878, + "learning_rate": 4.2673434856175973e-05, + "loss": 0.7079, + "step": 2599 + }, + { + "epoch": 1.4664410603496898, + "grad_norm": 2.721787452697754, + "learning_rate": 4.2670614777213766e-05, + "loss": 0.8644, + "step": 2600 + }, + { + "epoch": 1.467005076142132, + "grad_norm": 2.54023814201355, + "learning_rate": 4.266779469825155e-05, + "loss": 0.7869, + "step": 2601 + }, + { + "epoch": 1.4675690919345743, + "grad_norm": 1.8611476421356201, + "learning_rate": 4.266497461928934e-05, + "loss": 0.805, + "step": 2602 + }, + { + "epoch": 1.4681331077270163, + "grad_norm": 2.8938584327697754, + "learning_rate": 4.266215454032713e-05, + "loss": 0.8857, + "step": 2603 + }, + { + "epoch": 1.4686971235194586, + "grad_norm": 5.3935065269470215, + "learning_rate": 4.265933446136492e-05, + "loss": 0.9622, + "step": 2604 + }, + { + "epoch": 1.4692611393119006, + "grad_norm": 4.337327003479004, + "learning_rate": 4.2656514382402706e-05, + "loss": 0.9524, + "step": 2605 + }, + { + "epoch": 1.4698251551043429, + "grad_norm": 2.526120185852051, + "learning_rate": 4.26536943034405e-05, + "loss": 0.7247, + "step": 2606 + }, + { + "epoch": 1.4703891708967851, + "grad_norm": 2.617525100708008, + "learning_rate": 4.265087422447829e-05, + "loss": 0.8621, + "step": 2607 + }, + { + "epoch": 1.4709531866892274, + "grad_norm": 2.129469871520996, + "learning_rate": 4.2648054145516076e-05, + "loss": 0.799, + "step": 2608 + }, + { + "epoch": 1.4715172024816696, + "grad_norm": 3.076185464859009, + "learning_rate": 4.264523406655386e-05, + "loss": 0.9721, + "step": 2609 + }, + { + "epoch": 1.4720812182741116, + "grad_norm": 1.4925055503845215, + "learning_rate": 4.2642413987591654e-05, + "loss": 0.6162, + "step": 2610 + }, + { + "epoch": 1.472645234066554, + "grad_norm": 1.4688940048217773, + "learning_rate": 4.2639593908629446e-05, + "loss": 0.7657, + "step": 2611 + }, + { + "epoch": 1.473209249858996, + "grad_norm": 2.1277143955230713, + "learning_rate": 4.263677382966723e-05, + "loss": 0.8298, + "step": 2612 + }, + { + "epoch": 1.4737732656514382, + "grad_norm": 3.4041857719421387, + "learning_rate": 4.2633953750705017e-05, + "loss": 0.7687, + "step": 2613 + }, + { + "epoch": 1.4743372814438804, + "grad_norm": 2.3807060718536377, + "learning_rate": 4.2631133671742816e-05, + "loss": 0.8245, + "step": 2614 + }, + { + "epoch": 1.4749012972363227, + "grad_norm": 3.6664535999298096, + "learning_rate": 4.26283135927806e-05, + "loss": 0.9112, + "step": 2615 + }, + { + "epoch": 1.475465313028765, + "grad_norm": 2.359027862548828, + "learning_rate": 4.2625493513818386e-05, + "loss": 0.7915, + "step": 2616 + }, + { + "epoch": 1.476029328821207, + "grad_norm": 4.551119327545166, + "learning_rate": 4.262267343485618e-05, + "loss": 0.9299, + "step": 2617 + }, + { + "epoch": 1.4765933446136492, + "grad_norm": 3.2481632232666016, + "learning_rate": 4.261985335589397e-05, + "loss": 0.9005, + "step": 2618 + }, + { + "epoch": 1.4771573604060912, + "grad_norm": 1.8841652870178223, + "learning_rate": 4.2617033276931756e-05, + "loss": 0.7366, + "step": 2619 + }, + { + "epoch": 1.4777213761985335, + "grad_norm": 3.7834768295288086, + "learning_rate": 4.261421319796954e-05, + "loss": 0.9792, + "step": 2620 + }, + { + "epoch": 1.4782853919909758, + "grad_norm": 3.504779577255249, + "learning_rate": 4.2611393119007334e-05, + "loss": 0.7845, + "step": 2621 + }, + { + "epoch": 1.478849407783418, + "grad_norm": 2.850964069366455, + "learning_rate": 4.2608573040045126e-05, + "loss": 0.8071, + "step": 2622 + }, + { + "epoch": 1.4794134235758603, + "grad_norm": 2.627444267272949, + "learning_rate": 4.260575296108291e-05, + "loss": 0.786, + "step": 2623 + }, + { + "epoch": 1.4799774393683023, + "grad_norm": 4.068089962005615, + "learning_rate": 4.26029328821207e-05, + "loss": 0.8988, + "step": 2624 + }, + { + "epoch": 1.4805414551607445, + "grad_norm": 4.2869110107421875, + "learning_rate": 4.2600112803158496e-05, + "loss": 0.8979, + "step": 2625 + }, + { + "epoch": 1.4811054709531866, + "grad_norm": 2.207350969314575, + "learning_rate": 4.259729272419628e-05, + "loss": 0.764, + "step": 2626 + }, + { + "epoch": 1.4816694867456288, + "grad_norm": 4.964110374450684, + "learning_rate": 4.2594472645234067e-05, + "loss": 1.2232, + "step": 2627 + }, + { + "epoch": 1.482233502538071, + "grad_norm": 2.7483420372009277, + "learning_rate": 4.259165256627186e-05, + "loss": 0.8679, + "step": 2628 + }, + { + "epoch": 1.4827975183305133, + "grad_norm": 1.6668223142623901, + "learning_rate": 4.258883248730965e-05, + "loss": 0.7719, + "step": 2629 + }, + { + "epoch": 1.4833615341229556, + "grad_norm": 2.4131672382354736, + "learning_rate": 4.2586012408347436e-05, + "loss": 0.8562, + "step": 2630 + }, + { + "epoch": 1.4839255499153976, + "grad_norm": 2.4759209156036377, + "learning_rate": 4.258319232938522e-05, + "loss": 0.8529, + "step": 2631 + }, + { + "epoch": 1.4844895657078399, + "grad_norm": 5.423181056976318, + "learning_rate": 4.2580372250423014e-05, + "loss": 1.0389, + "step": 2632 + }, + { + "epoch": 1.4850535815002819, + "grad_norm": 2.4461426734924316, + "learning_rate": 4.2577552171460806e-05, + "loss": 0.7184, + "step": 2633 + }, + { + "epoch": 1.4856175972927241, + "grad_norm": 1.9877524375915527, + "learning_rate": 4.257473209249859e-05, + "loss": 0.7917, + "step": 2634 + }, + { + "epoch": 1.4861816130851664, + "grad_norm": 2.003495454788208, + "learning_rate": 4.257191201353638e-05, + "loss": 0.8405, + "step": 2635 + }, + { + "epoch": 1.4867456288776086, + "grad_norm": 2.17317533493042, + "learning_rate": 4.256909193457417e-05, + "loss": 0.8199, + "step": 2636 + }, + { + "epoch": 1.487309644670051, + "grad_norm": 4.825963497161865, + "learning_rate": 4.256627185561196e-05, + "loss": 0.8515, + "step": 2637 + }, + { + "epoch": 1.487873660462493, + "grad_norm": 4.6183247566223145, + "learning_rate": 4.256345177664975e-05, + "loss": 0.8055, + "step": 2638 + }, + { + "epoch": 1.4884376762549352, + "grad_norm": 1.9362821578979492, + "learning_rate": 4.256063169768754e-05, + "loss": 0.696, + "step": 2639 + }, + { + "epoch": 1.4890016920473772, + "grad_norm": 2.776355504989624, + "learning_rate": 4.2557811618725324e-05, + "loss": 0.854, + "step": 2640 + }, + { + "epoch": 1.4895657078398195, + "grad_norm": 2.3447425365448, + "learning_rate": 4.2554991539763116e-05, + "loss": 0.7586, + "step": 2641 + }, + { + "epoch": 1.4901297236322617, + "grad_norm": 2.7182955741882324, + "learning_rate": 4.25521714608009e-05, + "loss": 0.7241, + "step": 2642 + }, + { + "epoch": 1.490693739424704, + "grad_norm": 2.274862766265869, + "learning_rate": 4.2549351381838694e-05, + "loss": 0.8133, + "step": 2643 + }, + { + "epoch": 1.4912577552171462, + "grad_norm": 1.787659764289856, + "learning_rate": 4.254653130287648e-05, + "loss": 0.8458, + "step": 2644 + }, + { + "epoch": 1.4918217710095882, + "grad_norm": 2.5138614177703857, + "learning_rate": 4.254371122391427e-05, + "loss": 0.7683, + "step": 2645 + }, + { + "epoch": 1.4923857868020305, + "grad_norm": 2.836385488510132, + "learning_rate": 4.2540891144952064e-05, + "loss": 0.9394, + "step": 2646 + }, + { + "epoch": 1.4929498025944725, + "grad_norm": 3.029505968093872, + "learning_rate": 4.253807106598985e-05, + "loss": 0.9283, + "step": 2647 + }, + { + "epoch": 1.4935138183869148, + "grad_norm": 4.378316879272461, + "learning_rate": 4.2535250987027635e-05, + "loss": 0.9949, + "step": 2648 + }, + { + "epoch": 1.494077834179357, + "grad_norm": 1.4714843034744263, + "learning_rate": 4.253243090806543e-05, + "loss": 0.6178, + "step": 2649 + }, + { + "epoch": 1.4946418499717993, + "grad_norm": 2.765233039855957, + "learning_rate": 4.252961082910322e-05, + "loss": 0.7705, + "step": 2650 + }, + { + "epoch": 1.4952058657642415, + "grad_norm": 3.102768898010254, + "learning_rate": 4.2526790750141004e-05, + "loss": 0.7885, + "step": 2651 + }, + { + "epoch": 1.4957698815566836, + "grad_norm": 3.2787089347839355, + "learning_rate": 4.2523970671178797e-05, + "loss": 0.6887, + "step": 2652 + }, + { + "epoch": 1.4963338973491258, + "grad_norm": 4.432823181152344, + "learning_rate": 4.252115059221658e-05, + "loss": 0.9405, + "step": 2653 + }, + { + "epoch": 1.4968979131415678, + "grad_norm": 48.19443893432617, + "learning_rate": 4.2518330513254374e-05, + "loss": 0.8924, + "step": 2654 + }, + { + "epoch": 1.49746192893401, + "grad_norm": 2.3875741958618164, + "learning_rate": 4.251551043429216e-05, + "loss": 0.7955, + "step": 2655 + }, + { + "epoch": 1.4980259447264523, + "grad_norm": 3.0107946395874023, + "learning_rate": 4.251269035532995e-05, + "loss": 0.9104, + "step": 2656 + }, + { + "epoch": 1.4985899605188946, + "grad_norm": 1.726840615272522, + "learning_rate": 4.2509870276367744e-05, + "loss": 0.8261, + "step": 2657 + }, + { + "epoch": 1.4991539763113368, + "grad_norm": 1.8669785261154175, + "learning_rate": 4.250705019740553e-05, + "loss": 0.6936, + "step": 2658 + }, + { + "epoch": 1.4997179921037789, + "grad_norm": 3.762974262237549, + "learning_rate": 4.2504230118443315e-05, + "loss": 0.6737, + "step": 2659 + }, + { + "epoch": 1.5002820078962211, + "grad_norm": 2.0115723609924316, + "learning_rate": 4.250141003948111e-05, + "loss": 0.651, + "step": 2660 + }, + { + "epoch": 1.5008460236886632, + "grad_norm": 2.5445783138275146, + "learning_rate": 4.24985899605189e-05, + "loss": 0.8592, + "step": 2661 + }, + { + "epoch": 1.5014100394811054, + "grad_norm": 2.017869710922241, + "learning_rate": 4.2495769881556685e-05, + "loss": 0.8889, + "step": 2662 + }, + { + "epoch": 1.5019740552735477, + "grad_norm": 8.46349811553955, + "learning_rate": 4.249294980259447e-05, + "loss": 0.8228, + "step": 2663 + }, + { + "epoch": 1.50253807106599, + "grad_norm": 2.3961756229400635, + "learning_rate": 4.249012972363227e-05, + "loss": 0.8754, + "step": 2664 + }, + { + "epoch": 1.5031020868584322, + "grad_norm": 3.2503247261047363, + "learning_rate": 4.2487309644670054e-05, + "loss": 0.8249, + "step": 2665 + }, + { + "epoch": 1.5036661026508742, + "grad_norm": 3.9624078273773193, + "learning_rate": 4.248448956570784e-05, + "loss": 0.7991, + "step": 2666 + }, + { + "epoch": 1.5042301184433164, + "grad_norm": 2.184555768966675, + "learning_rate": 4.248166948674563e-05, + "loss": 0.8065, + "step": 2667 + }, + { + "epoch": 1.5047941342357585, + "grad_norm": 1.4817662239074707, + "learning_rate": 4.2478849407783424e-05, + "loss": 0.7212, + "step": 2668 + }, + { + "epoch": 1.5053581500282007, + "grad_norm": 3.907804489135742, + "learning_rate": 4.247602932882121e-05, + "loss": 0.9143, + "step": 2669 + }, + { + "epoch": 1.505922165820643, + "grad_norm": 1.421438455581665, + "learning_rate": 4.2473209249858995e-05, + "loss": 0.7117, + "step": 2670 + }, + { + "epoch": 1.5064861816130852, + "grad_norm": 1.852001667022705, + "learning_rate": 4.247038917089679e-05, + "loss": 0.8275, + "step": 2671 + }, + { + "epoch": 1.5070501974055275, + "grad_norm": 2.6958823204040527, + "learning_rate": 4.246756909193458e-05, + "loss": 0.858, + "step": 2672 + }, + { + "epoch": 1.5076142131979695, + "grad_norm": 3.887665033340454, + "learning_rate": 4.2464749012972365e-05, + "loss": 0.9365, + "step": 2673 + }, + { + "epoch": 1.5081782289904118, + "grad_norm": 1.5038361549377441, + "learning_rate": 4.246192893401015e-05, + "loss": 0.7168, + "step": 2674 + }, + { + "epoch": 1.5087422447828538, + "grad_norm": 3.441394090652466, + "learning_rate": 4.245910885504794e-05, + "loss": 0.8596, + "step": 2675 + }, + { + "epoch": 1.509306260575296, + "grad_norm": 2.979671001434326, + "learning_rate": 4.2456288776085734e-05, + "loss": 0.7707, + "step": 2676 + }, + { + "epoch": 1.5098702763677383, + "grad_norm": 4.299332618713379, + "learning_rate": 4.245346869712352e-05, + "loss": 1.0044, + "step": 2677 + }, + { + "epoch": 1.5104342921601805, + "grad_norm": 1.2340545654296875, + "learning_rate": 4.245064861816131e-05, + "loss": 0.8399, + "step": 2678 + }, + { + "epoch": 1.5109983079526228, + "grad_norm": 3.2908215522766113, + "learning_rate": 4.24478285391991e-05, + "loss": 0.9526, + "step": 2679 + }, + { + "epoch": 1.5115623237450648, + "grad_norm": 3.2746615409851074, + "learning_rate": 4.244500846023689e-05, + "loss": 0.8892, + "step": 2680 + }, + { + "epoch": 1.512126339537507, + "grad_norm": 1.817735195159912, + "learning_rate": 4.2442188381274675e-05, + "loss": 0.8848, + "step": 2681 + }, + { + "epoch": 1.512690355329949, + "grad_norm": 1.7308292388916016, + "learning_rate": 4.243936830231247e-05, + "loss": 0.7187, + "step": 2682 + }, + { + "epoch": 1.5132543711223914, + "grad_norm": 1.616939663887024, + "learning_rate": 4.243654822335025e-05, + "loss": 0.8123, + "step": 2683 + }, + { + "epoch": 1.5138183869148336, + "grad_norm": 2.6301205158233643, + "learning_rate": 4.2433728144388045e-05, + "loss": 0.81, + "step": 2684 + }, + { + "epoch": 1.5143824027072759, + "grad_norm": 6.63601541519165, + "learning_rate": 4.243090806542584e-05, + "loss": 0.8758, + "step": 2685 + }, + { + "epoch": 1.5149464184997181, + "grad_norm": 2.360447406768799, + "learning_rate": 4.242808798646362e-05, + "loss": 0.8075, + "step": 2686 + }, + { + "epoch": 1.5155104342921601, + "grad_norm": 9.062603950500488, + "learning_rate": 4.2425267907501415e-05, + "loss": 0.7407, + "step": 2687 + }, + { + "epoch": 1.5160744500846024, + "grad_norm": 1.7908782958984375, + "learning_rate": 4.24224478285392e-05, + "loss": 0.8099, + "step": 2688 + }, + { + "epoch": 1.5166384658770444, + "grad_norm": 2.313129425048828, + "learning_rate": 4.241962774957699e-05, + "loss": 0.7446, + "step": 2689 + }, + { + "epoch": 1.5172024816694867, + "grad_norm": 2.4772324562072754, + "learning_rate": 4.241680767061478e-05, + "loss": 0.7457, + "step": 2690 + }, + { + "epoch": 1.517766497461929, + "grad_norm": 1.7204152345657349, + "learning_rate": 4.241398759165257e-05, + "loss": 0.8238, + "step": 2691 + }, + { + "epoch": 1.5183305132543712, + "grad_norm": 2.0046074390411377, + "learning_rate": 4.2411167512690355e-05, + "loss": 0.8151, + "step": 2692 + }, + { + "epoch": 1.5188945290468134, + "grad_norm": 2.8764259815216064, + "learning_rate": 4.240834743372815e-05, + "loss": 0.8917, + "step": 2693 + }, + { + "epoch": 1.5194585448392555, + "grad_norm": 1.3996583223342896, + "learning_rate": 4.240552735476593e-05, + "loss": 0.7461, + "step": 2694 + }, + { + "epoch": 1.5200225606316977, + "grad_norm": 2.9638140201568604, + "learning_rate": 4.2402707275803725e-05, + "loss": 0.8035, + "step": 2695 + }, + { + "epoch": 1.5205865764241397, + "grad_norm": 3.3350625038146973, + "learning_rate": 4.239988719684152e-05, + "loss": 0.7466, + "step": 2696 + }, + { + "epoch": 1.521150592216582, + "grad_norm": 4.069559097290039, + "learning_rate": 4.23970671178793e-05, + "loss": 0.8612, + "step": 2697 + }, + { + "epoch": 1.5217146080090242, + "grad_norm": 4.141295433044434, + "learning_rate": 4.239424703891709e-05, + "loss": 0.854, + "step": 2698 + }, + { + "epoch": 1.5222786238014665, + "grad_norm": 1.5748587846755981, + "learning_rate": 4.239142695995488e-05, + "loss": 0.8139, + "step": 2699 + }, + { + "epoch": 1.5228426395939088, + "grad_norm": 4.019679069519043, + "learning_rate": 4.238860688099267e-05, + "loss": 0.9148, + "step": 2700 + }, + { + "epoch": 1.5234066553863508, + "grad_norm": 2.000671625137329, + "learning_rate": 4.238578680203046e-05, + "loss": 0.7219, + "step": 2701 + }, + { + "epoch": 1.523970671178793, + "grad_norm": 1.9919549226760864, + "learning_rate": 4.238296672306824e-05, + "loss": 0.7417, + "step": 2702 + }, + { + "epoch": 1.524534686971235, + "grad_norm": 4.201306343078613, + "learning_rate": 4.238014664410604e-05, + "loss": 0.8485, + "step": 2703 + }, + { + "epoch": 1.5250987027636773, + "grad_norm": 2.009620428085327, + "learning_rate": 4.237732656514383e-05, + "loss": 0.8709, + "step": 2704 + }, + { + "epoch": 1.5256627185561196, + "grad_norm": 1.6593433618545532, + "learning_rate": 4.237450648618161e-05, + "loss": 0.8922, + "step": 2705 + }, + { + "epoch": 1.5262267343485618, + "grad_norm": 2.045191764831543, + "learning_rate": 4.2371686407219405e-05, + "loss": 0.8231, + "step": 2706 + }, + { + "epoch": 1.526790750141004, + "grad_norm": 5.467828273773193, + "learning_rate": 4.23688663282572e-05, + "loss": 1.0056, + "step": 2707 + }, + { + "epoch": 1.527354765933446, + "grad_norm": 1.634879469871521, + "learning_rate": 4.236604624929498e-05, + "loss": 0.835, + "step": 2708 + }, + { + "epoch": 1.5279187817258884, + "grad_norm": 4.184314727783203, + "learning_rate": 4.236322617033277e-05, + "loss": 0.9733, + "step": 2709 + }, + { + "epoch": 1.5284827975183304, + "grad_norm": 3.347111463546753, + "learning_rate": 4.236040609137056e-05, + "loss": 0.9137, + "step": 2710 + }, + { + "epoch": 1.5290468133107726, + "grad_norm": 2.909543752670288, + "learning_rate": 4.235758601240835e-05, + "loss": 0.8614, + "step": 2711 + }, + { + "epoch": 1.5296108291032149, + "grad_norm": 2.5370030403137207, + "learning_rate": 4.235476593344614e-05, + "loss": 0.7733, + "step": 2712 + }, + { + "epoch": 1.5301748448956571, + "grad_norm": 3.949234962463379, + "learning_rate": 4.235194585448392e-05, + "loss": 0.9384, + "step": 2713 + }, + { + "epoch": 1.5307388606880994, + "grad_norm": 14.473357200622559, + "learning_rate": 4.2349125775521716e-05, + "loss": 0.7796, + "step": 2714 + }, + { + "epoch": 1.5313028764805414, + "grad_norm": 1.4487836360931396, + "learning_rate": 4.234630569655951e-05, + "loss": 0.7957, + "step": 2715 + }, + { + "epoch": 1.5318668922729837, + "grad_norm": 1.98909592628479, + "learning_rate": 4.234348561759729e-05, + "loss": 0.8512, + "step": 2716 + }, + { + "epoch": 1.5324309080654257, + "grad_norm": 2.1672980785369873, + "learning_rate": 4.2340665538635085e-05, + "loss": 0.8826, + "step": 2717 + }, + { + "epoch": 1.532994923857868, + "grad_norm": 2.1108946800231934, + "learning_rate": 4.233784545967287e-05, + "loss": 0.9233, + "step": 2718 + }, + { + "epoch": 1.5335589396503102, + "grad_norm": 3.9756815433502197, + "learning_rate": 4.233502538071066e-05, + "loss": 0.8922, + "step": 2719 + }, + { + "epoch": 1.5341229554427525, + "grad_norm": 2.1749625205993652, + "learning_rate": 4.233220530174845e-05, + "loss": 0.8495, + "step": 2720 + }, + { + "epoch": 1.5346869712351947, + "grad_norm": 2.509934902191162, + "learning_rate": 4.232938522278624e-05, + "loss": 0.9053, + "step": 2721 + }, + { + "epoch": 1.5352509870276367, + "grad_norm": 2.9047441482543945, + "learning_rate": 4.232656514382403e-05, + "loss": 0.8758, + "step": 2722 + }, + { + "epoch": 1.535815002820079, + "grad_norm": 1.5288869142532349, + "learning_rate": 4.232374506486182e-05, + "loss": 0.7794, + "step": 2723 + }, + { + "epoch": 1.536379018612521, + "grad_norm": 2.336473226547241, + "learning_rate": 4.232092498589961e-05, + "loss": 0.9495, + "step": 2724 + }, + { + "epoch": 1.5369430344049633, + "grad_norm": 3.4635536670684814, + "learning_rate": 4.2318104906937396e-05, + "loss": 0.7793, + "step": 2725 + }, + { + "epoch": 1.5375070501974055, + "grad_norm": 2.5464370250701904, + "learning_rate": 4.231528482797519e-05, + "loss": 0.8365, + "step": 2726 + }, + { + "epoch": 1.5380710659898478, + "grad_norm": 2.1653666496276855, + "learning_rate": 4.231246474901297e-05, + "loss": 0.8077, + "step": 2727 + }, + { + "epoch": 1.53863508178229, + "grad_norm": 1.183105230331421, + "learning_rate": 4.2309644670050765e-05, + "loss": 0.7759, + "step": 2728 + }, + { + "epoch": 1.539199097574732, + "grad_norm": 2.3023505210876465, + "learning_rate": 4.230682459108855e-05, + "loss": 0.8115, + "step": 2729 + }, + { + "epoch": 1.5397631133671743, + "grad_norm": 4.797541618347168, + "learning_rate": 4.230400451212634e-05, + "loss": 0.8008, + "step": 2730 + }, + { + "epoch": 1.5403271291596163, + "grad_norm": 4.219842910766602, + "learning_rate": 4.230118443316413e-05, + "loss": 0.9388, + "step": 2731 + }, + { + "epoch": 1.5408911449520586, + "grad_norm": 2.2454192638397217, + "learning_rate": 4.229836435420192e-05, + "loss": 0.703, + "step": 2732 + }, + { + "epoch": 1.5414551607445008, + "grad_norm": 4.062036037445068, + "learning_rate": 4.2295544275239706e-05, + "loss": 0.9761, + "step": 2733 + }, + { + "epoch": 1.542019176536943, + "grad_norm": 3.8579282760620117, + "learning_rate": 4.22927241962775e-05, + "loss": 0.9533, + "step": 2734 + }, + { + "epoch": 1.5425831923293853, + "grad_norm": 1.3839318752288818, + "learning_rate": 4.228990411731529e-05, + "loss": 0.8783, + "step": 2735 + }, + { + "epoch": 1.5431472081218274, + "grad_norm": 4.125, + "learning_rate": 4.2287084038353076e-05, + "loss": 0.7656, + "step": 2736 + }, + { + "epoch": 1.5437112239142696, + "grad_norm": 4.473674297332764, + "learning_rate": 4.228426395939086e-05, + "loss": 1.1985, + "step": 2737 + }, + { + "epoch": 1.5442752397067117, + "grad_norm": 1.719804286956787, + "learning_rate": 4.2281443880428653e-05, + "loss": 0.7844, + "step": 2738 + }, + { + "epoch": 1.544839255499154, + "grad_norm": 4.013195037841797, + "learning_rate": 4.2278623801466446e-05, + "loss": 1.0499, + "step": 2739 + }, + { + "epoch": 1.5454032712915962, + "grad_norm": 3.0870964527130127, + "learning_rate": 4.227580372250423e-05, + "loss": 0.8735, + "step": 2740 + }, + { + "epoch": 1.5459672870840384, + "grad_norm": 3.503981351852417, + "learning_rate": 4.2272983643542016e-05, + "loss": 0.9583, + "step": 2741 + }, + { + "epoch": 1.5465313028764807, + "grad_norm": 1.9686938524246216, + "learning_rate": 4.2270163564579815e-05, + "loss": 0.7485, + "step": 2742 + }, + { + "epoch": 1.5470953186689227, + "grad_norm": 1.608712911605835, + "learning_rate": 4.22673434856176e-05, + "loss": 0.6771, + "step": 2743 + }, + { + "epoch": 1.547659334461365, + "grad_norm": 1.4640681743621826, + "learning_rate": 4.2264523406655386e-05, + "loss": 0.6527, + "step": 2744 + }, + { + "epoch": 1.548223350253807, + "grad_norm": 2.257899045944214, + "learning_rate": 4.226170332769318e-05, + "loss": 0.809, + "step": 2745 + }, + { + "epoch": 1.5487873660462492, + "grad_norm": 1.4433192014694214, + "learning_rate": 4.225888324873097e-05, + "loss": 0.8128, + "step": 2746 + }, + { + "epoch": 1.5493513818386915, + "grad_norm": 1.9526954889297485, + "learning_rate": 4.2256063169768756e-05, + "loss": 0.7338, + "step": 2747 + }, + { + "epoch": 1.5499153976311337, + "grad_norm": 2.2218170166015625, + "learning_rate": 4.225324309080654e-05, + "loss": 0.8003, + "step": 2748 + }, + { + "epoch": 1.550479413423576, + "grad_norm": 3.6623919010162354, + "learning_rate": 4.2250423011844334e-05, + "loss": 0.8849, + "step": 2749 + }, + { + "epoch": 1.551043429216018, + "grad_norm": 2.4564566612243652, + "learning_rate": 4.2247602932882126e-05, + "loss": 0.8483, + "step": 2750 + }, + { + "epoch": 1.5516074450084603, + "grad_norm": 3.192899227142334, + "learning_rate": 4.224478285391991e-05, + "loss": 0.8852, + "step": 2751 + }, + { + "epoch": 1.5521714608009023, + "grad_norm": 2.817737340927124, + "learning_rate": 4.2241962774957697e-05, + "loss": 0.9194, + "step": 2752 + }, + { + "epoch": 1.5527354765933445, + "grad_norm": 4.426657199859619, + "learning_rate": 4.223914269599549e-05, + "loss": 0.9515, + "step": 2753 + }, + { + "epoch": 1.5532994923857868, + "grad_norm": 4.27437162399292, + "learning_rate": 4.223632261703328e-05, + "loss": 0.905, + "step": 2754 + }, + { + "epoch": 1.553863508178229, + "grad_norm": 3.6916356086730957, + "learning_rate": 4.2233502538071066e-05, + "loss": 0.8882, + "step": 2755 + }, + { + "epoch": 1.5544275239706713, + "grad_norm": 2.0691795349121094, + "learning_rate": 4.223068245910886e-05, + "loss": 0.748, + "step": 2756 + }, + { + "epoch": 1.5549915397631133, + "grad_norm": 1.6999636888504028, + "learning_rate": 4.222786238014665e-05, + "loss": 0.7162, + "step": 2757 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 2.4251582622528076, + "learning_rate": 4.2225042301184436e-05, + "loss": 0.8449, + "step": 2758 + }, + { + "epoch": 1.5561195713479976, + "grad_norm": 2.175359010696411, + "learning_rate": 4.222222222222222e-05, + "loss": 0.7536, + "step": 2759 + }, + { + "epoch": 1.5566835871404399, + "grad_norm": 2.4466798305511475, + "learning_rate": 4.2219402143260014e-05, + "loss": 0.9323, + "step": 2760 + }, + { + "epoch": 1.5572476029328821, + "grad_norm": 4.106523036956787, + "learning_rate": 4.2216582064297806e-05, + "loss": 0.9119, + "step": 2761 + }, + { + "epoch": 1.5578116187253244, + "grad_norm": 2.6054604053497314, + "learning_rate": 4.221376198533559e-05, + "loss": 0.7995, + "step": 2762 + }, + { + "epoch": 1.5583756345177666, + "grad_norm": 5.784640312194824, + "learning_rate": 4.2210941906373383e-05, + "loss": 1.1884, + "step": 2763 + }, + { + "epoch": 1.5589396503102086, + "grad_norm": 4.437988758087158, + "learning_rate": 4.220812182741117e-05, + "loss": 0.9266, + "step": 2764 + }, + { + "epoch": 1.559503666102651, + "grad_norm": 1.3679732084274292, + "learning_rate": 4.220530174844896e-05, + "loss": 0.764, + "step": 2765 + }, + { + "epoch": 1.560067681895093, + "grad_norm": 2.233797311782837, + "learning_rate": 4.2202481669486746e-05, + "loss": 0.8073, + "step": 2766 + }, + { + "epoch": 1.5606316976875352, + "grad_norm": 1.1360054016113281, + "learning_rate": 4.219966159052454e-05, + "loss": 0.6274, + "step": 2767 + }, + { + "epoch": 1.5611957134799774, + "grad_norm": 6.164411544799805, + "learning_rate": 4.2196841511562324e-05, + "loss": 1.0645, + "step": 2768 + }, + { + "epoch": 1.5617597292724197, + "grad_norm": 2.085322856903076, + "learning_rate": 4.2194021432600116e-05, + "loss": 0.7253, + "step": 2769 + }, + { + "epoch": 1.562323745064862, + "grad_norm": 2.178006649017334, + "learning_rate": 4.21912013536379e-05, + "loss": 0.7525, + "step": 2770 + }, + { + "epoch": 1.562887760857304, + "grad_norm": 1.9820976257324219, + "learning_rate": 4.2188381274675694e-05, + "loss": 0.7792, + "step": 2771 + }, + { + "epoch": 1.5634517766497462, + "grad_norm": 2.3002521991729736, + "learning_rate": 4.218556119571348e-05, + "loss": 0.8627, + "step": 2772 + }, + { + "epoch": 1.5640157924421882, + "grad_norm": 3.944929838180542, + "learning_rate": 4.218274111675127e-05, + "loss": 0.8854, + "step": 2773 + }, + { + "epoch": 1.5645798082346305, + "grad_norm": 3.599632740020752, + "learning_rate": 4.2179921037789064e-05, + "loss": 0.8412, + "step": 2774 + }, + { + "epoch": 1.5651438240270727, + "grad_norm": 5.0384016036987305, + "learning_rate": 4.217710095882685e-05, + "loss": 1.1903, + "step": 2775 + }, + { + "epoch": 1.565707839819515, + "grad_norm": 2.614184617996216, + "learning_rate": 4.2174280879864634e-05, + "loss": 0.7943, + "step": 2776 + }, + { + "epoch": 1.5662718556119573, + "grad_norm": 2.8192076683044434, + "learning_rate": 4.217146080090243e-05, + "loss": 0.8186, + "step": 2777 + }, + { + "epoch": 1.5668358714043993, + "grad_norm": 2.7647767066955566, + "learning_rate": 4.216864072194022e-05, + "loss": 0.7558, + "step": 2778 + }, + { + "epoch": 1.5673998871968415, + "grad_norm": 1.3104674816131592, + "learning_rate": 4.2165820642978004e-05, + "loss": 0.6887, + "step": 2779 + }, + { + "epoch": 1.5679639029892836, + "grad_norm": 1.8005448579788208, + "learning_rate": 4.216300056401579e-05, + "loss": 0.8128, + "step": 2780 + }, + { + "epoch": 1.5685279187817258, + "grad_norm": 4.38253116607666, + "learning_rate": 4.216018048505359e-05, + "loss": 0.9815, + "step": 2781 + }, + { + "epoch": 1.569091934574168, + "grad_norm": 4.442195415496826, + "learning_rate": 4.2157360406091374e-05, + "loss": 0.9268, + "step": 2782 + }, + { + "epoch": 1.5696559503666103, + "grad_norm": 2.7637417316436768, + "learning_rate": 4.215454032712916e-05, + "loss": 0.8922, + "step": 2783 + }, + { + "epoch": 1.5702199661590526, + "grad_norm": 2.1290018558502197, + "learning_rate": 4.2151720248166945e-05, + "loss": 0.87, + "step": 2784 + }, + { + "epoch": 1.5707839819514946, + "grad_norm": 1.9181612730026245, + "learning_rate": 4.2148900169204744e-05, + "loss": 0.7226, + "step": 2785 + }, + { + "epoch": 1.5713479977439369, + "grad_norm": 3.9906177520751953, + "learning_rate": 4.214608009024253e-05, + "loss": 0.9372, + "step": 2786 + }, + { + "epoch": 1.5719120135363789, + "grad_norm": 3.3934836387634277, + "learning_rate": 4.2143260011280315e-05, + "loss": 0.8893, + "step": 2787 + }, + { + "epoch": 1.5724760293288211, + "grad_norm": 1.2777904272079468, + "learning_rate": 4.214043993231811e-05, + "loss": 0.7278, + "step": 2788 + }, + { + "epoch": 1.5730400451212634, + "grad_norm": 1.679260015487671, + "learning_rate": 4.21376198533559e-05, + "loss": 0.7604, + "step": 2789 + }, + { + "epoch": 1.5736040609137056, + "grad_norm": 2.222649574279785, + "learning_rate": 4.2134799774393684e-05, + "loss": 0.7223, + "step": 2790 + }, + { + "epoch": 1.574168076706148, + "grad_norm": 2.0859761238098145, + "learning_rate": 4.213197969543147e-05, + "loss": 0.7576, + "step": 2791 + }, + { + "epoch": 1.57473209249859, + "grad_norm": 3.0179450511932373, + "learning_rate": 4.212915961646927e-05, + "loss": 1.001, + "step": 2792 + }, + { + "epoch": 1.5752961082910322, + "grad_norm": 3.805957317352295, + "learning_rate": 4.2126339537507054e-05, + "loss": 0.8553, + "step": 2793 + }, + { + "epoch": 1.5758601240834742, + "grad_norm": 2.2135496139526367, + "learning_rate": 4.212351945854484e-05, + "loss": 0.7945, + "step": 2794 + }, + { + "epoch": 1.5764241398759165, + "grad_norm": 4.041080474853516, + "learning_rate": 4.212069937958263e-05, + "loss": 0.9153, + "step": 2795 + }, + { + "epoch": 1.5769881556683587, + "grad_norm": 2.449293375015259, + "learning_rate": 4.2117879300620424e-05, + "loss": 0.8341, + "step": 2796 + }, + { + "epoch": 1.577552171460801, + "grad_norm": 1.7004756927490234, + "learning_rate": 4.211505922165821e-05, + "loss": 0.8006, + "step": 2797 + }, + { + "epoch": 1.5781161872532432, + "grad_norm": 2.197509527206421, + "learning_rate": 4.2112239142695995e-05, + "loss": 0.816, + "step": 2798 + }, + { + "epoch": 1.5786802030456852, + "grad_norm": 2.9554097652435303, + "learning_rate": 4.210941906373379e-05, + "loss": 0.8136, + "step": 2799 + }, + { + "epoch": 1.5792442188381275, + "grad_norm": 3.0138213634490967, + "learning_rate": 4.210659898477158e-05, + "loss": 0.9492, + "step": 2800 + }, + { + "epoch": 1.5798082346305695, + "grad_norm": 4.096212863922119, + "learning_rate": 4.2103778905809365e-05, + "loss": 1.0503, + "step": 2801 + }, + { + "epoch": 1.5803722504230118, + "grad_norm": 1.4300830364227295, + "learning_rate": 4.210095882684715e-05, + "loss": 0.7217, + "step": 2802 + }, + { + "epoch": 1.580936266215454, + "grad_norm": 3.426555871963501, + "learning_rate": 4.209813874788494e-05, + "loss": 0.8301, + "step": 2803 + }, + { + "epoch": 1.5815002820078963, + "grad_norm": 2.9117586612701416, + "learning_rate": 4.2095318668922734e-05, + "loss": 0.7889, + "step": 2804 + }, + { + "epoch": 1.5820642978003385, + "grad_norm": 2.0105526447296143, + "learning_rate": 4.209249858996052e-05, + "loss": 0.9228, + "step": 2805 + }, + { + "epoch": 1.5826283135927806, + "grad_norm": 2.401289939880371, + "learning_rate": 4.208967851099831e-05, + "loss": 0.8294, + "step": 2806 + }, + { + "epoch": 1.5831923293852228, + "grad_norm": 1.923640251159668, + "learning_rate": 4.20868584320361e-05, + "loss": 0.888, + "step": 2807 + }, + { + "epoch": 1.5837563451776648, + "grad_norm": 2.379230260848999, + "learning_rate": 4.208403835307389e-05, + "loss": 0.7777, + "step": 2808 + }, + { + "epoch": 1.584320360970107, + "grad_norm": 1.4294377565383911, + "learning_rate": 4.2081218274111675e-05, + "loss": 0.6139, + "step": 2809 + }, + { + "epoch": 1.5848843767625493, + "grad_norm": 2.9221882820129395, + "learning_rate": 4.207839819514947e-05, + "loss": 0.7415, + "step": 2810 + }, + { + "epoch": 1.5854483925549916, + "grad_norm": 3.281014919281006, + "learning_rate": 4.207557811618725e-05, + "loss": 0.9236, + "step": 2811 + }, + { + "epoch": 1.5860124083474338, + "grad_norm": 2.7529332637786865, + "learning_rate": 4.2072758037225045e-05, + "loss": 0.795, + "step": 2812 + }, + { + "epoch": 1.5865764241398759, + "grad_norm": 1.6474709510803223, + "learning_rate": 4.206993795826284e-05, + "loss": 0.8043, + "step": 2813 + }, + { + "epoch": 1.5871404399323181, + "grad_norm": 2.1285595893859863, + "learning_rate": 4.206711787930062e-05, + "loss": 0.7897, + "step": 2814 + }, + { + "epoch": 1.5877044557247602, + "grad_norm": 2.35612416267395, + "learning_rate": 4.206429780033841e-05, + "loss": 0.8853, + "step": 2815 + }, + { + "epoch": 1.5882684715172024, + "grad_norm": 4.138184547424316, + "learning_rate": 4.20614777213762e-05, + "loss": 0.8371, + "step": 2816 + }, + { + "epoch": 1.5888324873096447, + "grad_norm": 2.9376437664031982, + "learning_rate": 4.205865764241399e-05, + "loss": 0.8908, + "step": 2817 + }, + { + "epoch": 1.589396503102087, + "grad_norm": 3.9222939014434814, + "learning_rate": 4.205583756345178e-05, + "loss": 0.9857, + "step": 2818 + }, + { + "epoch": 1.5899605188945292, + "grad_norm": 2.70725417137146, + "learning_rate": 4.205301748448956e-05, + "loss": 0.8423, + "step": 2819 + }, + { + "epoch": 1.5905245346869712, + "grad_norm": 1.5869165658950806, + "learning_rate": 4.2050197405527355e-05, + "loss": 0.9285, + "step": 2820 + }, + { + "epoch": 1.5910885504794134, + "grad_norm": 1.7470983266830444, + "learning_rate": 4.204737732656515e-05, + "loss": 0.751, + "step": 2821 + }, + { + "epoch": 1.5916525662718555, + "grad_norm": 2.760617733001709, + "learning_rate": 4.204455724760293e-05, + "loss": 0.8334, + "step": 2822 + }, + { + "epoch": 1.5922165820642977, + "grad_norm": 1.842158317565918, + "learning_rate": 4.204173716864072e-05, + "loss": 0.8338, + "step": 2823 + }, + { + "epoch": 1.59278059785674, + "grad_norm": 1.9199355840682983, + "learning_rate": 4.203891708967852e-05, + "loss": 0.7095, + "step": 2824 + }, + { + "epoch": 1.5933446136491822, + "grad_norm": 3.067873954772949, + "learning_rate": 4.20360970107163e-05, + "loss": 0.8752, + "step": 2825 + }, + { + "epoch": 1.5939086294416245, + "grad_norm": 2.1084232330322266, + "learning_rate": 4.203327693175409e-05, + "loss": 0.9092, + "step": 2826 + }, + { + "epoch": 1.5944726452340665, + "grad_norm": 1.5863184928894043, + "learning_rate": 4.203045685279188e-05, + "loss": 0.742, + "step": 2827 + }, + { + "epoch": 1.5950366610265088, + "grad_norm": 2.591705322265625, + "learning_rate": 4.202763677382967e-05, + "loss": 0.8165, + "step": 2828 + }, + { + "epoch": 1.5956006768189508, + "grad_norm": 3.6074612140655518, + "learning_rate": 4.202481669486746e-05, + "loss": 0.8406, + "step": 2829 + }, + { + "epoch": 1.596164692611393, + "grad_norm": 2.51784348487854, + "learning_rate": 4.202199661590524e-05, + "loss": 0.7345, + "step": 2830 + }, + { + "epoch": 1.5967287084038353, + "grad_norm": 2.9195895195007324, + "learning_rate": 4.201917653694304e-05, + "loss": 0.8781, + "step": 2831 + }, + { + "epoch": 1.5972927241962775, + "grad_norm": 2.4800658226013184, + "learning_rate": 4.201635645798083e-05, + "loss": 0.706, + "step": 2832 + }, + { + "epoch": 1.5978567399887198, + "grad_norm": 3.1692123413085938, + "learning_rate": 4.201353637901861e-05, + "loss": 0.843, + "step": 2833 + }, + { + "epoch": 1.5984207557811618, + "grad_norm": 3.336965560913086, + "learning_rate": 4.2010716300056405e-05, + "loss": 0.9478, + "step": 2834 + }, + { + "epoch": 1.598984771573604, + "grad_norm": 1.2831710577011108, + "learning_rate": 4.20078962210942e-05, + "loss": 0.7357, + "step": 2835 + }, + { + "epoch": 1.599548787366046, + "grad_norm": 1.4374072551727295, + "learning_rate": 4.200507614213198e-05, + "loss": 0.7809, + "step": 2836 + }, + { + "epoch": 1.6001128031584884, + "grad_norm": 1.3036569356918335, + "learning_rate": 4.200225606316977e-05, + "loss": 0.7666, + "step": 2837 + }, + { + "epoch": 1.6006768189509306, + "grad_norm": 3.5638339519500732, + "learning_rate": 4.199943598420756e-05, + "loss": 0.9231, + "step": 2838 + }, + { + "epoch": 1.6012408347433729, + "grad_norm": 1.7627538442611694, + "learning_rate": 4.199661590524535e-05, + "loss": 0.7089, + "step": 2839 + }, + { + "epoch": 1.6018048505358151, + "grad_norm": 1.6464890241622925, + "learning_rate": 4.199379582628314e-05, + "loss": 0.845, + "step": 2840 + }, + { + "epoch": 1.6023688663282571, + "grad_norm": 2.1253879070281982, + "learning_rate": 4.199097574732092e-05, + "loss": 0.8899, + "step": 2841 + }, + { + "epoch": 1.6029328821206994, + "grad_norm": 2.36291766166687, + "learning_rate": 4.1988155668358715e-05, + "loss": 0.8021, + "step": 2842 + }, + { + "epoch": 1.6034968979131414, + "grad_norm": 2.24023175239563, + "learning_rate": 4.198533558939651e-05, + "loss": 0.8072, + "step": 2843 + }, + { + "epoch": 1.6040609137055837, + "grad_norm": 3.014163017272949, + "learning_rate": 4.198251551043429e-05, + "loss": 0.822, + "step": 2844 + }, + { + "epoch": 1.604624929498026, + "grad_norm": 2.8069519996643066, + "learning_rate": 4.1979695431472085e-05, + "loss": 0.8118, + "step": 2845 + }, + { + "epoch": 1.6051889452904682, + "grad_norm": 2.9785995483398438, + "learning_rate": 4.197687535250987e-05, + "loss": 0.8109, + "step": 2846 + }, + { + "epoch": 1.6057529610829104, + "grad_norm": 2.533046245574951, + "learning_rate": 4.197405527354766e-05, + "loss": 0.7292, + "step": 2847 + }, + { + "epoch": 1.6063169768753525, + "grad_norm": 1.5347938537597656, + "learning_rate": 4.197123519458545e-05, + "loss": 0.8492, + "step": 2848 + }, + { + "epoch": 1.6068809926677947, + "grad_norm": 2.6885077953338623, + "learning_rate": 4.196841511562324e-05, + "loss": 0.8509, + "step": 2849 + }, + { + "epoch": 1.6074450084602367, + "grad_norm": 4.083089828491211, + "learning_rate": 4.1965595036661026e-05, + "loss": 0.9028, + "step": 2850 + }, + { + "epoch": 1.608009024252679, + "grad_norm": 3.099421262741089, + "learning_rate": 4.196277495769882e-05, + "loss": 0.9239, + "step": 2851 + }, + { + "epoch": 1.6085730400451212, + "grad_norm": 2.536635398864746, + "learning_rate": 4.195995487873661e-05, + "loss": 0.8099, + "step": 2852 + }, + { + "epoch": 1.6091370558375635, + "grad_norm": 1.6501357555389404, + "learning_rate": 4.1957134799774396e-05, + "loss": 0.7469, + "step": 2853 + }, + { + "epoch": 1.6097010716300058, + "grad_norm": 2.7854087352752686, + "learning_rate": 4.195431472081218e-05, + "loss": 0.7512, + "step": 2854 + }, + { + "epoch": 1.6102650874224478, + "grad_norm": 3.029670238494873, + "learning_rate": 4.195149464184997e-05, + "loss": 0.8939, + "step": 2855 + }, + { + "epoch": 1.61082910321489, + "grad_norm": 2.149416446685791, + "learning_rate": 4.1948674562887765e-05, + "loss": 0.9012, + "step": 2856 + }, + { + "epoch": 1.611393119007332, + "grad_norm": 2.5264294147491455, + "learning_rate": 4.194585448392555e-05, + "loss": 0.8235, + "step": 2857 + }, + { + "epoch": 1.6119571347997743, + "grad_norm": 4.050015449523926, + "learning_rate": 4.1943034404963336e-05, + "loss": 0.9628, + "step": 2858 + }, + { + "epoch": 1.6125211505922166, + "grad_norm": 2.16375994682312, + "learning_rate": 4.194021432600113e-05, + "loss": 0.7136, + "step": 2859 + }, + { + "epoch": 1.6130851663846588, + "grad_norm": 2.262840509414673, + "learning_rate": 4.193739424703892e-05, + "loss": 0.7668, + "step": 2860 + }, + { + "epoch": 1.613649182177101, + "grad_norm": 1.733307957649231, + "learning_rate": 4.1934574168076706e-05, + "loss": 0.7825, + "step": 2861 + }, + { + "epoch": 1.614213197969543, + "grad_norm": 2.8579041957855225, + "learning_rate": 4.19317540891145e-05, + "loss": 0.8703, + "step": 2862 + }, + { + "epoch": 1.6147772137619854, + "grad_norm": 2.5574469566345215, + "learning_rate": 4.192893401015229e-05, + "loss": 0.9318, + "step": 2863 + }, + { + "epoch": 1.6153412295544274, + "grad_norm": 1.7581019401550293, + "learning_rate": 4.1926113931190076e-05, + "loss": 0.8101, + "step": 2864 + }, + { + "epoch": 1.6159052453468696, + "grad_norm": 1.4658771753311157, + "learning_rate": 4.192329385222786e-05, + "loss": 0.7417, + "step": 2865 + }, + { + "epoch": 1.6164692611393119, + "grad_norm": 3.831343650817871, + "learning_rate": 4.192047377326565e-05, + "loss": 0.9297, + "step": 2866 + }, + { + "epoch": 1.6170332769317541, + "grad_norm": 2.361738443374634, + "learning_rate": 4.1917653694303445e-05, + "loss": 0.9739, + "step": 2867 + }, + { + "epoch": 1.6175972927241964, + "grad_norm": 1.4587968587875366, + "learning_rate": 4.191483361534123e-05, + "loss": 0.8047, + "step": 2868 + }, + { + "epoch": 1.6181613085166384, + "grad_norm": 2.3187520503997803, + "learning_rate": 4.1912013536379016e-05, + "loss": 0.9055, + "step": 2869 + }, + { + "epoch": 1.6187253243090807, + "grad_norm": 2.2035748958587646, + "learning_rate": 4.1909193457416815e-05, + "loss": 0.8181, + "step": 2870 + }, + { + "epoch": 1.6192893401015227, + "grad_norm": 1.466405987739563, + "learning_rate": 4.19063733784546e-05, + "loss": 0.7576, + "step": 2871 + }, + { + "epoch": 1.619853355893965, + "grad_norm": 2.2931313514709473, + "learning_rate": 4.1903553299492386e-05, + "loss": 0.6993, + "step": 2872 + }, + { + "epoch": 1.6204173716864072, + "grad_norm": 2.404047966003418, + "learning_rate": 4.190073322053018e-05, + "loss": 0.7908, + "step": 2873 + }, + { + "epoch": 1.6209813874788495, + "grad_norm": 2.7128849029541016, + "learning_rate": 4.189791314156797e-05, + "loss": 0.8269, + "step": 2874 + }, + { + "epoch": 1.6215454032712917, + "grad_norm": 2.08723783493042, + "learning_rate": 4.1895093062605756e-05, + "loss": 0.8351, + "step": 2875 + }, + { + "epoch": 1.6221094190637337, + "grad_norm": 2.475386381149292, + "learning_rate": 4.189227298364354e-05, + "loss": 0.7782, + "step": 2876 + }, + { + "epoch": 1.622673434856176, + "grad_norm": 1.4326997995376587, + "learning_rate": 4.1889452904681333e-05, + "loss": 0.6982, + "step": 2877 + }, + { + "epoch": 1.623237450648618, + "grad_norm": 3.8401243686676025, + "learning_rate": 4.1886632825719126e-05, + "loss": 0.8569, + "step": 2878 + }, + { + "epoch": 1.6238014664410603, + "grad_norm": 2.3274643421173096, + "learning_rate": 4.188381274675691e-05, + "loss": 0.792, + "step": 2879 + }, + { + "epoch": 1.6243654822335025, + "grad_norm": 1.6200215816497803, + "learning_rate": 4.1880992667794696e-05, + "loss": 0.7777, + "step": 2880 + }, + { + "epoch": 1.6249294980259448, + "grad_norm": 4.65693998336792, + "learning_rate": 4.187817258883249e-05, + "loss": 0.8479, + "step": 2881 + }, + { + "epoch": 1.625493513818387, + "grad_norm": 1.886505365371704, + "learning_rate": 4.187535250987028e-05, + "loss": 0.7484, + "step": 2882 + }, + { + "epoch": 1.626057529610829, + "grad_norm": 3.64811110496521, + "learning_rate": 4.1872532430908066e-05, + "loss": 0.874, + "step": 2883 + }, + { + "epoch": 1.6266215454032713, + "grad_norm": 2.6703083515167236, + "learning_rate": 4.186971235194586e-05, + "loss": 0.855, + "step": 2884 + }, + { + "epoch": 1.6271855611957133, + "grad_norm": 3.2324235439300537, + "learning_rate": 4.1866892272983644e-05, + "loss": 0.8904, + "step": 2885 + }, + { + "epoch": 1.6277495769881556, + "grad_norm": 3.5729684829711914, + "learning_rate": 4.1864072194021436e-05, + "loss": 0.8084, + "step": 2886 + }, + { + "epoch": 1.6283135927805978, + "grad_norm": 1.9552663564682007, + "learning_rate": 4.186125211505922e-05, + "loss": 0.8069, + "step": 2887 + }, + { + "epoch": 1.62887760857304, + "grad_norm": 1.5418587923049927, + "learning_rate": 4.1858432036097014e-05, + "loss": 0.7769, + "step": 2888 + }, + { + "epoch": 1.6294416243654823, + "grad_norm": 1.670638918876648, + "learning_rate": 4.18556119571348e-05, + "loss": 0.6992, + "step": 2889 + }, + { + "epoch": 1.6300056401579244, + "grad_norm": 1.608277678489685, + "learning_rate": 4.185279187817259e-05, + "loss": 0.7644, + "step": 2890 + }, + { + "epoch": 1.6305696559503666, + "grad_norm": 2.617133140563965, + "learning_rate": 4.184997179921038e-05, + "loss": 0.7938, + "step": 2891 + }, + { + "epoch": 1.6311336717428087, + "grad_norm": 1.6060054302215576, + "learning_rate": 4.184715172024817e-05, + "loss": 0.6953, + "step": 2892 + }, + { + "epoch": 1.631697687535251, + "grad_norm": 2.9722740650177, + "learning_rate": 4.1844331641285954e-05, + "loss": 0.8702, + "step": 2893 + }, + { + "epoch": 1.6322617033276932, + "grad_norm": 3.9911394119262695, + "learning_rate": 4.1841511562323746e-05, + "loss": 0.7693, + "step": 2894 + }, + { + "epoch": 1.6328257191201354, + "grad_norm": 4.655816078186035, + "learning_rate": 4.183869148336154e-05, + "loss": 0.9524, + "step": 2895 + }, + { + "epoch": 1.6333897349125777, + "grad_norm": 2.3189492225646973, + "learning_rate": 4.1835871404399324e-05, + "loss": 0.7665, + "step": 2896 + }, + { + "epoch": 1.6339537507050197, + "grad_norm": 2.256235122680664, + "learning_rate": 4.183305132543711e-05, + "loss": 0.7806, + "step": 2897 + }, + { + "epoch": 1.634517766497462, + "grad_norm": 3.2728774547576904, + "learning_rate": 4.18302312464749e-05, + "loss": 0.924, + "step": 2898 + }, + { + "epoch": 1.635081782289904, + "grad_norm": 1.7380565404891968, + "learning_rate": 4.1827411167512694e-05, + "loss": 0.7928, + "step": 2899 + }, + { + "epoch": 1.6356457980823462, + "grad_norm": 1.4523407220840454, + "learning_rate": 4.182459108855048e-05, + "loss": 0.8287, + "step": 2900 + }, + { + "epoch": 1.6362098138747885, + "grad_norm": 3.2949578762054443, + "learning_rate": 4.182177100958827e-05, + "loss": 0.8113, + "step": 2901 + }, + { + "epoch": 1.6367738296672307, + "grad_norm": 2.2125446796417236, + "learning_rate": 4.1818950930626063e-05, + "loss": 0.7153, + "step": 2902 + }, + { + "epoch": 1.637337845459673, + "grad_norm": 1.6325109004974365, + "learning_rate": 4.181613085166385e-05, + "loss": 0.8524, + "step": 2903 + }, + { + "epoch": 1.637901861252115, + "grad_norm": 2.384798526763916, + "learning_rate": 4.1813310772701634e-05, + "loss": 0.7081, + "step": 2904 + }, + { + "epoch": 1.6384658770445573, + "grad_norm": 1.8741815090179443, + "learning_rate": 4.1810490693739426e-05, + "loss": 0.7176, + "step": 2905 + }, + { + "epoch": 1.6390298928369993, + "grad_norm": 1.2085474729537964, + "learning_rate": 4.180767061477722e-05, + "loss": 0.7039, + "step": 2906 + }, + { + "epoch": 1.6395939086294415, + "grad_norm": 3.4673655033111572, + "learning_rate": 4.1804850535815004e-05, + "loss": 0.8442, + "step": 2907 + }, + { + "epoch": 1.6401579244218838, + "grad_norm": 2.483916759490967, + "learning_rate": 4.180203045685279e-05, + "loss": 0.7075, + "step": 2908 + }, + { + "epoch": 1.640721940214326, + "grad_norm": 4.167349338531494, + "learning_rate": 4.179921037789059e-05, + "loss": 0.9161, + "step": 2909 + }, + { + "epoch": 1.6412859560067683, + "grad_norm": 3.6509807109832764, + "learning_rate": 4.1796390298928374e-05, + "loss": 0.9724, + "step": 2910 + }, + { + "epoch": 1.6418499717992103, + "grad_norm": 2.174196720123291, + "learning_rate": 4.179357021996616e-05, + "loss": 0.8968, + "step": 2911 + }, + { + "epoch": 1.6424139875916526, + "grad_norm": 3.7271881103515625, + "learning_rate": 4.179075014100395e-05, + "loss": 0.7798, + "step": 2912 + }, + { + "epoch": 1.6429780033840946, + "grad_norm": 3.7013087272644043, + "learning_rate": 4.1787930062041744e-05, + "loss": 0.8873, + "step": 2913 + }, + { + "epoch": 1.6435420191765369, + "grad_norm": 3.7088725566864014, + "learning_rate": 4.178510998307953e-05, + "loss": 0.8543, + "step": 2914 + }, + { + "epoch": 1.6441060349689791, + "grad_norm": 3.670149803161621, + "learning_rate": 4.1782289904117314e-05, + "loss": 0.8397, + "step": 2915 + }, + { + "epoch": 1.6446700507614214, + "grad_norm": 1.360365629196167, + "learning_rate": 4.177946982515511e-05, + "loss": 0.8214, + "step": 2916 + }, + { + "epoch": 1.6452340665538636, + "grad_norm": 1.6321829557418823, + "learning_rate": 4.17766497461929e-05, + "loss": 0.8484, + "step": 2917 + }, + { + "epoch": 1.6457980823463056, + "grad_norm": 4.07252836227417, + "learning_rate": 4.1773829667230684e-05, + "loss": 0.8969, + "step": 2918 + }, + { + "epoch": 1.646362098138748, + "grad_norm": 1.7541847229003906, + "learning_rate": 4.177100958826847e-05, + "loss": 0.729, + "step": 2919 + }, + { + "epoch": 1.64692611393119, + "grad_norm": 1.713841438293457, + "learning_rate": 4.176818950930626e-05, + "loss": 0.865, + "step": 2920 + }, + { + "epoch": 1.6474901297236322, + "grad_norm": 2.2217705249786377, + "learning_rate": 4.1765369430344054e-05, + "loss": 0.8116, + "step": 2921 + }, + { + "epoch": 1.6480541455160744, + "grad_norm": 3.2224960327148438, + "learning_rate": 4.176254935138184e-05, + "loss": 0.9047, + "step": 2922 + }, + { + "epoch": 1.6486181613085167, + "grad_norm": 2.4128167629241943, + "learning_rate": 4.175972927241963e-05, + "loss": 0.8466, + "step": 2923 + }, + { + "epoch": 1.649182177100959, + "grad_norm": 2.2820703983306885, + "learning_rate": 4.175690919345742e-05, + "loss": 0.8768, + "step": 2924 + }, + { + "epoch": 1.649746192893401, + "grad_norm": 3.5262935161590576, + "learning_rate": 4.175408911449521e-05, + "loss": 0.7924, + "step": 2925 + }, + { + "epoch": 1.6503102086858432, + "grad_norm": 4.737199306488037, + "learning_rate": 4.1751269035532995e-05, + "loss": 1.0154, + "step": 2926 + }, + { + "epoch": 1.6508742244782852, + "grad_norm": 1.7731355428695679, + "learning_rate": 4.174844895657079e-05, + "loss": 0.8096, + "step": 2927 + }, + { + "epoch": 1.6514382402707275, + "grad_norm": 1.7755709886550903, + "learning_rate": 4.174562887760857e-05, + "loss": 0.7819, + "step": 2928 + }, + { + "epoch": 1.6520022560631697, + "grad_norm": 2.3065295219421387, + "learning_rate": 4.1742808798646364e-05, + "loss": 0.5856, + "step": 2929 + }, + { + "epoch": 1.652566271855612, + "grad_norm": 1.2893520593643188, + "learning_rate": 4.1739988719684157e-05, + "loss": 0.7255, + "step": 2930 + }, + { + "epoch": 1.6531302876480543, + "grad_norm": 3.3215014934539795, + "learning_rate": 4.173716864072194e-05, + "loss": 0.9809, + "step": 2931 + }, + { + "epoch": 1.6536943034404963, + "grad_norm": 3.0238037109375, + "learning_rate": 4.173434856175973e-05, + "loss": 0.8468, + "step": 2932 + }, + { + "epoch": 1.6542583192329385, + "grad_norm": 1.4829660654067993, + "learning_rate": 4.173152848279752e-05, + "loss": 0.7412, + "step": 2933 + }, + { + "epoch": 1.6548223350253806, + "grad_norm": 2.594024181365967, + "learning_rate": 4.172870840383531e-05, + "loss": 0.6968, + "step": 2934 + }, + { + "epoch": 1.6553863508178228, + "grad_norm": 2.111926555633545, + "learning_rate": 4.17258883248731e-05, + "loss": 0.744, + "step": 2935 + }, + { + "epoch": 1.655950366610265, + "grad_norm": 1.8945931196212769, + "learning_rate": 4.172306824591089e-05, + "loss": 0.6877, + "step": 2936 + }, + { + "epoch": 1.6565143824027073, + "grad_norm": 2.6121387481689453, + "learning_rate": 4.1720248166948675e-05, + "loss": 0.943, + "step": 2937 + }, + { + "epoch": 1.6570783981951496, + "grad_norm": 2.345804452896118, + "learning_rate": 4.171742808798647e-05, + "loss": 0.7961, + "step": 2938 + }, + { + "epoch": 1.6576424139875916, + "grad_norm": 2.1553258895874023, + "learning_rate": 4.171460800902425e-05, + "loss": 0.7948, + "step": 2939 + }, + { + "epoch": 1.6582064297800339, + "grad_norm": 2.4533777236938477, + "learning_rate": 4.1711787930062045e-05, + "loss": 0.85, + "step": 2940 + }, + { + "epoch": 1.6587704455724759, + "grad_norm": 4.059479713439941, + "learning_rate": 4.170896785109984e-05, + "loss": 0.879, + "step": 2941 + }, + { + "epoch": 1.6593344613649181, + "grad_norm": 3.4790897369384766, + "learning_rate": 4.170614777213762e-05, + "loss": 0.9234, + "step": 2942 + }, + { + "epoch": 1.6598984771573604, + "grad_norm": 3.134258985519409, + "learning_rate": 4.170332769317541e-05, + "loss": 0.879, + "step": 2943 + }, + { + "epoch": 1.6604624929498026, + "grad_norm": 1.65171217918396, + "learning_rate": 4.17005076142132e-05, + "loss": 0.6997, + "step": 2944 + }, + { + "epoch": 1.661026508742245, + "grad_norm": 1.7797211408615112, + "learning_rate": 4.169768753525099e-05, + "loss": 0.6698, + "step": 2945 + }, + { + "epoch": 1.661590524534687, + "grad_norm": 4.5997443199157715, + "learning_rate": 4.169486745628878e-05, + "loss": 1.0708, + "step": 2946 + }, + { + "epoch": 1.6621545403271292, + "grad_norm": 11.336875915527344, + "learning_rate": 4.169204737732656e-05, + "loss": 0.7613, + "step": 2947 + }, + { + "epoch": 1.6627185561195712, + "grad_norm": 2.3541088104248047, + "learning_rate": 4.168922729836436e-05, + "loss": 0.7838, + "step": 2948 + }, + { + "epoch": 1.6632825719120135, + "grad_norm": 2.8142173290252686, + "learning_rate": 4.168640721940215e-05, + "loss": 0.7726, + "step": 2949 + }, + { + "epoch": 1.6638465877044557, + "grad_norm": 2.197531223297119, + "learning_rate": 4.168358714043993e-05, + "loss": 0.7256, + "step": 2950 + }, + { + "epoch": 1.664410603496898, + "grad_norm": 2.3519537448883057, + "learning_rate": 4.168076706147772e-05, + "loss": 0.7192, + "step": 2951 + }, + { + "epoch": 1.6649746192893402, + "grad_norm": 3.4062447547912598, + "learning_rate": 4.167794698251552e-05, + "loss": 0.8943, + "step": 2952 + }, + { + "epoch": 1.6655386350817822, + "grad_norm": 2.7247488498687744, + "learning_rate": 4.16751269035533e-05, + "loss": 0.8852, + "step": 2953 + }, + { + "epoch": 1.6661026508742245, + "grad_norm": 1.613693118095398, + "learning_rate": 4.167230682459109e-05, + "loss": 0.8701, + "step": 2954 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.947088360786438, + "learning_rate": 4.166948674562888e-05, + "loss": 0.8384, + "step": 2955 + }, + { + "epoch": 1.6672306824591088, + "grad_norm": 2.573793411254883, + "learning_rate": 4.166666666666667e-05, + "loss": 0.8535, + "step": 2956 + }, + { + "epoch": 1.667794698251551, + "grad_norm": 2.4152674674987793, + "learning_rate": 4.166384658770446e-05, + "loss": 0.8637, + "step": 2957 + }, + { + "epoch": 1.6683587140439933, + "grad_norm": 2.55652117729187, + "learning_rate": 4.166102650874224e-05, + "loss": 0.8377, + "step": 2958 + }, + { + "epoch": 1.6689227298364355, + "grad_norm": 2.363001823425293, + "learning_rate": 4.1658206429780035e-05, + "loss": 0.8277, + "step": 2959 + }, + { + "epoch": 1.6694867456288776, + "grad_norm": 1.4905493259429932, + "learning_rate": 4.165538635081783e-05, + "loss": 0.8036, + "step": 2960 + }, + { + "epoch": 1.6700507614213198, + "grad_norm": 1.9259226322174072, + "learning_rate": 4.165256627185561e-05, + "loss": 0.9061, + "step": 2961 + }, + { + "epoch": 1.6706147772137618, + "grad_norm": 3.3065669536590576, + "learning_rate": 4.1649746192893405e-05, + "loss": 0.9139, + "step": 2962 + }, + { + "epoch": 1.671178793006204, + "grad_norm": 2.3231635093688965, + "learning_rate": 4.164692611393119e-05, + "loss": 0.8866, + "step": 2963 + }, + { + "epoch": 1.6717428087986463, + "grad_norm": 1.7002103328704834, + "learning_rate": 4.164410603496898e-05, + "loss": 0.7726, + "step": 2964 + }, + { + "epoch": 1.6723068245910886, + "grad_norm": 2.331843376159668, + "learning_rate": 4.164128595600677e-05, + "loss": 0.8152, + "step": 2965 + }, + { + "epoch": 1.6728708403835308, + "grad_norm": 4.645202159881592, + "learning_rate": 4.163846587704456e-05, + "loss": 0.8907, + "step": 2966 + }, + { + "epoch": 1.6734348561759729, + "grad_norm": 1.1695178747177124, + "learning_rate": 4.1635645798082345e-05, + "loss": 0.6787, + "step": 2967 + }, + { + "epoch": 1.6739988719684151, + "grad_norm": 1.9628112316131592, + "learning_rate": 4.163282571912014e-05, + "loss": 0.8027, + "step": 2968 + }, + { + "epoch": 1.6745628877608572, + "grad_norm": 2.933436632156372, + "learning_rate": 4.163000564015792e-05, + "loss": 0.8622, + "step": 2969 + }, + { + "epoch": 1.6751269035532994, + "grad_norm": 3.5164997577667236, + "learning_rate": 4.1627185561195715e-05, + "loss": 0.8911, + "step": 2970 + }, + { + "epoch": 1.6756909193457417, + "grad_norm": 2.2647716999053955, + "learning_rate": 4.162436548223351e-05, + "loss": 0.8076, + "step": 2971 + }, + { + "epoch": 1.676254935138184, + "grad_norm": 2.024862766265869, + "learning_rate": 4.162154540327129e-05, + "loss": 0.9322, + "step": 2972 + }, + { + "epoch": 1.6768189509306262, + "grad_norm": 1.906864881515503, + "learning_rate": 4.1618725324309085e-05, + "loss": 0.8042, + "step": 2973 + }, + { + "epoch": 1.6773829667230682, + "grad_norm": 2.7898895740509033, + "learning_rate": 4.161590524534687e-05, + "loss": 0.8657, + "step": 2974 + }, + { + "epoch": 1.6779469825155104, + "grad_norm": 2.2709667682647705, + "learning_rate": 4.161308516638466e-05, + "loss": 0.8777, + "step": 2975 + }, + { + "epoch": 1.6785109983079525, + "grad_norm": 2.0254621505737305, + "learning_rate": 4.161026508742245e-05, + "loss": 0.9325, + "step": 2976 + }, + { + "epoch": 1.6790750141003947, + "grad_norm": 1.971210241317749, + "learning_rate": 4.160744500846024e-05, + "loss": 0.8605, + "step": 2977 + }, + { + "epoch": 1.679639029892837, + "grad_norm": 3.7805771827697754, + "learning_rate": 4.1604624929498026e-05, + "loss": 0.8293, + "step": 2978 + }, + { + "epoch": 1.6802030456852792, + "grad_norm": 1.4243556261062622, + "learning_rate": 4.160180485053582e-05, + "loss": 0.7842, + "step": 2979 + }, + { + "epoch": 1.6807670614777215, + "grad_norm": 4.397488594055176, + "learning_rate": 4.159898477157361e-05, + "loss": 0.8255, + "step": 2980 + }, + { + "epoch": 1.6813310772701635, + "grad_norm": 3.1812918186187744, + "learning_rate": 4.1596164692611395e-05, + "loss": 0.8803, + "step": 2981 + }, + { + "epoch": 1.6818950930626058, + "grad_norm": 2.0280487537384033, + "learning_rate": 4.159334461364918e-05, + "loss": 0.7453, + "step": 2982 + }, + { + "epoch": 1.6824591088550478, + "grad_norm": 2.617924451828003, + "learning_rate": 4.159052453468697e-05, + "loss": 0.8595, + "step": 2983 + }, + { + "epoch": 1.68302312464749, + "grad_norm": 1.8802367448806763, + "learning_rate": 4.1587704455724765e-05, + "loss": 0.8058, + "step": 2984 + }, + { + "epoch": 1.6835871404399323, + "grad_norm": 2.843233823776245, + "learning_rate": 4.158488437676255e-05, + "loss": 0.7043, + "step": 2985 + }, + { + "epoch": 1.6841511562323745, + "grad_norm": 2.1422011852264404, + "learning_rate": 4.1582064297800336e-05, + "loss": 0.737, + "step": 2986 + }, + { + "epoch": 1.6847151720248168, + "grad_norm": 4.289732933044434, + "learning_rate": 4.157924421883813e-05, + "loss": 0.9032, + "step": 2987 + }, + { + "epoch": 1.6852791878172588, + "grad_norm": 5.603814125061035, + "learning_rate": 4.157642413987592e-05, + "loss": 1.0157, + "step": 2988 + }, + { + "epoch": 1.685843203609701, + "grad_norm": 4.027318954467773, + "learning_rate": 4.1573604060913706e-05, + "loss": 0.7248, + "step": 2989 + }, + { + "epoch": 1.686407219402143, + "grad_norm": 3.299344539642334, + "learning_rate": 4.157078398195149e-05, + "loss": 0.7742, + "step": 2990 + }, + { + "epoch": 1.6869712351945854, + "grad_norm": 2.313824415206909, + "learning_rate": 4.156796390298929e-05, + "loss": 0.8177, + "step": 2991 + }, + { + "epoch": 1.6875352509870276, + "grad_norm": 2.2055296897888184, + "learning_rate": 4.1565143824027076e-05, + "loss": 0.862, + "step": 2992 + }, + { + "epoch": 1.6880992667794699, + "grad_norm": 1.7588179111480713, + "learning_rate": 4.156232374506486e-05, + "loss": 0.8782, + "step": 2993 + }, + { + "epoch": 1.6886632825719121, + "grad_norm": 1.6290262937545776, + "learning_rate": 4.155950366610265e-05, + "loss": 0.6649, + "step": 2994 + }, + { + "epoch": 1.6892272983643544, + "grad_norm": 3.2924065589904785, + "learning_rate": 4.1556683587140445e-05, + "loss": 0.9212, + "step": 2995 + }, + { + "epoch": 1.6897913141567964, + "grad_norm": 2.2461349964141846, + "learning_rate": 4.155386350817823e-05, + "loss": 0.669, + "step": 2996 + }, + { + "epoch": 1.6903553299492384, + "grad_norm": 1.9811537265777588, + "learning_rate": 4.1551043429216016e-05, + "loss": 0.8127, + "step": 2997 + }, + { + "epoch": 1.6909193457416807, + "grad_norm": 3.397531270980835, + "learning_rate": 4.154822335025381e-05, + "loss": 0.9882, + "step": 2998 + }, + { + "epoch": 1.691483361534123, + "grad_norm": 1.9280287027359009, + "learning_rate": 4.15454032712916e-05, + "loss": 0.7561, + "step": 2999 + }, + { + "epoch": 1.6920473773265652, + "grad_norm": 1.7510141134262085, + "learning_rate": 4.1542583192329386e-05, + "loss": 0.7927, + "step": 3000 + }, + { + "epoch": 1.6926113931190074, + "grad_norm": 1.3378405570983887, + "learning_rate": 4.153976311336718e-05, + "loss": 0.7541, + "step": 3001 + }, + { + "epoch": 1.6931754089114497, + "grad_norm": 1.8737866878509521, + "learning_rate": 4.1536943034404963e-05, + "loss": 0.7752, + "step": 3002 + }, + { + "epoch": 1.6937394247038917, + "grad_norm": 3.4249107837677, + "learning_rate": 4.1534122955442756e-05, + "loss": 0.9187, + "step": 3003 + }, + { + "epoch": 1.6943034404963337, + "grad_norm": 2.9732470512390137, + "learning_rate": 4.153130287648054e-05, + "loss": 0.9499, + "step": 3004 + }, + { + "epoch": 1.694867456288776, + "grad_norm": 2.32336163520813, + "learning_rate": 4.152848279751833e-05, + "loss": 0.6719, + "step": 3005 + }, + { + "epoch": 1.6954314720812182, + "grad_norm": 2.585975408554077, + "learning_rate": 4.1525662718556125e-05, + "loss": 0.7128, + "step": 3006 + }, + { + "epoch": 1.6959954878736605, + "grad_norm": 3.436152696609497, + "learning_rate": 4.152284263959391e-05, + "loss": 0.8977, + "step": 3007 + }, + { + "epoch": 1.6965595036661028, + "grad_norm": 2.0925536155700684, + "learning_rate": 4.1520022560631696e-05, + "loss": 0.8243, + "step": 3008 + }, + { + "epoch": 1.697123519458545, + "grad_norm": 2.836088180541992, + "learning_rate": 4.151720248166949e-05, + "loss": 0.829, + "step": 3009 + }, + { + "epoch": 1.697687535250987, + "grad_norm": 3.3649837970733643, + "learning_rate": 4.151438240270728e-05, + "loss": 0.8654, + "step": 3010 + }, + { + "epoch": 1.698251551043429, + "grad_norm": 4.066084861755371, + "learning_rate": 4.1511562323745066e-05, + "loss": 0.965, + "step": 3011 + }, + { + "epoch": 1.6988155668358713, + "grad_norm": 3.6588711738586426, + "learning_rate": 4.150874224478286e-05, + "loss": 0.7875, + "step": 3012 + }, + { + "epoch": 1.6993795826283136, + "grad_norm": 2.1062004566192627, + "learning_rate": 4.1505922165820644e-05, + "loss": 0.7518, + "step": 3013 + }, + { + "epoch": 1.6999435984207558, + "grad_norm": 2.2102270126342773, + "learning_rate": 4.1503102086858436e-05, + "loss": 0.6318, + "step": 3014 + }, + { + "epoch": 1.700507614213198, + "grad_norm": 2.9170074462890625, + "learning_rate": 4.150028200789622e-05, + "loss": 0.9354, + "step": 3015 + }, + { + "epoch": 1.7010716300056403, + "grad_norm": 2.8377702236175537, + "learning_rate": 4.1497461928934013e-05, + "loss": 0.8198, + "step": 3016 + }, + { + "epoch": 1.7016356457980824, + "grad_norm": 2.1604537963867188, + "learning_rate": 4.14946418499718e-05, + "loss": 0.8092, + "step": 3017 + }, + { + "epoch": 1.7021996615905244, + "grad_norm": 3.0797176361083984, + "learning_rate": 4.149182177100959e-05, + "loss": 0.9045, + "step": 3018 + }, + { + "epoch": 1.7027636773829666, + "grad_norm": 2.543471574783325, + "learning_rate": 4.148900169204738e-05, + "loss": 0.8008, + "step": 3019 + }, + { + "epoch": 1.7033276931754089, + "grad_norm": 2.4784538745880127, + "learning_rate": 4.148618161308517e-05, + "loss": 0.8899, + "step": 3020 + }, + { + "epoch": 1.7038917089678511, + "grad_norm": 2.5054335594177246, + "learning_rate": 4.1483361534122954e-05, + "loss": 0.9849, + "step": 3021 + }, + { + "epoch": 1.7044557247602934, + "grad_norm": 2.6998982429504395, + "learning_rate": 4.1480541455160746e-05, + "loss": 0.8515, + "step": 3022 + }, + { + "epoch": 1.7050197405527356, + "grad_norm": 2.986229181289673, + "learning_rate": 4.147772137619854e-05, + "loss": 0.7922, + "step": 3023 + }, + { + "epoch": 1.7055837563451777, + "grad_norm": 2.739682674407959, + "learning_rate": 4.1474901297236324e-05, + "loss": 0.9435, + "step": 3024 + }, + { + "epoch": 1.7061477721376197, + "grad_norm": 2.3190970420837402, + "learning_rate": 4.147208121827411e-05, + "loss": 0.8194, + "step": 3025 + }, + { + "epoch": 1.706711787930062, + "grad_norm": 2.8331873416900635, + "learning_rate": 4.14692611393119e-05, + "loss": 0.7952, + "step": 3026 + }, + { + "epoch": 1.7072758037225042, + "grad_norm": 1.5075490474700928, + "learning_rate": 4.1466441060349694e-05, + "loss": 0.7715, + "step": 3027 + }, + { + "epoch": 1.7078398195149465, + "grad_norm": 1.318095326423645, + "learning_rate": 4.146362098138748e-05, + "loss": 0.8369, + "step": 3028 + }, + { + "epoch": 1.7084038353073887, + "grad_norm": 1.6113853454589844, + "learning_rate": 4.1460800902425264e-05, + "loss": 0.8915, + "step": 3029 + }, + { + "epoch": 1.708967851099831, + "grad_norm": 2.3277273178100586, + "learning_rate": 4.145798082346306e-05, + "loss": 0.7309, + "step": 3030 + }, + { + "epoch": 1.709531866892273, + "grad_norm": 3.107325553894043, + "learning_rate": 4.145516074450085e-05, + "loss": 0.7342, + "step": 3031 + }, + { + "epoch": 1.710095882684715, + "grad_norm": 3.3824448585510254, + "learning_rate": 4.1452340665538634e-05, + "loss": 0.7594, + "step": 3032 + }, + { + "epoch": 1.7106598984771573, + "grad_norm": 2.5531954765319824, + "learning_rate": 4.1449520586576426e-05, + "loss": 0.8366, + "step": 3033 + }, + { + "epoch": 1.7112239142695995, + "grad_norm": 2.5163674354553223, + "learning_rate": 4.144670050761422e-05, + "loss": 0.7538, + "step": 3034 + }, + { + "epoch": 1.7117879300620418, + "grad_norm": 2.198244333267212, + "learning_rate": 4.1443880428652004e-05, + "loss": 0.9089, + "step": 3035 + }, + { + "epoch": 1.712351945854484, + "grad_norm": 5.349717617034912, + "learning_rate": 4.144106034968979e-05, + "loss": 1.0561, + "step": 3036 + }, + { + "epoch": 1.7129159616469263, + "grad_norm": 3.680516481399536, + "learning_rate": 4.143824027072758e-05, + "loss": 0.8479, + "step": 3037 + }, + { + "epoch": 1.7134799774393683, + "grad_norm": 3.2384698390960693, + "learning_rate": 4.1435420191765374e-05, + "loss": 0.9474, + "step": 3038 + }, + { + "epoch": 1.7140439932318103, + "grad_norm": 1.5716303586959839, + "learning_rate": 4.143260011280316e-05, + "loss": 0.7118, + "step": 3039 + }, + { + "epoch": 1.7146080090242526, + "grad_norm": 2.1845507621765137, + "learning_rate": 4.142978003384095e-05, + "loss": 0.6683, + "step": 3040 + }, + { + "epoch": 1.7151720248166948, + "grad_norm": 2.1972439289093018, + "learning_rate": 4.1426959954878743e-05, + "loss": 0.7308, + "step": 3041 + }, + { + "epoch": 1.715736040609137, + "grad_norm": 1.676922082901001, + "learning_rate": 4.142413987591653e-05, + "loss": 0.6797, + "step": 3042 + }, + { + "epoch": 1.7163000564015793, + "grad_norm": 4.674180507659912, + "learning_rate": 4.1421319796954314e-05, + "loss": 0.8549, + "step": 3043 + }, + { + "epoch": 1.7168640721940216, + "grad_norm": 1.894490361213684, + "learning_rate": 4.1418499717992106e-05, + "loss": 0.8728, + "step": 3044 + }, + { + "epoch": 1.7174280879864636, + "grad_norm": 3.236490249633789, + "learning_rate": 4.14156796390299e-05, + "loss": 0.9498, + "step": 3045 + }, + { + "epoch": 1.7179921037789057, + "grad_norm": 1.7805498838424683, + "learning_rate": 4.1412859560067684e-05, + "loss": 0.7752, + "step": 3046 + }, + { + "epoch": 1.718556119571348, + "grad_norm": 2.0882887840270996, + "learning_rate": 4.141003948110547e-05, + "loss": 0.8146, + "step": 3047 + }, + { + "epoch": 1.7191201353637902, + "grad_norm": 1.9173027276992798, + "learning_rate": 4.140721940214326e-05, + "loss": 0.6601, + "step": 3048 + }, + { + "epoch": 1.7196841511562324, + "grad_norm": 1.3574891090393066, + "learning_rate": 4.1404399323181054e-05, + "loss": 0.7512, + "step": 3049 + }, + { + "epoch": 1.7202481669486747, + "grad_norm": 2.7460014820098877, + "learning_rate": 4.140157924421884e-05, + "loss": 0.812, + "step": 3050 + }, + { + "epoch": 1.720812182741117, + "grad_norm": 1.255441427230835, + "learning_rate": 4.139875916525663e-05, + "loss": 0.778, + "step": 3051 + }, + { + "epoch": 1.721376198533559, + "grad_norm": 1.2412501573562622, + "learning_rate": 4.139593908629442e-05, + "loss": 0.6314, + "step": 3052 + }, + { + "epoch": 1.721940214326001, + "grad_norm": 4.83478307723999, + "learning_rate": 4.139311900733221e-05, + "loss": 0.9407, + "step": 3053 + }, + { + "epoch": 1.7225042301184432, + "grad_norm": 2.42270565032959, + "learning_rate": 4.1390298928369994e-05, + "loss": 0.8302, + "step": 3054 + }, + { + "epoch": 1.7230682459108855, + "grad_norm": 1.265850305557251, + "learning_rate": 4.138747884940779e-05, + "loss": 0.692, + "step": 3055 + }, + { + "epoch": 1.7236322617033277, + "grad_norm": 1.3870960474014282, + "learning_rate": 4.138465877044557e-05, + "loss": 0.6876, + "step": 3056 + }, + { + "epoch": 1.72419627749577, + "grad_norm": 1.4120582342147827, + "learning_rate": 4.1381838691483364e-05, + "loss": 0.7756, + "step": 3057 + }, + { + "epoch": 1.7247602932882122, + "grad_norm": 2.9864304065704346, + "learning_rate": 4.1379018612521156e-05, + "loss": 0.8639, + "step": 3058 + }, + { + "epoch": 1.7253243090806543, + "grad_norm": 1.626069188117981, + "learning_rate": 4.137619853355894e-05, + "loss": 0.8017, + "step": 3059 + }, + { + "epoch": 1.7258883248730963, + "grad_norm": 1.9800244569778442, + "learning_rate": 4.137337845459673e-05, + "loss": 0.8212, + "step": 3060 + }, + { + "epoch": 1.7264523406655385, + "grad_norm": 2.548862934112549, + "learning_rate": 4.137055837563452e-05, + "loss": 0.7987, + "step": 3061 + }, + { + "epoch": 1.7270163564579808, + "grad_norm": 2.4719433784484863, + "learning_rate": 4.136773829667231e-05, + "loss": 0.9585, + "step": 3062 + }, + { + "epoch": 1.727580372250423, + "grad_norm": 1.8541110754013062, + "learning_rate": 4.13649182177101e-05, + "loss": 0.7701, + "step": 3063 + }, + { + "epoch": 1.7281443880428653, + "grad_norm": 4.992244720458984, + "learning_rate": 4.136209813874788e-05, + "loss": 0.9819, + "step": 3064 + }, + { + "epoch": 1.7287084038353075, + "grad_norm": 2.1662280559539795, + "learning_rate": 4.1359278059785675e-05, + "loss": 0.836, + "step": 3065 + }, + { + "epoch": 1.7292724196277496, + "grad_norm": 1.7808504104614258, + "learning_rate": 4.135645798082347e-05, + "loss": 0.6532, + "step": 3066 + }, + { + "epoch": 1.7298364354201916, + "grad_norm": 2.274484634399414, + "learning_rate": 4.135363790186125e-05, + "loss": 0.8479, + "step": 3067 + }, + { + "epoch": 1.7304004512126339, + "grad_norm": 2.837322235107422, + "learning_rate": 4.135081782289904e-05, + "loss": 0.7438, + "step": 3068 + }, + { + "epoch": 1.7309644670050761, + "grad_norm": 1.6503515243530273, + "learning_rate": 4.1347997743936837e-05, + "loss": 0.6575, + "step": 3069 + }, + { + "epoch": 1.7315284827975184, + "grad_norm": 1.5287209749221802, + "learning_rate": 4.134517766497462e-05, + "loss": 0.6836, + "step": 3070 + }, + { + "epoch": 1.7320924985899606, + "grad_norm": 1.5991030931472778, + "learning_rate": 4.134235758601241e-05, + "loss": 0.8148, + "step": 3071 + }, + { + "epoch": 1.7326565143824029, + "grad_norm": 1.9736886024475098, + "learning_rate": 4.13395375070502e-05, + "loss": 0.8693, + "step": 3072 + }, + { + "epoch": 1.733220530174845, + "grad_norm": 1.3485822677612305, + "learning_rate": 4.133671742808799e-05, + "loss": 0.8961, + "step": 3073 + }, + { + "epoch": 1.733784545967287, + "grad_norm": 4.372910499572754, + "learning_rate": 4.133389734912578e-05, + "loss": 1.0449, + "step": 3074 + }, + { + "epoch": 1.7343485617597292, + "grad_norm": 2.854471206665039, + "learning_rate": 4.133107727016356e-05, + "loss": 0.8562, + "step": 3075 + }, + { + "epoch": 1.7349125775521714, + "grad_norm": 1.7579759359359741, + "learning_rate": 4.132825719120136e-05, + "loss": 0.7327, + "step": 3076 + }, + { + "epoch": 1.7354765933446137, + "grad_norm": 1.632912278175354, + "learning_rate": 4.132543711223915e-05, + "loss": 0.9103, + "step": 3077 + }, + { + "epoch": 1.736040609137056, + "grad_norm": 1.9941216707229614, + "learning_rate": 4.132261703327693e-05, + "loss": 0.7967, + "step": 3078 + }, + { + "epoch": 1.7366046249294982, + "grad_norm": 4.394731521606445, + "learning_rate": 4.1319796954314725e-05, + "loss": 0.8232, + "step": 3079 + }, + { + "epoch": 1.7371686407219402, + "grad_norm": 3.131598949432373, + "learning_rate": 4.131697687535252e-05, + "loss": 0.8965, + "step": 3080 + }, + { + "epoch": 1.7377326565143822, + "grad_norm": 1.5996822118759155, + "learning_rate": 4.13141567963903e-05, + "loss": 0.6993, + "step": 3081 + }, + { + "epoch": 1.7382966723068245, + "grad_norm": 2.1623010635375977, + "learning_rate": 4.131133671742809e-05, + "loss": 0.861, + "step": 3082 + }, + { + "epoch": 1.7388606880992667, + "grad_norm": 2.0903868675231934, + "learning_rate": 4.130851663846588e-05, + "loss": 0.8381, + "step": 3083 + }, + { + "epoch": 1.739424703891709, + "grad_norm": 2.383403778076172, + "learning_rate": 4.130569655950367e-05, + "loss": 0.925, + "step": 3084 + }, + { + "epoch": 1.7399887196841513, + "grad_norm": 1.447457194328308, + "learning_rate": 4.130287648054146e-05, + "loss": 0.8791, + "step": 3085 + }, + { + "epoch": 1.7405527354765935, + "grad_norm": 1.2664058208465576, + "learning_rate": 4.130005640157924e-05, + "loss": 0.6644, + "step": 3086 + }, + { + "epoch": 1.7411167512690355, + "grad_norm": 4.784078121185303, + "learning_rate": 4.1297236322617035e-05, + "loss": 0.9437, + "step": 3087 + }, + { + "epoch": 1.7416807670614776, + "grad_norm": 4.86183500289917, + "learning_rate": 4.129441624365483e-05, + "loss": 0.9239, + "step": 3088 + }, + { + "epoch": 1.7422447828539198, + "grad_norm": 1.9482295513153076, + "learning_rate": 4.129159616469261e-05, + "loss": 0.8961, + "step": 3089 + }, + { + "epoch": 1.742808798646362, + "grad_norm": 5.036369800567627, + "learning_rate": 4.1288776085730405e-05, + "loss": 1.0141, + "step": 3090 + }, + { + "epoch": 1.7433728144388043, + "grad_norm": 2.202361822128296, + "learning_rate": 4.128595600676819e-05, + "loss": 0.7775, + "step": 3091 + }, + { + "epoch": 1.7439368302312466, + "grad_norm": 2.5952341556549072, + "learning_rate": 4.128313592780598e-05, + "loss": 0.8363, + "step": 3092 + }, + { + "epoch": 1.7445008460236888, + "grad_norm": 1.5691925287246704, + "learning_rate": 4.128031584884377e-05, + "loss": 0.7553, + "step": 3093 + }, + { + "epoch": 1.7450648618161309, + "grad_norm": 1.1868176460266113, + "learning_rate": 4.127749576988156e-05, + "loss": 0.7461, + "step": 3094 + }, + { + "epoch": 1.7456288776085729, + "grad_norm": 3.008150815963745, + "learning_rate": 4.1274675690919345e-05, + "loss": 0.7878, + "step": 3095 + }, + { + "epoch": 1.7461928934010151, + "grad_norm": 3.6084342002868652, + "learning_rate": 4.127185561195714e-05, + "loss": 0.7663, + "step": 3096 + }, + { + "epoch": 1.7467569091934574, + "grad_norm": 2.1838538646698, + "learning_rate": 4.126903553299493e-05, + "loss": 0.8958, + "step": 3097 + }, + { + "epoch": 1.7473209249858996, + "grad_norm": 2.287383794784546, + "learning_rate": 4.1266215454032715e-05, + "loss": 0.8735, + "step": 3098 + }, + { + "epoch": 1.747884940778342, + "grad_norm": 3.046431064605713, + "learning_rate": 4.12633953750705e-05, + "loss": 0.8894, + "step": 3099 + }, + { + "epoch": 1.7484489565707841, + "grad_norm": 2.15859317779541, + "learning_rate": 4.126057529610829e-05, + "loss": 0.8158, + "step": 3100 + }, + { + "epoch": 1.7490129723632262, + "grad_norm": 2.1601572036743164, + "learning_rate": 4.1257755217146085e-05, + "loss": 0.8356, + "step": 3101 + }, + { + "epoch": 1.7495769881556682, + "grad_norm": 2.8605895042419434, + "learning_rate": 4.125493513818387e-05, + "loss": 0.8446, + "step": 3102 + }, + { + "epoch": 1.7501410039481105, + "grad_norm": 2.197662591934204, + "learning_rate": 4.1252115059221656e-05, + "loss": 0.9162, + "step": 3103 + }, + { + "epoch": 1.7507050197405527, + "grad_norm": 3.6387135982513428, + "learning_rate": 4.124929498025945e-05, + "loss": 0.9558, + "step": 3104 + }, + { + "epoch": 1.751269035532995, + "grad_norm": 1.8954102993011475, + "learning_rate": 4.124647490129724e-05, + "loss": 0.893, + "step": 3105 + }, + { + "epoch": 1.7518330513254372, + "grad_norm": 2.1081643104553223, + "learning_rate": 4.1243654822335025e-05, + "loss": 0.8406, + "step": 3106 + }, + { + "epoch": 1.7523970671178795, + "grad_norm": 3.252133846282959, + "learning_rate": 4.124083474337281e-05, + "loss": 0.8141, + "step": 3107 + }, + { + "epoch": 1.7529610829103215, + "grad_norm": 2.845484972000122, + "learning_rate": 4.123801466441061e-05, + "loss": 0.8813, + "step": 3108 + }, + { + "epoch": 1.7535250987027635, + "grad_norm": 1.9696167707443237, + "learning_rate": 4.1235194585448395e-05, + "loss": 0.7574, + "step": 3109 + }, + { + "epoch": 1.7540891144952058, + "grad_norm": 1.8999603986740112, + "learning_rate": 4.123237450648618e-05, + "loss": 0.7506, + "step": 3110 + }, + { + "epoch": 1.754653130287648, + "grad_norm": 1.8022290468215942, + "learning_rate": 4.122955442752397e-05, + "loss": 0.8752, + "step": 3111 + }, + { + "epoch": 1.7552171460800903, + "grad_norm": 2.7717015743255615, + "learning_rate": 4.1226734348561765e-05, + "loss": 0.9792, + "step": 3112 + }, + { + "epoch": 1.7557811618725325, + "grad_norm": 1.4160819053649902, + "learning_rate": 4.122391426959955e-05, + "loss": 0.8505, + "step": 3113 + }, + { + "epoch": 1.7563451776649748, + "grad_norm": 2.1991353034973145, + "learning_rate": 4.1221094190637336e-05, + "loss": 1.0049, + "step": 3114 + }, + { + "epoch": 1.7569091934574168, + "grad_norm": 1.5880967378616333, + "learning_rate": 4.1218274111675135e-05, + "loss": 0.7319, + "step": 3115 + }, + { + "epoch": 1.7574732092498588, + "grad_norm": 1.926115870475769, + "learning_rate": 4.121545403271292e-05, + "loss": 0.7874, + "step": 3116 + }, + { + "epoch": 1.758037225042301, + "grad_norm": 1.7303142547607422, + "learning_rate": 4.1212633953750706e-05, + "loss": 0.7537, + "step": 3117 + }, + { + "epoch": 1.7586012408347433, + "grad_norm": 1.2391005754470825, + "learning_rate": 4.120981387478849e-05, + "loss": 0.8041, + "step": 3118 + }, + { + "epoch": 1.7591652566271856, + "grad_norm": 2.8359525203704834, + "learning_rate": 4.120699379582629e-05, + "loss": 0.8865, + "step": 3119 + }, + { + "epoch": 1.7597292724196278, + "grad_norm": 1.459359049797058, + "learning_rate": 4.1204173716864075e-05, + "loss": 0.6971, + "step": 3120 + }, + { + "epoch": 1.76029328821207, + "grad_norm": 3.200657367706299, + "learning_rate": 4.120135363790186e-05, + "loss": 0.6603, + "step": 3121 + }, + { + "epoch": 1.7608573040045121, + "grad_norm": 1.0955644845962524, + "learning_rate": 4.119853355893965e-05, + "loss": 0.7042, + "step": 3122 + }, + { + "epoch": 1.7614213197969542, + "grad_norm": 1.5295311212539673, + "learning_rate": 4.1195713479977445e-05, + "loss": 0.8053, + "step": 3123 + }, + { + "epoch": 1.7619853355893964, + "grad_norm": 1.8376463651657104, + "learning_rate": 4.119289340101523e-05, + "loss": 0.7786, + "step": 3124 + }, + { + "epoch": 1.7625493513818387, + "grad_norm": 21.24383544921875, + "learning_rate": 4.1190073322053016e-05, + "loss": 0.9401, + "step": 3125 + }, + { + "epoch": 1.763113367174281, + "grad_norm": 2.2774579524993896, + "learning_rate": 4.118725324309081e-05, + "loss": 0.7765, + "step": 3126 + }, + { + "epoch": 1.7636773829667232, + "grad_norm": 4.864006519317627, + "learning_rate": 4.11844331641286e-05, + "loss": 1.034, + "step": 3127 + }, + { + "epoch": 1.7642413987591654, + "grad_norm": 9.472341537475586, + "learning_rate": 4.1181613085166386e-05, + "loss": 0.7722, + "step": 3128 + }, + { + "epoch": 1.7648054145516074, + "grad_norm": 2.1735262870788574, + "learning_rate": 4.117879300620418e-05, + "loss": 0.7563, + "step": 3129 + }, + { + "epoch": 1.7653694303440495, + "grad_norm": 2.505156993865967, + "learning_rate": 4.117597292724196e-05, + "loss": 0.8971, + "step": 3130 + }, + { + "epoch": 1.7659334461364917, + "grad_norm": 1.326642394065857, + "learning_rate": 4.1173152848279755e-05, + "loss": 0.7531, + "step": 3131 + }, + { + "epoch": 1.766497461928934, + "grad_norm": 1.6430962085723877, + "learning_rate": 4.117033276931754e-05, + "loss": 0.8503, + "step": 3132 + }, + { + "epoch": 1.7670614777213762, + "grad_norm": 2.089827299118042, + "learning_rate": 4.116751269035533e-05, + "loss": 0.7729, + "step": 3133 + }, + { + "epoch": 1.7676254935138185, + "grad_norm": 1.697205901145935, + "learning_rate": 4.116469261139312e-05, + "loss": 0.9137, + "step": 3134 + }, + { + "epoch": 1.7681895093062607, + "grad_norm": 1.4785974025726318, + "learning_rate": 4.116187253243091e-05, + "loss": 0.8507, + "step": 3135 + }, + { + "epoch": 1.7687535250987028, + "grad_norm": 2.4191887378692627, + "learning_rate": 4.1159052453468696e-05, + "loss": 0.8437, + "step": 3136 + }, + { + "epoch": 1.7693175408911448, + "grad_norm": 1.8910331726074219, + "learning_rate": 4.115623237450649e-05, + "loss": 0.8268, + "step": 3137 + }, + { + "epoch": 1.769881556683587, + "grad_norm": 3.0376029014587402, + "learning_rate": 4.1153412295544274e-05, + "loss": 0.8556, + "step": 3138 + }, + { + "epoch": 1.7704455724760293, + "grad_norm": 1.8542014360427856, + "learning_rate": 4.1150592216582066e-05, + "loss": 0.8237, + "step": 3139 + }, + { + "epoch": 1.7710095882684715, + "grad_norm": 1.9454257488250732, + "learning_rate": 4.114777213761986e-05, + "loss": 0.7244, + "step": 3140 + }, + { + "epoch": 1.7715736040609138, + "grad_norm": 1.5043392181396484, + "learning_rate": 4.1144952058657643e-05, + "loss": 0.6747, + "step": 3141 + }, + { + "epoch": 1.772137619853356, + "grad_norm": 1.829546570777893, + "learning_rate": 4.114213197969543e-05, + "loss": 0.8639, + "step": 3142 + }, + { + "epoch": 1.772701635645798, + "grad_norm": 1.8431310653686523, + "learning_rate": 4.113931190073322e-05, + "loss": 0.7184, + "step": 3143 + }, + { + "epoch": 1.77326565143824, + "grad_norm": 1.4681593179702759, + "learning_rate": 4.113649182177101e-05, + "loss": 0.8421, + "step": 3144 + }, + { + "epoch": 1.7738296672306824, + "grad_norm": 2.287550687789917, + "learning_rate": 4.11336717428088e-05, + "loss": 0.8826, + "step": 3145 + }, + { + "epoch": 1.7743936830231246, + "grad_norm": 3.410320520401001, + "learning_rate": 4.113085166384659e-05, + "loss": 1.0117, + "step": 3146 + }, + { + "epoch": 1.7749576988155669, + "grad_norm": 2.1535377502441406, + "learning_rate": 4.112803158488438e-05, + "loss": 0.7873, + "step": 3147 + }, + { + "epoch": 1.7755217146080091, + "grad_norm": 1.6500375270843506, + "learning_rate": 4.112521150592217e-05, + "loss": 0.7735, + "step": 3148 + }, + { + "epoch": 1.7760857304004514, + "grad_norm": 1.6506152153015137, + "learning_rate": 4.1122391426959954e-05, + "loss": 0.7425, + "step": 3149 + }, + { + "epoch": 1.7766497461928934, + "grad_norm": 3.979466199874878, + "learning_rate": 4.1119571347997746e-05, + "loss": 0.8763, + "step": 3150 + }, + { + "epoch": 1.7772137619853354, + "grad_norm": 2.4260094165802, + "learning_rate": 4.111675126903554e-05, + "loss": 0.8298, + "step": 3151 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 1.6096669435501099, + "learning_rate": 4.1113931190073324e-05, + "loss": 0.8243, + "step": 3152 + }, + { + "epoch": 1.77834179357022, + "grad_norm": 1.855204701423645, + "learning_rate": 4.111111111111111e-05, + "loss": 0.783, + "step": 3153 + }, + { + "epoch": 1.7789058093626622, + "grad_norm": 2.8293616771698, + "learning_rate": 4.11082910321489e-05, + "loss": 0.8234, + "step": 3154 + }, + { + "epoch": 1.7794698251551044, + "grad_norm": 3.5458009243011475, + "learning_rate": 4.110547095318669e-05, + "loss": 0.8295, + "step": 3155 + }, + { + "epoch": 1.7800338409475467, + "grad_norm": 1.8038548231124878, + "learning_rate": 4.110265087422448e-05, + "loss": 0.8027, + "step": 3156 + }, + { + "epoch": 1.7805978567399887, + "grad_norm": 1.9583332538604736, + "learning_rate": 4.1099830795262264e-05, + "loss": 0.8967, + "step": 3157 + }, + { + "epoch": 1.7811618725324307, + "grad_norm": 2.4198360443115234, + "learning_rate": 4.109701071630006e-05, + "loss": 0.7185, + "step": 3158 + }, + { + "epoch": 1.781725888324873, + "grad_norm": 2.0572657585144043, + "learning_rate": 4.109419063733785e-05, + "loss": 0.8179, + "step": 3159 + }, + { + "epoch": 1.7822899041173152, + "grad_norm": 2.060086250305176, + "learning_rate": 4.1091370558375634e-05, + "loss": 0.7296, + "step": 3160 + }, + { + "epoch": 1.7828539199097575, + "grad_norm": 3.140936851501465, + "learning_rate": 4.1088550479413426e-05, + "loss": 0.8595, + "step": 3161 + }, + { + "epoch": 1.7834179357021998, + "grad_norm": 1.8764045238494873, + "learning_rate": 4.108573040045122e-05, + "loss": 0.7968, + "step": 3162 + }, + { + "epoch": 1.783981951494642, + "grad_norm": 3.3882923126220703, + "learning_rate": 4.1082910321489004e-05, + "loss": 0.8517, + "step": 3163 + }, + { + "epoch": 1.784545967287084, + "grad_norm": 1.5287784337997437, + "learning_rate": 4.108009024252679e-05, + "loss": 0.7721, + "step": 3164 + }, + { + "epoch": 1.785109983079526, + "grad_norm": 2.38171124458313, + "learning_rate": 4.107727016356458e-05, + "loss": 0.795, + "step": 3165 + }, + { + "epoch": 1.7856739988719683, + "grad_norm": 1.5185083150863647, + "learning_rate": 4.1074450084602374e-05, + "loss": 0.8119, + "step": 3166 + }, + { + "epoch": 1.7862380146644106, + "grad_norm": 1.2193293571472168, + "learning_rate": 4.107163000564016e-05, + "loss": 0.7051, + "step": 3167 + }, + { + "epoch": 1.7868020304568528, + "grad_norm": 1.3424886465072632, + "learning_rate": 4.106880992667795e-05, + "loss": 0.7914, + "step": 3168 + }, + { + "epoch": 1.787366046249295, + "grad_norm": 1.158014178276062, + "learning_rate": 4.1065989847715737e-05, + "loss": 0.705, + "step": 3169 + }, + { + "epoch": 1.7879300620417373, + "grad_norm": 3.207904577255249, + "learning_rate": 4.106316976875353e-05, + "loss": 0.8491, + "step": 3170 + }, + { + "epoch": 1.7884940778341794, + "grad_norm": 3.8591089248657227, + "learning_rate": 4.1060349689791314e-05, + "loss": 0.927, + "step": 3171 + }, + { + "epoch": 1.7890580936266214, + "grad_norm": 3.7613651752471924, + "learning_rate": 4.1057529610829106e-05, + "loss": 0.7238, + "step": 3172 + }, + { + "epoch": 1.7896221094190636, + "grad_norm": 8.204793930053711, + "learning_rate": 4.105470953186689e-05, + "loss": 0.8851, + "step": 3173 + }, + { + "epoch": 1.7901861252115059, + "grad_norm": 2.4562714099884033, + "learning_rate": 4.1051889452904684e-05, + "loss": 0.9188, + "step": 3174 + }, + { + "epoch": 1.7907501410039481, + "grad_norm": 1.6587718725204468, + "learning_rate": 4.104906937394247e-05, + "loss": 0.8511, + "step": 3175 + }, + { + "epoch": 1.7913141567963904, + "grad_norm": 1.5317027568817139, + "learning_rate": 4.104624929498026e-05, + "loss": 0.706, + "step": 3176 + }, + { + "epoch": 1.7918781725888326, + "grad_norm": 2.6039295196533203, + "learning_rate": 4.104342921601805e-05, + "loss": 0.9083, + "step": 3177 + }, + { + "epoch": 1.7924421883812747, + "grad_norm": 1.2635036706924438, + "learning_rate": 4.104060913705584e-05, + "loss": 0.8318, + "step": 3178 + }, + { + "epoch": 1.7930062041737167, + "grad_norm": 1.4922655820846558, + "learning_rate": 4.103778905809363e-05, + "loss": 0.7224, + "step": 3179 + }, + { + "epoch": 1.793570219966159, + "grad_norm": 2.9215047359466553, + "learning_rate": 4.103496897913142e-05, + "loss": 0.9151, + "step": 3180 + }, + { + "epoch": 1.7941342357586012, + "grad_norm": 2.2595067024230957, + "learning_rate": 4.103214890016921e-05, + "loss": 0.8499, + "step": 3181 + }, + { + "epoch": 1.7946982515510435, + "grad_norm": 2.349879264831543, + "learning_rate": 4.1029328821206994e-05, + "loss": 0.8663, + "step": 3182 + }, + { + "epoch": 1.7952622673434857, + "grad_norm": 1.520565390586853, + "learning_rate": 4.1026508742244786e-05, + "loss": 0.725, + "step": 3183 + }, + { + "epoch": 1.795826283135928, + "grad_norm": 1.719356656074524, + "learning_rate": 4.102368866328257e-05, + "loss": 0.8695, + "step": 3184 + }, + { + "epoch": 1.79639029892837, + "grad_norm": 2.550110340118408, + "learning_rate": 4.1020868584320364e-05, + "loss": 0.906, + "step": 3185 + }, + { + "epoch": 1.796954314720812, + "grad_norm": 1.3411948680877686, + "learning_rate": 4.1018048505358156e-05, + "loss": 0.7324, + "step": 3186 + }, + { + "epoch": 1.7975183305132543, + "grad_norm": 3.407243251800537, + "learning_rate": 4.101522842639594e-05, + "loss": 0.8258, + "step": 3187 + }, + { + "epoch": 1.7980823463056965, + "grad_norm": 3.547654628753662, + "learning_rate": 4.101240834743373e-05, + "loss": 0.9204, + "step": 3188 + }, + { + "epoch": 1.7986463620981388, + "grad_norm": 1.6772263050079346, + "learning_rate": 4.100958826847152e-05, + "loss": 0.8655, + "step": 3189 + }, + { + "epoch": 1.799210377890581, + "grad_norm": 2.260645866394043, + "learning_rate": 4.100676818950931e-05, + "loss": 0.7878, + "step": 3190 + }, + { + "epoch": 1.7997743936830233, + "grad_norm": 1.6291614770889282, + "learning_rate": 4.10039481105471e-05, + "loss": 0.7278, + "step": 3191 + }, + { + "epoch": 1.8003384094754653, + "grad_norm": 3.034799575805664, + "learning_rate": 4.100112803158488e-05, + "loss": 0.9191, + "step": 3192 + }, + { + "epoch": 1.8009024252679073, + "grad_norm": 2.6621367931365967, + "learning_rate": 4.0998307952622674e-05, + "loss": 0.8845, + "step": 3193 + }, + { + "epoch": 1.8014664410603496, + "grad_norm": 1.9297282695770264, + "learning_rate": 4.099548787366047e-05, + "loss": 0.7687, + "step": 3194 + }, + { + "epoch": 1.8020304568527918, + "grad_norm": 1.8473314046859741, + "learning_rate": 4.099266779469825e-05, + "loss": 0.7653, + "step": 3195 + }, + { + "epoch": 1.802594472645234, + "grad_norm": 1.893326759338379, + "learning_rate": 4.098984771573604e-05, + "loss": 0.8597, + "step": 3196 + }, + { + "epoch": 1.8031584884376763, + "grad_norm": 1.1514054536819458, + "learning_rate": 4.0987027636773836e-05, + "loss": 0.7249, + "step": 3197 + }, + { + "epoch": 1.8037225042301186, + "grad_norm": 3.311206579208374, + "learning_rate": 4.098420755781162e-05, + "loss": 0.7351, + "step": 3198 + }, + { + "epoch": 1.8042865200225606, + "grad_norm": 1.7013766765594482, + "learning_rate": 4.098138747884941e-05, + "loss": 0.7437, + "step": 3199 + }, + { + "epoch": 1.8048505358150027, + "grad_norm": 2.1392526626586914, + "learning_rate": 4.09785673998872e-05, + "loss": 0.8684, + "step": 3200 + }, + { + "epoch": 1.805414551607445, + "grad_norm": 1.990025520324707, + "learning_rate": 4.097574732092499e-05, + "loss": 0.7423, + "step": 3201 + }, + { + "epoch": 1.8059785673998872, + "grad_norm": 1.255761742591858, + "learning_rate": 4.097292724196278e-05, + "loss": 0.7553, + "step": 3202 + }, + { + "epoch": 1.8065425831923294, + "grad_norm": 1.253653883934021, + "learning_rate": 4.097010716300056e-05, + "loss": 0.828, + "step": 3203 + }, + { + "epoch": 1.8071065989847717, + "grad_norm": 1.3059580326080322, + "learning_rate": 4.0967287084038355e-05, + "loss": 0.8068, + "step": 3204 + }, + { + "epoch": 1.807670614777214, + "grad_norm": 2.425823211669922, + "learning_rate": 4.096446700507615e-05, + "loss": 0.8796, + "step": 3205 + }, + { + "epoch": 1.808234630569656, + "grad_norm": 2.3115742206573486, + "learning_rate": 4.096164692611393e-05, + "loss": 0.8026, + "step": 3206 + }, + { + "epoch": 1.808798646362098, + "grad_norm": 4.376946449279785, + "learning_rate": 4.0958826847151724e-05, + "loss": 1.0114, + "step": 3207 + }, + { + "epoch": 1.8093626621545402, + "grad_norm": 1.6905958652496338, + "learning_rate": 4.095600676818951e-05, + "loss": 0.7032, + "step": 3208 + }, + { + "epoch": 1.8099266779469825, + "grad_norm": 4.03289794921875, + "learning_rate": 4.09531866892273e-05, + "loss": 0.9465, + "step": 3209 + }, + { + "epoch": 1.8104906937394247, + "grad_norm": 2.0751266479492188, + "learning_rate": 4.095036661026509e-05, + "loss": 0.7677, + "step": 3210 + }, + { + "epoch": 1.811054709531867, + "grad_norm": 1.5918869972229004, + "learning_rate": 4.094754653130288e-05, + "loss": 0.816, + "step": 3211 + }, + { + "epoch": 1.8116187253243092, + "grad_norm": 1.2386542558670044, + "learning_rate": 4.0944726452340665e-05, + "loss": 0.8006, + "step": 3212 + }, + { + "epoch": 1.8121827411167513, + "grad_norm": 1.9141442775726318, + "learning_rate": 4.094190637337846e-05, + "loss": 0.782, + "step": 3213 + }, + { + "epoch": 1.8127467569091935, + "grad_norm": 3.3600711822509766, + "learning_rate": 4.093908629441624e-05, + "loss": 0.9272, + "step": 3214 + }, + { + "epoch": 1.8133107727016355, + "grad_norm": 2.982912302017212, + "learning_rate": 4.0936266215454035e-05, + "loss": 0.7872, + "step": 3215 + }, + { + "epoch": 1.8138747884940778, + "grad_norm": 1.2878444194793701, + "learning_rate": 4.093344613649183e-05, + "loss": 0.7826, + "step": 3216 + }, + { + "epoch": 1.81443880428652, + "grad_norm": 2.916567325592041, + "learning_rate": 4.093062605752961e-05, + "loss": 0.9548, + "step": 3217 + }, + { + "epoch": 1.8150028200789623, + "grad_norm": 1.9496873617172241, + "learning_rate": 4.0927805978567405e-05, + "loss": 0.9125, + "step": 3218 + }, + { + "epoch": 1.8155668358714045, + "grad_norm": 2.7197158336639404, + "learning_rate": 4.092498589960519e-05, + "loss": 0.8259, + "step": 3219 + }, + { + "epoch": 1.8161308516638466, + "grad_norm": 2.512779951095581, + "learning_rate": 4.092216582064298e-05, + "loss": 0.8369, + "step": 3220 + }, + { + "epoch": 1.8166948674562888, + "grad_norm": 4.038216590881348, + "learning_rate": 4.091934574168077e-05, + "loss": 0.8083, + "step": 3221 + }, + { + "epoch": 1.8172588832487309, + "grad_norm": 2.2822036743164062, + "learning_rate": 4.091652566271856e-05, + "loss": 0.6419, + "step": 3222 + }, + { + "epoch": 1.8178228990411731, + "grad_norm": 4.167807102203369, + "learning_rate": 4.0913705583756345e-05, + "loss": 0.9065, + "step": 3223 + }, + { + "epoch": 1.8183869148336154, + "grad_norm": 1.7840313911437988, + "learning_rate": 4.091088550479414e-05, + "loss": 0.9259, + "step": 3224 + }, + { + "epoch": 1.8189509306260576, + "grad_norm": 1.3328427076339722, + "learning_rate": 4.090806542583193e-05, + "loss": 0.8029, + "step": 3225 + }, + { + "epoch": 1.8195149464184999, + "grad_norm": 2.666677236557007, + "learning_rate": 4.0905245346869715e-05, + "loss": 0.9093, + "step": 3226 + }, + { + "epoch": 1.820078962210942, + "grad_norm": 1.9748270511627197, + "learning_rate": 4.09024252679075e-05, + "loss": 0.7293, + "step": 3227 + }, + { + "epoch": 1.8206429780033841, + "grad_norm": 2.0619959831237793, + "learning_rate": 4.089960518894529e-05, + "loss": 0.6654, + "step": 3228 + }, + { + "epoch": 1.8212069937958262, + "grad_norm": 2.4224302768707275, + "learning_rate": 4.0896785109983085e-05, + "loss": 0.895, + "step": 3229 + }, + { + "epoch": 1.8217710095882684, + "grad_norm": 1.7621593475341797, + "learning_rate": 4.089396503102087e-05, + "loss": 0.7631, + "step": 3230 + }, + { + "epoch": 1.8223350253807107, + "grad_norm": 1.4023857116699219, + "learning_rate": 4.0891144952058655e-05, + "loss": 0.8451, + "step": 3231 + }, + { + "epoch": 1.822899041173153, + "grad_norm": 1.2591564655303955, + "learning_rate": 4.088832487309645e-05, + "loss": 0.8022, + "step": 3232 + }, + { + "epoch": 1.8234630569655952, + "grad_norm": 2.2306346893310547, + "learning_rate": 4.088550479413424e-05, + "loss": 0.8673, + "step": 3233 + }, + { + "epoch": 1.8240270727580372, + "grad_norm": 1.9428761005401611, + "learning_rate": 4.0882684715172025e-05, + "loss": 0.9368, + "step": 3234 + }, + { + "epoch": 1.8245910885504795, + "grad_norm": 2.6955013275146484, + "learning_rate": 4.087986463620981e-05, + "loss": 0.7519, + "step": 3235 + }, + { + "epoch": 1.8251551043429215, + "grad_norm": 1.6095861196517944, + "learning_rate": 4.087704455724761e-05, + "loss": 0.8651, + "step": 3236 + }, + { + "epoch": 1.8257191201353637, + "grad_norm": 2.8912878036499023, + "learning_rate": 4.0874224478285395e-05, + "loss": 0.8057, + "step": 3237 + }, + { + "epoch": 1.826283135927806, + "grad_norm": 2.288722038269043, + "learning_rate": 4.087140439932318e-05, + "loss": 0.8315, + "step": 3238 + }, + { + "epoch": 1.8268471517202483, + "grad_norm": 3.2079122066497803, + "learning_rate": 4.086858432036097e-05, + "loss": 0.7344, + "step": 3239 + }, + { + "epoch": 1.8274111675126905, + "grad_norm": 2.8087408542633057, + "learning_rate": 4.0865764241398765e-05, + "loss": 0.8284, + "step": 3240 + }, + { + "epoch": 1.8279751833051325, + "grad_norm": 1.7155333757400513, + "learning_rate": 4.086294416243655e-05, + "loss": 0.7691, + "step": 3241 + }, + { + "epoch": 1.8285391990975748, + "grad_norm": 3.786471128463745, + "learning_rate": 4.0860124083474336e-05, + "loss": 0.8196, + "step": 3242 + }, + { + "epoch": 1.8291032148900168, + "grad_norm": 1.6215590238571167, + "learning_rate": 4.085730400451213e-05, + "loss": 0.7838, + "step": 3243 + }, + { + "epoch": 1.829667230682459, + "grad_norm": 3.160203695297241, + "learning_rate": 4.085448392554992e-05, + "loss": 0.8628, + "step": 3244 + }, + { + "epoch": 1.8302312464749013, + "grad_norm": 1.9435185194015503, + "learning_rate": 4.0851663846587705e-05, + "loss": 0.7778, + "step": 3245 + }, + { + "epoch": 1.8307952622673436, + "grad_norm": 1.589078664779663, + "learning_rate": 4.08488437676255e-05, + "loss": 0.6546, + "step": 3246 + }, + { + "epoch": 1.8313592780597858, + "grad_norm": 2.0218095779418945, + "learning_rate": 4.084602368866328e-05, + "loss": 0.7345, + "step": 3247 + }, + { + "epoch": 1.8319232938522279, + "grad_norm": 1.6828216314315796, + "learning_rate": 4.0843203609701075e-05, + "loss": 0.8084, + "step": 3248 + }, + { + "epoch": 1.83248730964467, + "grad_norm": 1.8956117630004883, + "learning_rate": 4.084038353073886e-05, + "loss": 0.7563, + "step": 3249 + }, + { + "epoch": 1.8330513254371121, + "grad_norm": 2.985668182373047, + "learning_rate": 4.083756345177665e-05, + "loss": 0.8856, + "step": 3250 + }, + { + "epoch": 1.8336153412295544, + "grad_norm": 2.178593397140503, + "learning_rate": 4.0834743372814445e-05, + "loss": 0.8542, + "step": 3251 + }, + { + "epoch": 1.8341793570219966, + "grad_norm": 3.592478036880493, + "learning_rate": 4.083192329385223e-05, + "loss": 0.945, + "step": 3252 + }, + { + "epoch": 1.8347433728144389, + "grad_norm": 2.403519630432129, + "learning_rate": 4.0829103214890016e-05, + "loss": 0.6949, + "step": 3253 + }, + { + "epoch": 1.8353073886068811, + "grad_norm": 2.079730749130249, + "learning_rate": 4.082628313592781e-05, + "loss": 0.7686, + "step": 3254 + }, + { + "epoch": 1.8358714043993232, + "grad_norm": 2.342268228530884, + "learning_rate": 4.08234630569656e-05, + "loss": 0.7928, + "step": 3255 + }, + { + "epoch": 1.8364354201917654, + "grad_norm": 1.0978593826293945, + "learning_rate": 4.0820642978003386e-05, + "loss": 0.7244, + "step": 3256 + }, + { + "epoch": 1.8369994359842075, + "grad_norm": 1.6494719982147217, + "learning_rate": 4.081782289904118e-05, + "loss": 0.861, + "step": 3257 + }, + { + "epoch": 1.8375634517766497, + "grad_norm": 1.8084887266159058, + "learning_rate": 4.081500282007896e-05, + "loss": 0.7879, + "step": 3258 + }, + { + "epoch": 1.838127467569092, + "grad_norm": 1.723220944404602, + "learning_rate": 4.0812182741116755e-05, + "loss": 0.7644, + "step": 3259 + }, + { + "epoch": 1.8386914833615342, + "grad_norm": 2.5824906826019287, + "learning_rate": 4.080936266215454e-05, + "loss": 0.8909, + "step": 3260 + }, + { + "epoch": 1.8392554991539765, + "grad_norm": 2.1215832233428955, + "learning_rate": 4.080654258319233e-05, + "loss": 0.8419, + "step": 3261 + }, + { + "epoch": 1.8398195149464185, + "grad_norm": 1.844132423400879, + "learning_rate": 4.080372250423012e-05, + "loss": 0.7478, + "step": 3262 + }, + { + "epoch": 1.8403835307388607, + "grad_norm": 1.8452636003494263, + "learning_rate": 4.080090242526791e-05, + "loss": 0.7418, + "step": 3263 + }, + { + "epoch": 1.8409475465313028, + "grad_norm": 1.0982048511505127, + "learning_rate": 4.07980823463057e-05, + "loss": 0.6613, + "step": 3264 + }, + { + "epoch": 1.841511562323745, + "grad_norm": 4.043704986572266, + "learning_rate": 4.079526226734349e-05, + "loss": 0.9901, + "step": 3265 + }, + { + "epoch": 1.8420755781161873, + "grad_norm": 1.494786024093628, + "learning_rate": 4.0792442188381274e-05, + "loss": 0.7662, + "step": 3266 + }, + { + "epoch": 1.8426395939086295, + "grad_norm": 1.3628233671188354, + "learning_rate": 4.0789622109419066e-05, + "loss": 0.7944, + "step": 3267 + }, + { + "epoch": 1.8432036097010718, + "grad_norm": 2.3801326751708984, + "learning_rate": 4.078680203045686e-05, + "loss": 0.9319, + "step": 3268 + }, + { + "epoch": 1.8437676254935138, + "grad_norm": 1.3578822612762451, + "learning_rate": 4.078398195149464e-05, + "loss": 0.6087, + "step": 3269 + }, + { + "epoch": 1.844331641285956, + "grad_norm": 2.458172559738159, + "learning_rate": 4.078116187253243e-05, + "loss": 0.8402, + "step": 3270 + }, + { + "epoch": 1.844895657078398, + "grad_norm": 1.2854464054107666, + "learning_rate": 4.077834179357022e-05, + "loss": 0.8266, + "step": 3271 + }, + { + "epoch": 1.8454596728708403, + "grad_norm": 3.64546275138855, + "learning_rate": 4.077552171460801e-05, + "loss": 0.7543, + "step": 3272 + }, + { + "epoch": 1.8460236886632826, + "grad_norm": 1.6334940195083618, + "learning_rate": 4.07727016356458e-05, + "loss": 0.7334, + "step": 3273 + }, + { + "epoch": 1.8465877044557248, + "grad_norm": 1.6651076078414917, + "learning_rate": 4.0769881556683584e-05, + "loss": 0.7042, + "step": 3274 + }, + { + "epoch": 1.847151720248167, + "grad_norm": 1.58551025390625, + "learning_rate": 4.076706147772138e-05, + "loss": 0.6678, + "step": 3275 + }, + { + "epoch": 1.8477157360406091, + "grad_norm": 2.79415225982666, + "learning_rate": 4.076424139875917e-05, + "loss": 0.7876, + "step": 3276 + }, + { + "epoch": 1.8482797518330514, + "grad_norm": 2.6671478748321533, + "learning_rate": 4.0761421319796954e-05, + "loss": 0.74, + "step": 3277 + }, + { + "epoch": 1.8488437676254934, + "grad_norm": 1.9759657382965088, + "learning_rate": 4.0758601240834746e-05, + "loss": 0.808, + "step": 3278 + }, + { + "epoch": 1.8494077834179357, + "grad_norm": 1.7251396179199219, + "learning_rate": 4.075578116187254e-05, + "loss": 0.7012, + "step": 3279 + }, + { + "epoch": 1.849971799210378, + "grad_norm": 1.1889172792434692, + "learning_rate": 4.0752961082910323e-05, + "loss": 0.7144, + "step": 3280 + }, + { + "epoch": 1.8505358150028202, + "grad_norm": 1.2177200317382812, + "learning_rate": 4.075014100394811e-05, + "loss": 0.7333, + "step": 3281 + }, + { + "epoch": 1.8510998307952624, + "grad_norm": 1.5643775463104248, + "learning_rate": 4.07473209249859e-05, + "loss": 0.7792, + "step": 3282 + }, + { + "epoch": 1.8516638465877044, + "grad_norm": 1.328162431716919, + "learning_rate": 4.074450084602369e-05, + "loss": 0.6807, + "step": 3283 + }, + { + "epoch": 1.8522278623801467, + "grad_norm": 1.963409423828125, + "learning_rate": 4.074168076706148e-05, + "loss": 0.7014, + "step": 3284 + }, + { + "epoch": 1.8527918781725887, + "grad_norm": 2.0833725929260254, + "learning_rate": 4.0738860688099264e-05, + "loss": 0.6977, + "step": 3285 + }, + { + "epoch": 1.853355893965031, + "grad_norm": 2.6104986667633057, + "learning_rate": 4.073604060913706e-05, + "loss": 0.9087, + "step": 3286 + }, + { + "epoch": 1.8539199097574732, + "grad_norm": 3.2824337482452393, + "learning_rate": 4.073322053017485e-05, + "loss": 0.8372, + "step": 3287 + }, + { + "epoch": 1.8544839255499155, + "grad_norm": 1.6011995077133179, + "learning_rate": 4.0730400451212634e-05, + "loss": 0.8343, + "step": 3288 + }, + { + "epoch": 1.8550479413423577, + "grad_norm": 1.7461649179458618, + "learning_rate": 4.0727580372250426e-05, + "loss": 0.7249, + "step": 3289 + }, + { + "epoch": 1.8556119571347998, + "grad_norm": 7.813154697418213, + "learning_rate": 4.072476029328822e-05, + "loss": 0.8973, + "step": 3290 + }, + { + "epoch": 1.856175972927242, + "grad_norm": 3.6964707374572754, + "learning_rate": 4.0721940214326004e-05, + "loss": 0.9095, + "step": 3291 + }, + { + "epoch": 1.856739988719684, + "grad_norm": 1.6404235363006592, + "learning_rate": 4.071912013536379e-05, + "loss": 0.8466, + "step": 3292 + }, + { + "epoch": 1.8573040045121263, + "grad_norm": 1.5138617753982544, + "learning_rate": 4.071630005640158e-05, + "loss": 0.6187, + "step": 3293 + }, + { + "epoch": 1.8578680203045685, + "grad_norm": 2.9432497024536133, + "learning_rate": 4.071347997743937e-05, + "loss": 0.999, + "step": 3294 + }, + { + "epoch": 1.8584320360970108, + "grad_norm": 2.821164608001709, + "learning_rate": 4.071065989847716e-05, + "loss": 0.7626, + "step": 3295 + }, + { + "epoch": 1.858996051889453, + "grad_norm": 1.7076634168624878, + "learning_rate": 4.070783981951495e-05, + "loss": 0.756, + "step": 3296 + }, + { + "epoch": 1.859560067681895, + "grad_norm": 2.5613720417022705, + "learning_rate": 4.0705019740552736e-05, + "loss": 0.8455, + "step": 3297 + }, + { + "epoch": 1.8601240834743373, + "grad_norm": 1.5378819704055786, + "learning_rate": 4.070219966159053e-05, + "loss": 0.7824, + "step": 3298 + }, + { + "epoch": 1.8606880992667794, + "grad_norm": 0.9117276072502136, + "learning_rate": 4.0699379582628314e-05, + "loss": 0.6478, + "step": 3299 + }, + { + "epoch": 1.8612521150592216, + "grad_norm": 1.7754473686218262, + "learning_rate": 4.0696559503666106e-05, + "loss": 0.7949, + "step": 3300 + }, + { + "epoch": 1.8618161308516639, + "grad_norm": 1.3751022815704346, + "learning_rate": 4.069373942470389e-05, + "loss": 0.7619, + "step": 3301 + }, + { + "epoch": 1.8623801466441061, + "grad_norm": 1.9230269193649292, + "learning_rate": 4.0690919345741684e-05, + "loss": 0.9083, + "step": 3302 + }, + { + "epoch": 1.8629441624365484, + "grad_norm": 2.6383867263793945, + "learning_rate": 4.068809926677947e-05, + "loss": 0.9183, + "step": 3303 + }, + { + "epoch": 1.8635081782289904, + "grad_norm": 1.2557002305984497, + "learning_rate": 4.068527918781726e-05, + "loss": 0.7458, + "step": 3304 + }, + { + "epoch": 1.8640721940214326, + "grad_norm": 1.4955666065216064, + "learning_rate": 4.068245910885505e-05, + "loss": 0.7009, + "step": 3305 + }, + { + "epoch": 1.8646362098138747, + "grad_norm": 1.8930654525756836, + "learning_rate": 4.067963902989284e-05, + "loss": 0.7988, + "step": 3306 + }, + { + "epoch": 1.865200225606317, + "grad_norm": 2.19285249710083, + "learning_rate": 4.067681895093063e-05, + "loss": 0.7698, + "step": 3307 + }, + { + "epoch": 1.8657642413987592, + "grad_norm": 6.184276103973389, + "learning_rate": 4.0673998871968417e-05, + "loss": 0.9056, + "step": 3308 + }, + { + "epoch": 1.8663282571912014, + "grad_norm": 4.306686878204346, + "learning_rate": 4.06711787930062e-05, + "loss": 1.0717, + "step": 3309 + }, + { + "epoch": 1.8668922729836437, + "grad_norm": 1.6148991584777832, + "learning_rate": 4.0668358714043994e-05, + "loss": 0.8473, + "step": 3310 + }, + { + "epoch": 1.8674562887760857, + "grad_norm": 1.2961595058441162, + "learning_rate": 4.0665538635081786e-05, + "loss": 0.7349, + "step": 3311 + }, + { + "epoch": 1.868020304568528, + "grad_norm": 5.479981422424316, + "learning_rate": 4.066271855611957e-05, + "loss": 0.8756, + "step": 3312 + }, + { + "epoch": 1.86858432036097, + "grad_norm": 3.0378363132476807, + "learning_rate": 4.065989847715736e-05, + "loss": 0.8222, + "step": 3313 + }, + { + "epoch": 1.8691483361534122, + "grad_norm": 2.8229212760925293, + "learning_rate": 4.0657078398195156e-05, + "loss": 0.8228, + "step": 3314 + }, + { + "epoch": 1.8697123519458545, + "grad_norm": 1.9976860284805298, + "learning_rate": 4.065425831923294e-05, + "loss": 0.8436, + "step": 3315 + }, + { + "epoch": 1.8702763677382968, + "grad_norm": 2.900825023651123, + "learning_rate": 4.065143824027073e-05, + "loss": 0.8766, + "step": 3316 + }, + { + "epoch": 1.870840383530739, + "grad_norm": 1.336310625076294, + "learning_rate": 4.064861816130852e-05, + "loss": 0.7643, + "step": 3317 + }, + { + "epoch": 1.871404399323181, + "grad_norm": 2.0934669971466064, + "learning_rate": 4.064579808234631e-05, + "loss": 0.7332, + "step": 3318 + }, + { + "epoch": 1.8719684151156233, + "grad_norm": 1.7700774669647217, + "learning_rate": 4.06429780033841e-05, + "loss": 0.781, + "step": 3319 + }, + { + "epoch": 1.8725324309080653, + "grad_norm": 2.0269203186035156, + "learning_rate": 4.064015792442188e-05, + "loss": 0.8875, + "step": 3320 + }, + { + "epoch": 1.8730964467005076, + "grad_norm": 3.2845911979675293, + "learning_rate": 4.0637337845459674e-05, + "loss": 1.0066, + "step": 3321 + }, + { + "epoch": 1.8736604624929498, + "grad_norm": 2.522397756576538, + "learning_rate": 4.0634517766497466e-05, + "loss": 0.7743, + "step": 3322 + }, + { + "epoch": 1.874224478285392, + "grad_norm": 2.622580051422119, + "learning_rate": 4.063169768753525e-05, + "loss": 0.9284, + "step": 3323 + }, + { + "epoch": 1.8747884940778343, + "grad_norm": 2.4293251037597656, + "learning_rate": 4.062887760857304e-05, + "loss": 0.713, + "step": 3324 + }, + { + "epoch": 1.8753525098702764, + "grad_norm": 2.4443442821502686, + "learning_rate": 4.0626057529610836e-05, + "loss": 0.8581, + "step": 3325 + }, + { + "epoch": 1.8759165256627186, + "grad_norm": 1.4897018671035767, + "learning_rate": 4.062323745064862e-05, + "loss": 0.8074, + "step": 3326 + }, + { + "epoch": 1.8764805414551606, + "grad_norm": 4.326353549957275, + "learning_rate": 4.062041737168641e-05, + "loss": 0.8556, + "step": 3327 + }, + { + "epoch": 1.8770445572476029, + "grad_norm": 1.5596036911010742, + "learning_rate": 4.06175972927242e-05, + "loss": 0.7306, + "step": 3328 + }, + { + "epoch": 1.8776085730400451, + "grad_norm": 2.4388415813446045, + "learning_rate": 4.061477721376199e-05, + "loss": 0.8358, + "step": 3329 + }, + { + "epoch": 1.8781725888324874, + "grad_norm": 1.8094754219055176, + "learning_rate": 4.061195713479978e-05, + "loss": 0.6748, + "step": 3330 + }, + { + "epoch": 1.8787366046249296, + "grad_norm": 1.429795265197754, + "learning_rate": 4.060913705583756e-05, + "loss": 0.8602, + "step": 3331 + }, + { + "epoch": 1.8793006204173717, + "grad_norm": 2.1265766620635986, + "learning_rate": 4.0606316976875354e-05, + "loss": 0.7846, + "step": 3332 + }, + { + "epoch": 1.879864636209814, + "grad_norm": 1.65263032913208, + "learning_rate": 4.0603496897913147e-05, + "loss": 0.7785, + "step": 3333 + }, + { + "epoch": 1.880428652002256, + "grad_norm": 1.4784835577011108, + "learning_rate": 4.060067681895093e-05, + "loss": 0.789, + "step": 3334 + }, + { + "epoch": 1.8809926677946982, + "grad_norm": 2.0654966831207275, + "learning_rate": 4.0597856739988724e-05, + "loss": 0.7965, + "step": 3335 + }, + { + "epoch": 1.8815566835871405, + "grad_norm": 8.83458137512207, + "learning_rate": 4.059503666102651e-05, + "loss": 0.9864, + "step": 3336 + }, + { + "epoch": 1.8821206993795827, + "grad_norm": 2.40848445892334, + "learning_rate": 4.05922165820643e-05, + "loss": 0.9629, + "step": 3337 + }, + { + "epoch": 1.882684715172025, + "grad_norm": 3.8830296993255615, + "learning_rate": 4.058939650310209e-05, + "loss": 0.9826, + "step": 3338 + }, + { + "epoch": 1.883248730964467, + "grad_norm": 1.6975505352020264, + "learning_rate": 4.058657642413988e-05, + "loss": 0.8107, + "step": 3339 + }, + { + "epoch": 1.8838127467569092, + "grad_norm": 2.4655985832214355, + "learning_rate": 4.0583756345177665e-05, + "loss": 0.7998, + "step": 3340 + }, + { + "epoch": 1.8843767625493513, + "grad_norm": 3.330012559890747, + "learning_rate": 4.058093626621546e-05, + "loss": 0.8382, + "step": 3341 + }, + { + "epoch": 1.8849407783417935, + "grad_norm": 1.2987618446350098, + "learning_rate": 4.057811618725324e-05, + "loss": 0.8258, + "step": 3342 + }, + { + "epoch": 1.8855047941342358, + "grad_norm": 4.308237075805664, + "learning_rate": 4.0575296108291035e-05, + "loss": 0.9172, + "step": 3343 + }, + { + "epoch": 1.886068809926678, + "grad_norm": 1.4190349578857422, + "learning_rate": 4.057247602932882e-05, + "loss": 0.766, + "step": 3344 + }, + { + "epoch": 1.8866328257191203, + "grad_norm": 1.6320186853408813, + "learning_rate": 4.056965595036661e-05, + "loss": 0.7027, + "step": 3345 + }, + { + "epoch": 1.8871968415115623, + "grad_norm": 1.6314115524291992, + "learning_rate": 4.0566835871404404e-05, + "loss": 0.7582, + "step": 3346 + }, + { + "epoch": 1.8877608573040046, + "grad_norm": 4.009334087371826, + "learning_rate": 4.056401579244219e-05, + "loss": 0.929, + "step": 3347 + }, + { + "epoch": 1.8883248730964466, + "grad_norm": 2.9454691410064697, + "learning_rate": 4.0561195713479975e-05, + "loss": 0.8374, + "step": 3348 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 1.7239240407943726, + "learning_rate": 4.055837563451777e-05, + "loss": 0.703, + "step": 3349 + }, + { + "epoch": 1.889452904681331, + "grad_norm": 1.8031156063079834, + "learning_rate": 4.055555555555556e-05, + "loss": 0.8085, + "step": 3350 + }, + { + "epoch": 1.8900169204737733, + "grad_norm": 3.2004599571228027, + "learning_rate": 4.0552735476593345e-05, + "loss": 0.8223, + "step": 3351 + }, + { + "epoch": 1.8905809362662156, + "grad_norm": 1.6112946271896362, + "learning_rate": 4.054991539763113e-05, + "loss": 0.8129, + "step": 3352 + }, + { + "epoch": 1.8911449520586576, + "grad_norm": 1.8480414152145386, + "learning_rate": 4.054709531866893e-05, + "loss": 0.6912, + "step": 3353 + }, + { + "epoch": 1.8917089678510999, + "grad_norm": 1.8995141983032227, + "learning_rate": 4.0544275239706715e-05, + "loss": 0.7662, + "step": 3354 + }, + { + "epoch": 1.892272983643542, + "grad_norm": 1.1142916679382324, + "learning_rate": 4.05414551607445e-05, + "loss": 0.7014, + "step": 3355 + }, + { + "epoch": 1.8928369994359842, + "grad_norm": 1.594075083732605, + "learning_rate": 4.053863508178229e-05, + "loss": 0.814, + "step": 3356 + }, + { + "epoch": 1.8934010152284264, + "grad_norm": 1.204177975654602, + "learning_rate": 4.0535815002820084e-05, + "loss": 0.713, + "step": 3357 + }, + { + "epoch": 1.8939650310208687, + "grad_norm": 1.6000514030456543, + "learning_rate": 4.053299492385787e-05, + "loss": 0.6928, + "step": 3358 + }, + { + "epoch": 1.894529046813311, + "grad_norm": 3.0570478439331055, + "learning_rate": 4.0530174844895655e-05, + "loss": 0.7875, + "step": 3359 + }, + { + "epoch": 1.895093062605753, + "grad_norm": 4.741988182067871, + "learning_rate": 4.052735476593345e-05, + "loss": 0.7052, + "step": 3360 + }, + { + "epoch": 1.8956570783981952, + "grad_norm": 1.3846384286880493, + "learning_rate": 4.052453468697124e-05, + "loss": 0.737, + "step": 3361 + }, + { + "epoch": 1.8962210941906372, + "grad_norm": 3.330906629562378, + "learning_rate": 4.0521714608009025e-05, + "loss": 0.8285, + "step": 3362 + }, + { + "epoch": 1.8967851099830795, + "grad_norm": 1.750741958618164, + "learning_rate": 4.051889452904681e-05, + "loss": 0.8805, + "step": 3363 + }, + { + "epoch": 1.8973491257755217, + "grad_norm": 3.2489748001098633, + "learning_rate": 4.051607445008461e-05, + "loss": 0.8447, + "step": 3364 + }, + { + "epoch": 1.897913141567964, + "grad_norm": 2.8967177867889404, + "learning_rate": 4.0513254371122395e-05, + "loss": 0.8439, + "step": 3365 + }, + { + "epoch": 1.8984771573604062, + "grad_norm": 1.7333556413650513, + "learning_rate": 4.051043429216018e-05, + "loss": 0.7268, + "step": 3366 + }, + { + "epoch": 1.8990411731528483, + "grad_norm": 2.289215564727783, + "learning_rate": 4.050761421319797e-05, + "loss": 0.8188, + "step": 3367 + }, + { + "epoch": 1.8996051889452905, + "grad_norm": 2.032994270324707, + "learning_rate": 4.0504794134235765e-05, + "loss": 0.8502, + "step": 3368 + }, + { + "epoch": 1.9001692047377325, + "grad_norm": 3.4490272998809814, + "learning_rate": 4.050197405527355e-05, + "loss": 0.833, + "step": 3369 + }, + { + "epoch": 1.9007332205301748, + "grad_norm": 3.1214537620544434, + "learning_rate": 4.0499153976311335e-05, + "loss": 0.8527, + "step": 3370 + }, + { + "epoch": 1.901297236322617, + "grad_norm": 1.7205449342727661, + "learning_rate": 4.049633389734913e-05, + "loss": 0.679, + "step": 3371 + }, + { + "epoch": 1.9018612521150593, + "grad_norm": 2.915752649307251, + "learning_rate": 4.049351381838692e-05, + "loss": 0.8544, + "step": 3372 + }, + { + "epoch": 1.9024252679075015, + "grad_norm": 2.547429084777832, + "learning_rate": 4.0490693739424705e-05, + "loss": 0.8488, + "step": 3373 + }, + { + "epoch": 1.9029892836999436, + "grad_norm": 2.4482080936431885, + "learning_rate": 4.04878736604625e-05, + "loss": 0.7397, + "step": 3374 + }, + { + "epoch": 1.9035532994923858, + "grad_norm": 1.6965566873550415, + "learning_rate": 4.048505358150028e-05, + "loss": 0.8676, + "step": 3375 + }, + { + "epoch": 1.9041173152848279, + "grad_norm": 1.5114836692810059, + "learning_rate": 4.0482233502538075e-05, + "loss": 0.7445, + "step": 3376 + }, + { + "epoch": 1.9046813310772701, + "grad_norm": 1.485374927520752, + "learning_rate": 4.047941342357586e-05, + "loss": 0.8241, + "step": 3377 + }, + { + "epoch": 1.9052453468697124, + "grad_norm": 2.6673429012298584, + "learning_rate": 4.047659334461365e-05, + "loss": 0.7981, + "step": 3378 + }, + { + "epoch": 1.9058093626621546, + "grad_norm": 2.986579179763794, + "learning_rate": 4.047377326565144e-05, + "loss": 1.0041, + "step": 3379 + }, + { + "epoch": 1.9063733784545969, + "grad_norm": 1.9235817193984985, + "learning_rate": 4.047095318668923e-05, + "loss": 0.6693, + "step": 3380 + }, + { + "epoch": 1.906937394247039, + "grad_norm": 2.1430258750915527, + "learning_rate": 4.0468133107727016e-05, + "loss": 0.8608, + "step": 3381 + }, + { + "epoch": 1.9075014100394811, + "grad_norm": 1.3360700607299805, + "learning_rate": 4.046531302876481e-05, + "loss": 0.7304, + "step": 3382 + }, + { + "epoch": 1.9080654258319232, + "grad_norm": 2.102100133895874, + "learning_rate": 4.046249294980259e-05, + "loss": 0.8671, + "step": 3383 + }, + { + "epoch": 1.9086294416243654, + "grad_norm": 2.519850969314575, + "learning_rate": 4.0459672870840385e-05, + "loss": 0.8081, + "step": 3384 + }, + { + "epoch": 1.9091934574168077, + "grad_norm": 1.796711802482605, + "learning_rate": 4.045685279187818e-05, + "loss": 0.8081, + "step": 3385 + }, + { + "epoch": 1.90975747320925, + "grad_norm": 5.679064750671387, + "learning_rate": 4.045403271291596e-05, + "loss": 1.3302, + "step": 3386 + }, + { + "epoch": 1.9103214890016922, + "grad_norm": 1.249380111694336, + "learning_rate": 4.045121263395375e-05, + "loss": 0.7261, + "step": 3387 + }, + { + "epoch": 1.9108855047941342, + "grad_norm": 4.015459060668945, + "learning_rate": 4.044839255499154e-05, + "loss": 0.8853, + "step": 3388 + }, + { + "epoch": 1.9114495205865765, + "grad_norm": 1.49779212474823, + "learning_rate": 4.044557247602933e-05, + "loss": 0.7666, + "step": 3389 + }, + { + "epoch": 1.9120135363790185, + "grad_norm": 1.7742440700531006, + "learning_rate": 4.044275239706712e-05, + "loss": 0.8079, + "step": 3390 + }, + { + "epoch": 1.9125775521714607, + "grad_norm": 1.4323402643203735, + "learning_rate": 4.0439932318104904e-05, + "loss": 0.7804, + "step": 3391 + }, + { + "epoch": 1.913141567963903, + "grad_norm": 1.1340481042861938, + "learning_rate": 4.04371122391427e-05, + "loss": 0.7462, + "step": 3392 + }, + { + "epoch": 1.9137055837563453, + "grad_norm": 1.5081483125686646, + "learning_rate": 4.043429216018049e-05, + "loss": 0.7155, + "step": 3393 + }, + { + "epoch": 1.9142695995487875, + "grad_norm": 1.8845775127410889, + "learning_rate": 4.043147208121827e-05, + "loss": 0.8008, + "step": 3394 + }, + { + "epoch": 1.9148336153412295, + "grad_norm": 2.0958104133605957, + "learning_rate": 4.0428652002256066e-05, + "loss": 0.7914, + "step": 3395 + }, + { + "epoch": 1.9153976311336718, + "grad_norm": 2.006145477294922, + "learning_rate": 4.042583192329386e-05, + "loss": 0.7629, + "step": 3396 + }, + { + "epoch": 1.9159616469261138, + "grad_norm": 2.013828754425049, + "learning_rate": 4.042301184433164e-05, + "loss": 0.8331, + "step": 3397 + }, + { + "epoch": 1.916525662718556, + "grad_norm": 1.2923566102981567, + "learning_rate": 4.042019176536943e-05, + "loss": 0.7283, + "step": 3398 + }, + { + "epoch": 1.9170896785109983, + "grad_norm": 1.9807772636413574, + "learning_rate": 4.041737168640722e-05, + "loss": 0.8545, + "step": 3399 + }, + { + "epoch": 1.9176536943034406, + "grad_norm": 1.9976294040679932, + "learning_rate": 4.041455160744501e-05, + "loss": 0.9147, + "step": 3400 + }, + { + "epoch": 1.9182177100958828, + "grad_norm": 1.7271000146865845, + "learning_rate": 4.04117315284828e-05, + "loss": 0.8136, + "step": 3401 + }, + { + "epoch": 1.9187817258883249, + "grad_norm": 1.4120911359786987, + "learning_rate": 4.0408911449520584e-05, + "loss": 0.7438, + "step": 3402 + }, + { + "epoch": 1.919345741680767, + "grad_norm": 1.9923840761184692, + "learning_rate": 4.040609137055838e-05, + "loss": 0.8097, + "step": 3403 + }, + { + "epoch": 1.9199097574732091, + "grad_norm": 1.901414155960083, + "learning_rate": 4.040327129159617e-05, + "loss": 0.8028, + "step": 3404 + }, + { + "epoch": 1.9204737732656514, + "grad_norm": 2.7410805225372314, + "learning_rate": 4.0400451212633954e-05, + "loss": 0.8511, + "step": 3405 + }, + { + "epoch": 1.9210377890580936, + "grad_norm": 4.024306774139404, + "learning_rate": 4.0397631133671746e-05, + "loss": 0.9423, + "step": 3406 + }, + { + "epoch": 1.9216018048505359, + "grad_norm": 1.762539029121399, + "learning_rate": 4.039481105470954e-05, + "loss": 0.7394, + "step": 3407 + }, + { + "epoch": 1.9221658206429781, + "grad_norm": 2.189176321029663, + "learning_rate": 4.039199097574732e-05, + "loss": 0.8291, + "step": 3408 + }, + { + "epoch": 1.9227298364354202, + "grad_norm": 3.1442713737487793, + "learning_rate": 4.038917089678511e-05, + "loss": 0.9647, + "step": 3409 + }, + { + "epoch": 1.9232938522278624, + "grad_norm": 1.2192997932434082, + "learning_rate": 4.03863508178229e-05, + "loss": 0.6803, + "step": 3410 + }, + { + "epoch": 1.9238578680203045, + "grad_norm": 1.9165370464324951, + "learning_rate": 4.038353073886069e-05, + "loss": 0.9107, + "step": 3411 + }, + { + "epoch": 1.9244218838127467, + "grad_norm": 1.3862348794937134, + "learning_rate": 4.038071065989848e-05, + "loss": 0.7371, + "step": 3412 + }, + { + "epoch": 1.924985899605189, + "grad_norm": 3.510591983795166, + "learning_rate": 4.037789058093627e-05, + "loss": 0.8189, + "step": 3413 + }, + { + "epoch": 1.9255499153976312, + "grad_norm": 2.256193161010742, + "learning_rate": 4.0375070501974056e-05, + "loss": 0.852, + "step": 3414 + }, + { + "epoch": 1.9261139311900735, + "grad_norm": 2.6149983406066895, + "learning_rate": 4.037225042301185e-05, + "loss": 0.8367, + "step": 3415 + }, + { + "epoch": 1.9266779469825155, + "grad_norm": 2.588548183441162, + "learning_rate": 4.0369430344049634e-05, + "loss": 0.8209, + "step": 3416 + }, + { + "epoch": 1.9272419627749577, + "grad_norm": 3.356111764907837, + "learning_rate": 4.0366610265087426e-05, + "loss": 0.7973, + "step": 3417 + }, + { + "epoch": 1.9278059785673998, + "grad_norm": 1.3797404766082764, + "learning_rate": 4.036379018612521e-05, + "loss": 0.7723, + "step": 3418 + }, + { + "epoch": 1.928369994359842, + "grad_norm": 1.7689093351364136, + "learning_rate": 4.0360970107163003e-05, + "loss": 0.6824, + "step": 3419 + }, + { + "epoch": 1.9289340101522843, + "grad_norm": 1.3380012512207031, + "learning_rate": 4.035815002820079e-05, + "loss": 0.8017, + "step": 3420 + }, + { + "epoch": 1.9294980259447265, + "grad_norm": 3.7304697036743164, + "learning_rate": 4.035532994923858e-05, + "loss": 0.9134, + "step": 3421 + }, + { + "epoch": 1.9300620417371688, + "grad_norm": 1.698175311088562, + "learning_rate": 4.0352509870276366e-05, + "loss": 0.8399, + "step": 3422 + }, + { + "epoch": 1.9306260575296108, + "grad_norm": 3.387363910675049, + "learning_rate": 4.034968979131416e-05, + "loss": 0.7873, + "step": 3423 + }, + { + "epoch": 1.931190073322053, + "grad_norm": 3.2706265449523926, + "learning_rate": 4.034686971235195e-05, + "loss": 0.8188, + "step": 3424 + }, + { + "epoch": 1.931754089114495, + "grad_norm": 1.4174206256866455, + "learning_rate": 4.0344049633389736e-05, + "loss": 0.8725, + "step": 3425 + }, + { + "epoch": 1.9323181049069373, + "grad_norm": 1.08367919921875, + "learning_rate": 4.034122955442752e-05, + "loss": 0.6539, + "step": 3426 + }, + { + "epoch": 1.9328821206993796, + "grad_norm": 1.6987131834030151, + "learning_rate": 4.0338409475465314e-05, + "loss": 0.8314, + "step": 3427 + }, + { + "epoch": 1.9334461364918218, + "grad_norm": 2.2506816387176514, + "learning_rate": 4.0335589396503106e-05, + "loss": 0.9736, + "step": 3428 + }, + { + "epoch": 1.934010152284264, + "grad_norm": 2.755584955215454, + "learning_rate": 4.033276931754089e-05, + "loss": 0.6849, + "step": 3429 + }, + { + "epoch": 1.9345741680767061, + "grad_norm": 1.853174090385437, + "learning_rate": 4.0329949238578684e-05, + "loss": 0.8143, + "step": 3430 + }, + { + "epoch": 1.9351381838691484, + "grad_norm": 1.7751407623291016, + "learning_rate": 4.0327129159616476e-05, + "loss": 0.8328, + "step": 3431 + }, + { + "epoch": 1.9357021996615904, + "grad_norm": 2.229370355606079, + "learning_rate": 4.032430908065426e-05, + "loss": 0.8546, + "step": 3432 + }, + { + "epoch": 1.9362662154540327, + "grad_norm": 1.957994818687439, + "learning_rate": 4.0321489001692047e-05, + "loss": 0.8394, + "step": 3433 + }, + { + "epoch": 1.936830231246475, + "grad_norm": 1.904015302658081, + "learning_rate": 4.031866892272984e-05, + "loss": 0.8622, + "step": 3434 + }, + { + "epoch": 1.9373942470389172, + "grad_norm": 1.4655451774597168, + "learning_rate": 4.031584884376763e-05, + "loss": 0.7024, + "step": 3435 + }, + { + "epoch": 1.9379582628313594, + "grad_norm": 2.077575445175171, + "learning_rate": 4.0313028764805416e-05, + "loss": 0.8322, + "step": 3436 + }, + { + "epoch": 1.9385222786238014, + "grad_norm": 1.1993887424468994, + "learning_rate": 4.03102086858432e-05, + "loss": 0.7347, + "step": 3437 + }, + { + "epoch": 1.9390862944162437, + "grad_norm": 1.9218544960021973, + "learning_rate": 4.0307388606880994e-05, + "loss": 0.821, + "step": 3438 + }, + { + "epoch": 1.9396503102086857, + "grad_norm": 2.254420518875122, + "learning_rate": 4.0304568527918786e-05, + "loss": 0.6616, + "step": 3439 + }, + { + "epoch": 1.940214326001128, + "grad_norm": 13.922019958496094, + "learning_rate": 4.030174844895657e-05, + "loss": 0.9689, + "step": 3440 + }, + { + "epoch": 1.9407783417935702, + "grad_norm": 1.7272778749465942, + "learning_rate": 4.029892836999436e-05, + "loss": 0.7513, + "step": 3441 + }, + { + "epoch": 1.9413423575860125, + "grad_norm": 1.4320170879364014, + "learning_rate": 4.0296108291032156e-05, + "loss": 0.7789, + "step": 3442 + }, + { + "epoch": 1.9419063733784547, + "grad_norm": 3.119124174118042, + "learning_rate": 4.029328821206994e-05, + "loss": 0.8381, + "step": 3443 + }, + { + "epoch": 1.9424703891708968, + "grad_norm": 1.152909755706787, + "learning_rate": 4.029046813310773e-05, + "loss": 0.8037, + "step": 3444 + }, + { + "epoch": 1.943034404963339, + "grad_norm": 2.2259910106658936, + "learning_rate": 4.028764805414552e-05, + "loss": 0.8201, + "step": 3445 + }, + { + "epoch": 1.943598420755781, + "grad_norm": 1.3690552711486816, + "learning_rate": 4.028482797518331e-05, + "loss": 0.738, + "step": 3446 + }, + { + "epoch": 1.9441624365482233, + "grad_norm": 2.4507298469543457, + "learning_rate": 4.0282007896221097e-05, + "loss": 0.8837, + "step": 3447 + }, + { + "epoch": 1.9447264523406655, + "grad_norm": 2.068267583847046, + "learning_rate": 4.027918781725888e-05, + "loss": 0.8492, + "step": 3448 + }, + { + "epoch": 1.9452904681331078, + "grad_norm": 4.326219081878662, + "learning_rate": 4.0276367738296674e-05, + "loss": 1.0385, + "step": 3449 + }, + { + "epoch": 1.94585448392555, + "grad_norm": 1.708756685256958, + "learning_rate": 4.0273547659334466e-05, + "loss": 0.7553, + "step": 3450 + }, + { + "epoch": 1.946418499717992, + "grad_norm": 1.6764044761657715, + "learning_rate": 4.027072758037225e-05, + "loss": 0.7004, + "step": 3451 + }, + { + "epoch": 1.9469825155104343, + "grad_norm": 1.3860423564910889, + "learning_rate": 4.026790750141004e-05, + "loss": 0.668, + "step": 3452 + }, + { + "epoch": 1.9475465313028764, + "grad_norm": 2.137451648712158, + "learning_rate": 4.026508742244783e-05, + "loss": 0.7534, + "step": 3453 + }, + { + "epoch": 1.9481105470953186, + "grad_norm": 1.5863484144210815, + "learning_rate": 4.026226734348562e-05, + "loss": 0.8404, + "step": 3454 + }, + { + "epoch": 1.9486745628877609, + "grad_norm": 2.417083978652954, + "learning_rate": 4.025944726452341e-05, + "loss": 0.9527, + "step": 3455 + }, + { + "epoch": 1.9492385786802031, + "grad_norm": 2.209169626235962, + "learning_rate": 4.02566271855612e-05, + "loss": 0.9251, + "step": 3456 + }, + { + "epoch": 1.9498025944726454, + "grad_norm": 2.087097644805908, + "learning_rate": 4.0253807106598984e-05, + "loss": 0.7506, + "step": 3457 + }, + { + "epoch": 1.9503666102650874, + "grad_norm": 2.330463171005249, + "learning_rate": 4.025098702763678e-05, + "loss": 0.9214, + "step": 3458 + }, + { + "epoch": 1.9509306260575296, + "grad_norm": 4.212392330169678, + "learning_rate": 4.024816694867456e-05, + "loss": 0.8966, + "step": 3459 + }, + { + "epoch": 1.9514946418499717, + "grad_norm": 1.331752896308899, + "learning_rate": 4.0245346869712354e-05, + "loss": 0.7173, + "step": 3460 + }, + { + "epoch": 1.952058657642414, + "grad_norm": 2.982306718826294, + "learning_rate": 4.024252679075014e-05, + "loss": 0.8087, + "step": 3461 + }, + { + "epoch": 1.9526226734348562, + "grad_norm": 1.3130854368209839, + "learning_rate": 4.023970671178793e-05, + "loss": 0.7928, + "step": 3462 + }, + { + "epoch": 1.9531866892272984, + "grad_norm": 1.2959840297698975, + "learning_rate": 4.0236886632825724e-05, + "loss": 0.7584, + "step": 3463 + }, + { + "epoch": 1.9537507050197407, + "grad_norm": 2.257167100906372, + "learning_rate": 4.023406655386351e-05, + "loss": 0.7806, + "step": 3464 + }, + { + "epoch": 1.9543147208121827, + "grad_norm": 1.3764022588729858, + "learning_rate": 4.02312464749013e-05, + "loss": 0.7934, + "step": 3465 + }, + { + "epoch": 1.954878736604625, + "grad_norm": 1.445243000984192, + "learning_rate": 4.022842639593909e-05, + "loss": 0.7374, + "step": 3466 + }, + { + "epoch": 1.955442752397067, + "grad_norm": 2.078223943710327, + "learning_rate": 4.022560631697688e-05, + "loss": 0.7592, + "step": 3467 + }, + { + "epoch": 1.9560067681895092, + "grad_norm": 1.3728395700454712, + "learning_rate": 4.0222786238014665e-05, + "loss": 0.762, + "step": 3468 + }, + { + "epoch": 1.9565707839819515, + "grad_norm": 1.7173494100570679, + "learning_rate": 4.021996615905246e-05, + "loss": 0.7356, + "step": 3469 + }, + { + "epoch": 1.9571347997743938, + "grad_norm": 1.9564645290374756, + "learning_rate": 4.021714608009024e-05, + "loss": 0.7945, + "step": 3470 + }, + { + "epoch": 1.957698815566836, + "grad_norm": 2.3380367755889893, + "learning_rate": 4.0214326001128034e-05, + "loss": 0.783, + "step": 3471 + }, + { + "epoch": 1.958262831359278, + "grad_norm": 1.3818378448486328, + "learning_rate": 4.021150592216582e-05, + "loss": 0.8537, + "step": 3472 + }, + { + "epoch": 1.9588268471517203, + "grad_norm": 2.9656589031219482, + "learning_rate": 4.020868584320361e-05, + "loss": 0.7469, + "step": 3473 + }, + { + "epoch": 1.9593908629441623, + "grad_norm": 2.422645330429077, + "learning_rate": 4.0205865764241404e-05, + "loss": 0.84, + "step": 3474 + }, + { + "epoch": 1.9599548787366046, + "grad_norm": 2.2553820610046387, + "learning_rate": 4.020304568527919e-05, + "loss": 0.8876, + "step": 3475 + }, + { + "epoch": 1.9605188945290468, + "grad_norm": 2.2709882259368896, + "learning_rate": 4.0200225606316975e-05, + "loss": 0.8178, + "step": 3476 + }, + { + "epoch": 1.961082910321489, + "grad_norm": 1.0462646484375, + "learning_rate": 4.019740552735477e-05, + "loss": 0.6355, + "step": 3477 + }, + { + "epoch": 1.9616469261139313, + "grad_norm": 1.7391232252120972, + "learning_rate": 4.019458544839256e-05, + "loss": 0.8382, + "step": 3478 + }, + { + "epoch": 1.9622109419063734, + "grad_norm": 2.886775493621826, + "learning_rate": 4.0191765369430345e-05, + "loss": 0.8232, + "step": 3479 + }, + { + "epoch": 1.9627749576988156, + "grad_norm": 2.0261995792388916, + "learning_rate": 4.018894529046813e-05, + "loss": 0.928, + "step": 3480 + }, + { + "epoch": 1.9633389734912576, + "grad_norm": 2.2987101078033447, + "learning_rate": 4.018612521150593e-05, + "loss": 0.8745, + "step": 3481 + }, + { + "epoch": 1.9639029892836999, + "grad_norm": 2.3157198429107666, + "learning_rate": 4.0183305132543715e-05, + "loss": 0.8689, + "step": 3482 + }, + { + "epoch": 1.9644670050761421, + "grad_norm": 3.551222324371338, + "learning_rate": 4.01804850535815e-05, + "loss": 0.8096, + "step": 3483 + }, + { + "epoch": 1.9650310208685844, + "grad_norm": 1.3898463249206543, + "learning_rate": 4.017766497461929e-05, + "loss": 0.7722, + "step": 3484 + }, + { + "epoch": 1.9655950366610266, + "grad_norm": 5.174654960632324, + "learning_rate": 4.0174844895657084e-05, + "loss": 0.7905, + "step": 3485 + }, + { + "epoch": 1.9661590524534687, + "grad_norm": 2.031618118286133, + "learning_rate": 4.017202481669487e-05, + "loss": 0.7662, + "step": 3486 + }, + { + "epoch": 1.966723068245911, + "grad_norm": 1.3007246255874634, + "learning_rate": 4.0169204737732655e-05, + "loss": 0.8134, + "step": 3487 + }, + { + "epoch": 1.967287084038353, + "grad_norm": 1.1632391214370728, + "learning_rate": 4.016638465877045e-05, + "loss": 0.7682, + "step": 3488 + }, + { + "epoch": 1.9678510998307952, + "grad_norm": 1.1739698648452759, + "learning_rate": 4.016356457980824e-05, + "loss": 0.731, + "step": 3489 + }, + { + "epoch": 1.9684151156232375, + "grad_norm": 1.398618459701538, + "learning_rate": 4.0160744500846025e-05, + "loss": 0.698, + "step": 3490 + }, + { + "epoch": 1.9689791314156797, + "grad_norm": 1.878736972808838, + "learning_rate": 4.015792442188381e-05, + "loss": 0.7659, + "step": 3491 + }, + { + "epoch": 1.969543147208122, + "grad_norm": 1.4513404369354248, + "learning_rate": 4.01551043429216e-05, + "loss": 0.7601, + "step": 3492 + }, + { + "epoch": 1.970107163000564, + "grad_norm": 2.06258487701416, + "learning_rate": 4.0152284263959395e-05, + "loss": 0.9157, + "step": 3493 + }, + { + "epoch": 1.9706711787930062, + "grad_norm": 1.9779441356658936, + "learning_rate": 4.014946418499718e-05, + "loss": 0.7151, + "step": 3494 + }, + { + "epoch": 1.9712351945854483, + "grad_norm": 2.53747820854187, + "learning_rate": 4.014664410603497e-05, + "loss": 0.7634, + "step": 3495 + }, + { + "epoch": 1.9717992103778905, + "grad_norm": 2.926347494125366, + "learning_rate": 4.014382402707276e-05, + "loss": 0.762, + "step": 3496 + }, + { + "epoch": 1.9723632261703328, + "grad_norm": 2.311352252960205, + "learning_rate": 4.014100394811055e-05, + "loss": 0.8012, + "step": 3497 + }, + { + "epoch": 1.972927241962775, + "grad_norm": 3.1551902294158936, + "learning_rate": 4.0138183869148335e-05, + "loss": 0.8401, + "step": 3498 + }, + { + "epoch": 1.9734912577552173, + "grad_norm": 3.745513916015625, + "learning_rate": 4.013536379018613e-05, + "loss": 0.9234, + "step": 3499 + }, + { + "epoch": 1.9740552735476593, + "grad_norm": 4.077144145965576, + "learning_rate": 4.013254371122392e-05, + "loss": 0.8108, + "step": 3500 + }, + { + "epoch": 1.9746192893401016, + "grad_norm": 1.2234158515930176, + "learning_rate": 4.0129723632261705e-05, + "loss": 0.5753, + "step": 3501 + }, + { + "epoch": 1.9751833051325436, + "grad_norm": 1.7824879884719849, + "learning_rate": 4.01269035532995e-05, + "loss": 0.8762, + "step": 3502 + }, + { + "epoch": 1.9757473209249858, + "grad_norm": 3.1122522354125977, + "learning_rate": 4.012408347433728e-05, + "loss": 0.8749, + "step": 3503 + }, + { + "epoch": 1.976311336717428, + "grad_norm": 3.1814422607421875, + "learning_rate": 4.0121263395375075e-05, + "loss": 0.8357, + "step": 3504 + }, + { + "epoch": 1.9768753525098703, + "grad_norm": 1.453106164932251, + "learning_rate": 4.011844331641286e-05, + "loss": 0.6943, + "step": 3505 + }, + { + "epoch": 1.9774393683023126, + "grad_norm": 1.4218007326126099, + "learning_rate": 4.011562323745065e-05, + "loss": 0.7088, + "step": 3506 + }, + { + "epoch": 1.9780033840947546, + "grad_norm": 2.4339239597320557, + "learning_rate": 4.011280315848844e-05, + "loss": 0.8263, + "step": 3507 + }, + { + "epoch": 1.9785673998871969, + "grad_norm": 2.3758373260498047, + "learning_rate": 4.010998307952623e-05, + "loss": 0.8882, + "step": 3508 + }, + { + "epoch": 1.979131415679639, + "grad_norm": 1.9226967096328735, + "learning_rate": 4.0107163000564015e-05, + "loss": 0.7003, + "step": 3509 + }, + { + "epoch": 1.9796954314720812, + "grad_norm": 2.3578176498413086, + "learning_rate": 4.010434292160181e-05, + "loss": 0.7645, + "step": 3510 + }, + { + "epoch": 1.9802594472645234, + "grad_norm": 2.812030076980591, + "learning_rate": 4.010152284263959e-05, + "loss": 0.7735, + "step": 3511 + }, + { + "epoch": 1.9808234630569657, + "grad_norm": 2.5020089149475098, + "learning_rate": 4.0098702763677385e-05, + "loss": 0.8302, + "step": 3512 + }, + { + "epoch": 1.981387478849408, + "grad_norm": 1.6159744262695312, + "learning_rate": 4.009588268471518e-05, + "loss": 0.7711, + "step": 3513 + }, + { + "epoch": 1.98195149464185, + "grad_norm": 2.216994285583496, + "learning_rate": 4.009306260575296e-05, + "loss": 0.785, + "step": 3514 + }, + { + "epoch": 1.9825155104342922, + "grad_norm": 1.8952438831329346, + "learning_rate": 4.009024252679075e-05, + "loss": 0.9497, + "step": 3515 + }, + { + "epoch": 1.9830795262267342, + "grad_norm": 1.4818246364593506, + "learning_rate": 4.008742244782854e-05, + "loss": 0.7693, + "step": 3516 + }, + { + "epoch": 1.9836435420191765, + "grad_norm": 1.850576639175415, + "learning_rate": 4.008460236886633e-05, + "loss": 0.6929, + "step": 3517 + }, + { + "epoch": 1.9842075578116187, + "grad_norm": 1.275443196296692, + "learning_rate": 4.008178228990412e-05, + "loss": 0.7263, + "step": 3518 + }, + { + "epoch": 1.984771573604061, + "grad_norm": 1.5988969802856445, + "learning_rate": 4.0078962210941903e-05, + "loss": 0.7869, + "step": 3519 + }, + { + "epoch": 1.9853355893965032, + "grad_norm": 1.2852592468261719, + "learning_rate": 4.00761421319797e-05, + "loss": 0.7913, + "step": 3520 + }, + { + "epoch": 1.9858996051889453, + "grad_norm": 2.338758945465088, + "learning_rate": 4.007332205301749e-05, + "loss": 0.7191, + "step": 3521 + }, + { + "epoch": 1.9864636209813875, + "grad_norm": 1.6846011877059937, + "learning_rate": 4.007050197405527e-05, + "loss": 0.7996, + "step": 3522 + }, + { + "epoch": 1.9870276367738295, + "grad_norm": 3.2649450302124023, + "learning_rate": 4.0067681895093065e-05, + "loss": 0.8658, + "step": 3523 + }, + { + "epoch": 1.9875916525662718, + "grad_norm": 2.073964834213257, + "learning_rate": 4.006486181613086e-05, + "loss": 0.8444, + "step": 3524 + }, + { + "epoch": 1.988155668358714, + "grad_norm": 1.7759982347488403, + "learning_rate": 4.006204173716864e-05, + "loss": 0.8504, + "step": 3525 + }, + { + "epoch": 1.9887196841511563, + "grad_norm": 2.055896759033203, + "learning_rate": 4.005922165820643e-05, + "loss": 0.844, + "step": 3526 + }, + { + "epoch": 1.9892836999435985, + "grad_norm": 3.131143093109131, + "learning_rate": 4.005640157924422e-05, + "loss": 1.036, + "step": 3527 + }, + { + "epoch": 1.9898477157360406, + "grad_norm": 1.3650240898132324, + "learning_rate": 4.005358150028201e-05, + "loss": 0.8486, + "step": 3528 + }, + { + "epoch": 1.9904117315284828, + "grad_norm": 2.9426395893096924, + "learning_rate": 4.00507614213198e-05, + "loss": 0.8473, + "step": 3529 + }, + { + "epoch": 1.9909757473209249, + "grad_norm": 1.2502472400665283, + "learning_rate": 4.0047941342357584e-05, + "loss": 0.7205, + "step": 3530 + }, + { + "epoch": 1.9915397631133671, + "grad_norm": 1.4542596340179443, + "learning_rate": 4.0045121263395376e-05, + "loss": 0.7063, + "step": 3531 + }, + { + "epoch": 1.9921037789058094, + "grad_norm": 1.658296823501587, + "learning_rate": 4.004230118443317e-05, + "loss": 0.831, + "step": 3532 + }, + { + "epoch": 1.9926677946982516, + "grad_norm": 3.7616350650787354, + "learning_rate": 4.003948110547095e-05, + "loss": 0.8227, + "step": 3533 + }, + { + "epoch": 1.9932318104906939, + "grad_norm": 2.251955032348633, + "learning_rate": 4.0036661026508746e-05, + "loss": 0.8705, + "step": 3534 + }, + { + "epoch": 1.993795826283136, + "grad_norm": 2.456282615661621, + "learning_rate": 4.003384094754654e-05, + "loss": 0.9115, + "step": 3535 + }, + { + "epoch": 1.9943598420755781, + "grad_norm": 1.3562908172607422, + "learning_rate": 4.003102086858432e-05, + "loss": 0.7776, + "step": 3536 + }, + { + "epoch": 1.9949238578680202, + "grad_norm": 2.038081169128418, + "learning_rate": 4.002820078962211e-05, + "loss": 0.8951, + "step": 3537 + }, + { + "epoch": 1.9954878736604624, + "grad_norm": 1.187845230102539, + "learning_rate": 4.00253807106599e-05, + "loss": 0.7047, + "step": 3538 + }, + { + "epoch": 1.9960518894529047, + "grad_norm": 3.6045565605163574, + "learning_rate": 4.002256063169769e-05, + "loss": 0.86, + "step": 3539 + }, + { + "epoch": 1.996615905245347, + "grad_norm": 1.2272425889968872, + "learning_rate": 4.001974055273548e-05, + "loss": 0.726, + "step": 3540 + }, + { + "epoch": 1.9971799210377892, + "grad_norm": 1.4309743642807007, + "learning_rate": 4.001692047377327e-05, + "loss": 0.7247, + "step": 3541 + }, + { + "epoch": 1.9977439368302312, + "grad_norm": 2.825084686279297, + "learning_rate": 4.0014100394811056e-05, + "loss": 0.8153, + "step": 3542 + }, + { + "epoch": 1.9983079526226735, + "grad_norm": 2.2670319080352783, + "learning_rate": 4.001128031584885e-05, + "loss": 0.8506, + "step": 3543 + }, + { + "epoch": 1.9988719684151155, + "grad_norm": 2.810230255126953, + "learning_rate": 4.0008460236886633e-05, + "loss": 0.7596, + "step": 3544 + }, + { + "epoch": 1.9994359842075577, + "grad_norm": 1.669033169746399, + "learning_rate": 4.0005640157924426e-05, + "loss": 0.8196, + "step": 3545 + }, + { + "epoch": 2.0, + "grad_norm": 1.8105210065841675, + "learning_rate": 4.000282007896221e-05, + "loss": 0.7644, + "step": 3546 + }, + { + "epoch": 2.0005640157924423, + "grad_norm": 3.3299903869628906, + "learning_rate": 4e-05, + "loss": 0.8851, + "step": 3547 + }, + { + "epoch": 2.0011280315848845, + "grad_norm": 1.833738088607788, + "learning_rate": 3.999717992103779e-05, + "loss": 0.7814, + "step": 3548 + }, + { + "epoch": 2.0016920473773268, + "grad_norm": 1.9114599227905273, + "learning_rate": 3.999435984207558e-05, + "loss": 0.7453, + "step": 3549 + }, + { + "epoch": 2.0022560631697686, + "grad_norm": 3.191767454147339, + "learning_rate": 3.9991539763113366e-05, + "loss": 0.9623, + "step": 3550 + }, + { + "epoch": 2.002820078962211, + "grad_norm": 2.25968337059021, + "learning_rate": 3.998871968415116e-05, + "loss": 0.7527, + "step": 3551 + }, + { + "epoch": 2.003384094754653, + "grad_norm": 1.3484433889389038, + "learning_rate": 3.998589960518895e-05, + "loss": 0.7861, + "step": 3552 + }, + { + "epoch": 2.0039481105470953, + "grad_norm": 2.0659608840942383, + "learning_rate": 3.9983079526226736e-05, + "loss": 0.9388, + "step": 3553 + }, + { + "epoch": 2.0045121263395376, + "grad_norm": 2.608358144760132, + "learning_rate": 3.998025944726452e-05, + "loss": 0.8769, + "step": 3554 + }, + { + "epoch": 2.00507614213198, + "grad_norm": 2.5001258850097656, + "learning_rate": 3.9977439368302314e-05, + "loss": 0.8768, + "step": 3555 + }, + { + "epoch": 2.005640157924422, + "grad_norm": 2.4919378757476807, + "learning_rate": 3.9974619289340106e-05, + "loss": 0.8272, + "step": 3556 + }, + { + "epoch": 2.006204173716864, + "grad_norm": 2.8946268558502197, + "learning_rate": 3.997179921037789e-05, + "loss": 0.7789, + "step": 3557 + }, + { + "epoch": 2.006768189509306, + "grad_norm": 2.728595733642578, + "learning_rate": 3.996897913141568e-05, + "loss": 0.8175, + "step": 3558 + }, + { + "epoch": 2.0073322053017484, + "grad_norm": 1.3151038885116577, + "learning_rate": 3.9966159052453476e-05, + "loss": 0.7001, + "step": 3559 + }, + { + "epoch": 2.0078962210941906, + "grad_norm": 1.060347557067871, + "learning_rate": 3.996333897349126e-05, + "loss": 0.7205, + "step": 3560 + }, + { + "epoch": 2.008460236886633, + "grad_norm": 4.728885173797607, + "learning_rate": 3.9960518894529046e-05, + "loss": 0.8517, + "step": 3561 + }, + { + "epoch": 2.009024252679075, + "grad_norm": 1.2403380870819092, + "learning_rate": 3.995769881556684e-05, + "loss": 0.7636, + "step": 3562 + }, + { + "epoch": 2.0095882684715174, + "grad_norm": 2.2114076614379883, + "learning_rate": 3.995487873660463e-05, + "loss": 0.7803, + "step": 3563 + }, + { + "epoch": 2.010152284263959, + "grad_norm": 1.9998472929000854, + "learning_rate": 3.9952058657642416e-05, + "loss": 0.7949, + "step": 3564 + }, + { + "epoch": 2.0107163000564015, + "grad_norm": 2.0877561569213867, + "learning_rate": 3.99492385786802e-05, + "loss": 0.7983, + "step": 3565 + }, + { + "epoch": 2.0112803158488437, + "grad_norm": 1.038670539855957, + "learning_rate": 3.9946418499717994e-05, + "loss": 0.6653, + "step": 3566 + }, + { + "epoch": 2.011844331641286, + "grad_norm": 1.650781512260437, + "learning_rate": 3.9943598420755786e-05, + "loss": 0.674, + "step": 3567 + }, + { + "epoch": 2.012408347433728, + "grad_norm": 2.5603628158569336, + "learning_rate": 3.994077834179357e-05, + "loss": 0.912, + "step": 3568 + }, + { + "epoch": 2.0129723632261705, + "grad_norm": 1.661156177520752, + "learning_rate": 3.993795826283136e-05, + "loss": 0.7514, + "step": 3569 + }, + { + "epoch": 2.0135363790186127, + "grad_norm": 1.3883435726165771, + "learning_rate": 3.9935138183869156e-05, + "loss": 0.8102, + "step": 3570 + }, + { + "epoch": 2.0141003948110545, + "grad_norm": 1.4365131855010986, + "learning_rate": 3.993231810490694e-05, + "loss": 0.7416, + "step": 3571 + }, + { + "epoch": 2.0146644106034968, + "grad_norm": 4.181033611297607, + "learning_rate": 3.9929498025944727e-05, + "loss": 0.805, + "step": 3572 + }, + { + "epoch": 2.015228426395939, + "grad_norm": 2.7897205352783203, + "learning_rate": 3.992667794698252e-05, + "loss": 0.8567, + "step": 3573 + }, + { + "epoch": 2.0157924421883813, + "grad_norm": 2.5677647590637207, + "learning_rate": 3.992385786802031e-05, + "loss": 0.8066, + "step": 3574 + }, + { + "epoch": 2.0163564579808235, + "grad_norm": 2.1469883918762207, + "learning_rate": 3.9921037789058096e-05, + "loss": 0.8807, + "step": 3575 + }, + { + "epoch": 2.0169204737732658, + "grad_norm": 2.6923739910125732, + "learning_rate": 3.991821771009588e-05, + "loss": 0.8506, + "step": 3576 + }, + { + "epoch": 2.017484489565708, + "grad_norm": 1.4120311737060547, + "learning_rate": 3.9915397631133674e-05, + "loss": 0.5848, + "step": 3577 + }, + { + "epoch": 2.01804850535815, + "grad_norm": 1.827809453010559, + "learning_rate": 3.9912577552171466e-05, + "loss": 0.7451, + "step": 3578 + }, + { + "epoch": 2.018612521150592, + "grad_norm": 2.1201696395874023, + "learning_rate": 3.990975747320925e-05, + "loss": 0.8437, + "step": 3579 + }, + { + "epoch": 2.0191765369430343, + "grad_norm": 1.6362180709838867, + "learning_rate": 3.9906937394247044e-05, + "loss": 0.6702, + "step": 3580 + }, + { + "epoch": 2.0197405527354766, + "grad_norm": 1.6276044845581055, + "learning_rate": 3.990411731528483e-05, + "loss": 0.7248, + "step": 3581 + }, + { + "epoch": 2.020304568527919, + "grad_norm": 1.211349606513977, + "learning_rate": 3.990129723632262e-05, + "loss": 0.5957, + "step": 3582 + }, + { + "epoch": 2.020868584320361, + "grad_norm": 1.4472531080245972, + "learning_rate": 3.989847715736041e-05, + "loss": 0.6703, + "step": 3583 + }, + { + "epoch": 2.0214326001128033, + "grad_norm": 3.4510204792022705, + "learning_rate": 3.98956570783982e-05, + "loss": 0.8276, + "step": 3584 + }, + { + "epoch": 2.021996615905245, + "grad_norm": 2.2878713607788086, + "learning_rate": 3.9892836999435984e-05, + "loss": 0.8406, + "step": 3585 + }, + { + "epoch": 2.0225606316976874, + "grad_norm": 1.4693952798843384, + "learning_rate": 3.9890016920473777e-05, + "loss": 0.735, + "step": 3586 + }, + { + "epoch": 2.0231246474901297, + "grad_norm": 2.7030181884765625, + "learning_rate": 3.988719684151156e-05, + "loss": 0.7871, + "step": 3587 + }, + { + "epoch": 2.023688663282572, + "grad_norm": 1.8682940006256104, + "learning_rate": 3.9884376762549354e-05, + "loss": 0.6862, + "step": 3588 + }, + { + "epoch": 2.024252679075014, + "grad_norm": 1.5727556943893433, + "learning_rate": 3.988155668358714e-05, + "loss": 0.7586, + "step": 3589 + }, + { + "epoch": 2.0248166948674564, + "grad_norm": 5.9480767250061035, + "learning_rate": 3.987873660462493e-05, + "loss": 0.7624, + "step": 3590 + }, + { + "epoch": 2.0253807106598987, + "grad_norm": 2.9083495140075684, + "learning_rate": 3.9875916525662724e-05, + "loss": 0.8454, + "step": 3591 + }, + { + "epoch": 2.0259447264523405, + "grad_norm": 3.4044766426086426, + "learning_rate": 3.987309644670051e-05, + "loss": 0.8121, + "step": 3592 + }, + { + "epoch": 2.0265087422447827, + "grad_norm": 1.1631195545196533, + "learning_rate": 3.9870276367738295e-05, + "loss": 0.6861, + "step": 3593 + }, + { + "epoch": 2.027072758037225, + "grad_norm": 3.9037203788757324, + "learning_rate": 3.986745628877609e-05, + "loss": 0.8706, + "step": 3594 + }, + { + "epoch": 2.0276367738296672, + "grad_norm": 1.4333879947662354, + "learning_rate": 3.986463620981388e-05, + "loss": 0.7239, + "step": 3595 + }, + { + "epoch": 2.0282007896221095, + "grad_norm": 3.745743989944458, + "learning_rate": 3.9861816130851664e-05, + "loss": 0.7714, + "step": 3596 + }, + { + "epoch": 2.0287648054145517, + "grad_norm": 2.4145543575286865, + "learning_rate": 3.985899605188945e-05, + "loss": 0.7283, + "step": 3597 + }, + { + "epoch": 2.029328821206994, + "grad_norm": 1.343718409538269, + "learning_rate": 3.985617597292725e-05, + "loss": 0.7301, + "step": 3598 + }, + { + "epoch": 2.029892836999436, + "grad_norm": 2.619105815887451, + "learning_rate": 3.9853355893965034e-05, + "loss": 0.9574, + "step": 3599 + }, + { + "epoch": 2.030456852791878, + "grad_norm": 2.0695390701293945, + "learning_rate": 3.985053581500282e-05, + "loss": 0.6242, + "step": 3600 + }, + { + "epoch": 2.0310208685843203, + "grad_norm": 1.4110182523727417, + "learning_rate": 3.9847715736040605e-05, + "loss": 0.6787, + "step": 3601 + }, + { + "epoch": 2.0315848843767625, + "grad_norm": 1.2259395122528076, + "learning_rate": 3.9844895657078404e-05, + "loss": 0.7041, + "step": 3602 + }, + { + "epoch": 2.032148900169205, + "grad_norm": 3.3780295848846436, + "learning_rate": 3.984207557811619e-05, + "loss": 0.9276, + "step": 3603 + }, + { + "epoch": 2.032712915961647, + "grad_norm": 2.073824405670166, + "learning_rate": 3.9839255499153975e-05, + "loss": 0.9405, + "step": 3604 + }, + { + "epoch": 2.0332769317540893, + "grad_norm": 1.8284931182861328, + "learning_rate": 3.983643542019177e-05, + "loss": 0.6569, + "step": 3605 + }, + { + "epoch": 2.033840947546531, + "grad_norm": 1.9767485857009888, + "learning_rate": 3.983361534122956e-05, + "loss": 0.6861, + "step": 3606 + }, + { + "epoch": 2.0344049633389734, + "grad_norm": 1.6361582279205322, + "learning_rate": 3.9830795262267345e-05, + "loss": 0.7159, + "step": 3607 + }, + { + "epoch": 2.0349689791314156, + "grad_norm": 2.315877914428711, + "learning_rate": 3.982797518330513e-05, + "loss": 0.9102, + "step": 3608 + }, + { + "epoch": 2.035532994923858, + "grad_norm": 1.6598420143127441, + "learning_rate": 3.982515510434293e-05, + "loss": 0.8859, + "step": 3609 + }, + { + "epoch": 2.0360970107163, + "grad_norm": 1.6496645212173462, + "learning_rate": 3.9822335025380714e-05, + "loss": 0.8062, + "step": 3610 + }, + { + "epoch": 2.0366610265087424, + "grad_norm": 2.2620248794555664, + "learning_rate": 3.98195149464185e-05, + "loss": 0.727, + "step": 3611 + }, + { + "epoch": 2.0372250423011846, + "grad_norm": 2.0426907539367676, + "learning_rate": 3.981669486745629e-05, + "loss": 0.8694, + "step": 3612 + }, + { + "epoch": 2.0377890580936264, + "grad_norm": 1.590157389640808, + "learning_rate": 3.9813874788494084e-05, + "loss": 0.8935, + "step": 3613 + }, + { + "epoch": 2.0383530738860687, + "grad_norm": 1.7812305688858032, + "learning_rate": 3.981105470953187e-05, + "loss": 0.6968, + "step": 3614 + }, + { + "epoch": 2.038917089678511, + "grad_norm": 7.676169395446777, + "learning_rate": 3.9808234630569655e-05, + "loss": 0.8559, + "step": 3615 + }, + { + "epoch": 2.039481105470953, + "grad_norm": 2.1001548767089844, + "learning_rate": 3.980541455160745e-05, + "loss": 0.8278, + "step": 3616 + }, + { + "epoch": 2.0400451212633954, + "grad_norm": 1.3871103525161743, + "learning_rate": 3.980259447264524e-05, + "loss": 0.6879, + "step": 3617 + }, + { + "epoch": 2.0406091370558377, + "grad_norm": 2.1403090953826904, + "learning_rate": 3.9799774393683025e-05, + "loss": 0.8142, + "step": 3618 + }, + { + "epoch": 2.04117315284828, + "grad_norm": 1.999006748199463, + "learning_rate": 3.979695431472081e-05, + "loss": 0.7137, + "step": 3619 + }, + { + "epoch": 2.0417371686407217, + "grad_norm": 2.438877820968628, + "learning_rate": 3.97941342357586e-05, + "loss": 0.836, + "step": 3620 + }, + { + "epoch": 2.042301184433164, + "grad_norm": 1.9676750898361206, + "learning_rate": 3.9791314156796395e-05, + "loss": 0.8387, + "step": 3621 + }, + { + "epoch": 2.0428652002256062, + "grad_norm": 2.1744544506073, + "learning_rate": 3.978849407783418e-05, + "loss": 0.8942, + "step": 3622 + }, + { + "epoch": 2.0434292160180485, + "grad_norm": 1.3896976709365845, + "learning_rate": 3.978567399887197e-05, + "loss": 0.6459, + "step": 3623 + }, + { + "epoch": 2.0439932318104908, + "grad_norm": 3.1739282608032227, + "learning_rate": 3.978285391990976e-05, + "loss": 0.9388, + "step": 3624 + }, + { + "epoch": 2.044557247602933, + "grad_norm": 1.9637633562088013, + "learning_rate": 3.978003384094755e-05, + "loss": 0.7601, + "step": 3625 + }, + { + "epoch": 2.0451212633953753, + "grad_norm": 1.7174383401870728, + "learning_rate": 3.9777213761985335e-05, + "loss": 0.8317, + "step": 3626 + }, + { + "epoch": 2.045685279187817, + "grad_norm": 1.2802621126174927, + "learning_rate": 3.977439368302313e-05, + "loss": 0.8555, + "step": 3627 + }, + { + "epoch": 2.0462492949802593, + "grad_norm": 1.4057395458221436, + "learning_rate": 3.977157360406091e-05, + "loss": 0.761, + "step": 3628 + }, + { + "epoch": 2.0468133107727016, + "grad_norm": 3.197722911834717, + "learning_rate": 3.9768753525098705e-05, + "loss": 0.9038, + "step": 3629 + }, + { + "epoch": 2.047377326565144, + "grad_norm": 1.5794564485549927, + "learning_rate": 3.97659334461365e-05, + "loss": 0.7519, + "step": 3630 + }, + { + "epoch": 2.047941342357586, + "grad_norm": 1.7437596321105957, + "learning_rate": 3.976311336717428e-05, + "loss": 0.7718, + "step": 3631 + }, + { + "epoch": 2.0485053581500283, + "grad_norm": 2.098238468170166, + "learning_rate": 3.976029328821207e-05, + "loss": 0.8192, + "step": 3632 + }, + { + "epoch": 2.0490693739424706, + "grad_norm": 1.8117560148239136, + "learning_rate": 3.975747320924986e-05, + "loss": 0.7456, + "step": 3633 + }, + { + "epoch": 2.0496333897349124, + "grad_norm": 1.17897629737854, + "learning_rate": 3.975465313028765e-05, + "loss": 0.6912, + "step": 3634 + }, + { + "epoch": 2.0501974055273546, + "grad_norm": 1.673658013343811, + "learning_rate": 3.975183305132544e-05, + "loss": 0.6995, + "step": 3635 + }, + { + "epoch": 2.050761421319797, + "grad_norm": 1.9008489847183228, + "learning_rate": 3.974901297236322e-05, + "loss": 0.8953, + "step": 3636 + }, + { + "epoch": 2.051325437112239, + "grad_norm": 1.376089096069336, + "learning_rate": 3.9746192893401015e-05, + "loss": 0.7552, + "step": 3637 + }, + { + "epoch": 2.0518894529046814, + "grad_norm": 1.2616697549819946, + "learning_rate": 3.974337281443881e-05, + "loss": 0.6944, + "step": 3638 + }, + { + "epoch": 2.0524534686971236, + "grad_norm": 1.6600075960159302, + "learning_rate": 3.974055273547659e-05, + "loss": 0.8264, + "step": 3639 + }, + { + "epoch": 2.053017484489566, + "grad_norm": 1.281896948814392, + "learning_rate": 3.9737732656514385e-05, + "loss": 0.801, + "step": 3640 + }, + { + "epoch": 2.0535815002820077, + "grad_norm": 2.1614840030670166, + "learning_rate": 3.973491257755218e-05, + "loss": 0.8336, + "step": 3641 + }, + { + "epoch": 2.05414551607445, + "grad_norm": 1.5779178142547607, + "learning_rate": 3.973209249858996e-05, + "loss": 0.8644, + "step": 3642 + }, + { + "epoch": 2.054709531866892, + "grad_norm": 1.8298513889312744, + "learning_rate": 3.972927241962775e-05, + "loss": 0.7794, + "step": 3643 + }, + { + "epoch": 2.0552735476593345, + "grad_norm": 1.2376729249954224, + "learning_rate": 3.972645234066554e-05, + "loss": 0.8449, + "step": 3644 + }, + { + "epoch": 2.0558375634517767, + "grad_norm": 1.2834450006484985, + "learning_rate": 3.972363226170333e-05, + "loss": 0.7331, + "step": 3645 + }, + { + "epoch": 2.056401579244219, + "grad_norm": 2.8452961444854736, + "learning_rate": 3.972081218274112e-05, + "loss": 0.9174, + "step": 3646 + }, + { + "epoch": 2.056965595036661, + "grad_norm": 2.190554618835449, + "learning_rate": 3.97179921037789e-05, + "loss": 0.7988, + "step": 3647 + }, + { + "epoch": 2.057529610829103, + "grad_norm": 1.9589675664901733, + "learning_rate": 3.97151720248167e-05, + "loss": 0.8741, + "step": 3648 + }, + { + "epoch": 2.0580936266215453, + "grad_norm": 2.516519784927368, + "learning_rate": 3.971235194585449e-05, + "loss": 0.8125, + "step": 3649 + }, + { + "epoch": 2.0586576424139875, + "grad_norm": 1.5823651552200317, + "learning_rate": 3.970953186689227e-05, + "loss": 0.7389, + "step": 3650 + }, + { + "epoch": 2.0592216582064298, + "grad_norm": 1.4526094198226929, + "learning_rate": 3.9706711787930065e-05, + "loss": 0.7358, + "step": 3651 + }, + { + "epoch": 2.059785673998872, + "grad_norm": 1.8524175882339478, + "learning_rate": 3.970389170896786e-05, + "loss": 0.7688, + "step": 3652 + }, + { + "epoch": 2.0603496897913143, + "grad_norm": 3.1463708877563477, + "learning_rate": 3.970107163000564e-05, + "loss": 0.8691, + "step": 3653 + }, + { + "epoch": 2.0609137055837565, + "grad_norm": 1.430720567703247, + "learning_rate": 3.969825155104343e-05, + "loss": 0.7815, + "step": 3654 + }, + { + "epoch": 2.0614777213761983, + "grad_norm": 2.804628610610962, + "learning_rate": 3.969543147208122e-05, + "loss": 0.8034, + "step": 3655 + }, + { + "epoch": 2.0620417371686406, + "grad_norm": 2.4021289348602295, + "learning_rate": 3.969261139311901e-05, + "loss": 0.79, + "step": 3656 + }, + { + "epoch": 2.062605752961083, + "grad_norm": 2.038268566131592, + "learning_rate": 3.96897913141568e-05, + "loss": 0.7697, + "step": 3657 + }, + { + "epoch": 2.063169768753525, + "grad_norm": 2.7645771503448486, + "learning_rate": 3.9686971235194583e-05, + "loss": 0.7965, + "step": 3658 + }, + { + "epoch": 2.0637337845459673, + "grad_norm": 1.2045317888259888, + "learning_rate": 3.9684151156232376e-05, + "loss": 0.7182, + "step": 3659 + }, + { + "epoch": 2.0642978003384096, + "grad_norm": 2.3507111072540283, + "learning_rate": 3.968133107727017e-05, + "loss": 0.9083, + "step": 3660 + }, + { + "epoch": 2.064861816130852, + "grad_norm": 1.7573319673538208, + "learning_rate": 3.967851099830795e-05, + "loss": 0.7748, + "step": 3661 + }, + { + "epoch": 2.0654258319232937, + "grad_norm": 3.2913501262664795, + "learning_rate": 3.9675690919345745e-05, + "loss": 0.8868, + "step": 3662 + }, + { + "epoch": 2.065989847715736, + "grad_norm": 0.9998308420181274, + "learning_rate": 3.967287084038353e-05, + "loss": 0.7295, + "step": 3663 + }, + { + "epoch": 2.066553863508178, + "grad_norm": 1.4097988605499268, + "learning_rate": 3.967005076142132e-05, + "loss": 0.8362, + "step": 3664 + }, + { + "epoch": 2.0671178793006204, + "grad_norm": 1.2150397300720215, + "learning_rate": 3.966723068245911e-05, + "loss": 0.7161, + "step": 3665 + }, + { + "epoch": 2.0676818950930627, + "grad_norm": 1.5793814659118652, + "learning_rate": 3.96644106034969e-05, + "loss": 0.7216, + "step": 3666 + }, + { + "epoch": 2.068245910885505, + "grad_norm": 3.8021912574768066, + "learning_rate": 3.9661590524534686e-05, + "loss": 0.8854, + "step": 3667 + }, + { + "epoch": 2.068809926677947, + "grad_norm": 2.0058605670928955, + "learning_rate": 3.965877044557248e-05, + "loss": 0.8254, + "step": 3668 + }, + { + "epoch": 2.069373942470389, + "grad_norm": 2.162184000015259, + "learning_rate": 3.965595036661027e-05, + "loss": 0.8112, + "step": 3669 + }, + { + "epoch": 2.0699379582628312, + "grad_norm": 1.3567067384719849, + "learning_rate": 3.9653130287648056e-05, + "loss": 0.6674, + "step": 3670 + }, + { + "epoch": 2.0705019740552735, + "grad_norm": 1.7449777126312256, + "learning_rate": 3.965031020868584e-05, + "loss": 0.8231, + "step": 3671 + }, + { + "epoch": 2.0710659898477157, + "grad_norm": 1.068384051322937, + "learning_rate": 3.964749012972363e-05, + "loss": 0.6875, + "step": 3672 + }, + { + "epoch": 2.071630005640158, + "grad_norm": 3.7373762130737305, + "learning_rate": 3.9644670050761426e-05, + "loss": 0.8886, + "step": 3673 + }, + { + "epoch": 2.0721940214326002, + "grad_norm": 1.902698278427124, + "learning_rate": 3.964184997179921e-05, + "loss": 0.737, + "step": 3674 + }, + { + "epoch": 2.0727580372250425, + "grad_norm": 1.3495796918869019, + "learning_rate": 3.9639029892837e-05, + "loss": 0.7745, + "step": 3675 + }, + { + "epoch": 2.0733220530174843, + "grad_norm": 2.156341552734375, + "learning_rate": 3.963620981387479e-05, + "loss": 0.7266, + "step": 3676 + }, + { + "epoch": 2.0738860688099265, + "grad_norm": 1.3105710744857788, + "learning_rate": 3.963338973491258e-05, + "loss": 0.6592, + "step": 3677 + }, + { + "epoch": 2.074450084602369, + "grad_norm": 2.750330686569214, + "learning_rate": 3.9630569655950366e-05, + "loss": 0.8706, + "step": 3678 + }, + { + "epoch": 2.075014100394811, + "grad_norm": 1.6303809881210327, + "learning_rate": 3.962774957698816e-05, + "loss": 0.7637, + "step": 3679 + }, + { + "epoch": 2.0755781161872533, + "grad_norm": 1.6863425970077515, + "learning_rate": 3.962492949802595e-05, + "loss": 0.8265, + "step": 3680 + }, + { + "epoch": 2.0761421319796955, + "grad_norm": 1.3146826028823853, + "learning_rate": 3.9622109419063736e-05, + "loss": 0.7022, + "step": 3681 + }, + { + "epoch": 2.076706147772138, + "grad_norm": 3.3805394172668457, + "learning_rate": 3.961928934010152e-05, + "loss": 0.8936, + "step": 3682 + }, + { + "epoch": 2.0772701635645796, + "grad_norm": 2.6790266036987305, + "learning_rate": 3.9616469261139313e-05, + "loss": 0.6712, + "step": 3683 + }, + { + "epoch": 2.077834179357022, + "grad_norm": 1.8918555974960327, + "learning_rate": 3.9613649182177106e-05, + "loss": 0.7685, + "step": 3684 + }, + { + "epoch": 2.078398195149464, + "grad_norm": 2.9566845893859863, + "learning_rate": 3.961082910321489e-05, + "loss": 0.8226, + "step": 3685 + }, + { + "epoch": 2.0789622109419064, + "grad_norm": 1.4644434452056885, + "learning_rate": 3.9608009024252676e-05, + "loss": 0.79, + "step": 3686 + }, + { + "epoch": 2.0795262267343486, + "grad_norm": 1.357036828994751, + "learning_rate": 3.9605188945290475e-05, + "loss": 0.6505, + "step": 3687 + }, + { + "epoch": 2.080090242526791, + "grad_norm": 4.165358543395996, + "learning_rate": 3.960236886632826e-05, + "loss": 0.9386, + "step": 3688 + }, + { + "epoch": 2.080654258319233, + "grad_norm": 2.6948790550231934, + "learning_rate": 3.9599548787366046e-05, + "loss": 0.7688, + "step": 3689 + }, + { + "epoch": 2.081218274111675, + "grad_norm": 1.8289178609848022, + "learning_rate": 3.959672870840384e-05, + "loss": 0.6563, + "step": 3690 + }, + { + "epoch": 2.081782289904117, + "grad_norm": 1.5108940601348877, + "learning_rate": 3.959390862944163e-05, + "loss": 0.678, + "step": 3691 + }, + { + "epoch": 2.0823463056965594, + "grad_norm": 1.991288185119629, + "learning_rate": 3.9591088550479416e-05, + "loss": 0.718, + "step": 3692 + }, + { + "epoch": 2.0829103214890017, + "grad_norm": 1.789311170578003, + "learning_rate": 3.95882684715172e-05, + "loss": 0.6944, + "step": 3693 + }, + { + "epoch": 2.083474337281444, + "grad_norm": 2.076942205429077, + "learning_rate": 3.9585448392554994e-05, + "loss": 0.7271, + "step": 3694 + }, + { + "epoch": 2.084038353073886, + "grad_norm": 1.0392398834228516, + "learning_rate": 3.9582628313592786e-05, + "loss": 0.6459, + "step": 3695 + }, + { + "epoch": 2.0846023688663284, + "grad_norm": 2.0714306831359863, + "learning_rate": 3.957980823463057e-05, + "loss": 0.8124, + "step": 3696 + }, + { + "epoch": 2.0851663846587702, + "grad_norm": 2.6943037509918213, + "learning_rate": 3.957698815566836e-05, + "loss": 0.8357, + "step": 3697 + }, + { + "epoch": 2.0857304004512125, + "grad_norm": 1.727220058441162, + "learning_rate": 3.957416807670615e-05, + "loss": 0.7881, + "step": 3698 + }, + { + "epoch": 2.0862944162436547, + "grad_norm": 1.338938593864441, + "learning_rate": 3.957134799774394e-05, + "loss": 0.758, + "step": 3699 + }, + { + "epoch": 2.086858432036097, + "grad_norm": 1.3195066452026367, + "learning_rate": 3.9568527918781726e-05, + "loss": 0.6232, + "step": 3700 + }, + { + "epoch": 2.0874224478285393, + "grad_norm": 1.9606873989105225, + "learning_rate": 3.956570783981952e-05, + "loss": 0.7925, + "step": 3701 + }, + { + "epoch": 2.0879864636209815, + "grad_norm": 2.01094388961792, + "learning_rate": 3.9562887760857304e-05, + "loss": 0.8795, + "step": 3702 + }, + { + "epoch": 2.0885504794134238, + "grad_norm": 1.5895894765853882, + "learning_rate": 3.9560067681895096e-05, + "loss": 0.7329, + "step": 3703 + }, + { + "epoch": 2.0891144952058656, + "grad_norm": 1.2303848266601562, + "learning_rate": 3.955724760293288e-05, + "loss": 0.7832, + "step": 3704 + }, + { + "epoch": 2.089678510998308, + "grad_norm": 3.2350966930389404, + "learning_rate": 3.9554427523970674e-05, + "loss": 0.9626, + "step": 3705 + }, + { + "epoch": 2.09024252679075, + "grad_norm": 1.5138168334960938, + "learning_rate": 3.955160744500846e-05, + "loss": 0.8845, + "step": 3706 + }, + { + "epoch": 2.0908065425831923, + "grad_norm": 1.781734824180603, + "learning_rate": 3.954878736604625e-05, + "loss": 0.9267, + "step": 3707 + }, + { + "epoch": 2.0913705583756346, + "grad_norm": 1.363195776939392, + "learning_rate": 3.9545967287084044e-05, + "loss": 0.7508, + "step": 3708 + }, + { + "epoch": 2.091934574168077, + "grad_norm": 1.886474370956421, + "learning_rate": 3.954314720812183e-05, + "loss": 0.7772, + "step": 3709 + }, + { + "epoch": 2.092498589960519, + "grad_norm": 1.41608464717865, + "learning_rate": 3.954032712915962e-05, + "loss": 0.6635, + "step": 3710 + }, + { + "epoch": 2.093062605752961, + "grad_norm": 3.357473850250244, + "learning_rate": 3.9537507050197407e-05, + "loss": 0.8077, + "step": 3711 + }, + { + "epoch": 2.093626621545403, + "grad_norm": 2.2264392375946045, + "learning_rate": 3.95346869712352e-05, + "loss": 0.7291, + "step": 3712 + }, + { + "epoch": 2.0941906373378454, + "grad_norm": 1.607460618019104, + "learning_rate": 3.9531866892272984e-05, + "loss": 0.9131, + "step": 3713 + }, + { + "epoch": 2.0947546531302876, + "grad_norm": 1.8213706016540527, + "learning_rate": 3.9529046813310776e-05, + "loss": 0.8037, + "step": 3714 + }, + { + "epoch": 2.09531866892273, + "grad_norm": 2.8017935752868652, + "learning_rate": 3.952622673434856e-05, + "loss": 0.8011, + "step": 3715 + }, + { + "epoch": 2.095882684715172, + "grad_norm": 2.207747459411621, + "learning_rate": 3.9523406655386354e-05, + "loss": 0.821, + "step": 3716 + }, + { + "epoch": 2.0964467005076144, + "grad_norm": 1.5043319463729858, + "learning_rate": 3.952058657642414e-05, + "loss": 0.9178, + "step": 3717 + }, + { + "epoch": 2.097010716300056, + "grad_norm": 2.896436929702759, + "learning_rate": 3.951776649746193e-05, + "loss": 0.8143, + "step": 3718 + }, + { + "epoch": 2.0975747320924985, + "grad_norm": 2.81775164604187, + "learning_rate": 3.9514946418499724e-05, + "loss": 0.9714, + "step": 3719 + }, + { + "epoch": 2.0981387478849407, + "grad_norm": 1.5979435443878174, + "learning_rate": 3.951212633953751e-05, + "loss": 0.7786, + "step": 3720 + }, + { + "epoch": 2.098702763677383, + "grad_norm": 3.9748659133911133, + "learning_rate": 3.9509306260575295e-05, + "loss": 0.8279, + "step": 3721 + }, + { + "epoch": 2.099266779469825, + "grad_norm": 1.3522056341171265, + "learning_rate": 3.950648618161309e-05, + "loss": 0.7289, + "step": 3722 + }, + { + "epoch": 2.0998307952622675, + "grad_norm": 2.049743413925171, + "learning_rate": 3.950366610265088e-05, + "loss": 0.8274, + "step": 3723 + }, + { + "epoch": 2.1003948110547097, + "grad_norm": 1.5781779289245605, + "learning_rate": 3.9500846023688664e-05, + "loss": 0.7285, + "step": 3724 + }, + { + "epoch": 2.1009588268471515, + "grad_norm": 2.6880533695220947, + "learning_rate": 3.949802594472645e-05, + "loss": 0.7723, + "step": 3725 + }, + { + "epoch": 2.1015228426395938, + "grad_norm": 1.6690917015075684, + "learning_rate": 3.949520586576425e-05, + "loss": 0.7436, + "step": 3726 + }, + { + "epoch": 2.102086858432036, + "grad_norm": 2.2862179279327393, + "learning_rate": 3.9492385786802034e-05, + "loss": 0.9728, + "step": 3727 + }, + { + "epoch": 2.1026508742244783, + "grad_norm": 1.144933819770813, + "learning_rate": 3.948956570783982e-05, + "loss": 0.7238, + "step": 3728 + }, + { + "epoch": 2.1032148900169205, + "grad_norm": 2.623936653137207, + "learning_rate": 3.948674562887761e-05, + "loss": 0.8588, + "step": 3729 + }, + { + "epoch": 2.1037789058093628, + "grad_norm": 1.759713888168335, + "learning_rate": 3.9483925549915404e-05, + "loss": 0.8072, + "step": 3730 + }, + { + "epoch": 2.104342921601805, + "grad_norm": 1.9964879751205444, + "learning_rate": 3.948110547095319e-05, + "loss": 0.8336, + "step": 3731 + }, + { + "epoch": 2.104906937394247, + "grad_norm": 1.787194848060608, + "learning_rate": 3.9478285391990975e-05, + "loss": 0.8846, + "step": 3732 + }, + { + "epoch": 2.105470953186689, + "grad_norm": 2.0829193592071533, + "learning_rate": 3.947546531302877e-05, + "loss": 0.7877, + "step": 3733 + }, + { + "epoch": 2.1060349689791313, + "grad_norm": 2.819120168685913, + "learning_rate": 3.947264523406656e-05, + "loss": 0.7758, + "step": 3734 + }, + { + "epoch": 2.1065989847715736, + "grad_norm": 1.587852120399475, + "learning_rate": 3.9469825155104344e-05, + "loss": 0.7223, + "step": 3735 + }, + { + "epoch": 2.107163000564016, + "grad_norm": 1.9994829893112183, + "learning_rate": 3.946700507614213e-05, + "loss": 0.8802, + "step": 3736 + }, + { + "epoch": 2.107727016356458, + "grad_norm": 1.836421251296997, + "learning_rate": 3.946418499717992e-05, + "loss": 0.8483, + "step": 3737 + }, + { + "epoch": 2.1082910321489003, + "grad_norm": 3.2108025550842285, + "learning_rate": 3.9461364918217714e-05, + "loss": 0.8085, + "step": 3738 + }, + { + "epoch": 2.108855047941342, + "grad_norm": 1.5157166719436646, + "learning_rate": 3.94585448392555e-05, + "loss": 0.6919, + "step": 3739 + }, + { + "epoch": 2.1094190637337844, + "grad_norm": 2.5077767372131348, + "learning_rate": 3.945572476029329e-05, + "loss": 0.8131, + "step": 3740 + }, + { + "epoch": 2.1099830795262267, + "grad_norm": 1.3501975536346436, + "learning_rate": 3.945290468133108e-05, + "loss": 0.7926, + "step": 3741 + }, + { + "epoch": 2.110547095318669, + "grad_norm": 1.886006236076355, + "learning_rate": 3.945008460236887e-05, + "loss": 0.8275, + "step": 3742 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 1.1816704273223877, + "learning_rate": 3.9447264523406655e-05, + "loss": 0.7802, + "step": 3743 + }, + { + "epoch": 2.1116751269035534, + "grad_norm": 1.935807228088379, + "learning_rate": 3.944444444444445e-05, + "loss": 0.7987, + "step": 3744 + }, + { + "epoch": 2.1122391426959957, + "grad_norm": 1.3921314477920532, + "learning_rate": 3.944162436548224e-05, + "loss": 0.8161, + "step": 3745 + }, + { + "epoch": 2.1128031584884375, + "grad_norm": 1.7858648300170898, + "learning_rate": 3.9438804286520025e-05, + "loss": 0.9193, + "step": 3746 + }, + { + "epoch": 2.1133671742808797, + "grad_norm": 1.2408922910690308, + "learning_rate": 3.943598420755782e-05, + "loss": 0.7296, + "step": 3747 + }, + { + "epoch": 2.113931190073322, + "grad_norm": 2.5140886306762695, + "learning_rate": 3.94331641285956e-05, + "loss": 0.805, + "step": 3748 + }, + { + "epoch": 2.1144952058657642, + "grad_norm": 4.387124538421631, + "learning_rate": 3.9430344049633394e-05, + "loss": 0.9081, + "step": 3749 + }, + { + "epoch": 2.1150592216582065, + "grad_norm": 2.456474781036377, + "learning_rate": 3.942752397067118e-05, + "loss": 0.7849, + "step": 3750 + }, + { + "epoch": 2.1156232374506487, + "grad_norm": 1.3625067472457886, + "learning_rate": 3.942470389170897e-05, + "loss": 0.8093, + "step": 3751 + }, + { + "epoch": 2.116187253243091, + "grad_norm": 1.1835359334945679, + "learning_rate": 3.942188381274676e-05, + "loss": 0.6522, + "step": 3752 + }, + { + "epoch": 2.116751269035533, + "grad_norm": 1.1850286722183228, + "learning_rate": 3.941906373378455e-05, + "loss": 0.7626, + "step": 3753 + }, + { + "epoch": 2.117315284827975, + "grad_norm": 1.661045789718628, + "learning_rate": 3.9416243654822335e-05, + "loss": 0.8156, + "step": 3754 + }, + { + "epoch": 2.1178793006204173, + "grad_norm": 3.6824772357940674, + "learning_rate": 3.941342357586013e-05, + "loss": 0.9539, + "step": 3755 + }, + { + "epoch": 2.1184433164128595, + "grad_norm": 1.1071062088012695, + "learning_rate": 3.941060349689791e-05, + "loss": 0.7111, + "step": 3756 + }, + { + "epoch": 2.119007332205302, + "grad_norm": 1.1340047121047974, + "learning_rate": 3.9407783417935705e-05, + "loss": 0.6752, + "step": 3757 + }, + { + "epoch": 2.119571347997744, + "grad_norm": 1.5054785013198853, + "learning_rate": 3.94049633389735e-05, + "loss": 0.8255, + "step": 3758 + }, + { + "epoch": 2.1201353637901863, + "grad_norm": 3.1668715476989746, + "learning_rate": 3.940214326001128e-05, + "loss": 0.844, + "step": 3759 + }, + { + "epoch": 2.120699379582628, + "grad_norm": 1.3224042654037476, + "learning_rate": 3.939932318104907e-05, + "loss": 0.8256, + "step": 3760 + }, + { + "epoch": 2.1212633953750704, + "grad_norm": 2.173356294631958, + "learning_rate": 3.939650310208686e-05, + "loss": 0.8531, + "step": 3761 + }, + { + "epoch": 2.1218274111675126, + "grad_norm": 1.4271645545959473, + "learning_rate": 3.939368302312465e-05, + "loss": 0.7259, + "step": 3762 + }, + { + "epoch": 2.122391426959955, + "grad_norm": 2.345630407333374, + "learning_rate": 3.939086294416244e-05, + "loss": 0.7986, + "step": 3763 + }, + { + "epoch": 2.122955442752397, + "grad_norm": 1.4260175228118896, + "learning_rate": 3.938804286520022e-05, + "loss": 0.797, + "step": 3764 + }, + { + "epoch": 2.1235194585448394, + "grad_norm": 1.2505234479904175, + "learning_rate": 3.938522278623802e-05, + "loss": 0.6114, + "step": 3765 + }, + { + "epoch": 2.1240834743372816, + "grad_norm": 2.607543468475342, + "learning_rate": 3.938240270727581e-05, + "loss": 0.7373, + "step": 3766 + }, + { + "epoch": 2.1246474901297234, + "grad_norm": 1.456709384918213, + "learning_rate": 3.937958262831359e-05, + "loss": 0.7617, + "step": 3767 + }, + { + "epoch": 2.1252115059221657, + "grad_norm": 1.488771677017212, + "learning_rate": 3.937676254935138e-05, + "loss": 0.7698, + "step": 3768 + }, + { + "epoch": 2.125775521714608, + "grad_norm": 2.4300761222839355, + "learning_rate": 3.937394247038918e-05, + "loss": 0.8052, + "step": 3769 + }, + { + "epoch": 2.12633953750705, + "grad_norm": 1.1652787923812866, + "learning_rate": 3.937112239142696e-05, + "loss": 0.7149, + "step": 3770 + }, + { + "epoch": 2.1269035532994924, + "grad_norm": 1.3924622535705566, + "learning_rate": 3.936830231246475e-05, + "loss": 0.8764, + "step": 3771 + }, + { + "epoch": 2.1274675690919347, + "grad_norm": 1.457457184791565, + "learning_rate": 3.936548223350254e-05, + "loss": 0.6738, + "step": 3772 + }, + { + "epoch": 2.128031584884377, + "grad_norm": 2.6080358028411865, + "learning_rate": 3.936266215454033e-05, + "loss": 0.8563, + "step": 3773 + }, + { + "epoch": 2.1285956006768187, + "grad_norm": 3.0277111530303955, + "learning_rate": 3.935984207557812e-05, + "loss": 0.8571, + "step": 3774 + }, + { + "epoch": 2.129159616469261, + "grad_norm": 1.212388515472412, + "learning_rate": 3.93570219966159e-05, + "loss": 0.6781, + "step": 3775 + }, + { + "epoch": 2.1297236322617032, + "grad_norm": 1.547115445137024, + "learning_rate": 3.9354201917653695e-05, + "loss": 0.7608, + "step": 3776 + }, + { + "epoch": 2.1302876480541455, + "grad_norm": 1.3172249794006348, + "learning_rate": 3.935138183869149e-05, + "loss": 0.6288, + "step": 3777 + }, + { + "epoch": 2.1308516638465878, + "grad_norm": 2.415748119354248, + "learning_rate": 3.934856175972927e-05, + "loss": 0.9112, + "step": 3778 + }, + { + "epoch": 2.13141567963903, + "grad_norm": 1.0663427114486694, + "learning_rate": 3.9345741680767065e-05, + "loss": 0.7172, + "step": 3779 + }, + { + "epoch": 2.1319796954314723, + "grad_norm": 1.4293807744979858, + "learning_rate": 3.934292160180485e-05, + "loss": 0.8516, + "step": 3780 + }, + { + "epoch": 2.132543711223914, + "grad_norm": 1.5343133211135864, + "learning_rate": 3.934010152284264e-05, + "loss": 0.7604, + "step": 3781 + }, + { + "epoch": 2.1331077270163563, + "grad_norm": 1.7249259948730469, + "learning_rate": 3.933728144388043e-05, + "loss": 0.701, + "step": 3782 + }, + { + "epoch": 2.1336717428087986, + "grad_norm": 1.5453412532806396, + "learning_rate": 3.933446136491822e-05, + "loss": 0.7906, + "step": 3783 + }, + { + "epoch": 2.134235758601241, + "grad_norm": 2.883197784423828, + "learning_rate": 3.933164128595601e-05, + "loss": 0.7696, + "step": 3784 + }, + { + "epoch": 2.134799774393683, + "grad_norm": 1.9328190088272095, + "learning_rate": 3.93288212069938e-05, + "loss": 0.8195, + "step": 3785 + }, + { + "epoch": 2.1353637901861253, + "grad_norm": 2.512768507003784, + "learning_rate": 3.932600112803158e-05, + "loss": 0.8602, + "step": 3786 + }, + { + "epoch": 2.1359278059785676, + "grad_norm": 1.7009369134902954, + "learning_rate": 3.9323181049069375e-05, + "loss": 0.851, + "step": 3787 + }, + { + "epoch": 2.1364918217710094, + "grad_norm": 1.0229841470718384, + "learning_rate": 3.932036097010717e-05, + "loss": 0.5874, + "step": 3788 + }, + { + "epoch": 2.1370558375634516, + "grad_norm": 1.5662009716033936, + "learning_rate": 3.931754089114495e-05, + "loss": 0.7763, + "step": 3789 + }, + { + "epoch": 2.137619853355894, + "grad_norm": 2.801104784011841, + "learning_rate": 3.9314720812182745e-05, + "loss": 0.899, + "step": 3790 + }, + { + "epoch": 2.138183869148336, + "grad_norm": 2.0286216735839844, + "learning_rate": 3.931190073322053e-05, + "loss": 0.7089, + "step": 3791 + }, + { + "epoch": 2.1387478849407784, + "grad_norm": 2.822303056716919, + "learning_rate": 3.930908065425832e-05, + "loss": 0.8208, + "step": 3792 + }, + { + "epoch": 2.1393119007332206, + "grad_norm": 1.1916093826293945, + "learning_rate": 3.930626057529611e-05, + "loss": 0.8142, + "step": 3793 + }, + { + "epoch": 2.139875916525663, + "grad_norm": 1.4547048807144165, + "learning_rate": 3.93034404963339e-05, + "loss": 0.6761, + "step": 3794 + }, + { + "epoch": 2.1404399323181047, + "grad_norm": 1.371898889541626, + "learning_rate": 3.9300620417371686e-05, + "loss": 0.7147, + "step": 3795 + }, + { + "epoch": 2.141003948110547, + "grad_norm": 1.7376782894134521, + "learning_rate": 3.929780033840948e-05, + "loss": 0.8114, + "step": 3796 + }, + { + "epoch": 2.141567963902989, + "grad_norm": 1.9860639572143555, + "learning_rate": 3.929498025944727e-05, + "loss": 0.7923, + "step": 3797 + }, + { + "epoch": 2.1421319796954315, + "grad_norm": 2.004047393798828, + "learning_rate": 3.9292160180485056e-05, + "loss": 0.8763, + "step": 3798 + }, + { + "epoch": 2.1426959954878737, + "grad_norm": 1.5499753952026367, + "learning_rate": 3.928934010152284e-05, + "loss": 0.7484, + "step": 3799 + }, + { + "epoch": 2.143260011280316, + "grad_norm": 1.5375735759735107, + "learning_rate": 3.928652002256063e-05, + "loss": 0.8866, + "step": 3800 + }, + { + "epoch": 2.143824027072758, + "grad_norm": 1.4973137378692627, + "learning_rate": 3.9283699943598425e-05, + "loss": 0.8053, + "step": 3801 + }, + { + "epoch": 2.1443880428652005, + "grad_norm": 2.6665329933166504, + "learning_rate": 3.928087986463621e-05, + "loss": 0.8937, + "step": 3802 + }, + { + "epoch": 2.1449520586576423, + "grad_norm": 2.3648622035980225, + "learning_rate": 3.9278059785673996e-05, + "loss": 0.7845, + "step": 3803 + }, + { + "epoch": 2.1455160744500845, + "grad_norm": 1.6913079023361206, + "learning_rate": 3.927523970671179e-05, + "loss": 0.7167, + "step": 3804 + }, + { + "epoch": 2.1460800902425268, + "grad_norm": 1.4515029191970825, + "learning_rate": 3.927241962774958e-05, + "loss": 0.7935, + "step": 3805 + }, + { + "epoch": 2.146644106034969, + "grad_norm": 4.829561710357666, + "learning_rate": 3.9269599548787366e-05, + "loss": 0.8684, + "step": 3806 + }, + { + "epoch": 2.1472081218274113, + "grad_norm": 2.2822983264923096, + "learning_rate": 3.926677946982515e-05, + "loss": 0.7693, + "step": 3807 + }, + { + "epoch": 2.1477721376198535, + "grad_norm": 1.2155094146728516, + "learning_rate": 3.926395939086295e-05, + "loss": 0.7676, + "step": 3808 + }, + { + "epoch": 2.148336153412296, + "grad_norm": 1.7874735593795776, + "learning_rate": 3.9261139311900736e-05, + "loss": 0.7863, + "step": 3809 + }, + { + "epoch": 2.1489001692047376, + "grad_norm": 1.0810208320617676, + "learning_rate": 3.925831923293852e-05, + "loss": 0.771, + "step": 3810 + }, + { + "epoch": 2.14946418499718, + "grad_norm": 1.2135868072509766, + "learning_rate": 3.925549915397631e-05, + "loss": 0.7761, + "step": 3811 + }, + { + "epoch": 2.150028200789622, + "grad_norm": 1.4493751525878906, + "learning_rate": 3.9252679075014106e-05, + "loss": 0.7321, + "step": 3812 + }, + { + "epoch": 2.1505922165820643, + "grad_norm": 1.956496238708496, + "learning_rate": 3.924985899605189e-05, + "loss": 0.8134, + "step": 3813 + }, + { + "epoch": 2.1511562323745066, + "grad_norm": 1.3853485584259033, + "learning_rate": 3.9247038917089676e-05, + "loss": 0.7749, + "step": 3814 + }, + { + "epoch": 2.151720248166949, + "grad_norm": 2.9400794506073, + "learning_rate": 3.924421883812747e-05, + "loss": 0.7472, + "step": 3815 + }, + { + "epoch": 2.152284263959391, + "grad_norm": 1.3331818580627441, + "learning_rate": 3.924139875916526e-05, + "loss": 0.7546, + "step": 3816 + }, + { + "epoch": 2.152848279751833, + "grad_norm": 2.5953333377838135, + "learning_rate": 3.9238578680203046e-05, + "loss": 0.9078, + "step": 3817 + }, + { + "epoch": 2.153412295544275, + "grad_norm": 3.0689759254455566, + "learning_rate": 3.923575860124084e-05, + "loss": 0.8387, + "step": 3818 + }, + { + "epoch": 2.1539763113367174, + "grad_norm": 1.4926649332046509, + "learning_rate": 3.923293852227863e-05, + "loss": 0.7485, + "step": 3819 + }, + { + "epoch": 2.1545403271291597, + "grad_norm": 1.3141194581985474, + "learning_rate": 3.9230118443316416e-05, + "loss": 0.6974, + "step": 3820 + }, + { + "epoch": 2.155104342921602, + "grad_norm": 1.4535726308822632, + "learning_rate": 3.92272983643542e-05, + "loss": 0.8137, + "step": 3821 + }, + { + "epoch": 2.155668358714044, + "grad_norm": 1.9947644472122192, + "learning_rate": 3.9224478285391993e-05, + "loss": 0.8493, + "step": 3822 + }, + { + "epoch": 2.1562323745064864, + "grad_norm": 1.1144062280654907, + "learning_rate": 3.9221658206429786e-05, + "loss": 0.6553, + "step": 3823 + }, + { + "epoch": 2.1567963902989282, + "grad_norm": 1.259919285774231, + "learning_rate": 3.921883812746757e-05, + "loss": 0.7506, + "step": 3824 + }, + { + "epoch": 2.1573604060913705, + "grad_norm": 1.6495811939239502, + "learning_rate": 3.9216018048505356e-05, + "loss": 0.84, + "step": 3825 + }, + { + "epoch": 2.1579244218838127, + "grad_norm": 1.1216963529586792, + "learning_rate": 3.921319796954315e-05, + "loss": 0.632, + "step": 3826 + }, + { + "epoch": 2.158488437676255, + "grad_norm": 1.970537781715393, + "learning_rate": 3.921037789058094e-05, + "loss": 0.8183, + "step": 3827 + }, + { + "epoch": 2.1590524534686972, + "grad_norm": 1.1913857460021973, + "learning_rate": 3.9207557811618726e-05, + "loss": 0.7505, + "step": 3828 + }, + { + "epoch": 2.1596164692611395, + "grad_norm": 3.4341812133789062, + "learning_rate": 3.920473773265652e-05, + "loss": 0.8124, + "step": 3829 + }, + { + "epoch": 2.1601804850535817, + "grad_norm": 1.2949501276016235, + "learning_rate": 3.9201917653694304e-05, + "loss": 0.7018, + "step": 3830 + }, + { + "epoch": 2.1607445008460235, + "grad_norm": 1.737329125404358, + "learning_rate": 3.9199097574732096e-05, + "loss": 0.7288, + "step": 3831 + }, + { + "epoch": 2.161308516638466, + "grad_norm": 4.236886024475098, + "learning_rate": 3.919627749576988e-05, + "loss": 1.0112, + "step": 3832 + }, + { + "epoch": 2.161872532430908, + "grad_norm": 1.6590732336044312, + "learning_rate": 3.9193457416807674e-05, + "loss": 0.788, + "step": 3833 + }, + { + "epoch": 2.1624365482233503, + "grad_norm": 1.194907307624817, + "learning_rate": 3.919063733784546e-05, + "loss": 0.7293, + "step": 3834 + }, + { + "epoch": 2.1630005640157925, + "grad_norm": 1.6447813510894775, + "learning_rate": 3.918781725888325e-05, + "loss": 0.6564, + "step": 3835 + }, + { + "epoch": 2.163564579808235, + "grad_norm": 2.3711726665496826, + "learning_rate": 3.9184997179921043e-05, + "loss": 0.8711, + "step": 3836 + }, + { + "epoch": 2.164128595600677, + "grad_norm": 1.3628106117248535, + "learning_rate": 3.918217710095883e-05, + "loss": 0.6555, + "step": 3837 + }, + { + "epoch": 2.164692611393119, + "grad_norm": 1.7733960151672363, + "learning_rate": 3.9179357021996614e-05, + "loss": 0.8765, + "step": 3838 + }, + { + "epoch": 2.165256627185561, + "grad_norm": 2.101377487182617, + "learning_rate": 3.9176536943034406e-05, + "loss": 0.8602, + "step": 3839 + }, + { + "epoch": 2.1658206429780034, + "grad_norm": 1.5266300439834595, + "learning_rate": 3.91737168640722e-05, + "loss": 0.783, + "step": 3840 + }, + { + "epoch": 2.1663846587704456, + "grad_norm": 1.6844371557235718, + "learning_rate": 3.9170896785109984e-05, + "loss": 0.7477, + "step": 3841 + }, + { + "epoch": 2.166948674562888, + "grad_norm": 2.0783066749572754, + "learning_rate": 3.916807670614777e-05, + "loss": 0.8414, + "step": 3842 + }, + { + "epoch": 2.16751269035533, + "grad_norm": 2.749215841293335, + "learning_rate": 3.916525662718556e-05, + "loss": 0.8454, + "step": 3843 + }, + { + "epoch": 2.1680767061477724, + "grad_norm": 3.7522566318511963, + "learning_rate": 3.9162436548223354e-05, + "loss": 0.8328, + "step": 3844 + }, + { + "epoch": 2.168640721940214, + "grad_norm": 1.5237418413162231, + "learning_rate": 3.915961646926114e-05, + "loss": 0.8487, + "step": 3845 + }, + { + "epoch": 2.1692047377326564, + "grad_norm": 1.291420340538025, + "learning_rate": 3.9156796390298925e-05, + "loss": 0.7178, + "step": 3846 + }, + { + "epoch": 2.1697687535250987, + "grad_norm": 1.2036504745483398, + "learning_rate": 3.9153976311336724e-05, + "loss": 0.7223, + "step": 3847 + }, + { + "epoch": 2.170332769317541, + "grad_norm": 2.0554006099700928, + "learning_rate": 3.915115623237451e-05, + "loss": 0.6825, + "step": 3848 + }, + { + "epoch": 2.170896785109983, + "grad_norm": 1.2853769063949585, + "learning_rate": 3.9148336153412294e-05, + "loss": 0.743, + "step": 3849 + }, + { + "epoch": 2.1714608009024254, + "grad_norm": 1.206496238708496, + "learning_rate": 3.9145516074450087e-05, + "loss": 0.774, + "step": 3850 + }, + { + "epoch": 2.1720248166948677, + "grad_norm": 1.9885810613632202, + "learning_rate": 3.914269599548788e-05, + "loss": 0.8736, + "step": 3851 + }, + { + "epoch": 2.1725888324873095, + "grad_norm": 1.2244726419448853, + "learning_rate": 3.9139875916525664e-05, + "loss": 0.7696, + "step": 3852 + }, + { + "epoch": 2.1731528482797517, + "grad_norm": 1.331743836402893, + "learning_rate": 3.913705583756345e-05, + "loss": 0.8236, + "step": 3853 + }, + { + "epoch": 2.173716864072194, + "grad_norm": 3.0819432735443115, + "learning_rate": 3.913423575860125e-05, + "loss": 0.8195, + "step": 3854 + }, + { + "epoch": 2.1742808798646363, + "grad_norm": 1.5252470970153809, + "learning_rate": 3.9131415679639034e-05, + "loss": 0.778, + "step": 3855 + }, + { + "epoch": 2.1748448956570785, + "grad_norm": 1.9476978778839111, + "learning_rate": 3.912859560067682e-05, + "loss": 0.7415, + "step": 3856 + }, + { + "epoch": 2.1754089114495208, + "grad_norm": 2.4645907878875732, + "learning_rate": 3.912577552171461e-05, + "loss": 0.8028, + "step": 3857 + }, + { + "epoch": 2.175972927241963, + "grad_norm": 1.7249761819839478, + "learning_rate": 3.9122955442752404e-05, + "loss": 0.8053, + "step": 3858 + }, + { + "epoch": 2.176536943034405, + "grad_norm": 1.4543395042419434, + "learning_rate": 3.912013536379019e-05, + "loss": 0.7634, + "step": 3859 + }, + { + "epoch": 2.177100958826847, + "grad_norm": 1.578486442565918, + "learning_rate": 3.9117315284827975e-05, + "loss": 0.7625, + "step": 3860 + }, + { + "epoch": 2.1776649746192893, + "grad_norm": 1.328908920288086, + "learning_rate": 3.911449520586577e-05, + "loss": 0.7625, + "step": 3861 + }, + { + "epoch": 2.1782289904117316, + "grad_norm": 2.875828266143799, + "learning_rate": 3.911167512690356e-05, + "loss": 0.8763, + "step": 3862 + }, + { + "epoch": 2.178793006204174, + "grad_norm": 1.9191197156906128, + "learning_rate": 3.9108855047941344e-05, + "loss": 0.7899, + "step": 3863 + }, + { + "epoch": 2.179357021996616, + "grad_norm": 1.4037634134292603, + "learning_rate": 3.910603496897913e-05, + "loss": 0.7765, + "step": 3864 + }, + { + "epoch": 2.1799210377890583, + "grad_norm": 1.7999975681304932, + "learning_rate": 3.910321489001692e-05, + "loss": 0.8079, + "step": 3865 + }, + { + "epoch": 2.1804850535815, + "grad_norm": 2.269479274749756, + "learning_rate": 3.9100394811054714e-05, + "loss": 0.7467, + "step": 3866 + }, + { + "epoch": 2.1810490693739424, + "grad_norm": 1.9073723554611206, + "learning_rate": 3.90975747320925e-05, + "loss": 0.8018, + "step": 3867 + }, + { + "epoch": 2.1816130851663846, + "grad_norm": 1.9666842222213745, + "learning_rate": 3.909475465313029e-05, + "loss": 0.7511, + "step": 3868 + }, + { + "epoch": 2.182177100958827, + "grad_norm": 1.237981915473938, + "learning_rate": 3.909193457416808e-05, + "loss": 0.7197, + "step": 3869 + }, + { + "epoch": 2.182741116751269, + "grad_norm": 2.0354647636413574, + "learning_rate": 3.908911449520587e-05, + "loss": 0.7194, + "step": 3870 + }, + { + "epoch": 2.1833051325437114, + "grad_norm": 1.828442931175232, + "learning_rate": 3.9086294416243655e-05, + "loss": 0.7219, + "step": 3871 + }, + { + "epoch": 2.1838691483361536, + "grad_norm": 2.3037467002868652, + "learning_rate": 3.908347433728145e-05, + "loss": 0.9236, + "step": 3872 + }, + { + "epoch": 2.1844331641285955, + "grad_norm": 2.4469587802886963, + "learning_rate": 3.908065425831923e-05, + "loss": 0.8704, + "step": 3873 + }, + { + "epoch": 2.1849971799210377, + "grad_norm": 2.2597665786743164, + "learning_rate": 3.9077834179357024e-05, + "loss": 0.8975, + "step": 3874 + }, + { + "epoch": 2.18556119571348, + "grad_norm": 1.9735116958618164, + "learning_rate": 3.907501410039482e-05, + "loss": 0.7439, + "step": 3875 + }, + { + "epoch": 2.186125211505922, + "grad_norm": 1.6007543802261353, + "learning_rate": 3.90721940214326e-05, + "loss": 0.799, + "step": 3876 + }, + { + "epoch": 2.1866892272983645, + "grad_norm": 2.414811611175537, + "learning_rate": 3.906937394247039e-05, + "loss": 0.9039, + "step": 3877 + }, + { + "epoch": 2.1872532430908067, + "grad_norm": 1.6950150728225708, + "learning_rate": 3.906655386350818e-05, + "loss": 0.8368, + "step": 3878 + }, + { + "epoch": 2.187817258883249, + "grad_norm": 2.929328203201294, + "learning_rate": 3.906373378454597e-05, + "loss": 0.879, + "step": 3879 + }, + { + "epoch": 2.1883812746756908, + "grad_norm": 1.4056769609451294, + "learning_rate": 3.906091370558376e-05, + "loss": 0.7766, + "step": 3880 + }, + { + "epoch": 2.188945290468133, + "grad_norm": 2.3624815940856934, + "learning_rate": 3.905809362662154e-05, + "loss": 0.8713, + "step": 3881 + }, + { + "epoch": 2.1895093062605753, + "grad_norm": 1.7956892251968384, + "learning_rate": 3.9055273547659335e-05, + "loss": 0.7928, + "step": 3882 + }, + { + "epoch": 2.1900733220530175, + "grad_norm": 1.442191243171692, + "learning_rate": 3.905245346869713e-05, + "loss": 0.8246, + "step": 3883 + }, + { + "epoch": 2.1906373378454598, + "grad_norm": 2.1373867988586426, + "learning_rate": 3.904963338973491e-05, + "loss": 0.8273, + "step": 3884 + }, + { + "epoch": 2.191201353637902, + "grad_norm": 1.6620570421218872, + "learning_rate": 3.90468133107727e-05, + "loss": 0.7178, + "step": 3885 + }, + { + "epoch": 2.1917653694303443, + "grad_norm": 1.2003977298736572, + "learning_rate": 3.90439932318105e-05, + "loss": 0.6526, + "step": 3886 + }, + { + "epoch": 2.192329385222786, + "grad_norm": 1.6689754724502563, + "learning_rate": 3.904117315284828e-05, + "loss": 0.7909, + "step": 3887 + }, + { + "epoch": 2.1928934010152283, + "grad_norm": 1.3337476253509521, + "learning_rate": 3.903835307388607e-05, + "loss": 0.7509, + "step": 3888 + }, + { + "epoch": 2.1934574168076706, + "grad_norm": 2.038641929626465, + "learning_rate": 3.903553299492386e-05, + "loss": 0.7261, + "step": 3889 + }, + { + "epoch": 2.194021432600113, + "grad_norm": 1.9399901628494263, + "learning_rate": 3.903271291596165e-05, + "loss": 0.8181, + "step": 3890 + }, + { + "epoch": 2.194585448392555, + "grad_norm": 1.7363814115524292, + "learning_rate": 3.902989283699944e-05, + "loss": 0.7866, + "step": 3891 + }, + { + "epoch": 2.1951494641849973, + "grad_norm": 1.5917153358459473, + "learning_rate": 3.902707275803722e-05, + "loss": 0.7343, + "step": 3892 + }, + { + "epoch": 2.1957134799774396, + "grad_norm": 1.1863781213760376, + "learning_rate": 3.902425267907502e-05, + "loss": 0.7057, + "step": 3893 + }, + { + "epoch": 2.1962774957698814, + "grad_norm": 1.751075029373169, + "learning_rate": 3.902143260011281e-05, + "loss": 0.7411, + "step": 3894 + }, + { + "epoch": 2.1968415115623237, + "grad_norm": 1.4898427724838257, + "learning_rate": 3.901861252115059e-05, + "loss": 0.7603, + "step": 3895 + }, + { + "epoch": 2.197405527354766, + "grad_norm": 2.3679697513580322, + "learning_rate": 3.9015792442188385e-05, + "loss": 0.763, + "step": 3896 + }, + { + "epoch": 2.197969543147208, + "grad_norm": 1.9499919414520264, + "learning_rate": 3.901297236322618e-05, + "loss": 0.6917, + "step": 3897 + }, + { + "epoch": 2.1985335589396504, + "grad_norm": 1.1835017204284668, + "learning_rate": 3.901015228426396e-05, + "loss": 0.7414, + "step": 3898 + }, + { + "epoch": 2.1990975747320927, + "grad_norm": 3.7407264709472656, + "learning_rate": 3.900733220530175e-05, + "loss": 0.8543, + "step": 3899 + }, + { + "epoch": 2.199661590524535, + "grad_norm": 1.8577462434768677, + "learning_rate": 3.900451212633954e-05, + "loss": 0.8454, + "step": 3900 + }, + { + "epoch": 2.2002256063169767, + "grad_norm": 1.8182154893875122, + "learning_rate": 3.900169204737733e-05, + "loss": 0.8082, + "step": 3901 + }, + { + "epoch": 2.200789622109419, + "grad_norm": 2.001178503036499, + "learning_rate": 3.899887196841512e-05, + "loss": 0.743, + "step": 3902 + }, + { + "epoch": 2.2013536379018612, + "grad_norm": 1.2951277494430542, + "learning_rate": 3.89960518894529e-05, + "loss": 0.7044, + "step": 3903 + }, + { + "epoch": 2.2019176536943035, + "grad_norm": 1.4737571477890015, + "learning_rate": 3.8993231810490695e-05, + "loss": 0.6831, + "step": 3904 + }, + { + "epoch": 2.2024816694867457, + "grad_norm": 2.629331350326538, + "learning_rate": 3.899041173152849e-05, + "loss": 0.8651, + "step": 3905 + }, + { + "epoch": 2.203045685279188, + "grad_norm": 1.6107895374298096, + "learning_rate": 3.898759165256627e-05, + "loss": 0.8318, + "step": 3906 + }, + { + "epoch": 2.2036097010716302, + "grad_norm": 1.8809313774108887, + "learning_rate": 3.8984771573604065e-05, + "loss": 0.7698, + "step": 3907 + }, + { + "epoch": 2.204173716864072, + "grad_norm": 1.1919597387313843, + "learning_rate": 3.898195149464185e-05, + "loss": 0.7347, + "step": 3908 + }, + { + "epoch": 2.2047377326565143, + "grad_norm": 2.0065979957580566, + "learning_rate": 3.897913141567964e-05, + "loss": 0.753, + "step": 3909 + }, + { + "epoch": 2.2053017484489565, + "grad_norm": 1.7257779836654663, + "learning_rate": 3.897631133671743e-05, + "loss": 0.7122, + "step": 3910 + }, + { + "epoch": 2.205865764241399, + "grad_norm": 1.261401891708374, + "learning_rate": 3.897349125775522e-05, + "loss": 0.7485, + "step": 3911 + }, + { + "epoch": 2.206429780033841, + "grad_norm": 2.9354114532470703, + "learning_rate": 3.8970671178793005e-05, + "loss": 0.9348, + "step": 3912 + }, + { + "epoch": 2.2069937958262833, + "grad_norm": 1.4206286668777466, + "learning_rate": 3.89678510998308e-05, + "loss": 0.763, + "step": 3913 + }, + { + "epoch": 2.2075578116187256, + "grad_norm": 2.698512315750122, + "learning_rate": 3.896503102086859e-05, + "loss": 0.9488, + "step": 3914 + }, + { + "epoch": 2.2081218274111674, + "grad_norm": 1.2424598932266235, + "learning_rate": 3.8962210941906375e-05, + "loss": 0.7956, + "step": 3915 + }, + { + "epoch": 2.2086858432036096, + "grad_norm": 1.6442453861236572, + "learning_rate": 3.895939086294416e-05, + "loss": 0.7758, + "step": 3916 + }, + { + "epoch": 2.209249858996052, + "grad_norm": 2.0305378437042236, + "learning_rate": 3.895657078398195e-05, + "loss": 0.8993, + "step": 3917 + }, + { + "epoch": 2.209813874788494, + "grad_norm": 1.6094634532928467, + "learning_rate": 3.8953750705019745e-05, + "loss": 0.7861, + "step": 3918 + }, + { + "epoch": 2.2103778905809364, + "grad_norm": 1.626115322113037, + "learning_rate": 3.895093062605753e-05, + "loss": 0.6342, + "step": 3919 + }, + { + "epoch": 2.2109419063733786, + "grad_norm": 1.1527103185653687, + "learning_rate": 3.8948110547095316e-05, + "loss": 0.7285, + "step": 3920 + }, + { + "epoch": 2.211505922165821, + "grad_norm": 3.8516452312469482, + "learning_rate": 3.894529046813311e-05, + "loss": 0.7067, + "step": 3921 + }, + { + "epoch": 2.2120699379582627, + "grad_norm": 1.747627854347229, + "learning_rate": 3.89424703891709e-05, + "loss": 0.8233, + "step": 3922 + }, + { + "epoch": 2.212633953750705, + "grad_norm": 1.967646837234497, + "learning_rate": 3.8939650310208686e-05, + "loss": 0.6651, + "step": 3923 + }, + { + "epoch": 2.213197969543147, + "grad_norm": 1.312267780303955, + "learning_rate": 3.893683023124648e-05, + "loss": 0.8542, + "step": 3924 + }, + { + "epoch": 2.2137619853355894, + "grad_norm": 2.1715810298919678, + "learning_rate": 3.893401015228427e-05, + "loss": 0.7836, + "step": 3925 + }, + { + "epoch": 2.2143260011280317, + "grad_norm": 2.075305223464966, + "learning_rate": 3.8931190073322055e-05, + "loss": 0.7251, + "step": 3926 + }, + { + "epoch": 2.214890016920474, + "grad_norm": 1.8825993537902832, + "learning_rate": 3.892836999435984e-05, + "loss": 0.8387, + "step": 3927 + }, + { + "epoch": 2.215454032712916, + "grad_norm": 2.589550018310547, + "learning_rate": 3.892554991539763e-05, + "loss": 0.8258, + "step": 3928 + }, + { + "epoch": 2.216018048505358, + "grad_norm": 3.1851418018341064, + "learning_rate": 3.8922729836435425e-05, + "loss": 0.822, + "step": 3929 + }, + { + "epoch": 2.2165820642978002, + "grad_norm": 4.177192211151123, + "learning_rate": 3.891990975747321e-05, + "loss": 0.9374, + "step": 3930 + }, + { + "epoch": 2.2171460800902425, + "grad_norm": 1.452883243560791, + "learning_rate": 3.8917089678510996e-05, + "loss": 0.7255, + "step": 3931 + }, + { + "epoch": 2.2177100958826848, + "grad_norm": 2.266556978225708, + "learning_rate": 3.8914269599548795e-05, + "loss": 0.7684, + "step": 3932 + }, + { + "epoch": 2.218274111675127, + "grad_norm": 1.8090804815292358, + "learning_rate": 3.891144952058658e-05, + "loss": 0.8608, + "step": 3933 + }, + { + "epoch": 2.2188381274675693, + "grad_norm": 1.45242440700531, + "learning_rate": 3.8908629441624366e-05, + "loss": 0.7696, + "step": 3934 + }, + { + "epoch": 2.2194021432600115, + "grad_norm": 2.8245761394500732, + "learning_rate": 3.890580936266215e-05, + "loss": 0.7627, + "step": 3935 + }, + { + "epoch": 2.2199661590524533, + "grad_norm": 1.0946996212005615, + "learning_rate": 3.890298928369995e-05, + "loss": 0.7417, + "step": 3936 + }, + { + "epoch": 2.2205301748448956, + "grad_norm": 1.2821999788284302, + "learning_rate": 3.8900169204737736e-05, + "loss": 0.6606, + "step": 3937 + }, + { + "epoch": 2.221094190637338, + "grad_norm": 1.2787022590637207, + "learning_rate": 3.889734912577552e-05, + "loss": 0.8441, + "step": 3938 + }, + { + "epoch": 2.22165820642978, + "grad_norm": 1.599760890007019, + "learning_rate": 3.889452904681331e-05, + "loss": 0.8209, + "step": 3939 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 1.4416366815567017, + "learning_rate": 3.8891708967851105e-05, + "loss": 0.7374, + "step": 3940 + }, + { + "epoch": 2.2227862380146646, + "grad_norm": 2.1926040649414062, + "learning_rate": 3.888888888888889e-05, + "loss": 0.7941, + "step": 3941 + }, + { + "epoch": 2.223350253807107, + "grad_norm": 2.116206407546997, + "learning_rate": 3.8886068809926676e-05, + "loss": 0.7377, + "step": 3942 + }, + { + "epoch": 2.2239142695995486, + "grad_norm": 2.2638978958129883, + "learning_rate": 3.888324873096447e-05, + "loss": 0.8135, + "step": 3943 + }, + { + "epoch": 2.224478285391991, + "grad_norm": 1.5115762948989868, + "learning_rate": 3.888042865200226e-05, + "loss": 0.6692, + "step": 3944 + }, + { + "epoch": 2.225042301184433, + "grad_norm": 1.310941219329834, + "learning_rate": 3.8877608573040046e-05, + "loss": 0.7348, + "step": 3945 + }, + { + "epoch": 2.2256063169768754, + "grad_norm": 2.171990156173706, + "learning_rate": 3.887478849407784e-05, + "loss": 0.8512, + "step": 3946 + }, + { + "epoch": 2.2261703327693176, + "grad_norm": 1.0878716707229614, + "learning_rate": 3.8871968415115624e-05, + "loss": 0.598, + "step": 3947 + }, + { + "epoch": 2.22673434856176, + "grad_norm": 1.6563215255737305, + "learning_rate": 3.8869148336153416e-05, + "loss": 0.7475, + "step": 3948 + }, + { + "epoch": 2.227298364354202, + "grad_norm": 2.8093504905700684, + "learning_rate": 3.88663282571912e-05, + "loss": 0.7462, + "step": 3949 + }, + { + "epoch": 2.227862380146644, + "grad_norm": 1.2333428859710693, + "learning_rate": 3.886350817822899e-05, + "loss": 0.7663, + "step": 3950 + }, + { + "epoch": 2.228426395939086, + "grad_norm": 1.7772718667984009, + "learning_rate": 3.886068809926678e-05, + "loss": 0.8346, + "step": 3951 + }, + { + "epoch": 2.2289904117315285, + "grad_norm": 2.809354543685913, + "learning_rate": 3.885786802030457e-05, + "loss": 0.9518, + "step": 3952 + }, + { + "epoch": 2.2295544275239707, + "grad_norm": 1.9352201223373413, + "learning_rate": 3.8855047941342356e-05, + "loss": 0.7948, + "step": 3953 + }, + { + "epoch": 2.230118443316413, + "grad_norm": 1.9418679475784302, + "learning_rate": 3.885222786238015e-05, + "loss": 0.811, + "step": 3954 + }, + { + "epoch": 2.230682459108855, + "grad_norm": 2.0799858570098877, + "learning_rate": 3.8849407783417934e-05, + "loss": 0.8047, + "step": 3955 + }, + { + "epoch": 2.2312464749012975, + "grad_norm": 1.8081786632537842, + "learning_rate": 3.8846587704455726e-05, + "loss": 0.8261, + "step": 3956 + }, + { + "epoch": 2.2318104906937393, + "grad_norm": 2.071054458618164, + "learning_rate": 3.884376762549352e-05, + "loss": 0.8239, + "step": 3957 + }, + { + "epoch": 2.2323745064861815, + "grad_norm": 2.9074201583862305, + "learning_rate": 3.8840947546531304e-05, + "loss": 0.8257, + "step": 3958 + }, + { + "epoch": 2.2329385222786238, + "grad_norm": 1.5436829328536987, + "learning_rate": 3.8838127467569096e-05, + "loss": 0.7086, + "step": 3959 + }, + { + "epoch": 2.233502538071066, + "grad_norm": 1.8283177614212036, + "learning_rate": 3.883530738860688e-05, + "loss": 0.7496, + "step": 3960 + }, + { + "epoch": 2.2340665538635083, + "grad_norm": 1.5381239652633667, + "learning_rate": 3.8832487309644673e-05, + "loss": 0.8688, + "step": 3961 + }, + { + "epoch": 2.2346305696559505, + "grad_norm": 1.3500629663467407, + "learning_rate": 3.882966723068246e-05, + "loss": 0.7413, + "step": 3962 + }, + { + "epoch": 2.235194585448393, + "grad_norm": 1.1634490489959717, + "learning_rate": 3.882684715172025e-05, + "loss": 0.7622, + "step": 3963 + }, + { + "epoch": 2.2357586012408346, + "grad_norm": 1.8598711490631104, + "learning_rate": 3.882402707275804e-05, + "loss": 0.8878, + "step": 3964 + }, + { + "epoch": 2.236322617033277, + "grad_norm": 1.2377870082855225, + "learning_rate": 3.882120699379583e-05, + "loss": 0.7498, + "step": 3965 + }, + { + "epoch": 2.236886632825719, + "grad_norm": 1.559192180633545, + "learning_rate": 3.8818386914833614e-05, + "loss": 0.8011, + "step": 3966 + }, + { + "epoch": 2.2374506486181613, + "grad_norm": 2.1747446060180664, + "learning_rate": 3.8815566835871406e-05, + "loss": 0.7199, + "step": 3967 + }, + { + "epoch": 2.2380146644106036, + "grad_norm": 1.7702934741973877, + "learning_rate": 3.88127467569092e-05, + "loss": 0.9333, + "step": 3968 + }, + { + "epoch": 2.238578680203046, + "grad_norm": 1.1748850345611572, + "learning_rate": 3.8809926677946984e-05, + "loss": 0.7096, + "step": 3969 + }, + { + "epoch": 2.239142695995488, + "grad_norm": 1.305999755859375, + "learning_rate": 3.880710659898477e-05, + "loss": 0.8274, + "step": 3970 + }, + { + "epoch": 2.23970671178793, + "grad_norm": 1.3693749904632568, + "learning_rate": 3.880428652002256e-05, + "loss": 0.6916, + "step": 3971 + }, + { + "epoch": 2.240270727580372, + "grad_norm": 2.3061375617980957, + "learning_rate": 3.8801466441060354e-05, + "loss": 0.8348, + "step": 3972 + }, + { + "epoch": 2.2408347433728144, + "grad_norm": 2.0589189529418945, + "learning_rate": 3.879864636209814e-05, + "loss": 0.9103, + "step": 3973 + }, + { + "epoch": 2.2413987591652567, + "grad_norm": 1.5592139959335327, + "learning_rate": 3.8795826283135924e-05, + "loss": 0.7376, + "step": 3974 + }, + { + "epoch": 2.241962774957699, + "grad_norm": 4.326584815979004, + "learning_rate": 3.879300620417372e-05, + "loss": 0.9351, + "step": 3975 + }, + { + "epoch": 2.242526790750141, + "grad_norm": 1.966194987297058, + "learning_rate": 3.879018612521151e-05, + "loss": 0.7281, + "step": 3976 + }, + { + "epoch": 2.2430908065425834, + "grad_norm": 2.7689177989959717, + "learning_rate": 3.8787366046249294e-05, + "loss": 0.8243, + "step": 3977 + }, + { + "epoch": 2.2436548223350252, + "grad_norm": 2.0784854888916016, + "learning_rate": 3.8784545967287086e-05, + "loss": 0.8725, + "step": 3978 + }, + { + "epoch": 2.2442188381274675, + "grad_norm": 1.1708837747573853, + "learning_rate": 3.878172588832488e-05, + "loss": 0.714, + "step": 3979 + }, + { + "epoch": 2.2447828539199097, + "grad_norm": 1.3237435817718506, + "learning_rate": 3.8778905809362664e-05, + "loss": 0.7951, + "step": 3980 + }, + { + "epoch": 2.245346869712352, + "grad_norm": 1.4241880178451538, + "learning_rate": 3.877608573040045e-05, + "loss": 0.7052, + "step": 3981 + }, + { + "epoch": 2.2459108855047942, + "grad_norm": 1.5657984018325806, + "learning_rate": 3.877326565143824e-05, + "loss": 0.8403, + "step": 3982 + }, + { + "epoch": 2.2464749012972365, + "grad_norm": 1.4317257404327393, + "learning_rate": 3.8770445572476034e-05, + "loss": 0.8084, + "step": 3983 + }, + { + "epoch": 2.2470389170896787, + "grad_norm": 1.5749549865722656, + "learning_rate": 3.876762549351382e-05, + "loss": 0.7689, + "step": 3984 + }, + { + "epoch": 2.2476029328821205, + "grad_norm": 1.3214375972747803, + "learning_rate": 3.876480541455161e-05, + "loss": 0.6819, + "step": 3985 + }, + { + "epoch": 2.248166948674563, + "grad_norm": 1.5291162729263306, + "learning_rate": 3.87619853355894e-05, + "loss": 0.7771, + "step": 3986 + }, + { + "epoch": 2.248730964467005, + "grad_norm": 2.677705764770508, + "learning_rate": 3.875916525662719e-05, + "loss": 0.715, + "step": 3987 + }, + { + "epoch": 2.2492949802594473, + "grad_norm": 2.0494203567504883, + "learning_rate": 3.8756345177664974e-05, + "loss": 0.8026, + "step": 3988 + }, + { + "epoch": 2.2498589960518895, + "grad_norm": 1.6427336931228638, + "learning_rate": 3.8753525098702767e-05, + "loss": 0.7431, + "step": 3989 + }, + { + "epoch": 2.250423011844332, + "grad_norm": 2.9450483322143555, + "learning_rate": 3.875070501974055e-05, + "loss": 0.8627, + "step": 3990 + }, + { + "epoch": 2.250987027636774, + "grad_norm": 1.2564533948898315, + "learning_rate": 3.8747884940778344e-05, + "loss": 0.6276, + "step": 3991 + }, + { + "epoch": 2.251551043429216, + "grad_norm": 3.5762948989868164, + "learning_rate": 3.874506486181613e-05, + "loss": 0.9318, + "step": 3992 + }, + { + "epoch": 2.252115059221658, + "grad_norm": 1.1487168073654175, + "learning_rate": 3.874224478285392e-05, + "loss": 0.7946, + "step": 3993 + }, + { + "epoch": 2.2526790750141004, + "grad_norm": 1.8571832180023193, + "learning_rate": 3.8739424703891714e-05, + "loss": 0.7283, + "step": 3994 + }, + { + "epoch": 2.2532430908065426, + "grad_norm": 1.8468379974365234, + "learning_rate": 3.87366046249295e-05, + "loss": 0.6716, + "step": 3995 + }, + { + "epoch": 2.253807106598985, + "grad_norm": 1.6698120832443237, + "learning_rate": 3.873378454596729e-05, + "loss": 0.8731, + "step": 3996 + }, + { + "epoch": 2.254371122391427, + "grad_norm": 1.1647769212722778, + "learning_rate": 3.873096446700508e-05, + "loss": 0.7283, + "step": 3997 + }, + { + "epoch": 2.2549351381838694, + "grad_norm": 1.3889483213424683, + "learning_rate": 3.872814438804287e-05, + "loss": 0.7734, + "step": 3998 + }, + { + "epoch": 2.255499153976311, + "grad_norm": 2.285991907119751, + "learning_rate": 3.8725324309080655e-05, + "loss": 0.9399, + "step": 3999 + }, + { + "epoch": 2.2560631697687534, + "grad_norm": 2.175468921661377, + "learning_rate": 3.872250423011845e-05, + "loss": 0.8137, + "step": 4000 + }, + { + "epoch": 2.2566271855611957, + "grad_norm": 1.3109468221664429, + "learning_rate": 3.871968415115623e-05, + "loss": 0.6982, + "step": 4001 + }, + { + "epoch": 2.257191201353638, + "grad_norm": 7.803416728973389, + "learning_rate": 3.8716864072194024e-05, + "loss": 0.866, + "step": 4002 + }, + { + "epoch": 2.25775521714608, + "grad_norm": 1.8103234767913818, + "learning_rate": 3.8714043993231816e-05, + "loss": 0.7711, + "step": 4003 + }, + { + "epoch": 2.2583192329385224, + "grad_norm": 2.6887733936309814, + "learning_rate": 3.87112239142696e-05, + "loss": 0.7806, + "step": 4004 + }, + { + "epoch": 2.2588832487309647, + "grad_norm": 1.9710361957550049, + "learning_rate": 3.870840383530739e-05, + "loss": 0.7916, + "step": 4005 + }, + { + "epoch": 2.2594472645234065, + "grad_norm": 1.8416461944580078, + "learning_rate": 3.870558375634518e-05, + "loss": 0.8196, + "step": 4006 + }, + { + "epoch": 2.2600112803158487, + "grad_norm": 1.5923237800598145, + "learning_rate": 3.870276367738297e-05, + "loss": 0.876, + "step": 4007 + }, + { + "epoch": 2.260575296108291, + "grad_norm": 1.2487369775772095, + "learning_rate": 3.869994359842076e-05, + "loss": 0.7288, + "step": 4008 + }, + { + "epoch": 2.2611393119007333, + "grad_norm": 1.1744362115859985, + "learning_rate": 3.869712351945854e-05, + "loss": 0.7851, + "step": 4009 + }, + { + "epoch": 2.2617033276931755, + "grad_norm": 1.8034676313400269, + "learning_rate": 3.8694303440496335e-05, + "loss": 0.8957, + "step": 4010 + }, + { + "epoch": 2.2622673434856178, + "grad_norm": 1.6834394931793213, + "learning_rate": 3.869148336153413e-05, + "loss": 0.6536, + "step": 4011 + }, + { + "epoch": 2.26283135927806, + "grad_norm": 1.2887769937515259, + "learning_rate": 3.868866328257191e-05, + "loss": 0.7721, + "step": 4012 + }, + { + "epoch": 2.263395375070502, + "grad_norm": 3.0353314876556396, + "learning_rate": 3.86858432036097e-05, + "loss": 0.8895, + "step": 4013 + }, + { + "epoch": 2.263959390862944, + "grad_norm": 1.7687782049179077, + "learning_rate": 3.86830231246475e-05, + "loss": 0.7776, + "step": 4014 + }, + { + "epoch": 2.2645234066553863, + "grad_norm": 1.1144036054611206, + "learning_rate": 3.868020304568528e-05, + "loss": 0.6682, + "step": 4015 + }, + { + "epoch": 2.2650874224478286, + "grad_norm": 3.4064598083496094, + "learning_rate": 3.867738296672307e-05, + "loss": 0.8523, + "step": 4016 + }, + { + "epoch": 2.265651438240271, + "grad_norm": 1.964539885520935, + "learning_rate": 3.867456288776086e-05, + "loss": 0.6741, + "step": 4017 + }, + { + "epoch": 2.266215454032713, + "grad_norm": 2.3273866176605225, + "learning_rate": 3.867174280879865e-05, + "loss": 0.8415, + "step": 4018 + }, + { + "epoch": 2.2667794698251553, + "grad_norm": 1.6989202499389648, + "learning_rate": 3.866892272983644e-05, + "loss": 0.8656, + "step": 4019 + }, + { + "epoch": 2.267343485617597, + "grad_norm": 1.8661891222000122, + "learning_rate": 3.866610265087422e-05, + "loss": 0.7267, + "step": 4020 + }, + { + "epoch": 2.2679075014100394, + "grad_norm": 97.00116729736328, + "learning_rate": 3.8663282571912015e-05, + "loss": 0.6958, + "step": 4021 + }, + { + "epoch": 2.2684715172024816, + "grad_norm": 2.4183387756347656, + "learning_rate": 3.866046249294981e-05, + "loss": 0.765, + "step": 4022 + }, + { + "epoch": 2.269035532994924, + "grad_norm": 1.4862492084503174, + "learning_rate": 3.865764241398759e-05, + "loss": 0.6879, + "step": 4023 + }, + { + "epoch": 2.269599548787366, + "grad_norm": 1.672105312347412, + "learning_rate": 3.8654822335025385e-05, + "loss": 0.8496, + "step": 4024 + }, + { + "epoch": 2.2701635645798084, + "grad_norm": 6.780030727386475, + "learning_rate": 3.865200225606317e-05, + "loss": 0.7858, + "step": 4025 + }, + { + "epoch": 2.2707275803722506, + "grad_norm": 1.2245993614196777, + "learning_rate": 3.864918217710096e-05, + "loss": 0.7283, + "step": 4026 + }, + { + "epoch": 2.2712915961646925, + "grad_norm": 1.9637348651885986, + "learning_rate": 3.864636209813875e-05, + "loss": 0.761, + "step": 4027 + }, + { + "epoch": 2.2718556119571347, + "grad_norm": 1.0572561025619507, + "learning_rate": 3.864354201917654e-05, + "loss": 0.7215, + "step": 4028 + }, + { + "epoch": 2.272419627749577, + "grad_norm": 1.590684413909912, + "learning_rate": 3.864072194021433e-05, + "loss": 0.7518, + "step": 4029 + }, + { + "epoch": 2.272983643542019, + "grad_norm": 1.182720422744751, + "learning_rate": 3.863790186125212e-05, + "loss": 0.5736, + "step": 4030 + }, + { + "epoch": 2.2735476593344615, + "grad_norm": 1.4429899454116821, + "learning_rate": 3.86350817822899e-05, + "loss": 0.7675, + "step": 4031 + }, + { + "epoch": 2.2741116751269037, + "grad_norm": 1.461295247077942, + "learning_rate": 3.8632261703327695e-05, + "loss": 0.7977, + "step": 4032 + }, + { + "epoch": 2.274675690919346, + "grad_norm": 1.3879796266555786, + "learning_rate": 3.862944162436549e-05, + "loss": 0.719, + "step": 4033 + }, + { + "epoch": 2.2752397067117878, + "grad_norm": 1.7434064149856567, + "learning_rate": 3.862662154540327e-05, + "loss": 0.8413, + "step": 4034 + }, + { + "epoch": 2.27580372250423, + "grad_norm": 1.269261121749878, + "learning_rate": 3.8623801466441065e-05, + "loss": 0.6977, + "step": 4035 + }, + { + "epoch": 2.2763677382966723, + "grad_norm": 1.6035057306289673, + "learning_rate": 3.862098138747885e-05, + "loss": 0.7779, + "step": 4036 + }, + { + "epoch": 2.2769317540891145, + "grad_norm": 1.463683009147644, + "learning_rate": 3.861816130851664e-05, + "loss": 0.6709, + "step": 4037 + }, + { + "epoch": 2.2774957698815568, + "grad_norm": 1.3990254402160645, + "learning_rate": 3.861534122955443e-05, + "loss": 0.7674, + "step": 4038 + }, + { + "epoch": 2.278059785673999, + "grad_norm": 1.2680842876434326, + "learning_rate": 3.861252115059222e-05, + "loss": 0.6991, + "step": 4039 + }, + { + "epoch": 2.2786238014664413, + "grad_norm": 1.3670426607131958, + "learning_rate": 3.8609701071630005e-05, + "loss": 0.7244, + "step": 4040 + }, + { + "epoch": 2.279187817258883, + "grad_norm": 2.2467591762542725, + "learning_rate": 3.86068809926678e-05, + "loss": 0.7809, + "step": 4041 + }, + { + "epoch": 2.2797518330513253, + "grad_norm": 2.8808088302612305, + "learning_rate": 3.860406091370559e-05, + "loss": 0.8811, + "step": 4042 + }, + { + "epoch": 2.2803158488437676, + "grad_norm": 1.0861347913742065, + "learning_rate": 3.8601240834743375e-05, + "loss": 0.7717, + "step": 4043 + }, + { + "epoch": 2.28087986463621, + "grad_norm": 1.7372784614562988, + "learning_rate": 3.859842075578116e-05, + "loss": 0.793, + "step": 4044 + }, + { + "epoch": 2.281443880428652, + "grad_norm": 1.910177230834961, + "learning_rate": 3.859560067681895e-05, + "loss": 0.7432, + "step": 4045 + }, + { + "epoch": 2.2820078962210943, + "grad_norm": 1.8332091569900513, + "learning_rate": 3.8592780597856745e-05, + "loss": 0.7968, + "step": 4046 + }, + { + "epoch": 2.2825719120135366, + "grad_norm": 2.0879335403442383, + "learning_rate": 3.858996051889453e-05, + "loss": 0.7514, + "step": 4047 + }, + { + "epoch": 2.2831359278059784, + "grad_norm": 1.2339175939559937, + "learning_rate": 3.8587140439932316e-05, + "loss": 0.6979, + "step": 4048 + }, + { + "epoch": 2.2836999435984207, + "grad_norm": 1.0898432731628418, + "learning_rate": 3.858432036097011e-05, + "loss": 0.7171, + "step": 4049 + }, + { + "epoch": 2.284263959390863, + "grad_norm": 1.0442339181900024, + "learning_rate": 3.85815002820079e-05, + "loss": 0.7092, + "step": 4050 + }, + { + "epoch": 2.284827975183305, + "grad_norm": 1.2061305046081543, + "learning_rate": 3.8578680203045685e-05, + "loss": 0.696, + "step": 4051 + }, + { + "epoch": 2.2853919909757474, + "grad_norm": 2.506788730621338, + "learning_rate": 3.857586012408347e-05, + "loss": 1.007, + "step": 4052 + }, + { + "epoch": 2.2859560067681897, + "grad_norm": 1.2963688373565674, + "learning_rate": 3.857304004512127e-05, + "loss": 0.8097, + "step": 4053 + }, + { + "epoch": 2.286520022560632, + "grad_norm": 1.2979875802993774, + "learning_rate": 3.8570219966159055e-05, + "loss": 0.6589, + "step": 4054 + }, + { + "epoch": 2.2870840383530737, + "grad_norm": 1.0396299362182617, + "learning_rate": 3.856739988719684e-05, + "loss": 0.7317, + "step": 4055 + }, + { + "epoch": 2.287648054145516, + "grad_norm": 1.5705839395523071, + "learning_rate": 3.856457980823463e-05, + "loss": 0.7201, + "step": 4056 + }, + { + "epoch": 2.2882120699379582, + "grad_norm": 1.2809966802597046, + "learning_rate": 3.8561759729272425e-05, + "loss": 0.7392, + "step": 4057 + }, + { + "epoch": 2.2887760857304005, + "grad_norm": 1.7689188718795776, + "learning_rate": 3.855893965031021e-05, + "loss": 0.765, + "step": 4058 + }, + { + "epoch": 2.2893401015228427, + "grad_norm": 1.2323263883590698, + "learning_rate": 3.8556119571347996e-05, + "loss": 0.7811, + "step": 4059 + }, + { + "epoch": 2.289904117315285, + "grad_norm": 2.0383846759796143, + "learning_rate": 3.855329949238579e-05, + "loss": 0.8156, + "step": 4060 + }, + { + "epoch": 2.2904681331077272, + "grad_norm": 3.012679100036621, + "learning_rate": 3.855047941342358e-05, + "loss": 0.8534, + "step": 4061 + }, + { + "epoch": 2.291032148900169, + "grad_norm": 1.8505666255950928, + "learning_rate": 3.8547659334461366e-05, + "loss": 0.8425, + "step": 4062 + }, + { + "epoch": 2.2915961646926113, + "grad_norm": 1.4292774200439453, + "learning_rate": 3.854483925549915e-05, + "loss": 0.8251, + "step": 4063 + }, + { + "epoch": 2.2921601804850535, + "grad_norm": 2.0700747966766357, + "learning_rate": 3.854201917653695e-05, + "loss": 0.8202, + "step": 4064 + }, + { + "epoch": 2.292724196277496, + "grad_norm": 1.0661382675170898, + "learning_rate": 3.8539199097574735e-05, + "loss": 0.7418, + "step": 4065 + }, + { + "epoch": 2.293288212069938, + "grad_norm": 2.427427291870117, + "learning_rate": 3.853637901861252e-05, + "loss": 0.9016, + "step": 4066 + }, + { + "epoch": 2.2938522278623803, + "grad_norm": 1.740465760231018, + "learning_rate": 3.853355893965031e-05, + "loss": 0.6973, + "step": 4067 + }, + { + "epoch": 2.2944162436548226, + "grad_norm": 1.4973444938659668, + "learning_rate": 3.8530738860688105e-05, + "loss": 0.7982, + "step": 4068 + }, + { + "epoch": 2.2949802594472644, + "grad_norm": 1.500680923461914, + "learning_rate": 3.852791878172589e-05, + "loss": 0.7863, + "step": 4069 + }, + { + "epoch": 2.2955442752397066, + "grad_norm": 2.354074239730835, + "learning_rate": 3.8525098702763676e-05, + "loss": 0.7869, + "step": 4070 + }, + { + "epoch": 2.296108291032149, + "grad_norm": 0.9945712685585022, + "learning_rate": 3.852227862380147e-05, + "loss": 0.6441, + "step": 4071 + }, + { + "epoch": 2.296672306824591, + "grad_norm": 1.6093013286590576, + "learning_rate": 3.851945854483926e-05, + "loss": 0.7288, + "step": 4072 + }, + { + "epoch": 2.2972363226170334, + "grad_norm": 1.7214329242706299, + "learning_rate": 3.8516638465877046e-05, + "loss": 0.8612, + "step": 4073 + }, + { + "epoch": 2.2978003384094756, + "grad_norm": 2.297863721847534, + "learning_rate": 3.851381838691484e-05, + "loss": 0.8996, + "step": 4074 + }, + { + "epoch": 2.298364354201918, + "grad_norm": 2.875098943710327, + "learning_rate": 3.851099830795262e-05, + "loss": 0.805, + "step": 4075 + }, + { + "epoch": 2.2989283699943597, + "grad_norm": 1.288629412651062, + "learning_rate": 3.8508178228990416e-05, + "loss": 0.7084, + "step": 4076 + }, + { + "epoch": 2.299492385786802, + "grad_norm": 1.3625049591064453, + "learning_rate": 3.85053581500282e-05, + "loss": 0.8126, + "step": 4077 + }, + { + "epoch": 2.300056401579244, + "grad_norm": 1.2405056953430176, + "learning_rate": 3.850253807106599e-05, + "loss": 0.7689, + "step": 4078 + }, + { + "epoch": 2.3006204173716864, + "grad_norm": 1.2180018424987793, + "learning_rate": 3.849971799210378e-05, + "loss": 0.75, + "step": 4079 + }, + { + "epoch": 2.3011844331641287, + "grad_norm": 3.086463451385498, + "learning_rate": 3.849689791314157e-05, + "loss": 0.7887, + "step": 4080 + }, + { + "epoch": 2.301748448956571, + "grad_norm": 1.5996423959732056, + "learning_rate": 3.8494077834179356e-05, + "loss": 0.7708, + "step": 4081 + }, + { + "epoch": 2.302312464749013, + "grad_norm": 1.9750428199768066, + "learning_rate": 3.849125775521715e-05, + "loss": 0.9056, + "step": 4082 + }, + { + "epoch": 2.302876480541455, + "grad_norm": 3.4349772930145264, + "learning_rate": 3.8488437676254934e-05, + "loss": 0.8122, + "step": 4083 + }, + { + "epoch": 2.3034404963338972, + "grad_norm": 1.4146740436553955, + "learning_rate": 3.8485617597292726e-05, + "loss": 0.7506, + "step": 4084 + }, + { + "epoch": 2.3040045121263395, + "grad_norm": 1.47848641872406, + "learning_rate": 3.848279751833052e-05, + "loss": 0.8165, + "step": 4085 + }, + { + "epoch": 2.3045685279187818, + "grad_norm": 3.8510754108428955, + "learning_rate": 3.8479977439368304e-05, + "loss": 0.8706, + "step": 4086 + }, + { + "epoch": 2.305132543711224, + "grad_norm": 1.2915242910385132, + "learning_rate": 3.847715736040609e-05, + "loss": 0.7697, + "step": 4087 + }, + { + "epoch": 2.3056965595036663, + "grad_norm": 1.408416986465454, + "learning_rate": 3.847433728144388e-05, + "loss": 0.8775, + "step": 4088 + }, + { + "epoch": 2.3062605752961085, + "grad_norm": 1.2127480506896973, + "learning_rate": 3.847151720248167e-05, + "loss": 0.7748, + "step": 4089 + }, + { + "epoch": 2.3068245910885503, + "grad_norm": 1.2611298561096191, + "learning_rate": 3.846869712351946e-05, + "loss": 0.8027, + "step": 4090 + }, + { + "epoch": 2.3073886068809926, + "grad_norm": 2.4741721153259277, + "learning_rate": 3.8465877044557244e-05, + "loss": 0.9523, + "step": 4091 + }, + { + "epoch": 2.307952622673435, + "grad_norm": 1.0165144205093384, + "learning_rate": 3.846305696559504e-05, + "loss": 0.6544, + "step": 4092 + }, + { + "epoch": 2.308516638465877, + "grad_norm": 1.2496134042739868, + "learning_rate": 3.846023688663283e-05, + "loss": 0.8118, + "step": 4093 + }, + { + "epoch": 2.3090806542583193, + "grad_norm": 1.4085454940795898, + "learning_rate": 3.8457416807670614e-05, + "loss": 0.8154, + "step": 4094 + }, + { + "epoch": 2.3096446700507616, + "grad_norm": 1.9944932460784912, + "learning_rate": 3.8454596728708406e-05, + "loss": 0.8448, + "step": 4095 + }, + { + "epoch": 2.310208685843204, + "grad_norm": 1.4704509973526, + "learning_rate": 3.84517766497462e-05, + "loss": 0.6837, + "step": 4096 + }, + { + "epoch": 2.3107727016356456, + "grad_norm": 1.073840856552124, + "learning_rate": 3.8448956570783984e-05, + "loss": 0.6703, + "step": 4097 + }, + { + "epoch": 2.311336717428088, + "grad_norm": 1.2038096189498901, + "learning_rate": 3.844613649182177e-05, + "loss": 0.6869, + "step": 4098 + }, + { + "epoch": 2.31190073322053, + "grad_norm": 1.9109232425689697, + "learning_rate": 3.844331641285956e-05, + "loss": 0.779, + "step": 4099 + }, + { + "epoch": 2.3124647490129724, + "grad_norm": 1.1473872661590576, + "learning_rate": 3.8440496333897353e-05, + "loss": 0.7872, + "step": 4100 + }, + { + "epoch": 2.3130287648054146, + "grad_norm": 1.3170857429504395, + "learning_rate": 3.843767625493514e-05, + "loss": 0.8085, + "step": 4101 + }, + { + "epoch": 2.313592780597857, + "grad_norm": 2.1604912281036377, + "learning_rate": 3.8434856175972924e-05, + "loss": 0.9108, + "step": 4102 + }, + { + "epoch": 2.314156796390299, + "grad_norm": 1.3577048778533936, + "learning_rate": 3.843203609701072e-05, + "loss": 0.7342, + "step": 4103 + }, + { + "epoch": 2.314720812182741, + "grad_norm": 2.238844871520996, + "learning_rate": 3.842921601804851e-05, + "loss": 0.9143, + "step": 4104 + }, + { + "epoch": 2.315284827975183, + "grad_norm": 1.4964293241500854, + "learning_rate": 3.8426395939086294e-05, + "loss": 0.6813, + "step": 4105 + }, + { + "epoch": 2.3158488437676255, + "grad_norm": 1.1678621768951416, + "learning_rate": 3.8423575860124086e-05, + "loss": 0.8526, + "step": 4106 + }, + { + "epoch": 2.3164128595600677, + "grad_norm": 3.718189239501953, + "learning_rate": 3.842075578116188e-05, + "loss": 0.8595, + "step": 4107 + }, + { + "epoch": 2.31697687535251, + "grad_norm": 1.4390537738800049, + "learning_rate": 3.8417935702199664e-05, + "loss": 0.8122, + "step": 4108 + }, + { + "epoch": 2.317540891144952, + "grad_norm": 0.9023476243019104, + "learning_rate": 3.841511562323745e-05, + "loss": 0.5828, + "step": 4109 + }, + { + "epoch": 2.3181049069373945, + "grad_norm": 1.7557306289672852, + "learning_rate": 3.841229554427524e-05, + "loss": 0.8802, + "step": 4110 + }, + { + "epoch": 2.3186689227298363, + "grad_norm": 1.1420199871063232, + "learning_rate": 3.8409475465313034e-05, + "loss": 0.6947, + "step": 4111 + }, + { + "epoch": 2.3192329385222785, + "grad_norm": 1.459568977355957, + "learning_rate": 3.840665538635082e-05, + "loss": 0.7464, + "step": 4112 + }, + { + "epoch": 2.3197969543147208, + "grad_norm": 1.732977032661438, + "learning_rate": 3.840383530738861e-05, + "loss": 0.7629, + "step": 4113 + }, + { + "epoch": 2.320360970107163, + "grad_norm": 1.8033342361450195, + "learning_rate": 3.8401015228426397e-05, + "loss": 0.6653, + "step": 4114 + }, + { + "epoch": 2.3209249858996053, + "grad_norm": 3.0726630687713623, + "learning_rate": 3.839819514946419e-05, + "loss": 1.0974, + "step": 4115 + }, + { + "epoch": 2.3214890016920475, + "grad_norm": 1.40825355052948, + "learning_rate": 3.8395375070501974e-05, + "loss": 0.6607, + "step": 4116 + }, + { + "epoch": 2.32205301748449, + "grad_norm": 1.1263622045516968, + "learning_rate": 3.8392554991539766e-05, + "loss": 0.6433, + "step": 4117 + }, + { + "epoch": 2.3226170332769316, + "grad_norm": 3.985067129135132, + "learning_rate": 3.838973491257755e-05, + "loss": 0.9312, + "step": 4118 + }, + { + "epoch": 2.323181049069374, + "grad_norm": 1.6657482385635376, + "learning_rate": 3.8386914833615344e-05, + "loss": 0.8183, + "step": 4119 + }, + { + "epoch": 2.323745064861816, + "grad_norm": 2.3010001182556152, + "learning_rate": 3.838409475465313e-05, + "loss": 0.6958, + "step": 4120 + }, + { + "epoch": 2.3243090806542583, + "grad_norm": 1.4391157627105713, + "learning_rate": 3.838127467569092e-05, + "loss": 0.7341, + "step": 4121 + }, + { + "epoch": 2.3248730964467006, + "grad_norm": 5.25543212890625, + "learning_rate": 3.837845459672871e-05, + "loss": 0.8925, + "step": 4122 + }, + { + "epoch": 2.325437112239143, + "grad_norm": 1.9972602128982544, + "learning_rate": 3.83756345177665e-05, + "loss": 0.6585, + "step": 4123 + }, + { + "epoch": 2.326001128031585, + "grad_norm": 3.939441442489624, + "learning_rate": 3.837281443880429e-05, + "loss": 0.8757, + "step": 4124 + }, + { + "epoch": 2.326565143824027, + "grad_norm": 1.0773470401763916, + "learning_rate": 3.836999435984208e-05, + "loss": 0.7508, + "step": 4125 + }, + { + "epoch": 2.327129159616469, + "grad_norm": 1.2893574237823486, + "learning_rate": 3.836717428087986e-05, + "loss": 0.7966, + "step": 4126 + }, + { + "epoch": 2.3276931754089114, + "grad_norm": 1.2598549127578735, + "learning_rate": 3.8364354201917654e-05, + "loss": 0.7478, + "step": 4127 + }, + { + "epoch": 2.3282571912013537, + "grad_norm": 2.246861457824707, + "learning_rate": 3.8361534122955447e-05, + "loss": 0.7993, + "step": 4128 + }, + { + "epoch": 2.328821206993796, + "grad_norm": 1.4739301204681396, + "learning_rate": 3.835871404399323e-05, + "loss": 0.7437, + "step": 4129 + }, + { + "epoch": 2.329385222786238, + "grad_norm": 3.518036365509033, + "learning_rate": 3.835589396503102e-05, + "loss": 0.9073, + "step": 4130 + }, + { + "epoch": 2.3299492385786804, + "grad_norm": 2.313232183456421, + "learning_rate": 3.8353073886068816e-05, + "loss": 0.8714, + "step": 4131 + }, + { + "epoch": 2.3305132543711222, + "grad_norm": 3.3954555988311768, + "learning_rate": 3.83502538071066e-05, + "loss": 0.9141, + "step": 4132 + }, + { + "epoch": 2.3310772701635645, + "grad_norm": 1.1255989074707031, + "learning_rate": 3.834743372814439e-05, + "loss": 0.7341, + "step": 4133 + }, + { + "epoch": 2.3316412859560067, + "grad_norm": 2.7222557067871094, + "learning_rate": 3.834461364918218e-05, + "loss": 0.8256, + "step": 4134 + }, + { + "epoch": 2.332205301748449, + "grad_norm": 1.251917839050293, + "learning_rate": 3.834179357021997e-05, + "loss": 0.8125, + "step": 4135 + }, + { + "epoch": 2.3327693175408912, + "grad_norm": 1.4561132192611694, + "learning_rate": 3.833897349125776e-05, + "loss": 0.7879, + "step": 4136 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.3953408002853394, + "learning_rate": 3.833615341229554e-05, + "loss": 0.8332, + "step": 4137 + }, + { + "epoch": 2.3338973491257757, + "grad_norm": 1.6294844150543213, + "learning_rate": 3.8333333333333334e-05, + "loss": 0.8501, + "step": 4138 + }, + { + "epoch": 2.3344613649182175, + "grad_norm": 3.0402910709381104, + "learning_rate": 3.833051325437113e-05, + "loss": 0.948, + "step": 4139 + }, + { + "epoch": 2.33502538071066, + "grad_norm": 1.019917607307434, + "learning_rate": 3.832769317540891e-05, + "loss": 0.6973, + "step": 4140 + }, + { + "epoch": 2.335589396503102, + "grad_norm": 5.086674213409424, + "learning_rate": 3.83248730964467e-05, + "loss": 0.9054, + "step": 4141 + }, + { + "epoch": 2.3361534122955443, + "grad_norm": 3.8788936138153076, + "learning_rate": 3.8322053017484496e-05, + "loss": 0.7708, + "step": 4142 + }, + { + "epoch": 2.3367174280879865, + "grad_norm": 1.397735595703125, + "learning_rate": 3.831923293852228e-05, + "loss": 0.7307, + "step": 4143 + }, + { + "epoch": 2.337281443880429, + "grad_norm": 1.6134892702102661, + "learning_rate": 3.831641285956007e-05, + "loss": 0.7891, + "step": 4144 + }, + { + "epoch": 2.337845459672871, + "grad_norm": 2.4349260330200195, + "learning_rate": 3.831359278059786e-05, + "loss": 0.8906, + "step": 4145 + }, + { + "epoch": 2.338409475465313, + "grad_norm": 3.1441636085510254, + "learning_rate": 3.831077270163565e-05, + "loss": 0.7375, + "step": 4146 + }, + { + "epoch": 2.338973491257755, + "grad_norm": 1.602626085281372, + "learning_rate": 3.830795262267344e-05, + "loss": 0.8948, + "step": 4147 + }, + { + "epoch": 2.3395375070501974, + "grad_norm": 1.781261682510376, + "learning_rate": 3.830513254371122e-05, + "loss": 0.8276, + "step": 4148 + }, + { + "epoch": 2.3401015228426396, + "grad_norm": 2.0734806060791016, + "learning_rate": 3.8302312464749015e-05, + "loss": 0.799, + "step": 4149 + }, + { + "epoch": 2.340665538635082, + "grad_norm": 1.6941120624542236, + "learning_rate": 3.829949238578681e-05, + "loss": 0.7889, + "step": 4150 + }, + { + "epoch": 2.341229554427524, + "grad_norm": 2.116110324859619, + "learning_rate": 3.829667230682459e-05, + "loss": 0.9095, + "step": 4151 + }, + { + "epoch": 2.3417935702199664, + "grad_norm": 1.8148555755615234, + "learning_rate": 3.8293852227862384e-05, + "loss": 0.6964, + "step": 4152 + }, + { + "epoch": 2.342357586012408, + "grad_norm": 1.6677796840667725, + "learning_rate": 3.829103214890017e-05, + "loss": 0.8391, + "step": 4153 + }, + { + "epoch": 2.3429216018048504, + "grad_norm": 1.7410295009613037, + "learning_rate": 3.828821206993796e-05, + "loss": 0.7955, + "step": 4154 + }, + { + "epoch": 2.3434856175972927, + "grad_norm": 1.061815619468689, + "learning_rate": 3.828539199097575e-05, + "loss": 0.7833, + "step": 4155 + }, + { + "epoch": 2.344049633389735, + "grad_norm": 1.9817299842834473, + "learning_rate": 3.828257191201354e-05, + "loss": 0.7482, + "step": 4156 + }, + { + "epoch": 2.344613649182177, + "grad_norm": 1.5047581195831299, + "learning_rate": 3.8279751833051325e-05, + "loss": 0.83, + "step": 4157 + }, + { + "epoch": 2.3451776649746194, + "grad_norm": 1.6163618564605713, + "learning_rate": 3.827693175408912e-05, + "loss": 0.797, + "step": 4158 + }, + { + "epoch": 2.3457416807670617, + "grad_norm": 1.2829991579055786, + "learning_rate": 3.82741116751269e-05, + "loss": 0.8761, + "step": 4159 + }, + { + "epoch": 2.3463056965595035, + "grad_norm": 2.9383223056793213, + "learning_rate": 3.8271291596164695e-05, + "loss": 0.9767, + "step": 4160 + }, + { + "epoch": 2.3468697123519457, + "grad_norm": 1.9276928901672363, + "learning_rate": 3.826847151720248e-05, + "loss": 0.8712, + "step": 4161 + }, + { + "epoch": 2.347433728144388, + "grad_norm": 2.2604684829711914, + "learning_rate": 3.826565143824027e-05, + "loss": 0.8582, + "step": 4162 + }, + { + "epoch": 2.3479977439368303, + "grad_norm": 1.6405280828475952, + "learning_rate": 3.8262831359278065e-05, + "loss": 0.7449, + "step": 4163 + }, + { + "epoch": 2.3485617597292725, + "grad_norm": 1.326155185699463, + "learning_rate": 3.826001128031585e-05, + "loss": 0.6898, + "step": 4164 + }, + { + "epoch": 2.3491257755217148, + "grad_norm": 1.1384623050689697, + "learning_rate": 3.8257191201353635e-05, + "loss": 0.7118, + "step": 4165 + }, + { + "epoch": 2.349689791314157, + "grad_norm": 4.338139057159424, + "learning_rate": 3.825437112239143e-05, + "loss": 0.8844, + "step": 4166 + }, + { + "epoch": 2.350253807106599, + "grad_norm": 1.5199404954910278, + "learning_rate": 3.825155104342922e-05, + "loss": 0.8092, + "step": 4167 + }, + { + "epoch": 2.350817822899041, + "grad_norm": 1.6016108989715576, + "learning_rate": 3.8248730964467005e-05, + "loss": 0.7108, + "step": 4168 + }, + { + "epoch": 2.3513818386914833, + "grad_norm": 0.9818236827850342, + "learning_rate": 3.82459108855048e-05, + "loss": 0.7094, + "step": 4169 + }, + { + "epoch": 2.3519458544839256, + "grad_norm": 1.7476426362991333, + "learning_rate": 3.824309080654259e-05, + "loss": 0.8148, + "step": 4170 + }, + { + "epoch": 2.352509870276368, + "grad_norm": 1.2412185668945312, + "learning_rate": 3.8240270727580375e-05, + "loss": 0.842, + "step": 4171 + }, + { + "epoch": 2.35307388606881, + "grad_norm": 2.13908314704895, + "learning_rate": 3.823745064861816e-05, + "loss": 0.7435, + "step": 4172 + }, + { + "epoch": 2.3536379018612523, + "grad_norm": 2.1219170093536377, + "learning_rate": 3.823463056965595e-05, + "loss": 0.6964, + "step": 4173 + }, + { + "epoch": 2.354201917653694, + "grad_norm": 1.1084129810333252, + "learning_rate": 3.8231810490693745e-05, + "loss": 0.7606, + "step": 4174 + }, + { + "epoch": 2.3547659334461364, + "grad_norm": 2.417541980743408, + "learning_rate": 3.822899041173153e-05, + "loss": 0.973, + "step": 4175 + }, + { + "epoch": 2.3553299492385786, + "grad_norm": 1.259131908416748, + "learning_rate": 3.8226170332769316e-05, + "loss": 0.7585, + "step": 4176 + }, + { + "epoch": 2.355893965031021, + "grad_norm": 1.5034900903701782, + "learning_rate": 3.822335025380711e-05, + "loss": 0.7681, + "step": 4177 + }, + { + "epoch": 2.356457980823463, + "grad_norm": 1.7102066278457642, + "learning_rate": 3.82205301748449e-05, + "loss": 0.8324, + "step": 4178 + }, + { + "epoch": 2.3570219966159054, + "grad_norm": 2.276075601577759, + "learning_rate": 3.8217710095882685e-05, + "loss": 0.8582, + "step": 4179 + }, + { + "epoch": 2.3575860124083476, + "grad_norm": 1.545075535774231, + "learning_rate": 3.821489001692047e-05, + "loss": 0.8588, + "step": 4180 + }, + { + "epoch": 2.3581500282007894, + "grad_norm": 1.0648555755615234, + "learning_rate": 3.821206993795827e-05, + "loss": 0.6571, + "step": 4181 + }, + { + "epoch": 2.3587140439932317, + "grad_norm": 1.5050455331802368, + "learning_rate": 3.8209249858996055e-05, + "loss": 0.8321, + "step": 4182 + }, + { + "epoch": 2.359278059785674, + "grad_norm": 2.2596232891082764, + "learning_rate": 3.820642978003384e-05, + "loss": 0.8708, + "step": 4183 + }, + { + "epoch": 2.359842075578116, + "grad_norm": 1.895553708076477, + "learning_rate": 3.820360970107163e-05, + "loss": 0.7987, + "step": 4184 + }, + { + "epoch": 2.3604060913705585, + "grad_norm": 1.4007360935211182, + "learning_rate": 3.8200789622109425e-05, + "loss": 0.8802, + "step": 4185 + }, + { + "epoch": 2.3609701071630007, + "grad_norm": 1.2608084678649902, + "learning_rate": 3.819796954314721e-05, + "loss": 0.7649, + "step": 4186 + }, + { + "epoch": 2.361534122955443, + "grad_norm": 2.628876209259033, + "learning_rate": 3.8195149464184996e-05, + "loss": 0.8314, + "step": 4187 + }, + { + "epoch": 2.3620981387478848, + "grad_norm": 1.3481630086898804, + "learning_rate": 3.819232938522279e-05, + "loss": 0.8346, + "step": 4188 + }, + { + "epoch": 2.362662154540327, + "grad_norm": 5.57774543762207, + "learning_rate": 3.818950930626058e-05, + "loss": 0.716, + "step": 4189 + }, + { + "epoch": 2.3632261703327693, + "grad_norm": 2.0147526264190674, + "learning_rate": 3.8186689227298365e-05, + "loss": 0.8101, + "step": 4190 + }, + { + "epoch": 2.3637901861252115, + "grad_norm": 1.210970163345337, + "learning_rate": 3.818386914833616e-05, + "loss": 0.7959, + "step": 4191 + }, + { + "epoch": 2.3643542019176538, + "grad_norm": 2.2386436462402344, + "learning_rate": 3.818104906937394e-05, + "loss": 0.8436, + "step": 4192 + }, + { + "epoch": 2.364918217710096, + "grad_norm": 3.294814109802246, + "learning_rate": 3.8178228990411735e-05, + "loss": 0.7738, + "step": 4193 + }, + { + "epoch": 2.3654822335025383, + "grad_norm": 1.4291930198669434, + "learning_rate": 3.817540891144952e-05, + "loss": 0.7924, + "step": 4194 + }, + { + "epoch": 2.36604624929498, + "grad_norm": 1.3618242740631104, + "learning_rate": 3.817258883248731e-05, + "loss": 0.7723, + "step": 4195 + }, + { + "epoch": 2.3666102650874223, + "grad_norm": 3.054961681365967, + "learning_rate": 3.81697687535251e-05, + "loss": 0.8123, + "step": 4196 + }, + { + "epoch": 2.3671742808798646, + "grad_norm": 1.7782599925994873, + "learning_rate": 3.816694867456289e-05, + "loss": 0.7866, + "step": 4197 + }, + { + "epoch": 2.367738296672307, + "grad_norm": 1.2564347982406616, + "learning_rate": 3.8164128595600676e-05, + "loss": 0.7124, + "step": 4198 + }, + { + "epoch": 2.368302312464749, + "grad_norm": 1.9454777240753174, + "learning_rate": 3.816130851663847e-05, + "loss": 0.8782, + "step": 4199 + }, + { + "epoch": 2.3688663282571913, + "grad_norm": 1.5796470642089844, + "learning_rate": 3.8158488437676253e-05, + "loss": 0.7447, + "step": 4200 + }, + { + "epoch": 2.3694303440496336, + "grad_norm": 2.066194772720337, + "learning_rate": 3.8155668358714046e-05, + "loss": 0.7879, + "step": 4201 + }, + { + "epoch": 2.3699943598420754, + "grad_norm": 1.0767444372177124, + "learning_rate": 3.815284827975184e-05, + "loss": 0.591, + "step": 4202 + }, + { + "epoch": 2.3705583756345177, + "grad_norm": 1.3831429481506348, + "learning_rate": 3.815002820078962e-05, + "loss": 0.768, + "step": 4203 + }, + { + "epoch": 2.37112239142696, + "grad_norm": 1.1683646440505981, + "learning_rate": 3.8147208121827415e-05, + "loss": 0.6998, + "step": 4204 + }, + { + "epoch": 2.371686407219402, + "grad_norm": 1.9862871170043945, + "learning_rate": 3.81443880428652e-05, + "loss": 0.9123, + "step": 4205 + }, + { + "epoch": 2.3722504230118444, + "grad_norm": 1.1301944255828857, + "learning_rate": 3.814156796390299e-05, + "loss": 0.6867, + "step": 4206 + }, + { + "epoch": 2.3728144388042867, + "grad_norm": 1.5486834049224854, + "learning_rate": 3.813874788494078e-05, + "loss": 0.682, + "step": 4207 + }, + { + "epoch": 2.373378454596729, + "grad_norm": 2.1375415325164795, + "learning_rate": 3.813592780597857e-05, + "loss": 0.7199, + "step": 4208 + }, + { + "epoch": 2.3739424703891707, + "grad_norm": 1.5337603092193604, + "learning_rate": 3.813310772701636e-05, + "loss": 0.7848, + "step": 4209 + }, + { + "epoch": 2.374506486181613, + "grad_norm": 1.871781349182129, + "learning_rate": 3.813028764805415e-05, + "loss": 0.7669, + "step": 4210 + }, + { + "epoch": 2.3750705019740552, + "grad_norm": 1.785889744758606, + "learning_rate": 3.8127467569091934e-05, + "loss": 0.7517, + "step": 4211 + }, + { + "epoch": 2.3756345177664975, + "grad_norm": 2.8396048545837402, + "learning_rate": 3.8124647490129726e-05, + "loss": 0.8506, + "step": 4212 + }, + { + "epoch": 2.3761985335589397, + "grad_norm": 10.391283988952637, + "learning_rate": 3.812182741116752e-05, + "loss": 0.8237, + "step": 4213 + }, + { + "epoch": 2.376762549351382, + "grad_norm": 1.2172489166259766, + "learning_rate": 3.81190073322053e-05, + "loss": 0.7928, + "step": 4214 + }, + { + "epoch": 2.3773265651438242, + "grad_norm": 1.0911732912063599, + "learning_rate": 3.811618725324309e-05, + "loss": 0.7476, + "step": 4215 + }, + { + "epoch": 2.377890580936266, + "grad_norm": 2.500042200088501, + "learning_rate": 3.811336717428088e-05, + "loss": 0.8434, + "step": 4216 + }, + { + "epoch": 2.3784545967287083, + "grad_norm": 1.5197068452835083, + "learning_rate": 3.811054709531867e-05, + "loss": 0.7713, + "step": 4217 + }, + { + "epoch": 2.3790186125211505, + "grad_norm": 2.3369812965393066, + "learning_rate": 3.810772701635646e-05, + "loss": 0.7395, + "step": 4218 + }, + { + "epoch": 2.379582628313593, + "grad_norm": 1.790091633796692, + "learning_rate": 3.8104906937394244e-05, + "loss": 0.8137, + "step": 4219 + }, + { + "epoch": 2.380146644106035, + "grad_norm": 1.1291688680648804, + "learning_rate": 3.810208685843204e-05, + "loss": 0.6972, + "step": 4220 + }, + { + "epoch": 2.3807106598984773, + "grad_norm": 1.474265217781067, + "learning_rate": 3.809926677946983e-05, + "loss": 0.8354, + "step": 4221 + }, + { + "epoch": 2.3812746756909196, + "grad_norm": 1.6544934511184692, + "learning_rate": 3.8096446700507614e-05, + "loss": 0.8653, + "step": 4222 + }, + { + "epoch": 2.3818386914833614, + "grad_norm": 1.8783557415008545, + "learning_rate": 3.8093626621545406e-05, + "loss": 0.9179, + "step": 4223 + }, + { + "epoch": 2.3824027072758036, + "grad_norm": 1.812854290008545, + "learning_rate": 3.80908065425832e-05, + "loss": 0.7987, + "step": 4224 + }, + { + "epoch": 2.382966723068246, + "grad_norm": 1.6641794443130493, + "learning_rate": 3.8087986463620984e-05, + "loss": 0.7862, + "step": 4225 + }, + { + "epoch": 2.383530738860688, + "grad_norm": 1.8724396228790283, + "learning_rate": 3.808516638465877e-05, + "loss": 0.7508, + "step": 4226 + }, + { + "epoch": 2.3840947546531304, + "grad_norm": 1.482783317565918, + "learning_rate": 3.808234630569656e-05, + "loss": 0.7665, + "step": 4227 + }, + { + "epoch": 2.3846587704455726, + "grad_norm": 1.4722788333892822, + "learning_rate": 3.807952622673435e-05, + "loss": 0.7253, + "step": 4228 + }, + { + "epoch": 2.385222786238015, + "grad_norm": 2.3277273178100586, + "learning_rate": 3.807670614777214e-05, + "loss": 0.8714, + "step": 4229 + }, + { + "epoch": 2.3857868020304567, + "grad_norm": 1.6994754076004028, + "learning_rate": 3.8073886068809924e-05, + "loss": 0.818, + "step": 4230 + }, + { + "epoch": 2.386350817822899, + "grad_norm": 1.570056676864624, + "learning_rate": 3.8071065989847716e-05, + "loss": 0.7357, + "step": 4231 + }, + { + "epoch": 2.386914833615341, + "grad_norm": 1.9455288648605347, + "learning_rate": 3.806824591088551e-05, + "loss": 0.7449, + "step": 4232 + }, + { + "epoch": 2.3874788494077834, + "grad_norm": 1.3871351480484009, + "learning_rate": 3.8065425831923294e-05, + "loss": 0.8372, + "step": 4233 + }, + { + "epoch": 2.3880428652002257, + "grad_norm": 1.13671875, + "learning_rate": 3.8062605752961086e-05, + "loss": 0.6446, + "step": 4234 + }, + { + "epoch": 2.388606880992668, + "grad_norm": 1.6898572444915771, + "learning_rate": 3.805978567399887e-05, + "loss": 0.8442, + "step": 4235 + }, + { + "epoch": 2.38917089678511, + "grad_norm": 2.5239675045013428, + "learning_rate": 3.8056965595036664e-05, + "loss": 0.7026, + "step": 4236 + }, + { + "epoch": 2.389734912577552, + "grad_norm": 2.036555528640747, + "learning_rate": 3.805414551607445e-05, + "loss": 0.7778, + "step": 4237 + }, + { + "epoch": 2.3902989283699942, + "grad_norm": 1.3355499505996704, + "learning_rate": 3.805132543711224e-05, + "loss": 0.6475, + "step": 4238 + }, + { + "epoch": 2.3908629441624365, + "grad_norm": 1.2684075832366943, + "learning_rate": 3.804850535815003e-05, + "loss": 0.7947, + "step": 4239 + }, + { + "epoch": 2.3914269599548788, + "grad_norm": 1.9742119312286377, + "learning_rate": 3.804568527918782e-05, + "loss": 0.7386, + "step": 4240 + }, + { + "epoch": 2.391990975747321, + "grad_norm": 1.3668212890625, + "learning_rate": 3.804286520022561e-05, + "loss": 0.6301, + "step": 4241 + }, + { + "epoch": 2.3925549915397633, + "grad_norm": 1.732024908065796, + "learning_rate": 3.8040045121263396e-05, + "loss": 0.8239, + "step": 4242 + }, + { + "epoch": 2.3931190073322055, + "grad_norm": 1.0242832899093628, + "learning_rate": 3.803722504230119e-05, + "loss": 0.741, + "step": 4243 + }, + { + "epoch": 2.3936830231246473, + "grad_norm": 2.268066644668579, + "learning_rate": 3.8034404963338974e-05, + "loss": 0.6286, + "step": 4244 + }, + { + "epoch": 2.3942470389170896, + "grad_norm": 2.4279088973999023, + "learning_rate": 3.8031584884376766e-05, + "loss": 0.7678, + "step": 4245 + }, + { + "epoch": 2.394811054709532, + "grad_norm": 0.9730605483055115, + "learning_rate": 3.802876480541455e-05, + "loss": 0.7283, + "step": 4246 + }, + { + "epoch": 2.395375070501974, + "grad_norm": 1.9844164848327637, + "learning_rate": 3.8025944726452344e-05, + "loss": 0.8389, + "step": 4247 + }, + { + "epoch": 2.3959390862944163, + "grad_norm": 5.352534294128418, + "learning_rate": 3.802312464749013e-05, + "loss": 0.8265, + "step": 4248 + }, + { + "epoch": 2.3965031020868586, + "grad_norm": 1.6313180923461914, + "learning_rate": 3.802030456852792e-05, + "loss": 0.7539, + "step": 4249 + }, + { + "epoch": 2.397067117879301, + "grad_norm": 1.8161237239837646, + "learning_rate": 3.801748448956571e-05, + "loss": 0.8245, + "step": 4250 + }, + { + "epoch": 2.3976311336717426, + "grad_norm": 1.7449734210968018, + "learning_rate": 3.80146644106035e-05, + "loss": 0.7383, + "step": 4251 + }, + { + "epoch": 2.398195149464185, + "grad_norm": 1.3852330446243286, + "learning_rate": 3.801184433164129e-05, + "loss": 0.7727, + "step": 4252 + }, + { + "epoch": 2.398759165256627, + "grad_norm": 1.2185834646224976, + "learning_rate": 3.8009024252679077e-05, + "loss": 0.7579, + "step": 4253 + }, + { + "epoch": 2.3993231810490694, + "grad_norm": 1.6313782930374146, + "learning_rate": 3.800620417371686e-05, + "loss": 0.7511, + "step": 4254 + }, + { + "epoch": 2.3998871968415116, + "grad_norm": 2.033790111541748, + "learning_rate": 3.8003384094754654e-05, + "loss": 0.8673, + "step": 4255 + }, + { + "epoch": 2.400451212633954, + "grad_norm": 2.6653761863708496, + "learning_rate": 3.8000564015792446e-05, + "loss": 0.7279, + "step": 4256 + }, + { + "epoch": 2.401015228426396, + "grad_norm": 1.1957272291183472, + "learning_rate": 3.799774393683023e-05, + "loss": 0.7369, + "step": 4257 + }, + { + "epoch": 2.401579244218838, + "grad_norm": 1.4706863164901733, + "learning_rate": 3.799492385786802e-05, + "loss": 0.7409, + "step": 4258 + }, + { + "epoch": 2.40214326001128, + "grad_norm": 1.615254521369934, + "learning_rate": 3.7992103778905816e-05, + "loss": 0.7854, + "step": 4259 + }, + { + "epoch": 2.4027072758037225, + "grad_norm": 2.6301753520965576, + "learning_rate": 3.79892836999436e-05, + "loss": 0.8924, + "step": 4260 + }, + { + "epoch": 2.4032712915961647, + "grad_norm": 3.138383626937866, + "learning_rate": 3.798646362098139e-05, + "loss": 0.8459, + "step": 4261 + }, + { + "epoch": 2.403835307388607, + "grad_norm": 1.5975104570388794, + "learning_rate": 3.798364354201918e-05, + "loss": 0.811, + "step": 4262 + }, + { + "epoch": 2.404399323181049, + "grad_norm": 1.3229881525039673, + "learning_rate": 3.798082346305697e-05, + "loss": 0.7284, + "step": 4263 + }, + { + "epoch": 2.4049633389734915, + "grad_norm": 1.7743805646896362, + "learning_rate": 3.797800338409476e-05, + "loss": 0.7477, + "step": 4264 + }, + { + "epoch": 2.4055273547659333, + "grad_norm": 1.5292633771896362, + "learning_rate": 3.797518330513254e-05, + "loss": 0.7603, + "step": 4265 + }, + { + "epoch": 2.4060913705583755, + "grad_norm": 2.0938286781311035, + "learning_rate": 3.7972363226170334e-05, + "loss": 0.9169, + "step": 4266 + }, + { + "epoch": 2.4066553863508178, + "grad_norm": 2.8709776401519775, + "learning_rate": 3.7969543147208127e-05, + "loss": 0.7481, + "step": 4267 + }, + { + "epoch": 2.40721940214326, + "grad_norm": 1.2738566398620605, + "learning_rate": 3.796672306824591e-05, + "loss": 0.821, + "step": 4268 + }, + { + "epoch": 2.4077834179357023, + "grad_norm": 4.291167736053467, + "learning_rate": 3.79639029892837e-05, + "loss": 0.8573, + "step": 4269 + }, + { + "epoch": 2.4083474337281445, + "grad_norm": 1.7061396837234497, + "learning_rate": 3.796108291032149e-05, + "loss": 0.6692, + "step": 4270 + }, + { + "epoch": 2.408911449520587, + "grad_norm": 1.6930949687957764, + "learning_rate": 3.795826283135928e-05, + "loss": 0.7546, + "step": 4271 + }, + { + "epoch": 2.4094754653130286, + "grad_norm": 1.542667031288147, + "learning_rate": 3.795544275239707e-05, + "loss": 0.7559, + "step": 4272 + }, + { + "epoch": 2.410039481105471, + "grad_norm": 1.2443617582321167, + "learning_rate": 3.795262267343486e-05, + "loss": 0.7383, + "step": 4273 + }, + { + "epoch": 2.410603496897913, + "grad_norm": 1.0585789680480957, + "learning_rate": 3.7949802594472645e-05, + "loss": 0.6492, + "step": 4274 + }, + { + "epoch": 2.4111675126903553, + "grad_norm": 1.1155571937561035, + "learning_rate": 3.794698251551044e-05, + "loss": 0.7899, + "step": 4275 + }, + { + "epoch": 2.4117315284827976, + "grad_norm": 1.695431113243103, + "learning_rate": 3.794416243654822e-05, + "loss": 0.7171, + "step": 4276 + }, + { + "epoch": 2.41229554427524, + "grad_norm": 1.6798169612884521, + "learning_rate": 3.7941342357586014e-05, + "loss": 0.8335, + "step": 4277 + }, + { + "epoch": 2.412859560067682, + "grad_norm": 3.2771098613739014, + "learning_rate": 3.793852227862381e-05, + "loss": 0.8878, + "step": 4278 + }, + { + "epoch": 2.413423575860124, + "grad_norm": 1.4172029495239258, + "learning_rate": 3.793570219966159e-05, + "loss": 0.762, + "step": 4279 + }, + { + "epoch": 2.413987591652566, + "grad_norm": 2.379870891571045, + "learning_rate": 3.7932882120699384e-05, + "loss": 0.7519, + "step": 4280 + }, + { + "epoch": 2.4145516074450084, + "grad_norm": 1.1434049606323242, + "learning_rate": 3.793006204173717e-05, + "loss": 0.7903, + "step": 4281 + }, + { + "epoch": 2.4151156232374507, + "grad_norm": 2.8087334632873535, + "learning_rate": 3.792724196277496e-05, + "loss": 0.7254, + "step": 4282 + }, + { + "epoch": 2.415679639029893, + "grad_norm": 4.2126030921936035, + "learning_rate": 3.792442188381275e-05, + "loss": 0.8537, + "step": 4283 + }, + { + "epoch": 2.416243654822335, + "grad_norm": 2.2999930381774902, + "learning_rate": 3.792160180485054e-05, + "loss": 0.8226, + "step": 4284 + }, + { + "epoch": 2.4168076706147774, + "grad_norm": 2.2614388465881348, + "learning_rate": 3.7918781725888325e-05, + "loss": 0.7537, + "step": 4285 + }, + { + "epoch": 2.4173716864072192, + "grad_norm": 3.4039926528930664, + "learning_rate": 3.791596164692612e-05, + "loss": 0.9567, + "step": 4286 + }, + { + "epoch": 2.4179357021996615, + "grad_norm": 1.37233304977417, + "learning_rate": 3.79131415679639e-05, + "loss": 0.8152, + "step": 4287 + }, + { + "epoch": 2.4184997179921037, + "grad_norm": 1.6258798837661743, + "learning_rate": 3.7910321489001695e-05, + "loss": 0.7705, + "step": 4288 + }, + { + "epoch": 2.419063733784546, + "grad_norm": 1.4471783638000488, + "learning_rate": 3.790750141003948e-05, + "loss": 0.7326, + "step": 4289 + }, + { + "epoch": 2.4196277495769882, + "grad_norm": 3.911803722381592, + "learning_rate": 3.790468133107727e-05, + "loss": 0.9286, + "step": 4290 + }, + { + "epoch": 2.4201917653694305, + "grad_norm": 1.2002558708190918, + "learning_rate": 3.7901861252115064e-05, + "loss": 0.7474, + "step": 4291 + }, + { + "epoch": 2.4207557811618727, + "grad_norm": 1.718485713005066, + "learning_rate": 3.789904117315285e-05, + "loss": 0.8826, + "step": 4292 + }, + { + "epoch": 2.4213197969543145, + "grad_norm": 1.3475528955459595, + "learning_rate": 3.7896221094190635e-05, + "loss": 0.6763, + "step": 4293 + }, + { + "epoch": 2.421883812746757, + "grad_norm": 2.0876359939575195, + "learning_rate": 3.789340101522843e-05, + "loss": 0.8268, + "step": 4294 + }, + { + "epoch": 2.422447828539199, + "grad_norm": 1.3002711534500122, + "learning_rate": 3.789058093626622e-05, + "loss": 0.7765, + "step": 4295 + }, + { + "epoch": 2.4230118443316413, + "grad_norm": 1.593274474143982, + "learning_rate": 3.7887760857304005e-05, + "loss": 0.7657, + "step": 4296 + }, + { + "epoch": 2.4235758601240835, + "grad_norm": 2.0933446884155273, + "learning_rate": 3.788494077834179e-05, + "loss": 0.7808, + "step": 4297 + }, + { + "epoch": 2.424139875916526, + "grad_norm": 1.9166125059127808, + "learning_rate": 3.788212069937959e-05, + "loss": 0.862, + "step": 4298 + }, + { + "epoch": 2.424703891708968, + "grad_norm": 1.2239457368850708, + "learning_rate": 3.7879300620417375e-05, + "loss": 0.6087, + "step": 4299 + }, + { + "epoch": 2.42526790750141, + "grad_norm": 1.1502474546432495, + "learning_rate": 3.787648054145516e-05, + "loss": 0.6887, + "step": 4300 + }, + { + "epoch": 2.425831923293852, + "grad_norm": 2.256903648376465, + "learning_rate": 3.787366046249295e-05, + "loss": 0.8509, + "step": 4301 + }, + { + "epoch": 2.4263959390862944, + "grad_norm": 2.1819939613342285, + "learning_rate": 3.7870840383530745e-05, + "loss": 0.8407, + "step": 4302 + }, + { + "epoch": 2.4269599548787366, + "grad_norm": 1.1459838151931763, + "learning_rate": 3.786802030456853e-05, + "loss": 0.7454, + "step": 4303 + }, + { + "epoch": 2.427523970671179, + "grad_norm": 1.4178191423416138, + "learning_rate": 3.7865200225606315e-05, + "loss": 0.6717, + "step": 4304 + }, + { + "epoch": 2.428087986463621, + "grad_norm": 1.2112609148025513, + "learning_rate": 3.786238014664411e-05, + "loss": 0.7089, + "step": 4305 + }, + { + "epoch": 2.4286520022560634, + "grad_norm": 1.6500297784805298, + "learning_rate": 3.78595600676819e-05, + "loss": 0.6933, + "step": 4306 + }, + { + "epoch": 2.429216018048505, + "grad_norm": 2.912858247756958, + "learning_rate": 3.7856739988719685e-05, + "loss": 0.8023, + "step": 4307 + }, + { + "epoch": 2.4297800338409474, + "grad_norm": 1.2362736463546753, + "learning_rate": 3.785391990975747e-05, + "loss": 0.7925, + "step": 4308 + }, + { + "epoch": 2.4303440496333897, + "grad_norm": 1.4235992431640625, + "learning_rate": 3.785109983079526e-05, + "loss": 0.723, + "step": 4309 + }, + { + "epoch": 2.430908065425832, + "grad_norm": 2.5226922035217285, + "learning_rate": 3.7848279751833055e-05, + "loss": 0.6937, + "step": 4310 + }, + { + "epoch": 2.431472081218274, + "grad_norm": 1.7686916589736938, + "learning_rate": 3.784545967287084e-05, + "loss": 0.7558, + "step": 4311 + }, + { + "epoch": 2.4320360970107164, + "grad_norm": 2.4313669204711914, + "learning_rate": 3.784263959390863e-05, + "loss": 0.8582, + "step": 4312 + }, + { + "epoch": 2.4326001128031587, + "grad_norm": 7.5087890625, + "learning_rate": 3.7839819514946425e-05, + "loss": 0.9062, + "step": 4313 + }, + { + "epoch": 2.4331641285956005, + "grad_norm": 1.6737895011901855, + "learning_rate": 3.783699943598421e-05, + "loss": 0.83, + "step": 4314 + }, + { + "epoch": 2.4337281443880427, + "grad_norm": 1.3172059059143066, + "learning_rate": 3.7834179357021996e-05, + "loss": 0.7771, + "step": 4315 + }, + { + "epoch": 2.434292160180485, + "grad_norm": 1.759864330291748, + "learning_rate": 3.783135927805979e-05, + "loss": 0.7817, + "step": 4316 + }, + { + "epoch": 2.4348561759729273, + "grad_norm": 1.2742342948913574, + "learning_rate": 3.782853919909758e-05, + "loss": 0.7784, + "step": 4317 + }, + { + "epoch": 2.4354201917653695, + "grad_norm": 2.637709856033325, + "learning_rate": 3.7825719120135365e-05, + "loss": 0.8894, + "step": 4318 + }, + { + "epoch": 2.4359842075578118, + "grad_norm": 2.7423667907714844, + "learning_rate": 3.782289904117316e-05, + "loss": 0.8083, + "step": 4319 + }, + { + "epoch": 2.436548223350254, + "grad_norm": 1.9336351156234741, + "learning_rate": 3.782007896221094e-05, + "loss": 0.8427, + "step": 4320 + }, + { + "epoch": 2.437112239142696, + "grad_norm": 1.0851316452026367, + "learning_rate": 3.7817258883248735e-05, + "loss": 0.7731, + "step": 4321 + }, + { + "epoch": 2.437676254935138, + "grad_norm": 1.367384433746338, + "learning_rate": 3.781443880428652e-05, + "loss": 0.6601, + "step": 4322 + }, + { + "epoch": 2.4382402707275803, + "grad_norm": 1.6422247886657715, + "learning_rate": 3.781161872532431e-05, + "loss": 0.6883, + "step": 4323 + }, + { + "epoch": 2.4388042865200226, + "grad_norm": 1.5721391439437866, + "learning_rate": 3.78087986463621e-05, + "loss": 0.6941, + "step": 4324 + }, + { + "epoch": 2.439368302312465, + "grad_norm": 1.624330997467041, + "learning_rate": 3.780597856739989e-05, + "loss": 0.7197, + "step": 4325 + }, + { + "epoch": 2.439932318104907, + "grad_norm": 2.0790441036224365, + "learning_rate": 3.7803158488437676e-05, + "loss": 0.7366, + "step": 4326 + }, + { + "epoch": 2.4404963338973493, + "grad_norm": 1.2163503170013428, + "learning_rate": 3.780033840947547e-05, + "loss": 0.746, + "step": 4327 + }, + { + "epoch": 2.441060349689791, + "grad_norm": 1.425275444984436, + "learning_rate": 3.779751833051325e-05, + "loss": 0.7543, + "step": 4328 + }, + { + "epoch": 2.4416243654822334, + "grad_norm": 3.236504077911377, + "learning_rate": 3.7794698251551045e-05, + "loss": 0.8615, + "step": 4329 + }, + { + "epoch": 2.4421883812746756, + "grad_norm": 1.3825727701187134, + "learning_rate": 3.779187817258884e-05, + "loss": 0.7229, + "step": 4330 + }, + { + "epoch": 2.442752397067118, + "grad_norm": 0.9982205629348755, + "learning_rate": 3.778905809362662e-05, + "loss": 0.7321, + "step": 4331 + }, + { + "epoch": 2.44331641285956, + "grad_norm": 3.248659610748291, + "learning_rate": 3.778623801466441e-05, + "loss": 0.8286, + "step": 4332 + }, + { + "epoch": 2.4438804286520024, + "grad_norm": 1.6181086301803589, + "learning_rate": 3.77834179357022e-05, + "loss": 0.7315, + "step": 4333 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 1.2528043985366821, + "learning_rate": 3.778059785673999e-05, + "loss": 0.7446, + "step": 4334 + }, + { + "epoch": 2.4450084602368864, + "grad_norm": 1.3913739919662476, + "learning_rate": 3.777777777777778e-05, + "loss": 0.8577, + "step": 4335 + }, + { + "epoch": 2.4455724760293287, + "grad_norm": 1.4595309495925903, + "learning_rate": 3.7774957698815564e-05, + "loss": 0.7791, + "step": 4336 + }, + { + "epoch": 2.446136491821771, + "grad_norm": 1.2781999111175537, + "learning_rate": 3.777213761985336e-05, + "loss": 0.7908, + "step": 4337 + }, + { + "epoch": 2.446700507614213, + "grad_norm": 1.3154332637786865, + "learning_rate": 3.776931754089115e-05, + "loss": 0.6665, + "step": 4338 + }, + { + "epoch": 2.4472645234066555, + "grad_norm": 2.968759536743164, + "learning_rate": 3.7766497461928933e-05, + "loss": 0.8748, + "step": 4339 + }, + { + "epoch": 2.4478285391990977, + "grad_norm": 2.8771350383758545, + "learning_rate": 3.7763677382966726e-05, + "loss": 0.7839, + "step": 4340 + }, + { + "epoch": 2.44839255499154, + "grad_norm": 2.7244460582733154, + "learning_rate": 3.776085730400452e-05, + "loss": 0.7812, + "step": 4341 + }, + { + "epoch": 2.4489565707839818, + "grad_norm": 1.1229355335235596, + "learning_rate": 3.77580372250423e-05, + "loss": 0.6814, + "step": 4342 + }, + { + "epoch": 2.449520586576424, + "grad_norm": 1.9664324522018433, + "learning_rate": 3.775521714608009e-05, + "loss": 0.8323, + "step": 4343 + }, + { + "epoch": 2.4500846023688663, + "grad_norm": 1.279440999031067, + "learning_rate": 3.775239706711788e-05, + "loss": 0.7624, + "step": 4344 + }, + { + "epoch": 2.4506486181613085, + "grad_norm": 1.192280888557434, + "learning_rate": 3.774957698815567e-05, + "loss": 0.6425, + "step": 4345 + }, + { + "epoch": 2.4512126339537508, + "grad_norm": 1.1487611532211304, + "learning_rate": 3.774675690919346e-05, + "loss": 0.7837, + "step": 4346 + }, + { + "epoch": 2.451776649746193, + "grad_norm": 1.5138351917266846, + "learning_rate": 3.7743936830231244e-05, + "loss": 0.8283, + "step": 4347 + }, + { + "epoch": 2.4523406655386353, + "grad_norm": 5.703322410583496, + "learning_rate": 3.774111675126904e-05, + "loss": 0.8754, + "step": 4348 + }, + { + "epoch": 2.452904681331077, + "grad_norm": 1.1540641784667969, + "learning_rate": 3.773829667230683e-05, + "loss": 0.8074, + "step": 4349 + }, + { + "epoch": 2.4534686971235193, + "grad_norm": 1.7427937984466553, + "learning_rate": 3.7735476593344614e-05, + "loss": 0.7719, + "step": 4350 + }, + { + "epoch": 2.4540327129159616, + "grad_norm": 1.1979212760925293, + "learning_rate": 3.7732656514382406e-05, + "loss": 0.7528, + "step": 4351 + }, + { + "epoch": 2.454596728708404, + "grad_norm": 1.5976338386535645, + "learning_rate": 3.77298364354202e-05, + "loss": 0.7375, + "step": 4352 + }, + { + "epoch": 2.455160744500846, + "grad_norm": 1.606420874595642, + "learning_rate": 3.772701635645798e-05, + "loss": 0.7417, + "step": 4353 + }, + { + "epoch": 2.4557247602932883, + "grad_norm": 1.060125708580017, + "learning_rate": 3.772419627749577e-05, + "loss": 0.7439, + "step": 4354 + }, + { + "epoch": 2.4562887760857306, + "grad_norm": 1.6842106580734253, + "learning_rate": 3.772137619853356e-05, + "loss": 0.7779, + "step": 4355 + }, + { + "epoch": 2.4568527918781724, + "grad_norm": 1.4703938961029053, + "learning_rate": 3.771855611957135e-05, + "loss": 0.7689, + "step": 4356 + }, + { + "epoch": 2.4574168076706147, + "grad_norm": 2.9276533126831055, + "learning_rate": 3.771573604060914e-05, + "loss": 0.831, + "step": 4357 + }, + { + "epoch": 2.457980823463057, + "grad_norm": 1.306656002998352, + "learning_rate": 3.771291596164693e-05, + "loss": 0.8533, + "step": 4358 + }, + { + "epoch": 2.458544839255499, + "grad_norm": 2.7495031356811523, + "learning_rate": 3.7710095882684716e-05, + "loss": 0.8631, + "step": 4359 + }, + { + "epoch": 2.4591088550479414, + "grad_norm": 2.0595686435699463, + "learning_rate": 3.770727580372251e-05, + "loss": 0.8967, + "step": 4360 + }, + { + "epoch": 2.4596728708403837, + "grad_norm": 1.8698065280914307, + "learning_rate": 3.7704455724760294e-05, + "loss": 0.7889, + "step": 4361 + }, + { + "epoch": 2.460236886632826, + "grad_norm": 1.5057376623153687, + "learning_rate": 3.7701635645798086e-05, + "loss": 0.7788, + "step": 4362 + }, + { + "epoch": 2.4608009024252677, + "grad_norm": 1.3948349952697754, + "learning_rate": 3.769881556683587e-05, + "loss": 0.757, + "step": 4363 + }, + { + "epoch": 2.46136491821771, + "grad_norm": 1.5554733276367188, + "learning_rate": 3.7695995487873663e-05, + "loss": 0.8088, + "step": 4364 + }, + { + "epoch": 2.4619289340101522, + "grad_norm": 4.319594383239746, + "learning_rate": 3.769317540891145e-05, + "loss": 0.7545, + "step": 4365 + }, + { + "epoch": 2.4624929498025945, + "grad_norm": 1.8985170125961304, + "learning_rate": 3.769035532994924e-05, + "loss": 0.9226, + "step": 4366 + }, + { + "epoch": 2.4630569655950367, + "grad_norm": 1.3814361095428467, + "learning_rate": 3.7687535250987026e-05, + "loss": 0.7113, + "step": 4367 + }, + { + "epoch": 2.463620981387479, + "grad_norm": 1.8680990934371948, + "learning_rate": 3.768471517202482e-05, + "loss": 0.8794, + "step": 4368 + }, + { + "epoch": 2.4641849971799212, + "grad_norm": 1.1227864027023315, + "learning_rate": 3.768189509306261e-05, + "loss": 0.7619, + "step": 4369 + }, + { + "epoch": 2.464749012972363, + "grad_norm": 1.9287322759628296, + "learning_rate": 3.7679075014100396e-05, + "loss": 0.8438, + "step": 4370 + }, + { + "epoch": 2.4653130287648053, + "grad_norm": 2.3126325607299805, + "learning_rate": 3.767625493513818e-05, + "loss": 0.8644, + "step": 4371 + }, + { + "epoch": 2.4658770445572475, + "grad_norm": 1.7756311893463135, + "learning_rate": 3.7673434856175974e-05, + "loss": 0.8624, + "step": 4372 + }, + { + "epoch": 2.46644106034969, + "grad_norm": 1.1124073266983032, + "learning_rate": 3.7670614777213766e-05, + "loss": 0.6681, + "step": 4373 + }, + { + "epoch": 2.467005076142132, + "grad_norm": 3.3212087154388428, + "learning_rate": 3.766779469825155e-05, + "loss": 0.7455, + "step": 4374 + }, + { + "epoch": 2.4675690919345743, + "grad_norm": 1.8720715045928955, + "learning_rate": 3.766497461928934e-05, + "loss": 0.7667, + "step": 4375 + }, + { + "epoch": 2.4681331077270166, + "grad_norm": 5.494768142700195, + "learning_rate": 3.7662154540327136e-05, + "loss": 1.1092, + "step": 4376 + }, + { + "epoch": 2.4686971235194584, + "grad_norm": 1.7445237636566162, + "learning_rate": 3.765933446136492e-05, + "loss": 0.7661, + "step": 4377 + }, + { + "epoch": 2.4692611393119006, + "grad_norm": 3.0982000827789307, + "learning_rate": 3.765651438240271e-05, + "loss": 0.9273, + "step": 4378 + }, + { + "epoch": 2.469825155104343, + "grad_norm": 1.737053632736206, + "learning_rate": 3.765369430344049e-05, + "loss": 0.7818, + "step": 4379 + }, + { + "epoch": 2.470389170896785, + "grad_norm": 1.941421627998352, + "learning_rate": 3.765087422447829e-05, + "loss": 0.9267, + "step": 4380 + }, + { + "epoch": 2.4709531866892274, + "grad_norm": 1.5984026193618774, + "learning_rate": 3.7648054145516076e-05, + "loss": 0.8634, + "step": 4381 + }, + { + "epoch": 2.4715172024816696, + "grad_norm": 1.5952117443084717, + "learning_rate": 3.764523406655386e-05, + "loss": 0.6943, + "step": 4382 + }, + { + "epoch": 2.472081218274112, + "grad_norm": 1.292813777923584, + "learning_rate": 3.7642413987591654e-05, + "loss": 0.6884, + "step": 4383 + }, + { + "epoch": 2.4726452340665537, + "grad_norm": 1.5665441751480103, + "learning_rate": 3.7639593908629446e-05, + "loss": 0.746, + "step": 4384 + }, + { + "epoch": 2.473209249858996, + "grad_norm": 1.6026160717010498, + "learning_rate": 3.763677382966723e-05, + "loss": 0.7792, + "step": 4385 + }, + { + "epoch": 2.473773265651438, + "grad_norm": 1.4967350959777832, + "learning_rate": 3.763395375070502e-05, + "loss": 0.8008, + "step": 4386 + }, + { + "epoch": 2.4743372814438804, + "grad_norm": 1.370393991470337, + "learning_rate": 3.7631133671742816e-05, + "loss": 0.719, + "step": 4387 + }, + { + "epoch": 2.4749012972363227, + "grad_norm": 1.735375165939331, + "learning_rate": 3.76283135927806e-05, + "loss": 0.8401, + "step": 4388 + }, + { + "epoch": 2.475465313028765, + "grad_norm": 1.775395154953003, + "learning_rate": 3.762549351381839e-05, + "loss": 0.7574, + "step": 4389 + }, + { + "epoch": 2.476029328821207, + "grad_norm": 1.8256022930145264, + "learning_rate": 3.762267343485618e-05, + "loss": 0.7399, + "step": 4390 + }, + { + "epoch": 2.476593344613649, + "grad_norm": 1.6733388900756836, + "learning_rate": 3.761985335589397e-05, + "loss": 0.7271, + "step": 4391 + }, + { + "epoch": 2.4771573604060912, + "grad_norm": 2.280492067337036, + "learning_rate": 3.7617033276931757e-05, + "loss": 1.0133, + "step": 4392 + }, + { + "epoch": 2.4777213761985335, + "grad_norm": 1.3130221366882324, + "learning_rate": 3.761421319796954e-05, + "loss": 0.6907, + "step": 4393 + }, + { + "epoch": 2.4782853919909758, + "grad_norm": 1.2370532751083374, + "learning_rate": 3.7611393119007334e-05, + "loss": 0.8883, + "step": 4394 + }, + { + "epoch": 2.478849407783418, + "grad_norm": 2.454714059829712, + "learning_rate": 3.7608573040045126e-05, + "loss": 0.7674, + "step": 4395 + }, + { + "epoch": 2.4794134235758603, + "grad_norm": 1.9195526838302612, + "learning_rate": 3.760575296108291e-05, + "loss": 0.8974, + "step": 4396 + }, + { + "epoch": 2.4799774393683025, + "grad_norm": 1.4057940244674683, + "learning_rate": 3.76029328821207e-05, + "loss": 0.7764, + "step": 4397 + }, + { + "epoch": 2.4805414551607443, + "grad_norm": 1.0583746433258057, + "learning_rate": 3.760011280315849e-05, + "loss": 0.6637, + "step": 4398 + }, + { + "epoch": 2.4811054709531866, + "grad_norm": 1.5315879583358765, + "learning_rate": 3.759729272419628e-05, + "loss": 0.7445, + "step": 4399 + }, + { + "epoch": 2.481669486745629, + "grad_norm": 1.9808884859085083, + "learning_rate": 3.759447264523407e-05, + "loss": 0.8331, + "step": 4400 + }, + { + "epoch": 2.482233502538071, + "grad_norm": 1.861488699913025, + "learning_rate": 3.759165256627186e-05, + "loss": 0.8186, + "step": 4401 + }, + { + "epoch": 2.4827975183305133, + "grad_norm": 1.2528120279312134, + "learning_rate": 3.7588832487309645e-05, + "loss": 0.779, + "step": 4402 + }, + { + "epoch": 2.4833615341229556, + "grad_norm": 1.4204429388046265, + "learning_rate": 3.758601240834744e-05, + "loss": 0.7148, + "step": 4403 + }, + { + "epoch": 2.483925549915398, + "grad_norm": 1.0801392793655396, + "learning_rate": 3.758319232938522e-05, + "loss": 0.6782, + "step": 4404 + }, + { + "epoch": 2.4844895657078396, + "grad_norm": 2.0045862197875977, + "learning_rate": 3.7580372250423014e-05, + "loss": 0.8641, + "step": 4405 + }, + { + "epoch": 2.485053581500282, + "grad_norm": 1.7368013858795166, + "learning_rate": 3.75775521714608e-05, + "loss": 0.8047, + "step": 4406 + }, + { + "epoch": 2.485617597292724, + "grad_norm": 1.4469884634017944, + "learning_rate": 3.757473209249859e-05, + "loss": 0.6448, + "step": 4407 + }, + { + "epoch": 2.4861816130851664, + "grad_norm": 1.8730380535125732, + "learning_rate": 3.7571912013536384e-05, + "loss": 0.7001, + "step": 4408 + }, + { + "epoch": 2.4867456288776086, + "grad_norm": 1.7643741369247437, + "learning_rate": 3.756909193457417e-05, + "loss": 0.8107, + "step": 4409 + }, + { + "epoch": 2.487309644670051, + "grad_norm": 1.4992226362228394, + "learning_rate": 3.7566271855611955e-05, + "loss": 0.6647, + "step": 4410 + }, + { + "epoch": 2.487873660462493, + "grad_norm": 2.205383539199829, + "learning_rate": 3.756345177664975e-05, + "loss": 0.8442, + "step": 4411 + }, + { + "epoch": 2.488437676254935, + "grad_norm": 1.7325786352157593, + "learning_rate": 3.756063169768754e-05, + "loss": 0.8073, + "step": 4412 + }, + { + "epoch": 2.489001692047377, + "grad_norm": 1.5766772031784058, + "learning_rate": 3.7557811618725325e-05, + "loss": 0.8572, + "step": 4413 + }, + { + "epoch": 2.4895657078398195, + "grad_norm": 1.0375566482543945, + "learning_rate": 3.755499153976311e-05, + "loss": 0.8016, + "step": 4414 + }, + { + "epoch": 2.4901297236322617, + "grad_norm": 1.5375927686691284, + "learning_rate": 3.75521714608009e-05, + "loss": 0.8701, + "step": 4415 + }, + { + "epoch": 2.490693739424704, + "grad_norm": 1.0653690099716187, + "learning_rate": 3.7549351381838694e-05, + "loss": 0.7446, + "step": 4416 + }, + { + "epoch": 2.491257755217146, + "grad_norm": 1.5762988328933716, + "learning_rate": 3.754653130287648e-05, + "loss": 0.8453, + "step": 4417 + }, + { + "epoch": 2.4918217710095885, + "grad_norm": 1.1430631875991821, + "learning_rate": 3.754371122391427e-05, + "loss": 0.6501, + "step": 4418 + }, + { + "epoch": 2.4923857868020303, + "grad_norm": 1.50095796585083, + "learning_rate": 3.7540891144952064e-05, + "loss": 0.7333, + "step": 4419 + }, + { + "epoch": 2.4929498025944725, + "grad_norm": 2.0748231410980225, + "learning_rate": 3.753807106598985e-05, + "loss": 0.9558, + "step": 4420 + }, + { + "epoch": 2.4935138183869148, + "grad_norm": 1.6613446474075317, + "learning_rate": 3.7535250987027635e-05, + "loss": 0.7749, + "step": 4421 + }, + { + "epoch": 2.494077834179357, + "grad_norm": 1.2805798053741455, + "learning_rate": 3.753243090806543e-05, + "loss": 0.7554, + "step": 4422 + }, + { + "epoch": 2.4946418499717993, + "grad_norm": 1.1070263385772705, + "learning_rate": 3.752961082910322e-05, + "loss": 0.7402, + "step": 4423 + }, + { + "epoch": 2.4952058657642415, + "grad_norm": 1.6008806228637695, + "learning_rate": 3.7526790750141005e-05, + "loss": 0.8081, + "step": 4424 + }, + { + "epoch": 2.495769881556684, + "grad_norm": 1.750071406364441, + "learning_rate": 3.752397067117879e-05, + "loss": 0.8368, + "step": 4425 + }, + { + "epoch": 2.4963338973491256, + "grad_norm": 1.0843379497528076, + "learning_rate": 3.752115059221659e-05, + "loss": 0.7635, + "step": 4426 + }, + { + "epoch": 2.496897913141568, + "grad_norm": 3.3931198120117188, + "learning_rate": 3.7518330513254375e-05, + "loss": 0.81, + "step": 4427 + }, + { + "epoch": 2.49746192893401, + "grad_norm": 1.5560587644577026, + "learning_rate": 3.751551043429216e-05, + "loss": 0.6399, + "step": 4428 + }, + { + "epoch": 2.4980259447264523, + "grad_norm": 2.704648971557617, + "learning_rate": 3.751269035532995e-05, + "loss": 0.93, + "step": 4429 + }, + { + "epoch": 2.4985899605188946, + "grad_norm": 1.1112045049667358, + "learning_rate": 3.7509870276367744e-05, + "loss": 0.7049, + "step": 4430 + }, + { + "epoch": 2.499153976311337, + "grad_norm": 1.0754038095474243, + "learning_rate": 3.750705019740553e-05, + "loss": 0.6767, + "step": 4431 + }, + { + "epoch": 2.499717992103779, + "grad_norm": 2.0219533443450928, + "learning_rate": 3.7504230118443315e-05, + "loss": 0.8242, + "step": 4432 + }, + { + "epoch": 2.500282007896221, + "grad_norm": 1.5000885725021362, + "learning_rate": 3.750141003948111e-05, + "loss": 0.8469, + "step": 4433 + }, + { + "epoch": 2.500846023688663, + "grad_norm": 1.604529619216919, + "learning_rate": 3.74985899605189e-05, + "loss": 0.7365, + "step": 4434 + }, + { + "epoch": 2.5014100394811054, + "grad_norm": 1.6046701669692993, + "learning_rate": 3.7495769881556685e-05, + "loss": 0.8392, + "step": 4435 + }, + { + "epoch": 2.5019740552735477, + "grad_norm": 2.2339425086975098, + "learning_rate": 3.749294980259447e-05, + "loss": 0.7429, + "step": 4436 + }, + { + "epoch": 2.50253807106599, + "grad_norm": 1.1306681632995605, + "learning_rate": 3.749012972363226e-05, + "loss": 0.8487, + "step": 4437 + }, + { + "epoch": 2.503102086858432, + "grad_norm": 1.3498822450637817, + "learning_rate": 3.7487309644670055e-05, + "loss": 0.7045, + "step": 4438 + }, + { + "epoch": 2.5036661026508744, + "grad_norm": 3.044671058654785, + "learning_rate": 3.748448956570784e-05, + "loss": 0.9474, + "step": 4439 + }, + { + "epoch": 2.504230118443316, + "grad_norm": 1.5068479776382446, + "learning_rate": 3.748166948674563e-05, + "loss": 0.7363, + "step": 4440 + }, + { + "epoch": 2.5047941342357585, + "grad_norm": 1.4542990922927856, + "learning_rate": 3.747884940778342e-05, + "loss": 0.7892, + "step": 4441 + }, + { + "epoch": 2.5053581500282007, + "grad_norm": 1.1880545616149902, + "learning_rate": 3.747602932882121e-05, + "loss": 0.7469, + "step": 4442 + }, + { + "epoch": 2.505922165820643, + "grad_norm": 1.6300756931304932, + "learning_rate": 3.7473209249858995e-05, + "loss": 0.7689, + "step": 4443 + }, + { + "epoch": 2.5064861816130852, + "grad_norm": 1.3162578344345093, + "learning_rate": 3.747038917089679e-05, + "loss": 0.6306, + "step": 4444 + }, + { + "epoch": 2.5070501974055275, + "grad_norm": 2.413998603820801, + "learning_rate": 3.746756909193457e-05, + "loss": 0.9056, + "step": 4445 + }, + { + "epoch": 2.5076142131979697, + "grad_norm": 2.0194849967956543, + "learning_rate": 3.7464749012972365e-05, + "loss": 0.7563, + "step": 4446 + }, + { + "epoch": 2.5081782289904115, + "grad_norm": 3.896456718444824, + "learning_rate": 3.746192893401016e-05, + "loss": 0.7155, + "step": 4447 + }, + { + "epoch": 2.508742244782854, + "grad_norm": 1.1395326852798462, + "learning_rate": 3.745910885504794e-05, + "loss": 0.6778, + "step": 4448 + }, + { + "epoch": 2.509306260575296, + "grad_norm": 2.0683400630950928, + "learning_rate": 3.745628877608573e-05, + "loss": 0.7367, + "step": 4449 + }, + { + "epoch": 2.5098702763677383, + "grad_norm": 2.442870616912842, + "learning_rate": 3.745346869712352e-05, + "loss": 0.8741, + "step": 4450 + }, + { + "epoch": 2.5104342921601805, + "grad_norm": 1.0958002805709839, + "learning_rate": 3.745064861816131e-05, + "loss": 0.7046, + "step": 4451 + }, + { + "epoch": 2.510998307952623, + "grad_norm": 1.7340261936187744, + "learning_rate": 3.74478285391991e-05, + "loss": 0.7649, + "step": 4452 + }, + { + "epoch": 2.511562323745065, + "grad_norm": 1.1577355861663818, + "learning_rate": 3.744500846023689e-05, + "loss": 0.7109, + "step": 4453 + }, + { + "epoch": 2.512126339537507, + "grad_norm": 1.595916748046875, + "learning_rate": 3.7442188381274676e-05, + "loss": 0.822, + "step": 4454 + }, + { + "epoch": 2.512690355329949, + "grad_norm": 2.4131197929382324, + "learning_rate": 3.743936830231247e-05, + "loss": 0.8952, + "step": 4455 + }, + { + "epoch": 2.5132543711223914, + "grad_norm": 1.3806604146957397, + "learning_rate": 3.743654822335025e-05, + "loss": 0.7559, + "step": 4456 + }, + { + "epoch": 2.5138183869148336, + "grad_norm": 1.6604130268096924, + "learning_rate": 3.7433728144388045e-05, + "loss": 0.7587, + "step": 4457 + }, + { + "epoch": 2.514382402707276, + "grad_norm": 1.8902521133422852, + "learning_rate": 3.743090806542584e-05, + "loss": 0.7699, + "step": 4458 + }, + { + "epoch": 2.514946418499718, + "grad_norm": 0.8797606229782104, + "learning_rate": 3.742808798646362e-05, + "loss": 0.641, + "step": 4459 + }, + { + "epoch": 2.5155104342921604, + "grad_norm": 1.1636934280395508, + "learning_rate": 3.742526790750141e-05, + "loss": 0.685, + "step": 4460 + }, + { + "epoch": 2.516074450084602, + "grad_norm": 0.9820253252983093, + "learning_rate": 3.74224478285392e-05, + "loss": 0.6941, + "step": 4461 + }, + { + "epoch": 2.5166384658770444, + "grad_norm": 1.3081450462341309, + "learning_rate": 3.741962774957699e-05, + "loss": 0.7124, + "step": 4462 + }, + { + "epoch": 2.5172024816694867, + "grad_norm": 1.2925670146942139, + "learning_rate": 3.741680767061478e-05, + "loss": 0.755, + "step": 4463 + }, + { + "epoch": 2.517766497461929, + "grad_norm": 3.269944429397583, + "learning_rate": 3.7413987591652563e-05, + "loss": 1.0064, + "step": 4464 + }, + { + "epoch": 2.518330513254371, + "grad_norm": 1.9121735095977783, + "learning_rate": 3.741116751269036e-05, + "loss": 0.8166, + "step": 4465 + }, + { + "epoch": 2.5188945290468134, + "grad_norm": 2.0004186630249023, + "learning_rate": 3.740834743372815e-05, + "loss": 0.8701, + "step": 4466 + }, + { + "epoch": 2.5194585448392557, + "grad_norm": 1.9155081510543823, + "learning_rate": 3.740552735476593e-05, + "loss": 0.8005, + "step": 4467 + }, + { + "epoch": 2.5200225606316975, + "grad_norm": 1.2449194192886353, + "learning_rate": 3.7402707275803725e-05, + "loss": 0.7076, + "step": 4468 + }, + { + "epoch": 2.5205865764241397, + "grad_norm": 1.747649908065796, + "learning_rate": 3.739988719684152e-05, + "loss": 0.7339, + "step": 4469 + }, + { + "epoch": 2.521150592216582, + "grad_norm": 1.5038758516311646, + "learning_rate": 3.73970671178793e-05, + "loss": 0.789, + "step": 4470 + }, + { + "epoch": 2.5217146080090242, + "grad_norm": 1.4197758436203003, + "learning_rate": 3.739424703891709e-05, + "loss": 0.696, + "step": 4471 + }, + { + "epoch": 2.5222786238014665, + "grad_norm": 1.4082894325256348, + "learning_rate": 3.739142695995488e-05, + "loss": 0.8, + "step": 4472 + }, + { + "epoch": 2.5228426395939088, + "grad_norm": 1.2969615459442139, + "learning_rate": 3.738860688099267e-05, + "loss": 0.8079, + "step": 4473 + }, + { + "epoch": 2.523406655386351, + "grad_norm": 3.254626750946045, + "learning_rate": 3.738578680203046e-05, + "loss": 0.8309, + "step": 4474 + }, + { + "epoch": 2.523970671178793, + "grad_norm": 1.4672725200653076, + "learning_rate": 3.7382966723068244e-05, + "loss": 0.793, + "step": 4475 + }, + { + "epoch": 2.524534686971235, + "grad_norm": 1.0509099960327148, + "learning_rate": 3.7380146644106036e-05, + "loss": 0.7885, + "step": 4476 + }, + { + "epoch": 2.5250987027636773, + "grad_norm": 1.462099313735962, + "learning_rate": 3.737732656514383e-05, + "loss": 0.7015, + "step": 4477 + }, + { + "epoch": 2.5256627185561196, + "grad_norm": 1.5240525007247925, + "learning_rate": 3.7374506486181613e-05, + "loss": 0.7652, + "step": 4478 + }, + { + "epoch": 2.526226734348562, + "grad_norm": 2.118467330932617, + "learning_rate": 3.7371686407219406e-05, + "loss": 0.895, + "step": 4479 + }, + { + "epoch": 2.526790750141004, + "grad_norm": 2.806723117828369, + "learning_rate": 3.736886632825719e-05, + "loss": 0.7653, + "step": 4480 + }, + { + "epoch": 2.5273547659334463, + "grad_norm": 2.107492446899414, + "learning_rate": 3.736604624929498e-05, + "loss": 0.8282, + "step": 4481 + }, + { + "epoch": 2.527918781725888, + "grad_norm": 1.3547694683074951, + "learning_rate": 3.736322617033277e-05, + "loss": 0.7688, + "step": 4482 + }, + { + "epoch": 2.5284827975183304, + "grad_norm": 3.0595169067382812, + "learning_rate": 3.736040609137056e-05, + "loss": 0.8813, + "step": 4483 + }, + { + "epoch": 2.5290468133107726, + "grad_norm": 1.218176245689392, + "learning_rate": 3.7357586012408346e-05, + "loss": 0.72, + "step": 4484 + }, + { + "epoch": 2.529610829103215, + "grad_norm": 1.5312689542770386, + "learning_rate": 3.735476593344614e-05, + "loss": 0.6571, + "step": 4485 + }, + { + "epoch": 2.530174844895657, + "grad_norm": 1.9626554250717163, + "learning_rate": 3.735194585448393e-05, + "loss": 0.7218, + "step": 4486 + }, + { + "epoch": 2.5307388606880994, + "grad_norm": 1.5262762308120728, + "learning_rate": 3.7349125775521716e-05, + "loss": 0.84, + "step": 4487 + }, + { + "epoch": 2.5313028764805416, + "grad_norm": 1.2722768783569336, + "learning_rate": 3.734630569655951e-05, + "loss": 0.8149, + "step": 4488 + }, + { + "epoch": 2.5318668922729834, + "grad_norm": 2.9597413539886475, + "learning_rate": 3.7343485617597294e-05, + "loss": 0.8701, + "step": 4489 + }, + { + "epoch": 2.5324309080654257, + "grad_norm": 2.661111831665039, + "learning_rate": 3.7340665538635086e-05, + "loss": 0.9619, + "step": 4490 + }, + { + "epoch": 2.532994923857868, + "grad_norm": 1.2595335245132446, + "learning_rate": 3.733784545967287e-05, + "loss": 0.742, + "step": 4491 + }, + { + "epoch": 2.53355893965031, + "grad_norm": 1.0489615201950073, + "learning_rate": 3.733502538071066e-05, + "loss": 0.7861, + "step": 4492 + }, + { + "epoch": 2.5341229554427525, + "grad_norm": 1.0421168804168701, + "learning_rate": 3.733220530174845e-05, + "loss": 0.7636, + "step": 4493 + }, + { + "epoch": 2.5346869712351947, + "grad_norm": 1.5356405973434448, + "learning_rate": 3.732938522278624e-05, + "loss": 0.7839, + "step": 4494 + }, + { + "epoch": 2.535250987027637, + "grad_norm": 1.5457159280776978, + "learning_rate": 3.7326565143824026e-05, + "loss": 0.8351, + "step": 4495 + }, + { + "epoch": 2.5358150028200788, + "grad_norm": 0.9021584391593933, + "learning_rate": 3.732374506486182e-05, + "loss": 0.6636, + "step": 4496 + }, + { + "epoch": 2.536379018612521, + "grad_norm": 2.654984712600708, + "learning_rate": 3.732092498589961e-05, + "loss": 0.7622, + "step": 4497 + }, + { + "epoch": 2.5369430344049633, + "grad_norm": 1.2537591457366943, + "learning_rate": 3.7318104906937396e-05, + "loss": 0.6515, + "step": 4498 + }, + { + "epoch": 2.5375070501974055, + "grad_norm": 3.9955780506134033, + "learning_rate": 3.731528482797518e-05, + "loss": 0.9936, + "step": 4499 + }, + { + "epoch": 2.5380710659898478, + "grad_norm": 3.7539801597595215, + "learning_rate": 3.7312464749012974e-05, + "loss": 0.8278, + "step": 4500 + }, + { + "epoch": 2.53863508178229, + "grad_norm": 1.1470224857330322, + "learning_rate": 3.7309644670050766e-05, + "loss": 0.7986, + "step": 4501 + }, + { + "epoch": 2.5391990975747323, + "grad_norm": 1.3277709484100342, + "learning_rate": 3.730682459108855e-05, + "loss": 0.753, + "step": 4502 + }, + { + "epoch": 2.539763113367174, + "grad_norm": 1.857107162475586, + "learning_rate": 3.730400451212634e-05, + "loss": 0.797, + "step": 4503 + }, + { + "epoch": 2.5403271291596163, + "grad_norm": 1.3019859790802002, + "learning_rate": 3.7301184433164136e-05, + "loss": 0.7747, + "step": 4504 + }, + { + "epoch": 2.5408911449520586, + "grad_norm": 1.6342942714691162, + "learning_rate": 3.729836435420192e-05, + "loss": 0.8852, + "step": 4505 + }, + { + "epoch": 2.541455160744501, + "grad_norm": 1.9503992795944214, + "learning_rate": 3.7295544275239706e-05, + "loss": 0.7887, + "step": 4506 + }, + { + "epoch": 2.542019176536943, + "grad_norm": 1.4861195087432861, + "learning_rate": 3.72927241962775e-05, + "loss": 0.6603, + "step": 4507 + }, + { + "epoch": 2.5425831923293853, + "grad_norm": 3.215782403945923, + "learning_rate": 3.728990411731529e-05, + "loss": 0.9556, + "step": 4508 + }, + { + "epoch": 2.5431472081218276, + "grad_norm": 1.5232340097427368, + "learning_rate": 3.7287084038353076e-05, + "loss": 0.8227, + "step": 4509 + }, + { + "epoch": 2.5437112239142694, + "grad_norm": 4.301727771759033, + "learning_rate": 3.728426395939086e-05, + "loss": 1.052, + "step": 4510 + }, + { + "epoch": 2.5442752397067117, + "grad_norm": 2.9793238639831543, + "learning_rate": 3.7281443880428654e-05, + "loss": 0.9189, + "step": 4511 + }, + { + "epoch": 2.544839255499154, + "grad_norm": 1.8068972826004028, + "learning_rate": 3.7278623801466446e-05, + "loss": 0.8749, + "step": 4512 + }, + { + "epoch": 2.545403271291596, + "grad_norm": 1.2161543369293213, + "learning_rate": 3.727580372250423e-05, + "loss": 0.7822, + "step": 4513 + }, + { + "epoch": 2.5459672870840384, + "grad_norm": 1.2903437614440918, + "learning_rate": 3.727298364354202e-05, + "loss": 0.6848, + "step": 4514 + }, + { + "epoch": 2.5465313028764807, + "grad_norm": 1.2237539291381836, + "learning_rate": 3.727016356457981e-05, + "loss": 0.6113, + "step": 4515 + }, + { + "epoch": 2.547095318668923, + "grad_norm": 1.305870532989502, + "learning_rate": 3.72673434856176e-05, + "loss": 0.7701, + "step": 4516 + }, + { + "epoch": 2.5476593344613647, + "grad_norm": 1.6668955087661743, + "learning_rate": 3.726452340665539e-05, + "loss": 0.7571, + "step": 4517 + }, + { + "epoch": 2.548223350253807, + "grad_norm": 2.7369344234466553, + "learning_rate": 3.726170332769318e-05, + "loss": 0.8161, + "step": 4518 + }, + { + "epoch": 2.5487873660462492, + "grad_norm": 1.9946810007095337, + "learning_rate": 3.7258883248730964e-05, + "loss": 0.8353, + "step": 4519 + }, + { + "epoch": 2.5493513818386915, + "grad_norm": 1.6999633312225342, + "learning_rate": 3.7256063169768756e-05, + "loss": 0.7595, + "step": 4520 + }, + { + "epoch": 2.5499153976311337, + "grad_norm": 1.1797558069229126, + "learning_rate": 3.725324309080654e-05, + "loss": 0.7458, + "step": 4521 + }, + { + "epoch": 2.550479413423576, + "grad_norm": 2.9479541778564453, + "learning_rate": 3.7250423011844334e-05, + "loss": 0.8509, + "step": 4522 + }, + { + "epoch": 2.5510434292160182, + "grad_norm": 2.1920359134674072, + "learning_rate": 3.7247602932882126e-05, + "loss": 0.8342, + "step": 4523 + }, + { + "epoch": 2.55160744500846, + "grad_norm": 1.4080787897109985, + "learning_rate": 3.724478285391991e-05, + "loss": 0.6926, + "step": 4524 + }, + { + "epoch": 2.5521714608009023, + "grad_norm": 2.0476467609405518, + "learning_rate": 3.7241962774957704e-05, + "loss": 0.8827, + "step": 4525 + }, + { + "epoch": 2.5527354765933445, + "grad_norm": 2.351870059967041, + "learning_rate": 3.723914269599549e-05, + "loss": 0.9259, + "step": 4526 + }, + { + "epoch": 2.553299492385787, + "grad_norm": 1.201342225074768, + "learning_rate": 3.723632261703328e-05, + "loss": 0.715, + "step": 4527 + }, + { + "epoch": 2.553863508178229, + "grad_norm": 1.060089349746704, + "learning_rate": 3.723350253807107e-05, + "loss": 0.7473, + "step": 4528 + }, + { + "epoch": 2.5544275239706713, + "grad_norm": 6.345795631408691, + "learning_rate": 3.723068245910886e-05, + "loss": 0.9511, + "step": 4529 + }, + { + "epoch": 2.5549915397631136, + "grad_norm": 1.3042365312576294, + "learning_rate": 3.7227862380146644e-05, + "loss": 0.7433, + "step": 4530 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 1.2402116060256958, + "learning_rate": 3.7225042301184437e-05, + "loss": 0.6272, + "step": 4531 + }, + { + "epoch": 2.5561195713479976, + "grad_norm": 2.0523345470428467, + "learning_rate": 3.722222222222222e-05, + "loss": 0.7823, + "step": 4532 + }, + { + "epoch": 2.55668358714044, + "grad_norm": 4.1411213874816895, + "learning_rate": 3.7219402143260014e-05, + "loss": 0.7398, + "step": 4533 + }, + { + "epoch": 2.557247602932882, + "grad_norm": 1.2283709049224854, + "learning_rate": 3.72165820642978e-05, + "loss": 0.7689, + "step": 4534 + }, + { + "epoch": 2.5578116187253244, + "grad_norm": 2.7027852535247803, + "learning_rate": 3.721376198533559e-05, + "loss": 0.7279, + "step": 4535 + }, + { + "epoch": 2.5583756345177666, + "grad_norm": 1.7566039562225342, + "learning_rate": 3.7210941906373384e-05, + "loss": 0.8282, + "step": 4536 + }, + { + "epoch": 2.558939650310209, + "grad_norm": 1.3697630167007446, + "learning_rate": 3.720812182741117e-05, + "loss": 0.6805, + "step": 4537 + }, + { + "epoch": 2.5595036661026507, + "grad_norm": 4.05826997756958, + "learning_rate": 3.7205301748448955e-05, + "loss": 0.7955, + "step": 4538 + }, + { + "epoch": 2.560067681895093, + "grad_norm": 1.564286470413208, + "learning_rate": 3.720248166948675e-05, + "loss": 0.8607, + "step": 4539 + }, + { + "epoch": 2.560631697687535, + "grad_norm": 1.8718783855438232, + "learning_rate": 3.719966159052454e-05, + "loss": 0.716, + "step": 4540 + }, + { + "epoch": 2.5611957134799774, + "grad_norm": 2.003221273422241, + "learning_rate": 3.7196841511562325e-05, + "loss": 0.8317, + "step": 4541 + }, + { + "epoch": 2.5617597292724197, + "grad_norm": 1.2901523113250732, + "learning_rate": 3.719402143260011e-05, + "loss": 0.6424, + "step": 4542 + }, + { + "epoch": 2.562323745064862, + "grad_norm": 1.827522873878479, + "learning_rate": 3.719120135363791e-05, + "loss": 0.7953, + "step": 4543 + }, + { + "epoch": 2.562887760857304, + "grad_norm": 2.3984920978546143, + "learning_rate": 3.7188381274675694e-05, + "loss": 0.8833, + "step": 4544 + }, + { + "epoch": 2.563451776649746, + "grad_norm": 1.210383653640747, + "learning_rate": 3.718556119571348e-05, + "loss": 0.7425, + "step": 4545 + }, + { + "epoch": 2.5640157924421882, + "grad_norm": 1.1481655836105347, + "learning_rate": 3.7182741116751265e-05, + "loss": 0.6537, + "step": 4546 + }, + { + "epoch": 2.5645798082346305, + "grad_norm": 1.3138822317123413, + "learning_rate": 3.7179921037789064e-05, + "loss": 0.7411, + "step": 4547 + }, + { + "epoch": 2.5651438240270727, + "grad_norm": 1.2678836584091187, + "learning_rate": 3.717710095882685e-05, + "loss": 0.7406, + "step": 4548 + }, + { + "epoch": 2.565707839819515, + "grad_norm": 1.0674917697906494, + "learning_rate": 3.7174280879864635e-05, + "loss": 0.7653, + "step": 4549 + }, + { + "epoch": 2.5662718556119573, + "grad_norm": 1.104506015777588, + "learning_rate": 3.717146080090243e-05, + "loss": 0.7405, + "step": 4550 + }, + { + "epoch": 2.5668358714043995, + "grad_norm": 1.3528512716293335, + "learning_rate": 3.716864072194022e-05, + "loss": 0.6111, + "step": 4551 + }, + { + "epoch": 2.5673998871968413, + "grad_norm": 1.2350969314575195, + "learning_rate": 3.7165820642978005e-05, + "loss": 0.7959, + "step": 4552 + }, + { + "epoch": 2.5679639029892836, + "grad_norm": 1.6141815185546875, + "learning_rate": 3.716300056401579e-05, + "loss": 0.7795, + "step": 4553 + }, + { + "epoch": 2.568527918781726, + "grad_norm": 1.5269044637680054, + "learning_rate": 3.716018048505358e-05, + "loss": 0.8032, + "step": 4554 + }, + { + "epoch": 2.569091934574168, + "grad_norm": 1.746118426322937, + "learning_rate": 3.7157360406091374e-05, + "loss": 0.9425, + "step": 4555 + }, + { + "epoch": 2.5696559503666103, + "grad_norm": 1.435403823852539, + "learning_rate": 3.715454032712916e-05, + "loss": 0.8102, + "step": 4556 + }, + { + "epoch": 2.5702199661590526, + "grad_norm": 1.0994175672531128, + "learning_rate": 3.715172024816695e-05, + "loss": 0.721, + "step": 4557 + }, + { + "epoch": 2.570783981951495, + "grad_norm": 1.3874207735061646, + "learning_rate": 3.7148900169204744e-05, + "loss": 0.6588, + "step": 4558 + }, + { + "epoch": 2.5713479977439366, + "grad_norm": 2.2331228256225586, + "learning_rate": 3.714608009024253e-05, + "loss": 0.923, + "step": 4559 + }, + { + "epoch": 2.571912013536379, + "grad_norm": 1.610399842262268, + "learning_rate": 3.7143260011280315e-05, + "loss": 0.7103, + "step": 4560 + }, + { + "epoch": 2.572476029328821, + "grad_norm": 1.3803101778030396, + "learning_rate": 3.714043993231811e-05, + "loss": 0.7549, + "step": 4561 + }, + { + "epoch": 2.5730400451212634, + "grad_norm": 2.0253281593322754, + "learning_rate": 3.71376198533559e-05, + "loss": 0.7873, + "step": 4562 + }, + { + "epoch": 2.5736040609137056, + "grad_norm": 1.9827250242233276, + "learning_rate": 3.7134799774393685e-05, + "loss": 0.846, + "step": 4563 + }, + { + "epoch": 2.574168076706148, + "grad_norm": 1.3576347827911377, + "learning_rate": 3.713197969543147e-05, + "loss": 0.82, + "step": 4564 + }, + { + "epoch": 2.57473209249859, + "grad_norm": 1.9680030345916748, + "learning_rate": 3.712915961646926e-05, + "loss": 0.853, + "step": 4565 + }, + { + "epoch": 2.575296108291032, + "grad_norm": 1.330319881439209, + "learning_rate": 3.7126339537507055e-05, + "loss": 0.8545, + "step": 4566 + }, + { + "epoch": 2.575860124083474, + "grad_norm": 1.5089925527572632, + "learning_rate": 3.712351945854484e-05, + "loss": 0.6668, + "step": 4567 + }, + { + "epoch": 2.5764241398759165, + "grad_norm": 1.1189707517623901, + "learning_rate": 3.712069937958263e-05, + "loss": 0.7634, + "step": 4568 + }, + { + "epoch": 2.5769881556683587, + "grad_norm": 0.9890088438987732, + "learning_rate": 3.711787930062042e-05, + "loss": 0.7206, + "step": 4569 + }, + { + "epoch": 2.577552171460801, + "grad_norm": 1.517849326133728, + "learning_rate": 3.711505922165821e-05, + "loss": 0.7507, + "step": 4570 + }, + { + "epoch": 2.578116187253243, + "grad_norm": 1.7770240306854248, + "learning_rate": 3.7112239142695995e-05, + "loss": 0.8238, + "step": 4571 + }, + { + "epoch": 2.5786802030456855, + "grad_norm": 1.7985609769821167, + "learning_rate": 3.710941906373379e-05, + "loss": 0.8379, + "step": 4572 + }, + { + "epoch": 2.5792442188381273, + "grad_norm": 0.9341646432876587, + "learning_rate": 3.710659898477157e-05, + "loss": 0.6901, + "step": 4573 + }, + { + "epoch": 2.5798082346305695, + "grad_norm": 1.9566086530685425, + "learning_rate": 3.7103778905809365e-05, + "loss": 0.7994, + "step": 4574 + }, + { + "epoch": 2.5803722504230118, + "grad_norm": 1.4560548067092896, + "learning_rate": 3.710095882684716e-05, + "loss": 0.6743, + "step": 4575 + }, + { + "epoch": 2.580936266215454, + "grad_norm": 2.445833683013916, + "learning_rate": 3.709813874788494e-05, + "loss": 0.824, + "step": 4576 + }, + { + "epoch": 2.5815002820078963, + "grad_norm": 1.2979217767715454, + "learning_rate": 3.709531866892273e-05, + "loss": 0.8223, + "step": 4577 + }, + { + "epoch": 2.5820642978003385, + "grad_norm": 3.026008367538452, + "learning_rate": 3.709249858996052e-05, + "loss": 0.9005, + "step": 4578 + }, + { + "epoch": 2.5826283135927808, + "grad_norm": 1.6206945180892944, + "learning_rate": 3.708967851099831e-05, + "loss": 0.724, + "step": 4579 + }, + { + "epoch": 2.5831923293852226, + "grad_norm": 1.4256489276885986, + "learning_rate": 3.70868584320361e-05, + "loss": 0.7859, + "step": 4580 + }, + { + "epoch": 2.583756345177665, + "grad_norm": 1.954579472541809, + "learning_rate": 3.708403835307388e-05, + "loss": 0.7248, + "step": 4581 + }, + { + "epoch": 2.584320360970107, + "grad_norm": 1.3714854717254639, + "learning_rate": 3.7081218274111675e-05, + "loss": 0.8481, + "step": 4582 + }, + { + "epoch": 2.5848843767625493, + "grad_norm": 5.37931489944458, + "learning_rate": 3.707839819514947e-05, + "loss": 0.8014, + "step": 4583 + }, + { + "epoch": 2.5854483925549916, + "grad_norm": 1.3871556520462036, + "learning_rate": 3.707557811618725e-05, + "loss": 0.7017, + "step": 4584 + }, + { + "epoch": 2.586012408347434, + "grad_norm": 2.3273770809173584, + "learning_rate": 3.707275803722504e-05, + "loss": 0.8564, + "step": 4585 + }, + { + "epoch": 2.586576424139876, + "grad_norm": 0.9982866644859314, + "learning_rate": 3.706993795826284e-05, + "loss": 0.6751, + "step": 4586 + }, + { + "epoch": 2.587140439932318, + "grad_norm": 1.3432340621948242, + "learning_rate": 3.706711787930062e-05, + "loss": 0.8465, + "step": 4587 + }, + { + "epoch": 2.58770445572476, + "grad_norm": 1.193343162536621, + "learning_rate": 3.706429780033841e-05, + "loss": 0.7855, + "step": 4588 + }, + { + "epoch": 2.5882684715172024, + "grad_norm": 1.6874557733535767, + "learning_rate": 3.70614777213762e-05, + "loss": 0.78, + "step": 4589 + }, + { + "epoch": 2.5888324873096447, + "grad_norm": 2.712373971939087, + "learning_rate": 3.705865764241399e-05, + "loss": 0.8732, + "step": 4590 + }, + { + "epoch": 2.589396503102087, + "grad_norm": 2.7662370204925537, + "learning_rate": 3.705583756345178e-05, + "loss": 0.8047, + "step": 4591 + }, + { + "epoch": 2.589960518894529, + "grad_norm": 1.1940042972564697, + "learning_rate": 3.705301748448956e-05, + "loss": 0.7147, + "step": 4592 + }, + { + "epoch": 2.5905245346869714, + "grad_norm": 1.6214615106582642, + "learning_rate": 3.705019740552736e-05, + "loss": 0.7248, + "step": 4593 + }, + { + "epoch": 2.591088550479413, + "grad_norm": 1.3393549919128418, + "learning_rate": 3.704737732656515e-05, + "loss": 0.7319, + "step": 4594 + }, + { + "epoch": 2.5916525662718555, + "grad_norm": 2.3323259353637695, + "learning_rate": 3.704455724760293e-05, + "loss": 0.8607, + "step": 4595 + }, + { + "epoch": 2.5922165820642977, + "grad_norm": 1.3652269840240479, + "learning_rate": 3.7041737168640725e-05, + "loss": 0.7649, + "step": 4596 + }, + { + "epoch": 2.59278059785674, + "grad_norm": 1.4414434432983398, + "learning_rate": 3.703891708967852e-05, + "loss": 0.7538, + "step": 4597 + }, + { + "epoch": 2.5933446136491822, + "grad_norm": 1.2097948789596558, + "learning_rate": 3.70360970107163e-05, + "loss": 0.8138, + "step": 4598 + }, + { + "epoch": 2.5939086294416245, + "grad_norm": 0.9947838187217712, + "learning_rate": 3.703327693175409e-05, + "loss": 0.621, + "step": 4599 + }, + { + "epoch": 2.5944726452340667, + "grad_norm": 1.915388584136963, + "learning_rate": 3.703045685279188e-05, + "loss": 0.787, + "step": 4600 + }, + { + "epoch": 2.5950366610265085, + "grad_norm": 1.2951912879943848, + "learning_rate": 3.702763677382967e-05, + "loss": 0.7798, + "step": 4601 + }, + { + "epoch": 2.595600676818951, + "grad_norm": 1.365172028541565, + "learning_rate": 3.702481669486746e-05, + "loss": 0.8351, + "step": 4602 + }, + { + "epoch": 2.596164692611393, + "grad_norm": 2.162311553955078, + "learning_rate": 3.7021996615905243e-05, + "loss": 0.7625, + "step": 4603 + }, + { + "epoch": 2.5967287084038353, + "grad_norm": 1.4394780397415161, + "learning_rate": 3.7019176536943036e-05, + "loss": 0.8793, + "step": 4604 + }, + { + "epoch": 2.5972927241962775, + "grad_norm": 1.094388723373413, + "learning_rate": 3.701635645798083e-05, + "loss": 0.6765, + "step": 4605 + }, + { + "epoch": 2.59785673998872, + "grad_norm": 1.5854887962341309, + "learning_rate": 3.701353637901861e-05, + "loss": 0.7972, + "step": 4606 + }, + { + "epoch": 2.598420755781162, + "grad_norm": 1.3338193893432617, + "learning_rate": 3.7010716300056405e-05, + "loss": 0.8442, + "step": 4607 + }, + { + "epoch": 2.598984771573604, + "grad_norm": 2.0029842853546143, + "learning_rate": 3.700789622109419e-05, + "loss": 0.8069, + "step": 4608 + }, + { + "epoch": 2.599548787366046, + "grad_norm": 1.1393506526947021, + "learning_rate": 3.700507614213198e-05, + "loss": 0.7913, + "step": 4609 + }, + { + "epoch": 2.6001128031584884, + "grad_norm": 1.0313498973846436, + "learning_rate": 3.700225606316977e-05, + "loss": 0.7395, + "step": 4610 + }, + { + "epoch": 2.6006768189509306, + "grad_norm": 0.9956133365631104, + "learning_rate": 3.699943598420756e-05, + "loss": 0.7668, + "step": 4611 + }, + { + "epoch": 2.601240834743373, + "grad_norm": 3.456251382827759, + "learning_rate": 3.6996615905245346e-05, + "loss": 0.8752, + "step": 4612 + }, + { + "epoch": 2.601804850535815, + "grad_norm": 1.4574917554855347, + "learning_rate": 3.699379582628314e-05, + "loss": 0.7335, + "step": 4613 + }, + { + "epoch": 2.6023688663282574, + "grad_norm": 2.4384708404541016, + "learning_rate": 3.699097574732093e-05, + "loss": 0.7783, + "step": 4614 + }, + { + "epoch": 2.602932882120699, + "grad_norm": 0.9754536747932434, + "learning_rate": 3.6988155668358716e-05, + "loss": 0.6861, + "step": 4615 + }, + { + "epoch": 2.6034968979131414, + "grad_norm": 1.6975810527801514, + "learning_rate": 3.69853355893965e-05, + "loss": 0.886, + "step": 4616 + }, + { + "epoch": 2.6040609137055837, + "grad_norm": 2.2842299938201904, + "learning_rate": 3.6982515510434293e-05, + "loss": 0.8278, + "step": 4617 + }, + { + "epoch": 2.604624929498026, + "grad_norm": 1.33036470413208, + "learning_rate": 3.6979695431472086e-05, + "loss": 0.6662, + "step": 4618 + }, + { + "epoch": 2.605188945290468, + "grad_norm": 1.1792320013046265, + "learning_rate": 3.697687535250987e-05, + "loss": 0.7321, + "step": 4619 + }, + { + "epoch": 2.6057529610829104, + "grad_norm": 1.7364267110824585, + "learning_rate": 3.6974055273547656e-05, + "loss": 0.7818, + "step": 4620 + }, + { + "epoch": 2.6063169768753527, + "grad_norm": 2.041741132736206, + "learning_rate": 3.697123519458545e-05, + "loss": 0.8388, + "step": 4621 + }, + { + "epoch": 2.6068809926677945, + "grad_norm": 1.9486713409423828, + "learning_rate": 3.696841511562324e-05, + "loss": 0.7429, + "step": 4622 + }, + { + "epoch": 2.6074450084602367, + "grad_norm": 1.2012414932250977, + "learning_rate": 3.6965595036661026e-05, + "loss": 0.7486, + "step": 4623 + }, + { + "epoch": 2.608009024252679, + "grad_norm": 1.6170802116394043, + "learning_rate": 3.696277495769881e-05, + "loss": 0.7853, + "step": 4624 + }, + { + "epoch": 2.6085730400451212, + "grad_norm": 5.732732772827148, + "learning_rate": 3.695995487873661e-05, + "loss": 0.8607, + "step": 4625 + }, + { + "epoch": 2.6091370558375635, + "grad_norm": 1.7548532485961914, + "learning_rate": 3.6957134799774396e-05, + "loss": 0.7422, + "step": 4626 + }, + { + "epoch": 2.6097010716300058, + "grad_norm": 0.9422653913497925, + "learning_rate": 3.695431472081218e-05, + "loss": 0.6525, + "step": 4627 + }, + { + "epoch": 2.610265087422448, + "grad_norm": 1.6762590408325195, + "learning_rate": 3.6951494641849974e-05, + "loss": 0.8521, + "step": 4628 + }, + { + "epoch": 2.61082910321489, + "grad_norm": 1.3471101522445679, + "learning_rate": 3.6948674562887766e-05, + "loss": 0.7197, + "step": 4629 + }, + { + "epoch": 2.611393119007332, + "grad_norm": 1.3684686422348022, + "learning_rate": 3.694585448392555e-05, + "loss": 0.6378, + "step": 4630 + }, + { + "epoch": 2.6119571347997743, + "grad_norm": 1.2661564350128174, + "learning_rate": 3.6943034404963337e-05, + "loss": 0.8319, + "step": 4631 + }, + { + "epoch": 2.6125211505922166, + "grad_norm": 1.826434850692749, + "learning_rate": 3.6940214326001136e-05, + "loss": 0.8995, + "step": 4632 + }, + { + "epoch": 2.613085166384659, + "grad_norm": 1.6537030935287476, + "learning_rate": 3.693739424703892e-05, + "loss": 0.7906, + "step": 4633 + }, + { + "epoch": 2.613649182177101, + "grad_norm": 2.4395904541015625, + "learning_rate": 3.6934574168076706e-05, + "loss": 0.6879, + "step": 4634 + }, + { + "epoch": 2.6142131979695433, + "grad_norm": 1.2260955572128296, + "learning_rate": 3.69317540891145e-05, + "loss": 0.7327, + "step": 4635 + }, + { + "epoch": 2.614777213761985, + "grad_norm": 1.685632586479187, + "learning_rate": 3.692893401015229e-05, + "loss": 0.7606, + "step": 4636 + }, + { + "epoch": 2.6153412295544274, + "grad_norm": 0.9311856031417847, + "learning_rate": 3.6926113931190076e-05, + "loss": 0.6039, + "step": 4637 + }, + { + "epoch": 2.6159052453468696, + "grad_norm": 2.346775531768799, + "learning_rate": 3.692329385222786e-05, + "loss": 0.7959, + "step": 4638 + }, + { + "epoch": 2.616469261139312, + "grad_norm": 2.0602867603302, + "learning_rate": 3.6920473773265654e-05, + "loss": 0.7525, + "step": 4639 + }, + { + "epoch": 2.617033276931754, + "grad_norm": 1.0458984375, + "learning_rate": 3.6917653694303446e-05, + "loss": 0.763, + "step": 4640 + }, + { + "epoch": 2.6175972927241964, + "grad_norm": 1.6811600923538208, + "learning_rate": 3.691483361534123e-05, + "loss": 0.9636, + "step": 4641 + }, + { + "epoch": 2.6181613085166386, + "grad_norm": 1.8494259119033813, + "learning_rate": 3.691201353637902e-05, + "loss": 0.7783, + "step": 4642 + }, + { + "epoch": 2.6187253243090804, + "grad_norm": 1.9967617988586426, + "learning_rate": 3.690919345741681e-05, + "loss": 0.7711, + "step": 4643 + }, + { + "epoch": 2.6192893401015227, + "grad_norm": 1.3370025157928467, + "learning_rate": 3.69063733784546e-05, + "loss": 0.7993, + "step": 4644 + }, + { + "epoch": 2.619853355893965, + "grad_norm": 1.0239025354385376, + "learning_rate": 3.6903553299492386e-05, + "loss": 0.7224, + "step": 4645 + }, + { + "epoch": 2.620417371686407, + "grad_norm": 1.0978318452835083, + "learning_rate": 3.690073322053018e-05, + "loss": 0.7011, + "step": 4646 + }, + { + "epoch": 2.6209813874788495, + "grad_norm": 1.3254905939102173, + "learning_rate": 3.6897913141567964e-05, + "loss": 0.6891, + "step": 4647 + }, + { + "epoch": 2.6215454032712917, + "grad_norm": 2.1350483894348145, + "learning_rate": 3.6895093062605756e-05, + "loss": 0.719, + "step": 4648 + }, + { + "epoch": 2.622109419063734, + "grad_norm": 2.230175495147705, + "learning_rate": 3.689227298364354e-05, + "loss": 0.85, + "step": 4649 + }, + { + "epoch": 2.6226734348561758, + "grad_norm": 3.1746273040771484, + "learning_rate": 3.6889452904681334e-05, + "loss": 0.9824, + "step": 4650 + }, + { + "epoch": 2.623237450648618, + "grad_norm": 1.8573102951049805, + "learning_rate": 3.688663282571912e-05, + "loss": 0.7602, + "step": 4651 + }, + { + "epoch": 2.6238014664410603, + "grad_norm": 3.0772714614868164, + "learning_rate": 3.688381274675691e-05, + "loss": 0.7054, + "step": 4652 + }, + { + "epoch": 2.6243654822335025, + "grad_norm": 1.2969262599945068, + "learning_rate": 3.6880992667794704e-05, + "loss": 0.7633, + "step": 4653 + }, + { + "epoch": 2.6249294980259448, + "grad_norm": 1.5970107316970825, + "learning_rate": 3.687817258883249e-05, + "loss": 0.7387, + "step": 4654 + }, + { + "epoch": 2.625493513818387, + "grad_norm": 1.8474440574645996, + "learning_rate": 3.6875352509870274e-05, + "loss": 0.8076, + "step": 4655 + }, + { + "epoch": 2.6260575296108293, + "grad_norm": 1.8688863515853882, + "learning_rate": 3.687253243090807e-05, + "loss": 1.0011, + "step": 4656 + }, + { + "epoch": 2.626621545403271, + "grad_norm": 1.501447081565857, + "learning_rate": 3.686971235194586e-05, + "loss": 0.6818, + "step": 4657 + }, + { + "epoch": 2.6271855611957133, + "grad_norm": 1.4446452856063843, + "learning_rate": 3.6866892272983644e-05, + "loss": 0.7136, + "step": 4658 + }, + { + "epoch": 2.6277495769881556, + "grad_norm": 1.4883445501327515, + "learning_rate": 3.686407219402143e-05, + "loss": 0.8667, + "step": 4659 + }, + { + "epoch": 2.628313592780598, + "grad_norm": 1.472672462463379, + "learning_rate": 3.686125211505922e-05, + "loss": 0.843, + "step": 4660 + }, + { + "epoch": 2.62887760857304, + "grad_norm": 4.091024398803711, + "learning_rate": 3.6858432036097014e-05, + "loss": 0.8522, + "step": 4661 + }, + { + "epoch": 2.6294416243654823, + "grad_norm": 1.8482105731964111, + "learning_rate": 3.68556119571348e-05, + "loss": 0.8279, + "step": 4662 + }, + { + "epoch": 2.6300056401579246, + "grad_norm": 1.4511030912399292, + "learning_rate": 3.6852791878172585e-05, + "loss": 0.6883, + "step": 4663 + }, + { + "epoch": 2.6305696559503664, + "grad_norm": 1.456647515296936, + "learning_rate": 3.6849971799210384e-05, + "loss": 0.7425, + "step": 4664 + }, + { + "epoch": 2.6311336717428087, + "grad_norm": 1.1067395210266113, + "learning_rate": 3.684715172024817e-05, + "loss": 0.7786, + "step": 4665 + }, + { + "epoch": 2.631697687535251, + "grad_norm": 2.279919147491455, + "learning_rate": 3.6844331641285955e-05, + "loss": 0.9251, + "step": 4666 + }, + { + "epoch": 2.632261703327693, + "grad_norm": 1.233009696006775, + "learning_rate": 3.684151156232375e-05, + "loss": 0.7083, + "step": 4667 + }, + { + "epoch": 2.6328257191201354, + "grad_norm": 1.311530351638794, + "learning_rate": 3.683869148336154e-05, + "loss": 0.6291, + "step": 4668 + }, + { + "epoch": 2.6333897349125777, + "grad_norm": 1.2077081203460693, + "learning_rate": 3.6835871404399324e-05, + "loss": 0.8274, + "step": 4669 + }, + { + "epoch": 2.63395375070502, + "grad_norm": 1.7157227993011475, + "learning_rate": 3.683305132543711e-05, + "loss": 0.6387, + "step": 4670 + }, + { + "epoch": 2.6345177664974617, + "grad_norm": 1.4724866151809692, + "learning_rate": 3.683023124647491e-05, + "loss": 0.867, + "step": 4671 + }, + { + "epoch": 2.635081782289904, + "grad_norm": 2.114405870437622, + "learning_rate": 3.6827411167512694e-05, + "loss": 0.891, + "step": 4672 + }, + { + "epoch": 2.6356457980823462, + "grad_norm": 2.073690176010132, + "learning_rate": 3.682459108855048e-05, + "loss": 0.7968, + "step": 4673 + }, + { + "epoch": 2.6362098138747885, + "grad_norm": 1.7069562673568726, + "learning_rate": 3.682177100958827e-05, + "loss": 0.8631, + "step": 4674 + }, + { + "epoch": 2.6367738296672307, + "grad_norm": 1.416678547859192, + "learning_rate": 3.6818950930626064e-05, + "loss": 0.7721, + "step": 4675 + }, + { + "epoch": 2.637337845459673, + "grad_norm": 2.094090461730957, + "learning_rate": 3.681613085166385e-05, + "loss": 0.8036, + "step": 4676 + }, + { + "epoch": 2.6379018612521152, + "grad_norm": 7.145820617675781, + "learning_rate": 3.6813310772701635e-05, + "loss": 0.8531, + "step": 4677 + }, + { + "epoch": 2.638465877044557, + "grad_norm": 1.4528104066848755, + "learning_rate": 3.681049069373943e-05, + "loss": 0.7391, + "step": 4678 + }, + { + "epoch": 2.6390298928369993, + "grad_norm": 1.9147250652313232, + "learning_rate": 3.680767061477722e-05, + "loss": 0.6893, + "step": 4679 + }, + { + "epoch": 2.6395939086294415, + "grad_norm": 2.248922348022461, + "learning_rate": 3.6804850535815005e-05, + "loss": 0.7961, + "step": 4680 + }, + { + "epoch": 2.640157924421884, + "grad_norm": 1.3542442321777344, + "learning_rate": 3.680203045685279e-05, + "loss": 0.7793, + "step": 4681 + }, + { + "epoch": 2.640721940214326, + "grad_norm": 1.1501243114471436, + "learning_rate": 3.679921037789058e-05, + "loss": 0.7436, + "step": 4682 + }, + { + "epoch": 2.6412859560067683, + "grad_norm": 1.750244140625, + "learning_rate": 3.6796390298928374e-05, + "loss": 0.7721, + "step": 4683 + }, + { + "epoch": 2.6418499717992106, + "grad_norm": 1.3017834424972534, + "learning_rate": 3.679357021996616e-05, + "loss": 0.6931, + "step": 4684 + }, + { + "epoch": 2.6424139875916524, + "grad_norm": 1.53607177734375, + "learning_rate": 3.679075014100395e-05, + "loss": 0.7869, + "step": 4685 + }, + { + "epoch": 2.6429780033840946, + "grad_norm": 1.4634170532226562, + "learning_rate": 3.678793006204174e-05, + "loss": 0.7221, + "step": 4686 + }, + { + "epoch": 2.643542019176537, + "grad_norm": 2.6918790340423584, + "learning_rate": 3.678510998307953e-05, + "loss": 0.8455, + "step": 4687 + }, + { + "epoch": 2.644106034968979, + "grad_norm": 1.8638315200805664, + "learning_rate": 3.6782289904117315e-05, + "loss": 0.7046, + "step": 4688 + }, + { + "epoch": 2.6446700507614214, + "grad_norm": 2.012871265411377, + "learning_rate": 3.677946982515511e-05, + "loss": 0.8149, + "step": 4689 + }, + { + "epoch": 2.6452340665538636, + "grad_norm": 1.744794487953186, + "learning_rate": 3.677664974619289e-05, + "loss": 0.867, + "step": 4690 + }, + { + "epoch": 2.645798082346306, + "grad_norm": 2.5556275844573975, + "learning_rate": 3.6773829667230685e-05, + "loss": 0.9985, + "step": 4691 + }, + { + "epoch": 2.6463620981387477, + "grad_norm": 3.797506332397461, + "learning_rate": 3.677100958826848e-05, + "loss": 0.9114, + "step": 4692 + }, + { + "epoch": 2.64692611393119, + "grad_norm": 0.9308549761772156, + "learning_rate": 3.676818950930626e-05, + "loss": 0.6785, + "step": 4693 + }, + { + "epoch": 2.647490129723632, + "grad_norm": 1.3943864107131958, + "learning_rate": 3.676536943034405e-05, + "loss": 0.7161, + "step": 4694 + }, + { + "epoch": 2.6480541455160744, + "grad_norm": 1.907315731048584, + "learning_rate": 3.676254935138184e-05, + "loss": 0.7974, + "step": 4695 + }, + { + "epoch": 2.6486181613085167, + "grad_norm": 1.2749361991882324, + "learning_rate": 3.675972927241963e-05, + "loss": 0.7364, + "step": 4696 + }, + { + "epoch": 2.649182177100959, + "grad_norm": 1.720593810081482, + "learning_rate": 3.675690919345742e-05, + "loss": 0.7628, + "step": 4697 + }, + { + "epoch": 2.649746192893401, + "grad_norm": 1.3939635753631592, + "learning_rate": 3.67540891144952e-05, + "loss": 0.7972, + "step": 4698 + }, + { + "epoch": 2.650310208685843, + "grad_norm": 1.6875163316726685, + "learning_rate": 3.6751269035532995e-05, + "loss": 0.7513, + "step": 4699 + }, + { + "epoch": 2.6508742244782852, + "grad_norm": 2.3541982173919678, + "learning_rate": 3.674844895657079e-05, + "loss": 0.7551, + "step": 4700 + }, + { + "epoch": 2.6514382402707275, + "grad_norm": 1.2357089519500732, + "learning_rate": 3.674562887760857e-05, + "loss": 0.777, + "step": 4701 + }, + { + "epoch": 2.6520022560631697, + "grad_norm": 1.2892489433288574, + "learning_rate": 3.6742808798646365e-05, + "loss": 0.7094, + "step": 4702 + }, + { + "epoch": 2.652566271855612, + "grad_norm": 1.4688773155212402, + "learning_rate": 3.673998871968416e-05, + "loss": 0.8231, + "step": 4703 + }, + { + "epoch": 2.6531302876480543, + "grad_norm": 0.9425245523452759, + "learning_rate": 3.673716864072194e-05, + "loss": 0.5854, + "step": 4704 + }, + { + "epoch": 2.6536943034404965, + "grad_norm": 1.5968906879425049, + "learning_rate": 3.673434856175973e-05, + "loss": 0.749, + "step": 4705 + }, + { + "epoch": 2.6542583192329383, + "grad_norm": 2.3606603145599365, + "learning_rate": 3.673152848279752e-05, + "loss": 0.7725, + "step": 4706 + }, + { + "epoch": 2.6548223350253806, + "grad_norm": 2.812427520751953, + "learning_rate": 3.672870840383531e-05, + "loss": 0.7984, + "step": 4707 + }, + { + "epoch": 2.655386350817823, + "grad_norm": 0.9766665697097778, + "learning_rate": 3.67258883248731e-05, + "loss": 0.7077, + "step": 4708 + }, + { + "epoch": 2.655950366610265, + "grad_norm": 1.9540870189666748, + "learning_rate": 3.672306824591088e-05, + "loss": 0.7709, + "step": 4709 + }, + { + "epoch": 2.6565143824027073, + "grad_norm": 1.3238272666931152, + "learning_rate": 3.672024816694868e-05, + "loss": 0.6882, + "step": 4710 + }, + { + "epoch": 2.6570783981951496, + "grad_norm": 1.1464872360229492, + "learning_rate": 3.671742808798647e-05, + "loss": 0.7098, + "step": 4711 + }, + { + "epoch": 2.657642413987592, + "grad_norm": 2.112374782562256, + "learning_rate": 3.671460800902425e-05, + "loss": 0.8256, + "step": 4712 + }, + { + "epoch": 2.6582064297800336, + "grad_norm": 1.1358901262283325, + "learning_rate": 3.671178793006204e-05, + "loss": 0.8203, + "step": 4713 + }, + { + "epoch": 2.658770445572476, + "grad_norm": 1.3006843328475952, + "learning_rate": 3.670896785109984e-05, + "loss": 0.8573, + "step": 4714 + }, + { + "epoch": 2.659334461364918, + "grad_norm": 1.53341805934906, + "learning_rate": 3.670614777213762e-05, + "loss": 0.7837, + "step": 4715 + }, + { + "epoch": 2.6598984771573604, + "grad_norm": 2.1158554553985596, + "learning_rate": 3.670332769317541e-05, + "loss": 0.7835, + "step": 4716 + }, + { + "epoch": 2.6604624929498026, + "grad_norm": 1.4463210105895996, + "learning_rate": 3.67005076142132e-05, + "loss": 0.75, + "step": 4717 + }, + { + "epoch": 2.661026508742245, + "grad_norm": 1.201442837715149, + "learning_rate": 3.669768753525099e-05, + "loss": 0.7255, + "step": 4718 + }, + { + "epoch": 2.661590524534687, + "grad_norm": 1.1322671175003052, + "learning_rate": 3.669486745628878e-05, + "loss": 0.6164, + "step": 4719 + }, + { + "epoch": 2.662154540327129, + "grad_norm": 1.167025089263916, + "learning_rate": 3.669204737732656e-05, + "loss": 0.7401, + "step": 4720 + }, + { + "epoch": 2.662718556119571, + "grad_norm": 1.8718875646591187, + "learning_rate": 3.6689227298364355e-05, + "loss": 0.8508, + "step": 4721 + }, + { + "epoch": 2.6632825719120135, + "grad_norm": 1.8064249753952026, + "learning_rate": 3.668640721940215e-05, + "loss": 0.7999, + "step": 4722 + }, + { + "epoch": 2.6638465877044557, + "grad_norm": 3.171867847442627, + "learning_rate": 3.668358714043993e-05, + "loss": 0.7891, + "step": 4723 + }, + { + "epoch": 2.664410603496898, + "grad_norm": 1.565495491027832, + "learning_rate": 3.6680767061477725e-05, + "loss": 0.8165, + "step": 4724 + }, + { + "epoch": 2.66497461928934, + "grad_norm": 1.7279266119003296, + "learning_rate": 3.667794698251551e-05, + "loss": 0.8061, + "step": 4725 + }, + { + "epoch": 2.6655386350817825, + "grad_norm": 1.1343978643417358, + "learning_rate": 3.66751269035533e-05, + "loss": 0.7822, + "step": 4726 + }, + { + "epoch": 2.6661026508742243, + "grad_norm": 1.3891886472702026, + "learning_rate": 3.667230682459109e-05, + "loss": 0.7672, + "step": 4727 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.3409398794174194, + "learning_rate": 3.666948674562888e-05, + "loss": 0.7696, + "step": 4728 + }, + { + "epoch": 2.6672306824591088, + "grad_norm": 1.520253300666809, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.7351, + "step": 4729 + }, + { + "epoch": 2.667794698251551, + "grad_norm": 1.110926628112793, + "learning_rate": 3.666384658770446e-05, + "loss": 0.7392, + "step": 4730 + }, + { + "epoch": 2.6683587140439933, + "grad_norm": 1.619615912437439, + "learning_rate": 3.666102650874224e-05, + "loss": 0.8085, + "step": 4731 + }, + { + "epoch": 2.6689227298364355, + "grad_norm": 1.1421959400177002, + "learning_rate": 3.6658206429780035e-05, + "loss": 0.6782, + "step": 4732 + }, + { + "epoch": 2.6694867456288778, + "grad_norm": 2.8419811725616455, + "learning_rate": 3.665538635081782e-05, + "loss": 0.7832, + "step": 4733 + }, + { + "epoch": 2.6700507614213196, + "grad_norm": 1.279409646987915, + "learning_rate": 3.665256627185561e-05, + "loss": 0.6256, + "step": 4734 + }, + { + "epoch": 2.670614777213762, + "grad_norm": 2.6669139862060547, + "learning_rate": 3.6649746192893405e-05, + "loss": 0.7592, + "step": 4735 + }, + { + "epoch": 2.671178793006204, + "grad_norm": 1.5251175165176392, + "learning_rate": 3.664692611393119e-05, + "loss": 0.7448, + "step": 4736 + }, + { + "epoch": 2.6717428087986463, + "grad_norm": 1.7346786260604858, + "learning_rate": 3.664410603496898e-05, + "loss": 0.805, + "step": 4737 + }, + { + "epoch": 2.6723068245910886, + "grad_norm": 1.3561097383499146, + "learning_rate": 3.664128595600677e-05, + "loss": 0.7455, + "step": 4738 + }, + { + "epoch": 2.672870840383531, + "grad_norm": 1.839356780052185, + "learning_rate": 3.663846587704456e-05, + "loss": 0.7226, + "step": 4739 + }, + { + "epoch": 2.673434856175973, + "grad_norm": 2.130901336669922, + "learning_rate": 3.6635645798082346e-05, + "loss": 1.0527, + "step": 4740 + }, + { + "epoch": 2.673998871968415, + "grad_norm": 2.832141637802124, + "learning_rate": 3.663282571912014e-05, + "loss": 0.7651, + "step": 4741 + }, + { + "epoch": 2.674562887760857, + "grad_norm": 1.287414789199829, + "learning_rate": 3.663000564015793e-05, + "loss": 0.7772, + "step": 4742 + }, + { + "epoch": 2.6751269035532994, + "grad_norm": 2.177854537963867, + "learning_rate": 3.6627185561195716e-05, + "loss": 0.8509, + "step": 4743 + }, + { + "epoch": 2.6756909193457417, + "grad_norm": 0.9712371230125427, + "learning_rate": 3.66243654822335e-05, + "loss": 0.7393, + "step": 4744 + }, + { + "epoch": 2.676254935138184, + "grad_norm": 1.4526687860488892, + "learning_rate": 3.662154540327129e-05, + "loss": 0.7537, + "step": 4745 + }, + { + "epoch": 2.676818950930626, + "grad_norm": 1.6999257802963257, + "learning_rate": 3.6618725324309085e-05, + "loss": 0.8299, + "step": 4746 + }, + { + "epoch": 2.6773829667230684, + "grad_norm": 1.6834956407546997, + "learning_rate": 3.661590524534687e-05, + "loss": 0.6639, + "step": 4747 + }, + { + "epoch": 2.67794698251551, + "grad_norm": 1.08090078830719, + "learning_rate": 3.6613085166384656e-05, + "loss": 0.7214, + "step": 4748 + }, + { + "epoch": 2.6785109983079525, + "grad_norm": 1.1259666681289673, + "learning_rate": 3.661026508742245e-05, + "loss": 0.751, + "step": 4749 + }, + { + "epoch": 2.6790750141003947, + "grad_norm": 1.965246319770813, + "learning_rate": 3.660744500846024e-05, + "loss": 0.8785, + "step": 4750 + }, + { + "epoch": 2.679639029892837, + "grad_norm": 2.4323227405548096, + "learning_rate": 3.6604624929498026e-05, + "loss": 0.9045, + "step": 4751 + }, + { + "epoch": 2.6802030456852792, + "grad_norm": 1.0573526620864868, + "learning_rate": 3.660180485053581e-05, + "loss": 0.7846, + "step": 4752 + }, + { + "epoch": 2.6807670614777215, + "grad_norm": 1.7993266582489014, + "learning_rate": 3.659898477157361e-05, + "loss": 0.737, + "step": 4753 + }, + { + "epoch": 2.6813310772701637, + "grad_norm": 1.3817867040634155, + "learning_rate": 3.6596164692611396e-05, + "loss": 0.7118, + "step": 4754 + }, + { + "epoch": 2.6818950930626055, + "grad_norm": 2.0204315185546875, + "learning_rate": 3.659334461364918e-05, + "loss": 0.7968, + "step": 4755 + }, + { + "epoch": 2.682459108855048, + "grad_norm": 1.3550137281417847, + "learning_rate": 3.659052453468697e-05, + "loss": 0.6987, + "step": 4756 + }, + { + "epoch": 2.68302312464749, + "grad_norm": 1.2286165952682495, + "learning_rate": 3.6587704455724766e-05, + "loss": 0.8557, + "step": 4757 + }, + { + "epoch": 2.6835871404399323, + "grad_norm": 1.8194241523742676, + "learning_rate": 3.658488437676255e-05, + "loss": 0.7905, + "step": 4758 + }, + { + "epoch": 2.6841511562323745, + "grad_norm": 1.4694240093231201, + "learning_rate": 3.6582064297800336e-05, + "loss": 0.8662, + "step": 4759 + }, + { + "epoch": 2.684715172024817, + "grad_norm": 1.8574243783950806, + "learning_rate": 3.657924421883813e-05, + "loss": 0.8454, + "step": 4760 + }, + { + "epoch": 2.685279187817259, + "grad_norm": 1.4293341636657715, + "learning_rate": 3.657642413987592e-05, + "loss": 0.8451, + "step": 4761 + }, + { + "epoch": 2.685843203609701, + "grad_norm": 1.5686882734298706, + "learning_rate": 3.6573604060913706e-05, + "loss": 0.8625, + "step": 4762 + }, + { + "epoch": 2.686407219402143, + "grad_norm": 1.1168296337127686, + "learning_rate": 3.65707839819515e-05, + "loss": 0.6936, + "step": 4763 + }, + { + "epoch": 2.6869712351945854, + "grad_norm": 1.2932920455932617, + "learning_rate": 3.6567963902989284e-05, + "loss": 0.7699, + "step": 4764 + }, + { + "epoch": 2.6875352509870276, + "grad_norm": 2.46457839012146, + "learning_rate": 3.6565143824027076e-05, + "loss": 0.8839, + "step": 4765 + }, + { + "epoch": 2.68809926677947, + "grad_norm": 1.0887490510940552, + "learning_rate": 3.656232374506486e-05, + "loss": 0.7278, + "step": 4766 + }, + { + "epoch": 2.688663282571912, + "grad_norm": 1.1571754217147827, + "learning_rate": 3.6559503666102654e-05, + "loss": 0.8038, + "step": 4767 + }, + { + "epoch": 2.6892272983643544, + "grad_norm": 1.664354920387268, + "learning_rate": 3.655668358714044e-05, + "loss": 0.7643, + "step": 4768 + }, + { + "epoch": 2.689791314156796, + "grad_norm": 1.7875900268554688, + "learning_rate": 3.655386350817823e-05, + "loss": 0.827, + "step": 4769 + }, + { + "epoch": 2.6903553299492384, + "grad_norm": 1.9490115642547607, + "learning_rate": 3.6551043429216017e-05, + "loss": 0.8324, + "step": 4770 + }, + { + "epoch": 2.6909193457416807, + "grad_norm": 2.103599786758423, + "learning_rate": 3.654822335025381e-05, + "loss": 0.8699, + "step": 4771 + }, + { + "epoch": 2.691483361534123, + "grad_norm": 1.9119923114776611, + "learning_rate": 3.65454032712916e-05, + "loss": 0.8194, + "step": 4772 + }, + { + "epoch": 2.692047377326565, + "grad_norm": 1.6038916110992432, + "learning_rate": 3.6542583192329386e-05, + "loss": 0.7735, + "step": 4773 + }, + { + "epoch": 2.6926113931190074, + "grad_norm": 1.6838961839675903, + "learning_rate": 3.653976311336718e-05, + "loss": 0.8606, + "step": 4774 + }, + { + "epoch": 2.6931754089114497, + "grad_norm": 1.0292549133300781, + "learning_rate": 3.6536943034404964e-05, + "loss": 0.6966, + "step": 4775 + }, + { + "epoch": 2.6937394247038915, + "grad_norm": 2.778895616531372, + "learning_rate": 3.6534122955442756e-05, + "loss": 0.8226, + "step": 4776 + }, + { + "epoch": 2.6943034404963337, + "grad_norm": 1.069286823272705, + "learning_rate": 3.653130287648054e-05, + "loss": 0.7407, + "step": 4777 + }, + { + "epoch": 2.694867456288776, + "grad_norm": 1.7285075187683105, + "learning_rate": 3.6528482797518334e-05, + "loss": 0.7431, + "step": 4778 + }, + { + "epoch": 2.6954314720812182, + "grad_norm": 1.0734556913375854, + "learning_rate": 3.652566271855612e-05, + "loss": 0.7778, + "step": 4779 + }, + { + "epoch": 2.6959954878736605, + "grad_norm": 1.8001697063446045, + "learning_rate": 3.652284263959391e-05, + "loss": 0.8094, + "step": 4780 + }, + { + "epoch": 2.6965595036661028, + "grad_norm": 1.4700651168823242, + "learning_rate": 3.6520022560631703e-05, + "loss": 0.834, + "step": 4781 + }, + { + "epoch": 2.697123519458545, + "grad_norm": 1.2352668046951294, + "learning_rate": 3.651720248166949e-05, + "loss": 0.7999, + "step": 4782 + }, + { + "epoch": 2.697687535250987, + "grad_norm": 1.1096389293670654, + "learning_rate": 3.6514382402707274e-05, + "loss": 0.7397, + "step": 4783 + }, + { + "epoch": 2.698251551043429, + "grad_norm": 1.3011687994003296, + "learning_rate": 3.6511562323745066e-05, + "loss": 0.7263, + "step": 4784 + }, + { + "epoch": 2.6988155668358713, + "grad_norm": 1.200297474861145, + "learning_rate": 3.650874224478286e-05, + "loss": 0.7833, + "step": 4785 + }, + { + "epoch": 2.6993795826283136, + "grad_norm": 1.0228639841079712, + "learning_rate": 3.6505922165820644e-05, + "loss": 0.6892, + "step": 4786 + }, + { + "epoch": 2.699943598420756, + "grad_norm": 2.4413199424743652, + "learning_rate": 3.650310208685843e-05, + "loss": 0.8274, + "step": 4787 + }, + { + "epoch": 2.700507614213198, + "grad_norm": 1.438771367073059, + "learning_rate": 3.650028200789622e-05, + "loss": 0.8133, + "step": 4788 + }, + { + "epoch": 2.7010716300056403, + "grad_norm": 2.6391844749450684, + "learning_rate": 3.6497461928934014e-05, + "loss": 0.8728, + "step": 4789 + }, + { + "epoch": 2.701635645798082, + "grad_norm": 1.0734612941741943, + "learning_rate": 3.64946418499718e-05, + "loss": 0.7594, + "step": 4790 + }, + { + "epoch": 2.7021996615905244, + "grad_norm": 1.6049035787582397, + "learning_rate": 3.6491821771009585e-05, + "loss": 0.77, + "step": 4791 + }, + { + "epoch": 2.7027636773829666, + "grad_norm": 1.2592662572860718, + "learning_rate": 3.6489001692047384e-05, + "loss": 0.8309, + "step": 4792 + }, + { + "epoch": 2.703327693175409, + "grad_norm": 1.468422770500183, + "learning_rate": 3.648618161308517e-05, + "loss": 0.831, + "step": 4793 + }, + { + "epoch": 2.703891708967851, + "grad_norm": 1.1936346292495728, + "learning_rate": 3.6483361534122954e-05, + "loss": 0.7456, + "step": 4794 + }, + { + "epoch": 2.7044557247602934, + "grad_norm": 1.4792077541351318, + "learning_rate": 3.648054145516075e-05, + "loss": 0.6852, + "step": 4795 + }, + { + "epoch": 2.7050197405527356, + "grad_norm": 1.1128387451171875, + "learning_rate": 3.647772137619854e-05, + "loss": 0.643, + "step": 4796 + }, + { + "epoch": 2.7055837563451774, + "grad_norm": 1.3294575214385986, + "learning_rate": 3.6474901297236324e-05, + "loss": 0.8732, + "step": 4797 + }, + { + "epoch": 2.7061477721376197, + "grad_norm": 1.1272259950637817, + "learning_rate": 3.647208121827411e-05, + "loss": 0.6358, + "step": 4798 + }, + { + "epoch": 2.706711787930062, + "grad_norm": 1.5210726261138916, + "learning_rate": 3.64692611393119e-05, + "loss": 0.778, + "step": 4799 + }, + { + "epoch": 2.707275803722504, + "grad_norm": 1.1590478420257568, + "learning_rate": 3.6466441060349694e-05, + "loss": 0.6539, + "step": 4800 + }, + { + "epoch": 2.7078398195149465, + "grad_norm": 1.3415002822875977, + "learning_rate": 3.646362098138748e-05, + "loss": 0.7655, + "step": 4801 + }, + { + "epoch": 2.7084038353073887, + "grad_norm": 1.5042321681976318, + "learning_rate": 3.646080090242527e-05, + "loss": 0.8031, + "step": 4802 + }, + { + "epoch": 2.708967851099831, + "grad_norm": 1.1706874370574951, + "learning_rate": 3.645798082346306e-05, + "loss": 0.6903, + "step": 4803 + }, + { + "epoch": 2.7095318668922728, + "grad_norm": 2.181605577468872, + "learning_rate": 3.645516074450085e-05, + "loss": 0.7528, + "step": 4804 + }, + { + "epoch": 2.710095882684715, + "grad_norm": 1.454461693763733, + "learning_rate": 3.6452340665538635e-05, + "loss": 0.7755, + "step": 4805 + }, + { + "epoch": 2.7106598984771573, + "grad_norm": 2.602041244506836, + "learning_rate": 3.644952058657643e-05, + "loss": 0.8146, + "step": 4806 + }, + { + "epoch": 2.7112239142695995, + "grad_norm": 1.4601901769638062, + "learning_rate": 3.644670050761422e-05, + "loss": 0.8578, + "step": 4807 + }, + { + "epoch": 2.7117879300620418, + "grad_norm": 1.3499191999435425, + "learning_rate": 3.6443880428652004e-05, + "loss": 0.8104, + "step": 4808 + }, + { + "epoch": 2.712351945854484, + "grad_norm": 1.6228779554367065, + "learning_rate": 3.644106034968979e-05, + "loss": 0.8258, + "step": 4809 + }, + { + "epoch": 2.7129159616469263, + "grad_norm": 2.402904510498047, + "learning_rate": 3.643824027072758e-05, + "loss": 0.835, + "step": 4810 + }, + { + "epoch": 2.713479977439368, + "grad_norm": 1.1833679676055908, + "learning_rate": 3.6435420191765374e-05, + "loss": 0.6456, + "step": 4811 + }, + { + "epoch": 2.7140439932318103, + "grad_norm": 1.3513455390930176, + "learning_rate": 3.643260011280316e-05, + "loss": 0.6968, + "step": 4812 + }, + { + "epoch": 2.7146080090242526, + "grad_norm": 1.5595911741256714, + "learning_rate": 3.642978003384095e-05, + "loss": 0.8505, + "step": 4813 + }, + { + "epoch": 2.715172024816695, + "grad_norm": 1.291529893875122, + "learning_rate": 3.642695995487874e-05, + "loss": 0.727, + "step": 4814 + }, + { + "epoch": 2.715736040609137, + "grad_norm": 1.1704567670822144, + "learning_rate": 3.642413987591653e-05, + "loss": 0.7782, + "step": 4815 + }, + { + "epoch": 2.7163000564015793, + "grad_norm": 1.6051980257034302, + "learning_rate": 3.6421319796954315e-05, + "loss": 0.7752, + "step": 4816 + }, + { + "epoch": 2.7168640721940216, + "grad_norm": 1.3038668632507324, + "learning_rate": 3.641849971799211e-05, + "loss": 0.7245, + "step": 4817 + }, + { + "epoch": 2.7174280879864634, + "grad_norm": 1.4800615310668945, + "learning_rate": 3.641567963902989e-05, + "loss": 0.7319, + "step": 4818 + }, + { + "epoch": 2.7179921037789057, + "grad_norm": 1.552430272102356, + "learning_rate": 3.6412859560067685e-05, + "loss": 0.7552, + "step": 4819 + }, + { + "epoch": 2.718556119571348, + "grad_norm": 1.5594757795333862, + "learning_rate": 3.641003948110548e-05, + "loss": 0.7125, + "step": 4820 + }, + { + "epoch": 2.71912013536379, + "grad_norm": 1.5068689584732056, + "learning_rate": 3.640721940214326e-05, + "loss": 0.7526, + "step": 4821 + }, + { + "epoch": 2.7196841511562324, + "grad_norm": 1.0000568628311157, + "learning_rate": 3.640439932318105e-05, + "loss": 0.6553, + "step": 4822 + }, + { + "epoch": 2.7202481669486747, + "grad_norm": 1.985864281654358, + "learning_rate": 3.640157924421884e-05, + "loss": 0.8143, + "step": 4823 + }, + { + "epoch": 2.720812182741117, + "grad_norm": 1.1955105066299438, + "learning_rate": 3.639875916525663e-05, + "loss": 0.7005, + "step": 4824 + }, + { + "epoch": 2.7213761985335587, + "grad_norm": 1.403176188468933, + "learning_rate": 3.639593908629442e-05, + "loss": 0.6587, + "step": 4825 + }, + { + "epoch": 2.721940214326001, + "grad_norm": 3.133958101272583, + "learning_rate": 3.63931190073322e-05, + "loss": 0.7885, + "step": 4826 + }, + { + "epoch": 2.7225042301184432, + "grad_norm": 1.9671103954315186, + "learning_rate": 3.6390298928369995e-05, + "loss": 0.8546, + "step": 4827 + }, + { + "epoch": 2.7230682459108855, + "grad_norm": 1.2883493900299072, + "learning_rate": 3.638747884940779e-05, + "loss": 0.6697, + "step": 4828 + }, + { + "epoch": 2.7236322617033277, + "grad_norm": 1.776261806488037, + "learning_rate": 3.638465877044557e-05, + "loss": 0.7252, + "step": 4829 + }, + { + "epoch": 2.72419627749577, + "grad_norm": 1.1979544162750244, + "learning_rate": 3.638183869148336e-05, + "loss": 0.7789, + "step": 4830 + }, + { + "epoch": 2.7247602932882122, + "grad_norm": 4.410787582397461, + "learning_rate": 3.637901861252116e-05, + "loss": 0.8453, + "step": 4831 + }, + { + "epoch": 2.725324309080654, + "grad_norm": 0.983568012714386, + "learning_rate": 3.637619853355894e-05, + "loss": 0.6879, + "step": 4832 + }, + { + "epoch": 2.7258883248730963, + "grad_norm": 1.0583457946777344, + "learning_rate": 3.637337845459673e-05, + "loss": 0.7228, + "step": 4833 + }, + { + "epoch": 2.7264523406655385, + "grad_norm": 1.1706205606460571, + "learning_rate": 3.637055837563452e-05, + "loss": 0.7011, + "step": 4834 + }, + { + "epoch": 2.727016356457981, + "grad_norm": 2.0285284519195557, + "learning_rate": 3.636773829667231e-05, + "loss": 0.8491, + "step": 4835 + }, + { + "epoch": 2.727580372250423, + "grad_norm": 2.684518575668335, + "learning_rate": 3.63649182177101e-05, + "loss": 0.8619, + "step": 4836 + }, + { + "epoch": 2.7281443880428653, + "grad_norm": 1.043033480644226, + "learning_rate": 3.636209813874788e-05, + "loss": 0.7217, + "step": 4837 + }, + { + "epoch": 2.7287084038353075, + "grad_norm": 1.0603502988815308, + "learning_rate": 3.6359278059785675e-05, + "loss": 0.8555, + "step": 4838 + }, + { + "epoch": 2.7292724196277494, + "grad_norm": 1.536653757095337, + "learning_rate": 3.635645798082347e-05, + "loss": 0.7601, + "step": 4839 + }, + { + "epoch": 2.7298364354201916, + "grad_norm": 1.4246833324432373, + "learning_rate": 3.635363790186125e-05, + "loss": 0.7361, + "step": 4840 + }, + { + "epoch": 2.730400451212634, + "grad_norm": 1.3773049116134644, + "learning_rate": 3.6350817822899045e-05, + "loss": 0.7104, + "step": 4841 + }, + { + "epoch": 2.730964467005076, + "grad_norm": 1.9376276731491089, + "learning_rate": 3.634799774393684e-05, + "loss": 0.8948, + "step": 4842 + }, + { + "epoch": 2.7315284827975184, + "grad_norm": 3.940323829650879, + "learning_rate": 3.634517766497462e-05, + "loss": 0.9217, + "step": 4843 + }, + { + "epoch": 2.7320924985899606, + "grad_norm": 1.992957592010498, + "learning_rate": 3.634235758601241e-05, + "loss": 0.7727, + "step": 4844 + }, + { + "epoch": 2.732656514382403, + "grad_norm": 1.257796287536621, + "learning_rate": 3.63395375070502e-05, + "loss": 0.7939, + "step": 4845 + }, + { + "epoch": 2.7332205301748447, + "grad_norm": 2.0824077129364014, + "learning_rate": 3.633671742808799e-05, + "loss": 0.9127, + "step": 4846 + }, + { + "epoch": 2.733784545967287, + "grad_norm": 1.2232555150985718, + "learning_rate": 3.633389734912578e-05, + "loss": 0.6154, + "step": 4847 + }, + { + "epoch": 2.734348561759729, + "grad_norm": 1.4366039037704468, + "learning_rate": 3.633107727016356e-05, + "loss": 0.7886, + "step": 4848 + }, + { + "epoch": 2.7349125775521714, + "grad_norm": 1.3861265182495117, + "learning_rate": 3.6328257191201355e-05, + "loss": 0.7804, + "step": 4849 + }, + { + "epoch": 2.7354765933446137, + "grad_norm": 1.4197280406951904, + "learning_rate": 3.632543711223915e-05, + "loss": 0.8479, + "step": 4850 + }, + { + "epoch": 2.736040609137056, + "grad_norm": 2.3899614810943604, + "learning_rate": 3.632261703327693e-05, + "loss": 0.7023, + "step": 4851 + }, + { + "epoch": 2.736604624929498, + "grad_norm": 1.766207218170166, + "learning_rate": 3.6319796954314725e-05, + "loss": 0.7902, + "step": 4852 + }, + { + "epoch": 2.73716864072194, + "grad_norm": 1.3264029026031494, + "learning_rate": 3.631697687535251e-05, + "loss": 0.7129, + "step": 4853 + }, + { + "epoch": 2.7377326565143822, + "grad_norm": 2.07415509223938, + "learning_rate": 3.63141567963903e-05, + "loss": 0.6059, + "step": 4854 + }, + { + "epoch": 2.7382966723068245, + "grad_norm": 2.6059484481811523, + "learning_rate": 3.631133671742809e-05, + "loss": 0.8658, + "step": 4855 + }, + { + "epoch": 2.7388606880992667, + "grad_norm": 1.0388861894607544, + "learning_rate": 3.630851663846588e-05, + "loss": 0.7749, + "step": 4856 + }, + { + "epoch": 2.739424703891709, + "grad_norm": 1.7167870998382568, + "learning_rate": 3.6305696559503666e-05, + "loss": 0.7573, + "step": 4857 + }, + { + "epoch": 2.7399887196841513, + "grad_norm": 1.724664568901062, + "learning_rate": 3.630287648054146e-05, + "loss": 0.7587, + "step": 4858 + }, + { + "epoch": 2.7405527354765935, + "grad_norm": 1.4371967315673828, + "learning_rate": 3.630005640157925e-05, + "loss": 0.7693, + "step": 4859 + }, + { + "epoch": 2.7411167512690353, + "grad_norm": 1.1628514528274536, + "learning_rate": 3.6297236322617035e-05, + "loss": 0.6906, + "step": 4860 + }, + { + "epoch": 2.7416807670614776, + "grad_norm": 0.9979652762413025, + "learning_rate": 3.629441624365482e-05, + "loss": 0.7382, + "step": 4861 + }, + { + "epoch": 2.74224478285392, + "grad_norm": 1.2774518728256226, + "learning_rate": 3.629159616469261e-05, + "loss": 0.6681, + "step": 4862 + }, + { + "epoch": 2.742808798646362, + "grad_norm": 2.1332929134368896, + "learning_rate": 3.6288776085730405e-05, + "loss": 0.9374, + "step": 4863 + }, + { + "epoch": 2.7433728144388043, + "grad_norm": 0.9630194306373596, + "learning_rate": 3.628595600676819e-05, + "loss": 0.6072, + "step": 4864 + }, + { + "epoch": 2.7439368302312466, + "grad_norm": 3.1516904830932617, + "learning_rate": 3.6283135927805976e-05, + "loss": 0.8968, + "step": 4865 + }, + { + "epoch": 2.744500846023689, + "grad_norm": 1.226872444152832, + "learning_rate": 3.628031584884377e-05, + "loss": 0.6733, + "step": 4866 + }, + { + "epoch": 2.7450648618161306, + "grad_norm": 1.3219091892242432, + "learning_rate": 3.627749576988156e-05, + "loss": 0.7378, + "step": 4867 + }, + { + "epoch": 2.745628877608573, + "grad_norm": 2.027543783187866, + "learning_rate": 3.6274675690919346e-05, + "loss": 0.8442, + "step": 4868 + }, + { + "epoch": 2.746192893401015, + "grad_norm": 2.9311633110046387, + "learning_rate": 3.627185561195713e-05, + "loss": 0.7183, + "step": 4869 + }, + { + "epoch": 2.7467569091934574, + "grad_norm": 1.5934617519378662, + "learning_rate": 3.626903553299493e-05, + "loss": 0.7089, + "step": 4870 + }, + { + "epoch": 2.7473209249858996, + "grad_norm": 1.8219237327575684, + "learning_rate": 3.6266215454032715e-05, + "loss": 0.8136, + "step": 4871 + }, + { + "epoch": 2.747884940778342, + "grad_norm": 1.5109699964523315, + "learning_rate": 3.62633953750705e-05, + "loss": 0.8428, + "step": 4872 + }, + { + "epoch": 2.748448956570784, + "grad_norm": 2.227489471435547, + "learning_rate": 3.626057529610829e-05, + "loss": 0.725, + "step": 4873 + }, + { + "epoch": 2.749012972363226, + "grad_norm": 1.9843955039978027, + "learning_rate": 3.6257755217146085e-05, + "loss": 0.7352, + "step": 4874 + }, + { + "epoch": 2.749576988155668, + "grad_norm": 2.416576862335205, + "learning_rate": 3.625493513818387e-05, + "loss": 0.7315, + "step": 4875 + }, + { + "epoch": 2.7501410039481105, + "grad_norm": 1.0707536935806274, + "learning_rate": 3.6252115059221656e-05, + "loss": 0.8344, + "step": 4876 + }, + { + "epoch": 2.7507050197405527, + "grad_norm": 1.2184795141220093, + "learning_rate": 3.6249294980259455e-05, + "loss": 0.754, + "step": 4877 + }, + { + "epoch": 2.751269035532995, + "grad_norm": 1.4984766244888306, + "learning_rate": 3.624647490129724e-05, + "loss": 0.7864, + "step": 4878 + }, + { + "epoch": 2.751833051325437, + "grad_norm": 1.4933706521987915, + "learning_rate": 3.6243654822335026e-05, + "loss": 0.7292, + "step": 4879 + }, + { + "epoch": 2.7523970671178795, + "grad_norm": 1.3239473104476929, + "learning_rate": 3.624083474337281e-05, + "loss": 0.7875, + "step": 4880 + }, + { + "epoch": 2.7529610829103213, + "grad_norm": 1.259231686592102, + "learning_rate": 3.623801466441061e-05, + "loss": 0.6874, + "step": 4881 + }, + { + "epoch": 2.7535250987027635, + "grad_norm": 1.0620059967041016, + "learning_rate": 3.6235194585448396e-05, + "loss": 0.6366, + "step": 4882 + }, + { + "epoch": 2.7540891144952058, + "grad_norm": 1.0250835418701172, + "learning_rate": 3.623237450648618e-05, + "loss": 0.73, + "step": 4883 + }, + { + "epoch": 2.754653130287648, + "grad_norm": 1.4987821578979492, + "learning_rate": 3.622955442752397e-05, + "loss": 0.8503, + "step": 4884 + }, + { + "epoch": 2.7552171460800903, + "grad_norm": 1.8129056692123413, + "learning_rate": 3.6226734348561765e-05, + "loss": 0.7714, + "step": 4885 + }, + { + "epoch": 2.7557811618725325, + "grad_norm": 1.4240896701812744, + "learning_rate": 3.622391426959955e-05, + "loss": 0.8237, + "step": 4886 + }, + { + "epoch": 2.7563451776649748, + "grad_norm": 1.9270573854446411, + "learning_rate": 3.6221094190637336e-05, + "loss": 0.8202, + "step": 4887 + }, + { + "epoch": 2.7569091934574166, + "grad_norm": 0.9132212996482849, + "learning_rate": 3.621827411167513e-05, + "loss": 0.6449, + "step": 4888 + }, + { + "epoch": 2.757473209249859, + "grad_norm": 2.462268590927124, + "learning_rate": 3.621545403271292e-05, + "loss": 0.8911, + "step": 4889 + }, + { + "epoch": 2.758037225042301, + "grad_norm": 2.783615827560425, + "learning_rate": 3.6212633953750706e-05, + "loss": 0.8986, + "step": 4890 + }, + { + "epoch": 2.7586012408347433, + "grad_norm": 1.9069323539733887, + "learning_rate": 3.62098138747885e-05, + "loss": 0.7809, + "step": 4891 + }, + { + "epoch": 2.7591652566271856, + "grad_norm": 1.431980848312378, + "learning_rate": 3.6206993795826284e-05, + "loss": 0.7531, + "step": 4892 + }, + { + "epoch": 2.759729272419628, + "grad_norm": 1.5990875959396362, + "learning_rate": 3.6204173716864076e-05, + "loss": 0.8302, + "step": 4893 + }, + { + "epoch": 2.76029328821207, + "grad_norm": 1.5678685903549194, + "learning_rate": 3.620135363790186e-05, + "loss": 0.9178, + "step": 4894 + }, + { + "epoch": 2.760857304004512, + "grad_norm": 1.6040995121002197, + "learning_rate": 3.619853355893965e-05, + "loss": 0.7161, + "step": 4895 + }, + { + "epoch": 2.761421319796954, + "grad_norm": 1.2681264877319336, + "learning_rate": 3.619571347997744e-05, + "loss": 0.754, + "step": 4896 + }, + { + "epoch": 2.7619853355893964, + "grad_norm": 1.8002510070800781, + "learning_rate": 3.619289340101523e-05, + "loss": 0.7399, + "step": 4897 + }, + { + "epoch": 2.7625493513818387, + "grad_norm": 1.4495959281921387, + "learning_rate": 3.6190073322053016e-05, + "loss": 0.793, + "step": 4898 + }, + { + "epoch": 2.763113367174281, + "grad_norm": 1.0946542024612427, + "learning_rate": 3.618725324309081e-05, + "loss": 0.7518, + "step": 4899 + }, + { + "epoch": 2.763677382966723, + "grad_norm": 1.3441277742385864, + "learning_rate": 3.6184433164128594e-05, + "loss": 0.7955, + "step": 4900 + }, + { + "epoch": 2.7642413987591654, + "grad_norm": 1.01449716091156, + "learning_rate": 3.6181613085166386e-05, + "loss": 0.6297, + "step": 4901 + }, + { + "epoch": 2.764805414551607, + "grad_norm": 1.0519548654556274, + "learning_rate": 3.617879300620418e-05, + "loss": 0.7016, + "step": 4902 + }, + { + "epoch": 2.7653694303440495, + "grad_norm": 1.3649752140045166, + "learning_rate": 3.6175972927241964e-05, + "loss": 0.7319, + "step": 4903 + }, + { + "epoch": 2.7659334461364917, + "grad_norm": 0.9486994743347168, + "learning_rate": 3.617315284827975e-05, + "loss": 0.6948, + "step": 4904 + }, + { + "epoch": 2.766497461928934, + "grad_norm": 1.0837254524230957, + "learning_rate": 3.617033276931754e-05, + "loss": 0.7286, + "step": 4905 + }, + { + "epoch": 2.7670614777213762, + "grad_norm": 1.3511337041854858, + "learning_rate": 3.6167512690355334e-05, + "loss": 0.7483, + "step": 4906 + }, + { + "epoch": 2.7676254935138185, + "grad_norm": 1.0248892307281494, + "learning_rate": 3.616469261139312e-05, + "loss": 0.6775, + "step": 4907 + }, + { + "epoch": 2.7681895093062607, + "grad_norm": 1.9071098566055298, + "learning_rate": 3.6161872532430904e-05, + "loss": 0.8369, + "step": 4908 + }, + { + "epoch": 2.7687535250987025, + "grad_norm": 2.407444477081299, + "learning_rate": 3.61590524534687e-05, + "loss": 0.8363, + "step": 4909 + }, + { + "epoch": 2.769317540891145, + "grad_norm": 1.3625481128692627, + "learning_rate": 3.615623237450649e-05, + "loss": 0.8251, + "step": 4910 + }, + { + "epoch": 2.769881556683587, + "grad_norm": 4.180976390838623, + "learning_rate": 3.6153412295544274e-05, + "loss": 0.9151, + "step": 4911 + }, + { + "epoch": 2.7704455724760293, + "grad_norm": 1.3116743564605713, + "learning_rate": 3.6150592216582066e-05, + "loss": 0.7352, + "step": 4912 + }, + { + "epoch": 2.7710095882684715, + "grad_norm": 3.0495336055755615, + "learning_rate": 3.614777213761986e-05, + "loss": 0.8712, + "step": 4913 + }, + { + "epoch": 2.771573604060914, + "grad_norm": 1.0706580877304077, + "learning_rate": 3.6144952058657644e-05, + "loss": 0.7532, + "step": 4914 + }, + { + "epoch": 2.772137619853356, + "grad_norm": 1.786864995956421, + "learning_rate": 3.614213197969543e-05, + "loss": 0.8179, + "step": 4915 + }, + { + "epoch": 2.772701635645798, + "grad_norm": 1.5516940355300903, + "learning_rate": 3.613931190073322e-05, + "loss": 0.7285, + "step": 4916 + }, + { + "epoch": 2.77326565143824, + "grad_norm": 1.2104485034942627, + "learning_rate": 3.6136491821771014e-05, + "loss": 0.727, + "step": 4917 + }, + { + "epoch": 2.7738296672306824, + "grad_norm": 1.47706937789917, + "learning_rate": 3.61336717428088e-05, + "loss": 0.7515, + "step": 4918 + }, + { + "epoch": 2.7743936830231246, + "grad_norm": 1.2012404203414917, + "learning_rate": 3.6130851663846584e-05, + "loss": 0.7376, + "step": 4919 + }, + { + "epoch": 2.774957698815567, + "grad_norm": 1.4966704845428467, + "learning_rate": 3.6128031584884383e-05, + "loss": 0.7572, + "step": 4920 + }, + { + "epoch": 2.775521714608009, + "grad_norm": 1.366039514541626, + "learning_rate": 3.612521150592217e-05, + "loss": 0.6926, + "step": 4921 + }, + { + "epoch": 2.7760857304004514, + "grad_norm": 2.0520129203796387, + "learning_rate": 3.6122391426959954e-05, + "loss": 0.7262, + "step": 4922 + }, + { + "epoch": 2.776649746192893, + "grad_norm": 1.461326003074646, + "learning_rate": 3.6119571347997746e-05, + "loss": 0.7748, + "step": 4923 + }, + { + "epoch": 2.7772137619853354, + "grad_norm": 2.6600449085235596, + "learning_rate": 3.611675126903554e-05, + "loss": 0.6465, + "step": 4924 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 1.7899713516235352, + "learning_rate": 3.6113931190073324e-05, + "loss": 0.6814, + "step": 4925 + }, + { + "epoch": 2.77834179357022, + "grad_norm": 1.0373200178146362, + "learning_rate": 3.611111111111111e-05, + "loss": 0.7032, + "step": 4926 + }, + { + "epoch": 2.778905809362662, + "grad_norm": 2.1896004676818848, + "learning_rate": 3.61082910321489e-05, + "loss": 0.7868, + "step": 4927 + }, + { + "epoch": 2.7794698251551044, + "grad_norm": 1.197743535041809, + "learning_rate": 3.6105470953186694e-05, + "loss": 0.7009, + "step": 4928 + }, + { + "epoch": 2.7800338409475467, + "grad_norm": 1.371680498123169, + "learning_rate": 3.610265087422448e-05, + "loss": 0.7429, + "step": 4929 + }, + { + "epoch": 2.7805978567399885, + "grad_norm": 2.0822904109954834, + "learning_rate": 3.609983079526227e-05, + "loss": 0.7941, + "step": 4930 + }, + { + "epoch": 2.7811618725324307, + "grad_norm": 1.0305134057998657, + "learning_rate": 3.609701071630006e-05, + "loss": 0.6834, + "step": 4931 + }, + { + "epoch": 2.781725888324873, + "grad_norm": 1.4198076725006104, + "learning_rate": 3.609419063733785e-05, + "loss": 0.705, + "step": 4932 + }, + { + "epoch": 2.7822899041173152, + "grad_norm": 1.1482062339782715, + "learning_rate": 3.6091370558375634e-05, + "loss": 0.8062, + "step": 4933 + }, + { + "epoch": 2.7828539199097575, + "grad_norm": 1.421528935432434, + "learning_rate": 3.6088550479413427e-05, + "loss": 0.769, + "step": 4934 + }, + { + "epoch": 2.7834179357021998, + "grad_norm": 1.2353570461273193, + "learning_rate": 3.608573040045121e-05, + "loss": 0.8691, + "step": 4935 + }, + { + "epoch": 2.783981951494642, + "grad_norm": 1.4470453262329102, + "learning_rate": 3.6082910321489004e-05, + "loss": 0.7872, + "step": 4936 + }, + { + "epoch": 2.784545967287084, + "grad_norm": 1.721861481666565, + "learning_rate": 3.608009024252679e-05, + "loss": 0.8082, + "step": 4937 + }, + { + "epoch": 2.785109983079526, + "grad_norm": 1.8074418306350708, + "learning_rate": 3.607727016356458e-05, + "loss": 0.9275, + "step": 4938 + }, + { + "epoch": 2.7856739988719683, + "grad_norm": 1.2445214986801147, + "learning_rate": 3.607445008460237e-05, + "loss": 0.8067, + "step": 4939 + }, + { + "epoch": 2.7862380146644106, + "grad_norm": 1.8997670412063599, + "learning_rate": 3.607163000564016e-05, + "loss": 0.7191, + "step": 4940 + }, + { + "epoch": 2.786802030456853, + "grad_norm": 1.7987881898880005, + "learning_rate": 3.606880992667795e-05, + "loss": 0.8447, + "step": 4941 + }, + { + "epoch": 2.787366046249295, + "grad_norm": 0.9616972804069519, + "learning_rate": 3.606598984771574e-05, + "loss": 0.6942, + "step": 4942 + }, + { + "epoch": 2.7879300620417373, + "grad_norm": 1.0449737310409546, + "learning_rate": 3.606316976875352e-05, + "loss": 0.7476, + "step": 4943 + }, + { + "epoch": 2.788494077834179, + "grad_norm": 1.290661096572876, + "learning_rate": 3.6060349689791315e-05, + "loss": 0.7293, + "step": 4944 + }, + { + "epoch": 2.7890580936266214, + "grad_norm": 1.4462281465530396, + "learning_rate": 3.605752961082911e-05, + "loss": 0.7436, + "step": 4945 + }, + { + "epoch": 2.7896221094190636, + "grad_norm": 1.4687793254852295, + "learning_rate": 3.605470953186689e-05, + "loss": 0.7129, + "step": 4946 + }, + { + "epoch": 2.790186125211506, + "grad_norm": 2.0641238689422607, + "learning_rate": 3.6051889452904684e-05, + "loss": 0.8109, + "step": 4947 + }, + { + "epoch": 2.790750141003948, + "grad_norm": 1.4696886539459229, + "learning_rate": 3.6049069373942477e-05, + "loss": 0.863, + "step": 4948 + }, + { + "epoch": 2.7913141567963904, + "grad_norm": 1.227148413658142, + "learning_rate": 3.604624929498026e-05, + "loss": 0.8654, + "step": 4949 + }, + { + "epoch": 2.7918781725888326, + "grad_norm": 1.0378905534744263, + "learning_rate": 3.604342921601805e-05, + "loss": 0.7143, + "step": 4950 + }, + { + "epoch": 2.7924421883812744, + "grad_norm": 1.497017741203308, + "learning_rate": 3.604060913705584e-05, + "loss": 0.7256, + "step": 4951 + }, + { + "epoch": 2.7930062041737167, + "grad_norm": 1.1553651094436646, + "learning_rate": 3.603778905809363e-05, + "loss": 0.7211, + "step": 4952 + }, + { + "epoch": 2.793570219966159, + "grad_norm": 0.7832198143005371, + "learning_rate": 3.603496897913142e-05, + "loss": 0.5278, + "step": 4953 + }, + { + "epoch": 2.794134235758601, + "grad_norm": 1.5289373397827148, + "learning_rate": 3.60321489001692e-05, + "loss": 0.9023, + "step": 4954 + }, + { + "epoch": 2.7946982515510435, + "grad_norm": 1.1982940435409546, + "learning_rate": 3.6029328821206995e-05, + "loss": 0.7237, + "step": 4955 + }, + { + "epoch": 2.7952622673434857, + "grad_norm": 1.5228102207183838, + "learning_rate": 3.602650874224479e-05, + "loss": 0.8792, + "step": 4956 + }, + { + "epoch": 2.795826283135928, + "grad_norm": 1.2951759099960327, + "learning_rate": 3.602368866328257e-05, + "loss": 0.7692, + "step": 4957 + }, + { + "epoch": 2.7963902989283698, + "grad_norm": 1.7180404663085938, + "learning_rate": 3.602086858432036e-05, + "loss": 0.7732, + "step": 4958 + }, + { + "epoch": 2.796954314720812, + "grad_norm": 1.2818409204483032, + "learning_rate": 3.601804850535816e-05, + "loss": 0.8194, + "step": 4959 + }, + { + "epoch": 2.7975183305132543, + "grad_norm": 1.6626765727996826, + "learning_rate": 3.601522842639594e-05, + "loss": 0.7827, + "step": 4960 + }, + { + "epoch": 2.7980823463056965, + "grad_norm": 1.1671142578125, + "learning_rate": 3.601240834743373e-05, + "loss": 0.7884, + "step": 4961 + }, + { + "epoch": 2.7986463620981388, + "grad_norm": 1.1068974733352661, + "learning_rate": 3.600958826847152e-05, + "loss": 0.6968, + "step": 4962 + }, + { + "epoch": 2.799210377890581, + "grad_norm": 2.1694061756134033, + "learning_rate": 3.600676818950931e-05, + "loss": 0.675, + "step": 4963 + }, + { + "epoch": 2.7997743936830233, + "grad_norm": 1.3333700895309448, + "learning_rate": 3.60039481105471e-05, + "loss": 0.8508, + "step": 4964 + }, + { + "epoch": 2.800338409475465, + "grad_norm": 1.2825827598571777, + "learning_rate": 3.600112803158488e-05, + "loss": 0.8118, + "step": 4965 + }, + { + "epoch": 2.8009024252679073, + "grad_norm": 2.622441291809082, + "learning_rate": 3.5998307952622675e-05, + "loss": 0.9164, + "step": 4966 + }, + { + "epoch": 2.8014664410603496, + "grad_norm": 2.4930922985076904, + "learning_rate": 3.599548787366047e-05, + "loss": 0.9281, + "step": 4967 + }, + { + "epoch": 2.802030456852792, + "grad_norm": 1.0602622032165527, + "learning_rate": 3.599266779469825e-05, + "loss": 0.7675, + "step": 4968 + }, + { + "epoch": 2.802594472645234, + "grad_norm": 1.4714100360870361, + "learning_rate": 3.5989847715736045e-05, + "loss": 0.7799, + "step": 4969 + }, + { + "epoch": 2.8031584884376763, + "grad_norm": 2.1952011585235596, + "learning_rate": 3.598702763677383e-05, + "loss": 0.7267, + "step": 4970 + }, + { + "epoch": 2.8037225042301186, + "grad_norm": 1.5652046203613281, + "learning_rate": 3.598420755781162e-05, + "loss": 0.7103, + "step": 4971 + }, + { + "epoch": 2.8042865200225604, + "grad_norm": 1.0250643491744995, + "learning_rate": 3.598138747884941e-05, + "loss": 0.7782, + "step": 4972 + }, + { + "epoch": 2.8048505358150027, + "grad_norm": 1.0706712007522583, + "learning_rate": 3.59785673998872e-05, + "loss": 0.7435, + "step": 4973 + }, + { + "epoch": 2.805414551607445, + "grad_norm": 1.3922637701034546, + "learning_rate": 3.5975747320924985e-05, + "loss": 0.7199, + "step": 4974 + }, + { + "epoch": 2.805978567399887, + "grad_norm": 1.4777835607528687, + "learning_rate": 3.597292724196278e-05, + "loss": 0.8253, + "step": 4975 + }, + { + "epoch": 2.8065425831923294, + "grad_norm": 1.5206612348556519, + "learning_rate": 3.597010716300056e-05, + "loss": 0.7907, + "step": 4976 + }, + { + "epoch": 2.8071065989847717, + "grad_norm": 2.424769401550293, + "learning_rate": 3.5967287084038355e-05, + "loss": 0.824, + "step": 4977 + }, + { + "epoch": 2.807670614777214, + "grad_norm": 1.2394754886627197, + "learning_rate": 3.596446700507614e-05, + "loss": 0.6756, + "step": 4978 + }, + { + "epoch": 2.8082346305696557, + "grad_norm": 1.265147089958191, + "learning_rate": 3.596164692611393e-05, + "loss": 0.8333, + "step": 4979 + }, + { + "epoch": 2.808798646362098, + "grad_norm": 2.1902718544006348, + "learning_rate": 3.5958826847151725e-05, + "loss": 0.9037, + "step": 4980 + }, + { + "epoch": 2.8093626621545402, + "grad_norm": 1.1805747747421265, + "learning_rate": 3.595600676818951e-05, + "loss": 0.6832, + "step": 4981 + }, + { + "epoch": 2.8099266779469825, + "grad_norm": 1.6049036979675293, + "learning_rate": 3.59531866892273e-05, + "loss": 0.8427, + "step": 4982 + }, + { + "epoch": 2.8104906937394247, + "grad_norm": 1.072511076927185, + "learning_rate": 3.595036661026509e-05, + "loss": 0.694, + "step": 4983 + }, + { + "epoch": 2.811054709531867, + "grad_norm": 1.395917296409607, + "learning_rate": 3.594754653130288e-05, + "loss": 0.7828, + "step": 4984 + }, + { + "epoch": 2.8116187253243092, + "grad_norm": 1.1519280672073364, + "learning_rate": 3.5944726452340665e-05, + "loss": 0.7763, + "step": 4985 + }, + { + "epoch": 2.812182741116751, + "grad_norm": 2.0463411808013916, + "learning_rate": 3.594190637337846e-05, + "loss": 0.9299, + "step": 4986 + }, + { + "epoch": 2.8127467569091937, + "grad_norm": 1.112114429473877, + "learning_rate": 3.593908629441625e-05, + "loss": 0.7065, + "step": 4987 + }, + { + "epoch": 2.8133107727016355, + "grad_norm": 1.234641194343567, + "learning_rate": 3.5936266215454035e-05, + "loss": 0.7328, + "step": 4988 + }, + { + "epoch": 2.813874788494078, + "grad_norm": 2.481292486190796, + "learning_rate": 3.593344613649182e-05, + "loss": 0.7671, + "step": 4989 + }, + { + "epoch": 2.81443880428652, + "grad_norm": 2.1414682865142822, + "learning_rate": 3.593062605752961e-05, + "loss": 0.8351, + "step": 4990 + }, + { + "epoch": 2.8150028200789623, + "grad_norm": 1.5505577325820923, + "learning_rate": 3.5927805978567405e-05, + "loss": 0.8855, + "step": 4991 + }, + { + "epoch": 2.8155668358714045, + "grad_norm": 1.293513536453247, + "learning_rate": 3.592498589960519e-05, + "loss": 0.7364, + "step": 4992 + }, + { + "epoch": 2.8161308516638464, + "grad_norm": 2.12551212310791, + "learning_rate": 3.5922165820642976e-05, + "loss": 0.9258, + "step": 4993 + }, + { + "epoch": 2.816694867456289, + "grad_norm": 1.0413907766342163, + "learning_rate": 3.591934574168077e-05, + "loss": 0.6669, + "step": 4994 + }, + { + "epoch": 2.817258883248731, + "grad_norm": 1.3338109254837036, + "learning_rate": 3.591652566271856e-05, + "loss": 0.7435, + "step": 4995 + }, + { + "epoch": 2.817822899041173, + "grad_norm": 0.9654446840286255, + "learning_rate": 3.5913705583756346e-05, + "loss": 0.6751, + "step": 4996 + }, + { + "epoch": 2.8183869148336154, + "grad_norm": 1.7087571620941162, + "learning_rate": 3.591088550479413e-05, + "loss": 0.7313, + "step": 4997 + }, + { + "epoch": 2.8189509306260576, + "grad_norm": 1.6117157936096191, + "learning_rate": 3.590806542583193e-05, + "loss": 0.7558, + "step": 4998 + }, + { + "epoch": 2.8195149464185, + "grad_norm": 1.159475564956665, + "learning_rate": 3.5905245346869715e-05, + "loss": 0.8448, + "step": 4999 + }, + { + "epoch": 2.8200789622109417, + "grad_norm": 1.308187484741211, + "learning_rate": 3.59024252679075e-05, + "loss": 0.7329, + "step": 5000 + }, + { + "epoch": 2.8206429780033844, + "grad_norm": 1.4936449527740479, + "learning_rate": 3.589960518894529e-05, + "loss": 0.706, + "step": 5001 + }, + { + "epoch": 2.821206993795826, + "grad_norm": 1.015722632408142, + "learning_rate": 3.5896785109983085e-05, + "loss": 0.6769, + "step": 5002 + }, + { + "epoch": 2.8217710095882684, + "grad_norm": 1.5416457653045654, + "learning_rate": 3.589396503102087e-05, + "loss": 0.8508, + "step": 5003 + }, + { + "epoch": 2.8223350253807107, + "grad_norm": 1.5682330131530762, + "learning_rate": 3.5891144952058656e-05, + "loss": 0.7279, + "step": 5004 + }, + { + "epoch": 2.822899041173153, + "grad_norm": 1.544968843460083, + "learning_rate": 3.588832487309645e-05, + "loss": 0.775, + "step": 5005 + }, + { + "epoch": 2.823463056965595, + "grad_norm": 1.3548167943954468, + "learning_rate": 3.588550479413424e-05, + "loss": 0.8265, + "step": 5006 + }, + { + "epoch": 2.824027072758037, + "grad_norm": 1.8369879722595215, + "learning_rate": 3.5882684715172026e-05, + "loss": 0.7698, + "step": 5007 + }, + { + "epoch": 2.8245910885504797, + "grad_norm": 1.523626685142517, + "learning_rate": 3.587986463620982e-05, + "loss": 0.7153, + "step": 5008 + }, + { + "epoch": 2.8251551043429215, + "grad_norm": 1.8856927156448364, + "learning_rate": 3.58770445572476e-05, + "loss": 0.7686, + "step": 5009 + }, + { + "epoch": 2.8257191201353637, + "grad_norm": 1.1483659744262695, + "learning_rate": 3.5874224478285395e-05, + "loss": 0.7722, + "step": 5010 + }, + { + "epoch": 2.826283135927806, + "grad_norm": 1.5691959857940674, + "learning_rate": 3.587140439932318e-05, + "loss": 0.7502, + "step": 5011 + }, + { + "epoch": 2.8268471517202483, + "grad_norm": 3.7470927238464355, + "learning_rate": 3.586858432036097e-05, + "loss": 0.8922, + "step": 5012 + }, + { + "epoch": 2.8274111675126905, + "grad_norm": 1.5573678016662598, + "learning_rate": 3.586576424139876e-05, + "loss": 0.8076, + "step": 5013 + }, + { + "epoch": 2.8279751833051323, + "grad_norm": 2.28695011138916, + "learning_rate": 3.586294416243655e-05, + "loss": 0.7971, + "step": 5014 + }, + { + "epoch": 2.828539199097575, + "grad_norm": 1.213567852973938, + "learning_rate": 3.5860124083474336e-05, + "loss": 0.7, + "step": 5015 + }, + { + "epoch": 2.829103214890017, + "grad_norm": 1.422788381576538, + "learning_rate": 3.585730400451213e-05, + "loss": 0.7619, + "step": 5016 + }, + { + "epoch": 2.829667230682459, + "grad_norm": 1.3097549676895142, + "learning_rate": 3.585448392554992e-05, + "loss": 0.8008, + "step": 5017 + }, + { + "epoch": 2.8302312464749013, + "grad_norm": 1.216148018836975, + "learning_rate": 3.5851663846587706e-05, + "loss": 0.8133, + "step": 5018 + }, + { + "epoch": 2.8307952622673436, + "grad_norm": 1.319933533668518, + "learning_rate": 3.58488437676255e-05, + "loss": 0.8718, + "step": 5019 + }, + { + "epoch": 2.831359278059786, + "grad_norm": 1.4365962743759155, + "learning_rate": 3.5846023688663283e-05, + "loss": 0.7348, + "step": 5020 + }, + { + "epoch": 2.8319232938522276, + "grad_norm": 2.1887495517730713, + "learning_rate": 3.5843203609701076e-05, + "loss": 0.7961, + "step": 5021 + }, + { + "epoch": 2.8324873096446703, + "grad_norm": 1.3729735612869263, + "learning_rate": 3.584038353073886e-05, + "loss": 0.8224, + "step": 5022 + }, + { + "epoch": 2.833051325437112, + "grad_norm": 1.3039391040802002, + "learning_rate": 3.583756345177665e-05, + "loss": 0.7793, + "step": 5023 + }, + { + "epoch": 2.8336153412295544, + "grad_norm": 1.5517897605895996, + "learning_rate": 3.583474337281444e-05, + "loss": 0.7964, + "step": 5024 + }, + { + "epoch": 2.8341793570219966, + "grad_norm": 1.2766914367675781, + "learning_rate": 3.583192329385223e-05, + "loss": 0.6306, + "step": 5025 + }, + { + "epoch": 2.834743372814439, + "grad_norm": 1.8562507629394531, + "learning_rate": 3.582910321489002e-05, + "loss": 0.8547, + "step": 5026 + }, + { + "epoch": 2.835307388606881, + "grad_norm": 1.736722469329834, + "learning_rate": 3.582628313592781e-05, + "loss": 0.8616, + "step": 5027 + }, + { + "epoch": 2.835871404399323, + "grad_norm": 1.7108139991760254, + "learning_rate": 3.5823463056965594e-05, + "loss": 0.7204, + "step": 5028 + }, + { + "epoch": 2.8364354201917656, + "grad_norm": 1.6812479496002197, + "learning_rate": 3.5820642978003386e-05, + "loss": 0.8165, + "step": 5029 + }, + { + "epoch": 2.8369994359842075, + "grad_norm": 1.8098232746124268, + "learning_rate": 3.581782289904118e-05, + "loss": 0.8449, + "step": 5030 + }, + { + "epoch": 2.8375634517766497, + "grad_norm": 1.402275800704956, + "learning_rate": 3.5815002820078964e-05, + "loss": 0.7461, + "step": 5031 + }, + { + "epoch": 2.838127467569092, + "grad_norm": 2.2723207473754883, + "learning_rate": 3.581218274111675e-05, + "loss": 0.7633, + "step": 5032 + }, + { + "epoch": 2.838691483361534, + "grad_norm": 1.1573950052261353, + "learning_rate": 3.580936266215454e-05, + "loss": 0.6756, + "step": 5033 + }, + { + "epoch": 2.8392554991539765, + "grad_norm": 2.028144359588623, + "learning_rate": 3.580654258319233e-05, + "loss": 0.8137, + "step": 5034 + }, + { + "epoch": 2.8398195149464183, + "grad_norm": 1.3134909868240356, + "learning_rate": 3.580372250423012e-05, + "loss": 0.7212, + "step": 5035 + }, + { + "epoch": 2.840383530738861, + "grad_norm": 1.300691843032837, + "learning_rate": 3.5800902425267904e-05, + "loss": 0.7819, + "step": 5036 + }, + { + "epoch": 2.8409475465313028, + "grad_norm": 1.3533010482788086, + "learning_rate": 3.57980823463057e-05, + "loss": 0.8147, + "step": 5037 + }, + { + "epoch": 2.841511562323745, + "grad_norm": 1.3344945907592773, + "learning_rate": 3.579526226734349e-05, + "loss": 0.8074, + "step": 5038 + }, + { + "epoch": 2.8420755781161873, + "grad_norm": 7.280612945556641, + "learning_rate": 3.5792442188381274e-05, + "loss": 0.893, + "step": 5039 + }, + { + "epoch": 2.8426395939086295, + "grad_norm": 1.0810669660568237, + "learning_rate": 3.5789622109419066e-05, + "loss": 0.7155, + "step": 5040 + }, + { + "epoch": 2.8432036097010718, + "grad_norm": 1.1497782468795776, + "learning_rate": 3.578680203045686e-05, + "loss": 0.7448, + "step": 5041 + }, + { + "epoch": 2.8437676254935136, + "grad_norm": 1.171316146850586, + "learning_rate": 3.5783981951494644e-05, + "loss": 0.8095, + "step": 5042 + }, + { + "epoch": 2.8443316412859563, + "grad_norm": 1.1869665384292603, + "learning_rate": 3.578116187253243e-05, + "loss": 0.6581, + "step": 5043 + }, + { + "epoch": 2.844895657078398, + "grad_norm": 1.4500523805618286, + "learning_rate": 3.577834179357022e-05, + "loss": 0.7411, + "step": 5044 + }, + { + "epoch": 2.8454596728708403, + "grad_norm": 1.0187304019927979, + "learning_rate": 3.5775521714608014e-05, + "loss": 0.8129, + "step": 5045 + }, + { + "epoch": 2.8460236886632826, + "grad_norm": 1.6835063695907593, + "learning_rate": 3.57727016356458e-05, + "loss": 0.697, + "step": 5046 + }, + { + "epoch": 2.846587704455725, + "grad_norm": 2.2320215702056885, + "learning_rate": 3.5769881556683584e-05, + "loss": 0.9086, + "step": 5047 + }, + { + "epoch": 2.847151720248167, + "grad_norm": 1.4450551271438599, + "learning_rate": 3.5767061477721377e-05, + "loss": 0.7525, + "step": 5048 + }, + { + "epoch": 2.847715736040609, + "grad_norm": 1.8717098236083984, + "learning_rate": 3.576424139875917e-05, + "loss": 0.7139, + "step": 5049 + }, + { + "epoch": 2.8482797518330516, + "grad_norm": 1.7192754745483398, + "learning_rate": 3.5761421319796954e-05, + "loss": 0.7831, + "step": 5050 + }, + { + "epoch": 2.8488437676254934, + "grad_norm": 1.2630348205566406, + "learning_rate": 3.5758601240834746e-05, + "loss": 0.7937, + "step": 5051 + }, + { + "epoch": 2.8494077834179357, + "grad_norm": 1.1435115337371826, + "learning_rate": 3.575578116187254e-05, + "loss": 0.7302, + "step": 5052 + }, + { + "epoch": 2.849971799210378, + "grad_norm": 1.649595022201538, + "learning_rate": 3.5752961082910324e-05, + "loss": 0.7438, + "step": 5053 + }, + { + "epoch": 2.85053581500282, + "grad_norm": 1.6258643865585327, + "learning_rate": 3.575014100394811e-05, + "loss": 0.7709, + "step": 5054 + }, + { + "epoch": 2.8510998307952624, + "grad_norm": 1.4774127006530762, + "learning_rate": 3.57473209249859e-05, + "loss": 0.7853, + "step": 5055 + }, + { + "epoch": 2.851663846587704, + "grad_norm": 1.1366838216781616, + "learning_rate": 3.5744500846023694e-05, + "loss": 0.7605, + "step": 5056 + }, + { + "epoch": 2.852227862380147, + "grad_norm": 1.4171016216278076, + "learning_rate": 3.574168076706148e-05, + "loss": 0.7503, + "step": 5057 + }, + { + "epoch": 2.8527918781725887, + "grad_norm": 1.5626707077026367, + "learning_rate": 3.573886068809927e-05, + "loss": 0.794, + "step": 5058 + }, + { + "epoch": 2.853355893965031, + "grad_norm": 1.694316029548645, + "learning_rate": 3.573604060913706e-05, + "loss": 0.7932, + "step": 5059 + }, + { + "epoch": 2.8539199097574732, + "grad_norm": 1.2403815984725952, + "learning_rate": 3.573322053017485e-05, + "loss": 0.8022, + "step": 5060 + }, + { + "epoch": 2.8544839255499155, + "grad_norm": 1.4316462278366089, + "learning_rate": 3.5730400451212634e-05, + "loss": 0.7469, + "step": 5061 + }, + { + "epoch": 2.8550479413423577, + "grad_norm": 1.086928129196167, + "learning_rate": 3.5727580372250426e-05, + "loss": 0.7325, + "step": 5062 + }, + { + "epoch": 2.8556119571347995, + "grad_norm": 1.9895174503326416, + "learning_rate": 3.572476029328821e-05, + "loss": 0.9275, + "step": 5063 + }, + { + "epoch": 2.8561759729272422, + "grad_norm": 1.6892586946487427, + "learning_rate": 3.5721940214326004e-05, + "loss": 0.8075, + "step": 5064 + }, + { + "epoch": 2.856739988719684, + "grad_norm": 1.0323439836502075, + "learning_rate": 3.571912013536379e-05, + "loss": 0.7891, + "step": 5065 + }, + { + "epoch": 2.8573040045121263, + "grad_norm": 1.221414566040039, + "learning_rate": 3.571630005640158e-05, + "loss": 0.739, + "step": 5066 + }, + { + "epoch": 2.8578680203045685, + "grad_norm": 1.046651005744934, + "learning_rate": 3.571347997743937e-05, + "loss": 0.7141, + "step": 5067 + }, + { + "epoch": 2.858432036097011, + "grad_norm": 3.102886915206909, + "learning_rate": 3.571065989847716e-05, + "loss": 0.7724, + "step": 5068 + }, + { + "epoch": 2.858996051889453, + "grad_norm": 2.009310722351074, + "learning_rate": 3.570783981951495e-05, + "loss": 0.8718, + "step": 5069 + }, + { + "epoch": 2.859560067681895, + "grad_norm": 2.2875795364379883, + "learning_rate": 3.570501974055274e-05, + "loss": 0.9041, + "step": 5070 + }, + { + "epoch": 2.8601240834743376, + "grad_norm": 1.2741032838821411, + "learning_rate": 3.570219966159052e-05, + "loss": 0.6227, + "step": 5071 + }, + { + "epoch": 2.8606880992667794, + "grad_norm": 1.5158793926239014, + "learning_rate": 3.5699379582628314e-05, + "loss": 0.7972, + "step": 5072 + }, + { + "epoch": 2.8612521150592216, + "grad_norm": 1.173370361328125, + "learning_rate": 3.5696559503666107e-05, + "loss": 0.6749, + "step": 5073 + }, + { + "epoch": 2.861816130851664, + "grad_norm": 0.9732911586761475, + "learning_rate": 3.569373942470389e-05, + "loss": 0.6866, + "step": 5074 + }, + { + "epoch": 2.862380146644106, + "grad_norm": 1.6888628005981445, + "learning_rate": 3.569091934574168e-05, + "loss": 0.7539, + "step": 5075 + }, + { + "epoch": 2.8629441624365484, + "grad_norm": 5.115172863006592, + "learning_rate": 3.5688099266779476e-05, + "loss": 0.7788, + "step": 5076 + }, + { + "epoch": 2.86350817822899, + "grad_norm": 1.6015177965164185, + "learning_rate": 3.568527918781726e-05, + "loss": 0.7759, + "step": 5077 + }, + { + "epoch": 2.864072194021433, + "grad_norm": 1.3461310863494873, + "learning_rate": 3.568245910885505e-05, + "loss": 0.704, + "step": 5078 + }, + { + "epoch": 2.8646362098138747, + "grad_norm": 1.008044719696045, + "learning_rate": 3.567963902989284e-05, + "loss": 0.7227, + "step": 5079 + }, + { + "epoch": 2.865200225606317, + "grad_norm": 2.430941343307495, + "learning_rate": 3.567681895093063e-05, + "loss": 0.8812, + "step": 5080 + }, + { + "epoch": 2.865764241398759, + "grad_norm": 5.865020751953125, + "learning_rate": 3.567399887196842e-05, + "loss": 1.0674, + "step": 5081 + }, + { + "epoch": 2.8663282571912014, + "grad_norm": 2.5764477252960205, + "learning_rate": 3.56711787930062e-05, + "loss": 0.8282, + "step": 5082 + }, + { + "epoch": 2.8668922729836437, + "grad_norm": 2.275310516357422, + "learning_rate": 3.5668358714043995e-05, + "loss": 0.6511, + "step": 5083 + }, + { + "epoch": 2.8674562887760855, + "grad_norm": 2.0714969635009766, + "learning_rate": 3.566553863508179e-05, + "loss": 0.781, + "step": 5084 + }, + { + "epoch": 2.868020304568528, + "grad_norm": 1.273498296737671, + "learning_rate": 3.566271855611957e-05, + "loss": 0.6829, + "step": 5085 + }, + { + "epoch": 2.86858432036097, + "grad_norm": 1.2169650793075562, + "learning_rate": 3.565989847715736e-05, + "loss": 0.6512, + "step": 5086 + }, + { + "epoch": 2.8691483361534122, + "grad_norm": 1.2868396043777466, + "learning_rate": 3.5657078398195157e-05, + "loss": 0.9069, + "step": 5087 + }, + { + "epoch": 2.8697123519458545, + "grad_norm": 1.7508240938186646, + "learning_rate": 3.565425831923294e-05, + "loss": 0.7863, + "step": 5088 + }, + { + "epoch": 2.8702763677382968, + "grad_norm": 1.8479502201080322, + "learning_rate": 3.565143824027073e-05, + "loss": 0.7645, + "step": 5089 + }, + { + "epoch": 2.870840383530739, + "grad_norm": 1.4023202657699585, + "learning_rate": 3.564861816130852e-05, + "loss": 0.8408, + "step": 5090 + }, + { + "epoch": 2.871404399323181, + "grad_norm": 1.3544923067092896, + "learning_rate": 3.564579808234631e-05, + "loss": 0.7183, + "step": 5091 + }, + { + "epoch": 2.8719684151156235, + "grad_norm": 1.2838428020477295, + "learning_rate": 3.56429780033841e-05, + "loss": 0.6955, + "step": 5092 + }, + { + "epoch": 2.8725324309080653, + "grad_norm": 4.122363090515137, + "learning_rate": 3.564015792442188e-05, + "loss": 0.7708, + "step": 5093 + }, + { + "epoch": 2.8730964467005076, + "grad_norm": 1.0867252349853516, + "learning_rate": 3.5637337845459675e-05, + "loss": 0.681, + "step": 5094 + }, + { + "epoch": 2.87366046249295, + "grad_norm": 1.2372360229492188, + "learning_rate": 3.563451776649747e-05, + "loss": 0.5715, + "step": 5095 + }, + { + "epoch": 2.874224478285392, + "grad_norm": 1.6824679374694824, + "learning_rate": 3.563169768753525e-05, + "loss": 0.8011, + "step": 5096 + }, + { + "epoch": 2.8747884940778343, + "grad_norm": 1.1778755187988281, + "learning_rate": 3.5628877608573044e-05, + "loss": 0.8457, + "step": 5097 + }, + { + "epoch": 2.875352509870276, + "grad_norm": 1.3546990156173706, + "learning_rate": 3.562605752961083e-05, + "loss": 0.7566, + "step": 5098 + }, + { + "epoch": 2.875916525662719, + "grad_norm": 2.2310619354248047, + "learning_rate": 3.562323745064862e-05, + "loss": 0.7677, + "step": 5099 + }, + { + "epoch": 2.8764805414551606, + "grad_norm": 1.3081551790237427, + "learning_rate": 3.562041737168641e-05, + "loss": 0.8294, + "step": 5100 + }, + { + "epoch": 2.877044557247603, + "grad_norm": 1.2065801620483398, + "learning_rate": 3.56175972927242e-05, + "loss": 0.6992, + "step": 5101 + }, + { + "epoch": 2.877608573040045, + "grad_norm": 1.8825311660766602, + "learning_rate": 3.5614777213761985e-05, + "loss": 0.8338, + "step": 5102 + }, + { + "epoch": 2.8781725888324874, + "grad_norm": 2.4688329696655273, + "learning_rate": 3.561195713479978e-05, + "loss": 0.8694, + "step": 5103 + }, + { + "epoch": 2.8787366046249296, + "grad_norm": 1.4906684160232544, + "learning_rate": 3.560913705583756e-05, + "loss": 0.7117, + "step": 5104 + }, + { + "epoch": 2.8793006204173714, + "grad_norm": 1.327063798904419, + "learning_rate": 3.5606316976875355e-05, + "loss": 0.7568, + "step": 5105 + }, + { + "epoch": 2.879864636209814, + "grad_norm": 1.2293487787246704, + "learning_rate": 3.560349689791314e-05, + "loss": 0.6447, + "step": 5106 + }, + { + "epoch": 2.880428652002256, + "grad_norm": 2.4379074573516846, + "learning_rate": 3.560067681895093e-05, + "loss": 0.7832, + "step": 5107 + }, + { + "epoch": 2.880992667794698, + "grad_norm": 3.409668445587158, + "learning_rate": 3.5597856739988725e-05, + "loss": 1.0858, + "step": 5108 + }, + { + "epoch": 2.8815566835871405, + "grad_norm": 1.0945894718170166, + "learning_rate": 3.559503666102651e-05, + "loss": 0.7515, + "step": 5109 + }, + { + "epoch": 2.8821206993795827, + "grad_norm": 1.4558472633361816, + "learning_rate": 3.5592216582064295e-05, + "loss": 0.8254, + "step": 5110 + }, + { + "epoch": 2.882684715172025, + "grad_norm": 2.3264575004577637, + "learning_rate": 3.558939650310209e-05, + "loss": 0.9164, + "step": 5111 + }, + { + "epoch": 2.8832487309644668, + "grad_norm": 1.0599011182785034, + "learning_rate": 3.558657642413988e-05, + "loss": 0.7319, + "step": 5112 + }, + { + "epoch": 2.8838127467569095, + "grad_norm": 1.967612862586975, + "learning_rate": 3.5583756345177665e-05, + "loss": 0.8064, + "step": 5113 + }, + { + "epoch": 2.8843767625493513, + "grad_norm": 1.6731576919555664, + "learning_rate": 3.558093626621545e-05, + "loss": 0.8115, + "step": 5114 + }, + { + "epoch": 2.8849407783417935, + "grad_norm": 1.7809828519821167, + "learning_rate": 3.557811618725325e-05, + "loss": 0.8281, + "step": 5115 + }, + { + "epoch": 2.8855047941342358, + "grad_norm": 4.594448089599609, + "learning_rate": 3.5575296108291035e-05, + "loss": 0.7679, + "step": 5116 + }, + { + "epoch": 2.886068809926678, + "grad_norm": 1.565303921699524, + "learning_rate": 3.557247602932882e-05, + "loss": 0.7969, + "step": 5117 + }, + { + "epoch": 2.8866328257191203, + "grad_norm": 1.6266670227050781, + "learning_rate": 3.556965595036661e-05, + "loss": 0.7234, + "step": 5118 + }, + { + "epoch": 2.887196841511562, + "grad_norm": 1.9427905082702637, + "learning_rate": 3.5566835871404405e-05, + "loss": 0.8348, + "step": 5119 + }, + { + "epoch": 2.887760857304005, + "grad_norm": 1.453325629234314, + "learning_rate": 3.556401579244219e-05, + "loss": 0.8002, + "step": 5120 + }, + { + "epoch": 2.8883248730964466, + "grad_norm": 1.2679029703140259, + "learning_rate": 3.5561195713479976e-05, + "loss": 0.6781, + "step": 5121 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 1.1205003261566162, + "learning_rate": 3.555837563451777e-05, + "loss": 0.7933, + "step": 5122 + }, + { + "epoch": 2.889452904681331, + "grad_norm": 4.3735833168029785, + "learning_rate": 3.555555555555556e-05, + "loss": 0.7273, + "step": 5123 + }, + { + "epoch": 2.8900169204737733, + "grad_norm": 1.9139126539230347, + "learning_rate": 3.5552735476593345e-05, + "loss": 0.804, + "step": 5124 + }, + { + "epoch": 2.8905809362662156, + "grad_norm": 1.7702075242996216, + "learning_rate": 3.554991539763113e-05, + "loss": 0.7219, + "step": 5125 + }, + { + "epoch": 2.8911449520586574, + "grad_norm": 2.539844512939453, + "learning_rate": 3.554709531866893e-05, + "loss": 0.6497, + "step": 5126 + }, + { + "epoch": 2.8917089678511, + "grad_norm": 1.601339340209961, + "learning_rate": 3.5544275239706715e-05, + "loss": 0.8316, + "step": 5127 + }, + { + "epoch": 2.892272983643542, + "grad_norm": 1.4255014657974243, + "learning_rate": 3.55414551607445e-05, + "loss": 0.7134, + "step": 5128 + }, + { + "epoch": 2.892836999435984, + "grad_norm": 1.6633285284042358, + "learning_rate": 3.553863508178229e-05, + "loss": 0.7612, + "step": 5129 + }, + { + "epoch": 2.8934010152284264, + "grad_norm": 1.111611008644104, + "learning_rate": 3.5535815002820085e-05, + "loss": 0.7349, + "step": 5130 + }, + { + "epoch": 2.8939650310208687, + "grad_norm": 1.5589654445648193, + "learning_rate": 3.553299492385787e-05, + "loss": 0.6682, + "step": 5131 + }, + { + "epoch": 2.894529046813311, + "grad_norm": 1.4336998462677002, + "learning_rate": 3.5530174844895656e-05, + "loss": 0.771, + "step": 5132 + }, + { + "epoch": 2.8950930626057527, + "grad_norm": 2.0409088134765625, + "learning_rate": 3.552735476593345e-05, + "loss": 0.8457, + "step": 5133 + }, + { + "epoch": 2.8956570783981954, + "grad_norm": 1.468640923500061, + "learning_rate": 3.552453468697124e-05, + "loss": 0.849, + "step": 5134 + }, + { + "epoch": 2.8962210941906372, + "grad_norm": 1.7723591327667236, + "learning_rate": 3.5521714608009026e-05, + "loss": 0.9069, + "step": 5135 + }, + { + "epoch": 2.8967851099830795, + "grad_norm": 1.3085148334503174, + "learning_rate": 3.551889452904682e-05, + "loss": 0.8324, + "step": 5136 + }, + { + "epoch": 2.8973491257755217, + "grad_norm": 1.5094842910766602, + "learning_rate": 3.55160744500846e-05, + "loss": 0.8404, + "step": 5137 + }, + { + "epoch": 2.897913141567964, + "grad_norm": 1.458851933479309, + "learning_rate": 3.5513254371122395e-05, + "loss": 0.8366, + "step": 5138 + }, + { + "epoch": 2.8984771573604062, + "grad_norm": 2.0044732093811035, + "learning_rate": 3.551043429216018e-05, + "loss": 0.9245, + "step": 5139 + }, + { + "epoch": 2.899041173152848, + "grad_norm": 1.356710433959961, + "learning_rate": 3.550761421319797e-05, + "loss": 0.7368, + "step": 5140 + }, + { + "epoch": 2.8996051889452907, + "grad_norm": 1.9391679763793945, + "learning_rate": 3.550479413423576e-05, + "loss": 0.7822, + "step": 5141 + }, + { + "epoch": 2.9001692047377325, + "grad_norm": 1.0071605443954468, + "learning_rate": 3.550197405527355e-05, + "loss": 0.6751, + "step": 5142 + }, + { + "epoch": 2.900733220530175, + "grad_norm": 1.7187280654907227, + "learning_rate": 3.5499153976311336e-05, + "loss": 0.7527, + "step": 5143 + }, + { + "epoch": 2.901297236322617, + "grad_norm": 1.9637573957443237, + "learning_rate": 3.549633389734913e-05, + "loss": 0.833, + "step": 5144 + }, + { + "epoch": 2.9018612521150593, + "grad_norm": 1.266482949256897, + "learning_rate": 3.5493513818386913e-05, + "loss": 0.7769, + "step": 5145 + }, + { + "epoch": 2.9024252679075015, + "grad_norm": 1.8109632730484009, + "learning_rate": 3.5490693739424706e-05, + "loss": 0.7379, + "step": 5146 + }, + { + "epoch": 2.9029892836999434, + "grad_norm": 1.3091473579406738, + "learning_rate": 3.54878736604625e-05, + "loss": 0.7732, + "step": 5147 + }, + { + "epoch": 2.903553299492386, + "grad_norm": 1.9623682498931885, + "learning_rate": 3.548505358150028e-05, + "loss": 0.8614, + "step": 5148 + }, + { + "epoch": 2.904117315284828, + "grad_norm": 1.6483557224273682, + "learning_rate": 3.548223350253807e-05, + "loss": 0.8131, + "step": 5149 + }, + { + "epoch": 2.90468133107727, + "grad_norm": 1.500203013420105, + "learning_rate": 3.547941342357586e-05, + "loss": 0.7359, + "step": 5150 + }, + { + "epoch": 2.9052453468697124, + "grad_norm": 1.2229657173156738, + "learning_rate": 3.547659334461365e-05, + "loss": 0.7063, + "step": 5151 + }, + { + "epoch": 2.9058093626621546, + "grad_norm": 2.7649118900299072, + "learning_rate": 3.547377326565144e-05, + "loss": 0.7132, + "step": 5152 + }, + { + "epoch": 2.906373378454597, + "grad_norm": 2.42484450340271, + "learning_rate": 3.5470953186689224e-05, + "loss": 0.9604, + "step": 5153 + }, + { + "epoch": 2.9069373942470387, + "grad_norm": 1.8086555004119873, + "learning_rate": 3.546813310772702e-05, + "loss": 0.7898, + "step": 5154 + }, + { + "epoch": 2.9075014100394814, + "grad_norm": 1.8618918657302856, + "learning_rate": 3.546531302876481e-05, + "loss": 0.9161, + "step": 5155 + }, + { + "epoch": 2.908065425831923, + "grad_norm": 1.5813409090042114, + "learning_rate": 3.5462492949802594e-05, + "loss": 0.7601, + "step": 5156 + }, + { + "epoch": 2.9086294416243654, + "grad_norm": 1.2465486526489258, + "learning_rate": 3.5459672870840386e-05, + "loss": 0.7162, + "step": 5157 + }, + { + "epoch": 2.9091934574168077, + "grad_norm": 1.4506988525390625, + "learning_rate": 3.545685279187818e-05, + "loss": 0.7006, + "step": 5158 + }, + { + "epoch": 2.90975747320925, + "grad_norm": 2.7914202213287354, + "learning_rate": 3.5454032712915963e-05, + "loss": 0.7822, + "step": 5159 + }, + { + "epoch": 2.910321489001692, + "grad_norm": 1.330726146697998, + "learning_rate": 3.545121263395375e-05, + "loss": 0.7122, + "step": 5160 + }, + { + "epoch": 2.910885504794134, + "grad_norm": 2.2028586864471436, + "learning_rate": 3.544839255499154e-05, + "loss": 0.7188, + "step": 5161 + }, + { + "epoch": 2.9114495205865767, + "grad_norm": 1.5792711973190308, + "learning_rate": 3.544557247602933e-05, + "loss": 0.774, + "step": 5162 + }, + { + "epoch": 2.9120135363790185, + "grad_norm": 1.5237085819244385, + "learning_rate": 3.544275239706712e-05, + "loss": 0.8327, + "step": 5163 + }, + { + "epoch": 2.9125775521714607, + "grad_norm": 1.1193121671676636, + "learning_rate": 3.5439932318104904e-05, + "loss": 0.7875, + "step": 5164 + }, + { + "epoch": 2.913141567963903, + "grad_norm": 1.7535526752471924, + "learning_rate": 3.54371122391427e-05, + "loss": 0.698, + "step": 5165 + }, + { + "epoch": 2.9137055837563453, + "grad_norm": 1.1952077150344849, + "learning_rate": 3.543429216018049e-05, + "loss": 0.7937, + "step": 5166 + }, + { + "epoch": 2.9142695995487875, + "grad_norm": 2.075803518295288, + "learning_rate": 3.5431472081218274e-05, + "loss": 0.7682, + "step": 5167 + }, + { + "epoch": 2.9148336153412293, + "grad_norm": 1.1652902364730835, + "learning_rate": 3.5428652002256066e-05, + "loss": 0.738, + "step": 5168 + }, + { + "epoch": 2.915397631133672, + "grad_norm": 1.1543129682540894, + "learning_rate": 3.542583192329386e-05, + "loss": 0.7793, + "step": 5169 + }, + { + "epoch": 2.915961646926114, + "grad_norm": 1.2132995128631592, + "learning_rate": 3.5423011844331644e-05, + "loss": 0.7374, + "step": 5170 + }, + { + "epoch": 2.916525662718556, + "grad_norm": 2.7077534198760986, + "learning_rate": 3.542019176536943e-05, + "loss": 0.6429, + "step": 5171 + }, + { + "epoch": 2.9170896785109983, + "grad_norm": 1.201992392539978, + "learning_rate": 3.541737168640722e-05, + "loss": 0.7194, + "step": 5172 + }, + { + "epoch": 2.9176536943034406, + "grad_norm": 1.343896508216858, + "learning_rate": 3.541455160744501e-05, + "loss": 0.6659, + "step": 5173 + }, + { + "epoch": 2.918217710095883, + "grad_norm": 1.716086506843567, + "learning_rate": 3.54117315284828e-05, + "loss": 0.7569, + "step": 5174 + }, + { + "epoch": 2.9187817258883246, + "grad_norm": 3.60217022895813, + "learning_rate": 3.540891144952059e-05, + "loss": 0.8142, + "step": 5175 + }, + { + "epoch": 2.9193457416807673, + "grad_norm": 1.041835904121399, + "learning_rate": 3.5406091370558376e-05, + "loss": 0.7905, + "step": 5176 + }, + { + "epoch": 2.919909757473209, + "grad_norm": 1.5862946510314941, + "learning_rate": 3.540327129159617e-05, + "loss": 0.8197, + "step": 5177 + }, + { + "epoch": 2.9204737732656514, + "grad_norm": 1.0835376977920532, + "learning_rate": 3.5400451212633954e-05, + "loss": 0.7024, + "step": 5178 + }, + { + "epoch": 2.9210377890580936, + "grad_norm": 1.4224913120269775, + "learning_rate": 3.5397631133671746e-05, + "loss": 0.7263, + "step": 5179 + }, + { + "epoch": 2.921601804850536, + "grad_norm": 1.2590742111206055, + "learning_rate": 3.539481105470953e-05, + "loss": 0.7503, + "step": 5180 + }, + { + "epoch": 2.922165820642978, + "grad_norm": 1.6656159162521362, + "learning_rate": 3.5391990975747324e-05, + "loss": 0.7215, + "step": 5181 + }, + { + "epoch": 2.92272983643542, + "grad_norm": 1.1774442195892334, + "learning_rate": 3.538917089678511e-05, + "loss": 0.7602, + "step": 5182 + }, + { + "epoch": 2.9232938522278626, + "grad_norm": 1.0746279954910278, + "learning_rate": 3.53863508178229e-05, + "loss": 0.6923, + "step": 5183 + }, + { + "epoch": 2.9238578680203045, + "grad_norm": 1.7995394468307495, + "learning_rate": 3.538353073886069e-05, + "loss": 0.7812, + "step": 5184 + }, + { + "epoch": 2.9244218838127467, + "grad_norm": 1.3651039600372314, + "learning_rate": 3.538071065989848e-05, + "loss": 0.7895, + "step": 5185 + }, + { + "epoch": 2.924985899605189, + "grad_norm": 13.023636817932129, + "learning_rate": 3.537789058093627e-05, + "loss": 0.8097, + "step": 5186 + }, + { + "epoch": 2.925549915397631, + "grad_norm": 1.7551738023757935, + "learning_rate": 3.5375070501974057e-05, + "loss": 0.6424, + "step": 5187 + }, + { + "epoch": 2.9261139311900735, + "grad_norm": 1.2858790159225464, + "learning_rate": 3.537225042301184e-05, + "loss": 0.7304, + "step": 5188 + }, + { + "epoch": 2.9266779469825153, + "grad_norm": 1.1111400127410889, + "learning_rate": 3.5369430344049634e-05, + "loss": 0.7517, + "step": 5189 + }, + { + "epoch": 2.927241962774958, + "grad_norm": 2.0502161979675293, + "learning_rate": 3.5366610265087426e-05, + "loss": 0.8825, + "step": 5190 + }, + { + "epoch": 2.9278059785673998, + "grad_norm": 2.3391733169555664, + "learning_rate": 3.536379018612521e-05, + "loss": 0.8696, + "step": 5191 + }, + { + "epoch": 2.928369994359842, + "grad_norm": 1.0278058052062988, + "learning_rate": 3.5360970107163e-05, + "loss": 0.6933, + "step": 5192 + }, + { + "epoch": 2.9289340101522843, + "grad_norm": 1.4589496850967407, + "learning_rate": 3.5358150028200796e-05, + "loss": 0.7978, + "step": 5193 + }, + { + "epoch": 2.9294980259447265, + "grad_norm": 1.5229746103286743, + "learning_rate": 3.535532994923858e-05, + "loss": 0.7685, + "step": 5194 + }, + { + "epoch": 2.9300620417371688, + "grad_norm": 2.2416131496429443, + "learning_rate": 3.535250987027637e-05, + "loss": 0.81, + "step": 5195 + }, + { + "epoch": 2.9306260575296106, + "grad_norm": 1.2237838506698608, + "learning_rate": 3.534968979131416e-05, + "loss": 0.7801, + "step": 5196 + }, + { + "epoch": 2.9311900733220533, + "grad_norm": 1.7729389667510986, + "learning_rate": 3.534686971235195e-05, + "loss": 0.8152, + "step": 5197 + }, + { + "epoch": 2.931754089114495, + "grad_norm": 1.609981656074524, + "learning_rate": 3.534404963338974e-05, + "loss": 0.9111, + "step": 5198 + }, + { + "epoch": 2.9323181049069373, + "grad_norm": 2.176649332046509, + "learning_rate": 3.534122955442752e-05, + "loss": 0.8488, + "step": 5199 + }, + { + "epoch": 2.9328821206993796, + "grad_norm": 2.571518659591675, + "learning_rate": 3.5338409475465314e-05, + "loss": 0.8383, + "step": 5200 + }, + { + "epoch": 2.933446136491822, + "grad_norm": 1.7831530570983887, + "learning_rate": 3.5335589396503106e-05, + "loss": 0.8167, + "step": 5201 + }, + { + "epoch": 2.934010152284264, + "grad_norm": 2.1890878677368164, + "learning_rate": 3.533276931754089e-05, + "loss": 0.9588, + "step": 5202 + }, + { + "epoch": 2.934574168076706, + "grad_norm": 1.635040283203125, + "learning_rate": 3.532994923857868e-05, + "loss": 0.7904, + "step": 5203 + }, + { + "epoch": 2.9351381838691486, + "grad_norm": 0.945482075214386, + "learning_rate": 3.5327129159616476e-05, + "loss": 0.7046, + "step": 5204 + }, + { + "epoch": 2.9357021996615904, + "grad_norm": 1.6750471591949463, + "learning_rate": 3.532430908065426e-05, + "loss": 0.7945, + "step": 5205 + }, + { + "epoch": 2.9362662154540327, + "grad_norm": 1.1014727354049683, + "learning_rate": 3.532148900169205e-05, + "loss": 0.8424, + "step": 5206 + }, + { + "epoch": 2.936830231246475, + "grad_norm": 1.552628517150879, + "learning_rate": 3.531866892272984e-05, + "loss": 0.7606, + "step": 5207 + }, + { + "epoch": 2.937394247038917, + "grad_norm": 1.4469074010849, + "learning_rate": 3.531584884376763e-05, + "loss": 0.8393, + "step": 5208 + }, + { + "epoch": 2.9379582628313594, + "grad_norm": 1.1769939661026, + "learning_rate": 3.531302876480542e-05, + "loss": 0.7168, + "step": 5209 + }, + { + "epoch": 2.938522278623801, + "grad_norm": 2.4828271865844727, + "learning_rate": 3.53102086858432e-05, + "loss": 0.9584, + "step": 5210 + }, + { + "epoch": 2.939086294416244, + "grad_norm": 1.8197894096374512, + "learning_rate": 3.5307388606880994e-05, + "loss": 0.724, + "step": 5211 + }, + { + "epoch": 2.9396503102086857, + "grad_norm": 0.9775641560554504, + "learning_rate": 3.5304568527918787e-05, + "loss": 0.7796, + "step": 5212 + }, + { + "epoch": 2.940214326001128, + "grad_norm": 1.699252724647522, + "learning_rate": 3.530174844895657e-05, + "loss": 0.8142, + "step": 5213 + }, + { + "epoch": 2.9407783417935702, + "grad_norm": 2.0897886753082275, + "learning_rate": 3.529892836999436e-05, + "loss": 0.8345, + "step": 5214 + }, + { + "epoch": 2.9413423575860125, + "grad_norm": 0.9866510033607483, + "learning_rate": 3.529610829103215e-05, + "loss": 0.7052, + "step": 5215 + }, + { + "epoch": 2.9419063733784547, + "grad_norm": 1.2223873138427734, + "learning_rate": 3.529328821206994e-05, + "loss": 0.7149, + "step": 5216 + }, + { + "epoch": 2.9424703891708965, + "grad_norm": 1.5809615850448608, + "learning_rate": 3.529046813310773e-05, + "loss": 0.7829, + "step": 5217 + }, + { + "epoch": 2.9430344049633392, + "grad_norm": 1.8336679935455322, + "learning_rate": 3.528764805414552e-05, + "loss": 0.796, + "step": 5218 + }, + { + "epoch": 2.943598420755781, + "grad_norm": 1.114122748374939, + "learning_rate": 3.5284827975183305e-05, + "loss": 0.7035, + "step": 5219 + }, + { + "epoch": 2.9441624365482233, + "grad_norm": 1.8864521980285645, + "learning_rate": 3.52820078962211e-05, + "loss": 0.8826, + "step": 5220 + }, + { + "epoch": 2.9447264523406655, + "grad_norm": 1.9164211750030518, + "learning_rate": 3.527918781725888e-05, + "loss": 0.8513, + "step": 5221 + }, + { + "epoch": 2.945290468133108, + "grad_norm": 1.3842003345489502, + "learning_rate": 3.5276367738296675e-05, + "loss": 0.8067, + "step": 5222 + }, + { + "epoch": 2.94585448392555, + "grad_norm": 1.0745970010757446, + "learning_rate": 3.527354765933446e-05, + "loss": 0.6319, + "step": 5223 + }, + { + "epoch": 2.946418499717992, + "grad_norm": 1.2931088209152222, + "learning_rate": 3.527072758037225e-05, + "loss": 0.726, + "step": 5224 + }, + { + "epoch": 2.9469825155104346, + "grad_norm": 2.161780834197998, + "learning_rate": 3.5267907501410044e-05, + "loss": 0.8418, + "step": 5225 + }, + { + "epoch": 2.9475465313028764, + "grad_norm": 1.0568071603775024, + "learning_rate": 3.526508742244783e-05, + "loss": 0.7322, + "step": 5226 + }, + { + "epoch": 2.9481105470953186, + "grad_norm": 1.1529297828674316, + "learning_rate": 3.5262267343485615e-05, + "loss": 0.7283, + "step": 5227 + }, + { + "epoch": 2.948674562887761, + "grad_norm": 1.2773123979568481, + "learning_rate": 3.525944726452341e-05, + "loss": 0.7484, + "step": 5228 + }, + { + "epoch": 2.949238578680203, + "grad_norm": 1.803906798362732, + "learning_rate": 3.52566271855612e-05, + "loss": 0.8636, + "step": 5229 + }, + { + "epoch": 2.9498025944726454, + "grad_norm": 1.143790602684021, + "learning_rate": 3.5253807106598985e-05, + "loss": 0.6865, + "step": 5230 + }, + { + "epoch": 2.950366610265087, + "grad_norm": 1.2156388759613037, + "learning_rate": 3.525098702763678e-05, + "loss": 0.7237, + "step": 5231 + }, + { + "epoch": 2.95093062605753, + "grad_norm": 1.2641326189041138, + "learning_rate": 3.524816694867456e-05, + "loss": 0.6896, + "step": 5232 + }, + { + "epoch": 2.9514946418499717, + "grad_norm": 1.5669670104980469, + "learning_rate": 3.5245346869712355e-05, + "loss": 0.7096, + "step": 5233 + }, + { + "epoch": 2.952058657642414, + "grad_norm": 4.139676094055176, + "learning_rate": 3.524252679075014e-05, + "loss": 0.7752, + "step": 5234 + }, + { + "epoch": 2.952622673434856, + "grad_norm": 1.4616827964782715, + "learning_rate": 3.523970671178793e-05, + "loss": 0.7234, + "step": 5235 + }, + { + "epoch": 2.9531866892272984, + "grad_norm": 1.263140082359314, + "learning_rate": 3.5236886632825724e-05, + "loss": 0.749, + "step": 5236 + }, + { + "epoch": 2.9537507050197407, + "grad_norm": 3.95914626121521, + "learning_rate": 3.523406655386351e-05, + "loss": 0.958, + "step": 5237 + }, + { + "epoch": 2.9543147208121825, + "grad_norm": 1.5178720951080322, + "learning_rate": 3.5231246474901295e-05, + "loss": 0.7877, + "step": 5238 + }, + { + "epoch": 2.954878736604625, + "grad_norm": 1.6300195455551147, + "learning_rate": 3.522842639593909e-05, + "loss": 0.7577, + "step": 5239 + }, + { + "epoch": 2.955442752397067, + "grad_norm": 1.0107141733169556, + "learning_rate": 3.522560631697688e-05, + "loss": 0.6674, + "step": 5240 + }, + { + "epoch": 2.9560067681895092, + "grad_norm": 1.3513516187667847, + "learning_rate": 3.5222786238014665e-05, + "loss": 0.929, + "step": 5241 + }, + { + "epoch": 2.9565707839819515, + "grad_norm": 1.255391240119934, + "learning_rate": 3.521996615905245e-05, + "loss": 0.6484, + "step": 5242 + }, + { + "epoch": 2.9571347997743938, + "grad_norm": 1.0793999433517456, + "learning_rate": 3.521714608009025e-05, + "loss": 0.6331, + "step": 5243 + }, + { + "epoch": 2.957698815566836, + "grad_norm": 1.6223173141479492, + "learning_rate": 3.5214326001128035e-05, + "loss": 0.7234, + "step": 5244 + }, + { + "epoch": 2.958262831359278, + "grad_norm": 1.5735793113708496, + "learning_rate": 3.521150592216582e-05, + "loss": 0.8035, + "step": 5245 + }, + { + "epoch": 2.9588268471517205, + "grad_norm": 1.1522771120071411, + "learning_rate": 3.520868584320361e-05, + "loss": 0.6631, + "step": 5246 + }, + { + "epoch": 2.9593908629441623, + "grad_norm": 2.3131086826324463, + "learning_rate": 3.5205865764241405e-05, + "loss": 0.7808, + "step": 5247 + }, + { + "epoch": 2.9599548787366046, + "grad_norm": 1.1160589456558228, + "learning_rate": 3.520304568527919e-05, + "loss": 0.7079, + "step": 5248 + }, + { + "epoch": 2.960518894529047, + "grad_norm": 1.165769100189209, + "learning_rate": 3.5200225606316975e-05, + "loss": 0.6986, + "step": 5249 + }, + { + "epoch": 2.961082910321489, + "grad_norm": 1.0224900245666504, + "learning_rate": 3.519740552735477e-05, + "loss": 0.6802, + "step": 5250 + }, + { + "epoch": 2.9616469261139313, + "grad_norm": 1.5284881591796875, + "learning_rate": 3.519458544839256e-05, + "loss": 0.8361, + "step": 5251 + }, + { + "epoch": 2.962210941906373, + "grad_norm": 1.2513470649719238, + "learning_rate": 3.5191765369430345e-05, + "loss": 0.6604, + "step": 5252 + }, + { + "epoch": 2.962774957698816, + "grad_norm": 1.4891295433044434, + "learning_rate": 3.518894529046813e-05, + "loss": 0.8049, + "step": 5253 + }, + { + "epoch": 2.9633389734912576, + "grad_norm": 1.053916335105896, + "learning_rate": 3.518612521150592e-05, + "loss": 0.6579, + "step": 5254 + }, + { + "epoch": 2.9639029892837, + "grad_norm": 2.7599406242370605, + "learning_rate": 3.5183305132543715e-05, + "loss": 0.67, + "step": 5255 + }, + { + "epoch": 2.964467005076142, + "grad_norm": 1.1556950807571411, + "learning_rate": 3.51804850535815e-05, + "loss": 0.7714, + "step": 5256 + }, + { + "epoch": 2.9650310208685844, + "grad_norm": 1.6568654775619507, + "learning_rate": 3.517766497461929e-05, + "loss": 0.8774, + "step": 5257 + }, + { + "epoch": 2.9655950366610266, + "grad_norm": 2.67429256439209, + "learning_rate": 3.517484489565708e-05, + "loss": 0.8026, + "step": 5258 + }, + { + "epoch": 2.9661590524534684, + "grad_norm": 1.520246982574463, + "learning_rate": 3.517202481669487e-05, + "loss": 0.7374, + "step": 5259 + }, + { + "epoch": 2.966723068245911, + "grad_norm": 1.421830654144287, + "learning_rate": 3.5169204737732656e-05, + "loss": 0.7569, + "step": 5260 + }, + { + "epoch": 2.967287084038353, + "grad_norm": 1.3958945274353027, + "learning_rate": 3.516638465877045e-05, + "loss": 0.7519, + "step": 5261 + }, + { + "epoch": 2.967851099830795, + "grad_norm": 2.151994228363037, + "learning_rate": 3.516356457980823e-05, + "loss": 0.8076, + "step": 5262 + }, + { + "epoch": 2.9684151156232375, + "grad_norm": 4.21859073638916, + "learning_rate": 3.5160744500846025e-05, + "loss": 0.7709, + "step": 5263 + }, + { + "epoch": 2.9689791314156797, + "grad_norm": 1.0102312564849854, + "learning_rate": 3.515792442188382e-05, + "loss": 0.7396, + "step": 5264 + }, + { + "epoch": 2.969543147208122, + "grad_norm": 1.0199308395385742, + "learning_rate": 3.51551043429216e-05, + "loss": 0.7042, + "step": 5265 + }, + { + "epoch": 2.9701071630005638, + "grad_norm": 1.861587405204773, + "learning_rate": 3.5152284263959395e-05, + "loss": 0.7995, + "step": 5266 + }, + { + "epoch": 2.9706711787930065, + "grad_norm": 1.499611258506775, + "learning_rate": 3.514946418499718e-05, + "loss": 0.7544, + "step": 5267 + }, + { + "epoch": 2.9712351945854483, + "grad_norm": 1.570209264755249, + "learning_rate": 3.514664410603497e-05, + "loss": 0.848, + "step": 5268 + }, + { + "epoch": 2.9717992103778905, + "grad_norm": 1.4099538326263428, + "learning_rate": 3.514382402707276e-05, + "loss": 0.8049, + "step": 5269 + }, + { + "epoch": 2.9723632261703328, + "grad_norm": 2.3325257301330566, + "learning_rate": 3.514100394811055e-05, + "loss": 0.9516, + "step": 5270 + }, + { + "epoch": 2.972927241962775, + "grad_norm": 1.7486168146133423, + "learning_rate": 3.5138183869148336e-05, + "loss": 0.8836, + "step": 5271 + }, + { + "epoch": 2.9734912577552173, + "grad_norm": 3.3993289470672607, + "learning_rate": 3.513536379018613e-05, + "loss": 0.7946, + "step": 5272 + }, + { + "epoch": 2.974055273547659, + "grad_norm": 2.0125832557678223, + "learning_rate": 3.513254371122391e-05, + "loss": 0.7262, + "step": 5273 + }, + { + "epoch": 2.974619289340102, + "grad_norm": 1.2295492887496948, + "learning_rate": 3.5129723632261706e-05, + "loss": 0.8158, + "step": 5274 + }, + { + "epoch": 2.9751833051325436, + "grad_norm": 1.5148173570632935, + "learning_rate": 3.51269035532995e-05, + "loss": 0.7758, + "step": 5275 + }, + { + "epoch": 2.975747320924986, + "grad_norm": 1.698516607284546, + "learning_rate": 3.512408347433728e-05, + "loss": 0.6696, + "step": 5276 + }, + { + "epoch": 2.976311336717428, + "grad_norm": 1.048239827156067, + "learning_rate": 3.512126339537507e-05, + "loss": 0.7314, + "step": 5277 + }, + { + "epoch": 2.9768753525098703, + "grad_norm": 8.78715991973877, + "learning_rate": 3.511844331641286e-05, + "loss": 0.825, + "step": 5278 + }, + { + "epoch": 2.9774393683023126, + "grad_norm": 1.9710824489593506, + "learning_rate": 3.511562323745065e-05, + "loss": 0.8371, + "step": 5279 + }, + { + "epoch": 2.9780033840947544, + "grad_norm": 1.1735038757324219, + "learning_rate": 3.511280315848844e-05, + "loss": 0.7095, + "step": 5280 + }, + { + "epoch": 2.978567399887197, + "grad_norm": 2.032517671585083, + "learning_rate": 3.5109983079526224e-05, + "loss": 0.7593, + "step": 5281 + }, + { + "epoch": 2.979131415679639, + "grad_norm": 1.1447546482086182, + "learning_rate": 3.510716300056402e-05, + "loss": 0.8285, + "step": 5282 + }, + { + "epoch": 2.979695431472081, + "grad_norm": 1.2308675050735474, + "learning_rate": 3.510434292160181e-05, + "loss": 0.8427, + "step": 5283 + }, + { + "epoch": 2.9802594472645234, + "grad_norm": 2.5067665576934814, + "learning_rate": 3.5101522842639593e-05, + "loss": 0.8676, + "step": 5284 + }, + { + "epoch": 2.9808234630569657, + "grad_norm": 0.9244096875190735, + "learning_rate": 3.5098702763677386e-05, + "loss": 0.7117, + "step": 5285 + }, + { + "epoch": 2.981387478849408, + "grad_norm": 1.2365413904190063, + "learning_rate": 3.509588268471518e-05, + "loss": 0.8309, + "step": 5286 + }, + { + "epoch": 2.9819514946418497, + "grad_norm": 2.059475898742676, + "learning_rate": 3.509306260575296e-05, + "loss": 0.7851, + "step": 5287 + }, + { + "epoch": 2.9825155104342924, + "grad_norm": 0.9832132458686829, + "learning_rate": 3.509024252679075e-05, + "loss": 0.7114, + "step": 5288 + }, + { + "epoch": 2.9830795262267342, + "grad_norm": 1.3637841939926147, + "learning_rate": 3.508742244782854e-05, + "loss": 0.8734, + "step": 5289 + }, + { + "epoch": 2.9836435420191765, + "grad_norm": 1.1595669984817505, + "learning_rate": 3.508460236886633e-05, + "loss": 0.701, + "step": 5290 + }, + { + "epoch": 2.9842075578116187, + "grad_norm": 1.4550303220748901, + "learning_rate": 3.508178228990412e-05, + "loss": 0.8198, + "step": 5291 + }, + { + "epoch": 2.984771573604061, + "grad_norm": 1.1803655624389648, + "learning_rate": 3.5078962210941904e-05, + "loss": 0.801, + "step": 5292 + }, + { + "epoch": 2.9853355893965032, + "grad_norm": 0.982180655002594, + "learning_rate": 3.5076142131979696e-05, + "loss": 0.696, + "step": 5293 + }, + { + "epoch": 2.985899605188945, + "grad_norm": 2.2853503227233887, + "learning_rate": 3.507332205301749e-05, + "loss": 0.8831, + "step": 5294 + }, + { + "epoch": 2.9864636209813877, + "grad_norm": 1.9145252704620361, + "learning_rate": 3.5070501974055274e-05, + "loss": 0.6633, + "step": 5295 + }, + { + "epoch": 2.9870276367738295, + "grad_norm": 1.005075454711914, + "learning_rate": 3.5067681895093066e-05, + "loss": 0.7226, + "step": 5296 + }, + { + "epoch": 2.987591652566272, + "grad_norm": 1.9665803909301758, + "learning_rate": 3.506486181613085e-05, + "loss": 0.888, + "step": 5297 + }, + { + "epoch": 2.988155668358714, + "grad_norm": 1.7165381908416748, + "learning_rate": 3.5062041737168643e-05, + "loss": 0.6631, + "step": 5298 + }, + { + "epoch": 2.9887196841511563, + "grad_norm": 1.8999887704849243, + "learning_rate": 3.505922165820643e-05, + "loss": 0.8608, + "step": 5299 + }, + { + "epoch": 2.9892836999435985, + "grad_norm": 1.1669169664382935, + "learning_rate": 3.505640157924422e-05, + "loss": 0.854, + "step": 5300 + }, + { + "epoch": 2.9898477157360404, + "grad_norm": 0.7894555330276489, + "learning_rate": 3.505358150028201e-05, + "loss": 0.663, + "step": 5301 + }, + { + "epoch": 2.990411731528483, + "grad_norm": 1.2671810388565063, + "learning_rate": 3.50507614213198e-05, + "loss": 0.7245, + "step": 5302 + }, + { + "epoch": 2.990975747320925, + "grad_norm": 2.000046968460083, + "learning_rate": 3.504794134235759e-05, + "loss": 0.7852, + "step": 5303 + }, + { + "epoch": 2.991539763113367, + "grad_norm": 1.1048399209976196, + "learning_rate": 3.5045121263395376e-05, + "loss": 0.7449, + "step": 5304 + }, + { + "epoch": 2.9921037789058094, + "grad_norm": 1.4673618078231812, + "learning_rate": 3.504230118443317e-05, + "loss": 0.8259, + "step": 5305 + }, + { + "epoch": 2.9926677946982516, + "grad_norm": 1.2701573371887207, + "learning_rate": 3.5039481105470954e-05, + "loss": 0.8294, + "step": 5306 + }, + { + "epoch": 2.993231810490694, + "grad_norm": 1.8400615453720093, + "learning_rate": 3.5036661026508746e-05, + "loss": 0.7518, + "step": 5307 + }, + { + "epoch": 2.9937958262831357, + "grad_norm": 1.3473267555236816, + "learning_rate": 3.503384094754653e-05, + "loss": 0.7133, + "step": 5308 + }, + { + "epoch": 2.9943598420755784, + "grad_norm": 1.967867374420166, + "learning_rate": 3.5031020868584324e-05, + "loss": 0.9578, + "step": 5309 + }, + { + "epoch": 2.99492385786802, + "grad_norm": 1.7026355266571045, + "learning_rate": 3.502820078962211e-05, + "loss": 0.8578, + "step": 5310 + }, + { + "epoch": 2.9954878736604624, + "grad_norm": 0.8385064005851746, + "learning_rate": 3.50253807106599e-05, + "loss": 0.6766, + "step": 5311 + }, + { + "epoch": 2.9960518894529047, + "grad_norm": 0.9732752442359924, + "learning_rate": 3.5022560631697687e-05, + "loss": 0.7096, + "step": 5312 + }, + { + "epoch": 2.996615905245347, + "grad_norm": 1.4746527671813965, + "learning_rate": 3.501974055273548e-05, + "loss": 0.7484, + "step": 5313 + }, + { + "epoch": 2.997179921037789, + "grad_norm": 1.147400140762329, + "learning_rate": 3.501692047377327e-05, + "loss": 0.7663, + "step": 5314 + }, + { + "epoch": 2.997743936830231, + "grad_norm": 0.9526527523994446, + "learning_rate": 3.5014100394811056e-05, + "loss": 0.7648, + "step": 5315 + }, + { + "epoch": 2.9983079526226737, + "grad_norm": 1.1990350484848022, + "learning_rate": 3.501128031584884e-05, + "loss": 0.7364, + "step": 5316 + }, + { + "epoch": 2.9988719684151155, + "grad_norm": 1.5398015975952148, + "learning_rate": 3.5008460236886634e-05, + "loss": 0.8201, + "step": 5317 + }, + { + "epoch": 2.9994359842075577, + "grad_norm": 1.5052604675292969, + "learning_rate": 3.5005640157924426e-05, + "loss": 0.6483, + "step": 5318 + }, + { + "epoch": 3.0, + "grad_norm": 2.337231397628784, + "learning_rate": 3.500282007896221e-05, + "loss": 0.8242, + "step": 5319 + }, + { + "epoch": 3.0005640157924423, + "grad_norm": 1.5858460664749146, + "learning_rate": 3.5e-05, + "loss": 0.7913, + "step": 5320 + }, + { + "epoch": 3.0011280315848845, + "grad_norm": 1.0854666233062744, + "learning_rate": 3.4997179921037796e-05, + "loss": 0.8109, + "step": 5321 + }, + { + "epoch": 3.0016920473773268, + "grad_norm": 2.374823808670044, + "learning_rate": 3.499435984207558e-05, + "loss": 0.8204, + "step": 5322 + }, + { + "epoch": 3.0022560631697686, + "grad_norm": 1.2633973360061646, + "learning_rate": 3.499153976311337e-05, + "loss": 0.7854, + "step": 5323 + }, + { + "epoch": 3.002820078962211, + "grad_norm": 2.416114091873169, + "learning_rate": 3.498871968415116e-05, + "loss": 0.6802, + "step": 5324 + }, + { + "epoch": 3.003384094754653, + "grad_norm": 1.368937611579895, + "learning_rate": 3.498589960518895e-05, + "loss": 0.7602, + "step": 5325 + }, + { + "epoch": 3.0039481105470953, + "grad_norm": 1.3965678215026855, + "learning_rate": 3.4983079526226736e-05, + "loss": 0.6994, + "step": 5326 + }, + { + "epoch": 3.0045121263395376, + "grad_norm": 4.247986793518066, + "learning_rate": 3.498025944726452e-05, + "loss": 0.974, + "step": 5327 + }, + { + "epoch": 3.00507614213198, + "grad_norm": 1.9941530227661133, + "learning_rate": 3.4977439368302314e-05, + "loss": 0.7331, + "step": 5328 + }, + { + "epoch": 3.005640157924422, + "grad_norm": 1.1866850852966309, + "learning_rate": 3.4974619289340106e-05, + "loss": 0.6897, + "step": 5329 + }, + { + "epoch": 3.006204173716864, + "grad_norm": 1.2376821041107178, + "learning_rate": 3.497179921037789e-05, + "loss": 0.7364, + "step": 5330 + }, + { + "epoch": 3.006768189509306, + "grad_norm": 1.8085728883743286, + "learning_rate": 3.496897913141568e-05, + "loss": 0.8926, + "step": 5331 + }, + { + "epoch": 3.0073322053017484, + "grad_norm": 1.1166876554489136, + "learning_rate": 3.496615905245347e-05, + "loss": 0.7649, + "step": 5332 + }, + { + "epoch": 3.0078962210941906, + "grad_norm": 0.875982403755188, + "learning_rate": 3.496333897349126e-05, + "loss": 0.6267, + "step": 5333 + }, + { + "epoch": 3.008460236886633, + "grad_norm": 1.1906152963638306, + "learning_rate": 3.496051889452905e-05, + "loss": 0.8052, + "step": 5334 + }, + { + "epoch": 3.009024252679075, + "grad_norm": 1.7327971458435059, + "learning_rate": 3.495769881556684e-05, + "loss": 0.8382, + "step": 5335 + }, + { + "epoch": 3.0095882684715174, + "grad_norm": 1.2347124814987183, + "learning_rate": 3.495487873660463e-05, + "loss": 0.7323, + "step": 5336 + }, + { + "epoch": 3.010152284263959, + "grad_norm": 3.0648391246795654, + "learning_rate": 3.495205865764242e-05, + "loss": 0.7877, + "step": 5337 + }, + { + "epoch": 3.0107163000564015, + "grad_norm": 1.440242886543274, + "learning_rate": 3.49492385786802e-05, + "loss": 0.6754, + "step": 5338 + }, + { + "epoch": 3.0112803158488437, + "grad_norm": 1.3008722066879272, + "learning_rate": 3.4946418499717994e-05, + "loss": 0.7293, + "step": 5339 + }, + { + "epoch": 3.011844331641286, + "grad_norm": 1.334465503692627, + "learning_rate": 3.4943598420755786e-05, + "loss": 0.7597, + "step": 5340 + }, + { + "epoch": 3.012408347433728, + "grad_norm": 0.9899925589561462, + "learning_rate": 3.494077834179357e-05, + "loss": 0.6823, + "step": 5341 + }, + { + "epoch": 3.0129723632261705, + "grad_norm": 1.394951343536377, + "learning_rate": 3.4937958262831364e-05, + "loss": 0.8111, + "step": 5342 + }, + { + "epoch": 3.0135363790186127, + "grad_norm": 1.559773564338684, + "learning_rate": 3.493513818386915e-05, + "loss": 0.7215, + "step": 5343 + }, + { + "epoch": 3.0141003948110545, + "grad_norm": 2.080475330352783, + "learning_rate": 3.493231810490694e-05, + "loss": 0.8507, + "step": 5344 + }, + { + "epoch": 3.0146644106034968, + "grad_norm": 1.3852298259735107, + "learning_rate": 3.492949802594473e-05, + "loss": 0.7583, + "step": 5345 + }, + { + "epoch": 3.015228426395939, + "grad_norm": 1.088636875152588, + "learning_rate": 3.492667794698252e-05, + "loss": 0.7168, + "step": 5346 + }, + { + "epoch": 3.0157924421883813, + "grad_norm": 1.3651378154754639, + "learning_rate": 3.4923857868020305e-05, + "loss": 0.7079, + "step": 5347 + }, + { + "epoch": 3.0163564579808235, + "grad_norm": 1.0507100820541382, + "learning_rate": 3.49210377890581e-05, + "loss": 0.8124, + "step": 5348 + }, + { + "epoch": 3.0169204737732658, + "grad_norm": 1.9854308366775513, + "learning_rate": 3.491821771009588e-05, + "loss": 0.8258, + "step": 5349 + }, + { + "epoch": 3.017484489565708, + "grad_norm": 1.2457563877105713, + "learning_rate": 3.4915397631133674e-05, + "loss": 0.7256, + "step": 5350 + }, + { + "epoch": 3.01804850535815, + "grad_norm": 1.6448336839675903, + "learning_rate": 3.491257755217146e-05, + "loss": 0.7364, + "step": 5351 + }, + { + "epoch": 3.018612521150592, + "grad_norm": 1.0115329027175903, + "learning_rate": 3.490975747320925e-05, + "loss": 0.772, + "step": 5352 + }, + { + "epoch": 3.0191765369430343, + "grad_norm": 1.0776920318603516, + "learning_rate": 3.4906937394247044e-05, + "loss": 0.723, + "step": 5353 + }, + { + "epoch": 3.0197405527354766, + "grad_norm": 1.9401233196258545, + "learning_rate": 3.490411731528483e-05, + "loss": 0.7406, + "step": 5354 + }, + { + "epoch": 3.020304568527919, + "grad_norm": 1.129099726676941, + "learning_rate": 3.4901297236322615e-05, + "loss": 0.7533, + "step": 5355 + }, + { + "epoch": 3.020868584320361, + "grad_norm": 1.1422474384307861, + "learning_rate": 3.489847715736041e-05, + "loss": 0.7298, + "step": 5356 + }, + { + "epoch": 3.0214326001128033, + "grad_norm": 1.4060860872268677, + "learning_rate": 3.48956570783982e-05, + "loss": 0.6896, + "step": 5357 + }, + { + "epoch": 3.021996615905245, + "grad_norm": 1.1049646139144897, + "learning_rate": 3.4892836999435985e-05, + "loss": 0.7378, + "step": 5358 + }, + { + "epoch": 3.0225606316976874, + "grad_norm": 1.2888001203536987, + "learning_rate": 3.489001692047377e-05, + "loss": 0.695, + "step": 5359 + }, + { + "epoch": 3.0231246474901297, + "grad_norm": 1.2685937881469727, + "learning_rate": 3.488719684151157e-05, + "loss": 0.7009, + "step": 5360 + }, + { + "epoch": 3.023688663282572, + "grad_norm": 1.341180443763733, + "learning_rate": 3.4884376762549355e-05, + "loss": 0.6699, + "step": 5361 + }, + { + "epoch": 3.024252679075014, + "grad_norm": 1.1595600843429565, + "learning_rate": 3.488155668358714e-05, + "loss": 0.7309, + "step": 5362 + }, + { + "epoch": 3.0248166948674564, + "grad_norm": 1.6280285120010376, + "learning_rate": 3.4878736604624925e-05, + "loss": 0.6294, + "step": 5363 + }, + { + "epoch": 3.0253807106598987, + "grad_norm": 3.269077777862549, + "learning_rate": 3.4875916525662724e-05, + "loss": 0.7168, + "step": 5364 + }, + { + "epoch": 3.0259447264523405, + "grad_norm": 1.61896550655365, + "learning_rate": 3.487309644670051e-05, + "loss": 0.8295, + "step": 5365 + }, + { + "epoch": 3.0265087422447827, + "grad_norm": 1.4897775650024414, + "learning_rate": 3.4870276367738295e-05, + "loss": 0.784, + "step": 5366 + }, + { + "epoch": 3.027072758037225, + "grad_norm": 2.292579412460327, + "learning_rate": 3.486745628877609e-05, + "loss": 0.8107, + "step": 5367 + }, + { + "epoch": 3.0276367738296672, + "grad_norm": 1.9097257852554321, + "learning_rate": 3.486463620981388e-05, + "loss": 0.729, + "step": 5368 + }, + { + "epoch": 3.0282007896221095, + "grad_norm": 2.0286335945129395, + "learning_rate": 3.4861816130851665e-05, + "loss": 0.8703, + "step": 5369 + }, + { + "epoch": 3.0287648054145517, + "grad_norm": 1.4925894737243652, + "learning_rate": 3.485899605188945e-05, + "loss": 0.8562, + "step": 5370 + }, + { + "epoch": 3.029328821206994, + "grad_norm": 1.062406301498413, + "learning_rate": 3.485617597292725e-05, + "loss": 0.6821, + "step": 5371 + }, + { + "epoch": 3.029892836999436, + "grad_norm": 1.043054223060608, + "learning_rate": 3.4853355893965035e-05, + "loss": 0.6912, + "step": 5372 + }, + { + "epoch": 3.030456852791878, + "grad_norm": 1.0730137825012207, + "learning_rate": 3.485053581500282e-05, + "loss": 0.7021, + "step": 5373 + }, + { + "epoch": 3.0310208685843203, + "grad_norm": 1.224481225013733, + "learning_rate": 3.484771573604061e-05, + "loss": 0.7639, + "step": 5374 + }, + { + "epoch": 3.0315848843767625, + "grad_norm": 1.8901340961456299, + "learning_rate": 3.4844895657078404e-05, + "loss": 0.7189, + "step": 5375 + }, + { + "epoch": 3.032148900169205, + "grad_norm": 1.5139063596725464, + "learning_rate": 3.484207557811619e-05, + "loss": 0.7035, + "step": 5376 + }, + { + "epoch": 3.032712915961647, + "grad_norm": 1.839371681213379, + "learning_rate": 3.4839255499153975e-05, + "loss": 0.769, + "step": 5377 + }, + { + "epoch": 3.0332769317540893, + "grad_norm": 1.2557955980300903, + "learning_rate": 3.483643542019177e-05, + "loss": 0.7545, + "step": 5378 + }, + { + "epoch": 3.033840947546531, + "grad_norm": 1.9636085033416748, + "learning_rate": 3.483361534122956e-05, + "loss": 0.7717, + "step": 5379 + }, + { + "epoch": 3.0344049633389734, + "grad_norm": 1.6300138235092163, + "learning_rate": 3.4830795262267345e-05, + "loss": 0.8218, + "step": 5380 + }, + { + "epoch": 3.0349689791314156, + "grad_norm": 0.9967474341392517, + "learning_rate": 3.482797518330513e-05, + "loss": 0.687, + "step": 5381 + }, + { + "epoch": 3.035532994923858, + "grad_norm": 1.2988414764404297, + "learning_rate": 3.482515510434292e-05, + "loss": 0.6218, + "step": 5382 + }, + { + "epoch": 3.0360970107163, + "grad_norm": 2.1273486614227295, + "learning_rate": 3.4822335025380715e-05, + "loss": 0.8907, + "step": 5383 + }, + { + "epoch": 3.0366610265087424, + "grad_norm": 1.015977144241333, + "learning_rate": 3.48195149464185e-05, + "loss": 0.6914, + "step": 5384 + }, + { + "epoch": 3.0372250423011846, + "grad_norm": 1.8430590629577637, + "learning_rate": 3.481669486745629e-05, + "loss": 0.8536, + "step": 5385 + }, + { + "epoch": 3.0377890580936264, + "grad_norm": 1.9760929346084595, + "learning_rate": 3.481387478849408e-05, + "loss": 0.7274, + "step": 5386 + }, + { + "epoch": 3.0383530738860687, + "grad_norm": 2.026038408279419, + "learning_rate": 3.481105470953187e-05, + "loss": 0.8609, + "step": 5387 + }, + { + "epoch": 3.038917089678511, + "grad_norm": 1.8202383518218994, + "learning_rate": 3.4808234630569655e-05, + "loss": 0.8194, + "step": 5388 + }, + { + "epoch": 3.039481105470953, + "grad_norm": 1.4296754598617554, + "learning_rate": 3.480541455160745e-05, + "loss": 0.6967, + "step": 5389 + }, + { + "epoch": 3.0400451212633954, + "grad_norm": 1.9346568584442139, + "learning_rate": 3.480259447264523e-05, + "loss": 0.8191, + "step": 5390 + }, + { + "epoch": 3.0406091370558377, + "grad_norm": 1.0757452249526978, + "learning_rate": 3.4799774393683025e-05, + "loss": 0.7405, + "step": 5391 + }, + { + "epoch": 3.04117315284828, + "grad_norm": 0.9017037153244019, + "learning_rate": 3.479695431472082e-05, + "loss": 0.5842, + "step": 5392 + }, + { + "epoch": 3.0417371686407217, + "grad_norm": 1.2356514930725098, + "learning_rate": 3.47941342357586e-05, + "loss": 0.7793, + "step": 5393 + }, + { + "epoch": 3.042301184433164, + "grad_norm": 1.5035799741744995, + "learning_rate": 3.479131415679639e-05, + "loss": 0.7671, + "step": 5394 + }, + { + "epoch": 3.0428652002256062, + "grad_norm": 1.7543175220489502, + "learning_rate": 3.478849407783418e-05, + "loss": 0.7798, + "step": 5395 + }, + { + "epoch": 3.0434292160180485, + "grad_norm": 0.8197963237762451, + "learning_rate": 3.478567399887197e-05, + "loss": 0.6151, + "step": 5396 + }, + { + "epoch": 3.0439932318104908, + "grad_norm": 1.2008908987045288, + "learning_rate": 3.478285391990976e-05, + "loss": 0.806, + "step": 5397 + }, + { + "epoch": 3.044557247602933, + "grad_norm": 1.4444607496261597, + "learning_rate": 3.4780033840947543e-05, + "loss": 0.7432, + "step": 5398 + }, + { + "epoch": 3.0451212633953753, + "grad_norm": 1.9719561338424683, + "learning_rate": 3.4777213761985336e-05, + "loss": 0.8303, + "step": 5399 + }, + { + "epoch": 3.045685279187817, + "grad_norm": 1.3943098783493042, + "learning_rate": 3.477439368302313e-05, + "loss": 0.7114, + "step": 5400 + }, + { + "epoch": 3.0462492949802593, + "grad_norm": 1.0151946544647217, + "learning_rate": 3.477157360406091e-05, + "loss": 0.7336, + "step": 5401 + }, + { + "epoch": 3.0468133107727016, + "grad_norm": 2.478933572769165, + "learning_rate": 3.47687535250987e-05, + "loss": 0.9275, + "step": 5402 + }, + { + "epoch": 3.047377326565144, + "grad_norm": 1.6049935817718506, + "learning_rate": 3.47659334461365e-05, + "loss": 0.8518, + "step": 5403 + }, + { + "epoch": 3.047941342357586, + "grad_norm": 1.130598545074463, + "learning_rate": 3.476311336717428e-05, + "loss": 0.754, + "step": 5404 + }, + { + "epoch": 3.0485053581500283, + "grad_norm": 1.468432068824768, + "learning_rate": 3.476029328821207e-05, + "loss": 0.6934, + "step": 5405 + }, + { + "epoch": 3.0490693739424706, + "grad_norm": 0.9960411190986633, + "learning_rate": 3.475747320924986e-05, + "loss": 0.7011, + "step": 5406 + }, + { + "epoch": 3.0496333897349124, + "grad_norm": 2.1322999000549316, + "learning_rate": 3.475465313028765e-05, + "loss": 0.7594, + "step": 5407 + }, + { + "epoch": 3.0501974055273546, + "grad_norm": 1.4504964351654053, + "learning_rate": 3.475183305132544e-05, + "loss": 0.7473, + "step": 5408 + }, + { + "epoch": 3.050761421319797, + "grad_norm": 1.3814811706542969, + "learning_rate": 3.4749012972363224e-05, + "loss": 0.7734, + "step": 5409 + }, + { + "epoch": 3.051325437112239, + "grad_norm": 1.257180094718933, + "learning_rate": 3.474619289340102e-05, + "loss": 0.7269, + "step": 5410 + }, + { + "epoch": 3.0518894529046814, + "grad_norm": 0.9607113003730774, + "learning_rate": 3.474337281443881e-05, + "loss": 0.7471, + "step": 5411 + }, + { + "epoch": 3.0524534686971236, + "grad_norm": 0.9731194376945496, + "learning_rate": 3.474055273547659e-05, + "loss": 0.6777, + "step": 5412 + }, + { + "epoch": 3.053017484489566, + "grad_norm": 1.4041693210601807, + "learning_rate": 3.4737732656514386e-05, + "loss": 0.7256, + "step": 5413 + }, + { + "epoch": 3.0535815002820077, + "grad_norm": 1.7456165552139282, + "learning_rate": 3.473491257755218e-05, + "loss": 0.7014, + "step": 5414 + }, + { + "epoch": 3.05414551607445, + "grad_norm": 0.9378483891487122, + "learning_rate": 3.473209249858996e-05, + "loss": 0.5964, + "step": 5415 + }, + { + "epoch": 3.054709531866892, + "grad_norm": 1.1929296255111694, + "learning_rate": 3.472927241962775e-05, + "loss": 0.6997, + "step": 5416 + }, + { + "epoch": 3.0552735476593345, + "grad_norm": 1.7791422605514526, + "learning_rate": 3.472645234066554e-05, + "loss": 0.8344, + "step": 5417 + }, + { + "epoch": 3.0558375634517767, + "grad_norm": 1.255711317062378, + "learning_rate": 3.472363226170333e-05, + "loss": 0.7707, + "step": 5418 + }, + { + "epoch": 3.056401579244219, + "grad_norm": 3.4562838077545166, + "learning_rate": 3.472081218274112e-05, + "loss": 0.7934, + "step": 5419 + }, + { + "epoch": 3.056965595036661, + "grad_norm": 0.9877322912216187, + "learning_rate": 3.4717992103778904e-05, + "loss": 0.6271, + "step": 5420 + }, + { + "epoch": 3.057529610829103, + "grad_norm": 1.5152616500854492, + "learning_rate": 3.4715172024816696e-05, + "loss": 0.8267, + "step": 5421 + }, + { + "epoch": 3.0580936266215453, + "grad_norm": 1.2489594221115112, + "learning_rate": 3.471235194585449e-05, + "loss": 0.7392, + "step": 5422 + }, + { + "epoch": 3.0586576424139875, + "grad_norm": 1.822610855102539, + "learning_rate": 3.4709531866892273e-05, + "loss": 0.875, + "step": 5423 + }, + { + "epoch": 3.0592216582064298, + "grad_norm": 1.1351383924484253, + "learning_rate": 3.4706711787930066e-05, + "loss": 0.6993, + "step": 5424 + }, + { + "epoch": 3.059785673998872, + "grad_norm": 1.184533953666687, + "learning_rate": 3.470389170896785e-05, + "loss": 0.7731, + "step": 5425 + }, + { + "epoch": 3.0603496897913143, + "grad_norm": 1.3897420167922974, + "learning_rate": 3.470107163000564e-05, + "loss": 0.6896, + "step": 5426 + }, + { + "epoch": 3.0609137055837565, + "grad_norm": 1.2820795774459839, + "learning_rate": 3.469825155104343e-05, + "loss": 0.7981, + "step": 5427 + }, + { + "epoch": 3.0614777213761983, + "grad_norm": 1.5230038166046143, + "learning_rate": 3.469543147208122e-05, + "loss": 0.7328, + "step": 5428 + }, + { + "epoch": 3.0620417371686406, + "grad_norm": 1.2961490154266357, + "learning_rate": 3.4692611393119006e-05, + "loss": 0.6928, + "step": 5429 + }, + { + "epoch": 3.062605752961083, + "grad_norm": 1.5819569826126099, + "learning_rate": 3.46897913141568e-05, + "loss": 0.7693, + "step": 5430 + }, + { + "epoch": 3.063169768753525, + "grad_norm": 1.3373943567276, + "learning_rate": 3.468697123519459e-05, + "loss": 0.7158, + "step": 5431 + }, + { + "epoch": 3.0637337845459673, + "grad_norm": 1.233603596687317, + "learning_rate": 3.4684151156232376e-05, + "loss": 0.6583, + "step": 5432 + }, + { + "epoch": 3.0642978003384096, + "grad_norm": 1.0537523031234741, + "learning_rate": 3.468133107727016e-05, + "loss": 0.7998, + "step": 5433 + }, + { + "epoch": 3.064861816130852, + "grad_norm": 1.419140338897705, + "learning_rate": 3.4678510998307954e-05, + "loss": 0.8248, + "step": 5434 + }, + { + "epoch": 3.0654258319232937, + "grad_norm": 1.059381365776062, + "learning_rate": 3.4675690919345746e-05, + "loss": 0.7359, + "step": 5435 + }, + { + "epoch": 3.065989847715736, + "grad_norm": 1.6053274869918823, + "learning_rate": 3.467287084038353e-05, + "loss": 0.8264, + "step": 5436 + }, + { + "epoch": 3.066553863508178, + "grad_norm": 2.176851749420166, + "learning_rate": 3.467005076142132e-05, + "loss": 0.7259, + "step": 5437 + }, + { + "epoch": 3.0671178793006204, + "grad_norm": 2.2161078453063965, + "learning_rate": 3.466723068245911e-05, + "loss": 0.7667, + "step": 5438 + }, + { + "epoch": 3.0676818950930627, + "grad_norm": 1.3592857122421265, + "learning_rate": 3.46644106034969e-05, + "loss": 0.7986, + "step": 5439 + }, + { + "epoch": 3.068245910885505, + "grad_norm": 1.390512466430664, + "learning_rate": 3.4661590524534686e-05, + "loss": 0.8022, + "step": 5440 + }, + { + "epoch": 3.068809926677947, + "grad_norm": 1.1152993440628052, + "learning_rate": 3.465877044557248e-05, + "loss": 0.7007, + "step": 5441 + }, + { + "epoch": 3.069373942470389, + "grad_norm": 1.2477152347564697, + "learning_rate": 3.465595036661027e-05, + "loss": 0.7732, + "step": 5442 + }, + { + "epoch": 3.0699379582628312, + "grad_norm": 1.5782545804977417, + "learning_rate": 3.4653130287648056e-05, + "loss": 0.8291, + "step": 5443 + }, + { + "epoch": 3.0705019740552735, + "grad_norm": 1.3791054487228394, + "learning_rate": 3.465031020868584e-05, + "loss": 0.8325, + "step": 5444 + }, + { + "epoch": 3.0710659898477157, + "grad_norm": 1.6909189224243164, + "learning_rate": 3.4647490129723634e-05, + "loss": 0.8368, + "step": 5445 + }, + { + "epoch": 3.071630005640158, + "grad_norm": 1.4832054376602173, + "learning_rate": 3.4644670050761426e-05, + "loss": 0.7127, + "step": 5446 + }, + { + "epoch": 3.0721940214326002, + "grad_norm": 2.2897963523864746, + "learning_rate": 3.464184997179921e-05, + "loss": 0.8196, + "step": 5447 + }, + { + "epoch": 3.0727580372250425, + "grad_norm": 2.2038581371307373, + "learning_rate": 3.4639029892837e-05, + "loss": 0.7667, + "step": 5448 + }, + { + "epoch": 3.0733220530174843, + "grad_norm": 1.0458084344863892, + "learning_rate": 3.4636209813874796e-05, + "loss": 0.6923, + "step": 5449 + }, + { + "epoch": 3.0738860688099265, + "grad_norm": 1.4955493211746216, + "learning_rate": 3.463338973491258e-05, + "loss": 0.6706, + "step": 5450 + }, + { + "epoch": 3.074450084602369, + "grad_norm": 1.2853437662124634, + "learning_rate": 3.4630569655950367e-05, + "loss": 0.7498, + "step": 5451 + }, + { + "epoch": 3.075014100394811, + "grad_norm": 2.7196710109710693, + "learning_rate": 3.462774957698816e-05, + "loss": 0.6763, + "step": 5452 + }, + { + "epoch": 3.0755781161872533, + "grad_norm": 1.4584969282150269, + "learning_rate": 3.462492949802595e-05, + "loss": 0.669, + "step": 5453 + }, + { + "epoch": 3.0761421319796955, + "grad_norm": 1.68522310256958, + "learning_rate": 3.4622109419063736e-05, + "loss": 0.7377, + "step": 5454 + }, + { + "epoch": 3.076706147772138, + "grad_norm": 2.114872455596924, + "learning_rate": 3.461928934010152e-05, + "loss": 0.8419, + "step": 5455 + }, + { + "epoch": 3.0772701635645796, + "grad_norm": 1.6759276390075684, + "learning_rate": 3.4616469261139314e-05, + "loss": 0.8318, + "step": 5456 + }, + { + "epoch": 3.077834179357022, + "grad_norm": 1.3205817937850952, + "learning_rate": 3.4613649182177106e-05, + "loss": 0.7, + "step": 5457 + }, + { + "epoch": 3.078398195149464, + "grad_norm": 1.3711316585540771, + "learning_rate": 3.461082910321489e-05, + "loss": 0.7498, + "step": 5458 + }, + { + "epoch": 3.0789622109419064, + "grad_norm": 1.3174079656600952, + "learning_rate": 3.460800902425268e-05, + "loss": 0.7311, + "step": 5459 + }, + { + "epoch": 3.0795262267343486, + "grad_norm": 1.301640510559082, + "learning_rate": 3.460518894529047e-05, + "loss": 0.8637, + "step": 5460 + }, + { + "epoch": 3.080090242526791, + "grad_norm": 1.743299961090088, + "learning_rate": 3.460236886632826e-05, + "loss": 0.8509, + "step": 5461 + }, + { + "epoch": 3.080654258319233, + "grad_norm": 1.2909075021743774, + "learning_rate": 3.459954878736605e-05, + "loss": 0.6707, + "step": 5462 + }, + { + "epoch": 3.081218274111675, + "grad_norm": 2.680821180343628, + "learning_rate": 3.459672870840384e-05, + "loss": 0.7768, + "step": 5463 + }, + { + "epoch": 3.081782289904117, + "grad_norm": 1.1102393865585327, + "learning_rate": 3.4593908629441624e-05, + "loss": 0.7094, + "step": 5464 + }, + { + "epoch": 3.0823463056965594, + "grad_norm": 1.0106416940689087, + "learning_rate": 3.4591088550479416e-05, + "loss": 0.7747, + "step": 5465 + }, + { + "epoch": 3.0829103214890017, + "grad_norm": 1.1007487773895264, + "learning_rate": 3.45882684715172e-05, + "loss": 0.7192, + "step": 5466 + }, + { + "epoch": 3.083474337281444, + "grad_norm": 1.7588094472885132, + "learning_rate": 3.4585448392554994e-05, + "loss": 0.7747, + "step": 5467 + }, + { + "epoch": 3.084038353073886, + "grad_norm": 1.0854053497314453, + "learning_rate": 3.458262831359278e-05, + "loss": 0.7569, + "step": 5468 + }, + { + "epoch": 3.0846023688663284, + "grad_norm": 1.7974201440811157, + "learning_rate": 3.457980823463057e-05, + "loss": 0.8267, + "step": 5469 + }, + { + "epoch": 3.0851663846587702, + "grad_norm": 1.0102028846740723, + "learning_rate": 3.4576988155668364e-05, + "loss": 0.7124, + "step": 5470 + }, + { + "epoch": 3.0857304004512125, + "grad_norm": 1.2184334993362427, + "learning_rate": 3.457416807670615e-05, + "loss": 0.7609, + "step": 5471 + }, + { + "epoch": 3.0862944162436547, + "grad_norm": 2.5477092266082764, + "learning_rate": 3.4571347997743935e-05, + "loss": 0.7445, + "step": 5472 + }, + { + "epoch": 3.086858432036097, + "grad_norm": 1.0361618995666504, + "learning_rate": 3.456852791878173e-05, + "loss": 0.6979, + "step": 5473 + }, + { + "epoch": 3.0874224478285393, + "grad_norm": 1.1665396690368652, + "learning_rate": 3.456570783981952e-05, + "loss": 0.7515, + "step": 5474 + }, + { + "epoch": 3.0879864636209815, + "grad_norm": 1.234960675239563, + "learning_rate": 3.4562887760857304e-05, + "loss": 0.7847, + "step": 5475 + }, + { + "epoch": 3.0885504794134238, + "grad_norm": 1.8779327869415283, + "learning_rate": 3.45600676818951e-05, + "loss": 0.7412, + "step": 5476 + }, + { + "epoch": 3.0891144952058656, + "grad_norm": 2.0819783210754395, + "learning_rate": 3.455724760293288e-05, + "loss": 0.7539, + "step": 5477 + }, + { + "epoch": 3.089678510998308, + "grad_norm": 1.1904748678207397, + "learning_rate": 3.4554427523970674e-05, + "loss": 0.7334, + "step": 5478 + }, + { + "epoch": 3.09024252679075, + "grad_norm": 2.228386640548706, + "learning_rate": 3.455160744500846e-05, + "loss": 0.8927, + "step": 5479 + }, + { + "epoch": 3.0908065425831923, + "grad_norm": 0.981625497341156, + "learning_rate": 3.454878736604625e-05, + "loss": 0.6884, + "step": 5480 + }, + { + "epoch": 3.0913705583756346, + "grad_norm": 1.9318687915802002, + "learning_rate": 3.4545967287084044e-05, + "loss": 0.8693, + "step": 5481 + }, + { + "epoch": 3.091934574168077, + "grad_norm": 1.4372544288635254, + "learning_rate": 3.454314720812183e-05, + "loss": 0.734, + "step": 5482 + }, + { + "epoch": 3.092498589960519, + "grad_norm": 1.0504611730575562, + "learning_rate": 3.4540327129159615e-05, + "loss": 0.6941, + "step": 5483 + }, + { + "epoch": 3.093062605752961, + "grad_norm": 4.212674140930176, + "learning_rate": 3.453750705019741e-05, + "loss": 0.8944, + "step": 5484 + }, + { + "epoch": 3.093626621545403, + "grad_norm": 1.9469029903411865, + "learning_rate": 3.45346869712352e-05, + "loss": 0.7002, + "step": 5485 + }, + { + "epoch": 3.0941906373378454, + "grad_norm": 1.3332849740982056, + "learning_rate": 3.4531866892272985e-05, + "loss": 0.8483, + "step": 5486 + }, + { + "epoch": 3.0947546531302876, + "grad_norm": 1.3629783391952515, + "learning_rate": 3.452904681331077e-05, + "loss": 0.872, + "step": 5487 + }, + { + "epoch": 3.09531866892273, + "grad_norm": 1.1780983209609985, + "learning_rate": 3.452622673434857e-05, + "loss": 0.6667, + "step": 5488 + }, + { + "epoch": 3.095882684715172, + "grad_norm": 3.0073606967926025, + "learning_rate": 3.4523406655386354e-05, + "loss": 0.7578, + "step": 5489 + }, + { + "epoch": 3.0964467005076144, + "grad_norm": 1.4067445993423462, + "learning_rate": 3.452058657642414e-05, + "loss": 0.6613, + "step": 5490 + }, + { + "epoch": 3.097010716300056, + "grad_norm": 1.3034497499465942, + "learning_rate": 3.451776649746193e-05, + "loss": 0.8137, + "step": 5491 + }, + { + "epoch": 3.0975747320924985, + "grad_norm": 1.0912765264511108, + "learning_rate": 3.4514946418499724e-05, + "loss": 0.5958, + "step": 5492 + }, + { + "epoch": 3.0981387478849407, + "grad_norm": 1.9103387594223022, + "learning_rate": 3.451212633953751e-05, + "loss": 0.6861, + "step": 5493 + }, + { + "epoch": 3.098702763677383, + "grad_norm": 3.46647310256958, + "learning_rate": 3.4509306260575295e-05, + "loss": 0.9413, + "step": 5494 + }, + { + "epoch": 3.099266779469825, + "grad_norm": 0.9357880353927612, + "learning_rate": 3.450648618161309e-05, + "loss": 0.6045, + "step": 5495 + }, + { + "epoch": 3.0998307952622675, + "grad_norm": 1.6814861297607422, + "learning_rate": 3.450366610265088e-05, + "loss": 0.7933, + "step": 5496 + }, + { + "epoch": 3.1003948110547097, + "grad_norm": 2.5150249004364014, + "learning_rate": 3.4500846023688665e-05, + "loss": 0.7227, + "step": 5497 + }, + { + "epoch": 3.1009588268471515, + "grad_norm": 2.174682855606079, + "learning_rate": 3.449802594472645e-05, + "loss": 0.7954, + "step": 5498 + }, + { + "epoch": 3.1015228426395938, + "grad_norm": 2.35013747215271, + "learning_rate": 3.449520586576424e-05, + "loss": 0.8048, + "step": 5499 + }, + { + "epoch": 3.102086858432036, + "grad_norm": 1.514682412147522, + "learning_rate": 3.4492385786802035e-05, + "loss": 0.7445, + "step": 5500 + }, + { + "epoch": 3.1026508742244783, + "grad_norm": 1.829744577407837, + "learning_rate": 3.448956570783982e-05, + "loss": 0.8041, + "step": 5501 + }, + { + "epoch": 3.1032148900169205, + "grad_norm": 2.5218517780303955, + "learning_rate": 3.448674562887761e-05, + "loss": 0.8851, + "step": 5502 + }, + { + "epoch": 3.1037789058093628, + "grad_norm": 1.0489734411239624, + "learning_rate": 3.44839255499154e-05, + "loss": 0.617, + "step": 5503 + }, + { + "epoch": 3.104342921601805, + "grad_norm": 1.7773900032043457, + "learning_rate": 3.448110547095319e-05, + "loss": 0.7632, + "step": 5504 + }, + { + "epoch": 3.104906937394247, + "grad_norm": 0.9866594076156616, + "learning_rate": 3.4478285391990975e-05, + "loss": 0.6828, + "step": 5505 + }, + { + "epoch": 3.105470953186689, + "grad_norm": 1.486332893371582, + "learning_rate": 3.447546531302877e-05, + "loss": 0.7162, + "step": 5506 + }, + { + "epoch": 3.1060349689791313, + "grad_norm": 1.271120548248291, + "learning_rate": 3.447264523406655e-05, + "loss": 0.7348, + "step": 5507 + }, + { + "epoch": 3.1065989847715736, + "grad_norm": 1.4167964458465576, + "learning_rate": 3.4469825155104345e-05, + "loss": 0.6966, + "step": 5508 + }, + { + "epoch": 3.107163000564016, + "grad_norm": 1.7664259672164917, + "learning_rate": 3.446700507614214e-05, + "loss": 0.6999, + "step": 5509 + }, + { + "epoch": 3.107727016356458, + "grad_norm": 2.4207658767700195, + "learning_rate": 3.446418499717992e-05, + "loss": 0.9143, + "step": 5510 + }, + { + "epoch": 3.1082910321489003, + "grad_norm": 0.9905707836151123, + "learning_rate": 3.4461364918217715e-05, + "loss": 0.7046, + "step": 5511 + }, + { + "epoch": 3.108855047941342, + "grad_norm": 1.0929515361785889, + "learning_rate": 3.44585448392555e-05, + "loss": 0.6573, + "step": 5512 + }, + { + "epoch": 3.1094190637337844, + "grad_norm": 1.4676673412322998, + "learning_rate": 3.445572476029329e-05, + "loss": 0.7266, + "step": 5513 + }, + { + "epoch": 3.1099830795262267, + "grad_norm": 1.3876346349716187, + "learning_rate": 3.445290468133108e-05, + "loss": 0.7119, + "step": 5514 + }, + { + "epoch": 3.110547095318669, + "grad_norm": 1.5141794681549072, + "learning_rate": 3.445008460236887e-05, + "loss": 0.7815, + "step": 5515 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 2.2828452587127686, + "learning_rate": 3.4447264523406655e-05, + "loss": 0.6695, + "step": 5516 + }, + { + "epoch": 3.1116751269035534, + "grad_norm": 1.1431933641433716, + "learning_rate": 3.444444444444445e-05, + "loss": 0.7669, + "step": 5517 + }, + { + "epoch": 3.1122391426959957, + "grad_norm": 1.2520852088928223, + "learning_rate": 3.444162436548223e-05, + "loss": 0.7346, + "step": 5518 + }, + { + "epoch": 3.1128031584884375, + "grad_norm": 1.1400927305221558, + "learning_rate": 3.4438804286520025e-05, + "loss": 0.8102, + "step": 5519 + }, + { + "epoch": 3.1133671742808797, + "grad_norm": 1.3541618585586548, + "learning_rate": 3.443598420755782e-05, + "loss": 0.6475, + "step": 5520 + }, + { + "epoch": 3.113931190073322, + "grad_norm": 0.9246187210083008, + "learning_rate": 3.44331641285956e-05, + "loss": 0.5741, + "step": 5521 + }, + { + "epoch": 3.1144952058657642, + "grad_norm": 1.9152889251708984, + "learning_rate": 3.443034404963339e-05, + "loss": 0.779, + "step": 5522 + }, + { + "epoch": 3.1150592216582065, + "grad_norm": 1.0387749671936035, + "learning_rate": 3.442752397067118e-05, + "loss": 0.7845, + "step": 5523 + }, + { + "epoch": 3.1156232374506487, + "grad_norm": 1.7393782138824463, + "learning_rate": 3.442470389170897e-05, + "loss": 0.7554, + "step": 5524 + }, + { + "epoch": 3.116187253243091, + "grad_norm": 1.5197162628173828, + "learning_rate": 3.442188381274676e-05, + "loss": 0.862, + "step": 5525 + }, + { + "epoch": 3.116751269035533, + "grad_norm": 1.6715689897537231, + "learning_rate": 3.441906373378454e-05, + "loss": 0.7487, + "step": 5526 + }, + { + "epoch": 3.117315284827975, + "grad_norm": 1.3339506387710571, + "learning_rate": 3.441624365482234e-05, + "loss": 0.7742, + "step": 5527 + }, + { + "epoch": 3.1178793006204173, + "grad_norm": 1.6808977127075195, + "learning_rate": 3.441342357586013e-05, + "loss": 0.8145, + "step": 5528 + }, + { + "epoch": 3.1184433164128595, + "grad_norm": 2.018932819366455, + "learning_rate": 3.441060349689791e-05, + "loss": 0.8349, + "step": 5529 + }, + { + "epoch": 3.119007332205302, + "grad_norm": 1.8538877964019775, + "learning_rate": 3.44077834179357e-05, + "loss": 0.8384, + "step": 5530 + }, + { + "epoch": 3.119571347997744, + "grad_norm": 1.4242368936538696, + "learning_rate": 3.44049633389735e-05, + "loss": 0.7129, + "step": 5531 + }, + { + "epoch": 3.1201353637901863, + "grad_norm": 1.7947310209274292, + "learning_rate": 3.440214326001128e-05, + "loss": 0.8248, + "step": 5532 + }, + { + "epoch": 3.120699379582628, + "grad_norm": 1.6134436130523682, + "learning_rate": 3.439932318104907e-05, + "loss": 0.7557, + "step": 5533 + }, + { + "epoch": 3.1212633953750704, + "grad_norm": 1.3886737823486328, + "learning_rate": 3.439650310208686e-05, + "loss": 0.7576, + "step": 5534 + }, + { + "epoch": 3.1218274111675126, + "grad_norm": 1.148814082145691, + "learning_rate": 3.439368302312465e-05, + "loss": 0.729, + "step": 5535 + }, + { + "epoch": 3.122391426959955, + "grad_norm": 1.828250765800476, + "learning_rate": 3.439086294416244e-05, + "loss": 0.8677, + "step": 5536 + }, + { + "epoch": 3.122955442752397, + "grad_norm": 1.4915943145751953, + "learning_rate": 3.438804286520022e-05, + "loss": 0.8024, + "step": 5537 + }, + { + "epoch": 3.1235194585448394, + "grad_norm": 1.397336483001709, + "learning_rate": 3.4385222786238016e-05, + "loss": 0.8566, + "step": 5538 + }, + { + "epoch": 3.1240834743372816, + "grad_norm": 1.102821707725525, + "learning_rate": 3.438240270727581e-05, + "loss": 0.7071, + "step": 5539 + }, + { + "epoch": 3.1246474901297234, + "grad_norm": 1.0162334442138672, + "learning_rate": 3.437958262831359e-05, + "loss": 0.7159, + "step": 5540 + }, + { + "epoch": 3.1252115059221657, + "grad_norm": 1.2828441858291626, + "learning_rate": 3.4376762549351385e-05, + "loss": 0.7672, + "step": 5541 + }, + { + "epoch": 3.125775521714608, + "grad_norm": 0.8548522591590881, + "learning_rate": 3.437394247038917e-05, + "loss": 0.6104, + "step": 5542 + }, + { + "epoch": 3.12633953750705, + "grad_norm": 1.0815380811691284, + "learning_rate": 3.437112239142696e-05, + "loss": 0.8151, + "step": 5543 + }, + { + "epoch": 3.1269035532994924, + "grad_norm": 1.1011905670166016, + "learning_rate": 3.436830231246475e-05, + "loss": 0.7436, + "step": 5544 + }, + { + "epoch": 3.1274675690919347, + "grad_norm": 2.1954495906829834, + "learning_rate": 3.436548223350254e-05, + "loss": 0.9102, + "step": 5545 + }, + { + "epoch": 3.128031584884377, + "grad_norm": 1.0310065746307373, + "learning_rate": 3.4362662154540326e-05, + "loss": 0.5576, + "step": 5546 + }, + { + "epoch": 3.1285956006768187, + "grad_norm": 4.8194966316223145, + "learning_rate": 3.435984207557812e-05, + "loss": 0.8397, + "step": 5547 + }, + { + "epoch": 3.129159616469261, + "grad_norm": 1.204056739807129, + "learning_rate": 3.4357021996615904e-05, + "loss": 0.8134, + "step": 5548 + }, + { + "epoch": 3.1297236322617032, + "grad_norm": 1.0861318111419678, + "learning_rate": 3.4354201917653696e-05, + "loss": 0.6734, + "step": 5549 + }, + { + "epoch": 3.1302876480541455, + "grad_norm": 1.2707151174545288, + "learning_rate": 3.435138183869149e-05, + "loss": 0.7344, + "step": 5550 + }, + { + "epoch": 3.1308516638465878, + "grad_norm": 1.925061821937561, + "learning_rate": 3.434856175972927e-05, + "loss": 0.8187, + "step": 5551 + }, + { + "epoch": 3.13141567963903, + "grad_norm": 3.20914363861084, + "learning_rate": 3.4345741680767065e-05, + "loss": 0.7113, + "step": 5552 + }, + { + "epoch": 3.1319796954314723, + "grad_norm": 1.1004451513290405, + "learning_rate": 3.434292160180485e-05, + "loss": 0.7178, + "step": 5553 + }, + { + "epoch": 3.132543711223914, + "grad_norm": 1.249193549156189, + "learning_rate": 3.434010152284264e-05, + "loss": 0.7226, + "step": 5554 + }, + { + "epoch": 3.1331077270163563, + "grad_norm": 1.188631296157837, + "learning_rate": 3.433728144388043e-05, + "loss": 0.7223, + "step": 5555 + }, + { + "epoch": 3.1336717428087986, + "grad_norm": 1.0297514200210571, + "learning_rate": 3.433446136491822e-05, + "loss": 0.7031, + "step": 5556 + }, + { + "epoch": 3.134235758601241, + "grad_norm": 1.4170323610305786, + "learning_rate": 3.4331641285956006e-05, + "loss": 0.7878, + "step": 5557 + }, + { + "epoch": 3.134799774393683, + "grad_norm": 1.5883219242095947, + "learning_rate": 3.43288212069938e-05, + "loss": 0.7053, + "step": 5558 + }, + { + "epoch": 3.1353637901861253, + "grad_norm": 1.2681516408920288, + "learning_rate": 3.432600112803159e-05, + "loss": 0.8508, + "step": 5559 + }, + { + "epoch": 3.1359278059785676, + "grad_norm": 1.8171086311340332, + "learning_rate": 3.4323181049069376e-05, + "loss": 0.8829, + "step": 5560 + }, + { + "epoch": 3.1364918217710094, + "grad_norm": 1.2522064447402954, + "learning_rate": 3.432036097010716e-05, + "loss": 0.7677, + "step": 5561 + }, + { + "epoch": 3.1370558375634516, + "grad_norm": 1.385923981666565, + "learning_rate": 3.4317540891144953e-05, + "loss": 0.8156, + "step": 5562 + }, + { + "epoch": 3.137619853355894, + "grad_norm": 1.3869054317474365, + "learning_rate": 3.4314720812182746e-05, + "loss": 0.754, + "step": 5563 + }, + { + "epoch": 3.138183869148336, + "grad_norm": 1.2684073448181152, + "learning_rate": 3.431190073322053e-05, + "loss": 0.7589, + "step": 5564 + }, + { + "epoch": 3.1387478849407784, + "grad_norm": 1.5908031463623047, + "learning_rate": 3.4309080654258316e-05, + "loss": 0.7429, + "step": 5565 + }, + { + "epoch": 3.1393119007332206, + "grad_norm": 1.198461651802063, + "learning_rate": 3.430626057529611e-05, + "loss": 0.8152, + "step": 5566 + }, + { + "epoch": 3.139875916525663, + "grad_norm": 1.1149871349334717, + "learning_rate": 3.43034404963339e-05, + "loss": 0.8025, + "step": 5567 + }, + { + "epoch": 3.1404399323181047, + "grad_norm": 1.450128436088562, + "learning_rate": 3.4300620417371686e-05, + "loss": 0.7497, + "step": 5568 + }, + { + "epoch": 3.141003948110547, + "grad_norm": 3.483546733856201, + "learning_rate": 3.429780033840947e-05, + "loss": 0.8409, + "step": 5569 + }, + { + "epoch": 3.141567963902989, + "grad_norm": 1.1264480352401733, + "learning_rate": 3.429498025944727e-05, + "loss": 0.7291, + "step": 5570 + }, + { + "epoch": 3.1421319796954315, + "grad_norm": 1.435597538948059, + "learning_rate": 3.4292160180485056e-05, + "loss": 0.6169, + "step": 5571 + }, + { + "epoch": 3.1426959954878737, + "grad_norm": 1.2847024202346802, + "learning_rate": 3.428934010152284e-05, + "loss": 0.7818, + "step": 5572 + }, + { + "epoch": 3.143260011280316, + "grad_norm": 1.6528042554855347, + "learning_rate": 3.4286520022560634e-05, + "loss": 0.7926, + "step": 5573 + }, + { + "epoch": 3.143824027072758, + "grad_norm": 1.9498118162155151, + "learning_rate": 3.4283699943598426e-05, + "loss": 0.6957, + "step": 5574 + }, + { + "epoch": 3.1443880428652005, + "grad_norm": 1.839006781578064, + "learning_rate": 3.428087986463621e-05, + "loss": 0.7417, + "step": 5575 + }, + { + "epoch": 3.1449520586576423, + "grad_norm": 1.352607250213623, + "learning_rate": 3.4278059785673997e-05, + "loss": 0.7299, + "step": 5576 + }, + { + "epoch": 3.1455160744500845, + "grad_norm": 1.9853910207748413, + "learning_rate": 3.427523970671179e-05, + "loss": 0.8315, + "step": 5577 + }, + { + "epoch": 3.1460800902425268, + "grad_norm": 1.3243541717529297, + "learning_rate": 3.427241962774958e-05, + "loss": 0.7586, + "step": 5578 + }, + { + "epoch": 3.146644106034969, + "grad_norm": 0.9695037007331848, + "learning_rate": 3.4269599548787366e-05, + "loss": 0.7263, + "step": 5579 + }, + { + "epoch": 3.1472081218274113, + "grad_norm": 0.9359468817710876, + "learning_rate": 3.426677946982516e-05, + "loss": 0.6626, + "step": 5580 + }, + { + "epoch": 3.1477721376198535, + "grad_norm": 0.8443858027458191, + "learning_rate": 3.4263959390862944e-05, + "loss": 0.6369, + "step": 5581 + }, + { + "epoch": 3.148336153412296, + "grad_norm": 1.4690093994140625, + "learning_rate": 3.4261139311900736e-05, + "loss": 0.7598, + "step": 5582 + }, + { + "epoch": 3.1489001692047376, + "grad_norm": 1.24175226688385, + "learning_rate": 3.425831923293852e-05, + "loss": 0.7469, + "step": 5583 + }, + { + "epoch": 3.14946418499718, + "grad_norm": 0.9579851627349854, + "learning_rate": 3.4255499153976314e-05, + "loss": 0.6561, + "step": 5584 + }, + { + "epoch": 3.150028200789622, + "grad_norm": 0.918980598449707, + "learning_rate": 3.4252679075014106e-05, + "loss": 0.695, + "step": 5585 + }, + { + "epoch": 3.1505922165820643, + "grad_norm": 1.8143937587738037, + "learning_rate": 3.424985899605189e-05, + "loss": 0.8832, + "step": 5586 + }, + { + "epoch": 3.1511562323745066, + "grad_norm": 2.238259792327881, + "learning_rate": 3.424703891708968e-05, + "loss": 0.8728, + "step": 5587 + }, + { + "epoch": 3.151720248166949, + "grad_norm": 1.167911410331726, + "learning_rate": 3.424421883812747e-05, + "loss": 0.7242, + "step": 5588 + }, + { + "epoch": 3.152284263959391, + "grad_norm": 1.8631342649459839, + "learning_rate": 3.424139875916526e-05, + "loss": 0.6493, + "step": 5589 + }, + { + "epoch": 3.152848279751833, + "grad_norm": 1.8140634298324585, + "learning_rate": 3.4238578680203047e-05, + "loss": 0.7312, + "step": 5590 + }, + { + "epoch": 3.153412295544275, + "grad_norm": 1.1218843460083008, + "learning_rate": 3.423575860124084e-05, + "loss": 0.7032, + "step": 5591 + }, + { + "epoch": 3.1539763113367174, + "grad_norm": 1.2597410678863525, + "learning_rate": 3.4232938522278624e-05, + "loss": 0.7491, + "step": 5592 + }, + { + "epoch": 3.1545403271291597, + "grad_norm": 1.2672176361083984, + "learning_rate": 3.4230118443316416e-05, + "loss": 0.8389, + "step": 5593 + }, + { + "epoch": 3.155104342921602, + "grad_norm": 1.734974980354309, + "learning_rate": 3.42272983643542e-05, + "loss": 0.8612, + "step": 5594 + }, + { + "epoch": 3.155668358714044, + "grad_norm": 3.421815872192383, + "learning_rate": 3.4224478285391994e-05, + "loss": 0.9254, + "step": 5595 + }, + { + "epoch": 3.1562323745064864, + "grad_norm": 1.4958150386810303, + "learning_rate": 3.422165820642978e-05, + "loss": 0.8055, + "step": 5596 + }, + { + "epoch": 3.1567963902989282, + "grad_norm": 1.1469459533691406, + "learning_rate": 3.421883812746757e-05, + "loss": 0.7965, + "step": 5597 + }, + { + "epoch": 3.1573604060913705, + "grad_norm": 1.0922582149505615, + "learning_rate": 3.4216018048505364e-05, + "loss": 0.7123, + "step": 5598 + }, + { + "epoch": 3.1579244218838127, + "grad_norm": 1.0737634897232056, + "learning_rate": 3.421319796954315e-05, + "loss": 0.7388, + "step": 5599 + }, + { + "epoch": 3.158488437676255, + "grad_norm": 1.1568152904510498, + "learning_rate": 3.4210377890580935e-05, + "loss": 0.7582, + "step": 5600 + }, + { + "epoch": 3.1590524534686972, + "grad_norm": 1.0088382959365845, + "learning_rate": 3.420755781161873e-05, + "loss": 0.6811, + "step": 5601 + }, + { + "epoch": 3.1596164692611395, + "grad_norm": 1.7050532102584839, + "learning_rate": 3.420473773265652e-05, + "loss": 0.779, + "step": 5602 + }, + { + "epoch": 3.1601804850535817, + "grad_norm": 1.6146385669708252, + "learning_rate": 3.4201917653694304e-05, + "loss": 0.857, + "step": 5603 + }, + { + "epoch": 3.1607445008460235, + "grad_norm": 1.1906417608261108, + "learning_rate": 3.419909757473209e-05, + "loss": 0.7716, + "step": 5604 + }, + { + "epoch": 3.161308516638466, + "grad_norm": 1.290120005607605, + "learning_rate": 3.419627749576988e-05, + "loss": 0.7815, + "step": 5605 + }, + { + "epoch": 3.161872532430908, + "grad_norm": 1.440936803817749, + "learning_rate": 3.4193457416807674e-05, + "loss": 0.8481, + "step": 5606 + }, + { + "epoch": 3.1624365482233503, + "grad_norm": 1.499876618385315, + "learning_rate": 3.419063733784546e-05, + "loss": 0.7548, + "step": 5607 + }, + { + "epoch": 3.1630005640157925, + "grad_norm": 1.2327203750610352, + "learning_rate": 3.4187817258883245e-05, + "loss": 0.786, + "step": 5608 + }, + { + "epoch": 3.163564579808235, + "grad_norm": 1.0861589908599854, + "learning_rate": 3.4184997179921044e-05, + "loss": 0.7093, + "step": 5609 + }, + { + "epoch": 3.164128595600677, + "grad_norm": 1.6900557279586792, + "learning_rate": 3.418217710095883e-05, + "loss": 0.808, + "step": 5610 + }, + { + "epoch": 3.164692611393119, + "grad_norm": 1.1761358976364136, + "learning_rate": 3.4179357021996615e-05, + "loss": 0.7471, + "step": 5611 + }, + { + "epoch": 3.165256627185561, + "grad_norm": 1.2950842380523682, + "learning_rate": 3.417653694303441e-05, + "loss": 0.7705, + "step": 5612 + }, + { + "epoch": 3.1658206429780034, + "grad_norm": 1.434152603149414, + "learning_rate": 3.41737168640722e-05, + "loss": 0.8118, + "step": 5613 + }, + { + "epoch": 3.1663846587704456, + "grad_norm": 1.957171082496643, + "learning_rate": 3.4170896785109984e-05, + "loss": 0.8646, + "step": 5614 + }, + { + "epoch": 3.166948674562888, + "grad_norm": 0.989643394947052, + "learning_rate": 3.416807670614777e-05, + "loss": 0.7034, + "step": 5615 + }, + { + "epoch": 3.16751269035533, + "grad_norm": 1.1173131465911865, + "learning_rate": 3.416525662718556e-05, + "loss": 0.7122, + "step": 5616 + }, + { + "epoch": 3.1680767061477724, + "grad_norm": 1.360968828201294, + "learning_rate": 3.4162436548223354e-05, + "loss": 0.6624, + "step": 5617 + }, + { + "epoch": 3.168640721940214, + "grad_norm": 1.3753869533538818, + "learning_rate": 3.415961646926114e-05, + "loss": 0.6542, + "step": 5618 + }, + { + "epoch": 3.1692047377326564, + "grad_norm": 1.305587649345398, + "learning_rate": 3.415679639029893e-05, + "loss": 0.7327, + "step": 5619 + }, + { + "epoch": 3.1697687535250987, + "grad_norm": 1.3101575374603271, + "learning_rate": 3.4153976311336724e-05, + "loss": 0.6849, + "step": 5620 + }, + { + "epoch": 3.170332769317541, + "grad_norm": 2.3850462436676025, + "learning_rate": 3.415115623237451e-05, + "loss": 0.8394, + "step": 5621 + }, + { + "epoch": 3.170896785109983, + "grad_norm": 1.1841928958892822, + "learning_rate": 3.4148336153412295e-05, + "loss": 0.7524, + "step": 5622 + }, + { + "epoch": 3.1714608009024254, + "grad_norm": 1.5501891374588013, + "learning_rate": 3.414551607445009e-05, + "loss": 0.8598, + "step": 5623 + }, + { + "epoch": 3.1720248166948677, + "grad_norm": 1.1166491508483887, + "learning_rate": 3.414269599548788e-05, + "loss": 0.7, + "step": 5624 + }, + { + "epoch": 3.1725888324873095, + "grad_norm": 1.6202937364578247, + "learning_rate": 3.4139875916525665e-05, + "loss": 0.7787, + "step": 5625 + }, + { + "epoch": 3.1731528482797517, + "grad_norm": 1.0915544033050537, + "learning_rate": 3.413705583756345e-05, + "loss": 0.7136, + "step": 5626 + }, + { + "epoch": 3.173716864072194, + "grad_norm": 2.804715394973755, + "learning_rate": 3.413423575860124e-05, + "loss": 0.7815, + "step": 5627 + }, + { + "epoch": 3.1742808798646363, + "grad_norm": 1.8239482641220093, + "learning_rate": 3.4131415679639034e-05, + "loss": 0.8991, + "step": 5628 + }, + { + "epoch": 3.1748448956570785, + "grad_norm": 2.0178380012512207, + "learning_rate": 3.412859560067682e-05, + "loss": 0.7238, + "step": 5629 + }, + { + "epoch": 3.1754089114495208, + "grad_norm": 1.3251259326934814, + "learning_rate": 3.412577552171461e-05, + "loss": 0.7654, + "step": 5630 + }, + { + "epoch": 3.175972927241963, + "grad_norm": 1.0511949062347412, + "learning_rate": 3.41229554427524e-05, + "loss": 0.7267, + "step": 5631 + }, + { + "epoch": 3.176536943034405, + "grad_norm": 1.2981547117233276, + "learning_rate": 3.412013536379019e-05, + "loss": 0.8671, + "step": 5632 + }, + { + "epoch": 3.177100958826847, + "grad_norm": 1.0007489919662476, + "learning_rate": 3.4117315284827975e-05, + "loss": 0.671, + "step": 5633 + }, + { + "epoch": 3.1776649746192893, + "grad_norm": 2.577162027359009, + "learning_rate": 3.411449520586577e-05, + "loss": 0.7475, + "step": 5634 + }, + { + "epoch": 3.1782289904117316, + "grad_norm": 1.7339115142822266, + "learning_rate": 3.411167512690355e-05, + "loss": 0.7861, + "step": 5635 + }, + { + "epoch": 3.178793006204174, + "grad_norm": 1.5084922313690186, + "learning_rate": 3.4108855047941345e-05, + "loss": 0.7704, + "step": 5636 + }, + { + "epoch": 3.179357021996616, + "grad_norm": 1.7074618339538574, + "learning_rate": 3.410603496897914e-05, + "loss": 0.839, + "step": 5637 + }, + { + "epoch": 3.1799210377890583, + "grad_norm": 1.5640361309051514, + "learning_rate": 3.410321489001692e-05, + "loss": 0.8186, + "step": 5638 + }, + { + "epoch": 3.1804850535815, + "grad_norm": 1.4856891632080078, + "learning_rate": 3.410039481105471e-05, + "loss": 0.8137, + "step": 5639 + }, + { + "epoch": 3.1810490693739424, + "grad_norm": 1.157490611076355, + "learning_rate": 3.40975747320925e-05, + "loss": 0.6995, + "step": 5640 + }, + { + "epoch": 3.1816130851663846, + "grad_norm": 1.2235437631607056, + "learning_rate": 3.409475465313029e-05, + "loss": 0.6548, + "step": 5641 + }, + { + "epoch": 3.182177100958827, + "grad_norm": 1.7161883115768433, + "learning_rate": 3.409193457416808e-05, + "loss": 0.9158, + "step": 5642 + }, + { + "epoch": 3.182741116751269, + "grad_norm": 1.344417929649353, + "learning_rate": 3.408911449520586e-05, + "loss": 0.6237, + "step": 5643 + }, + { + "epoch": 3.1833051325437114, + "grad_norm": 1.3342164754867554, + "learning_rate": 3.4086294416243655e-05, + "loss": 0.785, + "step": 5644 + }, + { + "epoch": 3.1838691483361536, + "grad_norm": 1.1472805738449097, + "learning_rate": 3.408347433728145e-05, + "loss": 0.8136, + "step": 5645 + }, + { + "epoch": 3.1844331641285955, + "grad_norm": 1.1364067792892456, + "learning_rate": 3.408065425831923e-05, + "loss": 0.7257, + "step": 5646 + }, + { + "epoch": 3.1849971799210377, + "grad_norm": 1.131205677986145, + "learning_rate": 3.407783417935702e-05, + "loss": 0.7618, + "step": 5647 + }, + { + "epoch": 3.18556119571348, + "grad_norm": 1.6237164735794067, + "learning_rate": 3.407501410039482e-05, + "loss": 0.7899, + "step": 5648 + }, + { + "epoch": 3.186125211505922, + "grad_norm": 1.109871506690979, + "learning_rate": 3.40721940214326e-05, + "loss": 0.8203, + "step": 5649 + }, + { + "epoch": 3.1866892272983645, + "grad_norm": 0.9398954510688782, + "learning_rate": 3.406937394247039e-05, + "loss": 0.7066, + "step": 5650 + }, + { + "epoch": 3.1872532430908067, + "grad_norm": 1.4610475301742554, + "learning_rate": 3.406655386350818e-05, + "loss": 0.8032, + "step": 5651 + }, + { + "epoch": 3.187817258883249, + "grad_norm": 0.8366550803184509, + "learning_rate": 3.406373378454597e-05, + "loss": 0.6299, + "step": 5652 + }, + { + "epoch": 3.1883812746756908, + "grad_norm": 1.0250072479248047, + "learning_rate": 3.406091370558376e-05, + "loss": 0.7628, + "step": 5653 + }, + { + "epoch": 3.188945290468133, + "grad_norm": 1.024994134902954, + "learning_rate": 3.405809362662154e-05, + "loss": 0.7241, + "step": 5654 + }, + { + "epoch": 3.1895093062605753, + "grad_norm": 1.7308217287063599, + "learning_rate": 3.405527354765934e-05, + "loss": 0.7528, + "step": 5655 + }, + { + "epoch": 3.1900733220530175, + "grad_norm": 1.53082275390625, + "learning_rate": 3.405245346869713e-05, + "loss": 0.8385, + "step": 5656 + }, + { + "epoch": 3.1906373378454598, + "grad_norm": 1.2232837677001953, + "learning_rate": 3.404963338973491e-05, + "loss": 0.7936, + "step": 5657 + }, + { + "epoch": 3.191201353637902, + "grad_norm": 1.52603018283844, + "learning_rate": 3.40468133107727e-05, + "loss": 0.7392, + "step": 5658 + }, + { + "epoch": 3.1917653694303443, + "grad_norm": 1.0690438747406006, + "learning_rate": 3.40439932318105e-05, + "loss": 0.7292, + "step": 5659 + }, + { + "epoch": 3.192329385222786, + "grad_norm": 1.5517722368240356, + "learning_rate": 3.404117315284828e-05, + "loss": 0.8076, + "step": 5660 + }, + { + "epoch": 3.1928934010152283, + "grad_norm": 1.2878435850143433, + "learning_rate": 3.403835307388607e-05, + "loss": 0.7906, + "step": 5661 + }, + { + "epoch": 3.1934574168076706, + "grad_norm": 1.6271944046020508, + "learning_rate": 3.403553299492386e-05, + "loss": 0.7724, + "step": 5662 + }, + { + "epoch": 3.194021432600113, + "grad_norm": 1.2554994821548462, + "learning_rate": 3.403271291596165e-05, + "loss": 0.738, + "step": 5663 + }, + { + "epoch": 3.194585448392555, + "grad_norm": 1.859877586364746, + "learning_rate": 3.402989283699944e-05, + "loss": 0.8011, + "step": 5664 + }, + { + "epoch": 3.1951494641849973, + "grad_norm": 2.0899100303649902, + "learning_rate": 3.402707275803722e-05, + "loss": 0.8857, + "step": 5665 + }, + { + "epoch": 3.1957134799774396, + "grad_norm": 0.8777434229850769, + "learning_rate": 3.4024252679075015e-05, + "loss": 0.7094, + "step": 5666 + }, + { + "epoch": 3.1962774957698814, + "grad_norm": 3.4518513679504395, + "learning_rate": 3.402143260011281e-05, + "loss": 0.7885, + "step": 5667 + }, + { + "epoch": 3.1968415115623237, + "grad_norm": 1.0238677263259888, + "learning_rate": 3.401861252115059e-05, + "loss": 0.785, + "step": 5668 + }, + { + "epoch": 3.197405527354766, + "grad_norm": 1.0638993978500366, + "learning_rate": 3.4015792442188385e-05, + "loss": 0.7212, + "step": 5669 + }, + { + "epoch": 3.197969543147208, + "grad_norm": 1.0981897115707397, + "learning_rate": 3.401297236322617e-05, + "loss": 0.8933, + "step": 5670 + }, + { + "epoch": 3.1985335589396504, + "grad_norm": 1.1647636890411377, + "learning_rate": 3.401015228426396e-05, + "loss": 0.718, + "step": 5671 + }, + { + "epoch": 3.1990975747320927, + "grad_norm": 1.4898418188095093, + "learning_rate": 3.400733220530175e-05, + "loss": 0.7329, + "step": 5672 + }, + { + "epoch": 3.199661590524535, + "grad_norm": 1.0277327299118042, + "learning_rate": 3.400451212633954e-05, + "loss": 0.7995, + "step": 5673 + }, + { + "epoch": 3.2002256063169767, + "grad_norm": 1.7163903713226318, + "learning_rate": 3.4001692047377326e-05, + "loss": 0.6733, + "step": 5674 + }, + { + "epoch": 3.200789622109419, + "grad_norm": 1.1669375896453857, + "learning_rate": 3.399887196841512e-05, + "loss": 0.8111, + "step": 5675 + }, + { + "epoch": 3.2013536379018612, + "grad_norm": 0.790995717048645, + "learning_rate": 3.39960518894529e-05, + "loss": 0.542, + "step": 5676 + }, + { + "epoch": 3.2019176536943035, + "grad_norm": 1.5982271432876587, + "learning_rate": 3.3993231810490696e-05, + "loss": 0.8143, + "step": 5677 + }, + { + "epoch": 3.2024816694867457, + "grad_norm": 1.7544358968734741, + "learning_rate": 3.399041173152848e-05, + "loss": 0.7567, + "step": 5678 + }, + { + "epoch": 3.203045685279188, + "grad_norm": 1.8187273740768433, + "learning_rate": 3.398759165256627e-05, + "loss": 0.7883, + "step": 5679 + }, + { + "epoch": 3.2036097010716302, + "grad_norm": 1.6455137729644775, + "learning_rate": 3.3984771573604065e-05, + "loss": 0.7372, + "step": 5680 + }, + { + "epoch": 3.204173716864072, + "grad_norm": 1.5381474494934082, + "learning_rate": 3.398195149464185e-05, + "loss": 0.7588, + "step": 5681 + }, + { + "epoch": 3.2047377326565143, + "grad_norm": 1.2183997631072998, + "learning_rate": 3.3979131415679636e-05, + "loss": 0.7617, + "step": 5682 + }, + { + "epoch": 3.2053017484489565, + "grad_norm": 1.7221564054489136, + "learning_rate": 3.397631133671743e-05, + "loss": 0.781, + "step": 5683 + }, + { + "epoch": 3.205865764241399, + "grad_norm": 1.7878056764602661, + "learning_rate": 3.397349125775522e-05, + "loss": 0.7899, + "step": 5684 + }, + { + "epoch": 3.206429780033841, + "grad_norm": 1.1434401273727417, + "learning_rate": 3.3970671178793006e-05, + "loss": 0.7696, + "step": 5685 + }, + { + "epoch": 3.2069937958262833, + "grad_norm": 1.7320303916931152, + "learning_rate": 3.396785109983079e-05, + "loss": 0.917, + "step": 5686 + }, + { + "epoch": 3.2075578116187256, + "grad_norm": 1.498845100402832, + "learning_rate": 3.396503102086859e-05, + "loss": 0.8004, + "step": 5687 + }, + { + "epoch": 3.2081218274111674, + "grad_norm": 1.0801345109939575, + "learning_rate": 3.3962210941906376e-05, + "loss": 0.7496, + "step": 5688 + }, + { + "epoch": 3.2086858432036096, + "grad_norm": 1.6392048597335815, + "learning_rate": 3.395939086294416e-05, + "loss": 0.7512, + "step": 5689 + }, + { + "epoch": 3.209249858996052, + "grad_norm": 1.2284260988235474, + "learning_rate": 3.395657078398195e-05, + "loss": 0.8376, + "step": 5690 + }, + { + "epoch": 3.209813874788494, + "grad_norm": 1.5296926498413086, + "learning_rate": 3.3953750705019745e-05, + "loss": 0.7463, + "step": 5691 + }, + { + "epoch": 3.2103778905809364, + "grad_norm": 1.4649039506912231, + "learning_rate": 3.395093062605753e-05, + "loss": 0.7724, + "step": 5692 + }, + { + "epoch": 3.2109419063733786, + "grad_norm": 1.1592333316802979, + "learning_rate": 3.3948110547095316e-05, + "loss": 0.7023, + "step": 5693 + }, + { + "epoch": 3.211505922165821, + "grad_norm": 1.0462610721588135, + "learning_rate": 3.394529046813311e-05, + "loss": 0.671, + "step": 5694 + }, + { + "epoch": 3.2120699379582627, + "grad_norm": 2.4096925258636475, + "learning_rate": 3.39424703891709e-05, + "loss": 0.9093, + "step": 5695 + }, + { + "epoch": 3.212633953750705, + "grad_norm": 3.169431209564209, + "learning_rate": 3.3939650310208686e-05, + "loss": 0.7445, + "step": 5696 + }, + { + "epoch": 3.213197969543147, + "grad_norm": 1.028133749961853, + "learning_rate": 3.393683023124647e-05, + "loss": 0.7506, + "step": 5697 + }, + { + "epoch": 3.2137619853355894, + "grad_norm": 2.3914194107055664, + "learning_rate": 3.393401015228427e-05, + "loss": 0.7563, + "step": 5698 + }, + { + "epoch": 3.2143260011280317, + "grad_norm": 1.2578554153442383, + "learning_rate": 3.3931190073322056e-05, + "loss": 0.6637, + "step": 5699 + }, + { + "epoch": 3.214890016920474, + "grad_norm": 1.0975186824798584, + "learning_rate": 3.392836999435984e-05, + "loss": 0.7698, + "step": 5700 + }, + { + "epoch": 3.215454032712916, + "grad_norm": 1.1230334043502808, + "learning_rate": 3.3925549915397633e-05, + "loss": 0.8076, + "step": 5701 + }, + { + "epoch": 3.216018048505358, + "grad_norm": 1.6829332113265991, + "learning_rate": 3.3922729836435426e-05, + "loss": 0.866, + "step": 5702 + }, + { + "epoch": 3.2165820642978002, + "grad_norm": 2.8697779178619385, + "learning_rate": 3.391990975747321e-05, + "loss": 0.795, + "step": 5703 + }, + { + "epoch": 3.2171460800902425, + "grad_norm": 1.63060462474823, + "learning_rate": 3.3917089678510996e-05, + "loss": 0.8589, + "step": 5704 + }, + { + "epoch": 3.2177100958826848, + "grad_norm": 1.3638191223144531, + "learning_rate": 3.391426959954879e-05, + "loss": 0.7936, + "step": 5705 + }, + { + "epoch": 3.218274111675127, + "grad_norm": 1.3098841905593872, + "learning_rate": 3.391144952058658e-05, + "loss": 0.827, + "step": 5706 + }, + { + "epoch": 3.2188381274675693, + "grad_norm": 2.841935396194458, + "learning_rate": 3.3908629441624366e-05, + "loss": 0.7426, + "step": 5707 + }, + { + "epoch": 3.2194021432600115, + "grad_norm": 1.4618176221847534, + "learning_rate": 3.390580936266216e-05, + "loss": 0.7778, + "step": 5708 + }, + { + "epoch": 3.2199661590524533, + "grad_norm": 2.318459987640381, + "learning_rate": 3.3902989283699944e-05, + "loss": 0.7905, + "step": 5709 + }, + { + "epoch": 3.2205301748448956, + "grad_norm": 1.153128743171692, + "learning_rate": 3.3900169204737736e-05, + "loss": 0.7815, + "step": 5710 + }, + { + "epoch": 3.221094190637338, + "grad_norm": 1.6824082136154175, + "learning_rate": 3.389734912577552e-05, + "loss": 0.7182, + "step": 5711 + }, + { + "epoch": 3.22165820642978, + "grad_norm": 1.3732502460479736, + "learning_rate": 3.3894529046813314e-05, + "loss": 0.818, + "step": 5712 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 2.512193441390991, + "learning_rate": 3.38917089678511e-05, + "loss": 0.8899, + "step": 5713 + }, + { + "epoch": 3.2227862380146646, + "grad_norm": 1.5276325941085815, + "learning_rate": 3.388888888888889e-05, + "loss": 0.7718, + "step": 5714 + }, + { + "epoch": 3.223350253807107, + "grad_norm": 1.2077511548995972, + "learning_rate": 3.3886068809926677e-05, + "loss": 0.6895, + "step": 5715 + }, + { + "epoch": 3.2239142695995486, + "grad_norm": 1.9146628379821777, + "learning_rate": 3.388324873096447e-05, + "loss": 0.7747, + "step": 5716 + }, + { + "epoch": 3.224478285391991, + "grad_norm": 1.1529040336608887, + "learning_rate": 3.3880428652002254e-05, + "loss": 0.8395, + "step": 5717 + }, + { + "epoch": 3.225042301184433, + "grad_norm": 1.7593204975128174, + "learning_rate": 3.3877608573040046e-05, + "loss": 0.8889, + "step": 5718 + }, + { + "epoch": 3.2256063169768754, + "grad_norm": 0.9149988293647766, + "learning_rate": 3.387478849407784e-05, + "loss": 0.5888, + "step": 5719 + }, + { + "epoch": 3.2261703327693176, + "grad_norm": 1.2414805889129639, + "learning_rate": 3.3871968415115624e-05, + "loss": 0.7488, + "step": 5720 + }, + { + "epoch": 3.22673434856176, + "grad_norm": 1.2646700143814087, + "learning_rate": 3.386914833615341e-05, + "loss": 0.6725, + "step": 5721 + }, + { + "epoch": 3.227298364354202, + "grad_norm": 1.2129197120666504, + "learning_rate": 3.38663282571912e-05, + "loss": 0.8386, + "step": 5722 + }, + { + "epoch": 3.227862380146644, + "grad_norm": 2.1745445728302, + "learning_rate": 3.3863508178228994e-05, + "loss": 0.9329, + "step": 5723 + }, + { + "epoch": 3.228426395939086, + "grad_norm": 1.338452696800232, + "learning_rate": 3.386068809926678e-05, + "loss": 0.8365, + "step": 5724 + }, + { + "epoch": 3.2289904117315285, + "grad_norm": 1.8903522491455078, + "learning_rate": 3.385786802030457e-05, + "loss": 0.9711, + "step": 5725 + }, + { + "epoch": 3.2295544275239707, + "grad_norm": 1.3934893608093262, + "learning_rate": 3.3855047941342364e-05, + "loss": 0.7046, + "step": 5726 + }, + { + "epoch": 3.230118443316413, + "grad_norm": 0.9544268846511841, + "learning_rate": 3.385222786238015e-05, + "loss": 0.7343, + "step": 5727 + }, + { + "epoch": 3.230682459108855, + "grad_norm": 1.1361356973648071, + "learning_rate": 3.3849407783417934e-05, + "loss": 0.6539, + "step": 5728 + }, + { + "epoch": 3.2312464749012975, + "grad_norm": 1.3583831787109375, + "learning_rate": 3.3846587704455727e-05, + "loss": 0.7971, + "step": 5729 + }, + { + "epoch": 3.2318104906937393, + "grad_norm": 1.6317023038864136, + "learning_rate": 3.384376762549352e-05, + "loss": 0.8195, + "step": 5730 + }, + { + "epoch": 3.2323745064861815, + "grad_norm": 4.555150508880615, + "learning_rate": 3.3840947546531304e-05, + "loss": 0.7968, + "step": 5731 + }, + { + "epoch": 3.2329385222786238, + "grad_norm": 1.9233667850494385, + "learning_rate": 3.383812746756909e-05, + "loss": 0.8183, + "step": 5732 + }, + { + "epoch": 3.233502538071066, + "grad_norm": 1.7793076038360596, + "learning_rate": 3.383530738860688e-05, + "loss": 0.8825, + "step": 5733 + }, + { + "epoch": 3.2340665538635083, + "grad_norm": 1.686890721321106, + "learning_rate": 3.3832487309644674e-05, + "loss": 0.8826, + "step": 5734 + }, + { + "epoch": 3.2346305696559505, + "grad_norm": 1.0569735765457153, + "learning_rate": 3.382966723068246e-05, + "loss": 0.7619, + "step": 5735 + }, + { + "epoch": 3.235194585448393, + "grad_norm": 1.5664807558059692, + "learning_rate": 3.3826847151720245e-05, + "loss": 0.9416, + "step": 5736 + }, + { + "epoch": 3.2357586012408346, + "grad_norm": 2.422377347946167, + "learning_rate": 3.3824027072758044e-05, + "loss": 0.9501, + "step": 5737 + }, + { + "epoch": 3.236322617033277, + "grad_norm": 1.168586254119873, + "learning_rate": 3.382120699379583e-05, + "loss": 0.7622, + "step": 5738 + }, + { + "epoch": 3.236886632825719, + "grad_norm": 1.4091815948486328, + "learning_rate": 3.3818386914833614e-05, + "loss": 0.7701, + "step": 5739 + }, + { + "epoch": 3.2374506486181613, + "grad_norm": 1.1334714889526367, + "learning_rate": 3.381556683587141e-05, + "loss": 0.6744, + "step": 5740 + }, + { + "epoch": 3.2380146644106036, + "grad_norm": 1.5056265592575073, + "learning_rate": 3.38127467569092e-05, + "loss": 0.8683, + "step": 5741 + }, + { + "epoch": 3.238578680203046, + "grad_norm": 1.0770020484924316, + "learning_rate": 3.3809926677946984e-05, + "loss": 0.6913, + "step": 5742 + }, + { + "epoch": 3.239142695995488, + "grad_norm": 1.3196462392807007, + "learning_rate": 3.380710659898477e-05, + "loss": 0.7396, + "step": 5743 + }, + { + "epoch": 3.23970671178793, + "grad_norm": 3.8340437412261963, + "learning_rate": 3.380428652002256e-05, + "loss": 0.9367, + "step": 5744 + }, + { + "epoch": 3.240270727580372, + "grad_norm": 1.1006611585617065, + "learning_rate": 3.3801466441060354e-05, + "loss": 0.683, + "step": 5745 + }, + { + "epoch": 3.2408347433728144, + "grad_norm": 1.4724634885787964, + "learning_rate": 3.379864636209814e-05, + "loss": 0.8497, + "step": 5746 + }, + { + "epoch": 3.2413987591652567, + "grad_norm": 1.285098910331726, + "learning_rate": 3.379582628313593e-05, + "loss": 0.8599, + "step": 5747 + }, + { + "epoch": 3.241962774957699, + "grad_norm": 0.8794533014297485, + "learning_rate": 3.379300620417372e-05, + "loss": 0.6609, + "step": 5748 + }, + { + "epoch": 3.242526790750141, + "grad_norm": 1.3058305978775024, + "learning_rate": 3.379018612521151e-05, + "loss": 0.8794, + "step": 5749 + }, + { + "epoch": 3.2430908065425834, + "grad_norm": 1.168738603591919, + "learning_rate": 3.3787366046249295e-05, + "loss": 0.8051, + "step": 5750 + }, + { + "epoch": 3.2436548223350252, + "grad_norm": 2.266206741333008, + "learning_rate": 3.378454596728709e-05, + "loss": 0.8271, + "step": 5751 + }, + { + "epoch": 3.2442188381274675, + "grad_norm": 1.6222224235534668, + "learning_rate": 3.378172588832487e-05, + "loss": 0.8458, + "step": 5752 + }, + { + "epoch": 3.2447828539199097, + "grad_norm": 3.477837085723877, + "learning_rate": 3.3778905809362664e-05, + "loss": 0.7939, + "step": 5753 + }, + { + "epoch": 3.245346869712352, + "grad_norm": 0.9717532992362976, + "learning_rate": 3.377608573040045e-05, + "loss": 0.6017, + "step": 5754 + }, + { + "epoch": 3.2459108855047942, + "grad_norm": 1.979442834854126, + "learning_rate": 3.377326565143824e-05, + "loss": 0.8869, + "step": 5755 + }, + { + "epoch": 3.2464749012972365, + "grad_norm": 4.571866512298584, + "learning_rate": 3.377044557247603e-05, + "loss": 0.7983, + "step": 5756 + }, + { + "epoch": 3.2470389170896787, + "grad_norm": 1.1882667541503906, + "learning_rate": 3.376762549351382e-05, + "loss": 0.7499, + "step": 5757 + }, + { + "epoch": 3.2476029328821205, + "grad_norm": 1.7238820791244507, + "learning_rate": 3.376480541455161e-05, + "loss": 0.7539, + "step": 5758 + }, + { + "epoch": 3.248166948674563, + "grad_norm": 1.2126710414886475, + "learning_rate": 3.37619853355894e-05, + "loss": 0.7808, + "step": 5759 + }, + { + "epoch": 3.248730964467005, + "grad_norm": 1.224709391593933, + "learning_rate": 3.375916525662719e-05, + "loss": 0.6705, + "step": 5760 + }, + { + "epoch": 3.2492949802594473, + "grad_norm": 1.9499391317367554, + "learning_rate": 3.3756345177664975e-05, + "loss": 0.8082, + "step": 5761 + }, + { + "epoch": 3.2498589960518895, + "grad_norm": 1.624570608139038, + "learning_rate": 3.375352509870277e-05, + "loss": 0.8054, + "step": 5762 + }, + { + "epoch": 3.250423011844332, + "grad_norm": 1.23528254032135, + "learning_rate": 3.375070501974055e-05, + "loss": 0.7362, + "step": 5763 + }, + { + "epoch": 3.250987027636774, + "grad_norm": 1.054046392440796, + "learning_rate": 3.3747884940778345e-05, + "loss": 0.8307, + "step": 5764 + }, + { + "epoch": 3.251551043429216, + "grad_norm": 1.0894134044647217, + "learning_rate": 3.374506486181614e-05, + "loss": 0.6903, + "step": 5765 + }, + { + "epoch": 3.252115059221658, + "grad_norm": 1.9302978515625, + "learning_rate": 3.374224478285392e-05, + "loss": 0.807, + "step": 5766 + }, + { + "epoch": 3.2526790750141004, + "grad_norm": 1.5377044677734375, + "learning_rate": 3.373942470389171e-05, + "loss": 0.8037, + "step": 5767 + }, + { + "epoch": 3.2532430908065426, + "grad_norm": 1.2646229267120361, + "learning_rate": 3.37366046249295e-05, + "loss": 0.8307, + "step": 5768 + }, + { + "epoch": 3.253807106598985, + "grad_norm": 1.1439902782440186, + "learning_rate": 3.373378454596729e-05, + "loss": 0.6463, + "step": 5769 + }, + { + "epoch": 3.254371122391427, + "grad_norm": 0.9744473695755005, + "learning_rate": 3.373096446700508e-05, + "loss": 0.7152, + "step": 5770 + }, + { + "epoch": 3.2549351381838694, + "grad_norm": 1.746068000793457, + "learning_rate": 3.372814438804286e-05, + "loss": 0.798, + "step": 5771 + }, + { + "epoch": 3.255499153976311, + "grad_norm": 1.1408771276474, + "learning_rate": 3.3725324309080655e-05, + "loss": 0.6394, + "step": 5772 + }, + { + "epoch": 3.2560631697687534, + "grad_norm": 1.4771850109100342, + "learning_rate": 3.372250423011845e-05, + "loss": 0.7951, + "step": 5773 + }, + { + "epoch": 3.2566271855611957, + "grad_norm": 1.2993085384368896, + "learning_rate": 3.371968415115623e-05, + "loss": 0.7601, + "step": 5774 + }, + { + "epoch": 3.257191201353638, + "grad_norm": 1.5512560606002808, + "learning_rate": 3.371686407219402e-05, + "loss": 0.7831, + "step": 5775 + }, + { + "epoch": 3.25775521714608, + "grad_norm": 1.3143768310546875, + "learning_rate": 3.371404399323182e-05, + "loss": 0.7944, + "step": 5776 + }, + { + "epoch": 3.2583192329385224, + "grad_norm": 1.1122984886169434, + "learning_rate": 3.37112239142696e-05, + "loss": 0.7593, + "step": 5777 + }, + { + "epoch": 3.2588832487309647, + "grad_norm": 1.628251314163208, + "learning_rate": 3.370840383530739e-05, + "loss": 0.7647, + "step": 5778 + }, + { + "epoch": 3.2594472645234065, + "grad_norm": 1.3042387962341309, + "learning_rate": 3.370558375634518e-05, + "loss": 0.7626, + "step": 5779 + }, + { + "epoch": 3.2600112803158487, + "grad_norm": 1.7352463006973267, + "learning_rate": 3.370276367738297e-05, + "loss": 0.7213, + "step": 5780 + }, + { + "epoch": 3.260575296108291, + "grad_norm": 1.3220151662826538, + "learning_rate": 3.369994359842076e-05, + "loss": 0.6917, + "step": 5781 + }, + { + "epoch": 3.2611393119007333, + "grad_norm": 2.2614634037017822, + "learning_rate": 3.369712351945854e-05, + "loss": 0.8704, + "step": 5782 + }, + { + "epoch": 3.2617033276931755, + "grad_norm": 1.356263518333435, + "learning_rate": 3.3694303440496335e-05, + "loss": 0.7827, + "step": 5783 + }, + { + "epoch": 3.2622673434856178, + "grad_norm": 1.079001545906067, + "learning_rate": 3.369148336153413e-05, + "loss": 0.6727, + "step": 5784 + }, + { + "epoch": 3.26283135927806, + "grad_norm": 1.3855078220367432, + "learning_rate": 3.368866328257191e-05, + "loss": 0.8711, + "step": 5785 + }, + { + "epoch": 3.263395375070502, + "grad_norm": 1.575613021850586, + "learning_rate": 3.3685843203609705e-05, + "loss": 0.761, + "step": 5786 + }, + { + "epoch": 3.263959390862944, + "grad_norm": 1.4540350437164307, + "learning_rate": 3.368302312464749e-05, + "loss": 0.785, + "step": 5787 + }, + { + "epoch": 3.2645234066553863, + "grad_norm": 1.2484259605407715, + "learning_rate": 3.368020304568528e-05, + "loss": 0.66, + "step": 5788 + }, + { + "epoch": 3.2650874224478286, + "grad_norm": 1.6570894718170166, + "learning_rate": 3.367738296672307e-05, + "loss": 0.8892, + "step": 5789 + }, + { + "epoch": 3.265651438240271, + "grad_norm": 1.777673363685608, + "learning_rate": 3.367456288776086e-05, + "loss": 0.7142, + "step": 5790 + }, + { + "epoch": 3.266215454032713, + "grad_norm": 0.9655534625053406, + "learning_rate": 3.3671742808798645e-05, + "loss": 0.7104, + "step": 5791 + }, + { + "epoch": 3.2667794698251553, + "grad_norm": 1.196000337600708, + "learning_rate": 3.366892272983644e-05, + "loss": 0.7117, + "step": 5792 + }, + { + "epoch": 3.267343485617597, + "grad_norm": 1.3664456605911255, + "learning_rate": 3.366610265087422e-05, + "loss": 0.7294, + "step": 5793 + }, + { + "epoch": 3.2679075014100394, + "grad_norm": 1.669800043106079, + "learning_rate": 3.3663282571912015e-05, + "loss": 0.8327, + "step": 5794 + }, + { + "epoch": 3.2684715172024816, + "grad_norm": 1.0386126041412354, + "learning_rate": 3.366046249294981e-05, + "loss": 0.7223, + "step": 5795 + }, + { + "epoch": 3.269035532994924, + "grad_norm": 1.4572877883911133, + "learning_rate": 3.365764241398759e-05, + "loss": 0.7883, + "step": 5796 + }, + { + "epoch": 3.269599548787366, + "grad_norm": 1.6509127616882324, + "learning_rate": 3.3654822335025385e-05, + "loss": 0.7451, + "step": 5797 + }, + { + "epoch": 3.2701635645798084, + "grad_norm": 1.3245741128921509, + "learning_rate": 3.365200225606317e-05, + "loss": 0.7008, + "step": 5798 + }, + { + "epoch": 3.2707275803722506, + "grad_norm": 2.257344961166382, + "learning_rate": 3.364918217710096e-05, + "loss": 0.8284, + "step": 5799 + }, + { + "epoch": 3.2712915961646925, + "grad_norm": 1.3695907592773438, + "learning_rate": 3.364636209813875e-05, + "loss": 0.8231, + "step": 5800 + }, + { + "epoch": 3.2718556119571347, + "grad_norm": 1.1755452156066895, + "learning_rate": 3.364354201917654e-05, + "loss": 0.7692, + "step": 5801 + }, + { + "epoch": 3.272419627749577, + "grad_norm": 5.829773426055908, + "learning_rate": 3.3640721940214326e-05, + "loss": 0.7206, + "step": 5802 + }, + { + "epoch": 3.272983643542019, + "grad_norm": 1.9921128749847412, + "learning_rate": 3.363790186125212e-05, + "loss": 0.6444, + "step": 5803 + }, + { + "epoch": 3.2735476593344615, + "grad_norm": 1.7829056978225708, + "learning_rate": 3.363508178228991e-05, + "loss": 0.7599, + "step": 5804 + }, + { + "epoch": 3.2741116751269037, + "grad_norm": 1.3307007551193237, + "learning_rate": 3.3632261703327695e-05, + "loss": 0.7255, + "step": 5805 + }, + { + "epoch": 3.274675690919346, + "grad_norm": 1.3018344640731812, + "learning_rate": 3.362944162436548e-05, + "loss": 0.7896, + "step": 5806 + }, + { + "epoch": 3.2752397067117878, + "grad_norm": 1.1979752779006958, + "learning_rate": 3.362662154540327e-05, + "loss": 0.6986, + "step": 5807 + }, + { + "epoch": 3.27580372250423, + "grad_norm": 2.7248520851135254, + "learning_rate": 3.3623801466441065e-05, + "loss": 0.8555, + "step": 5808 + }, + { + "epoch": 3.2763677382966723, + "grad_norm": 1.6016892194747925, + "learning_rate": 3.362098138747885e-05, + "loss": 0.7832, + "step": 5809 + }, + { + "epoch": 3.2769317540891145, + "grad_norm": 1.5805745124816895, + "learning_rate": 3.3618161308516636e-05, + "loss": 0.7086, + "step": 5810 + }, + { + "epoch": 3.2774957698815568, + "grad_norm": 1.1291229724884033, + "learning_rate": 3.361534122955443e-05, + "loss": 0.7715, + "step": 5811 + }, + { + "epoch": 3.278059785673999, + "grad_norm": 1.181694507598877, + "learning_rate": 3.361252115059222e-05, + "loss": 0.8058, + "step": 5812 + }, + { + "epoch": 3.2786238014664413, + "grad_norm": 1.0255616903305054, + "learning_rate": 3.3609701071630006e-05, + "loss": 0.6505, + "step": 5813 + }, + { + "epoch": 3.279187817258883, + "grad_norm": 1.2185084819793701, + "learning_rate": 3.360688099266779e-05, + "loss": 0.8167, + "step": 5814 + }, + { + "epoch": 3.2797518330513253, + "grad_norm": 1.3002196550369263, + "learning_rate": 3.360406091370559e-05, + "loss": 0.7394, + "step": 5815 + }, + { + "epoch": 3.2803158488437676, + "grad_norm": 0.972511351108551, + "learning_rate": 3.3601240834743376e-05, + "loss": 0.6541, + "step": 5816 + }, + { + "epoch": 3.28087986463621, + "grad_norm": 1.1241375207901, + "learning_rate": 3.359842075578116e-05, + "loss": 0.7296, + "step": 5817 + }, + { + "epoch": 3.281443880428652, + "grad_norm": 1.0912060737609863, + "learning_rate": 3.359560067681895e-05, + "loss": 0.6493, + "step": 5818 + }, + { + "epoch": 3.2820078962210943, + "grad_norm": 0.7787783741950989, + "learning_rate": 3.3592780597856745e-05, + "loss": 0.5475, + "step": 5819 + }, + { + "epoch": 3.2825719120135366, + "grad_norm": 1.0144951343536377, + "learning_rate": 3.358996051889453e-05, + "loss": 0.6973, + "step": 5820 + }, + { + "epoch": 3.2831359278059784, + "grad_norm": 1.2082377672195435, + "learning_rate": 3.3587140439932316e-05, + "loss": 0.78, + "step": 5821 + }, + { + "epoch": 3.2836999435984207, + "grad_norm": 2.3979759216308594, + "learning_rate": 3.358432036097011e-05, + "loss": 0.992, + "step": 5822 + }, + { + "epoch": 3.284263959390863, + "grad_norm": 1.760275959968567, + "learning_rate": 3.35815002820079e-05, + "loss": 0.6898, + "step": 5823 + }, + { + "epoch": 3.284827975183305, + "grad_norm": 1.9101591110229492, + "learning_rate": 3.3578680203045686e-05, + "loss": 0.8624, + "step": 5824 + }, + { + "epoch": 3.2853919909757474, + "grad_norm": 1.487568974494934, + "learning_rate": 3.357586012408347e-05, + "loss": 0.7275, + "step": 5825 + }, + { + "epoch": 3.2859560067681897, + "grad_norm": 1.850010871887207, + "learning_rate": 3.3573040045121264e-05, + "loss": 0.7092, + "step": 5826 + }, + { + "epoch": 3.286520022560632, + "grad_norm": 1.2381008863449097, + "learning_rate": 3.3570219966159056e-05, + "loss": 0.8488, + "step": 5827 + }, + { + "epoch": 3.2870840383530737, + "grad_norm": 1.1520308256149292, + "learning_rate": 3.356739988719684e-05, + "loss": 0.7208, + "step": 5828 + }, + { + "epoch": 3.287648054145516, + "grad_norm": 1.3990199565887451, + "learning_rate": 3.356457980823463e-05, + "loss": 0.8382, + "step": 5829 + }, + { + "epoch": 3.2882120699379582, + "grad_norm": 1.116603136062622, + "learning_rate": 3.3561759729272425e-05, + "loss": 0.8104, + "step": 5830 + }, + { + "epoch": 3.2887760857304005, + "grad_norm": 2.4759652614593506, + "learning_rate": 3.355893965031021e-05, + "loss": 0.8768, + "step": 5831 + }, + { + "epoch": 3.2893401015228427, + "grad_norm": 1.2541353702545166, + "learning_rate": 3.3556119571347996e-05, + "loss": 0.8062, + "step": 5832 + }, + { + "epoch": 3.289904117315285, + "grad_norm": 1.3795007467269897, + "learning_rate": 3.355329949238579e-05, + "loss": 0.8077, + "step": 5833 + }, + { + "epoch": 3.2904681331077272, + "grad_norm": 1.9704803228378296, + "learning_rate": 3.355047941342358e-05, + "loss": 0.8105, + "step": 5834 + }, + { + "epoch": 3.291032148900169, + "grad_norm": 1.0661566257476807, + "learning_rate": 3.3547659334461366e-05, + "loss": 0.6804, + "step": 5835 + }, + { + "epoch": 3.2915961646926113, + "grad_norm": 1.3890348672866821, + "learning_rate": 3.354483925549916e-05, + "loss": 0.875, + "step": 5836 + }, + { + "epoch": 3.2921601804850535, + "grad_norm": 1.086404800415039, + "learning_rate": 3.3542019176536944e-05, + "loss": 0.7868, + "step": 5837 + }, + { + "epoch": 3.292724196277496, + "grad_norm": 1.1229525804519653, + "learning_rate": 3.3539199097574736e-05, + "loss": 0.7461, + "step": 5838 + }, + { + "epoch": 3.293288212069938, + "grad_norm": 1.2705154418945312, + "learning_rate": 3.353637901861252e-05, + "loss": 0.8413, + "step": 5839 + }, + { + "epoch": 3.2938522278623803, + "grad_norm": 1.3008779287338257, + "learning_rate": 3.3533558939650313e-05, + "loss": 0.8239, + "step": 5840 + }, + { + "epoch": 3.2944162436548226, + "grad_norm": 1.4431856870651245, + "learning_rate": 3.35307388606881e-05, + "loss": 0.679, + "step": 5841 + }, + { + "epoch": 3.2949802594472644, + "grad_norm": 1.0877549648284912, + "learning_rate": 3.352791878172589e-05, + "loss": 0.6498, + "step": 5842 + }, + { + "epoch": 3.2955442752397066, + "grad_norm": 1.2493400573730469, + "learning_rate": 3.3525098702763676e-05, + "loss": 0.8037, + "step": 5843 + }, + { + "epoch": 3.296108291032149, + "grad_norm": 1.3858715295791626, + "learning_rate": 3.352227862380147e-05, + "loss": 0.6922, + "step": 5844 + }, + { + "epoch": 3.296672306824591, + "grad_norm": 1.4153826236724854, + "learning_rate": 3.3519458544839254e-05, + "loss": 0.7728, + "step": 5845 + }, + { + "epoch": 3.2972363226170334, + "grad_norm": 1.313567876815796, + "learning_rate": 3.3516638465877046e-05, + "loss": 0.6661, + "step": 5846 + }, + { + "epoch": 3.2978003384094756, + "grad_norm": 1.1613945960998535, + "learning_rate": 3.351381838691484e-05, + "loss": 0.7966, + "step": 5847 + }, + { + "epoch": 3.298364354201918, + "grad_norm": 1.294851303100586, + "learning_rate": 3.3510998307952624e-05, + "loss": 0.7681, + "step": 5848 + }, + { + "epoch": 3.2989283699943597, + "grad_norm": 1.9686272144317627, + "learning_rate": 3.350817822899041e-05, + "loss": 0.795, + "step": 5849 + }, + { + "epoch": 3.299492385786802, + "grad_norm": 2.2596139907836914, + "learning_rate": 3.35053581500282e-05, + "loss": 0.8481, + "step": 5850 + }, + { + "epoch": 3.300056401579244, + "grad_norm": 1.6352734565734863, + "learning_rate": 3.3502538071065994e-05, + "loss": 0.6269, + "step": 5851 + }, + { + "epoch": 3.3006204173716864, + "grad_norm": 1.2802352905273438, + "learning_rate": 3.349971799210378e-05, + "loss": 0.8302, + "step": 5852 + }, + { + "epoch": 3.3011844331641287, + "grad_norm": 1.1239542961120605, + "learning_rate": 3.3496897913141564e-05, + "loss": 0.7778, + "step": 5853 + }, + { + "epoch": 3.301748448956571, + "grad_norm": 0.9468363523483276, + "learning_rate": 3.349407783417936e-05, + "loss": 0.6776, + "step": 5854 + }, + { + "epoch": 3.302312464749013, + "grad_norm": 1.154557466506958, + "learning_rate": 3.349125775521715e-05, + "loss": 0.7286, + "step": 5855 + }, + { + "epoch": 3.302876480541455, + "grad_norm": 1.1113108396530151, + "learning_rate": 3.3488437676254934e-05, + "loss": 0.7657, + "step": 5856 + }, + { + "epoch": 3.3034404963338972, + "grad_norm": 1.4663515090942383, + "learning_rate": 3.3485617597292726e-05, + "loss": 0.735, + "step": 5857 + }, + { + "epoch": 3.3040045121263395, + "grad_norm": 1.151698112487793, + "learning_rate": 3.348279751833052e-05, + "loss": 0.7595, + "step": 5858 + }, + { + "epoch": 3.3045685279187818, + "grad_norm": 1.063844919204712, + "learning_rate": 3.3479977439368304e-05, + "loss": 0.719, + "step": 5859 + }, + { + "epoch": 3.305132543711224, + "grad_norm": 1.1215472221374512, + "learning_rate": 3.347715736040609e-05, + "loss": 0.7748, + "step": 5860 + }, + { + "epoch": 3.3056965595036663, + "grad_norm": 1.383344054222107, + "learning_rate": 3.347433728144388e-05, + "loss": 0.697, + "step": 5861 + }, + { + "epoch": 3.3062605752961085, + "grad_norm": 1.1033761501312256, + "learning_rate": 3.3471517202481674e-05, + "loss": 0.8148, + "step": 5862 + }, + { + "epoch": 3.3068245910885503, + "grad_norm": 1.931015968322754, + "learning_rate": 3.346869712351946e-05, + "loss": 0.7951, + "step": 5863 + }, + { + "epoch": 3.3073886068809926, + "grad_norm": 1.2929425239562988, + "learning_rate": 3.3465877044557245e-05, + "loss": 0.7057, + "step": 5864 + }, + { + "epoch": 3.307952622673435, + "grad_norm": 1.1207658052444458, + "learning_rate": 3.3463056965595044e-05, + "loss": 0.7421, + "step": 5865 + }, + { + "epoch": 3.308516638465877, + "grad_norm": 1.131466269493103, + "learning_rate": 3.346023688663283e-05, + "loss": 0.7421, + "step": 5866 + }, + { + "epoch": 3.3090806542583193, + "grad_norm": 1.3440792560577393, + "learning_rate": 3.3457416807670614e-05, + "loss": 0.846, + "step": 5867 + }, + { + "epoch": 3.3096446700507616, + "grad_norm": 1.6218657493591309, + "learning_rate": 3.3454596728708407e-05, + "loss": 0.8792, + "step": 5868 + }, + { + "epoch": 3.310208685843204, + "grad_norm": 1.0149182081222534, + "learning_rate": 3.34517766497462e-05, + "loss": 0.8099, + "step": 5869 + }, + { + "epoch": 3.3107727016356456, + "grad_norm": 1.2648671865463257, + "learning_rate": 3.3448956570783984e-05, + "loss": 0.8131, + "step": 5870 + }, + { + "epoch": 3.311336717428088, + "grad_norm": 2.833770751953125, + "learning_rate": 3.344613649182177e-05, + "loss": 0.8778, + "step": 5871 + }, + { + "epoch": 3.31190073322053, + "grad_norm": 2.297112226486206, + "learning_rate": 3.344331641285956e-05, + "loss": 0.7782, + "step": 5872 + }, + { + "epoch": 3.3124647490129724, + "grad_norm": 3.469269275665283, + "learning_rate": 3.3440496333897354e-05, + "loss": 0.7766, + "step": 5873 + }, + { + "epoch": 3.3130287648054146, + "grad_norm": 1.4145551919937134, + "learning_rate": 3.343767625493514e-05, + "loss": 0.7487, + "step": 5874 + }, + { + "epoch": 3.313592780597857, + "grad_norm": 1.0019222497940063, + "learning_rate": 3.343485617597293e-05, + "loss": 0.7518, + "step": 5875 + }, + { + "epoch": 3.314156796390299, + "grad_norm": 2.349616289138794, + "learning_rate": 3.343203609701072e-05, + "loss": 0.9265, + "step": 5876 + }, + { + "epoch": 3.314720812182741, + "grad_norm": 1.5180076360702515, + "learning_rate": 3.342921601804851e-05, + "loss": 0.673, + "step": 5877 + }, + { + "epoch": 3.315284827975183, + "grad_norm": 1.3483304977416992, + "learning_rate": 3.3426395939086294e-05, + "loss": 0.8108, + "step": 5878 + }, + { + "epoch": 3.3158488437676255, + "grad_norm": 1.1721872091293335, + "learning_rate": 3.342357586012409e-05, + "loss": 0.8013, + "step": 5879 + }, + { + "epoch": 3.3164128595600677, + "grad_norm": 1.4551950693130493, + "learning_rate": 3.342075578116187e-05, + "loss": 0.7342, + "step": 5880 + }, + { + "epoch": 3.31697687535251, + "grad_norm": 1.244197130203247, + "learning_rate": 3.3417935702199664e-05, + "loss": 0.7496, + "step": 5881 + }, + { + "epoch": 3.317540891144952, + "grad_norm": 1.8240445852279663, + "learning_rate": 3.341511562323745e-05, + "loss": 0.8661, + "step": 5882 + }, + { + "epoch": 3.3181049069373945, + "grad_norm": 0.9023531675338745, + "learning_rate": 3.341229554427524e-05, + "loss": 0.6265, + "step": 5883 + }, + { + "epoch": 3.3186689227298363, + "grad_norm": 1.2575653791427612, + "learning_rate": 3.340947546531303e-05, + "loss": 0.6583, + "step": 5884 + }, + { + "epoch": 3.3192329385222785, + "grad_norm": 0.935534656047821, + "learning_rate": 3.340665538635082e-05, + "loss": 0.7254, + "step": 5885 + }, + { + "epoch": 3.3197969543147208, + "grad_norm": 1.3270732164382935, + "learning_rate": 3.340383530738861e-05, + "loss": 0.7809, + "step": 5886 + }, + { + "epoch": 3.320360970107163, + "grad_norm": 1.0725975036621094, + "learning_rate": 3.34010152284264e-05, + "loss": 0.7817, + "step": 5887 + }, + { + "epoch": 3.3209249858996053, + "grad_norm": 0.9460633993148804, + "learning_rate": 3.339819514946418e-05, + "loss": 0.7289, + "step": 5888 + }, + { + "epoch": 3.3214890016920475, + "grad_norm": 1.8313959836959839, + "learning_rate": 3.3395375070501975e-05, + "loss": 0.802, + "step": 5889 + }, + { + "epoch": 3.32205301748449, + "grad_norm": 1.3060648441314697, + "learning_rate": 3.339255499153977e-05, + "loss": 0.766, + "step": 5890 + }, + { + "epoch": 3.3226170332769316, + "grad_norm": 1.1859666109085083, + "learning_rate": 3.338973491257755e-05, + "loss": 0.6626, + "step": 5891 + }, + { + "epoch": 3.323181049069374, + "grad_norm": 1.4627699851989746, + "learning_rate": 3.338691483361534e-05, + "loss": 0.7984, + "step": 5892 + }, + { + "epoch": 3.323745064861816, + "grad_norm": 0.9156669974327087, + "learning_rate": 3.3384094754653137e-05, + "loss": 0.6444, + "step": 5893 + }, + { + "epoch": 3.3243090806542583, + "grad_norm": 0.9748626947402954, + "learning_rate": 3.338127467569092e-05, + "loss": 0.6586, + "step": 5894 + }, + { + "epoch": 3.3248730964467006, + "grad_norm": 1.5491610765457153, + "learning_rate": 3.337845459672871e-05, + "loss": 0.78, + "step": 5895 + }, + { + "epoch": 3.325437112239143, + "grad_norm": 1.9007395505905151, + "learning_rate": 3.33756345177665e-05, + "loss": 0.8565, + "step": 5896 + }, + { + "epoch": 3.326001128031585, + "grad_norm": 1.082636833190918, + "learning_rate": 3.337281443880429e-05, + "loss": 0.737, + "step": 5897 + }, + { + "epoch": 3.326565143824027, + "grad_norm": 1.0953292846679688, + "learning_rate": 3.336999435984208e-05, + "loss": 0.7259, + "step": 5898 + }, + { + "epoch": 3.327129159616469, + "grad_norm": 2.171391248703003, + "learning_rate": 3.336717428087986e-05, + "loss": 0.8195, + "step": 5899 + }, + { + "epoch": 3.3276931754089114, + "grad_norm": 1.1835262775421143, + "learning_rate": 3.3364354201917655e-05, + "loss": 0.6803, + "step": 5900 + }, + { + "epoch": 3.3282571912013537, + "grad_norm": 1.131967306137085, + "learning_rate": 3.336153412295545e-05, + "loss": 0.7347, + "step": 5901 + }, + { + "epoch": 3.328821206993796, + "grad_norm": 2.2872707843780518, + "learning_rate": 3.335871404399323e-05, + "loss": 0.8051, + "step": 5902 + }, + { + "epoch": 3.329385222786238, + "grad_norm": 1.6942728757858276, + "learning_rate": 3.335589396503102e-05, + "loss": 0.7765, + "step": 5903 + }, + { + "epoch": 3.3299492385786804, + "grad_norm": 1.5785763263702393, + "learning_rate": 3.335307388606882e-05, + "loss": 0.6783, + "step": 5904 + }, + { + "epoch": 3.3305132543711222, + "grad_norm": 1.0951669216156006, + "learning_rate": 3.33502538071066e-05, + "loss": 0.7836, + "step": 5905 + }, + { + "epoch": 3.3310772701635645, + "grad_norm": 1.4169560670852661, + "learning_rate": 3.334743372814439e-05, + "loss": 0.6204, + "step": 5906 + }, + { + "epoch": 3.3316412859560067, + "grad_norm": 1.212157130241394, + "learning_rate": 3.334461364918218e-05, + "loss": 0.7821, + "step": 5907 + }, + { + "epoch": 3.332205301748449, + "grad_norm": 1.3305089473724365, + "learning_rate": 3.334179357021997e-05, + "loss": 0.9281, + "step": 5908 + }, + { + "epoch": 3.3327693175408912, + "grad_norm": 1.5470564365386963, + "learning_rate": 3.333897349125776e-05, + "loss": 0.7483, + "step": 5909 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 1.6880168914794922, + "learning_rate": 3.333615341229554e-05, + "loss": 0.8608, + "step": 5910 + }, + { + "epoch": 3.3338973491257757, + "grad_norm": 1.5056190490722656, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.8213, + "step": 5911 + }, + { + "epoch": 3.3344613649182175, + "grad_norm": 1.5594887733459473, + "learning_rate": 3.333051325437113e-05, + "loss": 0.6604, + "step": 5912 + }, + { + "epoch": 3.33502538071066, + "grad_norm": 1.5874608755111694, + "learning_rate": 3.332769317540891e-05, + "loss": 0.7408, + "step": 5913 + }, + { + "epoch": 3.335589396503102, + "grad_norm": 1.7747867107391357, + "learning_rate": 3.3324873096446705e-05, + "loss": 0.7258, + "step": 5914 + }, + { + "epoch": 3.3361534122955443, + "grad_norm": 2.055694341659546, + "learning_rate": 3.332205301748449e-05, + "loss": 0.7824, + "step": 5915 + }, + { + "epoch": 3.3367174280879865, + "grad_norm": 1.5543817281723022, + "learning_rate": 3.331923293852228e-05, + "loss": 0.7498, + "step": 5916 + }, + { + "epoch": 3.337281443880429, + "grad_norm": 1.1613267660140991, + "learning_rate": 3.331641285956007e-05, + "loss": 0.7835, + "step": 5917 + }, + { + "epoch": 3.337845459672871, + "grad_norm": 1.5492234230041504, + "learning_rate": 3.331359278059786e-05, + "loss": 0.7286, + "step": 5918 + }, + { + "epoch": 3.338409475465313, + "grad_norm": 1.8777729272842407, + "learning_rate": 3.3310772701635645e-05, + "loss": 0.8968, + "step": 5919 + }, + { + "epoch": 3.338973491257755, + "grad_norm": 1.0162723064422607, + "learning_rate": 3.330795262267344e-05, + "loss": 0.6936, + "step": 5920 + }, + { + "epoch": 3.3395375070501974, + "grad_norm": 1.3776912689208984, + "learning_rate": 3.330513254371122e-05, + "loss": 0.6908, + "step": 5921 + }, + { + "epoch": 3.3401015228426396, + "grad_norm": 1.630392074584961, + "learning_rate": 3.3302312464749015e-05, + "loss": 0.8012, + "step": 5922 + }, + { + "epoch": 3.340665538635082, + "grad_norm": 2.5339601039886475, + "learning_rate": 3.32994923857868e-05, + "loss": 0.6646, + "step": 5923 + }, + { + "epoch": 3.341229554427524, + "grad_norm": 1.1906211376190186, + "learning_rate": 3.329667230682459e-05, + "loss": 0.7701, + "step": 5924 + }, + { + "epoch": 3.3417935702199664, + "grad_norm": 1.7961821556091309, + "learning_rate": 3.3293852227862385e-05, + "loss": 0.8133, + "step": 5925 + }, + { + "epoch": 3.342357586012408, + "grad_norm": 1.2486416101455688, + "learning_rate": 3.329103214890017e-05, + "loss": 0.7037, + "step": 5926 + }, + { + "epoch": 3.3429216018048504, + "grad_norm": 2.1825740337371826, + "learning_rate": 3.3288212069937956e-05, + "loss": 0.8888, + "step": 5927 + }, + { + "epoch": 3.3434856175972927, + "grad_norm": 0.9602975249290466, + "learning_rate": 3.328539199097575e-05, + "loss": 0.7814, + "step": 5928 + }, + { + "epoch": 3.344049633389735, + "grad_norm": 1.5041210651397705, + "learning_rate": 3.328257191201354e-05, + "loss": 0.7605, + "step": 5929 + }, + { + "epoch": 3.344613649182177, + "grad_norm": 0.9489894509315491, + "learning_rate": 3.3279751833051325e-05, + "loss": 0.771, + "step": 5930 + }, + { + "epoch": 3.3451776649746194, + "grad_norm": 1.0727587938308716, + "learning_rate": 3.327693175408911e-05, + "loss": 0.7905, + "step": 5931 + }, + { + "epoch": 3.3457416807670617, + "grad_norm": 1.649381399154663, + "learning_rate": 3.327411167512691e-05, + "loss": 0.8118, + "step": 5932 + }, + { + "epoch": 3.3463056965595035, + "grad_norm": 1.738005518913269, + "learning_rate": 3.3271291596164695e-05, + "loss": 0.8311, + "step": 5933 + }, + { + "epoch": 3.3468697123519457, + "grad_norm": 1.0997871160507202, + "learning_rate": 3.326847151720248e-05, + "loss": 0.7964, + "step": 5934 + }, + { + "epoch": 3.347433728144388, + "grad_norm": 1.5388027429580688, + "learning_rate": 3.326565143824027e-05, + "loss": 0.7759, + "step": 5935 + }, + { + "epoch": 3.3479977439368303, + "grad_norm": 1.5273809432983398, + "learning_rate": 3.3262831359278065e-05, + "loss": 0.6897, + "step": 5936 + }, + { + "epoch": 3.3485617597292725, + "grad_norm": 1.3730394840240479, + "learning_rate": 3.326001128031585e-05, + "loss": 0.7724, + "step": 5937 + }, + { + "epoch": 3.3491257755217148, + "grad_norm": 1.4324724674224854, + "learning_rate": 3.3257191201353636e-05, + "loss": 0.8832, + "step": 5938 + }, + { + "epoch": 3.349689791314157, + "grad_norm": 0.9131780862808228, + "learning_rate": 3.325437112239143e-05, + "loss": 0.638, + "step": 5939 + }, + { + "epoch": 3.350253807106599, + "grad_norm": 1.3741261959075928, + "learning_rate": 3.325155104342922e-05, + "loss": 0.7468, + "step": 5940 + }, + { + "epoch": 3.350817822899041, + "grad_norm": 1.203094720840454, + "learning_rate": 3.3248730964467006e-05, + "loss": 0.6789, + "step": 5941 + }, + { + "epoch": 3.3513818386914833, + "grad_norm": 0.8429622650146484, + "learning_rate": 3.324591088550479e-05, + "loss": 0.6727, + "step": 5942 + }, + { + "epoch": 3.3519458544839256, + "grad_norm": 1.2065088748931885, + "learning_rate": 3.324309080654259e-05, + "loss": 0.759, + "step": 5943 + }, + { + "epoch": 3.352509870276368, + "grad_norm": 1.3252332210540771, + "learning_rate": 3.3240270727580375e-05, + "loss": 0.786, + "step": 5944 + }, + { + "epoch": 3.35307388606881, + "grad_norm": 1.3698590993881226, + "learning_rate": 3.323745064861816e-05, + "loss": 0.8734, + "step": 5945 + }, + { + "epoch": 3.3536379018612523, + "grad_norm": 1.3249231576919556, + "learning_rate": 3.323463056965595e-05, + "loss": 0.8343, + "step": 5946 + }, + { + "epoch": 3.354201917653694, + "grad_norm": 1.9363030195236206, + "learning_rate": 3.3231810490693745e-05, + "loss": 0.8006, + "step": 5947 + }, + { + "epoch": 3.3547659334461364, + "grad_norm": 1.336272954940796, + "learning_rate": 3.322899041173153e-05, + "loss": 0.7895, + "step": 5948 + }, + { + "epoch": 3.3553299492385786, + "grad_norm": 0.9411410689353943, + "learning_rate": 3.3226170332769316e-05, + "loss": 0.7065, + "step": 5949 + }, + { + "epoch": 3.355893965031021, + "grad_norm": 1.3645671606063843, + "learning_rate": 3.322335025380711e-05, + "loss": 0.7568, + "step": 5950 + }, + { + "epoch": 3.356457980823463, + "grad_norm": 1.3891026973724365, + "learning_rate": 3.32205301748449e-05, + "loss": 0.6993, + "step": 5951 + }, + { + "epoch": 3.3570219966159054, + "grad_norm": 1.3562601804733276, + "learning_rate": 3.3217710095882686e-05, + "loss": 0.7174, + "step": 5952 + }, + { + "epoch": 3.3575860124083476, + "grad_norm": 1.4175360202789307, + "learning_rate": 3.321489001692048e-05, + "loss": 0.762, + "step": 5953 + }, + { + "epoch": 3.3581500282007894, + "grad_norm": 1.9872524738311768, + "learning_rate": 3.321206993795826e-05, + "loss": 0.8128, + "step": 5954 + }, + { + "epoch": 3.3587140439932317, + "grad_norm": 1.035081148147583, + "learning_rate": 3.3209249858996056e-05, + "loss": 0.7182, + "step": 5955 + }, + { + "epoch": 3.359278059785674, + "grad_norm": 1.6328970193862915, + "learning_rate": 3.320642978003384e-05, + "loss": 0.7363, + "step": 5956 + }, + { + "epoch": 3.359842075578116, + "grad_norm": 1.3486433029174805, + "learning_rate": 3.320360970107163e-05, + "loss": 0.7964, + "step": 5957 + }, + { + "epoch": 3.3604060913705585, + "grad_norm": 1.3242994546890259, + "learning_rate": 3.320078962210942e-05, + "loss": 0.8559, + "step": 5958 + }, + { + "epoch": 3.3609701071630007, + "grad_norm": 1.2954983711242676, + "learning_rate": 3.319796954314721e-05, + "loss": 0.8075, + "step": 5959 + }, + { + "epoch": 3.361534122955443, + "grad_norm": 1.3409581184387207, + "learning_rate": 3.3195149464184996e-05, + "loss": 0.7633, + "step": 5960 + }, + { + "epoch": 3.3620981387478848, + "grad_norm": 1.343142032623291, + "learning_rate": 3.319232938522279e-05, + "loss": 0.821, + "step": 5961 + }, + { + "epoch": 3.362662154540327, + "grad_norm": 2.2300524711608887, + "learning_rate": 3.3189509306260574e-05, + "loss": 0.8983, + "step": 5962 + }, + { + "epoch": 3.3632261703327693, + "grad_norm": 1.7717617750167847, + "learning_rate": 3.3186689227298366e-05, + "loss": 0.8605, + "step": 5963 + }, + { + "epoch": 3.3637901861252115, + "grad_norm": 1.4395346641540527, + "learning_rate": 3.318386914833616e-05, + "loss": 0.762, + "step": 5964 + }, + { + "epoch": 3.3643542019176538, + "grad_norm": 1.2964345216751099, + "learning_rate": 3.3181049069373943e-05, + "loss": 0.8079, + "step": 5965 + }, + { + "epoch": 3.364918217710096, + "grad_norm": 1.1055008172988892, + "learning_rate": 3.317822899041173e-05, + "loss": 0.765, + "step": 5966 + }, + { + "epoch": 3.3654822335025383, + "grad_norm": 1.517945647239685, + "learning_rate": 3.317540891144952e-05, + "loss": 0.7417, + "step": 5967 + }, + { + "epoch": 3.36604624929498, + "grad_norm": 1.2225604057312012, + "learning_rate": 3.317258883248731e-05, + "loss": 0.6886, + "step": 5968 + }, + { + "epoch": 3.3666102650874223, + "grad_norm": 1.156570553779602, + "learning_rate": 3.31697687535251e-05, + "loss": 0.7586, + "step": 5969 + }, + { + "epoch": 3.3671742808798646, + "grad_norm": 1.1359667778015137, + "learning_rate": 3.316694867456289e-05, + "loss": 0.7063, + "step": 5970 + }, + { + "epoch": 3.367738296672307, + "grad_norm": 1.0835344791412354, + "learning_rate": 3.316412859560068e-05, + "loss": 0.6781, + "step": 5971 + }, + { + "epoch": 3.368302312464749, + "grad_norm": 2.597184419631958, + "learning_rate": 3.316130851663847e-05, + "loss": 0.8798, + "step": 5972 + }, + { + "epoch": 3.3688663282571913, + "grad_norm": 2.1205756664276123, + "learning_rate": 3.3158488437676254e-05, + "loss": 0.7603, + "step": 5973 + }, + { + "epoch": 3.3694303440496336, + "grad_norm": 2.211571455001831, + "learning_rate": 3.3155668358714046e-05, + "loss": 0.717, + "step": 5974 + }, + { + "epoch": 3.3699943598420754, + "grad_norm": 1.0133463144302368, + "learning_rate": 3.315284827975184e-05, + "loss": 0.6496, + "step": 5975 + }, + { + "epoch": 3.3705583756345177, + "grad_norm": 1.8579902648925781, + "learning_rate": 3.3150028200789624e-05, + "loss": 0.6812, + "step": 5976 + }, + { + "epoch": 3.37112239142696, + "grad_norm": 1.595048189163208, + "learning_rate": 3.314720812182741e-05, + "loss": 0.7875, + "step": 5977 + }, + { + "epoch": 3.371686407219402, + "grad_norm": 3.154452085494995, + "learning_rate": 3.31443880428652e-05, + "loss": 0.8023, + "step": 5978 + }, + { + "epoch": 3.3722504230118444, + "grad_norm": 1.2996844053268433, + "learning_rate": 3.3141567963902993e-05, + "loss": 0.6942, + "step": 5979 + }, + { + "epoch": 3.3728144388042867, + "grad_norm": 1.185442328453064, + "learning_rate": 3.313874788494078e-05, + "loss": 0.6836, + "step": 5980 + }, + { + "epoch": 3.373378454596729, + "grad_norm": 2.1299962997436523, + "learning_rate": 3.3135927805978564e-05, + "loss": 0.8501, + "step": 5981 + }, + { + "epoch": 3.3739424703891707, + "grad_norm": 1.6003100872039795, + "learning_rate": 3.313310772701636e-05, + "loss": 0.756, + "step": 5982 + }, + { + "epoch": 3.374506486181613, + "grad_norm": 2.354863166809082, + "learning_rate": 3.313028764805415e-05, + "loss": 0.6843, + "step": 5983 + }, + { + "epoch": 3.3750705019740552, + "grad_norm": 1.383545994758606, + "learning_rate": 3.3127467569091934e-05, + "loss": 0.7567, + "step": 5984 + }, + { + "epoch": 3.3756345177664975, + "grad_norm": 0.8842910528182983, + "learning_rate": 3.3124647490129726e-05, + "loss": 0.6381, + "step": 5985 + }, + { + "epoch": 3.3761985335589397, + "grad_norm": 2.407806396484375, + "learning_rate": 3.312182741116752e-05, + "loss": 0.8197, + "step": 5986 + }, + { + "epoch": 3.376762549351382, + "grad_norm": 2.7070109844207764, + "learning_rate": 3.3119007332205304e-05, + "loss": 0.7769, + "step": 5987 + }, + { + "epoch": 3.3773265651438242, + "grad_norm": 1.5184099674224854, + "learning_rate": 3.311618725324309e-05, + "loss": 0.7765, + "step": 5988 + }, + { + "epoch": 3.377890580936266, + "grad_norm": 1.6084542274475098, + "learning_rate": 3.311336717428088e-05, + "loss": 0.7314, + "step": 5989 + }, + { + "epoch": 3.3784545967287083, + "grad_norm": 1.3276056051254272, + "learning_rate": 3.3110547095318674e-05, + "loss": 0.7206, + "step": 5990 + }, + { + "epoch": 3.3790186125211505, + "grad_norm": 1.401200532913208, + "learning_rate": 3.310772701635646e-05, + "loss": 0.7765, + "step": 5991 + }, + { + "epoch": 3.379582628313593, + "grad_norm": 1.1060516834259033, + "learning_rate": 3.3104906937394244e-05, + "loss": 0.8437, + "step": 5992 + }, + { + "epoch": 3.380146644106035, + "grad_norm": 1.5901780128479004, + "learning_rate": 3.3102086858432037e-05, + "loss": 0.8381, + "step": 5993 + }, + { + "epoch": 3.3807106598984773, + "grad_norm": 1.3007760047912598, + "learning_rate": 3.309926677946983e-05, + "loss": 0.8783, + "step": 5994 + }, + { + "epoch": 3.3812746756909196, + "grad_norm": 2.585623264312744, + "learning_rate": 3.3096446700507614e-05, + "loss": 0.8802, + "step": 5995 + }, + { + "epoch": 3.3818386914833614, + "grad_norm": 0.9718672633171082, + "learning_rate": 3.3093626621545406e-05, + "loss": 0.6793, + "step": 5996 + }, + { + "epoch": 3.3824027072758036, + "grad_norm": 2.2477269172668457, + "learning_rate": 3.309080654258319e-05, + "loss": 0.7357, + "step": 5997 + }, + { + "epoch": 3.382966723068246, + "grad_norm": 1.238193154335022, + "learning_rate": 3.3087986463620984e-05, + "loss": 0.7563, + "step": 5998 + }, + { + "epoch": 3.383530738860688, + "grad_norm": 1.5765316486358643, + "learning_rate": 3.308516638465877e-05, + "loss": 0.841, + "step": 5999 + }, + { + "epoch": 3.3840947546531304, + "grad_norm": 1.4330226182937622, + "learning_rate": 3.308234630569656e-05, + "loss": 0.8778, + "step": 6000 + }, + { + "epoch": 3.3846587704455726, + "grad_norm": 45.7878303527832, + "learning_rate": 3.307952622673435e-05, + "loss": 0.9332, + "step": 6001 + }, + { + "epoch": 3.385222786238015, + "grad_norm": 2.533677339553833, + "learning_rate": 3.307670614777214e-05, + "loss": 0.7447, + "step": 6002 + }, + { + "epoch": 3.3857868020304567, + "grad_norm": 1.830029845237732, + "learning_rate": 3.307388606880993e-05, + "loss": 0.8397, + "step": 6003 + }, + { + "epoch": 3.386350817822899, + "grad_norm": 0.9617888331413269, + "learning_rate": 3.307106598984772e-05, + "loss": 0.7137, + "step": 6004 + }, + { + "epoch": 3.386914833615341, + "grad_norm": 1.2229984998703003, + "learning_rate": 3.30682459108855e-05, + "loss": 0.7506, + "step": 6005 + }, + { + "epoch": 3.3874788494077834, + "grad_norm": 3.170426368713379, + "learning_rate": 3.3065425831923294e-05, + "loss": 0.8478, + "step": 6006 + }, + { + "epoch": 3.3880428652002257, + "grad_norm": 1.2093971967697144, + "learning_rate": 3.3062605752961087e-05, + "loss": 0.7645, + "step": 6007 + }, + { + "epoch": 3.388606880992668, + "grad_norm": 0.9716242551803589, + "learning_rate": 3.305978567399887e-05, + "loss": 0.6687, + "step": 6008 + }, + { + "epoch": 3.38917089678511, + "grad_norm": 1.8389651775360107, + "learning_rate": 3.3056965595036664e-05, + "loss": 0.7787, + "step": 6009 + }, + { + "epoch": 3.389734912577552, + "grad_norm": 1.2661941051483154, + "learning_rate": 3.305414551607445e-05, + "loss": 0.7868, + "step": 6010 + }, + { + "epoch": 3.3902989283699942, + "grad_norm": 1.381704330444336, + "learning_rate": 3.305132543711224e-05, + "loss": 0.735, + "step": 6011 + }, + { + "epoch": 3.3908629441624365, + "grad_norm": 1.5375990867614746, + "learning_rate": 3.304850535815003e-05, + "loss": 0.7591, + "step": 6012 + }, + { + "epoch": 3.3914269599548788, + "grad_norm": 2.8496601581573486, + "learning_rate": 3.304568527918782e-05, + "loss": 0.7987, + "step": 6013 + }, + { + "epoch": 3.391990975747321, + "grad_norm": 1.3658033609390259, + "learning_rate": 3.304286520022561e-05, + "loss": 0.8149, + "step": 6014 + }, + { + "epoch": 3.3925549915397633, + "grad_norm": 1.737308382987976, + "learning_rate": 3.30400451212634e-05, + "loss": 0.797, + "step": 6015 + }, + { + "epoch": 3.3931190073322055, + "grad_norm": 0.7822115421295166, + "learning_rate": 3.303722504230118e-05, + "loss": 0.5819, + "step": 6016 + }, + { + "epoch": 3.3936830231246473, + "grad_norm": 1.5184836387634277, + "learning_rate": 3.3034404963338974e-05, + "loss": 0.7225, + "step": 6017 + }, + { + "epoch": 3.3942470389170896, + "grad_norm": 1.1496120691299438, + "learning_rate": 3.303158488437677e-05, + "loss": 0.6939, + "step": 6018 + }, + { + "epoch": 3.394811054709532, + "grad_norm": 1.2736550569534302, + "learning_rate": 3.302876480541455e-05, + "loss": 0.7736, + "step": 6019 + }, + { + "epoch": 3.395375070501974, + "grad_norm": 1.3883891105651855, + "learning_rate": 3.302594472645234e-05, + "loss": 0.689, + "step": 6020 + }, + { + "epoch": 3.3959390862944163, + "grad_norm": 1.7679023742675781, + "learning_rate": 3.3023124647490136e-05, + "loss": 0.9725, + "step": 6021 + }, + { + "epoch": 3.3965031020868586, + "grad_norm": 1.5724897384643555, + "learning_rate": 3.302030456852792e-05, + "loss": 0.8274, + "step": 6022 + }, + { + "epoch": 3.397067117879301, + "grad_norm": 1.6328941583633423, + "learning_rate": 3.301748448956571e-05, + "loss": 0.8266, + "step": 6023 + }, + { + "epoch": 3.3976311336717426, + "grad_norm": 1.259330153465271, + "learning_rate": 3.30146644106035e-05, + "loss": 0.7877, + "step": 6024 + }, + { + "epoch": 3.398195149464185, + "grad_norm": 1.1734095811843872, + "learning_rate": 3.301184433164129e-05, + "loss": 0.6559, + "step": 6025 + }, + { + "epoch": 3.398759165256627, + "grad_norm": 1.0664796829223633, + "learning_rate": 3.300902425267908e-05, + "loss": 0.6298, + "step": 6026 + }, + { + "epoch": 3.3993231810490694, + "grad_norm": 1.275609016418457, + "learning_rate": 3.300620417371686e-05, + "loss": 0.7257, + "step": 6027 + }, + { + "epoch": 3.3998871968415116, + "grad_norm": 1.2721320390701294, + "learning_rate": 3.3003384094754655e-05, + "loss": 0.6588, + "step": 6028 + }, + { + "epoch": 3.400451212633954, + "grad_norm": 1.9759047031402588, + "learning_rate": 3.300056401579245e-05, + "loss": 0.8611, + "step": 6029 + }, + { + "epoch": 3.401015228426396, + "grad_norm": 1.7156009674072266, + "learning_rate": 3.299774393683023e-05, + "loss": 0.8657, + "step": 6030 + }, + { + "epoch": 3.401579244218838, + "grad_norm": 1.15489661693573, + "learning_rate": 3.299492385786802e-05, + "loss": 0.7397, + "step": 6031 + }, + { + "epoch": 3.40214326001128, + "grad_norm": 0.9435311555862427, + "learning_rate": 3.299210377890581e-05, + "loss": 0.6393, + "step": 6032 + }, + { + "epoch": 3.4027072758037225, + "grad_norm": 1.1436338424682617, + "learning_rate": 3.29892836999436e-05, + "loss": 0.6935, + "step": 6033 + }, + { + "epoch": 3.4032712915961647, + "grad_norm": 1.195976972579956, + "learning_rate": 3.298646362098139e-05, + "loss": 0.769, + "step": 6034 + }, + { + "epoch": 3.403835307388607, + "grad_norm": 1.3839290142059326, + "learning_rate": 3.298364354201918e-05, + "loss": 0.7371, + "step": 6035 + }, + { + "epoch": 3.404399323181049, + "grad_norm": 3.0059216022491455, + "learning_rate": 3.2980823463056965e-05, + "loss": 1.0787, + "step": 6036 + }, + { + "epoch": 3.4049633389734915, + "grad_norm": 1.4927705526351929, + "learning_rate": 3.297800338409476e-05, + "loss": 0.7042, + "step": 6037 + }, + { + "epoch": 3.4055273547659333, + "grad_norm": 1.6573482751846313, + "learning_rate": 3.297518330513254e-05, + "loss": 0.8082, + "step": 6038 + }, + { + "epoch": 3.4060913705583755, + "grad_norm": 1.244741678237915, + "learning_rate": 3.2972363226170335e-05, + "loss": 0.7718, + "step": 6039 + }, + { + "epoch": 3.4066553863508178, + "grad_norm": 1.4229689836502075, + "learning_rate": 3.296954314720812e-05, + "loss": 0.8593, + "step": 6040 + }, + { + "epoch": 3.40721940214326, + "grad_norm": 1.1574150323867798, + "learning_rate": 3.296672306824591e-05, + "loss": 0.7359, + "step": 6041 + }, + { + "epoch": 3.4077834179357023, + "grad_norm": 1.0016474723815918, + "learning_rate": 3.2963902989283705e-05, + "loss": 0.7103, + "step": 6042 + }, + { + "epoch": 3.4083474337281445, + "grad_norm": 1.1238739490509033, + "learning_rate": 3.296108291032149e-05, + "loss": 0.7495, + "step": 6043 + }, + { + "epoch": 3.408911449520587, + "grad_norm": 1.723650336265564, + "learning_rate": 3.295826283135928e-05, + "loss": 0.8182, + "step": 6044 + }, + { + "epoch": 3.4094754653130286, + "grad_norm": 1.0924688577651978, + "learning_rate": 3.295544275239707e-05, + "loss": 0.8011, + "step": 6045 + }, + { + "epoch": 3.410039481105471, + "grad_norm": 1.2562977075576782, + "learning_rate": 3.295262267343486e-05, + "loss": 0.7128, + "step": 6046 + }, + { + "epoch": 3.410603496897913, + "grad_norm": 1.1563141345977783, + "learning_rate": 3.2949802594472645e-05, + "loss": 0.7285, + "step": 6047 + }, + { + "epoch": 3.4111675126903553, + "grad_norm": 1.0223300457000732, + "learning_rate": 3.294698251551044e-05, + "loss": 0.6111, + "step": 6048 + }, + { + "epoch": 3.4117315284827976, + "grad_norm": 0.8784473538398743, + "learning_rate": 3.294416243654822e-05, + "loss": 0.638, + "step": 6049 + }, + { + "epoch": 3.41229554427524, + "grad_norm": 1.0557132959365845, + "learning_rate": 3.2941342357586015e-05, + "loss": 0.8261, + "step": 6050 + }, + { + "epoch": 3.412859560067682, + "grad_norm": 1.2391661405563354, + "learning_rate": 3.29385222786238e-05, + "loss": 0.7224, + "step": 6051 + }, + { + "epoch": 3.413423575860124, + "grad_norm": 0.896231472492218, + "learning_rate": 3.293570219966159e-05, + "loss": 0.624, + "step": 6052 + }, + { + "epoch": 3.413987591652566, + "grad_norm": 1.6015671491622925, + "learning_rate": 3.2932882120699385e-05, + "loss": 0.8188, + "step": 6053 + }, + { + "epoch": 3.4145516074450084, + "grad_norm": 1.2440071105957031, + "learning_rate": 3.293006204173717e-05, + "loss": 0.6946, + "step": 6054 + }, + { + "epoch": 3.4151156232374507, + "grad_norm": 2.2928857803344727, + "learning_rate": 3.2927241962774956e-05, + "loss": 0.8825, + "step": 6055 + }, + { + "epoch": 3.415679639029893, + "grad_norm": 1.228778600692749, + "learning_rate": 3.292442188381275e-05, + "loss": 0.7743, + "step": 6056 + }, + { + "epoch": 3.416243654822335, + "grad_norm": 1.0089836120605469, + "learning_rate": 3.292160180485054e-05, + "loss": 0.748, + "step": 6057 + }, + { + "epoch": 3.4168076706147774, + "grad_norm": 1.6430611610412598, + "learning_rate": 3.2918781725888325e-05, + "loss": 0.8393, + "step": 6058 + }, + { + "epoch": 3.4173716864072192, + "grad_norm": 1.3201491832733154, + "learning_rate": 3.291596164692611e-05, + "loss": 0.6915, + "step": 6059 + }, + { + "epoch": 3.4179357021996615, + "grad_norm": 1.8049399852752686, + "learning_rate": 3.291314156796391e-05, + "loss": 0.8138, + "step": 6060 + }, + { + "epoch": 3.4184997179921037, + "grad_norm": 1.0400491952896118, + "learning_rate": 3.2910321489001695e-05, + "loss": 0.6798, + "step": 6061 + }, + { + "epoch": 3.419063733784546, + "grad_norm": 2.990953207015991, + "learning_rate": 3.290750141003948e-05, + "loss": 0.8852, + "step": 6062 + }, + { + "epoch": 3.4196277495769882, + "grad_norm": 1.5868605375289917, + "learning_rate": 3.290468133107727e-05, + "loss": 0.6945, + "step": 6063 + }, + { + "epoch": 3.4201917653694305, + "grad_norm": 1.3210716247558594, + "learning_rate": 3.2901861252115065e-05, + "loss": 0.761, + "step": 6064 + }, + { + "epoch": 3.4207557811618727, + "grad_norm": 1.0449566841125488, + "learning_rate": 3.289904117315285e-05, + "loss": 0.7666, + "step": 6065 + }, + { + "epoch": 3.4213197969543145, + "grad_norm": 1.3562555313110352, + "learning_rate": 3.2896221094190636e-05, + "loss": 0.7549, + "step": 6066 + }, + { + "epoch": 3.421883812746757, + "grad_norm": 1.022244930267334, + "learning_rate": 3.289340101522843e-05, + "loss": 0.7193, + "step": 6067 + }, + { + "epoch": 3.422447828539199, + "grad_norm": 1.0242582559585571, + "learning_rate": 3.289058093626622e-05, + "loss": 0.6937, + "step": 6068 + }, + { + "epoch": 3.4230118443316413, + "grad_norm": 1.544330358505249, + "learning_rate": 3.2887760857304005e-05, + "loss": 0.9241, + "step": 6069 + }, + { + "epoch": 3.4235758601240835, + "grad_norm": 0.9395970106124878, + "learning_rate": 3.288494077834179e-05, + "loss": 0.7511, + "step": 6070 + }, + { + "epoch": 3.424139875916526, + "grad_norm": 0.9583370685577393, + "learning_rate": 3.288212069937958e-05, + "loss": 0.611, + "step": 6071 + }, + { + "epoch": 3.424703891708968, + "grad_norm": 1.8842558860778809, + "learning_rate": 3.2879300620417375e-05, + "loss": 0.8516, + "step": 6072 + }, + { + "epoch": 3.42526790750141, + "grad_norm": 1.1527687311172485, + "learning_rate": 3.287648054145516e-05, + "loss": 0.8422, + "step": 6073 + }, + { + "epoch": 3.425831923293852, + "grad_norm": 1.3987784385681152, + "learning_rate": 3.287366046249295e-05, + "loss": 0.7692, + "step": 6074 + }, + { + "epoch": 3.4263959390862944, + "grad_norm": 1.757792592048645, + "learning_rate": 3.287084038353074e-05, + "loss": 0.8064, + "step": 6075 + }, + { + "epoch": 3.4269599548787366, + "grad_norm": 1.8671691417694092, + "learning_rate": 3.286802030456853e-05, + "loss": 0.7959, + "step": 6076 + }, + { + "epoch": 3.427523970671179, + "grad_norm": 1.3074697256088257, + "learning_rate": 3.2865200225606316e-05, + "loss": 0.7258, + "step": 6077 + }, + { + "epoch": 3.428087986463621, + "grad_norm": 1.1760886907577515, + "learning_rate": 3.286238014664411e-05, + "loss": 0.7114, + "step": 6078 + }, + { + "epoch": 3.4286520022560634, + "grad_norm": 1.5671107769012451, + "learning_rate": 3.28595600676819e-05, + "loss": 0.8118, + "step": 6079 + }, + { + "epoch": 3.429216018048505, + "grad_norm": 1.130251407623291, + "learning_rate": 3.2856739988719686e-05, + "loss": 0.7246, + "step": 6080 + }, + { + "epoch": 3.4297800338409474, + "grad_norm": 3.0593926906585693, + "learning_rate": 3.285391990975748e-05, + "loss": 0.8188, + "step": 6081 + }, + { + "epoch": 3.4303440496333897, + "grad_norm": 0.9317616820335388, + "learning_rate": 3.285109983079526e-05, + "loss": 0.5883, + "step": 6082 + }, + { + "epoch": 3.430908065425832, + "grad_norm": 1.47529137134552, + "learning_rate": 3.2848279751833055e-05, + "loss": 0.7647, + "step": 6083 + }, + { + "epoch": 3.431472081218274, + "grad_norm": 1.7933458089828491, + "learning_rate": 3.284545967287084e-05, + "loss": 0.8226, + "step": 6084 + }, + { + "epoch": 3.4320360970107164, + "grad_norm": 6.939945220947266, + "learning_rate": 3.284263959390863e-05, + "loss": 0.707, + "step": 6085 + }, + { + "epoch": 3.4326001128031587, + "grad_norm": 1.4360259771347046, + "learning_rate": 3.283981951494642e-05, + "loss": 0.7799, + "step": 6086 + }, + { + "epoch": 3.4331641285956005, + "grad_norm": 1.3722056150436401, + "learning_rate": 3.283699943598421e-05, + "loss": 0.7188, + "step": 6087 + }, + { + "epoch": 3.4337281443880427, + "grad_norm": 1.236414909362793, + "learning_rate": 3.2834179357021996e-05, + "loss": 0.7252, + "step": 6088 + }, + { + "epoch": 3.434292160180485, + "grad_norm": 1.3123146295547485, + "learning_rate": 3.283135927805979e-05, + "loss": 0.7633, + "step": 6089 + }, + { + "epoch": 3.4348561759729273, + "grad_norm": 1.25961434841156, + "learning_rate": 3.2828539199097574e-05, + "loss": 0.7589, + "step": 6090 + }, + { + "epoch": 3.4354201917653695, + "grad_norm": 1.6029636859893799, + "learning_rate": 3.2825719120135366e-05, + "loss": 0.703, + "step": 6091 + }, + { + "epoch": 3.4359842075578118, + "grad_norm": 1.232375979423523, + "learning_rate": 3.282289904117316e-05, + "loss": 0.7955, + "step": 6092 + }, + { + "epoch": 3.436548223350254, + "grad_norm": 1.2823573350906372, + "learning_rate": 3.282007896221094e-05, + "loss": 0.7961, + "step": 6093 + }, + { + "epoch": 3.437112239142696, + "grad_norm": 1.2734870910644531, + "learning_rate": 3.281725888324873e-05, + "loss": 0.7695, + "step": 6094 + }, + { + "epoch": 3.437676254935138, + "grad_norm": 1.8253177404403687, + "learning_rate": 3.281443880428652e-05, + "loss": 0.789, + "step": 6095 + }, + { + "epoch": 3.4382402707275803, + "grad_norm": 2.2530672550201416, + "learning_rate": 3.281161872532431e-05, + "loss": 0.7136, + "step": 6096 + }, + { + "epoch": 3.4388042865200226, + "grad_norm": 0.9924137592315674, + "learning_rate": 3.28087986463621e-05, + "loss": 0.6663, + "step": 6097 + }, + { + "epoch": 3.439368302312465, + "grad_norm": 1.6539642810821533, + "learning_rate": 3.2805978567399884e-05, + "loss": 0.8072, + "step": 6098 + }, + { + "epoch": 3.439932318104907, + "grad_norm": 1.9825129508972168, + "learning_rate": 3.280315848843768e-05, + "loss": 0.7013, + "step": 6099 + }, + { + "epoch": 3.4404963338973493, + "grad_norm": 2.593069076538086, + "learning_rate": 3.280033840947547e-05, + "loss": 0.6431, + "step": 6100 + }, + { + "epoch": 3.441060349689791, + "grad_norm": 1.1518467664718628, + "learning_rate": 3.2797518330513254e-05, + "loss": 0.8015, + "step": 6101 + }, + { + "epoch": 3.4416243654822334, + "grad_norm": 0.8811089992523193, + "learning_rate": 3.2794698251551046e-05, + "loss": 0.6624, + "step": 6102 + }, + { + "epoch": 3.4421883812746756, + "grad_norm": 1.197019338607788, + "learning_rate": 3.279187817258884e-05, + "loss": 0.7434, + "step": 6103 + }, + { + "epoch": 3.442752397067118, + "grad_norm": 1.5419714450836182, + "learning_rate": 3.2789058093626623e-05, + "loss": 0.8549, + "step": 6104 + }, + { + "epoch": 3.44331641285956, + "grad_norm": 1.2281663417816162, + "learning_rate": 3.278623801466441e-05, + "loss": 0.7402, + "step": 6105 + }, + { + "epoch": 3.4438804286520024, + "grad_norm": 1.809636116027832, + "learning_rate": 3.27834179357022e-05, + "loss": 0.8317, + "step": 6106 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 1.0437004566192627, + "learning_rate": 3.278059785673999e-05, + "loss": 0.6559, + "step": 6107 + }, + { + "epoch": 3.4450084602368864, + "grad_norm": 1.8607759475708008, + "learning_rate": 3.277777777777778e-05, + "loss": 0.7721, + "step": 6108 + }, + { + "epoch": 3.4455724760293287, + "grad_norm": 1.1095020771026611, + "learning_rate": 3.2774957698815564e-05, + "loss": 0.6768, + "step": 6109 + }, + { + "epoch": 3.446136491821771, + "grad_norm": 1.5474332571029663, + "learning_rate": 3.2772137619853356e-05, + "loss": 0.7466, + "step": 6110 + }, + { + "epoch": 3.446700507614213, + "grad_norm": 0.9997957944869995, + "learning_rate": 3.276931754089115e-05, + "loss": 0.7484, + "step": 6111 + }, + { + "epoch": 3.4472645234066555, + "grad_norm": 1.252105474472046, + "learning_rate": 3.2766497461928934e-05, + "loss": 0.75, + "step": 6112 + }, + { + "epoch": 3.4478285391990977, + "grad_norm": 1.3290538787841797, + "learning_rate": 3.2763677382966726e-05, + "loss": 0.7054, + "step": 6113 + }, + { + "epoch": 3.44839255499154, + "grad_norm": 1.3981539011001587, + "learning_rate": 3.276085730400452e-05, + "loss": 0.7919, + "step": 6114 + }, + { + "epoch": 3.4489565707839818, + "grad_norm": 1.6960688829421997, + "learning_rate": 3.2758037225042304e-05, + "loss": 0.6979, + "step": 6115 + }, + { + "epoch": 3.449520586576424, + "grad_norm": 1.0148290395736694, + "learning_rate": 3.275521714608009e-05, + "loss": 0.6558, + "step": 6116 + }, + { + "epoch": 3.4500846023688663, + "grad_norm": 1.4311987161636353, + "learning_rate": 3.275239706711788e-05, + "loss": 0.8803, + "step": 6117 + }, + { + "epoch": 3.4506486181613085, + "grad_norm": 1.357190728187561, + "learning_rate": 3.2749576988155673e-05, + "loss": 0.8483, + "step": 6118 + }, + { + "epoch": 3.4512126339537508, + "grad_norm": 1.645817518234253, + "learning_rate": 3.274675690919346e-05, + "loss": 0.8851, + "step": 6119 + }, + { + "epoch": 3.451776649746193, + "grad_norm": 1.4132095575332642, + "learning_rate": 3.274393683023125e-05, + "loss": 0.6603, + "step": 6120 + }, + { + "epoch": 3.4523406655386353, + "grad_norm": 3.2085719108581543, + "learning_rate": 3.2741116751269036e-05, + "loss": 0.8825, + "step": 6121 + }, + { + "epoch": 3.452904681331077, + "grad_norm": 1.509200930595398, + "learning_rate": 3.273829667230683e-05, + "loss": 0.8499, + "step": 6122 + }, + { + "epoch": 3.4534686971235193, + "grad_norm": 1.3516258001327515, + "learning_rate": 3.2735476593344614e-05, + "loss": 0.8231, + "step": 6123 + }, + { + "epoch": 3.4540327129159616, + "grad_norm": 1.324279546737671, + "learning_rate": 3.2732656514382406e-05, + "loss": 0.7199, + "step": 6124 + }, + { + "epoch": 3.454596728708404, + "grad_norm": 1.0109715461730957, + "learning_rate": 3.272983643542019e-05, + "loss": 0.6348, + "step": 6125 + }, + { + "epoch": 3.455160744500846, + "grad_norm": 1.3097000122070312, + "learning_rate": 3.2727016356457984e-05, + "loss": 0.88, + "step": 6126 + }, + { + "epoch": 3.4557247602932883, + "grad_norm": 1.0360406637191772, + "learning_rate": 3.272419627749577e-05, + "loss": 0.7922, + "step": 6127 + }, + { + "epoch": 3.4562887760857306, + "grad_norm": 1.0069713592529297, + "learning_rate": 3.272137619853356e-05, + "loss": 0.7449, + "step": 6128 + }, + { + "epoch": 3.4568527918781724, + "grad_norm": 1.532023310661316, + "learning_rate": 3.271855611957135e-05, + "loss": 0.8146, + "step": 6129 + }, + { + "epoch": 3.4574168076706147, + "grad_norm": 1.321244478225708, + "learning_rate": 3.271573604060914e-05, + "loss": 0.7807, + "step": 6130 + }, + { + "epoch": 3.457980823463057, + "grad_norm": 1.0524674654006958, + "learning_rate": 3.271291596164693e-05, + "loss": 0.68, + "step": 6131 + }, + { + "epoch": 3.458544839255499, + "grad_norm": 1.2251405715942383, + "learning_rate": 3.2710095882684717e-05, + "loss": 0.7121, + "step": 6132 + }, + { + "epoch": 3.4591088550479414, + "grad_norm": 1.2945444583892822, + "learning_rate": 3.27072758037225e-05, + "loss": 0.8533, + "step": 6133 + }, + { + "epoch": 3.4596728708403837, + "grad_norm": 1.2155534029006958, + "learning_rate": 3.2704455724760294e-05, + "loss": 0.7062, + "step": 6134 + }, + { + "epoch": 3.460236886632826, + "grad_norm": 0.9793747067451477, + "learning_rate": 3.2701635645798086e-05, + "loss": 0.6575, + "step": 6135 + }, + { + "epoch": 3.4608009024252677, + "grad_norm": 1.6584171056747437, + "learning_rate": 3.269881556683587e-05, + "loss": 0.8241, + "step": 6136 + }, + { + "epoch": 3.46136491821771, + "grad_norm": 1.0194305181503296, + "learning_rate": 3.269599548787366e-05, + "loss": 0.7583, + "step": 6137 + }, + { + "epoch": 3.4619289340101522, + "grad_norm": 0.9115114808082581, + "learning_rate": 3.2693175408911456e-05, + "loss": 0.7004, + "step": 6138 + }, + { + "epoch": 3.4624929498025945, + "grad_norm": 1.0541411638259888, + "learning_rate": 3.269035532994924e-05, + "loss": 0.6835, + "step": 6139 + }, + { + "epoch": 3.4630569655950367, + "grad_norm": 2.059570789337158, + "learning_rate": 3.268753525098703e-05, + "loss": 0.8704, + "step": 6140 + }, + { + "epoch": 3.463620981387479, + "grad_norm": 1.5452613830566406, + "learning_rate": 3.268471517202481e-05, + "loss": 0.8883, + "step": 6141 + }, + { + "epoch": 3.4641849971799212, + "grad_norm": 1.5537407398223877, + "learning_rate": 3.268189509306261e-05, + "loss": 0.8039, + "step": 6142 + }, + { + "epoch": 3.464749012972363, + "grad_norm": 0.9819119572639465, + "learning_rate": 3.26790750141004e-05, + "loss": 0.7826, + "step": 6143 + }, + { + "epoch": 3.4653130287648053, + "grad_norm": 1.7318617105484009, + "learning_rate": 3.267625493513818e-05, + "loss": 0.7374, + "step": 6144 + }, + { + "epoch": 3.4658770445572475, + "grad_norm": 1.1386125087738037, + "learning_rate": 3.2673434856175974e-05, + "loss": 0.8113, + "step": 6145 + }, + { + "epoch": 3.46644106034969, + "grad_norm": 1.4464070796966553, + "learning_rate": 3.2670614777213766e-05, + "loss": 0.6519, + "step": 6146 + }, + { + "epoch": 3.467005076142132, + "grad_norm": 0.9952981472015381, + "learning_rate": 3.266779469825155e-05, + "loss": 0.7293, + "step": 6147 + }, + { + "epoch": 3.4675690919345743, + "grad_norm": 0.9250311255455017, + "learning_rate": 3.266497461928934e-05, + "loss": 0.6457, + "step": 6148 + }, + { + "epoch": 3.4681331077270166, + "grad_norm": 1.0210514068603516, + "learning_rate": 3.2662154540327136e-05, + "loss": 0.7504, + "step": 6149 + }, + { + "epoch": 3.4686971235194584, + "grad_norm": 1.18464994430542, + "learning_rate": 3.265933446136492e-05, + "loss": 0.8084, + "step": 6150 + }, + { + "epoch": 3.4692611393119006, + "grad_norm": 1.0783482789993286, + "learning_rate": 3.265651438240271e-05, + "loss": 0.7831, + "step": 6151 + }, + { + "epoch": 3.469825155104343, + "grad_norm": 1.7409254312515259, + "learning_rate": 3.26536943034405e-05, + "loss": 0.7945, + "step": 6152 + }, + { + "epoch": 3.470389170896785, + "grad_norm": 1.745208501815796, + "learning_rate": 3.265087422447829e-05, + "loss": 0.6991, + "step": 6153 + }, + { + "epoch": 3.4709531866892274, + "grad_norm": 1.5134023427963257, + "learning_rate": 3.264805414551608e-05, + "loss": 0.7598, + "step": 6154 + }, + { + "epoch": 3.4715172024816696, + "grad_norm": 1.1635887622833252, + "learning_rate": 3.264523406655386e-05, + "loss": 0.6714, + "step": 6155 + }, + { + "epoch": 3.472081218274112, + "grad_norm": 1.2070190906524658, + "learning_rate": 3.2642413987591654e-05, + "loss": 0.6697, + "step": 6156 + }, + { + "epoch": 3.4726452340665537, + "grad_norm": 1.1053615808486938, + "learning_rate": 3.263959390862945e-05, + "loss": 0.8175, + "step": 6157 + }, + { + "epoch": 3.473209249858996, + "grad_norm": 2.321896553039551, + "learning_rate": 3.263677382966723e-05, + "loss": 0.7649, + "step": 6158 + }, + { + "epoch": 3.473773265651438, + "grad_norm": 1.1093451976776123, + "learning_rate": 3.263395375070502e-05, + "loss": 0.7458, + "step": 6159 + }, + { + "epoch": 3.4743372814438804, + "grad_norm": 1.0297232866287231, + "learning_rate": 3.263113367174281e-05, + "loss": 0.6149, + "step": 6160 + }, + { + "epoch": 3.4749012972363227, + "grad_norm": 1.2333513498306274, + "learning_rate": 3.26283135927806e-05, + "loss": 0.6954, + "step": 6161 + }, + { + "epoch": 3.475465313028765, + "grad_norm": 1.0247528553009033, + "learning_rate": 3.262549351381839e-05, + "loss": 0.7453, + "step": 6162 + }, + { + "epoch": 3.476029328821207, + "grad_norm": 3.4342334270477295, + "learning_rate": 3.262267343485618e-05, + "loss": 0.7863, + "step": 6163 + }, + { + "epoch": 3.476593344613649, + "grad_norm": 1.3772245645523071, + "learning_rate": 3.2619853355893965e-05, + "loss": 0.7765, + "step": 6164 + }, + { + "epoch": 3.4771573604060912, + "grad_norm": 1.1144558191299438, + "learning_rate": 3.261703327693176e-05, + "loss": 0.7559, + "step": 6165 + }, + { + "epoch": 3.4777213761985335, + "grad_norm": 1.2099157571792603, + "learning_rate": 3.261421319796954e-05, + "loss": 0.7169, + "step": 6166 + }, + { + "epoch": 3.4782853919909758, + "grad_norm": 1.6076462268829346, + "learning_rate": 3.2611393119007335e-05, + "loss": 0.7672, + "step": 6167 + }, + { + "epoch": 3.478849407783418, + "grad_norm": 1.2325785160064697, + "learning_rate": 3.260857304004512e-05, + "loss": 0.7002, + "step": 6168 + }, + { + "epoch": 3.4794134235758603, + "grad_norm": 1.1510425806045532, + "learning_rate": 3.260575296108291e-05, + "loss": 0.6743, + "step": 6169 + }, + { + "epoch": 3.4799774393683025, + "grad_norm": 1.2992613315582275, + "learning_rate": 3.2602932882120704e-05, + "loss": 0.7308, + "step": 6170 + }, + { + "epoch": 3.4805414551607443, + "grad_norm": 1.0943677425384521, + "learning_rate": 3.260011280315849e-05, + "loss": 0.7487, + "step": 6171 + }, + { + "epoch": 3.4811054709531866, + "grad_norm": 1.140324592590332, + "learning_rate": 3.2597292724196275e-05, + "loss": 0.775, + "step": 6172 + }, + { + "epoch": 3.481669486745629, + "grad_norm": 1.6375921964645386, + "learning_rate": 3.259447264523407e-05, + "loss": 0.8204, + "step": 6173 + }, + { + "epoch": 3.482233502538071, + "grad_norm": 1.094485878944397, + "learning_rate": 3.259165256627186e-05, + "loss": 0.8209, + "step": 6174 + }, + { + "epoch": 3.4827975183305133, + "grad_norm": 0.9611552357673645, + "learning_rate": 3.2588832487309645e-05, + "loss": 0.7274, + "step": 6175 + }, + { + "epoch": 3.4833615341229556, + "grad_norm": 1.2398611307144165, + "learning_rate": 3.258601240834743e-05, + "loss": 0.8172, + "step": 6176 + }, + { + "epoch": 3.483925549915398, + "grad_norm": 1.615638256072998, + "learning_rate": 3.258319232938522e-05, + "loss": 0.7857, + "step": 6177 + }, + { + "epoch": 3.4844895657078396, + "grad_norm": 1.3650530576705933, + "learning_rate": 3.2580372250423015e-05, + "loss": 0.6822, + "step": 6178 + }, + { + "epoch": 3.485053581500282, + "grad_norm": 1.3865995407104492, + "learning_rate": 3.25775521714608e-05, + "loss": 0.8262, + "step": 6179 + }, + { + "epoch": 3.485617597292724, + "grad_norm": 1.3489224910736084, + "learning_rate": 3.2574732092498586e-05, + "loss": 0.6559, + "step": 6180 + }, + { + "epoch": 3.4861816130851664, + "grad_norm": 1.4231746196746826, + "learning_rate": 3.2571912013536385e-05, + "loss": 0.6927, + "step": 6181 + }, + { + "epoch": 3.4867456288776086, + "grad_norm": 1.8608746528625488, + "learning_rate": 3.256909193457417e-05, + "loss": 0.8009, + "step": 6182 + }, + { + "epoch": 3.487309644670051, + "grad_norm": 1.8448880910873413, + "learning_rate": 3.2566271855611955e-05, + "loss": 0.6969, + "step": 6183 + }, + { + "epoch": 3.487873660462493, + "grad_norm": 1.0872348546981812, + "learning_rate": 3.256345177664975e-05, + "loss": 0.7217, + "step": 6184 + }, + { + "epoch": 3.488437676254935, + "grad_norm": 1.4043086767196655, + "learning_rate": 3.256063169768754e-05, + "loss": 0.7474, + "step": 6185 + }, + { + "epoch": 3.489001692047377, + "grad_norm": 1.1215450763702393, + "learning_rate": 3.2557811618725325e-05, + "loss": 0.6982, + "step": 6186 + }, + { + "epoch": 3.4895657078398195, + "grad_norm": 1.2453289031982422, + "learning_rate": 3.255499153976311e-05, + "loss": 0.6693, + "step": 6187 + }, + { + "epoch": 3.4901297236322617, + "grad_norm": 1.270354151725769, + "learning_rate": 3.255217146080091e-05, + "loss": 0.7204, + "step": 6188 + }, + { + "epoch": 3.490693739424704, + "grad_norm": 1.4062310457229614, + "learning_rate": 3.2549351381838695e-05, + "loss": 0.7693, + "step": 6189 + }, + { + "epoch": 3.491257755217146, + "grad_norm": 2.1478233337402344, + "learning_rate": 3.254653130287648e-05, + "loss": 0.8992, + "step": 6190 + }, + { + "epoch": 3.4918217710095885, + "grad_norm": 0.9469218850135803, + "learning_rate": 3.254371122391427e-05, + "loss": 0.6594, + "step": 6191 + }, + { + "epoch": 3.4923857868020303, + "grad_norm": 5.25871467590332, + "learning_rate": 3.2540891144952065e-05, + "loss": 0.9561, + "step": 6192 + }, + { + "epoch": 3.4929498025944725, + "grad_norm": 1.2182809114456177, + "learning_rate": 3.253807106598985e-05, + "loss": 0.7956, + "step": 6193 + }, + { + "epoch": 3.4935138183869148, + "grad_norm": 0.889734148979187, + "learning_rate": 3.2535250987027636e-05, + "loss": 0.7515, + "step": 6194 + }, + { + "epoch": 3.494077834179357, + "grad_norm": 1.8540147542953491, + "learning_rate": 3.253243090806543e-05, + "loss": 0.7362, + "step": 6195 + }, + { + "epoch": 3.4946418499717993, + "grad_norm": 1.1195862293243408, + "learning_rate": 3.252961082910322e-05, + "loss": 0.6156, + "step": 6196 + }, + { + "epoch": 3.4952058657642415, + "grad_norm": 1.4107328653335571, + "learning_rate": 3.2526790750141005e-05, + "loss": 0.7158, + "step": 6197 + }, + { + "epoch": 3.495769881556684, + "grad_norm": 1.5463483333587646, + "learning_rate": 3.252397067117879e-05, + "loss": 0.7938, + "step": 6198 + }, + { + "epoch": 3.4963338973491256, + "grad_norm": 1.0446264743804932, + "learning_rate": 3.252115059221658e-05, + "loss": 0.6379, + "step": 6199 + }, + { + "epoch": 3.496897913141568, + "grad_norm": 3.733189821243286, + "learning_rate": 3.2518330513254375e-05, + "loss": 0.8802, + "step": 6200 + }, + { + "epoch": 3.49746192893401, + "grad_norm": 1.3300944566726685, + "learning_rate": 3.251551043429216e-05, + "loss": 0.7723, + "step": 6201 + }, + { + "epoch": 3.4980259447264523, + "grad_norm": 1.6791012287139893, + "learning_rate": 3.251269035532995e-05, + "loss": 0.8224, + "step": 6202 + }, + { + "epoch": 3.4985899605188946, + "grad_norm": 1.1461268663406372, + "learning_rate": 3.250987027636774e-05, + "loss": 0.8473, + "step": 6203 + }, + { + "epoch": 3.499153976311337, + "grad_norm": 1.64085054397583, + "learning_rate": 3.250705019740553e-05, + "loss": 0.7804, + "step": 6204 + }, + { + "epoch": 3.499717992103779, + "grad_norm": 1.1415640115737915, + "learning_rate": 3.2504230118443316e-05, + "loss": 0.7372, + "step": 6205 + }, + { + "epoch": 3.500282007896221, + "grad_norm": 1.2778582572937012, + "learning_rate": 3.250141003948111e-05, + "loss": 0.7737, + "step": 6206 + }, + { + "epoch": 3.500846023688663, + "grad_norm": 1.5202713012695312, + "learning_rate": 3.249858996051889e-05, + "loss": 0.7617, + "step": 6207 + }, + { + "epoch": 3.5014100394811054, + "grad_norm": 1.9397356510162354, + "learning_rate": 3.2495769881556685e-05, + "loss": 0.7007, + "step": 6208 + }, + { + "epoch": 3.5019740552735477, + "grad_norm": 1.279099464416504, + "learning_rate": 3.249294980259448e-05, + "loss": 0.7806, + "step": 6209 + }, + { + "epoch": 3.50253807106599, + "grad_norm": 1.2244174480438232, + "learning_rate": 3.249012972363226e-05, + "loss": 0.7107, + "step": 6210 + }, + { + "epoch": 3.503102086858432, + "grad_norm": 2.0639476776123047, + "learning_rate": 3.248730964467005e-05, + "loss": 0.7452, + "step": 6211 + }, + { + "epoch": 3.5036661026508744, + "grad_norm": 1.1234256029129028, + "learning_rate": 3.248448956570784e-05, + "loss": 0.7212, + "step": 6212 + }, + { + "epoch": 3.504230118443316, + "grad_norm": 2.127016305923462, + "learning_rate": 3.248166948674563e-05, + "loss": 0.9253, + "step": 6213 + }, + { + "epoch": 3.5047941342357585, + "grad_norm": 1.5122308731079102, + "learning_rate": 3.247884940778342e-05, + "loss": 0.7437, + "step": 6214 + }, + { + "epoch": 3.5053581500282007, + "grad_norm": 1.8047467470169067, + "learning_rate": 3.2476029328821204e-05, + "loss": 1.0205, + "step": 6215 + }, + { + "epoch": 3.505922165820643, + "grad_norm": 0.9330095052719116, + "learning_rate": 3.2473209249858996e-05, + "loss": 0.7125, + "step": 6216 + }, + { + "epoch": 3.5064861816130852, + "grad_norm": 1.0189608335494995, + "learning_rate": 3.247038917089679e-05, + "loss": 0.7242, + "step": 6217 + }, + { + "epoch": 3.5070501974055275, + "grad_norm": 1.644675374031067, + "learning_rate": 3.2467569091934573e-05, + "loss": 0.8629, + "step": 6218 + }, + { + "epoch": 3.5076142131979697, + "grad_norm": 0.8712964653968811, + "learning_rate": 3.2464749012972366e-05, + "loss": 0.685, + "step": 6219 + }, + { + "epoch": 3.5081782289904115, + "grad_norm": 1.0890018939971924, + "learning_rate": 3.246192893401016e-05, + "loss": 0.7853, + "step": 6220 + }, + { + "epoch": 3.508742244782854, + "grad_norm": 1.2721951007843018, + "learning_rate": 3.245910885504794e-05, + "loss": 0.7689, + "step": 6221 + }, + { + "epoch": 3.509306260575296, + "grad_norm": 1.1633974313735962, + "learning_rate": 3.245628877608573e-05, + "loss": 0.7318, + "step": 6222 + }, + { + "epoch": 3.5098702763677383, + "grad_norm": 1.5214958190917969, + "learning_rate": 3.245346869712352e-05, + "loss": 0.8155, + "step": 6223 + }, + { + "epoch": 3.5104342921601805, + "grad_norm": 1.8527244329452515, + "learning_rate": 3.245064861816131e-05, + "loss": 0.8173, + "step": 6224 + }, + { + "epoch": 3.510998307952623, + "grad_norm": 1.6605396270751953, + "learning_rate": 3.24478285391991e-05, + "loss": 0.739, + "step": 6225 + }, + { + "epoch": 3.511562323745065, + "grad_norm": 1.3292268514633179, + "learning_rate": 3.2445008460236884e-05, + "loss": 0.7607, + "step": 6226 + }, + { + "epoch": 3.512126339537507, + "grad_norm": 1.433720588684082, + "learning_rate": 3.244218838127468e-05, + "loss": 0.8713, + "step": 6227 + }, + { + "epoch": 3.512690355329949, + "grad_norm": 1.4779740571975708, + "learning_rate": 3.243936830231247e-05, + "loss": 0.6671, + "step": 6228 + }, + { + "epoch": 3.5132543711223914, + "grad_norm": 1.5719767808914185, + "learning_rate": 3.2436548223350254e-05, + "loss": 0.7664, + "step": 6229 + }, + { + "epoch": 3.5138183869148336, + "grad_norm": 1.2324589490890503, + "learning_rate": 3.2433728144388046e-05, + "loss": 0.7967, + "step": 6230 + }, + { + "epoch": 3.514382402707276, + "grad_norm": 1.4360957145690918, + "learning_rate": 3.243090806542584e-05, + "loss": 0.7568, + "step": 6231 + }, + { + "epoch": 3.514946418499718, + "grad_norm": 1.1631929874420166, + "learning_rate": 3.242808798646362e-05, + "loss": 0.7431, + "step": 6232 + }, + { + "epoch": 3.5155104342921604, + "grad_norm": 1.2847633361816406, + "learning_rate": 3.242526790750141e-05, + "loss": 0.8201, + "step": 6233 + }, + { + "epoch": 3.516074450084602, + "grad_norm": 1.0921061038970947, + "learning_rate": 3.24224478285392e-05, + "loss": 0.7711, + "step": 6234 + }, + { + "epoch": 3.5166384658770444, + "grad_norm": 1.4964892864227295, + "learning_rate": 3.241962774957699e-05, + "loss": 0.7414, + "step": 6235 + }, + { + "epoch": 3.5172024816694867, + "grad_norm": 0.8112980127334595, + "learning_rate": 3.241680767061478e-05, + "loss": 0.6082, + "step": 6236 + }, + { + "epoch": 3.517766497461929, + "grad_norm": 1.3356245756149292, + "learning_rate": 3.2413987591652564e-05, + "loss": 0.7704, + "step": 6237 + }, + { + "epoch": 3.518330513254371, + "grad_norm": 1.2658835649490356, + "learning_rate": 3.2411167512690356e-05, + "loss": 0.8288, + "step": 6238 + }, + { + "epoch": 3.5188945290468134, + "grad_norm": 1.3455501794815063, + "learning_rate": 3.240834743372815e-05, + "loss": 0.7821, + "step": 6239 + }, + { + "epoch": 3.5194585448392557, + "grad_norm": 1.5769476890563965, + "learning_rate": 3.2405527354765934e-05, + "loss": 0.8582, + "step": 6240 + }, + { + "epoch": 3.5200225606316975, + "grad_norm": 2.0352916717529297, + "learning_rate": 3.2402707275803726e-05, + "loss": 0.8757, + "step": 6241 + }, + { + "epoch": 3.5205865764241397, + "grad_norm": 1.2071373462677002, + "learning_rate": 3.239988719684151e-05, + "loss": 0.7146, + "step": 6242 + }, + { + "epoch": 3.521150592216582, + "grad_norm": 1.1524194478988647, + "learning_rate": 3.2397067117879303e-05, + "loss": 0.6545, + "step": 6243 + }, + { + "epoch": 3.5217146080090242, + "grad_norm": 2.1292285919189453, + "learning_rate": 3.239424703891709e-05, + "loss": 0.8957, + "step": 6244 + }, + { + "epoch": 3.5222786238014665, + "grad_norm": 2.2945969104766846, + "learning_rate": 3.239142695995488e-05, + "loss": 0.7851, + "step": 6245 + }, + { + "epoch": 3.5228426395939088, + "grad_norm": 1.5713496208190918, + "learning_rate": 3.2388606880992666e-05, + "loss": 0.8112, + "step": 6246 + }, + { + "epoch": 3.523406655386351, + "grad_norm": 3.331760883331299, + "learning_rate": 3.238578680203046e-05, + "loss": 0.7975, + "step": 6247 + }, + { + "epoch": 3.523970671178793, + "grad_norm": 1.7299107313156128, + "learning_rate": 3.238296672306825e-05, + "loss": 0.8862, + "step": 6248 + }, + { + "epoch": 3.524534686971235, + "grad_norm": 2.976073741912842, + "learning_rate": 3.2380146644106036e-05, + "loss": 0.965, + "step": 6249 + }, + { + "epoch": 3.5250987027636773, + "grad_norm": 2.1563403606414795, + "learning_rate": 3.237732656514382e-05, + "loss": 0.865, + "step": 6250 + }, + { + "epoch": 3.5256627185561196, + "grad_norm": 1.8113245964050293, + "learning_rate": 3.2374506486181614e-05, + "loss": 0.7783, + "step": 6251 + }, + { + "epoch": 3.526226734348562, + "grad_norm": 1.88075590133667, + "learning_rate": 3.2371686407219406e-05, + "loss": 0.8308, + "step": 6252 + }, + { + "epoch": 3.526790750141004, + "grad_norm": 1.4557173252105713, + "learning_rate": 3.236886632825719e-05, + "loss": 0.8844, + "step": 6253 + }, + { + "epoch": 3.5273547659334463, + "grad_norm": 1.1646987199783325, + "learning_rate": 3.2366046249294984e-05, + "loss": 0.6453, + "step": 6254 + }, + { + "epoch": 3.527918781725888, + "grad_norm": 1.583805799484253, + "learning_rate": 3.236322617033277e-05, + "loss": 0.7999, + "step": 6255 + }, + { + "epoch": 3.5284827975183304, + "grad_norm": 1.5221972465515137, + "learning_rate": 3.236040609137056e-05, + "loss": 0.7799, + "step": 6256 + }, + { + "epoch": 3.5290468133107726, + "grad_norm": 1.7543349266052246, + "learning_rate": 3.235758601240835e-05, + "loss": 0.7925, + "step": 6257 + }, + { + "epoch": 3.529610829103215, + "grad_norm": 2.189941883087158, + "learning_rate": 3.235476593344614e-05, + "loss": 0.9423, + "step": 6258 + }, + { + "epoch": 3.530174844895657, + "grad_norm": 1.3310625553131104, + "learning_rate": 3.235194585448393e-05, + "loss": 0.7485, + "step": 6259 + }, + { + "epoch": 3.5307388606880994, + "grad_norm": 1.1754868030548096, + "learning_rate": 3.2349125775521716e-05, + "loss": 0.731, + "step": 6260 + }, + { + "epoch": 3.5313028764805416, + "grad_norm": 1.3450548648834229, + "learning_rate": 3.23463056965595e-05, + "loss": 0.7795, + "step": 6261 + }, + { + "epoch": 3.5318668922729834, + "grad_norm": 1.043283224105835, + "learning_rate": 3.2343485617597294e-05, + "loss": 0.6749, + "step": 6262 + }, + { + "epoch": 3.5324309080654257, + "grad_norm": 1.4440784454345703, + "learning_rate": 3.2340665538635086e-05, + "loss": 0.7425, + "step": 6263 + }, + { + "epoch": 3.532994923857868, + "grad_norm": 0.8805188536643982, + "learning_rate": 3.233784545967287e-05, + "loss": 0.6312, + "step": 6264 + }, + { + "epoch": 3.53355893965031, + "grad_norm": 1.3333208560943604, + "learning_rate": 3.233502538071066e-05, + "loss": 0.726, + "step": 6265 + }, + { + "epoch": 3.5341229554427525, + "grad_norm": 0.9542872905731201, + "learning_rate": 3.2332205301748456e-05, + "loss": 0.6494, + "step": 6266 + }, + { + "epoch": 3.5346869712351947, + "grad_norm": 2.3613202571868896, + "learning_rate": 3.232938522278624e-05, + "loss": 0.6514, + "step": 6267 + }, + { + "epoch": 3.535250987027637, + "grad_norm": 1.8003747463226318, + "learning_rate": 3.232656514382403e-05, + "loss": 0.7498, + "step": 6268 + }, + { + "epoch": 3.5358150028200788, + "grad_norm": 1.3041898012161255, + "learning_rate": 3.232374506486182e-05, + "loss": 0.8805, + "step": 6269 + }, + { + "epoch": 3.536379018612521, + "grad_norm": 0.8999313712120056, + "learning_rate": 3.232092498589961e-05, + "loss": 0.6381, + "step": 6270 + }, + { + "epoch": 3.5369430344049633, + "grad_norm": 1.3399038314819336, + "learning_rate": 3.2318104906937397e-05, + "loss": 0.8031, + "step": 6271 + }, + { + "epoch": 3.5375070501974055, + "grad_norm": 1.005401372909546, + "learning_rate": 3.231528482797518e-05, + "loss": 0.6558, + "step": 6272 + }, + { + "epoch": 3.5380710659898478, + "grad_norm": 1.248084306716919, + "learning_rate": 3.2312464749012974e-05, + "loss": 0.7956, + "step": 6273 + }, + { + "epoch": 3.53863508178229, + "grad_norm": 1.4477919340133667, + "learning_rate": 3.2309644670050766e-05, + "loss": 0.79, + "step": 6274 + }, + { + "epoch": 3.5391990975747323, + "grad_norm": 1.26422917842865, + "learning_rate": 3.230682459108855e-05, + "loss": 0.7616, + "step": 6275 + }, + { + "epoch": 3.539763113367174, + "grad_norm": 1.3235836029052734, + "learning_rate": 3.230400451212634e-05, + "loss": 0.7891, + "step": 6276 + }, + { + "epoch": 3.5403271291596163, + "grad_norm": 1.4283831119537354, + "learning_rate": 3.230118443316413e-05, + "loss": 0.7875, + "step": 6277 + }, + { + "epoch": 3.5408911449520586, + "grad_norm": 2.362579107284546, + "learning_rate": 3.229836435420192e-05, + "loss": 0.8509, + "step": 6278 + }, + { + "epoch": 3.541455160744501, + "grad_norm": 1.2470622062683105, + "learning_rate": 3.229554427523971e-05, + "loss": 0.8976, + "step": 6279 + }, + { + "epoch": 3.542019176536943, + "grad_norm": 0.8934418559074402, + "learning_rate": 3.22927241962775e-05, + "loss": 0.6452, + "step": 6280 + }, + { + "epoch": 3.5425831923293853, + "grad_norm": 1.0684773921966553, + "learning_rate": 3.2289904117315285e-05, + "loss": 0.6632, + "step": 6281 + }, + { + "epoch": 3.5431472081218276, + "grad_norm": 1.0208261013031006, + "learning_rate": 3.228708403835308e-05, + "loss": 0.713, + "step": 6282 + }, + { + "epoch": 3.5437112239142694, + "grad_norm": 0.868873655796051, + "learning_rate": 3.228426395939086e-05, + "loss": 0.6788, + "step": 6283 + }, + { + "epoch": 3.5442752397067117, + "grad_norm": 1.4988150596618652, + "learning_rate": 3.2281443880428654e-05, + "loss": 0.7624, + "step": 6284 + }, + { + "epoch": 3.544839255499154, + "grad_norm": 1.6987472772598267, + "learning_rate": 3.227862380146644e-05, + "loss": 1.0078, + "step": 6285 + }, + { + "epoch": 3.545403271291596, + "grad_norm": 1.2213797569274902, + "learning_rate": 3.227580372250423e-05, + "loss": 0.7624, + "step": 6286 + }, + { + "epoch": 3.5459672870840384, + "grad_norm": 1.1984933614730835, + "learning_rate": 3.2272983643542024e-05, + "loss": 0.8214, + "step": 6287 + }, + { + "epoch": 3.5465313028764807, + "grad_norm": 1.9890673160552979, + "learning_rate": 3.227016356457981e-05, + "loss": 0.8544, + "step": 6288 + }, + { + "epoch": 3.547095318668923, + "grad_norm": 2.6175265312194824, + "learning_rate": 3.22673434856176e-05, + "loss": 0.7468, + "step": 6289 + }, + { + "epoch": 3.5476593344613647, + "grad_norm": 1.4187445640563965, + "learning_rate": 3.226452340665539e-05, + "loss": 0.8122, + "step": 6290 + }, + { + "epoch": 3.548223350253807, + "grad_norm": 1.4541891813278198, + "learning_rate": 3.226170332769318e-05, + "loss": 0.7941, + "step": 6291 + }, + { + "epoch": 3.5487873660462492, + "grad_norm": 1.0953274965286255, + "learning_rate": 3.2258883248730965e-05, + "loss": 0.6917, + "step": 6292 + }, + { + "epoch": 3.5493513818386915, + "grad_norm": 2.400822162628174, + "learning_rate": 3.225606316976876e-05, + "loss": 0.7385, + "step": 6293 + }, + { + "epoch": 3.5499153976311337, + "grad_norm": 1.3164167404174805, + "learning_rate": 3.225324309080654e-05, + "loss": 0.7373, + "step": 6294 + }, + { + "epoch": 3.550479413423576, + "grad_norm": 1.1860078573226929, + "learning_rate": 3.2250423011844334e-05, + "loss": 0.7923, + "step": 6295 + }, + { + "epoch": 3.5510434292160182, + "grad_norm": 0.9823527932167053, + "learning_rate": 3.224760293288212e-05, + "loss": 0.7627, + "step": 6296 + }, + { + "epoch": 3.55160744500846, + "grad_norm": 1.3865348100662231, + "learning_rate": 3.224478285391991e-05, + "loss": 0.7928, + "step": 6297 + }, + { + "epoch": 3.5521714608009023, + "grad_norm": 1.251409649848938, + "learning_rate": 3.2241962774957704e-05, + "loss": 0.6475, + "step": 6298 + }, + { + "epoch": 3.5527354765933445, + "grad_norm": 1.1081424951553345, + "learning_rate": 3.223914269599549e-05, + "loss": 0.6933, + "step": 6299 + }, + { + "epoch": 3.553299492385787, + "grad_norm": 1.2268290519714355, + "learning_rate": 3.2236322617033275e-05, + "loss": 0.7351, + "step": 6300 + }, + { + "epoch": 3.553863508178229, + "grad_norm": 1.7916666269302368, + "learning_rate": 3.223350253807107e-05, + "loss": 0.7489, + "step": 6301 + }, + { + "epoch": 3.5544275239706713, + "grad_norm": 1.177505612373352, + "learning_rate": 3.223068245910886e-05, + "loss": 0.8237, + "step": 6302 + }, + { + "epoch": 3.5549915397631136, + "grad_norm": 0.9272423386573792, + "learning_rate": 3.2227862380146645e-05, + "loss": 0.7294, + "step": 6303 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.7719138860702515, + "learning_rate": 3.222504230118443e-05, + "loss": 0.8406, + "step": 6304 + }, + { + "epoch": 3.5561195713479976, + "grad_norm": 1.0846961736679077, + "learning_rate": 3.222222222222223e-05, + "loss": 0.7175, + "step": 6305 + }, + { + "epoch": 3.55668358714044, + "grad_norm": 1.5414551496505737, + "learning_rate": 3.2219402143260015e-05, + "loss": 0.738, + "step": 6306 + }, + { + "epoch": 3.557247602932882, + "grad_norm": 1.3476489782333374, + "learning_rate": 3.22165820642978e-05, + "loss": 0.7864, + "step": 6307 + }, + { + "epoch": 3.5578116187253244, + "grad_norm": 1.4048867225646973, + "learning_rate": 3.2213761985335585e-05, + "loss": 0.8149, + "step": 6308 + }, + { + "epoch": 3.5583756345177666, + "grad_norm": 1.7078750133514404, + "learning_rate": 3.2210941906373384e-05, + "loss": 0.8749, + "step": 6309 + }, + { + "epoch": 3.558939650310209, + "grad_norm": 0.9676898121833801, + "learning_rate": 3.220812182741117e-05, + "loss": 0.7185, + "step": 6310 + }, + { + "epoch": 3.5595036661026507, + "grad_norm": 1.2141451835632324, + "learning_rate": 3.2205301748448955e-05, + "loss": 0.8191, + "step": 6311 + }, + { + "epoch": 3.560067681895093, + "grad_norm": 1.052295446395874, + "learning_rate": 3.220248166948675e-05, + "loss": 0.7754, + "step": 6312 + }, + { + "epoch": 3.560631697687535, + "grad_norm": 0.8883212804794312, + "learning_rate": 3.219966159052454e-05, + "loss": 0.5764, + "step": 6313 + }, + { + "epoch": 3.5611957134799774, + "grad_norm": 2.146008014678955, + "learning_rate": 3.2196841511562325e-05, + "loss": 0.8934, + "step": 6314 + }, + { + "epoch": 3.5617597292724197, + "grad_norm": 1.5373495817184448, + "learning_rate": 3.219402143260011e-05, + "loss": 0.7376, + "step": 6315 + }, + { + "epoch": 3.562323745064862, + "grad_norm": 1.1360546350479126, + "learning_rate": 3.21912013536379e-05, + "loss": 0.62, + "step": 6316 + }, + { + "epoch": 3.562887760857304, + "grad_norm": 1.4976305961608887, + "learning_rate": 3.2188381274675695e-05, + "loss": 0.8879, + "step": 6317 + }, + { + "epoch": 3.563451776649746, + "grad_norm": 1.0341852903366089, + "learning_rate": 3.218556119571348e-05, + "loss": 0.767, + "step": 6318 + }, + { + "epoch": 3.5640157924421882, + "grad_norm": 0.9248468279838562, + "learning_rate": 3.218274111675127e-05, + "loss": 0.6352, + "step": 6319 + }, + { + "epoch": 3.5645798082346305, + "grad_norm": 2.094421148300171, + "learning_rate": 3.217992103778906e-05, + "loss": 0.8329, + "step": 6320 + }, + { + "epoch": 3.5651438240270727, + "grad_norm": 1.3141918182373047, + "learning_rate": 3.217710095882685e-05, + "loss": 0.7985, + "step": 6321 + }, + { + "epoch": 3.565707839819515, + "grad_norm": 1.5052425861358643, + "learning_rate": 3.2174280879864635e-05, + "loss": 0.813, + "step": 6322 + }, + { + "epoch": 3.5662718556119573, + "grad_norm": 1.207857608795166, + "learning_rate": 3.217146080090243e-05, + "loss": 0.6754, + "step": 6323 + }, + { + "epoch": 3.5668358714043995, + "grad_norm": 1.5834072828292847, + "learning_rate": 3.216864072194022e-05, + "loss": 0.775, + "step": 6324 + }, + { + "epoch": 3.5673998871968413, + "grad_norm": 1.5942994356155396, + "learning_rate": 3.2165820642978005e-05, + "loss": 0.747, + "step": 6325 + }, + { + "epoch": 3.5679639029892836, + "grad_norm": 0.9390817880630493, + "learning_rate": 3.216300056401579e-05, + "loss": 0.6755, + "step": 6326 + }, + { + "epoch": 3.568527918781726, + "grad_norm": 1.6644092798233032, + "learning_rate": 3.216018048505358e-05, + "loss": 0.7153, + "step": 6327 + }, + { + "epoch": 3.569091934574168, + "grad_norm": 0.9989796280860901, + "learning_rate": 3.2157360406091375e-05, + "loss": 0.8113, + "step": 6328 + }, + { + "epoch": 3.5696559503666103, + "grad_norm": 1.5269006490707397, + "learning_rate": 3.215454032712916e-05, + "loss": 0.7872, + "step": 6329 + }, + { + "epoch": 3.5702199661590526, + "grad_norm": 1.2105259895324707, + "learning_rate": 3.215172024816695e-05, + "loss": 0.6551, + "step": 6330 + }, + { + "epoch": 3.570783981951495, + "grad_norm": 1.306248426437378, + "learning_rate": 3.214890016920474e-05, + "loss": 0.7175, + "step": 6331 + }, + { + "epoch": 3.5713479977439366, + "grad_norm": 1.274039626121521, + "learning_rate": 3.214608009024253e-05, + "loss": 0.7467, + "step": 6332 + }, + { + "epoch": 3.571912013536379, + "grad_norm": 0.9586502909660339, + "learning_rate": 3.2143260011280315e-05, + "loss": 0.7524, + "step": 6333 + }, + { + "epoch": 3.572476029328821, + "grad_norm": 1.2612590789794922, + "learning_rate": 3.214043993231811e-05, + "loss": 0.6868, + "step": 6334 + }, + { + "epoch": 3.5730400451212634, + "grad_norm": 1.5402858257293701, + "learning_rate": 3.213761985335589e-05, + "loss": 0.708, + "step": 6335 + }, + { + "epoch": 3.5736040609137056, + "grad_norm": 1.2845218181610107, + "learning_rate": 3.2134799774393685e-05, + "loss": 0.7483, + "step": 6336 + }, + { + "epoch": 3.574168076706148, + "grad_norm": 1.5512737035751343, + "learning_rate": 3.213197969543148e-05, + "loss": 0.797, + "step": 6337 + }, + { + "epoch": 3.57473209249859, + "grad_norm": 1.401044487953186, + "learning_rate": 3.212915961646926e-05, + "loss": 0.7994, + "step": 6338 + }, + { + "epoch": 3.575296108291032, + "grad_norm": 2.0869131088256836, + "learning_rate": 3.212633953750705e-05, + "loss": 0.7968, + "step": 6339 + }, + { + "epoch": 3.575860124083474, + "grad_norm": 1.4046530723571777, + "learning_rate": 3.212351945854484e-05, + "loss": 0.7485, + "step": 6340 + }, + { + "epoch": 3.5764241398759165, + "grad_norm": 3.255955219268799, + "learning_rate": 3.212069937958263e-05, + "loss": 0.9027, + "step": 6341 + }, + { + "epoch": 3.5769881556683587, + "grad_norm": 1.1561771631240845, + "learning_rate": 3.211787930062042e-05, + "loss": 0.7213, + "step": 6342 + }, + { + "epoch": 3.577552171460801, + "grad_norm": 1.0283921957015991, + "learning_rate": 3.2115059221658203e-05, + "loss": 0.6998, + "step": 6343 + }, + { + "epoch": 3.578116187253243, + "grad_norm": 1.2246856689453125, + "learning_rate": 3.2112239142695996e-05, + "loss": 0.7763, + "step": 6344 + }, + { + "epoch": 3.5786802030456855, + "grad_norm": 1.5038708448410034, + "learning_rate": 3.210941906373379e-05, + "loss": 0.7439, + "step": 6345 + }, + { + "epoch": 3.5792442188381273, + "grad_norm": 1.583160638809204, + "learning_rate": 3.210659898477157e-05, + "loss": 0.8302, + "step": 6346 + }, + { + "epoch": 3.5798082346305695, + "grad_norm": 0.8997402191162109, + "learning_rate": 3.210377890580936e-05, + "loss": 0.7003, + "step": 6347 + }, + { + "epoch": 3.5803722504230118, + "grad_norm": 1.287355899810791, + "learning_rate": 3.210095882684716e-05, + "loss": 0.7874, + "step": 6348 + }, + { + "epoch": 3.580936266215454, + "grad_norm": 1.6332969665527344, + "learning_rate": 3.209813874788494e-05, + "loss": 0.8752, + "step": 6349 + }, + { + "epoch": 3.5815002820078963, + "grad_norm": 1.09163236618042, + "learning_rate": 3.209531866892273e-05, + "loss": 0.7232, + "step": 6350 + }, + { + "epoch": 3.5820642978003385, + "grad_norm": 1.1049751043319702, + "learning_rate": 3.209249858996052e-05, + "loss": 0.8525, + "step": 6351 + }, + { + "epoch": 3.5826283135927808, + "grad_norm": 1.0864368677139282, + "learning_rate": 3.208967851099831e-05, + "loss": 0.6609, + "step": 6352 + }, + { + "epoch": 3.5831923293852226, + "grad_norm": 1.3650063276290894, + "learning_rate": 3.20868584320361e-05, + "loss": 0.7511, + "step": 6353 + }, + { + "epoch": 3.583756345177665, + "grad_norm": 1.7556920051574707, + "learning_rate": 3.2084038353073884e-05, + "loss": 0.7911, + "step": 6354 + }, + { + "epoch": 3.584320360970107, + "grad_norm": 1.141140103340149, + "learning_rate": 3.2081218274111676e-05, + "loss": 0.6908, + "step": 6355 + }, + { + "epoch": 3.5848843767625493, + "grad_norm": 1.1653014421463013, + "learning_rate": 3.207839819514947e-05, + "loss": 0.7458, + "step": 6356 + }, + { + "epoch": 3.5854483925549916, + "grad_norm": 0.9820500612258911, + "learning_rate": 3.207557811618725e-05, + "loss": 0.6675, + "step": 6357 + }, + { + "epoch": 3.586012408347434, + "grad_norm": 1.3284944295883179, + "learning_rate": 3.2072758037225046e-05, + "loss": 0.8136, + "step": 6358 + }, + { + "epoch": 3.586576424139876, + "grad_norm": 1.0801243782043457, + "learning_rate": 3.206993795826284e-05, + "loss": 0.685, + "step": 6359 + }, + { + "epoch": 3.587140439932318, + "grad_norm": 1.3506568670272827, + "learning_rate": 3.206711787930062e-05, + "loss": 0.8419, + "step": 6360 + }, + { + "epoch": 3.58770445572476, + "grad_norm": 1.04178786277771, + "learning_rate": 3.206429780033841e-05, + "loss": 0.759, + "step": 6361 + }, + { + "epoch": 3.5882684715172024, + "grad_norm": 3.5507562160491943, + "learning_rate": 3.20614777213762e-05, + "loss": 0.7904, + "step": 6362 + }, + { + "epoch": 3.5888324873096447, + "grad_norm": 1.2726410627365112, + "learning_rate": 3.205865764241399e-05, + "loss": 0.6591, + "step": 6363 + }, + { + "epoch": 3.589396503102087, + "grad_norm": 1.1832941770553589, + "learning_rate": 3.205583756345178e-05, + "loss": 0.6395, + "step": 6364 + }, + { + "epoch": 3.589960518894529, + "grad_norm": 3.4582679271698, + "learning_rate": 3.2053017484489564e-05, + "loss": 0.9739, + "step": 6365 + }, + { + "epoch": 3.5905245346869714, + "grad_norm": 1.7039563655853271, + "learning_rate": 3.2050197405527356e-05, + "loss": 0.8178, + "step": 6366 + }, + { + "epoch": 3.591088550479413, + "grad_norm": 1.2145264148712158, + "learning_rate": 3.204737732656515e-05, + "loss": 0.795, + "step": 6367 + }, + { + "epoch": 3.5916525662718555, + "grad_norm": 1.3314356803894043, + "learning_rate": 3.2044557247602934e-05, + "loss": 0.7546, + "step": 6368 + }, + { + "epoch": 3.5922165820642977, + "grad_norm": 1.007781744003296, + "learning_rate": 3.2041737168640726e-05, + "loss": 0.6515, + "step": 6369 + }, + { + "epoch": 3.59278059785674, + "grad_norm": 2.2165307998657227, + "learning_rate": 3.203891708967851e-05, + "loss": 0.9453, + "step": 6370 + }, + { + "epoch": 3.5933446136491822, + "grad_norm": 1.7066805362701416, + "learning_rate": 3.20360970107163e-05, + "loss": 0.8735, + "step": 6371 + }, + { + "epoch": 3.5939086294416245, + "grad_norm": 1.20063316822052, + "learning_rate": 3.203327693175409e-05, + "loss": 0.7352, + "step": 6372 + }, + { + "epoch": 3.5944726452340667, + "grad_norm": 0.9874749183654785, + "learning_rate": 3.203045685279188e-05, + "loss": 0.7323, + "step": 6373 + }, + { + "epoch": 3.5950366610265085, + "grad_norm": 0.9837151169776917, + "learning_rate": 3.2027636773829666e-05, + "loss": 0.6355, + "step": 6374 + }, + { + "epoch": 3.595600676818951, + "grad_norm": 0.9116043448448181, + "learning_rate": 3.202481669486746e-05, + "loss": 0.7314, + "step": 6375 + }, + { + "epoch": 3.596164692611393, + "grad_norm": 0.9646072387695312, + "learning_rate": 3.202199661590525e-05, + "loss": 0.6962, + "step": 6376 + }, + { + "epoch": 3.5967287084038353, + "grad_norm": 1.4394476413726807, + "learning_rate": 3.2019176536943036e-05, + "loss": 0.6551, + "step": 6377 + }, + { + "epoch": 3.5972927241962775, + "grad_norm": 2.2539215087890625, + "learning_rate": 3.201635645798082e-05, + "loss": 0.662, + "step": 6378 + }, + { + "epoch": 3.59785673998872, + "grad_norm": 1.7519840002059937, + "learning_rate": 3.2013536379018614e-05, + "loss": 0.8453, + "step": 6379 + }, + { + "epoch": 3.598420755781162, + "grad_norm": 1.4768249988555908, + "learning_rate": 3.2010716300056406e-05, + "loss": 0.759, + "step": 6380 + }, + { + "epoch": 3.598984771573604, + "grad_norm": 1.1825140714645386, + "learning_rate": 3.200789622109419e-05, + "loss": 0.7551, + "step": 6381 + }, + { + "epoch": 3.599548787366046, + "grad_norm": 2.2133257389068604, + "learning_rate": 3.200507614213198e-05, + "loss": 0.8307, + "step": 6382 + }, + { + "epoch": 3.6001128031584884, + "grad_norm": 1.4631348848342896, + "learning_rate": 3.200225606316977e-05, + "loss": 0.7188, + "step": 6383 + }, + { + "epoch": 3.6006768189509306, + "grad_norm": 1.5824686288833618, + "learning_rate": 3.199943598420756e-05, + "loss": 0.8064, + "step": 6384 + }, + { + "epoch": 3.601240834743373, + "grad_norm": 1.4001089334487915, + "learning_rate": 3.1996615905245346e-05, + "loss": 0.8836, + "step": 6385 + }, + { + "epoch": 3.601804850535815, + "grad_norm": 0.9745133519172668, + "learning_rate": 3.199379582628313e-05, + "loss": 0.697, + "step": 6386 + }, + { + "epoch": 3.6023688663282574, + "grad_norm": 1.5888755321502686, + "learning_rate": 3.199097574732093e-05, + "loss": 0.8026, + "step": 6387 + }, + { + "epoch": 3.602932882120699, + "grad_norm": 1.0273957252502441, + "learning_rate": 3.1988155668358716e-05, + "loss": 0.6897, + "step": 6388 + }, + { + "epoch": 3.6034968979131414, + "grad_norm": 1.6995056867599487, + "learning_rate": 3.19853355893965e-05, + "loss": 0.7529, + "step": 6389 + }, + { + "epoch": 3.6040609137055837, + "grad_norm": 1.4534692764282227, + "learning_rate": 3.1982515510434294e-05, + "loss": 0.8697, + "step": 6390 + }, + { + "epoch": 3.604624929498026, + "grad_norm": 1.166191816329956, + "learning_rate": 3.1979695431472086e-05, + "loss": 0.7614, + "step": 6391 + }, + { + "epoch": 3.605188945290468, + "grad_norm": 2.5506365299224854, + "learning_rate": 3.197687535250987e-05, + "loss": 0.9618, + "step": 6392 + }, + { + "epoch": 3.6057529610829104, + "grad_norm": 1.3329704999923706, + "learning_rate": 3.197405527354766e-05, + "loss": 0.7242, + "step": 6393 + }, + { + "epoch": 3.6063169768753527, + "grad_norm": 1.1574798822402954, + "learning_rate": 3.1971235194585456e-05, + "loss": 0.7779, + "step": 6394 + }, + { + "epoch": 3.6068809926677945, + "grad_norm": 1.4006009101867676, + "learning_rate": 3.196841511562324e-05, + "loss": 0.726, + "step": 6395 + }, + { + "epoch": 3.6074450084602367, + "grad_norm": 1.9912188053131104, + "learning_rate": 3.196559503666103e-05, + "loss": 0.951, + "step": 6396 + }, + { + "epoch": 3.608009024252679, + "grad_norm": 1.2349509000778198, + "learning_rate": 3.196277495769882e-05, + "loss": 0.8692, + "step": 6397 + }, + { + "epoch": 3.6085730400451212, + "grad_norm": 1.161952018737793, + "learning_rate": 3.195995487873661e-05, + "loss": 0.8346, + "step": 6398 + }, + { + "epoch": 3.6091370558375635, + "grad_norm": 1.2584537267684937, + "learning_rate": 3.1957134799774396e-05, + "loss": 0.7659, + "step": 6399 + }, + { + "epoch": 3.6097010716300058, + "grad_norm": 1.549554467201233, + "learning_rate": 3.195431472081218e-05, + "loss": 0.7758, + "step": 6400 + }, + { + "epoch": 3.610265087422448, + "grad_norm": 1.2402267456054688, + "learning_rate": 3.1951494641849974e-05, + "loss": 0.7793, + "step": 6401 + }, + { + "epoch": 3.61082910321489, + "grad_norm": 1.0206468105316162, + "learning_rate": 3.1948674562887766e-05, + "loss": 0.7765, + "step": 6402 + }, + { + "epoch": 3.611393119007332, + "grad_norm": 1.093268871307373, + "learning_rate": 3.194585448392555e-05, + "loss": 0.7432, + "step": 6403 + }, + { + "epoch": 3.6119571347997743, + "grad_norm": 1.2733148336410522, + "learning_rate": 3.194303440496334e-05, + "loss": 0.7864, + "step": 6404 + }, + { + "epoch": 3.6125211505922166, + "grad_norm": 1.3049451112747192, + "learning_rate": 3.194021432600113e-05, + "loss": 0.6069, + "step": 6405 + }, + { + "epoch": 3.613085166384659, + "grad_norm": 1.31002938747406, + "learning_rate": 3.193739424703892e-05, + "loss": 0.7686, + "step": 6406 + }, + { + "epoch": 3.613649182177101, + "grad_norm": 1.7579952478408813, + "learning_rate": 3.193457416807671e-05, + "loss": 0.7985, + "step": 6407 + }, + { + "epoch": 3.6142131979695433, + "grad_norm": 1.29407799243927, + "learning_rate": 3.19317540891145e-05, + "loss": 0.7516, + "step": 6408 + }, + { + "epoch": 3.614777213761985, + "grad_norm": 1.4467464685440063, + "learning_rate": 3.1928934010152284e-05, + "loss": 0.8243, + "step": 6409 + }, + { + "epoch": 3.6153412295544274, + "grad_norm": 7.108229637145996, + "learning_rate": 3.1926113931190077e-05, + "loss": 0.953, + "step": 6410 + }, + { + "epoch": 3.6159052453468696, + "grad_norm": 1.3810783624649048, + "learning_rate": 3.192329385222786e-05, + "loss": 0.7352, + "step": 6411 + }, + { + "epoch": 3.616469261139312, + "grad_norm": 1.1028972864151, + "learning_rate": 3.1920473773265654e-05, + "loss": 0.7462, + "step": 6412 + }, + { + "epoch": 3.617033276931754, + "grad_norm": 1.4758102893829346, + "learning_rate": 3.191765369430344e-05, + "loss": 0.7654, + "step": 6413 + }, + { + "epoch": 3.6175972927241964, + "grad_norm": 2.142298936843872, + "learning_rate": 3.191483361534123e-05, + "loss": 0.798, + "step": 6414 + }, + { + "epoch": 3.6181613085166386, + "grad_norm": 0.9065330028533936, + "learning_rate": 3.1912013536379024e-05, + "loss": 0.7024, + "step": 6415 + }, + { + "epoch": 3.6187253243090804, + "grad_norm": 1.472309947013855, + "learning_rate": 3.190919345741681e-05, + "loss": 0.7936, + "step": 6416 + }, + { + "epoch": 3.6192893401015227, + "grad_norm": 1.5573545694351196, + "learning_rate": 3.1906373378454595e-05, + "loss": 0.7181, + "step": 6417 + }, + { + "epoch": 3.619853355893965, + "grad_norm": 1.5291099548339844, + "learning_rate": 3.190355329949239e-05, + "loss": 0.8979, + "step": 6418 + }, + { + "epoch": 3.620417371686407, + "grad_norm": 1.4693100452423096, + "learning_rate": 3.190073322053018e-05, + "loss": 0.8036, + "step": 6419 + }, + { + "epoch": 3.6209813874788495, + "grad_norm": 0.8948105573654175, + "learning_rate": 3.1897913141567965e-05, + "loss": 0.7156, + "step": 6420 + }, + { + "epoch": 3.6215454032712917, + "grad_norm": 1.249625325202942, + "learning_rate": 3.189509306260575e-05, + "loss": 0.6989, + "step": 6421 + }, + { + "epoch": 3.622109419063734, + "grad_norm": 1.6400299072265625, + "learning_rate": 3.189227298364354e-05, + "loss": 0.8161, + "step": 6422 + }, + { + "epoch": 3.6226734348561758, + "grad_norm": 1.163960576057434, + "learning_rate": 3.1889452904681334e-05, + "loss": 0.7792, + "step": 6423 + }, + { + "epoch": 3.623237450648618, + "grad_norm": 1.1295231580734253, + "learning_rate": 3.188663282571912e-05, + "loss": 0.6785, + "step": 6424 + }, + { + "epoch": 3.6238014664410603, + "grad_norm": 1.4619524478912354, + "learning_rate": 3.1883812746756905e-05, + "loss": 0.6851, + "step": 6425 + }, + { + "epoch": 3.6243654822335025, + "grad_norm": 1.706164002418518, + "learning_rate": 3.1880992667794704e-05, + "loss": 0.7416, + "step": 6426 + }, + { + "epoch": 3.6249294980259448, + "grad_norm": 1.5113236904144287, + "learning_rate": 3.187817258883249e-05, + "loss": 0.6703, + "step": 6427 + }, + { + "epoch": 3.625493513818387, + "grad_norm": 2.4077529907226562, + "learning_rate": 3.1875352509870275e-05, + "loss": 0.9568, + "step": 6428 + }, + { + "epoch": 3.6260575296108293, + "grad_norm": 1.2227897644042969, + "learning_rate": 3.187253243090807e-05, + "loss": 0.8304, + "step": 6429 + }, + { + "epoch": 3.626621545403271, + "grad_norm": 1.0900013446807861, + "learning_rate": 3.186971235194586e-05, + "loss": 0.7005, + "step": 6430 + }, + { + "epoch": 3.6271855611957133, + "grad_norm": 1.368381381034851, + "learning_rate": 3.1866892272983645e-05, + "loss": 0.7968, + "step": 6431 + }, + { + "epoch": 3.6277495769881556, + "grad_norm": 1.1315118074417114, + "learning_rate": 3.186407219402143e-05, + "loss": 0.7834, + "step": 6432 + }, + { + "epoch": 3.628313592780598, + "grad_norm": 1.1827421188354492, + "learning_rate": 3.186125211505923e-05, + "loss": 0.7978, + "step": 6433 + }, + { + "epoch": 3.62887760857304, + "grad_norm": 1.072627067565918, + "learning_rate": 3.1858432036097014e-05, + "loss": 0.7392, + "step": 6434 + }, + { + "epoch": 3.6294416243654823, + "grad_norm": 1.2976816892623901, + "learning_rate": 3.18556119571348e-05, + "loss": 0.7467, + "step": 6435 + }, + { + "epoch": 3.6300056401579246, + "grad_norm": 2.432227611541748, + "learning_rate": 3.185279187817259e-05, + "loss": 1.1516, + "step": 6436 + }, + { + "epoch": 3.6305696559503664, + "grad_norm": 1.1720871925354004, + "learning_rate": 3.1849971799210384e-05, + "loss": 0.7102, + "step": 6437 + }, + { + "epoch": 3.6311336717428087, + "grad_norm": 1.226250171661377, + "learning_rate": 3.184715172024817e-05, + "loss": 0.7392, + "step": 6438 + }, + { + "epoch": 3.631697687535251, + "grad_norm": 1.5780473947525024, + "learning_rate": 3.1844331641285955e-05, + "loss": 0.7259, + "step": 6439 + }, + { + "epoch": 3.632261703327693, + "grad_norm": 1.4790852069854736, + "learning_rate": 3.184151156232375e-05, + "loss": 0.7647, + "step": 6440 + }, + { + "epoch": 3.6328257191201354, + "grad_norm": 1.9390138387680054, + "learning_rate": 3.183869148336154e-05, + "loss": 0.8511, + "step": 6441 + }, + { + "epoch": 3.6333897349125777, + "grad_norm": 0.9738144874572754, + "learning_rate": 3.1835871404399325e-05, + "loss": 0.7737, + "step": 6442 + }, + { + "epoch": 3.63395375070502, + "grad_norm": 1.0806351900100708, + "learning_rate": 3.183305132543711e-05, + "loss": 0.7698, + "step": 6443 + }, + { + "epoch": 3.6345177664974617, + "grad_norm": 0.9331944584846497, + "learning_rate": 3.18302312464749e-05, + "loss": 0.7391, + "step": 6444 + }, + { + "epoch": 3.635081782289904, + "grad_norm": 1.2022031545639038, + "learning_rate": 3.1827411167512695e-05, + "loss": 0.6299, + "step": 6445 + }, + { + "epoch": 3.6356457980823462, + "grad_norm": 1.24078369140625, + "learning_rate": 3.182459108855048e-05, + "loss": 0.7306, + "step": 6446 + }, + { + "epoch": 3.6362098138747885, + "grad_norm": 1.0323104858398438, + "learning_rate": 3.182177100958827e-05, + "loss": 0.7072, + "step": 6447 + }, + { + "epoch": 3.6367738296672307, + "grad_norm": 1.8025034666061401, + "learning_rate": 3.181895093062606e-05, + "loss": 0.6651, + "step": 6448 + }, + { + "epoch": 3.637337845459673, + "grad_norm": 2.241332530975342, + "learning_rate": 3.181613085166385e-05, + "loss": 0.8799, + "step": 6449 + }, + { + "epoch": 3.6379018612521152, + "grad_norm": 1.3813774585723877, + "learning_rate": 3.1813310772701635e-05, + "loss": 0.8705, + "step": 6450 + }, + { + "epoch": 3.638465877044557, + "grad_norm": 1.102372169494629, + "learning_rate": 3.181049069373943e-05, + "loss": 0.6563, + "step": 6451 + }, + { + "epoch": 3.6390298928369993, + "grad_norm": 1.1005750894546509, + "learning_rate": 3.180767061477721e-05, + "loss": 0.6599, + "step": 6452 + }, + { + "epoch": 3.6395939086294415, + "grad_norm": 1.5599313974380493, + "learning_rate": 3.1804850535815005e-05, + "loss": 0.9039, + "step": 6453 + }, + { + "epoch": 3.640157924421884, + "grad_norm": 0.9831977486610413, + "learning_rate": 3.18020304568528e-05, + "loss": 0.8022, + "step": 6454 + }, + { + "epoch": 3.640721940214326, + "grad_norm": 1.155051589012146, + "learning_rate": 3.179921037789058e-05, + "loss": 0.6766, + "step": 6455 + }, + { + "epoch": 3.6412859560067683, + "grad_norm": 0.7963878512382507, + "learning_rate": 3.179639029892837e-05, + "loss": 0.6798, + "step": 6456 + }, + { + "epoch": 3.6418499717992106, + "grad_norm": 1.1947768926620483, + "learning_rate": 3.179357021996616e-05, + "loss": 0.7357, + "step": 6457 + }, + { + "epoch": 3.6424139875916524, + "grad_norm": 1.2204231023788452, + "learning_rate": 3.179075014100395e-05, + "loss": 0.8385, + "step": 6458 + }, + { + "epoch": 3.6429780033840946, + "grad_norm": 1.4477336406707764, + "learning_rate": 3.178793006204174e-05, + "loss": 0.775, + "step": 6459 + }, + { + "epoch": 3.643542019176537, + "grad_norm": 1.299521803855896, + "learning_rate": 3.178510998307952e-05, + "loss": 0.643, + "step": 6460 + }, + { + "epoch": 3.644106034968979, + "grad_norm": 1.0433542728424072, + "learning_rate": 3.1782289904117315e-05, + "loss": 0.7262, + "step": 6461 + }, + { + "epoch": 3.6446700507614214, + "grad_norm": 1.3798675537109375, + "learning_rate": 3.177946982515511e-05, + "loss": 0.6946, + "step": 6462 + }, + { + "epoch": 3.6452340665538636, + "grad_norm": 0.8317168951034546, + "learning_rate": 3.177664974619289e-05, + "loss": 0.6765, + "step": 6463 + }, + { + "epoch": 3.645798082346306, + "grad_norm": 1.3037368059158325, + "learning_rate": 3.177382966723068e-05, + "loss": 0.8832, + "step": 6464 + }, + { + "epoch": 3.6463620981387477, + "grad_norm": 1.0855027437210083, + "learning_rate": 3.177100958826848e-05, + "loss": 0.7546, + "step": 6465 + }, + { + "epoch": 3.64692611393119, + "grad_norm": 3.174534559249878, + "learning_rate": 3.176818950930626e-05, + "loss": 0.7055, + "step": 6466 + }, + { + "epoch": 3.647490129723632, + "grad_norm": 1.1180965900421143, + "learning_rate": 3.176536943034405e-05, + "loss": 0.7374, + "step": 6467 + }, + { + "epoch": 3.6480541455160744, + "grad_norm": 1.6102304458618164, + "learning_rate": 3.176254935138184e-05, + "loss": 0.8471, + "step": 6468 + }, + { + "epoch": 3.6486181613085167, + "grad_norm": 1.1939213275909424, + "learning_rate": 3.175972927241963e-05, + "loss": 0.7457, + "step": 6469 + }, + { + "epoch": 3.649182177100959, + "grad_norm": 1.2453409433364868, + "learning_rate": 3.175690919345742e-05, + "loss": 0.7421, + "step": 6470 + }, + { + "epoch": 3.649746192893401, + "grad_norm": 3.810720443725586, + "learning_rate": 3.17540891144952e-05, + "loss": 0.9175, + "step": 6471 + }, + { + "epoch": 3.650310208685843, + "grad_norm": 1.1651779413223267, + "learning_rate": 3.1751269035533e-05, + "loss": 0.7936, + "step": 6472 + }, + { + "epoch": 3.6508742244782852, + "grad_norm": 2.0176198482513428, + "learning_rate": 3.174844895657079e-05, + "loss": 0.8702, + "step": 6473 + }, + { + "epoch": 3.6514382402707275, + "grad_norm": 1.189711093902588, + "learning_rate": 3.174562887760857e-05, + "loss": 0.7197, + "step": 6474 + }, + { + "epoch": 3.6520022560631697, + "grad_norm": 1.1115669012069702, + "learning_rate": 3.174280879864636e-05, + "loss": 0.6743, + "step": 6475 + }, + { + "epoch": 3.652566271855612, + "grad_norm": 0.986182451248169, + "learning_rate": 3.173998871968416e-05, + "loss": 0.7038, + "step": 6476 + }, + { + "epoch": 3.6531302876480543, + "grad_norm": 1.2382862567901611, + "learning_rate": 3.173716864072194e-05, + "loss": 0.8354, + "step": 6477 + }, + { + "epoch": 3.6536943034404965, + "grad_norm": 1.432434320449829, + "learning_rate": 3.173434856175973e-05, + "loss": 0.6933, + "step": 6478 + }, + { + "epoch": 3.6542583192329383, + "grad_norm": 1.1393836736679077, + "learning_rate": 3.173152848279752e-05, + "loss": 0.7411, + "step": 6479 + }, + { + "epoch": 3.6548223350253806, + "grad_norm": 1.1011202335357666, + "learning_rate": 3.172870840383531e-05, + "loss": 0.6881, + "step": 6480 + }, + { + "epoch": 3.655386350817823, + "grad_norm": 1.1537110805511475, + "learning_rate": 3.17258883248731e-05, + "loss": 0.7441, + "step": 6481 + }, + { + "epoch": 3.655950366610265, + "grad_norm": 2.2222683429718018, + "learning_rate": 3.1723068245910883e-05, + "loss": 0.9046, + "step": 6482 + }, + { + "epoch": 3.6565143824027073, + "grad_norm": 1.1499226093292236, + "learning_rate": 3.1720248166948676e-05, + "loss": 0.8251, + "step": 6483 + }, + { + "epoch": 3.6570783981951496, + "grad_norm": 1.9365452527999878, + "learning_rate": 3.171742808798647e-05, + "loss": 0.8841, + "step": 6484 + }, + { + "epoch": 3.657642413987592, + "grad_norm": 1.3272167444229126, + "learning_rate": 3.171460800902425e-05, + "loss": 0.78, + "step": 6485 + }, + { + "epoch": 3.6582064297800336, + "grad_norm": 1.315826416015625, + "learning_rate": 3.1711787930062045e-05, + "loss": 0.7254, + "step": 6486 + }, + { + "epoch": 3.658770445572476, + "grad_norm": 1.5934101343154907, + "learning_rate": 3.170896785109983e-05, + "loss": 0.776, + "step": 6487 + }, + { + "epoch": 3.659334461364918, + "grad_norm": 1.1044104099273682, + "learning_rate": 3.170614777213762e-05, + "loss": 0.7759, + "step": 6488 + }, + { + "epoch": 3.6598984771573604, + "grad_norm": 1.2050827741622925, + "learning_rate": 3.170332769317541e-05, + "loss": 0.7246, + "step": 6489 + }, + { + "epoch": 3.6604624929498026, + "grad_norm": 1.4967200756072998, + "learning_rate": 3.17005076142132e-05, + "loss": 0.7759, + "step": 6490 + }, + { + "epoch": 3.661026508742245, + "grad_norm": 1.405254602432251, + "learning_rate": 3.1697687535250986e-05, + "loss": 0.8612, + "step": 6491 + }, + { + "epoch": 3.661590524534687, + "grad_norm": 1.0123167037963867, + "learning_rate": 3.169486745628878e-05, + "loss": 0.7051, + "step": 6492 + }, + { + "epoch": 3.662154540327129, + "grad_norm": 1.0409328937530518, + "learning_rate": 3.1692047377326564e-05, + "loss": 0.7902, + "step": 6493 + }, + { + "epoch": 3.662718556119571, + "grad_norm": 0.9672085642814636, + "learning_rate": 3.1689227298364356e-05, + "loss": 0.6639, + "step": 6494 + }, + { + "epoch": 3.6632825719120135, + "grad_norm": 1.034106969833374, + "learning_rate": 3.168640721940214e-05, + "loss": 0.7215, + "step": 6495 + }, + { + "epoch": 3.6638465877044557, + "grad_norm": 1.1055296659469604, + "learning_rate": 3.168358714043993e-05, + "loss": 0.7847, + "step": 6496 + }, + { + "epoch": 3.664410603496898, + "grad_norm": 1.828428864479065, + "learning_rate": 3.1680767061477726e-05, + "loss": 0.8204, + "step": 6497 + }, + { + "epoch": 3.66497461928934, + "grad_norm": 1.3398476839065552, + "learning_rate": 3.167794698251551e-05, + "loss": 0.7318, + "step": 6498 + }, + { + "epoch": 3.6655386350817825, + "grad_norm": 0.7258148193359375, + "learning_rate": 3.1675126903553296e-05, + "loss": 0.5989, + "step": 6499 + }, + { + "epoch": 3.6661026508742243, + "grad_norm": 1.273784875869751, + "learning_rate": 3.167230682459109e-05, + "loss": 0.6996, + "step": 6500 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 2.351418972015381, + "learning_rate": 3.166948674562888e-05, + "loss": 0.7153, + "step": 6501 + }, + { + "epoch": 3.6672306824591088, + "grad_norm": 1.5885690450668335, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.7653, + "step": 6502 + }, + { + "epoch": 3.667794698251551, + "grad_norm": 0.9559596180915833, + "learning_rate": 3.166384658770446e-05, + "loss": 0.6975, + "step": 6503 + }, + { + "epoch": 3.6683587140439933, + "grad_norm": 1.2158766984939575, + "learning_rate": 3.166102650874225e-05, + "loss": 0.8054, + "step": 6504 + }, + { + "epoch": 3.6689227298364355, + "grad_norm": 1.045864224433899, + "learning_rate": 3.1658206429780036e-05, + "loss": 0.7271, + "step": 6505 + }, + { + "epoch": 3.6694867456288778, + "grad_norm": 1.5422585010528564, + "learning_rate": 3.165538635081782e-05, + "loss": 0.7416, + "step": 6506 + }, + { + "epoch": 3.6700507614213196, + "grad_norm": 1.2739392518997192, + "learning_rate": 3.1652566271855614e-05, + "loss": 0.8471, + "step": 6507 + }, + { + "epoch": 3.670614777213762, + "grad_norm": 1.4949793815612793, + "learning_rate": 3.1649746192893406e-05, + "loss": 0.8747, + "step": 6508 + }, + { + "epoch": 3.671178793006204, + "grad_norm": 1.600306510925293, + "learning_rate": 3.164692611393119e-05, + "loss": 0.7898, + "step": 6509 + }, + { + "epoch": 3.6717428087986463, + "grad_norm": 1.704193353652954, + "learning_rate": 3.1644106034968977e-05, + "loss": 0.9942, + "step": 6510 + }, + { + "epoch": 3.6723068245910886, + "grad_norm": 1.0970607995986938, + "learning_rate": 3.164128595600677e-05, + "loss": 0.7668, + "step": 6511 + }, + { + "epoch": 3.672870840383531, + "grad_norm": 1.7296706438064575, + "learning_rate": 3.163846587704456e-05, + "loss": 0.7934, + "step": 6512 + }, + { + "epoch": 3.673434856175973, + "grad_norm": 1.1482527256011963, + "learning_rate": 3.1635645798082346e-05, + "loss": 0.7893, + "step": 6513 + }, + { + "epoch": 3.673998871968415, + "grad_norm": 0.9930245280265808, + "learning_rate": 3.163282571912013e-05, + "loss": 0.7943, + "step": 6514 + }, + { + "epoch": 3.674562887760857, + "grad_norm": 0.9477379322052002, + "learning_rate": 3.163000564015793e-05, + "loss": 0.803, + "step": 6515 + }, + { + "epoch": 3.6751269035532994, + "grad_norm": 0.888786792755127, + "learning_rate": 3.1627185561195716e-05, + "loss": 0.6952, + "step": 6516 + }, + { + "epoch": 3.6756909193457417, + "grad_norm": 1.1407109498977661, + "learning_rate": 3.16243654822335e-05, + "loss": 0.7436, + "step": 6517 + }, + { + "epoch": 3.676254935138184, + "grad_norm": 2.4000306129455566, + "learning_rate": 3.1621545403271294e-05, + "loss": 0.9896, + "step": 6518 + }, + { + "epoch": 3.676818950930626, + "grad_norm": 1.3490123748779297, + "learning_rate": 3.1618725324309086e-05, + "loss": 0.7573, + "step": 6519 + }, + { + "epoch": 3.6773829667230684, + "grad_norm": 1.1222158670425415, + "learning_rate": 3.161590524534687e-05, + "loss": 0.8818, + "step": 6520 + }, + { + "epoch": 3.67794698251551, + "grad_norm": 1.5292823314666748, + "learning_rate": 3.161308516638466e-05, + "loss": 0.8823, + "step": 6521 + }, + { + "epoch": 3.6785109983079525, + "grad_norm": 1.4519853591918945, + "learning_rate": 3.161026508742245e-05, + "loss": 0.8096, + "step": 6522 + }, + { + "epoch": 3.6790750141003947, + "grad_norm": 1.2539334297180176, + "learning_rate": 3.160744500846024e-05, + "loss": 0.6944, + "step": 6523 + }, + { + "epoch": 3.679639029892837, + "grad_norm": 1.3957902193069458, + "learning_rate": 3.1604624929498026e-05, + "loss": 0.7115, + "step": 6524 + }, + { + "epoch": 3.6802030456852792, + "grad_norm": 1.7846869230270386, + "learning_rate": 3.160180485053582e-05, + "loss": 0.8928, + "step": 6525 + }, + { + "epoch": 3.6807670614777215, + "grad_norm": 0.991299033164978, + "learning_rate": 3.1598984771573604e-05, + "loss": 0.7795, + "step": 6526 + }, + { + "epoch": 3.6813310772701637, + "grad_norm": 1.2753850221633911, + "learning_rate": 3.1596164692611396e-05, + "loss": 0.8241, + "step": 6527 + }, + { + "epoch": 3.6818950930626055, + "grad_norm": 0.9120011329650879, + "learning_rate": 3.159334461364918e-05, + "loss": 0.6786, + "step": 6528 + }, + { + "epoch": 3.682459108855048, + "grad_norm": 3.4128100872039795, + "learning_rate": 3.1590524534686974e-05, + "loss": 0.6535, + "step": 6529 + }, + { + "epoch": 3.68302312464749, + "grad_norm": 1.2557865381240845, + "learning_rate": 3.158770445572476e-05, + "loss": 0.8656, + "step": 6530 + }, + { + "epoch": 3.6835871404399323, + "grad_norm": 1.0283777713775635, + "learning_rate": 3.158488437676255e-05, + "loss": 0.663, + "step": 6531 + }, + { + "epoch": 3.6841511562323745, + "grad_norm": 1.2525558471679688, + "learning_rate": 3.158206429780034e-05, + "loss": 0.7646, + "step": 6532 + }, + { + "epoch": 3.684715172024817, + "grad_norm": 1.2438280582427979, + "learning_rate": 3.157924421883813e-05, + "loss": 0.7667, + "step": 6533 + }, + { + "epoch": 3.685279187817259, + "grad_norm": 1.132690668106079, + "learning_rate": 3.1576424139875914e-05, + "loss": 0.7809, + "step": 6534 + }, + { + "epoch": 3.685843203609701, + "grad_norm": 0.9725260734558105, + "learning_rate": 3.1573604060913707e-05, + "loss": 0.7742, + "step": 6535 + }, + { + "epoch": 3.686407219402143, + "grad_norm": 1.5699856281280518, + "learning_rate": 3.15707839819515e-05, + "loss": 0.8898, + "step": 6536 + }, + { + "epoch": 3.6869712351945854, + "grad_norm": 2.005948781967163, + "learning_rate": 3.1567963902989284e-05, + "loss": 0.8583, + "step": 6537 + }, + { + "epoch": 3.6875352509870276, + "grad_norm": 0.8919313549995422, + "learning_rate": 3.1565143824027076e-05, + "loss": 0.6853, + "step": 6538 + }, + { + "epoch": 3.68809926677947, + "grad_norm": 1.6730595827102661, + "learning_rate": 3.156232374506486e-05, + "loss": 0.754, + "step": 6539 + }, + { + "epoch": 3.688663282571912, + "grad_norm": 1.2912185192108154, + "learning_rate": 3.1559503666102654e-05, + "loss": 0.6128, + "step": 6540 + }, + { + "epoch": 3.6892272983643544, + "grad_norm": 1.0956395864486694, + "learning_rate": 3.155668358714044e-05, + "loss": 0.6866, + "step": 6541 + }, + { + "epoch": 3.689791314156796, + "grad_norm": 1.1289132833480835, + "learning_rate": 3.155386350817823e-05, + "loss": 0.8197, + "step": 6542 + }, + { + "epoch": 3.6903553299492384, + "grad_norm": 1.2400559186935425, + "learning_rate": 3.1551043429216024e-05, + "loss": 0.7507, + "step": 6543 + }, + { + "epoch": 3.6909193457416807, + "grad_norm": 1.115543246269226, + "learning_rate": 3.154822335025381e-05, + "loss": 0.7901, + "step": 6544 + }, + { + "epoch": 3.691483361534123, + "grad_norm": 1.5217962265014648, + "learning_rate": 3.1545403271291595e-05, + "loss": 0.8124, + "step": 6545 + }, + { + "epoch": 3.692047377326565, + "grad_norm": 1.9951670169830322, + "learning_rate": 3.154258319232939e-05, + "loss": 0.7153, + "step": 6546 + }, + { + "epoch": 3.6926113931190074, + "grad_norm": 1.7494254112243652, + "learning_rate": 3.153976311336718e-05, + "loss": 0.8174, + "step": 6547 + }, + { + "epoch": 3.6931754089114497, + "grad_norm": 1.47707200050354, + "learning_rate": 3.1536943034404964e-05, + "loss": 0.7589, + "step": 6548 + }, + { + "epoch": 3.6937394247038915, + "grad_norm": 1.3860570192337036, + "learning_rate": 3.153412295544275e-05, + "loss": 0.6924, + "step": 6549 + }, + { + "epoch": 3.6943034404963337, + "grad_norm": 1.6439313888549805, + "learning_rate": 3.153130287648054e-05, + "loss": 0.857, + "step": 6550 + }, + { + "epoch": 3.694867456288776, + "grad_norm": 1.0960924625396729, + "learning_rate": 3.1528482797518334e-05, + "loss": 0.7731, + "step": 6551 + }, + { + "epoch": 3.6954314720812182, + "grad_norm": 1.0032755136489868, + "learning_rate": 3.152566271855612e-05, + "loss": 0.7467, + "step": 6552 + }, + { + "epoch": 3.6959954878736605, + "grad_norm": 1.3156534433364868, + "learning_rate": 3.1522842639593905e-05, + "loss": 0.6947, + "step": 6553 + }, + { + "epoch": 3.6965595036661028, + "grad_norm": 1.3542804718017578, + "learning_rate": 3.1520022560631704e-05, + "loss": 0.8429, + "step": 6554 + }, + { + "epoch": 3.697123519458545, + "grad_norm": 1.5640934705734253, + "learning_rate": 3.151720248166949e-05, + "loss": 0.8149, + "step": 6555 + }, + { + "epoch": 3.697687535250987, + "grad_norm": 2.0854413509368896, + "learning_rate": 3.1514382402707275e-05, + "loss": 0.8969, + "step": 6556 + }, + { + "epoch": 3.698251551043429, + "grad_norm": 1.1679633855819702, + "learning_rate": 3.151156232374507e-05, + "loss": 0.8211, + "step": 6557 + }, + { + "epoch": 3.6988155668358713, + "grad_norm": 1.4200204610824585, + "learning_rate": 3.150874224478286e-05, + "loss": 0.6775, + "step": 6558 + }, + { + "epoch": 3.6993795826283136, + "grad_norm": 1.0227060317993164, + "learning_rate": 3.1505922165820644e-05, + "loss": 0.6654, + "step": 6559 + }, + { + "epoch": 3.699943598420756, + "grad_norm": 1.92538583278656, + "learning_rate": 3.150310208685843e-05, + "loss": 0.9058, + "step": 6560 + }, + { + "epoch": 3.700507614213198, + "grad_norm": 1.1359703540802002, + "learning_rate": 3.150028200789622e-05, + "loss": 0.6644, + "step": 6561 + }, + { + "epoch": 3.7010716300056403, + "grad_norm": 1.2057017087936401, + "learning_rate": 3.1497461928934014e-05, + "loss": 0.7165, + "step": 6562 + }, + { + "epoch": 3.701635645798082, + "grad_norm": 0.8881958723068237, + "learning_rate": 3.14946418499718e-05, + "loss": 0.6818, + "step": 6563 + }, + { + "epoch": 3.7021996615905244, + "grad_norm": 2.260310173034668, + "learning_rate": 3.149182177100959e-05, + "loss": 0.8861, + "step": 6564 + }, + { + "epoch": 3.7027636773829666, + "grad_norm": 1.6953909397125244, + "learning_rate": 3.148900169204738e-05, + "loss": 0.8085, + "step": 6565 + }, + { + "epoch": 3.703327693175409, + "grad_norm": 0.9846319556236267, + "learning_rate": 3.148618161308517e-05, + "loss": 0.7846, + "step": 6566 + }, + { + "epoch": 3.703891708967851, + "grad_norm": 1.3718534708023071, + "learning_rate": 3.1483361534122955e-05, + "loss": 0.732, + "step": 6567 + }, + { + "epoch": 3.7044557247602934, + "grad_norm": 1.2067530155181885, + "learning_rate": 3.148054145516075e-05, + "loss": 0.6635, + "step": 6568 + }, + { + "epoch": 3.7050197405527356, + "grad_norm": 1.7032477855682373, + "learning_rate": 3.147772137619853e-05, + "loss": 0.7451, + "step": 6569 + }, + { + "epoch": 3.7055837563451774, + "grad_norm": 1.6365582942962646, + "learning_rate": 3.1474901297236325e-05, + "loss": 0.7347, + "step": 6570 + }, + { + "epoch": 3.7061477721376197, + "grad_norm": 1.4799972772598267, + "learning_rate": 3.147208121827411e-05, + "loss": 0.7502, + "step": 6571 + }, + { + "epoch": 3.706711787930062, + "grad_norm": 1.6689486503601074, + "learning_rate": 3.14692611393119e-05, + "loss": 0.9147, + "step": 6572 + }, + { + "epoch": 3.707275803722504, + "grad_norm": 1.9700593948364258, + "learning_rate": 3.1466441060349694e-05, + "loss": 0.8387, + "step": 6573 + }, + { + "epoch": 3.7078398195149465, + "grad_norm": 1.893408179283142, + "learning_rate": 3.146362098138748e-05, + "loss": 0.9124, + "step": 6574 + }, + { + "epoch": 3.7084038353073887, + "grad_norm": 0.9996896386146545, + "learning_rate": 3.146080090242527e-05, + "loss": 0.7462, + "step": 6575 + }, + { + "epoch": 3.708967851099831, + "grad_norm": 1.2418264150619507, + "learning_rate": 3.145798082346306e-05, + "loss": 0.8592, + "step": 6576 + }, + { + "epoch": 3.7095318668922728, + "grad_norm": 1.6257061958312988, + "learning_rate": 3.145516074450085e-05, + "loss": 0.767, + "step": 6577 + }, + { + "epoch": 3.710095882684715, + "grad_norm": 0.9718237519264221, + "learning_rate": 3.1452340665538635e-05, + "loss": 0.7434, + "step": 6578 + }, + { + "epoch": 3.7106598984771573, + "grad_norm": 1.5828485488891602, + "learning_rate": 3.144952058657643e-05, + "loss": 0.6671, + "step": 6579 + }, + { + "epoch": 3.7112239142695995, + "grad_norm": 1.3612743616104126, + "learning_rate": 3.144670050761421e-05, + "loss": 0.8094, + "step": 6580 + }, + { + "epoch": 3.7117879300620418, + "grad_norm": 1.4036189317703247, + "learning_rate": 3.1443880428652005e-05, + "loss": 0.8024, + "step": 6581 + }, + { + "epoch": 3.712351945854484, + "grad_norm": 1.5775353908538818, + "learning_rate": 3.14410603496898e-05, + "loss": 0.8233, + "step": 6582 + }, + { + "epoch": 3.7129159616469263, + "grad_norm": 1.195363998413086, + "learning_rate": 3.143824027072758e-05, + "loss": 0.6355, + "step": 6583 + }, + { + "epoch": 3.713479977439368, + "grad_norm": 1.0396430492401123, + "learning_rate": 3.143542019176537e-05, + "loss": 0.7729, + "step": 6584 + }, + { + "epoch": 3.7140439932318103, + "grad_norm": 1.5165907144546509, + "learning_rate": 3.143260011280316e-05, + "loss": 0.7809, + "step": 6585 + }, + { + "epoch": 3.7146080090242526, + "grad_norm": 0.9035001397132874, + "learning_rate": 3.142978003384095e-05, + "loss": 0.6751, + "step": 6586 + }, + { + "epoch": 3.715172024816695, + "grad_norm": 1.7191110849380493, + "learning_rate": 3.142695995487874e-05, + "loss": 0.8004, + "step": 6587 + }, + { + "epoch": 3.715736040609137, + "grad_norm": 0.9337326884269714, + "learning_rate": 3.142413987591652e-05, + "loss": 0.705, + "step": 6588 + }, + { + "epoch": 3.7163000564015793, + "grad_norm": 1.1000373363494873, + "learning_rate": 3.1421319796954315e-05, + "loss": 0.8013, + "step": 6589 + }, + { + "epoch": 3.7168640721940216, + "grad_norm": 1.0409499406814575, + "learning_rate": 3.141849971799211e-05, + "loss": 0.7513, + "step": 6590 + }, + { + "epoch": 3.7174280879864634, + "grad_norm": 0.8813404440879822, + "learning_rate": 3.141567963902989e-05, + "loss": 0.6621, + "step": 6591 + }, + { + "epoch": 3.7179921037789057, + "grad_norm": 1.249656319618225, + "learning_rate": 3.141285956006768e-05, + "loss": 0.8326, + "step": 6592 + }, + { + "epoch": 3.718556119571348, + "grad_norm": 0.9958592057228088, + "learning_rate": 3.141003948110548e-05, + "loss": 0.7086, + "step": 6593 + }, + { + "epoch": 3.71912013536379, + "grad_norm": 1.0140933990478516, + "learning_rate": 3.140721940214326e-05, + "loss": 0.6899, + "step": 6594 + }, + { + "epoch": 3.7196841511562324, + "grad_norm": 1.415124773979187, + "learning_rate": 3.140439932318105e-05, + "loss": 0.8302, + "step": 6595 + }, + { + "epoch": 3.7202481669486747, + "grad_norm": 1.7900841236114502, + "learning_rate": 3.140157924421884e-05, + "loss": 0.9956, + "step": 6596 + }, + { + "epoch": 3.720812182741117, + "grad_norm": 1.178980827331543, + "learning_rate": 3.139875916525663e-05, + "loss": 0.868, + "step": 6597 + }, + { + "epoch": 3.7213761985335587, + "grad_norm": 1.2774420976638794, + "learning_rate": 3.139593908629442e-05, + "loss": 0.6688, + "step": 6598 + }, + { + "epoch": 3.721940214326001, + "grad_norm": 1.6234530210494995, + "learning_rate": 3.13931190073322e-05, + "loss": 0.8279, + "step": 6599 + }, + { + "epoch": 3.7225042301184432, + "grad_norm": 1.7201298475265503, + "learning_rate": 3.1390298928369995e-05, + "loss": 0.8505, + "step": 6600 + }, + { + "epoch": 3.7230682459108855, + "grad_norm": 1.6077252626419067, + "learning_rate": 3.138747884940779e-05, + "loss": 0.8696, + "step": 6601 + }, + { + "epoch": 3.7236322617033277, + "grad_norm": 1.4228019714355469, + "learning_rate": 3.138465877044557e-05, + "loss": 0.7357, + "step": 6602 + }, + { + "epoch": 3.72419627749577, + "grad_norm": 1.3029474020004272, + "learning_rate": 3.1381838691483365e-05, + "loss": 0.6928, + "step": 6603 + }, + { + "epoch": 3.7247602932882122, + "grad_norm": 1.523637056350708, + "learning_rate": 3.137901861252115e-05, + "loss": 0.7236, + "step": 6604 + }, + { + "epoch": 3.725324309080654, + "grad_norm": 1.0247873067855835, + "learning_rate": 3.137619853355894e-05, + "loss": 0.7338, + "step": 6605 + }, + { + "epoch": 3.7258883248730963, + "grad_norm": 1.7432427406311035, + "learning_rate": 3.137337845459673e-05, + "loss": 0.7375, + "step": 6606 + }, + { + "epoch": 3.7264523406655385, + "grad_norm": 1.5003352165222168, + "learning_rate": 3.137055837563452e-05, + "loss": 0.83, + "step": 6607 + }, + { + "epoch": 3.727016356457981, + "grad_norm": 1.3304266929626465, + "learning_rate": 3.136773829667231e-05, + "loss": 0.8227, + "step": 6608 + }, + { + "epoch": 3.727580372250423, + "grad_norm": 1.0710421800613403, + "learning_rate": 3.13649182177101e-05, + "loss": 0.7976, + "step": 6609 + }, + { + "epoch": 3.7281443880428653, + "grad_norm": 4.649355411529541, + "learning_rate": 3.136209813874788e-05, + "loss": 0.8674, + "step": 6610 + }, + { + "epoch": 3.7287084038353075, + "grad_norm": 1.2307581901550293, + "learning_rate": 3.1359278059785675e-05, + "loss": 0.686, + "step": 6611 + }, + { + "epoch": 3.7292724196277494, + "grad_norm": 1.547216773033142, + "learning_rate": 3.135645798082347e-05, + "loss": 0.7827, + "step": 6612 + }, + { + "epoch": 3.7298364354201916, + "grad_norm": 1.3689693212509155, + "learning_rate": 3.135363790186125e-05, + "loss": 0.8059, + "step": 6613 + }, + { + "epoch": 3.730400451212634, + "grad_norm": 0.9032647609710693, + "learning_rate": 3.1350817822899045e-05, + "loss": 0.7064, + "step": 6614 + }, + { + "epoch": 3.730964467005076, + "grad_norm": 2.827622175216675, + "learning_rate": 3.134799774393683e-05, + "loss": 0.8147, + "step": 6615 + }, + { + "epoch": 3.7315284827975184, + "grad_norm": 1.6383134126663208, + "learning_rate": 3.134517766497462e-05, + "loss": 0.7984, + "step": 6616 + }, + { + "epoch": 3.7320924985899606, + "grad_norm": 1.0139193534851074, + "learning_rate": 3.134235758601241e-05, + "loss": 0.6882, + "step": 6617 + }, + { + "epoch": 3.732656514382403, + "grad_norm": 2.6645090579986572, + "learning_rate": 3.13395375070502e-05, + "loss": 0.7571, + "step": 6618 + }, + { + "epoch": 3.7332205301748447, + "grad_norm": 1.4316833019256592, + "learning_rate": 3.1336717428087986e-05, + "loss": 0.7033, + "step": 6619 + }, + { + "epoch": 3.733784545967287, + "grad_norm": 1.0213643312454224, + "learning_rate": 3.133389734912578e-05, + "loss": 0.7277, + "step": 6620 + }, + { + "epoch": 3.734348561759729, + "grad_norm": 1.2305959463119507, + "learning_rate": 3.133107727016357e-05, + "loss": 0.7715, + "step": 6621 + }, + { + "epoch": 3.7349125775521714, + "grad_norm": 1.7147648334503174, + "learning_rate": 3.1328257191201356e-05, + "loss": 0.8052, + "step": 6622 + }, + { + "epoch": 3.7354765933446137, + "grad_norm": 1.2529081106185913, + "learning_rate": 3.132543711223914e-05, + "loss": 0.6992, + "step": 6623 + }, + { + "epoch": 3.736040609137056, + "grad_norm": 1.720534086227417, + "learning_rate": 3.132261703327693e-05, + "loss": 0.7845, + "step": 6624 + }, + { + "epoch": 3.736604624929498, + "grad_norm": 1.4406583309173584, + "learning_rate": 3.1319796954314725e-05, + "loss": 0.9984, + "step": 6625 + }, + { + "epoch": 3.73716864072194, + "grad_norm": 1.9548592567443848, + "learning_rate": 3.131697687535251e-05, + "loss": 0.7611, + "step": 6626 + }, + { + "epoch": 3.7377326565143822, + "grad_norm": 2.6896941661834717, + "learning_rate": 3.1314156796390296e-05, + "loss": 0.6913, + "step": 6627 + }, + { + "epoch": 3.7382966723068245, + "grad_norm": 1.08397376537323, + "learning_rate": 3.131133671742809e-05, + "loss": 0.6515, + "step": 6628 + }, + { + "epoch": 3.7388606880992667, + "grad_norm": 1.0849076509475708, + "learning_rate": 3.130851663846588e-05, + "loss": 0.6826, + "step": 6629 + }, + { + "epoch": 3.739424703891709, + "grad_norm": 1.5439969301223755, + "learning_rate": 3.1305696559503666e-05, + "loss": 0.6325, + "step": 6630 + }, + { + "epoch": 3.7399887196841513, + "grad_norm": 1.3016263246536255, + "learning_rate": 3.130287648054145e-05, + "loss": 0.7312, + "step": 6631 + }, + { + "epoch": 3.7405527354765935, + "grad_norm": 1.3929712772369385, + "learning_rate": 3.130005640157925e-05, + "loss": 0.7536, + "step": 6632 + }, + { + "epoch": 3.7411167512690353, + "grad_norm": 1.572603702545166, + "learning_rate": 3.1297236322617036e-05, + "loss": 0.8063, + "step": 6633 + }, + { + "epoch": 3.7416807670614776, + "grad_norm": 1.1248064041137695, + "learning_rate": 3.129441624365482e-05, + "loss": 0.7618, + "step": 6634 + }, + { + "epoch": 3.74224478285392, + "grad_norm": 1.1714333295822144, + "learning_rate": 3.129159616469261e-05, + "loss": 0.7251, + "step": 6635 + }, + { + "epoch": 3.742808798646362, + "grad_norm": 1.330395221710205, + "learning_rate": 3.1288776085730406e-05, + "loss": 0.7097, + "step": 6636 + }, + { + "epoch": 3.7433728144388043, + "grad_norm": 1.3089901208877563, + "learning_rate": 3.128595600676819e-05, + "loss": 0.6986, + "step": 6637 + }, + { + "epoch": 3.7439368302312466, + "grad_norm": 1.047876000404358, + "learning_rate": 3.1283135927805976e-05, + "loss": 0.7835, + "step": 6638 + }, + { + "epoch": 3.744500846023689, + "grad_norm": 1.250688076019287, + "learning_rate": 3.128031584884377e-05, + "loss": 0.7256, + "step": 6639 + }, + { + "epoch": 3.7450648618161306, + "grad_norm": 1.4403657913208008, + "learning_rate": 3.127749576988156e-05, + "loss": 0.8565, + "step": 6640 + }, + { + "epoch": 3.745628877608573, + "grad_norm": 1.607193112373352, + "learning_rate": 3.1274675690919346e-05, + "loss": 0.7499, + "step": 6641 + }, + { + "epoch": 3.746192893401015, + "grad_norm": 1.2158292531967163, + "learning_rate": 3.127185561195713e-05, + "loss": 0.8098, + "step": 6642 + }, + { + "epoch": 3.7467569091934574, + "grad_norm": 1.7020150423049927, + "learning_rate": 3.126903553299493e-05, + "loss": 0.833, + "step": 6643 + }, + { + "epoch": 3.7473209249858996, + "grad_norm": 1.4306302070617676, + "learning_rate": 3.1266215454032716e-05, + "loss": 0.7158, + "step": 6644 + }, + { + "epoch": 3.747884940778342, + "grad_norm": 1.0634605884552002, + "learning_rate": 3.12633953750705e-05, + "loss": 0.6606, + "step": 6645 + }, + { + "epoch": 3.748448956570784, + "grad_norm": 1.1122781038284302, + "learning_rate": 3.1260575296108294e-05, + "loss": 0.836, + "step": 6646 + }, + { + "epoch": 3.749012972363226, + "grad_norm": 1.213357925415039, + "learning_rate": 3.1257755217146086e-05, + "loss": 0.7822, + "step": 6647 + }, + { + "epoch": 3.749576988155668, + "grad_norm": 1.5116090774536133, + "learning_rate": 3.125493513818387e-05, + "loss": 0.8382, + "step": 6648 + }, + { + "epoch": 3.7501410039481105, + "grad_norm": 0.943088948726654, + "learning_rate": 3.1252115059221657e-05, + "loss": 0.7721, + "step": 6649 + }, + { + "epoch": 3.7507050197405527, + "grad_norm": 0.9528307318687439, + "learning_rate": 3.124929498025945e-05, + "loss": 0.7514, + "step": 6650 + }, + { + "epoch": 3.751269035532995, + "grad_norm": 1.3928598165512085, + "learning_rate": 3.124647490129724e-05, + "loss": 0.7407, + "step": 6651 + }, + { + "epoch": 3.751833051325437, + "grad_norm": 1.473408818244934, + "learning_rate": 3.1243654822335026e-05, + "loss": 0.8353, + "step": 6652 + }, + { + "epoch": 3.7523970671178795, + "grad_norm": 1.0801506042480469, + "learning_rate": 3.124083474337282e-05, + "loss": 0.7801, + "step": 6653 + }, + { + "epoch": 3.7529610829103213, + "grad_norm": 0.9452170133590698, + "learning_rate": 3.1238014664410604e-05, + "loss": 0.707, + "step": 6654 + }, + { + "epoch": 3.7535250987027635, + "grad_norm": 1.05149507522583, + "learning_rate": 3.1235194585448396e-05, + "loss": 0.7129, + "step": 6655 + }, + { + "epoch": 3.7540891144952058, + "grad_norm": 1.2138980627059937, + "learning_rate": 3.123237450648618e-05, + "loss": 0.7469, + "step": 6656 + }, + { + "epoch": 3.754653130287648, + "grad_norm": 1.7567696571350098, + "learning_rate": 3.1229554427523974e-05, + "loss": 0.8408, + "step": 6657 + }, + { + "epoch": 3.7552171460800903, + "grad_norm": 1.3780244588851929, + "learning_rate": 3.122673434856176e-05, + "loss": 0.6777, + "step": 6658 + }, + { + "epoch": 3.7557811618725325, + "grad_norm": 1.4686496257781982, + "learning_rate": 3.122391426959955e-05, + "loss": 0.8019, + "step": 6659 + }, + { + "epoch": 3.7563451776649748, + "grad_norm": 1.4886443614959717, + "learning_rate": 3.122109419063734e-05, + "loss": 0.6628, + "step": 6660 + }, + { + "epoch": 3.7569091934574166, + "grad_norm": 2.4647715091705322, + "learning_rate": 3.121827411167513e-05, + "loss": 0.7673, + "step": 6661 + }, + { + "epoch": 3.757473209249859, + "grad_norm": 0.8827227354049683, + "learning_rate": 3.1215454032712914e-05, + "loss": 0.6333, + "step": 6662 + }, + { + "epoch": 3.758037225042301, + "grad_norm": 1.1081626415252686, + "learning_rate": 3.1212633953750706e-05, + "loss": 0.7359, + "step": 6663 + }, + { + "epoch": 3.7586012408347433, + "grad_norm": 1.0611982345581055, + "learning_rate": 3.12098138747885e-05, + "loss": 0.7314, + "step": 6664 + }, + { + "epoch": 3.7591652566271856, + "grad_norm": 1.3389434814453125, + "learning_rate": 3.1206993795826284e-05, + "loss": 0.7531, + "step": 6665 + }, + { + "epoch": 3.759729272419628, + "grad_norm": 1.1783535480499268, + "learning_rate": 3.120417371686407e-05, + "loss": 0.8518, + "step": 6666 + }, + { + "epoch": 3.76029328821207, + "grad_norm": 1.4510366916656494, + "learning_rate": 3.120135363790186e-05, + "loss": 0.6992, + "step": 6667 + }, + { + "epoch": 3.760857304004512, + "grad_norm": 1.3054172992706299, + "learning_rate": 3.1198533558939654e-05, + "loss": 0.7739, + "step": 6668 + }, + { + "epoch": 3.761421319796954, + "grad_norm": 1.6530771255493164, + "learning_rate": 3.119571347997744e-05, + "loss": 0.8145, + "step": 6669 + }, + { + "epoch": 3.7619853355893964, + "grad_norm": 1.7304348945617676, + "learning_rate": 3.1192893401015225e-05, + "loss": 0.8694, + "step": 6670 + }, + { + "epoch": 3.7625493513818387, + "grad_norm": 1.163500189781189, + "learning_rate": 3.1190073322053024e-05, + "loss": 0.7784, + "step": 6671 + }, + { + "epoch": 3.763113367174281, + "grad_norm": 1.2576000690460205, + "learning_rate": 3.118725324309081e-05, + "loss": 0.7168, + "step": 6672 + }, + { + "epoch": 3.763677382966723, + "grad_norm": 1.8481848239898682, + "learning_rate": 3.1184433164128594e-05, + "loss": 0.7366, + "step": 6673 + }, + { + "epoch": 3.7642413987591654, + "grad_norm": 1.2169092893600464, + "learning_rate": 3.1181613085166387e-05, + "loss": 0.7158, + "step": 6674 + }, + { + "epoch": 3.764805414551607, + "grad_norm": 1.1882110834121704, + "learning_rate": 3.117879300620418e-05, + "loss": 0.7214, + "step": 6675 + }, + { + "epoch": 3.7653694303440495, + "grad_norm": 1.5902564525604248, + "learning_rate": 3.1175972927241964e-05, + "loss": 0.8286, + "step": 6676 + }, + { + "epoch": 3.7659334461364917, + "grad_norm": 1.5421096086502075, + "learning_rate": 3.117315284827975e-05, + "loss": 0.7633, + "step": 6677 + }, + { + "epoch": 3.766497461928934, + "grad_norm": 1.2159290313720703, + "learning_rate": 3.117033276931754e-05, + "loss": 0.804, + "step": 6678 + }, + { + "epoch": 3.7670614777213762, + "grad_norm": 1.2964078187942505, + "learning_rate": 3.1167512690355334e-05, + "loss": 0.7862, + "step": 6679 + }, + { + "epoch": 3.7676254935138185, + "grad_norm": 3.49359393119812, + "learning_rate": 3.116469261139312e-05, + "loss": 0.875, + "step": 6680 + }, + { + "epoch": 3.7681895093062607, + "grad_norm": 1.4943416118621826, + "learning_rate": 3.1161872532430905e-05, + "loss": 0.9043, + "step": 6681 + }, + { + "epoch": 3.7687535250987025, + "grad_norm": 1.3048430681228638, + "learning_rate": 3.1159052453468704e-05, + "loss": 0.7249, + "step": 6682 + }, + { + "epoch": 3.769317540891145, + "grad_norm": 1.746619462966919, + "learning_rate": 3.115623237450649e-05, + "loss": 0.834, + "step": 6683 + }, + { + "epoch": 3.769881556683587, + "grad_norm": 1.609586238861084, + "learning_rate": 3.1153412295544275e-05, + "loss": 0.793, + "step": 6684 + }, + { + "epoch": 3.7704455724760293, + "grad_norm": 1.3224812746047974, + "learning_rate": 3.115059221658207e-05, + "loss": 0.7191, + "step": 6685 + }, + { + "epoch": 3.7710095882684715, + "grad_norm": 1.6042499542236328, + "learning_rate": 3.114777213761986e-05, + "loss": 0.8006, + "step": 6686 + }, + { + "epoch": 3.771573604060914, + "grad_norm": 1.1019542217254639, + "learning_rate": 3.1144952058657644e-05, + "loss": 0.6794, + "step": 6687 + }, + { + "epoch": 3.772137619853356, + "grad_norm": 1.271060824394226, + "learning_rate": 3.114213197969543e-05, + "loss": 0.7118, + "step": 6688 + }, + { + "epoch": 3.772701635645798, + "grad_norm": 1.3279904127120972, + "learning_rate": 3.113931190073322e-05, + "loss": 0.7448, + "step": 6689 + }, + { + "epoch": 3.77326565143824, + "grad_norm": 1.1614627838134766, + "learning_rate": 3.1136491821771014e-05, + "loss": 0.6763, + "step": 6690 + }, + { + "epoch": 3.7738296672306824, + "grad_norm": 2.1797235012054443, + "learning_rate": 3.11336717428088e-05, + "loss": 0.7354, + "step": 6691 + }, + { + "epoch": 3.7743936830231246, + "grad_norm": 1.1863280534744263, + "learning_rate": 3.113085166384659e-05, + "loss": 0.7403, + "step": 6692 + }, + { + "epoch": 3.774957698815567, + "grad_norm": 2.0659096240997314, + "learning_rate": 3.112803158488438e-05, + "loss": 0.8621, + "step": 6693 + }, + { + "epoch": 3.775521714608009, + "grad_norm": 1.2219096422195435, + "learning_rate": 3.112521150592217e-05, + "loss": 0.7581, + "step": 6694 + }, + { + "epoch": 3.7760857304004514, + "grad_norm": 0.9514010548591614, + "learning_rate": 3.1122391426959955e-05, + "loss": 0.7285, + "step": 6695 + }, + { + "epoch": 3.776649746192893, + "grad_norm": 1.8123611211776733, + "learning_rate": 3.111957134799775e-05, + "loss": 0.8843, + "step": 6696 + }, + { + "epoch": 3.7772137619853354, + "grad_norm": 1.27156662940979, + "learning_rate": 3.111675126903553e-05, + "loss": 0.8419, + "step": 6697 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 1.9348864555358887, + "learning_rate": 3.1113931190073324e-05, + "loss": 0.8128, + "step": 6698 + }, + { + "epoch": 3.77834179357022, + "grad_norm": 0.9992871880531311, + "learning_rate": 3.111111111111111e-05, + "loss": 0.6765, + "step": 6699 + }, + { + "epoch": 3.778905809362662, + "grad_norm": 1.1430397033691406, + "learning_rate": 3.11082910321489e-05, + "loss": 0.7173, + "step": 6700 + }, + { + "epoch": 3.7794698251551044, + "grad_norm": 1.748494029045105, + "learning_rate": 3.110547095318669e-05, + "loss": 0.807, + "step": 6701 + }, + { + "epoch": 3.7800338409475467, + "grad_norm": 2.2117438316345215, + "learning_rate": 3.110265087422448e-05, + "loss": 0.8332, + "step": 6702 + }, + { + "epoch": 3.7805978567399885, + "grad_norm": 1.2268577814102173, + "learning_rate": 3.109983079526227e-05, + "loss": 0.6666, + "step": 6703 + }, + { + "epoch": 3.7811618725324307, + "grad_norm": 1.2543580532073975, + "learning_rate": 3.109701071630006e-05, + "loss": 0.6681, + "step": 6704 + }, + { + "epoch": 3.781725888324873, + "grad_norm": 1.1560534238815308, + "learning_rate": 3.109419063733784e-05, + "loss": 0.7179, + "step": 6705 + }, + { + "epoch": 3.7822899041173152, + "grad_norm": 1.3548616170883179, + "learning_rate": 3.1091370558375635e-05, + "loss": 0.7288, + "step": 6706 + }, + { + "epoch": 3.7828539199097575, + "grad_norm": 1.2742669582366943, + "learning_rate": 3.108855047941343e-05, + "loss": 0.763, + "step": 6707 + }, + { + "epoch": 3.7834179357021998, + "grad_norm": 1.4947097301483154, + "learning_rate": 3.108573040045121e-05, + "loss": 0.7401, + "step": 6708 + }, + { + "epoch": 3.783981951494642, + "grad_norm": 0.9418726563453674, + "learning_rate": 3.1082910321489e-05, + "loss": 0.7055, + "step": 6709 + }, + { + "epoch": 3.784545967287084, + "grad_norm": 1.4864559173583984, + "learning_rate": 3.10800902425268e-05, + "loss": 0.7242, + "step": 6710 + }, + { + "epoch": 3.785109983079526, + "grad_norm": 2.3468620777130127, + "learning_rate": 3.107727016356458e-05, + "loss": 0.9107, + "step": 6711 + }, + { + "epoch": 3.7856739988719683, + "grad_norm": 1.6345449686050415, + "learning_rate": 3.107445008460237e-05, + "loss": 0.8303, + "step": 6712 + }, + { + "epoch": 3.7862380146644106, + "grad_norm": 5.314533710479736, + "learning_rate": 3.107163000564016e-05, + "loss": 0.8705, + "step": 6713 + }, + { + "epoch": 3.786802030456853, + "grad_norm": 1.748913049697876, + "learning_rate": 3.106880992667795e-05, + "loss": 0.8399, + "step": 6714 + }, + { + "epoch": 3.787366046249295, + "grad_norm": 1.5384221076965332, + "learning_rate": 3.106598984771574e-05, + "loss": 0.8138, + "step": 6715 + }, + { + "epoch": 3.7879300620417373, + "grad_norm": 1.2314796447753906, + "learning_rate": 3.106316976875352e-05, + "loss": 0.6705, + "step": 6716 + }, + { + "epoch": 3.788494077834179, + "grad_norm": 1.8613982200622559, + "learning_rate": 3.1060349689791315e-05, + "loss": 0.7639, + "step": 6717 + }, + { + "epoch": 3.7890580936266214, + "grad_norm": 1.657619595527649, + "learning_rate": 3.105752961082911e-05, + "loss": 0.836, + "step": 6718 + }, + { + "epoch": 3.7896221094190636, + "grad_norm": 1.1087048053741455, + "learning_rate": 3.105470953186689e-05, + "loss": 0.8266, + "step": 6719 + }, + { + "epoch": 3.790186125211506, + "grad_norm": 1.2291077375411987, + "learning_rate": 3.105188945290468e-05, + "loss": 0.8247, + "step": 6720 + }, + { + "epoch": 3.790750141003948, + "grad_norm": 1.0241081714630127, + "learning_rate": 3.104906937394248e-05, + "loss": 0.693, + "step": 6721 + }, + { + "epoch": 3.7913141567963904, + "grad_norm": 1.3007277250289917, + "learning_rate": 3.104624929498026e-05, + "loss": 0.657, + "step": 6722 + }, + { + "epoch": 3.7918781725888326, + "grad_norm": 1.0237573385238647, + "learning_rate": 3.104342921601805e-05, + "loss": 0.7973, + "step": 6723 + }, + { + "epoch": 3.7924421883812744, + "grad_norm": 1.2429959774017334, + "learning_rate": 3.104060913705584e-05, + "loss": 0.8092, + "step": 6724 + }, + { + "epoch": 3.7930062041737167, + "grad_norm": 1.4766446352005005, + "learning_rate": 3.103778905809363e-05, + "loss": 0.7253, + "step": 6725 + }, + { + "epoch": 3.793570219966159, + "grad_norm": 1.1975899934768677, + "learning_rate": 3.103496897913142e-05, + "loss": 0.6771, + "step": 6726 + }, + { + "epoch": 3.794134235758601, + "grad_norm": 1.7223656177520752, + "learning_rate": 3.10321489001692e-05, + "loss": 0.8269, + "step": 6727 + }, + { + "epoch": 3.7946982515510435, + "grad_norm": 1.6200143098831177, + "learning_rate": 3.1029328821206995e-05, + "loss": 0.8362, + "step": 6728 + }, + { + "epoch": 3.7952622673434857, + "grad_norm": 1.4463282823562622, + "learning_rate": 3.102650874224479e-05, + "loss": 0.7321, + "step": 6729 + }, + { + "epoch": 3.795826283135928, + "grad_norm": 2.522980213165283, + "learning_rate": 3.102368866328257e-05, + "loss": 0.9105, + "step": 6730 + }, + { + "epoch": 3.7963902989283698, + "grad_norm": 0.9215388894081116, + "learning_rate": 3.1020868584320365e-05, + "loss": 0.6628, + "step": 6731 + }, + { + "epoch": 3.796954314720812, + "grad_norm": 1.2014102935791016, + "learning_rate": 3.101804850535815e-05, + "loss": 0.6774, + "step": 6732 + }, + { + "epoch": 3.7975183305132543, + "grad_norm": 1.4084265232086182, + "learning_rate": 3.101522842639594e-05, + "loss": 0.8563, + "step": 6733 + }, + { + "epoch": 3.7980823463056965, + "grad_norm": 1.7437026500701904, + "learning_rate": 3.101240834743373e-05, + "loss": 0.8364, + "step": 6734 + }, + { + "epoch": 3.7986463620981388, + "grad_norm": 1.1930400133132935, + "learning_rate": 3.100958826847152e-05, + "loss": 0.8113, + "step": 6735 + }, + { + "epoch": 3.799210377890581, + "grad_norm": 1.12479567527771, + "learning_rate": 3.1006768189509306e-05, + "loss": 0.7865, + "step": 6736 + }, + { + "epoch": 3.7997743936830233, + "grad_norm": 1.0760301351547241, + "learning_rate": 3.10039481105471e-05, + "loss": 0.595, + "step": 6737 + }, + { + "epoch": 3.800338409475465, + "grad_norm": 1.0987327098846436, + "learning_rate": 3.100112803158488e-05, + "loss": 0.7414, + "step": 6738 + }, + { + "epoch": 3.8009024252679073, + "grad_norm": 1.483599066734314, + "learning_rate": 3.0998307952622675e-05, + "loss": 0.7674, + "step": 6739 + }, + { + "epoch": 3.8014664410603496, + "grad_norm": 1.6363860368728638, + "learning_rate": 3.099548787366046e-05, + "loss": 0.8156, + "step": 6740 + }, + { + "epoch": 3.802030456852792, + "grad_norm": 0.9917011857032776, + "learning_rate": 3.099266779469825e-05, + "loss": 0.8005, + "step": 6741 + }, + { + "epoch": 3.802594472645234, + "grad_norm": 1.6908448934555054, + "learning_rate": 3.0989847715736045e-05, + "loss": 0.7396, + "step": 6742 + }, + { + "epoch": 3.8031584884376763, + "grad_norm": 1.2496519088745117, + "learning_rate": 3.098702763677383e-05, + "loss": 0.7447, + "step": 6743 + }, + { + "epoch": 3.8037225042301186, + "grad_norm": 0.9287325739860535, + "learning_rate": 3.0984207557811616e-05, + "loss": 0.6674, + "step": 6744 + }, + { + "epoch": 3.8042865200225604, + "grad_norm": 1.0139328241348267, + "learning_rate": 3.098138747884941e-05, + "loss": 0.6884, + "step": 6745 + }, + { + "epoch": 3.8048505358150027, + "grad_norm": 1.1052234172821045, + "learning_rate": 3.09785673998872e-05, + "loss": 0.7227, + "step": 6746 + }, + { + "epoch": 3.805414551607445, + "grad_norm": 1.0273624658584595, + "learning_rate": 3.0975747320924986e-05, + "loss": 0.6664, + "step": 6747 + }, + { + "epoch": 3.805978567399887, + "grad_norm": 1.3958687782287598, + "learning_rate": 3.097292724196278e-05, + "loss": 0.6738, + "step": 6748 + }, + { + "epoch": 3.8065425831923294, + "grad_norm": 1.2177811861038208, + "learning_rate": 3.097010716300057e-05, + "loss": 0.6532, + "step": 6749 + }, + { + "epoch": 3.8071065989847717, + "grad_norm": 1.7657595872879028, + "learning_rate": 3.0967287084038355e-05, + "loss": 0.7748, + "step": 6750 + }, + { + "epoch": 3.807670614777214, + "grad_norm": 1.1842401027679443, + "learning_rate": 3.096446700507614e-05, + "loss": 0.6411, + "step": 6751 + }, + { + "epoch": 3.8082346305696557, + "grad_norm": 1.4424562454223633, + "learning_rate": 3.096164692611393e-05, + "loss": 0.7692, + "step": 6752 + }, + { + "epoch": 3.808798646362098, + "grad_norm": 0.950943112373352, + "learning_rate": 3.0958826847151725e-05, + "loss": 0.7172, + "step": 6753 + }, + { + "epoch": 3.8093626621545402, + "grad_norm": 1.5396703481674194, + "learning_rate": 3.095600676818951e-05, + "loss": 0.8053, + "step": 6754 + }, + { + "epoch": 3.8099266779469825, + "grad_norm": 1.039280891418457, + "learning_rate": 3.0953186689227296e-05, + "loss": 0.7235, + "step": 6755 + }, + { + "epoch": 3.8104906937394247, + "grad_norm": 1.505823016166687, + "learning_rate": 3.095036661026509e-05, + "loss": 0.8004, + "step": 6756 + }, + { + "epoch": 3.811054709531867, + "grad_norm": 1.0417029857635498, + "learning_rate": 3.094754653130288e-05, + "loss": 0.7578, + "step": 6757 + }, + { + "epoch": 3.8116187253243092, + "grad_norm": 1.0101059675216675, + "learning_rate": 3.0944726452340666e-05, + "loss": 0.8062, + "step": 6758 + }, + { + "epoch": 3.812182741116751, + "grad_norm": 1.5209729671478271, + "learning_rate": 3.094190637337845e-05, + "loss": 0.8448, + "step": 6759 + }, + { + "epoch": 3.8127467569091937, + "grad_norm": 2.2491912841796875, + "learning_rate": 3.093908629441625e-05, + "loss": 0.8042, + "step": 6760 + }, + { + "epoch": 3.8133107727016355, + "grad_norm": 1.0749465227127075, + "learning_rate": 3.0936266215454036e-05, + "loss": 0.7433, + "step": 6761 + }, + { + "epoch": 3.813874788494078, + "grad_norm": 1.4562424421310425, + "learning_rate": 3.093344613649182e-05, + "loss": 0.7497, + "step": 6762 + }, + { + "epoch": 3.81443880428652, + "grad_norm": 1.1740468740463257, + "learning_rate": 3.093062605752961e-05, + "loss": 0.8288, + "step": 6763 + }, + { + "epoch": 3.8150028200789623, + "grad_norm": 1.507620930671692, + "learning_rate": 3.0927805978567405e-05, + "loss": 0.684, + "step": 6764 + }, + { + "epoch": 3.8155668358714045, + "grad_norm": 1.4858869314193726, + "learning_rate": 3.092498589960519e-05, + "loss": 0.7777, + "step": 6765 + }, + { + "epoch": 3.8161308516638464, + "grad_norm": 1.1119794845581055, + "learning_rate": 3.0922165820642976e-05, + "loss": 0.7475, + "step": 6766 + }, + { + "epoch": 3.816694867456289, + "grad_norm": 1.096927523612976, + "learning_rate": 3.091934574168077e-05, + "loss": 0.7377, + "step": 6767 + }, + { + "epoch": 3.817258883248731, + "grad_norm": 0.9728274941444397, + "learning_rate": 3.091652566271856e-05, + "loss": 0.7103, + "step": 6768 + }, + { + "epoch": 3.817822899041173, + "grad_norm": 1.1296411752700806, + "learning_rate": 3.0913705583756346e-05, + "loss": 0.6909, + "step": 6769 + }, + { + "epoch": 3.8183869148336154, + "grad_norm": 1.3405187129974365, + "learning_rate": 3.091088550479414e-05, + "loss": 0.7336, + "step": 6770 + }, + { + "epoch": 3.8189509306260576, + "grad_norm": 1.4114357233047485, + "learning_rate": 3.0908065425831924e-05, + "loss": 0.8085, + "step": 6771 + }, + { + "epoch": 3.8195149464185, + "grad_norm": 1.6218007802963257, + "learning_rate": 3.0905245346869716e-05, + "loss": 0.7778, + "step": 6772 + }, + { + "epoch": 3.8200789622109417, + "grad_norm": 1.5732039213180542, + "learning_rate": 3.09024252679075e-05, + "loss": 0.7146, + "step": 6773 + }, + { + "epoch": 3.8206429780033844, + "grad_norm": 1.8068606853485107, + "learning_rate": 3.089960518894529e-05, + "loss": 0.8165, + "step": 6774 + }, + { + "epoch": 3.821206993795826, + "grad_norm": 1.7282310724258423, + "learning_rate": 3.089678510998308e-05, + "loss": 0.8515, + "step": 6775 + }, + { + "epoch": 3.8217710095882684, + "grad_norm": 1.290687084197998, + "learning_rate": 3.089396503102087e-05, + "loss": 0.7846, + "step": 6776 + }, + { + "epoch": 3.8223350253807107, + "grad_norm": 1.4648017883300781, + "learning_rate": 3.0891144952058656e-05, + "loss": 0.771, + "step": 6777 + }, + { + "epoch": 3.822899041173153, + "grad_norm": 1.6376073360443115, + "learning_rate": 3.088832487309645e-05, + "loss": 0.7188, + "step": 6778 + }, + { + "epoch": 3.823463056965595, + "grad_norm": 1.2829148769378662, + "learning_rate": 3.0885504794134234e-05, + "loss": 0.7362, + "step": 6779 + }, + { + "epoch": 3.824027072758037, + "grad_norm": 1.2179867029190063, + "learning_rate": 3.0882684715172026e-05, + "loss": 0.7896, + "step": 6780 + }, + { + "epoch": 3.8245910885504797, + "grad_norm": 1.2191007137298584, + "learning_rate": 3.087986463620982e-05, + "loss": 0.7077, + "step": 6781 + }, + { + "epoch": 3.8251551043429215, + "grad_norm": 1.082360029220581, + "learning_rate": 3.0877044557247604e-05, + "loss": 0.7331, + "step": 6782 + }, + { + "epoch": 3.8257191201353637, + "grad_norm": 1.4956587553024292, + "learning_rate": 3.0874224478285396e-05, + "loss": 0.8138, + "step": 6783 + }, + { + "epoch": 3.826283135927806, + "grad_norm": 1.2645213603973389, + "learning_rate": 3.087140439932318e-05, + "loss": 0.6899, + "step": 6784 + }, + { + "epoch": 3.8268471517202483, + "grad_norm": 1.7249737977981567, + "learning_rate": 3.0868584320360973e-05, + "loss": 0.8329, + "step": 6785 + }, + { + "epoch": 3.8274111675126905, + "grad_norm": 1.5681195259094238, + "learning_rate": 3.086576424139876e-05, + "loss": 0.7925, + "step": 6786 + }, + { + "epoch": 3.8279751833051323, + "grad_norm": 1.3611207008361816, + "learning_rate": 3.086294416243655e-05, + "loss": 0.6665, + "step": 6787 + }, + { + "epoch": 3.828539199097575, + "grad_norm": 1.0530575513839722, + "learning_rate": 3.086012408347434e-05, + "loss": 0.6558, + "step": 6788 + }, + { + "epoch": 3.829103214890017, + "grad_norm": 1.1587270498275757, + "learning_rate": 3.085730400451213e-05, + "loss": 0.6487, + "step": 6789 + }, + { + "epoch": 3.829667230682459, + "grad_norm": 1.965791940689087, + "learning_rate": 3.0854483925549914e-05, + "loss": 0.7618, + "step": 6790 + }, + { + "epoch": 3.8302312464749013, + "grad_norm": 1.5806469917297363, + "learning_rate": 3.0851663846587706e-05, + "loss": 0.9244, + "step": 6791 + }, + { + "epoch": 3.8307952622673436, + "grad_norm": 0.8677186965942383, + "learning_rate": 3.08488437676255e-05, + "loss": 0.6986, + "step": 6792 + }, + { + "epoch": 3.831359278059786, + "grad_norm": 1.0127888917922974, + "learning_rate": 3.0846023688663284e-05, + "loss": 0.7489, + "step": 6793 + }, + { + "epoch": 3.8319232938522276, + "grad_norm": 0.9743685722351074, + "learning_rate": 3.084320360970107e-05, + "loss": 0.7203, + "step": 6794 + }, + { + "epoch": 3.8324873096446703, + "grad_norm": 1.6616885662078857, + "learning_rate": 3.084038353073886e-05, + "loss": 0.775, + "step": 6795 + }, + { + "epoch": 3.833051325437112, + "grad_norm": 2.07399320602417, + "learning_rate": 3.0837563451776654e-05, + "loss": 0.8416, + "step": 6796 + }, + { + "epoch": 3.8336153412295544, + "grad_norm": 1.7480686902999878, + "learning_rate": 3.083474337281444e-05, + "loss": 0.8535, + "step": 6797 + }, + { + "epoch": 3.8341793570219966, + "grad_norm": 0.8611979484558105, + "learning_rate": 3.0831923293852224e-05, + "loss": 0.6375, + "step": 6798 + }, + { + "epoch": 3.834743372814439, + "grad_norm": 1.0643882751464844, + "learning_rate": 3.0829103214890023e-05, + "loss": 0.7336, + "step": 6799 + }, + { + "epoch": 3.835307388606881, + "grad_norm": 1.6201846599578857, + "learning_rate": 3.082628313592781e-05, + "loss": 0.7034, + "step": 6800 + }, + { + "epoch": 3.835871404399323, + "grad_norm": 2.4318363666534424, + "learning_rate": 3.0823463056965594e-05, + "loss": 0.7468, + "step": 6801 + }, + { + "epoch": 3.8364354201917656, + "grad_norm": 1.0223522186279297, + "learning_rate": 3.0820642978003386e-05, + "loss": 0.7036, + "step": 6802 + }, + { + "epoch": 3.8369994359842075, + "grad_norm": 1.235891580581665, + "learning_rate": 3.081782289904118e-05, + "loss": 0.6993, + "step": 6803 + }, + { + "epoch": 3.8375634517766497, + "grad_norm": 1.0576130151748657, + "learning_rate": 3.0815002820078964e-05, + "loss": 0.7414, + "step": 6804 + }, + { + "epoch": 3.838127467569092, + "grad_norm": 1.0357606410980225, + "learning_rate": 3.081218274111675e-05, + "loss": 0.7962, + "step": 6805 + }, + { + "epoch": 3.838691483361534, + "grad_norm": 2.132061243057251, + "learning_rate": 3.080936266215454e-05, + "loss": 0.7514, + "step": 6806 + }, + { + "epoch": 3.8392554991539765, + "grad_norm": 1.0598024129867554, + "learning_rate": 3.0806542583192334e-05, + "loss": 0.8035, + "step": 6807 + }, + { + "epoch": 3.8398195149464183, + "grad_norm": 1.79752516746521, + "learning_rate": 3.080372250423012e-05, + "loss": 0.8775, + "step": 6808 + }, + { + "epoch": 3.840383530738861, + "grad_norm": 0.929111123085022, + "learning_rate": 3.0800902425267905e-05, + "loss": 0.6995, + "step": 6809 + }, + { + "epoch": 3.8409475465313028, + "grad_norm": 1.014265775680542, + "learning_rate": 3.07980823463057e-05, + "loss": 0.7057, + "step": 6810 + }, + { + "epoch": 3.841511562323745, + "grad_norm": 2.260734796524048, + "learning_rate": 3.079526226734349e-05, + "loss": 0.7662, + "step": 6811 + }, + { + "epoch": 3.8420755781161873, + "grad_norm": 1.2435506582260132, + "learning_rate": 3.0792442188381274e-05, + "loss": 0.8649, + "step": 6812 + }, + { + "epoch": 3.8426395939086295, + "grad_norm": 1.3306143283843994, + "learning_rate": 3.0789622109419067e-05, + "loss": 0.8149, + "step": 6813 + }, + { + "epoch": 3.8432036097010718, + "grad_norm": 1.0177499055862427, + "learning_rate": 3.078680203045685e-05, + "loss": 0.801, + "step": 6814 + }, + { + "epoch": 3.8437676254935136, + "grad_norm": 2.187274694442749, + "learning_rate": 3.0783981951494644e-05, + "loss": 0.8488, + "step": 6815 + }, + { + "epoch": 3.8443316412859563, + "grad_norm": 1.2267409563064575, + "learning_rate": 3.078116187253243e-05, + "loss": 0.7075, + "step": 6816 + }, + { + "epoch": 3.844895657078398, + "grad_norm": 1.2118709087371826, + "learning_rate": 3.077834179357022e-05, + "loss": 0.7493, + "step": 6817 + }, + { + "epoch": 3.8454596728708403, + "grad_norm": 1.8025586605072021, + "learning_rate": 3.0775521714608014e-05, + "loss": 0.7143, + "step": 6818 + }, + { + "epoch": 3.8460236886632826, + "grad_norm": 1.7506352663040161, + "learning_rate": 3.07727016356458e-05, + "loss": 0.8052, + "step": 6819 + }, + { + "epoch": 3.846587704455725, + "grad_norm": 1.3254035711288452, + "learning_rate": 3.076988155668359e-05, + "loss": 0.8108, + "step": 6820 + }, + { + "epoch": 3.847151720248167, + "grad_norm": 1.5642756223678589, + "learning_rate": 3.076706147772138e-05, + "loss": 0.7016, + "step": 6821 + }, + { + "epoch": 3.847715736040609, + "grad_norm": 1.4956212043762207, + "learning_rate": 3.076424139875917e-05, + "loss": 0.7794, + "step": 6822 + }, + { + "epoch": 3.8482797518330516, + "grad_norm": 0.9871727228164673, + "learning_rate": 3.0761421319796955e-05, + "loss": 0.699, + "step": 6823 + }, + { + "epoch": 3.8488437676254934, + "grad_norm": 1.0900640487670898, + "learning_rate": 3.075860124083475e-05, + "loss": 0.827, + "step": 6824 + }, + { + "epoch": 3.8494077834179357, + "grad_norm": 1.4871851205825806, + "learning_rate": 3.075578116187253e-05, + "loss": 0.7564, + "step": 6825 + }, + { + "epoch": 3.849971799210378, + "grad_norm": 1.0719845294952393, + "learning_rate": 3.0752961082910324e-05, + "loss": 0.7896, + "step": 6826 + }, + { + "epoch": 3.85053581500282, + "grad_norm": 1.1505728960037231, + "learning_rate": 3.075014100394811e-05, + "loss": 0.7536, + "step": 6827 + }, + { + "epoch": 3.8510998307952624, + "grad_norm": 1.0959137678146362, + "learning_rate": 3.07473209249859e-05, + "loss": 0.6546, + "step": 6828 + }, + { + "epoch": 3.851663846587704, + "grad_norm": 3.0320587158203125, + "learning_rate": 3.074450084602369e-05, + "loss": 0.8107, + "step": 6829 + }, + { + "epoch": 3.852227862380147, + "grad_norm": 0.7980020642280579, + "learning_rate": 3.074168076706148e-05, + "loss": 0.6449, + "step": 6830 + }, + { + "epoch": 3.8527918781725887, + "grad_norm": 1.241891860961914, + "learning_rate": 3.073886068809927e-05, + "loss": 0.7279, + "step": 6831 + }, + { + "epoch": 3.853355893965031, + "grad_norm": 1.1279773712158203, + "learning_rate": 3.073604060913706e-05, + "loss": 0.7568, + "step": 6832 + }, + { + "epoch": 3.8539199097574732, + "grad_norm": 2.1646130084991455, + "learning_rate": 3.073322053017484e-05, + "loss": 0.7731, + "step": 6833 + }, + { + "epoch": 3.8544839255499155, + "grad_norm": 1.6317425966262817, + "learning_rate": 3.0730400451212635e-05, + "loss": 0.7032, + "step": 6834 + }, + { + "epoch": 3.8550479413423577, + "grad_norm": 1.6372588872909546, + "learning_rate": 3.072758037225043e-05, + "loss": 0.7467, + "step": 6835 + }, + { + "epoch": 3.8556119571347995, + "grad_norm": 1.014786720275879, + "learning_rate": 3.072476029328821e-05, + "loss": 0.6678, + "step": 6836 + }, + { + "epoch": 3.8561759729272422, + "grad_norm": 1.6960575580596924, + "learning_rate": 3.0721940214326e-05, + "loss": 0.8224, + "step": 6837 + }, + { + "epoch": 3.856739988719684, + "grad_norm": 0.9611683487892151, + "learning_rate": 3.07191201353638e-05, + "loss": 0.7868, + "step": 6838 + }, + { + "epoch": 3.8573040045121263, + "grad_norm": 1.8541682958602905, + "learning_rate": 3.071630005640158e-05, + "loss": 0.8196, + "step": 6839 + }, + { + "epoch": 3.8578680203045685, + "grad_norm": 1.2574551105499268, + "learning_rate": 3.071347997743937e-05, + "loss": 0.7593, + "step": 6840 + }, + { + "epoch": 3.858432036097011, + "grad_norm": 1.3956458568572998, + "learning_rate": 3.071065989847716e-05, + "loss": 0.8235, + "step": 6841 + }, + { + "epoch": 3.858996051889453, + "grad_norm": 1.6937358379364014, + "learning_rate": 3.070783981951495e-05, + "loss": 0.8308, + "step": 6842 + }, + { + "epoch": 3.859560067681895, + "grad_norm": 1.0382546186447144, + "learning_rate": 3.070501974055274e-05, + "loss": 0.5883, + "step": 6843 + }, + { + "epoch": 3.8601240834743376, + "grad_norm": 1.4563781023025513, + "learning_rate": 3.070219966159052e-05, + "loss": 0.7754, + "step": 6844 + }, + { + "epoch": 3.8606880992667794, + "grad_norm": 1.343794584274292, + "learning_rate": 3.0699379582628315e-05, + "loss": 0.74, + "step": 6845 + }, + { + "epoch": 3.8612521150592216, + "grad_norm": 2.285032033920288, + "learning_rate": 3.069655950366611e-05, + "loss": 0.9112, + "step": 6846 + }, + { + "epoch": 3.861816130851664, + "grad_norm": 1.1627177000045776, + "learning_rate": 3.069373942470389e-05, + "loss": 0.7556, + "step": 6847 + }, + { + "epoch": 3.862380146644106, + "grad_norm": 1.1437124013900757, + "learning_rate": 3.069091934574168e-05, + "loss": 0.8138, + "step": 6848 + }, + { + "epoch": 3.8629441624365484, + "grad_norm": 1.0486360788345337, + "learning_rate": 3.068809926677947e-05, + "loss": 0.7728, + "step": 6849 + }, + { + "epoch": 3.86350817822899, + "grad_norm": 1.482807993888855, + "learning_rate": 3.068527918781726e-05, + "loss": 0.7445, + "step": 6850 + }, + { + "epoch": 3.864072194021433, + "grad_norm": 1.3478472232818604, + "learning_rate": 3.068245910885505e-05, + "loss": 0.784, + "step": 6851 + }, + { + "epoch": 3.8646362098138747, + "grad_norm": 0.8551536798477173, + "learning_rate": 3.067963902989284e-05, + "loss": 0.6331, + "step": 6852 + }, + { + "epoch": 3.865200225606317, + "grad_norm": 1.0139349699020386, + "learning_rate": 3.067681895093063e-05, + "loss": 0.8431, + "step": 6853 + }, + { + "epoch": 3.865764241398759, + "grad_norm": 2.0413646697998047, + "learning_rate": 3.067399887196842e-05, + "loss": 0.8748, + "step": 6854 + }, + { + "epoch": 3.8663282571912014, + "grad_norm": 1.0224305391311646, + "learning_rate": 3.06711787930062e-05, + "loss": 0.6568, + "step": 6855 + }, + { + "epoch": 3.8668922729836437, + "grad_norm": 1.235233187675476, + "learning_rate": 3.0668358714043995e-05, + "loss": 0.7748, + "step": 6856 + }, + { + "epoch": 3.8674562887760855, + "grad_norm": 4.0534987449646, + "learning_rate": 3.066553863508179e-05, + "loss": 0.8966, + "step": 6857 + }, + { + "epoch": 3.868020304568528, + "grad_norm": 1.2542767524719238, + "learning_rate": 3.066271855611957e-05, + "loss": 0.7341, + "step": 6858 + }, + { + "epoch": 3.86858432036097, + "grad_norm": 1.2078992128372192, + "learning_rate": 3.0659898477157365e-05, + "loss": 0.7705, + "step": 6859 + }, + { + "epoch": 3.8691483361534122, + "grad_norm": 0.986579418182373, + "learning_rate": 3.065707839819515e-05, + "loss": 0.7668, + "step": 6860 + }, + { + "epoch": 3.8697123519458545, + "grad_norm": 1.0717731714248657, + "learning_rate": 3.065425831923294e-05, + "loss": 0.6738, + "step": 6861 + }, + { + "epoch": 3.8702763677382968, + "grad_norm": 1.3841710090637207, + "learning_rate": 3.065143824027073e-05, + "loss": 0.7093, + "step": 6862 + }, + { + "epoch": 3.870840383530739, + "grad_norm": 1.2832789421081543, + "learning_rate": 3.064861816130852e-05, + "loss": 0.795, + "step": 6863 + }, + { + "epoch": 3.871404399323181, + "grad_norm": 1.7568562030792236, + "learning_rate": 3.0645798082346305e-05, + "loss": 0.7735, + "step": 6864 + }, + { + "epoch": 3.8719684151156235, + "grad_norm": 1.297277569770813, + "learning_rate": 3.06429780033841e-05, + "loss": 0.8022, + "step": 6865 + }, + { + "epoch": 3.8725324309080653, + "grad_norm": 1.5628490447998047, + "learning_rate": 3.064015792442188e-05, + "loss": 0.862, + "step": 6866 + }, + { + "epoch": 3.8730964467005076, + "grad_norm": 2.0147926807403564, + "learning_rate": 3.0637337845459675e-05, + "loss": 0.9932, + "step": 6867 + }, + { + "epoch": 3.87366046249295, + "grad_norm": 1.6376464366912842, + "learning_rate": 3.063451776649746e-05, + "loss": 0.7921, + "step": 6868 + }, + { + "epoch": 3.874224478285392, + "grad_norm": 1.348096251487732, + "learning_rate": 3.063169768753525e-05, + "loss": 0.749, + "step": 6869 + }, + { + "epoch": 3.8747884940778343, + "grad_norm": 1.6164910793304443, + "learning_rate": 3.0628877608573045e-05, + "loss": 0.8431, + "step": 6870 + }, + { + "epoch": 3.875352509870276, + "grad_norm": 1.2963062524795532, + "learning_rate": 3.062605752961083e-05, + "loss": 0.7632, + "step": 6871 + }, + { + "epoch": 3.875916525662719, + "grad_norm": 1.1243139505386353, + "learning_rate": 3.0623237450648616e-05, + "loss": 0.8111, + "step": 6872 + }, + { + "epoch": 3.8764805414551606, + "grad_norm": 0.9811694025993347, + "learning_rate": 3.062041737168641e-05, + "loss": 0.6768, + "step": 6873 + }, + { + "epoch": 3.877044557247603, + "grad_norm": 1.8843573331832886, + "learning_rate": 3.06175972927242e-05, + "loss": 0.8094, + "step": 6874 + }, + { + "epoch": 3.877608573040045, + "grad_norm": 0.9607158899307251, + "learning_rate": 3.0614777213761986e-05, + "loss": 0.6011, + "step": 6875 + }, + { + "epoch": 3.8781725888324874, + "grad_norm": 1.3236454725265503, + "learning_rate": 3.061195713479977e-05, + "loss": 0.7724, + "step": 6876 + }, + { + "epoch": 3.8787366046249296, + "grad_norm": 1.0697202682495117, + "learning_rate": 3.060913705583757e-05, + "loss": 0.638, + "step": 6877 + }, + { + "epoch": 3.8793006204173714, + "grad_norm": 1.6109479665756226, + "learning_rate": 3.0606316976875355e-05, + "loss": 0.813, + "step": 6878 + }, + { + "epoch": 3.879864636209814, + "grad_norm": 2.3178560733795166, + "learning_rate": 3.060349689791314e-05, + "loss": 0.7427, + "step": 6879 + }, + { + "epoch": 3.880428652002256, + "grad_norm": 1.5191659927368164, + "learning_rate": 3.060067681895093e-05, + "loss": 0.8266, + "step": 6880 + }, + { + "epoch": 3.880992667794698, + "grad_norm": 1.0358799695968628, + "learning_rate": 3.0597856739988725e-05, + "loss": 0.7739, + "step": 6881 + }, + { + "epoch": 3.8815566835871405, + "grad_norm": 1.1180998086929321, + "learning_rate": 3.059503666102651e-05, + "loss": 0.8433, + "step": 6882 + }, + { + "epoch": 3.8821206993795827, + "grad_norm": 1.6915525197982788, + "learning_rate": 3.0592216582064296e-05, + "loss": 0.8342, + "step": 6883 + }, + { + "epoch": 3.882684715172025, + "grad_norm": 2.282060384750366, + "learning_rate": 3.058939650310209e-05, + "loss": 0.7035, + "step": 6884 + }, + { + "epoch": 3.8832487309644668, + "grad_norm": 1.8028643131256104, + "learning_rate": 3.058657642413988e-05, + "loss": 0.8596, + "step": 6885 + }, + { + "epoch": 3.8838127467569095, + "grad_norm": 0.9282116889953613, + "learning_rate": 3.0583756345177666e-05, + "loss": 0.6695, + "step": 6886 + }, + { + "epoch": 3.8843767625493513, + "grad_norm": 5.880528926849365, + "learning_rate": 3.058093626621545e-05, + "loss": 0.7718, + "step": 6887 + }, + { + "epoch": 3.8849407783417935, + "grad_norm": 1.909151554107666, + "learning_rate": 3.057811618725324e-05, + "loss": 0.9987, + "step": 6888 + }, + { + "epoch": 3.8855047941342358, + "grad_norm": 1.3626011610031128, + "learning_rate": 3.0575296108291035e-05, + "loss": 0.7836, + "step": 6889 + }, + { + "epoch": 3.886068809926678, + "grad_norm": 1.1139105558395386, + "learning_rate": 3.057247602932882e-05, + "loss": 0.6915, + "step": 6890 + }, + { + "epoch": 3.8866328257191203, + "grad_norm": 1.2437654733657837, + "learning_rate": 3.056965595036661e-05, + "loss": 0.6942, + "step": 6891 + }, + { + "epoch": 3.887196841511562, + "grad_norm": 1.5807931423187256, + "learning_rate": 3.0566835871404405e-05, + "loss": 0.7888, + "step": 6892 + }, + { + "epoch": 3.887760857304005, + "grad_norm": 1.1242033243179321, + "learning_rate": 3.056401579244219e-05, + "loss": 0.6464, + "step": 6893 + }, + { + "epoch": 3.8883248730964466, + "grad_norm": 1.0532931089401245, + "learning_rate": 3.0561195713479976e-05, + "loss": 0.768, + "step": 6894 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.9771828651428223, + "learning_rate": 3.055837563451777e-05, + "loss": 0.6789, + "step": 6895 + }, + { + "epoch": 3.889452904681331, + "grad_norm": 1.5157989263534546, + "learning_rate": 3.055555555555556e-05, + "loss": 0.744, + "step": 6896 + }, + { + "epoch": 3.8900169204737733, + "grad_norm": 1.7777485847473145, + "learning_rate": 3.0552735476593346e-05, + "loss": 0.7774, + "step": 6897 + }, + { + "epoch": 3.8905809362662156, + "grad_norm": 1.5685886144638062, + "learning_rate": 3.054991539763114e-05, + "loss": 0.7834, + "step": 6898 + }, + { + "epoch": 3.8911449520586574, + "grad_norm": 1.8406261205673218, + "learning_rate": 3.0547095318668923e-05, + "loss": 0.8414, + "step": 6899 + }, + { + "epoch": 3.8917089678511, + "grad_norm": 1.1904776096343994, + "learning_rate": 3.0544275239706716e-05, + "loss": 0.83, + "step": 6900 + }, + { + "epoch": 3.892272983643542, + "grad_norm": 1.4952385425567627, + "learning_rate": 3.05414551607445e-05, + "loss": 0.7343, + "step": 6901 + }, + { + "epoch": 3.892836999435984, + "grad_norm": 1.039419412612915, + "learning_rate": 3.053863508178229e-05, + "loss": 0.7726, + "step": 6902 + }, + { + "epoch": 3.8934010152284264, + "grad_norm": 0.9466871619224548, + "learning_rate": 3.053581500282008e-05, + "loss": 0.707, + "step": 6903 + }, + { + "epoch": 3.8939650310208687, + "grad_norm": 1.022469401359558, + "learning_rate": 3.053299492385787e-05, + "loss": 0.7498, + "step": 6904 + }, + { + "epoch": 3.894529046813311, + "grad_norm": 1.3102562427520752, + "learning_rate": 3.0530174844895656e-05, + "loss": 0.6949, + "step": 6905 + }, + { + "epoch": 3.8950930626057527, + "grad_norm": 1.254721999168396, + "learning_rate": 3.052735476593345e-05, + "loss": 0.7378, + "step": 6906 + }, + { + "epoch": 3.8956570783981954, + "grad_norm": 1.1779370307922363, + "learning_rate": 3.0524534686971234e-05, + "loss": 0.7769, + "step": 6907 + }, + { + "epoch": 3.8962210941906372, + "grad_norm": 1.3215595483779907, + "learning_rate": 3.0521714608009026e-05, + "loss": 0.7726, + "step": 6908 + }, + { + "epoch": 3.8967851099830795, + "grad_norm": 1.9763262271881104, + "learning_rate": 3.051889452904682e-05, + "loss": 0.9372, + "step": 6909 + }, + { + "epoch": 3.8973491257755217, + "grad_norm": 1.6020554304122925, + "learning_rate": 3.0516074450084604e-05, + "loss": 0.7303, + "step": 6910 + }, + { + "epoch": 3.897913141567964, + "grad_norm": 0.8648461699485779, + "learning_rate": 3.051325437112239e-05, + "loss": 0.6787, + "step": 6911 + }, + { + "epoch": 3.8984771573604062, + "grad_norm": 1.1193490028381348, + "learning_rate": 3.0510434292160185e-05, + "loss": 0.7423, + "step": 6912 + }, + { + "epoch": 3.899041173152848, + "grad_norm": 1.1626516580581665, + "learning_rate": 3.050761421319797e-05, + "loss": 0.7561, + "step": 6913 + }, + { + "epoch": 3.8996051889452907, + "grad_norm": 1.1513994932174683, + "learning_rate": 3.050479413423576e-05, + "loss": 0.7608, + "step": 6914 + }, + { + "epoch": 3.9001692047377325, + "grad_norm": 1.0876692533493042, + "learning_rate": 3.0501974055273548e-05, + "loss": 0.7182, + "step": 6915 + }, + { + "epoch": 3.900733220530175, + "grad_norm": 1.0080355405807495, + "learning_rate": 3.049915397631134e-05, + "loss": 0.7543, + "step": 6916 + }, + { + "epoch": 3.901297236322617, + "grad_norm": 1.0764294862747192, + "learning_rate": 3.049633389734913e-05, + "loss": 0.8285, + "step": 6917 + }, + { + "epoch": 3.9018612521150593, + "grad_norm": 1.521127700805664, + "learning_rate": 3.0493513818386914e-05, + "loss": 0.7239, + "step": 6918 + }, + { + "epoch": 3.9024252679075015, + "grad_norm": 1.0011934041976929, + "learning_rate": 3.0490693739424703e-05, + "loss": 0.7707, + "step": 6919 + }, + { + "epoch": 3.9029892836999434, + "grad_norm": 1.1018027067184448, + "learning_rate": 3.0487873660462495e-05, + "loss": 0.7409, + "step": 6920 + }, + { + "epoch": 3.903553299492386, + "grad_norm": 1.3120427131652832, + "learning_rate": 3.0485053581500284e-05, + "loss": 0.7397, + "step": 6921 + }, + { + "epoch": 3.904117315284828, + "grad_norm": 1.8379539251327515, + "learning_rate": 3.0482233502538073e-05, + "loss": 0.8736, + "step": 6922 + }, + { + "epoch": 3.90468133107727, + "grad_norm": 1.2788363695144653, + "learning_rate": 3.0479413423575858e-05, + "loss": 0.7892, + "step": 6923 + }, + { + "epoch": 3.9052453468697124, + "grad_norm": 1.61379075050354, + "learning_rate": 3.0476593344613653e-05, + "loss": 0.8322, + "step": 6924 + }, + { + "epoch": 3.9058093626621546, + "grad_norm": 1.0795484781265259, + "learning_rate": 3.047377326565144e-05, + "loss": 0.7965, + "step": 6925 + }, + { + "epoch": 3.906373378454597, + "grad_norm": 1.505115032196045, + "learning_rate": 3.0470953186689228e-05, + "loss": 0.7554, + "step": 6926 + }, + { + "epoch": 3.9069373942470387, + "grad_norm": 0.9417849183082581, + "learning_rate": 3.046813310772702e-05, + "loss": 0.7013, + "step": 6927 + }, + { + "epoch": 3.9075014100394814, + "grad_norm": 1.4794522523880005, + "learning_rate": 3.046531302876481e-05, + "loss": 0.7633, + "step": 6928 + }, + { + "epoch": 3.908065425831923, + "grad_norm": 1.02364981174469, + "learning_rate": 3.0462492949802594e-05, + "loss": 0.6424, + "step": 6929 + }, + { + "epoch": 3.9086294416243654, + "grad_norm": 1.1307299137115479, + "learning_rate": 3.0459672870840383e-05, + "loss": 0.709, + "step": 6930 + }, + { + "epoch": 3.9091934574168077, + "grad_norm": 1.2521047592163086, + "learning_rate": 3.0456852791878175e-05, + "loss": 0.7801, + "step": 6931 + }, + { + "epoch": 3.90975747320925, + "grad_norm": 1.4469342231750488, + "learning_rate": 3.0454032712915964e-05, + "loss": 0.7967, + "step": 6932 + }, + { + "epoch": 3.910321489001692, + "grad_norm": 2.4241974353790283, + "learning_rate": 3.0451212633953753e-05, + "loss": 0.7655, + "step": 6933 + }, + { + "epoch": 3.910885504794134, + "grad_norm": 0.8674084544181824, + "learning_rate": 3.0448392554991538e-05, + "loss": 0.6894, + "step": 6934 + }, + { + "epoch": 3.9114495205865767, + "grad_norm": 3.366577386856079, + "learning_rate": 3.0445572476029334e-05, + "loss": 0.725, + "step": 6935 + }, + { + "epoch": 3.9120135363790185, + "grad_norm": 1.1423625946044922, + "learning_rate": 3.044275239706712e-05, + "loss": 0.6895, + "step": 6936 + }, + { + "epoch": 3.9125775521714607, + "grad_norm": 1.642510175704956, + "learning_rate": 3.0439932318104908e-05, + "loss": 0.8518, + "step": 6937 + }, + { + "epoch": 3.913141567963903, + "grad_norm": 1.2139848470687866, + "learning_rate": 3.0437112239142697e-05, + "loss": 0.7563, + "step": 6938 + }, + { + "epoch": 3.9137055837563453, + "grad_norm": 1.0516502857208252, + "learning_rate": 3.043429216018049e-05, + "loss": 0.6965, + "step": 6939 + }, + { + "epoch": 3.9142695995487875, + "grad_norm": 1.2906320095062256, + "learning_rate": 3.0431472081218278e-05, + "loss": 0.6896, + "step": 6940 + }, + { + "epoch": 3.9148336153412293, + "grad_norm": 0.9848943948745728, + "learning_rate": 3.0428652002256063e-05, + "loss": 0.671, + "step": 6941 + }, + { + "epoch": 3.915397631133672, + "grad_norm": 1.5811195373535156, + "learning_rate": 3.0425831923293852e-05, + "loss": 0.7887, + "step": 6942 + }, + { + "epoch": 3.915961646926114, + "grad_norm": 1.0981950759887695, + "learning_rate": 3.0423011844331644e-05, + "loss": 0.7074, + "step": 6943 + }, + { + "epoch": 3.916525662718556, + "grad_norm": 1.6338317394256592, + "learning_rate": 3.0420191765369433e-05, + "loss": 0.8769, + "step": 6944 + }, + { + "epoch": 3.9170896785109983, + "grad_norm": 1.1974008083343506, + "learning_rate": 3.041737168640722e-05, + "loss": 0.7193, + "step": 6945 + }, + { + "epoch": 3.9176536943034406, + "grad_norm": 2.1619315147399902, + "learning_rate": 3.0414551607445007e-05, + "loss": 0.8712, + "step": 6946 + }, + { + "epoch": 3.918217710095883, + "grad_norm": 1.0411609411239624, + "learning_rate": 3.04117315284828e-05, + "loss": 0.7909, + "step": 6947 + }, + { + "epoch": 3.9187817258883246, + "grad_norm": 1.271402359008789, + "learning_rate": 3.0408911449520588e-05, + "loss": 0.8351, + "step": 6948 + }, + { + "epoch": 3.9193457416807673, + "grad_norm": 1.4001796245574951, + "learning_rate": 3.0406091370558377e-05, + "loss": 0.7953, + "step": 6949 + }, + { + "epoch": 3.919909757473209, + "grad_norm": 1.402313470840454, + "learning_rate": 3.0403271291596162e-05, + "loss": 0.7425, + "step": 6950 + }, + { + "epoch": 3.9204737732656514, + "grad_norm": 1.2478582859039307, + "learning_rate": 3.0400451212633958e-05, + "loss": 0.7967, + "step": 6951 + }, + { + "epoch": 3.9210377890580936, + "grad_norm": 1.0033268928527832, + "learning_rate": 3.0397631133671743e-05, + "loss": 0.7197, + "step": 6952 + }, + { + "epoch": 3.921601804850536, + "grad_norm": 1.1189298629760742, + "learning_rate": 3.0394811054709532e-05, + "loss": 0.7418, + "step": 6953 + }, + { + "epoch": 3.922165820642978, + "grad_norm": 1.3501094579696655, + "learning_rate": 3.039199097574732e-05, + "loss": 0.7847, + "step": 6954 + }, + { + "epoch": 3.92272983643542, + "grad_norm": 1.6663110256195068, + "learning_rate": 3.0389170896785113e-05, + "loss": 0.8994, + "step": 6955 + }, + { + "epoch": 3.9232938522278626, + "grad_norm": 1.038336157798767, + "learning_rate": 3.0386350817822902e-05, + "loss": 0.8163, + "step": 6956 + }, + { + "epoch": 3.9238578680203045, + "grad_norm": 1.3490335941314697, + "learning_rate": 3.0383530738860687e-05, + "loss": 0.7276, + "step": 6957 + }, + { + "epoch": 3.9244218838127467, + "grad_norm": 1.2548998594284058, + "learning_rate": 3.0380710659898476e-05, + "loss": 0.7966, + "step": 6958 + }, + { + "epoch": 3.924985899605189, + "grad_norm": 1.8107621669769287, + "learning_rate": 3.0377890580936268e-05, + "loss": 0.7889, + "step": 6959 + }, + { + "epoch": 3.925549915397631, + "grad_norm": 1.1803133487701416, + "learning_rate": 3.0375070501974057e-05, + "loss": 0.8078, + "step": 6960 + }, + { + "epoch": 3.9261139311900735, + "grad_norm": 1.0692561864852905, + "learning_rate": 3.0372250423011846e-05, + "loss": 0.7039, + "step": 6961 + }, + { + "epoch": 3.9266779469825153, + "grad_norm": 1.4294698238372803, + "learning_rate": 3.0369430344049638e-05, + "loss": 0.7151, + "step": 6962 + }, + { + "epoch": 3.927241962774958, + "grad_norm": 1.0815658569335938, + "learning_rate": 3.0366610265087427e-05, + "loss": 0.7476, + "step": 6963 + }, + { + "epoch": 3.9278059785673998, + "grad_norm": 1.1474159955978394, + "learning_rate": 3.0363790186125212e-05, + "loss": 0.7611, + "step": 6964 + }, + { + "epoch": 3.928369994359842, + "grad_norm": 1.4519321918487549, + "learning_rate": 3.0360970107163e-05, + "loss": 0.8208, + "step": 6965 + }, + { + "epoch": 3.9289340101522843, + "grad_norm": 1.522058129310608, + "learning_rate": 3.0358150028200793e-05, + "loss": 0.788, + "step": 6966 + }, + { + "epoch": 3.9294980259447265, + "grad_norm": 1.2211424112319946, + "learning_rate": 3.0355329949238582e-05, + "loss": 0.7955, + "step": 6967 + }, + { + "epoch": 3.9300620417371688, + "grad_norm": 1.4596693515777588, + "learning_rate": 3.0352509870276367e-05, + "loss": 0.6757, + "step": 6968 + }, + { + "epoch": 3.9306260575296106, + "grad_norm": 1.5522432327270508, + "learning_rate": 3.0349689791314156e-05, + "loss": 0.7167, + "step": 6969 + }, + { + "epoch": 3.9311900733220533, + "grad_norm": 1.4647212028503418, + "learning_rate": 3.0346869712351948e-05, + "loss": 0.8233, + "step": 6970 + }, + { + "epoch": 3.931754089114495, + "grad_norm": 1.2483370304107666, + "learning_rate": 3.0344049633389737e-05, + "loss": 0.8431, + "step": 6971 + }, + { + "epoch": 3.9323181049069373, + "grad_norm": 1.043725848197937, + "learning_rate": 3.0341229554427526e-05, + "loss": 0.6724, + "step": 6972 + }, + { + "epoch": 3.9328821206993796, + "grad_norm": 1.7930113077163696, + "learning_rate": 3.033840947546531e-05, + "loss": 0.8059, + "step": 6973 + }, + { + "epoch": 3.933446136491822, + "grad_norm": 1.1961729526519775, + "learning_rate": 3.0335589396503107e-05, + "loss": 0.7185, + "step": 6974 + }, + { + "epoch": 3.934010152284264, + "grad_norm": 1.1227444410324097, + "learning_rate": 3.0332769317540892e-05, + "loss": 0.6688, + "step": 6975 + }, + { + "epoch": 3.934574168076706, + "grad_norm": 1.5744272470474243, + "learning_rate": 3.032994923857868e-05, + "loss": 0.7542, + "step": 6976 + }, + { + "epoch": 3.9351381838691486, + "grad_norm": 1.7565982341766357, + "learning_rate": 3.032712915961647e-05, + "loss": 0.7488, + "step": 6977 + }, + { + "epoch": 3.9357021996615904, + "grad_norm": 1.2440526485443115, + "learning_rate": 3.0324309080654262e-05, + "loss": 0.7686, + "step": 6978 + }, + { + "epoch": 3.9362662154540327, + "grad_norm": 1.3720760345458984, + "learning_rate": 3.032148900169205e-05, + "loss": 0.5926, + "step": 6979 + }, + { + "epoch": 3.936830231246475, + "grad_norm": 0.9058757424354553, + "learning_rate": 3.0318668922729836e-05, + "loss": 0.7666, + "step": 6980 + }, + { + "epoch": 3.937394247038917, + "grad_norm": 1.68398118019104, + "learning_rate": 3.0315848843767625e-05, + "loss": 0.81, + "step": 6981 + }, + { + "epoch": 3.9379582628313594, + "grad_norm": 0.973071813583374, + "learning_rate": 3.0313028764805417e-05, + "loss": 0.7458, + "step": 6982 + }, + { + "epoch": 3.938522278623801, + "grad_norm": 2.0173263549804688, + "learning_rate": 3.0310208685843206e-05, + "loss": 0.7839, + "step": 6983 + }, + { + "epoch": 3.939086294416244, + "grad_norm": 1.1180442571640015, + "learning_rate": 3.030738860688099e-05, + "loss": 0.8328, + "step": 6984 + }, + { + "epoch": 3.9396503102086857, + "grad_norm": 1.3929789066314697, + "learning_rate": 3.030456852791878e-05, + "loss": 0.7987, + "step": 6985 + }, + { + "epoch": 3.940214326001128, + "grad_norm": 1.347956657409668, + "learning_rate": 3.0301748448956572e-05, + "loss": 0.8058, + "step": 6986 + }, + { + "epoch": 3.9407783417935702, + "grad_norm": 1.295893669128418, + "learning_rate": 3.029892836999436e-05, + "loss": 0.6513, + "step": 6987 + }, + { + "epoch": 3.9413423575860125, + "grad_norm": 1.1398415565490723, + "learning_rate": 3.029610829103215e-05, + "loss": 0.7379, + "step": 6988 + }, + { + "epoch": 3.9419063733784547, + "grad_norm": 1.091251015663147, + "learning_rate": 3.0293288212069935e-05, + "loss": 0.8179, + "step": 6989 + }, + { + "epoch": 3.9424703891708965, + "grad_norm": 1.6249301433563232, + "learning_rate": 3.029046813310773e-05, + "loss": 0.7636, + "step": 6990 + }, + { + "epoch": 3.9430344049633392, + "grad_norm": 1.1485404968261719, + "learning_rate": 3.0287648054145516e-05, + "loss": 0.8164, + "step": 6991 + }, + { + "epoch": 3.943598420755781, + "grad_norm": 1.6853481531143188, + "learning_rate": 3.0284827975183305e-05, + "loss": 0.7453, + "step": 6992 + }, + { + "epoch": 3.9441624365482233, + "grad_norm": 1.0275263786315918, + "learning_rate": 3.0282007896221094e-05, + "loss": 0.7199, + "step": 6993 + }, + { + "epoch": 3.9447264523406655, + "grad_norm": 1.7684051990509033, + "learning_rate": 3.0279187817258886e-05, + "loss": 0.7721, + "step": 6994 + }, + { + "epoch": 3.945290468133108, + "grad_norm": 1.1181820631027222, + "learning_rate": 3.0276367738296675e-05, + "loss": 0.7735, + "step": 6995 + }, + { + "epoch": 3.94585448392555, + "grad_norm": 1.5807945728302002, + "learning_rate": 3.027354765933446e-05, + "loss": 0.8186, + "step": 6996 + }, + { + "epoch": 3.946418499717992, + "grad_norm": 1.1314154863357544, + "learning_rate": 3.0270727580372256e-05, + "loss": 0.7896, + "step": 6997 + }, + { + "epoch": 3.9469825155104346, + "grad_norm": 1.4533549547195435, + "learning_rate": 3.026790750141004e-05, + "loss": 0.8759, + "step": 6998 + }, + { + "epoch": 3.9475465313028764, + "grad_norm": 1.1339412927627563, + "learning_rate": 3.026508742244783e-05, + "loss": 0.7319, + "step": 6999 + }, + { + "epoch": 3.9481105470953186, + "grad_norm": 1.5066819190979004, + "learning_rate": 3.026226734348562e-05, + "loss": 0.8256, + "step": 7000 + }, + { + "epoch": 3.948674562887761, + "grad_norm": 1.0192575454711914, + "learning_rate": 3.025944726452341e-05, + "loss": 0.6755, + "step": 7001 + }, + { + "epoch": 3.949238578680203, + "grad_norm": 1.0187804698944092, + "learning_rate": 3.0256627185561197e-05, + "loss": 0.7754, + "step": 7002 + }, + { + "epoch": 3.9498025944726454, + "grad_norm": 0.9983628988265991, + "learning_rate": 3.0253807106598985e-05, + "loss": 0.8041, + "step": 7003 + }, + { + "epoch": 3.950366610265087, + "grad_norm": 1.4986755847930908, + "learning_rate": 3.0250987027636774e-05, + "loss": 0.7646, + "step": 7004 + }, + { + "epoch": 3.95093062605753, + "grad_norm": 0.8671907186508179, + "learning_rate": 3.0248166948674566e-05, + "loss": 0.6744, + "step": 7005 + }, + { + "epoch": 3.9514946418499717, + "grad_norm": 1.3665404319763184, + "learning_rate": 3.0245346869712355e-05, + "loss": 0.7556, + "step": 7006 + }, + { + "epoch": 3.952058657642414, + "grad_norm": 1.3699955940246582, + "learning_rate": 3.024252679075014e-05, + "loss": 0.8774, + "step": 7007 + }, + { + "epoch": 3.952622673434856, + "grad_norm": 1.165099024772644, + "learning_rate": 3.023970671178793e-05, + "loss": 0.7063, + "step": 7008 + }, + { + "epoch": 3.9531866892272984, + "grad_norm": 1.099150538444519, + "learning_rate": 3.023688663282572e-05, + "loss": 0.7917, + "step": 7009 + }, + { + "epoch": 3.9537507050197407, + "grad_norm": 1.6766057014465332, + "learning_rate": 3.023406655386351e-05, + "loss": 0.8013, + "step": 7010 + }, + { + "epoch": 3.9543147208121825, + "grad_norm": 1.255160927772522, + "learning_rate": 3.02312464749013e-05, + "loss": 0.7037, + "step": 7011 + }, + { + "epoch": 3.954878736604625, + "grad_norm": 0.7828295826911926, + "learning_rate": 3.0228426395939085e-05, + "loss": 0.6756, + "step": 7012 + }, + { + "epoch": 3.955442752397067, + "grad_norm": 1.0746477842330933, + "learning_rate": 3.022560631697688e-05, + "loss": 0.6313, + "step": 7013 + }, + { + "epoch": 3.9560067681895092, + "grad_norm": 1.124215006828308, + "learning_rate": 3.0222786238014666e-05, + "loss": 0.6641, + "step": 7014 + }, + { + "epoch": 3.9565707839819515, + "grad_norm": 1.2092440128326416, + "learning_rate": 3.0219966159052454e-05, + "loss": 0.7975, + "step": 7015 + }, + { + "epoch": 3.9571347997743938, + "grad_norm": 1.2887415885925293, + "learning_rate": 3.0217146080090243e-05, + "loss": 0.7897, + "step": 7016 + }, + { + "epoch": 3.957698815566836, + "grad_norm": 1.2801337242126465, + "learning_rate": 3.0214326001128035e-05, + "loss": 0.7724, + "step": 7017 + }, + { + "epoch": 3.958262831359278, + "grad_norm": 0.9815800189971924, + "learning_rate": 3.0211505922165824e-05, + "loss": 0.6874, + "step": 7018 + }, + { + "epoch": 3.9588268471517205, + "grad_norm": 1.4903807640075684, + "learning_rate": 3.020868584320361e-05, + "loss": 0.8106, + "step": 7019 + }, + { + "epoch": 3.9593908629441623, + "grad_norm": 1.1941684484481812, + "learning_rate": 3.0205865764241398e-05, + "loss": 0.8412, + "step": 7020 + }, + { + "epoch": 3.9599548787366046, + "grad_norm": 1.3317570686340332, + "learning_rate": 3.020304568527919e-05, + "loss": 0.7299, + "step": 7021 + }, + { + "epoch": 3.960518894529047, + "grad_norm": 0.9628488421440125, + "learning_rate": 3.020022560631698e-05, + "loss": 0.7792, + "step": 7022 + }, + { + "epoch": 3.961082910321489, + "grad_norm": 1.0313928127288818, + "learning_rate": 3.0197405527354765e-05, + "loss": 0.6887, + "step": 7023 + }, + { + "epoch": 3.9616469261139313, + "grad_norm": 1.2588788270950317, + "learning_rate": 3.0194585448392553e-05, + "loss": 0.7842, + "step": 7024 + }, + { + "epoch": 3.962210941906373, + "grad_norm": 1.2022327184677124, + "learning_rate": 3.0191765369430346e-05, + "loss": 0.7276, + "step": 7025 + }, + { + "epoch": 3.962774957698816, + "grad_norm": 1.1614376306533813, + "learning_rate": 3.0188945290468134e-05, + "loss": 0.7768, + "step": 7026 + }, + { + "epoch": 3.9633389734912576, + "grad_norm": 1.7213789224624634, + "learning_rate": 3.0186125211505923e-05, + "loss": 0.6709, + "step": 7027 + }, + { + "epoch": 3.9639029892837, + "grad_norm": 1.4383749961853027, + "learning_rate": 3.018330513254371e-05, + "loss": 0.7782, + "step": 7028 + }, + { + "epoch": 3.964467005076142, + "grad_norm": 1.0473653078079224, + "learning_rate": 3.0180485053581504e-05, + "loss": 0.8107, + "step": 7029 + }, + { + "epoch": 3.9650310208685844, + "grad_norm": 1.15763521194458, + "learning_rate": 3.017766497461929e-05, + "loss": 0.7603, + "step": 7030 + }, + { + "epoch": 3.9655950366610266, + "grad_norm": 1.3664655685424805, + "learning_rate": 3.017484489565708e-05, + "loss": 0.7203, + "step": 7031 + }, + { + "epoch": 3.9661590524534684, + "grad_norm": 0.9872208833694458, + "learning_rate": 3.017202481669487e-05, + "loss": 0.7534, + "step": 7032 + }, + { + "epoch": 3.966723068245911, + "grad_norm": 1.2836863994598389, + "learning_rate": 3.016920473773266e-05, + "loss": 0.7735, + "step": 7033 + }, + { + "epoch": 3.967287084038353, + "grad_norm": 1.133715271949768, + "learning_rate": 3.0166384658770448e-05, + "loss": 0.6992, + "step": 7034 + }, + { + "epoch": 3.967851099830795, + "grad_norm": 0.9108738303184509, + "learning_rate": 3.0163564579808234e-05, + "loss": 0.6903, + "step": 7035 + }, + { + "epoch": 3.9684151156232375, + "grad_norm": 1.54414963722229, + "learning_rate": 3.016074450084603e-05, + "loss": 0.8178, + "step": 7036 + }, + { + "epoch": 3.9689791314156797, + "grad_norm": 0.9639269709587097, + "learning_rate": 3.0157924421883815e-05, + "loss": 0.6905, + "step": 7037 + }, + { + "epoch": 3.969543147208122, + "grad_norm": 1.0612363815307617, + "learning_rate": 3.0155104342921603e-05, + "loss": 0.7174, + "step": 7038 + }, + { + "epoch": 3.9701071630005638, + "grad_norm": 1.0257508754730225, + "learning_rate": 3.0152284263959392e-05, + "loss": 0.7017, + "step": 7039 + }, + { + "epoch": 3.9706711787930065, + "grad_norm": 1.0880186557769775, + "learning_rate": 3.0149464184997184e-05, + "loss": 0.6609, + "step": 7040 + }, + { + "epoch": 3.9712351945854483, + "grad_norm": 1.2383379936218262, + "learning_rate": 3.014664410603497e-05, + "loss": 0.6558, + "step": 7041 + }, + { + "epoch": 3.9717992103778905, + "grad_norm": 1.249084711074829, + "learning_rate": 3.014382402707276e-05, + "loss": 0.7553, + "step": 7042 + }, + { + "epoch": 3.9723632261703328, + "grad_norm": 1.519101858139038, + "learning_rate": 3.0141003948110547e-05, + "loss": 0.8692, + "step": 7043 + }, + { + "epoch": 3.972927241962775, + "grad_norm": 1.3160488605499268, + "learning_rate": 3.013818386914834e-05, + "loss": 0.8235, + "step": 7044 + }, + { + "epoch": 3.9734912577552173, + "grad_norm": 1.7031804323196411, + "learning_rate": 3.013536379018613e-05, + "loss": 0.7414, + "step": 7045 + }, + { + "epoch": 3.974055273547659, + "grad_norm": 1.1389458179473877, + "learning_rate": 3.0132543711223914e-05, + "loss": 0.758, + "step": 7046 + }, + { + "epoch": 3.974619289340102, + "grad_norm": 0.9924626350402832, + "learning_rate": 3.0129723632261703e-05, + "loss": 0.8274, + "step": 7047 + }, + { + "epoch": 3.9751833051325436, + "grad_norm": 1.7511056661605835, + "learning_rate": 3.0126903553299495e-05, + "loss": 0.8485, + "step": 7048 + }, + { + "epoch": 3.975747320924986, + "grad_norm": 1.6499974727630615, + "learning_rate": 3.0124083474337284e-05, + "loss": 0.7657, + "step": 7049 + }, + { + "epoch": 3.976311336717428, + "grad_norm": 1.3829492330551147, + "learning_rate": 3.0121263395375072e-05, + "loss": 0.7629, + "step": 7050 + }, + { + "epoch": 3.9768753525098703, + "grad_norm": 1.426169753074646, + "learning_rate": 3.0118443316412858e-05, + "loss": 0.7212, + "step": 7051 + }, + { + "epoch": 3.9774393683023126, + "grad_norm": 1.344746708869934, + "learning_rate": 3.0115623237450653e-05, + "loss": 0.7141, + "step": 7052 + }, + { + "epoch": 3.9780033840947544, + "grad_norm": 0.9514457583427429, + "learning_rate": 3.011280315848844e-05, + "loss": 0.6448, + "step": 7053 + }, + { + "epoch": 3.978567399887197, + "grad_norm": 1.2661361694335938, + "learning_rate": 3.0109983079526228e-05, + "loss": 0.7657, + "step": 7054 + }, + { + "epoch": 3.979131415679639, + "grad_norm": 1.3456112146377563, + "learning_rate": 3.0107163000564016e-05, + "loss": 0.7716, + "step": 7055 + }, + { + "epoch": 3.979695431472081, + "grad_norm": 1.4011059999465942, + "learning_rate": 3.010434292160181e-05, + "loss": 0.6525, + "step": 7056 + }, + { + "epoch": 3.9802594472645234, + "grad_norm": 1.5198900699615479, + "learning_rate": 3.0101522842639597e-05, + "loss": 0.795, + "step": 7057 + }, + { + "epoch": 3.9808234630569657, + "grad_norm": 1.6706085205078125, + "learning_rate": 3.0098702763677383e-05, + "loss": 0.7455, + "step": 7058 + }, + { + "epoch": 3.981387478849408, + "grad_norm": 1.7426562309265137, + "learning_rate": 3.009588268471517e-05, + "loss": 0.8426, + "step": 7059 + }, + { + "epoch": 3.9819514946418497, + "grad_norm": 1.0538442134857178, + "learning_rate": 3.0093062605752964e-05, + "loss": 0.6648, + "step": 7060 + }, + { + "epoch": 3.9825155104342924, + "grad_norm": 1.3703869581222534, + "learning_rate": 3.0090242526790752e-05, + "loss": 0.6616, + "step": 7061 + }, + { + "epoch": 3.9830795262267342, + "grad_norm": 1.1749682426452637, + "learning_rate": 3.0087422447828538e-05, + "loss": 0.7299, + "step": 7062 + }, + { + "epoch": 3.9836435420191765, + "grad_norm": 1.625362753868103, + "learning_rate": 3.0084602368866327e-05, + "loss": 0.9432, + "step": 7063 + }, + { + "epoch": 3.9842075578116187, + "grad_norm": 1.3792786598205566, + "learning_rate": 3.008178228990412e-05, + "loss": 0.8479, + "step": 7064 + }, + { + "epoch": 3.984771573604061, + "grad_norm": 2.5013251304626465, + "learning_rate": 3.0078962210941908e-05, + "loss": 0.6429, + "step": 7065 + }, + { + "epoch": 3.9853355893965032, + "grad_norm": 1.7857754230499268, + "learning_rate": 3.0076142131979696e-05, + "loss": 0.827, + "step": 7066 + }, + { + "epoch": 3.985899605188945, + "grad_norm": 1.221062421798706, + "learning_rate": 3.007332205301749e-05, + "loss": 0.8075, + "step": 7067 + }, + { + "epoch": 3.9864636209813877, + "grad_norm": 0.9680389761924744, + "learning_rate": 3.0070501974055277e-05, + "loss": 0.7592, + "step": 7068 + }, + { + "epoch": 3.9870276367738295, + "grad_norm": 1.3945934772491455, + "learning_rate": 3.0067681895093063e-05, + "loss": 0.7142, + "step": 7069 + }, + { + "epoch": 3.987591652566272, + "grad_norm": 1.8958213329315186, + "learning_rate": 3.006486181613085e-05, + "loss": 0.8523, + "step": 7070 + }, + { + "epoch": 3.988155668358714, + "grad_norm": 1.3471713066101074, + "learning_rate": 3.0062041737168644e-05, + "loss": 0.8842, + "step": 7071 + }, + { + "epoch": 3.9887196841511563, + "grad_norm": 1.1077653169631958, + "learning_rate": 3.0059221658206433e-05, + "loss": 0.7163, + "step": 7072 + }, + { + "epoch": 3.9892836999435985, + "grad_norm": 1.3038990497589111, + "learning_rate": 3.005640157924422e-05, + "loss": 0.8061, + "step": 7073 + }, + { + "epoch": 3.9898477157360404, + "grad_norm": 1.2556939125061035, + "learning_rate": 3.0053581500282007e-05, + "loss": 0.8062, + "step": 7074 + }, + { + "epoch": 3.990411731528483, + "grad_norm": 1.0404025316238403, + "learning_rate": 3.0050761421319802e-05, + "loss": 0.7271, + "step": 7075 + }, + { + "epoch": 3.990975747320925, + "grad_norm": 0.876685619354248, + "learning_rate": 3.0047941342357588e-05, + "loss": 0.6938, + "step": 7076 + }, + { + "epoch": 3.991539763113367, + "grad_norm": 1.0928173065185547, + "learning_rate": 3.0045121263395377e-05, + "loss": 0.684, + "step": 7077 + }, + { + "epoch": 3.9921037789058094, + "grad_norm": 0.9461342096328735, + "learning_rate": 3.0042301184433162e-05, + "loss": 0.6532, + "step": 7078 + }, + { + "epoch": 3.9926677946982516, + "grad_norm": 2.1973156929016113, + "learning_rate": 3.0039481105470958e-05, + "loss": 0.8409, + "step": 7079 + }, + { + "epoch": 3.993231810490694, + "grad_norm": 1.054807186126709, + "learning_rate": 3.0036661026508743e-05, + "loss": 0.6338, + "step": 7080 + }, + { + "epoch": 3.9937958262831357, + "grad_norm": 1.6376621723175049, + "learning_rate": 3.0033840947546532e-05, + "loss": 0.8073, + "step": 7081 + }, + { + "epoch": 3.9943598420755784, + "grad_norm": 1.0413662195205688, + "learning_rate": 3.003102086858432e-05, + "loss": 0.6689, + "step": 7082 + }, + { + "epoch": 3.99492385786802, + "grad_norm": 1.0027658939361572, + "learning_rate": 3.0028200789622113e-05, + "loss": 0.7069, + "step": 7083 + }, + { + "epoch": 3.9954878736604624, + "grad_norm": 2.3650991916656494, + "learning_rate": 3.00253807106599e-05, + "loss": 0.6962, + "step": 7084 + }, + { + "epoch": 3.9960518894529047, + "grad_norm": 1.360927700996399, + "learning_rate": 3.0022560631697687e-05, + "loss": 0.8523, + "step": 7085 + }, + { + "epoch": 3.996615905245347, + "grad_norm": 1.0400152206420898, + "learning_rate": 3.0019740552735476e-05, + "loss": 0.7089, + "step": 7086 + }, + { + "epoch": 3.997179921037789, + "grad_norm": 1.2846651077270508, + "learning_rate": 3.0016920473773268e-05, + "loss": 0.7438, + "step": 7087 + }, + { + "epoch": 3.997743936830231, + "grad_norm": 1.1127532720565796, + "learning_rate": 3.0014100394811057e-05, + "loss": 0.7625, + "step": 7088 + }, + { + "epoch": 3.9983079526226737, + "grad_norm": 1.0109045505523682, + "learning_rate": 3.0011280315848846e-05, + "loss": 0.7867, + "step": 7089 + }, + { + "epoch": 3.9988719684151155, + "grad_norm": 1.523675799369812, + "learning_rate": 3.000846023688663e-05, + "loss": 0.8074, + "step": 7090 + }, + { + "epoch": 3.9994359842075577, + "grad_norm": 1.9907784461975098, + "learning_rate": 3.0005640157924427e-05, + "loss": 0.7839, + "step": 7091 + }, + { + "epoch": 4.0, + "grad_norm": 1.326237678527832, + "learning_rate": 3.0002820078962212e-05, + "loss": 0.7161, + "step": 7092 + }, + { + "epoch": 4.000564015792442, + "grad_norm": 1.3905961513519287, + "learning_rate": 3e-05, + "loss": 0.7552, + "step": 7093 + }, + { + "epoch": 4.0011280315848845, + "grad_norm": 1.457708477973938, + "learning_rate": 2.999717992103779e-05, + "loss": 0.7626, + "step": 7094 + }, + { + "epoch": 4.001692047377326, + "grad_norm": 1.4564639329910278, + "learning_rate": 2.9994359842075582e-05, + "loss": 0.849, + "step": 7095 + }, + { + "epoch": 4.002256063169769, + "grad_norm": 1.5347716808319092, + "learning_rate": 2.9991539763113367e-05, + "loss": 0.8239, + "step": 7096 + }, + { + "epoch": 4.002820078962211, + "grad_norm": 1.1608283519744873, + "learning_rate": 2.9988719684151156e-05, + "loss": 0.8274, + "step": 7097 + }, + { + "epoch": 4.0033840947546535, + "grad_norm": 1.1154751777648926, + "learning_rate": 2.9985899605188945e-05, + "loss": 0.743, + "step": 7098 + }, + { + "epoch": 4.003948110547095, + "grad_norm": 1.2000186443328857, + "learning_rate": 2.9983079526226737e-05, + "loss": 0.69, + "step": 7099 + }, + { + "epoch": 4.004512126339537, + "grad_norm": 1.0359102487564087, + "learning_rate": 2.9980259447264526e-05, + "loss": 0.7973, + "step": 7100 + }, + { + "epoch": 4.00507614213198, + "grad_norm": 1.7192823886871338, + "learning_rate": 2.997743936830231e-05, + "loss": 0.8042, + "step": 7101 + }, + { + "epoch": 4.005640157924422, + "grad_norm": 1.0243949890136719, + "learning_rate": 2.9974619289340107e-05, + "loss": 0.7611, + "step": 7102 + }, + { + "epoch": 4.006204173716864, + "grad_norm": 1.3037338256835938, + "learning_rate": 2.9971799210377892e-05, + "loss": 0.8584, + "step": 7103 + }, + { + "epoch": 4.006768189509306, + "grad_norm": 1.4005930423736572, + "learning_rate": 2.996897913141568e-05, + "loss": 0.6598, + "step": 7104 + }, + { + "epoch": 4.007332205301749, + "grad_norm": 1.2984267473220825, + "learning_rate": 2.996615905245347e-05, + "loss": 0.6703, + "step": 7105 + }, + { + "epoch": 4.007896221094191, + "grad_norm": 1.1586332321166992, + "learning_rate": 2.9963338973491262e-05, + "loss": 0.7186, + "step": 7106 + }, + { + "epoch": 4.008460236886632, + "grad_norm": 1.2282923460006714, + "learning_rate": 2.996051889452905e-05, + "loss": 0.7819, + "step": 7107 + }, + { + "epoch": 4.009024252679075, + "grad_norm": 1.8611208200454712, + "learning_rate": 2.9957698815566836e-05, + "loss": 0.8326, + "step": 7108 + }, + { + "epoch": 4.009588268471517, + "grad_norm": 0.9734347462654114, + "learning_rate": 2.9954878736604625e-05, + "loss": 0.6552, + "step": 7109 + }, + { + "epoch": 4.01015228426396, + "grad_norm": 1.5233033895492554, + "learning_rate": 2.9952058657642417e-05, + "loss": 0.8323, + "step": 7110 + }, + { + "epoch": 4.0107163000564015, + "grad_norm": 1.8048012256622314, + "learning_rate": 2.9949238578680206e-05, + "loss": 0.8163, + "step": 7111 + }, + { + "epoch": 4.011280315848844, + "grad_norm": 1.512772798538208, + "learning_rate": 2.9946418499717995e-05, + "loss": 0.8438, + "step": 7112 + }, + { + "epoch": 4.011844331641286, + "grad_norm": 1.1215766668319702, + "learning_rate": 2.994359842075578e-05, + "loss": 0.7598, + "step": 7113 + }, + { + "epoch": 4.012408347433728, + "grad_norm": 1.0616103410720825, + "learning_rate": 2.9940778341793572e-05, + "loss": 0.7187, + "step": 7114 + }, + { + "epoch": 4.0129723632261705, + "grad_norm": 1.3413695096969604, + "learning_rate": 2.993795826283136e-05, + "loss": 0.7961, + "step": 7115 + }, + { + "epoch": 4.013536379018612, + "grad_norm": 1.4676659107208252, + "learning_rate": 2.993513818386915e-05, + "loss": 0.7861, + "step": 7116 + }, + { + "epoch": 4.014100394811055, + "grad_norm": 0.9159256815910339, + "learning_rate": 2.9932318104906935e-05, + "loss": 0.7257, + "step": 7117 + }, + { + "epoch": 4.014664410603497, + "grad_norm": 2.3236424922943115, + "learning_rate": 2.992949802594473e-05, + "loss": 0.805, + "step": 7118 + }, + { + "epoch": 4.0152284263959395, + "grad_norm": 1.4584945440292358, + "learning_rate": 2.9926677946982516e-05, + "loss": 0.7684, + "step": 7119 + }, + { + "epoch": 4.015792442188381, + "grad_norm": 1.550033450126648, + "learning_rate": 2.9923857868020305e-05, + "loss": 0.7941, + "step": 7120 + }, + { + "epoch": 4.016356457980823, + "grad_norm": 1.0868663787841797, + "learning_rate": 2.9921037789058094e-05, + "loss": 0.7152, + "step": 7121 + }, + { + "epoch": 4.016920473773266, + "grad_norm": 1.3023477792739868, + "learning_rate": 2.9918217710095886e-05, + "loss": 0.7295, + "step": 7122 + }, + { + "epoch": 4.017484489565708, + "grad_norm": 1.1017574071884155, + "learning_rate": 2.9915397631133675e-05, + "loss": 0.7637, + "step": 7123 + }, + { + "epoch": 4.01804850535815, + "grad_norm": 1.310776948928833, + "learning_rate": 2.991257755217146e-05, + "loss": 0.8386, + "step": 7124 + }, + { + "epoch": 4.018612521150592, + "grad_norm": 1.0151314735412598, + "learning_rate": 2.990975747320925e-05, + "loss": 0.7576, + "step": 7125 + }, + { + "epoch": 4.019176536943035, + "grad_norm": 1.0802425146102905, + "learning_rate": 2.990693739424704e-05, + "loss": 0.7799, + "step": 7126 + }, + { + "epoch": 4.019740552735477, + "grad_norm": 1.3720940351486206, + "learning_rate": 2.990411731528483e-05, + "loss": 0.8321, + "step": 7127 + }, + { + "epoch": 4.020304568527918, + "grad_norm": 1.0881316661834717, + "learning_rate": 2.990129723632262e-05, + "loss": 0.7371, + "step": 7128 + }, + { + "epoch": 4.020868584320361, + "grad_norm": 1.5235018730163574, + "learning_rate": 2.9898477157360404e-05, + "loss": 0.7477, + "step": 7129 + }, + { + "epoch": 4.021432600112803, + "grad_norm": 1.5693109035491943, + "learning_rate": 2.98956570783982e-05, + "loss": 0.8296, + "step": 7130 + }, + { + "epoch": 4.021996615905246, + "grad_norm": 1.5644420385360718, + "learning_rate": 2.9892836999435985e-05, + "loss": 0.7803, + "step": 7131 + }, + { + "epoch": 4.022560631697687, + "grad_norm": 1.9566612243652344, + "learning_rate": 2.9890016920473774e-05, + "loss": 0.7297, + "step": 7132 + }, + { + "epoch": 4.02312464749013, + "grad_norm": 1.0229893922805786, + "learning_rate": 2.988719684151156e-05, + "loss": 0.6984, + "step": 7133 + }, + { + "epoch": 4.023688663282572, + "grad_norm": 1.0393925905227661, + "learning_rate": 2.9884376762549355e-05, + "loss": 0.6427, + "step": 7134 + }, + { + "epoch": 4.024252679075014, + "grad_norm": 0.8941148519515991, + "learning_rate": 2.988155668358714e-05, + "loss": 0.6229, + "step": 7135 + }, + { + "epoch": 4.024816694867456, + "grad_norm": 1.0583866834640503, + "learning_rate": 2.987873660462493e-05, + "loss": 0.7672, + "step": 7136 + }, + { + "epoch": 4.025380710659898, + "grad_norm": 1.0124177932739258, + "learning_rate": 2.987591652566272e-05, + "loss": 0.7497, + "step": 7137 + }, + { + "epoch": 4.025944726452341, + "grad_norm": 0.9340419173240662, + "learning_rate": 2.987309644670051e-05, + "loss": 0.684, + "step": 7138 + }, + { + "epoch": 4.026508742244783, + "grad_norm": 1.4622260332107544, + "learning_rate": 2.98702763677383e-05, + "loss": 0.7914, + "step": 7139 + }, + { + "epoch": 4.027072758037225, + "grad_norm": 1.0148086547851562, + "learning_rate": 2.9867456288776084e-05, + "loss": 0.672, + "step": 7140 + }, + { + "epoch": 4.027636773829667, + "grad_norm": 1.5084317922592163, + "learning_rate": 2.986463620981388e-05, + "loss": 0.8262, + "step": 7141 + }, + { + "epoch": 4.028200789622109, + "grad_norm": 1.539806604385376, + "learning_rate": 2.9861816130851665e-05, + "loss": 0.7752, + "step": 7142 + }, + { + "epoch": 4.028764805414552, + "grad_norm": 1.2443232536315918, + "learning_rate": 2.9858996051889454e-05, + "loss": 0.7707, + "step": 7143 + }, + { + "epoch": 4.0293288212069935, + "grad_norm": 1.276864767074585, + "learning_rate": 2.9856175972927243e-05, + "loss": 0.6829, + "step": 7144 + }, + { + "epoch": 4.029892836999436, + "grad_norm": 0.9598891139030457, + "learning_rate": 2.9853355893965035e-05, + "loss": 0.6304, + "step": 7145 + }, + { + "epoch": 4.030456852791878, + "grad_norm": 0.9582312107086182, + "learning_rate": 2.9850535815002824e-05, + "loss": 0.6628, + "step": 7146 + }, + { + "epoch": 4.031020868584321, + "grad_norm": 1.0465445518493652, + "learning_rate": 2.984771573604061e-05, + "loss": 0.7092, + "step": 7147 + }, + { + "epoch": 4.0315848843767625, + "grad_norm": 1.263271689414978, + "learning_rate": 2.9844895657078398e-05, + "loss": 0.7435, + "step": 7148 + }, + { + "epoch": 4.032148900169204, + "grad_norm": 1.378187894821167, + "learning_rate": 2.984207557811619e-05, + "loss": 0.6752, + "step": 7149 + }, + { + "epoch": 4.032712915961647, + "grad_norm": 1.1876693964004517, + "learning_rate": 2.983925549915398e-05, + "loss": 0.6537, + "step": 7150 + }, + { + "epoch": 4.033276931754089, + "grad_norm": 1.3187217712402344, + "learning_rate": 2.9836435420191765e-05, + "loss": 0.7048, + "step": 7151 + }, + { + "epoch": 4.0338409475465316, + "grad_norm": 1.3742361068725586, + "learning_rate": 2.9833615341229553e-05, + "loss": 0.6984, + "step": 7152 + }, + { + "epoch": 4.034404963338973, + "grad_norm": 1.1189759969711304, + "learning_rate": 2.9830795262267345e-05, + "loss": 0.7192, + "step": 7153 + }, + { + "epoch": 4.034968979131416, + "grad_norm": 1.5905067920684814, + "learning_rate": 2.9827975183305134e-05, + "loss": 0.7985, + "step": 7154 + }, + { + "epoch": 4.035532994923858, + "grad_norm": 2.0011537075042725, + "learning_rate": 2.9825155104342923e-05, + "loss": 0.873, + "step": 7155 + }, + { + "epoch": 4.0360970107163, + "grad_norm": 1.3071987628936768, + "learning_rate": 2.982233502538071e-05, + "loss": 0.6724, + "step": 7156 + }, + { + "epoch": 4.036661026508742, + "grad_norm": 1.690152645111084, + "learning_rate": 2.9819514946418504e-05, + "loss": 0.8636, + "step": 7157 + }, + { + "epoch": 4.037225042301184, + "grad_norm": 1.100243330001831, + "learning_rate": 2.981669486745629e-05, + "loss": 0.7324, + "step": 7158 + }, + { + "epoch": 4.037789058093627, + "grad_norm": 0.8163221478462219, + "learning_rate": 2.9813874788494078e-05, + "loss": 0.6312, + "step": 7159 + }, + { + "epoch": 4.038353073886069, + "grad_norm": 1.270582675933838, + "learning_rate": 2.9811054709531867e-05, + "loss": 0.8043, + "step": 7160 + }, + { + "epoch": 4.038917089678511, + "grad_norm": 0.9935411810874939, + "learning_rate": 2.980823463056966e-05, + "loss": 0.7756, + "step": 7161 + }, + { + "epoch": 4.039481105470953, + "grad_norm": 3.240086555480957, + "learning_rate": 2.9805414551607448e-05, + "loss": 0.8071, + "step": 7162 + }, + { + "epoch": 4.040045121263395, + "grad_norm": 1.742782473564148, + "learning_rate": 2.9802594472645233e-05, + "loss": 0.78, + "step": 7163 + }, + { + "epoch": 4.040609137055838, + "grad_norm": 1.1983853578567505, + "learning_rate": 2.9799774393683022e-05, + "loss": 0.6573, + "step": 7164 + }, + { + "epoch": 4.0411731528482795, + "grad_norm": 1.1612807512283325, + "learning_rate": 2.9796954314720814e-05, + "loss": 0.7664, + "step": 7165 + }, + { + "epoch": 4.041737168640722, + "grad_norm": 0.9909684658050537, + "learning_rate": 2.9794134235758603e-05, + "loss": 0.7804, + "step": 7166 + }, + { + "epoch": 4.042301184433164, + "grad_norm": 1.5052664279937744, + "learning_rate": 2.9791314156796392e-05, + "loss": 0.6509, + "step": 7167 + }, + { + "epoch": 4.042865200225607, + "grad_norm": 1.3250094652175903, + "learning_rate": 2.9788494077834177e-05, + "loss": 0.6973, + "step": 7168 + }, + { + "epoch": 4.0434292160180485, + "grad_norm": 1.3323149681091309, + "learning_rate": 2.978567399887197e-05, + "loss": 0.7553, + "step": 7169 + }, + { + "epoch": 4.04399323181049, + "grad_norm": 0.930840253829956, + "learning_rate": 2.978285391990976e-05, + "loss": 0.7159, + "step": 7170 + }, + { + "epoch": 4.044557247602933, + "grad_norm": 2.16108775138855, + "learning_rate": 2.9780033840947547e-05, + "loss": 0.8059, + "step": 7171 + }, + { + "epoch": 4.045121263395375, + "grad_norm": 2.1380269527435303, + "learning_rate": 2.977721376198534e-05, + "loss": 0.8404, + "step": 7172 + }, + { + "epoch": 4.0456852791878175, + "grad_norm": 1.0445667505264282, + "learning_rate": 2.9774393683023128e-05, + "loss": 0.7324, + "step": 7173 + }, + { + "epoch": 4.046249294980259, + "grad_norm": 1.2120411396026611, + "learning_rate": 2.9771573604060914e-05, + "loss": 0.6652, + "step": 7174 + }, + { + "epoch": 4.046813310772702, + "grad_norm": 1.17043137550354, + "learning_rate": 2.9768753525098702e-05, + "loss": 0.695, + "step": 7175 + }, + { + "epoch": 4.047377326565144, + "grad_norm": 0.935356080532074, + "learning_rate": 2.9765933446136495e-05, + "loss": 0.6437, + "step": 7176 + }, + { + "epoch": 4.047941342357586, + "grad_norm": 1.7009652853012085, + "learning_rate": 2.9763113367174283e-05, + "loss": 0.7623, + "step": 7177 + }, + { + "epoch": 4.048505358150028, + "grad_norm": 0.9119917750358582, + "learning_rate": 2.9760293288212072e-05, + "loss": 0.6939, + "step": 7178 + }, + { + "epoch": 4.04906937394247, + "grad_norm": 0.9456127285957336, + "learning_rate": 2.9757473209249858e-05, + "loss": 0.7278, + "step": 7179 + }, + { + "epoch": 4.049633389734913, + "grad_norm": 1.3851370811462402, + "learning_rate": 2.9754653130287653e-05, + "loss": 0.7658, + "step": 7180 + }, + { + "epoch": 4.050197405527355, + "grad_norm": 1.0927038192749023, + "learning_rate": 2.975183305132544e-05, + "loss": 0.7909, + "step": 7181 + }, + { + "epoch": 4.050761421319797, + "grad_norm": 1.7099417448043823, + "learning_rate": 2.9749012972363227e-05, + "loss": 0.9006, + "step": 7182 + }, + { + "epoch": 4.051325437112239, + "grad_norm": 1.173612356185913, + "learning_rate": 2.9746192893401016e-05, + "loss": 0.8216, + "step": 7183 + }, + { + "epoch": 4.051889452904681, + "grad_norm": 1.682969570159912, + "learning_rate": 2.974337281443881e-05, + "loss": 0.7243, + "step": 7184 + }, + { + "epoch": 4.052453468697124, + "grad_norm": 1.3384714126586914, + "learning_rate": 2.9740552735476597e-05, + "loss": 0.8896, + "step": 7185 + }, + { + "epoch": 4.0530174844895654, + "grad_norm": 1.4734536409378052, + "learning_rate": 2.9737732656514383e-05, + "loss": 0.7726, + "step": 7186 + }, + { + "epoch": 4.053581500282008, + "grad_norm": 1.3352863788604736, + "learning_rate": 2.973491257755217e-05, + "loss": 0.8051, + "step": 7187 + }, + { + "epoch": 4.05414551607445, + "grad_norm": 1.5933237075805664, + "learning_rate": 2.9732092498589964e-05, + "loss": 0.8118, + "step": 7188 + }, + { + "epoch": 4.054709531866893, + "grad_norm": 0.7979554533958435, + "learning_rate": 2.9729272419627752e-05, + "loss": 0.6164, + "step": 7189 + }, + { + "epoch": 4.0552735476593345, + "grad_norm": 1.9682426452636719, + "learning_rate": 2.9726452340665538e-05, + "loss": 0.8306, + "step": 7190 + }, + { + "epoch": 4.055837563451776, + "grad_norm": 1.1254172325134277, + "learning_rate": 2.9723632261703327e-05, + "loss": 0.6924, + "step": 7191 + }, + { + "epoch": 4.056401579244219, + "grad_norm": 1.4603731632232666, + "learning_rate": 2.972081218274112e-05, + "loss": 0.8623, + "step": 7192 + }, + { + "epoch": 4.056965595036661, + "grad_norm": 1.186004638671875, + "learning_rate": 2.9717992103778908e-05, + "loss": 0.781, + "step": 7193 + }, + { + "epoch": 4.0575296108291035, + "grad_norm": 0.8796769976615906, + "learning_rate": 2.9715172024816696e-05, + "loss": 0.5899, + "step": 7194 + }, + { + "epoch": 4.058093626621545, + "grad_norm": 1.1068209409713745, + "learning_rate": 2.9712351945854482e-05, + "loss": 0.7753, + "step": 7195 + }, + { + "epoch": 4.058657642413988, + "grad_norm": 1.587481141090393, + "learning_rate": 2.9709531866892277e-05, + "loss": 0.7987, + "step": 7196 + }, + { + "epoch": 4.05922165820643, + "grad_norm": 1.383974313735962, + "learning_rate": 2.9706711787930063e-05, + "loss": 0.7302, + "step": 7197 + }, + { + "epoch": 4.059785673998872, + "grad_norm": 1.3033232688903809, + "learning_rate": 2.970389170896785e-05, + "loss": 0.7508, + "step": 7198 + }, + { + "epoch": 4.060349689791314, + "grad_norm": 1.244996428489685, + "learning_rate": 2.970107163000564e-05, + "loss": 0.7187, + "step": 7199 + }, + { + "epoch": 4.060913705583756, + "grad_norm": 1.3954306840896606, + "learning_rate": 2.9698251551043432e-05, + "loss": 0.808, + "step": 7200 + }, + { + "epoch": 4.061477721376199, + "grad_norm": 0.9359831213951111, + "learning_rate": 2.969543147208122e-05, + "loss": 0.6658, + "step": 7201 + }, + { + "epoch": 4.062041737168641, + "grad_norm": 1.1566426753997803, + "learning_rate": 2.9692611393119007e-05, + "loss": 0.748, + "step": 7202 + }, + { + "epoch": 4.062605752961083, + "grad_norm": 1.86312735080719, + "learning_rate": 2.9689791314156795e-05, + "loss": 0.912, + "step": 7203 + }, + { + "epoch": 4.063169768753525, + "grad_norm": 1.7282949686050415, + "learning_rate": 2.9686971235194588e-05, + "loss": 0.8471, + "step": 7204 + }, + { + "epoch": 4.063733784545967, + "grad_norm": 1.260417103767395, + "learning_rate": 2.9684151156232376e-05, + "loss": 0.7639, + "step": 7205 + }, + { + "epoch": 4.06429780033841, + "grad_norm": 1.2693952322006226, + "learning_rate": 2.9681331077270165e-05, + "loss": 0.6811, + "step": 7206 + }, + { + "epoch": 4.064861816130851, + "grad_norm": 1.3106300830841064, + "learning_rate": 2.9678510998307957e-05, + "loss": 0.684, + "step": 7207 + }, + { + "epoch": 4.065425831923294, + "grad_norm": 1.1499660015106201, + "learning_rate": 2.9675690919345743e-05, + "loss": 0.7084, + "step": 7208 + }, + { + "epoch": 4.065989847715736, + "grad_norm": 0.8865706324577332, + "learning_rate": 2.967287084038353e-05, + "loss": 0.6954, + "step": 7209 + }, + { + "epoch": 4.066553863508179, + "grad_norm": 1.2760432958602905, + "learning_rate": 2.967005076142132e-05, + "loss": 0.7278, + "step": 7210 + }, + { + "epoch": 4.06711787930062, + "grad_norm": 1.2088162899017334, + "learning_rate": 2.9667230682459113e-05, + "loss": 0.7746, + "step": 7211 + }, + { + "epoch": 4.067681895093062, + "grad_norm": 1.0951483249664307, + "learning_rate": 2.96644106034969e-05, + "loss": 0.6373, + "step": 7212 + }, + { + "epoch": 4.068245910885505, + "grad_norm": 1.146074652671814, + "learning_rate": 2.9661590524534687e-05, + "loss": 0.7328, + "step": 7213 + }, + { + "epoch": 4.068809926677947, + "grad_norm": 1.1742992401123047, + "learning_rate": 2.9658770445572476e-05, + "loss": 0.7559, + "step": 7214 + }, + { + "epoch": 4.069373942470389, + "grad_norm": 1.0377042293548584, + "learning_rate": 2.9655950366610268e-05, + "loss": 0.748, + "step": 7215 + }, + { + "epoch": 4.069937958262831, + "grad_norm": 1.5057353973388672, + "learning_rate": 2.9653130287648057e-05, + "loss": 0.7114, + "step": 7216 + }, + { + "epoch": 4.070501974055274, + "grad_norm": 1.3107577562332153, + "learning_rate": 2.9650310208685845e-05, + "loss": 0.874, + "step": 7217 + }, + { + "epoch": 4.071065989847716, + "grad_norm": 1.3019468784332275, + "learning_rate": 2.964749012972363e-05, + "loss": 0.7381, + "step": 7218 + }, + { + "epoch": 4.0716300056401575, + "grad_norm": 1.4460997581481934, + "learning_rate": 2.9644670050761426e-05, + "loss": 0.768, + "step": 7219 + }, + { + "epoch": 4.0721940214326, + "grad_norm": 1.1601325273513794, + "learning_rate": 2.9641849971799212e-05, + "loss": 0.6845, + "step": 7220 + }, + { + "epoch": 4.072758037225042, + "grad_norm": 1.4713853597640991, + "learning_rate": 2.9639029892837e-05, + "loss": 0.7157, + "step": 7221 + }, + { + "epoch": 4.073322053017485, + "grad_norm": 1.001245141029358, + "learning_rate": 2.963620981387479e-05, + "loss": 0.7183, + "step": 7222 + }, + { + "epoch": 4.0738860688099265, + "grad_norm": 1.14930260181427, + "learning_rate": 2.963338973491258e-05, + "loss": 0.6193, + "step": 7223 + }, + { + "epoch": 4.074450084602369, + "grad_norm": 1.348183035850525, + "learning_rate": 2.963056965595037e-05, + "loss": 0.7002, + "step": 7224 + }, + { + "epoch": 4.075014100394811, + "grad_norm": 1.047295093536377, + "learning_rate": 2.9627749576988156e-05, + "loss": 0.7497, + "step": 7225 + }, + { + "epoch": 4.075578116187253, + "grad_norm": 1.077890396118164, + "learning_rate": 2.9624929498025945e-05, + "loss": 0.6453, + "step": 7226 + }, + { + "epoch": 4.0761421319796955, + "grad_norm": 1.4381296634674072, + "learning_rate": 2.9622109419063737e-05, + "loss": 0.758, + "step": 7227 + }, + { + "epoch": 4.076706147772137, + "grad_norm": 1.1611320972442627, + "learning_rate": 2.9619289340101526e-05, + "loss": 0.8469, + "step": 7228 + }, + { + "epoch": 4.07727016356458, + "grad_norm": 1.0427873134613037, + "learning_rate": 2.961646926113931e-05, + "loss": 0.6055, + "step": 7229 + }, + { + "epoch": 4.077834179357022, + "grad_norm": 0.9956427216529846, + "learning_rate": 2.96136491821771e-05, + "loss": 0.7116, + "step": 7230 + }, + { + "epoch": 4.0783981951494646, + "grad_norm": 1.6691648960113525, + "learning_rate": 2.9610829103214892e-05, + "loss": 0.7582, + "step": 7231 + }, + { + "epoch": 4.078962210941906, + "grad_norm": 1.544386386871338, + "learning_rate": 2.960800902425268e-05, + "loss": 0.8024, + "step": 7232 + }, + { + "epoch": 4.079526226734348, + "grad_norm": 0.907722532749176, + "learning_rate": 2.960518894529047e-05, + "loss": 0.6731, + "step": 7233 + }, + { + "epoch": 4.080090242526791, + "grad_norm": 1.2446144819259644, + "learning_rate": 2.9602368866328255e-05, + "loss": 0.6558, + "step": 7234 + }, + { + "epoch": 4.080654258319233, + "grad_norm": 1.3995240926742554, + "learning_rate": 2.959954878736605e-05, + "loss": 0.8439, + "step": 7235 + }, + { + "epoch": 4.081218274111675, + "grad_norm": 1.1575652360916138, + "learning_rate": 2.9596728708403836e-05, + "loss": 0.6983, + "step": 7236 + }, + { + "epoch": 4.081782289904117, + "grad_norm": 1.1180707216262817, + "learning_rate": 2.9593908629441625e-05, + "loss": 0.8491, + "step": 7237 + }, + { + "epoch": 4.08234630569656, + "grad_norm": 0.9774967432022095, + "learning_rate": 2.9591088550479414e-05, + "loss": 0.7629, + "step": 7238 + }, + { + "epoch": 4.082910321489002, + "grad_norm": 1.2620716094970703, + "learning_rate": 2.9588268471517206e-05, + "loss": 0.8063, + "step": 7239 + }, + { + "epoch": 4.0834743372814435, + "grad_norm": 1.0216622352600098, + "learning_rate": 2.9585448392554995e-05, + "loss": 0.6529, + "step": 7240 + }, + { + "epoch": 4.084038353073886, + "grad_norm": 1.1512560844421387, + "learning_rate": 2.958262831359278e-05, + "loss": 0.7225, + "step": 7241 + }, + { + "epoch": 4.084602368866328, + "grad_norm": 1.3521250486373901, + "learning_rate": 2.9579808234630575e-05, + "loss": 0.7908, + "step": 7242 + }, + { + "epoch": 4.085166384658771, + "grad_norm": 1.503287672996521, + "learning_rate": 2.957698815566836e-05, + "loss": 0.759, + "step": 7243 + }, + { + "epoch": 4.0857304004512125, + "grad_norm": 1.7788995504379272, + "learning_rate": 2.957416807670615e-05, + "loss": 0.7512, + "step": 7244 + }, + { + "epoch": 4.086294416243655, + "grad_norm": 1.1418765783309937, + "learning_rate": 2.9571347997743935e-05, + "loss": 0.7551, + "step": 7245 + }, + { + "epoch": 4.086858432036097, + "grad_norm": 0.9333507418632507, + "learning_rate": 2.956852791878173e-05, + "loss": 0.7121, + "step": 7246 + }, + { + "epoch": 4.087422447828539, + "grad_norm": 1.655034065246582, + "learning_rate": 2.9565707839819516e-05, + "loss": 0.7924, + "step": 7247 + }, + { + "epoch": 4.0879864636209815, + "grad_norm": 1.1684801578521729, + "learning_rate": 2.9562887760857305e-05, + "loss": 0.8055, + "step": 7248 + }, + { + "epoch": 4.088550479413423, + "grad_norm": 1.3078534603118896, + "learning_rate": 2.9560067681895094e-05, + "loss": 0.758, + "step": 7249 + }, + { + "epoch": 4.089114495205866, + "grad_norm": 1.512797474861145, + "learning_rate": 2.9557247602932886e-05, + "loss": 0.7071, + "step": 7250 + }, + { + "epoch": 4.089678510998308, + "grad_norm": 1.1509519815444946, + "learning_rate": 2.9554427523970675e-05, + "loss": 0.7447, + "step": 7251 + }, + { + "epoch": 4.0902425267907505, + "grad_norm": 1.1386998891830444, + "learning_rate": 2.955160744500846e-05, + "loss": 0.6867, + "step": 7252 + }, + { + "epoch": 4.090806542583192, + "grad_norm": 1.5175293684005737, + "learning_rate": 2.954878736604625e-05, + "loss": 0.8353, + "step": 7253 + }, + { + "epoch": 4.091370558375634, + "grad_norm": 1.5513694286346436, + "learning_rate": 2.954596728708404e-05, + "loss": 0.7546, + "step": 7254 + }, + { + "epoch": 4.091934574168077, + "grad_norm": 1.3860361576080322, + "learning_rate": 2.954314720812183e-05, + "loss": 0.7895, + "step": 7255 + }, + { + "epoch": 4.092498589960519, + "grad_norm": 1.490984320640564, + "learning_rate": 2.954032712915962e-05, + "loss": 0.7535, + "step": 7256 + }, + { + "epoch": 4.093062605752961, + "grad_norm": 1.1893681287765503, + "learning_rate": 2.9537507050197404e-05, + "loss": 0.726, + "step": 7257 + }, + { + "epoch": 4.093626621545403, + "grad_norm": 0.9472885131835938, + "learning_rate": 2.95346869712352e-05, + "loss": 0.706, + "step": 7258 + }, + { + "epoch": 4.094190637337846, + "grad_norm": 1.6062798500061035, + "learning_rate": 2.9531866892272985e-05, + "loss": 0.6902, + "step": 7259 + }, + { + "epoch": 4.094754653130288, + "grad_norm": 1.409144639968872, + "learning_rate": 2.9529046813310774e-05, + "loss": 0.7074, + "step": 7260 + }, + { + "epoch": 4.095318668922729, + "grad_norm": 1.3398905992507935, + "learning_rate": 2.9526226734348563e-05, + "loss": 0.8055, + "step": 7261 + }, + { + "epoch": 4.095882684715172, + "grad_norm": 1.8813931941986084, + "learning_rate": 2.9523406655386355e-05, + "loss": 0.77, + "step": 7262 + }, + { + "epoch": 4.096446700507614, + "grad_norm": 1.314356803894043, + "learning_rate": 2.952058657642414e-05, + "loss": 0.8433, + "step": 7263 + }, + { + "epoch": 4.097010716300057, + "grad_norm": 1.824635624885559, + "learning_rate": 2.951776649746193e-05, + "loss": 0.9063, + "step": 7264 + }, + { + "epoch": 4.0975747320924985, + "grad_norm": 1.232885718345642, + "learning_rate": 2.9514946418499718e-05, + "loss": 0.7602, + "step": 7265 + }, + { + "epoch": 4.098138747884941, + "grad_norm": 0.9483507871627808, + "learning_rate": 2.951212633953751e-05, + "loss": 0.7068, + "step": 7266 + }, + { + "epoch": 4.098702763677383, + "grad_norm": 2.294335126876831, + "learning_rate": 2.95093062605753e-05, + "loss": 0.6549, + "step": 7267 + }, + { + "epoch": 4.099266779469825, + "grad_norm": 1.0342515707015991, + "learning_rate": 2.9506486181613084e-05, + "loss": 0.7304, + "step": 7268 + }, + { + "epoch": 4.0998307952622675, + "grad_norm": 1.2373123168945312, + "learning_rate": 2.9503666102650873e-05, + "loss": 0.643, + "step": 7269 + }, + { + "epoch": 4.100394811054709, + "grad_norm": 1.3167998790740967, + "learning_rate": 2.9500846023688665e-05, + "loss": 0.7847, + "step": 7270 + }, + { + "epoch": 4.100958826847152, + "grad_norm": 1.3906598091125488, + "learning_rate": 2.9498025944726454e-05, + "loss": 0.7781, + "step": 7271 + }, + { + "epoch": 4.101522842639594, + "grad_norm": 1.1262634992599487, + "learning_rate": 2.9495205865764243e-05, + "loss": 0.8556, + "step": 7272 + }, + { + "epoch": 4.1020868584320365, + "grad_norm": 0.8340985774993896, + "learning_rate": 2.9492385786802028e-05, + "loss": 0.5978, + "step": 7273 + }, + { + "epoch": 4.102650874224478, + "grad_norm": 1.3804664611816406, + "learning_rate": 2.9489565707839824e-05, + "loss": 0.7621, + "step": 7274 + }, + { + "epoch": 4.10321489001692, + "grad_norm": 0.9149504899978638, + "learning_rate": 2.948674562887761e-05, + "loss": 0.6416, + "step": 7275 + }, + { + "epoch": 4.103778905809363, + "grad_norm": 0.9967199563980103, + "learning_rate": 2.9483925549915398e-05, + "loss": 0.7906, + "step": 7276 + }, + { + "epoch": 4.104342921601805, + "grad_norm": 1.0681856870651245, + "learning_rate": 2.948110547095319e-05, + "loss": 0.7617, + "step": 7277 + }, + { + "epoch": 4.104906937394247, + "grad_norm": 3.3266348838806152, + "learning_rate": 2.947828539199098e-05, + "loss": 0.8117, + "step": 7278 + }, + { + "epoch": 4.105470953186689, + "grad_norm": 1.0112215280532837, + "learning_rate": 2.9475465313028768e-05, + "loss": 0.7779, + "step": 7279 + }, + { + "epoch": 4.106034968979132, + "grad_norm": 1.276869773864746, + "learning_rate": 2.9472645234066553e-05, + "loss": 0.7624, + "step": 7280 + }, + { + "epoch": 4.106598984771574, + "grad_norm": 1.9553226232528687, + "learning_rate": 2.9469825155104345e-05, + "loss": 0.7959, + "step": 7281 + }, + { + "epoch": 4.107163000564015, + "grad_norm": 1.4735302925109863, + "learning_rate": 2.9467005076142134e-05, + "loss": 0.7185, + "step": 7282 + }, + { + "epoch": 4.107727016356458, + "grad_norm": 1.3087986707687378, + "learning_rate": 2.9464184997179923e-05, + "loss": 0.7527, + "step": 7283 + }, + { + "epoch": 4.1082910321489, + "grad_norm": 1.004058599472046, + "learning_rate": 2.946136491821771e-05, + "loss": 0.6431, + "step": 7284 + }, + { + "epoch": 4.108855047941343, + "grad_norm": 1.465553879737854, + "learning_rate": 2.9458544839255504e-05, + "loss": 0.7224, + "step": 7285 + }, + { + "epoch": 4.109419063733784, + "grad_norm": 1.0392500162124634, + "learning_rate": 2.945572476029329e-05, + "loss": 0.6652, + "step": 7286 + }, + { + "epoch": 4.109983079526227, + "grad_norm": 1.2806609869003296, + "learning_rate": 2.9452904681331078e-05, + "loss": 0.7008, + "step": 7287 + }, + { + "epoch": 4.110547095318669, + "grad_norm": 2.007162094116211, + "learning_rate": 2.9450084602368867e-05, + "loss": 0.8936, + "step": 7288 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 1.0210779905319214, + "learning_rate": 2.944726452340666e-05, + "loss": 0.7125, + "step": 7289 + }, + { + "epoch": 4.111675126903553, + "grad_norm": 1.1421352624893188, + "learning_rate": 2.9444444444444448e-05, + "loss": 0.6999, + "step": 7290 + }, + { + "epoch": 4.112239142695995, + "grad_norm": 1.2998677492141724, + "learning_rate": 2.9441624365482233e-05, + "loss": 0.7685, + "step": 7291 + }, + { + "epoch": 4.112803158488438, + "grad_norm": 1.3232495784759521, + "learning_rate": 2.9438804286520022e-05, + "loss": 0.8709, + "step": 7292 + }, + { + "epoch": 4.11336717428088, + "grad_norm": 1.657104730606079, + "learning_rate": 2.9435984207557814e-05, + "loss": 0.7864, + "step": 7293 + }, + { + "epoch": 4.113931190073322, + "grad_norm": 0.8818696141242981, + "learning_rate": 2.9433164128595603e-05, + "loss": 0.6466, + "step": 7294 + }, + { + "epoch": 4.114495205865764, + "grad_norm": 1.585519552230835, + "learning_rate": 2.9430344049633392e-05, + "loss": 0.7667, + "step": 7295 + }, + { + "epoch": 4.115059221658206, + "grad_norm": 2.459195613861084, + "learning_rate": 2.9427523970671177e-05, + "loss": 0.7136, + "step": 7296 + }, + { + "epoch": 4.115623237450649, + "grad_norm": 1.2214964628219604, + "learning_rate": 2.9424703891708973e-05, + "loss": 0.7366, + "step": 7297 + }, + { + "epoch": 4.1161872532430905, + "grad_norm": 1.0008244514465332, + "learning_rate": 2.9421883812746758e-05, + "loss": 0.7532, + "step": 7298 + }, + { + "epoch": 4.116751269035533, + "grad_norm": 1.8608802556991577, + "learning_rate": 2.9419063733784547e-05, + "loss": 0.7293, + "step": 7299 + }, + { + "epoch": 4.117315284827975, + "grad_norm": 1.61432945728302, + "learning_rate": 2.9416243654822332e-05, + "loss": 0.7224, + "step": 7300 + }, + { + "epoch": 4.117879300620418, + "grad_norm": 1.8994171619415283, + "learning_rate": 2.9413423575860128e-05, + "loss": 0.8196, + "step": 7301 + }, + { + "epoch": 4.1184433164128595, + "grad_norm": 1.4072312116622925, + "learning_rate": 2.9410603496897913e-05, + "loss": 0.8049, + "step": 7302 + }, + { + "epoch": 4.119007332205301, + "grad_norm": 1.0233216285705566, + "learning_rate": 2.9407783417935702e-05, + "loss": 0.7157, + "step": 7303 + }, + { + "epoch": 4.119571347997744, + "grad_norm": 0.9388818740844727, + "learning_rate": 2.940496333897349e-05, + "loss": 0.7368, + "step": 7304 + }, + { + "epoch": 4.120135363790186, + "grad_norm": 1.5019656419754028, + "learning_rate": 2.9402143260011283e-05, + "loss": 0.8137, + "step": 7305 + }, + { + "epoch": 4.1206993795826286, + "grad_norm": 0.968132495880127, + "learning_rate": 2.9399323181049072e-05, + "loss": 0.6935, + "step": 7306 + }, + { + "epoch": 4.12126339537507, + "grad_norm": 1.8797537088394165, + "learning_rate": 2.9396503102086857e-05, + "loss": 0.7857, + "step": 7307 + }, + { + "epoch": 4.121827411167513, + "grad_norm": 1.5193641185760498, + "learning_rate": 2.9393683023124646e-05, + "loss": 0.8357, + "step": 7308 + }, + { + "epoch": 4.122391426959955, + "grad_norm": 1.4226752519607544, + "learning_rate": 2.939086294416244e-05, + "loss": 0.7523, + "step": 7309 + }, + { + "epoch": 4.122955442752397, + "grad_norm": 1.0655790567398071, + "learning_rate": 2.9388042865200227e-05, + "loss": 0.6102, + "step": 7310 + }, + { + "epoch": 4.123519458544839, + "grad_norm": 0.8869748115539551, + "learning_rate": 2.9385222786238016e-05, + "loss": 0.7531, + "step": 7311 + }, + { + "epoch": 4.124083474337281, + "grad_norm": 1.6200923919677734, + "learning_rate": 2.9382402707275808e-05, + "loss": 0.8206, + "step": 7312 + }, + { + "epoch": 4.124647490129724, + "grad_norm": 1.46846342086792, + "learning_rate": 2.9379582628313597e-05, + "loss": 0.8837, + "step": 7313 + }, + { + "epoch": 4.125211505922166, + "grad_norm": 1.179491400718689, + "learning_rate": 2.9376762549351382e-05, + "loss": 0.7471, + "step": 7314 + }, + { + "epoch": 4.125775521714608, + "grad_norm": 1.1802564859390259, + "learning_rate": 2.937394247038917e-05, + "loss": 0.7816, + "step": 7315 + }, + { + "epoch": 4.12633953750705, + "grad_norm": 1.4354242086410522, + "learning_rate": 2.9371122391426963e-05, + "loss": 0.6922, + "step": 7316 + }, + { + "epoch": 4.126903553299492, + "grad_norm": 1.204710841178894, + "learning_rate": 2.9368302312464752e-05, + "loss": 0.7404, + "step": 7317 + }, + { + "epoch": 4.127467569091935, + "grad_norm": 1.2317306995391846, + "learning_rate": 2.9365482233502538e-05, + "loss": 0.6286, + "step": 7318 + }, + { + "epoch": 4.1280315848843765, + "grad_norm": 1.800484538078308, + "learning_rate": 2.9362662154540326e-05, + "loss": 0.8092, + "step": 7319 + }, + { + "epoch": 4.128595600676819, + "grad_norm": 1.360711693763733, + "learning_rate": 2.935984207557812e-05, + "loss": 0.6075, + "step": 7320 + }, + { + "epoch": 4.129159616469261, + "grad_norm": 1.2495315074920654, + "learning_rate": 2.9357021996615907e-05, + "loss": 0.7904, + "step": 7321 + }, + { + "epoch": 4.129723632261704, + "grad_norm": 1.0052237510681152, + "learning_rate": 2.9354201917653696e-05, + "loss": 0.7873, + "step": 7322 + }, + { + "epoch": 4.1302876480541455, + "grad_norm": 2.0920300483703613, + "learning_rate": 2.935138183869148e-05, + "loss": 0.8542, + "step": 7323 + }, + { + "epoch": 4.130851663846587, + "grad_norm": 1.4103940725326538, + "learning_rate": 2.9348561759729277e-05, + "loss": 0.7145, + "step": 7324 + }, + { + "epoch": 4.13141567963903, + "grad_norm": 1.3879151344299316, + "learning_rate": 2.9345741680767063e-05, + "loss": 0.7302, + "step": 7325 + }, + { + "epoch": 4.131979695431472, + "grad_norm": 1.0688679218292236, + "learning_rate": 2.934292160180485e-05, + "loss": 0.785, + "step": 7326 + }, + { + "epoch": 4.1325437112239145, + "grad_norm": 1.1702179908752441, + "learning_rate": 2.934010152284264e-05, + "loss": 0.7174, + "step": 7327 + }, + { + "epoch": 4.133107727016356, + "grad_norm": 1.2426679134368896, + "learning_rate": 2.9337281443880432e-05, + "loss": 0.7691, + "step": 7328 + }, + { + "epoch": 4.133671742808799, + "grad_norm": 1.2468554973602295, + "learning_rate": 2.933446136491822e-05, + "loss": 0.7654, + "step": 7329 + }, + { + "epoch": 4.134235758601241, + "grad_norm": 1.4814362525939941, + "learning_rate": 2.9331641285956007e-05, + "loss": 0.7558, + "step": 7330 + }, + { + "epoch": 4.134799774393683, + "grad_norm": 3.029637575149536, + "learning_rate": 2.9328821206993795e-05, + "loss": 0.74, + "step": 7331 + }, + { + "epoch": 4.135363790186125, + "grad_norm": 1.4690608978271484, + "learning_rate": 2.9326001128031588e-05, + "loss": 0.6429, + "step": 7332 + }, + { + "epoch": 4.135927805978567, + "grad_norm": 1.5132571458816528, + "learning_rate": 2.9323181049069376e-05, + "loss": 0.7872, + "step": 7333 + }, + { + "epoch": 4.13649182177101, + "grad_norm": 1.4467540979385376, + "learning_rate": 2.9320360970107165e-05, + "loss": 0.7861, + "step": 7334 + }, + { + "epoch": 4.137055837563452, + "grad_norm": 1.3882991075515747, + "learning_rate": 2.931754089114495e-05, + "loss": 0.7149, + "step": 7335 + }, + { + "epoch": 4.137619853355894, + "grad_norm": 1.1785598993301392, + "learning_rate": 2.9314720812182743e-05, + "loss": 0.7711, + "step": 7336 + }, + { + "epoch": 4.138183869148336, + "grad_norm": 1.0346096754074097, + "learning_rate": 2.931190073322053e-05, + "loss": 0.7026, + "step": 7337 + }, + { + "epoch": 4.138747884940778, + "grad_norm": 1.7306232452392578, + "learning_rate": 2.930908065425832e-05, + "loss": 0.7841, + "step": 7338 + }, + { + "epoch": 4.139311900733221, + "grad_norm": 1.0788506269454956, + "learning_rate": 2.9306260575296106e-05, + "loss": 0.8394, + "step": 7339 + }, + { + "epoch": 4.1398759165256624, + "grad_norm": 1.3114646673202515, + "learning_rate": 2.93034404963339e-05, + "loss": 0.7646, + "step": 7340 + }, + { + "epoch": 4.140439932318105, + "grad_norm": 1.861124038696289, + "learning_rate": 2.9300620417371687e-05, + "loss": 0.836, + "step": 7341 + }, + { + "epoch": 4.141003948110547, + "grad_norm": 1.1666334867477417, + "learning_rate": 2.9297800338409475e-05, + "loss": 0.7809, + "step": 7342 + }, + { + "epoch": 4.14156796390299, + "grad_norm": 1.3580727577209473, + "learning_rate": 2.9294980259447264e-05, + "loss": 0.6792, + "step": 7343 + }, + { + "epoch": 4.1421319796954315, + "grad_norm": 1.2163246870040894, + "learning_rate": 2.9292160180485056e-05, + "loss": 0.7465, + "step": 7344 + }, + { + "epoch": 4.142695995487873, + "grad_norm": 1.5040147304534912, + "learning_rate": 2.9289340101522845e-05, + "loss": 0.6591, + "step": 7345 + }, + { + "epoch": 4.143260011280316, + "grad_norm": 1.0072888135910034, + "learning_rate": 2.928652002256063e-05, + "loss": 0.6955, + "step": 7346 + }, + { + "epoch": 4.143824027072758, + "grad_norm": 1.4772967100143433, + "learning_rate": 2.928369994359842e-05, + "loss": 0.7655, + "step": 7347 + }, + { + "epoch": 4.1443880428652005, + "grad_norm": 1.1749544143676758, + "learning_rate": 2.928087986463621e-05, + "loss": 0.7967, + "step": 7348 + }, + { + "epoch": 4.144952058657642, + "grad_norm": 1.1486541032791138, + "learning_rate": 2.9278059785674e-05, + "loss": 0.6833, + "step": 7349 + }, + { + "epoch": 4.145516074450085, + "grad_norm": 1.5317413806915283, + "learning_rate": 2.927523970671179e-05, + "loss": 0.8394, + "step": 7350 + }, + { + "epoch": 4.146080090242527, + "grad_norm": 1.825321912765503, + "learning_rate": 2.927241962774958e-05, + "loss": 0.8006, + "step": 7351 + }, + { + "epoch": 4.146644106034969, + "grad_norm": 1.8185174465179443, + "learning_rate": 2.926959954878737e-05, + "loss": 0.8441, + "step": 7352 + }, + { + "epoch": 4.147208121827411, + "grad_norm": 1.7766133546829224, + "learning_rate": 2.9266779469825156e-05, + "loss": 0.8582, + "step": 7353 + }, + { + "epoch": 4.147772137619853, + "grad_norm": 1.2579047679901123, + "learning_rate": 2.9263959390862944e-05, + "loss": 0.7922, + "step": 7354 + }, + { + "epoch": 4.148336153412296, + "grad_norm": 1.074658989906311, + "learning_rate": 2.9261139311900737e-05, + "loss": 0.7175, + "step": 7355 + }, + { + "epoch": 4.148900169204738, + "grad_norm": 1.0186654329299927, + "learning_rate": 2.9258319232938525e-05, + "loss": 0.728, + "step": 7356 + }, + { + "epoch": 4.14946418499718, + "grad_norm": 1.2123432159423828, + "learning_rate": 2.925549915397631e-05, + "loss": 0.759, + "step": 7357 + }, + { + "epoch": 4.150028200789622, + "grad_norm": 1.1066114902496338, + "learning_rate": 2.92526790750141e-05, + "loss": 0.8668, + "step": 7358 + }, + { + "epoch": 4.150592216582064, + "grad_norm": 1.3145573139190674, + "learning_rate": 2.9249858996051892e-05, + "loss": 0.7676, + "step": 7359 + }, + { + "epoch": 4.151156232374507, + "grad_norm": 1.0190306901931763, + "learning_rate": 2.924703891708968e-05, + "loss": 0.6817, + "step": 7360 + }, + { + "epoch": 4.151720248166948, + "grad_norm": 1.4471497535705566, + "learning_rate": 2.924421883812747e-05, + "loss": 0.7288, + "step": 7361 + }, + { + "epoch": 4.152284263959391, + "grad_norm": 1.1844364404678345, + "learning_rate": 2.9241398759165255e-05, + "loss": 0.6506, + "step": 7362 + }, + { + "epoch": 4.152848279751833, + "grad_norm": 1.367889165878296, + "learning_rate": 2.923857868020305e-05, + "loss": 0.8248, + "step": 7363 + }, + { + "epoch": 4.153412295544276, + "grad_norm": 1.28171706199646, + "learning_rate": 2.9235758601240836e-05, + "loss": 0.8002, + "step": 7364 + }, + { + "epoch": 4.153976311336717, + "grad_norm": 1.0485588312149048, + "learning_rate": 2.9232938522278625e-05, + "loss": 0.6574, + "step": 7365 + }, + { + "epoch": 4.154540327129159, + "grad_norm": 0.9751696586608887, + "learning_rate": 2.9230118443316413e-05, + "loss": 0.704, + "step": 7366 + }, + { + "epoch": 4.155104342921602, + "grad_norm": 1.639731764793396, + "learning_rate": 2.9227298364354206e-05, + "loss": 0.7728, + "step": 7367 + }, + { + "epoch": 4.155668358714044, + "grad_norm": 1.3764383792877197, + "learning_rate": 2.9224478285391994e-05, + "loss": 0.7182, + "step": 7368 + }, + { + "epoch": 4.156232374506486, + "grad_norm": 1.5765769481658936, + "learning_rate": 2.922165820642978e-05, + "loss": 0.8526, + "step": 7369 + }, + { + "epoch": 4.156796390298928, + "grad_norm": 1.3434207439422607, + "learning_rate": 2.921883812746757e-05, + "loss": 0.7429, + "step": 7370 + }, + { + "epoch": 4.157360406091371, + "grad_norm": 1.4400140047073364, + "learning_rate": 2.921601804850536e-05, + "loss": 0.7979, + "step": 7371 + }, + { + "epoch": 4.157924421883813, + "grad_norm": 1.1340104341506958, + "learning_rate": 2.921319796954315e-05, + "loss": 0.6726, + "step": 7372 + }, + { + "epoch": 4.1584884376762545, + "grad_norm": 2.146251916885376, + "learning_rate": 2.921037789058094e-05, + "loss": 0.8156, + "step": 7373 + }, + { + "epoch": 4.159052453468697, + "grad_norm": 1.3704848289489746, + "learning_rate": 2.9207557811618724e-05, + "loss": 0.7376, + "step": 7374 + }, + { + "epoch": 4.159616469261139, + "grad_norm": 2.0294556617736816, + "learning_rate": 2.9204737732656516e-05, + "loss": 0.8245, + "step": 7375 + }, + { + "epoch": 4.160180485053582, + "grad_norm": 1.160467267036438, + "learning_rate": 2.9201917653694305e-05, + "loss": 0.7584, + "step": 7376 + }, + { + "epoch": 4.1607445008460235, + "grad_norm": 1.2247822284698486, + "learning_rate": 2.9199097574732094e-05, + "loss": 0.7552, + "step": 7377 + }, + { + "epoch": 4.161308516638466, + "grad_norm": 1.2906190156936646, + "learning_rate": 2.919627749576988e-05, + "loss": 0.7787, + "step": 7378 + }, + { + "epoch": 4.161872532430908, + "grad_norm": 1.579561471939087, + "learning_rate": 2.9193457416807674e-05, + "loss": 0.7832, + "step": 7379 + }, + { + "epoch": 4.16243654822335, + "grad_norm": 1.3806830644607544, + "learning_rate": 2.919063733784546e-05, + "loss": 0.8169, + "step": 7380 + }, + { + "epoch": 4.1630005640157925, + "grad_norm": 0.9083021879196167, + "learning_rate": 2.918781725888325e-05, + "loss": 0.6413, + "step": 7381 + }, + { + "epoch": 4.163564579808234, + "grad_norm": 1.300988793373108, + "learning_rate": 2.9184997179921037e-05, + "loss": 0.8904, + "step": 7382 + }, + { + "epoch": 4.164128595600677, + "grad_norm": 1.3474498987197876, + "learning_rate": 2.918217710095883e-05, + "loss": 0.7822, + "step": 7383 + }, + { + "epoch": 4.164692611393119, + "grad_norm": 1.137424349784851, + "learning_rate": 2.917935702199662e-05, + "loss": 0.7746, + "step": 7384 + }, + { + "epoch": 4.1652566271855616, + "grad_norm": 0.9866982102394104, + "learning_rate": 2.9176536943034404e-05, + "loss": 0.6752, + "step": 7385 + }, + { + "epoch": 4.165820642978003, + "grad_norm": 1.1243690252304077, + "learning_rate": 2.91737168640722e-05, + "loss": 0.7579, + "step": 7386 + }, + { + "epoch": 4.166384658770445, + "grad_norm": 1.2770106792449951, + "learning_rate": 2.9170896785109985e-05, + "loss": 0.835, + "step": 7387 + }, + { + "epoch": 4.166948674562888, + "grad_norm": 0.8662923574447632, + "learning_rate": 2.9168076706147774e-05, + "loss": 0.6576, + "step": 7388 + }, + { + "epoch": 4.16751269035533, + "grad_norm": 1.7128015756607056, + "learning_rate": 2.9165256627185562e-05, + "loss": 0.6448, + "step": 7389 + }, + { + "epoch": 4.168076706147772, + "grad_norm": 0.9110943078994751, + "learning_rate": 2.9162436548223355e-05, + "loss": 0.679, + "step": 7390 + }, + { + "epoch": 4.168640721940214, + "grad_norm": 1.1977927684783936, + "learning_rate": 2.9159616469261143e-05, + "loss": 0.718, + "step": 7391 + }, + { + "epoch": 4.169204737732657, + "grad_norm": 1.1111438274383545, + "learning_rate": 2.915679639029893e-05, + "loss": 0.7476, + "step": 7392 + }, + { + "epoch": 4.169768753525099, + "grad_norm": 1.3958885669708252, + "learning_rate": 2.9153976311336718e-05, + "loss": 0.7893, + "step": 7393 + }, + { + "epoch": 4.1703327693175405, + "grad_norm": 1.3863195180892944, + "learning_rate": 2.915115623237451e-05, + "loss": 0.7603, + "step": 7394 + }, + { + "epoch": 4.170896785109983, + "grad_norm": 1.2410259246826172, + "learning_rate": 2.91483361534123e-05, + "loss": 0.7316, + "step": 7395 + }, + { + "epoch": 4.171460800902425, + "grad_norm": 1.8145923614501953, + "learning_rate": 2.9145516074450084e-05, + "loss": 0.8432, + "step": 7396 + }, + { + "epoch": 4.172024816694868, + "grad_norm": 4.86771297454834, + "learning_rate": 2.9142695995487873e-05, + "loss": 0.7286, + "step": 7397 + }, + { + "epoch": 4.1725888324873095, + "grad_norm": 1.2782469987869263, + "learning_rate": 2.9139875916525665e-05, + "loss": 0.6857, + "step": 7398 + }, + { + "epoch": 4.173152848279752, + "grad_norm": 0.9716362357139587, + "learning_rate": 2.9137055837563454e-05, + "loss": 0.8082, + "step": 7399 + }, + { + "epoch": 4.173716864072194, + "grad_norm": 1.3365226984024048, + "learning_rate": 2.9134235758601243e-05, + "loss": 0.841, + "step": 7400 + }, + { + "epoch": 4.174280879864636, + "grad_norm": 1.4887583255767822, + "learning_rate": 2.9131415679639028e-05, + "loss": 0.6406, + "step": 7401 + }, + { + "epoch": 4.1748448956570785, + "grad_norm": 1.539617657661438, + "learning_rate": 2.9128595600676824e-05, + "loss": 0.8401, + "step": 7402 + }, + { + "epoch": 4.17540891144952, + "grad_norm": 1.1204580068588257, + "learning_rate": 2.912577552171461e-05, + "loss": 0.7488, + "step": 7403 + }, + { + "epoch": 4.175972927241963, + "grad_norm": 1.4825910329818726, + "learning_rate": 2.9122955442752398e-05, + "loss": 0.8151, + "step": 7404 + }, + { + "epoch": 4.176536943034405, + "grad_norm": 1.2296122312545776, + "learning_rate": 2.9120135363790187e-05, + "loss": 0.8268, + "step": 7405 + }, + { + "epoch": 4.1771009588268475, + "grad_norm": 1.1539522409439087, + "learning_rate": 2.911731528482798e-05, + "loss": 0.7411, + "step": 7406 + }, + { + "epoch": 4.177664974619289, + "grad_norm": 1.502611756324768, + "learning_rate": 2.9114495205865768e-05, + "loss": 0.829, + "step": 7407 + }, + { + "epoch": 4.178228990411731, + "grad_norm": 1.0948576927185059, + "learning_rate": 2.9111675126903553e-05, + "loss": 0.7448, + "step": 7408 + }, + { + "epoch": 4.178793006204174, + "grad_norm": 1.2993348836898804, + "learning_rate": 2.9108855047941342e-05, + "loss": 0.8274, + "step": 7409 + }, + { + "epoch": 4.179357021996616, + "grad_norm": 1.2840576171875, + "learning_rate": 2.9106034968979134e-05, + "loss": 0.7638, + "step": 7410 + }, + { + "epoch": 4.179921037789058, + "grad_norm": 4.355712413787842, + "learning_rate": 2.9103214890016923e-05, + "loss": 0.8054, + "step": 7411 + }, + { + "epoch": 4.1804850535815, + "grad_norm": 2.9976673126220703, + "learning_rate": 2.9100394811054708e-05, + "loss": 0.6881, + "step": 7412 + }, + { + "epoch": 4.181049069373943, + "grad_norm": 1.3582007884979248, + "learning_rate": 2.9097574732092497e-05, + "loss": 0.7414, + "step": 7413 + }, + { + "epoch": 4.181613085166385, + "grad_norm": 0.9957648515701294, + "learning_rate": 2.909475465313029e-05, + "loss": 0.7768, + "step": 7414 + }, + { + "epoch": 4.182177100958826, + "grad_norm": 1.581167459487915, + "learning_rate": 2.9091934574168078e-05, + "loss": 0.7062, + "step": 7415 + }, + { + "epoch": 4.182741116751269, + "grad_norm": 1.5339040756225586, + "learning_rate": 2.9089114495205867e-05, + "loss": 0.8237, + "step": 7416 + }, + { + "epoch": 4.183305132543711, + "grad_norm": 1.3419595956802368, + "learning_rate": 2.9086294416243652e-05, + "loss": 0.7949, + "step": 7417 + }, + { + "epoch": 4.183869148336154, + "grad_norm": 1.432178020477295, + "learning_rate": 2.9083474337281448e-05, + "loss": 0.7044, + "step": 7418 + }, + { + "epoch": 4.1844331641285955, + "grad_norm": 1.5273312330245972, + "learning_rate": 2.9080654258319233e-05, + "loss": 0.8291, + "step": 7419 + }, + { + "epoch": 4.184997179921038, + "grad_norm": 1.5109100341796875, + "learning_rate": 2.9077834179357022e-05, + "loss": 0.7714, + "step": 7420 + }, + { + "epoch": 4.18556119571348, + "grad_norm": 1.1098859310150146, + "learning_rate": 2.9075014100394814e-05, + "loss": 0.6849, + "step": 7421 + }, + { + "epoch": 4.186125211505922, + "grad_norm": 1.296301245689392, + "learning_rate": 2.9072194021432603e-05, + "loss": 0.7544, + "step": 7422 + }, + { + "epoch": 4.1866892272983645, + "grad_norm": 0.8709320425987244, + "learning_rate": 2.9069373942470392e-05, + "loss": 0.6539, + "step": 7423 + }, + { + "epoch": 4.187253243090806, + "grad_norm": 1.265250325202942, + "learning_rate": 2.9066553863508177e-05, + "loss": 0.8558, + "step": 7424 + }, + { + "epoch": 4.187817258883249, + "grad_norm": 1.229979395866394, + "learning_rate": 2.9063733784545973e-05, + "loss": 0.7246, + "step": 7425 + }, + { + "epoch": 4.188381274675691, + "grad_norm": 1.0658034086227417, + "learning_rate": 2.9060913705583758e-05, + "loss": 0.6623, + "step": 7426 + }, + { + "epoch": 4.1889452904681335, + "grad_norm": 0.8444039225578308, + "learning_rate": 2.9058093626621547e-05, + "loss": 0.6818, + "step": 7427 + }, + { + "epoch": 4.189509306260575, + "grad_norm": 1.1133105754852295, + "learning_rate": 2.9055273547659336e-05, + "loss": 0.7719, + "step": 7428 + }, + { + "epoch": 4.190073322053017, + "grad_norm": 1.2504092454910278, + "learning_rate": 2.9052453468697128e-05, + "loss": 0.7154, + "step": 7429 + }, + { + "epoch": 4.19063733784546, + "grad_norm": 1.3953933715820312, + "learning_rate": 2.9049633389734913e-05, + "loss": 0.7244, + "step": 7430 + }, + { + "epoch": 4.191201353637902, + "grad_norm": 1.0837231874465942, + "learning_rate": 2.9046813310772702e-05, + "loss": 0.6542, + "step": 7431 + }, + { + "epoch": 4.191765369430344, + "grad_norm": 1.3716115951538086, + "learning_rate": 2.904399323181049e-05, + "loss": 0.7154, + "step": 7432 + }, + { + "epoch": 4.192329385222786, + "grad_norm": 1.1164768934249878, + "learning_rate": 2.9041173152848283e-05, + "loss": 0.6803, + "step": 7433 + }, + { + "epoch": 4.192893401015229, + "grad_norm": 1.1872515678405762, + "learning_rate": 2.9038353073886072e-05, + "loss": 0.6498, + "step": 7434 + }, + { + "epoch": 4.193457416807671, + "grad_norm": 1.423502802848816, + "learning_rate": 2.9035532994923857e-05, + "loss": 0.7431, + "step": 7435 + }, + { + "epoch": 4.194021432600112, + "grad_norm": 1.3352141380310059, + "learning_rate": 2.9032712915961646e-05, + "loss": 0.8883, + "step": 7436 + }, + { + "epoch": 4.194585448392555, + "grad_norm": 1.527648687362671, + "learning_rate": 2.9029892836999438e-05, + "loss": 0.824, + "step": 7437 + }, + { + "epoch": 4.195149464184997, + "grad_norm": 1.4438451528549194, + "learning_rate": 2.9027072758037227e-05, + "loss": 0.8201, + "step": 7438 + }, + { + "epoch": 4.19571347997744, + "grad_norm": 1.684901475906372, + "learning_rate": 2.9024252679075016e-05, + "loss": 0.7287, + "step": 7439 + }, + { + "epoch": 4.196277495769881, + "grad_norm": 1.5712018013000488, + "learning_rate": 2.90214326001128e-05, + "loss": 0.8048, + "step": 7440 + }, + { + "epoch": 4.196841511562324, + "grad_norm": 1.592553973197937, + "learning_rate": 2.9018612521150597e-05, + "loss": 0.8209, + "step": 7441 + }, + { + "epoch": 4.197405527354766, + "grad_norm": 1.2974241971969604, + "learning_rate": 2.9015792442188382e-05, + "loss": 0.7498, + "step": 7442 + }, + { + "epoch": 4.197969543147208, + "grad_norm": 1.1044915914535522, + "learning_rate": 2.901297236322617e-05, + "loss": 0.7652, + "step": 7443 + }, + { + "epoch": 4.19853355893965, + "grad_norm": 1.1907480955123901, + "learning_rate": 2.901015228426396e-05, + "loss": 0.8122, + "step": 7444 + }, + { + "epoch": 4.199097574732092, + "grad_norm": 1.1884511709213257, + "learning_rate": 2.9007332205301752e-05, + "loss": 0.7518, + "step": 7445 + }, + { + "epoch": 4.199661590524535, + "grad_norm": 1.28415846824646, + "learning_rate": 2.900451212633954e-05, + "loss": 0.7898, + "step": 7446 + }, + { + "epoch": 4.200225606316977, + "grad_norm": 1.6321463584899902, + "learning_rate": 2.9001692047377326e-05, + "loss": 0.7361, + "step": 7447 + }, + { + "epoch": 4.200789622109419, + "grad_norm": 1.383212685585022, + "learning_rate": 2.8998871968415115e-05, + "loss": 0.8216, + "step": 7448 + }, + { + "epoch": 4.201353637901861, + "grad_norm": 1.0881156921386719, + "learning_rate": 2.8996051889452907e-05, + "loss": 0.7789, + "step": 7449 + }, + { + "epoch": 4.201917653694303, + "grad_norm": 1.2229971885681152, + "learning_rate": 2.8993231810490696e-05, + "loss": 0.7973, + "step": 7450 + }, + { + "epoch": 4.202481669486746, + "grad_norm": 1.6354767084121704, + "learning_rate": 2.899041173152848e-05, + "loss": 0.8109, + "step": 7451 + }, + { + "epoch": 4.2030456852791875, + "grad_norm": 1.0732413530349731, + "learning_rate": 2.898759165256627e-05, + "loss": 0.7169, + "step": 7452 + }, + { + "epoch": 4.20360970107163, + "grad_norm": 1.220512866973877, + "learning_rate": 2.8984771573604062e-05, + "loss": 0.6783, + "step": 7453 + }, + { + "epoch": 4.204173716864072, + "grad_norm": 1.4865845441818237, + "learning_rate": 2.898195149464185e-05, + "loss": 0.785, + "step": 7454 + }, + { + "epoch": 4.204737732656515, + "grad_norm": 1.6454914808273315, + "learning_rate": 2.897913141567964e-05, + "loss": 0.7929, + "step": 7455 + }, + { + "epoch": 4.2053017484489565, + "grad_norm": 1.4064222574234009, + "learning_rate": 2.8976311336717432e-05, + "loss": 0.725, + "step": 7456 + }, + { + "epoch": 4.205865764241398, + "grad_norm": 1.2050362825393677, + "learning_rate": 2.897349125775522e-05, + "loss": 0.8112, + "step": 7457 + }, + { + "epoch": 4.206429780033841, + "grad_norm": 1.1174520254135132, + "learning_rate": 2.8970671178793006e-05, + "loss": 0.8123, + "step": 7458 + }, + { + "epoch": 4.206993795826283, + "grad_norm": 1.2994647026062012, + "learning_rate": 2.8967851099830795e-05, + "loss": 0.6967, + "step": 7459 + }, + { + "epoch": 4.2075578116187256, + "grad_norm": 0.9888596534729004, + "learning_rate": 2.8965031020868587e-05, + "loss": 0.7428, + "step": 7460 + }, + { + "epoch": 4.208121827411167, + "grad_norm": 4.483127117156982, + "learning_rate": 2.8962210941906376e-05, + "loss": 0.8074, + "step": 7461 + }, + { + "epoch": 4.20868584320361, + "grad_norm": 0.9928138852119446, + "learning_rate": 2.8959390862944165e-05, + "loss": 0.6782, + "step": 7462 + }, + { + "epoch": 4.209249858996052, + "grad_norm": 1.5280171632766724, + "learning_rate": 2.895657078398195e-05, + "loss": 0.7962, + "step": 7463 + }, + { + "epoch": 4.209813874788494, + "grad_norm": 1.0244241952896118, + "learning_rate": 2.8953750705019746e-05, + "loss": 0.59, + "step": 7464 + }, + { + "epoch": 4.210377890580936, + "grad_norm": 1.7350447177886963, + "learning_rate": 2.895093062605753e-05, + "loss": 0.87, + "step": 7465 + }, + { + "epoch": 4.210941906373378, + "grad_norm": 1.29668390750885, + "learning_rate": 2.894811054709532e-05, + "loss": 0.6504, + "step": 7466 + }, + { + "epoch": 4.211505922165821, + "grad_norm": 1.0736548900604248, + "learning_rate": 2.8945290468133106e-05, + "loss": 0.6308, + "step": 7467 + }, + { + "epoch": 4.212069937958263, + "grad_norm": 1.0866085290908813, + "learning_rate": 2.89424703891709e-05, + "loss": 0.8132, + "step": 7468 + }, + { + "epoch": 4.212633953750705, + "grad_norm": 1.1525853872299194, + "learning_rate": 2.8939650310208687e-05, + "loss": 0.7066, + "step": 7469 + }, + { + "epoch": 4.213197969543147, + "grad_norm": 1.1951894760131836, + "learning_rate": 2.8936830231246475e-05, + "loss": 0.7927, + "step": 7470 + }, + { + "epoch": 4.213761985335589, + "grad_norm": 1.1270670890808105, + "learning_rate": 2.8934010152284264e-05, + "loss": 0.6571, + "step": 7471 + }, + { + "epoch": 4.214326001128032, + "grad_norm": 0.9981789588928223, + "learning_rate": 2.8931190073322056e-05, + "loss": 0.6846, + "step": 7472 + }, + { + "epoch": 4.2148900169204735, + "grad_norm": 1.276862621307373, + "learning_rate": 2.8928369994359845e-05, + "loss": 0.7451, + "step": 7473 + }, + { + "epoch": 4.215454032712916, + "grad_norm": 1.02895188331604, + "learning_rate": 2.892554991539763e-05, + "loss": 0.7169, + "step": 7474 + }, + { + "epoch": 4.216018048505358, + "grad_norm": 1.450885534286499, + "learning_rate": 2.892272983643542e-05, + "loss": 0.8729, + "step": 7475 + }, + { + "epoch": 4.216582064297801, + "grad_norm": 0.8635163903236389, + "learning_rate": 2.891990975747321e-05, + "loss": 0.6318, + "step": 7476 + }, + { + "epoch": 4.2171460800902425, + "grad_norm": 1.4558134078979492, + "learning_rate": 2.8917089678511e-05, + "loss": 0.7637, + "step": 7477 + }, + { + "epoch": 4.217710095882684, + "grad_norm": 1.1474354267120361, + "learning_rate": 2.891426959954879e-05, + "loss": 0.8186, + "step": 7478 + }, + { + "epoch": 4.218274111675127, + "grad_norm": 1.4167612791061401, + "learning_rate": 2.8911449520586574e-05, + "loss": 0.7794, + "step": 7479 + }, + { + "epoch": 4.218838127467569, + "grad_norm": 1.1063816547393799, + "learning_rate": 2.890862944162437e-05, + "loss": 0.7735, + "step": 7480 + }, + { + "epoch": 4.2194021432600115, + "grad_norm": 2.2590343952178955, + "learning_rate": 2.8905809362662155e-05, + "loss": 0.8418, + "step": 7481 + }, + { + "epoch": 4.219966159052453, + "grad_norm": 1.234124779701233, + "learning_rate": 2.8902989283699944e-05, + "loss": 0.737, + "step": 7482 + }, + { + "epoch": 4.220530174844896, + "grad_norm": 2.1710996627807617, + "learning_rate": 2.8900169204737733e-05, + "loss": 0.711, + "step": 7483 + }, + { + "epoch": 4.221094190637338, + "grad_norm": 1.583486557006836, + "learning_rate": 2.8897349125775525e-05, + "loss": 0.764, + "step": 7484 + }, + { + "epoch": 4.22165820642978, + "grad_norm": 1.0688797235488892, + "learning_rate": 2.889452904681331e-05, + "loss": 0.7418, + "step": 7485 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 1.4261890649795532, + "learning_rate": 2.88917089678511e-05, + "loss": 0.9156, + "step": 7486 + }, + { + "epoch": 4.222786238014664, + "grad_norm": 0.9665696024894714, + "learning_rate": 2.8888888888888888e-05, + "loss": 0.7164, + "step": 7487 + }, + { + "epoch": 4.223350253807107, + "grad_norm": 1.5568124055862427, + "learning_rate": 2.888606880992668e-05, + "loss": 0.8282, + "step": 7488 + }, + { + "epoch": 4.223914269599549, + "grad_norm": 0.8876916766166687, + "learning_rate": 2.888324873096447e-05, + "loss": 0.6891, + "step": 7489 + }, + { + "epoch": 4.224478285391991, + "grad_norm": 1.5381646156311035, + "learning_rate": 2.8880428652002255e-05, + "loss": 0.7952, + "step": 7490 + }, + { + "epoch": 4.225042301184433, + "grad_norm": 0.8975207805633545, + "learning_rate": 2.887760857304005e-05, + "loss": 0.6164, + "step": 7491 + }, + { + "epoch": 4.225606316976875, + "grad_norm": 1.853811502456665, + "learning_rate": 2.8874788494077836e-05, + "loss": 0.8209, + "step": 7492 + }, + { + "epoch": 4.226170332769318, + "grad_norm": 1.3800510168075562, + "learning_rate": 2.8871968415115624e-05, + "loss": 0.8407, + "step": 7493 + }, + { + "epoch": 4.2267343485617594, + "grad_norm": 0.9254677891731262, + "learning_rate": 2.8869148336153413e-05, + "loss": 0.7061, + "step": 7494 + }, + { + "epoch": 4.227298364354202, + "grad_norm": 1.2868678569793701, + "learning_rate": 2.8866328257191205e-05, + "loss": 0.7384, + "step": 7495 + }, + { + "epoch": 4.227862380146644, + "grad_norm": 1.4131410121917725, + "learning_rate": 2.8863508178228994e-05, + "loss": 0.8335, + "step": 7496 + }, + { + "epoch": 4.228426395939087, + "grad_norm": 0.9219481945037842, + "learning_rate": 2.886068809926678e-05, + "loss": 0.6492, + "step": 7497 + }, + { + "epoch": 4.2289904117315285, + "grad_norm": 1.4442131519317627, + "learning_rate": 2.885786802030457e-05, + "loss": 0.8136, + "step": 7498 + }, + { + "epoch": 4.22955442752397, + "grad_norm": 1.7943435907363892, + "learning_rate": 2.885504794134236e-05, + "loss": 0.8214, + "step": 7499 + }, + { + "epoch": 4.230118443316413, + "grad_norm": 1.2944220304489136, + "learning_rate": 2.885222786238015e-05, + "loss": 0.7505, + "step": 7500 + }, + { + "epoch": 4.230682459108855, + "grad_norm": 1.245905876159668, + "learning_rate": 2.8849407783417938e-05, + "loss": 0.6477, + "step": 7501 + }, + { + "epoch": 4.2312464749012975, + "grad_norm": 1.1213454008102417, + "learning_rate": 2.8846587704455724e-05, + "loss": 0.7589, + "step": 7502 + }, + { + "epoch": 4.231810490693739, + "grad_norm": 1.086135745048523, + "learning_rate": 2.8843767625493516e-05, + "loss": 0.735, + "step": 7503 + }, + { + "epoch": 4.232374506486182, + "grad_norm": 1.0334019660949707, + "learning_rate": 2.8840947546531305e-05, + "loss": 0.683, + "step": 7504 + }, + { + "epoch": 4.232938522278624, + "grad_norm": 1.3889083862304688, + "learning_rate": 2.8838127467569093e-05, + "loss": 0.75, + "step": 7505 + }, + { + "epoch": 4.233502538071066, + "grad_norm": 1.1153361797332764, + "learning_rate": 2.883530738860688e-05, + "loss": 0.6481, + "step": 7506 + }, + { + "epoch": 4.234066553863508, + "grad_norm": 1.0079246759414673, + "learning_rate": 2.8832487309644674e-05, + "loss": 0.7213, + "step": 7507 + }, + { + "epoch": 4.23463056965595, + "grad_norm": 1.7045546770095825, + "learning_rate": 2.882966723068246e-05, + "loss": 0.8075, + "step": 7508 + }, + { + "epoch": 4.235194585448393, + "grad_norm": 1.0503665208816528, + "learning_rate": 2.882684715172025e-05, + "loss": 0.7183, + "step": 7509 + }, + { + "epoch": 4.235758601240835, + "grad_norm": 1.1561461687088013, + "learning_rate": 2.8824027072758037e-05, + "loss": 0.713, + "step": 7510 + }, + { + "epoch": 4.236322617033277, + "grad_norm": 1.1602522134780884, + "learning_rate": 2.882120699379583e-05, + "loss": 0.8276, + "step": 7511 + }, + { + "epoch": 4.236886632825719, + "grad_norm": 1.6722087860107422, + "learning_rate": 2.881838691483362e-05, + "loss": 0.7219, + "step": 7512 + }, + { + "epoch": 4.237450648618161, + "grad_norm": 1.4450684785842896, + "learning_rate": 2.8815566835871404e-05, + "loss": 0.6853, + "step": 7513 + }, + { + "epoch": 4.238014664410604, + "grad_norm": 1.2192161083221436, + "learning_rate": 2.8812746756909193e-05, + "loss": 0.6604, + "step": 7514 + }, + { + "epoch": 4.238578680203045, + "grad_norm": 1.2438310384750366, + "learning_rate": 2.8809926677946985e-05, + "loss": 0.7588, + "step": 7515 + }, + { + "epoch": 4.239142695995488, + "grad_norm": 0.8189388513565063, + "learning_rate": 2.8807106598984774e-05, + "loss": 0.6497, + "step": 7516 + }, + { + "epoch": 4.23970671178793, + "grad_norm": 1.2928599119186401, + "learning_rate": 2.8804286520022562e-05, + "loss": 0.7424, + "step": 7517 + }, + { + "epoch": 4.240270727580373, + "grad_norm": 0.9131361842155457, + "learning_rate": 2.8801466441060348e-05, + "loss": 0.7026, + "step": 7518 + }, + { + "epoch": 4.240834743372814, + "grad_norm": 1.5980441570281982, + "learning_rate": 2.8798646362098143e-05, + "loss": 0.7425, + "step": 7519 + }, + { + "epoch": 4.241398759165256, + "grad_norm": 1.6410648822784424, + "learning_rate": 2.879582628313593e-05, + "loss": 0.6623, + "step": 7520 + }, + { + "epoch": 4.241962774957699, + "grad_norm": 1.181356430053711, + "learning_rate": 2.8793006204173717e-05, + "loss": 0.6818, + "step": 7521 + }, + { + "epoch": 4.242526790750141, + "grad_norm": 1.0191928148269653, + "learning_rate": 2.8790186125211506e-05, + "loss": 0.7638, + "step": 7522 + }, + { + "epoch": 4.243090806542583, + "grad_norm": 1.7895221710205078, + "learning_rate": 2.87873660462493e-05, + "loss": 0.8046, + "step": 7523 + }, + { + "epoch": 4.243654822335025, + "grad_norm": 0.924102246761322, + "learning_rate": 2.8784545967287084e-05, + "loss": 0.6376, + "step": 7524 + }, + { + "epoch": 4.244218838127468, + "grad_norm": 1.0379314422607422, + "learning_rate": 2.8781725888324873e-05, + "loss": 0.7946, + "step": 7525 + }, + { + "epoch": 4.24478285391991, + "grad_norm": 1.4277286529541016, + "learning_rate": 2.8778905809362665e-05, + "loss": 0.7513, + "step": 7526 + }, + { + "epoch": 4.2453468697123515, + "grad_norm": 1.0332897901535034, + "learning_rate": 2.8776085730400454e-05, + "loss": 0.6988, + "step": 7527 + }, + { + "epoch": 4.245910885504794, + "grad_norm": 1.2996742725372314, + "learning_rate": 2.8773265651438242e-05, + "loss": 0.7441, + "step": 7528 + }, + { + "epoch": 4.246474901297236, + "grad_norm": 1.0791727304458618, + "learning_rate": 2.8770445572476028e-05, + "loss": 0.7835, + "step": 7529 + }, + { + "epoch": 4.247038917089679, + "grad_norm": 1.4706426858901978, + "learning_rate": 2.8767625493513823e-05, + "loss": 0.7515, + "step": 7530 + }, + { + "epoch": 4.2476029328821205, + "grad_norm": 1.0757787227630615, + "learning_rate": 2.876480541455161e-05, + "loss": 0.7617, + "step": 7531 + }, + { + "epoch": 4.248166948674563, + "grad_norm": 1.7745949029922485, + "learning_rate": 2.8761985335589398e-05, + "loss": 0.7151, + "step": 7532 + }, + { + "epoch": 4.248730964467005, + "grad_norm": 1.5479316711425781, + "learning_rate": 2.8759165256627186e-05, + "loss": 0.6854, + "step": 7533 + }, + { + "epoch": 4.249294980259447, + "grad_norm": 1.1608080863952637, + "learning_rate": 2.875634517766498e-05, + "loss": 0.712, + "step": 7534 + }, + { + "epoch": 4.2498589960518895, + "grad_norm": 1.6047624349594116, + "learning_rate": 2.8753525098702767e-05, + "loss": 0.8013, + "step": 7535 + }, + { + "epoch": 4.250423011844331, + "grad_norm": 1.655909776687622, + "learning_rate": 2.8750705019740553e-05, + "loss": 0.8284, + "step": 7536 + }, + { + "epoch": 4.250987027636774, + "grad_norm": 1.2181638479232788, + "learning_rate": 2.874788494077834e-05, + "loss": 0.7761, + "step": 7537 + }, + { + "epoch": 4.251551043429216, + "grad_norm": 1.3980913162231445, + "learning_rate": 2.8745064861816134e-05, + "loss": 0.7425, + "step": 7538 + }, + { + "epoch": 4.2521150592216586, + "grad_norm": 1.2727243900299072, + "learning_rate": 2.8742244782853923e-05, + "loss": 0.7438, + "step": 7539 + }, + { + "epoch": 4.2526790750141, + "grad_norm": 1.0473496913909912, + "learning_rate": 2.873942470389171e-05, + "loss": 0.6293, + "step": 7540 + }, + { + "epoch": 4.253243090806542, + "grad_norm": 1.7819674015045166, + "learning_rate": 2.8736604624929497e-05, + "loss": 0.9459, + "step": 7541 + }, + { + "epoch": 4.253807106598985, + "grad_norm": 1.5206936597824097, + "learning_rate": 2.873378454596729e-05, + "loss": 0.6896, + "step": 7542 + }, + { + "epoch": 4.254371122391427, + "grad_norm": 1.4548892974853516, + "learning_rate": 2.8730964467005078e-05, + "loss": 0.6866, + "step": 7543 + }, + { + "epoch": 4.254935138183869, + "grad_norm": 1.7381502389907837, + "learning_rate": 2.8728144388042867e-05, + "loss": 0.7927, + "step": 7544 + }, + { + "epoch": 4.255499153976311, + "grad_norm": 0.9262317419052124, + "learning_rate": 2.8725324309080652e-05, + "loss": 0.6938, + "step": 7545 + }, + { + "epoch": 4.256063169768754, + "grad_norm": 1.3107541799545288, + "learning_rate": 2.8722504230118448e-05, + "loss": 0.8791, + "step": 7546 + }, + { + "epoch": 4.256627185561196, + "grad_norm": 0.9323747754096985, + "learning_rate": 2.8719684151156233e-05, + "loss": 0.7531, + "step": 7547 + }, + { + "epoch": 4.2571912013536375, + "grad_norm": 1.1483802795410156, + "learning_rate": 2.8716864072194022e-05, + "loss": 0.8381, + "step": 7548 + }, + { + "epoch": 4.25775521714608, + "grad_norm": 0.8562392592430115, + "learning_rate": 2.871404399323181e-05, + "loss": 0.6133, + "step": 7549 + }, + { + "epoch": 4.258319232938522, + "grad_norm": 1.4374216794967651, + "learning_rate": 2.8711223914269603e-05, + "loss": 0.7643, + "step": 7550 + }, + { + "epoch": 4.258883248730965, + "grad_norm": 1.2385369539260864, + "learning_rate": 2.870840383530739e-05, + "loss": 0.7422, + "step": 7551 + }, + { + "epoch": 4.2594472645234065, + "grad_norm": 2.0248303413391113, + "learning_rate": 2.8705583756345177e-05, + "loss": 0.661, + "step": 7552 + }, + { + "epoch": 4.260011280315849, + "grad_norm": 1.1934309005737305, + "learning_rate": 2.8702763677382966e-05, + "loss": 0.7433, + "step": 7553 + }, + { + "epoch": 4.260575296108291, + "grad_norm": 1.0299971103668213, + "learning_rate": 2.8699943598420758e-05, + "loss": 0.7492, + "step": 7554 + }, + { + "epoch": 4.261139311900733, + "grad_norm": 1.1454414129257202, + "learning_rate": 2.8697123519458547e-05, + "loss": 0.7617, + "step": 7555 + }, + { + "epoch": 4.2617033276931755, + "grad_norm": 1.1759881973266602, + "learning_rate": 2.8694303440496336e-05, + "loss": 0.7089, + "step": 7556 + }, + { + "epoch": 4.262267343485617, + "grad_norm": 1.7586027383804321, + "learning_rate": 2.869148336153412e-05, + "loss": 0.7797, + "step": 7557 + }, + { + "epoch": 4.26283135927806, + "grad_norm": 1.108970284461975, + "learning_rate": 2.8688663282571917e-05, + "loss": 0.674, + "step": 7558 + }, + { + "epoch": 4.263395375070502, + "grad_norm": 1.7605571746826172, + "learning_rate": 2.8685843203609702e-05, + "loss": 0.8355, + "step": 7559 + }, + { + "epoch": 4.2639593908629445, + "grad_norm": 1.8711320161819458, + "learning_rate": 2.868302312464749e-05, + "loss": 0.8684, + "step": 7560 + }, + { + "epoch": 4.264523406655386, + "grad_norm": 0.9456261992454529, + "learning_rate": 2.8680203045685283e-05, + "loss": 0.7663, + "step": 7561 + }, + { + "epoch": 4.265087422447828, + "grad_norm": 1.1637065410614014, + "learning_rate": 2.8677382966723072e-05, + "loss": 0.7826, + "step": 7562 + }, + { + "epoch": 4.265651438240271, + "grad_norm": 1.2413731813430786, + "learning_rate": 2.8674562887760857e-05, + "loss": 0.7263, + "step": 7563 + }, + { + "epoch": 4.266215454032713, + "grad_norm": 1.5690093040466309, + "learning_rate": 2.8671742808798646e-05, + "loss": 0.831, + "step": 7564 + }, + { + "epoch": 4.266779469825155, + "grad_norm": 1.2838894128799438, + "learning_rate": 2.8668922729836438e-05, + "loss": 0.7093, + "step": 7565 + }, + { + "epoch": 4.267343485617597, + "grad_norm": 1.2740285396575928, + "learning_rate": 2.8666102650874227e-05, + "loss": 0.7611, + "step": 7566 + }, + { + "epoch": 4.26790750141004, + "grad_norm": 1.0722485780715942, + "learning_rate": 2.8663282571912016e-05, + "loss": 0.7225, + "step": 7567 + }, + { + "epoch": 4.268471517202482, + "grad_norm": 1.6605055332183838, + "learning_rate": 2.86604624929498e-05, + "loss": 0.7758, + "step": 7568 + }, + { + "epoch": 4.269035532994923, + "grad_norm": 1.258410930633545, + "learning_rate": 2.8657642413987597e-05, + "loss": 0.6516, + "step": 7569 + }, + { + "epoch": 4.269599548787366, + "grad_norm": 1.2729847431182861, + "learning_rate": 2.8654822335025382e-05, + "loss": 0.7498, + "step": 7570 + }, + { + "epoch": 4.270163564579808, + "grad_norm": 1.0219472646713257, + "learning_rate": 2.865200225606317e-05, + "loss": 0.8415, + "step": 7571 + }, + { + "epoch": 4.270727580372251, + "grad_norm": 1.136000156402588, + "learning_rate": 2.864918217710096e-05, + "loss": 0.6933, + "step": 7572 + }, + { + "epoch": 4.2712915961646925, + "grad_norm": 1.2666465044021606, + "learning_rate": 2.8646362098138752e-05, + "loss": 0.7186, + "step": 7573 + }, + { + "epoch": 4.271855611957135, + "grad_norm": 1.309251070022583, + "learning_rate": 2.864354201917654e-05, + "loss": 0.8228, + "step": 7574 + }, + { + "epoch": 4.272419627749577, + "grad_norm": 1.5783779621124268, + "learning_rate": 2.8640721940214326e-05, + "loss": 0.8045, + "step": 7575 + }, + { + "epoch": 4.272983643542019, + "grad_norm": 1.6756813526153564, + "learning_rate": 2.8637901861252115e-05, + "loss": 0.7866, + "step": 7576 + }, + { + "epoch": 4.2735476593344615, + "grad_norm": 1.6256859302520752, + "learning_rate": 2.8635081782289907e-05, + "loss": 0.7639, + "step": 7577 + }, + { + "epoch": 4.274111675126903, + "grad_norm": 1.1028156280517578, + "learning_rate": 2.8632261703327696e-05, + "loss": 0.7404, + "step": 7578 + }, + { + "epoch": 4.274675690919346, + "grad_norm": 1.172734260559082, + "learning_rate": 2.862944162436548e-05, + "loss": 0.689, + "step": 7579 + }, + { + "epoch": 4.275239706711788, + "grad_norm": 1.6162075996398926, + "learning_rate": 2.862662154540327e-05, + "loss": 0.8962, + "step": 7580 + }, + { + "epoch": 4.2758037225042305, + "grad_norm": 1.3372570276260376, + "learning_rate": 2.8623801466441062e-05, + "loss": 0.8035, + "step": 7581 + }, + { + "epoch": 4.276367738296672, + "grad_norm": 1.1537551879882812, + "learning_rate": 2.862098138747885e-05, + "loss": 0.73, + "step": 7582 + }, + { + "epoch": 4.276931754089114, + "grad_norm": 0.9278579950332642, + "learning_rate": 2.861816130851664e-05, + "loss": 0.6332, + "step": 7583 + }, + { + "epoch": 4.277495769881557, + "grad_norm": 1.1273692846298218, + "learning_rate": 2.8615341229554425e-05, + "loss": 0.7071, + "step": 7584 + }, + { + "epoch": 4.278059785673999, + "grad_norm": 1.155627965927124, + "learning_rate": 2.861252115059222e-05, + "loss": 0.8262, + "step": 7585 + }, + { + "epoch": 4.278623801466441, + "grad_norm": 1.134741187095642, + "learning_rate": 2.8609701071630006e-05, + "loss": 0.7309, + "step": 7586 + }, + { + "epoch": 4.279187817258883, + "grad_norm": 3.216761350631714, + "learning_rate": 2.8606880992667795e-05, + "loss": 0.7372, + "step": 7587 + }, + { + "epoch": 4.279751833051326, + "grad_norm": 0.8852941989898682, + "learning_rate": 2.8604060913705584e-05, + "loss": 0.6693, + "step": 7588 + }, + { + "epoch": 4.280315848843768, + "grad_norm": 1.5200071334838867, + "learning_rate": 2.8601240834743376e-05, + "loss": 0.783, + "step": 7589 + }, + { + "epoch": 4.280879864636209, + "grad_norm": 0.9664500951766968, + "learning_rate": 2.8598420755781165e-05, + "loss": 0.672, + "step": 7590 + }, + { + "epoch": 4.281443880428652, + "grad_norm": 1.3549087047576904, + "learning_rate": 2.859560067681895e-05, + "loss": 0.7458, + "step": 7591 + }, + { + "epoch": 4.282007896221094, + "grad_norm": 1.3628872632980347, + "learning_rate": 2.859278059785674e-05, + "loss": 0.7301, + "step": 7592 + }, + { + "epoch": 4.282571912013537, + "grad_norm": 1.3923221826553345, + "learning_rate": 2.858996051889453e-05, + "loss": 0.7816, + "step": 7593 + }, + { + "epoch": 4.283135927805978, + "grad_norm": 0.9882336258888245, + "learning_rate": 2.858714043993232e-05, + "loss": 0.7483, + "step": 7594 + }, + { + "epoch": 4.283699943598421, + "grad_norm": 1.2188427448272705, + "learning_rate": 2.858432036097011e-05, + "loss": 0.7089, + "step": 7595 + }, + { + "epoch": 4.284263959390863, + "grad_norm": 1.0906882286071777, + "learning_rate": 2.85815002820079e-05, + "loss": 0.7515, + "step": 7596 + }, + { + "epoch": 4.284827975183305, + "grad_norm": 2.733816623687744, + "learning_rate": 2.8578680203045686e-05, + "loss": 0.741, + "step": 7597 + }, + { + "epoch": 4.285391990975747, + "grad_norm": 1.125624656677246, + "learning_rate": 2.8575860124083475e-05, + "loss": 0.6925, + "step": 7598 + }, + { + "epoch": 4.285956006768189, + "grad_norm": 0.9930005073547363, + "learning_rate": 2.8573040045121264e-05, + "loss": 0.8202, + "step": 7599 + }, + { + "epoch": 4.286520022560632, + "grad_norm": 1.209054946899414, + "learning_rate": 2.8570219966159056e-05, + "loss": 0.7687, + "step": 7600 + }, + { + "epoch": 4.287084038353074, + "grad_norm": 0.9055995345115662, + "learning_rate": 2.8567399887196845e-05, + "loss": 0.7434, + "step": 7601 + }, + { + "epoch": 4.287648054145516, + "grad_norm": 1.2315168380737305, + "learning_rate": 2.856457980823463e-05, + "loss": 0.692, + "step": 7602 + }, + { + "epoch": 4.288212069937958, + "grad_norm": 1.0515260696411133, + "learning_rate": 2.856175972927242e-05, + "loss": 0.713, + "step": 7603 + }, + { + "epoch": 4.288776085730401, + "grad_norm": 1.2886873483657837, + "learning_rate": 2.855893965031021e-05, + "loss": 0.7102, + "step": 7604 + }, + { + "epoch": 4.289340101522843, + "grad_norm": 1.7750325202941895, + "learning_rate": 2.8556119571348e-05, + "loss": 0.7556, + "step": 7605 + }, + { + "epoch": 4.2899041173152845, + "grad_norm": 3.127697706222534, + "learning_rate": 2.855329949238579e-05, + "loss": 0.7957, + "step": 7606 + }, + { + "epoch": 4.290468133107727, + "grad_norm": 1.1866313219070435, + "learning_rate": 2.8550479413423574e-05, + "loss": 0.6982, + "step": 7607 + }, + { + "epoch": 4.291032148900169, + "grad_norm": 1.4430077075958252, + "learning_rate": 2.854765933446137e-05, + "loss": 0.7329, + "step": 7608 + }, + { + "epoch": 4.291596164692612, + "grad_norm": 0.9088682532310486, + "learning_rate": 2.8544839255499155e-05, + "loss": 0.5776, + "step": 7609 + }, + { + "epoch": 4.2921601804850535, + "grad_norm": 1.0976403951644897, + "learning_rate": 2.8542019176536944e-05, + "loss": 0.7456, + "step": 7610 + }, + { + "epoch": 4.292724196277495, + "grad_norm": 1.2700021266937256, + "learning_rate": 2.8539199097574733e-05, + "loss": 0.7663, + "step": 7611 + }, + { + "epoch": 4.293288212069938, + "grad_norm": 1.8128645420074463, + "learning_rate": 2.8536379018612525e-05, + "loss": 0.777, + "step": 7612 + }, + { + "epoch": 4.29385222786238, + "grad_norm": 2.229588747024536, + "learning_rate": 2.8533558939650314e-05, + "loss": 0.7466, + "step": 7613 + }, + { + "epoch": 4.2944162436548226, + "grad_norm": 1.5326924324035645, + "learning_rate": 2.85307388606881e-05, + "loss": 0.8223, + "step": 7614 + }, + { + "epoch": 4.294980259447264, + "grad_norm": 1.549063801765442, + "learning_rate": 2.8527918781725888e-05, + "loss": 0.8245, + "step": 7615 + }, + { + "epoch": 4.295544275239707, + "grad_norm": 1.3078484535217285, + "learning_rate": 2.852509870276368e-05, + "loss": 0.7147, + "step": 7616 + }, + { + "epoch": 4.296108291032149, + "grad_norm": 1.0290215015411377, + "learning_rate": 2.852227862380147e-05, + "loss": 0.7009, + "step": 7617 + }, + { + "epoch": 4.296672306824592, + "grad_norm": 0.9771449565887451, + "learning_rate": 2.8519458544839254e-05, + "loss": 0.7868, + "step": 7618 + }, + { + "epoch": 4.297236322617033, + "grad_norm": 1.0251127481460571, + "learning_rate": 2.8516638465877043e-05, + "loss": 0.6948, + "step": 7619 + }, + { + "epoch": 4.297800338409475, + "grad_norm": 1.049201250076294, + "learning_rate": 2.8513818386914835e-05, + "loss": 0.7186, + "step": 7620 + }, + { + "epoch": 4.298364354201918, + "grad_norm": 1.3586363792419434, + "learning_rate": 2.8510998307952624e-05, + "loss": 0.7527, + "step": 7621 + }, + { + "epoch": 4.29892836999436, + "grad_norm": 1.1222045421600342, + "learning_rate": 2.8508178228990413e-05, + "loss": 0.7624, + "step": 7622 + }, + { + "epoch": 4.299492385786802, + "grad_norm": 1.0040611028671265, + "learning_rate": 2.85053581500282e-05, + "loss": 0.6588, + "step": 7623 + }, + { + "epoch": 4.300056401579244, + "grad_norm": 1.1066683530807495, + "learning_rate": 2.8502538071065994e-05, + "loss": 0.681, + "step": 7624 + }, + { + "epoch": 4.300620417371686, + "grad_norm": 1.5871635675430298, + "learning_rate": 2.849971799210378e-05, + "loss": 0.7603, + "step": 7625 + }, + { + "epoch": 4.301184433164129, + "grad_norm": 0.9824357032775879, + "learning_rate": 2.8496897913141568e-05, + "loss": 0.6779, + "step": 7626 + }, + { + "epoch": 4.3017484489565705, + "grad_norm": 1.0576450824737549, + "learning_rate": 2.8494077834179357e-05, + "loss": 0.7583, + "step": 7627 + }, + { + "epoch": 4.302312464749013, + "grad_norm": 0.9285808205604553, + "learning_rate": 2.849125775521715e-05, + "loss": 0.7539, + "step": 7628 + }, + { + "epoch": 4.302876480541455, + "grad_norm": 3.5437092781066895, + "learning_rate": 2.8488437676254938e-05, + "loss": 0.9102, + "step": 7629 + }, + { + "epoch": 4.303440496333898, + "grad_norm": 1.6546483039855957, + "learning_rate": 2.8485617597292723e-05, + "loss": 0.827, + "step": 7630 + }, + { + "epoch": 4.3040045121263395, + "grad_norm": 1.1743422746658325, + "learning_rate": 2.848279751833052e-05, + "loss": 0.7671, + "step": 7631 + }, + { + "epoch": 4.304568527918782, + "grad_norm": 1.335115671157837, + "learning_rate": 2.8479977439368304e-05, + "loss": 0.7675, + "step": 7632 + }, + { + "epoch": 4.305132543711224, + "grad_norm": 0.887910783290863, + "learning_rate": 2.8477157360406093e-05, + "loss": 0.7291, + "step": 7633 + }, + { + "epoch": 4.305696559503666, + "grad_norm": 1.7291845083236694, + "learning_rate": 2.847433728144388e-05, + "loss": 0.8449, + "step": 7634 + }, + { + "epoch": 4.3062605752961085, + "grad_norm": 2.676635980606079, + "learning_rate": 2.8471517202481674e-05, + "loss": 0.6774, + "step": 7635 + }, + { + "epoch": 4.30682459108855, + "grad_norm": 0.8830721378326416, + "learning_rate": 2.846869712351946e-05, + "loss": 0.6479, + "step": 7636 + }, + { + "epoch": 4.307388606880993, + "grad_norm": 1.3478097915649414, + "learning_rate": 2.846587704455725e-05, + "loss": 0.7569, + "step": 7637 + }, + { + "epoch": 4.307952622673435, + "grad_norm": 1.6141982078552246, + "learning_rate": 2.8463056965595037e-05, + "loss": 0.7068, + "step": 7638 + }, + { + "epoch": 4.308516638465877, + "grad_norm": 1.2364509105682373, + "learning_rate": 2.846023688663283e-05, + "loss": 0.7528, + "step": 7639 + }, + { + "epoch": 4.309080654258319, + "grad_norm": 1.1837552785873413, + "learning_rate": 2.8457416807670618e-05, + "loss": 0.856, + "step": 7640 + }, + { + "epoch": 4.309644670050761, + "grad_norm": 1.1428947448730469, + "learning_rate": 2.8454596728708404e-05, + "loss": 0.7567, + "step": 7641 + }, + { + "epoch": 4.310208685843204, + "grad_norm": 1.4396138191223145, + "learning_rate": 2.8451776649746192e-05, + "loss": 0.6661, + "step": 7642 + }, + { + "epoch": 4.310772701635646, + "grad_norm": 1.4917232990264893, + "learning_rate": 2.8448956570783985e-05, + "loss": 0.7627, + "step": 7643 + }, + { + "epoch": 4.311336717428088, + "grad_norm": 0.9654418230056763, + "learning_rate": 2.8446136491821773e-05, + "loss": 0.7035, + "step": 7644 + }, + { + "epoch": 4.31190073322053, + "grad_norm": 1.2315261363983154, + "learning_rate": 2.8443316412859562e-05, + "loss": 0.6657, + "step": 7645 + }, + { + "epoch": 4.312464749012973, + "grad_norm": 0.9906793832778931, + "learning_rate": 2.8440496333897348e-05, + "loss": 0.7448, + "step": 7646 + }, + { + "epoch": 4.313028764805415, + "grad_norm": 1.5686004161834717, + "learning_rate": 2.8437676254935143e-05, + "loss": 0.835, + "step": 7647 + }, + { + "epoch": 4.3135927805978564, + "grad_norm": 1.0773870944976807, + "learning_rate": 2.843485617597293e-05, + "loss": 0.7201, + "step": 7648 + }, + { + "epoch": 4.314156796390299, + "grad_norm": 1.5544445514678955, + "learning_rate": 2.8432036097010717e-05, + "loss": 0.7859, + "step": 7649 + }, + { + "epoch": 4.314720812182741, + "grad_norm": 2.2386488914489746, + "learning_rate": 2.8429216018048506e-05, + "loss": 0.9022, + "step": 7650 + }, + { + "epoch": 4.315284827975184, + "grad_norm": 0.9811075925827026, + "learning_rate": 2.84263959390863e-05, + "loss": 0.762, + "step": 7651 + }, + { + "epoch": 4.3158488437676255, + "grad_norm": 1.0487874746322632, + "learning_rate": 2.8423575860124084e-05, + "loss": 0.7024, + "step": 7652 + }, + { + "epoch": 4.316412859560067, + "grad_norm": 1.0811535120010376, + "learning_rate": 2.8420755781161873e-05, + "loss": 0.7415, + "step": 7653 + }, + { + "epoch": 4.31697687535251, + "grad_norm": 1.1293922662734985, + "learning_rate": 2.841793570219966e-05, + "loss": 0.7511, + "step": 7654 + }, + { + "epoch": 4.317540891144952, + "grad_norm": 1.3936147689819336, + "learning_rate": 2.8415115623237453e-05, + "loss": 0.7595, + "step": 7655 + }, + { + "epoch": 4.3181049069373945, + "grad_norm": 1.579390525817871, + "learning_rate": 2.8412295544275242e-05, + "loss": 0.9356, + "step": 7656 + }, + { + "epoch": 4.318668922729836, + "grad_norm": 1.2316919565200806, + "learning_rate": 2.8409475465313028e-05, + "loss": 0.6792, + "step": 7657 + }, + { + "epoch": 4.319232938522279, + "grad_norm": 1.1563637256622314, + "learning_rate": 2.8406655386350816e-05, + "loss": 0.7124, + "step": 7658 + }, + { + "epoch": 4.319796954314721, + "grad_norm": 1.1332780122756958, + "learning_rate": 2.840383530738861e-05, + "loss": 0.6939, + "step": 7659 + }, + { + "epoch": 4.3203609701071635, + "grad_norm": 1.4405357837677002, + "learning_rate": 2.8401015228426397e-05, + "loss": 0.7903, + "step": 7660 + }, + { + "epoch": 4.320924985899605, + "grad_norm": 1.1842281818389893, + "learning_rate": 2.8398195149464186e-05, + "loss": 0.7473, + "step": 7661 + }, + { + "epoch": 4.321489001692047, + "grad_norm": 1.1675339937210083, + "learning_rate": 2.839537507050197e-05, + "loss": 0.6333, + "step": 7662 + }, + { + "epoch": 4.32205301748449, + "grad_norm": 1.504186987876892, + "learning_rate": 2.8392554991539767e-05, + "loss": 0.8819, + "step": 7663 + }, + { + "epoch": 4.322617033276932, + "grad_norm": 1.5181589126586914, + "learning_rate": 2.8389734912577553e-05, + "loss": 0.8979, + "step": 7664 + }, + { + "epoch": 4.323181049069374, + "grad_norm": 1.4764212369918823, + "learning_rate": 2.838691483361534e-05, + "loss": 0.762, + "step": 7665 + }, + { + "epoch": 4.323745064861816, + "grad_norm": 0.9726526737213135, + "learning_rate": 2.8384094754653134e-05, + "loss": 0.6529, + "step": 7666 + }, + { + "epoch": 4.324309080654258, + "grad_norm": 1.8127307891845703, + "learning_rate": 2.8381274675690922e-05, + "loss": 0.7852, + "step": 7667 + }, + { + "epoch": 4.324873096446701, + "grad_norm": 1.072893738746643, + "learning_rate": 2.837845459672871e-05, + "loss": 0.7041, + "step": 7668 + }, + { + "epoch": 4.325437112239142, + "grad_norm": 1.2689608335494995, + "learning_rate": 2.8375634517766497e-05, + "loss": 0.6997, + "step": 7669 + }, + { + "epoch": 4.326001128031585, + "grad_norm": 1.01645028591156, + "learning_rate": 2.837281443880429e-05, + "loss": 0.7182, + "step": 7670 + }, + { + "epoch": 4.326565143824027, + "grad_norm": 1.1121047735214233, + "learning_rate": 2.8369994359842078e-05, + "loss": 0.784, + "step": 7671 + }, + { + "epoch": 4.32712915961647, + "grad_norm": 1.2648202180862427, + "learning_rate": 2.8367174280879866e-05, + "loss": 0.7739, + "step": 7672 + }, + { + "epoch": 4.327693175408911, + "grad_norm": 1.3099817037582397, + "learning_rate": 2.8364354201917652e-05, + "loss": 0.8343, + "step": 7673 + }, + { + "epoch": 4.328257191201354, + "grad_norm": 1.93722403049469, + "learning_rate": 2.8361534122955447e-05, + "loss": 0.7803, + "step": 7674 + }, + { + "epoch": 4.328821206993796, + "grad_norm": 1.1605881452560425, + "learning_rate": 2.8358714043993233e-05, + "loss": 0.7572, + "step": 7675 + }, + { + "epoch": 4.329385222786238, + "grad_norm": 1.6315380334854126, + "learning_rate": 2.835589396503102e-05, + "loss": 0.8656, + "step": 7676 + }, + { + "epoch": 4.32994923857868, + "grad_norm": 0.9703378081321716, + "learning_rate": 2.835307388606881e-05, + "loss": 0.7678, + "step": 7677 + }, + { + "epoch": 4.330513254371122, + "grad_norm": 1.2863290309906006, + "learning_rate": 2.8350253807106603e-05, + "loss": 0.7573, + "step": 7678 + }, + { + "epoch": 4.331077270163565, + "grad_norm": 1.6006826162338257, + "learning_rate": 2.834743372814439e-05, + "loss": 0.7406, + "step": 7679 + }, + { + "epoch": 4.331641285956007, + "grad_norm": 1.310964822769165, + "learning_rate": 2.8344613649182177e-05, + "loss": 0.7901, + "step": 7680 + }, + { + "epoch": 4.3322053017484485, + "grad_norm": 0.9848299026489258, + "learning_rate": 2.8341793570219966e-05, + "loss": 0.7408, + "step": 7681 + }, + { + "epoch": 4.332769317540891, + "grad_norm": 1.2533682584762573, + "learning_rate": 2.8338973491257758e-05, + "loss": 0.7956, + "step": 7682 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.087390661239624, + "learning_rate": 2.8336153412295547e-05, + "loss": 0.7363, + "step": 7683 + }, + { + "epoch": 4.333897349125776, + "grad_norm": 1.2984281778335571, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.7812, + "step": 7684 + }, + { + "epoch": 4.3344613649182175, + "grad_norm": 0.9992433190345764, + "learning_rate": 2.833051325437112e-05, + "loss": 0.7076, + "step": 7685 + }, + { + "epoch": 4.33502538071066, + "grad_norm": 0.9762792587280273, + "learning_rate": 2.8327693175408916e-05, + "loss": 0.7105, + "step": 7686 + }, + { + "epoch": 4.335589396503102, + "grad_norm": 1.1735339164733887, + "learning_rate": 2.8324873096446702e-05, + "loss": 0.7206, + "step": 7687 + }, + { + "epoch": 4.336153412295545, + "grad_norm": 0.9006327390670776, + "learning_rate": 2.832205301748449e-05, + "loss": 0.7283, + "step": 7688 + }, + { + "epoch": 4.3367174280879865, + "grad_norm": 1.8795621395111084, + "learning_rate": 2.8319232938522276e-05, + "loss": 0.7737, + "step": 7689 + }, + { + "epoch": 4.337281443880428, + "grad_norm": 3.9246511459350586, + "learning_rate": 2.831641285956007e-05, + "loss": 0.8135, + "step": 7690 + }, + { + "epoch": 4.337845459672871, + "grad_norm": 0.9983042478561401, + "learning_rate": 2.8313592780597857e-05, + "loss": 0.7474, + "step": 7691 + }, + { + "epoch": 4.338409475465313, + "grad_norm": 1.7765430212020874, + "learning_rate": 2.8310772701635646e-05, + "loss": 0.897, + "step": 7692 + }, + { + "epoch": 4.3389734912577556, + "grad_norm": 1.1098212003707886, + "learning_rate": 2.8307952622673435e-05, + "loss": 0.7409, + "step": 7693 + }, + { + "epoch": 4.339537507050197, + "grad_norm": 1.8278402090072632, + "learning_rate": 2.8305132543711227e-05, + "loss": 0.8593, + "step": 7694 + }, + { + "epoch": 4.340101522842639, + "grad_norm": 1.0182485580444336, + "learning_rate": 2.8302312464749016e-05, + "loss": 0.6936, + "step": 7695 + }, + { + "epoch": 4.340665538635082, + "grad_norm": 1.53148353099823, + "learning_rate": 2.82994923857868e-05, + "loss": 0.7455, + "step": 7696 + }, + { + "epoch": 4.341229554427524, + "grad_norm": 0.7380218505859375, + "learning_rate": 2.829667230682459e-05, + "loss": 0.5727, + "step": 7697 + }, + { + "epoch": 4.341793570219966, + "grad_norm": 3.525778293609619, + "learning_rate": 2.8293852227862382e-05, + "loss": 0.7094, + "step": 7698 + }, + { + "epoch": 4.342357586012408, + "grad_norm": 1.292281985282898, + "learning_rate": 2.829103214890017e-05, + "loss": 0.7137, + "step": 7699 + }, + { + "epoch": 4.342921601804851, + "grad_norm": 4.893954277038574, + "learning_rate": 2.828821206993796e-05, + "loss": 0.7715, + "step": 7700 + }, + { + "epoch": 4.343485617597293, + "grad_norm": 1.3945815563201904, + "learning_rate": 2.828539199097575e-05, + "loss": 0.8175, + "step": 7701 + }, + { + "epoch": 4.344049633389735, + "grad_norm": 0.9567761421203613, + "learning_rate": 2.828257191201354e-05, + "loss": 0.6891, + "step": 7702 + }, + { + "epoch": 4.344613649182177, + "grad_norm": 1.167527437210083, + "learning_rate": 2.8279751833051326e-05, + "loss": 0.7503, + "step": 7703 + }, + { + "epoch": 4.345177664974619, + "grad_norm": 1.1191253662109375, + "learning_rate": 2.8276931754089115e-05, + "loss": 0.8041, + "step": 7704 + }, + { + "epoch": 4.345741680767062, + "grad_norm": 1.3187255859375, + "learning_rate": 2.8274111675126907e-05, + "loss": 0.7797, + "step": 7705 + }, + { + "epoch": 4.3463056965595035, + "grad_norm": 1.447105050086975, + "learning_rate": 2.8271291596164696e-05, + "loss": 0.7361, + "step": 7706 + }, + { + "epoch": 4.346869712351946, + "grad_norm": 1.277083396911621, + "learning_rate": 2.826847151720248e-05, + "loss": 0.7699, + "step": 7707 + }, + { + "epoch": 4.347433728144388, + "grad_norm": 1.3996073007583618, + "learning_rate": 2.826565143824027e-05, + "loss": 0.802, + "step": 7708 + }, + { + "epoch": 4.34799774393683, + "grad_norm": 1.468860387802124, + "learning_rate": 2.8262831359278062e-05, + "loss": 0.8664, + "step": 7709 + }, + { + "epoch": 4.3485617597292725, + "grad_norm": 0.9666085243225098, + "learning_rate": 2.826001128031585e-05, + "loss": 0.6998, + "step": 7710 + }, + { + "epoch": 4.349125775521714, + "grad_norm": 1.2757220268249512, + "learning_rate": 2.825719120135364e-05, + "loss": 0.8288, + "step": 7711 + }, + { + "epoch": 4.349689791314157, + "grad_norm": 1.442526936531067, + "learning_rate": 2.8254371122391425e-05, + "loss": 0.8593, + "step": 7712 + }, + { + "epoch": 4.350253807106599, + "grad_norm": 1.4268895387649536, + "learning_rate": 2.825155104342922e-05, + "loss": 0.8075, + "step": 7713 + }, + { + "epoch": 4.3508178228990415, + "grad_norm": 1.096364974975586, + "learning_rate": 2.8248730964467006e-05, + "loss": 0.8568, + "step": 7714 + }, + { + "epoch": 4.351381838691483, + "grad_norm": 0.7220455408096313, + "learning_rate": 2.8245910885504795e-05, + "loss": 0.55, + "step": 7715 + }, + { + "epoch": 4.351945854483926, + "grad_norm": 1.061663031578064, + "learning_rate": 2.8243090806542584e-05, + "loss": 0.774, + "step": 7716 + }, + { + "epoch": 4.352509870276368, + "grad_norm": 1.6672099828720093, + "learning_rate": 2.8240270727580376e-05, + "loss": 0.7266, + "step": 7717 + }, + { + "epoch": 4.35307388606881, + "grad_norm": 0.9888852834701538, + "learning_rate": 2.8237450648618165e-05, + "loss": 0.5896, + "step": 7718 + }, + { + "epoch": 4.353637901861252, + "grad_norm": 1.0103774070739746, + "learning_rate": 2.823463056965595e-05, + "loss": 0.7119, + "step": 7719 + }, + { + "epoch": 4.354201917653694, + "grad_norm": 0.9086347222328186, + "learning_rate": 2.823181049069374e-05, + "loss": 0.6893, + "step": 7720 + }, + { + "epoch": 4.354765933446137, + "grad_norm": 1.245914340019226, + "learning_rate": 2.822899041173153e-05, + "loss": 0.7776, + "step": 7721 + }, + { + "epoch": 4.355329949238579, + "grad_norm": 1.7426981925964355, + "learning_rate": 2.822617033276932e-05, + "loss": 0.8034, + "step": 7722 + }, + { + "epoch": 4.35589396503102, + "grad_norm": 1.2527499198913574, + "learning_rate": 2.822335025380711e-05, + "loss": 0.7727, + "step": 7723 + }, + { + "epoch": 4.356457980823463, + "grad_norm": 1.1851006746292114, + "learning_rate": 2.8220530174844894e-05, + "loss": 0.7161, + "step": 7724 + }, + { + "epoch": 4.357021996615905, + "grad_norm": 1.9735888242721558, + "learning_rate": 2.8217710095882686e-05, + "loss": 0.6815, + "step": 7725 + }, + { + "epoch": 4.357586012408348, + "grad_norm": 1.5722932815551758, + "learning_rate": 2.8214890016920475e-05, + "loss": 0.7587, + "step": 7726 + }, + { + "epoch": 4.3581500282007894, + "grad_norm": 0.9914368391036987, + "learning_rate": 2.8212069937958264e-05, + "loss": 0.6699, + "step": 7727 + }, + { + "epoch": 4.358714043993232, + "grad_norm": 1.6378098726272583, + "learning_rate": 2.820924985899605e-05, + "loss": 0.7539, + "step": 7728 + }, + { + "epoch": 4.359278059785674, + "grad_norm": 1.4919116497039795, + "learning_rate": 2.8206429780033845e-05, + "loss": 0.7776, + "step": 7729 + }, + { + "epoch": 4.359842075578117, + "grad_norm": 1.9790775775909424, + "learning_rate": 2.820360970107163e-05, + "loss": 0.8965, + "step": 7730 + }, + { + "epoch": 4.3604060913705585, + "grad_norm": 0.8787882924079895, + "learning_rate": 2.820078962210942e-05, + "loss": 0.654, + "step": 7731 + }, + { + "epoch": 4.360970107163, + "grad_norm": 1.2300126552581787, + "learning_rate": 2.8197969543147208e-05, + "loss": 0.8784, + "step": 7732 + }, + { + "epoch": 4.361534122955443, + "grad_norm": 1.3162825107574463, + "learning_rate": 2.8195149464185e-05, + "loss": 0.7805, + "step": 7733 + }, + { + "epoch": 4.362098138747885, + "grad_norm": 1.3184447288513184, + "learning_rate": 2.819232938522279e-05, + "loss": 0.8134, + "step": 7734 + }, + { + "epoch": 4.3626621545403275, + "grad_norm": 1.6046785116195679, + "learning_rate": 2.8189509306260574e-05, + "loss": 0.9488, + "step": 7735 + }, + { + "epoch": 4.363226170332769, + "grad_norm": 1.22018563747406, + "learning_rate": 2.818668922729837e-05, + "loss": 0.7976, + "step": 7736 + }, + { + "epoch": 4.363790186125211, + "grad_norm": 1.1038966178894043, + "learning_rate": 2.8183869148336155e-05, + "loss": 0.7764, + "step": 7737 + }, + { + "epoch": 4.364354201917654, + "grad_norm": 1.2360683679580688, + "learning_rate": 2.8181049069373944e-05, + "loss": 0.6938, + "step": 7738 + }, + { + "epoch": 4.364918217710096, + "grad_norm": 1.5997456312179565, + "learning_rate": 2.8178228990411733e-05, + "loss": 0.8406, + "step": 7739 + }, + { + "epoch": 4.365482233502538, + "grad_norm": 1.5775675773620605, + "learning_rate": 2.8175408911449525e-05, + "loss": 0.8282, + "step": 7740 + }, + { + "epoch": 4.36604624929498, + "grad_norm": 1.4079927206039429, + "learning_rate": 2.8172588832487314e-05, + "loss": 0.7193, + "step": 7741 + }, + { + "epoch": 4.366610265087423, + "grad_norm": 1.820389986038208, + "learning_rate": 2.81697687535251e-05, + "loss": 0.8439, + "step": 7742 + }, + { + "epoch": 4.367174280879865, + "grad_norm": 0.9302222728729248, + "learning_rate": 2.8166948674562888e-05, + "loss": 0.6608, + "step": 7743 + }, + { + "epoch": 4.367738296672307, + "grad_norm": 1.2706114053726196, + "learning_rate": 2.816412859560068e-05, + "loss": 0.7005, + "step": 7744 + }, + { + "epoch": 4.368302312464749, + "grad_norm": 1.22764253616333, + "learning_rate": 2.816130851663847e-05, + "loss": 0.7861, + "step": 7745 + }, + { + "epoch": 4.368866328257191, + "grad_norm": 2.075408935546875, + "learning_rate": 2.8158488437676254e-05, + "loss": 0.7928, + "step": 7746 + }, + { + "epoch": 4.369430344049634, + "grad_norm": 1.204465389251709, + "learning_rate": 2.8155668358714043e-05, + "loss": 0.7783, + "step": 7747 + }, + { + "epoch": 4.369994359842075, + "grad_norm": 1.0010626316070557, + "learning_rate": 2.8152848279751835e-05, + "loss": 0.7053, + "step": 7748 + }, + { + "epoch": 4.370558375634518, + "grad_norm": 1.2057998180389404, + "learning_rate": 2.8150028200789624e-05, + "loss": 0.6057, + "step": 7749 + }, + { + "epoch": 4.37112239142696, + "grad_norm": 1.0817034244537354, + "learning_rate": 2.8147208121827413e-05, + "loss": 0.7816, + "step": 7750 + }, + { + "epoch": 4.371686407219402, + "grad_norm": 1.3209952116012573, + "learning_rate": 2.8144388042865198e-05, + "loss": 0.7746, + "step": 7751 + }, + { + "epoch": 4.372250423011844, + "grad_norm": 1.861583948135376, + "learning_rate": 2.8141567963902994e-05, + "loss": 0.8121, + "step": 7752 + }, + { + "epoch": 4.372814438804286, + "grad_norm": 1.3429932594299316, + "learning_rate": 2.813874788494078e-05, + "loss": 0.7912, + "step": 7753 + }, + { + "epoch": 4.373378454596729, + "grad_norm": 0.8256375789642334, + "learning_rate": 2.8135927805978568e-05, + "loss": 0.6076, + "step": 7754 + }, + { + "epoch": 4.373942470389171, + "grad_norm": 1.4395219087600708, + "learning_rate": 2.8133107727016357e-05, + "loss": 0.8415, + "step": 7755 + }, + { + "epoch": 4.374506486181613, + "grad_norm": 13.835236549377441, + "learning_rate": 2.813028764805415e-05, + "loss": 0.7763, + "step": 7756 + }, + { + "epoch": 4.375070501974055, + "grad_norm": 1.174875259399414, + "learning_rate": 2.8127467569091938e-05, + "loss": 0.6487, + "step": 7757 + }, + { + "epoch": 4.375634517766498, + "grad_norm": 1.5919336080551147, + "learning_rate": 2.8124647490129723e-05, + "loss": 0.9667, + "step": 7758 + }, + { + "epoch": 4.37619853355894, + "grad_norm": 1.4046199321746826, + "learning_rate": 2.8121827411167512e-05, + "loss": 0.8019, + "step": 7759 + }, + { + "epoch": 4.3767625493513815, + "grad_norm": 5.12776517868042, + "learning_rate": 2.8119007332205304e-05, + "loss": 0.835, + "step": 7760 + }, + { + "epoch": 4.377326565143824, + "grad_norm": 1.5111050605773926, + "learning_rate": 2.8116187253243093e-05, + "loss": 0.7642, + "step": 7761 + }, + { + "epoch": 4.377890580936266, + "grad_norm": 1.2167541980743408, + "learning_rate": 2.8113367174280882e-05, + "loss": 0.7898, + "step": 7762 + }, + { + "epoch": 4.378454596728709, + "grad_norm": 1.2590800523757935, + "learning_rate": 2.8110547095318667e-05, + "loss": 0.7586, + "step": 7763 + }, + { + "epoch": 4.3790186125211505, + "grad_norm": 1.1447041034698486, + "learning_rate": 2.810772701635646e-05, + "loss": 0.7221, + "step": 7764 + }, + { + "epoch": 4.379582628313592, + "grad_norm": 1.4885456562042236, + "learning_rate": 2.8104906937394248e-05, + "loss": 0.8023, + "step": 7765 + }, + { + "epoch": 4.380146644106035, + "grad_norm": 1.2799041271209717, + "learning_rate": 2.8102086858432037e-05, + "loss": 0.7922, + "step": 7766 + }, + { + "epoch": 4.380710659898477, + "grad_norm": 1.134682059288025, + "learning_rate": 2.8099266779469822e-05, + "loss": 0.6621, + "step": 7767 + }, + { + "epoch": 4.3812746756909196, + "grad_norm": 0.8104646801948547, + "learning_rate": 2.8096446700507618e-05, + "loss": 0.6548, + "step": 7768 + }, + { + "epoch": 4.381838691483361, + "grad_norm": 2.558155059814453, + "learning_rate": 2.8093626621545403e-05, + "loss": 0.6689, + "step": 7769 + }, + { + "epoch": 4.382402707275804, + "grad_norm": 1.3917343616485596, + "learning_rate": 2.8090806542583192e-05, + "loss": 0.7956, + "step": 7770 + }, + { + "epoch": 4.382966723068246, + "grad_norm": 0.9795765280723572, + "learning_rate": 2.808798646362098e-05, + "loss": 0.7184, + "step": 7771 + }, + { + "epoch": 4.383530738860689, + "grad_norm": 1.1673299074172974, + "learning_rate": 2.8085166384658773e-05, + "loss": 0.7795, + "step": 7772 + }, + { + "epoch": 4.38409475465313, + "grad_norm": 1.0241196155548096, + "learning_rate": 2.8082346305696562e-05, + "loss": 0.7052, + "step": 7773 + }, + { + "epoch": 4.384658770445572, + "grad_norm": 4.439658164978027, + "learning_rate": 2.8079526226734347e-05, + "loss": 0.8467, + "step": 7774 + }, + { + "epoch": 4.385222786238015, + "grad_norm": 1.5495586395263672, + "learning_rate": 2.8076706147772143e-05, + "loss": 0.7183, + "step": 7775 + }, + { + "epoch": 4.385786802030457, + "grad_norm": 1.0653672218322754, + "learning_rate": 2.807388606880993e-05, + "loss": 0.7892, + "step": 7776 + }, + { + "epoch": 4.386350817822899, + "grad_norm": 1.3353604078292847, + "learning_rate": 2.8071065989847717e-05, + "loss": 0.7884, + "step": 7777 + }, + { + "epoch": 4.386914833615341, + "grad_norm": 1.7924225330352783, + "learning_rate": 2.8068245910885506e-05, + "loss": 0.8446, + "step": 7778 + }, + { + "epoch": 4.387478849407783, + "grad_norm": 0.9921491146087646, + "learning_rate": 2.8065425831923298e-05, + "loss": 0.6823, + "step": 7779 + }, + { + "epoch": 4.388042865200226, + "grad_norm": 1.3837181329727173, + "learning_rate": 2.8062605752961087e-05, + "loss": 0.7397, + "step": 7780 + }, + { + "epoch": 4.3886068809926675, + "grad_norm": 1.368569016456604, + "learning_rate": 2.8059785673998872e-05, + "loss": 0.8389, + "step": 7781 + }, + { + "epoch": 4.38917089678511, + "grad_norm": 2.421445846557617, + "learning_rate": 2.805696559503666e-05, + "loss": 0.7625, + "step": 7782 + }, + { + "epoch": 4.389734912577552, + "grad_norm": 1.4051955938339233, + "learning_rate": 2.8054145516074453e-05, + "loss": 0.746, + "step": 7783 + }, + { + "epoch": 4.390298928369995, + "grad_norm": 1.393864393234253, + "learning_rate": 2.8051325437112242e-05, + "loss": 0.8762, + "step": 7784 + }, + { + "epoch": 4.3908629441624365, + "grad_norm": 1.6294876337051392, + "learning_rate": 2.8048505358150028e-05, + "loss": 0.8508, + "step": 7785 + }, + { + "epoch": 4.391426959954879, + "grad_norm": 1.2858312129974365, + "learning_rate": 2.8045685279187816e-05, + "loss": 0.7455, + "step": 7786 + }, + { + "epoch": 4.391990975747321, + "grad_norm": 1.069641351699829, + "learning_rate": 2.804286520022561e-05, + "loss": 0.6908, + "step": 7787 + }, + { + "epoch": 4.392554991539763, + "grad_norm": 0.890540599822998, + "learning_rate": 2.8040045121263397e-05, + "loss": 0.7428, + "step": 7788 + }, + { + "epoch": 4.3931190073322055, + "grad_norm": 1.3621538877487183, + "learning_rate": 2.8037225042301186e-05, + "loss": 0.753, + "step": 7789 + }, + { + "epoch": 4.393683023124647, + "grad_norm": 1.2102913856506348, + "learning_rate": 2.803440496333897e-05, + "loss": 0.7791, + "step": 7790 + }, + { + "epoch": 4.39424703891709, + "grad_norm": 1.055444598197937, + "learning_rate": 2.8031584884376767e-05, + "loss": 0.7495, + "step": 7791 + }, + { + "epoch": 4.394811054709532, + "grad_norm": 1.0945168733596802, + "learning_rate": 2.8028764805414553e-05, + "loss": 0.8653, + "step": 7792 + }, + { + "epoch": 4.395375070501974, + "grad_norm": 1.67923104763031, + "learning_rate": 2.802594472645234e-05, + "loss": 0.7548, + "step": 7793 + }, + { + "epoch": 4.395939086294416, + "grad_norm": 1.4600465297698975, + "learning_rate": 2.802312464749013e-05, + "loss": 0.7004, + "step": 7794 + }, + { + "epoch": 4.396503102086858, + "grad_norm": 1.035143494606018, + "learning_rate": 2.8020304568527922e-05, + "loss": 0.7387, + "step": 7795 + }, + { + "epoch": 4.397067117879301, + "grad_norm": 1.0280953645706177, + "learning_rate": 2.801748448956571e-05, + "loss": 0.6664, + "step": 7796 + }, + { + "epoch": 4.397631133671743, + "grad_norm": 1.0499093532562256, + "learning_rate": 2.8014664410603496e-05, + "loss": 0.7291, + "step": 7797 + }, + { + "epoch": 4.398195149464185, + "grad_norm": 1.0749760866165161, + "learning_rate": 2.8011844331641285e-05, + "loss": 0.8001, + "step": 7798 + }, + { + "epoch": 4.398759165256627, + "grad_norm": 1.3375493288040161, + "learning_rate": 2.8009024252679077e-05, + "loss": 0.8171, + "step": 7799 + }, + { + "epoch": 4.39932318104907, + "grad_norm": 1.0770409107208252, + "learning_rate": 2.8006204173716866e-05, + "loss": 0.7813, + "step": 7800 + }, + { + "epoch": 4.399887196841512, + "grad_norm": 1.179721713066101, + "learning_rate": 2.800338409475465e-05, + "loss": 0.724, + "step": 7801 + }, + { + "epoch": 4.4004512126339534, + "grad_norm": 1.0879814624786377, + "learning_rate": 2.800056401579244e-05, + "loss": 0.6166, + "step": 7802 + }, + { + "epoch": 4.401015228426396, + "grad_norm": 1.4094977378845215, + "learning_rate": 2.7997743936830233e-05, + "loss": 0.7544, + "step": 7803 + }, + { + "epoch": 4.401579244218838, + "grad_norm": 1.329872488975525, + "learning_rate": 2.799492385786802e-05, + "loss": 0.686, + "step": 7804 + }, + { + "epoch": 4.402143260011281, + "grad_norm": 1.7095826864242554, + "learning_rate": 2.799210377890581e-05, + "loss": 0.9199, + "step": 7805 + }, + { + "epoch": 4.4027072758037225, + "grad_norm": 0.9305998682975769, + "learning_rate": 2.7989283699943596e-05, + "loss": 0.6707, + "step": 7806 + }, + { + "epoch": 4.403271291596164, + "grad_norm": 1.1210578680038452, + "learning_rate": 2.798646362098139e-05, + "loss": 0.7643, + "step": 7807 + }, + { + "epoch": 4.403835307388607, + "grad_norm": 0.925287127494812, + "learning_rate": 2.7983643542019177e-05, + "loss": 0.6868, + "step": 7808 + }, + { + "epoch": 4.404399323181049, + "grad_norm": 1.204323410987854, + "learning_rate": 2.7980823463056965e-05, + "loss": 0.7382, + "step": 7809 + }, + { + "epoch": 4.4049633389734915, + "grad_norm": 1.7542446851730347, + "learning_rate": 2.7978003384094758e-05, + "loss": 0.769, + "step": 7810 + }, + { + "epoch": 4.405527354765933, + "grad_norm": 1.0246723890304565, + "learning_rate": 2.7975183305132546e-05, + "loss": 0.5953, + "step": 7811 + }, + { + "epoch": 4.406091370558376, + "grad_norm": 0.8524156808853149, + "learning_rate": 2.7972363226170335e-05, + "loss": 0.6455, + "step": 7812 + }, + { + "epoch": 4.406655386350818, + "grad_norm": 1.6144676208496094, + "learning_rate": 2.796954314720812e-05, + "loss": 0.7989, + "step": 7813 + }, + { + "epoch": 4.4072194021432605, + "grad_norm": 1.1548302173614502, + "learning_rate": 2.7966723068245916e-05, + "loss": 0.709, + "step": 7814 + }, + { + "epoch": 4.407783417935702, + "grad_norm": 1.2070196866989136, + "learning_rate": 2.79639029892837e-05, + "loss": 0.8288, + "step": 7815 + }, + { + "epoch": 4.408347433728144, + "grad_norm": 0.9131433367729187, + "learning_rate": 2.796108291032149e-05, + "loss": 0.762, + "step": 7816 + }, + { + "epoch": 4.408911449520587, + "grad_norm": 1.8400177955627441, + "learning_rate": 2.795826283135928e-05, + "loss": 0.8864, + "step": 7817 + }, + { + "epoch": 4.409475465313029, + "grad_norm": 1.4314018487930298, + "learning_rate": 2.795544275239707e-05, + "loss": 0.7672, + "step": 7818 + }, + { + "epoch": 4.410039481105471, + "grad_norm": 1.3091464042663574, + "learning_rate": 2.7952622673434857e-05, + "loss": 0.719, + "step": 7819 + }, + { + "epoch": 4.410603496897913, + "grad_norm": 1.45077645778656, + "learning_rate": 2.7949802594472646e-05, + "loss": 0.7209, + "step": 7820 + }, + { + "epoch": 4.411167512690355, + "grad_norm": 1.223375678062439, + "learning_rate": 2.7946982515510434e-05, + "loss": 0.6481, + "step": 7821 + }, + { + "epoch": 4.411731528482798, + "grad_norm": 1.3703887462615967, + "learning_rate": 2.7944162436548227e-05, + "loss": 0.737, + "step": 7822 + }, + { + "epoch": 4.412295544275239, + "grad_norm": 0.833962082862854, + "learning_rate": 2.7941342357586015e-05, + "loss": 0.6607, + "step": 7823 + }, + { + "epoch": 4.412859560067682, + "grad_norm": 1.4134718179702759, + "learning_rate": 2.79385222786238e-05, + "loss": 0.6865, + "step": 7824 + }, + { + "epoch": 4.413423575860124, + "grad_norm": 1.3377989530563354, + "learning_rate": 2.793570219966159e-05, + "loss": 0.8158, + "step": 7825 + }, + { + "epoch": 4.413987591652567, + "grad_norm": 2.24003529548645, + "learning_rate": 2.7932882120699382e-05, + "loss": 0.7902, + "step": 7826 + }, + { + "epoch": 4.414551607445008, + "grad_norm": 1.321770191192627, + "learning_rate": 2.793006204173717e-05, + "loss": 0.7742, + "step": 7827 + }, + { + "epoch": 4.415115623237451, + "grad_norm": 1.531746506690979, + "learning_rate": 2.792724196277496e-05, + "loss": 0.8176, + "step": 7828 + }, + { + "epoch": 4.415679639029893, + "grad_norm": 0.9782936573028564, + "learning_rate": 2.7924421883812745e-05, + "loss": 0.7334, + "step": 7829 + }, + { + "epoch": 4.416243654822335, + "grad_norm": 1.5740883350372314, + "learning_rate": 2.792160180485054e-05, + "loss": 0.7672, + "step": 7830 + }, + { + "epoch": 4.416807670614777, + "grad_norm": 1.2276872396469116, + "learning_rate": 2.7918781725888326e-05, + "loss": 0.7548, + "step": 7831 + }, + { + "epoch": 4.417371686407219, + "grad_norm": 1.1870465278625488, + "learning_rate": 2.7915961646926115e-05, + "loss": 0.7807, + "step": 7832 + }, + { + "epoch": 4.417935702199662, + "grad_norm": 1.1696300506591797, + "learning_rate": 2.7913141567963903e-05, + "loss": 0.7185, + "step": 7833 + }, + { + "epoch": 4.418499717992104, + "grad_norm": 1.2150886058807373, + "learning_rate": 2.7910321489001696e-05, + "loss": 0.8315, + "step": 7834 + }, + { + "epoch": 4.4190637337845455, + "grad_norm": 1.4779787063598633, + "learning_rate": 2.7907501410039484e-05, + "loss": 0.7733, + "step": 7835 + }, + { + "epoch": 4.419627749576988, + "grad_norm": 1.1077741384506226, + "learning_rate": 2.790468133107727e-05, + "loss": 0.6877, + "step": 7836 + }, + { + "epoch": 4.42019176536943, + "grad_norm": 0.9944502711296082, + "learning_rate": 2.790186125211506e-05, + "loss": 0.7218, + "step": 7837 + }, + { + "epoch": 4.420755781161873, + "grad_norm": 0.9226536750793457, + "learning_rate": 2.789904117315285e-05, + "loss": 0.6984, + "step": 7838 + }, + { + "epoch": 4.4213197969543145, + "grad_norm": 1.275018572807312, + "learning_rate": 2.789622109419064e-05, + "loss": 0.7453, + "step": 7839 + }, + { + "epoch": 4.421883812746757, + "grad_norm": 1.427428960800171, + "learning_rate": 2.7893401015228425e-05, + "loss": 0.7904, + "step": 7840 + }, + { + "epoch": 4.422447828539199, + "grad_norm": 0.9510368704795837, + "learning_rate": 2.7890580936266214e-05, + "loss": 0.7022, + "step": 7841 + }, + { + "epoch": 4.423011844331642, + "grad_norm": 1.501806616783142, + "learning_rate": 2.7887760857304006e-05, + "loss": 0.7609, + "step": 7842 + }, + { + "epoch": 4.4235758601240835, + "grad_norm": 0.9619936347007751, + "learning_rate": 2.7884940778341795e-05, + "loss": 0.7665, + "step": 7843 + }, + { + "epoch": 4.424139875916525, + "grad_norm": 1.0713129043579102, + "learning_rate": 2.7882120699379583e-05, + "loss": 0.7767, + "step": 7844 + }, + { + "epoch": 4.424703891708968, + "grad_norm": 1.2840155363082886, + "learning_rate": 2.7879300620417376e-05, + "loss": 0.7988, + "step": 7845 + }, + { + "epoch": 4.42526790750141, + "grad_norm": 1.5209310054779053, + "learning_rate": 2.7876480541455164e-05, + "loss": 0.7406, + "step": 7846 + }, + { + "epoch": 4.4258319232938526, + "grad_norm": 1.126114845275879, + "learning_rate": 2.787366046249295e-05, + "loss": 0.6875, + "step": 7847 + }, + { + "epoch": 4.426395939086294, + "grad_norm": 1.2683497667312622, + "learning_rate": 2.787084038353074e-05, + "loss": 0.7753, + "step": 7848 + }, + { + "epoch": 4.426959954878736, + "grad_norm": 1.3469469547271729, + "learning_rate": 2.786802030456853e-05, + "loss": 0.7737, + "step": 7849 + }, + { + "epoch": 4.427523970671179, + "grad_norm": 1.4205286502838135, + "learning_rate": 2.786520022560632e-05, + "loss": 0.8413, + "step": 7850 + }, + { + "epoch": 4.428087986463621, + "grad_norm": 1.2909127473831177, + "learning_rate": 2.786238014664411e-05, + "loss": 0.8198, + "step": 7851 + }, + { + "epoch": 4.428652002256063, + "grad_norm": 1.2056976556777954, + "learning_rate": 2.7859560067681894e-05, + "loss": 0.6342, + "step": 7852 + }, + { + "epoch": 4.429216018048505, + "grad_norm": 1.106584072113037, + "learning_rate": 2.785673998871969e-05, + "loss": 0.7361, + "step": 7853 + }, + { + "epoch": 4.429780033840948, + "grad_norm": 1.704590082168579, + "learning_rate": 2.7853919909757475e-05, + "loss": 0.8656, + "step": 7854 + }, + { + "epoch": 4.43034404963339, + "grad_norm": 0.8869720697402954, + "learning_rate": 2.7851099830795264e-05, + "loss": 0.6818, + "step": 7855 + }, + { + "epoch": 4.430908065425832, + "grad_norm": 1.0837827920913696, + "learning_rate": 2.784827975183305e-05, + "loss": 0.7572, + "step": 7856 + }, + { + "epoch": 4.431472081218274, + "grad_norm": 0.9747019410133362, + "learning_rate": 2.7845459672870845e-05, + "loss": 0.7202, + "step": 7857 + }, + { + "epoch": 4.432036097010716, + "grad_norm": 0.9463246464729309, + "learning_rate": 2.784263959390863e-05, + "loss": 0.7809, + "step": 7858 + }, + { + "epoch": 4.432600112803159, + "grad_norm": 1.3044337034225464, + "learning_rate": 2.783981951494642e-05, + "loss": 0.6376, + "step": 7859 + }, + { + "epoch": 4.4331641285956005, + "grad_norm": 2.108431339263916, + "learning_rate": 2.7836999435984208e-05, + "loss": 0.8901, + "step": 7860 + }, + { + "epoch": 4.433728144388043, + "grad_norm": 1.260980248451233, + "learning_rate": 2.7834179357022e-05, + "loss": 0.6962, + "step": 7861 + }, + { + "epoch": 4.434292160180485, + "grad_norm": 1.1842292547225952, + "learning_rate": 2.783135927805979e-05, + "loss": 0.7673, + "step": 7862 + }, + { + "epoch": 4.434856175972927, + "grad_norm": 1.3219767808914185, + "learning_rate": 2.7828539199097574e-05, + "loss": 0.7116, + "step": 7863 + }, + { + "epoch": 4.4354201917653695, + "grad_norm": 1.070684552192688, + "learning_rate": 2.7825719120135363e-05, + "loss": 0.8205, + "step": 7864 + }, + { + "epoch": 4.435984207557811, + "grad_norm": 1.0197266340255737, + "learning_rate": 2.7822899041173155e-05, + "loss": 0.6732, + "step": 7865 + }, + { + "epoch": 4.436548223350254, + "grad_norm": 1.664245843887329, + "learning_rate": 2.7820078962210944e-05, + "loss": 0.7505, + "step": 7866 + }, + { + "epoch": 4.437112239142696, + "grad_norm": 1.6288961172103882, + "learning_rate": 2.7817258883248733e-05, + "loss": 0.7996, + "step": 7867 + }, + { + "epoch": 4.4376762549351385, + "grad_norm": 1.5506784915924072, + "learning_rate": 2.7814438804286518e-05, + "loss": 0.6742, + "step": 7868 + }, + { + "epoch": 4.43824027072758, + "grad_norm": 0.9389987587928772, + "learning_rate": 2.7811618725324314e-05, + "loss": 0.6691, + "step": 7869 + }, + { + "epoch": 4.438804286520023, + "grad_norm": 1.0620964765548706, + "learning_rate": 2.78087986463621e-05, + "loss": 0.8093, + "step": 7870 + }, + { + "epoch": 4.439368302312465, + "grad_norm": 1.3464382886886597, + "learning_rate": 2.7805978567399888e-05, + "loss": 0.8339, + "step": 7871 + }, + { + "epoch": 4.439932318104907, + "grad_norm": 1.5747380256652832, + "learning_rate": 2.7803158488437677e-05, + "loss": 0.8092, + "step": 7872 + }, + { + "epoch": 4.440496333897349, + "grad_norm": 1.0110893249511719, + "learning_rate": 2.780033840947547e-05, + "loss": 0.7607, + "step": 7873 + }, + { + "epoch": 4.441060349689791, + "grad_norm": 0.9994455575942993, + "learning_rate": 2.7797518330513254e-05, + "loss": 0.7582, + "step": 7874 + }, + { + "epoch": 4.441624365482234, + "grad_norm": 1.0898665189743042, + "learning_rate": 2.7794698251551043e-05, + "loss": 0.6977, + "step": 7875 + }, + { + "epoch": 4.442188381274676, + "grad_norm": 1.7564858198165894, + "learning_rate": 2.7791878172588832e-05, + "loss": 0.8897, + "step": 7876 + }, + { + "epoch": 4.442752397067117, + "grad_norm": 1.221510410308838, + "learning_rate": 2.7789058093626624e-05, + "loss": 0.7049, + "step": 7877 + }, + { + "epoch": 4.44331641285956, + "grad_norm": 2.364335298538208, + "learning_rate": 2.7786238014664413e-05, + "loss": 0.9381, + "step": 7878 + }, + { + "epoch": 4.443880428652002, + "grad_norm": 1.2955061197280884, + "learning_rate": 2.7783417935702198e-05, + "loss": 0.689, + "step": 7879 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 1.556610345840454, + "learning_rate": 2.7780597856739994e-05, + "loss": 0.8592, + "step": 7880 + }, + { + "epoch": 4.4450084602368864, + "grad_norm": 1.6513779163360596, + "learning_rate": 2.777777777777778e-05, + "loss": 0.8455, + "step": 7881 + }, + { + "epoch": 4.445572476029329, + "grad_norm": 0.9932650327682495, + "learning_rate": 2.7774957698815568e-05, + "loss": 0.6911, + "step": 7882 + }, + { + "epoch": 4.446136491821771, + "grad_norm": 1.451695442199707, + "learning_rate": 2.7772137619853357e-05, + "loss": 0.7883, + "step": 7883 + }, + { + "epoch": 4.446700507614214, + "grad_norm": 1.1860965490341187, + "learning_rate": 2.776931754089115e-05, + "loss": 0.7511, + "step": 7884 + }, + { + "epoch": 4.4472645234066555, + "grad_norm": 2.529273748397827, + "learning_rate": 2.7766497461928938e-05, + "loss": 0.8643, + "step": 7885 + }, + { + "epoch": 4.447828539199097, + "grad_norm": 1.336841106414795, + "learning_rate": 2.7763677382966723e-05, + "loss": 0.6891, + "step": 7886 + }, + { + "epoch": 4.44839255499154, + "grad_norm": 1.2180861234664917, + "learning_rate": 2.7760857304004512e-05, + "loss": 0.7615, + "step": 7887 + }, + { + "epoch": 4.448956570783982, + "grad_norm": 2.019392967224121, + "learning_rate": 2.7758037225042304e-05, + "loss": 0.8135, + "step": 7888 + }, + { + "epoch": 4.4495205865764245, + "grad_norm": 1.8265875577926636, + "learning_rate": 2.7755217146080093e-05, + "loss": 0.7794, + "step": 7889 + }, + { + "epoch": 4.450084602368866, + "grad_norm": 1.2699692249298096, + "learning_rate": 2.775239706711788e-05, + "loss": 0.738, + "step": 7890 + }, + { + "epoch": 4.450648618161308, + "grad_norm": 1.2249058485031128, + "learning_rate": 2.7749576988155667e-05, + "loss": 0.7259, + "step": 7891 + }, + { + "epoch": 4.451212633953751, + "grad_norm": 1.479358434677124, + "learning_rate": 2.774675690919346e-05, + "loss": 0.8031, + "step": 7892 + }, + { + "epoch": 4.451776649746193, + "grad_norm": 1.1927990913391113, + "learning_rate": 2.7743936830231248e-05, + "loss": 0.7901, + "step": 7893 + }, + { + "epoch": 4.452340665538635, + "grad_norm": 1.157591700553894, + "learning_rate": 2.7741116751269037e-05, + "loss": 0.6716, + "step": 7894 + }, + { + "epoch": 4.452904681331077, + "grad_norm": 1.3254834413528442, + "learning_rate": 2.7738296672306822e-05, + "loss": 0.8738, + "step": 7895 + }, + { + "epoch": 4.45346869712352, + "grad_norm": 1.1699117422103882, + "learning_rate": 2.7735476593344618e-05, + "loss": 0.7296, + "step": 7896 + }, + { + "epoch": 4.454032712915962, + "grad_norm": 1.6305667161941528, + "learning_rate": 2.7732656514382403e-05, + "loss": 0.6957, + "step": 7897 + }, + { + "epoch": 4.454596728708404, + "grad_norm": 1.384198784828186, + "learning_rate": 2.7729836435420192e-05, + "loss": 0.7251, + "step": 7898 + }, + { + "epoch": 4.455160744500846, + "grad_norm": 1.918563723564148, + "learning_rate": 2.772701635645798e-05, + "loss": 0.6786, + "step": 7899 + }, + { + "epoch": 4.455724760293288, + "grad_norm": 1.1590969562530518, + "learning_rate": 2.7724196277495773e-05, + "loss": 0.7364, + "step": 7900 + }, + { + "epoch": 4.456288776085731, + "grad_norm": 1.4760558605194092, + "learning_rate": 2.7721376198533562e-05, + "loss": 0.7784, + "step": 7901 + }, + { + "epoch": 4.456852791878172, + "grad_norm": 1.1489639282226562, + "learning_rate": 2.7718556119571347e-05, + "loss": 0.736, + "step": 7902 + }, + { + "epoch": 4.457416807670615, + "grad_norm": 1.0728205442428589, + "learning_rate": 2.7715736040609136e-05, + "loss": 0.7526, + "step": 7903 + }, + { + "epoch": 4.457980823463057, + "grad_norm": 1.5858906507492065, + "learning_rate": 2.7712915961646928e-05, + "loss": 0.8282, + "step": 7904 + }, + { + "epoch": 4.458544839255499, + "grad_norm": 1.236367106437683, + "learning_rate": 2.7710095882684717e-05, + "loss": 0.7773, + "step": 7905 + }, + { + "epoch": 4.459108855047941, + "grad_norm": 1.4390647411346436, + "learning_rate": 2.7707275803722506e-05, + "loss": 0.8234, + "step": 7906 + }, + { + "epoch": 4.459672870840383, + "grad_norm": 1.5024551153182983, + "learning_rate": 2.770445572476029e-05, + "loss": 0.8368, + "step": 7907 + }, + { + "epoch": 4.460236886632826, + "grad_norm": 1.105708122253418, + "learning_rate": 2.7701635645798087e-05, + "loss": 0.8544, + "step": 7908 + }, + { + "epoch": 4.460800902425268, + "grad_norm": 1.1386010646820068, + "learning_rate": 2.7698815566835872e-05, + "loss": 0.7017, + "step": 7909 + }, + { + "epoch": 4.46136491821771, + "grad_norm": 1.4386709928512573, + "learning_rate": 2.769599548787366e-05, + "loss": 0.902, + "step": 7910 + }, + { + "epoch": 4.461928934010152, + "grad_norm": 1.0551269054412842, + "learning_rate": 2.769317540891145e-05, + "loss": 0.733, + "step": 7911 + }, + { + "epoch": 4.462492949802595, + "grad_norm": 1.1459826231002808, + "learning_rate": 2.7690355329949242e-05, + "loss": 0.7687, + "step": 7912 + }, + { + "epoch": 4.463056965595037, + "grad_norm": 1.4318020343780518, + "learning_rate": 2.7687535250987027e-05, + "loss": 0.6858, + "step": 7913 + }, + { + "epoch": 4.4636209813874785, + "grad_norm": 1.4080861806869507, + "learning_rate": 2.7684715172024816e-05, + "loss": 0.7983, + "step": 7914 + }, + { + "epoch": 4.464184997179921, + "grad_norm": 1.3383618593215942, + "learning_rate": 2.768189509306261e-05, + "loss": 0.7321, + "step": 7915 + }, + { + "epoch": 4.464749012972363, + "grad_norm": 1.8249664306640625, + "learning_rate": 2.7679075014100397e-05, + "loss": 0.7504, + "step": 7916 + }, + { + "epoch": 4.465313028764806, + "grad_norm": 1.069496512413025, + "learning_rate": 2.7676254935138186e-05, + "loss": 0.6977, + "step": 7917 + }, + { + "epoch": 4.4658770445572475, + "grad_norm": 1.2327373027801514, + "learning_rate": 2.767343485617597e-05, + "loss": 0.7656, + "step": 7918 + }, + { + "epoch": 4.466441060349689, + "grad_norm": 1.2823644876480103, + "learning_rate": 2.7670614777213767e-05, + "loss": 0.7648, + "step": 7919 + }, + { + "epoch": 4.467005076142132, + "grad_norm": 1.6653841733932495, + "learning_rate": 2.7667794698251552e-05, + "loss": 0.8396, + "step": 7920 + }, + { + "epoch": 4.467569091934574, + "grad_norm": 2.615938186645508, + "learning_rate": 2.766497461928934e-05, + "loss": 0.8017, + "step": 7921 + }, + { + "epoch": 4.4681331077270166, + "grad_norm": 1.4951905012130737, + "learning_rate": 2.766215454032713e-05, + "loss": 0.7661, + "step": 7922 + }, + { + "epoch": 4.468697123519458, + "grad_norm": 1.4938396215438843, + "learning_rate": 2.7659334461364922e-05, + "loss": 0.8303, + "step": 7923 + }, + { + "epoch": 4.469261139311901, + "grad_norm": 1.1651314496994019, + "learning_rate": 2.765651438240271e-05, + "loss": 0.738, + "step": 7924 + }, + { + "epoch": 4.469825155104343, + "grad_norm": 1.3956387042999268, + "learning_rate": 2.7653694303440496e-05, + "loss": 0.6287, + "step": 7925 + }, + { + "epoch": 4.470389170896786, + "grad_norm": 1.0129112005233765, + "learning_rate": 2.7650874224478285e-05, + "loss": 0.8314, + "step": 7926 + }, + { + "epoch": 4.470953186689227, + "grad_norm": 1.0972778797149658, + "learning_rate": 2.7648054145516077e-05, + "loss": 0.8008, + "step": 7927 + }, + { + "epoch": 4.471517202481669, + "grad_norm": 1.1693408489227295, + "learning_rate": 2.7645234066553866e-05, + "loss": 0.6297, + "step": 7928 + }, + { + "epoch": 4.472081218274112, + "grad_norm": 1.1330887079238892, + "learning_rate": 2.7642413987591655e-05, + "loss": 0.7283, + "step": 7929 + }, + { + "epoch": 4.472645234066554, + "grad_norm": 1.318712830543518, + "learning_rate": 2.763959390862944e-05, + "loss": 0.7456, + "step": 7930 + }, + { + "epoch": 4.473209249858996, + "grad_norm": 1.2609647512435913, + "learning_rate": 2.7636773829667232e-05, + "loss": 0.7926, + "step": 7931 + }, + { + "epoch": 4.473773265651438, + "grad_norm": 1.1981717348098755, + "learning_rate": 2.763395375070502e-05, + "loss": 0.8121, + "step": 7932 + }, + { + "epoch": 4.47433728144388, + "grad_norm": 1.1007208824157715, + "learning_rate": 2.763113367174281e-05, + "loss": 0.6512, + "step": 7933 + }, + { + "epoch": 4.474901297236323, + "grad_norm": 1.3216415643692017, + "learning_rate": 2.7628313592780595e-05, + "loss": 0.7506, + "step": 7934 + }, + { + "epoch": 4.4754653130287645, + "grad_norm": 0.8384783267974854, + "learning_rate": 2.762549351381839e-05, + "loss": 0.6728, + "step": 7935 + }, + { + "epoch": 4.476029328821207, + "grad_norm": 1.2462891340255737, + "learning_rate": 2.7622673434856176e-05, + "loss": 0.7206, + "step": 7936 + }, + { + "epoch": 4.476593344613649, + "grad_norm": 1.4384130239486694, + "learning_rate": 2.7619853355893965e-05, + "loss": 0.7544, + "step": 7937 + }, + { + "epoch": 4.477157360406092, + "grad_norm": 1.2658425569534302, + "learning_rate": 2.7617033276931754e-05, + "loss": 0.7533, + "step": 7938 + }, + { + "epoch": 4.4777213761985335, + "grad_norm": 1.8400726318359375, + "learning_rate": 2.7614213197969546e-05, + "loss": 0.8885, + "step": 7939 + }, + { + "epoch": 4.478285391990976, + "grad_norm": 1.4017326831817627, + "learning_rate": 2.7611393119007335e-05, + "loss": 0.7569, + "step": 7940 + }, + { + "epoch": 4.478849407783418, + "grad_norm": 1.9973745346069336, + "learning_rate": 2.760857304004512e-05, + "loss": 0.7689, + "step": 7941 + }, + { + "epoch": 4.47941342357586, + "grad_norm": 0.9706562161445618, + "learning_rate": 2.760575296108291e-05, + "loss": 0.6398, + "step": 7942 + }, + { + "epoch": 4.4799774393683025, + "grad_norm": 0.8846675753593445, + "learning_rate": 2.76029328821207e-05, + "loss": 0.7454, + "step": 7943 + }, + { + "epoch": 4.480541455160744, + "grad_norm": 1.1339365243911743, + "learning_rate": 2.760011280315849e-05, + "loss": 0.7046, + "step": 7944 + }, + { + "epoch": 4.481105470953187, + "grad_norm": 2.117461919784546, + "learning_rate": 2.759729272419628e-05, + "loss": 0.9762, + "step": 7945 + }, + { + "epoch": 4.481669486745629, + "grad_norm": 1.4442260265350342, + "learning_rate": 2.7594472645234064e-05, + "loss": 0.9288, + "step": 7946 + }, + { + "epoch": 4.482233502538071, + "grad_norm": 1.4617167711257935, + "learning_rate": 2.759165256627186e-05, + "loss": 0.7911, + "step": 7947 + }, + { + "epoch": 4.482797518330513, + "grad_norm": 1.0536854267120361, + "learning_rate": 2.7588832487309645e-05, + "loss": 0.7866, + "step": 7948 + }, + { + "epoch": 4.483361534122955, + "grad_norm": 1.633105754852295, + "learning_rate": 2.7586012408347434e-05, + "loss": 0.8459, + "step": 7949 + }, + { + "epoch": 4.483925549915398, + "grad_norm": 0.922346293926239, + "learning_rate": 2.7583192329385226e-05, + "loss": 0.6586, + "step": 7950 + }, + { + "epoch": 4.48448956570784, + "grad_norm": 1.2902454137802124, + "learning_rate": 2.7580372250423015e-05, + "loss": 0.7523, + "step": 7951 + }, + { + "epoch": 4.485053581500282, + "grad_norm": 1.050187110900879, + "learning_rate": 2.75775521714608e-05, + "loss": 0.6227, + "step": 7952 + }, + { + "epoch": 4.485617597292724, + "grad_norm": 0.8528400659561157, + "learning_rate": 2.757473209249859e-05, + "loss": 0.5648, + "step": 7953 + }, + { + "epoch": 4.486181613085167, + "grad_norm": 1.1093069314956665, + "learning_rate": 2.757191201353638e-05, + "loss": 0.7502, + "step": 7954 + }, + { + "epoch": 4.486745628877609, + "grad_norm": 1.7130024433135986, + "learning_rate": 2.756909193457417e-05, + "loss": 0.8173, + "step": 7955 + }, + { + "epoch": 4.4873096446700504, + "grad_norm": 1.188951849937439, + "learning_rate": 2.756627185561196e-05, + "loss": 0.7449, + "step": 7956 + }, + { + "epoch": 4.487873660462493, + "grad_norm": 0.9484512805938721, + "learning_rate": 2.7563451776649745e-05, + "loss": 0.7486, + "step": 7957 + }, + { + "epoch": 4.488437676254935, + "grad_norm": 1.212955355644226, + "learning_rate": 2.756063169768754e-05, + "loss": 0.7182, + "step": 7958 + }, + { + "epoch": 4.489001692047378, + "grad_norm": 1.3476731777191162, + "learning_rate": 2.7557811618725326e-05, + "loss": 0.8167, + "step": 7959 + }, + { + "epoch": 4.4895657078398195, + "grad_norm": 1.0895435810089111, + "learning_rate": 2.7554991539763114e-05, + "loss": 0.7238, + "step": 7960 + }, + { + "epoch": 4.490129723632261, + "grad_norm": 1.4308762550354004, + "learning_rate": 2.7552171460800903e-05, + "loss": 0.7948, + "step": 7961 + }, + { + "epoch": 4.490693739424704, + "grad_norm": 1.6038644313812256, + "learning_rate": 2.7549351381838695e-05, + "loss": 0.8433, + "step": 7962 + }, + { + "epoch": 4.491257755217146, + "grad_norm": 1.2406734228134155, + "learning_rate": 2.7546531302876484e-05, + "loss": 0.6936, + "step": 7963 + }, + { + "epoch": 4.4918217710095885, + "grad_norm": 1.1957300901412964, + "learning_rate": 2.754371122391427e-05, + "loss": 0.8292, + "step": 7964 + }, + { + "epoch": 4.49238578680203, + "grad_norm": 1.1038849353790283, + "learning_rate": 2.754089114495206e-05, + "loss": 0.8435, + "step": 7965 + }, + { + "epoch": 4.492949802594473, + "grad_norm": 1.8544846773147583, + "learning_rate": 2.753807106598985e-05, + "loss": 0.8074, + "step": 7966 + }, + { + "epoch": 4.493513818386915, + "grad_norm": 2.050154685974121, + "learning_rate": 2.753525098702764e-05, + "loss": 0.8282, + "step": 7967 + }, + { + "epoch": 4.4940778341793575, + "grad_norm": 1.4403573274612427, + "learning_rate": 2.7532430908065425e-05, + "loss": 0.7397, + "step": 7968 + }, + { + "epoch": 4.494641849971799, + "grad_norm": 0.9875137805938721, + "learning_rate": 2.7529610829103214e-05, + "loss": 0.6653, + "step": 7969 + }, + { + "epoch": 4.495205865764241, + "grad_norm": 1.1718251705169678, + "learning_rate": 2.7526790750141006e-05, + "loss": 0.7444, + "step": 7970 + }, + { + "epoch": 4.495769881556684, + "grad_norm": 1.2458274364471436, + "learning_rate": 2.7523970671178795e-05, + "loss": 0.7129, + "step": 7971 + }, + { + "epoch": 4.496333897349126, + "grad_norm": 1.2133488655090332, + "learning_rate": 2.7521150592216583e-05, + "loss": 0.7166, + "step": 7972 + }, + { + "epoch": 4.496897913141568, + "grad_norm": 1.015228509902954, + "learning_rate": 2.751833051325437e-05, + "loss": 0.7268, + "step": 7973 + }, + { + "epoch": 4.49746192893401, + "grad_norm": 2.0460705757141113, + "learning_rate": 2.7515510434292164e-05, + "loss": 0.8511, + "step": 7974 + }, + { + "epoch": 4.498025944726452, + "grad_norm": 1.452454686164856, + "learning_rate": 2.751269035532995e-05, + "loss": 0.7658, + "step": 7975 + }, + { + "epoch": 4.498589960518895, + "grad_norm": 1.5052322149276733, + "learning_rate": 2.750987027636774e-05, + "loss": 0.8143, + "step": 7976 + }, + { + "epoch": 4.499153976311336, + "grad_norm": 1.0802433490753174, + "learning_rate": 2.7507050197405527e-05, + "loss": 0.7749, + "step": 7977 + }, + { + "epoch": 4.499717992103779, + "grad_norm": 1.077263593673706, + "learning_rate": 2.750423011844332e-05, + "loss": 0.6451, + "step": 7978 + }, + { + "epoch": 4.500282007896221, + "grad_norm": 1.6322647333145142, + "learning_rate": 2.7501410039481108e-05, + "loss": 0.7274, + "step": 7979 + }, + { + "epoch": 4.500846023688664, + "grad_norm": 0.9310643076896667, + "learning_rate": 2.7498589960518894e-05, + "loss": 0.6265, + "step": 7980 + }, + { + "epoch": 4.501410039481105, + "grad_norm": 1.2432043552398682, + "learning_rate": 2.7495769881556682e-05, + "loss": 0.7917, + "step": 7981 + }, + { + "epoch": 4.501974055273548, + "grad_norm": 1.2451082468032837, + "learning_rate": 2.7492949802594475e-05, + "loss": 0.7515, + "step": 7982 + }, + { + "epoch": 4.50253807106599, + "grad_norm": 1.1306226253509521, + "learning_rate": 2.7490129723632263e-05, + "loss": 0.6673, + "step": 7983 + }, + { + "epoch": 4.503102086858432, + "grad_norm": 1.1296231746673584, + "learning_rate": 2.7487309644670052e-05, + "loss": 0.7895, + "step": 7984 + }, + { + "epoch": 4.503666102650874, + "grad_norm": 1.4161683320999146, + "learning_rate": 2.7484489565707844e-05, + "loss": 0.7995, + "step": 7985 + }, + { + "epoch": 4.504230118443316, + "grad_norm": 1.1553009748458862, + "learning_rate": 2.748166948674563e-05, + "loss": 0.7075, + "step": 7986 + }, + { + "epoch": 4.504794134235759, + "grad_norm": 0.9976653456687927, + "learning_rate": 2.747884940778342e-05, + "loss": 0.692, + "step": 7987 + }, + { + "epoch": 4.505358150028201, + "grad_norm": 1.8123928308486938, + "learning_rate": 2.7476029328821207e-05, + "loss": 0.8174, + "step": 7988 + }, + { + "epoch": 4.5059221658206425, + "grad_norm": 1.176949143409729, + "learning_rate": 2.7473209249859e-05, + "loss": 0.8328, + "step": 7989 + }, + { + "epoch": 4.506486181613085, + "grad_norm": 1.263251781463623, + "learning_rate": 2.747038917089679e-05, + "loss": 0.8137, + "step": 7990 + }, + { + "epoch": 4.507050197405527, + "grad_norm": 1.3381409645080566, + "learning_rate": 2.7467569091934574e-05, + "loss": 0.7824, + "step": 7991 + }, + { + "epoch": 4.50761421319797, + "grad_norm": 1.3420151472091675, + "learning_rate": 2.7464749012972363e-05, + "loss": 0.8116, + "step": 7992 + }, + { + "epoch": 4.5081782289904115, + "grad_norm": 1.3358381986618042, + "learning_rate": 2.7461928934010155e-05, + "loss": 0.7181, + "step": 7993 + }, + { + "epoch": 4.508742244782854, + "grad_norm": 1.879418969154358, + "learning_rate": 2.7459108855047944e-05, + "loss": 0.6904, + "step": 7994 + }, + { + "epoch": 4.509306260575296, + "grad_norm": 1.2653111219406128, + "learning_rate": 2.7456288776085732e-05, + "loss": 0.9311, + "step": 7995 + }, + { + "epoch": 4.509870276367739, + "grad_norm": 0.9628810882568359, + "learning_rate": 2.7453468697123518e-05, + "loss": 0.7829, + "step": 7996 + }, + { + "epoch": 4.5104342921601805, + "grad_norm": 1.312954306602478, + "learning_rate": 2.7450648618161313e-05, + "loss": 0.7433, + "step": 7997 + }, + { + "epoch": 4.510998307952622, + "grad_norm": 1.2871360778808594, + "learning_rate": 2.74478285391991e-05, + "loss": 0.7239, + "step": 7998 + }, + { + "epoch": 4.511562323745065, + "grad_norm": 1.1900944709777832, + "learning_rate": 2.7445008460236888e-05, + "loss": 0.7422, + "step": 7999 + }, + { + "epoch": 4.512126339537507, + "grad_norm": 1.0627062320709229, + "learning_rate": 2.7442188381274676e-05, + "loss": 0.6208, + "step": 8000 + }, + { + "epoch": 4.5126903553299496, + "grad_norm": 1.8188141584396362, + "learning_rate": 2.743936830231247e-05, + "loss": 0.8857, + "step": 8001 + }, + { + "epoch": 4.513254371122391, + "grad_norm": 1.2962315082550049, + "learning_rate": 2.7436548223350257e-05, + "loss": 0.7562, + "step": 8002 + }, + { + "epoch": 4.513818386914833, + "grad_norm": 1.6570883989334106, + "learning_rate": 2.7433728144388043e-05, + "loss": 0.7731, + "step": 8003 + }, + { + "epoch": 4.514382402707276, + "grad_norm": 0.9757227301597595, + "learning_rate": 2.743090806542583e-05, + "loss": 0.8163, + "step": 8004 + }, + { + "epoch": 4.514946418499718, + "grad_norm": 1.2422914505004883, + "learning_rate": 2.7428087986463624e-05, + "loss": 0.7029, + "step": 8005 + }, + { + "epoch": 4.51551043429216, + "grad_norm": 1.2118529081344604, + "learning_rate": 2.7425267907501413e-05, + "loss": 0.6891, + "step": 8006 + }, + { + "epoch": 4.516074450084602, + "grad_norm": 1.06470787525177, + "learning_rate": 2.7422447828539198e-05, + "loss": 0.7348, + "step": 8007 + }, + { + "epoch": 4.516638465877045, + "grad_norm": 1.2173393964767456, + "learning_rate": 2.7419627749576987e-05, + "loss": 0.8117, + "step": 8008 + }, + { + "epoch": 4.517202481669487, + "grad_norm": 0.9870374798774719, + "learning_rate": 2.741680767061478e-05, + "loss": 0.7513, + "step": 8009 + }, + { + "epoch": 4.517766497461929, + "grad_norm": 0.967252790927887, + "learning_rate": 2.7413987591652568e-05, + "loss": 0.7332, + "step": 8010 + }, + { + "epoch": 4.518330513254371, + "grad_norm": 1.773478388786316, + "learning_rate": 2.7411167512690357e-05, + "loss": 0.872, + "step": 8011 + }, + { + "epoch": 4.518894529046813, + "grad_norm": 1.008494257926941, + "learning_rate": 2.7408347433728142e-05, + "loss": 0.7176, + "step": 8012 + }, + { + "epoch": 4.519458544839256, + "grad_norm": 1.7028956413269043, + "learning_rate": 2.7405527354765938e-05, + "loss": 0.805, + "step": 8013 + }, + { + "epoch": 4.5200225606316975, + "grad_norm": 1.225818157196045, + "learning_rate": 2.7402707275803723e-05, + "loss": 0.7929, + "step": 8014 + }, + { + "epoch": 4.52058657642414, + "grad_norm": 1.3641321659088135, + "learning_rate": 2.7399887196841512e-05, + "loss": 0.733, + "step": 8015 + }, + { + "epoch": 4.521150592216582, + "grad_norm": 1.0531129837036133, + "learning_rate": 2.73970671178793e-05, + "loss": 0.7173, + "step": 8016 + }, + { + "epoch": 4.521714608009024, + "grad_norm": 1.2356510162353516, + "learning_rate": 2.7394247038917093e-05, + "loss": 0.5984, + "step": 8017 + }, + { + "epoch": 4.5222786238014665, + "grad_norm": 1.6098136901855469, + "learning_rate": 2.739142695995488e-05, + "loss": 0.8613, + "step": 8018 + }, + { + "epoch": 4.522842639593908, + "grad_norm": 0.9359008073806763, + "learning_rate": 2.7388606880992667e-05, + "loss": 0.7864, + "step": 8019 + }, + { + "epoch": 4.523406655386351, + "grad_norm": 2.588550329208374, + "learning_rate": 2.7385786802030462e-05, + "loss": 0.8636, + "step": 8020 + }, + { + "epoch": 4.523970671178793, + "grad_norm": 1.4904630184173584, + "learning_rate": 2.7382966723068248e-05, + "loss": 0.8013, + "step": 8021 + }, + { + "epoch": 4.5245346869712355, + "grad_norm": 1.1709176301956177, + "learning_rate": 2.7380146644106037e-05, + "loss": 0.7088, + "step": 8022 + }, + { + "epoch": 4.525098702763677, + "grad_norm": 1.416947364807129, + "learning_rate": 2.7377326565143822e-05, + "loss": 0.7611, + "step": 8023 + }, + { + "epoch": 4.52566271855612, + "grad_norm": 1.1049916744232178, + "learning_rate": 2.7374506486181618e-05, + "loss": 0.6886, + "step": 8024 + }, + { + "epoch": 4.526226734348562, + "grad_norm": 1.2388840913772583, + "learning_rate": 2.7371686407219403e-05, + "loss": 0.6707, + "step": 8025 + }, + { + "epoch": 4.526790750141004, + "grad_norm": 1.2087689638137817, + "learning_rate": 2.7368866328257192e-05, + "loss": 0.7523, + "step": 8026 + }, + { + "epoch": 4.527354765933446, + "grad_norm": 1.1538933515548706, + "learning_rate": 2.736604624929498e-05, + "loss": 0.7, + "step": 8027 + }, + { + "epoch": 4.527918781725888, + "grad_norm": 1.2488157749176025, + "learning_rate": 2.7363226170332773e-05, + "loss": 0.6797, + "step": 8028 + }, + { + "epoch": 4.528482797518331, + "grad_norm": 1.6645183563232422, + "learning_rate": 2.736040609137056e-05, + "loss": 0.8186, + "step": 8029 + }, + { + "epoch": 4.529046813310773, + "grad_norm": 1.6408928632736206, + "learning_rate": 2.7357586012408347e-05, + "loss": 0.7793, + "step": 8030 + }, + { + "epoch": 4.529610829103214, + "grad_norm": 1.5658766031265259, + "learning_rate": 2.7354765933446136e-05, + "loss": 0.8601, + "step": 8031 + }, + { + "epoch": 4.530174844895657, + "grad_norm": 0.9768356084823608, + "learning_rate": 2.7351945854483928e-05, + "loss": 0.6264, + "step": 8032 + }, + { + "epoch": 4.530738860688099, + "grad_norm": 1.3569313287734985, + "learning_rate": 2.7349125775521717e-05, + "loss": 0.744, + "step": 8033 + }, + { + "epoch": 4.531302876480542, + "grad_norm": 0.9643110632896423, + "learning_rate": 2.7346305696559506e-05, + "loss": 0.7839, + "step": 8034 + }, + { + "epoch": 4.5318668922729834, + "grad_norm": 0.9289801120758057, + "learning_rate": 2.734348561759729e-05, + "loss": 0.7495, + "step": 8035 + }, + { + "epoch": 4.532430908065426, + "grad_norm": 1.59648597240448, + "learning_rate": 2.7340665538635087e-05, + "loss": 0.7813, + "step": 8036 + }, + { + "epoch": 4.532994923857868, + "grad_norm": 1.5485954284667969, + "learning_rate": 2.7337845459672872e-05, + "loss": 0.7161, + "step": 8037 + }, + { + "epoch": 4.533558939650311, + "grad_norm": 1.1993775367736816, + "learning_rate": 2.733502538071066e-05, + "loss": 0.7279, + "step": 8038 + }, + { + "epoch": 4.5341229554427525, + "grad_norm": 0.9670122265815735, + "learning_rate": 2.733220530174845e-05, + "loss": 0.6615, + "step": 8039 + }, + { + "epoch": 4.534686971235194, + "grad_norm": 1.1789612770080566, + "learning_rate": 2.7329385222786242e-05, + "loss": 0.714, + "step": 8040 + }, + { + "epoch": 4.535250987027637, + "grad_norm": 1.2608141899108887, + "learning_rate": 2.7326565143824027e-05, + "loss": 0.8396, + "step": 8041 + }, + { + "epoch": 4.535815002820079, + "grad_norm": 1.8277554512023926, + "learning_rate": 2.7323745064861816e-05, + "loss": 0.7218, + "step": 8042 + }, + { + "epoch": 4.5363790186125215, + "grad_norm": 1.8076460361480713, + "learning_rate": 2.7320924985899605e-05, + "loss": 0.8269, + "step": 8043 + }, + { + "epoch": 4.536943034404963, + "grad_norm": 1.083612084388733, + "learning_rate": 2.7318104906937397e-05, + "loss": 0.6739, + "step": 8044 + }, + { + "epoch": 4.537507050197405, + "grad_norm": 1.2487272024154663, + "learning_rate": 2.7315284827975186e-05, + "loss": 0.6997, + "step": 8045 + }, + { + "epoch": 4.538071065989848, + "grad_norm": 0.9870285391807556, + "learning_rate": 2.731246474901297e-05, + "loss": 0.6529, + "step": 8046 + }, + { + "epoch": 4.53863508178229, + "grad_norm": 1.3708252906799316, + "learning_rate": 2.730964467005076e-05, + "loss": 0.846, + "step": 8047 + }, + { + "epoch": 4.539199097574732, + "grad_norm": 1.083740472793579, + "learning_rate": 2.7306824591088552e-05, + "loss": 0.6676, + "step": 8048 + }, + { + "epoch": 4.539763113367174, + "grad_norm": 1.3472577333450317, + "learning_rate": 2.730400451212634e-05, + "loss": 0.6616, + "step": 8049 + }, + { + "epoch": 4.540327129159617, + "grad_norm": 0.9035170078277588, + "learning_rate": 2.730118443316413e-05, + "loss": 0.6006, + "step": 8050 + }, + { + "epoch": 4.540891144952059, + "grad_norm": 1.3256490230560303, + "learning_rate": 2.7298364354201915e-05, + "loss": 0.8467, + "step": 8051 + }, + { + "epoch": 4.541455160744501, + "grad_norm": 0.9697229266166687, + "learning_rate": 2.729554427523971e-05, + "loss": 0.756, + "step": 8052 + }, + { + "epoch": 4.542019176536943, + "grad_norm": 1.4848092794418335, + "learning_rate": 2.7292724196277496e-05, + "loss": 0.7585, + "step": 8053 + }, + { + "epoch": 4.542583192329385, + "grad_norm": 1.6045182943344116, + "learning_rate": 2.7289904117315285e-05, + "loss": 0.8141, + "step": 8054 + }, + { + "epoch": 4.543147208121828, + "grad_norm": 1.1839722394943237, + "learning_rate": 2.7287084038353077e-05, + "loss": 0.7366, + "step": 8055 + }, + { + "epoch": 4.543711223914269, + "grad_norm": 1.0886529684066772, + "learning_rate": 2.7284263959390866e-05, + "loss": 0.7547, + "step": 8056 + }, + { + "epoch": 4.544275239706712, + "grad_norm": 0.9424518942832947, + "learning_rate": 2.7281443880428655e-05, + "loss": 0.6936, + "step": 8057 + }, + { + "epoch": 4.544839255499154, + "grad_norm": 1.315578579902649, + "learning_rate": 2.727862380146644e-05, + "loss": 0.6749, + "step": 8058 + }, + { + "epoch": 4.545403271291596, + "grad_norm": 1.7961657047271729, + "learning_rate": 2.7275803722504232e-05, + "loss": 0.8605, + "step": 8059 + }, + { + "epoch": 4.545967287084038, + "grad_norm": 1.1383788585662842, + "learning_rate": 2.727298364354202e-05, + "loss": 0.6777, + "step": 8060 + }, + { + "epoch": 4.54653130287648, + "grad_norm": 1.013398289680481, + "learning_rate": 2.727016356457981e-05, + "loss": 0.7315, + "step": 8061 + }, + { + "epoch": 4.547095318668923, + "grad_norm": 1.903404951095581, + "learning_rate": 2.7267343485617595e-05, + "loss": 0.8055, + "step": 8062 + }, + { + "epoch": 4.547659334461365, + "grad_norm": 1.3780938386917114, + "learning_rate": 2.726452340665539e-05, + "loss": 0.7512, + "step": 8063 + }, + { + "epoch": 4.548223350253807, + "grad_norm": 0.9686315655708313, + "learning_rate": 2.7261703327693176e-05, + "loss": 0.7529, + "step": 8064 + }, + { + "epoch": 4.548787366046249, + "grad_norm": 1.043375849723816, + "learning_rate": 2.7258883248730965e-05, + "loss": 0.6864, + "step": 8065 + }, + { + "epoch": 4.549351381838692, + "grad_norm": 1.1075788736343384, + "learning_rate": 2.7256063169768754e-05, + "loss": 0.641, + "step": 8066 + }, + { + "epoch": 4.549915397631134, + "grad_norm": 1.6821062564849854, + "learning_rate": 2.7253243090806546e-05, + "loss": 0.8774, + "step": 8067 + }, + { + "epoch": 4.5504794134235755, + "grad_norm": 0.8100453019142151, + "learning_rate": 2.7250423011844335e-05, + "loss": 0.6455, + "step": 8068 + }, + { + "epoch": 4.551043429216018, + "grad_norm": 1.7191294431686401, + "learning_rate": 2.724760293288212e-05, + "loss": 0.8845, + "step": 8069 + }, + { + "epoch": 4.55160744500846, + "grad_norm": 1.5845147371292114, + "learning_rate": 2.724478285391991e-05, + "loss": 0.7653, + "step": 8070 + }, + { + "epoch": 4.552171460800903, + "grad_norm": 1.2893319129943848, + "learning_rate": 2.72419627749577e-05, + "loss": 0.7687, + "step": 8071 + }, + { + "epoch": 4.5527354765933445, + "grad_norm": 2.092350482940674, + "learning_rate": 2.723914269599549e-05, + "loss": 0.7853, + "step": 8072 + }, + { + "epoch": 4.553299492385786, + "grad_norm": 0.8371962308883667, + "learning_rate": 2.723632261703328e-05, + "loss": 0.674, + "step": 8073 + }, + { + "epoch": 4.553863508178229, + "grad_norm": 0.7747265100479126, + "learning_rate": 2.7233502538071064e-05, + "loss": 0.6049, + "step": 8074 + }, + { + "epoch": 4.554427523970671, + "grad_norm": 2.2793283462524414, + "learning_rate": 2.723068245910886e-05, + "loss": 0.8743, + "step": 8075 + }, + { + "epoch": 4.5549915397631136, + "grad_norm": 1.4527971744537354, + "learning_rate": 2.7227862380146645e-05, + "loss": 0.7544, + "step": 8076 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 1.1859880685806274, + "learning_rate": 2.7225042301184434e-05, + "loss": 0.7146, + "step": 8077 + }, + { + "epoch": 4.556119571347998, + "grad_norm": 1.2389464378356934, + "learning_rate": 2.7222222222222223e-05, + "loss": 0.7619, + "step": 8078 + }, + { + "epoch": 4.55668358714044, + "grad_norm": 1.0283236503601074, + "learning_rate": 2.7219402143260015e-05, + "loss": 0.6157, + "step": 8079 + }, + { + "epoch": 4.557247602932883, + "grad_norm": 1.3395459651947021, + "learning_rate": 2.72165820642978e-05, + "loss": 0.7825, + "step": 8080 + }, + { + "epoch": 4.557811618725324, + "grad_norm": 2.1666905879974365, + "learning_rate": 2.721376198533559e-05, + "loss": 0.8701, + "step": 8081 + }, + { + "epoch": 4.558375634517766, + "grad_norm": 1.20436692237854, + "learning_rate": 2.7210941906373378e-05, + "loss": 0.8097, + "step": 8082 + }, + { + "epoch": 4.558939650310209, + "grad_norm": 0.8993039131164551, + "learning_rate": 2.720812182741117e-05, + "loss": 0.7078, + "step": 8083 + }, + { + "epoch": 4.559503666102651, + "grad_norm": 1.1231766939163208, + "learning_rate": 2.720530174844896e-05, + "loss": 0.715, + "step": 8084 + }, + { + "epoch": 4.560067681895093, + "grad_norm": 1.0157058238983154, + "learning_rate": 2.7202481669486744e-05, + "loss": 0.803, + "step": 8085 + }, + { + "epoch": 4.560631697687535, + "grad_norm": 0.8425105214118958, + "learning_rate": 2.7199661590524533e-05, + "loss": 0.6752, + "step": 8086 + }, + { + "epoch": 4.561195713479977, + "grad_norm": 1.2126343250274658, + "learning_rate": 2.7196841511562325e-05, + "loss": 0.8262, + "step": 8087 + }, + { + "epoch": 4.56175972927242, + "grad_norm": 1.192361831665039, + "learning_rate": 2.7194021432600114e-05, + "loss": 0.7767, + "step": 8088 + }, + { + "epoch": 4.5623237450648615, + "grad_norm": 1.572264313697815, + "learning_rate": 2.7191201353637903e-05, + "loss": 0.7945, + "step": 8089 + }, + { + "epoch": 4.562887760857304, + "grad_norm": 1.344309687614441, + "learning_rate": 2.7188381274675695e-05, + "loss": 0.8259, + "step": 8090 + }, + { + "epoch": 4.563451776649746, + "grad_norm": 1.6310311555862427, + "learning_rate": 2.7185561195713484e-05, + "loss": 0.7431, + "step": 8091 + }, + { + "epoch": 4.564015792442189, + "grad_norm": 1.1971914768218994, + "learning_rate": 2.718274111675127e-05, + "loss": 0.7575, + "step": 8092 + }, + { + "epoch": 4.5645798082346305, + "grad_norm": 1.3698488473892212, + "learning_rate": 2.7179921037789058e-05, + "loss": 0.7581, + "step": 8093 + }, + { + "epoch": 4.565143824027073, + "grad_norm": 1.310232162475586, + "learning_rate": 2.717710095882685e-05, + "loss": 0.816, + "step": 8094 + }, + { + "epoch": 4.565707839819515, + "grad_norm": 1.2565078735351562, + "learning_rate": 2.717428087986464e-05, + "loss": 0.7061, + "step": 8095 + }, + { + "epoch": 4.566271855611957, + "grad_norm": 1.1493419408798218, + "learning_rate": 2.7171460800902428e-05, + "loss": 0.7662, + "step": 8096 + }, + { + "epoch": 4.5668358714043995, + "grad_norm": 1.0099272727966309, + "learning_rate": 2.7168640721940213e-05, + "loss": 0.6869, + "step": 8097 + }, + { + "epoch": 4.567399887196841, + "grad_norm": 1.6928309202194214, + "learning_rate": 2.7165820642978006e-05, + "loss": 0.847, + "step": 8098 + }, + { + "epoch": 4.567963902989284, + "grad_norm": 1.7460973262786865, + "learning_rate": 2.7163000564015794e-05, + "loss": 0.875, + "step": 8099 + }, + { + "epoch": 4.568527918781726, + "grad_norm": 1.1546897888183594, + "learning_rate": 2.7160180485053583e-05, + "loss": 0.6942, + "step": 8100 + }, + { + "epoch": 4.569091934574168, + "grad_norm": 1.2924442291259766, + "learning_rate": 2.715736040609137e-05, + "loss": 0.7381, + "step": 8101 + }, + { + "epoch": 4.56965595036661, + "grad_norm": 1.2076711654663086, + "learning_rate": 2.7154540327129164e-05, + "loss": 0.8133, + "step": 8102 + }, + { + "epoch": 4.570219966159052, + "grad_norm": 1.2501667737960815, + "learning_rate": 2.715172024816695e-05, + "loss": 0.708, + "step": 8103 + }, + { + "epoch": 4.570783981951495, + "grad_norm": 1.9368764162063599, + "learning_rate": 2.714890016920474e-05, + "loss": 0.8136, + "step": 8104 + }, + { + "epoch": 4.571347997743937, + "grad_norm": 1.5371242761611938, + "learning_rate": 2.7146080090242527e-05, + "loss": 0.8052, + "step": 8105 + }, + { + "epoch": 4.571912013536379, + "grad_norm": 1.1063650846481323, + "learning_rate": 2.714326001128032e-05, + "loss": 0.7877, + "step": 8106 + }, + { + "epoch": 4.572476029328821, + "grad_norm": 0.9444076418876648, + "learning_rate": 2.7140439932318108e-05, + "loss": 0.7507, + "step": 8107 + }, + { + "epoch": 4.573040045121264, + "grad_norm": 1.2938740253448486, + "learning_rate": 2.7137619853355894e-05, + "loss": 0.8112, + "step": 8108 + }, + { + "epoch": 4.573604060913706, + "grad_norm": 1.4850025177001953, + "learning_rate": 2.7134799774393682e-05, + "loss": 0.7361, + "step": 8109 + }, + { + "epoch": 4.5741680767061474, + "grad_norm": 1.1514499187469482, + "learning_rate": 2.7131979695431475e-05, + "loss": 0.6816, + "step": 8110 + }, + { + "epoch": 4.57473209249859, + "grad_norm": 0.9522131681442261, + "learning_rate": 2.7129159616469263e-05, + "loss": 0.732, + "step": 8111 + }, + { + "epoch": 4.575296108291032, + "grad_norm": 1.0292596817016602, + "learning_rate": 2.7126339537507052e-05, + "loss": 0.6382, + "step": 8112 + }, + { + "epoch": 4.575860124083475, + "grad_norm": 1.364276647567749, + "learning_rate": 2.7123519458544838e-05, + "loss": 0.704, + "step": 8113 + }, + { + "epoch": 4.5764241398759165, + "grad_norm": 1.2385932207107544, + "learning_rate": 2.7120699379582633e-05, + "loss": 0.719, + "step": 8114 + }, + { + "epoch": 4.576988155668358, + "grad_norm": 1.6726802587509155, + "learning_rate": 2.711787930062042e-05, + "loss": 0.81, + "step": 8115 + }, + { + "epoch": 4.577552171460801, + "grad_norm": 1.4418696165084839, + "learning_rate": 2.7115059221658207e-05, + "loss": 0.8871, + "step": 8116 + }, + { + "epoch": 4.578116187253243, + "grad_norm": 1.221822738647461, + "learning_rate": 2.7112239142695993e-05, + "loss": 0.7459, + "step": 8117 + }, + { + "epoch": 4.5786802030456855, + "grad_norm": 1.5103105306625366, + "learning_rate": 2.7109419063733788e-05, + "loss": 0.7676, + "step": 8118 + }, + { + "epoch": 4.579244218838127, + "grad_norm": 0.9597957134246826, + "learning_rate": 2.7106598984771574e-05, + "loss": 0.7457, + "step": 8119 + }, + { + "epoch": 4.57980823463057, + "grad_norm": 1.2437760829925537, + "learning_rate": 2.7103778905809362e-05, + "loss": 0.8343, + "step": 8120 + }, + { + "epoch": 4.580372250423012, + "grad_norm": 1.2382175922393799, + "learning_rate": 2.710095882684715e-05, + "loss": 0.7468, + "step": 8121 + }, + { + "epoch": 4.5809362662154545, + "grad_norm": 1.1955524682998657, + "learning_rate": 2.7098138747884943e-05, + "loss": 0.7533, + "step": 8122 + }, + { + "epoch": 4.581500282007896, + "grad_norm": 1.1422232389450073, + "learning_rate": 2.7095318668922732e-05, + "loss": 0.7808, + "step": 8123 + }, + { + "epoch": 4.582064297800338, + "grad_norm": 2.7328908443450928, + "learning_rate": 2.7092498589960518e-05, + "loss": 0.7049, + "step": 8124 + }, + { + "epoch": 4.582628313592781, + "grad_norm": 1.0864887237548828, + "learning_rate": 2.7089678510998313e-05, + "loss": 0.7728, + "step": 8125 + }, + { + "epoch": 4.583192329385223, + "grad_norm": 1.0271179676055908, + "learning_rate": 2.70868584320361e-05, + "loss": 0.7549, + "step": 8126 + }, + { + "epoch": 4.583756345177665, + "grad_norm": 1.0070723295211792, + "learning_rate": 2.7084038353073887e-05, + "loss": 0.7312, + "step": 8127 + }, + { + "epoch": 4.584320360970107, + "grad_norm": 1.486029028892517, + "learning_rate": 2.7081218274111676e-05, + "loss": 0.7438, + "step": 8128 + }, + { + "epoch": 4.584884376762549, + "grad_norm": 1.7190947532653809, + "learning_rate": 2.707839819514947e-05, + "loss": 0.8108, + "step": 8129 + }, + { + "epoch": 4.585448392554992, + "grad_norm": 0.9992444515228271, + "learning_rate": 2.7075578116187257e-05, + "loss": 0.7409, + "step": 8130 + }, + { + "epoch": 4.586012408347433, + "grad_norm": 1.2206835746765137, + "learning_rate": 2.7072758037225043e-05, + "loss": 0.7554, + "step": 8131 + }, + { + "epoch": 4.586576424139876, + "grad_norm": 0.9668298959732056, + "learning_rate": 2.706993795826283e-05, + "loss": 0.6725, + "step": 8132 + }, + { + "epoch": 4.587140439932318, + "grad_norm": 1.4129152297973633, + "learning_rate": 2.7067117879300624e-05, + "loss": 0.7894, + "step": 8133 + }, + { + "epoch": 4.587704455724761, + "grad_norm": 1.3310500383377075, + "learning_rate": 2.7064297800338412e-05, + "loss": 0.7471, + "step": 8134 + }, + { + "epoch": 4.588268471517202, + "grad_norm": 0.9094542860984802, + "learning_rate": 2.7061477721376198e-05, + "loss": 0.7484, + "step": 8135 + }, + { + "epoch": 4.588832487309645, + "grad_norm": 0.8812330961227417, + "learning_rate": 2.7058657642413987e-05, + "loss": 0.7077, + "step": 8136 + }, + { + "epoch": 4.589396503102087, + "grad_norm": 1.0717130899429321, + "learning_rate": 2.705583756345178e-05, + "loss": 0.7478, + "step": 8137 + }, + { + "epoch": 4.589960518894529, + "grad_norm": 1.1310954093933105, + "learning_rate": 2.7053017484489568e-05, + "loss": 0.7826, + "step": 8138 + }, + { + "epoch": 4.590524534686971, + "grad_norm": 1.6038291454315186, + "learning_rate": 2.7050197405527356e-05, + "loss": 0.8364, + "step": 8139 + }, + { + "epoch": 4.591088550479413, + "grad_norm": 1.1809762716293335, + "learning_rate": 2.7047377326565142e-05, + "loss": 0.816, + "step": 8140 + }, + { + "epoch": 4.591652566271856, + "grad_norm": 0.9227509498596191, + "learning_rate": 2.7044557247602937e-05, + "loss": 0.6996, + "step": 8141 + }, + { + "epoch": 4.592216582064298, + "grad_norm": 0.9993546009063721, + "learning_rate": 2.7041737168640723e-05, + "loss": 0.7442, + "step": 8142 + }, + { + "epoch": 4.5927805978567395, + "grad_norm": 3.946467161178589, + "learning_rate": 2.703891708967851e-05, + "loss": 0.882, + "step": 8143 + }, + { + "epoch": 4.593344613649182, + "grad_norm": 0.9501612782478333, + "learning_rate": 2.70360970107163e-05, + "loss": 0.7546, + "step": 8144 + }, + { + "epoch": 4.593908629441624, + "grad_norm": 0.8882330656051636, + "learning_rate": 2.7033276931754093e-05, + "loss": 0.7121, + "step": 8145 + }, + { + "epoch": 4.594472645234067, + "grad_norm": 1.4512298107147217, + "learning_rate": 2.703045685279188e-05, + "loss": 0.7314, + "step": 8146 + }, + { + "epoch": 4.5950366610265085, + "grad_norm": 1.4153070449829102, + "learning_rate": 2.7027636773829667e-05, + "loss": 0.7778, + "step": 8147 + }, + { + "epoch": 4.595600676818951, + "grad_norm": 0.8456439971923828, + "learning_rate": 2.7024816694867456e-05, + "loss": 0.6784, + "step": 8148 + }, + { + "epoch": 4.596164692611393, + "grad_norm": 1.074131965637207, + "learning_rate": 2.7021996615905248e-05, + "loss": 0.7155, + "step": 8149 + }, + { + "epoch": 4.596728708403836, + "grad_norm": 1.0093350410461426, + "learning_rate": 2.7019176536943037e-05, + "loss": 0.7167, + "step": 8150 + }, + { + "epoch": 4.5972927241962775, + "grad_norm": 1.003488302230835, + "learning_rate": 2.7016356457980825e-05, + "loss": 0.6971, + "step": 8151 + }, + { + "epoch": 4.597856739988719, + "grad_norm": 0.9370847940444946, + "learning_rate": 2.701353637901861e-05, + "loss": 0.7329, + "step": 8152 + }, + { + "epoch": 4.598420755781162, + "grad_norm": 1.4466925859451294, + "learning_rate": 2.7010716300056403e-05, + "loss": 0.8288, + "step": 8153 + }, + { + "epoch": 4.598984771573604, + "grad_norm": 1.3804813623428345, + "learning_rate": 2.7007896221094192e-05, + "loss": 0.7341, + "step": 8154 + }, + { + "epoch": 4.5995487873660466, + "grad_norm": 1.2964533567428589, + "learning_rate": 2.700507614213198e-05, + "loss": 0.8003, + "step": 8155 + }, + { + "epoch": 4.600112803158488, + "grad_norm": 1.1060402393341064, + "learning_rate": 2.7002256063169766e-05, + "loss": 0.6695, + "step": 8156 + }, + { + "epoch": 4.60067681895093, + "grad_norm": 1.4075489044189453, + "learning_rate": 2.699943598420756e-05, + "loss": 0.8322, + "step": 8157 + }, + { + "epoch": 4.601240834743373, + "grad_norm": 1.16313636302948, + "learning_rate": 2.6996615905245347e-05, + "loss": 0.8123, + "step": 8158 + }, + { + "epoch": 4.601804850535815, + "grad_norm": 1.1352421045303345, + "learning_rate": 2.6993795826283136e-05, + "loss": 0.6484, + "step": 8159 + }, + { + "epoch": 4.602368866328257, + "grad_norm": 1.3266091346740723, + "learning_rate": 2.6990975747320928e-05, + "loss": 0.7196, + "step": 8160 + }, + { + "epoch": 4.602932882120699, + "grad_norm": 1.0341322422027588, + "learning_rate": 2.6988155668358717e-05, + "loss": 0.6519, + "step": 8161 + }, + { + "epoch": 4.603496897913142, + "grad_norm": 0.9289688467979431, + "learning_rate": 2.6985335589396505e-05, + "loss": 0.6645, + "step": 8162 + }, + { + "epoch": 4.604060913705584, + "grad_norm": 1.2581645250320435, + "learning_rate": 2.698251551043429e-05, + "loss": 0.6765, + "step": 8163 + }, + { + "epoch": 4.604624929498026, + "grad_norm": 1.366866946220398, + "learning_rate": 2.6979695431472086e-05, + "loss": 0.7908, + "step": 8164 + }, + { + "epoch": 4.605188945290468, + "grad_norm": 1.207604169845581, + "learning_rate": 2.6976875352509872e-05, + "loss": 0.778, + "step": 8165 + }, + { + "epoch": 4.60575296108291, + "grad_norm": 1.6586357355117798, + "learning_rate": 2.697405527354766e-05, + "loss": 0.8257, + "step": 8166 + }, + { + "epoch": 4.606316976875353, + "grad_norm": 1.0779286623001099, + "learning_rate": 2.697123519458545e-05, + "loss": 0.8384, + "step": 8167 + }, + { + "epoch": 4.6068809926677945, + "grad_norm": 1.452081561088562, + "learning_rate": 2.696841511562324e-05, + "loss": 0.8612, + "step": 8168 + }, + { + "epoch": 4.607445008460237, + "grad_norm": 0.9123396873474121, + "learning_rate": 2.696559503666103e-05, + "loss": 0.7369, + "step": 8169 + }, + { + "epoch": 4.608009024252679, + "grad_norm": 1.155196189880371, + "learning_rate": 2.6962774957698816e-05, + "loss": 0.7012, + "step": 8170 + }, + { + "epoch": 4.608573040045121, + "grad_norm": 1.2814968824386597, + "learning_rate": 2.6959954878736605e-05, + "loss": 0.8068, + "step": 8171 + }, + { + "epoch": 4.6091370558375635, + "grad_norm": 2.0611953735351562, + "learning_rate": 2.6957134799774397e-05, + "loss": 0.6628, + "step": 8172 + }, + { + "epoch": 4.609701071630005, + "grad_norm": 1.3551603555679321, + "learning_rate": 2.6954314720812186e-05, + "loss": 0.7813, + "step": 8173 + }, + { + "epoch": 4.610265087422448, + "grad_norm": 1.4720028638839722, + "learning_rate": 2.695149464184997e-05, + "loss": 0.7929, + "step": 8174 + }, + { + "epoch": 4.61082910321489, + "grad_norm": 1.1865637302398682, + "learning_rate": 2.694867456288776e-05, + "loss": 0.6178, + "step": 8175 + }, + { + "epoch": 4.6113931190073325, + "grad_norm": 1.0945483446121216, + "learning_rate": 2.6945854483925552e-05, + "loss": 0.6536, + "step": 8176 + }, + { + "epoch": 4.611957134799774, + "grad_norm": 1.2392383813858032, + "learning_rate": 2.694303440496334e-05, + "loss": 0.6242, + "step": 8177 + }, + { + "epoch": 4.612521150592217, + "grad_norm": 1.3237828016281128, + "learning_rate": 2.694021432600113e-05, + "loss": 0.8019, + "step": 8178 + }, + { + "epoch": 4.613085166384659, + "grad_norm": 1.3906424045562744, + "learning_rate": 2.6937394247038915e-05, + "loss": 0.767, + "step": 8179 + }, + { + "epoch": 4.613649182177101, + "grad_norm": 1.5433396100997925, + "learning_rate": 2.693457416807671e-05, + "loss": 0.7563, + "step": 8180 + }, + { + "epoch": 4.614213197969543, + "grad_norm": 1.0607086420059204, + "learning_rate": 2.6931754089114496e-05, + "loss": 0.7974, + "step": 8181 + }, + { + "epoch": 4.614777213761985, + "grad_norm": 1.183859944343567, + "learning_rate": 2.6928934010152285e-05, + "loss": 0.6711, + "step": 8182 + }, + { + "epoch": 4.615341229554428, + "grad_norm": 1.0469319820404053, + "learning_rate": 2.6926113931190074e-05, + "loss": 0.8152, + "step": 8183 + }, + { + "epoch": 4.61590524534687, + "grad_norm": 1.2148023843765259, + "learning_rate": 2.6923293852227866e-05, + "loss": 0.7655, + "step": 8184 + }, + { + "epoch": 4.616469261139311, + "grad_norm": 1.6986414194107056, + "learning_rate": 2.6920473773265655e-05, + "loss": 0.8213, + "step": 8185 + }, + { + "epoch": 4.617033276931754, + "grad_norm": 1.28317391872406, + "learning_rate": 2.691765369430344e-05, + "loss": 0.7031, + "step": 8186 + }, + { + "epoch": 4.617597292724196, + "grad_norm": 1.33539879322052, + "learning_rate": 2.691483361534123e-05, + "loss": 0.7817, + "step": 8187 + }, + { + "epoch": 4.618161308516639, + "grad_norm": 1.1767401695251465, + "learning_rate": 2.691201353637902e-05, + "loss": 0.7641, + "step": 8188 + }, + { + "epoch": 4.6187253243090804, + "grad_norm": 0.9438018202781677, + "learning_rate": 2.690919345741681e-05, + "loss": 0.6459, + "step": 8189 + }, + { + "epoch": 4.619289340101523, + "grad_norm": 1.5833356380462646, + "learning_rate": 2.6906373378454595e-05, + "loss": 0.8314, + "step": 8190 + }, + { + "epoch": 4.619853355893965, + "grad_norm": 1.1879756450653076, + "learning_rate": 2.6903553299492384e-05, + "loss": 0.8261, + "step": 8191 + }, + { + "epoch": 4.620417371686408, + "grad_norm": 0.8844581246376038, + "learning_rate": 2.6900733220530176e-05, + "loss": 0.7109, + "step": 8192 + }, + { + "epoch": 4.6209813874788495, + "grad_norm": 0.9572518467903137, + "learning_rate": 2.6897913141567965e-05, + "loss": 0.6574, + "step": 8193 + }, + { + "epoch": 4.621545403271291, + "grad_norm": 0.8805217742919922, + "learning_rate": 2.6895093062605754e-05, + "loss": 0.6418, + "step": 8194 + }, + { + "epoch": 4.622109419063734, + "grad_norm": 1.4946520328521729, + "learning_rate": 2.6892272983643546e-05, + "loss": 0.8117, + "step": 8195 + }, + { + "epoch": 4.622673434856176, + "grad_norm": 1.230328917503357, + "learning_rate": 2.6889452904681335e-05, + "loss": 0.7194, + "step": 8196 + }, + { + "epoch": 4.6232374506486185, + "grad_norm": 1.2107679843902588, + "learning_rate": 2.688663282571912e-05, + "loss": 0.7483, + "step": 8197 + }, + { + "epoch": 4.62380146644106, + "grad_norm": 1.4062018394470215, + "learning_rate": 2.688381274675691e-05, + "loss": 0.8273, + "step": 8198 + }, + { + "epoch": 4.624365482233502, + "grad_norm": 1.5607631206512451, + "learning_rate": 2.68809926677947e-05, + "loss": 0.7879, + "step": 8199 + }, + { + "epoch": 4.624929498025945, + "grad_norm": 1.4308658838272095, + "learning_rate": 2.687817258883249e-05, + "loss": 0.7829, + "step": 8200 + }, + { + "epoch": 4.625493513818387, + "grad_norm": 1.364711880683899, + "learning_rate": 2.687535250987028e-05, + "loss": 0.7332, + "step": 8201 + }, + { + "epoch": 4.626057529610829, + "grad_norm": 1.047344446182251, + "learning_rate": 2.6872532430908064e-05, + "loss": 0.7147, + "step": 8202 + }, + { + "epoch": 4.626621545403271, + "grad_norm": 3.7056679725646973, + "learning_rate": 2.686971235194586e-05, + "loss": 0.9466, + "step": 8203 + }, + { + "epoch": 4.627185561195714, + "grad_norm": 1.3189103603363037, + "learning_rate": 2.6866892272983645e-05, + "loss": 0.7599, + "step": 8204 + }, + { + "epoch": 4.627749576988156, + "grad_norm": 1.2769631147384644, + "learning_rate": 2.6864072194021434e-05, + "loss": 0.7463, + "step": 8205 + }, + { + "epoch": 4.628313592780598, + "grad_norm": 1.042541265487671, + "learning_rate": 2.6861252115059223e-05, + "loss": 0.7128, + "step": 8206 + }, + { + "epoch": 4.62887760857304, + "grad_norm": 1.0187231302261353, + "learning_rate": 2.6858432036097015e-05, + "loss": 0.7999, + "step": 8207 + }, + { + "epoch": 4.629441624365482, + "grad_norm": 1.2742029428482056, + "learning_rate": 2.68556119571348e-05, + "loss": 0.6495, + "step": 8208 + }, + { + "epoch": 4.630005640157925, + "grad_norm": 1.3856549263000488, + "learning_rate": 2.685279187817259e-05, + "loss": 0.734, + "step": 8209 + }, + { + "epoch": 4.630569655950366, + "grad_norm": 1.2259548902511597, + "learning_rate": 2.6849971799210378e-05, + "loss": 0.7232, + "step": 8210 + }, + { + "epoch": 4.631133671742809, + "grad_norm": 1.6166719198226929, + "learning_rate": 2.684715172024817e-05, + "loss": 0.7438, + "step": 8211 + }, + { + "epoch": 4.631697687535251, + "grad_norm": 1.1416215896606445, + "learning_rate": 2.684433164128596e-05, + "loss": 0.7957, + "step": 8212 + }, + { + "epoch": 4.632261703327693, + "grad_norm": 1.040229082107544, + "learning_rate": 2.6841511562323744e-05, + "loss": 0.7398, + "step": 8213 + }, + { + "epoch": 4.632825719120135, + "grad_norm": 1.5613689422607422, + "learning_rate": 2.6838691483361533e-05, + "loss": 0.7156, + "step": 8214 + }, + { + "epoch": 4.633389734912577, + "grad_norm": 1.0823599100112915, + "learning_rate": 2.6835871404399325e-05, + "loss": 0.7365, + "step": 8215 + }, + { + "epoch": 4.63395375070502, + "grad_norm": 1.1227530241012573, + "learning_rate": 2.6833051325437114e-05, + "loss": 0.7922, + "step": 8216 + }, + { + "epoch": 4.634517766497462, + "grad_norm": 1.3353790044784546, + "learning_rate": 2.6830231246474903e-05, + "loss": 0.8204, + "step": 8217 + }, + { + "epoch": 4.635081782289904, + "grad_norm": 0.9049696922302246, + "learning_rate": 2.6827411167512688e-05, + "loss": 0.7551, + "step": 8218 + }, + { + "epoch": 4.635645798082346, + "grad_norm": 1.26494562625885, + "learning_rate": 2.6824591088550484e-05, + "loss": 0.7162, + "step": 8219 + }, + { + "epoch": 4.636209813874789, + "grad_norm": 1.1759456396102905, + "learning_rate": 2.682177100958827e-05, + "loss": 0.7928, + "step": 8220 + }, + { + "epoch": 4.636773829667231, + "grad_norm": 1.4542056322097778, + "learning_rate": 2.6818950930626058e-05, + "loss": 0.7238, + "step": 8221 + }, + { + "epoch": 4.6373378454596725, + "grad_norm": 1.17453134059906, + "learning_rate": 2.6816130851663847e-05, + "loss": 0.6618, + "step": 8222 + }, + { + "epoch": 4.637901861252115, + "grad_norm": 1.3077163696289062, + "learning_rate": 2.681331077270164e-05, + "loss": 0.7709, + "step": 8223 + }, + { + "epoch": 4.638465877044557, + "grad_norm": 0.939543604850769, + "learning_rate": 2.6810490693739428e-05, + "loss": 0.7053, + "step": 8224 + }, + { + "epoch": 4.639029892837, + "grad_norm": 1.1981655359268188, + "learning_rate": 2.6807670614777213e-05, + "loss": 0.8564, + "step": 8225 + }, + { + "epoch": 4.6395939086294415, + "grad_norm": 1.2170311212539673, + "learning_rate": 2.6804850535815002e-05, + "loss": 0.7155, + "step": 8226 + }, + { + "epoch": 4.640157924421883, + "grad_norm": 1.0153086185455322, + "learning_rate": 2.6802030456852794e-05, + "loss": 0.7202, + "step": 8227 + }, + { + "epoch": 4.640721940214326, + "grad_norm": 1.242586374282837, + "learning_rate": 2.6799210377890583e-05, + "loss": 0.8162, + "step": 8228 + }, + { + "epoch": 4.641285956006768, + "grad_norm": 0.8692457675933838, + "learning_rate": 2.679639029892837e-05, + "loss": 0.6567, + "step": 8229 + }, + { + "epoch": 4.6418499717992106, + "grad_norm": 0.9106494784355164, + "learning_rate": 2.6793570219966157e-05, + "loss": 0.7321, + "step": 8230 + }, + { + "epoch": 4.642413987591652, + "grad_norm": 1.18397057056427, + "learning_rate": 2.679075014100395e-05, + "loss": 0.7784, + "step": 8231 + }, + { + "epoch": 4.642978003384095, + "grad_norm": 1.1512460708618164, + "learning_rate": 2.6787930062041738e-05, + "loss": 0.7808, + "step": 8232 + }, + { + "epoch": 4.643542019176537, + "grad_norm": 1.612568974494934, + "learning_rate": 2.6785109983079527e-05, + "loss": 0.9436, + "step": 8233 + }, + { + "epoch": 4.64410603496898, + "grad_norm": 1.1054445505142212, + "learning_rate": 2.678228990411732e-05, + "loss": 0.7465, + "step": 8234 + }, + { + "epoch": 4.644670050761421, + "grad_norm": 1.0181565284729004, + "learning_rate": 2.6779469825155108e-05, + "loss": 0.6757, + "step": 8235 + }, + { + "epoch": 4.645234066553863, + "grad_norm": 1.5433441400527954, + "learning_rate": 2.6776649746192893e-05, + "loss": 0.6997, + "step": 8236 + }, + { + "epoch": 4.645798082346306, + "grad_norm": 1.6123101711273193, + "learning_rate": 2.6773829667230682e-05, + "loss": 0.7634, + "step": 8237 + }, + { + "epoch": 4.646362098138748, + "grad_norm": 1.0991034507751465, + "learning_rate": 2.6771009588268474e-05, + "loss": 0.7928, + "step": 8238 + }, + { + "epoch": 4.64692611393119, + "grad_norm": 1.0402402877807617, + "learning_rate": 2.6768189509306263e-05, + "loss": 0.6651, + "step": 8239 + }, + { + "epoch": 4.647490129723632, + "grad_norm": 1.511385440826416, + "learning_rate": 2.6765369430344052e-05, + "loss": 0.8056, + "step": 8240 + }, + { + "epoch": 4.648054145516074, + "grad_norm": 1.1254355907440186, + "learning_rate": 2.6762549351381837e-05, + "loss": 0.7536, + "step": 8241 + }, + { + "epoch": 4.648618161308517, + "grad_norm": 1.4508479833602905, + "learning_rate": 2.6759729272419633e-05, + "loss": 0.671, + "step": 8242 + }, + { + "epoch": 4.6491821771009585, + "grad_norm": 1.3153572082519531, + "learning_rate": 2.675690919345742e-05, + "loss": 0.8066, + "step": 8243 + }, + { + "epoch": 4.649746192893401, + "grad_norm": 0.9376722574234009, + "learning_rate": 2.6754089114495207e-05, + "loss": 0.7624, + "step": 8244 + }, + { + "epoch": 4.650310208685843, + "grad_norm": 1.2863713502883911, + "learning_rate": 2.6751269035532996e-05, + "loss": 0.7543, + "step": 8245 + }, + { + "epoch": 4.650874224478286, + "grad_norm": 1.559058666229248, + "learning_rate": 2.6748448956570788e-05, + "loss": 0.8454, + "step": 8246 + }, + { + "epoch": 4.6514382402707275, + "grad_norm": 1.0606462955474854, + "learning_rate": 2.6745628877608574e-05, + "loss": 0.8152, + "step": 8247 + }, + { + "epoch": 4.65200225606317, + "grad_norm": 0.9747870564460754, + "learning_rate": 2.6742808798646362e-05, + "loss": 0.7358, + "step": 8248 + }, + { + "epoch": 4.652566271855612, + "grad_norm": 1.3351603746414185, + "learning_rate": 2.673998871968415e-05, + "loss": 0.792, + "step": 8249 + }, + { + "epoch": 4.653130287648054, + "grad_norm": 1.0814156532287598, + "learning_rate": 2.6737168640721943e-05, + "loss": 0.7852, + "step": 8250 + }, + { + "epoch": 4.6536943034404965, + "grad_norm": 1.4427834749221802, + "learning_rate": 2.6734348561759732e-05, + "loss": 0.7497, + "step": 8251 + }, + { + "epoch": 4.654258319232938, + "grad_norm": 1.032303810119629, + "learning_rate": 2.6731528482797517e-05, + "loss": 0.7526, + "step": 8252 + }, + { + "epoch": 4.654822335025381, + "grad_norm": 0.9170224070549011, + "learning_rate": 2.6728708403835306e-05, + "loss": 0.7503, + "step": 8253 + }, + { + "epoch": 4.655386350817823, + "grad_norm": 1.6110020875930786, + "learning_rate": 2.67258883248731e-05, + "loss": 0.7812, + "step": 8254 + }, + { + "epoch": 4.655950366610265, + "grad_norm": 1.1501703262329102, + "learning_rate": 2.6723068245910887e-05, + "loss": 0.7833, + "step": 8255 + }, + { + "epoch": 4.656514382402707, + "grad_norm": 1.0931880474090576, + "learning_rate": 2.6720248166948676e-05, + "loss": 0.6613, + "step": 8256 + }, + { + "epoch": 4.657078398195149, + "grad_norm": 1.127100944519043, + "learning_rate": 2.671742808798646e-05, + "loss": 0.6401, + "step": 8257 + }, + { + "epoch": 4.657642413987592, + "grad_norm": 1.0693494081497192, + "learning_rate": 2.6714608009024257e-05, + "loss": 0.7243, + "step": 8258 + }, + { + "epoch": 4.658206429780034, + "grad_norm": 1.1267199516296387, + "learning_rate": 2.6711787930062042e-05, + "loss": 0.6985, + "step": 8259 + }, + { + "epoch": 4.658770445572476, + "grad_norm": 1.586668848991394, + "learning_rate": 2.670896785109983e-05, + "loss": 0.7688, + "step": 8260 + }, + { + "epoch": 4.659334461364918, + "grad_norm": 1.0471515655517578, + "learning_rate": 2.670614777213762e-05, + "loss": 0.7749, + "step": 8261 + }, + { + "epoch": 4.659898477157361, + "grad_norm": 1.3878048658370972, + "learning_rate": 2.6703327693175412e-05, + "loss": 0.7539, + "step": 8262 + }, + { + "epoch": 4.660462492949803, + "grad_norm": 1.0471028089523315, + "learning_rate": 2.67005076142132e-05, + "loss": 0.7087, + "step": 8263 + }, + { + "epoch": 4.6610265087422444, + "grad_norm": 0.9578248858451843, + "learning_rate": 2.6697687535250986e-05, + "loss": 0.7555, + "step": 8264 + }, + { + "epoch": 4.661590524534687, + "grad_norm": 1.0170531272888184, + "learning_rate": 2.6694867456288775e-05, + "loss": 0.7923, + "step": 8265 + }, + { + "epoch": 4.662154540327129, + "grad_norm": 0.8864668011665344, + "learning_rate": 2.6692047377326567e-05, + "loss": 0.655, + "step": 8266 + }, + { + "epoch": 4.662718556119572, + "grad_norm": 1.2699466943740845, + "learning_rate": 2.6689227298364356e-05, + "loss": 0.8066, + "step": 8267 + }, + { + "epoch": 4.6632825719120135, + "grad_norm": 3.7426204681396484, + "learning_rate": 2.668640721940214e-05, + "loss": 0.7561, + "step": 8268 + }, + { + "epoch": 4.663846587704455, + "grad_norm": 1.3514870405197144, + "learning_rate": 2.6683587140439937e-05, + "loss": 0.7495, + "step": 8269 + }, + { + "epoch": 4.664410603496898, + "grad_norm": 1.1810383796691895, + "learning_rate": 2.6680767061477723e-05, + "loss": 0.7605, + "step": 8270 + }, + { + "epoch": 4.66497461928934, + "grad_norm": 1.1602425575256348, + "learning_rate": 2.667794698251551e-05, + "loss": 0.8461, + "step": 8271 + }, + { + "epoch": 4.6655386350817825, + "grad_norm": 1.6592974662780762, + "learning_rate": 2.66751269035533e-05, + "loss": 0.8031, + "step": 8272 + }, + { + "epoch": 4.666102650874224, + "grad_norm": 0.973226010799408, + "learning_rate": 2.6672306824591092e-05, + "loss": 0.7213, + "step": 8273 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 1.23264479637146, + "learning_rate": 2.666948674562888e-05, + "loss": 0.7992, + "step": 8274 + }, + { + "epoch": 4.667230682459109, + "grad_norm": 1.306802749633789, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.7381, + "step": 8275 + }, + { + "epoch": 4.6677946982515515, + "grad_norm": 0.8701847791671753, + "learning_rate": 2.6663846587704455e-05, + "loss": 0.711, + "step": 8276 + }, + { + "epoch": 4.668358714043993, + "grad_norm": 1.2839068174362183, + "learning_rate": 2.6661026508742248e-05, + "loss": 0.6927, + "step": 8277 + }, + { + "epoch": 4.668922729836435, + "grad_norm": 1.116214632987976, + "learning_rate": 2.6658206429780036e-05, + "loss": 0.6701, + "step": 8278 + }, + { + "epoch": 4.669486745628878, + "grad_norm": 0.9262681603431702, + "learning_rate": 2.6655386350817825e-05, + "loss": 0.592, + "step": 8279 + }, + { + "epoch": 4.67005076142132, + "grad_norm": 0.9007713794708252, + "learning_rate": 2.665256627185561e-05, + "loss": 0.6773, + "step": 8280 + }, + { + "epoch": 4.670614777213762, + "grad_norm": 0.9142090678215027, + "learning_rate": 2.6649746192893406e-05, + "loss": 0.7064, + "step": 8281 + }, + { + "epoch": 4.671178793006204, + "grad_norm": 1.1047859191894531, + "learning_rate": 2.664692611393119e-05, + "loss": 0.6994, + "step": 8282 + }, + { + "epoch": 4.671742808798646, + "grad_norm": 1.1888608932495117, + "learning_rate": 2.664410603496898e-05, + "loss": 0.7319, + "step": 8283 + }, + { + "epoch": 4.672306824591089, + "grad_norm": 0.9206052422523499, + "learning_rate": 2.6641285956006766e-05, + "loss": 0.6722, + "step": 8284 + }, + { + "epoch": 4.67287084038353, + "grad_norm": 0.8883689045906067, + "learning_rate": 2.663846587704456e-05, + "loss": 0.6481, + "step": 8285 + }, + { + "epoch": 4.673434856175973, + "grad_norm": 0.9716887474060059, + "learning_rate": 2.6635645798082347e-05, + "loss": 0.6931, + "step": 8286 + }, + { + "epoch": 4.673998871968415, + "grad_norm": 1.459653377532959, + "learning_rate": 2.6632825719120136e-05, + "loss": 0.7066, + "step": 8287 + }, + { + "epoch": 4.674562887760858, + "grad_norm": 1.2577542066574097, + "learning_rate": 2.6630005640157924e-05, + "loss": 0.6717, + "step": 8288 + }, + { + "epoch": 4.675126903553299, + "grad_norm": 1.256015658378601, + "learning_rate": 2.6627185561195717e-05, + "loss": 0.7675, + "step": 8289 + }, + { + "epoch": 4.675690919345742, + "grad_norm": 1.1672265529632568, + "learning_rate": 2.6624365482233505e-05, + "loss": 0.8185, + "step": 8290 + }, + { + "epoch": 4.676254935138184, + "grad_norm": 1.6082878112792969, + "learning_rate": 2.662154540327129e-05, + "loss": 0.8835, + "step": 8291 + }, + { + "epoch": 4.676818950930626, + "grad_norm": 1.0309066772460938, + "learning_rate": 2.661872532430908e-05, + "loss": 0.7709, + "step": 8292 + }, + { + "epoch": 4.677382966723068, + "grad_norm": 1.1262290477752686, + "learning_rate": 2.6615905245346872e-05, + "loss": 0.7201, + "step": 8293 + }, + { + "epoch": 4.67794698251551, + "grad_norm": 1.160887360572815, + "learning_rate": 2.661308516638466e-05, + "loss": 0.7676, + "step": 8294 + }, + { + "epoch": 4.678510998307953, + "grad_norm": 1.2518885135650635, + "learning_rate": 2.661026508742245e-05, + "loss": 0.776, + "step": 8295 + }, + { + "epoch": 4.679075014100395, + "grad_norm": 1.4199597835540771, + "learning_rate": 2.6607445008460235e-05, + "loss": 0.7471, + "step": 8296 + }, + { + "epoch": 4.6796390298928365, + "grad_norm": 1.3809864521026611, + "learning_rate": 2.660462492949803e-05, + "loss": 0.7166, + "step": 8297 + }, + { + "epoch": 4.680203045685279, + "grad_norm": 1.266440987586975, + "learning_rate": 2.6601804850535816e-05, + "loss": 0.8244, + "step": 8298 + }, + { + "epoch": 4.680767061477721, + "grad_norm": 1.2679320573806763, + "learning_rate": 2.6598984771573604e-05, + "loss": 0.7205, + "step": 8299 + }, + { + "epoch": 4.681331077270164, + "grad_norm": 1.5445395708084106, + "learning_rate": 2.6596164692611393e-05, + "loss": 0.7379, + "step": 8300 + }, + { + "epoch": 4.6818950930626055, + "grad_norm": 1.073165774345398, + "learning_rate": 2.6593344613649185e-05, + "loss": 0.7051, + "step": 8301 + }, + { + "epoch": 4.682459108855048, + "grad_norm": 0.964981198310852, + "learning_rate": 2.659052453468697e-05, + "loss": 0.689, + "step": 8302 + }, + { + "epoch": 4.68302312464749, + "grad_norm": 1.3098982572555542, + "learning_rate": 2.658770445572476e-05, + "loss": 0.734, + "step": 8303 + }, + { + "epoch": 4.683587140439933, + "grad_norm": 1.4110652208328247, + "learning_rate": 2.6584884376762552e-05, + "loss": 0.7109, + "step": 8304 + }, + { + "epoch": 4.6841511562323745, + "grad_norm": 1.409659504890442, + "learning_rate": 2.658206429780034e-05, + "loss": 0.7366, + "step": 8305 + }, + { + "epoch": 4.684715172024816, + "grad_norm": 1.4615272283554077, + "learning_rate": 2.657924421883813e-05, + "loss": 0.7372, + "step": 8306 + }, + { + "epoch": 4.685279187817259, + "grad_norm": 0.9750277400016785, + "learning_rate": 2.6576424139875915e-05, + "loss": 0.6745, + "step": 8307 + }, + { + "epoch": 4.685843203609701, + "grad_norm": 1.800940990447998, + "learning_rate": 2.657360406091371e-05, + "loss": 0.9345, + "step": 8308 + }, + { + "epoch": 4.6864072194021436, + "grad_norm": 1.6504921913146973, + "learning_rate": 2.6570783981951496e-05, + "loss": 0.7275, + "step": 8309 + }, + { + "epoch": 4.686971235194585, + "grad_norm": 1.474428653717041, + "learning_rate": 2.6567963902989285e-05, + "loss": 0.6075, + "step": 8310 + }, + { + "epoch": 4.687535250987027, + "grad_norm": 1.889243721961975, + "learning_rate": 2.6565143824027073e-05, + "loss": 0.7916, + "step": 8311 + }, + { + "epoch": 4.68809926677947, + "grad_norm": 1.1542249917984009, + "learning_rate": 2.6562323745064866e-05, + "loss": 0.7745, + "step": 8312 + }, + { + "epoch": 4.688663282571912, + "grad_norm": 1.547569990158081, + "learning_rate": 2.6559503666102654e-05, + "loss": 0.7463, + "step": 8313 + }, + { + "epoch": 4.689227298364354, + "grad_norm": 1.0113731622695923, + "learning_rate": 2.655668358714044e-05, + "loss": 0.6881, + "step": 8314 + }, + { + "epoch": 4.689791314156796, + "grad_norm": 1.7130619287490845, + "learning_rate": 2.655386350817823e-05, + "loss": 0.8096, + "step": 8315 + }, + { + "epoch": 4.690355329949239, + "grad_norm": 1.4896949529647827, + "learning_rate": 2.655104342921602e-05, + "loss": 0.8571, + "step": 8316 + }, + { + "epoch": 4.690919345741681, + "grad_norm": 0.8511640429496765, + "learning_rate": 2.654822335025381e-05, + "loss": 0.5944, + "step": 8317 + }, + { + "epoch": 4.691483361534123, + "grad_norm": 1.113708257675171, + "learning_rate": 2.65454032712916e-05, + "loss": 0.7279, + "step": 8318 + }, + { + "epoch": 4.692047377326565, + "grad_norm": 1.5047054290771484, + "learning_rate": 2.6542583192329384e-05, + "loss": 0.7755, + "step": 8319 + }, + { + "epoch": 4.692611393119007, + "grad_norm": 1.2401204109191895, + "learning_rate": 2.6539763113367176e-05, + "loss": 0.7996, + "step": 8320 + }, + { + "epoch": 4.69317540891145, + "grad_norm": 0.9991405010223389, + "learning_rate": 2.6536943034404965e-05, + "loss": 0.671, + "step": 8321 + }, + { + "epoch": 4.6937394247038915, + "grad_norm": 1.5827159881591797, + "learning_rate": 2.6534122955442754e-05, + "loss": 0.7854, + "step": 8322 + }, + { + "epoch": 4.694303440496334, + "grad_norm": 1.341780424118042, + "learning_rate": 2.653130287648054e-05, + "loss": 0.8044, + "step": 8323 + }, + { + "epoch": 4.694867456288776, + "grad_norm": 1.226337194442749, + "learning_rate": 2.6528482797518335e-05, + "loss": 0.6527, + "step": 8324 + }, + { + "epoch": 4.695431472081218, + "grad_norm": 1.3879384994506836, + "learning_rate": 2.652566271855612e-05, + "loss": 0.7774, + "step": 8325 + }, + { + "epoch": 4.6959954878736605, + "grad_norm": 0.9526144862174988, + "learning_rate": 2.652284263959391e-05, + "loss": 0.611, + "step": 8326 + }, + { + "epoch": 4.696559503666102, + "grad_norm": 1.025287389755249, + "learning_rate": 2.6520022560631698e-05, + "loss": 0.6735, + "step": 8327 + }, + { + "epoch": 4.697123519458545, + "grad_norm": 1.12052321434021, + "learning_rate": 2.651720248166949e-05, + "loss": 0.6877, + "step": 8328 + }, + { + "epoch": 4.697687535250987, + "grad_norm": 1.0616477727890015, + "learning_rate": 2.651438240270728e-05, + "loss": 0.7713, + "step": 8329 + }, + { + "epoch": 4.6982515510434295, + "grad_norm": 0.8647138476371765, + "learning_rate": 2.6511562323745064e-05, + "loss": 0.6602, + "step": 8330 + }, + { + "epoch": 4.698815566835871, + "grad_norm": 1.4433372020721436, + "learning_rate": 2.6508742244782853e-05, + "loss": 0.7592, + "step": 8331 + }, + { + "epoch": 4.699379582628314, + "grad_norm": 0.9096046686172485, + "learning_rate": 2.6505922165820645e-05, + "loss": 0.7077, + "step": 8332 + }, + { + "epoch": 4.699943598420756, + "grad_norm": 0.7965997457504272, + "learning_rate": 2.6503102086858434e-05, + "loss": 0.6012, + "step": 8333 + }, + { + "epoch": 4.700507614213198, + "grad_norm": 1.1132452487945557, + "learning_rate": 2.6500282007896223e-05, + "loss": 0.7187, + "step": 8334 + }, + { + "epoch": 4.70107163000564, + "grad_norm": 2.575812339782715, + "learning_rate": 2.6497461928934008e-05, + "loss": 0.7173, + "step": 8335 + }, + { + "epoch": 4.701635645798082, + "grad_norm": 1.1398824453353882, + "learning_rate": 2.6494641849971804e-05, + "loss": 0.7188, + "step": 8336 + }, + { + "epoch": 4.702199661590525, + "grad_norm": 1.507504940032959, + "learning_rate": 2.649182177100959e-05, + "loss": 0.8454, + "step": 8337 + }, + { + "epoch": 4.702763677382967, + "grad_norm": 1.2046011686325073, + "learning_rate": 2.6489001692047378e-05, + "loss": 0.7055, + "step": 8338 + }, + { + "epoch": 4.703327693175408, + "grad_norm": 0.9933191537857056, + "learning_rate": 2.648618161308517e-05, + "loss": 0.7084, + "step": 8339 + }, + { + "epoch": 4.703891708967851, + "grad_norm": 1.3339089155197144, + "learning_rate": 2.648336153412296e-05, + "loss": 0.783, + "step": 8340 + }, + { + "epoch": 4.704455724760293, + "grad_norm": 1.397047758102417, + "learning_rate": 2.6480541455160744e-05, + "loss": 0.6871, + "step": 8341 + }, + { + "epoch": 4.705019740552736, + "grad_norm": 0.8363891243934631, + "learning_rate": 2.6477721376198533e-05, + "loss": 0.6917, + "step": 8342 + }, + { + "epoch": 4.7055837563451774, + "grad_norm": 1.2484318017959595, + "learning_rate": 2.6474901297236325e-05, + "loss": 0.7134, + "step": 8343 + }, + { + "epoch": 4.70614777213762, + "grad_norm": 1.2725521326065063, + "learning_rate": 2.6472081218274114e-05, + "loss": 0.8075, + "step": 8344 + }, + { + "epoch": 4.706711787930062, + "grad_norm": 1.4424265623092651, + "learning_rate": 2.6469261139311903e-05, + "loss": 0.7571, + "step": 8345 + }, + { + "epoch": 4.707275803722505, + "grad_norm": 1.1204544305801392, + "learning_rate": 2.6466441060349688e-05, + "loss": 0.7648, + "step": 8346 + }, + { + "epoch": 4.7078398195149465, + "grad_norm": 1.0127753019332886, + "learning_rate": 2.6463620981387484e-05, + "loss": 0.823, + "step": 8347 + }, + { + "epoch": 4.708403835307388, + "grad_norm": 1.2665443420410156, + "learning_rate": 2.646080090242527e-05, + "loss": 0.7656, + "step": 8348 + }, + { + "epoch": 4.708967851099831, + "grad_norm": 1.008277416229248, + "learning_rate": 2.6457980823463058e-05, + "loss": 0.7768, + "step": 8349 + }, + { + "epoch": 4.709531866892273, + "grad_norm": 1.3988659381866455, + "learning_rate": 2.6455160744500847e-05, + "loss": 0.7567, + "step": 8350 + }, + { + "epoch": 4.7100958826847155, + "grad_norm": 1.2537600994110107, + "learning_rate": 2.645234066553864e-05, + "loss": 0.6377, + "step": 8351 + }, + { + "epoch": 4.710659898477157, + "grad_norm": 1.156864047050476, + "learning_rate": 2.6449520586576428e-05, + "loss": 0.7172, + "step": 8352 + }, + { + "epoch": 4.711223914269599, + "grad_norm": 1.1351468563079834, + "learning_rate": 2.6446700507614213e-05, + "loss": 0.7103, + "step": 8353 + }, + { + "epoch": 4.711787930062042, + "grad_norm": 0.983992338180542, + "learning_rate": 2.6443880428652002e-05, + "loss": 0.6081, + "step": 8354 + }, + { + "epoch": 4.712351945854484, + "grad_norm": 1.2501275539398193, + "learning_rate": 2.6441060349689794e-05, + "loss": 0.7942, + "step": 8355 + }, + { + "epoch": 4.712915961646926, + "grad_norm": 1.1117533445358276, + "learning_rate": 2.6438240270727583e-05, + "loss": 0.6619, + "step": 8356 + }, + { + "epoch": 4.713479977439368, + "grad_norm": 1.463073968887329, + "learning_rate": 2.6435420191765368e-05, + "loss": 0.6955, + "step": 8357 + }, + { + "epoch": 4.714043993231811, + "grad_norm": 0.9100636839866638, + "learning_rate": 2.6432600112803157e-05, + "loss": 0.7075, + "step": 8358 + }, + { + "epoch": 4.714608009024253, + "grad_norm": 0.9446659684181213, + "learning_rate": 2.642978003384095e-05, + "loss": 0.6693, + "step": 8359 + }, + { + "epoch": 4.715172024816695, + "grad_norm": 1.226434350013733, + "learning_rate": 2.6426959954878738e-05, + "loss": 0.8211, + "step": 8360 + }, + { + "epoch": 4.715736040609137, + "grad_norm": 1.8470460176467896, + "learning_rate": 2.6424139875916527e-05, + "loss": 0.826, + "step": 8361 + }, + { + "epoch": 4.716300056401579, + "grad_norm": 1.2335574626922607, + "learning_rate": 2.6421319796954312e-05, + "loss": 0.6305, + "step": 8362 + }, + { + "epoch": 4.716864072194022, + "grad_norm": 1.2335704565048218, + "learning_rate": 2.6418499717992108e-05, + "loss": 0.6982, + "step": 8363 + }, + { + "epoch": 4.717428087986463, + "grad_norm": 0.9783929586410522, + "learning_rate": 2.6415679639029893e-05, + "loss": 0.7214, + "step": 8364 + }, + { + "epoch": 4.717992103778906, + "grad_norm": 1.665022850036621, + "learning_rate": 2.6412859560067682e-05, + "loss": 0.7898, + "step": 8365 + }, + { + "epoch": 4.718556119571348, + "grad_norm": 1.1388170719146729, + "learning_rate": 2.641003948110547e-05, + "loss": 0.8451, + "step": 8366 + }, + { + "epoch": 4.71912013536379, + "grad_norm": 0.9025022983551025, + "learning_rate": 2.6407219402143263e-05, + "loss": 0.575, + "step": 8367 + }, + { + "epoch": 4.719684151156232, + "grad_norm": 1.5191503763198853, + "learning_rate": 2.6404399323181052e-05, + "loss": 0.8686, + "step": 8368 + }, + { + "epoch": 4.720248166948674, + "grad_norm": 1.564003825187683, + "learning_rate": 2.6401579244218837e-05, + "loss": 0.8423, + "step": 8369 + }, + { + "epoch": 4.720812182741117, + "grad_norm": 0.8914083242416382, + "learning_rate": 2.6398759165256626e-05, + "loss": 0.7895, + "step": 8370 + }, + { + "epoch": 4.721376198533559, + "grad_norm": 1.5378636121749878, + "learning_rate": 2.6395939086294418e-05, + "loss": 0.7408, + "step": 8371 + }, + { + "epoch": 4.721940214326001, + "grad_norm": 0.9478761553764343, + "learning_rate": 2.6393119007332207e-05, + "loss": 0.7745, + "step": 8372 + }, + { + "epoch": 4.722504230118443, + "grad_norm": 1.6283948421478271, + "learning_rate": 2.6390298928369996e-05, + "loss": 0.7084, + "step": 8373 + }, + { + "epoch": 4.723068245910886, + "grad_norm": 1.0654088258743286, + "learning_rate": 2.6387478849407788e-05, + "loss": 0.7055, + "step": 8374 + }, + { + "epoch": 4.723632261703328, + "grad_norm": 1.2251787185668945, + "learning_rate": 2.6384658770445573e-05, + "loss": 0.7548, + "step": 8375 + }, + { + "epoch": 4.7241962774957695, + "grad_norm": 2.2267117500305176, + "learning_rate": 2.6381838691483362e-05, + "loss": 0.9567, + "step": 8376 + }, + { + "epoch": 4.724760293288212, + "grad_norm": 1.3617295026779175, + "learning_rate": 2.637901861252115e-05, + "loss": 0.7864, + "step": 8377 + }, + { + "epoch": 4.725324309080654, + "grad_norm": 0.9711377620697021, + "learning_rate": 2.6376198533558943e-05, + "loss": 0.709, + "step": 8378 + }, + { + "epoch": 4.725888324873097, + "grad_norm": 1.1273339986801147, + "learning_rate": 2.6373378454596732e-05, + "loss": 0.7511, + "step": 8379 + }, + { + "epoch": 4.7264523406655385, + "grad_norm": 1.239964246749878, + "learning_rate": 2.6370558375634517e-05, + "loss": 0.8218, + "step": 8380 + }, + { + "epoch": 4.72701635645798, + "grad_norm": 1.0314862728118896, + "learning_rate": 2.6367738296672306e-05, + "loss": 0.6821, + "step": 8381 + }, + { + "epoch": 4.727580372250423, + "grad_norm": 1.7306556701660156, + "learning_rate": 2.63649182177101e-05, + "loss": 0.8411, + "step": 8382 + }, + { + "epoch": 4.728144388042865, + "grad_norm": 1.0207226276397705, + "learning_rate": 2.6362098138747887e-05, + "loss": 0.6831, + "step": 8383 + }, + { + "epoch": 4.7287084038353075, + "grad_norm": 1.5032762289047241, + "learning_rate": 2.6359278059785676e-05, + "loss": 0.7856, + "step": 8384 + }, + { + "epoch": 4.729272419627749, + "grad_norm": 1.4923897981643677, + "learning_rate": 2.635645798082346e-05, + "loss": 0.7264, + "step": 8385 + }, + { + "epoch": 4.729836435420192, + "grad_norm": 4.133388996124268, + "learning_rate": 2.6353637901861257e-05, + "loss": 0.9687, + "step": 8386 + }, + { + "epoch": 4.730400451212634, + "grad_norm": 1.3860297203063965, + "learning_rate": 2.6350817822899042e-05, + "loss": 0.7041, + "step": 8387 + }, + { + "epoch": 4.730964467005077, + "grad_norm": 1.1101534366607666, + "learning_rate": 2.634799774393683e-05, + "loss": 0.7191, + "step": 8388 + }, + { + "epoch": 4.731528482797518, + "grad_norm": 1.0334324836730957, + "learning_rate": 2.634517766497462e-05, + "loss": 0.6033, + "step": 8389 + }, + { + "epoch": 4.73209249858996, + "grad_norm": 1.10657799243927, + "learning_rate": 2.6342357586012412e-05, + "loss": 0.6757, + "step": 8390 + }, + { + "epoch": 4.732656514382403, + "grad_norm": 1.1354963779449463, + "learning_rate": 2.63395375070502e-05, + "loss": 0.7002, + "step": 8391 + }, + { + "epoch": 4.733220530174845, + "grad_norm": 0.8762738108634949, + "learning_rate": 2.6336717428087986e-05, + "loss": 0.673, + "step": 8392 + }, + { + "epoch": 4.733784545967287, + "grad_norm": 1.1125658750534058, + "learning_rate": 2.6333897349125775e-05, + "loss": 0.7684, + "step": 8393 + }, + { + "epoch": 4.734348561759729, + "grad_norm": 0.878182053565979, + "learning_rate": 2.6331077270163567e-05, + "loss": 0.6977, + "step": 8394 + }, + { + "epoch": 4.734912577552171, + "grad_norm": 1.2323216199874878, + "learning_rate": 2.6328257191201356e-05, + "loss": 0.8144, + "step": 8395 + }, + { + "epoch": 4.735476593344614, + "grad_norm": 1.3392231464385986, + "learning_rate": 2.632543711223914e-05, + "loss": 0.8301, + "step": 8396 + }, + { + "epoch": 4.7360406091370555, + "grad_norm": 1.2015440464019775, + "learning_rate": 2.632261703327693e-05, + "loss": 0.8303, + "step": 8397 + }, + { + "epoch": 4.736604624929498, + "grad_norm": 1.2687597274780273, + "learning_rate": 2.6319796954314722e-05, + "loss": 0.8589, + "step": 8398 + }, + { + "epoch": 4.73716864072194, + "grad_norm": 1.310705542564392, + "learning_rate": 2.631697687535251e-05, + "loss": 0.734, + "step": 8399 + }, + { + "epoch": 4.737732656514383, + "grad_norm": 1.063872218132019, + "learning_rate": 2.63141567963903e-05, + "loss": 0.8, + "step": 8400 + }, + { + "epoch": 4.7382966723068245, + "grad_norm": 1.204511284828186, + "learning_rate": 2.6311336717428085e-05, + "loss": 0.8273, + "step": 8401 + }, + { + "epoch": 4.738860688099267, + "grad_norm": 0.9393495917320251, + "learning_rate": 2.630851663846588e-05, + "loss": 0.6893, + "step": 8402 + }, + { + "epoch": 4.739424703891709, + "grad_norm": 1.1677906513214111, + "learning_rate": 2.6305696559503666e-05, + "loss": 0.6337, + "step": 8403 + }, + { + "epoch": 4.739988719684151, + "grad_norm": 1.3248820304870605, + "learning_rate": 2.6302876480541455e-05, + "loss": 0.8089, + "step": 8404 + }, + { + "epoch": 4.7405527354765935, + "grad_norm": 1.7278364896774292, + "learning_rate": 2.6300056401579244e-05, + "loss": 0.7964, + "step": 8405 + }, + { + "epoch": 4.741116751269035, + "grad_norm": 1.5904664993286133, + "learning_rate": 2.6297236322617036e-05, + "loss": 0.9141, + "step": 8406 + }, + { + "epoch": 4.741680767061478, + "grad_norm": 1.164823293685913, + "learning_rate": 2.6294416243654825e-05, + "loss": 0.7569, + "step": 8407 + }, + { + "epoch": 4.74224478285392, + "grad_norm": 1.1126052141189575, + "learning_rate": 2.629159616469261e-05, + "loss": 0.7787, + "step": 8408 + }, + { + "epoch": 4.742808798646362, + "grad_norm": 1.0924774408340454, + "learning_rate": 2.6288776085730406e-05, + "loss": 0.8103, + "step": 8409 + }, + { + "epoch": 4.743372814438804, + "grad_norm": 1.0436760187149048, + "learning_rate": 2.628595600676819e-05, + "loss": 0.693, + "step": 8410 + }, + { + "epoch": 4.743936830231246, + "grad_norm": 0.875411331653595, + "learning_rate": 2.628313592780598e-05, + "loss": 0.6893, + "step": 8411 + }, + { + "epoch": 4.744500846023689, + "grad_norm": 0.8785144090652466, + "learning_rate": 2.628031584884377e-05, + "loss": 0.6513, + "step": 8412 + }, + { + "epoch": 4.745064861816131, + "grad_norm": 1.1505388021469116, + "learning_rate": 2.627749576988156e-05, + "loss": 0.7951, + "step": 8413 + }, + { + "epoch": 4.745628877608573, + "grad_norm": 1.4062063694000244, + "learning_rate": 2.6274675690919347e-05, + "loss": 0.7551, + "step": 8414 + }, + { + "epoch": 4.746192893401015, + "grad_norm": 1.602241039276123, + "learning_rate": 2.6271855611957135e-05, + "loss": 0.7054, + "step": 8415 + }, + { + "epoch": 4.746756909193458, + "grad_norm": 1.0597702264785767, + "learning_rate": 2.6269035532994924e-05, + "loss": 0.8146, + "step": 8416 + }, + { + "epoch": 4.7473209249859, + "grad_norm": 1.8955578804016113, + "learning_rate": 2.6266215454032716e-05, + "loss": 0.9191, + "step": 8417 + }, + { + "epoch": 4.7478849407783414, + "grad_norm": 1.310459017753601, + "learning_rate": 2.6263395375070505e-05, + "loss": 0.6985, + "step": 8418 + }, + { + "epoch": 4.748448956570784, + "grad_norm": 1.177262783050537, + "learning_rate": 2.626057529610829e-05, + "loss": 0.7339, + "step": 8419 + }, + { + "epoch": 4.749012972363226, + "grad_norm": 1.219351887702942, + "learning_rate": 2.625775521714608e-05, + "loss": 0.8251, + "step": 8420 + }, + { + "epoch": 4.749576988155669, + "grad_norm": 1.398093581199646, + "learning_rate": 2.625493513818387e-05, + "loss": 0.9076, + "step": 8421 + }, + { + "epoch": 4.7501410039481105, + "grad_norm": 1.1605525016784668, + "learning_rate": 2.625211505922166e-05, + "loss": 0.6367, + "step": 8422 + }, + { + "epoch": 4.750705019740552, + "grad_norm": 1.2790095806121826, + "learning_rate": 2.624929498025945e-05, + "loss": 0.7788, + "step": 8423 + }, + { + "epoch": 4.751269035532995, + "grad_norm": 1.0039938688278198, + "learning_rate": 2.6246474901297235e-05, + "loss": 0.7574, + "step": 8424 + }, + { + "epoch": 4.751833051325437, + "grad_norm": 1.340935468673706, + "learning_rate": 2.624365482233503e-05, + "loss": 0.8194, + "step": 8425 + }, + { + "epoch": 4.7523970671178795, + "grad_norm": 1.2215549945831299, + "learning_rate": 2.6240834743372816e-05, + "loss": 0.8051, + "step": 8426 + }, + { + "epoch": 4.752961082910321, + "grad_norm": 1.1435023546218872, + "learning_rate": 2.6238014664410604e-05, + "loss": 0.7849, + "step": 8427 + }, + { + "epoch": 4.753525098702764, + "grad_norm": 1.0936167240142822, + "learning_rate": 2.6235194585448393e-05, + "loss": 0.6973, + "step": 8428 + }, + { + "epoch": 4.754089114495206, + "grad_norm": 1.3785052299499512, + "learning_rate": 2.6232374506486185e-05, + "loss": 0.7667, + "step": 8429 + }, + { + "epoch": 4.7546531302876485, + "grad_norm": 1.217725396156311, + "learning_rate": 2.6229554427523974e-05, + "loss": 0.672, + "step": 8430 + }, + { + "epoch": 4.75521714608009, + "grad_norm": 1.1036807298660278, + "learning_rate": 2.622673434856176e-05, + "loss": 0.7455, + "step": 8431 + }, + { + "epoch": 4.755781161872532, + "grad_norm": 1.5191938877105713, + "learning_rate": 2.6223914269599548e-05, + "loss": 0.6725, + "step": 8432 + }, + { + "epoch": 4.756345177664975, + "grad_norm": 1.145039439201355, + "learning_rate": 2.622109419063734e-05, + "loss": 0.723, + "step": 8433 + }, + { + "epoch": 4.756909193457417, + "grad_norm": 1.3941717147827148, + "learning_rate": 2.621827411167513e-05, + "loss": 0.6626, + "step": 8434 + }, + { + "epoch": 4.757473209249859, + "grad_norm": 1.3685448169708252, + "learning_rate": 2.6215454032712915e-05, + "loss": 0.8087, + "step": 8435 + }, + { + "epoch": 4.758037225042301, + "grad_norm": 1.3597301244735718, + "learning_rate": 2.6212633953750703e-05, + "loss": 0.772, + "step": 8436 + }, + { + "epoch": 4.758601240834743, + "grad_norm": 0.8659186959266663, + "learning_rate": 2.6209813874788496e-05, + "loss": 0.6927, + "step": 8437 + }, + { + "epoch": 4.759165256627186, + "grad_norm": 1.029958724975586, + "learning_rate": 2.6206993795826284e-05, + "loss": 0.774, + "step": 8438 + }, + { + "epoch": 4.759729272419627, + "grad_norm": 1.0287936925888062, + "learning_rate": 2.6204173716864073e-05, + "loss": 0.8236, + "step": 8439 + }, + { + "epoch": 4.76029328821207, + "grad_norm": 0.9910601377487183, + "learning_rate": 2.620135363790186e-05, + "loss": 0.7527, + "step": 8440 + }, + { + "epoch": 4.760857304004512, + "grad_norm": 1.374100923538208, + "learning_rate": 2.6198533558939654e-05, + "loss": 0.7569, + "step": 8441 + }, + { + "epoch": 4.761421319796955, + "grad_norm": 1.444835901260376, + "learning_rate": 2.619571347997744e-05, + "loss": 0.7672, + "step": 8442 + }, + { + "epoch": 4.761985335589396, + "grad_norm": 0.8280646800994873, + "learning_rate": 2.619289340101523e-05, + "loss": 0.6922, + "step": 8443 + }, + { + "epoch": 4.762549351381839, + "grad_norm": 1.2527453899383545, + "learning_rate": 2.619007332205302e-05, + "loss": 0.7567, + "step": 8444 + }, + { + "epoch": 4.763113367174281, + "grad_norm": 1.3370119333267212, + "learning_rate": 2.618725324309081e-05, + "loss": 0.8127, + "step": 8445 + }, + { + "epoch": 4.763677382966723, + "grad_norm": 1.437544822692871, + "learning_rate": 2.6184433164128598e-05, + "loss": 0.9278, + "step": 8446 + }, + { + "epoch": 4.764241398759165, + "grad_norm": 1.7684195041656494, + "learning_rate": 2.6181613085166384e-05, + "loss": 0.8414, + "step": 8447 + }, + { + "epoch": 4.764805414551607, + "grad_norm": 0.8851845264434814, + "learning_rate": 2.6178793006204176e-05, + "loss": 0.6346, + "step": 8448 + }, + { + "epoch": 4.76536943034405, + "grad_norm": 1.3128005266189575, + "learning_rate": 2.6175972927241965e-05, + "loss": 0.8647, + "step": 8449 + }, + { + "epoch": 4.765933446136492, + "grad_norm": 1.4833651781082153, + "learning_rate": 2.6173152848279753e-05, + "loss": 0.8739, + "step": 8450 + }, + { + "epoch": 4.7664974619289335, + "grad_norm": 1.0078420639038086, + "learning_rate": 2.617033276931754e-05, + "loss": 0.7895, + "step": 8451 + }, + { + "epoch": 4.767061477721376, + "grad_norm": 1.5999562740325928, + "learning_rate": 2.6167512690355334e-05, + "loss": 0.7182, + "step": 8452 + }, + { + "epoch": 4.767625493513818, + "grad_norm": 0.9756954908370972, + "learning_rate": 2.616469261139312e-05, + "loss": 0.7144, + "step": 8453 + }, + { + "epoch": 4.768189509306261, + "grad_norm": 1.2620189189910889, + "learning_rate": 2.616187253243091e-05, + "loss": 0.7604, + "step": 8454 + }, + { + "epoch": 4.7687535250987025, + "grad_norm": 1.4946112632751465, + "learning_rate": 2.6159052453468697e-05, + "loss": 0.8033, + "step": 8455 + }, + { + "epoch": 4.769317540891145, + "grad_norm": 1.3771233558654785, + "learning_rate": 2.615623237450649e-05, + "loss": 0.761, + "step": 8456 + }, + { + "epoch": 4.769881556683587, + "grad_norm": 1.83327317237854, + "learning_rate": 2.615341229554428e-05, + "loss": 0.8367, + "step": 8457 + }, + { + "epoch": 4.77044557247603, + "grad_norm": 1.4146015644073486, + "learning_rate": 2.6150592216582064e-05, + "loss": 0.7865, + "step": 8458 + }, + { + "epoch": 4.7710095882684715, + "grad_norm": 1.0504963397979736, + "learning_rate": 2.6147772137619853e-05, + "loss": 0.6253, + "step": 8459 + }, + { + "epoch": 4.771573604060913, + "grad_norm": 1.4935787916183472, + "learning_rate": 2.6144952058657645e-05, + "loss": 0.8339, + "step": 8460 + }, + { + "epoch": 4.772137619853356, + "grad_norm": 1.2188568115234375, + "learning_rate": 2.6142131979695434e-05, + "loss": 0.7177, + "step": 8461 + }, + { + "epoch": 4.772701635645798, + "grad_norm": 1.6016589403152466, + "learning_rate": 2.6139311900733222e-05, + "loss": 0.7892, + "step": 8462 + }, + { + "epoch": 4.7732656514382406, + "grad_norm": 1.5664445161819458, + "learning_rate": 2.6136491821771008e-05, + "loss": 0.7781, + "step": 8463 + }, + { + "epoch": 4.773829667230682, + "grad_norm": 1.111824870109558, + "learning_rate": 2.6133671742808803e-05, + "loss": 0.6566, + "step": 8464 + }, + { + "epoch": 4.774393683023124, + "grad_norm": 1.083177089691162, + "learning_rate": 2.613085166384659e-05, + "loss": 0.6643, + "step": 8465 + }, + { + "epoch": 4.774957698815567, + "grad_norm": 1.804124355316162, + "learning_rate": 2.6128031584884378e-05, + "loss": 0.8599, + "step": 8466 + }, + { + "epoch": 4.775521714608009, + "grad_norm": 1.1096042394638062, + "learning_rate": 2.6125211505922166e-05, + "loss": 0.6932, + "step": 8467 + }, + { + "epoch": 4.776085730400451, + "grad_norm": 1.9992481470108032, + "learning_rate": 2.612239142695996e-05, + "loss": 0.9016, + "step": 8468 + }, + { + "epoch": 4.776649746192893, + "grad_norm": 1.301414966583252, + "learning_rate": 2.6119571347997744e-05, + "loss": 0.6996, + "step": 8469 + }, + { + "epoch": 4.777213761985336, + "grad_norm": 1.0818305015563965, + "learning_rate": 2.6116751269035533e-05, + "loss": 0.7194, + "step": 8470 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 1.4308651685714722, + "learning_rate": 2.611393119007332e-05, + "loss": 0.7684, + "step": 8471 + }, + { + "epoch": 4.77834179357022, + "grad_norm": 0.8633376955986023, + "learning_rate": 2.6111111111111114e-05, + "loss": 0.551, + "step": 8472 + }, + { + "epoch": 4.778905809362662, + "grad_norm": 1.331750512123108, + "learning_rate": 2.6108291032148903e-05, + "loss": 0.7719, + "step": 8473 + }, + { + "epoch": 4.779469825155104, + "grad_norm": 2.654916524887085, + "learning_rate": 2.6105470953186688e-05, + "loss": 0.9004, + "step": 8474 + }, + { + "epoch": 4.780033840947547, + "grad_norm": 1.2842161655426025, + "learning_rate": 2.6102650874224477e-05, + "loss": 0.8089, + "step": 8475 + }, + { + "epoch": 4.7805978567399885, + "grad_norm": 0.9484731554985046, + "learning_rate": 2.609983079526227e-05, + "loss": 0.6285, + "step": 8476 + }, + { + "epoch": 4.781161872532431, + "grad_norm": 1.0060535669326782, + "learning_rate": 2.6097010716300058e-05, + "loss": 0.7174, + "step": 8477 + }, + { + "epoch": 4.781725888324873, + "grad_norm": 1.1260219812393188, + "learning_rate": 2.6094190637337846e-05, + "loss": 0.7248, + "step": 8478 + }, + { + "epoch": 4.782289904117315, + "grad_norm": 1.4597365856170654, + "learning_rate": 2.609137055837564e-05, + "loss": 0.811, + "step": 8479 + }, + { + "epoch": 4.7828539199097575, + "grad_norm": 1.58738374710083, + "learning_rate": 2.6088550479413427e-05, + "loss": 0.685, + "step": 8480 + }, + { + "epoch": 4.783417935702199, + "grad_norm": 1.4153696298599243, + "learning_rate": 2.6085730400451213e-05, + "loss": 0.7819, + "step": 8481 + }, + { + "epoch": 4.783981951494642, + "grad_norm": 1.4631530046463013, + "learning_rate": 2.6082910321489e-05, + "loss": 0.8192, + "step": 8482 + }, + { + "epoch": 4.784545967287084, + "grad_norm": 0.9899834394454956, + "learning_rate": 2.6080090242526794e-05, + "loss": 0.8194, + "step": 8483 + }, + { + "epoch": 4.7851099830795265, + "grad_norm": 1.3146851062774658, + "learning_rate": 2.6077270163564583e-05, + "loss": 0.7116, + "step": 8484 + }, + { + "epoch": 4.785673998871968, + "grad_norm": 1.7345716953277588, + "learning_rate": 2.607445008460237e-05, + "loss": 0.8959, + "step": 8485 + }, + { + "epoch": 4.786238014664411, + "grad_norm": 1.5104764699935913, + "learning_rate": 2.6071630005640157e-05, + "loss": 0.8715, + "step": 8486 + }, + { + "epoch": 4.786802030456853, + "grad_norm": 1.2024742364883423, + "learning_rate": 2.606880992667795e-05, + "loss": 0.7639, + "step": 8487 + }, + { + "epoch": 4.787366046249295, + "grad_norm": 0.9514695405960083, + "learning_rate": 2.6065989847715738e-05, + "loss": 0.7891, + "step": 8488 + }, + { + "epoch": 4.787930062041737, + "grad_norm": 1.5120248794555664, + "learning_rate": 2.6063169768753527e-05, + "loss": 0.8032, + "step": 8489 + }, + { + "epoch": 4.788494077834179, + "grad_norm": 0.9601136445999146, + "learning_rate": 2.6060349689791312e-05, + "loss": 0.6825, + "step": 8490 + }, + { + "epoch": 4.789058093626622, + "grad_norm": 0.9867238998413086, + "learning_rate": 2.6057529610829108e-05, + "loss": 0.8017, + "step": 8491 + }, + { + "epoch": 4.789622109419064, + "grad_norm": 1.0633903741836548, + "learning_rate": 2.6054709531866893e-05, + "loss": 0.7425, + "step": 8492 + }, + { + "epoch": 4.790186125211505, + "grad_norm": 1.5554934740066528, + "learning_rate": 2.6051889452904682e-05, + "loss": 0.8271, + "step": 8493 + }, + { + "epoch": 4.790750141003948, + "grad_norm": 1.3297556638717651, + "learning_rate": 2.604906937394247e-05, + "loss": 0.7357, + "step": 8494 + }, + { + "epoch": 4.79131415679639, + "grad_norm": 1.5277395248413086, + "learning_rate": 2.6046249294980263e-05, + "loss": 0.6912, + "step": 8495 + }, + { + "epoch": 4.791878172588833, + "grad_norm": 1.0661300420761108, + "learning_rate": 2.604342921601805e-05, + "loss": 0.7624, + "step": 8496 + }, + { + "epoch": 4.7924421883812744, + "grad_norm": 1.3252222537994385, + "learning_rate": 2.6040609137055837e-05, + "loss": 0.7977, + "step": 8497 + }, + { + "epoch": 4.793006204173717, + "grad_norm": 1.5643147230148315, + "learning_rate": 2.6037789058093626e-05, + "loss": 0.7388, + "step": 8498 + }, + { + "epoch": 4.793570219966159, + "grad_norm": 1.4848493337631226, + "learning_rate": 2.6034968979131418e-05, + "loss": 0.8024, + "step": 8499 + }, + { + "epoch": 4.794134235758602, + "grad_norm": 1.0616700649261475, + "learning_rate": 2.6032148900169207e-05, + "loss": 0.7503, + "step": 8500 + }, + { + "epoch": 4.7946982515510435, + "grad_norm": 1.4627795219421387, + "learning_rate": 2.6029328821206996e-05, + "loss": 0.7603, + "step": 8501 + }, + { + "epoch": 4.795262267343485, + "grad_norm": 0.7167789340019226, + "learning_rate": 2.602650874224478e-05, + "loss": 0.6219, + "step": 8502 + }, + { + "epoch": 4.795826283135928, + "grad_norm": 1.1787701845169067, + "learning_rate": 2.6023688663282577e-05, + "loss": 0.7263, + "step": 8503 + }, + { + "epoch": 4.79639029892837, + "grad_norm": 1.496411681175232, + "learning_rate": 2.6020868584320362e-05, + "loss": 0.6259, + "step": 8504 + }, + { + "epoch": 4.7969543147208125, + "grad_norm": 1.1113601922988892, + "learning_rate": 2.601804850535815e-05, + "loss": 0.8587, + "step": 8505 + }, + { + "epoch": 4.797518330513254, + "grad_norm": 1.2622648477554321, + "learning_rate": 2.6015228426395936e-05, + "loss": 0.8293, + "step": 8506 + }, + { + "epoch": 4.798082346305696, + "grad_norm": 1.338503360748291, + "learning_rate": 2.6012408347433732e-05, + "loss": 0.7579, + "step": 8507 + }, + { + "epoch": 4.798646362098139, + "grad_norm": 1.1500097513198853, + "learning_rate": 2.6009588268471517e-05, + "loss": 0.738, + "step": 8508 + }, + { + "epoch": 4.799210377890581, + "grad_norm": 1.3278197050094604, + "learning_rate": 2.6006768189509306e-05, + "loss": 0.8105, + "step": 8509 + }, + { + "epoch": 4.799774393683023, + "grad_norm": 1.4773520231246948, + "learning_rate": 2.6003948110547095e-05, + "loss": 0.8355, + "step": 8510 + }, + { + "epoch": 4.800338409475465, + "grad_norm": 0.9132173657417297, + "learning_rate": 2.6001128031584887e-05, + "loss": 0.7649, + "step": 8511 + }, + { + "epoch": 4.800902425267908, + "grad_norm": 1.3070757389068604, + "learning_rate": 2.5998307952622676e-05, + "loss": 0.693, + "step": 8512 + }, + { + "epoch": 4.80146644106035, + "grad_norm": 1.3921736478805542, + "learning_rate": 2.599548787366046e-05, + "loss": 0.8482, + "step": 8513 + }, + { + "epoch": 4.802030456852792, + "grad_norm": 1.460697054862976, + "learning_rate": 2.5992667794698257e-05, + "loss": 0.8336, + "step": 8514 + }, + { + "epoch": 4.802594472645234, + "grad_norm": 0.9282457232475281, + "learning_rate": 2.5989847715736042e-05, + "loss": 0.7641, + "step": 8515 + }, + { + "epoch": 4.803158488437676, + "grad_norm": 1.0860282182693481, + "learning_rate": 2.598702763677383e-05, + "loss": 0.7582, + "step": 8516 + }, + { + "epoch": 4.803722504230119, + "grad_norm": 1.799089789390564, + "learning_rate": 2.598420755781162e-05, + "loss": 0.8729, + "step": 8517 + }, + { + "epoch": 4.80428652002256, + "grad_norm": 1.2686829566955566, + "learning_rate": 2.5981387478849412e-05, + "loss": 0.682, + "step": 8518 + }, + { + "epoch": 4.804850535815003, + "grad_norm": 0.9797212481498718, + "learning_rate": 2.59785673998872e-05, + "loss": 0.6614, + "step": 8519 + }, + { + "epoch": 4.805414551607445, + "grad_norm": 1.2144991159439087, + "learning_rate": 2.5975747320924986e-05, + "loss": 0.8262, + "step": 8520 + }, + { + "epoch": 4.805978567399887, + "grad_norm": 1.2771337032318115, + "learning_rate": 2.5972927241962775e-05, + "loss": 0.8119, + "step": 8521 + }, + { + "epoch": 4.806542583192329, + "grad_norm": 1.0628178119659424, + "learning_rate": 2.5970107163000567e-05, + "loss": 0.7947, + "step": 8522 + }, + { + "epoch": 4.807106598984771, + "grad_norm": 1.1782500743865967, + "learning_rate": 2.5967287084038356e-05, + "loss": 0.671, + "step": 8523 + }, + { + "epoch": 4.807670614777214, + "grad_norm": 0.8782236576080322, + "learning_rate": 2.596446700507614e-05, + "loss": 0.7443, + "step": 8524 + }, + { + "epoch": 4.808234630569656, + "grad_norm": 1.0194634199142456, + "learning_rate": 2.596164692611393e-05, + "loss": 0.6527, + "step": 8525 + }, + { + "epoch": 4.808798646362098, + "grad_norm": 1.0706137418746948, + "learning_rate": 2.5958826847151722e-05, + "loss": 0.6753, + "step": 8526 + }, + { + "epoch": 4.80936266215454, + "grad_norm": 1.1406104564666748, + "learning_rate": 2.595600676818951e-05, + "loss": 0.6464, + "step": 8527 + }, + { + "epoch": 4.809926677946983, + "grad_norm": 1.4066134691238403, + "learning_rate": 2.59531866892273e-05, + "loss": 0.8289, + "step": 8528 + }, + { + "epoch": 4.810490693739425, + "grad_norm": 1.5226811170578003, + "learning_rate": 2.5950366610265085e-05, + "loss": 0.7912, + "step": 8529 + }, + { + "epoch": 4.8110547095318665, + "grad_norm": 1.0254241228103638, + "learning_rate": 2.594754653130288e-05, + "loss": 0.6805, + "step": 8530 + }, + { + "epoch": 4.811618725324309, + "grad_norm": 1.1528512239456177, + "learning_rate": 2.5944726452340666e-05, + "loss": 0.7987, + "step": 8531 + }, + { + "epoch": 4.812182741116751, + "grad_norm": 1.5120866298675537, + "learning_rate": 2.5941906373378455e-05, + "loss": 0.7636, + "step": 8532 + }, + { + "epoch": 4.812746756909194, + "grad_norm": 1.2311291694641113, + "learning_rate": 2.5939086294416244e-05, + "loss": 0.6747, + "step": 8533 + }, + { + "epoch": 4.8133107727016355, + "grad_norm": 0.9442390203475952, + "learning_rate": 2.5936266215454036e-05, + "loss": 0.6436, + "step": 8534 + }, + { + "epoch": 4.813874788494077, + "grad_norm": 1.2329139709472656, + "learning_rate": 2.5933446136491825e-05, + "loss": 0.7551, + "step": 8535 + }, + { + "epoch": 4.81443880428652, + "grad_norm": 1.311579704284668, + "learning_rate": 2.593062605752961e-05, + "loss": 0.7167, + "step": 8536 + }, + { + "epoch": 4.815002820078962, + "grad_norm": 0.8026832342147827, + "learning_rate": 2.59278059785674e-05, + "loss": 0.6946, + "step": 8537 + }, + { + "epoch": 4.8155668358714045, + "grad_norm": 1.126634120941162, + "learning_rate": 2.592498589960519e-05, + "loss": 0.8559, + "step": 8538 + }, + { + "epoch": 4.816130851663846, + "grad_norm": 1.1531713008880615, + "learning_rate": 2.592216582064298e-05, + "loss": 0.8127, + "step": 8539 + }, + { + "epoch": 4.816694867456289, + "grad_norm": 1.2940328121185303, + "learning_rate": 2.591934574168077e-05, + "loss": 0.79, + "step": 8540 + }, + { + "epoch": 4.817258883248731, + "grad_norm": 1.592140793800354, + "learning_rate": 2.5916525662718554e-05, + "loss": 0.7606, + "step": 8541 + }, + { + "epoch": 4.817822899041174, + "grad_norm": 1.0140674114227295, + "learning_rate": 2.5913705583756346e-05, + "loss": 0.7285, + "step": 8542 + }, + { + "epoch": 4.818386914833615, + "grad_norm": 2.0414600372314453, + "learning_rate": 2.5910885504794135e-05, + "loss": 0.8682, + "step": 8543 + }, + { + "epoch": 4.818950930626057, + "grad_norm": 1.3873409032821655, + "learning_rate": 2.5908065425831924e-05, + "loss": 0.8804, + "step": 8544 + }, + { + "epoch": 4.8195149464185, + "grad_norm": 1.4280030727386475, + "learning_rate": 2.590524534686971e-05, + "loss": 0.7301, + "step": 8545 + }, + { + "epoch": 4.820078962210942, + "grad_norm": 0.9744502902030945, + "learning_rate": 2.5902425267907505e-05, + "loss": 0.749, + "step": 8546 + }, + { + "epoch": 4.820642978003384, + "grad_norm": 1.262412190437317, + "learning_rate": 2.589960518894529e-05, + "loss": 0.737, + "step": 8547 + }, + { + "epoch": 4.821206993795826, + "grad_norm": 1.4548990726470947, + "learning_rate": 2.589678510998308e-05, + "loss": 0.8016, + "step": 8548 + }, + { + "epoch": 4.821771009588268, + "grad_norm": 1.1220097541809082, + "learning_rate": 2.589396503102087e-05, + "loss": 0.7189, + "step": 8549 + }, + { + "epoch": 4.822335025380711, + "grad_norm": 1.4856524467468262, + "learning_rate": 2.589114495205866e-05, + "loss": 0.7895, + "step": 8550 + }, + { + "epoch": 4.8228990411731525, + "grad_norm": 1.6927025318145752, + "learning_rate": 2.588832487309645e-05, + "loss": 0.781, + "step": 8551 + }, + { + "epoch": 4.823463056965595, + "grad_norm": 1.7999004125595093, + "learning_rate": 2.5885504794134234e-05, + "loss": 0.7917, + "step": 8552 + }, + { + "epoch": 4.824027072758037, + "grad_norm": 1.0805250406265259, + "learning_rate": 2.588268471517203e-05, + "loss": 0.7071, + "step": 8553 + }, + { + "epoch": 4.82459108855048, + "grad_norm": 1.076160192489624, + "learning_rate": 2.5879864636209815e-05, + "loss": 0.7606, + "step": 8554 + }, + { + "epoch": 4.8251551043429215, + "grad_norm": 1.0611588954925537, + "learning_rate": 2.5877044557247604e-05, + "loss": 0.7348, + "step": 8555 + }, + { + "epoch": 4.825719120135364, + "grad_norm": 1.2059351205825806, + "learning_rate": 2.5874224478285393e-05, + "loss": 0.7151, + "step": 8556 + }, + { + "epoch": 4.826283135927806, + "grad_norm": 1.3321630954742432, + "learning_rate": 2.5871404399323185e-05, + "loss": 0.8629, + "step": 8557 + }, + { + "epoch": 4.826847151720248, + "grad_norm": 1.0321564674377441, + "learning_rate": 2.5868584320360974e-05, + "loss": 0.7301, + "step": 8558 + }, + { + "epoch": 4.8274111675126905, + "grad_norm": 1.3892085552215576, + "learning_rate": 2.586576424139876e-05, + "loss": 0.8013, + "step": 8559 + }, + { + "epoch": 4.827975183305132, + "grad_norm": 0.9936296939849854, + "learning_rate": 2.5862944162436548e-05, + "loss": 0.7061, + "step": 8560 + }, + { + "epoch": 4.828539199097575, + "grad_norm": 1.05750572681427, + "learning_rate": 2.586012408347434e-05, + "loss": 0.699, + "step": 8561 + }, + { + "epoch": 4.829103214890017, + "grad_norm": 1.4457144737243652, + "learning_rate": 2.585730400451213e-05, + "loss": 0.8427, + "step": 8562 + }, + { + "epoch": 4.829667230682459, + "grad_norm": 1.3666253089904785, + "learning_rate": 2.5854483925549915e-05, + "loss": 0.8046, + "step": 8563 + }, + { + "epoch": 4.830231246474901, + "grad_norm": 1.5875850915908813, + "learning_rate": 2.5851663846587703e-05, + "loss": 0.8123, + "step": 8564 + }, + { + "epoch": 4.830795262267343, + "grad_norm": 0.8660316467285156, + "learning_rate": 2.5848843767625496e-05, + "loss": 0.6772, + "step": 8565 + }, + { + "epoch": 4.831359278059786, + "grad_norm": 1.3319480419158936, + "learning_rate": 2.5846023688663284e-05, + "loss": 0.7108, + "step": 8566 + }, + { + "epoch": 4.831923293852228, + "grad_norm": 1.4622020721435547, + "learning_rate": 2.5843203609701073e-05, + "loss": 0.7689, + "step": 8567 + }, + { + "epoch": 4.83248730964467, + "grad_norm": 1.2483749389648438, + "learning_rate": 2.584038353073886e-05, + "loss": 0.812, + "step": 8568 + }, + { + "epoch": 4.833051325437112, + "grad_norm": 1.0345476865768433, + "learning_rate": 2.5837563451776654e-05, + "loss": 0.7915, + "step": 8569 + }, + { + "epoch": 4.833615341229555, + "grad_norm": 1.186002254486084, + "learning_rate": 2.583474337281444e-05, + "loss": 0.7355, + "step": 8570 + }, + { + "epoch": 4.834179357021997, + "grad_norm": 0.9586919546127319, + "learning_rate": 2.5831923293852228e-05, + "loss": 0.77, + "step": 8571 + }, + { + "epoch": 4.8347433728144384, + "grad_norm": 1.1716231107711792, + "learning_rate": 2.5829103214890017e-05, + "loss": 0.7388, + "step": 8572 + }, + { + "epoch": 4.835307388606881, + "grad_norm": 1.4330315589904785, + "learning_rate": 2.582628313592781e-05, + "loss": 0.7991, + "step": 8573 + }, + { + "epoch": 4.835871404399323, + "grad_norm": 1.2394037246704102, + "learning_rate": 2.5823463056965598e-05, + "loss": 0.7738, + "step": 8574 + }, + { + "epoch": 4.836435420191766, + "grad_norm": 1.0335001945495605, + "learning_rate": 2.5820642978003383e-05, + "loss": 0.8368, + "step": 8575 + }, + { + "epoch": 4.8369994359842075, + "grad_norm": 1.2842469215393066, + "learning_rate": 2.5817822899041172e-05, + "loss": 0.7909, + "step": 8576 + }, + { + "epoch": 4.837563451776649, + "grad_norm": 1.464725136756897, + "learning_rate": 2.5815002820078964e-05, + "loss": 0.8325, + "step": 8577 + }, + { + "epoch": 4.838127467569092, + "grad_norm": 1.308337688446045, + "learning_rate": 2.5812182741116753e-05, + "loss": 0.5796, + "step": 8578 + }, + { + "epoch": 4.838691483361534, + "grad_norm": 1.2496798038482666, + "learning_rate": 2.580936266215454e-05, + "loss": 0.727, + "step": 8579 + }, + { + "epoch": 4.8392554991539765, + "grad_norm": 0.9300209283828735, + "learning_rate": 2.5806542583192327e-05, + "loss": 0.713, + "step": 8580 + }, + { + "epoch": 4.839819514946418, + "grad_norm": 0.9459128975868225, + "learning_rate": 2.580372250423012e-05, + "loss": 0.722, + "step": 8581 + }, + { + "epoch": 4.840383530738861, + "grad_norm": 1.0024627447128296, + "learning_rate": 2.580090242526791e-05, + "loss": 0.7142, + "step": 8582 + }, + { + "epoch": 4.840947546531303, + "grad_norm": 1.313043475151062, + "learning_rate": 2.5798082346305697e-05, + "loss": 0.6977, + "step": 8583 + }, + { + "epoch": 4.8415115623237455, + "grad_norm": 1.2284607887268066, + "learning_rate": 2.579526226734349e-05, + "loss": 0.6078, + "step": 8584 + }, + { + "epoch": 4.842075578116187, + "grad_norm": 0.9615808725357056, + "learning_rate": 2.5792442188381278e-05, + "loss": 0.6895, + "step": 8585 + }, + { + "epoch": 4.842639593908629, + "grad_norm": 1.0345594882965088, + "learning_rate": 2.5789622109419064e-05, + "loss": 0.7852, + "step": 8586 + }, + { + "epoch": 4.843203609701072, + "grad_norm": 1.013258457183838, + "learning_rate": 2.5786802030456852e-05, + "loss": 0.7019, + "step": 8587 + }, + { + "epoch": 4.843767625493514, + "grad_norm": 0.9607200622558594, + "learning_rate": 2.5783981951494645e-05, + "loss": 0.6706, + "step": 8588 + }, + { + "epoch": 4.844331641285956, + "grad_norm": 1.5235910415649414, + "learning_rate": 2.5781161872532433e-05, + "loss": 0.7997, + "step": 8589 + }, + { + "epoch": 4.844895657078398, + "grad_norm": 1.0613291263580322, + "learning_rate": 2.5778341793570222e-05, + "loss": 0.7353, + "step": 8590 + }, + { + "epoch": 4.84545967287084, + "grad_norm": 1.7105318307876587, + "learning_rate": 2.5775521714608008e-05, + "loss": 0.8435, + "step": 8591 + }, + { + "epoch": 4.846023688663283, + "grad_norm": 2.2690415382385254, + "learning_rate": 2.5772701635645803e-05, + "loss": 0.7647, + "step": 8592 + }, + { + "epoch": 4.846587704455724, + "grad_norm": 1.0518708229064941, + "learning_rate": 2.576988155668359e-05, + "loss": 0.7198, + "step": 8593 + }, + { + "epoch": 4.847151720248167, + "grad_norm": 0.7670772671699524, + "learning_rate": 2.5767061477721377e-05, + "loss": 0.6817, + "step": 8594 + }, + { + "epoch": 4.847715736040609, + "grad_norm": 0.8652122616767883, + "learning_rate": 2.5764241398759166e-05, + "loss": 0.6811, + "step": 8595 + }, + { + "epoch": 4.848279751833052, + "grad_norm": 1.807963490486145, + "learning_rate": 2.576142131979696e-05, + "loss": 0.8872, + "step": 8596 + }, + { + "epoch": 4.848843767625493, + "grad_norm": 1.1059740781784058, + "learning_rate": 2.5758601240834744e-05, + "loss": 0.7102, + "step": 8597 + }, + { + "epoch": 4.849407783417936, + "grad_norm": 1.1038377285003662, + "learning_rate": 2.5755781161872533e-05, + "loss": 0.7104, + "step": 8598 + }, + { + "epoch": 4.849971799210378, + "grad_norm": 1.6917085647583008, + "learning_rate": 2.575296108291032e-05, + "loss": 0.7664, + "step": 8599 + }, + { + "epoch": 4.85053581500282, + "grad_norm": 0.9338497519493103, + "learning_rate": 2.5750141003948114e-05, + "loss": 0.6549, + "step": 8600 + }, + { + "epoch": 4.851099830795262, + "grad_norm": 0.9144843816757202, + "learning_rate": 2.5747320924985902e-05, + "loss": 0.6423, + "step": 8601 + }, + { + "epoch": 4.851663846587704, + "grad_norm": 2.1159088611602783, + "learning_rate": 2.5744500846023688e-05, + "loss": 0.8751, + "step": 8602 + }, + { + "epoch": 4.852227862380147, + "grad_norm": 1.4756135940551758, + "learning_rate": 2.5741680767061477e-05, + "loss": 0.701, + "step": 8603 + }, + { + "epoch": 4.852791878172589, + "grad_norm": 3.7522621154785156, + "learning_rate": 2.573886068809927e-05, + "loss": 0.7973, + "step": 8604 + }, + { + "epoch": 4.8533558939650305, + "grad_norm": 1.238901138305664, + "learning_rate": 2.5736040609137058e-05, + "loss": 0.7253, + "step": 8605 + }, + { + "epoch": 4.853919909757473, + "grad_norm": 1.3194023370742798, + "learning_rate": 2.5733220530174846e-05, + "loss": 0.7998, + "step": 8606 + }, + { + "epoch": 4.854483925549915, + "grad_norm": 1.0171345472335815, + "learning_rate": 2.5730400451212632e-05, + "loss": 0.7681, + "step": 8607 + }, + { + "epoch": 4.855047941342358, + "grad_norm": 1.3345822095870972, + "learning_rate": 2.5727580372250427e-05, + "loss": 0.7535, + "step": 8608 + }, + { + "epoch": 4.8556119571347995, + "grad_norm": 1.2357665300369263, + "learning_rate": 2.5724760293288213e-05, + "loss": 0.7998, + "step": 8609 + }, + { + "epoch": 4.856175972927242, + "grad_norm": 0.9362502098083496, + "learning_rate": 2.5721940214326e-05, + "loss": 0.7168, + "step": 8610 + }, + { + "epoch": 4.856739988719684, + "grad_norm": 1.2307909727096558, + "learning_rate": 2.571912013536379e-05, + "loss": 0.7956, + "step": 8611 + }, + { + "epoch": 4.857304004512127, + "grad_norm": 1.7600772380828857, + "learning_rate": 2.5716300056401583e-05, + "loss": 0.9159, + "step": 8612 + }, + { + "epoch": 4.8578680203045685, + "grad_norm": 0.9788634777069092, + "learning_rate": 2.571347997743937e-05, + "loss": 0.766, + "step": 8613 + }, + { + "epoch": 4.85843203609701, + "grad_norm": 1.169327974319458, + "learning_rate": 2.5710659898477157e-05, + "loss": 0.7144, + "step": 8614 + }, + { + "epoch": 4.858996051889453, + "grad_norm": 0.9774419069290161, + "learning_rate": 2.5707839819514946e-05, + "loss": 0.7443, + "step": 8615 + }, + { + "epoch": 4.859560067681895, + "grad_norm": 1.4543566703796387, + "learning_rate": 2.5705019740552738e-05, + "loss": 0.7304, + "step": 8616 + }, + { + "epoch": 4.8601240834743376, + "grad_norm": 1.1384384632110596, + "learning_rate": 2.5702199661590526e-05, + "loss": 0.7511, + "step": 8617 + }, + { + "epoch": 4.860688099266779, + "grad_norm": 1.3525155782699585, + "learning_rate": 2.5699379582628312e-05, + "loss": 0.8675, + "step": 8618 + }, + { + "epoch": 4.861252115059221, + "grad_norm": 1.1669888496398926, + "learning_rate": 2.5696559503666107e-05, + "loss": 0.7229, + "step": 8619 + }, + { + "epoch": 4.861816130851664, + "grad_norm": 1.3279818296432495, + "learning_rate": 2.5693739424703893e-05, + "loss": 0.6913, + "step": 8620 + }, + { + "epoch": 4.862380146644106, + "grad_norm": 0.9562972187995911, + "learning_rate": 2.569091934574168e-05, + "loss": 0.6893, + "step": 8621 + }, + { + "epoch": 4.862944162436548, + "grad_norm": 1.2236138582229614, + "learning_rate": 2.568809926677947e-05, + "loss": 0.834, + "step": 8622 + }, + { + "epoch": 4.86350817822899, + "grad_norm": 1.18997323513031, + "learning_rate": 2.5685279187817263e-05, + "loss": 0.724, + "step": 8623 + }, + { + "epoch": 4.864072194021433, + "grad_norm": 1.1665364503860474, + "learning_rate": 2.568245910885505e-05, + "loss": 0.8415, + "step": 8624 + }, + { + "epoch": 4.864636209813875, + "grad_norm": 1.0614745616912842, + "learning_rate": 2.5679639029892837e-05, + "loss": 0.7532, + "step": 8625 + }, + { + "epoch": 4.865200225606317, + "grad_norm": 1.3122100830078125, + "learning_rate": 2.5676818950930626e-05, + "loss": 0.6257, + "step": 8626 + }, + { + "epoch": 4.865764241398759, + "grad_norm": 1.4592105150222778, + "learning_rate": 2.5673998871968418e-05, + "loss": 0.7054, + "step": 8627 + }, + { + "epoch": 4.866328257191201, + "grad_norm": 1.4174443483352661, + "learning_rate": 2.5671178793006207e-05, + "loss": 0.7635, + "step": 8628 + }, + { + "epoch": 4.866892272983644, + "grad_norm": 1.404818058013916, + "learning_rate": 2.5668358714043995e-05, + "loss": 0.7988, + "step": 8629 + }, + { + "epoch": 4.8674562887760855, + "grad_norm": 0.996178925037384, + "learning_rate": 2.566553863508178e-05, + "loss": 0.7044, + "step": 8630 + }, + { + "epoch": 4.868020304568528, + "grad_norm": 1.1859806776046753, + "learning_rate": 2.5662718556119576e-05, + "loss": 0.7447, + "step": 8631 + }, + { + "epoch": 4.86858432036097, + "grad_norm": 2.0087435245513916, + "learning_rate": 2.5659898477157362e-05, + "loss": 0.9009, + "step": 8632 + }, + { + "epoch": 4.869148336153412, + "grad_norm": 0.965545117855072, + "learning_rate": 2.565707839819515e-05, + "loss": 0.7212, + "step": 8633 + }, + { + "epoch": 4.8697123519458545, + "grad_norm": 1.3236390352249146, + "learning_rate": 2.565425831923294e-05, + "loss": 0.7307, + "step": 8634 + }, + { + "epoch": 4.870276367738296, + "grad_norm": 0.813263475894928, + "learning_rate": 2.565143824027073e-05, + "loss": 0.6347, + "step": 8635 + }, + { + "epoch": 4.870840383530739, + "grad_norm": 1.2470332384109497, + "learning_rate": 2.5648618161308517e-05, + "loss": 0.668, + "step": 8636 + }, + { + "epoch": 4.871404399323181, + "grad_norm": 1.2003228664398193, + "learning_rate": 2.5645798082346306e-05, + "loss": 0.7601, + "step": 8637 + }, + { + "epoch": 4.8719684151156235, + "grad_norm": 1.2697139978408813, + "learning_rate": 2.5642978003384095e-05, + "loss": 0.7869, + "step": 8638 + }, + { + "epoch": 4.872532430908065, + "grad_norm": 0.9687620997428894, + "learning_rate": 2.5640157924421887e-05, + "loss": 0.6858, + "step": 8639 + }, + { + "epoch": 4.873096446700508, + "grad_norm": 1.4591726064682007, + "learning_rate": 2.5637337845459676e-05, + "loss": 0.7408, + "step": 8640 + }, + { + "epoch": 4.87366046249295, + "grad_norm": 1.0367722511291504, + "learning_rate": 2.563451776649746e-05, + "loss": 0.7141, + "step": 8641 + }, + { + "epoch": 4.874224478285392, + "grad_norm": 1.4211281538009644, + "learning_rate": 2.563169768753525e-05, + "loss": 0.7177, + "step": 8642 + }, + { + "epoch": 4.874788494077834, + "grad_norm": 0.8774756193161011, + "learning_rate": 2.5628877608573042e-05, + "loss": 0.7312, + "step": 8643 + }, + { + "epoch": 4.875352509870276, + "grad_norm": 1.3678269386291504, + "learning_rate": 2.562605752961083e-05, + "loss": 0.7027, + "step": 8644 + }, + { + "epoch": 4.875916525662719, + "grad_norm": 1.0478872060775757, + "learning_rate": 2.562323745064862e-05, + "loss": 0.7115, + "step": 8645 + }, + { + "epoch": 4.876480541455161, + "grad_norm": 1.2785893678665161, + "learning_rate": 2.5620417371686405e-05, + "loss": 0.736, + "step": 8646 + }, + { + "epoch": 4.877044557247602, + "grad_norm": 0.955639660358429, + "learning_rate": 2.56175972927242e-05, + "loss": 0.7763, + "step": 8647 + }, + { + "epoch": 4.877608573040045, + "grad_norm": 1.3337478637695312, + "learning_rate": 2.5614777213761986e-05, + "loss": 0.8393, + "step": 8648 + }, + { + "epoch": 4.878172588832487, + "grad_norm": 1.0125869512557983, + "learning_rate": 2.5611957134799775e-05, + "loss": 0.7512, + "step": 8649 + }, + { + "epoch": 4.87873660462493, + "grad_norm": 1.1001927852630615, + "learning_rate": 2.5609137055837564e-05, + "loss": 0.787, + "step": 8650 + }, + { + "epoch": 4.8793006204173714, + "grad_norm": 0.9059483408927917, + "learning_rate": 2.5606316976875356e-05, + "loss": 0.7329, + "step": 8651 + }, + { + "epoch": 4.879864636209814, + "grad_norm": 1.5665078163146973, + "learning_rate": 2.5603496897913145e-05, + "loss": 0.8277, + "step": 8652 + }, + { + "epoch": 4.880428652002256, + "grad_norm": 1.2051738500595093, + "learning_rate": 2.560067681895093e-05, + "loss": 0.6827, + "step": 8653 + }, + { + "epoch": 4.880992667794699, + "grad_norm": 1.2425669431686401, + "learning_rate": 2.559785673998872e-05, + "loss": 0.7508, + "step": 8654 + }, + { + "epoch": 4.8815566835871405, + "grad_norm": 1.36454176902771, + "learning_rate": 2.559503666102651e-05, + "loss": 0.7827, + "step": 8655 + }, + { + "epoch": 4.882120699379582, + "grad_norm": 1.0295724868774414, + "learning_rate": 2.55922165820643e-05, + "loss": 0.8797, + "step": 8656 + }, + { + "epoch": 4.882684715172025, + "grad_norm": 1.1586560010910034, + "learning_rate": 2.5589396503102085e-05, + "loss": 0.8335, + "step": 8657 + }, + { + "epoch": 4.883248730964467, + "grad_norm": 1.3476825952529907, + "learning_rate": 2.558657642413988e-05, + "loss": 0.7982, + "step": 8658 + }, + { + "epoch": 4.8838127467569095, + "grad_norm": 1.5096218585968018, + "learning_rate": 2.5583756345177666e-05, + "loss": 0.8621, + "step": 8659 + }, + { + "epoch": 4.884376762549351, + "grad_norm": 1.3209880590438843, + "learning_rate": 2.5580936266215455e-05, + "loss": 0.7662, + "step": 8660 + }, + { + "epoch": 4.884940778341793, + "grad_norm": 1.1793674230575562, + "learning_rate": 2.5578116187253244e-05, + "loss": 0.7282, + "step": 8661 + }, + { + "epoch": 4.885504794134236, + "grad_norm": 0.8087019324302673, + "learning_rate": 2.5575296108291036e-05, + "loss": 0.5375, + "step": 8662 + }, + { + "epoch": 4.886068809926678, + "grad_norm": 0.9630120396614075, + "learning_rate": 2.5572476029328825e-05, + "loss": 0.6705, + "step": 8663 + }, + { + "epoch": 4.88663282571912, + "grad_norm": 1.6011461019515991, + "learning_rate": 2.556965595036661e-05, + "loss": 0.8548, + "step": 8664 + }, + { + "epoch": 4.887196841511562, + "grad_norm": 1.1817275285720825, + "learning_rate": 2.55668358714044e-05, + "loss": 0.7709, + "step": 8665 + }, + { + "epoch": 4.887760857304005, + "grad_norm": 1.4901785850524902, + "learning_rate": 2.556401579244219e-05, + "loss": 0.7319, + "step": 8666 + }, + { + "epoch": 4.888324873096447, + "grad_norm": 1.8241621255874634, + "learning_rate": 2.556119571347998e-05, + "loss": 0.8342, + "step": 8667 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 1.6322011947631836, + "learning_rate": 2.555837563451777e-05, + "loss": 0.9153, + "step": 8668 + }, + { + "epoch": 4.889452904681331, + "grad_norm": 1.5823683738708496, + "learning_rate": 2.5555555555555554e-05, + "loss": 0.8519, + "step": 8669 + }, + { + "epoch": 4.890016920473773, + "grad_norm": 2.6129913330078125, + "learning_rate": 2.555273547659335e-05, + "loss": 0.9215, + "step": 8670 + }, + { + "epoch": 4.890580936266216, + "grad_norm": 1.5211966037750244, + "learning_rate": 2.5549915397631135e-05, + "loss": 0.7521, + "step": 8671 + }, + { + "epoch": 4.891144952058657, + "grad_norm": 1.2516573667526245, + "learning_rate": 2.5547095318668924e-05, + "loss": 0.7441, + "step": 8672 + }, + { + "epoch": 4.8917089678511, + "grad_norm": 1.9419206380844116, + "learning_rate": 2.554427523970671e-05, + "loss": 0.7198, + "step": 8673 + }, + { + "epoch": 4.892272983643542, + "grad_norm": 1.6414803266525269, + "learning_rate": 2.5541455160744505e-05, + "loss": 0.8347, + "step": 8674 + }, + { + "epoch": 4.892836999435984, + "grad_norm": 1.3007522821426392, + "learning_rate": 2.553863508178229e-05, + "loss": 0.7452, + "step": 8675 + }, + { + "epoch": 4.893401015228426, + "grad_norm": 1.4933249950408936, + "learning_rate": 2.553581500282008e-05, + "loss": 0.7773, + "step": 8676 + }, + { + "epoch": 4.893965031020868, + "grad_norm": 1.5695794820785522, + "learning_rate": 2.5532994923857868e-05, + "loss": 0.8284, + "step": 8677 + }, + { + "epoch": 4.894529046813311, + "grad_norm": 1.1415531635284424, + "learning_rate": 2.553017484489566e-05, + "loss": 0.6608, + "step": 8678 + }, + { + "epoch": 4.895093062605753, + "grad_norm": 0.9139989614486694, + "learning_rate": 2.552735476593345e-05, + "loss": 0.7296, + "step": 8679 + }, + { + "epoch": 4.895657078398195, + "grad_norm": 1.4966639280319214, + "learning_rate": 2.5524534686971234e-05, + "loss": 0.7947, + "step": 8680 + }, + { + "epoch": 4.896221094190637, + "grad_norm": 1.5466331243515015, + "learning_rate": 2.5521714608009023e-05, + "loss": 0.6904, + "step": 8681 + }, + { + "epoch": 4.89678510998308, + "grad_norm": 1.3796169757843018, + "learning_rate": 2.5518894529046815e-05, + "loss": 0.7653, + "step": 8682 + }, + { + "epoch": 4.897349125775522, + "grad_norm": 1.3743584156036377, + "learning_rate": 2.5516074450084604e-05, + "loss": 0.8554, + "step": 8683 + }, + { + "epoch": 4.8979131415679635, + "grad_norm": 1.6595840454101562, + "learning_rate": 2.5513254371122393e-05, + "loss": 0.849, + "step": 8684 + }, + { + "epoch": 4.898477157360406, + "grad_norm": 1.4092721939086914, + "learning_rate": 2.5510434292160178e-05, + "loss": 0.8488, + "step": 8685 + }, + { + "epoch": 4.899041173152848, + "grad_norm": 1.1218682527542114, + "learning_rate": 2.5507614213197974e-05, + "loss": 0.7611, + "step": 8686 + }, + { + "epoch": 4.899605188945291, + "grad_norm": 1.040789246559143, + "learning_rate": 2.550479413423576e-05, + "loss": 0.6168, + "step": 8687 + }, + { + "epoch": 4.9001692047377325, + "grad_norm": 1.4648280143737793, + "learning_rate": 2.5501974055273548e-05, + "loss": 0.7167, + "step": 8688 + }, + { + "epoch": 4.900733220530174, + "grad_norm": 1.0190412998199463, + "learning_rate": 2.5499153976311337e-05, + "loss": 0.6318, + "step": 8689 + }, + { + "epoch": 4.901297236322617, + "grad_norm": 1.1582025289535522, + "learning_rate": 2.549633389734913e-05, + "loss": 0.8129, + "step": 8690 + }, + { + "epoch": 4.901861252115059, + "grad_norm": 1.0544190406799316, + "learning_rate": 2.5493513818386914e-05, + "loss": 0.7522, + "step": 8691 + }, + { + "epoch": 4.9024252679075015, + "grad_norm": 1.1538805961608887, + "learning_rate": 2.5490693739424703e-05, + "loss": 0.7453, + "step": 8692 + }, + { + "epoch": 4.902989283699943, + "grad_norm": 1.1750432252883911, + "learning_rate": 2.5487873660462495e-05, + "loss": 0.6981, + "step": 8693 + }, + { + "epoch": 4.903553299492386, + "grad_norm": 1.176182746887207, + "learning_rate": 2.5485053581500284e-05, + "loss": 0.7471, + "step": 8694 + }, + { + "epoch": 4.904117315284828, + "grad_norm": 1.5049924850463867, + "learning_rate": 2.5482233502538073e-05, + "loss": 0.8414, + "step": 8695 + }, + { + "epoch": 4.904681331077271, + "grad_norm": 0.9979784488677979, + "learning_rate": 2.547941342357586e-05, + "loss": 0.7184, + "step": 8696 + }, + { + "epoch": 4.905245346869712, + "grad_norm": 1.06387197971344, + "learning_rate": 2.5476593344613654e-05, + "loss": 0.6659, + "step": 8697 + }, + { + "epoch": 4.905809362662154, + "grad_norm": 0.9830855131149292, + "learning_rate": 2.547377326565144e-05, + "loss": 0.6969, + "step": 8698 + }, + { + "epoch": 4.906373378454597, + "grad_norm": 1.5114030838012695, + "learning_rate": 2.5470953186689228e-05, + "loss": 0.8086, + "step": 8699 + }, + { + "epoch": 4.906937394247039, + "grad_norm": 0.9378460049629211, + "learning_rate": 2.5468133107727017e-05, + "loss": 0.6723, + "step": 8700 + }, + { + "epoch": 4.907501410039481, + "grad_norm": 1.6734063625335693, + "learning_rate": 2.546531302876481e-05, + "loss": 0.8237, + "step": 8701 + }, + { + "epoch": 4.908065425831923, + "grad_norm": 0.933930516242981, + "learning_rate": 2.5462492949802598e-05, + "loss": 0.7449, + "step": 8702 + }, + { + "epoch": 4.908629441624365, + "grad_norm": 1.2736023664474487, + "learning_rate": 2.5459672870840383e-05, + "loss": 0.8308, + "step": 8703 + }, + { + "epoch": 4.909193457416808, + "grad_norm": 1.091691255569458, + "learning_rate": 2.5456852791878172e-05, + "loss": 0.6693, + "step": 8704 + }, + { + "epoch": 4.9097574732092495, + "grad_norm": 1.2950390577316284, + "learning_rate": 2.5454032712915964e-05, + "loss": 0.7473, + "step": 8705 + }, + { + "epoch": 4.910321489001692, + "grad_norm": 1.8437741994857788, + "learning_rate": 2.5451212633953753e-05, + "loss": 0.8455, + "step": 8706 + }, + { + "epoch": 4.910885504794134, + "grad_norm": 1.1964832544326782, + "learning_rate": 2.5448392554991542e-05, + "loss": 0.7519, + "step": 8707 + }, + { + "epoch": 4.911449520586577, + "grad_norm": 1.462019443511963, + "learning_rate": 2.5445572476029327e-05, + "loss": 0.7186, + "step": 8708 + }, + { + "epoch": 4.9120135363790185, + "grad_norm": 1.1790223121643066, + "learning_rate": 2.544275239706712e-05, + "loss": 0.7055, + "step": 8709 + }, + { + "epoch": 4.912577552171461, + "grad_norm": 1.5493813753128052, + "learning_rate": 2.5439932318104908e-05, + "loss": 0.8306, + "step": 8710 + }, + { + "epoch": 4.913141567963903, + "grad_norm": 0.8104262948036194, + "learning_rate": 2.5437112239142697e-05, + "loss": 0.6734, + "step": 8711 + }, + { + "epoch": 4.913705583756345, + "grad_norm": 1.3909037113189697, + "learning_rate": 2.5434292160180482e-05, + "loss": 0.7642, + "step": 8712 + }, + { + "epoch": 4.9142695995487875, + "grad_norm": 1.0690754652023315, + "learning_rate": 2.5431472081218278e-05, + "loss": 0.7193, + "step": 8713 + }, + { + "epoch": 4.914833615341229, + "grad_norm": 0.7911039590835571, + "learning_rate": 2.5428652002256063e-05, + "loss": 0.6051, + "step": 8714 + }, + { + "epoch": 4.915397631133672, + "grad_norm": 0.9880651831626892, + "learning_rate": 2.5425831923293852e-05, + "loss": 0.6553, + "step": 8715 + }, + { + "epoch": 4.915961646926114, + "grad_norm": 1.5401091575622559, + "learning_rate": 2.542301184433164e-05, + "loss": 0.8017, + "step": 8716 + }, + { + "epoch": 4.916525662718556, + "grad_norm": 1.2112902402877808, + "learning_rate": 2.5420191765369433e-05, + "loss": 0.7901, + "step": 8717 + }, + { + "epoch": 4.917089678510998, + "grad_norm": 1.1308318376541138, + "learning_rate": 2.5417371686407222e-05, + "loss": 0.6744, + "step": 8718 + }, + { + "epoch": 4.91765369430344, + "grad_norm": 0.929852306842804, + "learning_rate": 2.5414551607445007e-05, + "loss": 0.6317, + "step": 8719 + }, + { + "epoch": 4.918217710095883, + "grad_norm": 0.9124884009361267, + "learning_rate": 2.5411731528482796e-05, + "loss": 0.7797, + "step": 8720 + }, + { + "epoch": 4.918781725888325, + "grad_norm": 0.9440932869911194, + "learning_rate": 2.540891144952059e-05, + "loss": 0.6614, + "step": 8721 + }, + { + "epoch": 4.919345741680767, + "grad_norm": 1.1849361658096313, + "learning_rate": 2.5406091370558377e-05, + "loss": 0.6762, + "step": 8722 + }, + { + "epoch": 4.919909757473209, + "grad_norm": 1.224407434463501, + "learning_rate": 2.5403271291596166e-05, + "loss": 0.7348, + "step": 8723 + }, + { + "epoch": 4.920473773265652, + "grad_norm": 0.9188922643661499, + "learning_rate": 2.540045121263395e-05, + "loss": 0.7134, + "step": 8724 + }, + { + "epoch": 4.921037789058094, + "grad_norm": 1.0238187313079834, + "learning_rate": 2.5397631133671747e-05, + "loss": 0.7806, + "step": 8725 + }, + { + "epoch": 4.9216018048505354, + "grad_norm": 1.402449607849121, + "learning_rate": 2.5394811054709532e-05, + "loss": 0.711, + "step": 8726 + }, + { + "epoch": 4.922165820642978, + "grad_norm": 1.0320183038711548, + "learning_rate": 2.539199097574732e-05, + "loss": 0.6692, + "step": 8727 + }, + { + "epoch": 4.92272983643542, + "grad_norm": 1.4012349843978882, + "learning_rate": 2.5389170896785113e-05, + "loss": 0.7533, + "step": 8728 + }, + { + "epoch": 4.923293852227863, + "grad_norm": 2.132901668548584, + "learning_rate": 2.5386350817822902e-05, + "loss": 0.7927, + "step": 8729 + }, + { + "epoch": 4.9238578680203045, + "grad_norm": 1.1684684753417969, + "learning_rate": 2.5383530738860688e-05, + "loss": 0.7567, + "step": 8730 + }, + { + "epoch": 4.924421883812746, + "grad_norm": 1.2696044445037842, + "learning_rate": 2.5380710659898476e-05, + "loss": 0.7658, + "step": 8731 + }, + { + "epoch": 4.924985899605189, + "grad_norm": 1.3437992334365845, + "learning_rate": 2.537789058093627e-05, + "loss": 0.8645, + "step": 8732 + }, + { + "epoch": 4.925549915397631, + "grad_norm": 0.9397892355918884, + "learning_rate": 2.5375070501974057e-05, + "loss": 0.6355, + "step": 8733 + }, + { + "epoch": 4.9261139311900735, + "grad_norm": 1.1482993364334106, + "learning_rate": 2.5372250423011846e-05, + "loss": 0.7555, + "step": 8734 + }, + { + "epoch": 4.926677946982515, + "grad_norm": 0.7966890931129456, + "learning_rate": 2.536943034404963e-05, + "loss": 0.664, + "step": 8735 + }, + { + "epoch": 4.927241962774958, + "grad_norm": 1.0514470338821411, + "learning_rate": 2.5366610265087427e-05, + "loss": 0.6932, + "step": 8736 + }, + { + "epoch": 4.9278059785674, + "grad_norm": 0.8910326361656189, + "learning_rate": 2.5363790186125213e-05, + "loss": 0.7558, + "step": 8737 + }, + { + "epoch": 4.9283699943598425, + "grad_norm": 1.4246279001235962, + "learning_rate": 2.5360970107163e-05, + "loss": 0.7535, + "step": 8738 + }, + { + "epoch": 4.928934010152284, + "grad_norm": 1.3381470441818237, + "learning_rate": 2.535815002820079e-05, + "loss": 0.7823, + "step": 8739 + }, + { + "epoch": 4.929498025944726, + "grad_norm": 1.3575338125228882, + "learning_rate": 2.5355329949238582e-05, + "loss": 0.7162, + "step": 8740 + }, + { + "epoch": 4.930062041737169, + "grad_norm": 0.911263108253479, + "learning_rate": 2.535250987027637e-05, + "loss": 0.6601, + "step": 8741 + }, + { + "epoch": 4.930626057529611, + "grad_norm": 1.3713396787643433, + "learning_rate": 2.5349689791314157e-05, + "loss": 0.828, + "step": 8742 + }, + { + "epoch": 4.931190073322053, + "grad_norm": 1.9508373737335205, + "learning_rate": 2.5346869712351945e-05, + "loss": 0.9092, + "step": 8743 + }, + { + "epoch": 4.931754089114495, + "grad_norm": 1.3620996475219727, + "learning_rate": 2.5344049633389738e-05, + "loss": 0.8171, + "step": 8744 + }, + { + "epoch": 4.932318104906937, + "grad_norm": 1.2525253295898438, + "learning_rate": 2.5341229554427526e-05, + "loss": 0.7777, + "step": 8745 + }, + { + "epoch": 4.93288212069938, + "grad_norm": 0.9664528965950012, + "learning_rate": 2.5338409475465312e-05, + "loss": 0.696, + "step": 8746 + }, + { + "epoch": 4.933446136491821, + "grad_norm": 1.0757148265838623, + "learning_rate": 2.53355893965031e-05, + "loss": 0.6857, + "step": 8747 + }, + { + "epoch": 4.934010152284264, + "grad_norm": 1.1754429340362549, + "learning_rate": 2.5332769317540893e-05, + "loss": 0.8112, + "step": 8748 + }, + { + "epoch": 4.934574168076706, + "grad_norm": 1.4135185480117798, + "learning_rate": 2.532994923857868e-05, + "loss": 0.6986, + "step": 8749 + }, + { + "epoch": 4.935138183869149, + "grad_norm": 1.4107235670089722, + "learning_rate": 2.532712915961647e-05, + "loss": 0.7407, + "step": 8750 + }, + { + "epoch": 4.93570219966159, + "grad_norm": 1.1154340505599976, + "learning_rate": 2.5324309080654256e-05, + "loss": 0.7538, + "step": 8751 + }, + { + "epoch": 4.936266215454033, + "grad_norm": 1.1174674034118652, + "learning_rate": 2.532148900169205e-05, + "loss": 0.7065, + "step": 8752 + }, + { + "epoch": 4.936830231246475, + "grad_norm": 1.338051199913025, + "learning_rate": 2.5318668922729837e-05, + "loss": 0.7046, + "step": 8753 + }, + { + "epoch": 4.937394247038917, + "grad_norm": 3.020419120788574, + "learning_rate": 2.5315848843767625e-05, + "loss": 0.8844, + "step": 8754 + }, + { + "epoch": 4.937958262831359, + "grad_norm": 1.1641148328781128, + "learning_rate": 2.5313028764805414e-05, + "loss": 0.7597, + "step": 8755 + }, + { + "epoch": 4.938522278623801, + "grad_norm": 1.414749026298523, + "learning_rate": 2.5310208685843206e-05, + "loss": 0.8002, + "step": 8756 + }, + { + "epoch": 4.939086294416244, + "grad_norm": 1.1584999561309814, + "learning_rate": 2.5307388606880995e-05, + "loss": 0.6757, + "step": 8757 + }, + { + "epoch": 4.939650310208686, + "grad_norm": 1.0661929845809937, + "learning_rate": 2.530456852791878e-05, + "loss": 0.6741, + "step": 8758 + }, + { + "epoch": 4.940214326001128, + "grad_norm": 0.8558828830718994, + "learning_rate": 2.530174844895657e-05, + "loss": 0.7168, + "step": 8759 + }, + { + "epoch": 4.94077834179357, + "grad_norm": 1.0149145126342773, + "learning_rate": 2.529892836999436e-05, + "loss": 0.7008, + "step": 8760 + }, + { + "epoch": 4.941342357586012, + "grad_norm": 1.5794650316238403, + "learning_rate": 2.529610829103215e-05, + "loss": 0.863, + "step": 8761 + }, + { + "epoch": 4.941906373378455, + "grad_norm": 1.3963313102722168, + "learning_rate": 2.529328821206994e-05, + "loss": 0.7964, + "step": 8762 + }, + { + "epoch": 4.9424703891708965, + "grad_norm": 1.2719216346740723, + "learning_rate": 2.529046813310773e-05, + "loss": 0.7254, + "step": 8763 + }, + { + "epoch": 4.943034404963339, + "grad_norm": 0.8817718625068665, + "learning_rate": 2.5287648054145517e-05, + "loss": 0.7483, + "step": 8764 + }, + { + "epoch": 4.943598420755781, + "grad_norm": 0.8328838348388672, + "learning_rate": 2.5284827975183306e-05, + "loss": 0.6534, + "step": 8765 + }, + { + "epoch": 4.944162436548224, + "grad_norm": 1.4402120113372803, + "learning_rate": 2.5282007896221094e-05, + "loss": 0.8723, + "step": 8766 + }, + { + "epoch": 4.9447264523406655, + "grad_norm": 1.2627888917922974, + "learning_rate": 2.5279187817258887e-05, + "loss": 0.7791, + "step": 8767 + }, + { + "epoch": 4.945290468133107, + "grad_norm": 1.8214221000671387, + "learning_rate": 2.5276367738296675e-05, + "loss": 0.8559, + "step": 8768 + }, + { + "epoch": 4.94585448392555, + "grad_norm": 1.0843391418457031, + "learning_rate": 2.527354765933446e-05, + "loss": 0.7541, + "step": 8769 + }, + { + "epoch": 4.946418499717992, + "grad_norm": 1.491212010383606, + "learning_rate": 2.527072758037225e-05, + "loss": 0.7592, + "step": 8770 + }, + { + "epoch": 4.9469825155104346, + "grad_norm": 2.264946699142456, + "learning_rate": 2.5267907501410042e-05, + "loss": 0.8684, + "step": 8771 + }, + { + "epoch": 4.947546531302876, + "grad_norm": 1.2059597969055176, + "learning_rate": 2.526508742244783e-05, + "loss": 0.6953, + "step": 8772 + }, + { + "epoch": 4.948110547095319, + "grad_norm": 0.8666388392448425, + "learning_rate": 2.526226734348562e-05, + "loss": 0.7411, + "step": 8773 + }, + { + "epoch": 4.948674562887761, + "grad_norm": 1.4589115381240845, + "learning_rate": 2.5259447264523405e-05, + "loss": 0.7679, + "step": 8774 + }, + { + "epoch": 4.949238578680203, + "grad_norm": 1.1440004110336304, + "learning_rate": 2.52566271855612e-05, + "loss": 0.8465, + "step": 8775 + }, + { + "epoch": 4.949802594472645, + "grad_norm": 0.9548677802085876, + "learning_rate": 2.5253807106598986e-05, + "loss": 0.6707, + "step": 8776 + }, + { + "epoch": 4.950366610265087, + "grad_norm": 1.2616122961044312, + "learning_rate": 2.5250987027636775e-05, + "loss": 0.6652, + "step": 8777 + }, + { + "epoch": 4.95093062605753, + "grad_norm": 0.7981822490692139, + "learning_rate": 2.5248166948674563e-05, + "loss": 0.6462, + "step": 8778 + }, + { + "epoch": 4.951494641849972, + "grad_norm": 1.3118023872375488, + "learning_rate": 2.5245346869712356e-05, + "loss": 0.7651, + "step": 8779 + }, + { + "epoch": 4.952058657642414, + "grad_norm": 1.4353606700897217, + "learning_rate": 2.5242526790750144e-05, + "loss": 0.7289, + "step": 8780 + }, + { + "epoch": 4.952622673434856, + "grad_norm": 1.2763041257858276, + "learning_rate": 2.523970671178793e-05, + "loss": 0.8847, + "step": 8781 + }, + { + "epoch": 4.953186689227298, + "grad_norm": 1.1065988540649414, + "learning_rate": 2.523688663282572e-05, + "loss": 0.7342, + "step": 8782 + }, + { + "epoch": 4.953750705019741, + "grad_norm": 1.0209336280822754, + "learning_rate": 2.523406655386351e-05, + "loss": 0.7537, + "step": 8783 + }, + { + "epoch": 4.9543147208121825, + "grad_norm": 1.0151599645614624, + "learning_rate": 2.52312464749013e-05, + "loss": 0.7661, + "step": 8784 + }, + { + "epoch": 4.954878736604625, + "grad_norm": 1.547844648361206, + "learning_rate": 2.5228426395939085e-05, + "loss": 0.6576, + "step": 8785 + }, + { + "epoch": 4.955442752397067, + "grad_norm": 1.132771372795105, + "learning_rate": 2.5225606316976874e-05, + "loss": 0.8013, + "step": 8786 + }, + { + "epoch": 4.95600676818951, + "grad_norm": 1.0372709035873413, + "learning_rate": 2.5222786238014666e-05, + "loss": 0.8022, + "step": 8787 + }, + { + "epoch": 4.9565707839819515, + "grad_norm": 1.1676138639450073, + "learning_rate": 2.5219966159052455e-05, + "loss": 0.6316, + "step": 8788 + }, + { + "epoch": 4.957134799774393, + "grad_norm": 1.0951265096664429, + "learning_rate": 2.5217146080090244e-05, + "loss": 0.7745, + "step": 8789 + }, + { + "epoch": 4.957698815566836, + "grad_norm": 0.9432402849197388, + "learning_rate": 2.521432600112803e-05, + "loss": 0.7293, + "step": 8790 + }, + { + "epoch": 4.958262831359278, + "grad_norm": 0.8183540105819702, + "learning_rate": 2.5211505922165825e-05, + "loss": 0.6522, + "step": 8791 + }, + { + "epoch": 4.9588268471517205, + "grad_norm": 0.9861758947372437, + "learning_rate": 2.520868584320361e-05, + "loss": 0.7757, + "step": 8792 + }, + { + "epoch": 4.959390862944162, + "grad_norm": 1.1674731969833374, + "learning_rate": 2.52058657642414e-05, + "loss": 0.7666, + "step": 8793 + }, + { + "epoch": 4.959954878736605, + "grad_norm": 1.2295864820480347, + "learning_rate": 2.5203045685279188e-05, + "loss": 0.6883, + "step": 8794 + }, + { + "epoch": 4.960518894529047, + "grad_norm": 1.3436484336853027, + "learning_rate": 2.520022560631698e-05, + "loss": 0.7314, + "step": 8795 + }, + { + "epoch": 4.961082910321489, + "grad_norm": 0.9656965136528015, + "learning_rate": 2.519740552735477e-05, + "loss": 0.6977, + "step": 8796 + }, + { + "epoch": 4.961646926113931, + "grad_norm": 1.5143771171569824, + "learning_rate": 2.5194585448392554e-05, + "loss": 0.8862, + "step": 8797 + }, + { + "epoch": 4.962210941906373, + "grad_norm": 1.2924107313156128, + "learning_rate": 2.519176536943035e-05, + "loss": 0.712, + "step": 8798 + }, + { + "epoch": 4.962774957698816, + "grad_norm": 1.1006306409835815, + "learning_rate": 2.5188945290468135e-05, + "loss": 0.7562, + "step": 8799 + }, + { + "epoch": 4.963338973491258, + "grad_norm": 1.4560986757278442, + "learning_rate": 2.5186125211505924e-05, + "loss": 0.7557, + "step": 8800 + }, + { + "epoch": 4.9639029892837, + "grad_norm": 1.4622077941894531, + "learning_rate": 2.5183305132543712e-05, + "loss": 0.8308, + "step": 8801 + }, + { + "epoch": 4.964467005076142, + "grad_norm": 1.2667019367218018, + "learning_rate": 2.5180485053581505e-05, + "loss": 0.7426, + "step": 8802 + }, + { + "epoch": 4.965031020868584, + "grad_norm": 1.7325456142425537, + "learning_rate": 2.517766497461929e-05, + "loss": 0.8899, + "step": 8803 + }, + { + "epoch": 4.965595036661027, + "grad_norm": 0.8907786011695862, + "learning_rate": 2.517484489565708e-05, + "loss": 0.7234, + "step": 8804 + }, + { + "epoch": 4.9661590524534684, + "grad_norm": 0.9446206092834473, + "learning_rate": 2.5172024816694868e-05, + "loss": 0.6567, + "step": 8805 + }, + { + "epoch": 4.966723068245911, + "grad_norm": 1.4261360168457031, + "learning_rate": 2.516920473773266e-05, + "loss": 0.7642, + "step": 8806 + }, + { + "epoch": 4.967287084038353, + "grad_norm": 1.7260234355926514, + "learning_rate": 2.516638465877045e-05, + "loss": 0.8192, + "step": 8807 + }, + { + "epoch": 4.967851099830796, + "grad_norm": 1.1515870094299316, + "learning_rate": 2.5163564579808234e-05, + "loss": 0.7242, + "step": 8808 + }, + { + "epoch": 4.9684151156232375, + "grad_norm": 1.57420015335083, + "learning_rate": 2.5160744500846023e-05, + "loss": 0.8429, + "step": 8809 + }, + { + "epoch": 4.968979131415679, + "grad_norm": 1.2307926416397095, + "learning_rate": 2.5157924421883815e-05, + "loss": 0.719, + "step": 8810 + }, + { + "epoch": 4.969543147208122, + "grad_norm": 1.3812905550003052, + "learning_rate": 2.5155104342921604e-05, + "loss": 0.7541, + "step": 8811 + }, + { + "epoch": 4.970107163000564, + "grad_norm": 1.2161121368408203, + "learning_rate": 2.5152284263959393e-05, + "loss": 0.7783, + "step": 8812 + }, + { + "epoch": 4.9706711787930065, + "grad_norm": 1.1385623216629028, + "learning_rate": 2.5149464184997178e-05, + "loss": 0.748, + "step": 8813 + }, + { + "epoch": 4.971235194585448, + "grad_norm": 1.3000956773757935, + "learning_rate": 2.5146644106034974e-05, + "loss": 0.7601, + "step": 8814 + }, + { + "epoch": 4.971799210377891, + "grad_norm": 1.6361198425292969, + "learning_rate": 2.514382402707276e-05, + "loss": 0.8257, + "step": 8815 + }, + { + "epoch": 4.972363226170333, + "grad_norm": 1.0883954763412476, + "learning_rate": 2.5141003948110548e-05, + "loss": 0.7389, + "step": 8816 + }, + { + "epoch": 4.972927241962775, + "grad_norm": 0.998385488986969, + "learning_rate": 2.5138183869148337e-05, + "loss": 0.7351, + "step": 8817 + }, + { + "epoch": 4.973491257755217, + "grad_norm": 0.9274818897247314, + "learning_rate": 2.513536379018613e-05, + "loss": 0.7521, + "step": 8818 + }, + { + "epoch": 4.974055273547659, + "grad_norm": 1.044885516166687, + "learning_rate": 2.5132543711223918e-05, + "loss": 0.7535, + "step": 8819 + }, + { + "epoch": 4.974619289340102, + "grad_norm": 1.0380045175552368, + "learning_rate": 2.5129723632261703e-05, + "loss": 0.7004, + "step": 8820 + }, + { + "epoch": 4.975183305132544, + "grad_norm": 0.7400694489479065, + "learning_rate": 2.5126903553299492e-05, + "loss": 0.5974, + "step": 8821 + }, + { + "epoch": 4.975747320924986, + "grad_norm": 1.2400734424591064, + "learning_rate": 2.5124083474337284e-05, + "loss": 0.7438, + "step": 8822 + }, + { + "epoch": 4.976311336717428, + "grad_norm": 0.940747082233429, + "learning_rate": 2.5121263395375073e-05, + "loss": 0.7979, + "step": 8823 + }, + { + "epoch": 4.97687535250987, + "grad_norm": 1.375603199005127, + "learning_rate": 2.5118443316412858e-05, + "loss": 0.8749, + "step": 8824 + }, + { + "epoch": 4.977439368302313, + "grad_norm": 1.5330191850662231, + "learning_rate": 2.5115623237450647e-05, + "loss": 0.7665, + "step": 8825 + }, + { + "epoch": 4.978003384094754, + "grad_norm": 1.6069761514663696, + "learning_rate": 2.511280315848844e-05, + "loss": 0.769, + "step": 8826 + }, + { + "epoch": 4.978567399887197, + "grad_norm": 1.4859812259674072, + "learning_rate": 2.5109983079526228e-05, + "loss": 0.8408, + "step": 8827 + }, + { + "epoch": 4.979131415679639, + "grad_norm": 1.4093375205993652, + "learning_rate": 2.5107163000564017e-05, + "loss": 0.766, + "step": 8828 + }, + { + "epoch": 4.979695431472082, + "grad_norm": 1.1776620149612427, + "learning_rate": 2.5104342921601802e-05, + "loss": 0.7858, + "step": 8829 + }, + { + "epoch": 4.980259447264523, + "grad_norm": 1.3401970863342285, + "learning_rate": 2.5101522842639598e-05, + "loss": 0.8085, + "step": 8830 + }, + { + "epoch": 4.980823463056965, + "grad_norm": 0.965613603591919, + "learning_rate": 2.5098702763677383e-05, + "loss": 0.7387, + "step": 8831 + }, + { + "epoch": 4.981387478849408, + "grad_norm": 1.0761812925338745, + "learning_rate": 2.5095882684715172e-05, + "loss": 0.7689, + "step": 8832 + }, + { + "epoch": 4.98195149464185, + "grad_norm": 1.1466773748397827, + "learning_rate": 2.5093062605752964e-05, + "loss": 0.7406, + "step": 8833 + }, + { + "epoch": 4.982515510434292, + "grad_norm": 0.9252434968948364, + "learning_rate": 2.5090242526790753e-05, + "loss": 0.6922, + "step": 8834 + }, + { + "epoch": 4.983079526226734, + "grad_norm": 1.017746090888977, + "learning_rate": 2.5087422447828542e-05, + "loss": 0.6482, + "step": 8835 + }, + { + "epoch": 4.983643542019177, + "grad_norm": 1.2243772745132446, + "learning_rate": 2.5084602368866327e-05, + "loss": 0.7758, + "step": 8836 + }, + { + "epoch": 4.984207557811619, + "grad_norm": 1.373437762260437, + "learning_rate": 2.5081782289904123e-05, + "loss": 0.8786, + "step": 8837 + }, + { + "epoch": 4.9847715736040605, + "grad_norm": 1.4386296272277832, + "learning_rate": 2.5078962210941908e-05, + "loss": 0.767, + "step": 8838 + }, + { + "epoch": 4.985335589396503, + "grad_norm": 1.193951964378357, + "learning_rate": 2.5076142131979697e-05, + "loss": 0.7054, + "step": 8839 + }, + { + "epoch": 4.985899605188945, + "grad_norm": 1.8451480865478516, + "learning_rate": 2.5073322053017482e-05, + "loss": 0.8128, + "step": 8840 + }, + { + "epoch": 4.986463620981388, + "grad_norm": 0.9533169865608215, + "learning_rate": 2.5070501974055278e-05, + "loss": 0.7543, + "step": 8841 + }, + { + "epoch": 4.9870276367738295, + "grad_norm": 1.0705950260162354, + "learning_rate": 2.5067681895093063e-05, + "loss": 0.8606, + "step": 8842 + }, + { + "epoch": 4.987591652566272, + "grad_norm": 0.9336458444595337, + "learning_rate": 2.5064861816130852e-05, + "loss": 0.7124, + "step": 8843 + }, + { + "epoch": 4.988155668358714, + "grad_norm": 1.2844367027282715, + "learning_rate": 2.506204173716864e-05, + "loss": 0.8881, + "step": 8844 + }, + { + "epoch": 4.988719684151156, + "grad_norm": 1.252480149269104, + "learning_rate": 2.5059221658206433e-05, + "loss": 0.7365, + "step": 8845 + }, + { + "epoch": 4.9892836999435985, + "grad_norm": 1.382333755493164, + "learning_rate": 2.5056401579244222e-05, + "loss": 0.7321, + "step": 8846 + }, + { + "epoch": 4.98984771573604, + "grad_norm": 1.1598821878433228, + "learning_rate": 2.5053581500282007e-05, + "loss": 0.7495, + "step": 8847 + }, + { + "epoch": 4.990411731528483, + "grad_norm": 1.1593519449234009, + "learning_rate": 2.5050761421319796e-05, + "loss": 0.7892, + "step": 8848 + }, + { + "epoch": 4.990975747320925, + "grad_norm": 1.0106353759765625, + "learning_rate": 2.5047941342357588e-05, + "loss": 0.6524, + "step": 8849 + }, + { + "epoch": 4.991539763113368, + "grad_norm": 0.9248888492584229, + "learning_rate": 2.5045121263395377e-05, + "loss": 0.7482, + "step": 8850 + }, + { + "epoch": 4.992103778905809, + "grad_norm": 1.7497389316558838, + "learning_rate": 2.5042301184433166e-05, + "loss": 0.8165, + "step": 8851 + }, + { + "epoch": 4.992667794698251, + "grad_norm": 1.230764389038086, + "learning_rate": 2.503948110547095e-05, + "loss": 0.845, + "step": 8852 + }, + { + "epoch": 4.993231810490694, + "grad_norm": 1.798249363899231, + "learning_rate": 2.5036661026508747e-05, + "loss": 0.8435, + "step": 8853 + }, + { + "epoch": 4.993795826283136, + "grad_norm": 1.4657868146896362, + "learning_rate": 2.5033840947546532e-05, + "loss": 0.7659, + "step": 8854 + }, + { + "epoch": 4.994359842075578, + "grad_norm": 1.1134436130523682, + "learning_rate": 2.503102086858432e-05, + "loss": 0.712, + "step": 8855 + }, + { + "epoch": 4.99492385786802, + "grad_norm": 1.1307345628738403, + "learning_rate": 2.502820078962211e-05, + "loss": 0.8517, + "step": 8856 + }, + { + "epoch": 4.995487873660463, + "grad_norm": 0.9914299249649048, + "learning_rate": 2.5025380710659902e-05, + "loss": 0.8193, + "step": 8857 + }, + { + "epoch": 4.996051889452905, + "grad_norm": 1.308624267578125, + "learning_rate": 2.5022560631697687e-05, + "loss": 0.8639, + "step": 8858 + }, + { + "epoch": 4.9966159052453465, + "grad_norm": 0.9576606750488281, + "learning_rate": 2.5019740552735476e-05, + "loss": 0.7045, + "step": 8859 + }, + { + "epoch": 4.997179921037789, + "grad_norm": 1.352341651916504, + "learning_rate": 2.5016920473773265e-05, + "loss": 0.6253, + "step": 8860 + }, + { + "epoch": 4.997743936830231, + "grad_norm": 1.2533316612243652, + "learning_rate": 2.5014100394811057e-05, + "loss": 0.7096, + "step": 8861 + }, + { + "epoch": 4.998307952622674, + "grad_norm": 1.5628637075424194, + "learning_rate": 2.5011280315848846e-05, + "loss": 0.8375, + "step": 8862 + }, + { + "epoch": 4.9988719684151155, + "grad_norm": 1.1409953832626343, + "learning_rate": 2.500846023688663e-05, + "loss": 0.7967, + "step": 8863 + }, + { + "epoch": 4.999435984207558, + "grad_norm": 1.3458105325698853, + "learning_rate": 2.500564015792442e-05, + "loss": 0.7367, + "step": 8864 + }, + { + "epoch": 5.0, + "grad_norm": 1.880089521408081, + "learning_rate": 2.5002820078962212e-05, + "loss": 0.606, + "step": 8865 + }, + { + "epoch": 5.000564015792442, + "grad_norm": 1.3038374185562134, + "learning_rate": 2.5e-05, + "loss": 0.7227, + "step": 8866 + }, + { + "epoch": 5.0011280315848845, + "grad_norm": 1.299896001815796, + "learning_rate": 2.499717992103779e-05, + "loss": 0.7225, + "step": 8867 + }, + { + "epoch": 5.001692047377326, + "grad_norm": 1.322834849357605, + "learning_rate": 2.499435984207558e-05, + "loss": 0.7784, + "step": 8868 + }, + { + "epoch": 5.002256063169769, + "grad_norm": 28.173442840576172, + "learning_rate": 2.499153976311337e-05, + "loss": 0.8438, + "step": 8869 + }, + { + "epoch": 5.002820078962211, + "grad_norm": 1.4014678001403809, + "learning_rate": 2.4988719684151156e-05, + "loss": 0.8106, + "step": 8870 + }, + { + "epoch": 5.0033840947546535, + "grad_norm": 1.0502835512161255, + "learning_rate": 2.498589960518895e-05, + "loss": 0.7846, + "step": 8871 + }, + { + "epoch": 5.003948110547095, + "grad_norm": 0.9669914245605469, + "learning_rate": 2.4983079526226734e-05, + "loss": 0.7677, + "step": 8872 + }, + { + "epoch": 5.004512126339537, + "grad_norm": 1.0007145404815674, + "learning_rate": 2.4980259447264526e-05, + "loss": 0.7161, + "step": 8873 + }, + { + "epoch": 5.00507614213198, + "grad_norm": 6.659524917602539, + "learning_rate": 2.4977439368302315e-05, + "loss": 0.8051, + "step": 8874 + }, + { + "epoch": 5.005640157924422, + "grad_norm": 0.9473444819450378, + "learning_rate": 2.4974619289340104e-05, + "loss": 0.7663, + "step": 8875 + }, + { + "epoch": 5.006204173716864, + "grad_norm": 1.2889963388442993, + "learning_rate": 2.4971799210377893e-05, + "loss": 0.8057, + "step": 8876 + }, + { + "epoch": 5.006768189509306, + "grad_norm": 1.0877364873886108, + "learning_rate": 2.496897913141568e-05, + "loss": 0.663, + "step": 8877 + }, + { + "epoch": 5.007332205301749, + "grad_norm": 1.112358808517456, + "learning_rate": 2.496615905245347e-05, + "loss": 0.7672, + "step": 8878 + }, + { + "epoch": 5.007896221094191, + "grad_norm": 0.8079631924629211, + "learning_rate": 2.496333897349126e-05, + "loss": 0.5979, + "step": 8879 + }, + { + "epoch": 5.008460236886632, + "grad_norm": 1.0755267143249512, + "learning_rate": 2.4960518894529048e-05, + "loss": 0.7047, + "step": 8880 + }, + { + "epoch": 5.009024252679075, + "grad_norm": 1.4378372430801392, + "learning_rate": 2.4957698815566837e-05, + "loss": 0.8036, + "step": 8881 + }, + { + "epoch": 5.009588268471517, + "grad_norm": 1.1779553890228271, + "learning_rate": 2.4954878736604625e-05, + "loss": 0.7872, + "step": 8882 + }, + { + "epoch": 5.01015228426396, + "grad_norm": 1.2122374773025513, + "learning_rate": 2.4952058657642418e-05, + "loss": 0.6567, + "step": 8883 + }, + { + "epoch": 5.0107163000564015, + "grad_norm": 1.0206527709960938, + "learning_rate": 2.4949238578680203e-05, + "loss": 0.7041, + "step": 8884 + }, + { + "epoch": 5.011280315848844, + "grad_norm": 0.9674568772315979, + "learning_rate": 2.4946418499717995e-05, + "loss": 0.7415, + "step": 8885 + }, + { + "epoch": 5.011844331641286, + "grad_norm": 1.2354025840759277, + "learning_rate": 2.494359842075578e-05, + "loss": 0.6129, + "step": 8886 + }, + { + "epoch": 5.012408347433728, + "grad_norm": 1.7446469068527222, + "learning_rate": 2.4940778341793573e-05, + "loss": 0.7619, + "step": 8887 + }, + { + "epoch": 5.0129723632261705, + "grad_norm": 1.026669979095459, + "learning_rate": 2.4937958262831358e-05, + "loss": 0.6878, + "step": 8888 + }, + { + "epoch": 5.013536379018612, + "grad_norm": 0.9105440378189087, + "learning_rate": 2.493513818386915e-05, + "loss": 0.6584, + "step": 8889 + }, + { + "epoch": 5.014100394811055, + "grad_norm": 1.252654790878296, + "learning_rate": 2.493231810490694e-05, + "loss": 0.8152, + "step": 8890 + }, + { + "epoch": 5.014664410603497, + "grad_norm": 1.1635215282440186, + "learning_rate": 2.4929498025944728e-05, + "loss": 0.748, + "step": 8891 + }, + { + "epoch": 5.0152284263959395, + "grad_norm": 1.5996134281158447, + "learning_rate": 2.4926677946982517e-05, + "loss": 0.7491, + "step": 8892 + }, + { + "epoch": 5.015792442188381, + "grad_norm": 1.7637224197387695, + "learning_rate": 2.4923857868020305e-05, + "loss": 0.8426, + "step": 8893 + }, + { + "epoch": 5.016356457980823, + "grad_norm": 1.6307517290115356, + "learning_rate": 2.4921037789058094e-05, + "loss": 0.8008, + "step": 8894 + }, + { + "epoch": 5.016920473773266, + "grad_norm": 0.8984596133232117, + "learning_rate": 2.4918217710095883e-05, + "loss": 0.5967, + "step": 8895 + }, + { + "epoch": 5.017484489565708, + "grad_norm": 1.4150127172470093, + "learning_rate": 2.4915397631133672e-05, + "loss": 0.8125, + "step": 8896 + }, + { + "epoch": 5.01804850535815, + "grad_norm": 1.4491039514541626, + "learning_rate": 2.491257755217146e-05, + "loss": 0.6895, + "step": 8897 + }, + { + "epoch": 5.018612521150592, + "grad_norm": 1.1263160705566406, + "learning_rate": 2.490975747320925e-05, + "loss": 0.7422, + "step": 8898 + }, + { + "epoch": 5.019176536943035, + "grad_norm": 2.4277679920196533, + "learning_rate": 2.490693739424704e-05, + "loss": 0.7419, + "step": 8899 + }, + { + "epoch": 5.019740552735477, + "grad_norm": 1.0620615482330322, + "learning_rate": 2.4904117315284827e-05, + "loss": 0.758, + "step": 8900 + }, + { + "epoch": 5.020304568527918, + "grad_norm": 1.4228039979934692, + "learning_rate": 2.490129723632262e-05, + "loss": 0.8448, + "step": 8901 + }, + { + "epoch": 5.020868584320361, + "grad_norm": 1.3416134119033813, + "learning_rate": 2.4898477157360408e-05, + "loss": 0.7081, + "step": 8902 + }, + { + "epoch": 5.021432600112803, + "grad_norm": 1.0377382040023804, + "learning_rate": 2.4895657078398197e-05, + "loss": 0.6442, + "step": 8903 + }, + { + "epoch": 5.021996615905246, + "grad_norm": 1.4393209218978882, + "learning_rate": 2.4892836999435986e-05, + "loss": 0.7566, + "step": 8904 + }, + { + "epoch": 5.022560631697687, + "grad_norm": 1.2026958465576172, + "learning_rate": 2.4890016920473774e-05, + "loss": 0.701, + "step": 8905 + }, + { + "epoch": 5.02312464749013, + "grad_norm": 1.2630895376205444, + "learning_rate": 2.4887196841511563e-05, + "loss": 0.7279, + "step": 8906 + }, + { + "epoch": 5.023688663282572, + "grad_norm": 0.8572716116905212, + "learning_rate": 2.4884376762549352e-05, + "loss": 0.622, + "step": 8907 + }, + { + "epoch": 5.024252679075014, + "grad_norm": 1.3171061277389526, + "learning_rate": 2.4881556683587144e-05, + "loss": 0.8359, + "step": 8908 + }, + { + "epoch": 5.024816694867456, + "grad_norm": 1.3644049167633057, + "learning_rate": 2.487873660462493e-05, + "loss": 0.7981, + "step": 8909 + }, + { + "epoch": 5.025380710659898, + "grad_norm": 1.36012601852417, + "learning_rate": 2.4875916525662722e-05, + "loss": 0.707, + "step": 8910 + }, + { + "epoch": 5.025944726452341, + "grad_norm": 1.2463101148605347, + "learning_rate": 2.4873096446700507e-05, + "loss": 0.7182, + "step": 8911 + }, + { + "epoch": 5.026508742244783, + "grad_norm": 1.368984580039978, + "learning_rate": 2.48702763677383e-05, + "loss": 0.7247, + "step": 8912 + }, + { + "epoch": 5.027072758037225, + "grad_norm": 1.068786382675171, + "learning_rate": 2.4867456288776085e-05, + "loss": 0.7174, + "step": 8913 + }, + { + "epoch": 5.027636773829667, + "grad_norm": 1.0685138702392578, + "learning_rate": 2.4864636209813877e-05, + "loss": 0.7265, + "step": 8914 + }, + { + "epoch": 5.028200789622109, + "grad_norm": 1.502489686012268, + "learning_rate": 2.4861816130851666e-05, + "loss": 0.7742, + "step": 8915 + }, + { + "epoch": 5.028764805414552, + "grad_norm": 1.6215754747390747, + "learning_rate": 2.4858996051889455e-05, + "loss": 0.8154, + "step": 8916 + }, + { + "epoch": 5.0293288212069935, + "grad_norm": 1.1718567609786987, + "learning_rate": 2.4856175972927243e-05, + "loss": 0.7286, + "step": 8917 + }, + { + "epoch": 5.029892836999436, + "grad_norm": 1.0254065990447998, + "learning_rate": 2.4853355893965032e-05, + "loss": 0.698, + "step": 8918 + }, + { + "epoch": 5.030456852791878, + "grad_norm": 1.0411627292633057, + "learning_rate": 2.485053581500282e-05, + "loss": 0.7031, + "step": 8919 + }, + { + "epoch": 5.031020868584321, + "grad_norm": 1.2060236930847168, + "learning_rate": 2.484771573604061e-05, + "loss": 0.7108, + "step": 8920 + }, + { + "epoch": 5.0315848843767625, + "grad_norm": 1.3795033693313599, + "learning_rate": 2.48448956570784e-05, + "loss": 0.7661, + "step": 8921 + }, + { + "epoch": 5.032148900169204, + "grad_norm": 1.7924388647079468, + "learning_rate": 2.4842075578116187e-05, + "loss": 0.7707, + "step": 8922 + }, + { + "epoch": 5.032712915961647, + "grad_norm": 1.0743218660354614, + "learning_rate": 2.4839255499153976e-05, + "loss": 0.7997, + "step": 8923 + }, + { + "epoch": 5.033276931754089, + "grad_norm": 1.5264854431152344, + "learning_rate": 2.483643542019177e-05, + "loss": 0.7313, + "step": 8924 + }, + { + "epoch": 5.0338409475465316, + "grad_norm": 1.0493563413619995, + "learning_rate": 2.4833615341229554e-05, + "loss": 0.8106, + "step": 8925 + }, + { + "epoch": 5.034404963338973, + "grad_norm": 0.9233822822570801, + "learning_rate": 2.4830795262267346e-05, + "loss": 0.6944, + "step": 8926 + }, + { + "epoch": 5.034968979131416, + "grad_norm": 1.255068063735962, + "learning_rate": 2.482797518330513e-05, + "loss": 0.7394, + "step": 8927 + }, + { + "epoch": 5.035532994923858, + "grad_norm": 1.3534880876541138, + "learning_rate": 2.4825155104342924e-05, + "loss": 0.7441, + "step": 8928 + }, + { + "epoch": 5.0360970107163, + "grad_norm": 1.2077667713165283, + "learning_rate": 2.4822335025380712e-05, + "loss": 0.6809, + "step": 8929 + }, + { + "epoch": 5.036661026508742, + "grad_norm": 1.3068314790725708, + "learning_rate": 2.48195149464185e-05, + "loss": 0.7483, + "step": 8930 + }, + { + "epoch": 5.037225042301184, + "grad_norm": 1.1837067604064941, + "learning_rate": 2.481669486745629e-05, + "loss": 0.7802, + "step": 8931 + }, + { + "epoch": 5.037789058093627, + "grad_norm": 1.3444890975952148, + "learning_rate": 2.481387478849408e-05, + "loss": 0.7855, + "step": 8932 + }, + { + "epoch": 5.038353073886069, + "grad_norm": 1.695351243019104, + "learning_rate": 2.4811054709531868e-05, + "loss": 0.7474, + "step": 8933 + }, + { + "epoch": 5.038917089678511, + "grad_norm": 1.3244264125823975, + "learning_rate": 2.4808234630569656e-05, + "loss": 0.7785, + "step": 8934 + }, + { + "epoch": 5.039481105470953, + "grad_norm": 1.1906899213790894, + "learning_rate": 2.4805414551607445e-05, + "loss": 0.7839, + "step": 8935 + }, + { + "epoch": 5.040045121263395, + "grad_norm": 0.9646005630493164, + "learning_rate": 2.4802594472645234e-05, + "loss": 0.7344, + "step": 8936 + }, + { + "epoch": 5.040609137055838, + "grad_norm": 1.0338670015335083, + "learning_rate": 2.4799774393683026e-05, + "loss": 0.6807, + "step": 8937 + }, + { + "epoch": 5.0411731528482795, + "grad_norm": 1.694243311882019, + "learning_rate": 2.4796954314720815e-05, + "loss": 0.8048, + "step": 8938 + }, + { + "epoch": 5.041737168640722, + "grad_norm": 1.1251823902130127, + "learning_rate": 2.4794134235758604e-05, + "loss": 0.8218, + "step": 8939 + }, + { + "epoch": 5.042301184433164, + "grad_norm": 1.3872493505477905, + "learning_rate": 2.4791314156796392e-05, + "loss": 0.7904, + "step": 8940 + }, + { + "epoch": 5.042865200225607, + "grad_norm": 1.0759508609771729, + "learning_rate": 2.478849407783418e-05, + "loss": 0.7554, + "step": 8941 + }, + { + "epoch": 5.0434292160180485, + "grad_norm": 1.0130566358566284, + "learning_rate": 2.478567399887197e-05, + "loss": 0.748, + "step": 8942 + }, + { + "epoch": 5.04399323181049, + "grad_norm": 1.1829733848571777, + "learning_rate": 2.478285391990976e-05, + "loss": 0.7567, + "step": 8943 + }, + { + "epoch": 5.044557247602933, + "grad_norm": 1.2318652868270874, + "learning_rate": 2.4780033840947548e-05, + "loss": 0.7122, + "step": 8944 + }, + { + "epoch": 5.045121263395375, + "grad_norm": 1.4113857746124268, + "learning_rate": 2.4777213761985336e-05, + "loss": 0.7373, + "step": 8945 + }, + { + "epoch": 5.0456852791878175, + "grad_norm": 1.1279007196426392, + "learning_rate": 2.4774393683023125e-05, + "loss": 0.7003, + "step": 8946 + }, + { + "epoch": 5.046249294980259, + "grad_norm": 1.508223533630371, + "learning_rate": 2.4771573604060917e-05, + "loss": 0.8891, + "step": 8947 + }, + { + "epoch": 5.046813310772702, + "grad_norm": 1.4380995035171509, + "learning_rate": 2.4768753525098703e-05, + "loss": 0.7859, + "step": 8948 + }, + { + "epoch": 5.047377326565144, + "grad_norm": 0.9026389122009277, + "learning_rate": 2.4765933446136495e-05, + "loss": 0.761, + "step": 8949 + }, + { + "epoch": 5.047941342357586, + "grad_norm": 1.4467670917510986, + "learning_rate": 2.476311336717428e-05, + "loss": 0.7264, + "step": 8950 + }, + { + "epoch": 5.048505358150028, + "grad_norm": 1.1590343713760376, + "learning_rate": 2.4760293288212073e-05, + "loss": 0.8489, + "step": 8951 + }, + { + "epoch": 5.04906937394247, + "grad_norm": 0.6786795258522034, + "learning_rate": 2.4757473209249858e-05, + "loss": 0.5666, + "step": 8952 + }, + { + "epoch": 5.049633389734913, + "grad_norm": 1.504798173904419, + "learning_rate": 2.475465313028765e-05, + "loss": 0.7507, + "step": 8953 + }, + { + "epoch": 5.050197405527355, + "grad_norm": 0.9046757817268372, + "learning_rate": 2.475183305132544e-05, + "loss": 0.7371, + "step": 8954 + }, + { + "epoch": 5.050761421319797, + "grad_norm": 1.3914138078689575, + "learning_rate": 2.4749012972363228e-05, + "loss": 0.7369, + "step": 8955 + }, + { + "epoch": 5.051325437112239, + "grad_norm": 1.4223439693450928, + "learning_rate": 2.4746192893401017e-05, + "loss": 0.7759, + "step": 8956 + }, + { + "epoch": 5.051889452904681, + "grad_norm": 0.9383448958396912, + "learning_rate": 2.4743372814438805e-05, + "loss": 0.7006, + "step": 8957 + }, + { + "epoch": 5.052453468697124, + "grad_norm": 1.2435518503189087, + "learning_rate": 2.4740552735476594e-05, + "loss": 0.6846, + "step": 8958 + }, + { + "epoch": 5.0530174844895654, + "grad_norm": 1.0616309642791748, + "learning_rate": 2.4737732656514383e-05, + "loss": 0.6893, + "step": 8959 + }, + { + "epoch": 5.053581500282008, + "grad_norm": 1.243648886680603, + "learning_rate": 2.4734912577552172e-05, + "loss": 0.8216, + "step": 8960 + }, + { + "epoch": 5.05414551607445, + "grad_norm": 1.0353877544403076, + "learning_rate": 2.473209249858996e-05, + "loss": 0.762, + "step": 8961 + }, + { + "epoch": 5.054709531866893, + "grad_norm": 1.1100374460220337, + "learning_rate": 2.472927241962775e-05, + "loss": 0.693, + "step": 8962 + }, + { + "epoch": 5.0552735476593345, + "grad_norm": 1.2490911483764648, + "learning_rate": 2.472645234066554e-05, + "loss": 0.7454, + "step": 8963 + }, + { + "epoch": 5.055837563451776, + "grad_norm": 1.154792070388794, + "learning_rate": 2.4723632261703327e-05, + "loss": 0.7152, + "step": 8964 + }, + { + "epoch": 5.056401579244219, + "grad_norm": 0.8957259058952332, + "learning_rate": 2.472081218274112e-05, + "loss": 0.637, + "step": 8965 + }, + { + "epoch": 5.056965595036661, + "grad_norm": 1.234694004058838, + "learning_rate": 2.4717992103778905e-05, + "loss": 0.7895, + "step": 8966 + }, + { + "epoch": 5.0575296108291035, + "grad_norm": 1.194405198097229, + "learning_rate": 2.4715172024816697e-05, + "loss": 0.6082, + "step": 8967 + }, + { + "epoch": 5.058093626621545, + "grad_norm": 1.2714592218399048, + "learning_rate": 2.4712351945854486e-05, + "loss": 0.7574, + "step": 8968 + }, + { + "epoch": 5.058657642413988, + "grad_norm": 1.1784716844558716, + "learning_rate": 2.4709531866892274e-05, + "loss": 0.6341, + "step": 8969 + }, + { + "epoch": 5.05922165820643, + "grad_norm": 1.2451984882354736, + "learning_rate": 2.4706711787930063e-05, + "loss": 0.641, + "step": 8970 + }, + { + "epoch": 5.059785673998872, + "grad_norm": 1.0262055397033691, + "learning_rate": 2.4703891708967852e-05, + "loss": 0.8575, + "step": 8971 + }, + { + "epoch": 5.060349689791314, + "grad_norm": 1.1176323890686035, + "learning_rate": 2.4701071630005644e-05, + "loss": 0.744, + "step": 8972 + }, + { + "epoch": 5.060913705583756, + "grad_norm": 1.2351164817810059, + "learning_rate": 2.469825155104343e-05, + "loss": 0.6891, + "step": 8973 + }, + { + "epoch": 5.061477721376199, + "grad_norm": 0.991428554058075, + "learning_rate": 2.4695431472081222e-05, + "loss": 0.6801, + "step": 8974 + }, + { + "epoch": 5.062041737168641, + "grad_norm": 1.2799232006072998, + "learning_rate": 2.4692611393119007e-05, + "loss": 0.7176, + "step": 8975 + }, + { + "epoch": 5.062605752961083, + "grad_norm": 1.495028018951416, + "learning_rate": 2.46897913141568e-05, + "loss": 0.867, + "step": 8976 + }, + { + "epoch": 5.063169768753525, + "grad_norm": 1.1436597108840942, + "learning_rate": 2.4686971235194588e-05, + "loss": 0.7568, + "step": 8977 + }, + { + "epoch": 5.063733784545967, + "grad_norm": 1.0373252630233765, + "learning_rate": 2.4684151156232377e-05, + "loss": 0.723, + "step": 8978 + }, + { + "epoch": 5.06429780033841, + "grad_norm": 1.0834825038909912, + "learning_rate": 2.4681331077270166e-05, + "loss": 0.7959, + "step": 8979 + }, + { + "epoch": 5.064861816130851, + "grad_norm": 1.1583482027053833, + "learning_rate": 2.4678510998307954e-05, + "loss": 0.7254, + "step": 8980 + }, + { + "epoch": 5.065425831923294, + "grad_norm": 0.9997695684432983, + "learning_rate": 2.4675690919345743e-05, + "loss": 0.6884, + "step": 8981 + }, + { + "epoch": 5.065989847715736, + "grad_norm": 1.182288646697998, + "learning_rate": 2.4672870840383532e-05, + "loss": 0.7853, + "step": 8982 + }, + { + "epoch": 5.066553863508179, + "grad_norm": 1.2114272117614746, + "learning_rate": 2.467005076142132e-05, + "loss": 0.8085, + "step": 8983 + }, + { + "epoch": 5.06711787930062, + "grad_norm": 1.3294243812561035, + "learning_rate": 2.466723068245911e-05, + "loss": 0.7622, + "step": 8984 + }, + { + "epoch": 5.067681895093062, + "grad_norm": 1.0204919576644897, + "learning_rate": 2.46644106034969e-05, + "loss": 0.8274, + "step": 8985 + }, + { + "epoch": 5.068245910885505, + "grad_norm": 1.1563236713409424, + "learning_rate": 2.466159052453469e-05, + "loss": 0.8404, + "step": 8986 + }, + { + "epoch": 5.068809926677947, + "grad_norm": 1.042771577835083, + "learning_rate": 2.4658770445572476e-05, + "loss": 0.67, + "step": 8987 + }, + { + "epoch": 5.069373942470389, + "grad_norm": 1.3698102235794067, + "learning_rate": 2.4655950366610268e-05, + "loss": 0.8135, + "step": 8988 + }, + { + "epoch": 5.069937958262831, + "grad_norm": 1.0503766536712646, + "learning_rate": 2.4653130287648054e-05, + "loss": 0.643, + "step": 8989 + }, + { + "epoch": 5.070501974055274, + "grad_norm": 1.0890461206436157, + "learning_rate": 2.4650310208685846e-05, + "loss": 0.6366, + "step": 8990 + }, + { + "epoch": 5.071065989847716, + "grad_norm": 0.9676222205162048, + "learning_rate": 2.464749012972363e-05, + "loss": 0.6501, + "step": 8991 + }, + { + "epoch": 5.0716300056401575, + "grad_norm": 1.3833746910095215, + "learning_rate": 2.4644670050761423e-05, + "loss": 0.7603, + "step": 8992 + }, + { + "epoch": 5.0721940214326, + "grad_norm": 1.2882065773010254, + "learning_rate": 2.4641849971799212e-05, + "loss": 0.7832, + "step": 8993 + }, + { + "epoch": 5.072758037225042, + "grad_norm": 1.0773766040802002, + "learning_rate": 2.4639029892837e-05, + "loss": 0.7509, + "step": 8994 + }, + { + "epoch": 5.073322053017485, + "grad_norm": 1.2991836071014404, + "learning_rate": 2.463620981387479e-05, + "loss": 0.7529, + "step": 8995 + }, + { + "epoch": 5.0738860688099265, + "grad_norm": 1.3156458139419556, + "learning_rate": 2.463338973491258e-05, + "loss": 0.7402, + "step": 8996 + }, + { + "epoch": 5.074450084602369, + "grad_norm": 1.0774158239364624, + "learning_rate": 2.4630569655950367e-05, + "loss": 0.7902, + "step": 8997 + }, + { + "epoch": 5.075014100394811, + "grad_norm": 1.6053268909454346, + "learning_rate": 2.4627749576988156e-05, + "loss": 0.8393, + "step": 8998 + }, + { + "epoch": 5.075578116187253, + "grad_norm": 0.9859835505485535, + "learning_rate": 2.4624929498025945e-05, + "loss": 0.6798, + "step": 8999 + }, + { + "epoch": 5.0761421319796955, + "grad_norm": 0.988562822341919, + "learning_rate": 2.4622109419063734e-05, + "loss": 0.6861, + "step": 9000 + }, + { + "epoch": 5.076706147772137, + "grad_norm": 1.109222173690796, + "learning_rate": 2.4619289340101523e-05, + "loss": 0.7271, + "step": 9001 + }, + { + "epoch": 5.07727016356458, + "grad_norm": 1.256163239479065, + "learning_rate": 2.4616469261139315e-05, + "loss": 0.7527, + "step": 9002 + }, + { + "epoch": 5.077834179357022, + "grad_norm": 1.0920039415359497, + "learning_rate": 2.46136491821771e-05, + "loss": 0.7247, + "step": 9003 + }, + { + "epoch": 5.0783981951494646, + "grad_norm": 1.0342434644699097, + "learning_rate": 2.4610829103214892e-05, + "loss": 0.7644, + "step": 9004 + }, + { + "epoch": 5.078962210941906, + "grad_norm": 1.2680085897445679, + "learning_rate": 2.4608009024252678e-05, + "loss": 0.7725, + "step": 9005 + }, + { + "epoch": 5.079526226734348, + "grad_norm": 1.5278480052947998, + "learning_rate": 2.460518894529047e-05, + "loss": 0.8848, + "step": 9006 + }, + { + "epoch": 5.080090242526791, + "grad_norm": 1.4956190586090088, + "learning_rate": 2.4602368866328255e-05, + "loss": 0.7386, + "step": 9007 + }, + { + "epoch": 5.080654258319233, + "grad_norm": 1.37944495677948, + "learning_rate": 2.4599548787366048e-05, + "loss": 0.7179, + "step": 9008 + }, + { + "epoch": 5.081218274111675, + "grad_norm": 0.7836048603057861, + "learning_rate": 2.4596728708403836e-05, + "loss": 0.668, + "step": 9009 + }, + { + "epoch": 5.081782289904117, + "grad_norm": 0.9884456992149353, + "learning_rate": 2.4593908629441625e-05, + "loss": 0.7025, + "step": 9010 + }, + { + "epoch": 5.08234630569656, + "grad_norm": 0.9323941469192505, + "learning_rate": 2.4591088550479417e-05, + "loss": 0.6881, + "step": 9011 + }, + { + "epoch": 5.082910321489002, + "grad_norm": 1.0681793689727783, + "learning_rate": 2.4588268471517203e-05, + "loss": 0.6575, + "step": 9012 + }, + { + "epoch": 5.0834743372814435, + "grad_norm": 1.1641243696212769, + "learning_rate": 2.4585448392554995e-05, + "loss": 0.6474, + "step": 9013 + }, + { + "epoch": 5.084038353073886, + "grad_norm": 1.6759076118469238, + "learning_rate": 2.458262831359278e-05, + "loss": 0.8235, + "step": 9014 + }, + { + "epoch": 5.084602368866328, + "grad_norm": 1.2556817531585693, + "learning_rate": 2.4579808234630573e-05, + "loss": 0.7076, + "step": 9015 + }, + { + "epoch": 5.085166384658771, + "grad_norm": 1.5654255151748657, + "learning_rate": 2.4576988155668358e-05, + "loss": 0.9163, + "step": 9016 + }, + { + "epoch": 5.0857304004512125, + "grad_norm": 1.3219021558761597, + "learning_rate": 2.457416807670615e-05, + "loss": 0.6449, + "step": 9017 + }, + { + "epoch": 5.086294416243655, + "grad_norm": 0.8976204991340637, + "learning_rate": 2.457134799774394e-05, + "loss": 0.6984, + "step": 9018 + }, + { + "epoch": 5.086858432036097, + "grad_norm": 1.3241260051727295, + "learning_rate": 2.4568527918781728e-05, + "loss": 0.7522, + "step": 9019 + }, + { + "epoch": 5.087422447828539, + "grad_norm": 1.0411491394042969, + "learning_rate": 2.4565707839819517e-05, + "loss": 0.821, + "step": 9020 + }, + { + "epoch": 5.0879864636209815, + "grad_norm": 0.949330747127533, + "learning_rate": 2.4562887760857305e-05, + "loss": 0.7569, + "step": 9021 + }, + { + "epoch": 5.088550479413423, + "grad_norm": 1.4303061962127686, + "learning_rate": 2.4560067681895094e-05, + "loss": 0.809, + "step": 9022 + }, + { + "epoch": 5.089114495205866, + "grad_norm": 1.8674501180648804, + "learning_rate": 2.4557247602932883e-05, + "loss": 0.867, + "step": 9023 + }, + { + "epoch": 5.089678510998308, + "grad_norm": 1.016187071800232, + "learning_rate": 2.4554427523970672e-05, + "loss": 0.7034, + "step": 9024 + }, + { + "epoch": 5.0902425267907505, + "grad_norm": 1.1747422218322754, + "learning_rate": 2.455160744500846e-05, + "loss": 0.7914, + "step": 9025 + }, + { + "epoch": 5.090806542583192, + "grad_norm": 1.2835005521774292, + "learning_rate": 2.454878736604625e-05, + "loss": 0.6996, + "step": 9026 + }, + { + "epoch": 5.091370558375634, + "grad_norm": 1.2051557302474976, + "learning_rate": 2.454596728708404e-05, + "loss": 0.8506, + "step": 9027 + }, + { + "epoch": 5.091934574168077, + "grad_norm": 1.0186668634414673, + "learning_rate": 2.4543147208121827e-05, + "loss": 0.7805, + "step": 9028 + }, + { + "epoch": 5.092498589960519, + "grad_norm": 1.0192166566848755, + "learning_rate": 2.454032712915962e-05, + "loss": 0.7035, + "step": 9029 + }, + { + "epoch": 5.093062605752961, + "grad_norm": 0.9701921343803406, + "learning_rate": 2.4537507050197404e-05, + "loss": 0.6483, + "step": 9030 + }, + { + "epoch": 5.093626621545403, + "grad_norm": 1.248824954032898, + "learning_rate": 2.4534686971235197e-05, + "loss": 0.714, + "step": 9031 + }, + { + "epoch": 5.094190637337846, + "grad_norm": 1.6031575202941895, + "learning_rate": 2.4531866892272985e-05, + "loss": 0.8197, + "step": 9032 + }, + { + "epoch": 5.094754653130288, + "grad_norm": 1.7063802480697632, + "learning_rate": 2.4529046813310774e-05, + "loss": 0.8062, + "step": 9033 + }, + { + "epoch": 5.095318668922729, + "grad_norm": 0.9506387710571289, + "learning_rate": 2.4526226734348563e-05, + "loss": 0.6468, + "step": 9034 + }, + { + "epoch": 5.095882684715172, + "grad_norm": 1.0962231159210205, + "learning_rate": 2.4523406655386352e-05, + "loss": 0.6789, + "step": 9035 + }, + { + "epoch": 5.096446700507614, + "grad_norm": 1.1375713348388672, + "learning_rate": 2.452058657642414e-05, + "loss": 0.6759, + "step": 9036 + }, + { + "epoch": 5.097010716300057, + "grad_norm": 1.0354455709457397, + "learning_rate": 2.451776649746193e-05, + "loss": 0.7804, + "step": 9037 + }, + { + "epoch": 5.0975747320924985, + "grad_norm": 1.742665410041809, + "learning_rate": 2.4514946418499718e-05, + "loss": 0.7816, + "step": 9038 + }, + { + "epoch": 5.098138747884941, + "grad_norm": 1.0579015016555786, + "learning_rate": 2.4512126339537507e-05, + "loss": 0.7822, + "step": 9039 + }, + { + "epoch": 5.098702763677383, + "grad_norm": 1.4619495868682861, + "learning_rate": 2.4509306260575296e-05, + "loss": 0.7681, + "step": 9040 + }, + { + "epoch": 5.099266779469825, + "grad_norm": 1.0710995197296143, + "learning_rate": 2.4506486181613088e-05, + "loss": 0.7182, + "step": 9041 + }, + { + "epoch": 5.0998307952622675, + "grad_norm": 1.1666862964630127, + "learning_rate": 2.4503666102650873e-05, + "loss": 0.8214, + "step": 9042 + }, + { + "epoch": 5.100394811054709, + "grad_norm": 1.2346402406692505, + "learning_rate": 2.4500846023688666e-05, + "loss": 0.7768, + "step": 9043 + }, + { + "epoch": 5.100958826847152, + "grad_norm": 1.2209014892578125, + "learning_rate": 2.4498025944726454e-05, + "loss": 0.7116, + "step": 9044 + }, + { + "epoch": 5.101522842639594, + "grad_norm": 0.9647657871246338, + "learning_rate": 2.4495205865764243e-05, + "loss": 0.6712, + "step": 9045 + }, + { + "epoch": 5.1020868584320365, + "grad_norm": 1.1825090646743774, + "learning_rate": 2.4492385786802032e-05, + "loss": 0.7275, + "step": 9046 + }, + { + "epoch": 5.102650874224478, + "grad_norm": 1.4905245304107666, + "learning_rate": 2.448956570783982e-05, + "loss": 0.7789, + "step": 9047 + }, + { + "epoch": 5.10321489001692, + "grad_norm": 1.2183082103729248, + "learning_rate": 2.448674562887761e-05, + "loss": 0.7254, + "step": 9048 + }, + { + "epoch": 5.103778905809363, + "grad_norm": 1.0284193754196167, + "learning_rate": 2.44839255499154e-05, + "loss": 0.7932, + "step": 9049 + }, + { + "epoch": 5.104342921601805, + "grad_norm": 1.0593558549880981, + "learning_rate": 2.448110547095319e-05, + "loss": 0.7555, + "step": 9050 + }, + { + "epoch": 5.104906937394247, + "grad_norm": 1.1098518371582031, + "learning_rate": 2.4478285391990976e-05, + "loss": 0.7153, + "step": 9051 + }, + { + "epoch": 5.105470953186689, + "grad_norm": 1.7077831029891968, + "learning_rate": 2.4475465313028768e-05, + "loss": 0.781, + "step": 9052 + }, + { + "epoch": 5.106034968979132, + "grad_norm": 1.1380894184112549, + "learning_rate": 2.4472645234066554e-05, + "loss": 0.7835, + "step": 9053 + }, + { + "epoch": 5.106598984771574, + "grad_norm": 0.887462854385376, + "learning_rate": 2.4469825155104346e-05, + "loss": 0.614, + "step": 9054 + }, + { + "epoch": 5.107163000564015, + "grad_norm": 1.4336377382278442, + "learning_rate": 2.446700507614213e-05, + "loss": 0.7152, + "step": 9055 + }, + { + "epoch": 5.107727016356458, + "grad_norm": 0.9767991900444031, + "learning_rate": 2.4464184997179923e-05, + "loss": 0.7034, + "step": 9056 + }, + { + "epoch": 5.1082910321489, + "grad_norm": 1.1612749099731445, + "learning_rate": 2.4461364918217712e-05, + "loss": 0.7265, + "step": 9057 + }, + { + "epoch": 5.108855047941343, + "grad_norm": 1.1956607103347778, + "learning_rate": 2.44585448392555e-05, + "loss": 0.6231, + "step": 9058 + }, + { + "epoch": 5.109419063733784, + "grad_norm": 1.049538254737854, + "learning_rate": 2.445572476029329e-05, + "loss": 0.6801, + "step": 9059 + }, + { + "epoch": 5.109983079526227, + "grad_norm": 1.2756409645080566, + "learning_rate": 2.445290468133108e-05, + "loss": 0.7017, + "step": 9060 + }, + { + "epoch": 5.110547095318669, + "grad_norm": 1.2783135175704956, + "learning_rate": 2.4450084602368867e-05, + "loss": 0.7288, + "step": 9061 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 1.463236689567566, + "learning_rate": 2.4447264523406656e-05, + "loss": 0.7584, + "step": 9062 + }, + { + "epoch": 5.111675126903553, + "grad_norm": 1.1592333316802979, + "learning_rate": 2.4444444444444445e-05, + "loss": 0.7489, + "step": 9063 + }, + { + "epoch": 5.112239142695995, + "grad_norm": 1.5047029256820679, + "learning_rate": 2.4441624365482234e-05, + "loss": 0.8816, + "step": 9064 + }, + { + "epoch": 5.112803158488438, + "grad_norm": 0.9286021590232849, + "learning_rate": 2.4438804286520023e-05, + "loss": 0.7131, + "step": 9065 + }, + { + "epoch": 5.11336717428088, + "grad_norm": 1.3339753150939941, + "learning_rate": 2.4435984207557815e-05, + "loss": 0.7616, + "step": 9066 + }, + { + "epoch": 5.113931190073322, + "grad_norm": 0.9927828311920166, + "learning_rate": 2.44331641285956e-05, + "loss": 0.7885, + "step": 9067 + }, + { + "epoch": 5.114495205865764, + "grad_norm": 0.9920218586921692, + "learning_rate": 2.4430344049633392e-05, + "loss": 0.6843, + "step": 9068 + }, + { + "epoch": 5.115059221658206, + "grad_norm": 1.09622061252594, + "learning_rate": 2.4427523970671178e-05, + "loss": 0.813, + "step": 9069 + }, + { + "epoch": 5.115623237450649, + "grad_norm": 1.3763885498046875, + "learning_rate": 2.442470389170897e-05, + "loss": 0.8865, + "step": 9070 + }, + { + "epoch": 5.1161872532430905, + "grad_norm": 1.3739826679229736, + "learning_rate": 2.4421883812746755e-05, + "loss": 0.7011, + "step": 9071 + }, + { + "epoch": 5.116751269035533, + "grad_norm": 0.9756321907043457, + "learning_rate": 2.4419063733784547e-05, + "loss": 0.7431, + "step": 9072 + }, + { + "epoch": 5.117315284827975, + "grad_norm": 1.3108497858047485, + "learning_rate": 2.4416243654822336e-05, + "loss": 0.7096, + "step": 9073 + }, + { + "epoch": 5.117879300620418, + "grad_norm": 0.9917362928390503, + "learning_rate": 2.4413423575860125e-05, + "loss": 0.7099, + "step": 9074 + }, + { + "epoch": 5.1184433164128595, + "grad_norm": 1.3093823194503784, + "learning_rate": 2.4410603496897914e-05, + "loss": 0.683, + "step": 9075 + }, + { + "epoch": 5.119007332205301, + "grad_norm": 1.0703046321868896, + "learning_rate": 2.4407783417935703e-05, + "loss": 0.7897, + "step": 9076 + }, + { + "epoch": 5.119571347997744, + "grad_norm": 1.0357234477996826, + "learning_rate": 2.440496333897349e-05, + "loss": 0.7162, + "step": 9077 + }, + { + "epoch": 5.120135363790186, + "grad_norm": 1.0654321908950806, + "learning_rate": 2.440214326001128e-05, + "loss": 0.7857, + "step": 9078 + }, + { + "epoch": 5.1206993795826286, + "grad_norm": 1.0065295696258545, + "learning_rate": 2.4399323181049072e-05, + "loss": 0.7283, + "step": 9079 + }, + { + "epoch": 5.12126339537507, + "grad_norm": 0.9057678580284119, + "learning_rate": 2.4396503102086858e-05, + "loss": 0.7113, + "step": 9080 + }, + { + "epoch": 5.121827411167513, + "grad_norm": 1.0780006647109985, + "learning_rate": 2.439368302312465e-05, + "loss": 0.7018, + "step": 9081 + }, + { + "epoch": 5.122391426959955, + "grad_norm": 1.2769986391067505, + "learning_rate": 2.439086294416244e-05, + "loss": 0.829, + "step": 9082 + }, + { + "epoch": 5.122955442752397, + "grad_norm": 1.1398768424987793, + "learning_rate": 2.4388042865200228e-05, + "loss": 0.6774, + "step": 9083 + }, + { + "epoch": 5.123519458544839, + "grad_norm": 2.127359628677368, + "learning_rate": 2.4385222786238016e-05, + "loss": 0.9034, + "step": 9084 + }, + { + "epoch": 5.124083474337281, + "grad_norm": 1.5672154426574707, + "learning_rate": 2.4382402707275805e-05, + "loss": 0.8243, + "step": 9085 + }, + { + "epoch": 5.124647490129724, + "grad_norm": 1.4692635536193848, + "learning_rate": 2.4379582628313594e-05, + "loss": 0.6772, + "step": 9086 + }, + { + "epoch": 5.125211505922166, + "grad_norm": 1.4125800132751465, + "learning_rate": 2.4376762549351383e-05, + "loss": 0.8316, + "step": 9087 + }, + { + "epoch": 5.125775521714608, + "grad_norm": 0.9211674928665161, + "learning_rate": 2.437394247038917e-05, + "loss": 0.7737, + "step": 9088 + }, + { + "epoch": 5.12633953750705, + "grad_norm": 1.5360774993896484, + "learning_rate": 2.437112239142696e-05, + "loss": 0.6832, + "step": 9089 + }, + { + "epoch": 5.126903553299492, + "grad_norm": 0.9319660663604736, + "learning_rate": 2.436830231246475e-05, + "loss": 0.6693, + "step": 9090 + }, + { + "epoch": 5.127467569091935, + "grad_norm": 1.5998352766036987, + "learning_rate": 2.436548223350254e-05, + "loss": 0.7813, + "step": 9091 + }, + { + "epoch": 5.1280315848843765, + "grad_norm": 1.1611109972000122, + "learning_rate": 2.4362662154540327e-05, + "loss": 0.7413, + "step": 9092 + }, + { + "epoch": 5.128595600676819, + "grad_norm": 1.203127145767212, + "learning_rate": 2.435984207557812e-05, + "loss": 0.6965, + "step": 9093 + }, + { + "epoch": 5.129159616469261, + "grad_norm": 1.3744075298309326, + "learning_rate": 2.4357021996615904e-05, + "loss": 0.7797, + "step": 9094 + }, + { + "epoch": 5.129723632261704, + "grad_norm": 1.4297444820404053, + "learning_rate": 2.4354201917653697e-05, + "loss": 0.8081, + "step": 9095 + }, + { + "epoch": 5.1302876480541455, + "grad_norm": 1.041799545288086, + "learning_rate": 2.4351381838691485e-05, + "loss": 0.7977, + "step": 9096 + }, + { + "epoch": 5.130851663846587, + "grad_norm": 1.0645686388015747, + "learning_rate": 2.4348561759729274e-05, + "loss": 0.7648, + "step": 9097 + }, + { + "epoch": 5.13141567963903, + "grad_norm": 1.168570876121521, + "learning_rate": 2.4345741680767063e-05, + "loss": 0.7113, + "step": 9098 + }, + { + "epoch": 5.131979695431472, + "grad_norm": 1.1627142429351807, + "learning_rate": 2.4342921601804852e-05, + "loss": 0.7656, + "step": 9099 + }, + { + "epoch": 5.1325437112239145, + "grad_norm": 0.8956966996192932, + "learning_rate": 2.434010152284264e-05, + "loss": 0.7558, + "step": 9100 + }, + { + "epoch": 5.133107727016356, + "grad_norm": 1.0152955055236816, + "learning_rate": 2.433728144388043e-05, + "loss": 0.6343, + "step": 9101 + }, + { + "epoch": 5.133671742808799, + "grad_norm": 1.2077809572219849, + "learning_rate": 2.4334461364918218e-05, + "loss": 0.8076, + "step": 9102 + }, + { + "epoch": 5.134235758601241, + "grad_norm": 0.9799327254295349, + "learning_rate": 2.4331641285956007e-05, + "loss": 0.6989, + "step": 9103 + }, + { + "epoch": 5.134799774393683, + "grad_norm": 1.5988073348999023, + "learning_rate": 2.4328821206993796e-05, + "loss": 0.8332, + "step": 9104 + }, + { + "epoch": 5.135363790186125, + "grad_norm": 0.919545590877533, + "learning_rate": 2.4326001128031588e-05, + "loss": 0.6661, + "step": 9105 + }, + { + "epoch": 5.135927805978567, + "grad_norm": 1.5194344520568848, + "learning_rate": 2.4323181049069373e-05, + "loss": 0.8185, + "step": 9106 + }, + { + "epoch": 5.13649182177101, + "grad_norm": 1.0722603797912598, + "learning_rate": 2.4320360970107166e-05, + "loss": 0.7053, + "step": 9107 + }, + { + "epoch": 5.137055837563452, + "grad_norm": 1.0302473306655884, + "learning_rate": 2.431754089114495e-05, + "loss": 0.761, + "step": 9108 + }, + { + "epoch": 5.137619853355894, + "grad_norm": 1.2034211158752441, + "learning_rate": 2.4314720812182743e-05, + "loss": 0.7121, + "step": 9109 + }, + { + "epoch": 5.138183869148336, + "grad_norm": 1.0130888223648071, + "learning_rate": 2.431190073322053e-05, + "loss": 0.795, + "step": 9110 + }, + { + "epoch": 5.138747884940778, + "grad_norm": 0.7948675751686096, + "learning_rate": 2.430908065425832e-05, + "loss": 0.6424, + "step": 9111 + }, + { + "epoch": 5.139311900733221, + "grad_norm": 1.1191047430038452, + "learning_rate": 2.430626057529611e-05, + "loss": 0.6456, + "step": 9112 + }, + { + "epoch": 5.1398759165256624, + "grad_norm": 1.3727178573608398, + "learning_rate": 2.43034404963339e-05, + "loss": 0.7975, + "step": 9113 + }, + { + "epoch": 5.140439932318105, + "grad_norm": 1.4104048013687134, + "learning_rate": 2.430062041737169e-05, + "loss": 0.8104, + "step": 9114 + }, + { + "epoch": 5.141003948110547, + "grad_norm": 2.010784387588501, + "learning_rate": 2.4297800338409476e-05, + "loss": 0.8228, + "step": 9115 + }, + { + "epoch": 5.14156796390299, + "grad_norm": 1.1671284437179565, + "learning_rate": 2.4294980259447268e-05, + "loss": 0.7961, + "step": 9116 + }, + { + "epoch": 5.1421319796954315, + "grad_norm": 1.2860078811645508, + "learning_rate": 2.4292160180485054e-05, + "loss": 0.7441, + "step": 9117 + }, + { + "epoch": 5.142695995487873, + "grad_norm": 1.13407301902771, + "learning_rate": 2.4289340101522846e-05, + "loss": 0.7019, + "step": 9118 + }, + { + "epoch": 5.143260011280316, + "grad_norm": 1.2358916997909546, + "learning_rate": 2.428652002256063e-05, + "loss": 0.7441, + "step": 9119 + }, + { + "epoch": 5.143824027072758, + "grad_norm": 0.8627371191978455, + "learning_rate": 2.4283699943598423e-05, + "loss": 0.7079, + "step": 9120 + }, + { + "epoch": 5.1443880428652005, + "grad_norm": 1.2149372100830078, + "learning_rate": 2.4280879864636212e-05, + "loss": 0.7326, + "step": 9121 + }, + { + "epoch": 5.144952058657642, + "grad_norm": 1.051257848739624, + "learning_rate": 2.4278059785674e-05, + "loss": 0.6827, + "step": 9122 + }, + { + "epoch": 5.145516074450085, + "grad_norm": 2.082305908203125, + "learning_rate": 2.427523970671179e-05, + "loss": 0.9812, + "step": 9123 + }, + { + "epoch": 5.146080090242527, + "grad_norm": 1.1052356958389282, + "learning_rate": 2.427241962774958e-05, + "loss": 0.6862, + "step": 9124 + }, + { + "epoch": 5.146644106034969, + "grad_norm": 1.0535533428192139, + "learning_rate": 2.4269599548787367e-05, + "loss": 0.7757, + "step": 9125 + }, + { + "epoch": 5.147208121827411, + "grad_norm": 1.1698654890060425, + "learning_rate": 2.4266779469825156e-05, + "loss": 0.7278, + "step": 9126 + }, + { + "epoch": 5.147772137619853, + "grad_norm": 1.01431405544281, + "learning_rate": 2.4263959390862945e-05, + "loss": 0.7552, + "step": 9127 + }, + { + "epoch": 5.148336153412296, + "grad_norm": 1.1546467542648315, + "learning_rate": 2.4261139311900734e-05, + "loss": 0.7715, + "step": 9128 + }, + { + "epoch": 5.148900169204738, + "grad_norm": 1.289730191230774, + "learning_rate": 2.4258319232938522e-05, + "loss": 0.671, + "step": 9129 + }, + { + "epoch": 5.14946418499718, + "grad_norm": 0.8167390823364258, + "learning_rate": 2.4255499153976315e-05, + "loss": 0.6157, + "step": 9130 + }, + { + "epoch": 5.150028200789622, + "grad_norm": 1.3817330598831177, + "learning_rate": 2.42526790750141e-05, + "loss": 0.7411, + "step": 9131 + }, + { + "epoch": 5.150592216582064, + "grad_norm": 1.5184077024459839, + "learning_rate": 2.4249858996051892e-05, + "loss": 0.7795, + "step": 9132 + }, + { + "epoch": 5.151156232374507, + "grad_norm": 1.1491832733154297, + "learning_rate": 2.4247038917089678e-05, + "loss": 0.7716, + "step": 9133 + }, + { + "epoch": 5.151720248166948, + "grad_norm": 1.3852043151855469, + "learning_rate": 2.424421883812747e-05, + "loss": 0.7834, + "step": 9134 + }, + { + "epoch": 5.152284263959391, + "grad_norm": 1.2108545303344727, + "learning_rate": 2.424139875916526e-05, + "loss": 0.7396, + "step": 9135 + }, + { + "epoch": 5.152848279751833, + "grad_norm": 1.0432193279266357, + "learning_rate": 2.4238578680203047e-05, + "loss": 0.6922, + "step": 9136 + }, + { + "epoch": 5.153412295544276, + "grad_norm": 1.7635388374328613, + "learning_rate": 2.4235758601240836e-05, + "loss": 0.878, + "step": 9137 + }, + { + "epoch": 5.153976311336717, + "grad_norm": 1.5654301643371582, + "learning_rate": 2.4232938522278625e-05, + "loss": 0.7828, + "step": 9138 + }, + { + "epoch": 5.154540327129159, + "grad_norm": 1.7017666101455688, + "learning_rate": 2.4230118443316414e-05, + "loss": 0.8266, + "step": 9139 + }, + { + "epoch": 5.155104342921602, + "grad_norm": 1.3716198205947876, + "learning_rate": 2.4227298364354203e-05, + "loss": 0.7693, + "step": 9140 + }, + { + "epoch": 5.155668358714044, + "grad_norm": 1.593615174293518, + "learning_rate": 2.422447828539199e-05, + "loss": 0.7127, + "step": 9141 + }, + { + "epoch": 5.156232374506486, + "grad_norm": 1.4935539960861206, + "learning_rate": 2.422165820642978e-05, + "loss": 0.883, + "step": 9142 + }, + { + "epoch": 5.156796390298928, + "grad_norm": 0.9178966283798218, + "learning_rate": 2.421883812746757e-05, + "loss": 0.715, + "step": 9143 + }, + { + "epoch": 5.157360406091371, + "grad_norm": 1.3269422054290771, + "learning_rate": 2.421601804850536e-05, + "loss": 0.7409, + "step": 9144 + }, + { + "epoch": 5.157924421883813, + "grad_norm": 0.913885235786438, + "learning_rate": 2.4213197969543147e-05, + "loss": 0.7292, + "step": 9145 + }, + { + "epoch": 5.1584884376762545, + "grad_norm": 1.3241825103759766, + "learning_rate": 2.421037789058094e-05, + "loss": 0.8354, + "step": 9146 + }, + { + "epoch": 5.159052453468697, + "grad_norm": 1.1303194761276245, + "learning_rate": 2.4207557811618724e-05, + "loss": 0.8172, + "step": 9147 + }, + { + "epoch": 5.159616469261139, + "grad_norm": 1.220565676689148, + "learning_rate": 2.4204737732656516e-05, + "loss": 0.8141, + "step": 9148 + }, + { + "epoch": 5.160180485053582, + "grad_norm": 1.219292163848877, + "learning_rate": 2.4201917653694305e-05, + "loss": 0.705, + "step": 9149 + }, + { + "epoch": 5.1607445008460235, + "grad_norm": 6.738746166229248, + "learning_rate": 2.4199097574732094e-05, + "loss": 0.7466, + "step": 9150 + }, + { + "epoch": 5.161308516638466, + "grad_norm": 1.140930414199829, + "learning_rate": 2.4196277495769883e-05, + "loss": 0.7386, + "step": 9151 + }, + { + "epoch": 5.161872532430908, + "grad_norm": 1.1219099760055542, + "learning_rate": 2.419345741680767e-05, + "loss": 0.7863, + "step": 9152 + }, + { + "epoch": 5.16243654822335, + "grad_norm": 0.977911651134491, + "learning_rate": 2.4190637337845464e-05, + "loss": 0.7501, + "step": 9153 + }, + { + "epoch": 5.1630005640157925, + "grad_norm": 1.4276142120361328, + "learning_rate": 2.418781725888325e-05, + "loss": 0.8164, + "step": 9154 + }, + { + "epoch": 5.163564579808234, + "grad_norm": 1.2601664066314697, + "learning_rate": 2.418499717992104e-05, + "loss": 0.802, + "step": 9155 + }, + { + "epoch": 5.164128595600677, + "grad_norm": 1.3958415985107422, + "learning_rate": 2.4182177100958827e-05, + "loss": 0.8111, + "step": 9156 + }, + { + "epoch": 5.164692611393119, + "grad_norm": 1.3963875770568848, + "learning_rate": 2.417935702199662e-05, + "loss": 0.7812, + "step": 9157 + }, + { + "epoch": 5.1652566271855616, + "grad_norm": 1.2496739625930786, + "learning_rate": 2.4176536943034404e-05, + "loss": 0.7956, + "step": 9158 + }, + { + "epoch": 5.165820642978003, + "grad_norm": 2.1091599464416504, + "learning_rate": 2.4173716864072197e-05, + "loss": 0.9113, + "step": 9159 + }, + { + "epoch": 5.166384658770445, + "grad_norm": 1.1362526416778564, + "learning_rate": 2.4170896785109985e-05, + "loss": 0.6601, + "step": 9160 + }, + { + "epoch": 5.166948674562888, + "grad_norm": 1.3989213705062866, + "learning_rate": 2.4168076706147774e-05, + "loss": 0.84, + "step": 9161 + }, + { + "epoch": 5.16751269035533, + "grad_norm": 1.0573397874832153, + "learning_rate": 2.4165256627185563e-05, + "loss": 0.7826, + "step": 9162 + }, + { + "epoch": 5.168076706147772, + "grad_norm": 1.1653211116790771, + "learning_rate": 2.4162436548223352e-05, + "loss": 0.7884, + "step": 9163 + }, + { + "epoch": 5.168640721940214, + "grad_norm": 0.9165202975273132, + "learning_rate": 2.415961646926114e-05, + "loss": 0.7778, + "step": 9164 + }, + { + "epoch": 5.169204737732657, + "grad_norm": 1.47770094871521, + "learning_rate": 2.415679639029893e-05, + "loss": 0.7212, + "step": 9165 + }, + { + "epoch": 5.169768753525099, + "grad_norm": 1.075750708580017, + "learning_rate": 2.4153976311336718e-05, + "loss": 0.7902, + "step": 9166 + }, + { + "epoch": 5.1703327693175405, + "grad_norm": 1.0807597637176514, + "learning_rate": 2.4151156232374507e-05, + "loss": 0.674, + "step": 9167 + }, + { + "epoch": 5.170896785109983, + "grad_norm": 1.2540647983551025, + "learning_rate": 2.4148336153412296e-05, + "loss": 0.7573, + "step": 9168 + }, + { + "epoch": 5.171460800902425, + "grad_norm": 1.1784577369689941, + "learning_rate": 2.4145516074450088e-05, + "loss": 0.7615, + "step": 9169 + }, + { + "epoch": 5.172024816694868, + "grad_norm": 1.6630135774612427, + "learning_rate": 2.4142695995487873e-05, + "loss": 0.7819, + "step": 9170 + }, + { + "epoch": 5.1725888324873095, + "grad_norm": 1.3411566019058228, + "learning_rate": 2.4139875916525665e-05, + "loss": 0.6695, + "step": 9171 + }, + { + "epoch": 5.173152848279752, + "grad_norm": 1.6331583261489868, + "learning_rate": 2.413705583756345e-05, + "loss": 0.8001, + "step": 9172 + }, + { + "epoch": 5.173716864072194, + "grad_norm": 1.089050531387329, + "learning_rate": 2.4134235758601243e-05, + "loss": 0.7632, + "step": 9173 + }, + { + "epoch": 5.174280879864636, + "grad_norm": 0.942014753818512, + "learning_rate": 2.413141567963903e-05, + "loss": 0.7342, + "step": 9174 + }, + { + "epoch": 5.1748448956570785, + "grad_norm": 1.689388394355774, + "learning_rate": 2.412859560067682e-05, + "loss": 0.8323, + "step": 9175 + }, + { + "epoch": 5.17540891144952, + "grad_norm": 0.8221925497055054, + "learning_rate": 2.412577552171461e-05, + "loss": 0.5877, + "step": 9176 + }, + { + "epoch": 5.175972927241963, + "grad_norm": 1.0988911390304565, + "learning_rate": 2.4122955442752398e-05, + "loss": 0.7949, + "step": 9177 + }, + { + "epoch": 5.176536943034405, + "grad_norm": 1.047595500946045, + "learning_rate": 2.4120135363790187e-05, + "loss": 0.8131, + "step": 9178 + }, + { + "epoch": 5.1771009588268475, + "grad_norm": 0.8842352032661438, + "learning_rate": 2.4117315284827976e-05, + "loss": 0.7096, + "step": 9179 + }, + { + "epoch": 5.177664974619289, + "grad_norm": 1.2714765071868896, + "learning_rate": 2.4114495205865765e-05, + "loss": 0.8405, + "step": 9180 + }, + { + "epoch": 5.178228990411731, + "grad_norm": 1.273914098739624, + "learning_rate": 2.4111675126903553e-05, + "loss": 0.8125, + "step": 9181 + }, + { + "epoch": 5.178793006204174, + "grad_norm": 2.132174491882324, + "learning_rate": 2.4108855047941342e-05, + "loss": 0.7588, + "step": 9182 + }, + { + "epoch": 5.179357021996616, + "grad_norm": 1.6599795818328857, + "learning_rate": 2.410603496897913e-05, + "loss": 0.7565, + "step": 9183 + }, + { + "epoch": 5.179921037789058, + "grad_norm": 1.1413875818252563, + "learning_rate": 2.4103214890016923e-05, + "loss": 0.7096, + "step": 9184 + }, + { + "epoch": 5.1804850535815, + "grad_norm": 1.4985015392303467, + "learning_rate": 2.4100394811054712e-05, + "loss": 0.7477, + "step": 9185 + }, + { + "epoch": 5.181049069373943, + "grad_norm": 1.0708578824996948, + "learning_rate": 2.40975747320925e-05, + "loss": 0.7387, + "step": 9186 + }, + { + "epoch": 5.181613085166385, + "grad_norm": 1.2921953201293945, + "learning_rate": 2.409475465313029e-05, + "loss": 0.8171, + "step": 9187 + }, + { + "epoch": 5.182177100958826, + "grad_norm": 1.1398239135742188, + "learning_rate": 2.409193457416808e-05, + "loss": 0.792, + "step": 9188 + }, + { + "epoch": 5.182741116751269, + "grad_norm": 1.3082890510559082, + "learning_rate": 2.4089114495205867e-05, + "loss": 0.7338, + "step": 9189 + }, + { + "epoch": 5.183305132543711, + "grad_norm": 1.1762229204177856, + "learning_rate": 2.4086294416243656e-05, + "loss": 0.7419, + "step": 9190 + }, + { + "epoch": 5.183869148336154, + "grad_norm": 0.8058336973190308, + "learning_rate": 2.4083474337281445e-05, + "loss": 0.6588, + "step": 9191 + }, + { + "epoch": 5.1844331641285955, + "grad_norm": 0.8601130247116089, + "learning_rate": 2.4080654258319234e-05, + "loss": 0.6685, + "step": 9192 + }, + { + "epoch": 5.184997179921038, + "grad_norm": 1.2018866539001465, + "learning_rate": 2.4077834179357022e-05, + "loss": 0.6983, + "step": 9193 + }, + { + "epoch": 5.18556119571348, + "grad_norm": 1.1554996967315674, + "learning_rate": 2.4075014100394815e-05, + "loss": 0.7431, + "step": 9194 + }, + { + "epoch": 5.186125211505922, + "grad_norm": 1.0020955801010132, + "learning_rate": 2.40721940214326e-05, + "loss": 0.639, + "step": 9195 + }, + { + "epoch": 5.1866892272983645, + "grad_norm": 1.0091487169265747, + "learning_rate": 2.4069373942470392e-05, + "loss": 0.6571, + "step": 9196 + }, + { + "epoch": 5.187253243090806, + "grad_norm": 1.360862374305725, + "learning_rate": 2.4066553863508178e-05, + "loss": 0.7511, + "step": 9197 + }, + { + "epoch": 5.187817258883249, + "grad_norm": 1.3109670877456665, + "learning_rate": 2.406373378454597e-05, + "loss": 0.7529, + "step": 9198 + }, + { + "epoch": 5.188381274675691, + "grad_norm": 1.1210238933563232, + "learning_rate": 2.406091370558376e-05, + "loss": 0.7471, + "step": 9199 + }, + { + "epoch": 5.1889452904681335, + "grad_norm": 1.087329387664795, + "learning_rate": 2.4058093626621547e-05, + "loss": 0.7956, + "step": 9200 + }, + { + "epoch": 5.189509306260575, + "grad_norm": 1.836256504058838, + "learning_rate": 2.4055273547659336e-05, + "loss": 0.8487, + "step": 9201 + }, + { + "epoch": 5.190073322053017, + "grad_norm": 1.1694084405899048, + "learning_rate": 2.4052453468697125e-05, + "loss": 0.887, + "step": 9202 + }, + { + "epoch": 5.19063733784546, + "grad_norm": 1.3456531763076782, + "learning_rate": 2.4049633389734914e-05, + "loss": 0.7344, + "step": 9203 + }, + { + "epoch": 5.191201353637902, + "grad_norm": 1.7364964485168457, + "learning_rate": 2.4046813310772703e-05, + "loss": 0.7658, + "step": 9204 + }, + { + "epoch": 5.191765369430344, + "grad_norm": 1.212285041809082, + "learning_rate": 2.404399323181049e-05, + "loss": 0.7424, + "step": 9205 + }, + { + "epoch": 5.192329385222786, + "grad_norm": 0.9454047083854675, + "learning_rate": 2.404117315284828e-05, + "loss": 0.7854, + "step": 9206 + }, + { + "epoch": 5.192893401015229, + "grad_norm": 1.1783337593078613, + "learning_rate": 2.403835307388607e-05, + "loss": 0.7675, + "step": 9207 + }, + { + "epoch": 5.193457416807671, + "grad_norm": 1.1527276039123535, + "learning_rate": 2.403553299492386e-05, + "loss": 0.7515, + "step": 9208 + }, + { + "epoch": 5.194021432600112, + "grad_norm": 0.9737351536750793, + "learning_rate": 2.4032712915961647e-05, + "loss": 0.7596, + "step": 9209 + }, + { + "epoch": 5.194585448392555, + "grad_norm": 1.048427939414978, + "learning_rate": 2.402989283699944e-05, + "loss": 0.6936, + "step": 9210 + }, + { + "epoch": 5.195149464184997, + "grad_norm": 1.3063615560531616, + "learning_rate": 2.4027072758037224e-05, + "loss": 0.7469, + "step": 9211 + }, + { + "epoch": 5.19571347997744, + "grad_norm": 1.093549132347107, + "learning_rate": 2.4024252679075016e-05, + "loss": 0.8057, + "step": 9212 + }, + { + "epoch": 5.196277495769881, + "grad_norm": 1.452853798866272, + "learning_rate": 2.40214326001128e-05, + "loss": 0.7405, + "step": 9213 + }, + { + "epoch": 5.196841511562324, + "grad_norm": 1.3913264274597168, + "learning_rate": 2.4018612521150594e-05, + "loss": 0.7405, + "step": 9214 + }, + { + "epoch": 5.197405527354766, + "grad_norm": 1.0065817832946777, + "learning_rate": 2.4015792442188383e-05, + "loss": 0.6753, + "step": 9215 + }, + { + "epoch": 5.197969543147208, + "grad_norm": 1.0401896238327026, + "learning_rate": 2.401297236322617e-05, + "loss": 0.7359, + "step": 9216 + }, + { + "epoch": 5.19853355893965, + "grad_norm": 1.639914631843567, + "learning_rate": 2.401015228426396e-05, + "loss": 0.7808, + "step": 9217 + }, + { + "epoch": 5.199097574732092, + "grad_norm": 1.1221351623535156, + "learning_rate": 2.400733220530175e-05, + "loss": 0.7743, + "step": 9218 + }, + { + "epoch": 5.199661590524535, + "grad_norm": 1.3438056707382202, + "learning_rate": 2.4004512126339538e-05, + "loss": 0.834, + "step": 9219 + }, + { + "epoch": 5.200225606316977, + "grad_norm": 1.1950780153274536, + "learning_rate": 2.4001692047377327e-05, + "loss": 0.7224, + "step": 9220 + }, + { + "epoch": 5.200789622109419, + "grad_norm": 1.27397620677948, + "learning_rate": 2.399887196841512e-05, + "loss": 0.6986, + "step": 9221 + }, + { + "epoch": 5.201353637901861, + "grad_norm": 1.6145288944244385, + "learning_rate": 2.3996051889452904e-05, + "loss": 0.7337, + "step": 9222 + }, + { + "epoch": 5.201917653694303, + "grad_norm": 1.8461339473724365, + "learning_rate": 2.3993231810490696e-05, + "loss": 0.81, + "step": 9223 + }, + { + "epoch": 5.202481669486746, + "grad_norm": 1.2588040828704834, + "learning_rate": 2.3990411731528485e-05, + "loss": 0.6862, + "step": 9224 + }, + { + "epoch": 5.2030456852791875, + "grad_norm": 1.8910588026046753, + "learning_rate": 2.3987591652566274e-05, + "loss": 0.785, + "step": 9225 + }, + { + "epoch": 5.20360970107163, + "grad_norm": 0.9278778433799744, + "learning_rate": 2.3984771573604063e-05, + "loss": 0.6787, + "step": 9226 + }, + { + "epoch": 5.204173716864072, + "grad_norm": 0.9133346676826477, + "learning_rate": 2.398195149464185e-05, + "loss": 0.7216, + "step": 9227 + }, + { + "epoch": 5.204737732656515, + "grad_norm": 1.4691014289855957, + "learning_rate": 2.397913141567964e-05, + "loss": 0.7964, + "step": 9228 + }, + { + "epoch": 5.2053017484489565, + "grad_norm": 1.0350592136383057, + "learning_rate": 2.397631133671743e-05, + "loss": 0.7024, + "step": 9229 + }, + { + "epoch": 5.205865764241398, + "grad_norm": 1.63254976272583, + "learning_rate": 2.3973491257755218e-05, + "loss": 0.8866, + "step": 9230 + }, + { + "epoch": 5.206429780033841, + "grad_norm": 0.9318413138389587, + "learning_rate": 2.3970671178793007e-05, + "loss": 0.7542, + "step": 9231 + }, + { + "epoch": 5.206993795826283, + "grad_norm": 0.9732080698013306, + "learning_rate": 2.3967851099830796e-05, + "loss": 0.6574, + "step": 9232 + }, + { + "epoch": 5.2075578116187256, + "grad_norm": 1.0002113580703735, + "learning_rate": 2.3965031020868588e-05, + "loss": 0.7442, + "step": 9233 + }, + { + "epoch": 5.208121827411167, + "grad_norm": 1.5633461475372314, + "learning_rate": 2.3962210941906373e-05, + "loss": 0.7797, + "step": 9234 + }, + { + "epoch": 5.20868584320361, + "grad_norm": 1.1275826692581177, + "learning_rate": 2.3959390862944165e-05, + "loss": 0.6459, + "step": 9235 + }, + { + "epoch": 5.209249858996052, + "grad_norm": 1.0730420351028442, + "learning_rate": 2.395657078398195e-05, + "loss": 0.7306, + "step": 9236 + }, + { + "epoch": 5.209813874788494, + "grad_norm": 1.1050337553024292, + "learning_rate": 2.3953750705019743e-05, + "loss": 0.8017, + "step": 9237 + }, + { + "epoch": 5.210377890580936, + "grad_norm": 1.055834412574768, + "learning_rate": 2.395093062605753e-05, + "loss": 0.8145, + "step": 9238 + }, + { + "epoch": 5.210941906373378, + "grad_norm": 1.350386142730713, + "learning_rate": 2.394811054709532e-05, + "loss": 0.8987, + "step": 9239 + }, + { + "epoch": 5.211505922165821, + "grad_norm": 0.9887906312942505, + "learning_rate": 2.394529046813311e-05, + "loss": 0.706, + "step": 9240 + }, + { + "epoch": 5.212069937958263, + "grad_norm": 1.4792168140411377, + "learning_rate": 2.3942470389170898e-05, + "loss": 0.8255, + "step": 9241 + }, + { + "epoch": 5.212633953750705, + "grad_norm": 1.4443113803863525, + "learning_rate": 2.3939650310208687e-05, + "loss": 0.6641, + "step": 9242 + }, + { + "epoch": 5.213197969543147, + "grad_norm": 0.9285457730293274, + "learning_rate": 2.3936830231246476e-05, + "loss": 0.6302, + "step": 9243 + }, + { + "epoch": 5.213761985335589, + "grad_norm": 1.0199360847473145, + "learning_rate": 2.3934010152284265e-05, + "loss": 0.7338, + "step": 9244 + }, + { + "epoch": 5.214326001128032, + "grad_norm": 1.1511306762695312, + "learning_rate": 2.3931190073322053e-05, + "loss": 0.7477, + "step": 9245 + }, + { + "epoch": 5.2148900169204735, + "grad_norm": 1.1566157341003418, + "learning_rate": 2.3928369994359842e-05, + "loss": 0.8526, + "step": 9246 + }, + { + "epoch": 5.215454032712916, + "grad_norm": 1.0521653890609741, + "learning_rate": 2.392554991539763e-05, + "loss": 0.6233, + "step": 9247 + }, + { + "epoch": 5.216018048505358, + "grad_norm": 1.010563850402832, + "learning_rate": 2.392272983643542e-05, + "loss": 0.7539, + "step": 9248 + }, + { + "epoch": 5.216582064297801, + "grad_norm": 1.1303040981292725, + "learning_rate": 2.3919909757473212e-05, + "loss": 0.6113, + "step": 9249 + }, + { + "epoch": 5.2171460800902425, + "grad_norm": 1.0339933633804321, + "learning_rate": 2.3917089678510997e-05, + "loss": 0.768, + "step": 9250 + }, + { + "epoch": 5.217710095882684, + "grad_norm": 1.298516869544983, + "learning_rate": 2.391426959954879e-05, + "loss": 0.713, + "step": 9251 + }, + { + "epoch": 5.218274111675127, + "grad_norm": 0.9336246848106384, + "learning_rate": 2.3911449520586575e-05, + "loss": 0.7537, + "step": 9252 + }, + { + "epoch": 5.218838127467569, + "grad_norm": 1.0397017002105713, + "learning_rate": 2.3908629441624367e-05, + "loss": 0.699, + "step": 9253 + }, + { + "epoch": 5.2194021432600115, + "grad_norm": 1.249405860900879, + "learning_rate": 2.3905809362662156e-05, + "loss": 0.7904, + "step": 9254 + }, + { + "epoch": 5.219966159052453, + "grad_norm": 1.0895580053329468, + "learning_rate": 2.3902989283699945e-05, + "loss": 0.7528, + "step": 9255 + }, + { + "epoch": 5.220530174844896, + "grad_norm": 1.468550205230713, + "learning_rate": 2.3900169204737733e-05, + "loss": 0.7722, + "step": 9256 + }, + { + "epoch": 5.221094190637338, + "grad_norm": 1.0290528535842896, + "learning_rate": 2.3897349125775522e-05, + "loss": 0.7099, + "step": 9257 + }, + { + "epoch": 5.22165820642978, + "grad_norm": 1.467636227607727, + "learning_rate": 2.3894529046813314e-05, + "loss": 0.7632, + "step": 9258 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 1.1575891971588135, + "learning_rate": 2.38917089678511e-05, + "loss": 0.7756, + "step": 9259 + }, + { + "epoch": 5.222786238014664, + "grad_norm": 0.8888334631919861, + "learning_rate": 2.3888888888888892e-05, + "loss": 0.6599, + "step": 9260 + }, + { + "epoch": 5.223350253807107, + "grad_norm": 1.2327791452407837, + "learning_rate": 2.3886068809926677e-05, + "loss": 0.7504, + "step": 9261 + }, + { + "epoch": 5.223914269599549, + "grad_norm": 1.897143006324768, + "learning_rate": 2.388324873096447e-05, + "loss": 0.7864, + "step": 9262 + }, + { + "epoch": 5.224478285391991, + "grad_norm": 1.3866184949874878, + "learning_rate": 2.388042865200226e-05, + "loss": 0.7358, + "step": 9263 + }, + { + "epoch": 5.225042301184433, + "grad_norm": 1.3497484922409058, + "learning_rate": 2.3877608573040047e-05, + "loss": 0.7957, + "step": 9264 + }, + { + "epoch": 5.225606316976875, + "grad_norm": 1.717993974685669, + "learning_rate": 2.3874788494077836e-05, + "loss": 0.8158, + "step": 9265 + }, + { + "epoch": 5.226170332769318, + "grad_norm": 1.1983706951141357, + "learning_rate": 2.3871968415115625e-05, + "loss": 0.6999, + "step": 9266 + }, + { + "epoch": 5.2267343485617594, + "grad_norm": 1.2934036254882812, + "learning_rate": 2.3869148336153414e-05, + "loss": 0.8555, + "step": 9267 + }, + { + "epoch": 5.227298364354202, + "grad_norm": 1.2909349203109741, + "learning_rate": 2.3866328257191202e-05, + "loss": 0.7985, + "step": 9268 + }, + { + "epoch": 5.227862380146644, + "grad_norm": 0.8749362230300903, + "learning_rate": 2.386350817822899e-05, + "loss": 0.6062, + "step": 9269 + }, + { + "epoch": 5.228426395939087, + "grad_norm": 1.5695083141326904, + "learning_rate": 2.386068809926678e-05, + "loss": 0.8178, + "step": 9270 + }, + { + "epoch": 5.2289904117315285, + "grad_norm": 1.2438528537750244, + "learning_rate": 2.385786802030457e-05, + "loss": 0.6445, + "step": 9271 + }, + { + "epoch": 5.22955442752397, + "grad_norm": 1.2526893615722656, + "learning_rate": 2.385504794134236e-05, + "loss": 0.7109, + "step": 9272 + }, + { + "epoch": 5.230118443316413, + "grad_norm": 1.278718113899231, + "learning_rate": 2.3852227862380146e-05, + "loss": 0.7506, + "step": 9273 + }, + { + "epoch": 5.230682459108855, + "grad_norm": 1.3011620044708252, + "learning_rate": 2.384940778341794e-05, + "loss": 0.7835, + "step": 9274 + }, + { + "epoch": 5.2312464749012975, + "grad_norm": 1.1364840269088745, + "learning_rate": 2.3846587704455724e-05, + "loss": 0.7061, + "step": 9275 + }, + { + "epoch": 5.231810490693739, + "grad_norm": 1.3348069190979004, + "learning_rate": 2.3843767625493516e-05, + "loss": 0.7377, + "step": 9276 + }, + { + "epoch": 5.232374506486182, + "grad_norm": 0.9385221600532532, + "learning_rate": 2.38409475465313e-05, + "loss": 0.6524, + "step": 9277 + }, + { + "epoch": 5.232938522278624, + "grad_norm": 1.5019850730895996, + "learning_rate": 2.3838127467569094e-05, + "loss": 0.7204, + "step": 9278 + }, + { + "epoch": 5.233502538071066, + "grad_norm": 1.164769172668457, + "learning_rate": 2.3835307388606883e-05, + "loss": 0.6991, + "step": 9279 + }, + { + "epoch": 5.234066553863508, + "grad_norm": 1.2374341487884521, + "learning_rate": 2.383248730964467e-05, + "loss": 0.8381, + "step": 9280 + }, + { + "epoch": 5.23463056965595, + "grad_norm": 1.5358268022537231, + "learning_rate": 2.382966723068246e-05, + "loss": 0.7369, + "step": 9281 + }, + { + "epoch": 5.235194585448393, + "grad_norm": 1.5708097219467163, + "learning_rate": 2.382684715172025e-05, + "loss": 0.858, + "step": 9282 + }, + { + "epoch": 5.235758601240835, + "grad_norm": 1.396201729774475, + "learning_rate": 2.3824027072758038e-05, + "loss": 0.7233, + "step": 9283 + }, + { + "epoch": 5.236322617033277, + "grad_norm": 1.0211522579193115, + "learning_rate": 2.3821206993795827e-05, + "loss": 0.6976, + "step": 9284 + }, + { + "epoch": 5.236886632825719, + "grad_norm": 1.2432916164398193, + "learning_rate": 2.3818386914833615e-05, + "loss": 0.7911, + "step": 9285 + }, + { + "epoch": 5.237450648618161, + "grad_norm": 1.01417875289917, + "learning_rate": 2.3815566835871404e-05, + "loss": 0.6828, + "step": 9286 + }, + { + "epoch": 5.238014664410604, + "grad_norm": 1.4470100402832031, + "learning_rate": 2.3812746756909193e-05, + "loss": 0.7102, + "step": 9287 + }, + { + "epoch": 5.238578680203045, + "grad_norm": 1.010351300239563, + "learning_rate": 2.3809926677946985e-05, + "loss": 0.7867, + "step": 9288 + }, + { + "epoch": 5.239142695995488, + "grad_norm": 1.261142373085022, + "learning_rate": 2.380710659898477e-05, + "loss": 0.7249, + "step": 9289 + }, + { + "epoch": 5.23970671178793, + "grad_norm": 0.863020122051239, + "learning_rate": 2.3804286520022563e-05, + "loss": 0.771, + "step": 9290 + }, + { + "epoch": 5.240270727580373, + "grad_norm": 0.9708882570266724, + "learning_rate": 2.380146644106035e-05, + "loss": 0.5726, + "step": 9291 + }, + { + "epoch": 5.240834743372814, + "grad_norm": 0.93086838722229, + "learning_rate": 2.379864636209814e-05, + "loss": 0.6436, + "step": 9292 + }, + { + "epoch": 5.241398759165256, + "grad_norm": 1.0957196950912476, + "learning_rate": 2.379582628313593e-05, + "loss": 0.774, + "step": 9293 + }, + { + "epoch": 5.241962774957699, + "grad_norm": 1.3480411767959595, + "learning_rate": 2.3793006204173718e-05, + "loss": 0.833, + "step": 9294 + }, + { + "epoch": 5.242526790750141, + "grad_norm": 1.4171704053878784, + "learning_rate": 2.3790186125211507e-05, + "loss": 0.8292, + "step": 9295 + }, + { + "epoch": 5.243090806542583, + "grad_norm": 1.1639713048934937, + "learning_rate": 2.3787366046249296e-05, + "loss": 0.6898, + "step": 9296 + }, + { + "epoch": 5.243654822335025, + "grad_norm": 0.9252480268478394, + "learning_rate": 2.3784545967287088e-05, + "loss": 0.7504, + "step": 9297 + }, + { + "epoch": 5.244218838127468, + "grad_norm": 1.0753211975097656, + "learning_rate": 2.3781725888324873e-05, + "loss": 0.7518, + "step": 9298 + }, + { + "epoch": 5.24478285391991, + "grad_norm": 1.1838932037353516, + "learning_rate": 2.3778905809362665e-05, + "loss": 0.8124, + "step": 9299 + }, + { + "epoch": 5.2453468697123515, + "grad_norm": 1.5608927011489868, + "learning_rate": 2.377608573040045e-05, + "loss": 0.6891, + "step": 9300 + }, + { + "epoch": 5.245910885504794, + "grad_norm": 0.9534503221511841, + "learning_rate": 2.3773265651438243e-05, + "loss": 0.7553, + "step": 9301 + }, + { + "epoch": 5.246474901297236, + "grad_norm": 1.154222846031189, + "learning_rate": 2.3770445572476028e-05, + "loss": 0.7371, + "step": 9302 + }, + { + "epoch": 5.247038917089679, + "grad_norm": 0.9338168501853943, + "learning_rate": 2.376762549351382e-05, + "loss": 0.7818, + "step": 9303 + }, + { + "epoch": 5.2476029328821205, + "grad_norm": 0.9360251426696777, + "learning_rate": 2.376480541455161e-05, + "loss": 0.814, + "step": 9304 + }, + { + "epoch": 5.248166948674563, + "grad_norm": 1.3953197002410889, + "learning_rate": 2.3761985335589398e-05, + "loss": 0.7678, + "step": 9305 + }, + { + "epoch": 5.248730964467005, + "grad_norm": 1.1066372394561768, + "learning_rate": 2.3759165256627187e-05, + "loss": 0.7757, + "step": 9306 + }, + { + "epoch": 5.249294980259447, + "grad_norm": 1.2286309003829956, + "learning_rate": 2.3756345177664976e-05, + "loss": 0.75, + "step": 9307 + }, + { + "epoch": 5.2498589960518895, + "grad_norm": 1.317941665649414, + "learning_rate": 2.3753525098702764e-05, + "loss": 0.725, + "step": 9308 + }, + { + "epoch": 5.250423011844331, + "grad_norm": 1.0715999603271484, + "learning_rate": 2.3750705019740553e-05, + "loss": 0.6849, + "step": 9309 + }, + { + "epoch": 5.250987027636774, + "grad_norm": 1.1206068992614746, + "learning_rate": 2.3747884940778342e-05, + "loss": 0.6609, + "step": 9310 + }, + { + "epoch": 5.251551043429216, + "grad_norm": 1.0989452600479126, + "learning_rate": 2.374506486181613e-05, + "loss": 0.7387, + "step": 9311 + }, + { + "epoch": 5.2521150592216586, + "grad_norm": 0.8333925604820251, + "learning_rate": 2.374224478285392e-05, + "loss": 0.6405, + "step": 9312 + }, + { + "epoch": 5.2526790750141, + "grad_norm": 1.1806987524032593, + "learning_rate": 2.3739424703891712e-05, + "loss": 0.7951, + "step": 9313 + }, + { + "epoch": 5.253243090806542, + "grad_norm": 1.3171536922454834, + "learning_rate": 2.3736604624929497e-05, + "loss": 0.7208, + "step": 9314 + }, + { + "epoch": 5.253807106598985, + "grad_norm": 1.1733731031417847, + "learning_rate": 2.373378454596729e-05, + "loss": 0.7827, + "step": 9315 + }, + { + "epoch": 5.254371122391427, + "grad_norm": 1.4975935220718384, + "learning_rate": 2.3730964467005075e-05, + "loss": 0.8079, + "step": 9316 + }, + { + "epoch": 5.254935138183869, + "grad_norm": 1.2689470052719116, + "learning_rate": 2.3728144388042867e-05, + "loss": 0.7159, + "step": 9317 + }, + { + "epoch": 5.255499153976311, + "grad_norm": 1.3881314992904663, + "learning_rate": 2.3725324309080656e-05, + "loss": 0.8076, + "step": 9318 + }, + { + "epoch": 5.256063169768754, + "grad_norm": 1.3836164474487305, + "learning_rate": 2.3722504230118445e-05, + "loss": 0.6581, + "step": 9319 + }, + { + "epoch": 5.256627185561196, + "grad_norm": 1.2516459226608276, + "learning_rate": 2.3719684151156233e-05, + "loss": 0.8426, + "step": 9320 + }, + { + "epoch": 5.2571912013536375, + "grad_norm": 1.011025309562683, + "learning_rate": 2.3716864072194022e-05, + "loss": 0.6193, + "step": 9321 + }, + { + "epoch": 5.25775521714608, + "grad_norm": 1.0799096822738647, + "learning_rate": 2.371404399323181e-05, + "loss": 0.7147, + "step": 9322 + }, + { + "epoch": 5.258319232938522, + "grad_norm": 1.0402930974960327, + "learning_rate": 2.37112239142696e-05, + "loss": 0.7155, + "step": 9323 + }, + { + "epoch": 5.258883248730965, + "grad_norm": 1.5398298501968384, + "learning_rate": 2.370840383530739e-05, + "loss": 0.8089, + "step": 9324 + }, + { + "epoch": 5.2594472645234065, + "grad_norm": 1.1106891632080078, + "learning_rate": 2.3705583756345177e-05, + "loss": 0.7008, + "step": 9325 + }, + { + "epoch": 5.260011280315849, + "grad_norm": 0.9598763585090637, + "learning_rate": 2.370276367738297e-05, + "loss": 0.6557, + "step": 9326 + }, + { + "epoch": 5.260575296108291, + "grad_norm": 1.3107255697250366, + "learning_rate": 2.369994359842076e-05, + "loss": 0.7843, + "step": 9327 + }, + { + "epoch": 5.261139311900733, + "grad_norm": 1.814447045326233, + "learning_rate": 2.3697123519458547e-05, + "loss": 0.7803, + "step": 9328 + }, + { + "epoch": 5.2617033276931755, + "grad_norm": 1.3076906204223633, + "learning_rate": 2.3694303440496336e-05, + "loss": 0.7584, + "step": 9329 + }, + { + "epoch": 5.262267343485617, + "grad_norm": 1.0541775226593018, + "learning_rate": 2.3691483361534125e-05, + "loss": 0.7192, + "step": 9330 + }, + { + "epoch": 5.26283135927806, + "grad_norm": 1.0283700227737427, + "learning_rate": 2.3688663282571914e-05, + "loss": 0.711, + "step": 9331 + }, + { + "epoch": 5.263395375070502, + "grad_norm": 1.0443166494369507, + "learning_rate": 2.3685843203609702e-05, + "loss": 0.7542, + "step": 9332 + }, + { + "epoch": 5.2639593908629445, + "grad_norm": 0.8931341171264648, + "learning_rate": 2.368302312464749e-05, + "loss": 0.7535, + "step": 9333 + }, + { + "epoch": 5.264523406655386, + "grad_norm": 1.6751604080200195, + "learning_rate": 2.368020304568528e-05, + "loss": 0.8026, + "step": 9334 + }, + { + "epoch": 5.265087422447828, + "grad_norm": 1.0932930707931519, + "learning_rate": 2.367738296672307e-05, + "loss": 0.77, + "step": 9335 + }, + { + "epoch": 5.265651438240271, + "grad_norm": 1.1579508781433105, + "learning_rate": 2.367456288776086e-05, + "loss": 0.7542, + "step": 9336 + }, + { + "epoch": 5.266215454032713, + "grad_norm": 0.9959549903869629, + "learning_rate": 2.3671742808798646e-05, + "loss": 0.796, + "step": 9337 + }, + { + "epoch": 5.266779469825155, + "grad_norm": 1.5030406713485718, + "learning_rate": 2.366892272983644e-05, + "loss": 0.8033, + "step": 9338 + }, + { + "epoch": 5.267343485617597, + "grad_norm": 1.1223653554916382, + "learning_rate": 2.3666102650874224e-05, + "loss": 0.7064, + "step": 9339 + }, + { + "epoch": 5.26790750141004, + "grad_norm": 0.8994312286376953, + "learning_rate": 2.3663282571912016e-05, + "loss": 0.7168, + "step": 9340 + }, + { + "epoch": 5.268471517202482, + "grad_norm": 1.203165054321289, + "learning_rate": 2.36604624929498e-05, + "loss": 0.7514, + "step": 9341 + }, + { + "epoch": 5.269035532994923, + "grad_norm": 0.9740228056907654, + "learning_rate": 2.3657642413987594e-05, + "loss": 0.7075, + "step": 9342 + }, + { + "epoch": 5.269599548787366, + "grad_norm": 0.9706597924232483, + "learning_rate": 2.3654822335025383e-05, + "loss": 0.7426, + "step": 9343 + }, + { + "epoch": 5.270163564579808, + "grad_norm": 1.188464641571045, + "learning_rate": 2.365200225606317e-05, + "loss": 0.7561, + "step": 9344 + }, + { + "epoch": 5.270727580372251, + "grad_norm": 1.129006028175354, + "learning_rate": 2.364918217710096e-05, + "loss": 0.7663, + "step": 9345 + }, + { + "epoch": 5.2712915961646925, + "grad_norm": 1.5517280101776123, + "learning_rate": 2.364636209813875e-05, + "loss": 0.6922, + "step": 9346 + }, + { + "epoch": 5.271855611957135, + "grad_norm": 1.5193796157836914, + "learning_rate": 2.3643542019176538e-05, + "loss": 0.7691, + "step": 9347 + }, + { + "epoch": 5.272419627749577, + "grad_norm": 1.4378122091293335, + "learning_rate": 2.3640721940214326e-05, + "loss": 0.7396, + "step": 9348 + }, + { + "epoch": 5.272983643542019, + "grad_norm": 1.7908393144607544, + "learning_rate": 2.3637901861252115e-05, + "loss": 0.7822, + "step": 9349 + }, + { + "epoch": 5.2735476593344615, + "grad_norm": 1.1063358783721924, + "learning_rate": 2.3635081782289904e-05, + "loss": 0.7521, + "step": 9350 + }, + { + "epoch": 5.274111675126903, + "grad_norm": 1.0874314308166504, + "learning_rate": 2.3632261703327693e-05, + "loss": 0.7203, + "step": 9351 + }, + { + "epoch": 5.274675690919346, + "grad_norm": 1.1466777324676514, + "learning_rate": 2.3629441624365485e-05, + "loss": 0.8014, + "step": 9352 + }, + { + "epoch": 5.275239706711788, + "grad_norm": 0.8973371386528015, + "learning_rate": 2.362662154540327e-05, + "loss": 0.7023, + "step": 9353 + }, + { + "epoch": 5.2758037225042305, + "grad_norm": 1.1398019790649414, + "learning_rate": 2.3623801466441063e-05, + "loss": 0.6543, + "step": 9354 + }, + { + "epoch": 5.276367738296672, + "grad_norm": 1.2874318361282349, + "learning_rate": 2.3620981387478848e-05, + "loss": 0.7279, + "step": 9355 + }, + { + "epoch": 5.276931754089114, + "grad_norm": 1.4906656742095947, + "learning_rate": 2.361816130851664e-05, + "loss": 0.8461, + "step": 9356 + }, + { + "epoch": 5.277495769881557, + "grad_norm": 0.9066860675811768, + "learning_rate": 2.361534122955443e-05, + "loss": 0.6611, + "step": 9357 + }, + { + "epoch": 5.278059785673999, + "grad_norm": 0.8476051092147827, + "learning_rate": 2.3612521150592218e-05, + "loss": 0.6808, + "step": 9358 + }, + { + "epoch": 5.278623801466441, + "grad_norm": 1.3361095190048218, + "learning_rate": 2.3609701071630007e-05, + "loss": 0.7257, + "step": 9359 + }, + { + "epoch": 5.279187817258883, + "grad_norm": 1.9325356483459473, + "learning_rate": 2.3606880992667795e-05, + "loss": 0.9122, + "step": 9360 + }, + { + "epoch": 5.279751833051326, + "grad_norm": 1.1444461345672607, + "learning_rate": 2.3604060913705588e-05, + "loss": 0.7514, + "step": 9361 + }, + { + "epoch": 5.280315848843768, + "grad_norm": 0.900065541267395, + "learning_rate": 2.3601240834743373e-05, + "loss": 0.7393, + "step": 9362 + }, + { + "epoch": 5.280879864636209, + "grad_norm": 1.0995995998382568, + "learning_rate": 2.3598420755781165e-05, + "loss": 0.7135, + "step": 9363 + }, + { + "epoch": 5.281443880428652, + "grad_norm": 1.1572738885879517, + "learning_rate": 2.359560067681895e-05, + "loss": 0.7221, + "step": 9364 + }, + { + "epoch": 5.282007896221094, + "grad_norm": 1.3512747287750244, + "learning_rate": 2.3592780597856743e-05, + "loss": 0.6894, + "step": 9365 + }, + { + "epoch": 5.282571912013537, + "grad_norm": 1.000486135482788, + "learning_rate": 2.358996051889453e-05, + "loss": 0.6829, + "step": 9366 + }, + { + "epoch": 5.283135927805978, + "grad_norm": 1.11508309841156, + "learning_rate": 2.358714043993232e-05, + "loss": 0.7737, + "step": 9367 + }, + { + "epoch": 5.283699943598421, + "grad_norm": 1.374550461769104, + "learning_rate": 2.358432036097011e-05, + "loss": 0.7289, + "step": 9368 + }, + { + "epoch": 5.284263959390863, + "grad_norm": 1.2205991744995117, + "learning_rate": 2.3581500282007898e-05, + "loss": 0.7181, + "step": 9369 + }, + { + "epoch": 5.284827975183305, + "grad_norm": 1.52883780002594, + "learning_rate": 2.3578680203045687e-05, + "loss": 0.8603, + "step": 9370 + }, + { + "epoch": 5.285391990975747, + "grad_norm": 0.9526699185371399, + "learning_rate": 2.3575860124083476e-05, + "loss": 0.6673, + "step": 9371 + }, + { + "epoch": 5.285956006768189, + "grad_norm": 1.2636046409606934, + "learning_rate": 2.3573040045121264e-05, + "loss": 0.7258, + "step": 9372 + }, + { + "epoch": 5.286520022560632, + "grad_norm": 1.2486567497253418, + "learning_rate": 2.3570219966159053e-05, + "loss": 0.7753, + "step": 9373 + }, + { + "epoch": 5.287084038353074, + "grad_norm": 1.0648421049118042, + "learning_rate": 2.3567399887196842e-05, + "loss": 0.7109, + "step": 9374 + }, + { + "epoch": 5.287648054145516, + "grad_norm": 0.8862990140914917, + "learning_rate": 2.3564579808234634e-05, + "loss": 0.6909, + "step": 9375 + }, + { + "epoch": 5.288212069937958, + "grad_norm": 0.9758118391036987, + "learning_rate": 2.356175972927242e-05, + "loss": 0.6953, + "step": 9376 + }, + { + "epoch": 5.288776085730401, + "grad_norm": 1.399340033531189, + "learning_rate": 2.3558939650310212e-05, + "loss": 0.7758, + "step": 9377 + }, + { + "epoch": 5.289340101522843, + "grad_norm": 0.9632096886634827, + "learning_rate": 2.3556119571347997e-05, + "loss": 0.7095, + "step": 9378 + }, + { + "epoch": 5.2899041173152845, + "grad_norm": 1.1213715076446533, + "learning_rate": 2.355329949238579e-05, + "loss": 0.7388, + "step": 9379 + }, + { + "epoch": 5.290468133107727, + "grad_norm": 1.0973094701766968, + "learning_rate": 2.3550479413423575e-05, + "loss": 0.6805, + "step": 9380 + }, + { + "epoch": 5.291032148900169, + "grad_norm": 1.355683445930481, + "learning_rate": 2.3547659334461367e-05, + "loss": 0.7987, + "step": 9381 + }, + { + "epoch": 5.291596164692612, + "grad_norm": 0.9145416021347046, + "learning_rate": 2.3544839255499156e-05, + "loss": 0.693, + "step": 9382 + }, + { + "epoch": 5.2921601804850535, + "grad_norm": 1.2356759309768677, + "learning_rate": 2.3542019176536945e-05, + "loss": 0.7531, + "step": 9383 + }, + { + "epoch": 5.292724196277495, + "grad_norm": 1.0552622079849243, + "learning_rate": 2.3539199097574733e-05, + "loss": 0.8294, + "step": 9384 + }, + { + "epoch": 5.293288212069938, + "grad_norm": 1.0666590929031372, + "learning_rate": 2.3536379018612522e-05, + "loss": 0.7945, + "step": 9385 + }, + { + "epoch": 5.29385222786238, + "grad_norm": 1.0042846202850342, + "learning_rate": 2.353355893965031e-05, + "loss": 0.7307, + "step": 9386 + }, + { + "epoch": 5.2944162436548226, + "grad_norm": 2.6529147624969482, + "learning_rate": 2.35307388606881e-05, + "loss": 0.8225, + "step": 9387 + }, + { + "epoch": 5.294980259447264, + "grad_norm": 1.2384421825408936, + "learning_rate": 2.352791878172589e-05, + "loss": 0.7284, + "step": 9388 + }, + { + "epoch": 5.295544275239707, + "grad_norm": 1.1043009757995605, + "learning_rate": 2.3525098702763677e-05, + "loss": 0.8047, + "step": 9389 + }, + { + "epoch": 5.296108291032149, + "grad_norm": 1.0580931901931763, + "learning_rate": 2.3522278623801466e-05, + "loss": 0.7167, + "step": 9390 + }, + { + "epoch": 5.296672306824592, + "grad_norm": 1.4327538013458252, + "learning_rate": 2.3519458544839258e-05, + "loss": 0.8079, + "step": 9391 + }, + { + "epoch": 5.297236322617033, + "grad_norm": 1.1272413730621338, + "learning_rate": 2.3516638465877044e-05, + "loss": 0.8434, + "step": 9392 + }, + { + "epoch": 5.297800338409475, + "grad_norm": 1.2602753639221191, + "learning_rate": 2.3513818386914836e-05, + "loss": 0.7746, + "step": 9393 + }, + { + "epoch": 5.298364354201918, + "grad_norm": 0.9354076385498047, + "learning_rate": 2.351099830795262e-05, + "loss": 0.7101, + "step": 9394 + }, + { + "epoch": 5.29892836999436, + "grad_norm": 1.3776651620864868, + "learning_rate": 2.3508178228990413e-05, + "loss": 0.8209, + "step": 9395 + }, + { + "epoch": 5.299492385786802, + "grad_norm": 1.1249216794967651, + "learning_rate": 2.3505358150028202e-05, + "loss": 0.7083, + "step": 9396 + }, + { + "epoch": 5.300056401579244, + "grad_norm": 0.9817894697189331, + "learning_rate": 2.350253807106599e-05, + "loss": 0.6515, + "step": 9397 + }, + { + "epoch": 5.300620417371686, + "grad_norm": 1.2404674291610718, + "learning_rate": 2.349971799210378e-05, + "loss": 0.7906, + "step": 9398 + }, + { + "epoch": 5.301184433164129, + "grad_norm": 1.025147557258606, + "learning_rate": 2.349689791314157e-05, + "loss": 0.6577, + "step": 9399 + }, + { + "epoch": 5.3017484489565705, + "grad_norm": 1.230078101158142, + "learning_rate": 2.349407783417936e-05, + "loss": 0.7603, + "step": 9400 + }, + { + "epoch": 5.302312464749013, + "grad_norm": 1.3445634841918945, + "learning_rate": 2.3491257755217146e-05, + "loss": 0.8365, + "step": 9401 + }, + { + "epoch": 5.302876480541455, + "grad_norm": 1.5207949876785278, + "learning_rate": 2.348843767625494e-05, + "loss": 0.7737, + "step": 9402 + }, + { + "epoch": 5.303440496333898, + "grad_norm": 1.0025399923324585, + "learning_rate": 2.3485617597292724e-05, + "loss": 0.7689, + "step": 9403 + }, + { + "epoch": 5.3040045121263395, + "grad_norm": 0.8195226788520813, + "learning_rate": 2.3482797518330516e-05, + "loss": 0.6657, + "step": 9404 + }, + { + "epoch": 5.304568527918782, + "grad_norm": 1.0611491203308105, + "learning_rate": 2.34799774393683e-05, + "loss": 0.6004, + "step": 9405 + }, + { + "epoch": 5.305132543711224, + "grad_norm": 0.8641297817230225, + "learning_rate": 2.3477157360406094e-05, + "loss": 0.7285, + "step": 9406 + }, + { + "epoch": 5.305696559503666, + "grad_norm": 1.1975302696228027, + "learning_rate": 2.3474337281443882e-05, + "loss": 0.7264, + "step": 9407 + }, + { + "epoch": 5.3062605752961085, + "grad_norm": 1.5596004724502563, + "learning_rate": 2.347151720248167e-05, + "loss": 0.7444, + "step": 9408 + }, + { + "epoch": 5.30682459108855, + "grad_norm": 1.0785585641860962, + "learning_rate": 2.346869712351946e-05, + "loss": 0.7154, + "step": 9409 + }, + { + "epoch": 5.307388606880993, + "grad_norm": 0.9770358800888062, + "learning_rate": 2.346587704455725e-05, + "loss": 0.6709, + "step": 9410 + }, + { + "epoch": 5.307952622673435, + "grad_norm": 1.2659192085266113, + "learning_rate": 2.3463056965595038e-05, + "loss": 0.7495, + "step": 9411 + }, + { + "epoch": 5.308516638465877, + "grad_norm": 1.090497612953186, + "learning_rate": 2.3460236886632826e-05, + "loss": 0.6611, + "step": 9412 + }, + { + "epoch": 5.309080654258319, + "grad_norm": 1.1255627870559692, + "learning_rate": 2.3457416807670615e-05, + "loss": 0.6676, + "step": 9413 + }, + { + "epoch": 5.309644670050761, + "grad_norm": 1.4249529838562012, + "learning_rate": 2.3454596728708404e-05, + "loss": 0.6873, + "step": 9414 + }, + { + "epoch": 5.310208685843204, + "grad_norm": 1.1859313249588013, + "learning_rate": 2.3451776649746193e-05, + "loss": 0.7199, + "step": 9415 + }, + { + "epoch": 5.310772701635646, + "grad_norm": 1.3246270418167114, + "learning_rate": 2.3448956570783985e-05, + "loss": 0.7523, + "step": 9416 + }, + { + "epoch": 5.311336717428088, + "grad_norm": 1.2200567722320557, + "learning_rate": 2.344613649182177e-05, + "loss": 0.7724, + "step": 9417 + }, + { + "epoch": 5.31190073322053, + "grad_norm": 1.2884314060211182, + "learning_rate": 2.3443316412859563e-05, + "loss": 0.8186, + "step": 9418 + }, + { + "epoch": 5.312464749012973, + "grad_norm": 1.5149695873260498, + "learning_rate": 2.3440496333897348e-05, + "loss": 0.7677, + "step": 9419 + }, + { + "epoch": 5.313028764805415, + "grad_norm": 1.1972931623458862, + "learning_rate": 2.343767625493514e-05, + "loss": 0.6526, + "step": 9420 + }, + { + "epoch": 5.3135927805978564, + "grad_norm": 1.1776987314224243, + "learning_rate": 2.343485617597293e-05, + "loss": 0.7679, + "step": 9421 + }, + { + "epoch": 5.314156796390299, + "grad_norm": 1.0290091037750244, + "learning_rate": 2.3432036097010718e-05, + "loss": 0.7241, + "step": 9422 + }, + { + "epoch": 5.314720812182741, + "grad_norm": 0.9518216848373413, + "learning_rate": 2.3429216018048507e-05, + "loss": 0.6856, + "step": 9423 + }, + { + "epoch": 5.315284827975184, + "grad_norm": 1.0173726081848145, + "learning_rate": 2.3426395939086295e-05, + "loss": 0.7431, + "step": 9424 + }, + { + "epoch": 5.3158488437676255, + "grad_norm": 1.8926658630371094, + "learning_rate": 2.3423575860124084e-05, + "loss": 0.7169, + "step": 9425 + }, + { + "epoch": 5.316412859560067, + "grad_norm": 1.2064117193222046, + "learning_rate": 2.3420755781161873e-05, + "loss": 0.7363, + "step": 9426 + }, + { + "epoch": 5.31697687535251, + "grad_norm": 1.2105778455734253, + "learning_rate": 2.3417935702199662e-05, + "loss": 0.6944, + "step": 9427 + }, + { + "epoch": 5.317540891144952, + "grad_norm": 1.3457108736038208, + "learning_rate": 2.341511562323745e-05, + "loss": 0.804, + "step": 9428 + }, + { + "epoch": 5.3181049069373945, + "grad_norm": 0.8770219683647156, + "learning_rate": 2.341229554427524e-05, + "loss": 0.6779, + "step": 9429 + }, + { + "epoch": 5.318668922729836, + "grad_norm": 1.523879051208496, + "learning_rate": 2.340947546531303e-05, + "loss": 0.8506, + "step": 9430 + }, + { + "epoch": 5.319232938522279, + "grad_norm": 1.3876922130584717, + "learning_rate": 2.3406655386350817e-05, + "loss": 0.8507, + "step": 9431 + }, + { + "epoch": 5.319796954314721, + "grad_norm": 1.0294835567474365, + "learning_rate": 2.340383530738861e-05, + "loss": 0.7116, + "step": 9432 + }, + { + "epoch": 5.3203609701071635, + "grad_norm": 1.1459676027297974, + "learning_rate": 2.3401015228426398e-05, + "loss": 0.8099, + "step": 9433 + }, + { + "epoch": 5.320924985899605, + "grad_norm": 0.8994395732879639, + "learning_rate": 2.3398195149464187e-05, + "loss": 0.6838, + "step": 9434 + }, + { + "epoch": 5.321489001692047, + "grad_norm": 1.237510323524475, + "learning_rate": 2.3395375070501976e-05, + "loss": 0.7172, + "step": 9435 + }, + { + "epoch": 5.32205301748449, + "grad_norm": 1.246864676475525, + "learning_rate": 2.3392554991539764e-05, + "loss": 0.6739, + "step": 9436 + }, + { + "epoch": 5.322617033276932, + "grad_norm": 1.880751371383667, + "learning_rate": 2.3389734912577553e-05, + "loss": 0.8449, + "step": 9437 + }, + { + "epoch": 5.323181049069374, + "grad_norm": 0.9630811214447021, + "learning_rate": 2.3386914833615342e-05, + "loss": 0.7101, + "step": 9438 + }, + { + "epoch": 5.323745064861816, + "grad_norm": 1.70271897315979, + "learning_rate": 2.3384094754653134e-05, + "loss": 0.6798, + "step": 9439 + }, + { + "epoch": 5.324309080654258, + "grad_norm": 1.0922209024429321, + "learning_rate": 2.338127467569092e-05, + "loss": 0.6468, + "step": 9440 + }, + { + "epoch": 5.324873096446701, + "grad_norm": 1.6972237825393677, + "learning_rate": 2.337845459672871e-05, + "loss": 0.7657, + "step": 9441 + }, + { + "epoch": 5.325437112239142, + "grad_norm": 0.9366665482521057, + "learning_rate": 2.3375634517766497e-05, + "loss": 0.7989, + "step": 9442 + }, + { + "epoch": 5.326001128031585, + "grad_norm": 1.0862531661987305, + "learning_rate": 2.337281443880429e-05, + "loss": 0.6767, + "step": 9443 + }, + { + "epoch": 5.326565143824027, + "grad_norm": 1.041310429573059, + "learning_rate": 2.3369994359842075e-05, + "loss": 0.6436, + "step": 9444 + }, + { + "epoch": 5.32712915961647, + "grad_norm": 1.6043530702590942, + "learning_rate": 2.3367174280879867e-05, + "loss": 0.7632, + "step": 9445 + }, + { + "epoch": 5.327693175408911, + "grad_norm": 1.0590122938156128, + "learning_rate": 2.3364354201917656e-05, + "loss": 0.6859, + "step": 9446 + }, + { + "epoch": 5.328257191201354, + "grad_norm": 1.36518394947052, + "learning_rate": 2.3361534122955444e-05, + "loss": 0.7731, + "step": 9447 + }, + { + "epoch": 5.328821206993796, + "grad_norm": 1.6122652292251587, + "learning_rate": 2.3358714043993233e-05, + "loss": 0.7466, + "step": 9448 + }, + { + "epoch": 5.329385222786238, + "grad_norm": 1.0065131187438965, + "learning_rate": 2.3355893965031022e-05, + "loss": 0.725, + "step": 9449 + }, + { + "epoch": 5.32994923857868, + "grad_norm": 1.2959187030792236, + "learning_rate": 2.335307388606881e-05, + "loss": 0.7555, + "step": 9450 + }, + { + "epoch": 5.330513254371122, + "grad_norm": 1.351271390914917, + "learning_rate": 2.33502538071066e-05, + "loss": 0.8143, + "step": 9451 + }, + { + "epoch": 5.331077270163565, + "grad_norm": 2.6470301151275635, + "learning_rate": 2.334743372814439e-05, + "loss": 0.8129, + "step": 9452 + }, + { + "epoch": 5.331641285956007, + "grad_norm": 1.062502384185791, + "learning_rate": 2.3344613649182177e-05, + "loss": 0.6349, + "step": 9453 + }, + { + "epoch": 5.3322053017484485, + "grad_norm": 0.8839182257652283, + "learning_rate": 2.3341793570219966e-05, + "loss": 0.7199, + "step": 9454 + }, + { + "epoch": 5.332769317540891, + "grad_norm": 1.1612097024917603, + "learning_rate": 2.3338973491257758e-05, + "loss": 0.7104, + "step": 9455 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 1.3772774934768677, + "learning_rate": 2.3336153412295544e-05, + "loss": 0.6899, + "step": 9456 + }, + { + "epoch": 5.333897349125776, + "grad_norm": 1.2871829271316528, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.7533, + "step": 9457 + }, + { + "epoch": 5.3344613649182175, + "grad_norm": 1.2840993404388428, + "learning_rate": 2.333051325437112e-05, + "loss": 0.7848, + "step": 9458 + }, + { + "epoch": 5.33502538071066, + "grad_norm": 1.096124291419983, + "learning_rate": 2.3327693175408913e-05, + "loss": 0.7146, + "step": 9459 + }, + { + "epoch": 5.335589396503102, + "grad_norm": 0.9994836449623108, + "learning_rate": 2.33248730964467e-05, + "loss": 0.6434, + "step": 9460 + }, + { + "epoch": 5.336153412295545, + "grad_norm": 0.9837965369224548, + "learning_rate": 2.332205301748449e-05, + "loss": 0.7746, + "step": 9461 + }, + { + "epoch": 5.3367174280879865, + "grad_norm": 1.0755212306976318, + "learning_rate": 2.331923293852228e-05, + "loss": 0.7071, + "step": 9462 + }, + { + "epoch": 5.337281443880428, + "grad_norm": 1.879425048828125, + "learning_rate": 2.331641285956007e-05, + "loss": 0.7431, + "step": 9463 + }, + { + "epoch": 5.337845459672871, + "grad_norm": 1.1344454288482666, + "learning_rate": 2.3313592780597857e-05, + "loss": 0.749, + "step": 9464 + }, + { + "epoch": 5.338409475465313, + "grad_norm": 1.1139200925827026, + "learning_rate": 2.3310772701635646e-05, + "loss": 0.8149, + "step": 9465 + }, + { + "epoch": 5.3389734912577556, + "grad_norm": 1.2296767234802246, + "learning_rate": 2.3307952622673435e-05, + "loss": 0.6897, + "step": 9466 + }, + { + "epoch": 5.339537507050197, + "grad_norm": 0.8694438934326172, + "learning_rate": 2.3305132543711224e-05, + "loss": 0.676, + "step": 9467 + }, + { + "epoch": 5.340101522842639, + "grad_norm": 0.8417538404464722, + "learning_rate": 2.3302312464749016e-05, + "loss": 0.5631, + "step": 9468 + }, + { + "epoch": 5.340665538635082, + "grad_norm": 1.3526115417480469, + "learning_rate": 2.32994923857868e-05, + "loss": 0.7729, + "step": 9469 + }, + { + "epoch": 5.341229554427524, + "grad_norm": 0.9393528699874878, + "learning_rate": 2.3296672306824594e-05, + "loss": 0.7994, + "step": 9470 + }, + { + "epoch": 5.341793570219966, + "grad_norm": 1.116883635520935, + "learning_rate": 2.3293852227862382e-05, + "loss": 0.6979, + "step": 9471 + }, + { + "epoch": 5.342357586012408, + "grad_norm": 1.2150040864944458, + "learning_rate": 2.329103214890017e-05, + "loss": 0.7721, + "step": 9472 + }, + { + "epoch": 5.342921601804851, + "grad_norm": 0.9622020721435547, + "learning_rate": 2.328821206993796e-05, + "loss": 0.7456, + "step": 9473 + }, + { + "epoch": 5.343485617597293, + "grad_norm": 1.229841709136963, + "learning_rate": 2.328539199097575e-05, + "loss": 0.705, + "step": 9474 + }, + { + "epoch": 5.344049633389735, + "grad_norm": 1.5842775106430054, + "learning_rate": 2.3282571912013538e-05, + "loss": 0.7639, + "step": 9475 + }, + { + "epoch": 5.344613649182177, + "grad_norm": 1.3013700246810913, + "learning_rate": 2.3279751833051326e-05, + "loss": 0.7214, + "step": 9476 + }, + { + "epoch": 5.345177664974619, + "grad_norm": 1.1207953691482544, + "learning_rate": 2.3276931754089115e-05, + "loss": 0.7832, + "step": 9477 + }, + { + "epoch": 5.345741680767062, + "grad_norm": 1.1656360626220703, + "learning_rate": 2.3274111675126904e-05, + "loss": 0.7944, + "step": 9478 + }, + { + "epoch": 5.3463056965595035, + "grad_norm": 1.1442843675613403, + "learning_rate": 2.3271291596164693e-05, + "loss": 0.7953, + "step": 9479 + }, + { + "epoch": 5.346869712351946, + "grad_norm": 1.4844419956207275, + "learning_rate": 2.3268471517202485e-05, + "loss": 0.8417, + "step": 9480 + }, + { + "epoch": 5.347433728144388, + "grad_norm": 1.2755790948867798, + "learning_rate": 2.326565143824027e-05, + "loss": 0.6648, + "step": 9481 + }, + { + "epoch": 5.34799774393683, + "grad_norm": 1.048895001411438, + "learning_rate": 2.3262831359278062e-05, + "loss": 0.7157, + "step": 9482 + }, + { + "epoch": 5.3485617597292725, + "grad_norm": 1.0891817808151245, + "learning_rate": 2.3260011280315848e-05, + "loss": 0.7737, + "step": 9483 + }, + { + "epoch": 5.349125775521714, + "grad_norm": 0.911840558052063, + "learning_rate": 2.325719120135364e-05, + "loss": 0.671, + "step": 9484 + }, + { + "epoch": 5.349689791314157, + "grad_norm": 1.3341671228408813, + "learning_rate": 2.325437112239143e-05, + "loss": 0.6706, + "step": 9485 + }, + { + "epoch": 5.350253807106599, + "grad_norm": 1.0620675086975098, + "learning_rate": 2.3251551043429218e-05, + "loss": 0.6752, + "step": 9486 + }, + { + "epoch": 5.3508178228990415, + "grad_norm": 0.802545964717865, + "learning_rate": 2.3248730964467006e-05, + "loss": 0.6746, + "step": 9487 + }, + { + "epoch": 5.351381838691483, + "grad_norm": 1.3889011144638062, + "learning_rate": 2.3245910885504795e-05, + "loss": 0.6962, + "step": 9488 + }, + { + "epoch": 5.351945854483926, + "grad_norm": 1.0115876197814941, + "learning_rate": 2.3243090806542584e-05, + "loss": 0.733, + "step": 9489 + }, + { + "epoch": 5.352509870276368, + "grad_norm": 0.938983142375946, + "learning_rate": 2.3240270727580373e-05, + "loss": 0.6728, + "step": 9490 + }, + { + "epoch": 5.35307388606881, + "grad_norm": 1.0398122072219849, + "learning_rate": 2.323745064861816e-05, + "loss": 0.9062, + "step": 9491 + }, + { + "epoch": 5.353637901861252, + "grad_norm": 1.3232873678207397, + "learning_rate": 2.323463056965595e-05, + "loss": 0.7219, + "step": 9492 + }, + { + "epoch": 5.354201917653694, + "grad_norm": 1.2590558528900146, + "learning_rate": 2.323181049069374e-05, + "loss": 0.6505, + "step": 9493 + }, + { + "epoch": 5.354765933446137, + "grad_norm": 1.0008809566497803, + "learning_rate": 2.322899041173153e-05, + "loss": 0.7974, + "step": 9494 + }, + { + "epoch": 5.355329949238579, + "grad_norm": 1.1801364421844482, + "learning_rate": 2.3226170332769317e-05, + "loss": 0.7308, + "step": 9495 + }, + { + "epoch": 5.35589396503102, + "grad_norm": 0.8587358593940735, + "learning_rate": 2.322335025380711e-05, + "loss": 0.7642, + "step": 9496 + }, + { + "epoch": 5.356457980823463, + "grad_norm": 0.7856799364089966, + "learning_rate": 2.3220530174844894e-05, + "loss": 0.6496, + "step": 9497 + }, + { + "epoch": 5.357021996615905, + "grad_norm": 1.4370758533477783, + "learning_rate": 2.3217710095882687e-05, + "loss": 0.7617, + "step": 9498 + }, + { + "epoch": 5.357586012408348, + "grad_norm": 1.2333701848983765, + "learning_rate": 2.3214890016920472e-05, + "loss": 0.6963, + "step": 9499 + }, + { + "epoch": 5.3581500282007894, + "grad_norm": 1.1793745756149292, + "learning_rate": 2.3212069937958264e-05, + "loss": 0.7034, + "step": 9500 + }, + { + "epoch": 5.358714043993232, + "grad_norm": 1.0360403060913086, + "learning_rate": 2.3209249858996053e-05, + "loss": 0.7245, + "step": 9501 + }, + { + "epoch": 5.359278059785674, + "grad_norm": 1.1637778282165527, + "learning_rate": 2.3206429780033842e-05, + "loss": 0.7115, + "step": 9502 + }, + { + "epoch": 5.359842075578117, + "grad_norm": 1.011515498161316, + "learning_rate": 2.3203609701071634e-05, + "loss": 0.7215, + "step": 9503 + }, + { + "epoch": 5.3604060913705585, + "grad_norm": 1.320739507675171, + "learning_rate": 2.320078962210942e-05, + "loss": 0.7034, + "step": 9504 + }, + { + "epoch": 5.360970107163, + "grad_norm": 1.212391972541809, + "learning_rate": 2.319796954314721e-05, + "loss": 0.688, + "step": 9505 + }, + { + "epoch": 5.361534122955443, + "grad_norm": 1.0648225545883179, + "learning_rate": 2.3195149464184997e-05, + "loss": 0.8017, + "step": 9506 + }, + { + "epoch": 5.362098138747885, + "grad_norm": 1.2969640493392944, + "learning_rate": 2.319232938522279e-05, + "loss": 0.7515, + "step": 9507 + }, + { + "epoch": 5.3626621545403275, + "grad_norm": 1.437687635421753, + "learning_rate": 2.3189509306260575e-05, + "loss": 0.8443, + "step": 9508 + }, + { + "epoch": 5.363226170332769, + "grad_norm": 1.4423048496246338, + "learning_rate": 2.3186689227298367e-05, + "loss": 0.7883, + "step": 9509 + }, + { + "epoch": 5.363790186125211, + "grad_norm": 1.0279234647750854, + "learning_rate": 2.3183869148336156e-05, + "loss": 0.7605, + "step": 9510 + }, + { + "epoch": 5.364354201917654, + "grad_norm": 0.7628734707832336, + "learning_rate": 2.3181049069373944e-05, + "loss": 0.6195, + "step": 9511 + }, + { + "epoch": 5.364918217710096, + "grad_norm": 0.9677740931510925, + "learning_rate": 2.3178228990411733e-05, + "loss": 0.6398, + "step": 9512 + }, + { + "epoch": 5.365482233502538, + "grad_norm": 0.8717389106750488, + "learning_rate": 2.3175408911449522e-05, + "loss": 0.7623, + "step": 9513 + }, + { + "epoch": 5.36604624929498, + "grad_norm": 1.2389674186706543, + "learning_rate": 2.317258883248731e-05, + "loss": 0.8337, + "step": 9514 + }, + { + "epoch": 5.366610265087423, + "grad_norm": 1.355407953262329, + "learning_rate": 2.31697687535251e-05, + "loss": 0.7692, + "step": 9515 + }, + { + "epoch": 5.367174280879865, + "grad_norm": 1.0422285795211792, + "learning_rate": 2.316694867456289e-05, + "loss": 0.6821, + "step": 9516 + }, + { + "epoch": 5.367738296672307, + "grad_norm": 1.174258828163147, + "learning_rate": 2.3164128595600677e-05, + "loss": 0.7788, + "step": 9517 + }, + { + "epoch": 5.368302312464749, + "grad_norm": 1.0644550323486328, + "learning_rate": 2.3161308516638466e-05, + "loss": 0.7773, + "step": 9518 + }, + { + "epoch": 5.368866328257191, + "grad_norm": 0.922942042350769, + "learning_rate": 2.3158488437676258e-05, + "loss": 0.5739, + "step": 9519 + }, + { + "epoch": 5.369430344049634, + "grad_norm": 1.6454987525939941, + "learning_rate": 2.3155668358714044e-05, + "loss": 0.7671, + "step": 9520 + }, + { + "epoch": 5.369994359842075, + "grad_norm": 1.0273051261901855, + "learning_rate": 2.3152848279751836e-05, + "loss": 0.5505, + "step": 9521 + }, + { + "epoch": 5.370558375634518, + "grad_norm": 1.120558738708496, + "learning_rate": 2.315002820078962e-05, + "loss": 0.6785, + "step": 9522 + }, + { + "epoch": 5.37112239142696, + "grad_norm": 1.4417119026184082, + "learning_rate": 2.3147208121827413e-05, + "loss": 0.8207, + "step": 9523 + }, + { + "epoch": 5.371686407219402, + "grad_norm": 1.2148818969726562, + "learning_rate": 2.3144388042865202e-05, + "loss": 0.7748, + "step": 9524 + }, + { + "epoch": 5.372250423011844, + "grad_norm": 1.3860859870910645, + "learning_rate": 2.314156796390299e-05, + "loss": 0.8663, + "step": 9525 + }, + { + "epoch": 5.372814438804286, + "grad_norm": 0.9033552408218384, + "learning_rate": 2.313874788494078e-05, + "loss": 0.5818, + "step": 9526 + }, + { + "epoch": 5.373378454596729, + "grad_norm": 1.4030992984771729, + "learning_rate": 2.313592780597857e-05, + "loss": 0.7735, + "step": 9527 + }, + { + "epoch": 5.373942470389171, + "grad_norm": 1.1282098293304443, + "learning_rate": 2.3133107727016357e-05, + "loss": 0.7034, + "step": 9528 + }, + { + "epoch": 5.374506486181613, + "grad_norm": 1.1021095514297485, + "learning_rate": 2.3130287648054146e-05, + "loss": 0.7959, + "step": 9529 + }, + { + "epoch": 5.375070501974055, + "grad_norm": 1.0174024105072021, + "learning_rate": 2.3127467569091935e-05, + "loss": 0.6815, + "step": 9530 + }, + { + "epoch": 5.375634517766498, + "grad_norm": 2.1464974880218506, + "learning_rate": 2.3124647490129724e-05, + "loss": 0.7498, + "step": 9531 + }, + { + "epoch": 5.37619853355894, + "grad_norm": 1.2309433221817017, + "learning_rate": 2.3121827411167512e-05, + "loss": 0.7957, + "step": 9532 + }, + { + "epoch": 5.3767625493513815, + "grad_norm": 1.5050835609436035, + "learning_rate": 2.3119007332205305e-05, + "loss": 0.797, + "step": 9533 + }, + { + "epoch": 5.377326565143824, + "grad_norm": 1.469483494758606, + "learning_rate": 2.311618725324309e-05, + "loss": 0.9068, + "step": 9534 + }, + { + "epoch": 5.377890580936266, + "grad_norm": 1.4268051385879517, + "learning_rate": 2.3113367174280882e-05, + "loss": 0.7995, + "step": 9535 + }, + { + "epoch": 5.378454596728709, + "grad_norm": 1.2805466651916504, + "learning_rate": 2.3110547095318668e-05, + "loss": 0.7315, + "step": 9536 + }, + { + "epoch": 5.3790186125211505, + "grad_norm": 1.230334758758545, + "learning_rate": 2.310772701635646e-05, + "loss": 0.8275, + "step": 9537 + }, + { + "epoch": 5.379582628313592, + "grad_norm": 0.9716184139251709, + "learning_rate": 2.310490693739425e-05, + "loss": 0.623, + "step": 9538 + }, + { + "epoch": 5.380146644106035, + "grad_norm": 1.3350707292556763, + "learning_rate": 2.3102086858432037e-05, + "loss": 0.7671, + "step": 9539 + }, + { + "epoch": 5.380710659898477, + "grad_norm": 1.3511890172958374, + "learning_rate": 2.3099266779469826e-05, + "loss": 0.7571, + "step": 9540 + }, + { + "epoch": 5.3812746756909196, + "grad_norm": 0.8160867691040039, + "learning_rate": 2.3096446700507615e-05, + "loss": 0.6054, + "step": 9541 + }, + { + "epoch": 5.381838691483361, + "grad_norm": 0.9512621164321899, + "learning_rate": 2.3093626621545407e-05, + "loss": 0.7389, + "step": 9542 + }, + { + "epoch": 5.382402707275804, + "grad_norm": 1.3227962255477905, + "learning_rate": 2.3090806542583193e-05, + "loss": 0.7622, + "step": 9543 + }, + { + "epoch": 5.382966723068246, + "grad_norm": 1.532221794128418, + "learning_rate": 2.3087986463620985e-05, + "loss": 0.7451, + "step": 9544 + }, + { + "epoch": 5.383530738860689, + "grad_norm": 1.8134533166885376, + "learning_rate": 2.308516638465877e-05, + "loss": 0.868, + "step": 9545 + }, + { + "epoch": 5.38409475465313, + "grad_norm": 1.091858983039856, + "learning_rate": 2.3082346305696562e-05, + "loss": 0.8208, + "step": 9546 + }, + { + "epoch": 5.384658770445572, + "grad_norm": 1.5226900577545166, + "learning_rate": 2.3079526226734348e-05, + "loss": 0.8895, + "step": 9547 + }, + { + "epoch": 5.385222786238015, + "grad_norm": 1.1232515573501587, + "learning_rate": 2.307670614777214e-05, + "loss": 0.7681, + "step": 9548 + }, + { + "epoch": 5.385786802030457, + "grad_norm": 1.088366150856018, + "learning_rate": 2.307388606880993e-05, + "loss": 0.6393, + "step": 9549 + }, + { + "epoch": 5.386350817822899, + "grad_norm": 1.4139615297317505, + "learning_rate": 2.3071065989847718e-05, + "loss": 0.7136, + "step": 9550 + }, + { + "epoch": 5.386914833615341, + "grad_norm": 0.8522089123725891, + "learning_rate": 2.3068245910885506e-05, + "loss": 0.6561, + "step": 9551 + }, + { + "epoch": 5.387478849407783, + "grad_norm": 1.524105191230774, + "learning_rate": 2.3065425831923295e-05, + "loss": 0.8255, + "step": 9552 + }, + { + "epoch": 5.388042865200226, + "grad_norm": 1.063434362411499, + "learning_rate": 2.3062605752961084e-05, + "loss": 0.7883, + "step": 9553 + }, + { + "epoch": 5.3886068809926675, + "grad_norm": 1.9095981121063232, + "learning_rate": 2.3059785673998873e-05, + "loss": 0.7907, + "step": 9554 + }, + { + "epoch": 5.38917089678511, + "grad_norm": 1.0407017469406128, + "learning_rate": 2.305696559503666e-05, + "loss": 0.7519, + "step": 9555 + }, + { + "epoch": 5.389734912577552, + "grad_norm": 1.6315019130706787, + "learning_rate": 2.305414551607445e-05, + "loss": 0.7912, + "step": 9556 + }, + { + "epoch": 5.390298928369995, + "grad_norm": 1.168796181678772, + "learning_rate": 2.305132543711224e-05, + "loss": 0.741, + "step": 9557 + }, + { + "epoch": 5.3908629441624365, + "grad_norm": 1.621677041053772, + "learning_rate": 2.304850535815003e-05, + "loss": 0.7425, + "step": 9558 + }, + { + "epoch": 5.391426959954879, + "grad_norm": 1.6345993280410767, + "learning_rate": 2.3045685279187817e-05, + "loss": 0.8057, + "step": 9559 + }, + { + "epoch": 5.391990975747321, + "grad_norm": 1.6913206577301025, + "learning_rate": 2.304286520022561e-05, + "loss": 0.9083, + "step": 9560 + }, + { + "epoch": 5.392554991539763, + "grad_norm": 1.3554410934448242, + "learning_rate": 2.3040045121263394e-05, + "loss": 0.7222, + "step": 9561 + }, + { + "epoch": 5.3931190073322055, + "grad_norm": 1.207461953163147, + "learning_rate": 2.3037225042301187e-05, + "loss": 0.808, + "step": 9562 + }, + { + "epoch": 5.393683023124647, + "grad_norm": 1.0589840412139893, + "learning_rate": 2.3034404963338972e-05, + "loss": 0.6814, + "step": 9563 + }, + { + "epoch": 5.39424703891709, + "grad_norm": 1.498752474784851, + "learning_rate": 2.3031584884376764e-05, + "loss": 0.6999, + "step": 9564 + }, + { + "epoch": 5.394811054709532, + "grad_norm": 1.4439252614974976, + "learning_rate": 2.3028764805414553e-05, + "loss": 0.6801, + "step": 9565 + }, + { + "epoch": 5.395375070501974, + "grad_norm": 1.1739798784255981, + "learning_rate": 2.3025944726452342e-05, + "loss": 0.6835, + "step": 9566 + }, + { + "epoch": 5.395939086294416, + "grad_norm": 1.2353754043579102, + "learning_rate": 2.302312464749013e-05, + "loss": 0.7465, + "step": 9567 + }, + { + "epoch": 5.396503102086858, + "grad_norm": 1.1883741617202759, + "learning_rate": 2.302030456852792e-05, + "loss": 0.7264, + "step": 9568 + }, + { + "epoch": 5.397067117879301, + "grad_norm": 1.1487538814544678, + "learning_rate": 2.3017484489565708e-05, + "loss": 0.7934, + "step": 9569 + }, + { + "epoch": 5.397631133671743, + "grad_norm": 1.27131187915802, + "learning_rate": 2.3014664410603497e-05, + "loss": 0.6663, + "step": 9570 + }, + { + "epoch": 5.398195149464185, + "grad_norm": 1.2829790115356445, + "learning_rate": 2.3011844331641286e-05, + "loss": 0.7012, + "step": 9571 + }, + { + "epoch": 5.398759165256627, + "grad_norm": 1.1120208501815796, + "learning_rate": 2.3009024252679075e-05, + "loss": 0.7368, + "step": 9572 + }, + { + "epoch": 5.39932318104907, + "grad_norm": 0.8830032348632812, + "learning_rate": 2.3006204173716867e-05, + "loss": 0.6182, + "step": 9573 + }, + { + "epoch": 5.399887196841512, + "grad_norm": 1.2653928995132446, + "learning_rate": 2.3003384094754655e-05, + "loss": 0.735, + "step": 9574 + }, + { + "epoch": 5.4004512126339534, + "grad_norm": 0.9556828737258911, + "learning_rate": 2.3000564015792444e-05, + "loss": 0.725, + "step": 9575 + }, + { + "epoch": 5.401015228426396, + "grad_norm": 1.0107260942459106, + "learning_rate": 2.2997743936830233e-05, + "loss": 0.7686, + "step": 9576 + }, + { + "epoch": 5.401579244218838, + "grad_norm": 0.9088018536567688, + "learning_rate": 2.2994923857868022e-05, + "loss": 0.6456, + "step": 9577 + }, + { + "epoch": 5.402143260011281, + "grad_norm": 1.398514747619629, + "learning_rate": 2.299210377890581e-05, + "loss": 0.7572, + "step": 9578 + }, + { + "epoch": 5.4027072758037225, + "grad_norm": 0.8074427247047424, + "learning_rate": 2.29892836999436e-05, + "loss": 0.6058, + "step": 9579 + }, + { + "epoch": 5.403271291596164, + "grad_norm": 1.4482734203338623, + "learning_rate": 2.2986463620981388e-05, + "loss": 0.7701, + "step": 9580 + }, + { + "epoch": 5.403835307388607, + "grad_norm": 0.8431611061096191, + "learning_rate": 2.2983643542019177e-05, + "loss": 0.6713, + "step": 9581 + }, + { + "epoch": 5.404399323181049, + "grad_norm": 1.2590723037719727, + "learning_rate": 2.2980823463056966e-05, + "loss": 0.6965, + "step": 9582 + }, + { + "epoch": 5.4049633389734915, + "grad_norm": 0.9260280132293701, + "learning_rate": 2.2978003384094758e-05, + "loss": 0.7825, + "step": 9583 + }, + { + "epoch": 5.405527354765933, + "grad_norm": 0.8383905291557312, + "learning_rate": 2.2975183305132543e-05, + "loss": 0.6833, + "step": 9584 + }, + { + "epoch": 5.406091370558376, + "grad_norm": 1.0954664945602417, + "learning_rate": 2.2972363226170336e-05, + "loss": 0.8156, + "step": 9585 + }, + { + "epoch": 5.406655386350818, + "grad_norm": 1.3052586317062378, + "learning_rate": 2.296954314720812e-05, + "loss": 0.7513, + "step": 9586 + }, + { + "epoch": 5.4072194021432605, + "grad_norm": 1.1882303953170776, + "learning_rate": 2.2966723068245913e-05, + "loss": 0.7093, + "step": 9587 + }, + { + "epoch": 5.407783417935702, + "grad_norm": 1.5143711566925049, + "learning_rate": 2.2963902989283702e-05, + "loss": 0.756, + "step": 9588 + }, + { + "epoch": 5.408347433728144, + "grad_norm": 1.33376145362854, + "learning_rate": 2.296108291032149e-05, + "loss": 0.7124, + "step": 9589 + }, + { + "epoch": 5.408911449520587, + "grad_norm": 1.2699201107025146, + "learning_rate": 2.295826283135928e-05, + "loss": 0.6782, + "step": 9590 + }, + { + "epoch": 5.409475465313029, + "grad_norm": 1.2829265594482422, + "learning_rate": 2.295544275239707e-05, + "loss": 0.6609, + "step": 9591 + }, + { + "epoch": 5.410039481105471, + "grad_norm": 1.0589427947998047, + "learning_rate": 2.2952622673434857e-05, + "loss": 0.6868, + "step": 9592 + }, + { + "epoch": 5.410603496897913, + "grad_norm": 1.1401976346969604, + "learning_rate": 2.2949802594472646e-05, + "loss": 0.787, + "step": 9593 + }, + { + "epoch": 5.411167512690355, + "grad_norm": 1.312058448791504, + "learning_rate": 2.2946982515510435e-05, + "loss": 0.816, + "step": 9594 + }, + { + "epoch": 5.411731528482798, + "grad_norm": 0.9766390323638916, + "learning_rate": 2.2944162436548224e-05, + "loss": 0.6997, + "step": 9595 + }, + { + "epoch": 5.412295544275239, + "grad_norm": 1.4837632179260254, + "learning_rate": 2.2941342357586012e-05, + "loss": 0.8335, + "step": 9596 + }, + { + "epoch": 5.412859560067682, + "grad_norm": 0.9440619945526123, + "learning_rate": 2.2938522278623805e-05, + "loss": 0.771, + "step": 9597 + }, + { + "epoch": 5.413423575860124, + "grad_norm": 1.0588524341583252, + "learning_rate": 2.293570219966159e-05, + "loss": 0.7197, + "step": 9598 + }, + { + "epoch": 5.413987591652567, + "grad_norm": 1.0699533224105835, + "learning_rate": 2.2932882120699382e-05, + "loss": 0.7662, + "step": 9599 + }, + { + "epoch": 5.414551607445008, + "grad_norm": 0.9315105080604553, + "learning_rate": 2.2930062041737168e-05, + "loss": 0.7083, + "step": 9600 + }, + { + "epoch": 5.415115623237451, + "grad_norm": 1.1357412338256836, + "learning_rate": 2.292724196277496e-05, + "loss": 0.77, + "step": 9601 + }, + { + "epoch": 5.415679639029893, + "grad_norm": 0.9422044157981873, + "learning_rate": 2.2924421883812745e-05, + "loss": 0.6451, + "step": 9602 + }, + { + "epoch": 5.416243654822335, + "grad_norm": 1.0072190761566162, + "learning_rate": 2.2921601804850537e-05, + "loss": 0.8115, + "step": 9603 + }, + { + "epoch": 5.416807670614777, + "grad_norm": 1.026131272315979, + "learning_rate": 2.2918781725888326e-05, + "loss": 0.7018, + "step": 9604 + }, + { + "epoch": 5.417371686407219, + "grad_norm": 1.4086499214172363, + "learning_rate": 2.2915961646926115e-05, + "loss": 0.8638, + "step": 9605 + }, + { + "epoch": 5.417935702199662, + "grad_norm": 1.1836447715759277, + "learning_rate": 2.2913141567963904e-05, + "loss": 0.7117, + "step": 9606 + }, + { + "epoch": 5.418499717992104, + "grad_norm": 1.1361076831817627, + "learning_rate": 2.2910321489001693e-05, + "loss": 0.7496, + "step": 9607 + }, + { + "epoch": 5.4190637337845455, + "grad_norm": 1.1427075862884521, + "learning_rate": 2.2907501410039485e-05, + "loss": 0.665, + "step": 9608 + }, + { + "epoch": 5.419627749576988, + "grad_norm": 1.6039457321166992, + "learning_rate": 2.290468133107727e-05, + "loss": 0.7989, + "step": 9609 + }, + { + "epoch": 5.42019176536943, + "grad_norm": 2.563190460205078, + "learning_rate": 2.2901861252115062e-05, + "loss": 0.6999, + "step": 9610 + }, + { + "epoch": 5.420755781161873, + "grad_norm": 1.2595652341842651, + "learning_rate": 2.2899041173152848e-05, + "loss": 0.7791, + "step": 9611 + }, + { + "epoch": 5.4213197969543145, + "grad_norm": 1.2256648540496826, + "learning_rate": 2.289622109419064e-05, + "loss": 0.7231, + "step": 9612 + }, + { + "epoch": 5.421883812746757, + "grad_norm": 1.1511247158050537, + "learning_rate": 2.289340101522843e-05, + "loss": 0.7738, + "step": 9613 + }, + { + "epoch": 5.422447828539199, + "grad_norm": 0.9558553695678711, + "learning_rate": 2.2890580936266218e-05, + "loss": 0.664, + "step": 9614 + }, + { + "epoch": 5.423011844331642, + "grad_norm": 1.3103513717651367, + "learning_rate": 2.2887760857304006e-05, + "loss": 0.8398, + "step": 9615 + }, + { + "epoch": 5.4235758601240835, + "grad_norm": 1.1605331897735596, + "learning_rate": 2.2884940778341795e-05, + "loss": 0.7891, + "step": 9616 + }, + { + "epoch": 5.424139875916525, + "grad_norm": 1.5893796682357788, + "learning_rate": 2.2882120699379584e-05, + "loss": 0.8171, + "step": 9617 + }, + { + "epoch": 5.424703891708968, + "grad_norm": 1.089961051940918, + "learning_rate": 2.2879300620417373e-05, + "loss": 0.6551, + "step": 9618 + }, + { + "epoch": 5.42526790750141, + "grad_norm": 1.0295556783676147, + "learning_rate": 2.287648054145516e-05, + "loss": 0.6878, + "step": 9619 + }, + { + "epoch": 5.4258319232938526, + "grad_norm": 1.3821598291397095, + "learning_rate": 2.287366046249295e-05, + "loss": 0.8436, + "step": 9620 + }, + { + "epoch": 5.426395939086294, + "grad_norm": 0.9787992238998413, + "learning_rate": 2.287084038353074e-05, + "loss": 0.7422, + "step": 9621 + }, + { + "epoch": 5.426959954878736, + "grad_norm": 1.0548162460327148, + "learning_rate": 2.286802030456853e-05, + "loss": 0.7274, + "step": 9622 + }, + { + "epoch": 5.427523970671179, + "grad_norm": 1.3653444051742554, + "learning_rate": 2.2865200225606317e-05, + "loss": 0.7152, + "step": 9623 + }, + { + "epoch": 5.428087986463621, + "grad_norm": 1.0964007377624512, + "learning_rate": 2.286238014664411e-05, + "loss": 0.6715, + "step": 9624 + }, + { + "epoch": 5.428652002256063, + "grad_norm": 1.526536226272583, + "learning_rate": 2.2859560067681894e-05, + "loss": 0.8508, + "step": 9625 + }, + { + "epoch": 5.429216018048505, + "grad_norm": 1.1941850185394287, + "learning_rate": 2.2856739988719686e-05, + "loss": 0.8379, + "step": 9626 + }, + { + "epoch": 5.429780033840948, + "grad_norm": 1.1849207878112793, + "learning_rate": 2.2853919909757472e-05, + "loss": 0.6783, + "step": 9627 + }, + { + "epoch": 5.43034404963339, + "grad_norm": 1.4426034688949585, + "learning_rate": 2.2851099830795264e-05, + "loss": 0.781, + "step": 9628 + }, + { + "epoch": 5.430908065425832, + "grad_norm": 0.9401556253433228, + "learning_rate": 2.2848279751833053e-05, + "loss": 0.6619, + "step": 9629 + }, + { + "epoch": 5.431472081218274, + "grad_norm": 1.3375498056411743, + "learning_rate": 2.284545967287084e-05, + "loss": 0.6961, + "step": 9630 + }, + { + "epoch": 5.432036097010716, + "grad_norm": 1.5088692903518677, + "learning_rate": 2.284263959390863e-05, + "loss": 0.7715, + "step": 9631 + }, + { + "epoch": 5.432600112803159, + "grad_norm": 0.9250931739807129, + "learning_rate": 2.283981951494642e-05, + "loss": 0.6249, + "step": 9632 + }, + { + "epoch": 5.4331641285956005, + "grad_norm": 1.1996586322784424, + "learning_rate": 2.2836999435984208e-05, + "loss": 0.7555, + "step": 9633 + }, + { + "epoch": 5.433728144388043, + "grad_norm": 1.0345020294189453, + "learning_rate": 2.2834179357021997e-05, + "loss": 0.6668, + "step": 9634 + }, + { + "epoch": 5.434292160180485, + "grad_norm": 1.0154422521591187, + "learning_rate": 2.2831359278059786e-05, + "loss": 0.8075, + "step": 9635 + }, + { + "epoch": 5.434856175972927, + "grad_norm": 1.181187391281128, + "learning_rate": 2.2828539199097574e-05, + "loss": 0.7433, + "step": 9636 + }, + { + "epoch": 5.4354201917653695, + "grad_norm": 1.2082990407943726, + "learning_rate": 2.2825719120135363e-05, + "loss": 0.6798, + "step": 9637 + }, + { + "epoch": 5.435984207557811, + "grad_norm": 1.009209156036377, + "learning_rate": 2.2822899041173155e-05, + "loss": 0.7437, + "step": 9638 + }, + { + "epoch": 5.436548223350254, + "grad_norm": 1.1667994260787964, + "learning_rate": 2.282007896221094e-05, + "loss": 0.7089, + "step": 9639 + }, + { + "epoch": 5.437112239142696, + "grad_norm": 0.8614265322685242, + "learning_rate": 2.2817258883248733e-05, + "loss": 0.723, + "step": 9640 + }, + { + "epoch": 5.4376762549351385, + "grad_norm": 0.8585981726646423, + "learning_rate": 2.281443880428652e-05, + "loss": 0.6786, + "step": 9641 + }, + { + "epoch": 5.43824027072758, + "grad_norm": 0.9574729204177856, + "learning_rate": 2.281161872532431e-05, + "loss": 0.6611, + "step": 9642 + }, + { + "epoch": 5.438804286520023, + "grad_norm": 0.9781889915466309, + "learning_rate": 2.28087986463621e-05, + "loss": 0.6459, + "step": 9643 + }, + { + "epoch": 5.439368302312465, + "grad_norm": 1.0144078731536865, + "learning_rate": 2.2805978567399888e-05, + "loss": 0.7174, + "step": 9644 + }, + { + "epoch": 5.439932318104907, + "grad_norm": 1.4903957843780518, + "learning_rate": 2.2803158488437677e-05, + "loss": 0.7652, + "step": 9645 + }, + { + "epoch": 5.440496333897349, + "grad_norm": 0.9909846186637878, + "learning_rate": 2.2800338409475466e-05, + "loss": 0.723, + "step": 9646 + }, + { + "epoch": 5.441060349689791, + "grad_norm": 1.2816702127456665, + "learning_rate": 2.2797518330513258e-05, + "loss": 0.7574, + "step": 9647 + }, + { + "epoch": 5.441624365482234, + "grad_norm": 1.2049837112426758, + "learning_rate": 2.2794698251551043e-05, + "loss": 0.7159, + "step": 9648 + }, + { + "epoch": 5.442188381274676, + "grad_norm": 1.4156543016433716, + "learning_rate": 2.2791878172588836e-05, + "loss": 0.7683, + "step": 9649 + }, + { + "epoch": 5.442752397067117, + "grad_norm": 1.0711565017700195, + "learning_rate": 2.278905809362662e-05, + "loss": 0.7343, + "step": 9650 + }, + { + "epoch": 5.44331641285956, + "grad_norm": 1.0088093280792236, + "learning_rate": 2.2786238014664413e-05, + "loss": 0.8165, + "step": 9651 + }, + { + "epoch": 5.443880428652002, + "grad_norm": 1.5681039094924927, + "learning_rate": 2.2783417935702202e-05, + "loss": 0.7563, + "step": 9652 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 1.2502585649490356, + "learning_rate": 2.278059785673999e-05, + "loss": 0.7369, + "step": 9653 + }, + { + "epoch": 5.4450084602368864, + "grad_norm": 1.0901501178741455, + "learning_rate": 2.277777777777778e-05, + "loss": 0.7447, + "step": 9654 + }, + { + "epoch": 5.445572476029329, + "grad_norm": 1.0490238666534424, + "learning_rate": 2.277495769881557e-05, + "loss": 0.7257, + "step": 9655 + }, + { + "epoch": 5.446136491821771, + "grad_norm": 1.3574920892715454, + "learning_rate": 2.2772137619853357e-05, + "loss": 0.7367, + "step": 9656 + }, + { + "epoch": 5.446700507614214, + "grad_norm": 0.937465488910675, + "learning_rate": 2.2769317540891146e-05, + "loss": 0.7126, + "step": 9657 + }, + { + "epoch": 5.4472645234066555, + "grad_norm": 1.3396185636520386, + "learning_rate": 2.2766497461928935e-05, + "loss": 0.834, + "step": 9658 + }, + { + "epoch": 5.447828539199097, + "grad_norm": 0.9613165855407715, + "learning_rate": 2.2763677382966724e-05, + "loss": 0.7508, + "step": 9659 + }, + { + "epoch": 5.44839255499154, + "grad_norm": 1.1459556818008423, + "learning_rate": 2.2760857304004512e-05, + "loss": 0.7524, + "step": 9660 + }, + { + "epoch": 5.448956570783982, + "grad_norm": 1.1444917917251587, + "learning_rate": 2.2758037225042305e-05, + "loss": 0.813, + "step": 9661 + }, + { + "epoch": 5.4495205865764245, + "grad_norm": 0.9682584404945374, + "learning_rate": 2.275521714608009e-05, + "loss": 0.6793, + "step": 9662 + }, + { + "epoch": 5.450084602368866, + "grad_norm": 1.0766987800598145, + "learning_rate": 2.2752397067117882e-05, + "loss": 0.8142, + "step": 9663 + }, + { + "epoch": 5.450648618161308, + "grad_norm": 0.8439182639122009, + "learning_rate": 2.2749576988155668e-05, + "loss": 0.6218, + "step": 9664 + }, + { + "epoch": 5.451212633953751, + "grad_norm": 1.4288729429244995, + "learning_rate": 2.274675690919346e-05, + "loss": 0.7877, + "step": 9665 + }, + { + "epoch": 5.451776649746193, + "grad_norm": 1.2755056619644165, + "learning_rate": 2.2743936830231245e-05, + "loss": 0.7047, + "step": 9666 + }, + { + "epoch": 5.452340665538635, + "grad_norm": 42.399173736572266, + "learning_rate": 2.2741116751269037e-05, + "loss": 1.0296, + "step": 9667 + }, + { + "epoch": 5.452904681331077, + "grad_norm": 1.5636783838272095, + "learning_rate": 2.2738296672306826e-05, + "loss": 0.7238, + "step": 9668 + }, + { + "epoch": 5.45346869712352, + "grad_norm": 1.604406476020813, + "learning_rate": 2.2735476593344615e-05, + "loss": 0.8506, + "step": 9669 + }, + { + "epoch": 5.454032712915962, + "grad_norm": 1.26551353931427, + "learning_rate": 2.2732656514382404e-05, + "loss": 0.7887, + "step": 9670 + }, + { + "epoch": 5.454596728708404, + "grad_norm": 1.2187247276306152, + "learning_rate": 2.2729836435420192e-05, + "loss": 0.7175, + "step": 9671 + }, + { + "epoch": 5.455160744500846, + "grad_norm": 1.1164766550064087, + "learning_rate": 2.272701635645798e-05, + "loss": 0.7028, + "step": 9672 + }, + { + "epoch": 5.455724760293288, + "grad_norm": 1.2470674514770508, + "learning_rate": 2.272419627749577e-05, + "loss": 0.7967, + "step": 9673 + }, + { + "epoch": 5.456288776085731, + "grad_norm": 1.0257558822631836, + "learning_rate": 2.272137619853356e-05, + "loss": 0.7109, + "step": 9674 + }, + { + "epoch": 5.456852791878172, + "grad_norm": 1.2873642444610596, + "learning_rate": 2.2718556119571348e-05, + "loss": 0.8353, + "step": 9675 + }, + { + "epoch": 5.457416807670615, + "grad_norm": 1.4216846227645874, + "learning_rate": 2.2715736040609136e-05, + "loss": 0.7844, + "step": 9676 + }, + { + "epoch": 5.457980823463057, + "grad_norm": 1.2008023262023926, + "learning_rate": 2.271291596164693e-05, + "loss": 0.8099, + "step": 9677 + }, + { + "epoch": 5.458544839255499, + "grad_norm": 0.9161457419395447, + "learning_rate": 2.2710095882684714e-05, + "loss": 0.6828, + "step": 9678 + }, + { + "epoch": 5.459108855047941, + "grad_norm": 1.133055329322815, + "learning_rate": 2.2707275803722506e-05, + "loss": 0.6786, + "step": 9679 + }, + { + "epoch": 5.459672870840383, + "grad_norm": 1.4945108890533447, + "learning_rate": 2.2704455724760295e-05, + "loss": 0.8005, + "step": 9680 + }, + { + "epoch": 5.460236886632826, + "grad_norm": 1.4308983087539673, + "learning_rate": 2.2701635645798084e-05, + "loss": 0.8648, + "step": 9681 + }, + { + "epoch": 5.460800902425268, + "grad_norm": 1.6491930484771729, + "learning_rate": 2.2698815566835873e-05, + "loss": 0.8706, + "step": 9682 + }, + { + "epoch": 5.46136491821771, + "grad_norm": 1.0207639932632446, + "learning_rate": 2.269599548787366e-05, + "loss": 0.7567, + "step": 9683 + }, + { + "epoch": 5.461928934010152, + "grad_norm": 1.2946534156799316, + "learning_rate": 2.269317540891145e-05, + "loss": 0.6481, + "step": 9684 + }, + { + "epoch": 5.462492949802595, + "grad_norm": 1.0329840183258057, + "learning_rate": 2.269035532994924e-05, + "loss": 0.691, + "step": 9685 + }, + { + "epoch": 5.463056965595037, + "grad_norm": 1.1931805610656738, + "learning_rate": 2.268753525098703e-05, + "loss": 0.8096, + "step": 9686 + }, + { + "epoch": 5.4636209813874785, + "grad_norm": 1.5403350591659546, + "learning_rate": 2.2684715172024817e-05, + "loss": 0.8406, + "step": 9687 + }, + { + "epoch": 5.464184997179921, + "grad_norm": 1.1038480997085571, + "learning_rate": 2.268189509306261e-05, + "loss": 0.742, + "step": 9688 + }, + { + "epoch": 5.464749012972363, + "grad_norm": 1.137256145477295, + "learning_rate": 2.2679075014100394e-05, + "loss": 0.6871, + "step": 9689 + }, + { + "epoch": 5.465313028764806, + "grad_norm": 1.063859462738037, + "learning_rate": 2.2676254935138186e-05, + "loss": 0.7065, + "step": 9690 + }, + { + "epoch": 5.4658770445572475, + "grad_norm": 1.0655517578125, + "learning_rate": 2.2673434856175975e-05, + "loss": 0.6328, + "step": 9691 + }, + { + "epoch": 5.466441060349689, + "grad_norm": 1.3371561765670776, + "learning_rate": 2.2670614777213764e-05, + "loss": 0.7941, + "step": 9692 + }, + { + "epoch": 5.467005076142132, + "grad_norm": 0.8905954360961914, + "learning_rate": 2.2667794698251553e-05, + "loss": 0.7379, + "step": 9693 + }, + { + "epoch": 5.467569091934574, + "grad_norm": 1.0840139389038086, + "learning_rate": 2.266497461928934e-05, + "loss": 0.7208, + "step": 9694 + }, + { + "epoch": 5.4681331077270166, + "grad_norm": 1.352781891822815, + "learning_rate": 2.266215454032713e-05, + "loss": 0.7533, + "step": 9695 + }, + { + "epoch": 5.468697123519458, + "grad_norm": 0.9220440983772278, + "learning_rate": 2.265933446136492e-05, + "loss": 0.6279, + "step": 9696 + }, + { + "epoch": 5.469261139311901, + "grad_norm": 0.8303223848342896, + "learning_rate": 2.2656514382402708e-05, + "loss": 0.7101, + "step": 9697 + }, + { + "epoch": 5.469825155104343, + "grad_norm": 1.2655948400497437, + "learning_rate": 2.2653694303440497e-05, + "loss": 0.7203, + "step": 9698 + }, + { + "epoch": 5.470389170896786, + "grad_norm": 1.284629225730896, + "learning_rate": 2.2650874224478286e-05, + "loss": 0.7772, + "step": 9699 + }, + { + "epoch": 5.470953186689227, + "grad_norm": 1.5019429922103882, + "learning_rate": 2.2648054145516078e-05, + "loss": 0.8074, + "step": 9700 + }, + { + "epoch": 5.471517202481669, + "grad_norm": 0.8676204681396484, + "learning_rate": 2.2645234066553863e-05, + "loss": 0.728, + "step": 9701 + }, + { + "epoch": 5.472081218274112, + "grad_norm": 1.7637945413589478, + "learning_rate": 2.2642413987591655e-05, + "loss": 0.9417, + "step": 9702 + }, + { + "epoch": 5.472645234066554, + "grad_norm": 1.1465163230895996, + "learning_rate": 2.263959390862944e-05, + "loss": 0.7248, + "step": 9703 + }, + { + "epoch": 5.473209249858996, + "grad_norm": 1.055288553237915, + "learning_rate": 2.2636773829667233e-05, + "loss": 0.7758, + "step": 9704 + }, + { + "epoch": 5.473773265651438, + "grad_norm": 1.3790277242660522, + "learning_rate": 2.263395375070502e-05, + "loss": 0.7625, + "step": 9705 + }, + { + "epoch": 5.47433728144388, + "grad_norm": 1.6440229415893555, + "learning_rate": 2.263113367174281e-05, + "loss": 0.8255, + "step": 9706 + }, + { + "epoch": 5.474901297236323, + "grad_norm": 0.7908397316932678, + "learning_rate": 2.26283135927806e-05, + "loss": 0.6026, + "step": 9707 + }, + { + "epoch": 5.4754653130287645, + "grad_norm": 2.1183364391326904, + "learning_rate": 2.2625493513818388e-05, + "loss": 0.9237, + "step": 9708 + }, + { + "epoch": 5.476029328821207, + "grad_norm": 1.4034911394119263, + "learning_rate": 2.2622673434856177e-05, + "loss": 0.8212, + "step": 9709 + }, + { + "epoch": 5.476593344613649, + "grad_norm": 1.293953776359558, + "learning_rate": 2.2619853355893966e-05, + "loss": 0.8758, + "step": 9710 + }, + { + "epoch": 5.477157360406092, + "grad_norm": 0.9453669786453247, + "learning_rate": 2.2617033276931755e-05, + "loss": 0.7227, + "step": 9711 + }, + { + "epoch": 5.4777213761985335, + "grad_norm": 1.184675693511963, + "learning_rate": 2.2614213197969543e-05, + "loss": 0.7706, + "step": 9712 + }, + { + "epoch": 5.478285391990976, + "grad_norm": 1.1529535055160522, + "learning_rate": 2.2611393119007332e-05, + "loss": 0.7576, + "step": 9713 + }, + { + "epoch": 5.478849407783418, + "grad_norm": 1.2730096578598022, + "learning_rate": 2.260857304004512e-05, + "loss": 0.7708, + "step": 9714 + }, + { + "epoch": 5.47941342357586, + "grad_norm": 1.4364205598831177, + "learning_rate": 2.2605752961082913e-05, + "loss": 0.7366, + "step": 9715 + }, + { + "epoch": 5.4799774393683025, + "grad_norm": 1.4857314825057983, + "learning_rate": 2.2602932882120702e-05, + "loss": 0.7007, + "step": 9716 + }, + { + "epoch": 5.480541455160744, + "grad_norm": 1.0332634449005127, + "learning_rate": 2.260011280315849e-05, + "loss": 0.7874, + "step": 9717 + }, + { + "epoch": 5.481105470953187, + "grad_norm": 1.0505998134613037, + "learning_rate": 2.259729272419628e-05, + "loss": 0.5726, + "step": 9718 + }, + { + "epoch": 5.481669486745629, + "grad_norm": 1.020260214805603, + "learning_rate": 2.2594472645234068e-05, + "loss": 0.6696, + "step": 9719 + }, + { + "epoch": 5.482233502538071, + "grad_norm": 1.086585283279419, + "learning_rate": 2.2591652566271857e-05, + "loss": 0.7811, + "step": 9720 + }, + { + "epoch": 5.482797518330513, + "grad_norm": 1.7802307605743408, + "learning_rate": 2.2588832487309646e-05, + "loss": 0.8654, + "step": 9721 + }, + { + "epoch": 5.483361534122955, + "grad_norm": 1.0638799667358398, + "learning_rate": 2.2586012408347435e-05, + "loss": 0.7208, + "step": 9722 + }, + { + "epoch": 5.483925549915398, + "grad_norm": 1.091943621635437, + "learning_rate": 2.2583192329385223e-05, + "loss": 0.7041, + "step": 9723 + }, + { + "epoch": 5.48448956570784, + "grad_norm": 1.0036736726760864, + "learning_rate": 2.2580372250423012e-05, + "loss": 0.7529, + "step": 9724 + }, + { + "epoch": 5.485053581500282, + "grad_norm": 0.9299917221069336, + "learning_rate": 2.2577552171460804e-05, + "loss": 0.7202, + "step": 9725 + }, + { + "epoch": 5.485617597292724, + "grad_norm": 1.0042989253997803, + "learning_rate": 2.257473209249859e-05, + "loss": 0.7773, + "step": 9726 + }, + { + "epoch": 5.486181613085167, + "grad_norm": 1.1000900268554688, + "learning_rate": 2.2571912013536382e-05, + "loss": 0.8328, + "step": 9727 + }, + { + "epoch": 5.486745628877609, + "grad_norm": 1.6309765577316284, + "learning_rate": 2.2569091934574167e-05, + "loss": 0.8192, + "step": 9728 + }, + { + "epoch": 5.4873096446700504, + "grad_norm": 0.9339597225189209, + "learning_rate": 2.256627185561196e-05, + "loss": 0.7082, + "step": 9729 + }, + { + "epoch": 5.487873660462493, + "grad_norm": 1.4140042066574097, + "learning_rate": 2.2563451776649745e-05, + "loss": 0.6786, + "step": 9730 + }, + { + "epoch": 5.488437676254935, + "grad_norm": 1.5092706680297852, + "learning_rate": 2.2560631697687537e-05, + "loss": 0.7625, + "step": 9731 + }, + { + "epoch": 5.489001692047378, + "grad_norm": 1.4578189849853516, + "learning_rate": 2.2557811618725326e-05, + "loss": 0.8213, + "step": 9732 + }, + { + "epoch": 5.4895657078398195, + "grad_norm": 1.8860199451446533, + "learning_rate": 2.2554991539763115e-05, + "loss": 0.7988, + "step": 9733 + }, + { + "epoch": 5.490129723632261, + "grad_norm": 1.5942695140838623, + "learning_rate": 2.2552171460800904e-05, + "loss": 0.8063, + "step": 9734 + }, + { + "epoch": 5.490693739424704, + "grad_norm": 0.8763282895088196, + "learning_rate": 2.2549351381838692e-05, + "loss": 0.7095, + "step": 9735 + }, + { + "epoch": 5.491257755217146, + "grad_norm": 0.9489726424217224, + "learning_rate": 2.254653130287648e-05, + "loss": 0.6331, + "step": 9736 + }, + { + "epoch": 5.4918217710095885, + "grad_norm": 0.9881574511528015, + "learning_rate": 2.254371122391427e-05, + "loss": 0.7578, + "step": 9737 + }, + { + "epoch": 5.49238578680203, + "grad_norm": 1.4848779439926147, + "learning_rate": 2.254089114495206e-05, + "loss": 0.8171, + "step": 9738 + }, + { + "epoch": 5.492949802594473, + "grad_norm": 1.1152679920196533, + "learning_rate": 2.2538071065989848e-05, + "loss": 0.7572, + "step": 9739 + }, + { + "epoch": 5.493513818386915, + "grad_norm": 1.1155263185501099, + "learning_rate": 2.2535250987027636e-05, + "loss": 0.8102, + "step": 9740 + }, + { + "epoch": 5.4940778341793575, + "grad_norm": 1.3022620677947998, + "learning_rate": 2.253243090806543e-05, + "loss": 0.786, + "step": 9741 + }, + { + "epoch": 5.494641849971799, + "grad_norm": 1.1630686521530151, + "learning_rate": 2.2529610829103214e-05, + "loss": 0.8034, + "step": 9742 + }, + { + "epoch": 5.495205865764241, + "grad_norm": 0.8765363693237305, + "learning_rate": 2.2526790750141006e-05, + "loss": 0.7327, + "step": 9743 + }, + { + "epoch": 5.495769881556684, + "grad_norm": 1.6062322854995728, + "learning_rate": 2.252397067117879e-05, + "loss": 0.9158, + "step": 9744 + }, + { + "epoch": 5.496333897349126, + "grad_norm": 1.2178412675857544, + "learning_rate": 2.2521150592216584e-05, + "loss": 0.662, + "step": 9745 + }, + { + "epoch": 5.496897913141568, + "grad_norm": 1.2600051164627075, + "learning_rate": 2.2518330513254373e-05, + "loss": 0.7275, + "step": 9746 + }, + { + "epoch": 5.49746192893401, + "grad_norm": 0.9916000366210938, + "learning_rate": 2.251551043429216e-05, + "loss": 0.6554, + "step": 9747 + }, + { + "epoch": 5.498025944726452, + "grad_norm": 1.1870208978652954, + "learning_rate": 2.251269035532995e-05, + "loss": 0.77, + "step": 9748 + }, + { + "epoch": 5.498589960518895, + "grad_norm": 1.034176230430603, + "learning_rate": 2.250987027636774e-05, + "loss": 0.7571, + "step": 9749 + }, + { + "epoch": 5.499153976311336, + "grad_norm": 1.0284367799758911, + "learning_rate": 2.250705019740553e-05, + "loss": 0.6718, + "step": 9750 + }, + { + "epoch": 5.499717992103779, + "grad_norm": 1.0676106214523315, + "learning_rate": 2.2504230118443317e-05, + "loss": 0.784, + "step": 9751 + }, + { + "epoch": 5.500282007896221, + "grad_norm": 0.9996857047080994, + "learning_rate": 2.250141003948111e-05, + "loss": 0.7471, + "step": 9752 + }, + { + "epoch": 5.500846023688664, + "grad_norm": 1.2240514755249023, + "learning_rate": 2.2498589960518894e-05, + "loss": 0.6877, + "step": 9753 + }, + { + "epoch": 5.501410039481105, + "grad_norm": 1.2828456163406372, + "learning_rate": 2.2495769881556686e-05, + "loss": 0.723, + "step": 9754 + }, + { + "epoch": 5.501974055273548, + "grad_norm": 1.4237185716629028, + "learning_rate": 2.2492949802594475e-05, + "loss": 0.746, + "step": 9755 + }, + { + "epoch": 5.50253807106599, + "grad_norm": 1.0342179536819458, + "learning_rate": 2.2490129723632264e-05, + "loss": 0.7925, + "step": 9756 + }, + { + "epoch": 5.503102086858432, + "grad_norm": 1.138704538345337, + "learning_rate": 2.2487309644670053e-05, + "loss": 0.7838, + "step": 9757 + }, + { + "epoch": 5.503666102650874, + "grad_norm": 1.1899950504302979, + "learning_rate": 2.248448956570784e-05, + "loss": 0.7578, + "step": 9758 + }, + { + "epoch": 5.504230118443316, + "grad_norm": 1.4366707801818848, + "learning_rate": 2.248166948674563e-05, + "loss": 0.7017, + "step": 9759 + }, + { + "epoch": 5.504794134235759, + "grad_norm": 1.320613145828247, + "learning_rate": 2.247884940778342e-05, + "loss": 0.7307, + "step": 9760 + }, + { + "epoch": 5.505358150028201, + "grad_norm": 0.9303173422813416, + "learning_rate": 2.2476029328821208e-05, + "loss": 0.7653, + "step": 9761 + }, + { + "epoch": 5.5059221658206425, + "grad_norm": 0.955819845199585, + "learning_rate": 2.2473209249858997e-05, + "loss": 0.8173, + "step": 9762 + }, + { + "epoch": 5.506486181613085, + "grad_norm": 1.0538170337677002, + "learning_rate": 2.2470389170896785e-05, + "loss": 0.8229, + "step": 9763 + }, + { + "epoch": 5.507050197405527, + "grad_norm": 0.8053732514381409, + "learning_rate": 2.2467569091934578e-05, + "loss": 0.6436, + "step": 9764 + }, + { + "epoch": 5.50761421319797, + "grad_norm": 0.8006096482276917, + "learning_rate": 2.2464749012972363e-05, + "loss": 0.6531, + "step": 9765 + }, + { + "epoch": 5.5081782289904115, + "grad_norm": 1.2325356006622314, + "learning_rate": 2.2461928934010155e-05, + "loss": 0.7681, + "step": 9766 + }, + { + "epoch": 5.508742244782854, + "grad_norm": 0.9264138340950012, + "learning_rate": 2.245910885504794e-05, + "loss": 0.72, + "step": 9767 + }, + { + "epoch": 5.509306260575296, + "grad_norm": 1.2886204719543457, + "learning_rate": 2.2456288776085733e-05, + "loss": 0.9083, + "step": 9768 + }, + { + "epoch": 5.509870276367739, + "grad_norm": 0.8836408853530884, + "learning_rate": 2.2453468697123518e-05, + "loss": 0.6152, + "step": 9769 + }, + { + "epoch": 5.5104342921601805, + "grad_norm": 0.9870207905769348, + "learning_rate": 2.245064861816131e-05, + "loss": 0.7257, + "step": 9770 + }, + { + "epoch": 5.510998307952622, + "grad_norm": 0.9416919350624084, + "learning_rate": 2.24478285391991e-05, + "loss": 0.682, + "step": 9771 + }, + { + "epoch": 5.511562323745065, + "grad_norm": 1.0096075534820557, + "learning_rate": 2.2445008460236888e-05, + "loss": 0.7797, + "step": 9772 + }, + { + "epoch": 5.512126339537507, + "grad_norm": 1.733046054840088, + "learning_rate": 2.2442188381274677e-05, + "loss": 0.7914, + "step": 9773 + }, + { + "epoch": 5.5126903553299496, + "grad_norm": 1.5568349361419678, + "learning_rate": 2.2439368302312466e-05, + "loss": 0.7015, + "step": 9774 + }, + { + "epoch": 5.513254371122391, + "grad_norm": 1.0891237258911133, + "learning_rate": 2.2436548223350254e-05, + "loss": 0.7935, + "step": 9775 + }, + { + "epoch": 5.513818386914833, + "grad_norm": 1.5076392889022827, + "learning_rate": 2.2433728144388043e-05, + "loss": 0.8344, + "step": 9776 + }, + { + "epoch": 5.514382402707276, + "grad_norm": 1.1381947994232178, + "learning_rate": 2.2430908065425832e-05, + "loss": 0.6644, + "step": 9777 + }, + { + "epoch": 5.514946418499718, + "grad_norm": 1.5135334730148315, + "learning_rate": 2.242808798646362e-05, + "loss": 0.7746, + "step": 9778 + }, + { + "epoch": 5.51551043429216, + "grad_norm": 1.2729721069335938, + "learning_rate": 2.242526790750141e-05, + "loss": 0.6569, + "step": 9779 + }, + { + "epoch": 5.516074450084602, + "grad_norm": 1.6609405279159546, + "learning_rate": 2.2422447828539202e-05, + "loss": 0.8796, + "step": 9780 + }, + { + "epoch": 5.516638465877045, + "grad_norm": 1.266183614730835, + "learning_rate": 2.2419627749576987e-05, + "loss": 0.8206, + "step": 9781 + }, + { + "epoch": 5.517202481669487, + "grad_norm": 1.0972729921340942, + "learning_rate": 2.241680767061478e-05, + "loss": 0.7459, + "step": 9782 + }, + { + "epoch": 5.517766497461929, + "grad_norm": 1.3629271984100342, + "learning_rate": 2.2413987591652565e-05, + "loss": 0.7576, + "step": 9783 + }, + { + "epoch": 5.518330513254371, + "grad_norm": 1.4715354442596436, + "learning_rate": 2.2411167512690357e-05, + "loss": 0.8751, + "step": 9784 + }, + { + "epoch": 5.518894529046813, + "grad_norm": 1.4966020584106445, + "learning_rate": 2.2408347433728146e-05, + "loss": 0.7885, + "step": 9785 + }, + { + "epoch": 5.519458544839256, + "grad_norm": 1.0023727416992188, + "learning_rate": 2.2405527354765935e-05, + "loss": 0.7424, + "step": 9786 + }, + { + "epoch": 5.5200225606316975, + "grad_norm": 1.2060484886169434, + "learning_rate": 2.2402707275803723e-05, + "loss": 0.6687, + "step": 9787 + }, + { + "epoch": 5.52058657642414, + "grad_norm": 0.9685783386230469, + "learning_rate": 2.2399887196841512e-05, + "loss": 0.706, + "step": 9788 + }, + { + "epoch": 5.521150592216582, + "grad_norm": 1.0691118240356445, + "learning_rate": 2.2397067117879304e-05, + "loss": 0.7629, + "step": 9789 + }, + { + "epoch": 5.521714608009024, + "grad_norm": 0.9582865834236145, + "learning_rate": 2.239424703891709e-05, + "loss": 0.7035, + "step": 9790 + }, + { + "epoch": 5.5222786238014665, + "grad_norm": 1.1036142110824585, + "learning_rate": 2.2391426959954882e-05, + "loss": 0.6946, + "step": 9791 + }, + { + "epoch": 5.522842639593908, + "grad_norm": 0.9261386394500732, + "learning_rate": 2.2388606880992667e-05, + "loss": 0.7047, + "step": 9792 + }, + { + "epoch": 5.523406655386351, + "grad_norm": 1.3468512296676636, + "learning_rate": 2.238578680203046e-05, + "loss": 0.7576, + "step": 9793 + }, + { + "epoch": 5.523970671178793, + "grad_norm": 1.0534392595291138, + "learning_rate": 2.2382966723068245e-05, + "loss": 0.7147, + "step": 9794 + }, + { + "epoch": 5.5245346869712355, + "grad_norm": 1.1717759370803833, + "learning_rate": 2.2380146644106037e-05, + "loss": 0.6549, + "step": 9795 + }, + { + "epoch": 5.525098702763677, + "grad_norm": 1.0314935445785522, + "learning_rate": 2.2377326565143826e-05, + "loss": 0.7683, + "step": 9796 + }, + { + "epoch": 5.52566271855612, + "grad_norm": 0.9239526987075806, + "learning_rate": 2.2374506486181615e-05, + "loss": 0.6921, + "step": 9797 + }, + { + "epoch": 5.526226734348562, + "grad_norm": 1.2538098096847534, + "learning_rate": 2.2371686407219404e-05, + "loss": 0.6608, + "step": 9798 + }, + { + "epoch": 5.526790750141004, + "grad_norm": 1.086954116821289, + "learning_rate": 2.2368866328257192e-05, + "loss": 0.8419, + "step": 9799 + }, + { + "epoch": 5.527354765933446, + "grad_norm": 1.1310920715332031, + "learning_rate": 2.236604624929498e-05, + "loss": 0.7254, + "step": 9800 + }, + { + "epoch": 5.527918781725888, + "grad_norm": 1.125230073928833, + "learning_rate": 2.236322617033277e-05, + "loss": 0.6899, + "step": 9801 + }, + { + "epoch": 5.528482797518331, + "grad_norm": 1.1742196083068848, + "learning_rate": 2.236040609137056e-05, + "loss": 0.8177, + "step": 9802 + }, + { + "epoch": 5.529046813310773, + "grad_norm": 0.903105616569519, + "learning_rate": 2.2357586012408348e-05, + "loss": 0.6279, + "step": 9803 + }, + { + "epoch": 5.529610829103214, + "grad_norm": 1.106855034828186, + "learning_rate": 2.2354765933446136e-05, + "loss": 0.6373, + "step": 9804 + }, + { + "epoch": 5.530174844895657, + "grad_norm": 1.11920166015625, + "learning_rate": 2.235194585448393e-05, + "loss": 0.8327, + "step": 9805 + }, + { + "epoch": 5.530738860688099, + "grad_norm": 1.1238059997558594, + "learning_rate": 2.2349125775521714e-05, + "loss": 0.7747, + "step": 9806 + }, + { + "epoch": 5.531302876480542, + "grad_norm": 1.2182537317276, + "learning_rate": 2.2346305696559506e-05, + "loss": 0.8147, + "step": 9807 + }, + { + "epoch": 5.5318668922729834, + "grad_norm": 0.972678542137146, + "learning_rate": 2.234348561759729e-05, + "loss": 0.7399, + "step": 9808 + }, + { + "epoch": 5.532430908065426, + "grad_norm": 1.670527458190918, + "learning_rate": 2.2340665538635084e-05, + "loss": 0.8356, + "step": 9809 + }, + { + "epoch": 5.532994923857868, + "grad_norm": 1.293713092803955, + "learning_rate": 2.2337845459672872e-05, + "loss": 0.8603, + "step": 9810 + }, + { + "epoch": 5.533558939650311, + "grad_norm": 1.4739024639129639, + "learning_rate": 2.233502538071066e-05, + "loss": 0.7365, + "step": 9811 + }, + { + "epoch": 5.5341229554427525, + "grad_norm": 0.9304956197738647, + "learning_rate": 2.233220530174845e-05, + "loss": 0.806, + "step": 9812 + }, + { + "epoch": 5.534686971235194, + "grad_norm": 1.4681518077850342, + "learning_rate": 2.232938522278624e-05, + "loss": 0.73, + "step": 9813 + }, + { + "epoch": 5.535250987027637, + "grad_norm": 1.1660618782043457, + "learning_rate": 2.2326565143824028e-05, + "loss": 0.7376, + "step": 9814 + }, + { + "epoch": 5.535815002820079, + "grad_norm": 0.9410316348075867, + "learning_rate": 2.2323745064861816e-05, + "loss": 0.7706, + "step": 9815 + }, + { + "epoch": 5.5363790186125215, + "grad_norm": 1.0914735794067383, + "learning_rate": 2.2320924985899605e-05, + "loss": 0.7783, + "step": 9816 + }, + { + "epoch": 5.536943034404963, + "grad_norm": 1.1667912006378174, + "learning_rate": 2.2318104906937394e-05, + "loss": 0.7982, + "step": 9817 + }, + { + "epoch": 5.537507050197405, + "grad_norm": 1.3399139642715454, + "learning_rate": 2.2315284827975183e-05, + "loss": 0.7832, + "step": 9818 + }, + { + "epoch": 5.538071065989848, + "grad_norm": 1.0747309923171997, + "learning_rate": 2.2312464749012975e-05, + "loss": 0.6875, + "step": 9819 + }, + { + "epoch": 5.53863508178229, + "grad_norm": 1.4604703187942505, + "learning_rate": 2.2309644670050764e-05, + "loss": 0.7955, + "step": 9820 + }, + { + "epoch": 5.539199097574732, + "grad_norm": 1.1383724212646484, + "learning_rate": 2.2306824591088553e-05, + "loss": 0.7813, + "step": 9821 + }, + { + "epoch": 5.539763113367174, + "grad_norm": 1.3452759981155396, + "learning_rate": 2.230400451212634e-05, + "loss": 0.7471, + "step": 9822 + }, + { + "epoch": 5.540327129159617, + "grad_norm": 1.2947680950164795, + "learning_rate": 2.230118443316413e-05, + "loss": 0.7894, + "step": 9823 + }, + { + "epoch": 5.540891144952059, + "grad_norm": 0.8444526791572571, + "learning_rate": 2.229836435420192e-05, + "loss": 0.6801, + "step": 9824 + }, + { + "epoch": 5.541455160744501, + "grad_norm": 1.3915804624557495, + "learning_rate": 2.2295544275239708e-05, + "loss": 0.6876, + "step": 9825 + }, + { + "epoch": 5.542019176536943, + "grad_norm": 1.2139012813568115, + "learning_rate": 2.2292724196277497e-05, + "loss": 0.7139, + "step": 9826 + }, + { + "epoch": 5.542583192329385, + "grad_norm": 1.2648545503616333, + "learning_rate": 2.2289904117315285e-05, + "loss": 0.7528, + "step": 9827 + }, + { + "epoch": 5.543147208121828, + "grad_norm": 1.3160524368286133, + "learning_rate": 2.2287084038353078e-05, + "loss": 0.7917, + "step": 9828 + }, + { + "epoch": 5.543711223914269, + "grad_norm": 1.5502591133117676, + "learning_rate": 2.2284263959390863e-05, + "loss": 0.8814, + "step": 9829 + }, + { + "epoch": 5.544275239706712, + "grad_norm": 1.0860576629638672, + "learning_rate": 2.2281443880428655e-05, + "loss": 0.7368, + "step": 9830 + }, + { + "epoch": 5.544839255499154, + "grad_norm": 1.1421046257019043, + "learning_rate": 2.227862380146644e-05, + "loss": 0.6937, + "step": 9831 + }, + { + "epoch": 5.545403271291596, + "grad_norm": 0.9157379269599915, + "learning_rate": 2.2275803722504233e-05, + "loss": 0.7323, + "step": 9832 + }, + { + "epoch": 5.545967287084038, + "grad_norm": 1.0844364166259766, + "learning_rate": 2.2272983643542018e-05, + "loss": 0.8232, + "step": 9833 + }, + { + "epoch": 5.54653130287648, + "grad_norm": 1.8207178115844727, + "learning_rate": 2.227016356457981e-05, + "loss": 0.7458, + "step": 9834 + }, + { + "epoch": 5.547095318668923, + "grad_norm": 1.2986962795257568, + "learning_rate": 2.22673434856176e-05, + "loss": 0.746, + "step": 9835 + }, + { + "epoch": 5.547659334461365, + "grad_norm": 1.2602365016937256, + "learning_rate": 2.2264523406655388e-05, + "loss": 0.7447, + "step": 9836 + }, + { + "epoch": 5.548223350253807, + "grad_norm": 0.9879333972930908, + "learning_rate": 2.2261703327693177e-05, + "loss": 0.7731, + "step": 9837 + }, + { + "epoch": 5.548787366046249, + "grad_norm": 1.370261788368225, + "learning_rate": 2.2258883248730966e-05, + "loss": 0.8264, + "step": 9838 + }, + { + "epoch": 5.549351381838692, + "grad_norm": 1.4594802856445312, + "learning_rate": 2.2256063169768754e-05, + "loss": 0.7435, + "step": 9839 + }, + { + "epoch": 5.549915397631134, + "grad_norm": 0.8275618553161621, + "learning_rate": 2.2253243090806543e-05, + "loss": 0.6388, + "step": 9840 + }, + { + "epoch": 5.5504794134235755, + "grad_norm": 1.0979266166687012, + "learning_rate": 2.2250423011844332e-05, + "loss": 0.7368, + "step": 9841 + }, + { + "epoch": 5.551043429216018, + "grad_norm": 1.1049968004226685, + "learning_rate": 2.224760293288212e-05, + "loss": 0.8633, + "step": 9842 + }, + { + "epoch": 5.55160744500846, + "grad_norm": 1.2938828468322754, + "learning_rate": 2.224478285391991e-05, + "loss": 0.8266, + "step": 9843 + }, + { + "epoch": 5.552171460800903, + "grad_norm": 1.3264131546020508, + "learning_rate": 2.2241962774957702e-05, + "loss": 0.8233, + "step": 9844 + }, + { + "epoch": 5.5527354765933445, + "grad_norm": 1.1594719886779785, + "learning_rate": 2.2239142695995487e-05, + "loss": 0.7633, + "step": 9845 + }, + { + "epoch": 5.553299492385786, + "grad_norm": 1.1719365119934082, + "learning_rate": 2.223632261703328e-05, + "loss": 0.7395, + "step": 9846 + }, + { + "epoch": 5.553863508178229, + "grad_norm": 0.9993629455566406, + "learning_rate": 2.2233502538071065e-05, + "loss": 0.6998, + "step": 9847 + }, + { + "epoch": 5.554427523970671, + "grad_norm": 2.4667415618896484, + "learning_rate": 2.2230682459108857e-05, + "loss": 0.7915, + "step": 9848 + }, + { + "epoch": 5.5549915397631136, + "grad_norm": 0.8506662845611572, + "learning_rate": 2.2227862380146646e-05, + "loss": 0.67, + "step": 9849 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.9912572503089905, + "learning_rate": 2.2225042301184434e-05, + "loss": 0.7449, + "step": 9850 + }, + { + "epoch": 5.556119571347998, + "grad_norm": 1.364230751991272, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.7669, + "step": 9851 + }, + { + "epoch": 5.55668358714044, + "grad_norm": 1.1885253190994263, + "learning_rate": 2.2219402143260012e-05, + "loss": 0.7113, + "step": 9852 + }, + { + "epoch": 5.557247602932883, + "grad_norm": 1.5918245315551758, + "learning_rate": 2.22165820642978e-05, + "loss": 0.7981, + "step": 9853 + }, + { + "epoch": 5.557811618725324, + "grad_norm": 1.2287622690200806, + "learning_rate": 2.221376198533559e-05, + "loss": 0.7045, + "step": 9854 + }, + { + "epoch": 5.558375634517766, + "grad_norm": 1.225943684577942, + "learning_rate": 2.2210941906373382e-05, + "loss": 0.7525, + "step": 9855 + }, + { + "epoch": 5.558939650310209, + "grad_norm": 1.4065817594528198, + "learning_rate": 2.2208121827411167e-05, + "loss": 0.8316, + "step": 9856 + }, + { + "epoch": 5.559503666102651, + "grad_norm": 1.1487215757369995, + "learning_rate": 2.220530174844896e-05, + "loss": 0.6797, + "step": 9857 + }, + { + "epoch": 5.560067681895093, + "grad_norm": 1.1039607524871826, + "learning_rate": 2.2202481669486748e-05, + "loss": 0.6952, + "step": 9858 + }, + { + "epoch": 5.560631697687535, + "grad_norm": 0.809675395488739, + "learning_rate": 2.2199661590524537e-05, + "loss": 0.601, + "step": 9859 + }, + { + "epoch": 5.561195713479977, + "grad_norm": 1.4885399341583252, + "learning_rate": 2.2196841511562326e-05, + "loss": 0.8066, + "step": 9860 + }, + { + "epoch": 5.56175972927242, + "grad_norm": 0.9342008829116821, + "learning_rate": 2.2194021432600115e-05, + "loss": 0.6698, + "step": 9861 + }, + { + "epoch": 5.5623237450648615, + "grad_norm": 0.9480478167533875, + "learning_rate": 2.2191201353637903e-05, + "loss": 0.7521, + "step": 9862 + }, + { + "epoch": 5.562887760857304, + "grad_norm": 0.9372832775115967, + "learning_rate": 2.2188381274675692e-05, + "loss": 0.7508, + "step": 9863 + }, + { + "epoch": 5.563451776649746, + "grad_norm": 2.046750545501709, + "learning_rate": 2.218556119571348e-05, + "loss": 0.7564, + "step": 9864 + }, + { + "epoch": 5.564015792442189, + "grad_norm": 1.077440857887268, + "learning_rate": 2.218274111675127e-05, + "loss": 0.6918, + "step": 9865 + }, + { + "epoch": 5.5645798082346305, + "grad_norm": 1.1553242206573486, + "learning_rate": 2.217992103778906e-05, + "loss": 0.7196, + "step": 9866 + }, + { + "epoch": 5.565143824027073, + "grad_norm": 1.1091188192367554, + "learning_rate": 2.217710095882685e-05, + "loss": 0.6986, + "step": 9867 + }, + { + "epoch": 5.565707839819515, + "grad_norm": 1.346698522567749, + "learning_rate": 2.2174280879864636e-05, + "loss": 0.7527, + "step": 9868 + }, + { + "epoch": 5.566271855611957, + "grad_norm": 0.9952457547187805, + "learning_rate": 2.217146080090243e-05, + "loss": 0.7713, + "step": 9869 + }, + { + "epoch": 5.5668358714043995, + "grad_norm": 1.0381057262420654, + "learning_rate": 2.2168640721940214e-05, + "loss": 0.6449, + "step": 9870 + }, + { + "epoch": 5.567399887196841, + "grad_norm": 0.9654024839401245, + "learning_rate": 2.2165820642978006e-05, + "loss": 0.7827, + "step": 9871 + }, + { + "epoch": 5.567963902989284, + "grad_norm": 1.0105341672897339, + "learning_rate": 2.216300056401579e-05, + "loss": 0.7127, + "step": 9872 + }, + { + "epoch": 5.568527918781726, + "grad_norm": 1.3070576190948486, + "learning_rate": 2.2160180485053584e-05, + "loss": 0.7466, + "step": 9873 + }, + { + "epoch": 5.569091934574168, + "grad_norm": 0.9564566016197205, + "learning_rate": 2.2157360406091372e-05, + "loss": 0.6583, + "step": 9874 + }, + { + "epoch": 5.56965595036661, + "grad_norm": 0.9456357359886169, + "learning_rate": 2.215454032712916e-05, + "loss": 0.7393, + "step": 9875 + }, + { + "epoch": 5.570219966159052, + "grad_norm": 0.9591564536094666, + "learning_rate": 2.215172024816695e-05, + "loss": 0.7599, + "step": 9876 + }, + { + "epoch": 5.570783981951495, + "grad_norm": 1.150158166885376, + "learning_rate": 2.214890016920474e-05, + "loss": 0.7427, + "step": 9877 + }, + { + "epoch": 5.571347997743937, + "grad_norm": 1.0692152976989746, + "learning_rate": 2.2146080090242528e-05, + "loss": 0.7754, + "step": 9878 + }, + { + "epoch": 5.571912013536379, + "grad_norm": 0.926566481590271, + "learning_rate": 2.2143260011280316e-05, + "loss": 0.7457, + "step": 9879 + }, + { + "epoch": 5.572476029328821, + "grad_norm": 0.9680757522583008, + "learning_rate": 2.2140439932318105e-05, + "loss": 0.6369, + "step": 9880 + }, + { + "epoch": 5.573040045121264, + "grad_norm": 1.3215887546539307, + "learning_rate": 2.2137619853355894e-05, + "loss": 0.7202, + "step": 9881 + }, + { + "epoch": 5.573604060913706, + "grad_norm": 1.0752038955688477, + "learning_rate": 2.2134799774393683e-05, + "loss": 0.6706, + "step": 9882 + }, + { + "epoch": 5.5741680767061474, + "grad_norm": 2.375749111175537, + "learning_rate": 2.2131979695431475e-05, + "loss": 0.8594, + "step": 9883 + }, + { + "epoch": 5.57473209249859, + "grad_norm": 1.389838695526123, + "learning_rate": 2.212915961646926e-05, + "loss": 0.8016, + "step": 9884 + }, + { + "epoch": 5.575296108291032, + "grad_norm": 1.410646915435791, + "learning_rate": 2.2126339537507053e-05, + "loss": 0.8084, + "step": 9885 + }, + { + "epoch": 5.575860124083475, + "grad_norm": 1.1280182600021362, + "learning_rate": 2.2123519458544838e-05, + "loss": 0.796, + "step": 9886 + }, + { + "epoch": 5.5764241398759165, + "grad_norm": 1.501784324645996, + "learning_rate": 2.212069937958263e-05, + "loss": 0.7125, + "step": 9887 + }, + { + "epoch": 5.576988155668358, + "grad_norm": 1.1757307052612305, + "learning_rate": 2.2117879300620416e-05, + "loss": 0.6985, + "step": 9888 + }, + { + "epoch": 5.577552171460801, + "grad_norm": 1.67729914188385, + "learning_rate": 2.2115059221658208e-05, + "loss": 0.8372, + "step": 9889 + }, + { + "epoch": 5.578116187253243, + "grad_norm": 0.8773844242095947, + "learning_rate": 2.2112239142695997e-05, + "loss": 0.7335, + "step": 9890 + }, + { + "epoch": 5.5786802030456855, + "grad_norm": 1.0837759971618652, + "learning_rate": 2.2109419063733785e-05, + "loss": 0.714, + "step": 9891 + }, + { + "epoch": 5.579244218838127, + "grad_norm": 1.0567749738693237, + "learning_rate": 2.2106598984771577e-05, + "loss": 0.7229, + "step": 9892 + }, + { + "epoch": 5.57980823463057, + "grad_norm": 1.1312987804412842, + "learning_rate": 2.2103778905809363e-05, + "loss": 0.7112, + "step": 9893 + }, + { + "epoch": 5.580372250423012, + "grad_norm": 0.9366505742073059, + "learning_rate": 2.2100958826847155e-05, + "loss": 0.6852, + "step": 9894 + }, + { + "epoch": 5.5809362662154545, + "grad_norm": 1.0879757404327393, + "learning_rate": 2.209813874788494e-05, + "loss": 0.8777, + "step": 9895 + }, + { + "epoch": 5.581500282007896, + "grad_norm": 1.3282349109649658, + "learning_rate": 2.2095318668922733e-05, + "loss": 0.7751, + "step": 9896 + }, + { + "epoch": 5.582064297800338, + "grad_norm": 1.1757941246032715, + "learning_rate": 2.2092498589960518e-05, + "loss": 0.714, + "step": 9897 + }, + { + "epoch": 5.582628313592781, + "grad_norm": 1.1896471977233887, + "learning_rate": 2.208967851099831e-05, + "loss": 0.7701, + "step": 9898 + }, + { + "epoch": 5.583192329385223, + "grad_norm": 1.4063563346862793, + "learning_rate": 2.20868584320361e-05, + "loss": 0.7719, + "step": 9899 + }, + { + "epoch": 5.583756345177665, + "grad_norm": 1.4919698238372803, + "learning_rate": 2.2084038353073888e-05, + "loss": 0.7513, + "step": 9900 + }, + { + "epoch": 5.584320360970107, + "grad_norm": 1.8041651248931885, + "learning_rate": 2.2081218274111677e-05, + "loss": 0.8478, + "step": 9901 + }, + { + "epoch": 5.584884376762549, + "grad_norm": 1.1094328165054321, + "learning_rate": 2.2078398195149465e-05, + "loss": 0.7045, + "step": 9902 + }, + { + "epoch": 5.585448392554992, + "grad_norm": 0.8916254043579102, + "learning_rate": 2.2075578116187254e-05, + "loss": 0.6995, + "step": 9903 + }, + { + "epoch": 5.586012408347433, + "grad_norm": 0.9634565114974976, + "learning_rate": 2.2072758037225043e-05, + "loss": 0.8225, + "step": 9904 + }, + { + "epoch": 5.586576424139876, + "grad_norm": 1.0678867101669312, + "learning_rate": 2.2069937958262832e-05, + "loss": 0.6601, + "step": 9905 + }, + { + "epoch": 5.587140439932318, + "grad_norm": 1.67549729347229, + "learning_rate": 2.206711787930062e-05, + "loss": 0.8496, + "step": 9906 + }, + { + "epoch": 5.587704455724761, + "grad_norm": 1.0570329427719116, + "learning_rate": 2.206429780033841e-05, + "loss": 0.7347, + "step": 9907 + }, + { + "epoch": 5.588268471517202, + "grad_norm": 2.1732113361358643, + "learning_rate": 2.20614777213762e-05, + "loss": 0.8254, + "step": 9908 + }, + { + "epoch": 5.588832487309645, + "grad_norm": 1.1156539916992188, + "learning_rate": 2.2058657642413987e-05, + "loss": 0.7248, + "step": 9909 + }, + { + "epoch": 5.589396503102087, + "grad_norm": 0.9420917630195618, + "learning_rate": 2.205583756345178e-05, + "loss": 0.6863, + "step": 9910 + }, + { + "epoch": 5.589960518894529, + "grad_norm": 1.0214710235595703, + "learning_rate": 2.2053017484489565e-05, + "loss": 0.713, + "step": 9911 + }, + { + "epoch": 5.590524534686971, + "grad_norm": 1.1905676126480103, + "learning_rate": 2.2050197405527357e-05, + "loss": 0.7597, + "step": 9912 + }, + { + "epoch": 5.591088550479413, + "grad_norm": 0.9959805607795715, + "learning_rate": 2.2047377326565146e-05, + "loss": 0.7099, + "step": 9913 + }, + { + "epoch": 5.591652566271856, + "grad_norm": 1.2591382265090942, + "learning_rate": 2.2044557247602934e-05, + "loss": 0.8464, + "step": 9914 + }, + { + "epoch": 5.592216582064298, + "grad_norm": 1.0410892963409424, + "learning_rate": 2.2041737168640723e-05, + "loss": 0.8279, + "step": 9915 + }, + { + "epoch": 5.5927805978567395, + "grad_norm": 1.5000293254852295, + "learning_rate": 2.2038917089678512e-05, + "loss": 0.8201, + "step": 9916 + }, + { + "epoch": 5.593344613649182, + "grad_norm": 1.126343846321106, + "learning_rate": 2.20360970107163e-05, + "loss": 0.7565, + "step": 9917 + }, + { + "epoch": 5.593908629441624, + "grad_norm": 1.4549280405044556, + "learning_rate": 2.203327693175409e-05, + "loss": 0.8109, + "step": 9918 + }, + { + "epoch": 5.594472645234067, + "grad_norm": 1.0342249870300293, + "learning_rate": 2.203045685279188e-05, + "loss": 0.7029, + "step": 9919 + }, + { + "epoch": 5.5950366610265085, + "grad_norm": 1.0171544551849365, + "learning_rate": 2.2027636773829667e-05, + "loss": 0.7323, + "step": 9920 + }, + { + "epoch": 5.595600676818951, + "grad_norm": 1.1623272895812988, + "learning_rate": 2.2024816694867456e-05, + "loss": 0.6231, + "step": 9921 + }, + { + "epoch": 5.596164692611393, + "grad_norm": 1.0397076606750488, + "learning_rate": 2.2021996615905248e-05, + "loss": 0.7437, + "step": 9922 + }, + { + "epoch": 5.596728708403836, + "grad_norm": 0.8960543870925903, + "learning_rate": 2.2019176536943034e-05, + "loss": 0.7396, + "step": 9923 + }, + { + "epoch": 5.5972927241962775, + "grad_norm": 0.7621214389801025, + "learning_rate": 2.2016356457980826e-05, + "loss": 0.6455, + "step": 9924 + }, + { + "epoch": 5.597856739988719, + "grad_norm": 1.0144731998443604, + "learning_rate": 2.201353637901861e-05, + "loss": 0.6911, + "step": 9925 + }, + { + "epoch": 5.598420755781162, + "grad_norm": 2.1862220764160156, + "learning_rate": 2.2010716300056403e-05, + "loss": 0.9043, + "step": 9926 + }, + { + "epoch": 5.598984771573604, + "grad_norm": 1.4644219875335693, + "learning_rate": 2.2007896221094192e-05, + "loss": 0.8172, + "step": 9927 + }, + { + "epoch": 5.5995487873660466, + "grad_norm": 1.317191481590271, + "learning_rate": 2.200507614213198e-05, + "loss": 0.8359, + "step": 9928 + }, + { + "epoch": 5.600112803158488, + "grad_norm": 0.9539303779602051, + "learning_rate": 2.200225606316977e-05, + "loss": 0.8059, + "step": 9929 + }, + { + "epoch": 5.60067681895093, + "grad_norm": 1.4187389612197876, + "learning_rate": 2.199943598420756e-05, + "loss": 0.745, + "step": 9930 + }, + { + "epoch": 5.601240834743373, + "grad_norm": 1.044615626335144, + "learning_rate": 2.199661590524535e-05, + "loss": 0.7687, + "step": 9931 + }, + { + "epoch": 5.601804850535815, + "grad_norm": 1.1825158596038818, + "learning_rate": 2.1993795826283136e-05, + "loss": 0.8958, + "step": 9932 + }, + { + "epoch": 5.602368866328257, + "grad_norm": 1.0652045011520386, + "learning_rate": 2.199097574732093e-05, + "loss": 0.7043, + "step": 9933 + }, + { + "epoch": 5.602932882120699, + "grad_norm": 1.5711308717727661, + "learning_rate": 2.1988155668358714e-05, + "loss": 0.7546, + "step": 9934 + }, + { + "epoch": 5.603496897913142, + "grad_norm": 1.180494785308838, + "learning_rate": 2.1985335589396506e-05, + "loss": 0.8297, + "step": 9935 + }, + { + "epoch": 5.604060913705584, + "grad_norm": 1.0332908630371094, + "learning_rate": 2.198251551043429e-05, + "loss": 0.794, + "step": 9936 + }, + { + "epoch": 5.604624929498026, + "grad_norm": 0.8914054036140442, + "learning_rate": 2.1979695431472084e-05, + "loss": 0.7222, + "step": 9937 + }, + { + "epoch": 5.605188945290468, + "grad_norm": 1.5501946210861206, + "learning_rate": 2.1976875352509872e-05, + "loss": 0.7867, + "step": 9938 + }, + { + "epoch": 5.60575296108291, + "grad_norm": 1.523630976676941, + "learning_rate": 2.197405527354766e-05, + "loss": 0.818, + "step": 9939 + }, + { + "epoch": 5.606316976875353, + "grad_norm": 1.4115971326828003, + "learning_rate": 2.197123519458545e-05, + "loss": 0.8593, + "step": 9940 + }, + { + "epoch": 5.6068809926677945, + "grad_norm": 0.9196008443832397, + "learning_rate": 2.196841511562324e-05, + "loss": 0.6363, + "step": 9941 + }, + { + "epoch": 5.607445008460237, + "grad_norm": 1.0750130414962769, + "learning_rate": 2.1965595036661027e-05, + "loss": 0.6944, + "step": 9942 + }, + { + "epoch": 5.608009024252679, + "grad_norm": 1.4499597549438477, + "learning_rate": 2.1962774957698816e-05, + "loss": 0.7945, + "step": 9943 + }, + { + "epoch": 5.608573040045121, + "grad_norm": 1.1373116970062256, + "learning_rate": 2.1959954878736605e-05, + "loss": 0.7654, + "step": 9944 + }, + { + "epoch": 5.6091370558375635, + "grad_norm": 1.1310838460922241, + "learning_rate": 2.1957134799774394e-05, + "loss": 0.6673, + "step": 9945 + }, + { + "epoch": 5.609701071630005, + "grad_norm": 1.0245542526245117, + "learning_rate": 2.1954314720812183e-05, + "loss": 0.8749, + "step": 9946 + }, + { + "epoch": 5.610265087422448, + "grad_norm": 0.9450960755348206, + "learning_rate": 2.1951494641849975e-05, + "loss": 0.735, + "step": 9947 + }, + { + "epoch": 5.61082910321489, + "grad_norm": 1.2608659267425537, + "learning_rate": 2.194867456288776e-05, + "loss": 0.7293, + "step": 9948 + }, + { + "epoch": 5.6113931190073325, + "grad_norm": 1.3537888526916504, + "learning_rate": 2.1945854483925552e-05, + "loss": 0.8142, + "step": 9949 + }, + { + "epoch": 5.611957134799774, + "grad_norm": 1.0705063343048096, + "learning_rate": 2.1943034404963338e-05, + "loss": 0.6729, + "step": 9950 + }, + { + "epoch": 5.612521150592217, + "grad_norm": 1.2906690835952759, + "learning_rate": 2.194021432600113e-05, + "loss": 0.8159, + "step": 9951 + }, + { + "epoch": 5.613085166384659, + "grad_norm": 1.3185193538665771, + "learning_rate": 2.1937394247038915e-05, + "loss": 0.8031, + "step": 9952 + }, + { + "epoch": 5.613649182177101, + "grad_norm": 1.0667599439620972, + "learning_rate": 2.1934574168076708e-05, + "loss": 0.779, + "step": 9953 + }, + { + "epoch": 5.614213197969543, + "grad_norm": 1.2882044315338135, + "learning_rate": 2.1931754089114496e-05, + "loss": 0.649, + "step": 9954 + }, + { + "epoch": 5.614777213761985, + "grad_norm": 1.1248366832733154, + "learning_rate": 2.1928934010152285e-05, + "loss": 0.6738, + "step": 9955 + }, + { + "epoch": 5.615341229554428, + "grad_norm": 0.9657866954803467, + "learning_rate": 2.1926113931190074e-05, + "loss": 0.7648, + "step": 9956 + }, + { + "epoch": 5.61590524534687, + "grad_norm": 1.2126744985580444, + "learning_rate": 2.1923293852227863e-05, + "loss": 0.8772, + "step": 9957 + }, + { + "epoch": 5.616469261139311, + "grad_norm": 1.761336326599121, + "learning_rate": 2.192047377326565e-05, + "loss": 0.7709, + "step": 9958 + }, + { + "epoch": 5.617033276931754, + "grad_norm": 1.0340251922607422, + "learning_rate": 2.191765369430344e-05, + "loss": 0.7102, + "step": 9959 + }, + { + "epoch": 5.617597292724196, + "grad_norm": 2.2373030185699463, + "learning_rate": 2.191483361534123e-05, + "loss": 0.8113, + "step": 9960 + }, + { + "epoch": 5.618161308516639, + "grad_norm": 1.4285591840744019, + "learning_rate": 2.1912013536379018e-05, + "loss": 0.6281, + "step": 9961 + }, + { + "epoch": 5.6187253243090804, + "grad_norm": 1.423182487487793, + "learning_rate": 2.190919345741681e-05, + "loss": 0.8859, + "step": 9962 + }, + { + "epoch": 5.619289340101523, + "grad_norm": 0.9346237182617188, + "learning_rate": 2.19063733784546e-05, + "loss": 0.6832, + "step": 9963 + }, + { + "epoch": 5.619853355893965, + "grad_norm": 1.0091732740402222, + "learning_rate": 2.1903553299492388e-05, + "loss": 0.7991, + "step": 9964 + }, + { + "epoch": 5.620417371686408, + "grad_norm": 1.131801724433899, + "learning_rate": 2.1900733220530177e-05, + "loss": 0.6652, + "step": 9965 + }, + { + "epoch": 5.6209813874788495, + "grad_norm": 0.9676613211631775, + "learning_rate": 2.1897913141567965e-05, + "loss": 0.7706, + "step": 9966 + }, + { + "epoch": 5.621545403271291, + "grad_norm": 0.9011164307594299, + "learning_rate": 2.1895093062605754e-05, + "loss": 0.7083, + "step": 9967 + }, + { + "epoch": 5.622109419063734, + "grad_norm": 1.078241229057312, + "learning_rate": 2.1892272983643543e-05, + "loss": 0.8267, + "step": 9968 + }, + { + "epoch": 5.622673434856176, + "grad_norm": 1.3340457677841187, + "learning_rate": 2.1889452904681332e-05, + "loss": 0.7593, + "step": 9969 + }, + { + "epoch": 5.6232374506486185, + "grad_norm": 1.256860375404358, + "learning_rate": 2.188663282571912e-05, + "loss": 0.7272, + "step": 9970 + }, + { + "epoch": 5.62380146644106, + "grad_norm": 0.9782193303108215, + "learning_rate": 2.188381274675691e-05, + "loss": 0.6791, + "step": 9971 + }, + { + "epoch": 5.624365482233502, + "grad_norm": 1.3724271059036255, + "learning_rate": 2.18809926677947e-05, + "loss": 0.7625, + "step": 9972 + }, + { + "epoch": 5.624929498025945, + "grad_norm": 1.0730518102645874, + "learning_rate": 2.1878172588832487e-05, + "loss": 0.7393, + "step": 9973 + }, + { + "epoch": 5.625493513818387, + "grad_norm": 0.9313830137252808, + "learning_rate": 2.187535250987028e-05, + "loss": 0.7147, + "step": 9974 + }, + { + "epoch": 5.626057529610829, + "grad_norm": 1.126033067703247, + "learning_rate": 2.1872532430908065e-05, + "loss": 0.8287, + "step": 9975 + }, + { + "epoch": 5.626621545403271, + "grad_norm": 1.0741455554962158, + "learning_rate": 2.1869712351945857e-05, + "loss": 0.7985, + "step": 9976 + }, + { + "epoch": 5.627185561195714, + "grad_norm": 1.26939058303833, + "learning_rate": 2.1866892272983646e-05, + "loss": 0.8178, + "step": 9977 + }, + { + "epoch": 5.627749576988156, + "grad_norm": 3.1869256496429443, + "learning_rate": 2.1864072194021434e-05, + "loss": 0.8649, + "step": 9978 + }, + { + "epoch": 5.628313592780598, + "grad_norm": 1.0377564430236816, + "learning_rate": 2.1861252115059223e-05, + "loss": 0.7269, + "step": 9979 + }, + { + "epoch": 5.62887760857304, + "grad_norm": 1.251015067100525, + "learning_rate": 2.1858432036097012e-05, + "loss": 0.6772, + "step": 9980 + }, + { + "epoch": 5.629441624365482, + "grad_norm": 1.5977680683135986, + "learning_rate": 2.18556119571348e-05, + "loss": 0.8937, + "step": 9981 + }, + { + "epoch": 5.630005640157925, + "grad_norm": 0.9731741547584534, + "learning_rate": 2.185279187817259e-05, + "loss": 0.7112, + "step": 9982 + }, + { + "epoch": 5.630569655950366, + "grad_norm": 0.8784654140472412, + "learning_rate": 2.184997179921038e-05, + "loss": 0.7348, + "step": 9983 + }, + { + "epoch": 5.631133671742809, + "grad_norm": 1.0361409187316895, + "learning_rate": 2.1847151720248167e-05, + "loss": 0.7019, + "step": 9984 + }, + { + "epoch": 5.631697687535251, + "grad_norm": 0.8787285685539246, + "learning_rate": 2.1844331641285956e-05, + "loss": 0.6403, + "step": 9985 + }, + { + "epoch": 5.632261703327693, + "grad_norm": 1.277635931968689, + "learning_rate": 2.1841511562323748e-05, + "loss": 0.7532, + "step": 9986 + }, + { + "epoch": 5.632825719120135, + "grad_norm": 1.278127670288086, + "learning_rate": 2.1838691483361533e-05, + "loss": 0.7403, + "step": 9987 + }, + { + "epoch": 5.633389734912577, + "grad_norm": 1.3188260793685913, + "learning_rate": 2.1835871404399326e-05, + "loss": 0.7983, + "step": 9988 + }, + { + "epoch": 5.63395375070502, + "grad_norm": 0.9985328912734985, + "learning_rate": 2.183305132543711e-05, + "loss": 0.7086, + "step": 9989 + }, + { + "epoch": 5.634517766497462, + "grad_norm": 1.1520940065383911, + "learning_rate": 2.1830231246474903e-05, + "loss": 0.653, + "step": 9990 + }, + { + "epoch": 5.635081782289904, + "grad_norm": 0.9107814431190491, + "learning_rate": 2.182741116751269e-05, + "loss": 0.7206, + "step": 9991 + }, + { + "epoch": 5.635645798082346, + "grad_norm": 1.1933906078338623, + "learning_rate": 2.182459108855048e-05, + "loss": 0.7522, + "step": 9992 + }, + { + "epoch": 5.636209813874789, + "grad_norm": 1.1650723218917847, + "learning_rate": 2.182177100958827e-05, + "loss": 0.7643, + "step": 9993 + }, + { + "epoch": 5.636773829667231, + "grad_norm": 0.9647825360298157, + "learning_rate": 2.181895093062606e-05, + "loss": 0.7487, + "step": 9994 + }, + { + "epoch": 5.6373378454596725, + "grad_norm": 0.9504395723342896, + "learning_rate": 2.1816130851663847e-05, + "loss": 0.7003, + "step": 9995 + }, + { + "epoch": 5.637901861252115, + "grad_norm": 15.183887481689453, + "learning_rate": 2.1813310772701636e-05, + "loss": 0.7981, + "step": 9996 + }, + { + "epoch": 5.638465877044557, + "grad_norm": 1.1957145929336548, + "learning_rate": 2.1810490693739428e-05, + "loss": 0.7769, + "step": 9997 + }, + { + "epoch": 5.639029892837, + "grad_norm": 0.9595609903335571, + "learning_rate": 2.1807670614777214e-05, + "loss": 0.6555, + "step": 9998 + }, + { + "epoch": 5.6395939086294415, + "grad_norm": 1.0711984634399414, + "learning_rate": 2.1804850535815006e-05, + "loss": 0.6577, + "step": 9999 + }, + { + "epoch": 5.640157924421883, + "grad_norm": 1.1091707944869995, + "learning_rate": 2.180203045685279e-05, + "loss": 0.7726, + "step": 10000 + }, + { + "epoch": 5.640721940214326, + "grad_norm": 1.2896289825439453, + "learning_rate": 2.1799210377890583e-05, + "loss": 0.8039, + "step": 10001 + }, + { + "epoch": 5.641285956006768, + "grad_norm": 1.173435091972351, + "learning_rate": 2.1796390298928372e-05, + "loss": 0.8718, + "step": 10002 + }, + { + "epoch": 5.6418499717992106, + "grad_norm": 1.3018962144851685, + "learning_rate": 2.179357021996616e-05, + "loss": 0.6947, + "step": 10003 + }, + { + "epoch": 5.642413987591652, + "grad_norm": 1.0861037969589233, + "learning_rate": 2.179075014100395e-05, + "loss": 0.6604, + "step": 10004 + }, + { + "epoch": 5.642978003384095, + "grad_norm": 1.1748970746994019, + "learning_rate": 2.178793006204174e-05, + "loss": 0.7529, + "step": 10005 + }, + { + "epoch": 5.643542019176537, + "grad_norm": 1.4900338649749756, + "learning_rate": 2.1785109983079527e-05, + "loss": 0.7905, + "step": 10006 + }, + { + "epoch": 5.64410603496898, + "grad_norm": 1.3486748933792114, + "learning_rate": 2.1782289904117316e-05, + "loss": 0.8292, + "step": 10007 + }, + { + "epoch": 5.644670050761421, + "grad_norm": 1.2924072742462158, + "learning_rate": 2.1779469825155105e-05, + "loss": 0.8483, + "step": 10008 + }, + { + "epoch": 5.645234066553863, + "grad_norm": 1.2880910634994507, + "learning_rate": 2.1776649746192894e-05, + "loss": 0.7652, + "step": 10009 + }, + { + "epoch": 5.645798082346306, + "grad_norm": 1.080166220664978, + "learning_rate": 2.1773829667230683e-05, + "loss": 0.707, + "step": 10010 + }, + { + "epoch": 5.646362098138748, + "grad_norm": 1.4199388027191162, + "learning_rate": 2.1771009588268475e-05, + "loss": 0.7515, + "step": 10011 + }, + { + "epoch": 5.64692611393119, + "grad_norm": 1.211152195930481, + "learning_rate": 2.176818950930626e-05, + "loss": 0.7653, + "step": 10012 + }, + { + "epoch": 5.647490129723632, + "grad_norm": 0.961672842502594, + "learning_rate": 2.1765369430344052e-05, + "loss": 0.8232, + "step": 10013 + }, + { + "epoch": 5.648054145516074, + "grad_norm": 1.0849708318710327, + "learning_rate": 2.1762549351381838e-05, + "loss": 0.7162, + "step": 10014 + }, + { + "epoch": 5.648618161308517, + "grad_norm": 1.0740289688110352, + "learning_rate": 2.175972927241963e-05, + "loss": 0.6666, + "step": 10015 + }, + { + "epoch": 5.6491821771009585, + "grad_norm": 1.1944124698638916, + "learning_rate": 2.175690919345742e-05, + "loss": 0.6681, + "step": 10016 + }, + { + "epoch": 5.649746192893401, + "grad_norm": 0.8585992455482483, + "learning_rate": 2.1754089114495208e-05, + "loss": 0.6398, + "step": 10017 + }, + { + "epoch": 5.650310208685843, + "grad_norm": 1.3170791864395142, + "learning_rate": 2.1751269035532996e-05, + "loss": 0.8141, + "step": 10018 + }, + { + "epoch": 5.650874224478286, + "grad_norm": 1.1636340618133545, + "learning_rate": 2.1748448956570785e-05, + "loss": 0.7, + "step": 10019 + }, + { + "epoch": 5.6514382402707275, + "grad_norm": 1.2875157594680786, + "learning_rate": 2.1745628877608574e-05, + "loss": 0.7829, + "step": 10020 + }, + { + "epoch": 5.65200225606317, + "grad_norm": 1.0265998840332031, + "learning_rate": 2.1742808798646363e-05, + "loss": 0.6718, + "step": 10021 + }, + { + "epoch": 5.652566271855612, + "grad_norm": 1.5472437143325806, + "learning_rate": 2.173998871968415e-05, + "loss": 0.8497, + "step": 10022 + }, + { + "epoch": 5.653130287648054, + "grad_norm": 0.854595959186554, + "learning_rate": 2.173716864072194e-05, + "loss": 0.6409, + "step": 10023 + }, + { + "epoch": 5.6536943034404965, + "grad_norm": 1.269713044166565, + "learning_rate": 2.173434856175973e-05, + "loss": 0.8007, + "step": 10024 + }, + { + "epoch": 5.654258319232938, + "grad_norm": 1.2652497291564941, + "learning_rate": 2.173152848279752e-05, + "loss": 0.7894, + "step": 10025 + }, + { + "epoch": 5.654822335025381, + "grad_norm": 1.4969841241836548, + "learning_rate": 2.1728708403835307e-05, + "loss": 0.6923, + "step": 10026 + }, + { + "epoch": 5.655386350817823, + "grad_norm": 1.231437087059021, + "learning_rate": 2.17258883248731e-05, + "loss": 0.699, + "step": 10027 + }, + { + "epoch": 5.655950366610265, + "grad_norm": 1.4682934284210205, + "learning_rate": 2.1723068245910884e-05, + "loss": 0.839, + "step": 10028 + }, + { + "epoch": 5.656514382402707, + "grad_norm": 1.0002269744873047, + "learning_rate": 2.1720248166948677e-05, + "loss": 0.7301, + "step": 10029 + }, + { + "epoch": 5.657078398195149, + "grad_norm": 1.9349017143249512, + "learning_rate": 2.1717428087986462e-05, + "loss": 0.8106, + "step": 10030 + }, + { + "epoch": 5.657642413987592, + "grad_norm": 0.9407554864883423, + "learning_rate": 2.1714608009024254e-05, + "loss": 0.6911, + "step": 10031 + }, + { + "epoch": 5.658206429780034, + "grad_norm": 1.7984998226165771, + "learning_rate": 2.1711787930062043e-05, + "loss": 0.7256, + "step": 10032 + }, + { + "epoch": 5.658770445572476, + "grad_norm": 1.064581036567688, + "learning_rate": 2.170896785109983e-05, + "loss": 0.6578, + "step": 10033 + }, + { + "epoch": 5.659334461364918, + "grad_norm": 1.331148624420166, + "learning_rate": 2.1706147772137624e-05, + "loss": 0.7615, + "step": 10034 + }, + { + "epoch": 5.659898477157361, + "grad_norm": 1.4361506700515747, + "learning_rate": 2.170332769317541e-05, + "loss": 0.7722, + "step": 10035 + }, + { + "epoch": 5.660462492949803, + "grad_norm": 1.2687406539916992, + "learning_rate": 2.17005076142132e-05, + "loss": 0.8464, + "step": 10036 + }, + { + "epoch": 5.6610265087422444, + "grad_norm": 1.285323143005371, + "learning_rate": 2.1697687535250987e-05, + "loss": 0.865, + "step": 10037 + }, + { + "epoch": 5.661590524534687, + "grad_norm": 0.9397885203361511, + "learning_rate": 2.169486745628878e-05, + "loss": 0.7204, + "step": 10038 + }, + { + "epoch": 5.662154540327129, + "grad_norm": 1.5653446912765503, + "learning_rate": 2.1692047377326564e-05, + "loss": 0.782, + "step": 10039 + }, + { + "epoch": 5.662718556119572, + "grad_norm": 1.5568065643310547, + "learning_rate": 2.1689227298364357e-05, + "loss": 0.7726, + "step": 10040 + }, + { + "epoch": 5.6632825719120135, + "grad_norm": 0.8882908225059509, + "learning_rate": 2.1686407219402145e-05, + "loss": 0.7528, + "step": 10041 + }, + { + "epoch": 5.663846587704455, + "grad_norm": 1.29019296169281, + "learning_rate": 2.1683587140439934e-05, + "loss": 0.7552, + "step": 10042 + }, + { + "epoch": 5.664410603496898, + "grad_norm": 1.3637782335281372, + "learning_rate": 2.1680767061477723e-05, + "loss": 0.8865, + "step": 10043 + }, + { + "epoch": 5.66497461928934, + "grad_norm": 1.0896133184432983, + "learning_rate": 2.1677946982515512e-05, + "loss": 0.7018, + "step": 10044 + }, + { + "epoch": 5.6655386350817825, + "grad_norm": 1.3541297912597656, + "learning_rate": 2.16751269035533e-05, + "loss": 0.7216, + "step": 10045 + }, + { + "epoch": 5.666102650874224, + "grad_norm": 1.1361794471740723, + "learning_rate": 2.167230682459109e-05, + "loss": 0.7038, + "step": 10046 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 1.229910969734192, + "learning_rate": 2.1669486745628878e-05, + "loss": 0.7655, + "step": 10047 + }, + { + "epoch": 5.667230682459109, + "grad_norm": 0.8317655324935913, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.6686, + "step": 10048 + }, + { + "epoch": 5.6677946982515515, + "grad_norm": 1.2761547565460205, + "learning_rate": 2.1663846587704456e-05, + "loss": 0.8024, + "step": 10049 + }, + { + "epoch": 5.668358714043993, + "grad_norm": 1.286130666732788, + "learning_rate": 2.1661026508742248e-05, + "loss": 0.7838, + "step": 10050 + }, + { + "epoch": 5.668922729836435, + "grad_norm": 1.1443952322006226, + "learning_rate": 2.1658206429780033e-05, + "loss": 0.7739, + "step": 10051 + }, + { + "epoch": 5.669486745628878, + "grad_norm": 0.959799587726593, + "learning_rate": 2.1655386350817826e-05, + "loss": 0.7515, + "step": 10052 + }, + { + "epoch": 5.67005076142132, + "grad_norm": 1.0078784227371216, + "learning_rate": 2.165256627185561e-05, + "loss": 0.7492, + "step": 10053 + }, + { + "epoch": 5.670614777213762, + "grad_norm": 0.9630573987960815, + "learning_rate": 2.1649746192893403e-05, + "loss": 0.7332, + "step": 10054 + }, + { + "epoch": 5.671178793006204, + "grad_norm": 0.8565779328346252, + "learning_rate": 2.164692611393119e-05, + "loss": 0.7544, + "step": 10055 + }, + { + "epoch": 5.671742808798646, + "grad_norm": 1.8834421634674072, + "learning_rate": 2.164410603496898e-05, + "loss": 0.8376, + "step": 10056 + }, + { + "epoch": 5.672306824591089, + "grad_norm": 1.2343144416809082, + "learning_rate": 2.164128595600677e-05, + "loss": 0.7552, + "step": 10057 + }, + { + "epoch": 5.67287084038353, + "grad_norm": 1.2565363645553589, + "learning_rate": 2.163846587704456e-05, + "loss": 0.715, + "step": 10058 + }, + { + "epoch": 5.673434856175973, + "grad_norm": 1.241108775138855, + "learning_rate": 2.1635645798082347e-05, + "loss": 0.7556, + "step": 10059 + }, + { + "epoch": 5.673998871968415, + "grad_norm": 1.2169463634490967, + "learning_rate": 2.1632825719120136e-05, + "loss": 0.7824, + "step": 10060 + }, + { + "epoch": 5.674562887760858, + "grad_norm": 0.8297600746154785, + "learning_rate": 2.1630005640157925e-05, + "loss": 0.659, + "step": 10061 + }, + { + "epoch": 5.675126903553299, + "grad_norm": 3.233808994293213, + "learning_rate": 2.1627185561195714e-05, + "loss": 0.8409, + "step": 10062 + }, + { + "epoch": 5.675690919345742, + "grad_norm": 0.9907492995262146, + "learning_rate": 2.1624365482233502e-05, + "loss": 0.6745, + "step": 10063 + }, + { + "epoch": 5.676254935138184, + "grad_norm": 0.8576064109802246, + "learning_rate": 2.162154540327129e-05, + "loss": 0.6828, + "step": 10064 + }, + { + "epoch": 5.676818950930626, + "grad_norm": 1.2662628889083862, + "learning_rate": 2.161872532430908e-05, + "loss": 0.814, + "step": 10065 + }, + { + "epoch": 5.677382966723068, + "grad_norm": 1.2394142150878906, + "learning_rate": 2.1615905245346872e-05, + "loss": 0.8143, + "step": 10066 + }, + { + "epoch": 5.67794698251551, + "grad_norm": 1.2527326345443726, + "learning_rate": 2.161308516638466e-05, + "loss": 0.6577, + "step": 10067 + }, + { + "epoch": 5.678510998307953, + "grad_norm": 1.297347068786621, + "learning_rate": 2.161026508742245e-05, + "loss": 0.8523, + "step": 10068 + }, + { + "epoch": 5.679075014100395, + "grad_norm": 1.3974318504333496, + "learning_rate": 2.160744500846024e-05, + "loss": 0.7589, + "step": 10069 + }, + { + "epoch": 5.6796390298928365, + "grad_norm": 1.0077965259552002, + "learning_rate": 2.1604624929498027e-05, + "loss": 0.6833, + "step": 10070 + }, + { + "epoch": 5.680203045685279, + "grad_norm": 0.9001182317733765, + "learning_rate": 2.1601804850535816e-05, + "loss": 0.729, + "step": 10071 + }, + { + "epoch": 5.680767061477721, + "grad_norm": 1.4224237203598022, + "learning_rate": 2.1598984771573605e-05, + "loss": 0.6697, + "step": 10072 + }, + { + "epoch": 5.681331077270164, + "grad_norm": 1.0960079431533813, + "learning_rate": 2.1596164692611394e-05, + "loss": 0.8636, + "step": 10073 + }, + { + "epoch": 5.6818950930626055, + "grad_norm": 1.2104071378707886, + "learning_rate": 2.1593344613649183e-05, + "loss": 0.6726, + "step": 10074 + }, + { + "epoch": 5.682459108855048, + "grad_norm": 1.1204053163528442, + "learning_rate": 2.1590524534686975e-05, + "loss": 0.6438, + "step": 10075 + }, + { + "epoch": 5.68302312464749, + "grad_norm": 1.3896105289459229, + "learning_rate": 2.158770445572476e-05, + "loss": 0.7567, + "step": 10076 + }, + { + "epoch": 5.683587140439933, + "grad_norm": 1.0367754697799683, + "learning_rate": 2.1584884376762552e-05, + "loss": 0.8007, + "step": 10077 + }, + { + "epoch": 5.6841511562323745, + "grad_norm": 1.191828727722168, + "learning_rate": 2.1582064297800338e-05, + "loss": 0.7826, + "step": 10078 + }, + { + "epoch": 5.684715172024816, + "grad_norm": 1.1056245565414429, + "learning_rate": 2.157924421883813e-05, + "loss": 0.7183, + "step": 10079 + }, + { + "epoch": 5.685279187817259, + "grad_norm": 1.3324718475341797, + "learning_rate": 2.157642413987592e-05, + "loss": 0.7189, + "step": 10080 + }, + { + "epoch": 5.685843203609701, + "grad_norm": 1.3069998025894165, + "learning_rate": 2.1573604060913707e-05, + "loss": 0.8041, + "step": 10081 + }, + { + "epoch": 5.6864072194021436, + "grad_norm": 0.8392359018325806, + "learning_rate": 2.1570783981951496e-05, + "loss": 0.6566, + "step": 10082 + }, + { + "epoch": 5.686971235194585, + "grad_norm": 1.2802153825759888, + "learning_rate": 2.1567963902989285e-05, + "loss": 0.7699, + "step": 10083 + }, + { + "epoch": 5.687535250987027, + "grad_norm": 0.8748158812522888, + "learning_rate": 2.1565143824027074e-05, + "loss": 0.7323, + "step": 10084 + }, + { + "epoch": 5.68809926677947, + "grad_norm": 1.0470703840255737, + "learning_rate": 2.1562323745064863e-05, + "loss": 0.6345, + "step": 10085 + }, + { + "epoch": 5.688663282571912, + "grad_norm": 4.3126630783081055, + "learning_rate": 2.155950366610265e-05, + "loss": 0.8024, + "step": 10086 + }, + { + "epoch": 5.689227298364354, + "grad_norm": 1.3979593515396118, + "learning_rate": 2.155668358714044e-05, + "loss": 0.7848, + "step": 10087 + }, + { + "epoch": 5.689791314156796, + "grad_norm": 0.9429330229759216, + "learning_rate": 2.155386350817823e-05, + "loss": 0.6609, + "step": 10088 + }, + { + "epoch": 5.690355329949239, + "grad_norm": 0.9502332806587219, + "learning_rate": 2.155104342921602e-05, + "loss": 0.7754, + "step": 10089 + }, + { + "epoch": 5.690919345741681, + "grad_norm": 1.0242717266082764, + "learning_rate": 2.1548223350253807e-05, + "loss": 0.6879, + "step": 10090 + }, + { + "epoch": 5.691483361534123, + "grad_norm": 1.0866366624832153, + "learning_rate": 2.15454032712916e-05, + "loss": 0.7516, + "step": 10091 + }, + { + "epoch": 5.692047377326565, + "grad_norm": 0.8927890658378601, + "learning_rate": 2.1542583192329384e-05, + "loss": 0.7294, + "step": 10092 + }, + { + "epoch": 5.692611393119007, + "grad_norm": 1.303646445274353, + "learning_rate": 2.1539763113367176e-05, + "loss": 0.7902, + "step": 10093 + }, + { + "epoch": 5.69317540891145, + "grad_norm": 1.152469277381897, + "learning_rate": 2.1536943034404962e-05, + "loss": 0.6854, + "step": 10094 + }, + { + "epoch": 5.6937394247038915, + "grad_norm": 0.732330322265625, + "learning_rate": 2.1534122955442754e-05, + "loss": 0.6406, + "step": 10095 + }, + { + "epoch": 5.694303440496334, + "grad_norm": 0.9640153050422668, + "learning_rate": 2.1531302876480543e-05, + "loss": 0.7152, + "step": 10096 + }, + { + "epoch": 5.694867456288776, + "grad_norm": 3.1887028217315674, + "learning_rate": 2.152848279751833e-05, + "loss": 0.775, + "step": 10097 + }, + { + "epoch": 5.695431472081218, + "grad_norm": 1.0086610317230225, + "learning_rate": 2.152566271855612e-05, + "loss": 0.7601, + "step": 10098 + }, + { + "epoch": 5.6959954878736605, + "grad_norm": 1.2829551696777344, + "learning_rate": 2.152284263959391e-05, + "loss": 0.7373, + "step": 10099 + }, + { + "epoch": 5.696559503666102, + "grad_norm": 1.0751516819000244, + "learning_rate": 2.1520022560631698e-05, + "loss": 0.6895, + "step": 10100 + }, + { + "epoch": 5.697123519458545, + "grad_norm": 1.0350805521011353, + "learning_rate": 2.1517202481669487e-05, + "loss": 0.7056, + "step": 10101 + }, + { + "epoch": 5.697687535250987, + "grad_norm": 1.282867193222046, + "learning_rate": 2.1514382402707276e-05, + "loss": 0.6484, + "step": 10102 + }, + { + "epoch": 5.6982515510434295, + "grad_norm": 1.9507503509521484, + "learning_rate": 2.1511562323745064e-05, + "loss": 0.7667, + "step": 10103 + }, + { + "epoch": 5.698815566835871, + "grad_norm": 1.6870779991149902, + "learning_rate": 2.1508742244782857e-05, + "loss": 0.831, + "step": 10104 + }, + { + "epoch": 5.699379582628314, + "grad_norm": 1.1379170417785645, + "learning_rate": 2.1505922165820645e-05, + "loss": 0.6542, + "step": 10105 + }, + { + "epoch": 5.699943598420756, + "grad_norm": 1.1176139116287231, + "learning_rate": 2.1503102086858434e-05, + "loss": 0.8723, + "step": 10106 + }, + { + "epoch": 5.700507614213198, + "grad_norm": 1.4911768436431885, + "learning_rate": 2.1500282007896223e-05, + "loss": 0.7301, + "step": 10107 + }, + { + "epoch": 5.70107163000564, + "grad_norm": 1.2432681322097778, + "learning_rate": 2.1497461928934012e-05, + "loss": 0.78, + "step": 10108 + }, + { + "epoch": 5.701635645798082, + "grad_norm": 1.709999442100525, + "learning_rate": 2.14946418499718e-05, + "loss": 0.7543, + "step": 10109 + }, + { + "epoch": 5.702199661590525, + "grad_norm": 1.2673200368881226, + "learning_rate": 2.149182177100959e-05, + "loss": 0.6988, + "step": 10110 + }, + { + "epoch": 5.702763677382967, + "grad_norm": 1.1059439182281494, + "learning_rate": 2.1489001692047378e-05, + "loss": 0.6878, + "step": 10111 + }, + { + "epoch": 5.703327693175408, + "grad_norm": 1.4353880882263184, + "learning_rate": 2.1486181613085167e-05, + "loss": 0.8228, + "step": 10112 + }, + { + "epoch": 5.703891708967851, + "grad_norm": 1.116050124168396, + "learning_rate": 2.1483361534122956e-05, + "loss": 0.6224, + "step": 10113 + }, + { + "epoch": 5.704455724760293, + "grad_norm": 1.1862070560455322, + "learning_rate": 2.1480541455160748e-05, + "loss": 0.8169, + "step": 10114 + }, + { + "epoch": 5.705019740552736, + "grad_norm": 2.488039970397949, + "learning_rate": 2.1477721376198533e-05, + "loss": 0.7625, + "step": 10115 + }, + { + "epoch": 5.7055837563451774, + "grad_norm": 1.101944923400879, + "learning_rate": 2.1474901297236326e-05, + "loss": 0.6159, + "step": 10116 + }, + { + "epoch": 5.70614777213762, + "grad_norm": 1.267340898513794, + "learning_rate": 2.147208121827411e-05, + "loss": 0.7026, + "step": 10117 + }, + { + "epoch": 5.706711787930062, + "grad_norm": 1.0948435068130493, + "learning_rate": 2.1469261139311903e-05, + "loss": 0.6793, + "step": 10118 + }, + { + "epoch": 5.707275803722505, + "grad_norm": 1.1554710865020752, + "learning_rate": 2.146644106034969e-05, + "loss": 0.7567, + "step": 10119 + }, + { + "epoch": 5.7078398195149465, + "grad_norm": 1.3714556694030762, + "learning_rate": 2.146362098138748e-05, + "loss": 0.7547, + "step": 10120 + }, + { + "epoch": 5.708403835307388, + "grad_norm": 0.8218474984169006, + "learning_rate": 2.146080090242527e-05, + "loss": 0.7438, + "step": 10121 + }, + { + "epoch": 5.708967851099831, + "grad_norm": 1.0526320934295654, + "learning_rate": 2.1457980823463058e-05, + "loss": 0.7083, + "step": 10122 + }, + { + "epoch": 5.709531866892273, + "grad_norm": 1.2816954851150513, + "learning_rate": 2.1455160744500847e-05, + "loss": 0.7436, + "step": 10123 + }, + { + "epoch": 5.7100958826847155, + "grad_norm": 1.0593774318695068, + "learning_rate": 2.1452340665538636e-05, + "loss": 0.702, + "step": 10124 + }, + { + "epoch": 5.710659898477157, + "grad_norm": 1.1477081775665283, + "learning_rate": 2.1449520586576425e-05, + "loss": 0.6807, + "step": 10125 + }, + { + "epoch": 5.711223914269599, + "grad_norm": 0.938200831413269, + "learning_rate": 2.1446700507614213e-05, + "loss": 0.7261, + "step": 10126 + }, + { + "epoch": 5.711787930062042, + "grad_norm": 1.2681814432144165, + "learning_rate": 2.1443880428652002e-05, + "loss": 0.6798, + "step": 10127 + }, + { + "epoch": 5.712351945854484, + "grad_norm": 1.038359522819519, + "learning_rate": 2.144106034968979e-05, + "loss": 0.7029, + "step": 10128 + }, + { + "epoch": 5.712915961646926, + "grad_norm": 1.1335769891738892, + "learning_rate": 2.143824027072758e-05, + "loss": 0.7336, + "step": 10129 + }, + { + "epoch": 5.713479977439368, + "grad_norm": 1.1324907541275024, + "learning_rate": 2.1435420191765372e-05, + "loss": 0.6791, + "step": 10130 + }, + { + "epoch": 5.714043993231811, + "grad_norm": 1.0708403587341309, + "learning_rate": 2.1432600112803157e-05, + "loss": 0.7098, + "step": 10131 + }, + { + "epoch": 5.714608009024253, + "grad_norm": 1.3622366189956665, + "learning_rate": 2.142978003384095e-05, + "loss": 0.7691, + "step": 10132 + }, + { + "epoch": 5.715172024816695, + "grad_norm": 1.2064887285232544, + "learning_rate": 2.1426959954878735e-05, + "loss": 0.734, + "step": 10133 + }, + { + "epoch": 5.715736040609137, + "grad_norm": 1.4114270210266113, + "learning_rate": 2.1424139875916527e-05, + "loss": 0.8475, + "step": 10134 + }, + { + "epoch": 5.716300056401579, + "grad_norm": 1.012355923652649, + "learning_rate": 2.1421319796954316e-05, + "loss": 0.7713, + "step": 10135 + }, + { + "epoch": 5.716864072194022, + "grad_norm": 1.4084007740020752, + "learning_rate": 2.1418499717992105e-05, + "loss": 0.8737, + "step": 10136 + }, + { + "epoch": 5.717428087986463, + "grad_norm": 0.8082910776138306, + "learning_rate": 2.1415679639029894e-05, + "loss": 0.6767, + "step": 10137 + }, + { + "epoch": 5.717992103778906, + "grad_norm": 1.5473755598068237, + "learning_rate": 2.1412859560067682e-05, + "loss": 0.8308, + "step": 10138 + }, + { + "epoch": 5.718556119571348, + "grad_norm": 0.905455470085144, + "learning_rate": 2.1410039481105475e-05, + "loss": 0.6731, + "step": 10139 + }, + { + "epoch": 5.71912013536379, + "grad_norm": 1.091281771659851, + "learning_rate": 2.140721940214326e-05, + "loss": 0.7309, + "step": 10140 + }, + { + "epoch": 5.719684151156232, + "grad_norm": 1.1277943849563599, + "learning_rate": 2.1404399323181052e-05, + "loss": 0.8006, + "step": 10141 + }, + { + "epoch": 5.720248166948674, + "grad_norm": 0.9096745848655701, + "learning_rate": 2.1401579244218838e-05, + "loss": 0.6686, + "step": 10142 + }, + { + "epoch": 5.720812182741117, + "grad_norm": 1.1098089218139648, + "learning_rate": 2.139875916525663e-05, + "loss": 0.7033, + "step": 10143 + }, + { + "epoch": 5.721376198533559, + "grad_norm": 0.8559788465499878, + "learning_rate": 2.139593908629442e-05, + "loss": 0.6767, + "step": 10144 + }, + { + "epoch": 5.721940214326001, + "grad_norm": 1.0769202709197998, + "learning_rate": 2.1393119007332207e-05, + "loss": 0.6579, + "step": 10145 + }, + { + "epoch": 5.722504230118443, + "grad_norm": 1.2171547412872314, + "learning_rate": 2.1390298928369996e-05, + "loss": 0.7452, + "step": 10146 + }, + { + "epoch": 5.723068245910886, + "grad_norm": 0.9689061641693115, + "learning_rate": 2.1387478849407785e-05, + "loss": 0.7048, + "step": 10147 + }, + { + "epoch": 5.723632261703328, + "grad_norm": 0.8016725182533264, + "learning_rate": 2.1384658770445574e-05, + "loss": 0.6959, + "step": 10148 + }, + { + "epoch": 5.7241962774957695, + "grad_norm": 1.139869213104248, + "learning_rate": 2.1381838691483363e-05, + "loss": 0.6767, + "step": 10149 + }, + { + "epoch": 5.724760293288212, + "grad_norm": 1.955506682395935, + "learning_rate": 2.137901861252115e-05, + "loss": 0.8895, + "step": 10150 + }, + { + "epoch": 5.725324309080654, + "grad_norm": 1.20160710811615, + "learning_rate": 2.137619853355894e-05, + "loss": 0.81, + "step": 10151 + }, + { + "epoch": 5.725888324873097, + "grad_norm": 0.8983718156814575, + "learning_rate": 2.137337845459673e-05, + "loss": 0.6255, + "step": 10152 + }, + { + "epoch": 5.7264523406655385, + "grad_norm": 1.0870414972305298, + "learning_rate": 2.137055837563452e-05, + "loss": 0.793, + "step": 10153 + }, + { + "epoch": 5.72701635645798, + "grad_norm": 1.4264295101165771, + "learning_rate": 2.1367738296672307e-05, + "loss": 0.7923, + "step": 10154 + }, + { + "epoch": 5.727580372250423, + "grad_norm": 1.193525791168213, + "learning_rate": 2.13649182177101e-05, + "loss": 0.8141, + "step": 10155 + }, + { + "epoch": 5.728144388042865, + "grad_norm": 1.1275498867034912, + "learning_rate": 2.1362098138747884e-05, + "loss": 0.7136, + "step": 10156 + }, + { + "epoch": 5.7287084038353075, + "grad_norm": 1.0080939531326294, + "learning_rate": 2.1359278059785676e-05, + "loss": 0.8514, + "step": 10157 + }, + { + "epoch": 5.729272419627749, + "grad_norm": 1.2133710384368896, + "learning_rate": 2.1356457980823462e-05, + "loss": 0.7953, + "step": 10158 + }, + { + "epoch": 5.729836435420192, + "grad_norm": 1.1002802848815918, + "learning_rate": 2.1353637901861254e-05, + "loss": 0.8293, + "step": 10159 + }, + { + "epoch": 5.730400451212634, + "grad_norm": 1.3731271028518677, + "learning_rate": 2.1350817822899043e-05, + "loss": 0.821, + "step": 10160 + }, + { + "epoch": 5.730964467005077, + "grad_norm": 1.1439074277877808, + "learning_rate": 2.134799774393683e-05, + "loss": 0.8151, + "step": 10161 + }, + { + "epoch": 5.731528482797518, + "grad_norm": 1.5013066530227661, + "learning_rate": 2.134517766497462e-05, + "loss": 0.7461, + "step": 10162 + }, + { + "epoch": 5.73209249858996, + "grad_norm": 1.095855474472046, + "learning_rate": 2.134235758601241e-05, + "loss": 0.7116, + "step": 10163 + }, + { + "epoch": 5.732656514382403, + "grad_norm": 1.1634461879730225, + "learning_rate": 2.1339537507050198e-05, + "loss": 0.7195, + "step": 10164 + }, + { + "epoch": 5.733220530174845, + "grad_norm": 1.1894537210464478, + "learning_rate": 2.1336717428087987e-05, + "loss": 0.6329, + "step": 10165 + }, + { + "epoch": 5.733784545967287, + "grad_norm": 1.3586474657058716, + "learning_rate": 2.1333897349125776e-05, + "loss": 0.8657, + "step": 10166 + }, + { + "epoch": 5.734348561759729, + "grad_norm": 1.665130853652954, + "learning_rate": 2.1331077270163564e-05, + "loss": 0.8118, + "step": 10167 + }, + { + "epoch": 5.734912577552171, + "grad_norm": 1.7288721799850464, + "learning_rate": 2.1328257191201353e-05, + "loss": 0.9182, + "step": 10168 + }, + { + "epoch": 5.735476593344614, + "grad_norm": 0.6984575986862183, + "learning_rate": 2.1325437112239145e-05, + "loss": 0.6126, + "step": 10169 + }, + { + "epoch": 5.7360406091370555, + "grad_norm": 1.2327383756637573, + "learning_rate": 2.132261703327693e-05, + "loss": 0.6693, + "step": 10170 + }, + { + "epoch": 5.736604624929498, + "grad_norm": 0.8074305057525635, + "learning_rate": 2.1319796954314723e-05, + "loss": 0.707, + "step": 10171 + }, + { + "epoch": 5.73716864072194, + "grad_norm": 1.0708980560302734, + "learning_rate": 2.1316976875352508e-05, + "loss": 0.7507, + "step": 10172 + }, + { + "epoch": 5.737732656514383, + "grad_norm": 1.3755613565444946, + "learning_rate": 2.13141567963903e-05, + "loss": 0.8267, + "step": 10173 + }, + { + "epoch": 5.7382966723068245, + "grad_norm": 1.1222012042999268, + "learning_rate": 2.131133671742809e-05, + "loss": 0.734, + "step": 10174 + }, + { + "epoch": 5.738860688099267, + "grad_norm": 1.1462059020996094, + "learning_rate": 2.1308516638465878e-05, + "loss": 0.7247, + "step": 10175 + }, + { + "epoch": 5.739424703891709, + "grad_norm": 1.2867991924285889, + "learning_rate": 2.1305696559503667e-05, + "loss": 0.7679, + "step": 10176 + }, + { + "epoch": 5.739988719684151, + "grad_norm": 1.1710333824157715, + "learning_rate": 2.1302876480541456e-05, + "loss": 0.7477, + "step": 10177 + }, + { + "epoch": 5.7405527354765935, + "grad_norm": 0.9333131909370422, + "learning_rate": 2.1300056401579248e-05, + "loss": 0.7208, + "step": 10178 + }, + { + "epoch": 5.741116751269035, + "grad_norm": 1.6656562089920044, + "learning_rate": 2.1297236322617033e-05, + "loss": 0.7944, + "step": 10179 + }, + { + "epoch": 5.741680767061478, + "grad_norm": 0.9727149605751038, + "learning_rate": 2.1294416243654825e-05, + "loss": 0.5961, + "step": 10180 + }, + { + "epoch": 5.74224478285392, + "grad_norm": 0.8424580693244934, + "learning_rate": 2.129159616469261e-05, + "loss": 0.6901, + "step": 10181 + }, + { + "epoch": 5.742808798646362, + "grad_norm": 1.266339659690857, + "learning_rate": 2.1288776085730403e-05, + "loss": 0.8106, + "step": 10182 + }, + { + "epoch": 5.743372814438804, + "grad_norm": 1.3606455326080322, + "learning_rate": 2.128595600676819e-05, + "loss": 0.7459, + "step": 10183 + }, + { + "epoch": 5.743936830231246, + "grad_norm": 1.22843337059021, + "learning_rate": 2.128313592780598e-05, + "loss": 0.7288, + "step": 10184 + }, + { + "epoch": 5.744500846023689, + "grad_norm": 1.1785149574279785, + "learning_rate": 2.128031584884377e-05, + "loss": 0.7893, + "step": 10185 + }, + { + "epoch": 5.745064861816131, + "grad_norm": 1.5164047479629517, + "learning_rate": 2.1277495769881558e-05, + "loss": 0.7642, + "step": 10186 + }, + { + "epoch": 5.745628877608573, + "grad_norm": 1.290663719177246, + "learning_rate": 2.1274675690919347e-05, + "loss": 0.7723, + "step": 10187 + }, + { + "epoch": 5.746192893401015, + "grad_norm": 1.381800651550293, + "learning_rate": 2.1271855611957136e-05, + "loss": 0.7928, + "step": 10188 + }, + { + "epoch": 5.746756909193458, + "grad_norm": 1.145526647567749, + "learning_rate": 2.1269035532994925e-05, + "loss": 0.6329, + "step": 10189 + }, + { + "epoch": 5.7473209249859, + "grad_norm": 1.0588253736495972, + "learning_rate": 2.1266215454032713e-05, + "loss": 0.727, + "step": 10190 + }, + { + "epoch": 5.7478849407783414, + "grad_norm": 1.2106249332427979, + "learning_rate": 2.1263395375070502e-05, + "loss": 0.7529, + "step": 10191 + }, + { + "epoch": 5.748448956570784, + "grad_norm": 1.090151309967041, + "learning_rate": 2.126057529610829e-05, + "loss": 0.7137, + "step": 10192 + }, + { + "epoch": 5.749012972363226, + "grad_norm": 1.0469672679901123, + "learning_rate": 2.125775521714608e-05, + "loss": 0.6692, + "step": 10193 + }, + { + "epoch": 5.749576988155669, + "grad_norm": 0.9871541857719421, + "learning_rate": 2.1254935138183872e-05, + "loss": 0.7274, + "step": 10194 + }, + { + "epoch": 5.7501410039481105, + "grad_norm": 1.1039490699768066, + "learning_rate": 2.1252115059221657e-05, + "loss": 0.6219, + "step": 10195 + }, + { + "epoch": 5.750705019740552, + "grad_norm": 0.9409885406494141, + "learning_rate": 2.124929498025945e-05, + "loss": 0.6952, + "step": 10196 + }, + { + "epoch": 5.751269035532995, + "grad_norm": 0.8965805172920227, + "learning_rate": 2.1246474901297235e-05, + "loss": 0.6653, + "step": 10197 + }, + { + "epoch": 5.751833051325437, + "grad_norm": 1.0808042287826538, + "learning_rate": 2.1243654822335027e-05, + "loss": 0.7475, + "step": 10198 + }, + { + "epoch": 5.7523970671178795, + "grad_norm": 1.007414698600769, + "learning_rate": 2.1240834743372816e-05, + "loss": 0.7343, + "step": 10199 + }, + { + "epoch": 5.752961082910321, + "grad_norm": 0.9440162777900696, + "learning_rate": 2.1238014664410605e-05, + "loss": 0.6921, + "step": 10200 + }, + { + "epoch": 5.753525098702764, + "grad_norm": 0.8880353569984436, + "learning_rate": 2.1235194585448394e-05, + "loss": 0.6931, + "step": 10201 + }, + { + "epoch": 5.754089114495206, + "grad_norm": 1.1339460611343384, + "learning_rate": 2.1232374506486182e-05, + "loss": 0.7365, + "step": 10202 + }, + { + "epoch": 5.7546531302876485, + "grad_norm": 1.089513897895813, + "learning_rate": 2.122955442752397e-05, + "loss": 0.7477, + "step": 10203 + }, + { + "epoch": 5.75521714608009, + "grad_norm": 1.423825979232788, + "learning_rate": 2.122673434856176e-05, + "loss": 0.6887, + "step": 10204 + }, + { + "epoch": 5.755781161872532, + "grad_norm": 1.9010660648345947, + "learning_rate": 2.122391426959955e-05, + "loss": 0.8161, + "step": 10205 + }, + { + "epoch": 5.756345177664975, + "grad_norm": 1.1020176410675049, + "learning_rate": 2.1221094190637338e-05, + "loss": 0.7007, + "step": 10206 + }, + { + "epoch": 5.756909193457417, + "grad_norm": 1.0266975164413452, + "learning_rate": 2.1218274111675126e-05, + "loss": 0.7621, + "step": 10207 + }, + { + "epoch": 5.757473209249859, + "grad_norm": 1.8582398891448975, + "learning_rate": 2.121545403271292e-05, + "loss": 0.9308, + "step": 10208 + }, + { + "epoch": 5.758037225042301, + "grad_norm": 1.0628793239593506, + "learning_rate": 2.1212633953750707e-05, + "loss": 0.6811, + "step": 10209 + }, + { + "epoch": 5.758601240834743, + "grad_norm": 1.206849455833435, + "learning_rate": 2.1209813874788496e-05, + "loss": 0.5918, + "step": 10210 + }, + { + "epoch": 5.759165256627186, + "grad_norm": 0.9601861238479614, + "learning_rate": 2.1206993795826285e-05, + "loss": 0.6912, + "step": 10211 + }, + { + "epoch": 5.759729272419627, + "grad_norm": 1.138798713684082, + "learning_rate": 2.1204173716864074e-05, + "loss": 0.8178, + "step": 10212 + }, + { + "epoch": 5.76029328821207, + "grad_norm": 1.2529747486114502, + "learning_rate": 2.1201353637901863e-05, + "loss": 0.8504, + "step": 10213 + }, + { + "epoch": 5.760857304004512, + "grad_norm": 1.4443683624267578, + "learning_rate": 2.119853355893965e-05, + "loss": 0.8558, + "step": 10214 + }, + { + "epoch": 5.761421319796955, + "grad_norm": 1.1017298698425293, + "learning_rate": 2.119571347997744e-05, + "loss": 0.7517, + "step": 10215 + }, + { + "epoch": 5.761985335589396, + "grad_norm": 1.7246551513671875, + "learning_rate": 2.119289340101523e-05, + "loss": 0.7048, + "step": 10216 + }, + { + "epoch": 5.762549351381839, + "grad_norm": 0.9363606572151184, + "learning_rate": 2.119007332205302e-05, + "loss": 0.7052, + "step": 10217 + }, + { + "epoch": 5.763113367174281, + "grad_norm": 1.1212750673294067, + "learning_rate": 2.1187253243090806e-05, + "loss": 0.8208, + "step": 10218 + }, + { + "epoch": 5.763677382966723, + "grad_norm": 2.0319087505340576, + "learning_rate": 2.11844331641286e-05, + "loss": 1.0264, + "step": 10219 + }, + { + "epoch": 5.764241398759165, + "grad_norm": 1.6677451133728027, + "learning_rate": 2.1181613085166384e-05, + "loss": 0.8042, + "step": 10220 + }, + { + "epoch": 5.764805414551607, + "grad_norm": 1.2921507358551025, + "learning_rate": 2.1178793006204176e-05, + "loss": 0.7623, + "step": 10221 + }, + { + "epoch": 5.76536943034405, + "grad_norm": 1.1610019207000732, + "learning_rate": 2.117597292724196e-05, + "loss": 0.6977, + "step": 10222 + }, + { + "epoch": 5.765933446136492, + "grad_norm": 1.1931170225143433, + "learning_rate": 2.1173152848279754e-05, + "loss": 0.8178, + "step": 10223 + }, + { + "epoch": 5.7664974619289335, + "grad_norm": 1.0321472883224487, + "learning_rate": 2.1170332769317543e-05, + "loss": 0.7641, + "step": 10224 + }, + { + "epoch": 5.767061477721376, + "grad_norm": 0.9731965065002441, + "learning_rate": 2.116751269035533e-05, + "loss": 0.7627, + "step": 10225 + }, + { + "epoch": 5.767625493513818, + "grad_norm": 1.1467396020889282, + "learning_rate": 2.116469261139312e-05, + "loss": 0.796, + "step": 10226 + }, + { + "epoch": 5.768189509306261, + "grad_norm": 1.1029530763626099, + "learning_rate": 2.116187253243091e-05, + "loss": 0.796, + "step": 10227 + }, + { + "epoch": 5.7687535250987025, + "grad_norm": 1.0953319072723389, + "learning_rate": 2.1159052453468698e-05, + "loss": 0.878, + "step": 10228 + }, + { + "epoch": 5.769317540891145, + "grad_norm": 1.0875605344772339, + "learning_rate": 2.1156232374506487e-05, + "loss": 0.6999, + "step": 10229 + }, + { + "epoch": 5.769881556683587, + "grad_norm": 1.4263155460357666, + "learning_rate": 2.1153412295544275e-05, + "loss": 0.6923, + "step": 10230 + }, + { + "epoch": 5.77044557247603, + "grad_norm": 1.065405011177063, + "learning_rate": 2.1150592216582064e-05, + "loss": 0.7674, + "step": 10231 + }, + { + "epoch": 5.7710095882684715, + "grad_norm": 1.0387883186340332, + "learning_rate": 2.1147772137619853e-05, + "loss": 0.7074, + "step": 10232 + }, + { + "epoch": 5.771573604060913, + "grad_norm": 1.385266900062561, + "learning_rate": 2.1144952058657645e-05, + "loss": 0.7963, + "step": 10233 + }, + { + "epoch": 5.772137619853356, + "grad_norm": 0.9128821492195129, + "learning_rate": 2.114213197969543e-05, + "loss": 0.7285, + "step": 10234 + }, + { + "epoch": 5.772701635645798, + "grad_norm": 1.2418804168701172, + "learning_rate": 2.1139311900733223e-05, + "loss": 0.803, + "step": 10235 + }, + { + "epoch": 5.7732656514382406, + "grad_norm": 1.6001256704330444, + "learning_rate": 2.1136491821771008e-05, + "loss": 0.712, + "step": 10236 + }, + { + "epoch": 5.773829667230682, + "grad_norm": 1.171077013015747, + "learning_rate": 2.11336717428088e-05, + "loss": 0.6723, + "step": 10237 + }, + { + "epoch": 5.774393683023124, + "grad_norm": 1.235867977142334, + "learning_rate": 2.113085166384659e-05, + "loss": 0.7704, + "step": 10238 + }, + { + "epoch": 5.774957698815567, + "grad_norm": 0.9975724816322327, + "learning_rate": 2.1128031584884378e-05, + "loss": 0.7886, + "step": 10239 + }, + { + "epoch": 5.775521714608009, + "grad_norm": 1.358473777770996, + "learning_rate": 2.1125211505922167e-05, + "loss": 0.7724, + "step": 10240 + }, + { + "epoch": 5.776085730400451, + "grad_norm": 1.0582778453826904, + "learning_rate": 2.1122391426959956e-05, + "loss": 0.6609, + "step": 10241 + }, + { + "epoch": 5.776649746192893, + "grad_norm": 0.9899654984474182, + "learning_rate": 2.1119571347997744e-05, + "loss": 0.694, + "step": 10242 + }, + { + "epoch": 5.777213761985336, + "grad_norm": 1.2658939361572266, + "learning_rate": 2.1116751269035533e-05, + "loss": 0.774, + "step": 10243 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 1.2750625610351562, + "learning_rate": 2.1113931190073325e-05, + "loss": 0.6677, + "step": 10244 + }, + { + "epoch": 5.77834179357022, + "grad_norm": 1.1259403228759766, + "learning_rate": 2.111111111111111e-05, + "loss": 0.7087, + "step": 10245 + }, + { + "epoch": 5.778905809362662, + "grad_norm": 1.2010682821273804, + "learning_rate": 2.1108291032148903e-05, + "loss": 0.8151, + "step": 10246 + }, + { + "epoch": 5.779469825155104, + "grad_norm": 1.321757197380066, + "learning_rate": 2.1105470953186692e-05, + "loss": 0.7219, + "step": 10247 + }, + { + "epoch": 5.780033840947547, + "grad_norm": 1.0147128105163574, + "learning_rate": 2.110265087422448e-05, + "loss": 0.7658, + "step": 10248 + }, + { + "epoch": 5.7805978567399885, + "grad_norm": 0.9388353228569031, + "learning_rate": 2.109983079526227e-05, + "loss": 0.7501, + "step": 10249 + }, + { + "epoch": 5.781161872532431, + "grad_norm": 1.0102256536483765, + "learning_rate": 2.1097010716300058e-05, + "loss": 0.7546, + "step": 10250 + }, + { + "epoch": 5.781725888324873, + "grad_norm": 0.9531445503234863, + "learning_rate": 2.1094190637337847e-05, + "loss": 0.7643, + "step": 10251 + }, + { + "epoch": 5.782289904117315, + "grad_norm": 0.9433715343475342, + "learning_rate": 2.1091370558375636e-05, + "loss": 0.7139, + "step": 10252 + }, + { + "epoch": 5.7828539199097575, + "grad_norm": 1.2810571193695068, + "learning_rate": 2.1088550479413425e-05, + "loss": 0.6923, + "step": 10253 + }, + { + "epoch": 5.783417935702199, + "grad_norm": 0.9733343124389648, + "learning_rate": 2.1085730400451213e-05, + "loss": 0.7312, + "step": 10254 + }, + { + "epoch": 5.783981951494642, + "grad_norm": 1.1100729703903198, + "learning_rate": 2.1082910321489002e-05, + "loss": 0.6724, + "step": 10255 + }, + { + "epoch": 5.784545967287084, + "grad_norm": 1.1616413593292236, + "learning_rate": 2.1080090242526794e-05, + "loss": 0.7383, + "step": 10256 + }, + { + "epoch": 5.7851099830795265, + "grad_norm": 1.167474389076233, + "learning_rate": 2.107727016356458e-05, + "loss": 0.7871, + "step": 10257 + }, + { + "epoch": 5.785673998871968, + "grad_norm": 1.5610407590866089, + "learning_rate": 2.1074450084602372e-05, + "loss": 0.8356, + "step": 10258 + }, + { + "epoch": 5.786238014664411, + "grad_norm": 1.103281021118164, + "learning_rate": 2.1071630005640157e-05, + "loss": 0.7424, + "step": 10259 + }, + { + "epoch": 5.786802030456853, + "grad_norm": 1.0812422037124634, + "learning_rate": 2.106880992667795e-05, + "loss": 0.673, + "step": 10260 + }, + { + "epoch": 5.787366046249295, + "grad_norm": 1.0344552993774414, + "learning_rate": 2.1065989847715735e-05, + "loss": 0.6648, + "step": 10261 + }, + { + "epoch": 5.787930062041737, + "grad_norm": 0.9565141797065735, + "learning_rate": 2.1063169768753527e-05, + "loss": 0.6959, + "step": 10262 + }, + { + "epoch": 5.788494077834179, + "grad_norm": 1.2461594343185425, + "learning_rate": 2.1060349689791316e-05, + "loss": 0.7461, + "step": 10263 + }, + { + "epoch": 5.789058093626622, + "grad_norm": 1.0776418447494507, + "learning_rate": 2.1057529610829105e-05, + "loss": 0.677, + "step": 10264 + }, + { + "epoch": 5.789622109419064, + "grad_norm": 1.007263422012329, + "learning_rate": 2.1054709531866893e-05, + "loss": 0.7126, + "step": 10265 + }, + { + "epoch": 5.790186125211505, + "grad_norm": 1.162896990776062, + "learning_rate": 2.1051889452904682e-05, + "loss": 0.6183, + "step": 10266 + }, + { + "epoch": 5.790750141003948, + "grad_norm": 0.9124925136566162, + "learning_rate": 2.104906937394247e-05, + "loss": 0.6038, + "step": 10267 + }, + { + "epoch": 5.79131415679639, + "grad_norm": 1.234656810760498, + "learning_rate": 2.104624929498026e-05, + "loss": 0.7248, + "step": 10268 + }, + { + "epoch": 5.791878172588833, + "grad_norm": 0.9380079507827759, + "learning_rate": 2.104342921601805e-05, + "loss": 0.737, + "step": 10269 + }, + { + "epoch": 5.7924421883812744, + "grad_norm": 1.1708694696426392, + "learning_rate": 2.1040609137055837e-05, + "loss": 0.735, + "step": 10270 + }, + { + "epoch": 5.793006204173717, + "grad_norm": 1.033779263496399, + "learning_rate": 2.1037789058093626e-05, + "loss": 0.7641, + "step": 10271 + }, + { + "epoch": 5.793570219966159, + "grad_norm": 1.1083154678344727, + "learning_rate": 2.103496897913142e-05, + "loss": 0.7747, + "step": 10272 + }, + { + "epoch": 5.794134235758602, + "grad_norm": 1.834172248840332, + "learning_rate": 2.1032148900169204e-05, + "loss": 0.8614, + "step": 10273 + }, + { + "epoch": 5.7946982515510435, + "grad_norm": 0.9342424869537354, + "learning_rate": 2.1029328821206996e-05, + "loss": 0.6446, + "step": 10274 + }, + { + "epoch": 5.795262267343485, + "grad_norm": 1.0591199398040771, + "learning_rate": 2.102650874224478e-05, + "loss": 0.7352, + "step": 10275 + }, + { + "epoch": 5.795826283135928, + "grad_norm": 1.1879124641418457, + "learning_rate": 2.1023688663282574e-05, + "loss": 0.8812, + "step": 10276 + }, + { + "epoch": 5.79639029892837, + "grad_norm": 1.2235982418060303, + "learning_rate": 2.102086858432036e-05, + "loss": 0.7911, + "step": 10277 + }, + { + "epoch": 5.7969543147208125, + "grad_norm": 1.2499216794967651, + "learning_rate": 2.101804850535815e-05, + "loss": 0.6736, + "step": 10278 + }, + { + "epoch": 5.797518330513254, + "grad_norm": 1.7421183586120605, + "learning_rate": 2.101522842639594e-05, + "loss": 0.8351, + "step": 10279 + }, + { + "epoch": 5.798082346305696, + "grad_norm": 1.5987293720245361, + "learning_rate": 2.101240834743373e-05, + "loss": 0.8357, + "step": 10280 + }, + { + "epoch": 5.798646362098139, + "grad_norm": 0.960813045501709, + "learning_rate": 2.100958826847152e-05, + "loss": 0.6771, + "step": 10281 + }, + { + "epoch": 5.799210377890581, + "grad_norm": 1.1440958976745605, + "learning_rate": 2.1006768189509306e-05, + "loss": 0.7312, + "step": 10282 + }, + { + "epoch": 5.799774393683023, + "grad_norm": 1.4759774208068848, + "learning_rate": 2.10039481105471e-05, + "loss": 0.7716, + "step": 10283 + }, + { + "epoch": 5.800338409475465, + "grad_norm": 1.188049077987671, + "learning_rate": 2.1001128031584884e-05, + "loss": 0.774, + "step": 10284 + }, + { + "epoch": 5.800902425267908, + "grad_norm": 1.0036290884017944, + "learning_rate": 2.0998307952622676e-05, + "loss": 0.7341, + "step": 10285 + }, + { + "epoch": 5.80146644106035, + "grad_norm": 1.0201674699783325, + "learning_rate": 2.099548787366046e-05, + "loss": 0.7721, + "step": 10286 + }, + { + "epoch": 5.802030456852792, + "grad_norm": 1.2132132053375244, + "learning_rate": 2.0992667794698254e-05, + "loss": 0.6894, + "step": 10287 + }, + { + "epoch": 5.802594472645234, + "grad_norm": 1.1989458799362183, + "learning_rate": 2.0989847715736043e-05, + "loss": 0.6904, + "step": 10288 + }, + { + "epoch": 5.803158488437676, + "grad_norm": 1.0240466594696045, + "learning_rate": 2.098702763677383e-05, + "loss": 0.6834, + "step": 10289 + }, + { + "epoch": 5.803722504230119, + "grad_norm": 0.8640313744544983, + "learning_rate": 2.098420755781162e-05, + "loss": 0.6515, + "step": 10290 + }, + { + "epoch": 5.80428652002256, + "grad_norm": 0.8529193997383118, + "learning_rate": 2.098138747884941e-05, + "loss": 0.7352, + "step": 10291 + }, + { + "epoch": 5.804850535815003, + "grad_norm": 1.217355728149414, + "learning_rate": 2.0978567399887198e-05, + "loss": 0.7509, + "step": 10292 + }, + { + "epoch": 5.805414551607445, + "grad_norm": 1.2400124073028564, + "learning_rate": 2.0975747320924987e-05, + "loss": 0.8495, + "step": 10293 + }, + { + "epoch": 5.805978567399887, + "grad_norm": 1.3673454523086548, + "learning_rate": 2.0972927241962775e-05, + "loss": 0.8139, + "step": 10294 + }, + { + "epoch": 5.806542583192329, + "grad_norm": 1.3814032077789307, + "learning_rate": 2.0970107163000564e-05, + "loss": 0.778, + "step": 10295 + }, + { + "epoch": 5.807106598984771, + "grad_norm": 1.6247446537017822, + "learning_rate": 2.0967287084038353e-05, + "loss": 0.8629, + "step": 10296 + }, + { + "epoch": 5.807670614777214, + "grad_norm": 0.817731499671936, + "learning_rate": 2.0964467005076145e-05, + "loss": 0.6521, + "step": 10297 + }, + { + "epoch": 5.808234630569656, + "grad_norm": 1.4986772537231445, + "learning_rate": 2.096164692611393e-05, + "loss": 0.7651, + "step": 10298 + }, + { + "epoch": 5.808798646362098, + "grad_norm": 1.1566500663757324, + "learning_rate": 2.0958826847151723e-05, + "loss": 0.8188, + "step": 10299 + }, + { + "epoch": 5.80936266215454, + "grad_norm": 1.309269905090332, + "learning_rate": 2.0956006768189508e-05, + "loss": 0.7543, + "step": 10300 + }, + { + "epoch": 5.809926677946983, + "grad_norm": 1.159575343132019, + "learning_rate": 2.09531866892273e-05, + "loss": 0.7404, + "step": 10301 + }, + { + "epoch": 5.810490693739425, + "grad_norm": 1.5213462114334106, + "learning_rate": 2.095036661026509e-05, + "loss": 0.738, + "step": 10302 + }, + { + "epoch": 5.8110547095318665, + "grad_norm": 1.516301155090332, + "learning_rate": 2.0947546531302878e-05, + "loss": 0.6881, + "step": 10303 + }, + { + "epoch": 5.811618725324309, + "grad_norm": 0.9928414821624756, + "learning_rate": 2.0944726452340667e-05, + "loss": 0.7605, + "step": 10304 + }, + { + "epoch": 5.812182741116751, + "grad_norm": 1.3591554164886475, + "learning_rate": 2.0941906373378456e-05, + "loss": 0.74, + "step": 10305 + }, + { + "epoch": 5.812746756909194, + "grad_norm": 1.2468572854995728, + "learning_rate": 2.0939086294416244e-05, + "loss": 0.8501, + "step": 10306 + }, + { + "epoch": 5.8133107727016355, + "grad_norm": 1.336046576499939, + "learning_rate": 2.0936266215454033e-05, + "loss": 0.6981, + "step": 10307 + }, + { + "epoch": 5.813874788494077, + "grad_norm": 1.2819615602493286, + "learning_rate": 2.0933446136491822e-05, + "loss": 0.7373, + "step": 10308 + }, + { + "epoch": 5.81443880428652, + "grad_norm": 1.087969183921814, + "learning_rate": 2.093062605752961e-05, + "loss": 0.781, + "step": 10309 + }, + { + "epoch": 5.815002820078962, + "grad_norm": 1.433969497680664, + "learning_rate": 2.09278059785674e-05, + "loss": 0.7671, + "step": 10310 + }, + { + "epoch": 5.8155668358714045, + "grad_norm": 0.9809502363204956, + "learning_rate": 2.092498589960519e-05, + "loss": 0.727, + "step": 10311 + }, + { + "epoch": 5.816130851663846, + "grad_norm": 1.379939079284668, + "learning_rate": 2.0922165820642977e-05, + "loss": 0.6946, + "step": 10312 + }, + { + "epoch": 5.816694867456289, + "grad_norm": 0.9492905139923096, + "learning_rate": 2.091934574168077e-05, + "loss": 0.747, + "step": 10313 + }, + { + "epoch": 5.817258883248731, + "grad_norm": 1.004873275756836, + "learning_rate": 2.0916525662718555e-05, + "loss": 0.7743, + "step": 10314 + }, + { + "epoch": 5.817822899041174, + "grad_norm": 1.0002844333648682, + "learning_rate": 2.0913705583756347e-05, + "loss": 0.6471, + "step": 10315 + }, + { + "epoch": 5.818386914833615, + "grad_norm": 1.1237188577651978, + "learning_rate": 2.0910885504794136e-05, + "loss": 0.7607, + "step": 10316 + }, + { + "epoch": 5.818950930626057, + "grad_norm": 1.1203968524932861, + "learning_rate": 2.0908065425831924e-05, + "loss": 0.6483, + "step": 10317 + }, + { + "epoch": 5.8195149464185, + "grad_norm": 1.4737471342086792, + "learning_rate": 2.0905245346869713e-05, + "loss": 0.814, + "step": 10318 + }, + { + "epoch": 5.820078962210942, + "grad_norm": 1.2297064065933228, + "learning_rate": 2.0902425267907502e-05, + "loss": 0.7754, + "step": 10319 + }, + { + "epoch": 5.820642978003384, + "grad_norm": 0.843859851360321, + "learning_rate": 2.0899605188945294e-05, + "loss": 0.684, + "step": 10320 + }, + { + "epoch": 5.821206993795826, + "grad_norm": 1.0956752300262451, + "learning_rate": 2.089678510998308e-05, + "loss": 0.7576, + "step": 10321 + }, + { + "epoch": 5.821771009588268, + "grad_norm": 1.2277166843414307, + "learning_rate": 2.0893965031020872e-05, + "loss": 0.7208, + "step": 10322 + }, + { + "epoch": 5.822335025380711, + "grad_norm": 1.2124803066253662, + "learning_rate": 2.0891144952058657e-05, + "loss": 0.6628, + "step": 10323 + }, + { + "epoch": 5.8228990411731525, + "grad_norm": 1.1394404172897339, + "learning_rate": 2.088832487309645e-05, + "loss": 0.821, + "step": 10324 + }, + { + "epoch": 5.823463056965595, + "grad_norm": 1.4209551811218262, + "learning_rate": 2.0885504794134235e-05, + "loss": 0.8015, + "step": 10325 + }, + { + "epoch": 5.824027072758037, + "grad_norm": 1.197966456413269, + "learning_rate": 2.0882684715172027e-05, + "loss": 0.745, + "step": 10326 + }, + { + "epoch": 5.82459108855048, + "grad_norm": 1.2219656705856323, + "learning_rate": 2.0879864636209816e-05, + "loss": 0.7302, + "step": 10327 + }, + { + "epoch": 5.8251551043429215, + "grad_norm": 1.265270471572876, + "learning_rate": 2.0877044557247605e-05, + "loss": 0.7946, + "step": 10328 + }, + { + "epoch": 5.825719120135364, + "grad_norm": 1.103991150856018, + "learning_rate": 2.0874224478285393e-05, + "loss": 0.7247, + "step": 10329 + }, + { + "epoch": 5.826283135927806, + "grad_norm": 1.1955243349075317, + "learning_rate": 2.0871404399323182e-05, + "loss": 0.7288, + "step": 10330 + }, + { + "epoch": 5.826847151720248, + "grad_norm": 1.4361165761947632, + "learning_rate": 2.086858432036097e-05, + "loss": 0.7497, + "step": 10331 + }, + { + "epoch": 5.8274111675126905, + "grad_norm": 1.4955273866653442, + "learning_rate": 2.086576424139876e-05, + "loss": 0.7591, + "step": 10332 + }, + { + "epoch": 5.827975183305132, + "grad_norm": 0.9304388761520386, + "learning_rate": 2.086294416243655e-05, + "loss": 0.7836, + "step": 10333 + }, + { + "epoch": 5.828539199097575, + "grad_norm": 1.4114372730255127, + "learning_rate": 2.0860124083474337e-05, + "loss": 0.88, + "step": 10334 + }, + { + "epoch": 5.829103214890017, + "grad_norm": 1.5988013744354248, + "learning_rate": 2.0857304004512126e-05, + "loss": 0.7487, + "step": 10335 + }, + { + "epoch": 5.829667230682459, + "grad_norm": 1.3426518440246582, + "learning_rate": 2.085448392554992e-05, + "loss": 0.7558, + "step": 10336 + }, + { + "epoch": 5.830231246474901, + "grad_norm": 1.4420315027236938, + "learning_rate": 2.0851663846587704e-05, + "loss": 0.835, + "step": 10337 + }, + { + "epoch": 5.830795262267343, + "grad_norm": 1.2440743446350098, + "learning_rate": 2.0848843767625496e-05, + "loss": 0.7852, + "step": 10338 + }, + { + "epoch": 5.831359278059786, + "grad_norm": 1.4373687505722046, + "learning_rate": 2.084602368866328e-05, + "loss": 0.7944, + "step": 10339 + }, + { + "epoch": 5.831923293852228, + "grad_norm": 0.8314909934997559, + "learning_rate": 2.0843203609701074e-05, + "loss": 0.7386, + "step": 10340 + }, + { + "epoch": 5.83248730964467, + "grad_norm": 1.0685604810714722, + "learning_rate": 2.084038353073886e-05, + "loss": 0.763, + "step": 10341 + }, + { + "epoch": 5.833051325437112, + "grad_norm": 1.1479110717773438, + "learning_rate": 2.083756345177665e-05, + "loss": 0.6811, + "step": 10342 + }, + { + "epoch": 5.833615341229555, + "grad_norm": 0.9862860441207886, + "learning_rate": 2.083474337281444e-05, + "loss": 0.7976, + "step": 10343 + }, + { + "epoch": 5.834179357021997, + "grad_norm": 0.9150574207305908, + "learning_rate": 2.083192329385223e-05, + "loss": 0.7039, + "step": 10344 + }, + { + "epoch": 5.8347433728144384, + "grad_norm": 1.6050617694854736, + "learning_rate": 2.0829103214890018e-05, + "loss": 0.8968, + "step": 10345 + }, + { + "epoch": 5.835307388606881, + "grad_norm": 1.4003524780273438, + "learning_rate": 2.0826283135927806e-05, + "loss": 0.657, + "step": 10346 + }, + { + "epoch": 5.835871404399323, + "grad_norm": 0.9667115211486816, + "learning_rate": 2.0823463056965595e-05, + "loss": 0.6095, + "step": 10347 + }, + { + "epoch": 5.836435420191766, + "grad_norm": 1.1238205432891846, + "learning_rate": 2.0820642978003384e-05, + "loss": 0.7126, + "step": 10348 + }, + { + "epoch": 5.8369994359842075, + "grad_norm": 1.083050012588501, + "learning_rate": 2.0817822899041173e-05, + "loss": 0.7761, + "step": 10349 + }, + { + "epoch": 5.837563451776649, + "grad_norm": 1.2461789846420288, + "learning_rate": 2.081500282007896e-05, + "loss": 0.7698, + "step": 10350 + }, + { + "epoch": 5.838127467569092, + "grad_norm": 1.2079684734344482, + "learning_rate": 2.0812182741116754e-05, + "loss": 0.7851, + "step": 10351 + }, + { + "epoch": 5.838691483361534, + "grad_norm": 1.287621259689331, + "learning_rate": 2.0809362662154542e-05, + "loss": 0.8497, + "step": 10352 + }, + { + "epoch": 5.8392554991539765, + "grad_norm": 1.3259046077728271, + "learning_rate": 2.080654258319233e-05, + "loss": 0.6982, + "step": 10353 + }, + { + "epoch": 5.839819514946418, + "grad_norm": 1.3736721277236938, + "learning_rate": 2.080372250423012e-05, + "loss": 0.7077, + "step": 10354 + }, + { + "epoch": 5.840383530738861, + "grad_norm": 1.3021469116210938, + "learning_rate": 2.080090242526791e-05, + "loss": 0.7595, + "step": 10355 + }, + { + "epoch": 5.840947546531303, + "grad_norm": 0.8184933662414551, + "learning_rate": 2.0798082346305698e-05, + "loss": 0.6884, + "step": 10356 + }, + { + "epoch": 5.8415115623237455, + "grad_norm": 1.3110703229904175, + "learning_rate": 2.0795262267343486e-05, + "loss": 0.7385, + "step": 10357 + }, + { + "epoch": 5.842075578116187, + "grad_norm": 1.3470929861068726, + "learning_rate": 2.0792442188381275e-05, + "loss": 0.722, + "step": 10358 + }, + { + "epoch": 5.842639593908629, + "grad_norm": 1.1561768054962158, + "learning_rate": 2.0789622109419064e-05, + "loss": 0.8458, + "step": 10359 + }, + { + "epoch": 5.843203609701072, + "grad_norm": 1.1118109226226807, + "learning_rate": 2.0786802030456853e-05, + "loss": 0.6735, + "step": 10360 + }, + { + "epoch": 5.843767625493514, + "grad_norm": 2.071638345718384, + "learning_rate": 2.0783981951494645e-05, + "loss": 0.8355, + "step": 10361 + }, + { + "epoch": 5.844331641285956, + "grad_norm": 1.4723135232925415, + "learning_rate": 2.078116187253243e-05, + "loss": 0.7672, + "step": 10362 + }, + { + "epoch": 5.844895657078398, + "grad_norm": 1.3828150033950806, + "learning_rate": 2.0778341793570223e-05, + "loss": 0.7647, + "step": 10363 + }, + { + "epoch": 5.84545967287084, + "grad_norm": 1.4076138734817505, + "learning_rate": 2.0775521714608008e-05, + "loss": 0.8747, + "step": 10364 + }, + { + "epoch": 5.846023688663283, + "grad_norm": 1.2101635932922363, + "learning_rate": 2.07727016356458e-05, + "loss": 0.6866, + "step": 10365 + }, + { + "epoch": 5.846587704455724, + "grad_norm": 1.170800805091858, + "learning_rate": 2.076988155668359e-05, + "loss": 0.7075, + "step": 10366 + }, + { + "epoch": 5.847151720248167, + "grad_norm": 1.352901577949524, + "learning_rate": 2.0767061477721378e-05, + "loss": 0.7153, + "step": 10367 + }, + { + "epoch": 5.847715736040609, + "grad_norm": 1.0514737367630005, + "learning_rate": 2.0764241398759167e-05, + "loss": 0.6142, + "step": 10368 + }, + { + "epoch": 5.848279751833052, + "grad_norm": 0.921158492565155, + "learning_rate": 2.0761421319796955e-05, + "loss": 0.6669, + "step": 10369 + }, + { + "epoch": 5.848843767625493, + "grad_norm": 0.8853246569633484, + "learning_rate": 2.0758601240834744e-05, + "loss": 0.6832, + "step": 10370 + }, + { + "epoch": 5.849407783417936, + "grad_norm": 1.0198323726654053, + "learning_rate": 2.0755781161872533e-05, + "loss": 0.7304, + "step": 10371 + }, + { + "epoch": 5.849971799210378, + "grad_norm": 1.1157255172729492, + "learning_rate": 2.0752961082910322e-05, + "loss": 0.9053, + "step": 10372 + }, + { + "epoch": 5.85053581500282, + "grad_norm": 1.0675629377365112, + "learning_rate": 2.075014100394811e-05, + "loss": 0.9367, + "step": 10373 + }, + { + "epoch": 5.851099830795262, + "grad_norm": 1.1604666709899902, + "learning_rate": 2.07473209249859e-05, + "loss": 0.7283, + "step": 10374 + }, + { + "epoch": 5.851663846587704, + "grad_norm": 1.0999799966812134, + "learning_rate": 2.074450084602369e-05, + "loss": 0.6606, + "step": 10375 + }, + { + "epoch": 5.852227862380147, + "grad_norm": 1.5305136442184448, + "learning_rate": 2.0741680767061477e-05, + "loss": 0.8134, + "step": 10376 + }, + { + "epoch": 5.852791878172589, + "grad_norm": 1.326855182647705, + "learning_rate": 2.073886068809927e-05, + "loss": 0.77, + "step": 10377 + }, + { + "epoch": 5.8533558939650305, + "grad_norm": 1.0859252214431763, + "learning_rate": 2.0736040609137055e-05, + "loss": 0.7626, + "step": 10378 + }, + { + "epoch": 5.853919909757473, + "grad_norm": 1.0459983348846436, + "learning_rate": 2.0733220530174847e-05, + "loss": 0.7318, + "step": 10379 + }, + { + "epoch": 5.854483925549915, + "grad_norm": 0.880793571472168, + "learning_rate": 2.0730400451212632e-05, + "loss": 0.6845, + "step": 10380 + }, + { + "epoch": 5.855047941342358, + "grad_norm": 1.1051052808761597, + "learning_rate": 2.0727580372250424e-05, + "loss": 0.7523, + "step": 10381 + }, + { + "epoch": 5.8556119571347995, + "grad_norm": 1.2490922212600708, + "learning_rate": 2.0724760293288213e-05, + "loss": 0.8094, + "step": 10382 + }, + { + "epoch": 5.856175972927242, + "grad_norm": 1.1541666984558105, + "learning_rate": 2.0721940214326002e-05, + "loss": 0.7307, + "step": 10383 + }, + { + "epoch": 5.856739988719684, + "grad_norm": 0.8535008430480957, + "learning_rate": 2.071912013536379e-05, + "loss": 0.7387, + "step": 10384 + }, + { + "epoch": 5.857304004512127, + "grad_norm": 1.1199300289154053, + "learning_rate": 2.071630005640158e-05, + "loss": 0.8319, + "step": 10385 + }, + { + "epoch": 5.8578680203045685, + "grad_norm": 0.9047502875328064, + "learning_rate": 2.0713479977439372e-05, + "loss": 0.6995, + "step": 10386 + }, + { + "epoch": 5.85843203609701, + "grad_norm": 1.1925071477890015, + "learning_rate": 2.0710659898477157e-05, + "loss": 0.7544, + "step": 10387 + }, + { + "epoch": 5.858996051889453, + "grad_norm": 1.0934990644454956, + "learning_rate": 2.070783981951495e-05, + "loss": 0.7815, + "step": 10388 + }, + { + "epoch": 5.859560067681895, + "grad_norm": 1.0696672201156616, + "learning_rate": 2.0705019740552735e-05, + "loss": 0.7748, + "step": 10389 + }, + { + "epoch": 5.8601240834743376, + "grad_norm": 1.2181237936019897, + "learning_rate": 2.0702199661590527e-05, + "loss": 0.7233, + "step": 10390 + }, + { + "epoch": 5.860688099266779, + "grad_norm": 1.0062673091888428, + "learning_rate": 2.0699379582628316e-05, + "loss": 0.7572, + "step": 10391 + }, + { + "epoch": 5.861252115059221, + "grad_norm": 0.9895596504211426, + "learning_rate": 2.0696559503666105e-05, + "loss": 0.785, + "step": 10392 + }, + { + "epoch": 5.861816130851664, + "grad_norm": 1.454849362373352, + "learning_rate": 2.0693739424703893e-05, + "loss": 0.6816, + "step": 10393 + }, + { + "epoch": 5.862380146644106, + "grad_norm": 1.5346319675445557, + "learning_rate": 2.0690919345741682e-05, + "loss": 0.7909, + "step": 10394 + }, + { + "epoch": 5.862944162436548, + "grad_norm": 1.0346888303756714, + "learning_rate": 2.068809926677947e-05, + "loss": 0.7726, + "step": 10395 + }, + { + "epoch": 5.86350817822899, + "grad_norm": 0.9773055911064148, + "learning_rate": 2.068527918781726e-05, + "loss": 0.6851, + "step": 10396 + }, + { + "epoch": 5.864072194021433, + "grad_norm": 1.5738978385925293, + "learning_rate": 2.068245910885505e-05, + "loss": 0.8667, + "step": 10397 + }, + { + "epoch": 5.864636209813875, + "grad_norm": 1.360961675643921, + "learning_rate": 2.0679639029892837e-05, + "loss": 0.7105, + "step": 10398 + }, + { + "epoch": 5.865200225606317, + "grad_norm": 1.0492914915084839, + "learning_rate": 2.0676818950930626e-05, + "loss": 0.7057, + "step": 10399 + }, + { + "epoch": 5.865764241398759, + "grad_norm": 1.2178103923797607, + "learning_rate": 2.0673998871968418e-05, + "loss": 0.7597, + "step": 10400 + }, + { + "epoch": 5.866328257191201, + "grad_norm": 1.0677953958511353, + "learning_rate": 2.0671178793006204e-05, + "loss": 0.7606, + "step": 10401 + }, + { + "epoch": 5.866892272983644, + "grad_norm": 0.7895663976669312, + "learning_rate": 2.0668358714043996e-05, + "loss": 0.6978, + "step": 10402 + }, + { + "epoch": 5.8674562887760855, + "grad_norm": 1.1882929801940918, + "learning_rate": 2.066553863508178e-05, + "loss": 0.7396, + "step": 10403 + }, + { + "epoch": 5.868020304568528, + "grad_norm": 0.8968794941902161, + "learning_rate": 2.0662718556119573e-05, + "loss": 0.6557, + "step": 10404 + }, + { + "epoch": 5.86858432036097, + "grad_norm": 1.0919311046600342, + "learning_rate": 2.0659898477157362e-05, + "loss": 0.6931, + "step": 10405 + }, + { + "epoch": 5.869148336153412, + "grad_norm": 1.1222496032714844, + "learning_rate": 2.065707839819515e-05, + "loss": 0.7442, + "step": 10406 + }, + { + "epoch": 5.8697123519458545, + "grad_norm": 0.9862481951713562, + "learning_rate": 2.065425831923294e-05, + "loss": 0.7031, + "step": 10407 + }, + { + "epoch": 5.870276367738296, + "grad_norm": 1.6389003992080688, + "learning_rate": 2.065143824027073e-05, + "loss": 0.7502, + "step": 10408 + }, + { + "epoch": 5.870840383530739, + "grad_norm": 0.954629123210907, + "learning_rate": 2.0648618161308517e-05, + "loss": 0.7506, + "step": 10409 + }, + { + "epoch": 5.871404399323181, + "grad_norm": 1.7692816257476807, + "learning_rate": 2.0645798082346306e-05, + "loss": 0.8198, + "step": 10410 + }, + { + "epoch": 5.8719684151156235, + "grad_norm": 1.1572808027267456, + "learning_rate": 2.0642978003384095e-05, + "loss": 0.7409, + "step": 10411 + }, + { + "epoch": 5.872532430908065, + "grad_norm": 1.226414680480957, + "learning_rate": 2.0640157924421884e-05, + "loss": 0.836, + "step": 10412 + }, + { + "epoch": 5.873096446700508, + "grad_norm": 1.2296332120895386, + "learning_rate": 2.0637337845459673e-05, + "loss": 0.6983, + "step": 10413 + }, + { + "epoch": 5.87366046249295, + "grad_norm": 1.032759428024292, + "learning_rate": 2.0634517766497465e-05, + "loss": 0.7955, + "step": 10414 + }, + { + "epoch": 5.874224478285392, + "grad_norm": 1.0920847654342651, + "learning_rate": 2.063169768753525e-05, + "loss": 0.7266, + "step": 10415 + }, + { + "epoch": 5.874788494077834, + "grad_norm": 0.885216474533081, + "learning_rate": 2.0628877608573042e-05, + "loss": 0.7415, + "step": 10416 + }, + { + "epoch": 5.875352509870276, + "grad_norm": 1.1632152795791626, + "learning_rate": 2.0626057529610828e-05, + "loss": 0.6874, + "step": 10417 + }, + { + "epoch": 5.875916525662719, + "grad_norm": 1.1575311422348022, + "learning_rate": 2.062323745064862e-05, + "loss": 0.721, + "step": 10418 + }, + { + "epoch": 5.876480541455161, + "grad_norm": 1.5121301412582397, + "learning_rate": 2.0620417371686405e-05, + "loss": 0.8113, + "step": 10419 + }, + { + "epoch": 5.877044557247602, + "grad_norm": 0.9125060439109802, + "learning_rate": 2.0617597292724198e-05, + "loss": 0.5567, + "step": 10420 + }, + { + "epoch": 5.877608573040045, + "grad_norm": 1.442146897315979, + "learning_rate": 2.0614777213761986e-05, + "loss": 0.8189, + "step": 10421 + }, + { + "epoch": 5.878172588832487, + "grad_norm": 1.184535264968872, + "learning_rate": 2.0611957134799775e-05, + "loss": 0.6351, + "step": 10422 + }, + { + "epoch": 5.87873660462493, + "grad_norm": 1.1288559436798096, + "learning_rate": 2.0609137055837567e-05, + "loss": 0.7567, + "step": 10423 + }, + { + "epoch": 5.8793006204173714, + "grad_norm": 1.0645464658737183, + "learning_rate": 2.0606316976875353e-05, + "loss": 0.7153, + "step": 10424 + }, + { + "epoch": 5.879864636209814, + "grad_norm": 0.9446566700935364, + "learning_rate": 2.0603496897913145e-05, + "loss": 0.7624, + "step": 10425 + }, + { + "epoch": 5.880428652002256, + "grad_norm": 1.0942307710647583, + "learning_rate": 2.060067681895093e-05, + "loss": 0.7678, + "step": 10426 + }, + { + "epoch": 5.880992667794699, + "grad_norm": 1.3029714822769165, + "learning_rate": 2.0597856739988723e-05, + "loss": 0.8363, + "step": 10427 + }, + { + "epoch": 5.8815566835871405, + "grad_norm": 1.9151787757873535, + "learning_rate": 2.0595036661026508e-05, + "loss": 0.7938, + "step": 10428 + }, + { + "epoch": 5.882120699379582, + "grad_norm": 1.090903878211975, + "learning_rate": 2.05922165820643e-05, + "loss": 0.7007, + "step": 10429 + }, + { + "epoch": 5.882684715172025, + "grad_norm": 1.2973047494888306, + "learning_rate": 2.058939650310209e-05, + "loss": 0.6749, + "step": 10430 + }, + { + "epoch": 5.883248730964467, + "grad_norm": 1.4193973541259766, + "learning_rate": 2.0586576424139878e-05, + "loss": 0.8565, + "step": 10431 + }, + { + "epoch": 5.8838127467569095, + "grad_norm": 1.3789421319961548, + "learning_rate": 2.0583756345177667e-05, + "loss": 0.7758, + "step": 10432 + }, + { + "epoch": 5.884376762549351, + "grad_norm": 1.1067699193954468, + "learning_rate": 2.0580936266215455e-05, + "loss": 0.6723, + "step": 10433 + }, + { + "epoch": 5.884940778341793, + "grad_norm": 1.0649610757827759, + "learning_rate": 2.0578116187253244e-05, + "loss": 0.6724, + "step": 10434 + }, + { + "epoch": 5.885504794134236, + "grad_norm": 1.5533119440078735, + "learning_rate": 2.0575296108291033e-05, + "loss": 0.7763, + "step": 10435 + }, + { + "epoch": 5.886068809926678, + "grad_norm": 2.6423897743225098, + "learning_rate": 2.0572476029328822e-05, + "loss": 0.7442, + "step": 10436 + }, + { + "epoch": 5.88663282571912, + "grad_norm": 1.5161513090133667, + "learning_rate": 2.056965595036661e-05, + "loss": 0.6861, + "step": 10437 + }, + { + "epoch": 5.887196841511562, + "grad_norm": 1.0959755182266235, + "learning_rate": 2.05668358714044e-05, + "loss": 0.85, + "step": 10438 + }, + { + "epoch": 5.887760857304005, + "grad_norm": 0.8807245492935181, + "learning_rate": 2.056401579244219e-05, + "loss": 0.6985, + "step": 10439 + }, + { + "epoch": 5.888324873096447, + "grad_norm": 1.1039031744003296, + "learning_rate": 2.0561195713479977e-05, + "loss": 0.701, + "step": 10440 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 1.0172971487045288, + "learning_rate": 2.055837563451777e-05, + "loss": 0.7749, + "step": 10441 + }, + { + "epoch": 5.889452904681331, + "grad_norm": 1.4779160022735596, + "learning_rate": 2.0555555555555555e-05, + "loss": 0.7548, + "step": 10442 + }, + { + "epoch": 5.890016920473773, + "grad_norm": 0.9879912734031677, + "learning_rate": 2.0552735476593347e-05, + "loss": 0.7515, + "step": 10443 + }, + { + "epoch": 5.890580936266216, + "grad_norm": 1.4436084032058716, + "learning_rate": 2.0549915397631132e-05, + "loss": 0.824, + "step": 10444 + }, + { + "epoch": 5.891144952058657, + "grad_norm": 1.0952692031860352, + "learning_rate": 2.0547095318668924e-05, + "loss": 0.6235, + "step": 10445 + }, + { + "epoch": 5.8917089678511, + "grad_norm": 1.3304184675216675, + "learning_rate": 2.0544275239706713e-05, + "loss": 0.6607, + "step": 10446 + }, + { + "epoch": 5.892272983643542, + "grad_norm": 0.888594925403595, + "learning_rate": 2.0541455160744502e-05, + "loss": 0.7494, + "step": 10447 + }, + { + "epoch": 5.892836999435984, + "grad_norm": 1.221816897392273, + "learning_rate": 2.053863508178229e-05, + "loss": 0.861, + "step": 10448 + }, + { + "epoch": 5.893401015228426, + "grad_norm": 1.279172420501709, + "learning_rate": 2.053581500282008e-05, + "loss": 0.7483, + "step": 10449 + }, + { + "epoch": 5.893965031020868, + "grad_norm": 0.9629322290420532, + "learning_rate": 2.0532994923857868e-05, + "loss": 0.7553, + "step": 10450 + }, + { + "epoch": 5.894529046813311, + "grad_norm": 1.2750576734542847, + "learning_rate": 2.0530174844895657e-05, + "loss": 0.7701, + "step": 10451 + }, + { + "epoch": 5.895093062605753, + "grad_norm": 1.0432591438293457, + "learning_rate": 2.0527354765933446e-05, + "loss": 0.6703, + "step": 10452 + }, + { + "epoch": 5.895657078398195, + "grad_norm": 0.9750784635543823, + "learning_rate": 2.0524534686971235e-05, + "loss": 0.682, + "step": 10453 + }, + { + "epoch": 5.896221094190637, + "grad_norm": 1.2466247081756592, + "learning_rate": 2.0521714608009023e-05, + "loss": 0.8026, + "step": 10454 + }, + { + "epoch": 5.89678510998308, + "grad_norm": 0.8729748725891113, + "learning_rate": 2.0518894529046816e-05, + "loss": 0.7535, + "step": 10455 + }, + { + "epoch": 5.897349125775522, + "grad_norm": 1.0928833484649658, + "learning_rate": 2.0516074450084604e-05, + "loss": 0.7261, + "step": 10456 + }, + { + "epoch": 5.8979131415679635, + "grad_norm": 1.096250057220459, + "learning_rate": 2.0513254371122393e-05, + "loss": 0.6027, + "step": 10457 + }, + { + "epoch": 5.898477157360406, + "grad_norm": 1.7902512550354004, + "learning_rate": 2.0510434292160182e-05, + "loss": 0.6973, + "step": 10458 + }, + { + "epoch": 5.899041173152848, + "grad_norm": 1.4166938066482544, + "learning_rate": 2.050761421319797e-05, + "loss": 0.7516, + "step": 10459 + }, + { + "epoch": 5.899605188945291, + "grad_norm": 1.019536018371582, + "learning_rate": 2.050479413423576e-05, + "loss": 0.6812, + "step": 10460 + }, + { + "epoch": 5.9001692047377325, + "grad_norm": 0.9721775054931641, + "learning_rate": 2.050197405527355e-05, + "loss": 0.719, + "step": 10461 + }, + { + "epoch": 5.900733220530174, + "grad_norm": 0.9468548893928528, + "learning_rate": 2.0499153976311337e-05, + "loss": 0.6545, + "step": 10462 + }, + { + "epoch": 5.901297236322617, + "grad_norm": 1.2168008089065552, + "learning_rate": 2.0496333897349126e-05, + "loss": 0.7631, + "step": 10463 + }, + { + "epoch": 5.901861252115059, + "grad_norm": 1.3311227560043335, + "learning_rate": 2.0493513818386918e-05, + "loss": 0.7721, + "step": 10464 + }, + { + "epoch": 5.9024252679075015, + "grad_norm": 1.19210684299469, + "learning_rate": 2.0490693739424704e-05, + "loss": 0.6799, + "step": 10465 + }, + { + "epoch": 5.902989283699943, + "grad_norm": 1.4807771444320679, + "learning_rate": 2.0487873660462496e-05, + "loss": 0.8321, + "step": 10466 + }, + { + "epoch": 5.903553299492386, + "grad_norm": 0.9534193873405457, + "learning_rate": 2.048505358150028e-05, + "loss": 0.6074, + "step": 10467 + }, + { + "epoch": 5.904117315284828, + "grad_norm": 1.1358790397644043, + "learning_rate": 2.0482233502538073e-05, + "loss": 0.6599, + "step": 10468 + }, + { + "epoch": 5.904681331077271, + "grad_norm": 1.2129572629928589, + "learning_rate": 2.0479413423575862e-05, + "loss": 0.7415, + "step": 10469 + }, + { + "epoch": 5.905245346869712, + "grad_norm": 0.9207345247268677, + "learning_rate": 2.047659334461365e-05, + "loss": 0.7232, + "step": 10470 + }, + { + "epoch": 5.905809362662154, + "grad_norm": 1.7721116542816162, + "learning_rate": 2.047377326565144e-05, + "loss": 0.8231, + "step": 10471 + }, + { + "epoch": 5.906373378454597, + "grad_norm": 1.452792763710022, + "learning_rate": 2.047095318668923e-05, + "loss": 0.7848, + "step": 10472 + }, + { + "epoch": 5.906937394247039, + "grad_norm": 0.9572867155075073, + "learning_rate": 2.0468133107727017e-05, + "loss": 0.6218, + "step": 10473 + }, + { + "epoch": 5.907501410039481, + "grad_norm": 0.8520444631576538, + "learning_rate": 2.0465313028764806e-05, + "loss": 0.6535, + "step": 10474 + }, + { + "epoch": 5.908065425831923, + "grad_norm": 0.9277737140655518, + "learning_rate": 2.0462492949802595e-05, + "loss": 0.6516, + "step": 10475 + }, + { + "epoch": 5.908629441624365, + "grad_norm": 1.341932773590088, + "learning_rate": 2.0459672870840384e-05, + "loss": 0.794, + "step": 10476 + }, + { + "epoch": 5.909193457416808, + "grad_norm": 1.3822250366210938, + "learning_rate": 2.0456852791878173e-05, + "loss": 0.7606, + "step": 10477 + }, + { + "epoch": 5.9097574732092495, + "grad_norm": 0.930888831615448, + "learning_rate": 2.0454032712915965e-05, + "loss": 0.6378, + "step": 10478 + }, + { + "epoch": 5.910321489001692, + "grad_norm": 1.2045024633407593, + "learning_rate": 2.045121263395375e-05, + "loss": 0.6455, + "step": 10479 + }, + { + "epoch": 5.910885504794134, + "grad_norm": 1.186229944229126, + "learning_rate": 2.0448392554991542e-05, + "loss": 0.6972, + "step": 10480 + }, + { + "epoch": 5.911449520586577, + "grad_norm": 0.8162946701049805, + "learning_rate": 2.0445572476029328e-05, + "loss": 0.7239, + "step": 10481 + }, + { + "epoch": 5.9120135363790185, + "grad_norm": 1.6487878561019897, + "learning_rate": 2.044275239706712e-05, + "loss": 0.7802, + "step": 10482 + }, + { + "epoch": 5.912577552171461, + "grad_norm": 1.1073566675186157, + "learning_rate": 2.0439932318104905e-05, + "loss": 0.7328, + "step": 10483 + }, + { + "epoch": 5.913141567963903, + "grad_norm": 1.0058461427688599, + "learning_rate": 2.0437112239142698e-05, + "loss": 0.7493, + "step": 10484 + }, + { + "epoch": 5.913705583756345, + "grad_norm": 1.2529692649841309, + "learning_rate": 2.0434292160180486e-05, + "loss": 0.7202, + "step": 10485 + }, + { + "epoch": 5.9142695995487875, + "grad_norm": 1.0513191223144531, + "learning_rate": 2.0431472081218275e-05, + "loss": 0.6851, + "step": 10486 + }, + { + "epoch": 5.914833615341229, + "grad_norm": 0.8515836596488953, + "learning_rate": 2.0428652002256064e-05, + "loss": 0.6194, + "step": 10487 + }, + { + "epoch": 5.915397631133672, + "grad_norm": 1.1439900398254395, + "learning_rate": 2.0425831923293853e-05, + "loss": 0.6849, + "step": 10488 + }, + { + "epoch": 5.915961646926114, + "grad_norm": 0.901187539100647, + "learning_rate": 2.042301184433164e-05, + "loss": 0.7144, + "step": 10489 + }, + { + "epoch": 5.916525662718556, + "grad_norm": 0.9425804615020752, + "learning_rate": 2.042019176536943e-05, + "loss": 0.6503, + "step": 10490 + }, + { + "epoch": 5.917089678510998, + "grad_norm": 1.0482910871505737, + "learning_rate": 2.0417371686407222e-05, + "loss": 0.7629, + "step": 10491 + }, + { + "epoch": 5.91765369430344, + "grad_norm": 1.1147058010101318, + "learning_rate": 2.0414551607445008e-05, + "loss": 0.803, + "step": 10492 + }, + { + "epoch": 5.918217710095883, + "grad_norm": 1.0226918458938599, + "learning_rate": 2.04117315284828e-05, + "loss": 0.681, + "step": 10493 + }, + { + "epoch": 5.918781725888325, + "grad_norm": 1.033752202987671, + "learning_rate": 2.040891144952059e-05, + "loss": 0.756, + "step": 10494 + }, + { + "epoch": 5.919345741680767, + "grad_norm": 1.3800348043441772, + "learning_rate": 2.0406091370558378e-05, + "loss": 0.7271, + "step": 10495 + }, + { + "epoch": 5.919909757473209, + "grad_norm": 1.4478381872177124, + "learning_rate": 2.0403271291596166e-05, + "loss": 0.7203, + "step": 10496 + }, + { + "epoch": 5.920473773265652, + "grad_norm": 1.0660656690597534, + "learning_rate": 2.0400451212633955e-05, + "loss": 0.8853, + "step": 10497 + }, + { + "epoch": 5.921037789058094, + "grad_norm": 1.0242153406143188, + "learning_rate": 2.0397631133671744e-05, + "loss": 0.7159, + "step": 10498 + }, + { + "epoch": 5.9216018048505354, + "grad_norm": 0.9593003988265991, + "learning_rate": 2.0394811054709533e-05, + "loss": 0.739, + "step": 10499 + }, + { + "epoch": 5.922165820642978, + "grad_norm": 1.316802740097046, + "learning_rate": 2.039199097574732e-05, + "loss": 0.6816, + "step": 10500 + }, + { + "epoch": 5.92272983643542, + "grad_norm": 0.9132954478263855, + "learning_rate": 2.038917089678511e-05, + "loss": 0.6834, + "step": 10501 + }, + { + "epoch": 5.923293852227863, + "grad_norm": 1.0748995542526245, + "learning_rate": 2.03863508178229e-05, + "loss": 0.6296, + "step": 10502 + }, + { + "epoch": 5.9238578680203045, + "grad_norm": 1.075657844543457, + "learning_rate": 2.038353073886069e-05, + "loss": 0.6822, + "step": 10503 + }, + { + "epoch": 5.924421883812746, + "grad_norm": 0.9285945296287537, + "learning_rate": 2.0380710659898477e-05, + "loss": 0.7689, + "step": 10504 + }, + { + "epoch": 5.924985899605189, + "grad_norm": 1.158036470413208, + "learning_rate": 2.037789058093627e-05, + "loss": 0.8719, + "step": 10505 + }, + { + "epoch": 5.925549915397631, + "grad_norm": 1.087553858757019, + "learning_rate": 2.0375070501974054e-05, + "loss": 0.7563, + "step": 10506 + }, + { + "epoch": 5.9261139311900735, + "grad_norm": 1.1974892616271973, + "learning_rate": 2.0372250423011847e-05, + "loss": 0.7298, + "step": 10507 + }, + { + "epoch": 5.926677946982515, + "grad_norm": 0.9487780332565308, + "learning_rate": 2.0369430344049632e-05, + "loss": 0.7399, + "step": 10508 + }, + { + "epoch": 5.927241962774958, + "grad_norm": 1.2076613903045654, + "learning_rate": 2.0366610265087424e-05, + "loss": 0.7276, + "step": 10509 + }, + { + "epoch": 5.9278059785674, + "grad_norm": 0.8610526919364929, + "learning_rate": 2.0363790186125213e-05, + "loss": 0.672, + "step": 10510 + }, + { + "epoch": 5.9283699943598425, + "grad_norm": 1.0205094814300537, + "learning_rate": 2.0360970107163002e-05, + "loss": 0.7275, + "step": 10511 + }, + { + "epoch": 5.928934010152284, + "grad_norm": 1.780886173248291, + "learning_rate": 2.035815002820079e-05, + "loss": 0.9179, + "step": 10512 + }, + { + "epoch": 5.929498025944726, + "grad_norm": 0.9406675100326538, + "learning_rate": 2.035532994923858e-05, + "loss": 0.6946, + "step": 10513 + }, + { + "epoch": 5.930062041737169, + "grad_norm": 1.4115339517593384, + "learning_rate": 2.0352509870276368e-05, + "loss": 0.7643, + "step": 10514 + }, + { + "epoch": 5.930626057529611, + "grad_norm": 1.4629065990447998, + "learning_rate": 2.0349689791314157e-05, + "loss": 0.8041, + "step": 10515 + }, + { + "epoch": 5.931190073322053, + "grad_norm": 1.7083334922790527, + "learning_rate": 2.0346869712351946e-05, + "loss": 0.8078, + "step": 10516 + }, + { + "epoch": 5.931754089114495, + "grad_norm": 1.1332917213439941, + "learning_rate": 2.0344049633389735e-05, + "loss": 0.7115, + "step": 10517 + }, + { + "epoch": 5.932318104906937, + "grad_norm": 1.0995533466339111, + "learning_rate": 2.0341229554427523e-05, + "loss": 0.6816, + "step": 10518 + }, + { + "epoch": 5.93288212069938, + "grad_norm": 1.1457951068878174, + "learning_rate": 2.0338409475465316e-05, + "loss": 0.7128, + "step": 10519 + }, + { + "epoch": 5.933446136491821, + "grad_norm": 0.9728316068649292, + "learning_rate": 2.03355893965031e-05, + "loss": 0.7058, + "step": 10520 + }, + { + "epoch": 5.934010152284264, + "grad_norm": 1.1914680004119873, + "learning_rate": 2.0332769317540893e-05, + "loss": 0.6713, + "step": 10521 + }, + { + "epoch": 5.934574168076706, + "grad_norm": 1.0160017013549805, + "learning_rate": 2.032994923857868e-05, + "loss": 0.6912, + "step": 10522 + }, + { + "epoch": 5.935138183869149, + "grad_norm": 1.000402569770813, + "learning_rate": 2.032712915961647e-05, + "loss": 0.6388, + "step": 10523 + }, + { + "epoch": 5.93570219966159, + "grad_norm": 0.9486568570137024, + "learning_rate": 2.032430908065426e-05, + "loss": 0.7109, + "step": 10524 + }, + { + "epoch": 5.936266215454033, + "grad_norm": 1.0980442762374878, + "learning_rate": 2.032148900169205e-05, + "loss": 0.7523, + "step": 10525 + }, + { + "epoch": 5.936830231246475, + "grad_norm": 1.2799071073532104, + "learning_rate": 2.0318668922729837e-05, + "loss": 0.8767, + "step": 10526 + }, + { + "epoch": 5.937394247038917, + "grad_norm": 0.9685234427452087, + "learning_rate": 2.0315848843767626e-05, + "loss": 0.6637, + "step": 10527 + }, + { + "epoch": 5.937958262831359, + "grad_norm": 1.0638600587844849, + "learning_rate": 2.0313028764805418e-05, + "loss": 0.7306, + "step": 10528 + }, + { + "epoch": 5.938522278623801, + "grad_norm": 1.0066821575164795, + "learning_rate": 2.0310208685843204e-05, + "loss": 0.6283, + "step": 10529 + }, + { + "epoch": 5.939086294416244, + "grad_norm": 1.2811676263809204, + "learning_rate": 2.0307388606880996e-05, + "loss": 0.6099, + "step": 10530 + }, + { + "epoch": 5.939650310208686, + "grad_norm": 1.172647476196289, + "learning_rate": 2.030456852791878e-05, + "loss": 0.7162, + "step": 10531 + }, + { + "epoch": 5.940214326001128, + "grad_norm": 1.4370402097702026, + "learning_rate": 2.0301748448956573e-05, + "loss": 0.8782, + "step": 10532 + }, + { + "epoch": 5.94077834179357, + "grad_norm": 1.1890878677368164, + "learning_rate": 2.0298928369994362e-05, + "loss": 0.6553, + "step": 10533 + }, + { + "epoch": 5.941342357586012, + "grad_norm": 1.161668062210083, + "learning_rate": 2.029610829103215e-05, + "loss": 0.7481, + "step": 10534 + }, + { + "epoch": 5.941906373378455, + "grad_norm": 1.3986247777938843, + "learning_rate": 2.029328821206994e-05, + "loss": 0.8829, + "step": 10535 + }, + { + "epoch": 5.9424703891708965, + "grad_norm": 0.8112261891365051, + "learning_rate": 2.029046813310773e-05, + "loss": 0.666, + "step": 10536 + }, + { + "epoch": 5.943034404963339, + "grad_norm": 1.3638886213302612, + "learning_rate": 2.0287648054145517e-05, + "loss": 0.6496, + "step": 10537 + }, + { + "epoch": 5.943598420755781, + "grad_norm": 2.0031960010528564, + "learning_rate": 2.0284827975183306e-05, + "loss": 0.9717, + "step": 10538 + }, + { + "epoch": 5.944162436548224, + "grad_norm": 0.7891957759857178, + "learning_rate": 2.0282007896221095e-05, + "loss": 0.6753, + "step": 10539 + }, + { + "epoch": 5.9447264523406655, + "grad_norm": 0.8994110226631165, + "learning_rate": 2.0279187817258884e-05, + "loss": 0.6667, + "step": 10540 + }, + { + "epoch": 5.945290468133107, + "grad_norm": 1.0391308069229126, + "learning_rate": 2.0276367738296672e-05, + "loss": 0.7299, + "step": 10541 + }, + { + "epoch": 5.94585448392555, + "grad_norm": 0.93760746717453, + "learning_rate": 2.0273547659334465e-05, + "loss": 0.7706, + "step": 10542 + }, + { + "epoch": 5.946418499717992, + "grad_norm": 0.8489275574684143, + "learning_rate": 2.027072758037225e-05, + "loss": 0.7264, + "step": 10543 + }, + { + "epoch": 5.9469825155104346, + "grad_norm": 1.0967557430267334, + "learning_rate": 2.0267907501410042e-05, + "loss": 0.7996, + "step": 10544 + }, + { + "epoch": 5.947546531302876, + "grad_norm": 1.102463722229004, + "learning_rate": 2.0265087422447828e-05, + "loss": 0.6893, + "step": 10545 + }, + { + "epoch": 5.948110547095319, + "grad_norm": 1.6163681745529175, + "learning_rate": 2.026226734348562e-05, + "loss": 0.8207, + "step": 10546 + }, + { + "epoch": 5.948674562887761, + "grad_norm": 1.1556286811828613, + "learning_rate": 2.0259447264523405e-05, + "loss": 0.7474, + "step": 10547 + }, + { + "epoch": 5.949238578680203, + "grad_norm": 0.9267195463180542, + "learning_rate": 2.0256627185561197e-05, + "loss": 0.6891, + "step": 10548 + }, + { + "epoch": 5.949802594472645, + "grad_norm": 1.6220797300338745, + "learning_rate": 2.0253807106598986e-05, + "loss": 0.7655, + "step": 10549 + }, + { + "epoch": 5.950366610265087, + "grad_norm": 1.2206257581710815, + "learning_rate": 2.0250987027636775e-05, + "loss": 0.7313, + "step": 10550 + }, + { + "epoch": 5.95093062605753, + "grad_norm": 1.3735549449920654, + "learning_rate": 2.0248166948674564e-05, + "loss": 0.7537, + "step": 10551 + }, + { + "epoch": 5.951494641849972, + "grad_norm": 0.7462092638015747, + "learning_rate": 2.0245346869712353e-05, + "loss": 0.6574, + "step": 10552 + }, + { + "epoch": 5.952058657642414, + "grad_norm": 1.168302297592163, + "learning_rate": 2.024252679075014e-05, + "loss": 0.8303, + "step": 10553 + }, + { + "epoch": 5.952622673434856, + "grad_norm": 1.1718157529830933, + "learning_rate": 2.023970671178793e-05, + "loss": 0.811, + "step": 10554 + }, + { + "epoch": 5.953186689227298, + "grad_norm": 1.2995778322219849, + "learning_rate": 2.023688663282572e-05, + "loss": 0.8126, + "step": 10555 + }, + { + "epoch": 5.953750705019741, + "grad_norm": 1.338700771331787, + "learning_rate": 2.0234066553863508e-05, + "loss": 0.7184, + "step": 10556 + }, + { + "epoch": 5.9543147208121825, + "grad_norm": 1.5334776639938354, + "learning_rate": 2.0231246474901297e-05, + "loss": 0.9179, + "step": 10557 + }, + { + "epoch": 5.954878736604625, + "grad_norm": 1.1191260814666748, + "learning_rate": 2.022842639593909e-05, + "loss": 0.6582, + "step": 10558 + }, + { + "epoch": 5.955442752397067, + "grad_norm": 1.1799700260162354, + "learning_rate": 2.0225606316976874e-05, + "loss": 0.8467, + "step": 10559 + }, + { + "epoch": 5.95600676818951, + "grad_norm": 1.161602258682251, + "learning_rate": 2.0222786238014666e-05, + "loss": 0.7396, + "step": 10560 + }, + { + "epoch": 5.9565707839819515, + "grad_norm": 1.1552373170852661, + "learning_rate": 2.0219966159052452e-05, + "loss": 0.7633, + "step": 10561 + }, + { + "epoch": 5.957134799774393, + "grad_norm": 1.4272810220718384, + "learning_rate": 2.0217146080090244e-05, + "loss": 0.7967, + "step": 10562 + }, + { + "epoch": 5.957698815566836, + "grad_norm": 1.6838929653167725, + "learning_rate": 2.0214326001128033e-05, + "loss": 0.9, + "step": 10563 + }, + { + "epoch": 5.958262831359278, + "grad_norm": 1.053084373474121, + "learning_rate": 2.021150592216582e-05, + "loss": 0.7436, + "step": 10564 + }, + { + "epoch": 5.9588268471517205, + "grad_norm": 1.2007157802581787, + "learning_rate": 2.020868584320361e-05, + "loss": 0.7732, + "step": 10565 + }, + { + "epoch": 5.959390862944162, + "grad_norm": 1.0844995975494385, + "learning_rate": 2.02058657642414e-05, + "loss": 0.765, + "step": 10566 + }, + { + "epoch": 5.959954878736605, + "grad_norm": 1.1421186923980713, + "learning_rate": 2.020304568527919e-05, + "loss": 0.841, + "step": 10567 + }, + { + "epoch": 5.960518894529047, + "grad_norm": 1.0724668502807617, + "learning_rate": 2.0200225606316977e-05, + "loss": 0.699, + "step": 10568 + }, + { + "epoch": 5.961082910321489, + "grad_norm": 1.7230920791625977, + "learning_rate": 2.019740552735477e-05, + "loss": 0.8332, + "step": 10569 + }, + { + "epoch": 5.961646926113931, + "grad_norm": 1.061848759651184, + "learning_rate": 2.0194585448392554e-05, + "loss": 0.7251, + "step": 10570 + }, + { + "epoch": 5.962210941906373, + "grad_norm": 2.55826735496521, + "learning_rate": 2.0191765369430347e-05, + "loss": 0.765, + "step": 10571 + }, + { + "epoch": 5.962774957698816, + "grad_norm": 1.1817357540130615, + "learning_rate": 2.0188945290468135e-05, + "loss": 0.752, + "step": 10572 + }, + { + "epoch": 5.963338973491258, + "grad_norm": 1.7187645435333252, + "learning_rate": 2.0186125211505924e-05, + "loss": 0.8889, + "step": 10573 + }, + { + "epoch": 5.9639029892837, + "grad_norm": 0.9647889733314514, + "learning_rate": 2.0183305132543713e-05, + "loss": 0.6814, + "step": 10574 + }, + { + "epoch": 5.964467005076142, + "grad_norm": 1.1175175905227661, + "learning_rate": 2.0180485053581502e-05, + "loss": 0.7936, + "step": 10575 + }, + { + "epoch": 5.965031020868584, + "grad_norm": 1.1742671728134155, + "learning_rate": 2.017766497461929e-05, + "loss": 0.7603, + "step": 10576 + }, + { + "epoch": 5.965595036661027, + "grad_norm": 1.0194165706634521, + "learning_rate": 2.017484489565708e-05, + "loss": 0.7158, + "step": 10577 + }, + { + "epoch": 5.9661590524534684, + "grad_norm": 1.4825726747512817, + "learning_rate": 2.0172024816694868e-05, + "loss": 0.768, + "step": 10578 + }, + { + "epoch": 5.966723068245911, + "grad_norm": 1.251847505569458, + "learning_rate": 2.0169204737732657e-05, + "loss": 0.9154, + "step": 10579 + }, + { + "epoch": 5.967287084038353, + "grad_norm": 1.1796451807022095, + "learning_rate": 2.0166384658770446e-05, + "loss": 0.7796, + "step": 10580 + }, + { + "epoch": 5.967851099830796, + "grad_norm": 0.9910492300987244, + "learning_rate": 2.0163564579808238e-05, + "loss": 0.7274, + "step": 10581 + }, + { + "epoch": 5.9684151156232375, + "grad_norm": 0.9571711421012878, + "learning_rate": 2.0160744500846023e-05, + "loss": 0.7423, + "step": 10582 + }, + { + "epoch": 5.968979131415679, + "grad_norm": 1.0981110334396362, + "learning_rate": 2.0157924421883815e-05, + "loss": 0.6699, + "step": 10583 + }, + { + "epoch": 5.969543147208122, + "grad_norm": 1.0949825048446655, + "learning_rate": 2.01551043429216e-05, + "loss": 0.8284, + "step": 10584 + }, + { + "epoch": 5.970107163000564, + "grad_norm": 1.4312052726745605, + "learning_rate": 2.0152284263959393e-05, + "loss": 0.8215, + "step": 10585 + }, + { + "epoch": 5.9706711787930065, + "grad_norm": 0.9487213492393494, + "learning_rate": 2.014946418499718e-05, + "loss": 0.6697, + "step": 10586 + }, + { + "epoch": 5.971235194585448, + "grad_norm": 1.1784580945968628, + "learning_rate": 2.014664410603497e-05, + "loss": 0.8156, + "step": 10587 + }, + { + "epoch": 5.971799210377891, + "grad_norm": 1.1964397430419922, + "learning_rate": 2.014382402707276e-05, + "loss": 0.8698, + "step": 10588 + }, + { + "epoch": 5.972363226170333, + "grad_norm": 1.1375211477279663, + "learning_rate": 2.0141003948110548e-05, + "loss": 0.7584, + "step": 10589 + }, + { + "epoch": 5.972927241962775, + "grad_norm": 1.0370492935180664, + "learning_rate": 2.0138183869148337e-05, + "loss": 0.7501, + "step": 10590 + }, + { + "epoch": 5.973491257755217, + "grad_norm": 1.0035569667816162, + "learning_rate": 2.0135363790186126e-05, + "loss": 0.7322, + "step": 10591 + }, + { + "epoch": 5.974055273547659, + "grad_norm": 4.114938259124756, + "learning_rate": 2.0132543711223915e-05, + "loss": 0.7409, + "step": 10592 + }, + { + "epoch": 5.974619289340102, + "grad_norm": 1.1891138553619385, + "learning_rate": 2.0129723632261703e-05, + "loss": 0.809, + "step": 10593 + }, + { + "epoch": 5.975183305132544, + "grad_norm": 1.0468693971633911, + "learning_rate": 2.0126903553299492e-05, + "loss": 0.7585, + "step": 10594 + }, + { + "epoch": 5.975747320924986, + "grad_norm": 1.4889099597930908, + "learning_rate": 2.012408347433728e-05, + "loss": 0.7377, + "step": 10595 + }, + { + "epoch": 5.976311336717428, + "grad_norm": 1.256626009941101, + "learning_rate": 2.012126339537507e-05, + "loss": 0.7766, + "step": 10596 + }, + { + "epoch": 5.97687535250987, + "grad_norm": 0.9201790690422058, + "learning_rate": 2.0118443316412862e-05, + "loss": 0.6623, + "step": 10597 + }, + { + "epoch": 5.977439368302313, + "grad_norm": 1.2052721977233887, + "learning_rate": 2.011562323745065e-05, + "loss": 0.7775, + "step": 10598 + }, + { + "epoch": 5.978003384094754, + "grad_norm": 1.0038400888442993, + "learning_rate": 2.011280315848844e-05, + "loss": 0.6912, + "step": 10599 + }, + { + "epoch": 5.978567399887197, + "grad_norm": 1.5944691896438599, + "learning_rate": 2.010998307952623e-05, + "loss": 0.8673, + "step": 10600 + }, + { + "epoch": 5.979131415679639, + "grad_norm": 1.1894696950912476, + "learning_rate": 2.0107163000564017e-05, + "loss": 0.728, + "step": 10601 + }, + { + "epoch": 5.979695431472082, + "grad_norm": 2.3016562461853027, + "learning_rate": 2.0104342921601806e-05, + "loss": 0.726, + "step": 10602 + }, + { + "epoch": 5.980259447264523, + "grad_norm": 1.0536456108093262, + "learning_rate": 2.0101522842639595e-05, + "loss": 0.685, + "step": 10603 + }, + { + "epoch": 5.980823463056965, + "grad_norm": 1.0415772199630737, + "learning_rate": 2.0098702763677384e-05, + "loss": 0.6959, + "step": 10604 + }, + { + "epoch": 5.981387478849408, + "grad_norm": 0.8044096231460571, + "learning_rate": 2.0095882684715172e-05, + "loss": 0.6724, + "step": 10605 + }, + { + "epoch": 5.98195149464185, + "grad_norm": 1.4139573574066162, + "learning_rate": 2.0093062605752965e-05, + "loss": 0.801, + "step": 10606 + }, + { + "epoch": 5.982515510434292, + "grad_norm": 1.0055736303329468, + "learning_rate": 2.009024252679075e-05, + "loss": 0.7806, + "step": 10607 + }, + { + "epoch": 5.983079526226734, + "grad_norm": 1.074540615081787, + "learning_rate": 2.0087422447828542e-05, + "loss": 0.7967, + "step": 10608 + }, + { + "epoch": 5.983643542019177, + "grad_norm": 1.085881233215332, + "learning_rate": 2.0084602368866328e-05, + "loss": 0.6542, + "step": 10609 + }, + { + "epoch": 5.984207557811619, + "grad_norm": 1.2591015100479126, + "learning_rate": 2.008178228990412e-05, + "loss": 0.7056, + "step": 10610 + }, + { + "epoch": 5.9847715736040605, + "grad_norm": 0.8288905024528503, + "learning_rate": 2.0078962210941905e-05, + "loss": 0.6753, + "step": 10611 + }, + { + "epoch": 5.985335589396503, + "grad_norm": 1.1252202987670898, + "learning_rate": 2.0076142131979697e-05, + "loss": 0.8606, + "step": 10612 + }, + { + "epoch": 5.985899605188945, + "grad_norm": 1.2145750522613525, + "learning_rate": 2.0073322053017486e-05, + "loss": 0.7862, + "step": 10613 + }, + { + "epoch": 5.986463620981388, + "grad_norm": 1.188681721687317, + "learning_rate": 2.0070501974055275e-05, + "loss": 0.7561, + "step": 10614 + }, + { + "epoch": 5.9870276367738295, + "grad_norm": 1.2096092700958252, + "learning_rate": 2.0067681895093064e-05, + "loss": 0.8065, + "step": 10615 + }, + { + "epoch": 5.987591652566272, + "grad_norm": 1.0981992483139038, + "learning_rate": 2.0064861816130853e-05, + "loss": 0.691, + "step": 10616 + }, + { + "epoch": 5.988155668358714, + "grad_norm": 1.0411765575408936, + "learning_rate": 2.006204173716864e-05, + "loss": 0.633, + "step": 10617 + }, + { + "epoch": 5.988719684151156, + "grad_norm": 0.9536280035972595, + "learning_rate": 2.005922165820643e-05, + "loss": 0.8306, + "step": 10618 + }, + { + "epoch": 5.9892836999435985, + "grad_norm": 0.9713473916053772, + "learning_rate": 2.005640157924422e-05, + "loss": 0.7671, + "step": 10619 + }, + { + "epoch": 5.98984771573604, + "grad_norm": 1.5206931829452515, + "learning_rate": 2.0053581500282008e-05, + "loss": 0.8056, + "step": 10620 + }, + { + "epoch": 5.990411731528483, + "grad_norm": 1.840725064277649, + "learning_rate": 2.0050761421319797e-05, + "loss": 0.7496, + "step": 10621 + }, + { + "epoch": 5.990975747320925, + "grad_norm": 1.8796117305755615, + "learning_rate": 2.004794134235759e-05, + "loss": 0.7925, + "step": 10622 + }, + { + "epoch": 5.991539763113368, + "grad_norm": 5.512580871582031, + "learning_rate": 2.0045121263395374e-05, + "loss": 0.7735, + "step": 10623 + }, + { + "epoch": 5.992103778905809, + "grad_norm": 1.1471726894378662, + "learning_rate": 2.0042301184433166e-05, + "loss": 0.7724, + "step": 10624 + }, + { + "epoch": 5.992667794698251, + "grad_norm": 1.0260207653045654, + "learning_rate": 2.0039481105470952e-05, + "loss": 0.6934, + "step": 10625 + }, + { + "epoch": 5.993231810490694, + "grad_norm": 1.305055856704712, + "learning_rate": 2.0036661026508744e-05, + "loss": 0.7534, + "step": 10626 + }, + { + "epoch": 5.993795826283136, + "grad_norm": 1.12186598777771, + "learning_rate": 2.0033840947546533e-05, + "loss": 0.7442, + "step": 10627 + }, + { + "epoch": 5.994359842075578, + "grad_norm": 1.6283787488937378, + "learning_rate": 2.003102086858432e-05, + "loss": 0.7651, + "step": 10628 + }, + { + "epoch": 5.99492385786802, + "grad_norm": 1.0534625053405762, + "learning_rate": 2.002820078962211e-05, + "loss": 0.6801, + "step": 10629 + }, + { + "epoch": 5.995487873660463, + "grad_norm": 0.983040988445282, + "learning_rate": 2.00253807106599e-05, + "loss": 0.6745, + "step": 10630 + }, + { + "epoch": 5.996051889452905, + "grad_norm": 0.9619300961494446, + "learning_rate": 2.0022560631697688e-05, + "loss": 0.6931, + "step": 10631 + }, + { + "epoch": 5.9966159052453465, + "grad_norm": 1.078843593597412, + "learning_rate": 2.0019740552735477e-05, + "loss": 0.683, + "step": 10632 + }, + { + "epoch": 5.997179921037789, + "grad_norm": 0.9601010084152222, + "learning_rate": 2.001692047377327e-05, + "loss": 0.741, + "step": 10633 + }, + { + "epoch": 5.997743936830231, + "grad_norm": 1.39041006565094, + "learning_rate": 2.0014100394811054e-05, + "loss": 0.7768, + "step": 10634 + }, + { + "epoch": 5.998307952622674, + "grad_norm": 1.1883444786071777, + "learning_rate": 2.0011280315848846e-05, + "loss": 0.6003, + "step": 10635 + }, + { + "epoch": 5.9988719684151155, + "grad_norm": 1.2328473329544067, + "learning_rate": 2.0008460236886635e-05, + "loss": 0.832, + "step": 10636 + }, + { + "epoch": 5.999435984207558, + "grad_norm": 1.0012056827545166, + "learning_rate": 2.0005640157924424e-05, + "loss": 0.7036, + "step": 10637 + }, + { + "epoch": 6.0, + "grad_norm": 2.028561592102051, + "learning_rate": 2.0002820078962213e-05, + "loss": 0.9035, + "step": 10638 + }, + { + "epoch": 6.000564015792442, + "grad_norm": 0.9974243640899658, + "learning_rate": 2e-05, + "loss": 0.7876, + "step": 10639 + }, + { + "epoch": 6.0011280315848845, + "grad_norm": 1.2094433307647705, + "learning_rate": 1.999717992103779e-05, + "loss": 0.744, + "step": 10640 + }, + { + "epoch": 6.001692047377326, + "grad_norm": 0.9518552422523499, + "learning_rate": 1.999435984207558e-05, + "loss": 0.7216, + "step": 10641 + }, + { + "epoch": 6.002256063169769, + "grad_norm": 1.4393208026885986, + "learning_rate": 1.9991539763113368e-05, + "loss": 0.8687, + "step": 10642 + }, + { + "epoch": 6.002820078962211, + "grad_norm": 0.9718635082244873, + "learning_rate": 1.9988719684151157e-05, + "loss": 0.7176, + "step": 10643 + }, + { + "epoch": 6.0033840947546535, + "grad_norm": 0.9794616103172302, + "learning_rate": 1.9985899605188946e-05, + "loss": 0.7368, + "step": 10644 + }, + { + "epoch": 6.003948110547095, + "grad_norm": 1.3881218433380127, + "learning_rate": 1.9983079526226738e-05, + "loss": 0.7553, + "step": 10645 + }, + { + "epoch": 6.004512126339537, + "grad_norm": 1.1670998334884644, + "learning_rate": 1.9980259447264523e-05, + "loss": 0.6554, + "step": 10646 + }, + { + "epoch": 6.00507614213198, + "grad_norm": 1.1870617866516113, + "learning_rate": 1.9977439368302315e-05, + "loss": 0.7994, + "step": 10647 + }, + { + "epoch": 6.005640157924422, + "grad_norm": 1.2230281829833984, + "learning_rate": 1.99746192893401e-05, + "loss": 0.7763, + "step": 10648 + }, + { + "epoch": 6.006204173716864, + "grad_norm": 1.404217004776001, + "learning_rate": 1.9971799210377893e-05, + "loss": 0.7494, + "step": 10649 + }, + { + "epoch": 6.006768189509306, + "grad_norm": 1.0192261934280396, + "learning_rate": 1.996897913141568e-05, + "loss": 0.7107, + "step": 10650 + }, + { + "epoch": 6.007332205301749, + "grad_norm": 0.8471778631210327, + "learning_rate": 1.996615905245347e-05, + "loss": 0.6721, + "step": 10651 + }, + { + "epoch": 6.007896221094191, + "grad_norm": 1.2796969413757324, + "learning_rate": 1.996333897349126e-05, + "loss": 0.6847, + "step": 10652 + }, + { + "epoch": 6.008460236886632, + "grad_norm": 1.5620695352554321, + "learning_rate": 1.9960518894529048e-05, + "loss": 0.768, + "step": 10653 + }, + { + "epoch": 6.009024252679075, + "grad_norm": 1.003352165222168, + "learning_rate": 1.9957698815566837e-05, + "loss": 0.7885, + "step": 10654 + }, + { + "epoch": 6.009588268471517, + "grad_norm": 0.9388307929039001, + "learning_rate": 1.9954878736604626e-05, + "loss": 0.7498, + "step": 10655 + }, + { + "epoch": 6.01015228426396, + "grad_norm": 1.254453420639038, + "learning_rate": 1.9952058657642415e-05, + "loss": 0.8098, + "step": 10656 + }, + { + "epoch": 6.0107163000564015, + "grad_norm": 1.3254634141921997, + "learning_rate": 1.9949238578680203e-05, + "loss": 0.794, + "step": 10657 + }, + { + "epoch": 6.011280315848844, + "grad_norm": 1.4542555809020996, + "learning_rate": 1.9946418499717992e-05, + "loss": 0.8123, + "step": 10658 + }, + { + "epoch": 6.011844331641286, + "grad_norm": 0.982172966003418, + "learning_rate": 1.994359842075578e-05, + "loss": 0.805, + "step": 10659 + }, + { + "epoch": 6.012408347433728, + "grad_norm": 1.3376480340957642, + "learning_rate": 1.994077834179357e-05, + "loss": 0.7585, + "step": 10660 + }, + { + "epoch": 6.0129723632261705, + "grad_norm": 1.0524548292160034, + "learning_rate": 1.9937958262831362e-05, + "loss": 0.7548, + "step": 10661 + }, + { + "epoch": 6.013536379018612, + "grad_norm": 1.3956010341644287, + "learning_rate": 1.9935138183869147e-05, + "loss": 0.8272, + "step": 10662 + }, + { + "epoch": 6.014100394811055, + "grad_norm": 0.911392092704773, + "learning_rate": 1.993231810490694e-05, + "loss": 0.6795, + "step": 10663 + }, + { + "epoch": 6.014664410603497, + "grad_norm": 1.1481726169586182, + "learning_rate": 1.9929498025944725e-05, + "loss": 0.8005, + "step": 10664 + }, + { + "epoch": 6.0152284263959395, + "grad_norm": 1.3794978857040405, + "learning_rate": 1.9926677946982517e-05, + "loss": 0.8427, + "step": 10665 + }, + { + "epoch": 6.015792442188381, + "grad_norm": 1.2437959909439087, + "learning_rate": 1.9923857868020303e-05, + "loss": 0.7185, + "step": 10666 + }, + { + "epoch": 6.016356457980823, + "grad_norm": 1.399409532546997, + "learning_rate": 1.9921037789058095e-05, + "loss": 0.7216, + "step": 10667 + }, + { + "epoch": 6.016920473773266, + "grad_norm": 1.1587111949920654, + "learning_rate": 1.9918217710095884e-05, + "loss": 0.6349, + "step": 10668 + }, + { + "epoch": 6.017484489565708, + "grad_norm": 1.4195712804794312, + "learning_rate": 1.9915397631133672e-05, + "loss": 0.8155, + "step": 10669 + }, + { + "epoch": 6.01804850535815, + "grad_norm": 1.0895440578460693, + "learning_rate": 1.9912577552171464e-05, + "loss": 0.7389, + "step": 10670 + }, + { + "epoch": 6.018612521150592, + "grad_norm": 0.8522146940231323, + "learning_rate": 1.990975747320925e-05, + "loss": 0.6692, + "step": 10671 + }, + { + "epoch": 6.019176536943035, + "grad_norm": 0.9579347372055054, + "learning_rate": 1.9906937394247042e-05, + "loss": 0.7069, + "step": 10672 + }, + { + "epoch": 6.019740552735477, + "grad_norm": 1.703454613685608, + "learning_rate": 1.9904117315284827e-05, + "loss": 0.8495, + "step": 10673 + }, + { + "epoch": 6.020304568527918, + "grad_norm": 1.2053871154785156, + "learning_rate": 1.990129723632262e-05, + "loss": 0.6554, + "step": 10674 + }, + { + "epoch": 6.020868584320361, + "grad_norm": 0.9924936294555664, + "learning_rate": 1.9898477157360405e-05, + "loss": 0.7265, + "step": 10675 + }, + { + "epoch": 6.021432600112803, + "grad_norm": 1.0584924221038818, + "learning_rate": 1.9895657078398197e-05, + "loss": 0.7831, + "step": 10676 + }, + { + "epoch": 6.021996615905246, + "grad_norm": 1.4073020219802856, + "learning_rate": 1.9892836999435986e-05, + "loss": 0.8128, + "step": 10677 + }, + { + "epoch": 6.022560631697687, + "grad_norm": 0.9644148349761963, + "learning_rate": 1.9890016920473775e-05, + "loss": 0.7569, + "step": 10678 + }, + { + "epoch": 6.02312464749013, + "grad_norm": 1.004822850227356, + "learning_rate": 1.9887196841511564e-05, + "loss": 0.67, + "step": 10679 + }, + { + "epoch": 6.023688663282572, + "grad_norm": 1.0839903354644775, + "learning_rate": 1.9884376762549352e-05, + "loss": 0.7827, + "step": 10680 + }, + { + "epoch": 6.024252679075014, + "grad_norm": 1.3784266710281372, + "learning_rate": 1.988155668358714e-05, + "loss": 0.7571, + "step": 10681 + }, + { + "epoch": 6.024816694867456, + "grad_norm": 1.2702741622924805, + "learning_rate": 1.987873660462493e-05, + "loss": 0.811, + "step": 10682 + }, + { + "epoch": 6.025380710659898, + "grad_norm": 1.4409613609313965, + "learning_rate": 1.987591652566272e-05, + "loss": 0.7192, + "step": 10683 + }, + { + "epoch": 6.025944726452341, + "grad_norm": 1.3942605257034302, + "learning_rate": 1.9873096446700508e-05, + "loss": 0.7868, + "step": 10684 + }, + { + "epoch": 6.026508742244783, + "grad_norm": 1.30878484249115, + "learning_rate": 1.9870276367738296e-05, + "loss": 0.6642, + "step": 10685 + }, + { + "epoch": 6.027072758037225, + "grad_norm": 1.1570134162902832, + "learning_rate": 1.986745628877609e-05, + "loss": 0.7846, + "step": 10686 + }, + { + "epoch": 6.027636773829667, + "grad_norm": 1.060621738433838, + "learning_rate": 1.9864636209813874e-05, + "loss": 0.7977, + "step": 10687 + }, + { + "epoch": 6.028200789622109, + "grad_norm": 1.0125268697738647, + "learning_rate": 1.9861816130851666e-05, + "loss": 0.7632, + "step": 10688 + }, + { + "epoch": 6.028764805414552, + "grad_norm": 1.1199893951416016, + "learning_rate": 1.985899605188945e-05, + "loss": 0.6792, + "step": 10689 + }, + { + "epoch": 6.0293288212069935, + "grad_norm": 1.1376551389694214, + "learning_rate": 1.9856175972927244e-05, + "loss": 0.6499, + "step": 10690 + }, + { + "epoch": 6.029892836999436, + "grad_norm": 1.4063873291015625, + "learning_rate": 1.9853355893965033e-05, + "loss": 0.8059, + "step": 10691 + }, + { + "epoch": 6.030456852791878, + "grad_norm": 1.1874502897262573, + "learning_rate": 1.985053581500282e-05, + "loss": 0.7434, + "step": 10692 + }, + { + "epoch": 6.031020868584321, + "grad_norm": 0.976309597492218, + "learning_rate": 1.984771573604061e-05, + "loss": 0.7006, + "step": 10693 + }, + { + "epoch": 6.0315848843767625, + "grad_norm": 1.5931200981140137, + "learning_rate": 1.98448956570784e-05, + "loss": 0.829, + "step": 10694 + }, + { + "epoch": 6.032148900169204, + "grad_norm": 1.1271402835845947, + "learning_rate": 1.9842075578116188e-05, + "loss": 0.7234, + "step": 10695 + }, + { + "epoch": 6.032712915961647, + "grad_norm": 1.3333078622817993, + "learning_rate": 1.9839255499153977e-05, + "loss": 0.7866, + "step": 10696 + }, + { + "epoch": 6.033276931754089, + "grad_norm": 0.9462042450904846, + "learning_rate": 1.9836435420191765e-05, + "loss": 0.7136, + "step": 10697 + }, + { + "epoch": 6.0338409475465316, + "grad_norm": 0.9652673006057739, + "learning_rate": 1.9833615341229554e-05, + "loss": 0.667, + "step": 10698 + }, + { + "epoch": 6.034404963338973, + "grad_norm": 1.5153716802597046, + "learning_rate": 1.9830795262267343e-05, + "loss": 0.7989, + "step": 10699 + }, + { + "epoch": 6.034968979131416, + "grad_norm": 1.7341797351837158, + "learning_rate": 1.9827975183305135e-05, + "loss": 0.7136, + "step": 10700 + }, + { + "epoch": 6.035532994923858, + "grad_norm": 1.1273047924041748, + "learning_rate": 1.982515510434292e-05, + "loss": 0.7106, + "step": 10701 + }, + { + "epoch": 6.0360970107163, + "grad_norm": 1.0913852453231812, + "learning_rate": 1.9822335025380713e-05, + "loss": 0.7145, + "step": 10702 + }, + { + "epoch": 6.036661026508742, + "grad_norm": 1.202828288078308, + "learning_rate": 1.98195149464185e-05, + "loss": 0.7252, + "step": 10703 + }, + { + "epoch": 6.037225042301184, + "grad_norm": 1.3477071523666382, + "learning_rate": 1.981669486745629e-05, + "loss": 0.736, + "step": 10704 + }, + { + "epoch": 6.037789058093627, + "grad_norm": 1.6705968379974365, + "learning_rate": 1.981387478849408e-05, + "loss": 0.9049, + "step": 10705 + }, + { + "epoch": 6.038353073886069, + "grad_norm": 1.1317126750946045, + "learning_rate": 1.9811054709531868e-05, + "loss": 0.8718, + "step": 10706 + }, + { + "epoch": 6.038917089678511, + "grad_norm": 1.2016714811325073, + "learning_rate": 1.9808234630569657e-05, + "loss": 0.748, + "step": 10707 + }, + { + "epoch": 6.039481105470953, + "grad_norm": 1.2803444862365723, + "learning_rate": 1.9805414551607446e-05, + "loss": 0.7733, + "step": 10708 + }, + { + "epoch": 6.040045121263395, + "grad_norm": 1.0797145366668701, + "learning_rate": 1.9802594472645238e-05, + "loss": 0.6197, + "step": 10709 + }, + { + "epoch": 6.040609137055838, + "grad_norm": 1.297336220741272, + "learning_rate": 1.9799774393683023e-05, + "loss": 0.7218, + "step": 10710 + }, + { + "epoch": 6.0411731528482795, + "grad_norm": 1.3428130149841309, + "learning_rate": 1.9796954314720815e-05, + "loss": 0.7561, + "step": 10711 + }, + { + "epoch": 6.041737168640722, + "grad_norm": 1.0398257970809937, + "learning_rate": 1.97941342357586e-05, + "loss": 0.7411, + "step": 10712 + }, + { + "epoch": 6.042301184433164, + "grad_norm": 1.4327181577682495, + "learning_rate": 1.9791314156796393e-05, + "loss": 0.855, + "step": 10713 + }, + { + "epoch": 6.042865200225607, + "grad_norm": 1.126462459564209, + "learning_rate": 1.978849407783418e-05, + "loss": 0.7163, + "step": 10714 + }, + { + "epoch": 6.0434292160180485, + "grad_norm": 1.6042810678482056, + "learning_rate": 1.978567399887197e-05, + "loss": 0.8424, + "step": 10715 + }, + { + "epoch": 6.04399323181049, + "grad_norm": 1.1941596269607544, + "learning_rate": 1.978285391990976e-05, + "loss": 0.786, + "step": 10716 + }, + { + "epoch": 6.044557247602933, + "grad_norm": 1.0250065326690674, + "learning_rate": 1.9780033840947548e-05, + "loss": 0.7573, + "step": 10717 + }, + { + "epoch": 6.045121263395375, + "grad_norm": 1.08114492893219, + "learning_rate": 1.9777213761985337e-05, + "loss": 0.7202, + "step": 10718 + }, + { + "epoch": 6.0456852791878175, + "grad_norm": 0.8515208959579468, + "learning_rate": 1.9774393683023126e-05, + "loss": 0.7137, + "step": 10719 + }, + { + "epoch": 6.046249294980259, + "grad_norm": 1.0180681943893433, + "learning_rate": 1.9771573604060914e-05, + "loss": 0.7007, + "step": 10720 + }, + { + "epoch": 6.046813310772702, + "grad_norm": 1.087882399559021, + "learning_rate": 1.9768753525098703e-05, + "loss": 0.6875, + "step": 10721 + }, + { + "epoch": 6.047377326565144, + "grad_norm": 1.0189913511276245, + "learning_rate": 1.9765933446136492e-05, + "loss": 0.7187, + "step": 10722 + }, + { + "epoch": 6.047941342357586, + "grad_norm": 5.734337329864502, + "learning_rate": 1.976311336717428e-05, + "loss": 0.8027, + "step": 10723 + }, + { + "epoch": 6.048505358150028, + "grad_norm": 1.0434463024139404, + "learning_rate": 1.976029328821207e-05, + "loss": 0.7225, + "step": 10724 + }, + { + "epoch": 6.04906937394247, + "grad_norm": 1.868075966835022, + "learning_rate": 1.9757473209249862e-05, + "loss": 0.7701, + "step": 10725 + }, + { + "epoch": 6.049633389734913, + "grad_norm": 1.0406981706619263, + "learning_rate": 1.9754653130287647e-05, + "loss": 0.7163, + "step": 10726 + }, + { + "epoch": 6.050197405527355, + "grad_norm": 1.337268590927124, + "learning_rate": 1.975183305132544e-05, + "loss": 0.81, + "step": 10727 + }, + { + "epoch": 6.050761421319797, + "grad_norm": 0.9715641140937805, + "learning_rate": 1.9749012972363225e-05, + "loss": 0.688, + "step": 10728 + }, + { + "epoch": 6.051325437112239, + "grad_norm": 1.002236247062683, + "learning_rate": 1.9746192893401017e-05, + "loss": 0.6635, + "step": 10729 + }, + { + "epoch": 6.051889452904681, + "grad_norm": 0.9796140789985657, + "learning_rate": 1.9743372814438806e-05, + "loss": 0.7199, + "step": 10730 + }, + { + "epoch": 6.052453468697124, + "grad_norm": 1.7284467220306396, + "learning_rate": 1.9740552735476595e-05, + "loss": 0.6976, + "step": 10731 + }, + { + "epoch": 6.0530174844895654, + "grad_norm": 1.0943406820297241, + "learning_rate": 1.9737732656514383e-05, + "loss": 0.759, + "step": 10732 + }, + { + "epoch": 6.053581500282008, + "grad_norm": 1.2979663610458374, + "learning_rate": 1.9734912577552172e-05, + "loss": 0.6905, + "step": 10733 + }, + { + "epoch": 6.05414551607445, + "grad_norm": 0.9292561411857605, + "learning_rate": 1.973209249858996e-05, + "loss": 0.727, + "step": 10734 + }, + { + "epoch": 6.054709531866893, + "grad_norm": 1.2158046960830688, + "learning_rate": 1.972927241962775e-05, + "loss": 0.7989, + "step": 10735 + }, + { + "epoch": 6.0552735476593345, + "grad_norm": 1.1113293170928955, + "learning_rate": 1.972645234066554e-05, + "loss": 0.7226, + "step": 10736 + }, + { + "epoch": 6.055837563451776, + "grad_norm": 1.034735083580017, + "learning_rate": 1.9723632261703327e-05, + "loss": 0.7714, + "step": 10737 + }, + { + "epoch": 6.056401579244219, + "grad_norm": 0.9774123430252075, + "learning_rate": 1.972081218274112e-05, + "loss": 0.6768, + "step": 10738 + }, + { + "epoch": 6.056965595036661, + "grad_norm": 1.2169119119644165, + "learning_rate": 1.971799210377891e-05, + "loss": 0.673, + "step": 10739 + }, + { + "epoch": 6.0575296108291035, + "grad_norm": 1.0247308015823364, + "learning_rate": 1.9715172024816697e-05, + "loss": 0.6919, + "step": 10740 + }, + { + "epoch": 6.058093626621545, + "grad_norm": 1.0377901792526245, + "learning_rate": 1.9712351945854486e-05, + "loss": 0.718, + "step": 10741 + }, + { + "epoch": 6.058657642413988, + "grad_norm": 1.239364504814148, + "learning_rate": 1.9709531866892275e-05, + "loss": 0.7498, + "step": 10742 + }, + { + "epoch": 6.05922165820643, + "grad_norm": 1.452962875366211, + "learning_rate": 1.9706711787930064e-05, + "loss": 0.7963, + "step": 10743 + }, + { + "epoch": 6.059785673998872, + "grad_norm": 1.0098798274993896, + "learning_rate": 1.9703891708967852e-05, + "loss": 0.699, + "step": 10744 + }, + { + "epoch": 6.060349689791314, + "grad_norm": 1.47590172290802, + "learning_rate": 1.970107163000564e-05, + "loss": 0.7761, + "step": 10745 + }, + { + "epoch": 6.060913705583756, + "grad_norm": 1.2502025365829468, + "learning_rate": 1.969825155104343e-05, + "loss": 0.7486, + "step": 10746 + }, + { + "epoch": 6.061477721376199, + "grad_norm": 0.9649451971054077, + "learning_rate": 1.969543147208122e-05, + "loss": 0.7675, + "step": 10747 + }, + { + "epoch": 6.062041737168641, + "grad_norm": 17.14453887939453, + "learning_rate": 1.969261139311901e-05, + "loss": 1.1032, + "step": 10748 + }, + { + "epoch": 6.062605752961083, + "grad_norm": 1.446341872215271, + "learning_rate": 1.9689791314156796e-05, + "loss": 0.8649, + "step": 10749 + }, + { + "epoch": 6.063169768753525, + "grad_norm": 1.204288125038147, + "learning_rate": 1.968697123519459e-05, + "loss": 0.6926, + "step": 10750 + }, + { + "epoch": 6.063733784545967, + "grad_norm": 1.3890001773834229, + "learning_rate": 1.9684151156232374e-05, + "loss": 0.7622, + "step": 10751 + }, + { + "epoch": 6.06429780033841, + "grad_norm": 1.2295304536819458, + "learning_rate": 1.9681331077270166e-05, + "loss": 0.8297, + "step": 10752 + }, + { + "epoch": 6.064861816130851, + "grad_norm": 1.4684715270996094, + "learning_rate": 1.967851099830795e-05, + "loss": 0.7451, + "step": 10753 + }, + { + "epoch": 6.065425831923294, + "grad_norm": 1.1441468000411987, + "learning_rate": 1.9675690919345744e-05, + "loss": 0.774, + "step": 10754 + }, + { + "epoch": 6.065989847715736, + "grad_norm": 1.3495943546295166, + "learning_rate": 1.9672870840383533e-05, + "loss": 0.8747, + "step": 10755 + }, + { + "epoch": 6.066553863508179, + "grad_norm": 1.5501290559768677, + "learning_rate": 1.967005076142132e-05, + "loss": 0.8153, + "step": 10756 + }, + { + "epoch": 6.06711787930062, + "grad_norm": 0.9333034157752991, + "learning_rate": 1.966723068245911e-05, + "loss": 0.7331, + "step": 10757 + }, + { + "epoch": 6.067681895093062, + "grad_norm": 1.3356519937515259, + "learning_rate": 1.96644106034969e-05, + "loss": 0.8099, + "step": 10758 + }, + { + "epoch": 6.068245910885505, + "grad_norm": 1.0298283100128174, + "learning_rate": 1.9661590524534688e-05, + "loss": 0.753, + "step": 10759 + }, + { + "epoch": 6.068809926677947, + "grad_norm": 1.1062687635421753, + "learning_rate": 1.9658770445572477e-05, + "loss": 0.7184, + "step": 10760 + }, + { + "epoch": 6.069373942470389, + "grad_norm": 1.1687310934066772, + "learning_rate": 1.9655950366610265e-05, + "loss": 0.7014, + "step": 10761 + }, + { + "epoch": 6.069937958262831, + "grad_norm": 1.3799554109573364, + "learning_rate": 1.9653130287648054e-05, + "loss": 0.7152, + "step": 10762 + }, + { + "epoch": 6.070501974055274, + "grad_norm": 1.19923734664917, + "learning_rate": 1.9650310208685843e-05, + "loss": 0.6745, + "step": 10763 + }, + { + "epoch": 6.071065989847716, + "grad_norm": 1.13346529006958, + "learning_rate": 1.9647490129723635e-05, + "loss": 0.7502, + "step": 10764 + }, + { + "epoch": 6.0716300056401575, + "grad_norm": 1.6345431804656982, + "learning_rate": 1.964467005076142e-05, + "loss": 0.8504, + "step": 10765 + }, + { + "epoch": 6.0721940214326, + "grad_norm": 1.0667277574539185, + "learning_rate": 1.9641849971799213e-05, + "loss": 0.7207, + "step": 10766 + }, + { + "epoch": 6.072758037225042, + "grad_norm": 1.078487515449524, + "learning_rate": 1.9639029892836998e-05, + "loss": 0.6539, + "step": 10767 + }, + { + "epoch": 6.073322053017485, + "grad_norm": 1.7735774517059326, + "learning_rate": 1.963620981387479e-05, + "loss": 0.7579, + "step": 10768 + }, + { + "epoch": 6.0738860688099265, + "grad_norm": 0.9654979109764099, + "learning_rate": 1.9633389734912576e-05, + "loss": 0.82, + "step": 10769 + }, + { + "epoch": 6.074450084602369, + "grad_norm": 1.0125954151153564, + "learning_rate": 1.9630569655950368e-05, + "loss": 0.6039, + "step": 10770 + }, + { + "epoch": 6.075014100394811, + "grad_norm": 1.390116572380066, + "learning_rate": 1.9627749576988157e-05, + "loss": 0.7918, + "step": 10771 + }, + { + "epoch": 6.075578116187253, + "grad_norm": 0.8850312232971191, + "learning_rate": 1.9624929498025945e-05, + "loss": 0.6322, + "step": 10772 + }, + { + "epoch": 6.0761421319796955, + "grad_norm": 0.9914840459823608, + "learning_rate": 1.9622109419063734e-05, + "loss": 0.6623, + "step": 10773 + }, + { + "epoch": 6.076706147772137, + "grad_norm": 1.2304801940917969, + "learning_rate": 1.9619289340101523e-05, + "loss": 0.7082, + "step": 10774 + }, + { + "epoch": 6.07727016356458, + "grad_norm": 0.9654090404510498, + "learning_rate": 1.9616469261139315e-05, + "loss": 0.6432, + "step": 10775 + }, + { + "epoch": 6.077834179357022, + "grad_norm": 0.981464684009552, + "learning_rate": 1.96136491821771e-05, + "loss": 0.725, + "step": 10776 + }, + { + "epoch": 6.0783981951494646, + "grad_norm": 1.1930654048919678, + "learning_rate": 1.9610829103214893e-05, + "loss": 0.7856, + "step": 10777 + }, + { + "epoch": 6.078962210941906, + "grad_norm": 1.4893823862075806, + "learning_rate": 1.9608009024252678e-05, + "loss": 0.7885, + "step": 10778 + }, + { + "epoch": 6.079526226734348, + "grad_norm": 1.3832603693008423, + "learning_rate": 1.960518894529047e-05, + "loss": 0.7826, + "step": 10779 + }, + { + "epoch": 6.080090242526791, + "grad_norm": 1.2753221988677979, + "learning_rate": 1.960236886632826e-05, + "loss": 0.7435, + "step": 10780 + }, + { + "epoch": 6.080654258319233, + "grad_norm": 1.3696049451828003, + "learning_rate": 1.9599548787366048e-05, + "loss": 0.8712, + "step": 10781 + }, + { + "epoch": 6.081218274111675, + "grad_norm": 1.0048516988754272, + "learning_rate": 1.9596728708403837e-05, + "loss": 0.705, + "step": 10782 + }, + { + "epoch": 6.081782289904117, + "grad_norm": 1.4841090440750122, + "learning_rate": 1.9593908629441626e-05, + "loss": 0.8113, + "step": 10783 + }, + { + "epoch": 6.08234630569656, + "grad_norm": 1.002677321434021, + "learning_rate": 1.9591088550479414e-05, + "loss": 0.6448, + "step": 10784 + }, + { + "epoch": 6.082910321489002, + "grad_norm": 1.1306666135787964, + "learning_rate": 1.9588268471517203e-05, + "loss": 0.6587, + "step": 10785 + }, + { + "epoch": 6.0834743372814435, + "grad_norm": 1.3725197315216064, + "learning_rate": 1.9585448392554992e-05, + "loss": 0.9031, + "step": 10786 + }, + { + "epoch": 6.084038353073886, + "grad_norm": 1.1182283163070679, + "learning_rate": 1.958262831359278e-05, + "loss": 0.7419, + "step": 10787 + }, + { + "epoch": 6.084602368866328, + "grad_norm": 1.0074297189712524, + "learning_rate": 1.957980823463057e-05, + "loss": 0.723, + "step": 10788 + }, + { + "epoch": 6.085166384658771, + "grad_norm": 1.1232067346572876, + "learning_rate": 1.9576988155668362e-05, + "loss": 0.8325, + "step": 10789 + }, + { + "epoch": 6.0857304004512125, + "grad_norm": 1.212904453277588, + "learning_rate": 1.9574168076706147e-05, + "loss": 0.727, + "step": 10790 + }, + { + "epoch": 6.086294416243655, + "grad_norm": 3.94256854057312, + "learning_rate": 1.957134799774394e-05, + "loss": 0.7106, + "step": 10791 + }, + { + "epoch": 6.086858432036097, + "grad_norm": 0.9246199727058411, + "learning_rate": 1.9568527918781725e-05, + "loss": 0.7527, + "step": 10792 + }, + { + "epoch": 6.087422447828539, + "grad_norm": 1.3919661045074463, + "learning_rate": 1.9565707839819517e-05, + "loss": 0.7525, + "step": 10793 + }, + { + "epoch": 6.0879864636209815, + "grad_norm": 1.097495436668396, + "learning_rate": 1.9562887760857306e-05, + "loss": 0.7671, + "step": 10794 + }, + { + "epoch": 6.088550479413423, + "grad_norm": 1.0903276205062866, + "learning_rate": 1.9560067681895095e-05, + "loss": 0.7415, + "step": 10795 + }, + { + "epoch": 6.089114495205866, + "grad_norm": 1.1159311532974243, + "learning_rate": 1.9557247602932883e-05, + "loss": 0.7248, + "step": 10796 + }, + { + "epoch": 6.089678510998308, + "grad_norm": 0.9850367903709412, + "learning_rate": 1.9554427523970672e-05, + "loss": 0.6606, + "step": 10797 + }, + { + "epoch": 6.0902425267907505, + "grad_norm": 0.8555707931518555, + "learning_rate": 1.955160744500846e-05, + "loss": 0.5785, + "step": 10798 + }, + { + "epoch": 6.090806542583192, + "grad_norm": 1.3033831119537354, + "learning_rate": 1.954878736604625e-05, + "loss": 0.8193, + "step": 10799 + }, + { + "epoch": 6.091370558375634, + "grad_norm": 1.169226884841919, + "learning_rate": 1.954596728708404e-05, + "loss": 0.7345, + "step": 10800 + }, + { + "epoch": 6.091934574168077, + "grad_norm": 1.3484041690826416, + "learning_rate": 1.9543147208121827e-05, + "loss": 0.8512, + "step": 10801 + }, + { + "epoch": 6.092498589960519, + "grad_norm": 1.1296154260635376, + "learning_rate": 1.9540327129159616e-05, + "loss": 0.7773, + "step": 10802 + }, + { + "epoch": 6.093062605752961, + "grad_norm": 0.951311469078064, + "learning_rate": 1.953750705019741e-05, + "loss": 0.6755, + "step": 10803 + }, + { + "epoch": 6.093626621545403, + "grad_norm": 0.9024459719657898, + "learning_rate": 1.9534686971235194e-05, + "loss": 0.6435, + "step": 10804 + }, + { + "epoch": 6.094190637337846, + "grad_norm": 1.3756221532821655, + "learning_rate": 1.9531866892272986e-05, + "loss": 0.7469, + "step": 10805 + }, + { + "epoch": 6.094754653130288, + "grad_norm": 1.1470253467559814, + "learning_rate": 1.952904681331077e-05, + "loss": 0.6643, + "step": 10806 + }, + { + "epoch": 6.095318668922729, + "grad_norm": 1.2522310018539429, + "learning_rate": 1.9526226734348564e-05, + "loss": 0.759, + "step": 10807 + }, + { + "epoch": 6.095882684715172, + "grad_norm": 1.0990455150604248, + "learning_rate": 1.952340665538635e-05, + "loss": 0.6959, + "step": 10808 + }, + { + "epoch": 6.096446700507614, + "grad_norm": 1.2064452171325684, + "learning_rate": 1.952058657642414e-05, + "loss": 0.7271, + "step": 10809 + }, + { + "epoch": 6.097010716300057, + "grad_norm": 1.025922179222107, + "learning_rate": 1.951776649746193e-05, + "loss": 0.6927, + "step": 10810 + }, + { + "epoch": 6.0975747320924985, + "grad_norm": 0.9920139312744141, + "learning_rate": 1.951494641849972e-05, + "loss": 0.7759, + "step": 10811 + }, + { + "epoch": 6.098138747884941, + "grad_norm": 1.0243693590164185, + "learning_rate": 1.951212633953751e-05, + "loss": 0.729, + "step": 10812 + }, + { + "epoch": 6.098702763677383, + "grad_norm": 1.5794414281845093, + "learning_rate": 1.9509306260575296e-05, + "loss": 0.7909, + "step": 10813 + }, + { + "epoch": 6.099266779469825, + "grad_norm": 1.3814973831176758, + "learning_rate": 1.950648618161309e-05, + "loss": 0.8304, + "step": 10814 + }, + { + "epoch": 6.0998307952622675, + "grad_norm": 1.7694456577301025, + "learning_rate": 1.9503666102650874e-05, + "loss": 0.8694, + "step": 10815 + }, + { + "epoch": 6.100394811054709, + "grad_norm": 1.1724203824996948, + "learning_rate": 1.9500846023688666e-05, + "loss": 0.7027, + "step": 10816 + }, + { + "epoch": 6.100958826847152, + "grad_norm": 1.3806493282318115, + "learning_rate": 1.949802594472645e-05, + "loss": 0.8502, + "step": 10817 + }, + { + "epoch": 6.101522842639594, + "grad_norm": 1.0370161533355713, + "learning_rate": 1.9495205865764244e-05, + "loss": 0.7678, + "step": 10818 + }, + { + "epoch": 6.1020868584320365, + "grad_norm": 1.6015231609344482, + "learning_rate": 1.9492385786802032e-05, + "loss": 0.7701, + "step": 10819 + }, + { + "epoch": 6.102650874224478, + "grad_norm": 1.5386055707931519, + "learning_rate": 1.948956570783982e-05, + "loss": 0.6853, + "step": 10820 + }, + { + "epoch": 6.10321489001692, + "grad_norm": 0.8513607382774353, + "learning_rate": 1.948674562887761e-05, + "loss": 0.6561, + "step": 10821 + }, + { + "epoch": 6.103778905809363, + "grad_norm": 1.4016411304473877, + "learning_rate": 1.94839255499154e-05, + "loss": 0.8232, + "step": 10822 + }, + { + "epoch": 6.104342921601805, + "grad_norm": 1.2308202981948853, + "learning_rate": 1.9481105470953188e-05, + "loss": 0.7197, + "step": 10823 + }, + { + "epoch": 6.104906937394247, + "grad_norm": 1.0839978456497192, + "learning_rate": 1.9478285391990976e-05, + "loss": 0.7541, + "step": 10824 + }, + { + "epoch": 6.105470953186689, + "grad_norm": 0.9458489418029785, + "learning_rate": 1.9475465313028765e-05, + "loss": 0.7441, + "step": 10825 + }, + { + "epoch": 6.106034968979132, + "grad_norm": 1.3590142726898193, + "learning_rate": 1.9472645234066554e-05, + "loss": 0.7531, + "step": 10826 + }, + { + "epoch": 6.106598984771574, + "grad_norm": 0.8915112018585205, + "learning_rate": 1.9469825155104343e-05, + "loss": 0.7342, + "step": 10827 + }, + { + "epoch": 6.107163000564015, + "grad_norm": 1.256093144416809, + "learning_rate": 1.9467005076142135e-05, + "loss": 0.6969, + "step": 10828 + }, + { + "epoch": 6.107727016356458, + "grad_norm": 1.1145893335342407, + "learning_rate": 1.946418499717992e-05, + "loss": 0.7499, + "step": 10829 + }, + { + "epoch": 6.1082910321489, + "grad_norm": 1.0563125610351562, + "learning_rate": 1.9461364918217713e-05, + "loss": 0.5875, + "step": 10830 + }, + { + "epoch": 6.108855047941343, + "grad_norm": 1.0874212980270386, + "learning_rate": 1.9458544839255498e-05, + "loss": 0.6881, + "step": 10831 + }, + { + "epoch": 6.109419063733784, + "grad_norm": 0.845644474029541, + "learning_rate": 1.945572476029329e-05, + "loss": 0.6298, + "step": 10832 + }, + { + "epoch": 6.109983079526227, + "grad_norm": 1.807312250137329, + "learning_rate": 1.9452904681331076e-05, + "loss": 0.8483, + "step": 10833 + }, + { + "epoch": 6.110547095318669, + "grad_norm": 1.1223106384277344, + "learning_rate": 1.9450084602368868e-05, + "loss": 0.75, + "step": 10834 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.8630914092063904, + "learning_rate": 1.9447264523406657e-05, + "loss": 0.7142, + "step": 10835 + }, + { + "epoch": 6.111675126903553, + "grad_norm": 1.2413203716278076, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.6194, + "step": 10836 + }, + { + "epoch": 6.112239142695995, + "grad_norm": 1.3324930667877197, + "learning_rate": 1.9441624365482234e-05, + "loss": 0.6833, + "step": 10837 + }, + { + "epoch": 6.112803158488438, + "grad_norm": 1.4320051670074463, + "learning_rate": 1.9438804286520023e-05, + "loss": 0.7593, + "step": 10838 + }, + { + "epoch": 6.11336717428088, + "grad_norm": 1.2493716478347778, + "learning_rate": 1.9435984207557812e-05, + "loss": 0.7764, + "step": 10839 + }, + { + "epoch": 6.113931190073322, + "grad_norm": 1.2176166772842407, + "learning_rate": 1.94331641285956e-05, + "loss": 0.7722, + "step": 10840 + }, + { + "epoch": 6.114495205865764, + "grad_norm": 1.0924358367919922, + "learning_rate": 1.943034404963339e-05, + "loss": 0.7161, + "step": 10841 + }, + { + "epoch": 6.115059221658206, + "grad_norm": 1.1731772422790527, + "learning_rate": 1.9427523970671178e-05, + "loss": 0.7149, + "step": 10842 + }, + { + "epoch": 6.115623237450649, + "grad_norm": 0.9036427736282349, + "learning_rate": 1.9424703891708967e-05, + "loss": 0.7274, + "step": 10843 + }, + { + "epoch": 6.1161872532430905, + "grad_norm": 1.470563292503357, + "learning_rate": 1.942188381274676e-05, + "loss": 0.7811, + "step": 10844 + }, + { + "epoch": 6.116751269035533, + "grad_norm": 1.0802559852600098, + "learning_rate": 1.9419063733784548e-05, + "loss": 0.71, + "step": 10845 + }, + { + "epoch": 6.117315284827975, + "grad_norm": 2.1447174549102783, + "learning_rate": 1.9416243654822337e-05, + "loss": 0.7826, + "step": 10846 + }, + { + "epoch": 6.117879300620418, + "grad_norm": 1.4852546453475952, + "learning_rate": 1.9413423575860126e-05, + "loss": 0.6843, + "step": 10847 + }, + { + "epoch": 6.1184433164128595, + "grad_norm": 0.9973083138465881, + "learning_rate": 1.9410603496897914e-05, + "loss": 0.7623, + "step": 10848 + }, + { + "epoch": 6.119007332205301, + "grad_norm": 0.8620861172676086, + "learning_rate": 1.9407783417935703e-05, + "loss": 0.6231, + "step": 10849 + }, + { + "epoch": 6.119571347997744, + "grad_norm": 1.3913800716400146, + "learning_rate": 1.9404963338973492e-05, + "loss": 0.8847, + "step": 10850 + }, + { + "epoch": 6.120135363790186, + "grad_norm": 0.8476279377937317, + "learning_rate": 1.940214326001128e-05, + "loss": 0.6919, + "step": 10851 + }, + { + "epoch": 6.1206993795826286, + "grad_norm": 0.9407113790512085, + "learning_rate": 1.939932318104907e-05, + "loss": 0.7751, + "step": 10852 + }, + { + "epoch": 6.12126339537507, + "grad_norm": 1.1548956632614136, + "learning_rate": 1.939650310208686e-05, + "loss": 0.7033, + "step": 10853 + }, + { + "epoch": 6.121827411167513, + "grad_norm": 0.78803551197052, + "learning_rate": 1.9393683023124647e-05, + "loss": 0.6231, + "step": 10854 + }, + { + "epoch": 6.122391426959955, + "grad_norm": 1.191227674484253, + "learning_rate": 1.939086294416244e-05, + "loss": 0.7263, + "step": 10855 + }, + { + "epoch": 6.122955442752397, + "grad_norm": 1.0709519386291504, + "learning_rate": 1.9388042865200225e-05, + "loss": 0.6761, + "step": 10856 + }, + { + "epoch": 6.123519458544839, + "grad_norm": 1.0859348773956299, + "learning_rate": 1.9385222786238017e-05, + "loss": 0.7141, + "step": 10857 + }, + { + "epoch": 6.124083474337281, + "grad_norm": 1.066923975944519, + "learning_rate": 1.9382402707275806e-05, + "loss": 0.7975, + "step": 10858 + }, + { + "epoch": 6.124647490129724, + "grad_norm": 0.9951345324516296, + "learning_rate": 1.9379582628313594e-05, + "loss": 0.7893, + "step": 10859 + }, + { + "epoch": 6.125211505922166, + "grad_norm": 1.002539038658142, + "learning_rate": 1.9376762549351383e-05, + "loss": 0.7152, + "step": 10860 + }, + { + "epoch": 6.125775521714608, + "grad_norm": 0.8988202214241028, + "learning_rate": 1.9373942470389172e-05, + "loss": 0.7469, + "step": 10861 + }, + { + "epoch": 6.12633953750705, + "grad_norm": 1.238853096961975, + "learning_rate": 1.937112239142696e-05, + "loss": 0.8003, + "step": 10862 + }, + { + "epoch": 6.126903553299492, + "grad_norm": 1.0647612810134888, + "learning_rate": 1.936830231246475e-05, + "loss": 0.761, + "step": 10863 + }, + { + "epoch": 6.127467569091935, + "grad_norm": 1.349487543106079, + "learning_rate": 1.936548223350254e-05, + "loss": 0.7093, + "step": 10864 + }, + { + "epoch": 6.1280315848843765, + "grad_norm": 1.1567606925964355, + "learning_rate": 1.9362662154540327e-05, + "loss": 0.672, + "step": 10865 + }, + { + "epoch": 6.128595600676819, + "grad_norm": 1.15083909034729, + "learning_rate": 1.9359842075578116e-05, + "loss": 0.7569, + "step": 10866 + }, + { + "epoch": 6.129159616469261, + "grad_norm": 1.7477377653121948, + "learning_rate": 1.9357021996615908e-05, + "loss": 0.8808, + "step": 10867 + }, + { + "epoch": 6.129723632261704, + "grad_norm": 0.8391011357307434, + "learning_rate": 1.9354201917653694e-05, + "loss": 0.5769, + "step": 10868 + }, + { + "epoch": 6.1302876480541455, + "grad_norm": 1.637311339378357, + "learning_rate": 1.9351381838691486e-05, + "loss": 0.8399, + "step": 10869 + }, + { + "epoch": 6.130851663846587, + "grad_norm": 1.4945999383926392, + "learning_rate": 1.934856175972927e-05, + "loss": 0.8179, + "step": 10870 + }, + { + "epoch": 6.13141567963903, + "grad_norm": 1.5259292125701904, + "learning_rate": 1.9345741680767063e-05, + "loss": 0.828, + "step": 10871 + }, + { + "epoch": 6.131979695431472, + "grad_norm": 0.9563044309616089, + "learning_rate": 1.934292160180485e-05, + "loss": 0.6842, + "step": 10872 + }, + { + "epoch": 6.1325437112239145, + "grad_norm": 1.4402915239334106, + "learning_rate": 1.934010152284264e-05, + "loss": 0.8192, + "step": 10873 + }, + { + "epoch": 6.133107727016356, + "grad_norm": 1.19093918800354, + "learning_rate": 1.933728144388043e-05, + "loss": 0.7311, + "step": 10874 + }, + { + "epoch": 6.133671742808799, + "grad_norm": 0.9426616430282593, + "learning_rate": 1.933446136491822e-05, + "loss": 0.7157, + "step": 10875 + }, + { + "epoch": 6.134235758601241, + "grad_norm": 1.4486050605773926, + "learning_rate": 1.9331641285956007e-05, + "loss": 0.8094, + "step": 10876 + }, + { + "epoch": 6.134799774393683, + "grad_norm": 1.3001317977905273, + "learning_rate": 1.9328821206993796e-05, + "loss": 0.7817, + "step": 10877 + }, + { + "epoch": 6.135363790186125, + "grad_norm": 1.0414704084396362, + "learning_rate": 1.9326001128031585e-05, + "loss": 0.8102, + "step": 10878 + }, + { + "epoch": 6.135927805978567, + "grad_norm": 1.0591089725494385, + "learning_rate": 1.9323181049069374e-05, + "loss": 0.7369, + "step": 10879 + }, + { + "epoch": 6.13649182177101, + "grad_norm": 1.0413798093795776, + "learning_rate": 1.9320360970107166e-05, + "loss": 0.6904, + "step": 10880 + }, + { + "epoch": 6.137055837563452, + "grad_norm": 1.2104754447937012, + "learning_rate": 1.931754089114495e-05, + "loss": 0.6462, + "step": 10881 + }, + { + "epoch": 6.137619853355894, + "grad_norm": 1.2367234230041504, + "learning_rate": 1.9314720812182744e-05, + "loss": 0.7511, + "step": 10882 + }, + { + "epoch": 6.138183869148336, + "grad_norm": 1.046377182006836, + "learning_rate": 1.9311900733220532e-05, + "loss": 0.7702, + "step": 10883 + }, + { + "epoch": 6.138747884940778, + "grad_norm": 1.4566642045974731, + "learning_rate": 1.930908065425832e-05, + "loss": 0.8319, + "step": 10884 + }, + { + "epoch": 6.139311900733221, + "grad_norm": 1.3580135107040405, + "learning_rate": 1.930626057529611e-05, + "loss": 0.7346, + "step": 10885 + }, + { + "epoch": 6.1398759165256624, + "grad_norm": 1.1091444492340088, + "learning_rate": 1.93034404963339e-05, + "loss": 0.75, + "step": 10886 + }, + { + "epoch": 6.140439932318105, + "grad_norm": 1.0809426307678223, + "learning_rate": 1.9300620417371688e-05, + "loss": 0.7042, + "step": 10887 + }, + { + "epoch": 6.141003948110547, + "grad_norm": 0.8751786947250366, + "learning_rate": 1.9297800338409476e-05, + "loss": 0.7557, + "step": 10888 + }, + { + "epoch": 6.14156796390299, + "grad_norm": 1.5522228479385376, + "learning_rate": 1.9294980259447265e-05, + "loss": 0.8159, + "step": 10889 + }, + { + "epoch": 6.1421319796954315, + "grad_norm": 1.1523452997207642, + "learning_rate": 1.9292160180485054e-05, + "loss": 0.7838, + "step": 10890 + }, + { + "epoch": 6.142695995487873, + "grad_norm": 0.8802281022071838, + "learning_rate": 1.9289340101522843e-05, + "loss": 0.686, + "step": 10891 + }, + { + "epoch": 6.143260011280316, + "grad_norm": 1.231825590133667, + "learning_rate": 1.9286520022560635e-05, + "loss": 0.7212, + "step": 10892 + }, + { + "epoch": 6.143824027072758, + "grad_norm": 1.2257674932479858, + "learning_rate": 1.928369994359842e-05, + "loss": 0.7529, + "step": 10893 + }, + { + "epoch": 6.1443880428652005, + "grad_norm": 1.4001964330673218, + "learning_rate": 1.9280879864636213e-05, + "loss": 0.8292, + "step": 10894 + }, + { + "epoch": 6.144952058657642, + "grad_norm": 1.1147241592407227, + "learning_rate": 1.9278059785673998e-05, + "loss": 0.8006, + "step": 10895 + }, + { + "epoch": 6.145516074450085, + "grad_norm": 1.1877473592758179, + "learning_rate": 1.927523970671179e-05, + "loss": 0.7885, + "step": 10896 + }, + { + "epoch": 6.146080090242527, + "grad_norm": 1.172204613685608, + "learning_rate": 1.9272419627749576e-05, + "loss": 0.6846, + "step": 10897 + }, + { + "epoch": 6.146644106034969, + "grad_norm": 1.421626091003418, + "learning_rate": 1.9269599548787368e-05, + "loss": 0.8045, + "step": 10898 + }, + { + "epoch": 6.147208121827411, + "grad_norm": 1.1689796447753906, + "learning_rate": 1.9266779469825156e-05, + "loss": 0.7916, + "step": 10899 + }, + { + "epoch": 6.147772137619853, + "grad_norm": 1.2751293182373047, + "learning_rate": 1.9263959390862945e-05, + "loss": 0.6896, + "step": 10900 + }, + { + "epoch": 6.148336153412296, + "grad_norm": 1.14472496509552, + "learning_rate": 1.9261139311900734e-05, + "loss": 0.8575, + "step": 10901 + }, + { + "epoch": 6.148900169204738, + "grad_norm": 1.1403460502624512, + "learning_rate": 1.9258319232938523e-05, + "loss": 0.7378, + "step": 10902 + }, + { + "epoch": 6.14946418499718, + "grad_norm": 1.5820733308792114, + "learning_rate": 1.925549915397631e-05, + "loss": 0.8337, + "step": 10903 + }, + { + "epoch": 6.150028200789622, + "grad_norm": 0.9897407293319702, + "learning_rate": 1.92526790750141e-05, + "loss": 0.7159, + "step": 10904 + }, + { + "epoch": 6.150592216582064, + "grad_norm": 1.0501095056533813, + "learning_rate": 1.924985899605189e-05, + "loss": 0.6817, + "step": 10905 + }, + { + "epoch": 6.151156232374507, + "grad_norm": 1.0467082262039185, + "learning_rate": 1.9247038917089678e-05, + "loss": 0.7313, + "step": 10906 + }, + { + "epoch": 6.151720248166948, + "grad_norm": 1.0241883993148804, + "learning_rate": 1.9244218838127467e-05, + "loss": 0.7025, + "step": 10907 + }, + { + "epoch": 6.152284263959391, + "grad_norm": 1.1540518999099731, + "learning_rate": 1.924139875916526e-05, + "loss": 0.7514, + "step": 10908 + }, + { + "epoch": 6.152848279751833, + "grad_norm": 0.8052049875259399, + "learning_rate": 1.9238578680203044e-05, + "loss": 0.6406, + "step": 10909 + }, + { + "epoch": 6.153412295544276, + "grad_norm": 0.9145747423171997, + "learning_rate": 1.9235758601240837e-05, + "loss": 0.7315, + "step": 10910 + }, + { + "epoch": 6.153976311336717, + "grad_norm": 1.06717050075531, + "learning_rate": 1.9232938522278622e-05, + "loss": 0.783, + "step": 10911 + }, + { + "epoch": 6.154540327129159, + "grad_norm": 0.9613760709762573, + "learning_rate": 1.9230118443316414e-05, + "loss": 0.7092, + "step": 10912 + }, + { + "epoch": 6.155104342921602, + "grad_norm": 1.2529385089874268, + "learning_rate": 1.9227298364354203e-05, + "loss": 0.7624, + "step": 10913 + }, + { + "epoch": 6.155668358714044, + "grad_norm": 1.2813637256622314, + "learning_rate": 1.9224478285391992e-05, + "loss": 0.783, + "step": 10914 + }, + { + "epoch": 6.156232374506486, + "grad_norm": 1.4257696866989136, + "learning_rate": 1.922165820642978e-05, + "loss": 0.7972, + "step": 10915 + }, + { + "epoch": 6.156796390298928, + "grad_norm": 1.019484281539917, + "learning_rate": 1.921883812746757e-05, + "loss": 0.748, + "step": 10916 + }, + { + "epoch": 6.157360406091371, + "grad_norm": 0.857812225818634, + "learning_rate": 1.921601804850536e-05, + "loss": 0.7025, + "step": 10917 + }, + { + "epoch": 6.157924421883813, + "grad_norm": 1.0167208909988403, + "learning_rate": 1.9213197969543147e-05, + "loss": 0.7764, + "step": 10918 + }, + { + "epoch": 6.1584884376762545, + "grad_norm": 1.1248027086257935, + "learning_rate": 1.921037789058094e-05, + "loss": 0.7232, + "step": 10919 + }, + { + "epoch": 6.159052453468697, + "grad_norm": 1.1837366819381714, + "learning_rate": 1.9207557811618725e-05, + "loss": 0.7784, + "step": 10920 + }, + { + "epoch": 6.159616469261139, + "grad_norm": 1.2688230276107788, + "learning_rate": 1.9204737732656517e-05, + "loss": 0.794, + "step": 10921 + }, + { + "epoch": 6.160180485053582, + "grad_norm": 1.5353899002075195, + "learning_rate": 1.9201917653694306e-05, + "loss": 0.7653, + "step": 10922 + }, + { + "epoch": 6.1607445008460235, + "grad_norm": 1.2481131553649902, + "learning_rate": 1.9199097574732094e-05, + "loss": 0.6817, + "step": 10923 + }, + { + "epoch": 6.161308516638466, + "grad_norm": 1.1089850664138794, + "learning_rate": 1.9196277495769883e-05, + "loss": 0.7243, + "step": 10924 + }, + { + "epoch": 6.161872532430908, + "grad_norm": 1.713175654411316, + "learning_rate": 1.9193457416807672e-05, + "loss": 0.7861, + "step": 10925 + }, + { + "epoch": 6.16243654822335, + "grad_norm": 1.2981691360473633, + "learning_rate": 1.919063733784546e-05, + "loss": 0.8772, + "step": 10926 + }, + { + "epoch": 6.1630005640157925, + "grad_norm": 0.9440165162086487, + "learning_rate": 1.918781725888325e-05, + "loss": 0.7015, + "step": 10927 + }, + { + "epoch": 6.163564579808234, + "grad_norm": 1.1396845579147339, + "learning_rate": 1.918499717992104e-05, + "loss": 0.7898, + "step": 10928 + }, + { + "epoch": 6.164128595600677, + "grad_norm": 1.1148595809936523, + "learning_rate": 1.9182177100958827e-05, + "loss": 0.7668, + "step": 10929 + }, + { + "epoch": 6.164692611393119, + "grad_norm": 1.190342903137207, + "learning_rate": 1.9179357021996616e-05, + "loss": 0.7754, + "step": 10930 + }, + { + "epoch": 6.1652566271855616, + "grad_norm": 0.962790846824646, + "learning_rate": 1.9176536943034408e-05, + "loss": 0.6948, + "step": 10931 + }, + { + "epoch": 6.165820642978003, + "grad_norm": 1.4930499792099, + "learning_rate": 1.9173716864072194e-05, + "loss": 0.7255, + "step": 10932 + }, + { + "epoch": 6.166384658770445, + "grad_norm": 1.0811649560928345, + "learning_rate": 1.9170896785109986e-05, + "loss": 0.7226, + "step": 10933 + }, + { + "epoch": 6.166948674562888, + "grad_norm": 1.0419763326644897, + "learning_rate": 1.916807670614777e-05, + "loss": 0.6834, + "step": 10934 + }, + { + "epoch": 6.16751269035533, + "grad_norm": 0.8742290139198303, + "learning_rate": 1.9165256627185563e-05, + "loss": 0.7018, + "step": 10935 + }, + { + "epoch": 6.168076706147772, + "grad_norm": 1.2995272874832153, + "learning_rate": 1.916243654822335e-05, + "loss": 0.7687, + "step": 10936 + }, + { + "epoch": 6.168640721940214, + "grad_norm": 1.1742956638336182, + "learning_rate": 1.915961646926114e-05, + "loss": 0.7687, + "step": 10937 + }, + { + "epoch": 6.169204737732657, + "grad_norm": 1.177295207977295, + "learning_rate": 1.915679639029893e-05, + "loss": 0.7225, + "step": 10938 + }, + { + "epoch": 6.169768753525099, + "grad_norm": 1.3009687662124634, + "learning_rate": 1.915397631133672e-05, + "loss": 0.7679, + "step": 10939 + }, + { + "epoch": 6.1703327693175405, + "grad_norm": 1.3803085088729858, + "learning_rate": 1.9151156232374507e-05, + "loss": 0.7273, + "step": 10940 + }, + { + "epoch": 6.170896785109983, + "grad_norm": 1.1361761093139648, + "learning_rate": 1.9148336153412296e-05, + "loss": 0.646, + "step": 10941 + }, + { + "epoch": 6.171460800902425, + "grad_norm": 1.1041616201400757, + "learning_rate": 1.9145516074450085e-05, + "loss": 0.7166, + "step": 10942 + }, + { + "epoch": 6.172024816694868, + "grad_norm": 0.98837810754776, + "learning_rate": 1.9142695995487874e-05, + "loss": 0.7498, + "step": 10943 + }, + { + "epoch": 6.1725888324873095, + "grad_norm": 1.8832088708877563, + "learning_rate": 1.9139875916525663e-05, + "loss": 0.8581, + "step": 10944 + }, + { + "epoch": 6.173152848279752, + "grad_norm": 1.3455110788345337, + "learning_rate": 1.913705583756345e-05, + "loss": 0.8414, + "step": 10945 + }, + { + "epoch": 6.173716864072194, + "grad_norm": 0.9071149826049805, + "learning_rate": 1.913423575860124e-05, + "loss": 0.7023, + "step": 10946 + }, + { + "epoch": 6.174280879864636, + "grad_norm": 1.481520414352417, + "learning_rate": 1.9131415679639032e-05, + "loss": 0.6362, + "step": 10947 + }, + { + "epoch": 6.1748448956570785, + "grad_norm": 1.0069034099578857, + "learning_rate": 1.9128595600676818e-05, + "loss": 0.7624, + "step": 10948 + }, + { + "epoch": 6.17540891144952, + "grad_norm": 1.0016337633132935, + "learning_rate": 1.912577552171461e-05, + "loss": 0.7426, + "step": 10949 + }, + { + "epoch": 6.175972927241963, + "grad_norm": 1.087093710899353, + "learning_rate": 1.91229554427524e-05, + "loss": 0.8197, + "step": 10950 + }, + { + "epoch": 6.176536943034405, + "grad_norm": 1.0520325899124146, + "learning_rate": 1.9120135363790187e-05, + "loss": 0.7409, + "step": 10951 + }, + { + "epoch": 6.1771009588268475, + "grad_norm": 1.4787821769714355, + "learning_rate": 1.9117315284827976e-05, + "loss": 0.799, + "step": 10952 + }, + { + "epoch": 6.177664974619289, + "grad_norm": 0.939795970916748, + "learning_rate": 1.9114495205865765e-05, + "loss": 0.7557, + "step": 10953 + }, + { + "epoch": 6.178228990411731, + "grad_norm": 1.2931287288665771, + "learning_rate": 1.9111675126903554e-05, + "loss": 0.67, + "step": 10954 + }, + { + "epoch": 6.178793006204174, + "grad_norm": 1.7848269939422607, + "learning_rate": 1.9108855047941343e-05, + "loss": 0.8304, + "step": 10955 + }, + { + "epoch": 6.179357021996616, + "grad_norm": 1.0921977758407593, + "learning_rate": 1.9106034968979135e-05, + "loss": 0.6451, + "step": 10956 + }, + { + "epoch": 6.179921037789058, + "grad_norm": 1.2801880836486816, + "learning_rate": 1.910321489001692e-05, + "loss": 0.7425, + "step": 10957 + }, + { + "epoch": 6.1804850535815, + "grad_norm": 0.8786726593971252, + "learning_rate": 1.9100394811054712e-05, + "loss": 0.6975, + "step": 10958 + }, + { + "epoch": 6.181049069373943, + "grad_norm": 2.310349464416504, + "learning_rate": 1.9097574732092498e-05, + "loss": 0.7988, + "step": 10959 + }, + { + "epoch": 6.181613085166385, + "grad_norm": 1.2042688131332397, + "learning_rate": 1.909475465313029e-05, + "loss": 0.7441, + "step": 10960 + }, + { + "epoch": 6.182177100958826, + "grad_norm": 0.9713948369026184, + "learning_rate": 1.909193457416808e-05, + "loss": 0.7763, + "step": 10961 + }, + { + "epoch": 6.182741116751269, + "grad_norm": 1.3408952951431274, + "learning_rate": 1.9089114495205868e-05, + "loss": 0.7562, + "step": 10962 + }, + { + "epoch": 6.183305132543711, + "grad_norm": 0.8097411394119263, + "learning_rate": 1.9086294416243656e-05, + "loss": 0.6554, + "step": 10963 + }, + { + "epoch": 6.183869148336154, + "grad_norm": 0.9546744227409363, + "learning_rate": 1.9083474337281445e-05, + "loss": 0.61, + "step": 10964 + }, + { + "epoch": 6.1844331641285955, + "grad_norm": 1.514724612236023, + "learning_rate": 1.9080654258319234e-05, + "loss": 0.7032, + "step": 10965 + }, + { + "epoch": 6.184997179921038, + "grad_norm": 1.0948598384857178, + "learning_rate": 1.9077834179357023e-05, + "loss": 0.7429, + "step": 10966 + }, + { + "epoch": 6.18556119571348, + "grad_norm": 1.0685731172561646, + "learning_rate": 1.907501410039481e-05, + "loss": 0.6705, + "step": 10967 + }, + { + "epoch": 6.186125211505922, + "grad_norm": 0.8906263113021851, + "learning_rate": 1.90721940214326e-05, + "loss": 0.7477, + "step": 10968 + }, + { + "epoch": 6.1866892272983645, + "grad_norm": 0.9269543886184692, + "learning_rate": 1.906937394247039e-05, + "loss": 0.6321, + "step": 10969 + }, + { + "epoch": 6.187253243090806, + "grad_norm": 1.0495191812515259, + "learning_rate": 1.906655386350818e-05, + "loss": 0.8271, + "step": 10970 + }, + { + "epoch": 6.187817258883249, + "grad_norm": 1.1682307720184326, + "learning_rate": 1.9063733784545967e-05, + "loss": 0.8803, + "step": 10971 + }, + { + "epoch": 6.188381274675691, + "grad_norm": 1.2607262134552002, + "learning_rate": 1.906091370558376e-05, + "loss": 0.8204, + "step": 10972 + }, + { + "epoch": 6.1889452904681335, + "grad_norm": 1.0937409400939941, + "learning_rate": 1.9058093626621544e-05, + "loss": 0.7245, + "step": 10973 + }, + { + "epoch": 6.189509306260575, + "grad_norm": 0.8380206227302551, + "learning_rate": 1.9055273547659337e-05, + "loss": 0.7188, + "step": 10974 + }, + { + "epoch": 6.190073322053017, + "grad_norm": 1.1125433444976807, + "learning_rate": 1.9052453468697122e-05, + "loss": 0.8443, + "step": 10975 + }, + { + "epoch": 6.19063733784546, + "grad_norm": 1.0434049367904663, + "learning_rate": 1.9049633389734914e-05, + "loss": 0.7278, + "step": 10976 + }, + { + "epoch": 6.191201353637902, + "grad_norm": 1.3092169761657715, + "learning_rate": 1.9046813310772703e-05, + "loss": 0.8219, + "step": 10977 + }, + { + "epoch": 6.191765369430344, + "grad_norm": 1.2028862237930298, + "learning_rate": 1.9043993231810492e-05, + "loss": 0.6753, + "step": 10978 + }, + { + "epoch": 6.192329385222786, + "grad_norm": 0.9097693562507629, + "learning_rate": 1.904117315284828e-05, + "loss": 0.6958, + "step": 10979 + }, + { + "epoch": 6.192893401015229, + "grad_norm": 1.2144691944122314, + "learning_rate": 1.903835307388607e-05, + "loss": 0.7479, + "step": 10980 + }, + { + "epoch": 6.193457416807671, + "grad_norm": 1.5682469606399536, + "learning_rate": 1.9035532994923858e-05, + "loss": 0.7762, + "step": 10981 + }, + { + "epoch": 6.194021432600112, + "grad_norm": 1.3166507482528687, + "learning_rate": 1.9032712915961647e-05, + "loss": 0.7419, + "step": 10982 + }, + { + "epoch": 6.194585448392555, + "grad_norm": 1.0128265619277954, + "learning_rate": 1.9029892836999436e-05, + "loss": 0.7287, + "step": 10983 + }, + { + "epoch": 6.195149464184997, + "grad_norm": 1.8208016157150269, + "learning_rate": 1.9027072758037225e-05, + "loss": 0.8029, + "step": 10984 + }, + { + "epoch": 6.19571347997744, + "grad_norm": 0.856054961681366, + "learning_rate": 1.9024252679075013e-05, + "loss": 0.6612, + "step": 10985 + }, + { + "epoch": 6.196277495769881, + "grad_norm": 2.7761070728302, + "learning_rate": 1.9021432600112806e-05, + "loss": 0.672, + "step": 10986 + }, + { + "epoch": 6.196841511562324, + "grad_norm": 1.1387211084365845, + "learning_rate": 1.9018612521150594e-05, + "loss": 0.7527, + "step": 10987 + }, + { + "epoch": 6.197405527354766, + "grad_norm": 1.0479336977005005, + "learning_rate": 1.9015792442188383e-05, + "loss": 0.7283, + "step": 10988 + }, + { + "epoch": 6.197969543147208, + "grad_norm": 1.304533839225769, + "learning_rate": 1.9012972363226172e-05, + "loss": 0.7691, + "step": 10989 + }, + { + "epoch": 6.19853355893965, + "grad_norm": 1.321195363998413, + "learning_rate": 1.901015228426396e-05, + "loss": 0.8102, + "step": 10990 + }, + { + "epoch": 6.199097574732092, + "grad_norm": 0.9540279507637024, + "learning_rate": 1.900733220530175e-05, + "loss": 0.7921, + "step": 10991 + }, + { + "epoch": 6.199661590524535, + "grad_norm": 0.9084389209747314, + "learning_rate": 1.9004512126339538e-05, + "loss": 0.7317, + "step": 10992 + }, + { + "epoch": 6.200225606316977, + "grad_norm": 1.1746686697006226, + "learning_rate": 1.9001692047377327e-05, + "loss": 0.7257, + "step": 10993 + }, + { + "epoch": 6.200789622109419, + "grad_norm": 1.0715402364730835, + "learning_rate": 1.8998871968415116e-05, + "loss": 0.7257, + "step": 10994 + }, + { + "epoch": 6.201353637901861, + "grad_norm": 0.9593506455421448, + "learning_rate": 1.8996051889452908e-05, + "loss": 0.6095, + "step": 10995 + }, + { + "epoch": 6.201917653694303, + "grad_norm": 1.0549495220184326, + "learning_rate": 1.8993231810490693e-05, + "loss": 0.706, + "step": 10996 + }, + { + "epoch": 6.202481669486746, + "grad_norm": 1.1375339031219482, + "learning_rate": 1.8990411731528486e-05, + "loss": 0.7164, + "step": 10997 + }, + { + "epoch": 6.2030456852791875, + "grad_norm": 1.3822739124298096, + "learning_rate": 1.898759165256627e-05, + "loss": 0.727, + "step": 10998 + }, + { + "epoch": 6.20360970107163, + "grad_norm": 1.5081230401992798, + "learning_rate": 1.8984771573604063e-05, + "loss": 0.787, + "step": 10999 + }, + { + "epoch": 6.204173716864072, + "grad_norm": 1.4294685125350952, + "learning_rate": 1.898195149464185e-05, + "loss": 0.6184, + "step": 11000 + }, + { + "epoch": 6.204737732656515, + "grad_norm": 1.3677213191986084, + "learning_rate": 1.897913141567964e-05, + "loss": 0.7381, + "step": 11001 + }, + { + "epoch": 6.2053017484489565, + "grad_norm": 1.0983606576919556, + "learning_rate": 1.897631133671743e-05, + "loss": 0.7314, + "step": 11002 + }, + { + "epoch": 6.205865764241398, + "grad_norm": 1.2022507190704346, + "learning_rate": 1.897349125775522e-05, + "loss": 0.7391, + "step": 11003 + }, + { + "epoch": 6.206429780033841, + "grad_norm": 1.1087348461151123, + "learning_rate": 1.8970671178793007e-05, + "loss": 0.7086, + "step": 11004 + }, + { + "epoch": 6.206993795826283, + "grad_norm": 1.1987487077713013, + "learning_rate": 1.8967851099830796e-05, + "loss": 0.8226, + "step": 11005 + }, + { + "epoch": 6.2075578116187256, + "grad_norm": 1.542194128036499, + "learning_rate": 1.8965031020868585e-05, + "loss": 0.832, + "step": 11006 + }, + { + "epoch": 6.208121827411167, + "grad_norm": 1.0330448150634766, + "learning_rate": 1.8962210941906374e-05, + "loss": 0.7121, + "step": 11007 + }, + { + "epoch": 6.20868584320361, + "grad_norm": 1.4799530506134033, + "learning_rate": 1.8959390862944162e-05, + "loss": 0.7973, + "step": 11008 + }, + { + "epoch": 6.209249858996052, + "grad_norm": 1.1220378875732422, + "learning_rate": 1.895657078398195e-05, + "loss": 0.7129, + "step": 11009 + }, + { + "epoch": 6.209813874788494, + "grad_norm": 0.9466809034347534, + "learning_rate": 1.895375070501974e-05, + "loss": 0.6184, + "step": 11010 + }, + { + "epoch": 6.210377890580936, + "grad_norm": 1.4249471426010132, + "learning_rate": 1.8950930626057532e-05, + "loss": 0.7725, + "step": 11011 + }, + { + "epoch": 6.210941906373378, + "grad_norm": 0.991500198841095, + "learning_rate": 1.8948110547095318e-05, + "loss": 0.7616, + "step": 11012 + }, + { + "epoch": 6.211505922165821, + "grad_norm": 1.2522462606430054, + "learning_rate": 1.894529046813311e-05, + "loss": 0.8003, + "step": 11013 + }, + { + "epoch": 6.212069937958263, + "grad_norm": 1.145014762878418, + "learning_rate": 1.8942470389170895e-05, + "loss": 0.7427, + "step": 11014 + }, + { + "epoch": 6.212633953750705, + "grad_norm": 1.0148793458938599, + "learning_rate": 1.8939650310208687e-05, + "loss": 0.7143, + "step": 11015 + }, + { + "epoch": 6.213197969543147, + "grad_norm": 1.2465392351150513, + "learning_rate": 1.8936830231246476e-05, + "loss": 0.6873, + "step": 11016 + }, + { + "epoch": 6.213761985335589, + "grad_norm": 0.9371864795684814, + "learning_rate": 1.8934010152284265e-05, + "loss": 0.6041, + "step": 11017 + }, + { + "epoch": 6.214326001128032, + "grad_norm": 1.2761728763580322, + "learning_rate": 1.8931190073322054e-05, + "loss": 0.6905, + "step": 11018 + }, + { + "epoch": 6.2148900169204735, + "grad_norm": 0.9873653650283813, + "learning_rate": 1.8928369994359843e-05, + "loss": 0.7853, + "step": 11019 + }, + { + "epoch": 6.215454032712916, + "grad_norm": 1.3514970541000366, + "learning_rate": 1.892554991539763e-05, + "loss": 0.7474, + "step": 11020 + }, + { + "epoch": 6.216018048505358, + "grad_norm": 0.8688609600067139, + "learning_rate": 1.892272983643542e-05, + "loss": 0.6898, + "step": 11021 + }, + { + "epoch": 6.216582064297801, + "grad_norm": 1.3775300979614258, + "learning_rate": 1.8919909757473212e-05, + "loss": 0.6866, + "step": 11022 + }, + { + "epoch": 6.2171460800902425, + "grad_norm": 1.0781376361846924, + "learning_rate": 1.8917089678510998e-05, + "loss": 0.7141, + "step": 11023 + }, + { + "epoch": 6.217710095882684, + "grad_norm": 1.1607235670089722, + "learning_rate": 1.891426959954879e-05, + "loss": 0.7692, + "step": 11024 + }, + { + "epoch": 6.218274111675127, + "grad_norm": 1.2798876762390137, + "learning_rate": 1.891144952058658e-05, + "loss": 0.7512, + "step": 11025 + }, + { + "epoch": 6.218838127467569, + "grad_norm": 1.023777961730957, + "learning_rate": 1.8908629441624368e-05, + "loss": 0.7336, + "step": 11026 + }, + { + "epoch": 6.2194021432600115, + "grad_norm": 0.9294446706771851, + "learning_rate": 1.8905809362662156e-05, + "loss": 0.6877, + "step": 11027 + }, + { + "epoch": 6.219966159052453, + "grad_norm": 0.9456128478050232, + "learning_rate": 1.8902989283699945e-05, + "loss": 0.6903, + "step": 11028 + }, + { + "epoch": 6.220530174844896, + "grad_norm": 1.1271618604660034, + "learning_rate": 1.8900169204737734e-05, + "loss": 0.7208, + "step": 11029 + }, + { + "epoch": 6.221094190637338, + "grad_norm": 1.4074430465698242, + "learning_rate": 1.8897349125775523e-05, + "loss": 0.7465, + "step": 11030 + }, + { + "epoch": 6.22165820642978, + "grad_norm": 1.5868474245071411, + "learning_rate": 1.889452904681331e-05, + "loss": 0.7761, + "step": 11031 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 1.3527302742004395, + "learning_rate": 1.88917089678511e-05, + "loss": 0.8867, + "step": 11032 + }, + { + "epoch": 6.222786238014664, + "grad_norm": 1.2477421760559082, + "learning_rate": 1.888888888888889e-05, + "loss": 0.7608, + "step": 11033 + }, + { + "epoch": 6.223350253807107, + "grad_norm": 1.2217293977737427, + "learning_rate": 1.888606880992668e-05, + "loss": 0.6508, + "step": 11034 + }, + { + "epoch": 6.223914269599549, + "grad_norm": 1.3626333475112915, + "learning_rate": 1.8883248730964467e-05, + "loss": 0.7262, + "step": 11035 + }, + { + "epoch": 6.224478285391991, + "grad_norm": 1.0816633701324463, + "learning_rate": 1.888042865200226e-05, + "loss": 0.7648, + "step": 11036 + }, + { + "epoch": 6.225042301184433, + "grad_norm": 1.907033085823059, + "learning_rate": 1.8877608573040044e-05, + "loss": 0.7789, + "step": 11037 + }, + { + "epoch": 6.225606316976875, + "grad_norm": 0.9851995706558228, + "learning_rate": 1.8874788494077836e-05, + "loss": 0.6341, + "step": 11038 + }, + { + "epoch": 6.226170332769318, + "grad_norm": 1.0293282270431519, + "learning_rate": 1.8871968415115622e-05, + "loss": 0.7195, + "step": 11039 + }, + { + "epoch": 6.2267343485617594, + "grad_norm": 0.8599963188171387, + "learning_rate": 1.8869148336153414e-05, + "loss": 0.6445, + "step": 11040 + }, + { + "epoch": 6.227298364354202, + "grad_norm": 1.015918254852295, + "learning_rate": 1.8866328257191203e-05, + "loss": 0.7202, + "step": 11041 + }, + { + "epoch": 6.227862380146644, + "grad_norm": 1.0453583002090454, + "learning_rate": 1.886350817822899e-05, + "loss": 0.6819, + "step": 11042 + }, + { + "epoch": 6.228426395939087, + "grad_norm": 1.2633388042449951, + "learning_rate": 1.886068809926678e-05, + "loss": 0.8044, + "step": 11043 + }, + { + "epoch": 6.2289904117315285, + "grad_norm": 1.4483040571212769, + "learning_rate": 1.885786802030457e-05, + "loss": 0.7776, + "step": 11044 + }, + { + "epoch": 6.22955442752397, + "grad_norm": 1.4660025835037231, + "learning_rate": 1.8855047941342358e-05, + "loss": 0.8205, + "step": 11045 + }, + { + "epoch": 6.230118443316413, + "grad_norm": 1.5004571676254272, + "learning_rate": 1.8852227862380147e-05, + "loss": 0.7697, + "step": 11046 + }, + { + "epoch": 6.230682459108855, + "grad_norm": 1.568402647972107, + "learning_rate": 1.8849407783417936e-05, + "loss": 0.7769, + "step": 11047 + }, + { + "epoch": 6.2312464749012975, + "grad_norm": 0.9383906722068787, + "learning_rate": 1.8846587704455724e-05, + "loss": 0.7962, + "step": 11048 + }, + { + "epoch": 6.231810490693739, + "grad_norm": 1.037559151649475, + "learning_rate": 1.8843767625493513e-05, + "loss": 0.7661, + "step": 11049 + }, + { + "epoch": 6.232374506486182, + "grad_norm": 0.7770329117774963, + "learning_rate": 1.8840947546531305e-05, + "loss": 0.6346, + "step": 11050 + }, + { + "epoch": 6.232938522278624, + "grad_norm": 1.1293796300888062, + "learning_rate": 1.883812746756909e-05, + "loss": 0.8094, + "step": 11051 + }, + { + "epoch": 6.233502538071066, + "grad_norm": 1.3363080024719238, + "learning_rate": 1.8835307388606883e-05, + "loss": 0.7482, + "step": 11052 + }, + { + "epoch": 6.234066553863508, + "grad_norm": 1.303639531135559, + "learning_rate": 1.883248730964467e-05, + "loss": 0.7764, + "step": 11053 + }, + { + "epoch": 6.23463056965595, + "grad_norm": 1.1052268743515015, + "learning_rate": 1.882966723068246e-05, + "loss": 0.7474, + "step": 11054 + }, + { + "epoch": 6.235194585448393, + "grad_norm": 0.8770880699157715, + "learning_rate": 1.8826847151720246e-05, + "loss": 0.7417, + "step": 11055 + }, + { + "epoch": 6.235758601240835, + "grad_norm": 1.078316330909729, + "learning_rate": 1.8824027072758038e-05, + "loss": 0.6757, + "step": 11056 + }, + { + "epoch": 6.236322617033277, + "grad_norm": 1.1801793575286865, + "learning_rate": 1.8821206993795827e-05, + "loss": 0.68, + "step": 11057 + }, + { + "epoch": 6.236886632825719, + "grad_norm": 1.4124709367752075, + "learning_rate": 1.8818386914833616e-05, + "loss": 0.893, + "step": 11058 + }, + { + "epoch": 6.237450648618161, + "grad_norm": 1.2978302240371704, + "learning_rate": 1.8815566835871408e-05, + "loss": 0.8008, + "step": 11059 + }, + { + "epoch": 6.238014664410604, + "grad_norm": 1.0285927057266235, + "learning_rate": 1.8812746756909193e-05, + "loss": 0.7324, + "step": 11060 + }, + { + "epoch": 6.238578680203045, + "grad_norm": 0.9243673086166382, + "learning_rate": 1.8809926677946986e-05, + "loss": 0.7499, + "step": 11061 + }, + { + "epoch": 6.239142695995488, + "grad_norm": 0.9569652676582336, + "learning_rate": 1.880710659898477e-05, + "loss": 0.6936, + "step": 11062 + }, + { + "epoch": 6.23970671178793, + "grad_norm": 1.3482979536056519, + "learning_rate": 1.8804286520022563e-05, + "loss": 0.7778, + "step": 11063 + }, + { + "epoch": 6.240270727580373, + "grad_norm": 0.9317815899848938, + "learning_rate": 1.880146644106035e-05, + "loss": 0.7133, + "step": 11064 + }, + { + "epoch": 6.240834743372814, + "grad_norm": 1.1354267597198486, + "learning_rate": 1.879864636209814e-05, + "loss": 0.7268, + "step": 11065 + }, + { + "epoch": 6.241398759165256, + "grad_norm": 1.1359431743621826, + "learning_rate": 1.879582628313593e-05, + "loss": 0.6979, + "step": 11066 + }, + { + "epoch": 6.241962774957699, + "grad_norm": 1.0797505378723145, + "learning_rate": 1.879300620417372e-05, + "loss": 0.5935, + "step": 11067 + }, + { + "epoch": 6.242526790750141, + "grad_norm": 1.2261238098144531, + "learning_rate": 1.8790186125211507e-05, + "loss": 0.6872, + "step": 11068 + }, + { + "epoch": 6.243090806542583, + "grad_norm": 0.8639359474182129, + "learning_rate": 1.8787366046249296e-05, + "loss": 0.5739, + "step": 11069 + }, + { + "epoch": 6.243654822335025, + "grad_norm": 1.1732814311981201, + "learning_rate": 1.8784545967287085e-05, + "loss": 0.7083, + "step": 11070 + }, + { + "epoch": 6.244218838127468, + "grad_norm": 1.376696228981018, + "learning_rate": 1.8781725888324874e-05, + "loss": 0.6864, + "step": 11071 + }, + { + "epoch": 6.24478285391991, + "grad_norm": 1.1874176263809204, + "learning_rate": 1.8778905809362662e-05, + "loss": 0.7704, + "step": 11072 + }, + { + "epoch": 6.2453468697123515, + "grad_norm": 0.8973110318183899, + "learning_rate": 1.877608573040045e-05, + "loss": 0.6974, + "step": 11073 + }, + { + "epoch": 6.245910885504794, + "grad_norm": 1.1999131441116333, + "learning_rate": 1.877326565143824e-05, + "loss": 0.6411, + "step": 11074 + }, + { + "epoch": 6.246474901297236, + "grad_norm": 1.383894681930542, + "learning_rate": 1.8770445572476032e-05, + "loss": 0.7334, + "step": 11075 + }, + { + "epoch": 6.247038917089679, + "grad_norm": 0.8798007369041443, + "learning_rate": 1.8767625493513818e-05, + "loss": 0.7322, + "step": 11076 + }, + { + "epoch": 6.2476029328821205, + "grad_norm": 1.115352749824524, + "learning_rate": 1.876480541455161e-05, + "loss": 0.6892, + "step": 11077 + }, + { + "epoch": 6.248166948674563, + "grad_norm": 0.9392686486244202, + "learning_rate": 1.8761985335589395e-05, + "loss": 0.7966, + "step": 11078 + }, + { + "epoch": 6.248730964467005, + "grad_norm": 1.186701774597168, + "learning_rate": 1.8759165256627187e-05, + "loss": 0.7082, + "step": 11079 + }, + { + "epoch": 6.249294980259447, + "grad_norm": 0.9341787695884705, + "learning_rate": 1.8756345177664976e-05, + "loss": 0.7036, + "step": 11080 + }, + { + "epoch": 6.2498589960518895, + "grad_norm": 1.1575639247894287, + "learning_rate": 1.8753525098702765e-05, + "loss": 0.7913, + "step": 11081 + }, + { + "epoch": 6.250423011844331, + "grad_norm": 1.2923660278320312, + "learning_rate": 1.8750705019740554e-05, + "loss": 0.815, + "step": 11082 + }, + { + "epoch": 6.250987027636774, + "grad_norm": 1.0399709939956665, + "learning_rate": 1.8747884940778342e-05, + "loss": 0.7604, + "step": 11083 + }, + { + "epoch": 6.251551043429216, + "grad_norm": 1.1719764471054077, + "learning_rate": 1.874506486181613e-05, + "loss": 0.7976, + "step": 11084 + }, + { + "epoch": 6.2521150592216586, + "grad_norm": 1.8159006834030151, + "learning_rate": 1.874224478285392e-05, + "loss": 0.8709, + "step": 11085 + }, + { + "epoch": 6.2526790750141, + "grad_norm": 1.5700089931488037, + "learning_rate": 1.873942470389171e-05, + "loss": 0.823, + "step": 11086 + }, + { + "epoch": 6.253243090806542, + "grad_norm": 1.251847743988037, + "learning_rate": 1.8736604624929498e-05, + "loss": 0.7055, + "step": 11087 + }, + { + "epoch": 6.253807106598985, + "grad_norm": 1.009575605392456, + "learning_rate": 1.8733784545967286e-05, + "loss": 0.7484, + "step": 11088 + }, + { + "epoch": 6.254371122391427, + "grad_norm": 0.9005950093269348, + "learning_rate": 1.873096446700508e-05, + "loss": 0.7255, + "step": 11089 + }, + { + "epoch": 6.254935138183869, + "grad_norm": 0.9132816791534424, + "learning_rate": 1.8728144388042864e-05, + "loss": 0.5855, + "step": 11090 + }, + { + "epoch": 6.255499153976311, + "grad_norm": 1.1636942625045776, + "learning_rate": 1.8725324309080656e-05, + "loss": 0.7284, + "step": 11091 + }, + { + "epoch": 6.256063169768754, + "grad_norm": 1.0147569179534912, + "learning_rate": 1.8722504230118445e-05, + "loss": 0.781, + "step": 11092 + }, + { + "epoch": 6.256627185561196, + "grad_norm": 1.3408751487731934, + "learning_rate": 1.8719684151156234e-05, + "loss": 0.7433, + "step": 11093 + }, + { + "epoch": 6.2571912013536375, + "grad_norm": 1.013231635093689, + "learning_rate": 1.8716864072194023e-05, + "loss": 0.6709, + "step": 11094 + }, + { + "epoch": 6.25775521714608, + "grad_norm": 0.7825023531913757, + "learning_rate": 1.871404399323181e-05, + "loss": 0.6255, + "step": 11095 + }, + { + "epoch": 6.258319232938522, + "grad_norm": 1.1647776365280151, + "learning_rate": 1.87112239142696e-05, + "loss": 0.7535, + "step": 11096 + }, + { + "epoch": 6.258883248730965, + "grad_norm": 1.3468390703201294, + "learning_rate": 1.870840383530739e-05, + "loss": 0.7522, + "step": 11097 + }, + { + "epoch": 6.2594472645234065, + "grad_norm": 0.9270802736282349, + "learning_rate": 1.870558375634518e-05, + "loss": 0.6531, + "step": 11098 + }, + { + "epoch": 6.260011280315849, + "grad_norm": 0.8971842527389526, + "learning_rate": 1.8702763677382967e-05, + "loss": 0.597, + "step": 11099 + }, + { + "epoch": 6.260575296108291, + "grad_norm": 1.59319269657135, + "learning_rate": 1.869994359842076e-05, + "loss": 0.7696, + "step": 11100 + }, + { + "epoch": 6.261139311900733, + "grad_norm": 1.3820444345474243, + "learning_rate": 1.8697123519458544e-05, + "loss": 0.7747, + "step": 11101 + }, + { + "epoch": 6.2617033276931755, + "grad_norm": 1.206054449081421, + "learning_rate": 1.8694303440496336e-05, + "loss": 0.7963, + "step": 11102 + }, + { + "epoch": 6.262267343485617, + "grad_norm": 1.3823461532592773, + "learning_rate": 1.8691483361534122e-05, + "loss": 0.8418, + "step": 11103 + }, + { + "epoch": 6.26283135927806, + "grad_norm": 1.4224505424499512, + "learning_rate": 1.8688663282571914e-05, + "loss": 0.7776, + "step": 11104 + }, + { + "epoch": 6.263395375070502, + "grad_norm": 1.510184407234192, + "learning_rate": 1.8685843203609703e-05, + "loss": 0.7829, + "step": 11105 + }, + { + "epoch": 6.2639593908629445, + "grad_norm": 2.10909104347229, + "learning_rate": 1.868302312464749e-05, + "loss": 0.6776, + "step": 11106 + }, + { + "epoch": 6.264523406655386, + "grad_norm": 0.9369859099388123, + "learning_rate": 1.868020304568528e-05, + "loss": 0.7126, + "step": 11107 + }, + { + "epoch": 6.265087422447828, + "grad_norm": 1.5975675582885742, + "learning_rate": 1.867738296672307e-05, + "loss": 0.7915, + "step": 11108 + }, + { + "epoch": 6.265651438240271, + "grad_norm": 0.9756054282188416, + "learning_rate": 1.8674562887760858e-05, + "loss": 0.6597, + "step": 11109 + }, + { + "epoch": 6.266215454032713, + "grad_norm": 0.9112524390220642, + "learning_rate": 1.8671742808798647e-05, + "loss": 0.7088, + "step": 11110 + }, + { + "epoch": 6.266779469825155, + "grad_norm": 0.9656860828399658, + "learning_rate": 1.8668922729836436e-05, + "loss": 0.6887, + "step": 11111 + }, + { + "epoch": 6.267343485617597, + "grad_norm": 1.1737260818481445, + "learning_rate": 1.8666102650874224e-05, + "loss": 0.7505, + "step": 11112 + }, + { + "epoch": 6.26790750141004, + "grad_norm": 1.1115789413452148, + "learning_rate": 1.8663282571912013e-05, + "loss": 0.6725, + "step": 11113 + }, + { + "epoch": 6.268471517202482, + "grad_norm": 1.2235984802246094, + "learning_rate": 1.8660462492949805e-05, + "loss": 0.6873, + "step": 11114 + }, + { + "epoch": 6.269035532994923, + "grad_norm": 1.1714682579040527, + "learning_rate": 1.865764241398759e-05, + "loss": 0.806, + "step": 11115 + }, + { + "epoch": 6.269599548787366, + "grad_norm": 0.9751256108283997, + "learning_rate": 1.8654822335025383e-05, + "loss": 0.6508, + "step": 11116 + }, + { + "epoch": 6.270163564579808, + "grad_norm": 1.005068063735962, + "learning_rate": 1.865200225606317e-05, + "loss": 0.7608, + "step": 11117 + }, + { + "epoch": 6.270727580372251, + "grad_norm": 1.424902319908142, + "learning_rate": 1.864918217710096e-05, + "loss": 0.7493, + "step": 11118 + }, + { + "epoch": 6.2712915961646925, + "grad_norm": 1.3042714595794678, + "learning_rate": 1.864636209813875e-05, + "loss": 0.8217, + "step": 11119 + }, + { + "epoch": 6.271855611957135, + "grad_norm": 1.341097354888916, + "learning_rate": 1.8643542019176538e-05, + "loss": 0.7483, + "step": 11120 + }, + { + "epoch": 6.272419627749577, + "grad_norm": 1.0971269607543945, + "learning_rate": 1.8640721940214327e-05, + "loss": 0.7416, + "step": 11121 + }, + { + "epoch": 6.272983643542019, + "grad_norm": 1.661301612854004, + "learning_rate": 1.8637901861252116e-05, + "loss": 0.8467, + "step": 11122 + }, + { + "epoch": 6.2735476593344615, + "grad_norm": 0.8412769436836243, + "learning_rate": 1.8635081782289905e-05, + "loss": 0.6944, + "step": 11123 + }, + { + "epoch": 6.274111675126903, + "grad_norm": 1.1556003093719482, + "learning_rate": 1.8632261703327693e-05, + "loss": 0.7298, + "step": 11124 + }, + { + "epoch": 6.274675690919346, + "grad_norm": 1.0508239269256592, + "learning_rate": 1.8629441624365482e-05, + "loss": 0.7461, + "step": 11125 + }, + { + "epoch": 6.275239706711788, + "grad_norm": 1.1390042304992676, + "learning_rate": 1.862662154540327e-05, + "loss": 0.6834, + "step": 11126 + }, + { + "epoch": 6.2758037225042305, + "grad_norm": 1.2942227125167847, + "learning_rate": 1.8623801466441063e-05, + "loss": 0.6692, + "step": 11127 + }, + { + "epoch": 6.276367738296672, + "grad_norm": 1.4200128316879272, + "learning_rate": 1.8620981387478852e-05, + "loss": 0.7405, + "step": 11128 + }, + { + "epoch": 6.276931754089114, + "grad_norm": 1.421315312385559, + "learning_rate": 1.861816130851664e-05, + "loss": 0.7649, + "step": 11129 + }, + { + "epoch": 6.277495769881557, + "grad_norm": 1.150639295578003, + "learning_rate": 1.861534122955443e-05, + "loss": 0.6637, + "step": 11130 + }, + { + "epoch": 6.278059785673999, + "grad_norm": 1.3187682628631592, + "learning_rate": 1.8612521150592218e-05, + "loss": 0.8079, + "step": 11131 + }, + { + "epoch": 6.278623801466441, + "grad_norm": 1.521899700164795, + "learning_rate": 1.8609701071630007e-05, + "loss": 0.8714, + "step": 11132 + }, + { + "epoch": 6.279187817258883, + "grad_norm": 0.9682912826538086, + "learning_rate": 1.8606880992667796e-05, + "loss": 0.7791, + "step": 11133 + }, + { + "epoch": 6.279751833051326, + "grad_norm": 1.011084794998169, + "learning_rate": 1.8604060913705585e-05, + "loss": 0.6509, + "step": 11134 + }, + { + "epoch": 6.280315848843768, + "grad_norm": 1.3716254234313965, + "learning_rate": 1.8601240834743373e-05, + "loss": 0.7123, + "step": 11135 + }, + { + "epoch": 6.280879864636209, + "grad_norm": 1.0984405279159546, + "learning_rate": 1.8598420755781162e-05, + "loss": 0.7781, + "step": 11136 + }, + { + "epoch": 6.281443880428652, + "grad_norm": 0.8802071809768677, + "learning_rate": 1.8595600676818954e-05, + "loss": 0.7156, + "step": 11137 + }, + { + "epoch": 6.282007896221094, + "grad_norm": 1.1922907829284668, + "learning_rate": 1.859278059785674e-05, + "loss": 0.7376, + "step": 11138 + }, + { + "epoch": 6.282571912013537, + "grad_norm": 0.9062994718551636, + "learning_rate": 1.8589960518894532e-05, + "loss": 0.7279, + "step": 11139 + }, + { + "epoch": 6.283135927805978, + "grad_norm": 1.3406410217285156, + "learning_rate": 1.8587140439932317e-05, + "loss": 0.7853, + "step": 11140 + }, + { + "epoch": 6.283699943598421, + "grad_norm": 1.2987703084945679, + "learning_rate": 1.858432036097011e-05, + "loss": 0.7254, + "step": 11141 + }, + { + "epoch": 6.284263959390863, + "grad_norm": 1.2510623931884766, + "learning_rate": 1.8581500282007895e-05, + "loss": 0.8194, + "step": 11142 + }, + { + "epoch": 6.284827975183305, + "grad_norm": 1.2367480993270874, + "learning_rate": 1.8578680203045687e-05, + "loss": 0.7204, + "step": 11143 + }, + { + "epoch": 6.285391990975747, + "grad_norm": 1.257564663887024, + "learning_rate": 1.8575860124083476e-05, + "loss": 0.787, + "step": 11144 + }, + { + "epoch": 6.285956006768189, + "grad_norm": 0.9754773378372192, + "learning_rate": 1.8573040045121265e-05, + "loss": 0.6732, + "step": 11145 + }, + { + "epoch": 6.286520022560632, + "grad_norm": 1.194212555885315, + "learning_rate": 1.8570219966159054e-05, + "loss": 0.7608, + "step": 11146 + }, + { + "epoch": 6.287084038353074, + "grad_norm": 1.3141101598739624, + "learning_rate": 1.8567399887196842e-05, + "loss": 0.8862, + "step": 11147 + }, + { + "epoch": 6.287648054145516, + "grad_norm": 1.293831467628479, + "learning_rate": 1.856457980823463e-05, + "loss": 0.788, + "step": 11148 + }, + { + "epoch": 6.288212069937958, + "grad_norm": 0.9186837077140808, + "learning_rate": 1.856175972927242e-05, + "loss": 0.7984, + "step": 11149 + }, + { + "epoch": 6.288776085730401, + "grad_norm": 1.3519190549850464, + "learning_rate": 1.855893965031021e-05, + "loss": 0.7678, + "step": 11150 + }, + { + "epoch": 6.289340101522843, + "grad_norm": 1.6754404306411743, + "learning_rate": 1.8556119571347998e-05, + "loss": 0.9144, + "step": 11151 + }, + { + "epoch": 6.2899041173152845, + "grad_norm": 1.2583283185958862, + "learning_rate": 1.8553299492385786e-05, + "loss": 0.748, + "step": 11152 + }, + { + "epoch": 6.290468133107727, + "grad_norm": 0.9981433749198914, + "learning_rate": 1.855047941342358e-05, + "loss": 0.7436, + "step": 11153 + }, + { + "epoch": 6.291032148900169, + "grad_norm": 0.9063414931297302, + "learning_rate": 1.8547659334461364e-05, + "loss": 0.7314, + "step": 11154 + }, + { + "epoch": 6.291596164692612, + "grad_norm": 0.9874424338340759, + "learning_rate": 1.8544839255499156e-05, + "loss": 0.6641, + "step": 11155 + }, + { + "epoch": 6.2921601804850535, + "grad_norm": 1.0706733465194702, + "learning_rate": 1.854201917653694e-05, + "loss": 0.7569, + "step": 11156 + }, + { + "epoch": 6.292724196277495, + "grad_norm": 1.4476572275161743, + "learning_rate": 1.8539199097574734e-05, + "loss": 0.756, + "step": 11157 + }, + { + "epoch": 6.293288212069938, + "grad_norm": 1.2778717279434204, + "learning_rate": 1.853637901861252e-05, + "loss": 0.6933, + "step": 11158 + }, + { + "epoch": 6.29385222786238, + "grad_norm": 1.5848344564437866, + "learning_rate": 1.853355893965031e-05, + "loss": 0.7869, + "step": 11159 + }, + { + "epoch": 6.2944162436548226, + "grad_norm": 1.320062518119812, + "learning_rate": 1.85307388606881e-05, + "loss": 0.7791, + "step": 11160 + }, + { + "epoch": 6.294980259447264, + "grad_norm": 1.1750632524490356, + "learning_rate": 1.852791878172589e-05, + "loss": 0.7178, + "step": 11161 + }, + { + "epoch": 6.295544275239707, + "grad_norm": 1.2035760879516602, + "learning_rate": 1.852509870276368e-05, + "loss": 0.7788, + "step": 11162 + }, + { + "epoch": 6.296108291032149, + "grad_norm": 1.0135905742645264, + "learning_rate": 1.8522278623801467e-05, + "loss": 0.6881, + "step": 11163 + }, + { + "epoch": 6.296672306824592, + "grad_norm": 0.8723045587539673, + "learning_rate": 1.851945854483926e-05, + "loss": 0.6438, + "step": 11164 + }, + { + "epoch": 6.297236322617033, + "grad_norm": 1.1463497877120972, + "learning_rate": 1.8516638465877044e-05, + "loss": 0.6336, + "step": 11165 + }, + { + "epoch": 6.297800338409475, + "grad_norm": 1.0477336645126343, + "learning_rate": 1.8513818386914836e-05, + "loss": 0.6818, + "step": 11166 + }, + { + "epoch": 6.298364354201918, + "grad_norm": 1.1399062871932983, + "learning_rate": 1.8510998307952622e-05, + "loss": 0.7816, + "step": 11167 + }, + { + "epoch": 6.29892836999436, + "grad_norm": 1.6703972816467285, + "learning_rate": 1.8508178228990414e-05, + "loss": 0.7372, + "step": 11168 + }, + { + "epoch": 6.299492385786802, + "grad_norm": 1.1856180429458618, + "learning_rate": 1.8505358150028203e-05, + "loss": 0.7785, + "step": 11169 + }, + { + "epoch": 6.300056401579244, + "grad_norm": 1.0682710409164429, + "learning_rate": 1.850253807106599e-05, + "loss": 0.712, + "step": 11170 + }, + { + "epoch": 6.300620417371686, + "grad_norm": 1.264165997505188, + "learning_rate": 1.849971799210378e-05, + "loss": 0.7709, + "step": 11171 + }, + { + "epoch": 6.301184433164129, + "grad_norm": 0.9892562627792358, + "learning_rate": 1.849689791314157e-05, + "loss": 0.6845, + "step": 11172 + }, + { + "epoch": 6.3017484489565705, + "grad_norm": 1.3760206699371338, + "learning_rate": 1.8494077834179358e-05, + "loss": 0.6858, + "step": 11173 + }, + { + "epoch": 6.302312464749013, + "grad_norm": 1.2524983882904053, + "learning_rate": 1.8491257755217147e-05, + "loss": 0.707, + "step": 11174 + }, + { + "epoch": 6.302876480541455, + "grad_norm": 1.2817747592926025, + "learning_rate": 1.8488437676254935e-05, + "loss": 0.7936, + "step": 11175 + }, + { + "epoch": 6.303440496333898, + "grad_norm": 1.5679305791854858, + "learning_rate": 1.8485617597292724e-05, + "loss": 0.7083, + "step": 11176 + }, + { + "epoch": 6.3040045121263395, + "grad_norm": 0.9484211802482605, + "learning_rate": 1.8482797518330513e-05, + "loss": 0.66, + "step": 11177 + }, + { + "epoch": 6.304568527918782, + "grad_norm": 1.2954025268554688, + "learning_rate": 1.8479977439368305e-05, + "loss": 0.7117, + "step": 11178 + }, + { + "epoch": 6.305132543711224, + "grad_norm": 1.160304307937622, + "learning_rate": 1.847715736040609e-05, + "loss": 0.8031, + "step": 11179 + }, + { + "epoch": 6.305696559503666, + "grad_norm": 0.8287292122840881, + "learning_rate": 1.8474337281443883e-05, + "loss": 0.6457, + "step": 11180 + }, + { + "epoch": 6.3062605752961085, + "grad_norm": 0.9361201524734497, + "learning_rate": 1.8471517202481668e-05, + "loss": 0.7414, + "step": 11181 + }, + { + "epoch": 6.30682459108855, + "grad_norm": 0.9310320615768433, + "learning_rate": 1.846869712351946e-05, + "loss": 0.7197, + "step": 11182 + }, + { + "epoch": 6.307388606880993, + "grad_norm": 0.853055477142334, + "learning_rate": 1.846587704455725e-05, + "loss": 0.6253, + "step": 11183 + }, + { + "epoch": 6.307952622673435, + "grad_norm": 1.2717514038085938, + "learning_rate": 1.8463056965595038e-05, + "loss": 0.7543, + "step": 11184 + }, + { + "epoch": 6.308516638465877, + "grad_norm": 1.3289119005203247, + "learning_rate": 1.8460236886632827e-05, + "loss": 0.7326, + "step": 11185 + }, + { + "epoch": 6.309080654258319, + "grad_norm": 1.0546631813049316, + "learning_rate": 1.8457416807670616e-05, + "loss": 0.7472, + "step": 11186 + }, + { + "epoch": 6.309644670050761, + "grad_norm": 1.2841614484786987, + "learning_rate": 1.8454596728708404e-05, + "loss": 0.7334, + "step": 11187 + }, + { + "epoch": 6.310208685843204, + "grad_norm": 0.8834025859832764, + "learning_rate": 1.8451776649746193e-05, + "loss": 0.6659, + "step": 11188 + }, + { + "epoch": 6.310772701635646, + "grad_norm": 0.8996708989143372, + "learning_rate": 1.8448956570783982e-05, + "loss": 0.7575, + "step": 11189 + }, + { + "epoch": 6.311336717428088, + "grad_norm": 1.0552542209625244, + "learning_rate": 1.844613649182177e-05, + "loss": 0.7729, + "step": 11190 + }, + { + "epoch": 6.31190073322053, + "grad_norm": 0.9462254047393799, + "learning_rate": 1.844331641285956e-05, + "loss": 0.6999, + "step": 11191 + }, + { + "epoch": 6.312464749012973, + "grad_norm": 1.2280079126358032, + "learning_rate": 1.8440496333897352e-05, + "loss": 0.638, + "step": 11192 + }, + { + "epoch": 6.313028764805415, + "grad_norm": 1.1195824146270752, + "learning_rate": 1.8437676254935137e-05, + "loss": 0.7249, + "step": 11193 + }, + { + "epoch": 6.3135927805978564, + "grad_norm": 1.3908624649047852, + "learning_rate": 1.843485617597293e-05, + "loss": 0.8581, + "step": 11194 + }, + { + "epoch": 6.314156796390299, + "grad_norm": 1.0835363864898682, + "learning_rate": 1.8432036097010715e-05, + "loss": 0.771, + "step": 11195 + }, + { + "epoch": 6.314720812182741, + "grad_norm": 1.134444236755371, + "learning_rate": 1.8429216018048507e-05, + "loss": 0.6691, + "step": 11196 + }, + { + "epoch": 6.315284827975184, + "grad_norm": 1.1666380167007446, + "learning_rate": 1.8426395939086292e-05, + "loss": 0.7764, + "step": 11197 + }, + { + "epoch": 6.3158488437676255, + "grad_norm": 1.2741119861602783, + "learning_rate": 1.8423575860124085e-05, + "loss": 0.7887, + "step": 11198 + }, + { + "epoch": 6.316412859560067, + "grad_norm": 1.034315824508667, + "learning_rate": 1.8420755781161873e-05, + "loss": 0.7341, + "step": 11199 + }, + { + "epoch": 6.31697687535251, + "grad_norm": 1.386246681213379, + "learning_rate": 1.8417935702199662e-05, + "loss": 0.6996, + "step": 11200 + }, + { + "epoch": 6.317540891144952, + "grad_norm": 1.2462676763534546, + "learning_rate": 1.8415115623237454e-05, + "loss": 0.9072, + "step": 11201 + }, + { + "epoch": 6.3181049069373945, + "grad_norm": 1.4497298002243042, + "learning_rate": 1.841229554427524e-05, + "loss": 0.808, + "step": 11202 + }, + { + "epoch": 6.318668922729836, + "grad_norm": 1.059128999710083, + "learning_rate": 1.8409475465313032e-05, + "loss": 0.6083, + "step": 11203 + }, + { + "epoch": 6.319232938522279, + "grad_norm": 1.2242168188095093, + "learning_rate": 1.8406655386350817e-05, + "loss": 0.6798, + "step": 11204 + }, + { + "epoch": 6.319796954314721, + "grad_norm": 1.1842528581619263, + "learning_rate": 1.840383530738861e-05, + "loss": 0.687, + "step": 11205 + }, + { + "epoch": 6.3203609701071635, + "grad_norm": 1.1736791133880615, + "learning_rate": 1.8401015228426395e-05, + "loss": 0.7154, + "step": 11206 + }, + { + "epoch": 6.320924985899605, + "grad_norm": 1.0821980237960815, + "learning_rate": 1.8398195149464187e-05, + "loss": 0.7717, + "step": 11207 + }, + { + "epoch": 6.321489001692047, + "grad_norm": 1.111345887184143, + "learning_rate": 1.8395375070501976e-05, + "loss": 0.6348, + "step": 11208 + }, + { + "epoch": 6.32205301748449, + "grad_norm": 0.935358464717865, + "learning_rate": 1.8392554991539765e-05, + "loss": 0.7508, + "step": 11209 + }, + { + "epoch": 6.322617033276932, + "grad_norm": 0.8856709003448486, + "learning_rate": 1.8389734912577554e-05, + "loss": 0.6569, + "step": 11210 + }, + { + "epoch": 6.323181049069374, + "grad_norm": 1.2770830392837524, + "learning_rate": 1.8386914833615342e-05, + "loss": 0.7854, + "step": 11211 + }, + { + "epoch": 6.323745064861816, + "grad_norm": 0.9372140765190125, + "learning_rate": 1.838409475465313e-05, + "loss": 0.6527, + "step": 11212 + }, + { + "epoch": 6.324309080654258, + "grad_norm": 1.315789818763733, + "learning_rate": 1.838127467569092e-05, + "loss": 0.7993, + "step": 11213 + }, + { + "epoch": 6.324873096446701, + "grad_norm": 1.0414528846740723, + "learning_rate": 1.837845459672871e-05, + "loss": 0.6884, + "step": 11214 + }, + { + "epoch": 6.325437112239142, + "grad_norm": 0.9458356499671936, + "learning_rate": 1.8375634517766498e-05, + "loss": 0.6738, + "step": 11215 + }, + { + "epoch": 6.326001128031585, + "grad_norm": 1.1214323043823242, + "learning_rate": 1.8372814438804286e-05, + "loss": 0.6498, + "step": 11216 + }, + { + "epoch": 6.326565143824027, + "grad_norm": 0.9355196952819824, + "learning_rate": 1.836999435984208e-05, + "loss": 0.7659, + "step": 11217 + }, + { + "epoch": 6.32712915961647, + "grad_norm": 1.0653319358825684, + "learning_rate": 1.8367174280879864e-05, + "loss": 0.7385, + "step": 11218 + }, + { + "epoch": 6.327693175408911, + "grad_norm": 0.8798925280570984, + "learning_rate": 1.8364354201917656e-05, + "loss": 0.6708, + "step": 11219 + }, + { + "epoch": 6.328257191201354, + "grad_norm": 1.3485745191574097, + "learning_rate": 1.836153412295544e-05, + "loss": 0.6941, + "step": 11220 + }, + { + "epoch": 6.328821206993796, + "grad_norm": 1.0507965087890625, + "learning_rate": 1.8358714043993234e-05, + "loss": 0.7808, + "step": 11221 + }, + { + "epoch": 6.329385222786238, + "grad_norm": 1.1963249444961548, + "learning_rate": 1.835589396503102e-05, + "loss": 0.7414, + "step": 11222 + }, + { + "epoch": 6.32994923857868, + "grad_norm": 1.5021562576293945, + "learning_rate": 1.835307388606881e-05, + "loss": 0.874, + "step": 11223 + }, + { + "epoch": 6.330513254371122, + "grad_norm": 1.120905876159668, + "learning_rate": 1.83502538071066e-05, + "loss": 0.6591, + "step": 11224 + }, + { + "epoch": 6.331077270163565, + "grad_norm": 0.9429426193237305, + "learning_rate": 1.834743372814439e-05, + "loss": 0.6501, + "step": 11225 + }, + { + "epoch": 6.331641285956007, + "grad_norm": 0.8040872812271118, + "learning_rate": 1.8344613649182178e-05, + "loss": 0.6704, + "step": 11226 + }, + { + "epoch": 6.3322053017484485, + "grad_norm": 1.018932580947876, + "learning_rate": 1.8341793570219966e-05, + "loss": 0.6852, + "step": 11227 + }, + { + "epoch": 6.332769317540891, + "grad_norm": 0.8793386220932007, + "learning_rate": 1.8338973491257755e-05, + "loss": 0.7379, + "step": 11228 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 0.7971097826957703, + "learning_rate": 1.8336153412295544e-05, + "loss": 0.6325, + "step": 11229 + }, + { + "epoch": 6.333897349125776, + "grad_norm": 1.0132005214691162, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.6983, + "step": 11230 + }, + { + "epoch": 6.3344613649182175, + "grad_norm": 0.9740996956825256, + "learning_rate": 1.833051325437112e-05, + "loss": 0.6586, + "step": 11231 + }, + { + "epoch": 6.33502538071066, + "grad_norm": 1.1211637258529663, + "learning_rate": 1.832769317540891e-05, + "loss": 0.8053, + "step": 11232 + }, + { + "epoch": 6.335589396503102, + "grad_norm": 1.6966606378555298, + "learning_rate": 1.8324873096446703e-05, + "loss": 0.7638, + "step": 11233 + }, + { + "epoch": 6.336153412295545, + "grad_norm": 1.1819274425506592, + "learning_rate": 1.832205301748449e-05, + "loss": 0.7333, + "step": 11234 + }, + { + "epoch": 6.3367174280879865, + "grad_norm": 1.1422703266143799, + "learning_rate": 1.831923293852228e-05, + "loss": 0.6148, + "step": 11235 + }, + { + "epoch": 6.337281443880428, + "grad_norm": 7.155543804168701, + "learning_rate": 1.831641285956007e-05, + "loss": 0.8909, + "step": 11236 + }, + { + "epoch": 6.337845459672871, + "grad_norm": 1.0878239870071411, + "learning_rate": 1.8313592780597858e-05, + "loss": 0.6623, + "step": 11237 + }, + { + "epoch": 6.338409475465313, + "grad_norm": 1.2424201965332031, + "learning_rate": 1.8310772701635647e-05, + "loss": 0.7321, + "step": 11238 + }, + { + "epoch": 6.3389734912577556, + "grad_norm": 1.2457631826400757, + "learning_rate": 1.8307952622673435e-05, + "loss": 0.8161, + "step": 11239 + }, + { + "epoch": 6.339537507050197, + "grad_norm": 1.2625325918197632, + "learning_rate": 1.8305132543711224e-05, + "loss": 0.8268, + "step": 11240 + }, + { + "epoch": 6.340101522842639, + "grad_norm": 0.9347015023231506, + "learning_rate": 1.8302312464749013e-05, + "loss": 0.767, + "step": 11241 + }, + { + "epoch": 6.340665538635082, + "grad_norm": 1.3908723592758179, + "learning_rate": 1.8299492385786805e-05, + "loss": 0.8291, + "step": 11242 + }, + { + "epoch": 6.341229554427524, + "grad_norm": 0.972091555595398, + "learning_rate": 1.829667230682459e-05, + "loss": 0.7134, + "step": 11243 + }, + { + "epoch": 6.341793570219966, + "grad_norm": 1.034291386604309, + "learning_rate": 1.8293852227862383e-05, + "loss": 0.6354, + "step": 11244 + }, + { + "epoch": 6.342357586012408, + "grad_norm": 0.8685824275016785, + "learning_rate": 1.8291032148900168e-05, + "loss": 0.6975, + "step": 11245 + }, + { + "epoch": 6.342921601804851, + "grad_norm": 0.737778902053833, + "learning_rate": 1.828821206993796e-05, + "loss": 0.6199, + "step": 11246 + }, + { + "epoch": 6.343485617597293, + "grad_norm": 1.2426457405090332, + "learning_rate": 1.828539199097575e-05, + "loss": 0.8587, + "step": 11247 + }, + { + "epoch": 6.344049633389735, + "grad_norm": 1.1563016176223755, + "learning_rate": 1.8282571912013538e-05, + "loss": 0.7711, + "step": 11248 + }, + { + "epoch": 6.344613649182177, + "grad_norm": 1.3048536777496338, + "learning_rate": 1.8279751833051327e-05, + "loss": 0.8218, + "step": 11249 + }, + { + "epoch": 6.345177664974619, + "grad_norm": 1.006488561630249, + "learning_rate": 1.8276931754089116e-05, + "loss": 0.7461, + "step": 11250 + }, + { + "epoch": 6.345741680767062, + "grad_norm": 1.0074316263198853, + "learning_rate": 1.8274111675126904e-05, + "loss": 0.8028, + "step": 11251 + }, + { + "epoch": 6.3463056965595035, + "grad_norm": 0.9216534495353699, + "learning_rate": 1.8271291596164693e-05, + "loss": 0.7354, + "step": 11252 + }, + { + "epoch": 6.346869712351946, + "grad_norm": 0.9892426133155823, + "learning_rate": 1.8268471517202482e-05, + "loss": 0.6975, + "step": 11253 + }, + { + "epoch": 6.347433728144388, + "grad_norm": 1.3499077558517456, + "learning_rate": 1.826565143824027e-05, + "loss": 0.7921, + "step": 11254 + }, + { + "epoch": 6.34799774393683, + "grad_norm": 1.2864482402801514, + "learning_rate": 1.826283135927806e-05, + "loss": 0.6974, + "step": 11255 + }, + { + "epoch": 6.3485617597292725, + "grad_norm": 0.9387160539627075, + "learning_rate": 1.8260011280315852e-05, + "loss": 0.7688, + "step": 11256 + }, + { + "epoch": 6.349125775521714, + "grad_norm": 0.7517344951629639, + "learning_rate": 1.8257191201353637e-05, + "loss": 0.6291, + "step": 11257 + }, + { + "epoch": 6.349689791314157, + "grad_norm": 0.9541428685188293, + "learning_rate": 1.825437112239143e-05, + "loss": 0.7987, + "step": 11258 + }, + { + "epoch": 6.350253807106599, + "grad_norm": 1.1475790739059448, + "learning_rate": 1.8251551043429215e-05, + "loss": 0.7406, + "step": 11259 + }, + { + "epoch": 6.3508178228990415, + "grad_norm": 1.5241367816925049, + "learning_rate": 1.8248730964467007e-05, + "loss": 0.8084, + "step": 11260 + }, + { + "epoch": 6.351381838691483, + "grad_norm": 1.394127607345581, + "learning_rate": 1.8245910885504792e-05, + "loss": 0.8663, + "step": 11261 + }, + { + "epoch": 6.351945854483926, + "grad_norm": 1.3431316614151, + "learning_rate": 1.8243090806542585e-05, + "loss": 0.7239, + "step": 11262 + }, + { + "epoch": 6.352509870276368, + "grad_norm": 1.1548618078231812, + "learning_rate": 1.8240270727580373e-05, + "loss": 0.7075, + "step": 11263 + }, + { + "epoch": 6.35307388606881, + "grad_norm": 1.713118076324463, + "learning_rate": 1.8237450648618162e-05, + "loss": 0.799, + "step": 11264 + }, + { + "epoch": 6.353637901861252, + "grad_norm": 1.8408926725387573, + "learning_rate": 1.823463056965595e-05, + "loss": 0.8624, + "step": 11265 + }, + { + "epoch": 6.354201917653694, + "grad_norm": 1.0158584117889404, + "learning_rate": 1.823181049069374e-05, + "loss": 0.7679, + "step": 11266 + }, + { + "epoch": 6.354765933446137, + "grad_norm": 1.2034032344818115, + "learning_rate": 1.822899041173153e-05, + "loss": 0.7463, + "step": 11267 + }, + { + "epoch": 6.355329949238579, + "grad_norm": 0.8593761324882507, + "learning_rate": 1.8226170332769317e-05, + "loss": 0.6759, + "step": 11268 + }, + { + "epoch": 6.35589396503102, + "grad_norm": 1.070250391960144, + "learning_rate": 1.822335025380711e-05, + "loss": 0.6922, + "step": 11269 + }, + { + "epoch": 6.356457980823463, + "grad_norm": 0.8605260848999023, + "learning_rate": 1.8220530174844895e-05, + "loss": 0.6153, + "step": 11270 + }, + { + "epoch": 6.357021996615905, + "grad_norm": 1.4335697889328003, + "learning_rate": 1.8217710095882687e-05, + "loss": 0.7206, + "step": 11271 + }, + { + "epoch": 6.357586012408348, + "grad_norm": 1.0626319646835327, + "learning_rate": 1.8214890016920476e-05, + "loss": 0.7242, + "step": 11272 + }, + { + "epoch": 6.3581500282007894, + "grad_norm": 1.9592983722686768, + "learning_rate": 1.8212069937958265e-05, + "loss": 0.7877, + "step": 11273 + }, + { + "epoch": 6.358714043993232, + "grad_norm": 1.0466361045837402, + "learning_rate": 1.8209249858996053e-05, + "loss": 0.7271, + "step": 11274 + }, + { + "epoch": 6.359278059785674, + "grad_norm": 1.0422563552856445, + "learning_rate": 1.8206429780033842e-05, + "loss": 0.684, + "step": 11275 + }, + { + "epoch": 6.359842075578117, + "grad_norm": 0.84504634141922, + "learning_rate": 1.820360970107163e-05, + "loss": 0.6285, + "step": 11276 + }, + { + "epoch": 6.3604060913705585, + "grad_norm": 1.3834854364395142, + "learning_rate": 1.820078962210942e-05, + "loss": 0.7541, + "step": 11277 + }, + { + "epoch": 6.360970107163, + "grad_norm": 0.9975838661193848, + "learning_rate": 1.819796954314721e-05, + "loss": 0.7861, + "step": 11278 + }, + { + "epoch": 6.361534122955443, + "grad_norm": 0.9208999276161194, + "learning_rate": 1.8195149464184997e-05, + "loss": 0.7652, + "step": 11279 + }, + { + "epoch": 6.362098138747885, + "grad_norm": 1.4911391735076904, + "learning_rate": 1.8192329385222786e-05, + "loss": 0.7133, + "step": 11280 + }, + { + "epoch": 6.3626621545403275, + "grad_norm": 1.1573470830917358, + "learning_rate": 1.818950930626058e-05, + "loss": 0.7097, + "step": 11281 + }, + { + "epoch": 6.363226170332769, + "grad_norm": 1.156683087348938, + "learning_rate": 1.8186689227298364e-05, + "loss": 0.7422, + "step": 11282 + }, + { + "epoch": 6.363790186125211, + "grad_norm": 1.4215378761291504, + "learning_rate": 1.8183869148336156e-05, + "loss": 0.6648, + "step": 11283 + }, + { + "epoch": 6.364354201917654, + "grad_norm": 0.9434122443199158, + "learning_rate": 1.818104906937394e-05, + "loss": 0.6656, + "step": 11284 + }, + { + "epoch": 6.364918217710096, + "grad_norm": 0.7654576301574707, + "learning_rate": 1.8178228990411734e-05, + "loss": 0.6626, + "step": 11285 + }, + { + "epoch": 6.365482233502538, + "grad_norm": 1.1130397319793701, + "learning_rate": 1.8175408911449522e-05, + "loss": 0.7847, + "step": 11286 + }, + { + "epoch": 6.36604624929498, + "grad_norm": 1.1706557273864746, + "learning_rate": 1.817258883248731e-05, + "loss": 0.7395, + "step": 11287 + }, + { + "epoch": 6.366610265087423, + "grad_norm": 1.0253511667251587, + "learning_rate": 1.81697687535251e-05, + "loss": 0.8493, + "step": 11288 + }, + { + "epoch": 6.367174280879865, + "grad_norm": 1.2636332511901855, + "learning_rate": 1.816694867456289e-05, + "loss": 0.796, + "step": 11289 + }, + { + "epoch": 6.367738296672307, + "grad_norm": 1.1265567541122437, + "learning_rate": 1.8164128595600678e-05, + "loss": 0.7607, + "step": 11290 + }, + { + "epoch": 6.368302312464749, + "grad_norm": 0.9262017607688904, + "learning_rate": 1.8161308516638466e-05, + "loss": 0.6514, + "step": 11291 + }, + { + "epoch": 6.368866328257191, + "grad_norm": 1.3698816299438477, + "learning_rate": 1.8158488437676255e-05, + "loss": 0.818, + "step": 11292 + }, + { + "epoch": 6.369430344049634, + "grad_norm": 1.0449045896530151, + "learning_rate": 1.8155668358714044e-05, + "loss": 0.7708, + "step": 11293 + }, + { + "epoch": 6.369994359842075, + "grad_norm": 1.1440131664276123, + "learning_rate": 1.8152848279751833e-05, + "loss": 0.7869, + "step": 11294 + }, + { + "epoch": 6.370558375634518, + "grad_norm": 1.0492939949035645, + "learning_rate": 1.8150028200789625e-05, + "loss": 0.7061, + "step": 11295 + }, + { + "epoch": 6.37112239142696, + "grad_norm": 1.327984094619751, + "learning_rate": 1.814720812182741e-05, + "loss": 0.7497, + "step": 11296 + }, + { + "epoch": 6.371686407219402, + "grad_norm": 1.2009598016738892, + "learning_rate": 1.8144388042865203e-05, + "loss": 0.7074, + "step": 11297 + }, + { + "epoch": 6.372250423011844, + "grad_norm": 0.9568209648132324, + "learning_rate": 1.8141567963902988e-05, + "loss": 0.7462, + "step": 11298 + }, + { + "epoch": 6.372814438804286, + "grad_norm": 0.8958033323287964, + "learning_rate": 1.813874788494078e-05, + "loss": 0.6626, + "step": 11299 + }, + { + "epoch": 6.373378454596729, + "grad_norm": 1.1286934614181519, + "learning_rate": 1.8135927805978566e-05, + "loss": 0.8213, + "step": 11300 + }, + { + "epoch": 6.373942470389171, + "grad_norm": 1.0369073152542114, + "learning_rate": 1.8133107727016358e-05, + "loss": 0.694, + "step": 11301 + }, + { + "epoch": 6.374506486181613, + "grad_norm": 0.7972751259803772, + "learning_rate": 1.8130287648054147e-05, + "loss": 0.6823, + "step": 11302 + }, + { + "epoch": 6.375070501974055, + "grad_norm": 0.9023282527923584, + "learning_rate": 1.8127467569091935e-05, + "loss": 0.7362, + "step": 11303 + }, + { + "epoch": 6.375634517766498, + "grad_norm": 1.2346457242965698, + "learning_rate": 1.8124647490129728e-05, + "loss": 0.7829, + "step": 11304 + }, + { + "epoch": 6.37619853355894, + "grad_norm": 0.8777521252632141, + "learning_rate": 1.8121827411167513e-05, + "loss": 0.7216, + "step": 11305 + }, + { + "epoch": 6.3767625493513815, + "grad_norm": 1.4850093126296997, + "learning_rate": 1.8119007332205305e-05, + "loss": 0.7821, + "step": 11306 + }, + { + "epoch": 6.377326565143824, + "grad_norm": 1.0472331047058105, + "learning_rate": 1.811618725324309e-05, + "loss": 0.6317, + "step": 11307 + }, + { + "epoch": 6.377890580936266, + "grad_norm": 1.2241551876068115, + "learning_rate": 1.8113367174280883e-05, + "loss": 0.7452, + "step": 11308 + }, + { + "epoch": 6.378454596728709, + "grad_norm": 1.0484418869018555, + "learning_rate": 1.8110547095318668e-05, + "loss": 0.6667, + "step": 11309 + }, + { + "epoch": 6.3790186125211505, + "grad_norm": 0.9805039763450623, + "learning_rate": 1.810772701635646e-05, + "loss": 0.7544, + "step": 11310 + }, + { + "epoch": 6.379582628313592, + "grad_norm": 1.392948031425476, + "learning_rate": 1.810490693739425e-05, + "loss": 0.7789, + "step": 11311 + }, + { + "epoch": 6.380146644106035, + "grad_norm": 0.8720724582672119, + "learning_rate": 1.8102086858432038e-05, + "loss": 0.6791, + "step": 11312 + }, + { + "epoch": 6.380710659898477, + "grad_norm": 1.0465384721755981, + "learning_rate": 1.8099266779469827e-05, + "loss": 0.7353, + "step": 11313 + }, + { + "epoch": 6.3812746756909196, + "grad_norm": 1.0902646780014038, + "learning_rate": 1.8096446700507615e-05, + "loss": 0.8223, + "step": 11314 + }, + { + "epoch": 6.381838691483361, + "grad_norm": 1.309880256652832, + "learning_rate": 1.8093626621545404e-05, + "loss": 0.8705, + "step": 11315 + }, + { + "epoch": 6.382402707275804, + "grad_norm": 0.85999995470047, + "learning_rate": 1.8090806542583193e-05, + "loss": 0.6832, + "step": 11316 + }, + { + "epoch": 6.382966723068246, + "grad_norm": 0.7547513842582703, + "learning_rate": 1.8087986463620982e-05, + "loss": 0.5954, + "step": 11317 + }, + { + "epoch": 6.383530738860689, + "grad_norm": 0.8836095333099365, + "learning_rate": 1.808516638465877e-05, + "loss": 0.7128, + "step": 11318 + }, + { + "epoch": 6.38409475465313, + "grad_norm": 1.2331633567810059, + "learning_rate": 1.808234630569656e-05, + "loss": 0.7348, + "step": 11319 + }, + { + "epoch": 6.384658770445572, + "grad_norm": 1.0138297080993652, + "learning_rate": 1.807952622673435e-05, + "loss": 0.6909, + "step": 11320 + }, + { + "epoch": 6.385222786238015, + "grad_norm": 1.1705825328826904, + "learning_rate": 1.8076706147772137e-05, + "loss": 0.9506, + "step": 11321 + }, + { + "epoch": 6.385786802030457, + "grad_norm": 0.919615626335144, + "learning_rate": 1.807388606880993e-05, + "loss": 0.7095, + "step": 11322 + }, + { + "epoch": 6.386350817822899, + "grad_norm": 1.345691442489624, + "learning_rate": 1.8071065989847715e-05, + "loss": 0.8585, + "step": 11323 + }, + { + "epoch": 6.386914833615341, + "grad_norm": 1.0110572576522827, + "learning_rate": 1.8068245910885507e-05, + "loss": 0.6791, + "step": 11324 + }, + { + "epoch": 6.387478849407783, + "grad_norm": 1.4315176010131836, + "learning_rate": 1.8065425831923292e-05, + "loss": 0.7223, + "step": 11325 + }, + { + "epoch": 6.388042865200226, + "grad_norm": 1.1776955127716064, + "learning_rate": 1.8062605752961084e-05, + "loss": 0.7652, + "step": 11326 + }, + { + "epoch": 6.3886068809926675, + "grad_norm": 1.1956591606140137, + "learning_rate": 1.8059785673998873e-05, + "loss": 0.6814, + "step": 11327 + }, + { + "epoch": 6.38917089678511, + "grad_norm": 1.2066136598587036, + "learning_rate": 1.8056965595036662e-05, + "loss": 0.7671, + "step": 11328 + }, + { + "epoch": 6.389734912577552, + "grad_norm": 0.9045932292938232, + "learning_rate": 1.805414551607445e-05, + "loss": 0.6579, + "step": 11329 + }, + { + "epoch": 6.390298928369995, + "grad_norm": 0.8614168763160706, + "learning_rate": 1.805132543711224e-05, + "loss": 0.7106, + "step": 11330 + }, + { + "epoch": 6.3908629441624365, + "grad_norm": 1.4606986045837402, + "learning_rate": 1.804850535815003e-05, + "loss": 0.7728, + "step": 11331 + }, + { + "epoch": 6.391426959954879, + "grad_norm": 1.2388503551483154, + "learning_rate": 1.8045685279187817e-05, + "loss": 0.7241, + "step": 11332 + }, + { + "epoch": 6.391990975747321, + "grad_norm": 1.0450559854507446, + "learning_rate": 1.8042865200225606e-05, + "loss": 0.7343, + "step": 11333 + }, + { + "epoch": 6.392554991539763, + "grad_norm": 1.1886157989501953, + "learning_rate": 1.8040045121263395e-05, + "loss": 0.8267, + "step": 11334 + }, + { + "epoch": 6.3931190073322055, + "grad_norm": 0.9601023197174072, + "learning_rate": 1.8037225042301184e-05, + "loss": 0.6359, + "step": 11335 + }, + { + "epoch": 6.393683023124647, + "grad_norm": 0.9339039921760559, + "learning_rate": 1.8034404963338976e-05, + "loss": 0.652, + "step": 11336 + }, + { + "epoch": 6.39424703891709, + "grad_norm": 1.5466573238372803, + "learning_rate": 1.803158488437676e-05, + "loss": 0.7366, + "step": 11337 + }, + { + "epoch": 6.394811054709532, + "grad_norm": 1.3004595041275024, + "learning_rate": 1.8028764805414553e-05, + "loss": 0.7843, + "step": 11338 + }, + { + "epoch": 6.395375070501974, + "grad_norm": 1.226999282836914, + "learning_rate": 1.8025944726452342e-05, + "loss": 0.7484, + "step": 11339 + }, + { + "epoch": 6.395939086294416, + "grad_norm": 0.9002104997634888, + "learning_rate": 1.802312464749013e-05, + "loss": 0.6287, + "step": 11340 + }, + { + "epoch": 6.396503102086858, + "grad_norm": 0.9779609441757202, + "learning_rate": 1.802030456852792e-05, + "loss": 0.7111, + "step": 11341 + }, + { + "epoch": 6.397067117879301, + "grad_norm": 1.227537751197815, + "learning_rate": 1.801748448956571e-05, + "loss": 0.8787, + "step": 11342 + }, + { + "epoch": 6.397631133671743, + "grad_norm": 1.4203547239303589, + "learning_rate": 1.8014664410603497e-05, + "loss": 0.7596, + "step": 11343 + }, + { + "epoch": 6.398195149464185, + "grad_norm": 0.8724333047866821, + "learning_rate": 1.8011844331641286e-05, + "loss": 0.5804, + "step": 11344 + }, + { + "epoch": 6.398759165256627, + "grad_norm": 1.2066410779953003, + "learning_rate": 1.800902425267908e-05, + "loss": 0.832, + "step": 11345 + }, + { + "epoch": 6.39932318104907, + "grad_norm": 1.0521142482757568, + "learning_rate": 1.8006204173716864e-05, + "loss": 0.7344, + "step": 11346 + }, + { + "epoch": 6.399887196841512, + "grad_norm": 1.0226383209228516, + "learning_rate": 1.8003384094754656e-05, + "loss": 0.7562, + "step": 11347 + }, + { + "epoch": 6.4004512126339534, + "grad_norm": 0.8969743251800537, + "learning_rate": 1.800056401579244e-05, + "loss": 0.6693, + "step": 11348 + }, + { + "epoch": 6.401015228426396, + "grad_norm": 0.9998248219490051, + "learning_rate": 1.7997743936830234e-05, + "loss": 0.7409, + "step": 11349 + }, + { + "epoch": 6.401579244218838, + "grad_norm": 0.940938413143158, + "learning_rate": 1.7994923857868022e-05, + "loss": 0.6836, + "step": 11350 + }, + { + "epoch": 6.402143260011281, + "grad_norm": 1.3168785572052002, + "learning_rate": 1.799210377890581e-05, + "loss": 0.9165, + "step": 11351 + }, + { + "epoch": 6.4027072758037225, + "grad_norm": 0.7456610798835754, + "learning_rate": 1.79892836999436e-05, + "loss": 0.6329, + "step": 11352 + }, + { + "epoch": 6.403271291596164, + "grad_norm": 1.2196818590164185, + "learning_rate": 1.798646362098139e-05, + "loss": 0.8329, + "step": 11353 + }, + { + "epoch": 6.403835307388607, + "grad_norm": 1.031862497329712, + "learning_rate": 1.7983643542019178e-05, + "loss": 0.6848, + "step": 11354 + }, + { + "epoch": 6.404399323181049, + "grad_norm": 1.0984278917312622, + "learning_rate": 1.7980823463056966e-05, + "loss": 0.8482, + "step": 11355 + }, + { + "epoch": 6.4049633389734915, + "grad_norm": 1.2186527252197266, + "learning_rate": 1.7978003384094755e-05, + "loss": 0.7931, + "step": 11356 + }, + { + "epoch": 6.405527354765933, + "grad_norm": 1.3260729312896729, + "learning_rate": 1.7975183305132544e-05, + "loss": 0.7609, + "step": 11357 + }, + { + "epoch": 6.406091370558376, + "grad_norm": 0.9936665892601013, + "learning_rate": 1.7972363226170333e-05, + "loss": 0.7497, + "step": 11358 + }, + { + "epoch": 6.406655386350818, + "grad_norm": 1.258534550666809, + "learning_rate": 1.7969543147208125e-05, + "loss": 0.7325, + "step": 11359 + }, + { + "epoch": 6.4072194021432605, + "grad_norm": 1.0052410364151, + "learning_rate": 1.796672306824591e-05, + "loss": 0.7658, + "step": 11360 + }, + { + "epoch": 6.407783417935702, + "grad_norm": 1.0151399374008179, + "learning_rate": 1.7963902989283702e-05, + "loss": 0.6841, + "step": 11361 + }, + { + "epoch": 6.408347433728144, + "grad_norm": 1.0287957191467285, + "learning_rate": 1.7961082910321488e-05, + "loss": 0.7092, + "step": 11362 + }, + { + "epoch": 6.408911449520587, + "grad_norm": 1.2091540098190308, + "learning_rate": 1.795826283135928e-05, + "loss": 0.7404, + "step": 11363 + }, + { + "epoch": 6.409475465313029, + "grad_norm": 1.1472861766815186, + "learning_rate": 1.7955442752397065e-05, + "loss": 0.6992, + "step": 11364 + }, + { + "epoch": 6.410039481105471, + "grad_norm": 1.2814345359802246, + "learning_rate": 1.7952622673434858e-05, + "loss": 0.8223, + "step": 11365 + }, + { + "epoch": 6.410603496897913, + "grad_norm": 1.1136568784713745, + "learning_rate": 1.7949802594472646e-05, + "loss": 0.7272, + "step": 11366 + }, + { + "epoch": 6.411167512690355, + "grad_norm": 1.0087083578109741, + "learning_rate": 1.7946982515510435e-05, + "loss": 0.8226, + "step": 11367 + }, + { + "epoch": 6.411731528482798, + "grad_norm": 0.8932173848152161, + "learning_rate": 1.7944162436548224e-05, + "loss": 0.6596, + "step": 11368 + }, + { + "epoch": 6.412295544275239, + "grad_norm": 1.1408904790878296, + "learning_rate": 1.7941342357586013e-05, + "loss": 0.6726, + "step": 11369 + }, + { + "epoch": 6.412859560067682, + "grad_norm": 1.4155068397521973, + "learning_rate": 1.79385222786238e-05, + "loss": 0.8495, + "step": 11370 + }, + { + "epoch": 6.413423575860124, + "grad_norm": 1.7791941165924072, + "learning_rate": 1.793570219966159e-05, + "loss": 0.8534, + "step": 11371 + }, + { + "epoch": 6.413987591652567, + "grad_norm": 1.136340856552124, + "learning_rate": 1.793288212069938e-05, + "loss": 0.626, + "step": 11372 + }, + { + "epoch": 6.414551607445008, + "grad_norm": 1.0670219659805298, + "learning_rate": 1.7930062041737168e-05, + "loss": 0.901, + "step": 11373 + }, + { + "epoch": 6.415115623237451, + "grad_norm": 1.1925642490386963, + "learning_rate": 1.792724196277496e-05, + "loss": 0.7663, + "step": 11374 + }, + { + "epoch": 6.415679639029893, + "grad_norm": 1.0506480932235718, + "learning_rate": 1.792442188381275e-05, + "loss": 0.693, + "step": 11375 + }, + { + "epoch": 6.416243654822335, + "grad_norm": 1.215370535850525, + "learning_rate": 1.7921601804850538e-05, + "loss": 0.7593, + "step": 11376 + }, + { + "epoch": 6.416807670614777, + "grad_norm": 1.1456736326217651, + "learning_rate": 1.7918781725888327e-05, + "loss": 0.7833, + "step": 11377 + }, + { + "epoch": 6.417371686407219, + "grad_norm": 1.5017048120498657, + "learning_rate": 1.7915961646926115e-05, + "loss": 0.8059, + "step": 11378 + }, + { + "epoch": 6.417935702199662, + "grad_norm": 0.8633458614349365, + "learning_rate": 1.7913141567963904e-05, + "loss": 0.6621, + "step": 11379 + }, + { + "epoch": 6.418499717992104, + "grad_norm": 1.2409170866012573, + "learning_rate": 1.7910321489001693e-05, + "loss": 0.7138, + "step": 11380 + }, + { + "epoch": 6.4190637337845455, + "grad_norm": 1.0911363363265991, + "learning_rate": 1.7907501410039482e-05, + "loss": 0.7719, + "step": 11381 + }, + { + "epoch": 6.419627749576988, + "grad_norm": 1.271601915359497, + "learning_rate": 1.790468133107727e-05, + "loss": 0.7551, + "step": 11382 + }, + { + "epoch": 6.42019176536943, + "grad_norm": 0.8961488604545593, + "learning_rate": 1.790186125211506e-05, + "loss": 0.6368, + "step": 11383 + }, + { + "epoch": 6.420755781161873, + "grad_norm": 1.4645144939422607, + "learning_rate": 1.789904117315285e-05, + "loss": 0.7578, + "step": 11384 + }, + { + "epoch": 6.4213197969543145, + "grad_norm": 0.998669445514679, + "learning_rate": 1.7896221094190637e-05, + "loss": 0.6932, + "step": 11385 + }, + { + "epoch": 6.421883812746757, + "grad_norm": 1.0862691402435303, + "learning_rate": 1.789340101522843e-05, + "loss": 0.7923, + "step": 11386 + }, + { + "epoch": 6.422447828539199, + "grad_norm": 1.173288345336914, + "learning_rate": 1.7890580936266215e-05, + "loss": 0.7233, + "step": 11387 + }, + { + "epoch": 6.423011844331642, + "grad_norm": 0.987878680229187, + "learning_rate": 1.7887760857304007e-05, + "loss": 0.6097, + "step": 11388 + }, + { + "epoch": 6.4235758601240835, + "grad_norm": 0.8813624978065491, + "learning_rate": 1.7884940778341792e-05, + "loss": 0.6697, + "step": 11389 + }, + { + "epoch": 6.424139875916525, + "grad_norm": 1.472568392753601, + "learning_rate": 1.7882120699379584e-05, + "loss": 0.7072, + "step": 11390 + }, + { + "epoch": 6.424703891708968, + "grad_norm": 1.1208537817001343, + "learning_rate": 1.7879300620417373e-05, + "loss": 0.6422, + "step": 11391 + }, + { + "epoch": 6.42526790750141, + "grad_norm": 1.15188467502594, + "learning_rate": 1.7876480541455162e-05, + "loss": 0.7611, + "step": 11392 + }, + { + "epoch": 6.4258319232938526, + "grad_norm": 1.3234823942184448, + "learning_rate": 1.787366046249295e-05, + "loss": 0.704, + "step": 11393 + }, + { + "epoch": 6.426395939086294, + "grad_norm": 0.8731600046157837, + "learning_rate": 1.787084038353074e-05, + "loss": 0.67, + "step": 11394 + }, + { + "epoch": 6.426959954878736, + "grad_norm": 1.356169581413269, + "learning_rate": 1.786802030456853e-05, + "loss": 0.6983, + "step": 11395 + }, + { + "epoch": 6.427523970671179, + "grad_norm": 0.8798912167549133, + "learning_rate": 1.7865200225606317e-05, + "loss": 0.6045, + "step": 11396 + }, + { + "epoch": 6.428087986463621, + "grad_norm": 1.1385140419006348, + "learning_rate": 1.7862380146644106e-05, + "loss": 0.7118, + "step": 11397 + }, + { + "epoch": 6.428652002256063, + "grad_norm": 0.9220947027206421, + "learning_rate": 1.7859560067681895e-05, + "loss": 0.6198, + "step": 11398 + }, + { + "epoch": 6.429216018048505, + "grad_norm": 1.2104318141937256, + "learning_rate": 1.7856739988719684e-05, + "loss": 0.7882, + "step": 11399 + }, + { + "epoch": 6.429780033840948, + "grad_norm": 1.1201468706130981, + "learning_rate": 1.7853919909757476e-05, + "loss": 0.7922, + "step": 11400 + }, + { + "epoch": 6.43034404963339, + "grad_norm": 0.9711065292358398, + "learning_rate": 1.785109983079526e-05, + "loss": 0.7896, + "step": 11401 + }, + { + "epoch": 6.430908065425832, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.7848279751833053e-05, + "loss": 0.7258, + "step": 11402 + }, + { + "epoch": 6.431472081218274, + "grad_norm": 1.5751101970672607, + "learning_rate": 1.784545967287084e-05, + "loss": 0.7552, + "step": 11403 + }, + { + "epoch": 6.432036097010716, + "grad_norm": 1.1882076263427734, + "learning_rate": 1.784263959390863e-05, + "loss": 0.7104, + "step": 11404 + }, + { + "epoch": 6.432600112803159, + "grad_norm": 1.3201184272766113, + "learning_rate": 1.783981951494642e-05, + "loss": 0.84, + "step": 11405 + }, + { + "epoch": 6.4331641285956005, + "grad_norm": 0.8822938799858093, + "learning_rate": 1.783699943598421e-05, + "loss": 0.6821, + "step": 11406 + }, + { + "epoch": 6.433728144388043, + "grad_norm": 1.109986662864685, + "learning_rate": 1.7834179357021997e-05, + "loss": 0.6845, + "step": 11407 + }, + { + "epoch": 6.434292160180485, + "grad_norm": 1.3514453172683716, + "learning_rate": 1.7831359278059786e-05, + "loss": 0.6107, + "step": 11408 + }, + { + "epoch": 6.434856175972927, + "grad_norm": 0.8725614547729492, + "learning_rate": 1.7828539199097578e-05, + "loss": 0.7047, + "step": 11409 + }, + { + "epoch": 6.4354201917653695, + "grad_norm": 0.9833052754402161, + "learning_rate": 1.7825719120135364e-05, + "loss": 0.7004, + "step": 11410 + }, + { + "epoch": 6.435984207557811, + "grad_norm": 1.153868556022644, + "learning_rate": 1.7822899041173156e-05, + "loss": 0.7055, + "step": 11411 + }, + { + "epoch": 6.436548223350254, + "grad_norm": 0.8900769948959351, + "learning_rate": 1.782007896221094e-05, + "loss": 0.6754, + "step": 11412 + }, + { + "epoch": 6.437112239142696, + "grad_norm": 1.0769089460372925, + "learning_rate": 1.7817258883248733e-05, + "loss": 0.6608, + "step": 11413 + }, + { + "epoch": 6.4376762549351385, + "grad_norm": 1.068122386932373, + "learning_rate": 1.7814438804286522e-05, + "loss": 0.6832, + "step": 11414 + }, + { + "epoch": 6.43824027072758, + "grad_norm": 0.9880443215370178, + "learning_rate": 1.781161872532431e-05, + "loss": 0.6654, + "step": 11415 + }, + { + "epoch": 6.438804286520023, + "grad_norm": 0.9677198529243469, + "learning_rate": 1.78087986463621e-05, + "loss": 0.7099, + "step": 11416 + }, + { + "epoch": 6.439368302312465, + "grad_norm": 0.9178690314292908, + "learning_rate": 1.780597856739989e-05, + "loss": 0.7031, + "step": 11417 + }, + { + "epoch": 6.439932318104907, + "grad_norm": 0.984250545501709, + "learning_rate": 1.7803158488437677e-05, + "loss": 0.6681, + "step": 11418 + }, + { + "epoch": 6.440496333897349, + "grad_norm": 0.9450239539146423, + "learning_rate": 1.7800338409475466e-05, + "loss": 0.8002, + "step": 11419 + }, + { + "epoch": 6.441060349689791, + "grad_norm": 1.212814450263977, + "learning_rate": 1.7797518330513255e-05, + "loss": 0.7545, + "step": 11420 + }, + { + "epoch": 6.441624365482234, + "grad_norm": 1.2430753707885742, + "learning_rate": 1.7794698251551044e-05, + "loss": 0.8113, + "step": 11421 + }, + { + "epoch": 6.442188381274676, + "grad_norm": 0.8811373114585876, + "learning_rate": 1.7791878172588833e-05, + "loss": 0.7375, + "step": 11422 + }, + { + "epoch": 6.442752397067117, + "grad_norm": 0.8143208622932434, + "learning_rate": 1.7789058093626625e-05, + "loss": 0.6577, + "step": 11423 + }, + { + "epoch": 6.44331641285956, + "grad_norm": 1.5077457427978516, + "learning_rate": 1.778623801466441e-05, + "loss": 0.7255, + "step": 11424 + }, + { + "epoch": 6.443880428652002, + "grad_norm": 1.0612092018127441, + "learning_rate": 1.7783417935702202e-05, + "loss": 0.684, + "step": 11425 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 0.8639861345291138, + "learning_rate": 1.7780597856739988e-05, + "loss": 0.7173, + "step": 11426 + }, + { + "epoch": 6.4450084602368864, + "grad_norm": 0.9688913822174072, + "learning_rate": 1.777777777777778e-05, + "loss": 0.7315, + "step": 11427 + }, + { + "epoch": 6.445572476029329, + "grad_norm": 1.550399899482727, + "learning_rate": 1.7774957698815565e-05, + "loss": 0.7693, + "step": 11428 + }, + { + "epoch": 6.446136491821771, + "grad_norm": 1.2488024234771729, + "learning_rate": 1.7772137619853358e-05, + "loss": 0.7163, + "step": 11429 + }, + { + "epoch": 6.446700507614214, + "grad_norm": 1.1152859926223755, + "learning_rate": 1.7769317540891146e-05, + "loss": 0.7446, + "step": 11430 + }, + { + "epoch": 6.4472645234066555, + "grad_norm": 0.8698285222053528, + "learning_rate": 1.7766497461928935e-05, + "loss": 0.6693, + "step": 11431 + }, + { + "epoch": 6.447828539199097, + "grad_norm": 1.5541472434997559, + "learning_rate": 1.7763677382966724e-05, + "loss": 0.8245, + "step": 11432 + }, + { + "epoch": 6.44839255499154, + "grad_norm": 1.053213357925415, + "learning_rate": 1.7760857304004513e-05, + "loss": 0.8125, + "step": 11433 + }, + { + "epoch": 6.448956570783982, + "grad_norm": 1.0270707607269287, + "learning_rate": 1.77580372250423e-05, + "loss": 0.7833, + "step": 11434 + }, + { + "epoch": 6.4495205865764245, + "grad_norm": 0.8798156976699829, + "learning_rate": 1.775521714608009e-05, + "loss": 0.7243, + "step": 11435 + }, + { + "epoch": 6.450084602368866, + "grad_norm": 1.4524528980255127, + "learning_rate": 1.775239706711788e-05, + "loss": 0.7343, + "step": 11436 + }, + { + "epoch": 6.450648618161308, + "grad_norm": 1.1145578622817993, + "learning_rate": 1.7749576988155668e-05, + "loss": 0.7534, + "step": 11437 + }, + { + "epoch": 6.451212633953751, + "grad_norm": 0.9664437770843506, + "learning_rate": 1.7746756909193457e-05, + "loss": 0.7393, + "step": 11438 + }, + { + "epoch": 6.451776649746193, + "grad_norm": 1.0106427669525146, + "learning_rate": 1.774393683023125e-05, + "loss": 0.7922, + "step": 11439 + }, + { + "epoch": 6.452340665538635, + "grad_norm": 1.0083757638931274, + "learning_rate": 1.7741116751269034e-05, + "loss": 0.7192, + "step": 11440 + }, + { + "epoch": 6.452904681331077, + "grad_norm": 1.0414685010910034, + "learning_rate": 1.7738296672306827e-05, + "loss": 0.7069, + "step": 11441 + }, + { + "epoch": 6.45346869712352, + "grad_norm": 1.267220377922058, + "learning_rate": 1.7735476593344612e-05, + "loss": 0.7952, + "step": 11442 + }, + { + "epoch": 6.454032712915962, + "grad_norm": 0.9736956357955933, + "learning_rate": 1.7732656514382404e-05, + "loss": 0.6346, + "step": 11443 + }, + { + "epoch": 6.454596728708404, + "grad_norm": 1.4044742584228516, + "learning_rate": 1.7729836435420193e-05, + "loss": 0.7528, + "step": 11444 + }, + { + "epoch": 6.455160744500846, + "grad_norm": 1.2226319313049316, + "learning_rate": 1.7727016356457982e-05, + "loss": 0.8311, + "step": 11445 + }, + { + "epoch": 6.455724760293288, + "grad_norm": 0.9666396379470825, + "learning_rate": 1.772419627749577e-05, + "loss": 0.7497, + "step": 11446 + }, + { + "epoch": 6.456288776085731, + "grad_norm": 1.1427226066589355, + "learning_rate": 1.772137619853356e-05, + "loss": 0.7398, + "step": 11447 + }, + { + "epoch": 6.456852791878172, + "grad_norm": 1.3043365478515625, + "learning_rate": 1.771855611957135e-05, + "loss": 0.8303, + "step": 11448 + }, + { + "epoch": 6.457416807670615, + "grad_norm": 1.149402379989624, + "learning_rate": 1.7715736040609137e-05, + "loss": 0.7523, + "step": 11449 + }, + { + "epoch": 6.457980823463057, + "grad_norm": 1.5773786306381226, + "learning_rate": 1.771291596164693e-05, + "loss": 0.8019, + "step": 11450 + }, + { + "epoch": 6.458544839255499, + "grad_norm": 1.006974458694458, + "learning_rate": 1.7710095882684714e-05, + "loss": 0.737, + "step": 11451 + }, + { + "epoch": 6.459108855047941, + "grad_norm": 1.0575344562530518, + "learning_rate": 1.7707275803722507e-05, + "loss": 0.7991, + "step": 11452 + }, + { + "epoch": 6.459672870840383, + "grad_norm": 0.8887671828269958, + "learning_rate": 1.7704455724760295e-05, + "loss": 0.6685, + "step": 11453 + }, + { + "epoch": 6.460236886632826, + "grad_norm": 1.2076395750045776, + "learning_rate": 1.7701635645798084e-05, + "loss": 0.8268, + "step": 11454 + }, + { + "epoch": 6.460800902425268, + "grad_norm": 1.118390679359436, + "learning_rate": 1.7698815566835873e-05, + "loss": 0.7451, + "step": 11455 + }, + { + "epoch": 6.46136491821771, + "grad_norm": 1.4645934104919434, + "learning_rate": 1.7695995487873662e-05, + "loss": 0.7222, + "step": 11456 + }, + { + "epoch": 6.461928934010152, + "grad_norm": 0.9727759957313538, + "learning_rate": 1.769317540891145e-05, + "loss": 0.669, + "step": 11457 + }, + { + "epoch": 6.462492949802595, + "grad_norm": 1.0316532850265503, + "learning_rate": 1.769035532994924e-05, + "loss": 0.8758, + "step": 11458 + }, + { + "epoch": 6.463056965595037, + "grad_norm": 1.2456450462341309, + "learning_rate": 1.7687535250987028e-05, + "loss": 0.6973, + "step": 11459 + }, + { + "epoch": 6.4636209813874785, + "grad_norm": 1.3240993022918701, + "learning_rate": 1.7684715172024817e-05, + "loss": 0.767, + "step": 11460 + }, + { + "epoch": 6.464184997179921, + "grad_norm": 0.9030250906944275, + "learning_rate": 1.7681895093062606e-05, + "loss": 0.646, + "step": 11461 + }, + { + "epoch": 6.464749012972363, + "grad_norm": 1.4449679851531982, + "learning_rate": 1.7679075014100398e-05, + "loss": 0.7128, + "step": 11462 + }, + { + "epoch": 6.465313028764806, + "grad_norm": 1.3029632568359375, + "learning_rate": 1.7676254935138183e-05, + "loss": 0.7303, + "step": 11463 + }, + { + "epoch": 6.4658770445572475, + "grad_norm": 1.325110912322998, + "learning_rate": 1.7673434856175976e-05, + "loss": 0.7298, + "step": 11464 + }, + { + "epoch": 6.466441060349689, + "grad_norm": 1.2547619342803955, + "learning_rate": 1.767061477721376e-05, + "loss": 0.7513, + "step": 11465 + }, + { + "epoch": 6.467005076142132, + "grad_norm": 1.1583629846572876, + "learning_rate": 1.7667794698251553e-05, + "loss": 0.7892, + "step": 11466 + }, + { + "epoch": 6.467569091934574, + "grad_norm": 1.1329773664474487, + "learning_rate": 1.766497461928934e-05, + "loss": 0.7455, + "step": 11467 + }, + { + "epoch": 6.4681331077270166, + "grad_norm": 0.991214394569397, + "learning_rate": 1.766215454032713e-05, + "loss": 0.6157, + "step": 11468 + }, + { + "epoch": 6.468697123519458, + "grad_norm": 1.2443844079971313, + "learning_rate": 1.765933446136492e-05, + "loss": 0.7566, + "step": 11469 + }, + { + "epoch": 6.469261139311901, + "grad_norm": 1.0939018726348877, + "learning_rate": 1.765651438240271e-05, + "loss": 0.6879, + "step": 11470 + }, + { + "epoch": 6.469825155104343, + "grad_norm": 1.2701890468597412, + "learning_rate": 1.7653694303440497e-05, + "loss": 0.6438, + "step": 11471 + }, + { + "epoch": 6.470389170896786, + "grad_norm": 1.1782169342041016, + "learning_rate": 1.7650874224478286e-05, + "loss": 0.7959, + "step": 11472 + }, + { + "epoch": 6.470953186689227, + "grad_norm": 1.108536958694458, + "learning_rate": 1.7648054145516075e-05, + "loss": 0.7611, + "step": 11473 + }, + { + "epoch": 6.471517202481669, + "grad_norm": 1.0291699171066284, + "learning_rate": 1.7645234066553864e-05, + "loss": 0.74, + "step": 11474 + }, + { + "epoch": 6.472081218274112, + "grad_norm": 1.0689152479171753, + "learning_rate": 1.7642413987591652e-05, + "loss": 0.7484, + "step": 11475 + }, + { + "epoch": 6.472645234066554, + "grad_norm": 0.9684939384460449, + "learning_rate": 1.763959390862944e-05, + "loss": 0.7026, + "step": 11476 + }, + { + "epoch": 6.473209249858996, + "grad_norm": 1.0777369737625122, + "learning_rate": 1.763677382966723e-05, + "loss": 0.749, + "step": 11477 + }, + { + "epoch": 6.473773265651438, + "grad_norm": 0.8832300901412964, + "learning_rate": 1.7633953750705022e-05, + "loss": 0.6894, + "step": 11478 + }, + { + "epoch": 6.47433728144388, + "grad_norm": 1.1120517253875732, + "learning_rate": 1.7631133671742808e-05, + "loss": 0.7064, + "step": 11479 + }, + { + "epoch": 6.474901297236323, + "grad_norm": 1.135316014289856, + "learning_rate": 1.76283135927806e-05, + "loss": 0.7911, + "step": 11480 + }, + { + "epoch": 6.4754653130287645, + "grad_norm": 1.206515908241272, + "learning_rate": 1.762549351381839e-05, + "loss": 0.6834, + "step": 11481 + }, + { + "epoch": 6.476029328821207, + "grad_norm": 1.0895355939865112, + "learning_rate": 1.7622673434856177e-05, + "loss": 0.7798, + "step": 11482 + }, + { + "epoch": 6.476593344613649, + "grad_norm": 1.4107924699783325, + "learning_rate": 1.7619853355893966e-05, + "loss": 0.7752, + "step": 11483 + }, + { + "epoch": 6.477157360406092, + "grad_norm": 1.0369755029678345, + "learning_rate": 1.7617033276931755e-05, + "loss": 0.7585, + "step": 11484 + }, + { + "epoch": 6.4777213761985335, + "grad_norm": 1.495094895362854, + "learning_rate": 1.7614213197969544e-05, + "loss": 0.8672, + "step": 11485 + }, + { + "epoch": 6.478285391990976, + "grad_norm": 0.919265866279602, + "learning_rate": 1.7611393119007333e-05, + "loss": 0.6783, + "step": 11486 + }, + { + "epoch": 6.478849407783418, + "grad_norm": 1.4688643217086792, + "learning_rate": 1.7608573040045125e-05, + "loss": 0.8479, + "step": 11487 + }, + { + "epoch": 6.47941342357586, + "grad_norm": 1.2359026670455933, + "learning_rate": 1.760575296108291e-05, + "loss": 0.8017, + "step": 11488 + }, + { + "epoch": 6.4799774393683025, + "grad_norm": 1.0550312995910645, + "learning_rate": 1.7602932882120702e-05, + "loss": 0.7314, + "step": 11489 + }, + { + "epoch": 6.480541455160744, + "grad_norm": 0.9707639217376709, + "learning_rate": 1.7600112803158488e-05, + "loss": 0.7924, + "step": 11490 + }, + { + "epoch": 6.481105470953187, + "grad_norm": 1.54131019115448, + "learning_rate": 1.759729272419628e-05, + "loss": 0.8628, + "step": 11491 + }, + { + "epoch": 6.481669486745629, + "grad_norm": 0.8267257809638977, + "learning_rate": 1.7594472645234065e-05, + "loss": 0.6998, + "step": 11492 + }, + { + "epoch": 6.482233502538071, + "grad_norm": 1.349870204925537, + "learning_rate": 1.7591652566271857e-05, + "loss": 0.7157, + "step": 11493 + }, + { + "epoch": 6.482797518330513, + "grad_norm": 0.9914360642433167, + "learning_rate": 1.7588832487309646e-05, + "loss": 0.7646, + "step": 11494 + }, + { + "epoch": 6.483361534122955, + "grad_norm": 1.1268258094787598, + "learning_rate": 1.7586012408347435e-05, + "loss": 0.7384, + "step": 11495 + }, + { + "epoch": 6.483925549915398, + "grad_norm": 1.0640848875045776, + "learning_rate": 1.7583192329385224e-05, + "loss": 0.7233, + "step": 11496 + }, + { + "epoch": 6.48448956570784, + "grad_norm": 0.9017088413238525, + "learning_rate": 1.7580372250423013e-05, + "loss": 0.748, + "step": 11497 + }, + { + "epoch": 6.485053581500282, + "grad_norm": 1.3863003253936768, + "learning_rate": 1.75775521714608e-05, + "loss": 0.8067, + "step": 11498 + }, + { + "epoch": 6.485617597292724, + "grad_norm": 1.220504641532898, + "learning_rate": 1.757473209249859e-05, + "loss": 0.8015, + "step": 11499 + }, + { + "epoch": 6.486181613085167, + "grad_norm": 1.3715598583221436, + "learning_rate": 1.757191201353638e-05, + "loss": 0.7564, + "step": 11500 + }, + { + "epoch": 6.486745628877609, + "grad_norm": 0.9935766458511353, + "learning_rate": 1.7569091934574168e-05, + "loss": 0.8117, + "step": 11501 + }, + { + "epoch": 6.4873096446700504, + "grad_norm": 1.1493778228759766, + "learning_rate": 1.7566271855611957e-05, + "loss": 0.8068, + "step": 11502 + }, + { + "epoch": 6.487873660462493, + "grad_norm": 1.124389410018921, + "learning_rate": 1.756345177664975e-05, + "loss": 0.7911, + "step": 11503 + }, + { + "epoch": 6.488437676254935, + "grad_norm": 1.2214114665985107, + "learning_rate": 1.7560631697687534e-05, + "loss": 0.6914, + "step": 11504 + }, + { + "epoch": 6.489001692047378, + "grad_norm": 1.4286749362945557, + "learning_rate": 1.7557811618725326e-05, + "loss": 0.783, + "step": 11505 + }, + { + "epoch": 6.4895657078398195, + "grad_norm": 0.8426365256309509, + "learning_rate": 1.7554991539763112e-05, + "loss": 0.6238, + "step": 11506 + }, + { + "epoch": 6.490129723632261, + "grad_norm": 1.0919499397277832, + "learning_rate": 1.7552171460800904e-05, + "loss": 0.7467, + "step": 11507 + }, + { + "epoch": 6.490693739424704, + "grad_norm": 1.233593463897705, + "learning_rate": 1.7549351381838693e-05, + "loss": 0.6695, + "step": 11508 + }, + { + "epoch": 6.491257755217146, + "grad_norm": 1.1864179372787476, + "learning_rate": 1.754653130287648e-05, + "loss": 0.7188, + "step": 11509 + }, + { + "epoch": 6.4918217710095885, + "grad_norm": 1.1970114707946777, + "learning_rate": 1.754371122391427e-05, + "loss": 0.7063, + "step": 11510 + }, + { + "epoch": 6.49238578680203, + "grad_norm": 0.9471473097801208, + "learning_rate": 1.754089114495206e-05, + "loss": 0.6865, + "step": 11511 + }, + { + "epoch": 6.492949802594473, + "grad_norm": 0.8049744963645935, + "learning_rate": 1.7538071065989848e-05, + "loss": 0.597, + "step": 11512 + }, + { + "epoch": 6.493513818386915, + "grad_norm": 1.1628514528274536, + "learning_rate": 1.7535250987027637e-05, + "loss": 0.7891, + "step": 11513 + }, + { + "epoch": 6.4940778341793575, + "grad_norm": 1.395434021949768, + "learning_rate": 1.7532430908065426e-05, + "loss": 0.7965, + "step": 11514 + }, + { + "epoch": 6.494641849971799, + "grad_norm": 1.017478346824646, + "learning_rate": 1.7529610829103214e-05, + "loss": 0.5791, + "step": 11515 + }, + { + "epoch": 6.495205865764241, + "grad_norm": 1.1613273620605469, + "learning_rate": 1.7526790750141007e-05, + "loss": 0.7366, + "step": 11516 + }, + { + "epoch": 6.495769881556684, + "grad_norm": 1.6525789499282837, + "learning_rate": 1.7523970671178795e-05, + "loss": 0.8739, + "step": 11517 + }, + { + "epoch": 6.496333897349126, + "grad_norm": 1.0197629928588867, + "learning_rate": 1.7521150592216584e-05, + "loss": 0.7855, + "step": 11518 + }, + { + "epoch": 6.496897913141568, + "grad_norm": 1.1722575426101685, + "learning_rate": 1.7518330513254373e-05, + "loss": 0.8055, + "step": 11519 + }, + { + "epoch": 6.49746192893401, + "grad_norm": 1.3377127647399902, + "learning_rate": 1.7515510434292162e-05, + "loss": 0.768, + "step": 11520 + }, + { + "epoch": 6.498025944726452, + "grad_norm": 1.2867631912231445, + "learning_rate": 1.751269035532995e-05, + "loss": 0.7756, + "step": 11521 + }, + { + "epoch": 6.498589960518895, + "grad_norm": 1.561493158340454, + "learning_rate": 1.750987027636774e-05, + "loss": 0.7832, + "step": 11522 + }, + { + "epoch": 6.499153976311336, + "grad_norm": 1.5129910707473755, + "learning_rate": 1.7507050197405528e-05, + "loss": 0.7672, + "step": 11523 + }, + { + "epoch": 6.499717992103779, + "grad_norm": 0.8582625985145569, + "learning_rate": 1.7504230118443317e-05, + "loss": 0.5632, + "step": 11524 + }, + { + "epoch": 6.500282007896221, + "grad_norm": 0.9366123676300049, + "learning_rate": 1.7501410039481106e-05, + "loss": 0.742, + "step": 11525 + }, + { + "epoch": 6.500846023688664, + "grad_norm": 0.9288321733474731, + "learning_rate": 1.7498589960518898e-05, + "loss": 0.6903, + "step": 11526 + }, + { + "epoch": 6.501410039481105, + "grad_norm": 1.5895929336547852, + "learning_rate": 1.7495769881556683e-05, + "loss": 0.7173, + "step": 11527 + }, + { + "epoch": 6.501974055273548, + "grad_norm": 1.4706381559371948, + "learning_rate": 1.7492949802594476e-05, + "loss": 0.8252, + "step": 11528 + }, + { + "epoch": 6.50253807106599, + "grad_norm": 1.1135883331298828, + "learning_rate": 1.749012972363226e-05, + "loss": 0.7269, + "step": 11529 + }, + { + "epoch": 6.503102086858432, + "grad_norm": 1.1505627632141113, + "learning_rate": 1.7487309644670053e-05, + "loss": 0.6341, + "step": 11530 + }, + { + "epoch": 6.503666102650874, + "grad_norm": 0.9436073303222656, + "learning_rate": 1.748448956570784e-05, + "loss": 0.6787, + "step": 11531 + }, + { + "epoch": 6.504230118443316, + "grad_norm": 1.2411547899246216, + "learning_rate": 1.748166948674563e-05, + "loss": 0.6948, + "step": 11532 + }, + { + "epoch": 6.504794134235759, + "grad_norm": 0.8574250340461731, + "learning_rate": 1.747884940778342e-05, + "loss": 0.7103, + "step": 11533 + }, + { + "epoch": 6.505358150028201, + "grad_norm": 1.3905673027038574, + "learning_rate": 1.747602932882121e-05, + "loss": 0.7067, + "step": 11534 + }, + { + "epoch": 6.5059221658206425, + "grad_norm": 1.4829760789871216, + "learning_rate": 1.7473209249858997e-05, + "loss": 0.7904, + "step": 11535 + }, + { + "epoch": 6.506486181613085, + "grad_norm": 0.8914266228675842, + "learning_rate": 1.7470389170896786e-05, + "loss": 0.6517, + "step": 11536 + }, + { + "epoch": 6.507050197405527, + "grad_norm": 1.625553846359253, + "learning_rate": 1.7467569091934575e-05, + "loss": 0.8375, + "step": 11537 + }, + { + "epoch": 6.50761421319797, + "grad_norm": 1.3778289556503296, + "learning_rate": 1.7464749012972364e-05, + "loss": 0.7863, + "step": 11538 + }, + { + "epoch": 6.5081782289904115, + "grad_norm": 1.4964722394943237, + "learning_rate": 1.7461928934010152e-05, + "loss": 0.8064, + "step": 11539 + }, + { + "epoch": 6.508742244782854, + "grad_norm": 1.4010001420974731, + "learning_rate": 1.745910885504794e-05, + "loss": 0.7302, + "step": 11540 + }, + { + "epoch": 6.509306260575296, + "grad_norm": 0.8882005214691162, + "learning_rate": 1.745628877608573e-05, + "loss": 0.6677, + "step": 11541 + }, + { + "epoch": 6.509870276367739, + "grad_norm": 1.1970652341842651, + "learning_rate": 1.7453468697123522e-05, + "loss": 0.7306, + "step": 11542 + }, + { + "epoch": 6.5104342921601805, + "grad_norm": 1.0866690874099731, + "learning_rate": 1.7450648618161307e-05, + "loss": 0.7388, + "step": 11543 + }, + { + "epoch": 6.510998307952622, + "grad_norm": 1.0401638746261597, + "learning_rate": 1.74478285391991e-05, + "loss": 0.7215, + "step": 11544 + }, + { + "epoch": 6.511562323745065, + "grad_norm": 0.8368956446647644, + "learning_rate": 1.7445008460236885e-05, + "loss": 0.6221, + "step": 11545 + }, + { + "epoch": 6.512126339537507, + "grad_norm": 1.043847680091858, + "learning_rate": 1.7442188381274677e-05, + "loss": 0.789, + "step": 11546 + }, + { + "epoch": 6.5126903553299496, + "grad_norm": 1.6238515377044678, + "learning_rate": 1.7439368302312463e-05, + "loss": 0.8522, + "step": 11547 + }, + { + "epoch": 6.513254371122391, + "grad_norm": 0.9051365852355957, + "learning_rate": 1.7436548223350255e-05, + "loss": 0.6155, + "step": 11548 + }, + { + "epoch": 6.513818386914833, + "grad_norm": 0.937477171421051, + "learning_rate": 1.7433728144388044e-05, + "loss": 0.701, + "step": 11549 + }, + { + "epoch": 6.514382402707276, + "grad_norm": 1.2313604354858398, + "learning_rate": 1.7430908065425832e-05, + "loss": 0.8018, + "step": 11550 + }, + { + "epoch": 6.514946418499718, + "grad_norm": 1.0578038692474365, + "learning_rate": 1.7428087986463625e-05, + "loss": 0.6974, + "step": 11551 + }, + { + "epoch": 6.51551043429216, + "grad_norm": 1.069061040878296, + "learning_rate": 1.742526790750141e-05, + "loss": 0.7022, + "step": 11552 + }, + { + "epoch": 6.516074450084602, + "grad_norm": 1.1570450067520142, + "learning_rate": 1.7422447828539202e-05, + "loss": 0.8268, + "step": 11553 + }, + { + "epoch": 6.516638465877045, + "grad_norm": 1.3440089225769043, + "learning_rate": 1.7419627749576988e-05, + "loss": 0.7716, + "step": 11554 + }, + { + "epoch": 6.517202481669487, + "grad_norm": 1.504372000694275, + "learning_rate": 1.741680767061478e-05, + "loss": 0.7408, + "step": 11555 + }, + { + "epoch": 6.517766497461929, + "grad_norm": 1.535266399383545, + "learning_rate": 1.7413987591652565e-05, + "loss": 0.7617, + "step": 11556 + }, + { + "epoch": 6.518330513254371, + "grad_norm": 1.3249591588974, + "learning_rate": 1.7411167512690357e-05, + "loss": 0.7059, + "step": 11557 + }, + { + "epoch": 6.518894529046813, + "grad_norm": 1.2775229215621948, + "learning_rate": 1.7408347433728146e-05, + "loss": 0.7418, + "step": 11558 + }, + { + "epoch": 6.519458544839256, + "grad_norm": 1.1850438117980957, + "learning_rate": 1.7405527354765935e-05, + "loss": 0.6865, + "step": 11559 + }, + { + "epoch": 6.5200225606316975, + "grad_norm": 1.3331961631774902, + "learning_rate": 1.7402707275803724e-05, + "loss": 0.746, + "step": 11560 + }, + { + "epoch": 6.52058657642414, + "grad_norm": 0.9444637298583984, + "learning_rate": 1.7399887196841513e-05, + "loss": 0.6685, + "step": 11561 + }, + { + "epoch": 6.521150592216582, + "grad_norm": 1.0698107481002808, + "learning_rate": 1.73970671178793e-05, + "loss": 0.689, + "step": 11562 + }, + { + "epoch": 6.521714608009024, + "grad_norm": 1.5880438089370728, + "learning_rate": 1.739424703891709e-05, + "loss": 0.7286, + "step": 11563 + }, + { + "epoch": 6.5222786238014665, + "grad_norm": 1.0811798572540283, + "learning_rate": 1.739142695995488e-05, + "loss": 0.7398, + "step": 11564 + }, + { + "epoch": 6.522842639593908, + "grad_norm": 1.4481699466705322, + "learning_rate": 1.7388606880992668e-05, + "loss": 0.8177, + "step": 11565 + }, + { + "epoch": 6.523406655386351, + "grad_norm": 1.381555438041687, + "learning_rate": 1.7385786802030457e-05, + "loss": 0.7176, + "step": 11566 + }, + { + "epoch": 6.523970671178793, + "grad_norm": 1.2158184051513672, + "learning_rate": 1.738296672306825e-05, + "loss": 0.7529, + "step": 11567 + }, + { + "epoch": 6.5245346869712355, + "grad_norm": 1.5214954614639282, + "learning_rate": 1.7380146644106034e-05, + "loss": 0.7263, + "step": 11568 + }, + { + "epoch": 6.525098702763677, + "grad_norm": 1.2651245594024658, + "learning_rate": 1.7377326565143826e-05, + "loss": 0.8128, + "step": 11569 + }, + { + "epoch": 6.52566271855612, + "grad_norm": 1.711988091468811, + "learning_rate": 1.7374506486181612e-05, + "loss": 0.828, + "step": 11570 + }, + { + "epoch": 6.526226734348562, + "grad_norm": 1.1640385389328003, + "learning_rate": 1.7371686407219404e-05, + "loss": 0.7604, + "step": 11571 + }, + { + "epoch": 6.526790750141004, + "grad_norm": 1.2591999769210815, + "learning_rate": 1.7368866328257193e-05, + "loss": 0.6893, + "step": 11572 + }, + { + "epoch": 6.527354765933446, + "grad_norm": 1.1993858814239502, + "learning_rate": 1.736604624929498e-05, + "loss": 0.649, + "step": 11573 + }, + { + "epoch": 6.527918781725888, + "grad_norm": 0.9206973910331726, + "learning_rate": 1.736322617033277e-05, + "loss": 0.6965, + "step": 11574 + }, + { + "epoch": 6.528482797518331, + "grad_norm": 0.8872495293617249, + "learning_rate": 1.736040609137056e-05, + "loss": 0.6644, + "step": 11575 + }, + { + "epoch": 6.529046813310773, + "grad_norm": 1.1804507970809937, + "learning_rate": 1.7357586012408348e-05, + "loss": 0.8499, + "step": 11576 + }, + { + "epoch": 6.529610829103214, + "grad_norm": 0.8118526935577393, + "learning_rate": 1.7354765933446137e-05, + "loss": 0.671, + "step": 11577 + }, + { + "epoch": 6.530174844895657, + "grad_norm": 1.263682246208191, + "learning_rate": 1.7351945854483926e-05, + "loss": 0.7037, + "step": 11578 + }, + { + "epoch": 6.530738860688099, + "grad_norm": 1.0580018758773804, + "learning_rate": 1.7349125775521714e-05, + "loss": 0.6775, + "step": 11579 + }, + { + "epoch": 6.531302876480542, + "grad_norm": 0.9817524552345276, + "learning_rate": 1.7346305696559503e-05, + "loss": 0.7685, + "step": 11580 + }, + { + "epoch": 6.5318668922729834, + "grad_norm": 1.2383676767349243, + "learning_rate": 1.7343485617597295e-05, + "loss": 0.6396, + "step": 11581 + }, + { + "epoch": 6.532430908065426, + "grad_norm": 1.3001093864440918, + "learning_rate": 1.734066553863508e-05, + "loss": 0.8433, + "step": 11582 + }, + { + "epoch": 6.532994923857868, + "grad_norm": 1.110290765762329, + "learning_rate": 1.7337845459672873e-05, + "loss": 0.7728, + "step": 11583 + }, + { + "epoch": 6.533558939650311, + "grad_norm": 1.2568262815475464, + "learning_rate": 1.733502538071066e-05, + "loss": 0.7734, + "step": 11584 + }, + { + "epoch": 6.5341229554427525, + "grad_norm": 1.2624244689941406, + "learning_rate": 1.733220530174845e-05, + "loss": 0.6878, + "step": 11585 + }, + { + "epoch": 6.534686971235194, + "grad_norm": 1.3698720932006836, + "learning_rate": 1.732938522278624e-05, + "loss": 0.7352, + "step": 11586 + }, + { + "epoch": 6.535250987027637, + "grad_norm": 0.8726199269294739, + "learning_rate": 1.7326565143824028e-05, + "loss": 0.7641, + "step": 11587 + }, + { + "epoch": 6.535815002820079, + "grad_norm": 1.4980822801589966, + "learning_rate": 1.7323745064861817e-05, + "loss": 0.7026, + "step": 11588 + }, + { + "epoch": 6.5363790186125215, + "grad_norm": 1.2283929586410522, + "learning_rate": 1.7320924985899606e-05, + "loss": 0.7503, + "step": 11589 + }, + { + "epoch": 6.536943034404963, + "grad_norm": 0.9813231825828552, + "learning_rate": 1.7318104906937398e-05, + "loss": 0.6799, + "step": 11590 + }, + { + "epoch": 6.537507050197405, + "grad_norm": 0.8859469890594482, + "learning_rate": 1.7315284827975183e-05, + "loss": 0.6823, + "step": 11591 + }, + { + "epoch": 6.538071065989848, + "grad_norm": 1.1540213823318481, + "learning_rate": 1.7312464749012975e-05, + "loss": 0.8117, + "step": 11592 + }, + { + "epoch": 6.53863508178229, + "grad_norm": 0.7832059264183044, + "learning_rate": 1.730964467005076e-05, + "loss": 0.5923, + "step": 11593 + }, + { + "epoch": 6.539199097574732, + "grad_norm": 1.0143311023712158, + "learning_rate": 1.7306824591088553e-05, + "loss": 0.6659, + "step": 11594 + }, + { + "epoch": 6.539763113367174, + "grad_norm": 1.1651113033294678, + "learning_rate": 1.730400451212634e-05, + "loss": 0.7464, + "step": 11595 + }, + { + "epoch": 6.540327129159617, + "grad_norm": 1.1708085536956787, + "learning_rate": 1.730118443316413e-05, + "loss": 0.7797, + "step": 11596 + }, + { + "epoch": 6.540891144952059, + "grad_norm": 1.1141756772994995, + "learning_rate": 1.729836435420192e-05, + "loss": 0.8947, + "step": 11597 + }, + { + "epoch": 6.541455160744501, + "grad_norm": 1.0938901901245117, + "learning_rate": 1.7295544275239708e-05, + "loss": 0.6619, + "step": 11598 + }, + { + "epoch": 6.542019176536943, + "grad_norm": 1.0672767162322998, + "learning_rate": 1.7292724196277497e-05, + "loss": 0.6551, + "step": 11599 + }, + { + "epoch": 6.542583192329385, + "grad_norm": 1.0410706996917725, + "learning_rate": 1.7289904117315286e-05, + "loss": 0.7235, + "step": 11600 + }, + { + "epoch": 6.543147208121828, + "grad_norm": 1.2556904554367065, + "learning_rate": 1.7287084038353075e-05, + "loss": 0.7446, + "step": 11601 + }, + { + "epoch": 6.543711223914269, + "grad_norm": 1.2274316549301147, + "learning_rate": 1.7284263959390863e-05, + "loss": 0.783, + "step": 11602 + }, + { + "epoch": 6.544275239706712, + "grad_norm": 1.0591776371002197, + "learning_rate": 1.7281443880428652e-05, + "loss": 0.6508, + "step": 11603 + }, + { + "epoch": 6.544839255499154, + "grad_norm": 0.827948272228241, + "learning_rate": 1.727862380146644e-05, + "loss": 0.6858, + "step": 11604 + }, + { + "epoch": 6.545403271291596, + "grad_norm": 0.9580187797546387, + "learning_rate": 1.727580372250423e-05, + "loss": 0.6734, + "step": 11605 + }, + { + "epoch": 6.545967287084038, + "grad_norm": 0.8355687260627747, + "learning_rate": 1.7272983643542022e-05, + "loss": 0.6467, + "step": 11606 + }, + { + "epoch": 6.54653130287648, + "grad_norm": 1.2671631574630737, + "learning_rate": 1.7270163564579807e-05, + "loss": 0.6489, + "step": 11607 + }, + { + "epoch": 6.547095318668923, + "grad_norm": 1.0881315469741821, + "learning_rate": 1.72673434856176e-05, + "loss": 0.7894, + "step": 11608 + }, + { + "epoch": 6.547659334461365, + "grad_norm": 1.3419172763824463, + "learning_rate": 1.7264523406655385e-05, + "loss": 0.7733, + "step": 11609 + }, + { + "epoch": 6.548223350253807, + "grad_norm": 0.9963918924331665, + "learning_rate": 1.7261703327693177e-05, + "loss": 0.6812, + "step": 11610 + }, + { + "epoch": 6.548787366046249, + "grad_norm": 0.8369222283363342, + "learning_rate": 1.7258883248730966e-05, + "loss": 0.7303, + "step": 11611 + }, + { + "epoch": 6.549351381838692, + "grad_norm": 1.0209901332855225, + "learning_rate": 1.7256063169768755e-05, + "loss": 0.5791, + "step": 11612 + }, + { + "epoch": 6.549915397631134, + "grad_norm": 1.4389116764068604, + "learning_rate": 1.7253243090806544e-05, + "loss": 0.7292, + "step": 11613 + }, + { + "epoch": 6.5504794134235755, + "grad_norm": 1.1160757541656494, + "learning_rate": 1.7250423011844332e-05, + "loss": 0.6415, + "step": 11614 + }, + { + "epoch": 6.551043429216018, + "grad_norm": 1.0900737047195435, + "learning_rate": 1.724760293288212e-05, + "loss": 0.7837, + "step": 11615 + }, + { + "epoch": 6.55160744500846, + "grad_norm": 1.4528828859329224, + "learning_rate": 1.724478285391991e-05, + "loss": 0.7688, + "step": 11616 + }, + { + "epoch": 6.552171460800903, + "grad_norm": 0.8627212643623352, + "learning_rate": 1.72419627749577e-05, + "loss": 0.7331, + "step": 11617 + }, + { + "epoch": 6.5527354765933445, + "grad_norm": 1.0989011526107788, + "learning_rate": 1.7239142695995488e-05, + "loss": 0.8178, + "step": 11618 + }, + { + "epoch": 6.553299492385786, + "grad_norm": 0.9520096778869629, + "learning_rate": 1.7236322617033276e-05, + "loss": 0.6728, + "step": 11619 + }, + { + "epoch": 6.553863508178229, + "grad_norm": 1.1782225370407104, + "learning_rate": 1.723350253807107e-05, + "loss": 0.8036, + "step": 11620 + }, + { + "epoch": 6.554427523970671, + "grad_norm": 0.753696620464325, + "learning_rate": 1.7230682459108857e-05, + "loss": 0.6294, + "step": 11621 + }, + { + "epoch": 6.5549915397631136, + "grad_norm": 1.1443084478378296, + "learning_rate": 1.7227862380146646e-05, + "loss": 0.6916, + "step": 11622 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 1.1295092105865479, + "learning_rate": 1.7225042301184435e-05, + "loss": 0.82, + "step": 11623 + }, + { + "epoch": 6.556119571347998, + "grad_norm": 1.0928659439086914, + "learning_rate": 1.7222222222222224e-05, + "loss": 0.7191, + "step": 11624 + }, + { + "epoch": 6.55668358714044, + "grad_norm": 1.179091453552246, + "learning_rate": 1.7219402143260013e-05, + "loss": 0.7147, + "step": 11625 + }, + { + "epoch": 6.557247602932883, + "grad_norm": 1.3595993518829346, + "learning_rate": 1.72165820642978e-05, + "loss": 0.7065, + "step": 11626 + }, + { + "epoch": 6.557811618725324, + "grad_norm": 0.9795125126838684, + "learning_rate": 1.721376198533559e-05, + "loss": 0.736, + "step": 11627 + }, + { + "epoch": 6.558375634517766, + "grad_norm": 1.0247173309326172, + "learning_rate": 1.721094190637338e-05, + "loss": 0.6003, + "step": 11628 + }, + { + "epoch": 6.558939650310209, + "grad_norm": 1.0649930238723755, + "learning_rate": 1.720812182741117e-05, + "loss": 0.7188, + "step": 11629 + }, + { + "epoch": 6.559503666102651, + "grad_norm": 1.310260534286499, + "learning_rate": 1.7205301748448957e-05, + "loss": 0.7243, + "step": 11630 + }, + { + "epoch": 6.560067681895093, + "grad_norm": 1.1834155321121216, + "learning_rate": 1.720248166948675e-05, + "loss": 0.8143, + "step": 11631 + }, + { + "epoch": 6.560631697687535, + "grad_norm": 1.0536963939666748, + "learning_rate": 1.7199661590524534e-05, + "loss": 0.7122, + "step": 11632 + }, + { + "epoch": 6.561195713479977, + "grad_norm": 1.160517692565918, + "learning_rate": 1.7196841511562326e-05, + "loss": 0.7902, + "step": 11633 + }, + { + "epoch": 6.56175972927242, + "grad_norm": 1.2925288677215576, + "learning_rate": 1.719402143260011e-05, + "loss": 0.717, + "step": 11634 + }, + { + "epoch": 6.5623237450648615, + "grad_norm": 1.4985883235931396, + "learning_rate": 1.7191201353637904e-05, + "loss": 0.794, + "step": 11635 + }, + { + "epoch": 6.562887760857304, + "grad_norm": 1.456594467163086, + "learning_rate": 1.7188381274675693e-05, + "loss": 0.7563, + "step": 11636 + }, + { + "epoch": 6.563451776649746, + "grad_norm": 1.3559149503707886, + "learning_rate": 1.718556119571348e-05, + "loss": 0.7589, + "step": 11637 + }, + { + "epoch": 6.564015792442189, + "grad_norm": 1.0912669897079468, + "learning_rate": 1.718274111675127e-05, + "loss": 0.7754, + "step": 11638 + }, + { + "epoch": 6.5645798082346305, + "grad_norm": 1.0245534181594849, + "learning_rate": 1.717992103778906e-05, + "loss": 0.6755, + "step": 11639 + }, + { + "epoch": 6.565143824027073, + "grad_norm": 1.0343244075775146, + "learning_rate": 1.7177100958826848e-05, + "loss": 0.686, + "step": 11640 + }, + { + "epoch": 6.565707839819515, + "grad_norm": 1.4009655714035034, + "learning_rate": 1.7174280879864637e-05, + "loss": 0.7073, + "step": 11641 + }, + { + "epoch": 6.566271855611957, + "grad_norm": 1.4670555591583252, + "learning_rate": 1.7171460800902425e-05, + "loss": 0.7271, + "step": 11642 + }, + { + "epoch": 6.5668358714043995, + "grad_norm": 1.0240229368209839, + "learning_rate": 1.7168640721940214e-05, + "loss": 0.6685, + "step": 11643 + }, + { + "epoch": 6.567399887196841, + "grad_norm": 1.1932915449142456, + "learning_rate": 1.7165820642978003e-05, + "loss": 0.7125, + "step": 11644 + }, + { + "epoch": 6.567963902989284, + "grad_norm": 2.462494373321533, + "learning_rate": 1.7163000564015795e-05, + "loss": 0.7406, + "step": 11645 + }, + { + "epoch": 6.568527918781726, + "grad_norm": 0.9879742860794067, + "learning_rate": 1.716018048505358e-05, + "loss": 0.795, + "step": 11646 + }, + { + "epoch": 6.569091934574168, + "grad_norm": 0.9536924958229065, + "learning_rate": 1.7157360406091373e-05, + "loss": 0.7279, + "step": 11647 + }, + { + "epoch": 6.56965595036661, + "grad_norm": 1.17770254611969, + "learning_rate": 1.7154540327129158e-05, + "loss": 0.7329, + "step": 11648 + }, + { + "epoch": 6.570219966159052, + "grad_norm": 1.2704380750656128, + "learning_rate": 1.715172024816695e-05, + "loss": 0.709, + "step": 11649 + }, + { + "epoch": 6.570783981951495, + "grad_norm": 1.051457166671753, + "learning_rate": 1.7148900169204736e-05, + "loss": 0.5924, + "step": 11650 + }, + { + "epoch": 6.571347997743937, + "grad_norm": 1.2682862281799316, + "learning_rate": 1.7146080090242528e-05, + "loss": 0.8177, + "step": 11651 + }, + { + "epoch": 6.571912013536379, + "grad_norm": 1.2408359050750732, + "learning_rate": 1.7143260011280317e-05, + "loss": 0.8072, + "step": 11652 + }, + { + "epoch": 6.572476029328821, + "grad_norm": 1.4679805040359497, + "learning_rate": 1.7140439932318106e-05, + "loss": 0.8091, + "step": 11653 + }, + { + "epoch": 6.573040045121264, + "grad_norm": 1.2788735628128052, + "learning_rate": 1.7137619853355894e-05, + "loss": 0.7534, + "step": 11654 + }, + { + "epoch": 6.573604060913706, + "grad_norm": 1.6482511758804321, + "learning_rate": 1.7134799774393683e-05, + "loss": 0.8748, + "step": 11655 + }, + { + "epoch": 6.5741680767061474, + "grad_norm": 0.9388530850410461, + "learning_rate": 1.7131979695431472e-05, + "loss": 0.7664, + "step": 11656 + }, + { + "epoch": 6.57473209249859, + "grad_norm": 0.8829855918884277, + "learning_rate": 1.712915961646926e-05, + "loss": 0.6359, + "step": 11657 + }, + { + "epoch": 6.575296108291032, + "grad_norm": 1.2722688913345337, + "learning_rate": 1.7126339537507053e-05, + "loss": 0.7979, + "step": 11658 + }, + { + "epoch": 6.575860124083475, + "grad_norm": 1.370389699935913, + "learning_rate": 1.712351945854484e-05, + "loss": 0.6887, + "step": 11659 + }, + { + "epoch": 6.5764241398759165, + "grad_norm": 1.330715298652649, + "learning_rate": 1.712069937958263e-05, + "loss": 0.8385, + "step": 11660 + }, + { + "epoch": 6.576988155668358, + "grad_norm": 1.4384691715240479, + "learning_rate": 1.711787930062042e-05, + "loss": 0.7471, + "step": 11661 + }, + { + "epoch": 6.577552171460801, + "grad_norm": 1.4664489030838013, + "learning_rate": 1.7115059221658208e-05, + "loss": 0.7341, + "step": 11662 + }, + { + "epoch": 6.578116187253243, + "grad_norm": 1.4287923574447632, + "learning_rate": 1.7112239142695997e-05, + "loss": 0.7594, + "step": 11663 + }, + { + "epoch": 6.5786802030456855, + "grad_norm": 1.3987939357757568, + "learning_rate": 1.7109419063733786e-05, + "loss": 0.8477, + "step": 11664 + }, + { + "epoch": 6.579244218838127, + "grad_norm": 1.436587929725647, + "learning_rate": 1.7106598984771575e-05, + "loss": 0.7378, + "step": 11665 + }, + { + "epoch": 6.57980823463057, + "grad_norm": 0.9340691566467285, + "learning_rate": 1.7103778905809363e-05, + "loss": 0.7548, + "step": 11666 + }, + { + "epoch": 6.580372250423012, + "grad_norm": 0.9764484167098999, + "learning_rate": 1.7100958826847152e-05, + "loss": 0.7855, + "step": 11667 + }, + { + "epoch": 6.5809362662154545, + "grad_norm": 1.3298221826553345, + "learning_rate": 1.709813874788494e-05, + "loss": 0.8054, + "step": 11668 + }, + { + "epoch": 6.581500282007896, + "grad_norm": 1.1424517631530762, + "learning_rate": 1.709531866892273e-05, + "loss": 0.7733, + "step": 11669 + }, + { + "epoch": 6.582064297800338, + "grad_norm": 1.4138766527175903, + "learning_rate": 1.7092498589960522e-05, + "loss": 0.8135, + "step": 11670 + }, + { + "epoch": 6.582628313592781, + "grad_norm": 1.145842432975769, + "learning_rate": 1.7089678510998307e-05, + "loss": 0.8029, + "step": 11671 + }, + { + "epoch": 6.583192329385223, + "grad_norm": 1.2331180572509766, + "learning_rate": 1.70868584320361e-05, + "loss": 0.7662, + "step": 11672 + }, + { + "epoch": 6.583756345177665, + "grad_norm": 0.8883332014083862, + "learning_rate": 1.7084038353073885e-05, + "loss": 0.6293, + "step": 11673 + }, + { + "epoch": 6.584320360970107, + "grad_norm": 1.2208579778671265, + "learning_rate": 1.7081218274111677e-05, + "loss": 0.7171, + "step": 11674 + }, + { + "epoch": 6.584884376762549, + "grad_norm": 0.9427213072776794, + "learning_rate": 1.7078398195149466e-05, + "loss": 0.5575, + "step": 11675 + }, + { + "epoch": 6.585448392554992, + "grad_norm": 1.7716354131698608, + "learning_rate": 1.7075578116187255e-05, + "loss": 0.8441, + "step": 11676 + }, + { + "epoch": 6.586012408347433, + "grad_norm": 1.0764460563659668, + "learning_rate": 1.7072758037225043e-05, + "loss": 0.6643, + "step": 11677 + }, + { + "epoch": 6.586576424139876, + "grad_norm": 0.9627671241760254, + "learning_rate": 1.7069937958262832e-05, + "loss": 0.8358, + "step": 11678 + }, + { + "epoch": 6.587140439932318, + "grad_norm": 1.0076364278793335, + "learning_rate": 1.706711787930062e-05, + "loss": 0.6771, + "step": 11679 + }, + { + "epoch": 6.587704455724761, + "grad_norm": 1.004461646080017, + "learning_rate": 1.706429780033841e-05, + "loss": 0.8098, + "step": 11680 + }, + { + "epoch": 6.588268471517202, + "grad_norm": 0.993886411190033, + "learning_rate": 1.70614777213762e-05, + "loss": 0.7179, + "step": 11681 + }, + { + "epoch": 6.588832487309645, + "grad_norm": 1.4133501052856445, + "learning_rate": 1.7058657642413987e-05, + "loss": 0.6619, + "step": 11682 + }, + { + "epoch": 6.589396503102087, + "grad_norm": 1.0660514831542969, + "learning_rate": 1.7055837563451776e-05, + "loss": 0.6738, + "step": 11683 + }, + { + "epoch": 6.589960518894529, + "grad_norm": 0.95234215259552, + "learning_rate": 1.705301748448957e-05, + "loss": 0.7517, + "step": 11684 + }, + { + "epoch": 6.590524534686971, + "grad_norm": 1.4407355785369873, + "learning_rate": 1.7050197405527354e-05, + "loss": 0.7369, + "step": 11685 + }, + { + "epoch": 6.591088550479413, + "grad_norm": 0.9827592372894287, + "learning_rate": 1.7047377326565146e-05, + "loss": 0.678, + "step": 11686 + }, + { + "epoch": 6.591652566271856, + "grad_norm": 1.238520860671997, + "learning_rate": 1.704455724760293e-05, + "loss": 0.627, + "step": 11687 + }, + { + "epoch": 6.592216582064298, + "grad_norm": 1.5363913774490356, + "learning_rate": 1.7041737168640724e-05, + "loss": 0.8292, + "step": 11688 + }, + { + "epoch": 6.5927805978567395, + "grad_norm": 1.2258418798446655, + "learning_rate": 1.703891708967851e-05, + "loss": 0.779, + "step": 11689 + }, + { + "epoch": 6.593344613649182, + "grad_norm": 1.4602538347244263, + "learning_rate": 1.70360970107163e-05, + "loss": 0.9304, + "step": 11690 + }, + { + "epoch": 6.593908629441624, + "grad_norm": 1.3408774137496948, + "learning_rate": 1.703327693175409e-05, + "loss": 0.7156, + "step": 11691 + }, + { + "epoch": 6.594472645234067, + "grad_norm": 1.4345512390136719, + "learning_rate": 1.703045685279188e-05, + "loss": 0.8156, + "step": 11692 + }, + { + "epoch": 6.5950366610265085, + "grad_norm": 1.240736484527588, + "learning_rate": 1.702763677382967e-05, + "loss": 0.8466, + "step": 11693 + }, + { + "epoch": 6.595600676818951, + "grad_norm": 0.9983288049697876, + "learning_rate": 1.7024816694867456e-05, + "loss": 0.7692, + "step": 11694 + }, + { + "epoch": 6.596164692611393, + "grad_norm": 0.8928210139274597, + "learning_rate": 1.702199661590525e-05, + "loss": 0.7024, + "step": 11695 + }, + { + "epoch": 6.596728708403836, + "grad_norm": 0.9558966159820557, + "learning_rate": 1.7019176536943034e-05, + "loss": 0.6774, + "step": 11696 + }, + { + "epoch": 6.5972927241962775, + "grad_norm": 1.5935105085372925, + "learning_rate": 1.7016356457980826e-05, + "loss": 0.7586, + "step": 11697 + }, + { + "epoch": 6.597856739988719, + "grad_norm": 1.3858423233032227, + "learning_rate": 1.701353637901861e-05, + "loss": 0.7518, + "step": 11698 + }, + { + "epoch": 6.598420755781162, + "grad_norm": 1.052266001701355, + "learning_rate": 1.7010716300056404e-05, + "loss": 0.8075, + "step": 11699 + }, + { + "epoch": 6.598984771573604, + "grad_norm": 1.12832772731781, + "learning_rate": 1.7007896221094193e-05, + "loss": 0.7405, + "step": 11700 + }, + { + "epoch": 6.5995487873660466, + "grad_norm": 0.9984036087989807, + "learning_rate": 1.700507614213198e-05, + "loss": 0.6587, + "step": 11701 + }, + { + "epoch": 6.600112803158488, + "grad_norm": 1.1443434953689575, + "learning_rate": 1.700225606316977e-05, + "loss": 0.855, + "step": 11702 + }, + { + "epoch": 6.60067681895093, + "grad_norm": 0.96019446849823, + "learning_rate": 1.699943598420756e-05, + "loss": 0.6885, + "step": 11703 + }, + { + "epoch": 6.601240834743373, + "grad_norm": 1.2184854745864868, + "learning_rate": 1.6996615905245348e-05, + "loss": 0.7221, + "step": 11704 + }, + { + "epoch": 6.601804850535815, + "grad_norm": 1.514756441116333, + "learning_rate": 1.6993795826283137e-05, + "loss": 0.7683, + "step": 11705 + }, + { + "epoch": 6.602368866328257, + "grad_norm": 1.3468201160430908, + "learning_rate": 1.6990975747320925e-05, + "loss": 0.7408, + "step": 11706 + }, + { + "epoch": 6.602932882120699, + "grad_norm": 1.206035852432251, + "learning_rate": 1.6988155668358714e-05, + "loss": 0.6928, + "step": 11707 + }, + { + "epoch": 6.603496897913142, + "grad_norm": 1.1852117776870728, + "learning_rate": 1.6985335589396503e-05, + "loss": 0.6884, + "step": 11708 + }, + { + "epoch": 6.604060913705584, + "grad_norm": 0.8246819972991943, + "learning_rate": 1.6982515510434295e-05, + "loss": 0.5777, + "step": 11709 + }, + { + "epoch": 6.604624929498026, + "grad_norm": 0.862443745136261, + "learning_rate": 1.697969543147208e-05, + "loss": 0.748, + "step": 11710 + }, + { + "epoch": 6.605188945290468, + "grad_norm": 1.226538896560669, + "learning_rate": 1.6976875352509873e-05, + "loss": 0.803, + "step": 11711 + }, + { + "epoch": 6.60575296108291, + "grad_norm": 1.1645673513412476, + "learning_rate": 1.6974055273547658e-05, + "loss": 0.7126, + "step": 11712 + }, + { + "epoch": 6.606316976875353, + "grad_norm": 1.422776460647583, + "learning_rate": 1.697123519458545e-05, + "loss": 0.7977, + "step": 11713 + }, + { + "epoch": 6.6068809926677945, + "grad_norm": 1.4480869770050049, + "learning_rate": 1.6968415115623236e-05, + "loss": 0.7843, + "step": 11714 + }, + { + "epoch": 6.607445008460237, + "grad_norm": 1.184696912765503, + "learning_rate": 1.6965595036661028e-05, + "loss": 0.7736, + "step": 11715 + }, + { + "epoch": 6.608009024252679, + "grad_norm": 1.094364881515503, + "learning_rate": 1.6962774957698817e-05, + "loss": 0.688, + "step": 11716 + }, + { + "epoch": 6.608573040045121, + "grad_norm": 1.077919602394104, + "learning_rate": 1.6959954878736606e-05, + "loss": 0.7228, + "step": 11717 + }, + { + "epoch": 6.6091370558375635, + "grad_norm": 1.086483120918274, + "learning_rate": 1.6957134799774394e-05, + "loss": 0.7138, + "step": 11718 + }, + { + "epoch": 6.609701071630005, + "grad_norm": 0.9025209546089172, + "learning_rate": 1.6954314720812183e-05, + "loss": 0.6345, + "step": 11719 + }, + { + "epoch": 6.610265087422448, + "grad_norm": 0.9713317155838013, + "learning_rate": 1.6951494641849972e-05, + "loss": 0.7746, + "step": 11720 + }, + { + "epoch": 6.61082910321489, + "grad_norm": 1.0555212497711182, + "learning_rate": 1.694867456288776e-05, + "loss": 0.6993, + "step": 11721 + }, + { + "epoch": 6.6113931190073325, + "grad_norm": 1.1437407732009888, + "learning_rate": 1.694585448392555e-05, + "loss": 0.701, + "step": 11722 + }, + { + "epoch": 6.611957134799774, + "grad_norm": 1.0894231796264648, + "learning_rate": 1.6943034404963338e-05, + "loss": 0.7371, + "step": 11723 + }, + { + "epoch": 6.612521150592217, + "grad_norm": 1.0495775938034058, + "learning_rate": 1.6940214326001127e-05, + "loss": 0.704, + "step": 11724 + }, + { + "epoch": 6.613085166384659, + "grad_norm": 0.8434204459190369, + "learning_rate": 1.693739424703892e-05, + "loss": 0.7052, + "step": 11725 + }, + { + "epoch": 6.613649182177101, + "grad_norm": 1.2861590385437012, + "learning_rate": 1.6934574168076705e-05, + "loss": 0.8097, + "step": 11726 + }, + { + "epoch": 6.614213197969543, + "grad_norm": 1.263717770576477, + "learning_rate": 1.6931754089114497e-05, + "loss": 0.6308, + "step": 11727 + }, + { + "epoch": 6.614777213761985, + "grad_norm": 0.8891022205352783, + "learning_rate": 1.6928934010152286e-05, + "loss": 0.6259, + "step": 11728 + }, + { + "epoch": 6.615341229554428, + "grad_norm": 1.0597072839736938, + "learning_rate": 1.6926113931190074e-05, + "loss": 0.7422, + "step": 11729 + }, + { + "epoch": 6.61590524534687, + "grad_norm": 0.8857246041297913, + "learning_rate": 1.6923293852227863e-05, + "loss": 0.7201, + "step": 11730 + }, + { + "epoch": 6.616469261139311, + "grad_norm": 1.0322154760360718, + "learning_rate": 1.6920473773265652e-05, + "loss": 0.7128, + "step": 11731 + }, + { + "epoch": 6.617033276931754, + "grad_norm": 1.2368800640106201, + "learning_rate": 1.691765369430344e-05, + "loss": 0.7308, + "step": 11732 + }, + { + "epoch": 6.617597292724196, + "grad_norm": 1.0891664028167725, + "learning_rate": 1.691483361534123e-05, + "loss": 0.8813, + "step": 11733 + }, + { + "epoch": 6.618161308516639, + "grad_norm": 1.3138484954833984, + "learning_rate": 1.6912013536379022e-05, + "loss": 0.7823, + "step": 11734 + }, + { + "epoch": 6.6187253243090804, + "grad_norm": 0.8797222375869751, + "learning_rate": 1.6909193457416807e-05, + "loss": 0.6128, + "step": 11735 + }, + { + "epoch": 6.619289340101523, + "grad_norm": 1.075474739074707, + "learning_rate": 1.69063733784546e-05, + "loss": 0.6754, + "step": 11736 + }, + { + "epoch": 6.619853355893965, + "grad_norm": 1.240526556968689, + "learning_rate": 1.6903553299492385e-05, + "loss": 0.748, + "step": 11737 + }, + { + "epoch": 6.620417371686408, + "grad_norm": 1.091849684715271, + "learning_rate": 1.6900733220530177e-05, + "loss": 0.8224, + "step": 11738 + }, + { + "epoch": 6.6209813874788495, + "grad_norm": 1.091629147529602, + "learning_rate": 1.6897913141567966e-05, + "loss": 0.7466, + "step": 11739 + }, + { + "epoch": 6.621545403271291, + "grad_norm": 1.4484195709228516, + "learning_rate": 1.6895093062605755e-05, + "loss": 0.7127, + "step": 11740 + }, + { + "epoch": 6.622109419063734, + "grad_norm": 1.240852952003479, + "learning_rate": 1.6892272983643543e-05, + "loss": 0.7018, + "step": 11741 + }, + { + "epoch": 6.622673434856176, + "grad_norm": 1.2582087516784668, + "learning_rate": 1.6889452904681332e-05, + "loss": 0.7496, + "step": 11742 + }, + { + "epoch": 6.6232374506486185, + "grad_norm": 1.0667473077774048, + "learning_rate": 1.688663282571912e-05, + "loss": 0.6237, + "step": 11743 + }, + { + "epoch": 6.62380146644106, + "grad_norm": 1.1655818223953247, + "learning_rate": 1.688381274675691e-05, + "loss": 0.7705, + "step": 11744 + }, + { + "epoch": 6.624365482233502, + "grad_norm": 2.2282426357269287, + "learning_rate": 1.68809926677947e-05, + "loss": 0.8523, + "step": 11745 + }, + { + "epoch": 6.624929498025945, + "grad_norm": 1.004740595817566, + "learning_rate": 1.6878172588832487e-05, + "loss": 0.6735, + "step": 11746 + }, + { + "epoch": 6.625493513818387, + "grad_norm": 0.9403308629989624, + "learning_rate": 1.6875352509870276e-05, + "loss": 0.6719, + "step": 11747 + }, + { + "epoch": 6.626057529610829, + "grad_norm": 1.665797472000122, + "learning_rate": 1.687253243090807e-05, + "loss": 0.8565, + "step": 11748 + }, + { + "epoch": 6.626621545403271, + "grad_norm": 1.2578465938568115, + "learning_rate": 1.6869712351945854e-05, + "loss": 0.6906, + "step": 11749 + }, + { + "epoch": 6.627185561195714, + "grad_norm": 1.2828081846237183, + "learning_rate": 1.6866892272983646e-05, + "loss": 0.8107, + "step": 11750 + }, + { + "epoch": 6.627749576988156, + "grad_norm": 1.1974927186965942, + "learning_rate": 1.686407219402143e-05, + "loss": 0.7665, + "step": 11751 + }, + { + "epoch": 6.628313592780598, + "grad_norm": 1.5289145708084106, + "learning_rate": 1.6861252115059224e-05, + "loss": 0.8529, + "step": 11752 + }, + { + "epoch": 6.62887760857304, + "grad_norm": 1.0394079685211182, + "learning_rate": 1.685843203609701e-05, + "loss": 0.8588, + "step": 11753 + }, + { + "epoch": 6.629441624365482, + "grad_norm": 1.006235957145691, + "learning_rate": 1.68556119571348e-05, + "loss": 0.7752, + "step": 11754 + }, + { + "epoch": 6.630005640157925, + "grad_norm": 0.9011165499687195, + "learning_rate": 1.685279187817259e-05, + "loss": 0.6542, + "step": 11755 + }, + { + "epoch": 6.630569655950366, + "grad_norm": 1.054163932800293, + "learning_rate": 1.684997179921038e-05, + "loss": 0.6522, + "step": 11756 + }, + { + "epoch": 6.631133671742809, + "grad_norm": 1.2652350664138794, + "learning_rate": 1.6847151720248168e-05, + "loss": 0.775, + "step": 11757 + }, + { + "epoch": 6.631697687535251, + "grad_norm": 1.2480170726776123, + "learning_rate": 1.6844331641285956e-05, + "loss": 0.8725, + "step": 11758 + }, + { + "epoch": 6.632261703327693, + "grad_norm": 1.2332496643066406, + "learning_rate": 1.6841511562323745e-05, + "loss": 0.8304, + "step": 11759 + }, + { + "epoch": 6.632825719120135, + "grad_norm": 1.0627444982528687, + "learning_rate": 1.6838691483361534e-05, + "loss": 0.7556, + "step": 11760 + }, + { + "epoch": 6.633389734912577, + "grad_norm": 1.135066032409668, + "learning_rate": 1.6835871404399323e-05, + "loss": 0.8234, + "step": 11761 + }, + { + "epoch": 6.63395375070502, + "grad_norm": 1.1012377738952637, + "learning_rate": 1.683305132543711e-05, + "loss": 0.725, + "step": 11762 + }, + { + "epoch": 6.634517766497462, + "grad_norm": 1.0348577499389648, + "learning_rate": 1.6830231246474904e-05, + "loss": 0.8174, + "step": 11763 + }, + { + "epoch": 6.635081782289904, + "grad_norm": 1.4856054782867432, + "learning_rate": 1.6827411167512693e-05, + "loss": 0.7783, + "step": 11764 + }, + { + "epoch": 6.635645798082346, + "grad_norm": 0.9017959833145142, + "learning_rate": 1.682459108855048e-05, + "loss": 0.6808, + "step": 11765 + }, + { + "epoch": 6.636209813874789, + "grad_norm": 1.0857878923416138, + "learning_rate": 1.682177100958827e-05, + "loss": 0.7917, + "step": 11766 + }, + { + "epoch": 6.636773829667231, + "grad_norm": 1.228956937789917, + "learning_rate": 1.681895093062606e-05, + "loss": 0.7103, + "step": 11767 + }, + { + "epoch": 6.6373378454596725, + "grad_norm": 1.2059898376464844, + "learning_rate": 1.6816130851663848e-05, + "loss": 0.6966, + "step": 11768 + }, + { + "epoch": 6.637901861252115, + "grad_norm": 1.110062599182129, + "learning_rate": 1.6813310772701636e-05, + "loss": 0.6922, + "step": 11769 + }, + { + "epoch": 6.638465877044557, + "grad_norm": 1.0371633768081665, + "learning_rate": 1.6810490693739425e-05, + "loss": 0.7884, + "step": 11770 + }, + { + "epoch": 6.639029892837, + "grad_norm": 1.8619391918182373, + "learning_rate": 1.6807670614777214e-05, + "loss": 0.8253, + "step": 11771 + }, + { + "epoch": 6.6395939086294415, + "grad_norm": 1.3169634342193604, + "learning_rate": 1.6804850535815003e-05, + "loss": 0.7435, + "step": 11772 + }, + { + "epoch": 6.640157924421883, + "grad_norm": 0.8236050009727478, + "learning_rate": 1.6802030456852795e-05, + "loss": 0.6685, + "step": 11773 + }, + { + "epoch": 6.640721940214326, + "grad_norm": 1.1673450469970703, + "learning_rate": 1.679921037789058e-05, + "loss": 0.751, + "step": 11774 + }, + { + "epoch": 6.641285956006768, + "grad_norm": 0.9296943545341492, + "learning_rate": 1.6796390298928373e-05, + "loss": 0.6753, + "step": 11775 + }, + { + "epoch": 6.6418499717992106, + "grad_norm": 1.1996450424194336, + "learning_rate": 1.6793570219966158e-05, + "loss": 0.7993, + "step": 11776 + }, + { + "epoch": 6.642413987591652, + "grad_norm": 1.046977162361145, + "learning_rate": 1.679075014100395e-05, + "loss": 0.774, + "step": 11777 + }, + { + "epoch": 6.642978003384095, + "grad_norm": 1.144750952720642, + "learning_rate": 1.6787930062041736e-05, + "loss": 0.6104, + "step": 11778 + }, + { + "epoch": 6.643542019176537, + "grad_norm": 1.6218856573104858, + "learning_rate": 1.6785109983079528e-05, + "loss": 0.8543, + "step": 11779 + }, + { + "epoch": 6.64410603496898, + "grad_norm": 0.9035986661911011, + "learning_rate": 1.6782289904117317e-05, + "loss": 0.7296, + "step": 11780 + }, + { + "epoch": 6.644670050761421, + "grad_norm": 1.018458366394043, + "learning_rate": 1.6779469825155105e-05, + "loss": 0.6803, + "step": 11781 + }, + { + "epoch": 6.645234066553863, + "grad_norm": 1.1811786890029907, + "learning_rate": 1.6776649746192894e-05, + "loss": 0.867, + "step": 11782 + }, + { + "epoch": 6.645798082346306, + "grad_norm": 1.4134066104888916, + "learning_rate": 1.6773829667230683e-05, + "loss": 0.7226, + "step": 11783 + }, + { + "epoch": 6.646362098138748, + "grad_norm": 1.0826178789138794, + "learning_rate": 1.6771009588268472e-05, + "loss": 0.6426, + "step": 11784 + }, + { + "epoch": 6.64692611393119, + "grad_norm": 0.9440646171569824, + "learning_rate": 1.676818950930626e-05, + "loss": 0.7604, + "step": 11785 + }, + { + "epoch": 6.647490129723632, + "grad_norm": 1.168129801750183, + "learning_rate": 1.676536943034405e-05, + "loss": 0.786, + "step": 11786 + }, + { + "epoch": 6.648054145516074, + "grad_norm": 1.1100575923919678, + "learning_rate": 1.6762549351381838e-05, + "loss": 0.6439, + "step": 11787 + }, + { + "epoch": 6.648618161308517, + "grad_norm": 0.8063020706176758, + "learning_rate": 1.6759729272419627e-05, + "loss": 0.659, + "step": 11788 + }, + { + "epoch": 6.6491821771009585, + "grad_norm": 1.2389706373214722, + "learning_rate": 1.675690919345742e-05, + "loss": 0.7949, + "step": 11789 + }, + { + "epoch": 6.649746192893401, + "grad_norm": 1.1054346561431885, + "learning_rate": 1.6754089114495205e-05, + "loss": 0.7585, + "step": 11790 + }, + { + "epoch": 6.650310208685843, + "grad_norm": 1.3628766536712646, + "learning_rate": 1.6751269035532997e-05, + "loss": 0.7745, + "step": 11791 + }, + { + "epoch": 6.650874224478286, + "grad_norm": 1.0113754272460938, + "learning_rate": 1.6748448956570782e-05, + "loss": 0.6154, + "step": 11792 + }, + { + "epoch": 6.6514382402707275, + "grad_norm": 1.3002893924713135, + "learning_rate": 1.6745628877608574e-05, + "loss": 0.7795, + "step": 11793 + }, + { + "epoch": 6.65200225606317, + "grad_norm": 1.1050136089324951, + "learning_rate": 1.6742808798646363e-05, + "loss": 0.8069, + "step": 11794 + }, + { + "epoch": 6.652566271855612, + "grad_norm": 1.0484873056411743, + "learning_rate": 1.6739988719684152e-05, + "loss": 0.7868, + "step": 11795 + }, + { + "epoch": 6.653130287648054, + "grad_norm": 1.581910490989685, + "learning_rate": 1.673716864072194e-05, + "loss": 0.8093, + "step": 11796 + }, + { + "epoch": 6.6536943034404965, + "grad_norm": 1.2282921075820923, + "learning_rate": 1.673434856175973e-05, + "loss": 0.6456, + "step": 11797 + }, + { + "epoch": 6.654258319232938, + "grad_norm": 1.0640246868133545, + "learning_rate": 1.6731528482797522e-05, + "loss": 0.7424, + "step": 11798 + }, + { + "epoch": 6.654822335025381, + "grad_norm": 1.4032953977584839, + "learning_rate": 1.6728708403835307e-05, + "loss": 0.7527, + "step": 11799 + }, + { + "epoch": 6.655386350817823, + "grad_norm": 0.9533419013023376, + "learning_rate": 1.67258883248731e-05, + "loss": 0.6521, + "step": 11800 + }, + { + "epoch": 6.655950366610265, + "grad_norm": 0.9996126890182495, + "learning_rate": 1.6723068245910885e-05, + "loss": 0.7568, + "step": 11801 + }, + { + "epoch": 6.656514382402707, + "grad_norm": 0.930286705493927, + "learning_rate": 1.6720248166948677e-05, + "loss": 0.6599, + "step": 11802 + }, + { + "epoch": 6.657078398195149, + "grad_norm": 1.054212212562561, + "learning_rate": 1.6717428087986466e-05, + "loss": 0.8328, + "step": 11803 + }, + { + "epoch": 6.657642413987592, + "grad_norm": 1.1632660627365112, + "learning_rate": 1.6714608009024255e-05, + "loss": 0.6584, + "step": 11804 + }, + { + "epoch": 6.658206429780034, + "grad_norm": 0.8103170394897461, + "learning_rate": 1.6711787930062043e-05, + "loss": 0.7373, + "step": 11805 + }, + { + "epoch": 6.658770445572476, + "grad_norm": 1.0514938831329346, + "learning_rate": 1.6708967851099832e-05, + "loss": 0.6557, + "step": 11806 + }, + { + "epoch": 6.659334461364918, + "grad_norm": 0.7936477065086365, + "learning_rate": 1.670614777213762e-05, + "loss": 0.6742, + "step": 11807 + }, + { + "epoch": 6.659898477157361, + "grad_norm": 1.2752323150634766, + "learning_rate": 1.670332769317541e-05, + "loss": 0.7064, + "step": 11808 + }, + { + "epoch": 6.660462492949803, + "grad_norm": 1.1281402111053467, + "learning_rate": 1.67005076142132e-05, + "loss": 0.7667, + "step": 11809 + }, + { + "epoch": 6.6610265087422444, + "grad_norm": 1.6654255390167236, + "learning_rate": 1.6697687535250987e-05, + "loss": 0.7404, + "step": 11810 + }, + { + "epoch": 6.661590524534687, + "grad_norm": 1.363050103187561, + "learning_rate": 1.6694867456288776e-05, + "loss": 0.6946, + "step": 11811 + }, + { + "epoch": 6.662154540327129, + "grad_norm": 1.2700726985931396, + "learning_rate": 1.6692047377326568e-05, + "loss": 0.6935, + "step": 11812 + }, + { + "epoch": 6.662718556119572, + "grad_norm": 1.1700515747070312, + "learning_rate": 1.6689227298364354e-05, + "loss": 0.8213, + "step": 11813 + }, + { + "epoch": 6.6632825719120135, + "grad_norm": 1.4593487977981567, + "learning_rate": 1.6686407219402146e-05, + "loss": 0.7519, + "step": 11814 + }, + { + "epoch": 6.663846587704455, + "grad_norm": 0.9667856097221375, + "learning_rate": 1.668358714043993e-05, + "loss": 0.6982, + "step": 11815 + }, + { + "epoch": 6.664410603496898, + "grad_norm": 1.0281329154968262, + "learning_rate": 1.6680767061477723e-05, + "loss": 0.7331, + "step": 11816 + }, + { + "epoch": 6.66497461928934, + "grad_norm": 0.9681075811386108, + "learning_rate": 1.667794698251551e-05, + "loss": 0.7393, + "step": 11817 + }, + { + "epoch": 6.6655386350817825, + "grad_norm": 1.0061053037643433, + "learning_rate": 1.66751269035533e-05, + "loss": 0.7991, + "step": 11818 + }, + { + "epoch": 6.666102650874224, + "grad_norm": 1.145425796508789, + "learning_rate": 1.667230682459109e-05, + "loss": 0.7576, + "step": 11819 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.0008091926574707, + "learning_rate": 1.666948674562888e-05, + "loss": 0.7673, + "step": 11820 + }, + { + "epoch": 6.667230682459109, + "grad_norm": 1.033789038658142, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.8073, + "step": 11821 + }, + { + "epoch": 6.6677946982515515, + "grad_norm": 2.001570463180542, + "learning_rate": 1.6663846587704456e-05, + "loss": 0.8472, + "step": 11822 + }, + { + "epoch": 6.668358714043993, + "grad_norm": 1.1447398662567139, + "learning_rate": 1.6661026508742245e-05, + "loss": 0.8146, + "step": 11823 + }, + { + "epoch": 6.668922729836435, + "grad_norm": 1.2614809274673462, + "learning_rate": 1.6658206429780034e-05, + "loss": 0.7306, + "step": 11824 + }, + { + "epoch": 6.669486745628878, + "grad_norm": 1.1053944826126099, + "learning_rate": 1.6655386350817823e-05, + "loss": 0.7408, + "step": 11825 + }, + { + "epoch": 6.67005076142132, + "grad_norm": 1.2858333587646484, + "learning_rate": 1.665256627185561e-05, + "loss": 0.7969, + "step": 11826 + }, + { + "epoch": 6.670614777213762, + "grad_norm": 1.447189211845398, + "learning_rate": 1.66497461928934e-05, + "loss": 0.8627, + "step": 11827 + }, + { + "epoch": 6.671178793006204, + "grad_norm": 1.1392217874526978, + "learning_rate": 1.6646926113931192e-05, + "loss": 0.7027, + "step": 11828 + }, + { + "epoch": 6.671742808798646, + "grad_norm": 2.0496366024017334, + "learning_rate": 1.6644106034968978e-05, + "loss": 0.8879, + "step": 11829 + }, + { + "epoch": 6.672306824591089, + "grad_norm": 1.15285325050354, + "learning_rate": 1.664128595600677e-05, + "loss": 0.7618, + "step": 11830 + }, + { + "epoch": 6.67287084038353, + "grad_norm": 1.904531478881836, + "learning_rate": 1.6638465877044555e-05, + "loss": 0.76, + "step": 11831 + }, + { + "epoch": 6.673434856175973, + "grad_norm": 1.0056710243225098, + "learning_rate": 1.6635645798082348e-05, + "loss": 0.7184, + "step": 11832 + }, + { + "epoch": 6.673998871968415, + "grad_norm": 1.144544243812561, + "learning_rate": 1.6632825719120136e-05, + "loss": 0.6259, + "step": 11833 + }, + { + "epoch": 6.674562887760858, + "grad_norm": 1.0497398376464844, + "learning_rate": 1.6630005640157925e-05, + "loss": 0.6656, + "step": 11834 + }, + { + "epoch": 6.675126903553299, + "grad_norm": 0.856687068939209, + "learning_rate": 1.6627185561195714e-05, + "loss": 0.6918, + "step": 11835 + }, + { + "epoch": 6.675690919345742, + "grad_norm": 0.9192733764648438, + "learning_rate": 1.6624365482233503e-05, + "loss": 0.6423, + "step": 11836 + }, + { + "epoch": 6.676254935138184, + "grad_norm": 1.3060230016708374, + "learning_rate": 1.6621545403271295e-05, + "loss": 0.735, + "step": 11837 + }, + { + "epoch": 6.676818950930626, + "grad_norm": 0.9620720148086548, + "learning_rate": 1.661872532430908e-05, + "loss": 0.7399, + "step": 11838 + }, + { + "epoch": 6.677382966723068, + "grad_norm": 1.3061689138412476, + "learning_rate": 1.6615905245346873e-05, + "loss": 0.8278, + "step": 11839 + }, + { + "epoch": 6.67794698251551, + "grad_norm": 1.3112024068832397, + "learning_rate": 1.6613085166384658e-05, + "loss": 0.8454, + "step": 11840 + }, + { + "epoch": 6.678510998307953, + "grad_norm": 1.2737445831298828, + "learning_rate": 1.661026508742245e-05, + "loss": 0.7474, + "step": 11841 + }, + { + "epoch": 6.679075014100395, + "grad_norm": 0.9368888139724731, + "learning_rate": 1.660744500846024e-05, + "loss": 0.7123, + "step": 11842 + }, + { + "epoch": 6.6796390298928365, + "grad_norm": 1.3229705095291138, + "learning_rate": 1.6604624929498028e-05, + "loss": 0.8228, + "step": 11843 + }, + { + "epoch": 6.680203045685279, + "grad_norm": 1.3128095865249634, + "learning_rate": 1.6601804850535817e-05, + "loss": 0.7715, + "step": 11844 + }, + { + "epoch": 6.680767061477721, + "grad_norm": 1.1347700357437134, + "learning_rate": 1.6598984771573605e-05, + "loss": 0.8095, + "step": 11845 + }, + { + "epoch": 6.681331077270164, + "grad_norm": 1.1338787078857422, + "learning_rate": 1.6596164692611394e-05, + "loss": 0.6928, + "step": 11846 + }, + { + "epoch": 6.6818950930626055, + "grad_norm": 1.0692081451416016, + "learning_rate": 1.6593344613649183e-05, + "loss": 0.6683, + "step": 11847 + }, + { + "epoch": 6.682459108855048, + "grad_norm": 1.2499215602874756, + "learning_rate": 1.6590524534686972e-05, + "loss": 0.7602, + "step": 11848 + }, + { + "epoch": 6.68302312464749, + "grad_norm": 1.4954506158828735, + "learning_rate": 1.658770445572476e-05, + "loss": 0.8824, + "step": 11849 + }, + { + "epoch": 6.683587140439933, + "grad_norm": 0.9808197021484375, + "learning_rate": 1.658488437676255e-05, + "loss": 0.7033, + "step": 11850 + }, + { + "epoch": 6.6841511562323745, + "grad_norm": 0.920637309551239, + "learning_rate": 1.658206429780034e-05, + "loss": 0.6047, + "step": 11851 + }, + { + "epoch": 6.684715172024816, + "grad_norm": 1.0171732902526855, + "learning_rate": 1.6579244218838127e-05, + "loss": 0.6698, + "step": 11852 + }, + { + "epoch": 6.685279187817259, + "grad_norm": 1.0925617218017578, + "learning_rate": 1.657642413987592e-05, + "loss": 0.7026, + "step": 11853 + }, + { + "epoch": 6.685843203609701, + "grad_norm": 1.16087806224823, + "learning_rate": 1.6573604060913705e-05, + "loss": 0.8256, + "step": 11854 + }, + { + "epoch": 6.6864072194021436, + "grad_norm": 1.7638483047485352, + "learning_rate": 1.6570783981951497e-05, + "loss": 0.8707, + "step": 11855 + }, + { + "epoch": 6.686971235194585, + "grad_norm": 1.297455906867981, + "learning_rate": 1.6567963902989282e-05, + "loss": 0.7567, + "step": 11856 + }, + { + "epoch": 6.687535250987027, + "grad_norm": 0.9805709719657898, + "learning_rate": 1.6565143824027074e-05, + "loss": 0.7345, + "step": 11857 + }, + { + "epoch": 6.68809926677947, + "grad_norm": 1.0571184158325195, + "learning_rate": 1.6562323745064863e-05, + "loss": 0.7013, + "step": 11858 + }, + { + "epoch": 6.688663282571912, + "grad_norm": 1.0704827308654785, + "learning_rate": 1.6559503666102652e-05, + "loss": 0.7327, + "step": 11859 + }, + { + "epoch": 6.689227298364354, + "grad_norm": 1.267427682876587, + "learning_rate": 1.655668358714044e-05, + "loss": 0.7324, + "step": 11860 + }, + { + "epoch": 6.689791314156796, + "grad_norm": 1.1646655797958374, + "learning_rate": 1.655386350817823e-05, + "loss": 0.8444, + "step": 11861 + }, + { + "epoch": 6.690355329949239, + "grad_norm": 1.032322645187378, + "learning_rate": 1.6551043429216018e-05, + "loss": 0.7657, + "step": 11862 + }, + { + "epoch": 6.690919345741681, + "grad_norm": 0.9903132319450378, + "learning_rate": 1.6548223350253807e-05, + "loss": 0.612, + "step": 11863 + }, + { + "epoch": 6.691483361534123, + "grad_norm": 1.0906884670257568, + "learning_rate": 1.6545403271291596e-05, + "loss": 0.6784, + "step": 11864 + }, + { + "epoch": 6.692047377326565, + "grad_norm": 1.4697703123092651, + "learning_rate": 1.6542583192329385e-05, + "loss": 0.8038, + "step": 11865 + }, + { + "epoch": 6.692611393119007, + "grad_norm": 1.1735763549804688, + "learning_rate": 1.6539763113367173e-05, + "loss": 0.7624, + "step": 11866 + }, + { + "epoch": 6.69317540891145, + "grad_norm": 1.887709617614746, + "learning_rate": 1.6536943034404966e-05, + "loss": 0.83, + "step": 11867 + }, + { + "epoch": 6.6937394247038915, + "grad_norm": 1.0858913660049438, + "learning_rate": 1.653412295544275e-05, + "loss": 0.7426, + "step": 11868 + }, + { + "epoch": 6.694303440496334, + "grad_norm": 1.1864426136016846, + "learning_rate": 1.6531302876480543e-05, + "loss": 0.7115, + "step": 11869 + }, + { + "epoch": 6.694867456288776, + "grad_norm": 1.1070575714111328, + "learning_rate": 1.6528482797518332e-05, + "loss": 0.7535, + "step": 11870 + }, + { + "epoch": 6.695431472081218, + "grad_norm": 1.1681350469589233, + "learning_rate": 1.652566271855612e-05, + "loss": 0.7487, + "step": 11871 + }, + { + "epoch": 6.6959954878736605, + "grad_norm": 1.3532156944274902, + "learning_rate": 1.652284263959391e-05, + "loss": 0.7988, + "step": 11872 + }, + { + "epoch": 6.696559503666102, + "grad_norm": 1.153429388999939, + "learning_rate": 1.65200225606317e-05, + "loss": 0.6311, + "step": 11873 + }, + { + "epoch": 6.697123519458545, + "grad_norm": 1.1897891759872437, + "learning_rate": 1.6517202481669487e-05, + "loss": 0.7536, + "step": 11874 + }, + { + "epoch": 6.697687535250987, + "grad_norm": 0.9949657320976257, + "learning_rate": 1.6514382402707276e-05, + "loss": 0.681, + "step": 11875 + }, + { + "epoch": 6.6982515510434295, + "grad_norm": 1.0410997867584229, + "learning_rate": 1.6511562323745068e-05, + "loss": 0.7115, + "step": 11876 + }, + { + "epoch": 6.698815566835871, + "grad_norm": 1.570593237876892, + "learning_rate": 1.6508742244782854e-05, + "loss": 0.8113, + "step": 11877 + }, + { + "epoch": 6.699379582628314, + "grad_norm": 0.9846529960632324, + "learning_rate": 1.6505922165820646e-05, + "loss": 0.6709, + "step": 11878 + }, + { + "epoch": 6.699943598420756, + "grad_norm": 1.132629156112671, + "learning_rate": 1.650310208685843e-05, + "loss": 0.633, + "step": 11879 + }, + { + "epoch": 6.700507614213198, + "grad_norm": 1.2873625755310059, + "learning_rate": 1.6500282007896223e-05, + "loss": 0.7298, + "step": 11880 + }, + { + "epoch": 6.70107163000564, + "grad_norm": 2.1183133125305176, + "learning_rate": 1.649746192893401e-05, + "loss": 0.7325, + "step": 11881 + }, + { + "epoch": 6.701635645798082, + "grad_norm": 1.4509153366088867, + "learning_rate": 1.64946418499718e-05, + "loss": 0.8525, + "step": 11882 + }, + { + "epoch": 6.702199661590525, + "grad_norm": 1.0997072458267212, + "learning_rate": 1.649182177100959e-05, + "loss": 0.8377, + "step": 11883 + }, + { + "epoch": 6.702763677382967, + "grad_norm": 1.1358948945999146, + "learning_rate": 1.648900169204738e-05, + "loss": 0.7243, + "step": 11884 + }, + { + "epoch": 6.703327693175408, + "grad_norm": 1.3969355821609497, + "learning_rate": 1.6486181613085167e-05, + "loss": 0.8295, + "step": 11885 + }, + { + "epoch": 6.703891708967851, + "grad_norm": 0.9919103980064392, + "learning_rate": 1.6483361534122956e-05, + "loss": 0.695, + "step": 11886 + }, + { + "epoch": 6.704455724760293, + "grad_norm": 1.4958637952804565, + "learning_rate": 1.6480541455160745e-05, + "loss": 0.7143, + "step": 11887 + }, + { + "epoch": 6.705019740552736, + "grad_norm": 1.3266067504882812, + "learning_rate": 1.6477721376198534e-05, + "loss": 0.7991, + "step": 11888 + }, + { + "epoch": 6.7055837563451774, + "grad_norm": 1.1922659873962402, + "learning_rate": 1.6474901297236323e-05, + "loss": 0.7547, + "step": 11889 + }, + { + "epoch": 6.70614777213762, + "grad_norm": 1.1615495681762695, + "learning_rate": 1.647208121827411e-05, + "loss": 0.7356, + "step": 11890 + }, + { + "epoch": 6.706711787930062, + "grad_norm": 1.073805570602417, + "learning_rate": 1.64692611393119e-05, + "loss": 0.755, + "step": 11891 + }, + { + "epoch": 6.707275803722505, + "grad_norm": 2.08866286277771, + "learning_rate": 1.6466441060349692e-05, + "loss": 0.8638, + "step": 11892 + }, + { + "epoch": 6.7078398195149465, + "grad_norm": 1.381706714630127, + "learning_rate": 1.6463620981387478e-05, + "loss": 0.822, + "step": 11893 + }, + { + "epoch": 6.708403835307388, + "grad_norm": 1.3307867050170898, + "learning_rate": 1.646080090242527e-05, + "loss": 0.7323, + "step": 11894 + }, + { + "epoch": 6.708967851099831, + "grad_norm": 1.3518778085708618, + "learning_rate": 1.6457980823463055e-05, + "loss": 0.7106, + "step": 11895 + }, + { + "epoch": 6.709531866892273, + "grad_norm": 1.0441864728927612, + "learning_rate": 1.6455160744500848e-05, + "loss": 0.7952, + "step": 11896 + }, + { + "epoch": 6.7100958826847155, + "grad_norm": 1.7001174688339233, + "learning_rate": 1.6452340665538636e-05, + "loss": 0.7122, + "step": 11897 + }, + { + "epoch": 6.710659898477157, + "grad_norm": 1.2991830110549927, + "learning_rate": 1.6449520586576425e-05, + "loss": 0.758, + "step": 11898 + }, + { + "epoch": 6.711223914269599, + "grad_norm": 0.9317671656608582, + "learning_rate": 1.6446700507614214e-05, + "loss": 0.6344, + "step": 11899 + }, + { + "epoch": 6.711787930062042, + "grad_norm": 0.9558156132698059, + "learning_rate": 1.6443880428652003e-05, + "loss": 0.696, + "step": 11900 + }, + { + "epoch": 6.712351945854484, + "grad_norm": 1.2398123741149902, + "learning_rate": 1.644106034968979e-05, + "loss": 0.835, + "step": 11901 + }, + { + "epoch": 6.712915961646926, + "grad_norm": 1.538633942604065, + "learning_rate": 1.643824027072758e-05, + "loss": 0.8074, + "step": 11902 + }, + { + "epoch": 6.713479977439368, + "grad_norm": 1.2961865663528442, + "learning_rate": 1.643542019176537e-05, + "loss": 0.7617, + "step": 11903 + }, + { + "epoch": 6.714043993231811, + "grad_norm": 0.8547909259796143, + "learning_rate": 1.6432600112803158e-05, + "loss": 0.7311, + "step": 11904 + }, + { + "epoch": 6.714608009024253, + "grad_norm": 1.7119001150131226, + "learning_rate": 1.642978003384095e-05, + "loss": 0.8428, + "step": 11905 + }, + { + "epoch": 6.715172024816695, + "grad_norm": 1.2271900177001953, + "learning_rate": 1.642695995487874e-05, + "loss": 0.8641, + "step": 11906 + }, + { + "epoch": 6.715736040609137, + "grad_norm": 1.2397290468215942, + "learning_rate": 1.6424139875916528e-05, + "loss": 0.7552, + "step": 11907 + }, + { + "epoch": 6.716300056401579, + "grad_norm": 1.2712554931640625, + "learning_rate": 1.6421319796954316e-05, + "loss": 0.7149, + "step": 11908 + }, + { + "epoch": 6.716864072194022, + "grad_norm": 1.029270887374878, + "learning_rate": 1.6418499717992105e-05, + "loss": 0.6732, + "step": 11909 + }, + { + "epoch": 6.717428087986463, + "grad_norm": 0.9642612934112549, + "learning_rate": 1.6415679639029894e-05, + "loss": 0.8025, + "step": 11910 + }, + { + "epoch": 6.717992103778906, + "grad_norm": 1.6624085903167725, + "learning_rate": 1.6412859560067683e-05, + "loss": 0.7913, + "step": 11911 + }, + { + "epoch": 6.718556119571348, + "grad_norm": 0.8896328210830688, + "learning_rate": 1.641003948110547e-05, + "loss": 0.6272, + "step": 11912 + }, + { + "epoch": 6.71912013536379, + "grad_norm": 1.1621168851852417, + "learning_rate": 1.640721940214326e-05, + "loss": 0.7012, + "step": 11913 + }, + { + "epoch": 6.719684151156232, + "grad_norm": 1.4153332710266113, + "learning_rate": 1.640439932318105e-05, + "loss": 0.7666, + "step": 11914 + }, + { + "epoch": 6.720248166948674, + "grad_norm": 1.6301649808883667, + "learning_rate": 1.640157924421884e-05, + "loss": 0.7935, + "step": 11915 + }, + { + "epoch": 6.720812182741117, + "grad_norm": 1.064192295074463, + "learning_rate": 1.6398759165256627e-05, + "loss": 0.7105, + "step": 11916 + }, + { + "epoch": 6.721376198533559, + "grad_norm": 1.2445721626281738, + "learning_rate": 1.639593908629442e-05, + "loss": 0.7348, + "step": 11917 + }, + { + "epoch": 6.721940214326001, + "grad_norm": 0.9534291625022888, + "learning_rate": 1.6393119007332204e-05, + "loss": 0.7721, + "step": 11918 + }, + { + "epoch": 6.722504230118443, + "grad_norm": 1.3562809228897095, + "learning_rate": 1.6390298928369997e-05, + "loss": 0.7279, + "step": 11919 + }, + { + "epoch": 6.723068245910886, + "grad_norm": 1.1317567825317383, + "learning_rate": 1.6387478849407782e-05, + "loss": 0.7957, + "step": 11920 + }, + { + "epoch": 6.723632261703328, + "grad_norm": 1.2607790231704712, + "learning_rate": 1.6384658770445574e-05, + "loss": 0.6719, + "step": 11921 + }, + { + "epoch": 6.7241962774957695, + "grad_norm": 1.3293875455856323, + "learning_rate": 1.6381838691483363e-05, + "loss": 0.7381, + "step": 11922 + }, + { + "epoch": 6.724760293288212, + "grad_norm": 1.4775558710098267, + "learning_rate": 1.6379018612521152e-05, + "loss": 0.6684, + "step": 11923 + }, + { + "epoch": 6.725324309080654, + "grad_norm": 0.9544207453727722, + "learning_rate": 1.637619853355894e-05, + "loss": 0.77, + "step": 11924 + }, + { + "epoch": 6.725888324873097, + "grad_norm": 1.3344465494155884, + "learning_rate": 1.637337845459673e-05, + "loss": 0.7845, + "step": 11925 + }, + { + "epoch": 6.7264523406655385, + "grad_norm": 1.1106256246566772, + "learning_rate": 1.6370558375634518e-05, + "loss": 0.7491, + "step": 11926 + }, + { + "epoch": 6.72701635645798, + "grad_norm": 1.2290700674057007, + "learning_rate": 1.6367738296672307e-05, + "loss": 0.7147, + "step": 11927 + }, + { + "epoch": 6.727580372250423, + "grad_norm": 1.2170634269714355, + "learning_rate": 1.6364918217710096e-05, + "loss": 0.7795, + "step": 11928 + }, + { + "epoch": 6.728144388042865, + "grad_norm": 1.063915491104126, + "learning_rate": 1.6362098138747885e-05, + "loss": 0.6526, + "step": 11929 + }, + { + "epoch": 6.7287084038353075, + "grad_norm": 0.8424531817436218, + "learning_rate": 1.6359278059785673e-05, + "loss": 0.7246, + "step": 11930 + }, + { + "epoch": 6.729272419627749, + "grad_norm": 1.224645733833313, + "learning_rate": 1.6356457980823466e-05, + "loss": 0.786, + "step": 11931 + }, + { + "epoch": 6.729836435420192, + "grad_norm": 1.4296495914459229, + "learning_rate": 1.635363790186125e-05, + "loss": 0.7526, + "step": 11932 + }, + { + "epoch": 6.730400451212634, + "grad_norm": 1.2208889722824097, + "learning_rate": 1.6350817822899043e-05, + "loss": 0.7673, + "step": 11933 + }, + { + "epoch": 6.730964467005077, + "grad_norm": 1.5302573442459106, + "learning_rate": 1.634799774393683e-05, + "loss": 0.6921, + "step": 11934 + }, + { + "epoch": 6.731528482797518, + "grad_norm": 1.1658515930175781, + "learning_rate": 1.634517766497462e-05, + "loss": 0.7116, + "step": 11935 + }, + { + "epoch": 6.73209249858996, + "grad_norm": 1.2863284349441528, + "learning_rate": 1.6342357586012406e-05, + "loss": 0.6947, + "step": 11936 + }, + { + "epoch": 6.732656514382403, + "grad_norm": 0.880917489528656, + "learning_rate": 1.63395375070502e-05, + "loss": 0.6255, + "step": 11937 + }, + { + "epoch": 6.733220530174845, + "grad_norm": 0.9846112132072449, + "learning_rate": 1.6336717428087987e-05, + "loss": 0.7106, + "step": 11938 + }, + { + "epoch": 6.733784545967287, + "grad_norm": 1.0488346815109253, + "learning_rate": 1.6333897349125776e-05, + "loss": 0.7075, + "step": 11939 + }, + { + "epoch": 6.734348561759729, + "grad_norm": 1.0990612506866455, + "learning_rate": 1.6331077270163568e-05, + "loss": 0.6889, + "step": 11940 + }, + { + "epoch": 6.734912577552171, + "grad_norm": 1.0968385934829712, + "learning_rate": 1.6328257191201354e-05, + "loss": 0.7508, + "step": 11941 + }, + { + "epoch": 6.735476593344614, + "grad_norm": 1.3104088306427002, + "learning_rate": 1.6325437112239146e-05, + "loss": 0.7611, + "step": 11942 + }, + { + "epoch": 6.7360406091370555, + "grad_norm": 1.4971377849578857, + "learning_rate": 1.632261703327693e-05, + "loss": 0.8235, + "step": 11943 + }, + { + "epoch": 6.736604624929498, + "grad_norm": 1.1680655479431152, + "learning_rate": 1.6319796954314723e-05, + "loss": 0.6959, + "step": 11944 + }, + { + "epoch": 6.73716864072194, + "grad_norm": 1.0579555034637451, + "learning_rate": 1.631697687535251e-05, + "loss": 0.7267, + "step": 11945 + }, + { + "epoch": 6.737732656514383, + "grad_norm": 1.2176947593688965, + "learning_rate": 1.63141567963903e-05, + "loss": 0.8072, + "step": 11946 + }, + { + "epoch": 6.7382966723068245, + "grad_norm": 1.1745167970657349, + "learning_rate": 1.631133671742809e-05, + "loss": 0.7528, + "step": 11947 + }, + { + "epoch": 6.738860688099267, + "grad_norm": 0.8802444934844971, + "learning_rate": 1.630851663846588e-05, + "loss": 0.7622, + "step": 11948 + }, + { + "epoch": 6.739424703891709, + "grad_norm": 1.0620955228805542, + "learning_rate": 1.6305696559503667e-05, + "loss": 0.706, + "step": 11949 + }, + { + "epoch": 6.739988719684151, + "grad_norm": 1.013607144355774, + "learning_rate": 1.6302876480541456e-05, + "loss": 0.7537, + "step": 11950 + }, + { + "epoch": 6.7405527354765935, + "grad_norm": 1.1354527473449707, + "learning_rate": 1.6300056401579245e-05, + "loss": 0.7189, + "step": 11951 + }, + { + "epoch": 6.741116751269035, + "grad_norm": 1.3139739036560059, + "learning_rate": 1.6297236322617034e-05, + "loss": 0.8465, + "step": 11952 + }, + { + "epoch": 6.741680767061478, + "grad_norm": 1.0232094526290894, + "learning_rate": 1.6294416243654822e-05, + "loss": 0.7078, + "step": 11953 + }, + { + "epoch": 6.74224478285392, + "grad_norm": 1.3244889974594116, + "learning_rate": 1.629159616469261e-05, + "loss": 0.7823, + "step": 11954 + }, + { + "epoch": 6.742808798646362, + "grad_norm": 1.415625810623169, + "learning_rate": 1.62887760857304e-05, + "loss": 0.8225, + "step": 11955 + }, + { + "epoch": 6.743372814438804, + "grad_norm": 1.0001029968261719, + "learning_rate": 1.6285956006768192e-05, + "loss": 0.8661, + "step": 11956 + }, + { + "epoch": 6.743936830231246, + "grad_norm": 1.3931629657745361, + "learning_rate": 1.6283135927805978e-05, + "loss": 0.8367, + "step": 11957 + }, + { + "epoch": 6.744500846023689, + "grad_norm": 0.9347873330116272, + "learning_rate": 1.628031584884377e-05, + "loss": 0.6737, + "step": 11958 + }, + { + "epoch": 6.745064861816131, + "grad_norm": 1.208046555519104, + "learning_rate": 1.6277495769881555e-05, + "loss": 0.7006, + "step": 11959 + }, + { + "epoch": 6.745628877608573, + "grad_norm": 1.189727783203125, + "learning_rate": 1.6274675690919347e-05, + "loss": 0.8173, + "step": 11960 + }, + { + "epoch": 6.746192893401015, + "grad_norm": 1.2106339931488037, + "learning_rate": 1.6271855611957136e-05, + "loss": 0.6522, + "step": 11961 + }, + { + "epoch": 6.746756909193458, + "grad_norm": 1.0447925329208374, + "learning_rate": 1.6269035532994925e-05, + "loss": 0.6778, + "step": 11962 + }, + { + "epoch": 6.7473209249859, + "grad_norm": 1.1685550212860107, + "learning_rate": 1.6266215454032714e-05, + "loss": 0.7262, + "step": 11963 + }, + { + "epoch": 6.7478849407783414, + "grad_norm": 1.1924995183944702, + "learning_rate": 1.6263395375070503e-05, + "loss": 0.7271, + "step": 11964 + }, + { + "epoch": 6.748448956570784, + "grad_norm": 1.4084715843200684, + "learning_rate": 1.626057529610829e-05, + "loss": 0.7651, + "step": 11965 + }, + { + "epoch": 6.749012972363226, + "grad_norm": 1.2188911437988281, + "learning_rate": 1.625775521714608e-05, + "loss": 0.773, + "step": 11966 + }, + { + "epoch": 6.749576988155669, + "grad_norm": 1.3925652503967285, + "learning_rate": 1.625493513818387e-05, + "loss": 0.7585, + "step": 11967 + }, + { + "epoch": 6.7501410039481105, + "grad_norm": 1.2235108613967896, + "learning_rate": 1.6252115059221658e-05, + "loss": 0.7289, + "step": 11968 + }, + { + "epoch": 6.750705019740552, + "grad_norm": 1.0063049793243408, + "learning_rate": 1.6249294980259447e-05, + "loss": 0.7453, + "step": 11969 + }, + { + "epoch": 6.751269035532995, + "grad_norm": 1.216225028038025, + "learning_rate": 1.624647490129724e-05, + "loss": 0.7905, + "step": 11970 + }, + { + "epoch": 6.751833051325437, + "grad_norm": 1.0372101068496704, + "learning_rate": 1.6243654822335024e-05, + "loss": 0.6144, + "step": 11971 + }, + { + "epoch": 6.7523970671178795, + "grad_norm": 1.1113446950912476, + "learning_rate": 1.6240834743372816e-05, + "loss": 0.7059, + "step": 11972 + }, + { + "epoch": 6.752961082910321, + "grad_norm": 1.2252154350280762, + "learning_rate": 1.6238014664410602e-05, + "loss": 0.8042, + "step": 11973 + }, + { + "epoch": 6.753525098702764, + "grad_norm": 1.0624722242355347, + "learning_rate": 1.6235194585448394e-05, + "loss": 0.6938, + "step": 11974 + }, + { + "epoch": 6.754089114495206, + "grad_norm": 0.8781360983848572, + "learning_rate": 1.6232374506486183e-05, + "loss": 0.7393, + "step": 11975 + }, + { + "epoch": 6.7546531302876485, + "grad_norm": 1.3118209838867188, + "learning_rate": 1.622955442752397e-05, + "loss": 0.7405, + "step": 11976 + }, + { + "epoch": 6.75521714608009, + "grad_norm": 1.1582539081573486, + "learning_rate": 1.622673434856176e-05, + "loss": 0.6651, + "step": 11977 + }, + { + "epoch": 6.755781161872532, + "grad_norm": 1.1586025953292847, + "learning_rate": 1.622391426959955e-05, + "loss": 0.7633, + "step": 11978 + }, + { + "epoch": 6.756345177664975, + "grad_norm": 1.0837033987045288, + "learning_rate": 1.622109419063734e-05, + "loss": 0.8102, + "step": 11979 + }, + { + "epoch": 6.756909193457417, + "grad_norm": 1.0783071517944336, + "learning_rate": 1.6218274111675127e-05, + "loss": 0.7858, + "step": 11980 + }, + { + "epoch": 6.757473209249859, + "grad_norm": 1.2190114259719849, + "learning_rate": 1.621545403271292e-05, + "loss": 0.7665, + "step": 11981 + }, + { + "epoch": 6.758037225042301, + "grad_norm": 1.1589040756225586, + "learning_rate": 1.6212633953750704e-05, + "loss": 0.7736, + "step": 11982 + }, + { + "epoch": 6.758601240834743, + "grad_norm": 1.233109474182129, + "learning_rate": 1.6209813874788497e-05, + "loss": 0.7324, + "step": 11983 + }, + { + "epoch": 6.759165256627186, + "grad_norm": 0.8542524576187134, + "learning_rate": 1.6206993795826282e-05, + "loss": 0.7124, + "step": 11984 + }, + { + "epoch": 6.759729272419627, + "grad_norm": 0.8744105100631714, + "learning_rate": 1.6204173716864074e-05, + "loss": 0.6142, + "step": 11985 + }, + { + "epoch": 6.76029328821207, + "grad_norm": 0.9923286437988281, + "learning_rate": 1.6201353637901863e-05, + "loss": 0.803, + "step": 11986 + }, + { + "epoch": 6.760857304004512, + "grad_norm": 1.3793610334396362, + "learning_rate": 1.6198533558939652e-05, + "loss": 0.8574, + "step": 11987 + }, + { + "epoch": 6.761421319796955, + "grad_norm": 0.9975877404212952, + "learning_rate": 1.619571347997744e-05, + "loss": 0.7376, + "step": 11988 + }, + { + "epoch": 6.761985335589396, + "grad_norm": 9.043712615966797, + "learning_rate": 1.619289340101523e-05, + "loss": 0.7632, + "step": 11989 + }, + { + "epoch": 6.762549351381839, + "grad_norm": 1.1322485208511353, + "learning_rate": 1.6190073322053018e-05, + "loss": 0.8181, + "step": 11990 + }, + { + "epoch": 6.763113367174281, + "grad_norm": 1.0214297771453857, + "learning_rate": 1.6187253243090807e-05, + "loss": 0.6188, + "step": 11991 + }, + { + "epoch": 6.763677382966723, + "grad_norm": 1.013706922531128, + "learning_rate": 1.6184433164128596e-05, + "loss": 0.7261, + "step": 11992 + }, + { + "epoch": 6.764241398759165, + "grad_norm": 0.9764316082000732, + "learning_rate": 1.6181613085166385e-05, + "loss": 0.7098, + "step": 11993 + }, + { + "epoch": 6.764805414551607, + "grad_norm": 1.1651637554168701, + "learning_rate": 1.6178793006204173e-05, + "loss": 0.7639, + "step": 11994 + }, + { + "epoch": 6.76536943034405, + "grad_norm": 1.3844326734542847, + "learning_rate": 1.6175972927241965e-05, + "loss": 0.7415, + "step": 11995 + }, + { + "epoch": 6.765933446136492, + "grad_norm": 1.5154534578323364, + "learning_rate": 1.617315284827975e-05, + "loss": 0.8063, + "step": 11996 + }, + { + "epoch": 6.7664974619289335, + "grad_norm": 1.2961345911026, + "learning_rate": 1.6170332769317543e-05, + "loss": 0.8309, + "step": 11997 + }, + { + "epoch": 6.767061477721376, + "grad_norm": 1.286914348602295, + "learning_rate": 1.616751269035533e-05, + "loss": 0.6989, + "step": 11998 + }, + { + "epoch": 6.767625493513818, + "grad_norm": 1.364166498184204, + "learning_rate": 1.616469261139312e-05, + "loss": 0.8299, + "step": 11999 + }, + { + "epoch": 6.768189509306261, + "grad_norm": 1.5648399591445923, + "learning_rate": 1.616187253243091e-05, + "loss": 0.8192, + "step": 12000 + }, + { + "epoch": 6.7687535250987025, + "grad_norm": 1.0456267595291138, + "learning_rate": 1.6159052453468698e-05, + "loss": 0.7412, + "step": 12001 + }, + { + "epoch": 6.769317540891145, + "grad_norm": 1.19896399974823, + "learning_rate": 1.6156232374506487e-05, + "loss": 0.6951, + "step": 12002 + }, + { + "epoch": 6.769881556683587, + "grad_norm": 2.067171096801758, + "learning_rate": 1.6153412295544276e-05, + "loss": 0.7823, + "step": 12003 + }, + { + "epoch": 6.77044557247603, + "grad_norm": 0.9635501503944397, + "learning_rate": 1.6150592216582065e-05, + "loss": 0.6827, + "step": 12004 + }, + { + "epoch": 6.7710095882684715, + "grad_norm": 0.9634432196617126, + "learning_rate": 1.6147772137619853e-05, + "loss": 0.781, + "step": 12005 + }, + { + "epoch": 6.771573604060913, + "grad_norm": 0.9921138286590576, + "learning_rate": 1.6144952058657642e-05, + "loss": 0.7177, + "step": 12006 + }, + { + "epoch": 6.772137619853356, + "grad_norm": 1.2116791009902954, + "learning_rate": 1.614213197969543e-05, + "loss": 0.8161, + "step": 12007 + }, + { + "epoch": 6.772701635645798, + "grad_norm": 0.9494927525520325, + "learning_rate": 1.613931190073322e-05, + "loss": 0.7307, + "step": 12008 + }, + { + "epoch": 6.7732656514382406, + "grad_norm": 2.6439225673675537, + "learning_rate": 1.6136491821771012e-05, + "loss": 0.7955, + "step": 12009 + }, + { + "epoch": 6.773829667230682, + "grad_norm": 1.4441100358963013, + "learning_rate": 1.61336717428088e-05, + "loss": 0.7902, + "step": 12010 + }, + { + "epoch": 6.774393683023124, + "grad_norm": 0.915610134601593, + "learning_rate": 1.613085166384659e-05, + "loss": 0.7003, + "step": 12011 + }, + { + "epoch": 6.774957698815567, + "grad_norm": 1.258841872215271, + "learning_rate": 1.612803158488438e-05, + "loss": 0.6779, + "step": 12012 + }, + { + "epoch": 6.775521714608009, + "grad_norm": 1.3111250400543213, + "learning_rate": 1.6125211505922167e-05, + "loss": 0.9065, + "step": 12013 + }, + { + "epoch": 6.776085730400451, + "grad_norm": 0.9395071864128113, + "learning_rate": 1.6122391426959956e-05, + "loss": 0.7557, + "step": 12014 + }, + { + "epoch": 6.776649746192893, + "grad_norm": 1.4024980068206787, + "learning_rate": 1.6119571347997745e-05, + "loss": 0.7293, + "step": 12015 + }, + { + "epoch": 6.777213761985336, + "grad_norm": 1.1266486644744873, + "learning_rate": 1.6116751269035534e-05, + "loss": 0.7175, + "step": 12016 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 1.0908452272415161, + "learning_rate": 1.6113931190073322e-05, + "loss": 0.7104, + "step": 12017 + }, + { + "epoch": 6.77834179357022, + "grad_norm": 0.9414927363395691, + "learning_rate": 1.6111111111111115e-05, + "loss": 0.6657, + "step": 12018 + }, + { + "epoch": 6.778905809362662, + "grad_norm": 1.7977542877197266, + "learning_rate": 1.61082910321489e-05, + "loss": 0.8524, + "step": 12019 + }, + { + "epoch": 6.779469825155104, + "grad_norm": 1.1446200609207153, + "learning_rate": 1.6105470953186692e-05, + "loss": 0.7064, + "step": 12020 + }, + { + "epoch": 6.780033840947547, + "grad_norm": 0.8649377226829529, + "learning_rate": 1.6102650874224478e-05, + "loss": 0.694, + "step": 12021 + }, + { + "epoch": 6.7805978567399885, + "grad_norm": 1.1044071912765503, + "learning_rate": 1.609983079526227e-05, + "loss": 0.7969, + "step": 12022 + }, + { + "epoch": 6.781161872532431, + "grad_norm": 0.9269590973854065, + "learning_rate": 1.6097010716300055e-05, + "loss": 0.6309, + "step": 12023 + }, + { + "epoch": 6.781725888324873, + "grad_norm": 1.641717553138733, + "learning_rate": 1.6094190637337847e-05, + "loss": 0.8107, + "step": 12024 + }, + { + "epoch": 6.782289904117315, + "grad_norm": 1.3916932344436646, + "learning_rate": 1.6091370558375636e-05, + "loss": 0.8267, + "step": 12025 + }, + { + "epoch": 6.7828539199097575, + "grad_norm": 1.7484040260314941, + "learning_rate": 1.6088550479413425e-05, + "loss": 0.7901, + "step": 12026 + }, + { + "epoch": 6.783417935702199, + "grad_norm": 1.8418627977371216, + "learning_rate": 1.6085730400451214e-05, + "loss": 0.8093, + "step": 12027 + }, + { + "epoch": 6.783981951494642, + "grad_norm": 1.0795351266860962, + "learning_rate": 1.6082910321489003e-05, + "loss": 0.72, + "step": 12028 + }, + { + "epoch": 6.784545967287084, + "grad_norm": 1.2995606660842896, + "learning_rate": 1.608009024252679e-05, + "loss": 0.7525, + "step": 12029 + }, + { + "epoch": 6.7851099830795265, + "grad_norm": 1.2018096446990967, + "learning_rate": 1.607727016356458e-05, + "loss": 0.8083, + "step": 12030 + }, + { + "epoch": 6.785673998871968, + "grad_norm": 1.051324486732483, + "learning_rate": 1.607445008460237e-05, + "loss": 0.7207, + "step": 12031 + }, + { + "epoch": 6.786238014664411, + "grad_norm": 1.1704143285751343, + "learning_rate": 1.6071630005640158e-05, + "loss": 0.7504, + "step": 12032 + }, + { + "epoch": 6.786802030456853, + "grad_norm": 1.5240975618362427, + "learning_rate": 1.6068809926677947e-05, + "loss": 0.6902, + "step": 12033 + }, + { + "epoch": 6.787366046249295, + "grad_norm": 1.4747394323349, + "learning_rate": 1.606598984771574e-05, + "loss": 0.808, + "step": 12034 + }, + { + "epoch": 6.787930062041737, + "grad_norm": 1.1227062940597534, + "learning_rate": 1.6063169768753524e-05, + "loss": 0.7806, + "step": 12035 + }, + { + "epoch": 6.788494077834179, + "grad_norm": 1.2200614213943481, + "learning_rate": 1.6060349689791316e-05, + "loss": 0.7413, + "step": 12036 + }, + { + "epoch": 6.789058093626622, + "grad_norm": 0.761772096157074, + "learning_rate": 1.6057529610829102e-05, + "loss": 0.6619, + "step": 12037 + }, + { + "epoch": 6.789622109419064, + "grad_norm": 0.8537749648094177, + "learning_rate": 1.6054709531866894e-05, + "loss": 0.6891, + "step": 12038 + }, + { + "epoch": 6.790186125211505, + "grad_norm": 1.1695774793624878, + "learning_rate": 1.605188945290468e-05, + "loss": 0.7595, + "step": 12039 + }, + { + "epoch": 6.790750141003948, + "grad_norm": 1.2866666316986084, + "learning_rate": 1.604906937394247e-05, + "loss": 0.7749, + "step": 12040 + }, + { + "epoch": 6.79131415679639, + "grad_norm": 0.8166520595550537, + "learning_rate": 1.604624929498026e-05, + "loss": 0.6557, + "step": 12041 + }, + { + "epoch": 6.791878172588833, + "grad_norm": 1.4129036664962769, + "learning_rate": 1.604342921601805e-05, + "loss": 0.7588, + "step": 12042 + }, + { + "epoch": 6.7924421883812744, + "grad_norm": 1.2101483345031738, + "learning_rate": 1.6040609137055838e-05, + "loss": 0.7556, + "step": 12043 + }, + { + "epoch": 6.793006204173717, + "grad_norm": 1.0158226490020752, + "learning_rate": 1.6037789058093627e-05, + "loss": 0.7355, + "step": 12044 + }, + { + "epoch": 6.793570219966159, + "grad_norm": 1.26964271068573, + "learning_rate": 1.603496897913142e-05, + "loss": 0.8451, + "step": 12045 + }, + { + "epoch": 6.794134235758602, + "grad_norm": 1.4309790134429932, + "learning_rate": 1.6032148900169204e-05, + "loss": 0.7635, + "step": 12046 + }, + { + "epoch": 6.7946982515510435, + "grad_norm": 1.1323765516281128, + "learning_rate": 1.6029328821206996e-05, + "loss": 0.7521, + "step": 12047 + }, + { + "epoch": 6.795262267343485, + "grad_norm": 1.1336684226989746, + "learning_rate": 1.6026508742244782e-05, + "loss": 0.8257, + "step": 12048 + }, + { + "epoch": 6.795826283135928, + "grad_norm": 1.4156049489974976, + "learning_rate": 1.6023688663282574e-05, + "loss": 0.7307, + "step": 12049 + }, + { + "epoch": 6.79639029892837, + "grad_norm": 0.9043057560920715, + "learning_rate": 1.6020868584320363e-05, + "loss": 0.7342, + "step": 12050 + }, + { + "epoch": 6.7969543147208125, + "grad_norm": 1.0268958806991577, + "learning_rate": 1.601804850535815e-05, + "loss": 0.6866, + "step": 12051 + }, + { + "epoch": 6.797518330513254, + "grad_norm": 0.7228097915649414, + "learning_rate": 1.601522842639594e-05, + "loss": 0.6114, + "step": 12052 + }, + { + "epoch": 6.798082346305696, + "grad_norm": 1.627173900604248, + "learning_rate": 1.601240834743373e-05, + "loss": 0.8773, + "step": 12053 + }, + { + "epoch": 6.798646362098139, + "grad_norm": 1.4999698400497437, + "learning_rate": 1.6009588268471518e-05, + "loss": 0.7471, + "step": 12054 + }, + { + "epoch": 6.799210377890581, + "grad_norm": 1.552716851234436, + "learning_rate": 1.6006768189509307e-05, + "loss": 0.9367, + "step": 12055 + }, + { + "epoch": 6.799774393683023, + "grad_norm": 1.4698048830032349, + "learning_rate": 1.6003948110547096e-05, + "loss": 0.7637, + "step": 12056 + }, + { + "epoch": 6.800338409475465, + "grad_norm": 2.2949631214141846, + "learning_rate": 1.6001128031584884e-05, + "loss": 0.7736, + "step": 12057 + }, + { + "epoch": 6.800902425267908, + "grad_norm": 1.1044715642929077, + "learning_rate": 1.5998307952622673e-05, + "loss": 0.7547, + "step": 12058 + }, + { + "epoch": 6.80146644106035, + "grad_norm": 1.099547266960144, + "learning_rate": 1.5995487873660465e-05, + "loss": 0.6987, + "step": 12059 + }, + { + "epoch": 6.802030456852792, + "grad_norm": 0.9322925209999084, + "learning_rate": 1.599266779469825e-05, + "loss": 0.7018, + "step": 12060 + }, + { + "epoch": 6.802594472645234, + "grad_norm": 1.3586056232452393, + "learning_rate": 1.5989847715736043e-05, + "loss": 0.6129, + "step": 12061 + }, + { + "epoch": 6.803158488437676, + "grad_norm": 1.1915289163589478, + "learning_rate": 1.598702763677383e-05, + "loss": 0.7265, + "step": 12062 + }, + { + "epoch": 6.803722504230119, + "grad_norm": 1.0521442890167236, + "learning_rate": 1.598420755781162e-05, + "loss": 0.6751, + "step": 12063 + }, + { + "epoch": 6.80428652002256, + "grad_norm": 1.2619035243988037, + "learning_rate": 1.598138747884941e-05, + "loss": 0.7714, + "step": 12064 + }, + { + "epoch": 6.804850535815003, + "grad_norm": 1.3898850679397583, + "learning_rate": 1.5978567399887198e-05, + "loss": 0.7472, + "step": 12065 + }, + { + "epoch": 6.805414551607445, + "grad_norm": 1.0419784784317017, + "learning_rate": 1.5975747320924987e-05, + "loss": 0.6921, + "step": 12066 + }, + { + "epoch": 6.805978567399887, + "grad_norm": 1.1099433898925781, + "learning_rate": 1.5972927241962776e-05, + "loss": 0.7284, + "step": 12067 + }, + { + "epoch": 6.806542583192329, + "grad_norm": 1.0323047637939453, + "learning_rate": 1.5970107163000565e-05, + "loss": 0.6838, + "step": 12068 + }, + { + "epoch": 6.807106598984771, + "grad_norm": 1.0740641355514526, + "learning_rate": 1.5967287084038353e-05, + "loss": 0.6255, + "step": 12069 + }, + { + "epoch": 6.807670614777214, + "grad_norm": 1.2435932159423828, + "learning_rate": 1.5964467005076142e-05, + "loss": 0.9055, + "step": 12070 + }, + { + "epoch": 6.808234630569656, + "grad_norm": 0.8682910799980164, + "learning_rate": 1.596164692611393e-05, + "loss": 0.7447, + "step": 12071 + }, + { + "epoch": 6.808798646362098, + "grad_norm": 1.105215311050415, + "learning_rate": 1.595882684715172e-05, + "loss": 0.8016, + "step": 12072 + }, + { + "epoch": 6.80936266215454, + "grad_norm": 0.9605927467346191, + "learning_rate": 1.5956006768189512e-05, + "loss": 0.7825, + "step": 12073 + }, + { + "epoch": 6.809926677946983, + "grad_norm": 1.1178650856018066, + "learning_rate": 1.5953186689227297e-05, + "loss": 0.7559, + "step": 12074 + }, + { + "epoch": 6.810490693739425, + "grad_norm": 1.0733721256256104, + "learning_rate": 1.595036661026509e-05, + "loss": 0.7067, + "step": 12075 + }, + { + "epoch": 6.8110547095318665, + "grad_norm": 1.0194048881530762, + "learning_rate": 1.5947546531302875e-05, + "loss": 0.7289, + "step": 12076 + }, + { + "epoch": 6.811618725324309, + "grad_norm": 1.2136807441711426, + "learning_rate": 1.5944726452340667e-05, + "loss": 0.704, + "step": 12077 + }, + { + "epoch": 6.812182741116751, + "grad_norm": 1.3465763330459595, + "learning_rate": 1.5941906373378453e-05, + "loss": 0.6812, + "step": 12078 + }, + { + "epoch": 6.812746756909194, + "grad_norm": 0.988160252571106, + "learning_rate": 1.5939086294416245e-05, + "loss": 0.7658, + "step": 12079 + }, + { + "epoch": 6.8133107727016355, + "grad_norm": 1.4429454803466797, + "learning_rate": 1.5936266215454034e-05, + "loss": 0.8197, + "step": 12080 + }, + { + "epoch": 6.813874788494077, + "grad_norm": 1.4531077146530151, + "learning_rate": 1.5933446136491822e-05, + "loss": 0.7938, + "step": 12081 + }, + { + "epoch": 6.81443880428652, + "grad_norm": 1.6316996812820435, + "learning_rate": 1.5930626057529615e-05, + "loss": 0.8391, + "step": 12082 + }, + { + "epoch": 6.815002820078962, + "grad_norm": 1.5822468996047974, + "learning_rate": 1.59278059785674e-05, + "loss": 0.8724, + "step": 12083 + }, + { + "epoch": 6.8155668358714045, + "grad_norm": 1.1147202253341675, + "learning_rate": 1.5924985899605192e-05, + "loss": 0.7344, + "step": 12084 + }, + { + "epoch": 6.816130851663846, + "grad_norm": 1.2197446823120117, + "learning_rate": 1.5922165820642978e-05, + "loss": 0.8572, + "step": 12085 + }, + { + "epoch": 6.816694867456289, + "grad_norm": 1.230939507484436, + "learning_rate": 1.591934574168077e-05, + "loss": 0.8161, + "step": 12086 + }, + { + "epoch": 6.817258883248731, + "grad_norm": 0.9122169613838196, + "learning_rate": 1.5916525662718555e-05, + "loss": 0.6929, + "step": 12087 + }, + { + "epoch": 6.817822899041174, + "grad_norm": 1.1308848857879639, + "learning_rate": 1.5913705583756347e-05, + "loss": 0.7746, + "step": 12088 + }, + { + "epoch": 6.818386914833615, + "grad_norm": 1.1083885431289673, + "learning_rate": 1.5910885504794136e-05, + "loss": 0.735, + "step": 12089 + }, + { + "epoch": 6.818950930626057, + "grad_norm": 1.0019201040267944, + "learning_rate": 1.5908065425831925e-05, + "loss": 0.6967, + "step": 12090 + }, + { + "epoch": 6.8195149464185, + "grad_norm": 1.2886688709259033, + "learning_rate": 1.5905245346869714e-05, + "loss": 0.762, + "step": 12091 + }, + { + "epoch": 6.820078962210942, + "grad_norm": 1.1407853364944458, + "learning_rate": 1.5902425267907502e-05, + "loss": 0.6913, + "step": 12092 + }, + { + "epoch": 6.820642978003384, + "grad_norm": 1.220723032951355, + "learning_rate": 1.589960518894529e-05, + "loss": 0.7867, + "step": 12093 + }, + { + "epoch": 6.821206993795826, + "grad_norm": 1.009348750114441, + "learning_rate": 1.589678510998308e-05, + "loss": 0.7918, + "step": 12094 + }, + { + "epoch": 6.821771009588268, + "grad_norm": 1.3878297805786133, + "learning_rate": 1.589396503102087e-05, + "loss": 0.863, + "step": 12095 + }, + { + "epoch": 6.822335025380711, + "grad_norm": 1.3866273164749146, + "learning_rate": 1.5891144952058658e-05, + "loss": 0.8231, + "step": 12096 + }, + { + "epoch": 6.8228990411731525, + "grad_norm": 1.277520775794983, + "learning_rate": 1.5888324873096446e-05, + "loss": 0.7542, + "step": 12097 + }, + { + "epoch": 6.823463056965595, + "grad_norm": 1.1213291883468628, + "learning_rate": 1.588550479413424e-05, + "loss": 0.6192, + "step": 12098 + }, + { + "epoch": 6.824027072758037, + "grad_norm": 1.3158866167068481, + "learning_rate": 1.5882684715172024e-05, + "loss": 0.859, + "step": 12099 + }, + { + "epoch": 6.82459108855048, + "grad_norm": 1.250327467918396, + "learning_rate": 1.5879864636209816e-05, + "loss": 0.7612, + "step": 12100 + }, + { + "epoch": 6.8251551043429215, + "grad_norm": 1.2095463275909424, + "learning_rate": 1.58770445572476e-05, + "loss": 0.7219, + "step": 12101 + }, + { + "epoch": 6.825719120135364, + "grad_norm": 1.1158812046051025, + "learning_rate": 1.5874224478285394e-05, + "loss": 0.8032, + "step": 12102 + }, + { + "epoch": 6.826283135927806, + "grad_norm": 0.9281573295593262, + "learning_rate": 1.587140439932318e-05, + "loss": 0.6856, + "step": 12103 + }, + { + "epoch": 6.826847151720248, + "grad_norm": 1.2131984233856201, + "learning_rate": 1.586858432036097e-05, + "loss": 0.8595, + "step": 12104 + }, + { + "epoch": 6.8274111675126905, + "grad_norm": 1.0385563373565674, + "learning_rate": 1.586576424139876e-05, + "loss": 0.8299, + "step": 12105 + }, + { + "epoch": 6.827975183305132, + "grad_norm": 1.185437798500061, + "learning_rate": 1.586294416243655e-05, + "loss": 0.7151, + "step": 12106 + }, + { + "epoch": 6.828539199097575, + "grad_norm": 1.0254478454589844, + "learning_rate": 1.5860124083474338e-05, + "loss": 0.75, + "step": 12107 + }, + { + "epoch": 6.829103214890017, + "grad_norm": 0.705250084400177, + "learning_rate": 1.5857304004512127e-05, + "loss": 0.5976, + "step": 12108 + }, + { + "epoch": 6.829667230682459, + "grad_norm": 1.0277806520462036, + "learning_rate": 1.5854483925549915e-05, + "loss": 0.7698, + "step": 12109 + }, + { + "epoch": 6.830231246474901, + "grad_norm": 1.0059525966644287, + "learning_rate": 1.5851663846587704e-05, + "loss": 0.7302, + "step": 12110 + }, + { + "epoch": 6.830795262267343, + "grad_norm": 1.137864351272583, + "learning_rate": 1.5848843767625493e-05, + "loss": 0.6884, + "step": 12111 + }, + { + "epoch": 6.831359278059786, + "grad_norm": 1.2766501903533936, + "learning_rate": 1.5846023688663282e-05, + "loss": 0.7302, + "step": 12112 + }, + { + "epoch": 6.831923293852228, + "grad_norm": 0.8867117166519165, + "learning_rate": 1.584320360970107e-05, + "loss": 0.7349, + "step": 12113 + }, + { + "epoch": 6.83248730964467, + "grad_norm": 1.14566969871521, + "learning_rate": 1.5840383530738863e-05, + "loss": 0.7211, + "step": 12114 + }, + { + "epoch": 6.833051325437112, + "grad_norm": 0.8875351548194885, + "learning_rate": 1.5837563451776648e-05, + "loss": 0.6239, + "step": 12115 + }, + { + "epoch": 6.833615341229555, + "grad_norm": 0.9386813640594482, + "learning_rate": 1.583474337281444e-05, + "loss": 0.8258, + "step": 12116 + }, + { + "epoch": 6.834179357021997, + "grad_norm": 1.3048312664031982, + "learning_rate": 1.583192329385223e-05, + "loss": 0.7724, + "step": 12117 + }, + { + "epoch": 6.8347433728144384, + "grad_norm": 1.4212875366210938, + "learning_rate": 1.5829103214890018e-05, + "loss": 0.8175, + "step": 12118 + }, + { + "epoch": 6.835307388606881, + "grad_norm": 1.2275582551956177, + "learning_rate": 1.5826283135927807e-05, + "loss": 0.8608, + "step": 12119 + }, + { + "epoch": 6.835871404399323, + "grad_norm": 0.906607985496521, + "learning_rate": 1.5823463056965596e-05, + "loss": 0.738, + "step": 12120 + }, + { + "epoch": 6.836435420191766, + "grad_norm": 1.11175537109375, + "learning_rate": 1.5820642978003384e-05, + "loss": 0.7509, + "step": 12121 + }, + { + "epoch": 6.8369994359842075, + "grad_norm": 1.0476912260055542, + "learning_rate": 1.5817822899041173e-05, + "loss": 0.6565, + "step": 12122 + }, + { + "epoch": 6.837563451776649, + "grad_norm": 0.8602321743965149, + "learning_rate": 1.5815002820078965e-05, + "loss": 0.7245, + "step": 12123 + }, + { + "epoch": 6.838127467569092, + "grad_norm": 1.4921982288360596, + "learning_rate": 1.581218274111675e-05, + "loss": 0.8228, + "step": 12124 + }, + { + "epoch": 6.838691483361534, + "grad_norm": 1.3067995309829712, + "learning_rate": 1.5809362662154543e-05, + "loss": 0.6716, + "step": 12125 + }, + { + "epoch": 6.8392554991539765, + "grad_norm": 1.1752562522888184, + "learning_rate": 1.580654258319233e-05, + "loss": 0.6716, + "step": 12126 + }, + { + "epoch": 6.839819514946418, + "grad_norm": 1.055769920349121, + "learning_rate": 1.580372250423012e-05, + "loss": 0.7449, + "step": 12127 + }, + { + "epoch": 6.840383530738861, + "grad_norm": 0.8720740675926208, + "learning_rate": 1.580090242526791e-05, + "loss": 0.7248, + "step": 12128 + }, + { + "epoch": 6.840947546531303, + "grad_norm": 1.2506515979766846, + "learning_rate": 1.5798082346305698e-05, + "loss": 0.6387, + "step": 12129 + }, + { + "epoch": 6.8415115623237455, + "grad_norm": 0.9882411956787109, + "learning_rate": 1.5795262267343487e-05, + "loss": 0.681, + "step": 12130 + }, + { + "epoch": 6.842075578116187, + "grad_norm": 1.5743666887283325, + "learning_rate": 1.5792442188381276e-05, + "loss": 0.8786, + "step": 12131 + }, + { + "epoch": 6.842639593908629, + "grad_norm": 1.1333060264587402, + "learning_rate": 1.5789622109419065e-05, + "loss": 0.7632, + "step": 12132 + }, + { + "epoch": 6.843203609701072, + "grad_norm": 1.1439166069030762, + "learning_rate": 1.5786802030456853e-05, + "loss": 0.7062, + "step": 12133 + }, + { + "epoch": 6.843767625493514, + "grad_norm": 1.2170647382736206, + "learning_rate": 1.5783981951494642e-05, + "loss": 0.6604, + "step": 12134 + }, + { + "epoch": 6.844331641285956, + "grad_norm": 1.0562463998794556, + "learning_rate": 1.578116187253243e-05, + "loss": 0.692, + "step": 12135 + }, + { + "epoch": 6.844895657078398, + "grad_norm": 1.1696301698684692, + "learning_rate": 1.577834179357022e-05, + "loss": 0.8021, + "step": 12136 + }, + { + "epoch": 6.84545967287084, + "grad_norm": 0.8927245736122131, + "learning_rate": 1.5775521714608012e-05, + "loss": 0.7439, + "step": 12137 + }, + { + "epoch": 6.846023688663283, + "grad_norm": 1.084938406944275, + "learning_rate": 1.5772701635645797e-05, + "loss": 0.7387, + "step": 12138 + }, + { + "epoch": 6.846587704455724, + "grad_norm": 1.3067578077316284, + "learning_rate": 1.576988155668359e-05, + "loss": 0.7517, + "step": 12139 + }, + { + "epoch": 6.847151720248167, + "grad_norm": 0.7939077615737915, + "learning_rate": 1.5767061477721375e-05, + "loss": 0.6405, + "step": 12140 + }, + { + "epoch": 6.847715736040609, + "grad_norm": 1.1561311483383179, + "learning_rate": 1.5764241398759167e-05, + "loss": 0.8005, + "step": 12141 + }, + { + "epoch": 6.848279751833052, + "grad_norm": 1.1801658868789673, + "learning_rate": 1.5761421319796952e-05, + "loss": 0.7565, + "step": 12142 + }, + { + "epoch": 6.848843767625493, + "grad_norm": 1.0325775146484375, + "learning_rate": 1.5758601240834745e-05, + "loss": 0.7026, + "step": 12143 + }, + { + "epoch": 6.849407783417936, + "grad_norm": 1.0269370079040527, + "learning_rate": 1.5755781161872533e-05, + "loss": 0.7411, + "step": 12144 + }, + { + "epoch": 6.849971799210378, + "grad_norm": 1.4095909595489502, + "learning_rate": 1.5752961082910322e-05, + "loss": 0.7682, + "step": 12145 + }, + { + "epoch": 6.85053581500282, + "grad_norm": 1.193212866783142, + "learning_rate": 1.575014100394811e-05, + "loss": 0.7676, + "step": 12146 + }, + { + "epoch": 6.851099830795262, + "grad_norm": 1.7237969636917114, + "learning_rate": 1.57473209249859e-05, + "loss": 0.7512, + "step": 12147 + }, + { + "epoch": 6.851663846587704, + "grad_norm": 1.1824653148651123, + "learning_rate": 1.574450084602369e-05, + "loss": 0.691, + "step": 12148 + }, + { + "epoch": 6.852227862380147, + "grad_norm": 1.222619652748108, + "learning_rate": 1.5741680767061477e-05, + "loss": 0.7353, + "step": 12149 + }, + { + "epoch": 6.852791878172589, + "grad_norm": 1.3757532835006714, + "learning_rate": 1.5738860688099266e-05, + "loss": 0.7246, + "step": 12150 + }, + { + "epoch": 6.8533558939650305, + "grad_norm": 0.980056643486023, + "learning_rate": 1.5736040609137055e-05, + "loss": 0.7793, + "step": 12151 + }, + { + "epoch": 6.853919909757473, + "grad_norm": 0.8622355461120605, + "learning_rate": 1.5733220530174847e-05, + "loss": 0.7163, + "step": 12152 + }, + { + "epoch": 6.854483925549915, + "grad_norm": 1.6506214141845703, + "learning_rate": 1.5730400451212636e-05, + "loss": 0.8262, + "step": 12153 + }, + { + "epoch": 6.855047941342358, + "grad_norm": 0.878420352935791, + "learning_rate": 1.5727580372250425e-05, + "loss": 0.7021, + "step": 12154 + }, + { + "epoch": 6.8556119571347995, + "grad_norm": 0.7650535106658936, + "learning_rate": 1.5724760293288214e-05, + "loss": 0.6606, + "step": 12155 + }, + { + "epoch": 6.856175972927242, + "grad_norm": 0.8452345728874207, + "learning_rate": 1.5721940214326002e-05, + "loss": 0.7042, + "step": 12156 + }, + { + "epoch": 6.856739988719684, + "grad_norm": 1.19624924659729, + "learning_rate": 1.571912013536379e-05, + "loss": 0.7647, + "step": 12157 + }, + { + "epoch": 6.857304004512127, + "grad_norm": 1.4280444383621216, + "learning_rate": 1.571630005640158e-05, + "loss": 0.7386, + "step": 12158 + }, + { + "epoch": 6.8578680203045685, + "grad_norm": 1.428678035736084, + "learning_rate": 1.571347997743937e-05, + "loss": 0.7845, + "step": 12159 + }, + { + "epoch": 6.85843203609701, + "grad_norm": 1.0567375421524048, + "learning_rate": 1.5710659898477158e-05, + "loss": 0.7156, + "step": 12160 + }, + { + "epoch": 6.858996051889453, + "grad_norm": 1.2322505712509155, + "learning_rate": 1.5707839819514946e-05, + "loss": 0.6894, + "step": 12161 + }, + { + "epoch": 6.859560067681895, + "grad_norm": 1.6254652738571167, + "learning_rate": 1.570501974055274e-05, + "loss": 0.8487, + "step": 12162 + }, + { + "epoch": 6.8601240834743376, + "grad_norm": 1.0416280031204224, + "learning_rate": 1.5702199661590524e-05, + "loss": 0.7405, + "step": 12163 + }, + { + "epoch": 6.860688099266779, + "grad_norm": 1.4111664295196533, + "learning_rate": 1.5699379582628316e-05, + "loss": 0.8422, + "step": 12164 + }, + { + "epoch": 6.861252115059221, + "grad_norm": 1.6377243995666504, + "learning_rate": 1.56965595036661e-05, + "loss": 0.7342, + "step": 12165 + }, + { + "epoch": 6.861816130851664, + "grad_norm": 1.2043700218200684, + "learning_rate": 1.5693739424703894e-05, + "loss": 0.7283, + "step": 12166 + }, + { + "epoch": 6.862380146644106, + "grad_norm": 1.294485092163086, + "learning_rate": 1.5690919345741683e-05, + "loss": 0.7143, + "step": 12167 + }, + { + "epoch": 6.862944162436548, + "grad_norm": 1.1887420415878296, + "learning_rate": 1.568809926677947e-05, + "loss": 0.6867, + "step": 12168 + }, + { + "epoch": 6.86350817822899, + "grad_norm": 0.9552145004272461, + "learning_rate": 1.568527918781726e-05, + "loss": 0.8144, + "step": 12169 + }, + { + "epoch": 6.864072194021433, + "grad_norm": 1.1259446144104004, + "learning_rate": 1.568245910885505e-05, + "loss": 0.7293, + "step": 12170 + }, + { + "epoch": 6.864636209813875, + "grad_norm": 0.953292727470398, + "learning_rate": 1.5679639029892838e-05, + "loss": 0.6933, + "step": 12171 + }, + { + "epoch": 6.865200225606317, + "grad_norm": 1.600394606590271, + "learning_rate": 1.5676818950930627e-05, + "loss": 0.6917, + "step": 12172 + }, + { + "epoch": 6.865764241398759, + "grad_norm": 1.1951346397399902, + "learning_rate": 1.5673998871968415e-05, + "loss": 0.6429, + "step": 12173 + }, + { + "epoch": 6.866328257191201, + "grad_norm": 0.9430506825447083, + "learning_rate": 1.5671178793006204e-05, + "loss": 0.7083, + "step": 12174 + }, + { + "epoch": 6.866892272983644, + "grad_norm": 1.055996060371399, + "learning_rate": 1.5668358714043993e-05, + "loss": 0.7276, + "step": 12175 + }, + { + "epoch": 6.8674562887760855, + "grad_norm": 1.5668764114379883, + "learning_rate": 1.5665538635081785e-05, + "loss": 0.8026, + "step": 12176 + }, + { + "epoch": 6.868020304568528, + "grad_norm": 1.2934317588806152, + "learning_rate": 1.566271855611957e-05, + "loss": 0.7046, + "step": 12177 + }, + { + "epoch": 6.86858432036097, + "grad_norm": 1.7377488613128662, + "learning_rate": 1.5659898477157363e-05, + "loss": 0.8419, + "step": 12178 + }, + { + "epoch": 6.869148336153412, + "grad_norm": 1.3985024690628052, + "learning_rate": 1.5657078398195148e-05, + "loss": 0.6836, + "step": 12179 + }, + { + "epoch": 6.8697123519458545, + "grad_norm": 0.8976640105247498, + "learning_rate": 1.565425831923294e-05, + "loss": 0.6877, + "step": 12180 + }, + { + "epoch": 6.870276367738296, + "grad_norm": 1.00959312915802, + "learning_rate": 1.5651438240270726e-05, + "loss": 0.7221, + "step": 12181 + }, + { + "epoch": 6.870840383530739, + "grad_norm": 1.6192387342453003, + "learning_rate": 1.5648618161308518e-05, + "loss": 0.8345, + "step": 12182 + }, + { + "epoch": 6.871404399323181, + "grad_norm": 1.0881561040878296, + "learning_rate": 1.5645798082346307e-05, + "loss": 0.7604, + "step": 12183 + }, + { + "epoch": 6.8719684151156235, + "grad_norm": 0.9068067073822021, + "learning_rate": 1.5642978003384095e-05, + "loss": 0.7127, + "step": 12184 + }, + { + "epoch": 6.872532430908065, + "grad_norm": 1.2519536018371582, + "learning_rate": 1.5640157924421884e-05, + "loss": 0.7539, + "step": 12185 + }, + { + "epoch": 6.873096446700508, + "grad_norm": 0.9088394641876221, + "learning_rate": 1.5637337845459673e-05, + "loss": 0.6784, + "step": 12186 + }, + { + "epoch": 6.87366046249295, + "grad_norm": 1.0371125936508179, + "learning_rate": 1.5634517766497465e-05, + "loss": 0.8096, + "step": 12187 + }, + { + "epoch": 6.874224478285392, + "grad_norm": 0.8978805541992188, + "learning_rate": 1.563169768753525e-05, + "loss": 0.7529, + "step": 12188 + }, + { + "epoch": 6.874788494077834, + "grad_norm": 1.2320038080215454, + "learning_rate": 1.5628877608573043e-05, + "loss": 0.8811, + "step": 12189 + }, + { + "epoch": 6.875352509870276, + "grad_norm": 1.2459043264389038, + "learning_rate": 1.5626057529610828e-05, + "loss": 0.6675, + "step": 12190 + }, + { + "epoch": 6.875916525662719, + "grad_norm": 1.3726308345794678, + "learning_rate": 1.562323745064862e-05, + "loss": 0.7149, + "step": 12191 + }, + { + "epoch": 6.876480541455161, + "grad_norm": 1.3527086973190308, + "learning_rate": 1.562041737168641e-05, + "loss": 0.7015, + "step": 12192 + }, + { + "epoch": 6.877044557247602, + "grad_norm": 0.9732446670532227, + "learning_rate": 1.5617597292724198e-05, + "loss": 0.8229, + "step": 12193 + }, + { + "epoch": 6.877608573040045, + "grad_norm": 1.0623514652252197, + "learning_rate": 1.5614777213761987e-05, + "loss": 0.7083, + "step": 12194 + }, + { + "epoch": 6.878172588832487, + "grad_norm": 1.1311495304107666, + "learning_rate": 1.5611957134799776e-05, + "loss": 0.638, + "step": 12195 + }, + { + "epoch": 6.87873660462493, + "grad_norm": 1.2973524332046509, + "learning_rate": 1.5609137055837564e-05, + "loss": 0.7379, + "step": 12196 + }, + { + "epoch": 6.8793006204173714, + "grad_norm": 1.168189525604248, + "learning_rate": 1.5606316976875353e-05, + "loss": 0.8686, + "step": 12197 + }, + { + "epoch": 6.879864636209814, + "grad_norm": 1.2548191547393799, + "learning_rate": 1.5603496897913142e-05, + "loss": 0.8165, + "step": 12198 + }, + { + "epoch": 6.880428652002256, + "grad_norm": 1.232563853263855, + "learning_rate": 1.560067681895093e-05, + "loss": 0.7606, + "step": 12199 + }, + { + "epoch": 6.880992667794699, + "grad_norm": 1.4013755321502686, + "learning_rate": 1.559785673998872e-05, + "loss": 0.7033, + "step": 12200 + }, + { + "epoch": 6.8815566835871405, + "grad_norm": 1.2607249021530151, + "learning_rate": 1.5595036661026512e-05, + "loss": 0.8413, + "step": 12201 + }, + { + "epoch": 6.882120699379582, + "grad_norm": 1.0369579792022705, + "learning_rate": 1.5592216582064297e-05, + "loss": 0.7119, + "step": 12202 + }, + { + "epoch": 6.882684715172025, + "grad_norm": 1.1262983083724976, + "learning_rate": 1.558939650310209e-05, + "loss": 0.8115, + "step": 12203 + }, + { + "epoch": 6.883248730964467, + "grad_norm": 1.016692042350769, + "learning_rate": 1.5586576424139875e-05, + "loss": 0.7461, + "step": 12204 + }, + { + "epoch": 6.8838127467569095, + "grad_norm": 1.2567055225372314, + "learning_rate": 1.5583756345177667e-05, + "loss": 0.7472, + "step": 12205 + }, + { + "epoch": 6.884376762549351, + "grad_norm": 1.2450385093688965, + "learning_rate": 1.5580936266215452e-05, + "loss": 0.7595, + "step": 12206 + }, + { + "epoch": 6.884940778341793, + "grad_norm": 1.2452493906021118, + "learning_rate": 1.5578116187253245e-05, + "loss": 0.705, + "step": 12207 + }, + { + "epoch": 6.885504794134236, + "grad_norm": 1.4742306470870972, + "learning_rate": 1.5575296108291033e-05, + "loss": 0.7634, + "step": 12208 + }, + { + "epoch": 6.886068809926678, + "grad_norm": 1.3610490560531616, + "learning_rate": 1.5572476029328822e-05, + "loss": 0.7057, + "step": 12209 + }, + { + "epoch": 6.88663282571912, + "grad_norm": 1.3949804306030273, + "learning_rate": 1.556965595036661e-05, + "loss": 0.7879, + "step": 12210 + }, + { + "epoch": 6.887196841511562, + "grad_norm": 0.9717573523521423, + "learning_rate": 1.55668358714044e-05, + "loss": 0.6611, + "step": 12211 + }, + { + "epoch": 6.887760857304005, + "grad_norm": 1.4165191650390625, + "learning_rate": 1.556401579244219e-05, + "loss": 0.7429, + "step": 12212 + }, + { + "epoch": 6.888324873096447, + "grad_norm": 1.0367436408996582, + "learning_rate": 1.5561195713479977e-05, + "loss": 0.841, + "step": 12213 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 1.4728318452835083, + "learning_rate": 1.5558375634517766e-05, + "loss": 0.7962, + "step": 12214 + }, + { + "epoch": 6.889452904681331, + "grad_norm": 0.8707613348960876, + "learning_rate": 1.5555555555555555e-05, + "loss": 0.6454, + "step": 12215 + }, + { + "epoch": 6.890016920473773, + "grad_norm": 0.9514728784561157, + "learning_rate": 1.5552735476593344e-05, + "loss": 0.6662, + "step": 12216 + }, + { + "epoch": 6.890580936266216, + "grad_norm": 1.2602282762527466, + "learning_rate": 1.5549915397631136e-05, + "loss": 0.7735, + "step": 12217 + }, + { + "epoch": 6.891144952058657, + "grad_norm": 0.8678683042526245, + "learning_rate": 1.554709531866892e-05, + "loss": 0.647, + "step": 12218 + }, + { + "epoch": 6.8917089678511, + "grad_norm": 1.275284767150879, + "learning_rate": 1.5544275239706714e-05, + "loss": 0.7935, + "step": 12219 + }, + { + "epoch": 6.892272983643542, + "grad_norm": 1.2598658800125122, + "learning_rate": 1.55414551607445e-05, + "loss": 0.6715, + "step": 12220 + }, + { + "epoch": 6.892836999435984, + "grad_norm": 1.131442904472351, + "learning_rate": 1.553863508178229e-05, + "loss": 0.7481, + "step": 12221 + }, + { + "epoch": 6.893401015228426, + "grad_norm": 1.1330649852752686, + "learning_rate": 1.553581500282008e-05, + "loss": 0.8423, + "step": 12222 + }, + { + "epoch": 6.893965031020868, + "grad_norm": 1.5228614807128906, + "learning_rate": 1.553299492385787e-05, + "loss": 0.8203, + "step": 12223 + }, + { + "epoch": 6.894529046813311, + "grad_norm": 1.2071776390075684, + "learning_rate": 1.5530174844895658e-05, + "loss": 0.761, + "step": 12224 + }, + { + "epoch": 6.895093062605753, + "grad_norm": 0.9679938554763794, + "learning_rate": 1.5527354765933446e-05, + "loss": 0.7518, + "step": 12225 + }, + { + "epoch": 6.895657078398195, + "grad_norm": 0.942290186882019, + "learning_rate": 1.552453468697124e-05, + "loss": 0.7329, + "step": 12226 + }, + { + "epoch": 6.896221094190637, + "grad_norm": 1.2106679677963257, + "learning_rate": 1.5521714608009024e-05, + "loss": 0.6575, + "step": 12227 + }, + { + "epoch": 6.89678510998308, + "grad_norm": 1.0960204601287842, + "learning_rate": 1.5518894529046816e-05, + "loss": 0.7725, + "step": 12228 + }, + { + "epoch": 6.897349125775522, + "grad_norm": 1.13601553440094, + "learning_rate": 1.55160744500846e-05, + "loss": 0.7231, + "step": 12229 + }, + { + "epoch": 6.8979131415679635, + "grad_norm": 0.8837463855743408, + "learning_rate": 1.5513254371122394e-05, + "loss": 0.7502, + "step": 12230 + }, + { + "epoch": 6.898477157360406, + "grad_norm": 1.4192854166030884, + "learning_rate": 1.5510434292160182e-05, + "loss": 0.7839, + "step": 12231 + }, + { + "epoch": 6.899041173152848, + "grad_norm": 0.9268853664398193, + "learning_rate": 1.550761421319797e-05, + "loss": 0.7328, + "step": 12232 + }, + { + "epoch": 6.899605188945291, + "grad_norm": 1.2965881824493408, + "learning_rate": 1.550479413423576e-05, + "loss": 0.8614, + "step": 12233 + }, + { + "epoch": 6.9001692047377325, + "grad_norm": 1.4326672554016113, + "learning_rate": 1.550197405527355e-05, + "loss": 0.8419, + "step": 12234 + }, + { + "epoch": 6.900733220530174, + "grad_norm": 0.9377231597900391, + "learning_rate": 1.5499153976311338e-05, + "loss": 0.7449, + "step": 12235 + }, + { + "epoch": 6.901297236322617, + "grad_norm": 1.4138544797897339, + "learning_rate": 1.5496333897349126e-05, + "loss": 0.8438, + "step": 12236 + }, + { + "epoch": 6.901861252115059, + "grad_norm": 1.0755475759506226, + "learning_rate": 1.5493513818386915e-05, + "loss": 0.7863, + "step": 12237 + }, + { + "epoch": 6.9024252679075015, + "grad_norm": 1.670078158378601, + "learning_rate": 1.5490693739424704e-05, + "loss": 0.8115, + "step": 12238 + }, + { + "epoch": 6.902989283699943, + "grad_norm": 0.9683701395988464, + "learning_rate": 1.5487873660462493e-05, + "loss": 0.7495, + "step": 12239 + }, + { + "epoch": 6.903553299492386, + "grad_norm": 0.9168999195098877, + "learning_rate": 1.5485053581500285e-05, + "loss": 0.7364, + "step": 12240 + }, + { + "epoch": 6.904117315284828, + "grad_norm": 1.19391930103302, + "learning_rate": 1.548223350253807e-05, + "loss": 0.7133, + "step": 12241 + }, + { + "epoch": 6.904681331077271, + "grad_norm": 0.9328383803367615, + "learning_rate": 1.5479413423575863e-05, + "loss": 0.6738, + "step": 12242 + }, + { + "epoch": 6.905245346869712, + "grad_norm": 1.0205820798873901, + "learning_rate": 1.5476593344613648e-05, + "loss": 0.6227, + "step": 12243 + }, + { + "epoch": 6.905809362662154, + "grad_norm": 1.0881997346878052, + "learning_rate": 1.547377326565144e-05, + "loss": 0.7216, + "step": 12244 + }, + { + "epoch": 6.906373378454597, + "grad_norm": 1.4139732122421265, + "learning_rate": 1.5470953186689226e-05, + "loss": 0.7896, + "step": 12245 + }, + { + "epoch": 6.906937394247039, + "grad_norm": 1.1154041290283203, + "learning_rate": 1.5468133107727018e-05, + "loss": 0.743, + "step": 12246 + }, + { + "epoch": 6.907501410039481, + "grad_norm": 1.6297829151153564, + "learning_rate": 1.5465313028764807e-05, + "loss": 0.7762, + "step": 12247 + }, + { + "epoch": 6.908065425831923, + "grad_norm": 1.3834106922149658, + "learning_rate": 1.5462492949802595e-05, + "loss": 0.7477, + "step": 12248 + }, + { + "epoch": 6.908629441624365, + "grad_norm": 1.044568419456482, + "learning_rate": 1.5459672870840384e-05, + "loss": 0.7213, + "step": 12249 + }, + { + "epoch": 6.909193457416808, + "grad_norm": 1.2954109907150269, + "learning_rate": 1.5456852791878173e-05, + "loss": 0.7542, + "step": 12250 + }, + { + "epoch": 6.9097574732092495, + "grad_norm": 0.8105816841125488, + "learning_rate": 1.5454032712915962e-05, + "loss": 0.7576, + "step": 12251 + }, + { + "epoch": 6.910321489001692, + "grad_norm": 0.9582816362380981, + "learning_rate": 1.545121263395375e-05, + "loss": 0.6811, + "step": 12252 + }, + { + "epoch": 6.910885504794134, + "grad_norm": 1.6399402618408203, + "learning_rate": 1.544839255499154e-05, + "loss": 0.7582, + "step": 12253 + }, + { + "epoch": 6.911449520586577, + "grad_norm": 1.072389006614685, + "learning_rate": 1.5445572476029328e-05, + "loss": 0.8101, + "step": 12254 + }, + { + "epoch": 6.9120135363790185, + "grad_norm": 1.296623706817627, + "learning_rate": 1.5442752397067117e-05, + "loss": 0.67, + "step": 12255 + }, + { + "epoch": 6.912577552171461, + "grad_norm": 1.1414637565612793, + "learning_rate": 1.543993231810491e-05, + "loss": 0.6813, + "step": 12256 + }, + { + "epoch": 6.913141567963903, + "grad_norm": 1.1104364395141602, + "learning_rate": 1.5437112239142698e-05, + "loss": 0.7059, + "step": 12257 + }, + { + "epoch": 6.913705583756345, + "grad_norm": 1.1345832347869873, + "learning_rate": 1.5434292160180487e-05, + "loss": 0.6705, + "step": 12258 + }, + { + "epoch": 6.9142695995487875, + "grad_norm": 1.1080068349838257, + "learning_rate": 1.5431472081218276e-05, + "loss": 0.7156, + "step": 12259 + }, + { + "epoch": 6.914833615341229, + "grad_norm": 1.2266854047775269, + "learning_rate": 1.5428652002256064e-05, + "loss": 0.732, + "step": 12260 + }, + { + "epoch": 6.915397631133672, + "grad_norm": 1.2341233491897583, + "learning_rate": 1.5425831923293853e-05, + "loss": 0.7313, + "step": 12261 + }, + { + "epoch": 6.915961646926114, + "grad_norm": 1.0727483034133911, + "learning_rate": 1.5423011844331642e-05, + "loss": 0.5862, + "step": 12262 + }, + { + "epoch": 6.916525662718556, + "grad_norm": 1.1225260496139526, + "learning_rate": 1.542019176536943e-05, + "loss": 0.7358, + "step": 12263 + }, + { + "epoch": 6.917089678510998, + "grad_norm": 1.4711291790008545, + "learning_rate": 1.541737168640722e-05, + "loss": 0.7641, + "step": 12264 + }, + { + "epoch": 6.91765369430344, + "grad_norm": 0.9851372241973877, + "learning_rate": 1.5414551607445012e-05, + "loss": 0.6997, + "step": 12265 + }, + { + "epoch": 6.918217710095883, + "grad_norm": 1.4694056510925293, + "learning_rate": 1.5411731528482797e-05, + "loss": 0.8, + "step": 12266 + }, + { + "epoch": 6.918781725888325, + "grad_norm": 1.076521635055542, + "learning_rate": 1.540891144952059e-05, + "loss": 0.7437, + "step": 12267 + }, + { + "epoch": 6.919345741680767, + "grad_norm": 0.9614657759666443, + "learning_rate": 1.5406091370558375e-05, + "loss": 0.6799, + "step": 12268 + }, + { + "epoch": 6.919909757473209, + "grad_norm": 0.8955855965614319, + "learning_rate": 1.5403271291596167e-05, + "loss": 0.7603, + "step": 12269 + }, + { + "epoch": 6.920473773265652, + "grad_norm": 0.9974464774131775, + "learning_rate": 1.5400451212633952e-05, + "loss": 0.7501, + "step": 12270 + }, + { + "epoch": 6.921037789058094, + "grad_norm": 1.2558903694152832, + "learning_rate": 1.5397631133671744e-05, + "loss": 0.8587, + "step": 12271 + }, + { + "epoch": 6.9216018048505354, + "grad_norm": 1.1520001888275146, + "learning_rate": 1.5394811054709533e-05, + "loss": 0.7806, + "step": 12272 + }, + { + "epoch": 6.922165820642978, + "grad_norm": 1.051769733428955, + "learning_rate": 1.5391990975747322e-05, + "loss": 0.7369, + "step": 12273 + }, + { + "epoch": 6.92272983643542, + "grad_norm": 1.1470518112182617, + "learning_rate": 1.538917089678511e-05, + "loss": 0.712, + "step": 12274 + }, + { + "epoch": 6.923293852227863, + "grad_norm": 0.8681237101554871, + "learning_rate": 1.53863508178229e-05, + "loss": 0.7272, + "step": 12275 + }, + { + "epoch": 6.9238578680203045, + "grad_norm": 1.0276895761489868, + "learning_rate": 1.538353073886069e-05, + "loss": 0.7425, + "step": 12276 + }, + { + "epoch": 6.924421883812746, + "grad_norm": 1.072133183479309, + "learning_rate": 1.5380710659898477e-05, + "loss": 0.8206, + "step": 12277 + }, + { + "epoch": 6.924985899605189, + "grad_norm": 3.550786256790161, + "learning_rate": 1.5377890580936266e-05, + "loss": 0.7634, + "step": 12278 + }, + { + "epoch": 6.925549915397631, + "grad_norm": 1.2405688762664795, + "learning_rate": 1.5375070501974055e-05, + "loss": 0.8579, + "step": 12279 + }, + { + "epoch": 6.9261139311900735, + "grad_norm": 1.5327874422073364, + "learning_rate": 1.5372250423011844e-05, + "loss": 0.7622, + "step": 12280 + }, + { + "epoch": 6.926677946982515, + "grad_norm": 1.1348580121994019, + "learning_rate": 1.5369430344049636e-05, + "loss": 0.7246, + "step": 12281 + }, + { + "epoch": 6.927241962774958, + "grad_norm": 1.100230097770691, + "learning_rate": 1.536661026508742e-05, + "loss": 0.5958, + "step": 12282 + }, + { + "epoch": 6.9278059785674, + "grad_norm": 0.884606122970581, + "learning_rate": 1.5363790186125213e-05, + "loss": 0.717, + "step": 12283 + }, + { + "epoch": 6.9283699943598425, + "grad_norm": 1.2359650135040283, + "learning_rate": 1.5360970107163e-05, + "loss": 0.7999, + "step": 12284 + }, + { + "epoch": 6.928934010152284, + "grad_norm": 1.1017358303070068, + "learning_rate": 1.535815002820079e-05, + "loss": 0.6902, + "step": 12285 + }, + { + "epoch": 6.929498025944726, + "grad_norm": 1.261636734008789, + "learning_rate": 1.535532994923858e-05, + "loss": 0.7097, + "step": 12286 + }, + { + "epoch": 6.930062041737169, + "grad_norm": 1.0299385786056519, + "learning_rate": 1.535250987027637e-05, + "loss": 0.7438, + "step": 12287 + }, + { + "epoch": 6.930626057529611, + "grad_norm": 1.2070567607879639, + "learning_rate": 1.5349689791314157e-05, + "loss": 0.6534, + "step": 12288 + }, + { + "epoch": 6.931190073322053, + "grad_norm": 1.063517689704895, + "learning_rate": 1.5346869712351946e-05, + "loss": 0.6509, + "step": 12289 + }, + { + "epoch": 6.931754089114495, + "grad_norm": 0.9013498425483704, + "learning_rate": 1.5344049633389735e-05, + "loss": 0.7214, + "step": 12290 + }, + { + "epoch": 6.932318104906937, + "grad_norm": 1.0352470874786377, + "learning_rate": 1.5341229554427524e-05, + "loss": 0.7466, + "step": 12291 + }, + { + "epoch": 6.93288212069938, + "grad_norm": 1.1171493530273438, + "learning_rate": 1.5338409475465316e-05, + "loss": 0.7198, + "step": 12292 + }, + { + "epoch": 6.933446136491821, + "grad_norm": 1.0015110969543457, + "learning_rate": 1.53355893965031e-05, + "loss": 0.6832, + "step": 12293 + }, + { + "epoch": 6.934010152284264, + "grad_norm": 0.8518908619880676, + "learning_rate": 1.5332769317540894e-05, + "loss": 0.7256, + "step": 12294 + }, + { + "epoch": 6.934574168076706, + "grad_norm": 1.1233559846878052, + "learning_rate": 1.5329949238578682e-05, + "loss": 0.738, + "step": 12295 + }, + { + "epoch": 6.935138183869149, + "grad_norm": 0.7994348406791687, + "learning_rate": 1.532712915961647e-05, + "loss": 0.6766, + "step": 12296 + }, + { + "epoch": 6.93570219966159, + "grad_norm": 1.2592486143112183, + "learning_rate": 1.532430908065426e-05, + "loss": 0.7, + "step": 12297 + }, + { + "epoch": 6.936266215454033, + "grad_norm": 1.3920867443084717, + "learning_rate": 1.532148900169205e-05, + "loss": 0.8045, + "step": 12298 + }, + { + "epoch": 6.936830231246475, + "grad_norm": 1.1822515726089478, + "learning_rate": 1.5318668922729838e-05, + "loss": 0.7498, + "step": 12299 + }, + { + "epoch": 6.937394247038917, + "grad_norm": 1.0435411930084229, + "learning_rate": 1.5315848843767626e-05, + "loss": 0.7911, + "step": 12300 + }, + { + "epoch": 6.937958262831359, + "grad_norm": 1.4811915159225464, + "learning_rate": 1.5313028764805415e-05, + "loss": 0.8461, + "step": 12301 + }, + { + "epoch": 6.938522278623801, + "grad_norm": 1.2941855192184448, + "learning_rate": 1.5310208685843204e-05, + "loss": 0.7153, + "step": 12302 + }, + { + "epoch": 6.939086294416244, + "grad_norm": 1.0565143823623657, + "learning_rate": 1.5307388606880993e-05, + "loss": 0.7309, + "step": 12303 + }, + { + "epoch": 6.939650310208686, + "grad_norm": 1.1003832817077637, + "learning_rate": 1.5304568527918785e-05, + "loss": 0.6784, + "step": 12304 + }, + { + "epoch": 6.940214326001128, + "grad_norm": 0.9939821362495422, + "learning_rate": 1.530174844895657e-05, + "loss": 0.6657, + "step": 12305 + }, + { + "epoch": 6.94077834179357, + "grad_norm": 0.9829081296920776, + "learning_rate": 1.5298928369994363e-05, + "loss": 0.7657, + "step": 12306 + }, + { + "epoch": 6.941342357586012, + "grad_norm": 1.2356051206588745, + "learning_rate": 1.5296108291032148e-05, + "loss": 0.7764, + "step": 12307 + }, + { + "epoch": 6.941906373378455, + "grad_norm": 1.0928033590316772, + "learning_rate": 1.529328821206994e-05, + "loss": 0.7182, + "step": 12308 + }, + { + "epoch": 6.9424703891708965, + "grad_norm": 1.2058416604995728, + "learning_rate": 1.5290468133107726e-05, + "loss": 0.7303, + "step": 12309 + }, + { + "epoch": 6.943034404963339, + "grad_norm": 1.0394601821899414, + "learning_rate": 1.5287648054145518e-05, + "loss": 0.7684, + "step": 12310 + }, + { + "epoch": 6.943598420755781, + "grad_norm": 1.3552823066711426, + "learning_rate": 1.5284827975183307e-05, + "loss": 0.7733, + "step": 12311 + }, + { + "epoch": 6.944162436548224, + "grad_norm": 0.9474915862083435, + "learning_rate": 1.5282007896221095e-05, + "loss": 0.6458, + "step": 12312 + }, + { + "epoch": 6.9447264523406655, + "grad_norm": 1.1371426582336426, + "learning_rate": 1.5279187817258884e-05, + "loss": 0.6906, + "step": 12313 + }, + { + "epoch": 6.945290468133107, + "grad_norm": 1.5708417892456055, + "learning_rate": 1.5276367738296673e-05, + "loss": 0.7472, + "step": 12314 + }, + { + "epoch": 6.94585448392555, + "grad_norm": 0.9191315174102783, + "learning_rate": 1.5273547659334462e-05, + "loss": 0.7414, + "step": 12315 + }, + { + "epoch": 6.946418499717992, + "grad_norm": 0.9315519332885742, + "learning_rate": 1.527072758037225e-05, + "loss": 0.6995, + "step": 12316 + }, + { + "epoch": 6.9469825155104346, + "grad_norm": 0.8715046048164368, + "learning_rate": 1.526790750141004e-05, + "loss": 0.6511, + "step": 12317 + }, + { + "epoch": 6.947546531302876, + "grad_norm": 1.1896886825561523, + "learning_rate": 1.5265087422447828e-05, + "loss": 0.7399, + "step": 12318 + }, + { + "epoch": 6.948110547095319, + "grad_norm": 1.214483618736267, + "learning_rate": 1.5262267343485617e-05, + "loss": 0.7516, + "step": 12319 + }, + { + "epoch": 6.948674562887761, + "grad_norm": 1.2931554317474365, + "learning_rate": 1.525944726452341e-05, + "loss": 0.8235, + "step": 12320 + }, + { + "epoch": 6.949238578680203, + "grad_norm": 0.9061561226844788, + "learning_rate": 1.5256627185561194e-05, + "loss": 0.6778, + "step": 12321 + }, + { + "epoch": 6.949802594472645, + "grad_norm": 1.463975429534912, + "learning_rate": 1.5253807106598985e-05, + "loss": 0.8382, + "step": 12322 + }, + { + "epoch": 6.950366610265087, + "grad_norm": 1.0196887254714966, + "learning_rate": 1.5250987027636774e-05, + "loss": 0.6764, + "step": 12323 + }, + { + "epoch": 6.95093062605753, + "grad_norm": 1.6702885627746582, + "learning_rate": 1.5248166948674564e-05, + "loss": 0.8261, + "step": 12324 + }, + { + "epoch": 6.951494641849972, + "grad_norm": 1.4788267612457275, + "learning_rate": 1.5245346869712351e-05, + "loss": 0.8022, + "step": 12325 + }, + { + "epoch": 6.952058657642414, + "grad_norm": 0.8813068270683289, + "learning_rate": 1.5242526790750142e-05, + "loss": 0.7029, + "step": 12326 + }, + { + "epoch": 6.952622673434856, + "grad_norm": 0.8219097852706909, + "learning_rate": 1.5239706711787929e-05, + "loss": 0.7253, + "step": 12327 + }, + { + "epoch": 6.953186689227298, + "grad_norm": 0.8657955527305603, + "learning_rate": 1.523688663282572e-05, + "loss": 0.6631, + "step": 12328 + }, + { + "epoch": 6.953750705019741, + "grad_norm": 0.9404909014701843, + "learning_rate": 1.523406655386351e-05, + "loss": 0.7081, + "step": 12329 + }, + { + "epoch": 6.9543147208121825, + "grad_norm": 1.0656819343566895, + "learning_rate": 1.5231246474901297e-05, + "loss": 0.762, + "step": 12330 + }, + { + "epoch": 6.954878736604625, + "grad_norm": 0.7410922646522522, + "learning_rate": 1.5228426395939088e-05, + "loss": 0.606, + "step": 12331 + }, + { + "epoch": 6.955442752397067, + "grad_norm": 1.3737143278121948, + "learning_rate": 1.5225606316976876e-05, + "loss": 0.7547, + "step": 12332 + }, + { + "epoch": 6.95600676818951, + "grad_norm": 0.9877793192863464, + "learning_rate": 1.5222786238014667e-05, + "loss": 0.6645, + "step": 12333 + }, + { + "epoch": 6.9565707839819515, + "grad_norm": 0.9370765686035156, + "learning_rate": 1.5219966159052454e-05, + "loss": 0.7178, + "step": 12334 + }, + { + "epoch": 6.957134799774393, + "grad_norm": 1.015158772468567, + "learning_rate": 1.5217146080090244e-05, + "loss": 0.7602, + "step": 12335 + }, + { + "epoch": 6.957698815566836, + "grad_norm": 2.207054376602173, + "learning_rate": 1.5214326001128032e-05, + "loss": 0.7367, + "step": 12336 + }, + { + "epoch": 6.958262831359278, + "grad_norm": 1.579013705253601, + "learning_rate": 1.5211505922165822e-05, + "loss": 0.7739, + "step": 12337 + }, + { + "epoch": 6.9588268471517205, + "grad_norm": 0.8955034017562866, + "learning_rate": 1.520868584320361e-05, + "loss": 0.7234, + "step": 12338 + }, + { + "epoch": 6.959390862944162, + "grad_norm": 1.045892357826233, + "learning_rate": 1.52058657642414e-05, + "loss": 0.6519, + "step": 12339 + }, + { + "epoch": 6.959954878736605, + "grad_norm": 1.0284473896026611, + "learning_rate": 1.5203045685279188e-05, + "loss": 0.8242, + "step": 12340 + }, + { + "epoch": 6.960518894529047, + "grad_norm": 1.1126528978347778, + "learning_rate": 1.5200225606316979e-05, + "loss": 0.687, + "step": 12341 + }, + { + "epoch": 6.961082910321489, + "grad_norm": 1.1298288106918335, + "learning_rate": 1.5197405527354766e-05, + "loss": 0.772, + "step": 12342 + }, + { + "epoch": 6.961646926113931, + "grad_norm": 1.0063908100128174, + "learning_rate": 1.5194585448392556e-05, + "loss": 0.6515, + "step": 12343 + }, + { + "epoch": 6.962210941906373, + "grad_norm": 1.2463912963867188, + "learning_rate": 1.5191765369430344e-05, + "loss": 0.7688, + "step": 12344 + }, + { + "epoch": 6.962774957698816, + "grad_norm": 1.2238434553146362, + "learning_rate": 1.5188945290468134e-05, + "loss": 0.7574, + "step": 12345 + }, + { + "epoch": 6.963338973491258, + "grad_norm": 1.2538541555404663, + "learning_rate": 1.5186125211505923e-05, + "loss": 0.778, + "step": 12346 + }, + { + "epoch": 6.9639029892837, + "grad_norm": 0.8865394592285156, + "learning_rate": 1.5183305132543713e-05, + "loss": 0.6918, + "step": 12347 + }, + { + "epoch": 6.964467005076142, + "grad_norm": 1.1176890134811401, + "learning_rate": 1.51804850535815e-05, + "loss": 0.8942, + "step": 12348 + }, + { + "epoch": 6.965031020868584, + "grad_norm": 1.301488995552063, + "learning_rate": 1.5177664974619291e-05, + "loss": 0.8235, + "step": 12349 + }, + { + "epoch": 6.965595036661027, + "grad_norm": 1.321045994758606, + "learning_rate": 1.5174844895657078e-05, + "loss": 0.7618, + "step": 12350 + }, + { + "epoch": 6.9661590524534684, + "grad_norm": 0.8867380619049072, + "learning_rate": 1.5172024816694869e-05, + "loss": 0.7144, + "step": 12351 + }, + { + "epoch": 6.966723068245911, + "grad_norm": 0.9571365714073181, + "learning_rate": 1.5169204737732656e-05, + "loss": 0.6799, + "step": 12352 + }, + { + "epoch": 6.967287084038353, + "grad_norm": 1.0436512231826782, + "learning_rate": 1.5166384658770446e-05, + "loss": 0.6389, + "step": 12353 + }, + { + "epoch": 6.967851099830796, + "grad_norm": 1.2178186178207397, + "learning_rate": 1.5163564579808235e-05, + "loss": 0.7679, + "step": 12354 + }, + { + "epoch": 6.9684151156232375, + "grad_norm": 1.1554298400878906, + "learning_rate": 1.5160744500846025e-05, + "loss": 0.7179, + "step": 12355 + }, + { + "epoch": 6.968979131415679, + "grad_norm": 1.2909560203552246, + "learning_rate": 1.5157924421883813e-05, + "loss": 0.6985, + "step": 12356 + }, + { + "epoch": 6.969543147208122, + "grad_norm": 1.06522536277771, + "learning_rate": 1.5155104342921603e-05, + "loss": 0.7103, + "step": 12357 + }, + { + "epoch": 6.970107163000564, + "grad_norm": 1.050885796546936, + "learning_rate": 1.515228426395939e-05, + "loss": 0.7509, + "step": 12358 + }, + { + "epoch": 6.9706711787930065, + "grad_norm": 1.1689976453781128, + "learning_rate": 1.514946418499718e-05, + "loss": 0.725, + "step": 12359 + }, + { + "epoch": 6.971235194585448, + "grad_norm": 1.2715688943862915, + "learning_rate": 1.5146644106034968e-05, + "loss": 0.7651, + "step": 12360 + }, + { + "epoch": 6.971799210377891, + "grad_norm": 1.8077563047409058, + "learning_rate": 1.5143824027072758e-05, + "loss": 0.7438, + "step": 12361 + }, + { + "epoch": 6.972363226170333, + "grad_norm": 1.0707216262817383, + "learning_rate": 1.5141003948110547e-05, + "loss": 0.7374, + "step": 12362 + }, + { + "epoch": 6.972927241962775, + "grad_norm": 0.9881689548492432, + "learning_rate": 1.5138183869148337e-05, + "loss": 0.7055, + "step": 12363 + }, + { + "epoch": 6.973491257755217, + "grad_norm": 0.9296417832374573, + "learning_rate": 1.5135363790186128e-05, + "loss": 0.6813, + "step": 12364 + }, + { + "epoch": 6.974055273547659, + "grad_norm": 1.1834912300109863, + "learning_rate": 1.5132543711223915e-05, + "loss": 0.7289, + "step": 12365 + }, + { + "epoch": 6.974619289340102, + "grad_norm": 1.547882080078125, + "learning_rate": 1.5129723632261706e-05, + "loss": 0.8225, + "step": 12366 + }, + { + "epoch": 6.975183305132544, + "grad_norm": 1.0873852968215942, + "learning_rate": 1.5126903553299493e-05, + "loss": 0.7596, + "step": 12367 + }, + { + "epoch": 6.975747320924986, + "grad_norm": 0.9350820779800415, + "learning_rate": 1.5124083474337283e-05, + "loss": 0.6145, + "step": 12368 + }, + { + "epoch": 6.976311336717428, + "grad_norm": 1.1047701835632324, + "learning_rate": 1.512126339537507e-05, + "loss": 0.7367, + "step": 12369 + }, + { + "epoch": 6.97687535250987, + "grad_norm": 0.8795268535614014, + "learning_rate": 1.511844331641286e-05, + "loss": 0.6767, + "step": 12370 + }, + { + "epoch": 6.977439368302313, + "grad_norm": 0.9148810505867004, + "learning_rate": 1.511562323745065e-05, + "loss": 0.6852, + "step": 12371 + }, + { + "epoch": 6.978003384094754, + "grad_norm": 1.321043848991394, + "learning_rate": 1.511280315848844e-05, + "loss": 0.7811, + "step": 12372 + }, + { + "epoch": 6.978567399887197, + "grad_norm": 1.1323877573013306, + "learning_rate": 1.5109983079526227e-05, + "loss": 0.6779, + "step": 12373 + }, + { + "epoch": 6.979131415679639, + "grad_norm": 0.8966406583786011, + "learning_rate": 1.5107163000564018e-05, + "loss": 0.7198, + "step": 12374 + }, + { + "epoch": 6.979695431472082, + "grad_norm": 0.7952114939689636, + "learning_rate": 1.5104342921601805e-05, + "loss": 0.6536, + "step": 12375 + }, + { + "epoch": 6.980259447264523, + "grad_norm": 1.336209774017334, + "learning_rate": 1.5101522842639595e-05, + "loss": 0.7525, + "step": 12376 + }, + { + "epoch": 6.980823463056965, + "grad_norm": 1.4126468896865845, + "learning_rate": 1.5098702763677382e-05, + "loss": 0.76, + "step": 12377 + }, + { + "epoch": 6.981387478849408, + "grad_norm": 0.9568546414375305, + "learning_rate": 1.5095882684715173e-05, + "loss": 0.7724, + "step": 12378 + }, + { + "epoch": 6.98195149464185, + "grad_norm": 1.4051440954208374, + "learning_rate": 1.5093062605752962e-05, + "loss": 0.7724, + "step": 12379 + }, + { + "epoch": 6.982515510434292, + "grad_norm": 0.9323943853378296, + "learning_rate": 1.5090242526790752e-05, + "loss": 0.7216, + "step": 12380 + }, + { + "epoch": 6.983079526226734, + "grad_norm": 1.1642212867736816, + "learning_rate": 1.508742244782854e-05, + "loss": 0.7994, + "step": 12381 + }, + { + "epoch": 6.983643542019177, + "grad_norm": 1.2115107774734497, + "learning_rate": 1.508460236886633e-05, + "loss": 0.7863, + "step": 12382 + }, + { + "epoch": 6.984207557811619, + "grad_norm": 1.0860956907272339, + "learning_rate": 1.5081782289904117e-05, + "loss": 0.7591, + "step": 12383 + }, + { + "epoch": 6.9847715736040605, + "grad_norm": 0.943770170211792, + "learning_rate": 1.5078962210941907e-05, + "loss": 0.7303, + "step": 12384 + }, + { + "epoch": 6.985335589396503, + "grad_norm": 1.213411808013916, + "learning_rate": 1.5076142131979696e-05, + "loss": 0.7848, + "step": 12385 + }, + { + "epoch": 6.985899605188945, + "grad_norm": 1.2630927562713623, + "learning_rate": 1.5073322053017485e-05, + "loss": 0.7712, + "step": 12386 + }, + { + "epoch": 6.986463620981388, + "grad_norm": 1.2097554206848145, + "learning_rate": 1.5070501974055274e-05, + "loss": 0.7463, + "step": 12387 + }, + { + "epoch": 6.9870276367738295, + "grad_norm": 1.049067735671997, + "learning_rate": 1.5067681895093064e-05, + "loss": 0.6473, + "step": 12388 + }, + { + "epoch": 6.987591652566272, + "grad_norm": 1.341256022453308, + "learning_rate": 1.5064861816130851e-05, + "loss": 0.7881, + "step": 12389 + }, + { + "epoch": 6.988155668358714, + "grad_norm": 1.463669776916504, + "learning_rate": 1.5062041737168642e-05, + "loss": 0.7504, + "step": 12390 + }, + { + "epoch": 6.988719684151156, + "grad_norm": 0.9241267442703247, + "learning_rate": 1.5059221658206429e-05, + "loss": 0.6963, + "step": 12391 + }, + { + "epoch": 6.9892836999435985, + "grad_norm": 0.8699859380722046, + "learning_rate": 1.505640157924422e-05, + "loss": 0.6217, + "step": 12392 + }, + { + "epoch": 6.98984771573604, + "grad_norm": 1.062442660331726, + "learning_rate": 1.5053581500282008e-05, + "loss": 0.756, + "step": 12393 + }, + { + "epoch": 6.990411731528483, + "grad_norm": 1.3071280717849731, + "learning_rate": 1.5050761421319799e-05, + "loss": 0.6631, + "step": 12394 + }, + { + "epoch": 6.990975747320925, + "grad_norm": 1.479596495628357, + "learning_rate": 1.5047941342357586e-05, + "loss": 0.7468, + "step": 12395 + }, + { + "epoch": 6.991539763113368, + "grad_norm": 1.4622441530227661, + "learning_rate": 1.5045121263395376e-05, + "loss": 0.7078, + "step": 12396 + }, + { + "epoch": 6.992103778905809, + "grad_norm": 1.0557703971862793, + "learning_rate": 1.5042301184433163e-05, + "loss": 0.675, + "step": 12397 + }, + { + "epoch": 6.992667794698251, + "grad_norm": 1.6512168645858765, + "learning_rate": 1.5039481105470954e-05, + "loss": 0.8779, + "step": 12398 + }, + { + "epoch": 6.993231810490694, + "grad_norm": 0.8553173542022705, + "learning_rate": 1.5036661026508744e-05, + "loss": 0.7149, + "step": 12399 + }, + { + "epoch": 6.993795826283136, + "grad_norm": 1.1006847620010376, + "learning_rate": 1.5033840947546531e-05, + "loss": 0.7855, + "step": 12400 + }, + { + "epoch": 6.994359842075578, + "grad_norm": 1.1448545455932617, + "learning_rate": 1.5031020868584322e-05, + "loss": 0.8532, + "step": 12401 + }, + { + "epoch": 6.99492385786802, + "grad_norm": 1.1632254123687744, + "learning_rate": 1.502820078962211e-05, + "loss": 0.746, + "step": 12402 + }, + { + "epoch": 6.995487873660463, + "grad_norm": 1.0646315813064575, + "learning_rate": 1.5025380710659901e-05, + "loss": 0.8103, + "step": 12403 + }, + { + "epoch": 6.996051889452905, + "grad_norm": 1.122871994972229, + "learning_rate": 1.5022560631697688e-05, + "loss": 0.7657, + "step": 12404 + }, + { + "epoch": 6.9966159052453465, + "grad_norm": 1.2194528579711914, + "learning_rate": 1.5019740552735479e-05, + "loss": 0.7245, + "step": 12405 + }, + { + "epoch": 6.997179921037789, + "grad_norm": 1.0850391387939453, + "learning_rate": 1.5016920473773266e-05, + "loss": 0.8225, + "step": 12406 + }, + { + "epoch": 6.997743936830231, + "grad_norm": 1.6698435544967651, + "learning_rate": 1.5014100394811056e-05, + "loss": 0.8163, + "step": 12407 + }, + { + "epoch": 6.998307952622674, + "grad_norm": 1.0121419429779053, + "learning_rate": 1.5011280315848843e-05, + "loss": 0.798, + "step": 12408 + }, + { + "epoch": 6.9988719684151155, + "grad_norm": 1.2210696935653687, + "learning_rate": 1.5008460236886634e-05, + "loss": 0.7833, + "step": 12409 + }, + { + "epoch": 6.999435984207558, + "grad_norm": 1.4437581300735474, + "learning_rate": 1.5005640157924423e-05, + "loss": 0.6989, + "step": 12410 + }, + { + "epoch": 7.0, + "grad_norm": 1.8949822187423706, + "learning_rate": 1.5002820078962213e-05, + "loss": 0.8554, + "step": 12411 + }, + { + "epoch": 7.000564015792442, + "grad_norm": 1.308295488357544, + "learning_rate": 1.5e-05, + "loss": 0.7553, + "step": 12412 + }, + { + "epoch": 7.0011280315848845, + "grad_norm": 1.0280810594558716, + "learning_rate": 1.4997179921037791e-05, + "loss": 0.7337, + "step": 12413 + }, + { + "epoch": 7.001692047377326, + "grad_norm": 1.4087920188903809, + "learning_rate": 1.4994359842075578e-05, + "loss": 0.794, + "step": 12414 + }, + { + "epoch": 7.002256063169769, + "grad_norm": 0.8730578422546387, + "learning_rate": 1.4991539763113368e-05, + "loss": 0.6346, + "step": 12415 + }, + { + "epoch": 7.002820078962211, + "grad_norm": 1.180494785308838, + "learning_rate": 1.4988719684151156e-05, + "loss": 0.8116, + "step": 12416 + }, + { + "epoch": 7.0033840947546535, + "grad_norm": 0.9381999969482422, + "learning_rate": 1.4985899605188946e-05, + "loss": 0.7519, + "step": 12417 + }, + { + "epoch": 7.003948110547095, + "grad_norm": 1.0113173723220825, + "learning_rate": 1.4983079526226735e-05, + "loss": 0.6817, + "step": 12418 + }, + { + "epoch": 7.004512126339537, + "grad_norm": 1.15796959400177, + "learning_rate": 1.4980259447264525e-05, + "loss": 0.705, + "step": 12419 + }, + { + "epoch": 7.00507614213198, + "grad_norm": 0.9769168496131897, + "learning_rate": 1.4977439368302312e-05, + "loss": 0.7545, + "step": 12420 + }, + { + "epoch": 7.005640157924422, + "grad_norm": 1.1932202577590942, + "learning_rate": 1.4974619289340103e-05, + "loss": 0.7836, + "step": 12421 + }, + { + "epoch": 7.006204173716864, + "grad_norm": 1.5130027532577515, + "learning_rate": 1.497179921037789e-05, + "loss": 0.7626, + "step": 12422 + }, + { + "epoch": 7.006768189509306, + "grad_norm": 1.2914634943008423, + "learning_rate": 1.496897913141568e-05, + "loss": 0.7218, + "step": 12423 + }, + { + "epoch": 7.007332205301749, + "grad_norm": 1.1897770166397095, + "learning_rate": 1.4966159052453468e-05, + "loss": 0.744, + "step": 12424 + }, + { + "epoch": 7.007896221094191, + "grad_norm": 0.8447132110595703, + "learning_rate": 1.4963338973491258e-05, + "loss": 0.6113, + "step": 12425 + }, + { + "epoch": 7.008460236886632, + "grad_norm": 0.9989084005355835, + "learning_rate": 1.4960518894529047e-05, + "loss": 0.7278, + "step": 12426 + }, + { + "epoch": 7.009024252679075, + "grad_norm": 1.5730347633361816, + "learning_rate": 1.4957698815566837e-05, + "loss": 0.8734, + "step": 12427 + }, + { + "epoch": 7.009588268471517, + "grad_norm": 1.204284906387329, + "learning_rate": 1.4954878736604625e-05, + "loss": 0.7112, + "step": 12428 + }, + { + "epoch": 7.01015228426396, + "grad_norm": 1.2503321170806885, + "learning_rate": 1.4952058657642415e-05, + "loss": 0.7803, + "step": 12429 + }, + { + "epoch": 7.0107163000564015, + "grad_norm": 0.9000890254974365, + "learning_rate": 1.4949238578680202e-05, + "loss": 0.656, + "step": 12430 + }, + { + "epoch": 7.011280315848844, + "grad_norm": 1.0215290784835815, + "learning_rate": 1.4946418499717993e-05, + "loss": 0.7165, + "step": 12431 + }, + { + "epoch": 7.011844331641286, + "grad_norm": 1.018207311630249, + "learning_rate": 1.494359842075578e-05, + "loss": 0.7308, + "step": 12432 + }, + { + "epoch": 7.012408347433728, + "grad_norm": 1.0966196060180664, + "learning_rate": 1.494077834179357e-05, + "loss": 0.8404, + "step": 12433 + }, + { + "epoch": 7.0129723632261705, + "grad_norm": 1.1377787590026855, + "learning_rate": 1.493795826283136e-05, + "loss": 0.7759, + "step": 12434 + }, + { + "epoch": 7.013536379018612, + "grad_norm": 1.2847143411636353, + "learning_rate": 1.493513818386915e-05, + "loss": 0.8154, + "step": 12435 + }, + { + "epoch": 7.014100394811055, + "grad_norm": 0.9906629920005798, + "learning_rate": 1.493231810490694e-05, + "loss": 0.7249, + "step": 12436 + }, + { + "epoch": 7.014664410603497, + "grad_norm": 1.055110216140747, + "learning_rate": 1.4929498025944727e-05, + "loss": 0.708, + "step": 12437 + }, + { + "epoch": 7.0152284263959395, + "grad_norm": 1.0405081510543823, + "learning_rate": 1.4926677946982518e-05, + "loss": 0.8367, + "step": 12438 + }, + { + "epoch": 7.015792442188381, + "grad_norm": 1.2654931545257568, + "learning_rate": 1.4923857868020305e-05, + "loss": 0.7978, + "step": 12439 + }, + { + "epoch": 7.016356457980823, + "grad_norm": 1.0418025255203247, + "learning_rate": 1.4921037789058095e-05, + "loss": 0.762, + "step": 12440 + }, + { + "epoch": 7.016920473773266, + "grad_norm": 1.0344789028167725, + "learning_rate": 1.4918217710095882e-05, + "loss": 0.7157, + "step": 12441 + }, + { + "epoch": 7.017484489565708, + "grad_norm": 1.093848466873169, + "learning_rate": 1.4915397631133673e-05, + "loss": 0.767, + "step": 12442 + }, + { + "epoch": 7.01804850535815, + "grad_norm": 1.1512582302093506, + "learning_rate": 1.4912577552171462e-05, + "loss": 0.7731, + "step": 12443 + }, + { + "epoch": 7.018612521150592, + "grad_norm": 0.9393943548202515, + "learning_rate": 1.4909757473209252e-05, + "loss": 0.7495, + "step": 12444 + }, + { + "epoch": 7.019176536943035, + "grad_norm": 1.0952011346817017, + "learning_rate": 1.4906937394247039e-05, + "loss": 0.7235, + "step": 12445 + }, + { + "epoch": 7.019740552735477, + "grad_norm": 1.1307328939437866, + "learning_rate": 1.490411731528483e-05, + "loss": 0.6468, + "step": 12446 + }, + { + "epoch": 7.020304568527918, + "grad_norm": 1.0692046880722046, + "learning_rate": 1.4901297236322617e-05, + "loss": 0.8603, + "step": 12447 + }, + { + "epoch": 7.020868584320361, + "grad_norm": 3.053016185760498, + "learning_rate": 1.4898477157360407e-05, + "loss": 0.6206, + "step": 12448 + }, + { + "epoch": 7.021432600112803, + "grad_norm": 1.4609559774398804, + "learning_rate": 1.4895657078398196e-05, + "loss": 0.7973, + "step": 12449 + }, + { + "epoch": 7.021996615905246, + "grad_norm": 1.2892451286315918, + "learning_rate": 1.4892836999435985e-05, + "loss": 0.7705, + "step": 12450 + }, + { + "epoch": 7.022560631697687, + "grad_norm": 1.5535682439804077, + "learning_rate": 1.4890016920473774e-05, + "loss": 0.8079, + "step": 12451 + }, + { + "epoch": 7.02312464749013, + "grad_norm": 1.176255702972412, + "learning_rate": 1.4887196841511564e-05, + "loss": 0.7432, + "step": 12452 + }, + { + "epoch": 7.023688663282572, + "grad_norm": 1.5806641578674316, + "learning_rate": 1.4884376762549351e-05, + "loss": 0.8136, + "step": 12453 + }, + { + "epoch": 7.024252679075014, + "grad_norm": 1.290452480316162, + "learning_rate": 1.4881556683587142e-05, + "loss": 0.7116, + "step": 12454 + }, + { + "epoch": 7.024816694867456, + "grad_norm": 0.8575352430343628, + "learning_rate": 1.4878736604624929e-05, + "loss": 0.6123, + "step": 12455 + }, + { + "epoch": 7.025380710659898, + "grad_norm": 2.0498220920562744, + "learning_rate": 1.487591652566272e-05, + "loss": 0.7618, + "step": 12456 + }, + { + "epoch": 7.025944726452341, + "grad_norm": 0.9940595030784607, + "learning_rate": 1.4873096446700508e-05, + "loss": 0.7139, + "step": 12457 + }, + { + "epoch": 7.026508742244783, + "grad_norm": 1.3228806257247925, + "learning_rate": 1.4870276367738299e-05, + "loss": 0.7775, + "step": 12458 + }, + { + "epoch": 7.027072758037225, + "grad_norm": 0.8135009407997131, + "learning_rate": 1.4867456288776086e-05, + "loss": 0.6783, + "step": 12459 + }, + { + "epoch": 7.027636773829667, + "grad_norm": 1.0693752765655518, + "learning_rate": 1.4864636209813876e-05, + "loss": 0.7875, + "step": 12460 + }, + { + "epoch": 7.028200789622109, + "grad_norm": 1.0267823934555054, + "learning_rate": 1.4861816130851663e-05, + "loss": 0.6198, + "step": 12461 + }, + { + "epoch": 7.028764805414552, + "grad_norm": 1.2414538860321045, + "learning_rate": 1.4858996051889454e-05, + "loss": 0.7463, + "step": 12462 + }, + { + "epoch": 7.0293288212069935, + "grad_norm": 1.1869436502456665, + "learning_rate": 1.4856175972927241e-05, + "loss": 0.7622, + "step": 12463 + }, + { + "epoch": 7.029892836999436, + "grad_norm": 1.1137429475784302, + "learning_rate": 1.4853355893965031e-05, + "loss": 0.6952, + "step": 12464 + }, + { + "epoch": 7.030456852791878, + "grad_norm": 1.1679571866989136, + "learning_rate": 1.485053581500282e-05, + "loss": 0.6981, + "step": 12465 + }, + { + "epoch": 7.031020868584321, + "grad_norm": 2.3140015602111816, + "learning_rate": 1.484771573604061e-05, + "loss": 0.7519, + "step": 12466 + }, + { + "epoch": 7.0315848843767625, + "grad_norm": 1.0060691833496094, + "learning_rate": 1.4844895657078398e-05, + "loss": 0.868, + "step": 12467 + }, + { + "epoch": 7.032148900169204, + "grad_norm": 1.5612672567367554, + "learning_rate": 1.4842075578116188e-05, + "loss": 0.7674, + "step": 12468 + }, + { + "epoch": 7.032712915961647, + "grad_norm": 0.8491168022155762, + "learning_rate": 1.4839255499153979e-05, + "loss": 0.6841, + "step": 12469 + }, + { + "epoch": 7.033276931754089, + "grad_norm": 1.2539373636245728, + "learning_rate": 1.4836435420191766e-05, + "loss": 0.781, + "step": 12470 + }, + { + "epoch": 7.0338409475465316, + "grad_norm": 1.1309155225753784, + "learning_rate": 1.4833615341229556e-05, + "loss": 0.7153, + "step": 12471 + }, + { + "epoch": 7.034404963338973, + "grad_norm": 0.9482673406600952, + "learning_rate": 1.4830795262267343e-05, + "loss": 0.7342, + "step": 12472 + }, + { + "epoch": 7.034968979131416, + "grad_norm": 0.906524658203125, + "learning_rate": 1.4827975183305134e-05, + "loss": 0.6896, + "step": 12473 + }, + { + "epoch": 7.035532994923858, + "grad_norm": 1.5253244638442993, + "learning_rate": 1.4825155104342923e-05, + "loss": 0.8338, + "step": 12474 + }, + { + "epoch": 7.0360970107163, + "grad_norm": 1.0923564434051514, + "learning_rate": 1.4822335025380713e-05, + "loss": 0.782, + "step": 12475 + }, + { + "epoch": 7.036661026508742, + "grad_norm": 1.0789133310317993, + "learning_rate": 1.48195149464185e-05, + "loss": 0.7939, + "step": 12476 + }, + { + "epoch": 7.037225042301184, + "grad_norm": 1.3238945007324219, + "learning_rate": 1.481669486745629e-05, + "loss": 0.7049, + "step": 12477 + }, + { + "epoch": 7.037789058093627, + "grad_norm": 1.2504827976226807, + "learning_rate": 1.4813874788494078e-05, + "loss": 0.6881, + "step": 12478 + }, + { + "epoch": 7.038353073886069, + "grad_norm": 1.255505084991455, + "learning_rate": 1.4811054709531868e-05, + "loss": 0.7036, + "step": 12479 + }, + { + "epoch": 7.038917089678511, + "grad_norm": 1.313533067703247, + "learning_rate": 1.4808234630569655e-05, + "loss": 0.7752, + "step": 12480 + }, + { + "epoch": 7.039481105470953, + "grad_norm": 0.780770480632782, + "learning_rate": 1.4805414551607446e-05, + "loss": 0.6545, + "step": 12481 + }, + { + "epoch": 7.040045121263395, + "grad_norm": 1.2951997518539429, + "learning_rate": 1.4802594472645235e-05, + "loss": 0.7067, + "step": 12482 + }, + { + "epoch": 7.040609137055838, + "grad_norm": 1.1743658781051636, + "learning_rate": 1.4799774393683025e-05, + "loss": 0.726, + "step": 12483 + }, + { + "epoch": 7.0411731528482795, + "grad_norm": 1.199122667312622, + "learning_rate": 1.4796954314720812e-05, + "loss": 0.7527, + "step": 12484 + }, + { + "epoch": 7.041737168640722, + "grad_norm": 1.2380306720733643, + "learning_rate": 1.4794134235758603e-05, + "loss": 0.6819, + "step": 12485 + }, + { + "epoch": 7.042301184433164, + "grad_norm": 0.9076663851737976, + "learning_rate": 1.479131415679639e-05, + "loss": 0.6842, + "step": 12486 + }, + { + "epoch": 7.042865200225607, + "grad_norm": 1.127320647239685, + "learning_rate": 1.478849407783418e-05, + "loss": 0.7531, + "step": 12487 + }, + { + "epoch": 7.0434292160180485, + "grad_norm": 0.9500805735588074, + "learning_rate": 1.4785673998871968e-05, + "loss": 0.7568, + "step": 12488 + }, + { + "epoch": 7.04399323181049, + "grad_norm": 0.9974868893623352, + "learning_rate": 1.4782853919909758e-05, + "loss": 0.791, + "step": 12489 + }, + { + "epoch": 7.044557247602933, + "grad_norm": 0.9216572642326355, + "learning_rate": 1.4780033840947547e-05, + "loss": 0.7415, + "step": 12490 + }, + { + "epoch": 7.045121263395375, + "grad_norm": 0.8443543910980225, + "learning_rate": 1.4777213761985337e-05, + "loss": 0.7115, + "step": 12491 + }, + { + "epoch": 7.0456852791878175, + "grad_norm": 1.364777684211731, + "learning_rate": 1.4774393683023124e-05, + "loss": 0.7539, + "step": 12492 + }, + { + "epoch": 7.046249294980259, + "grad_norm": 0.7719613909721375, + "learning_rate": 1.4771573604060915e-05, + "loss": 0.5517, + "step": 12493 + }, + { + "epoch": 7.046813310772702, + "grad_norm": 0.9703431129455566, + "learning_rate": 1.4768753525098702e-05, + "loss": 0.6755, + "step": 12494 + }, + { + "epoch": 7.047377326565144, + "grad_norm": 1.1826083660125732, + "learning_rate": 1.4765933446136493e-05, + "loss": 0.847, + "step": 12495 + }, + { + "epoch": 7.047941342357586, + "grad_norm": 1.0482511520385742, + "learning_rate": 1.4763113367174281e-05, + "loss": 0.7084, + "step": 12496 + }, + { + "epoch": 7.048505358150028, + "grad_norm": 1.277687668800354, + "learning_rate": 1.476029328821207e-05, + "loss": 0.7153, + "step": 12497 + }, + { + "epoch": 7.04906937394247, + "grad_norm": 0.8739993572235107, + "learning_rate": 1.4757473209249859e-05, + "loss": 0.6871, + "step": 12498 + }, + { + "epoch": 7.049633389734913, + "grad_norm": 1.1874799728393555, + "learning_rate": 1.475465313028765e-05, + "loss": 0.8037, + "step": 12499 + }, + { + "epoch": 7.050197405527355, + "grad_norm": 1.476470708847046, + "learning_rate": 1.4751833051325436e-05, + "loss": 0.7854, + "step": 12500 + }, + { + "epoch": 7.050761421319797, + "grad_norm": 1.0815000534057617, + "learning_rate": 1.4749012972363227e-05, + "loss": 0.7903, + "step": 12501 + }, + { + "epoch": 7.051325437112239, + "grad_norm": 1.0069966316223145, + "learning_rate": 1.4746192893401014e-05, + "loss": 0.7665, + "step": 12502 + }, + { + "epoch": 7.051889452904681, + "grad_norm": 0.9649519324302673, + "learning_rate": 1.4743372814438805e-05, + "loss": 0.7225, + "step": 12503 + }, + { + "epoch": 7.052453468697124, + "grad_norm": 1.2580232620239258, + "learning_rate": 1.4740552735476595e-05, + "loss": 0.795, + "step": 12504 + }, + { + "epoch": 7.0530174844895654, + "grad_norm": 1.1287808418273926, + "learning_rate": 1.4737732656514384e-05, + "loss": 0.7491, + "step": 12505 + }, + { + "epoch": 7.053581500282008, + "grad_norm": 1.718454360961914, + "learning_rate": 1.4734912577552173e-05, + "loss": 0.7947, + "step": 12506 + }, + { + "epoch": 7.05414551607445, + "grad_norm": 1.089264154434204, + "learning_rate": 1.4732092498589961e-05, + "loss": 0.6681, + "step": 12507 + }, + { + "epoch": 7.054709531866893, + "grad_norm": 1.1681793928146362, + "learning_rate": 1.4729272419627752e-05, + "loss": 0.8194, + "step": 12508 + }, + { + "epoch": 7.0552735476593345, + "grad_norm": 1.5933421850204468, + "learning_rate": 1.4726452340665539e-05, + "loss": 0.7671, + "step": 12509 + }, + { + "epoch": 7.055837563451776, + "grad_norm": 0.9686843752861023, + "learning_rate": 1.472363226170333e-05, + "loss": 0.7498, + "step": 12510 + }, + { + "epoch": 7.056401579244219, + "grad_norm": 1.2687585353851318, + "learning_rate": 1.4720812182741117e-05, + "loss": 0.7871, + "step": 12511 + }, + { + "epoch": 7.056965595036661, + "grad_norm": 1.4454331398010254, + "learning_rate": 1.4717992103778907e-05, + "loss": 0.7917, + "step": 12512 + }, + { + "epoch": 7.0575296108291035, + "grad_norm": 0.8546806573867798, + "learning_rate": 1.4715172024816696e-05, + "loss": 0.6119, + "step": 12513 + }, + { + "epoch": 7.058093626621545, + "grad_norm": 1.005630373954773, + "learning_rate": 1.4712351945854486e-05, + "loss": 0.7536, + "step": 12514 + }, + { + "epoch": 7.058657642413988, + "grad_norm": 0.9173685312271118, + "learning_rate": 1.4709531866892274e-05, + "loss": 0.6114, + "step": 12515 + }, + { + "epoch": 7.05922165820643, + "grad_norm": 0.9844632148742676, + "learning_rate": 1.4706711787930064e-05, + "loss": 0.7112, + "step": 12516 + }, + { + "epoch": 7.059785673998872, + "grad_norm": 1.252695083618164, + "learning_rate": 1.4703891708967851e-05, + "loss": 0.6471, + "step": 12517 + }, + { + "epoch": 7.060349689791314, + "grad_norm": 1.0494028329849243, + "learning_rate": 1.4701071630005642e-05, + "loss": 0.7491, + "step": 12518 + }, + { + "epoch": 7.060913705583756, + "grad_norm": 0.8861351609230042, + "learning_rate": 1.4698251551043429e-05, + "loss": 0.7097, + "step": 12519 + }, + { + "epoch": 7.061477721376199, + "grad_norm": 1.1026530265808105, + "learning_rate": 1.469543147208122e-05, + "loss": 0.7605, + "step": 12520 + }, + { + "epoch": 7.062041737168641, + "grad_norm": 1.0652976036071777, + "learning_rate": 1.4692611393119008e-05, + "loss": 0.7035, + "step": 12521 + }, + { + "epoch": 7.062605752961083, + "grad_norm": 0.8879209160804749, + "learning_rate": 1.4689791314156798e-05, + "loss": 0.7525, + "step": 12522 + }, + { + "epoch": 7.063169768753525, + "grad_norm": 1.2101638317108154, + "learning_rate": 1.4686971235194586e-05, + "loss": 0.7764, + "step": 12523 + }, + { + "epoch": 7.063733784545967, + "grad_norm": 0.9217450618743896, + "learning_rate": 1.4684151156232376e-05, + "loss": 0.7696, + "step": 12524 + }, + { + "epoch": 7.06429780033841, + "grad_norm": 1.1742427349090576, + "learning_rate": 1.4681331077270163e-05, + "loss": 0.725, + "step": 12525 + }, + { + "epoch": 7.064861816130851, + "grad_norm": 1.38254976272583, + "learning_rate": 1.4678510998307954e-05, + "loss": 0.7603, + "step": 12526 + }, + { + "epoch": 7.065425831923294, + "grad_norm": 1.2748175859451294, + "learning_rate": 1.467569091934574e-05, + "loss": 0.7041, + "step": 12527 + }, + { + "epoch": 7.065989847715736, + "grad_norm": 1.1561279296875, + "learning_rate": 1.4672870840383531e-05, + "loss": 0.7903, + "step": 12528 + }, + { + "epoch": 7.066553863508179, + "grad_norm": 1.2865517139434814, + "learning_rate": 1.467005076142132e-05, + "loss": 0.7854, + "step": 12529 + }, + { + "epoch": 7.06711787930062, + "grad_norm": 1.4170405864715576, + "learning_rate": 1.466723068245911e-05, + "loss": 0.8401, + "step": 12530 + }, + { + "epoch": 7.067681895093062, + "grad_norm": 1.1283284425735474, + "learning_rate": 1.4664410603496898e-05, + "loss": 0.6297, + "step": 12531 + }, + { + "epoch": 7.068245910885505, + "grad_norm": 1.0253740549087524, + "learning_rate": 1.4661590524534688e-05, + "loss": 0.6058, + "step": 12532 + }, + { + "epoch": 7.068809926677947, + "grad_norm": 1.4122792482376099, + "learning_rate": 1.4658770445572475e-05, + "loss": 0.7451, + "step": 12533 + }, + { + "epoch": 7.069373942470389, + "grad_norm": 1.3987631797790527, + "learning_rate": 1.4655950366610266e-05, + "loss": 0.7638, + "step": 12534 + }, + { + "epoch": 7.069937958262831, + "grad_norm": 1.4052350521087646, + "learning_rate": 1.4653130287648053e-05, + "loss": 0.5846, + "step": 12535 + }, + { + "epoch": 7.070501974055274, + "grad_norm": 1.2006199359893799, + "learning_rate": 1.4650310208685843e-05, + "loss": 0.8126, + "step": 12536 + }, + { + "epoch": 7.071065989847716, + "grad_norm": 1.2029461860656738, + "learning_rate": 1.4647490129723632e-05, + "loss": 0.7608, + "step": 12537 + }, + { + "epoch": 7.0716300056401575, + "grad_norm": 0.9386877417564392, + "learning_rate": 1.4644670050761423e-05, + "loss": 0.7421, + "step": 12538 + }, + { + "epoch": 7.0721940214326, + "grad_norm": 0.9365391731262207, + "learning_rate": 1.464184997179921e-05, + "loss": 0.6613, + "step": 12539 + }, + { + "epoch": 7.072758037225042, + "grad_norm": 1.1345340013504028, + "learning_rate": 1.4639029892837e-05, + "loss": 0.6845, + "step": 12540 + }, + { + "epoch": 7.073322053017485, + "grad_norm": 1.3518736362457275, + "learning_rate": 1.463620981387479e-05, + "loss": 0.7488, + "step": 12541 + }, + { + "epoch": 7.0738860688099265, + "grad_norm": 0.80569988489151, + "learning_rate": 1.4633389734912578e-05, + "loss": 0.6334, + "step": 12542 + }, + { + "epoch": 7.074450084602369, + "grad_norm": 1.0422989130020142, + "learning_rate": 1.4630569655950368e-05, + "loss": 0.7553, + "step": 12543 + }, + { + "epoch": 7.075014100394811, + "grad_norm": 1.0088489055633545, + "learning_rate": 1.4627749576988155e-05, + "loss": 0.8101, + "step": 12544 + }, + { + "epoch": 7.075578116187253, + "grad_norm": 0.8674643039703369, + "learning_rate": 1.4624929498025946e-05, + "loss": 0.6677, + "step": 12545 + }, + { + "epoch": 7.0761421319796955, + "grad_norm": 0.9638376832008362, + "learning_rate": 1.4622109419063735e-05, + "loss": 0.7275, + "step": 12546 + }, + { + "epoch": 7.076706147772137, + "grad_norm": 0.9111378192901611, + "learning_rate": 1.4619289340101525e-05, + "loss": 0.6438, + "step": 12547 + }, + { + "epoch": 7.07727016356458, + "grad_norm": 1.0464093685150146, + "learning_rate": 1.4616469261139312e-05, + "loss": 0.758, + "step": 12548 + }, + { + "epoch": 7.077834179357022, + "grad_norm": 1.084357500076294, + "learning_rate": 1.4613649182177103e-05, + "loss": 0.7571, + "step": 12549 + }, + { + "epoch": 7.0783981951494646, + "grad_norm": 1.2897547483444214, + "learning_rate": 1.461082910321489e-05, + "loss": 0.6997, + "step": 12550 + }, + { + "epoch": 7.078962210941906, + "grad_norm": 1.0094870328903198, + "learning_rate": 1.460800902425268e-05, + "loss": 0.6894, + "step": 12551 + }, + { + "epoch": 7.079526226734348, + "grad_norm": 0.7875709533691406, + "learning_rate": 1.460518894529047e-05, + "loss": 0.6676, + "step": 12552 + }, + { + "epoch": 7.080090242526791, + "grad_norm": 1.4365880489349365, + "learning_rate": 1.4602368866328258e-05, + "loss": 0.8546, + "step": 12553 + }, + { + "epoch": 7.080654258319233, + "grad_norm": 0.9829257726669312, + "learning_rate": 1.4599548787366047e-05, + "loss": 0.638, + "step": 12554 + }, + { + "epoch": 7.081218274111675, + "grad_norm": 1.3630369901657104, + "learning_rate": 1.4596728708403837e-05, + "loss": 0.7736, + "step": 12555 + }, + { + "epoch": 7.081782289904117, + "grad_norm": 1.0923571586608887, + "learning_rate": 1.4593908629441624e-05, + "loss": 0.6409, + "step": 12556 + }, + { + "epoch": 7.08234630569656, + "grad_norm": 1.4480712413787842, + "learning_rate": 1.4591088550479415e-05, + "loss": 0.6837, + "step": 12557 + }, + { + "epoch": 7.082910321489002, + "grad_norm": 1.1640890836715698, + "learning_rate": 1.4588268471517202e-05, + "loss": 0.6834, + "step": 12558 + }, + { + "epoch": 7.0834743372814435, + "grad_norm": 1.0118769407272339, + "learning_rate": 1.4585448392554992e-05, + "loss": 0.8102, + "step": 12559 + }, + { + "epoch": 7.084038353073886, + "grad_norm": 1.2168034315109253, + "learning_rate": 1.4582628313592781e-05, + "loss": 0.7603, + "step": 12560 + }, + { + "epoch": 7.084602368866328, + "grad_norm": 1.339118242263794, + "learning_rate": 1.4579808234630572e-05, + "loss": 0.7601, + "step": 12561 + }, + { + "epoch": 7.085166384658771, + "grad_norm": 1.1961654424667358, + "learning_rate": 1.4576988155668359e-05, + "loss": 0.7841, + "step": 12562 + }, + { + "epoch": 7.0857304004512125, + "grad_norm": 1.0924060344696045, + "learning_rate": 1.457416807670615e-05, + "loss": 0.7073, + "step": 12563 + }, + { + "epoch": 7.086294416243655, + "grad_norm": 1.138907551765442, + "learning_rate": 1.4571347997743936e-05, + "loss": 0.7276, + "step": 12564 + }, + { + "epoch": 7.086858432036097, + "grad_norm": 1.0314613580703735, + "learning_rate": 1.4568527918781727e-05, + "loss": 0.7279, + "step": 12565 + }, + { + "epoch": 7.087422447828539, + "grad_norm": 1.3913463354110718, + "learning_rate": 1.4565707839819514e-05, + "loss": 0.7971, + "step": 12566 + }, + { + "epoch": 7.0879864636209815, + "grad_norm": 0.8230644464492798, + "learning_rate": 1.4562887760857305e-05, + "loss": 0.6252, + "step": 12567 + }, + { + "epoch": 7.088550479413423, + "grad_norm": 1.2367690801620483, + "learning_rate": 1.4560067681895093e-05, + "loss": 0.6528, + "step": 12568 + }, + { + "epoch": 7.089114495205866, + "grad_norm": 1.0723189115524292, + "learning_rate": 1.4557247602932884e-05, + "loss": 0.7734, + "step": 12569 + }, + { + "epoch": 7.089678510998308, + "grad_norm": 1.1703389883041382, + "learning_rate": 1.4554427523970671e-05, + "loss": 0.7621, + "step": 12570 + }, + { + "epoch": 7.0902425267907505, + "grad_norm": 1.047075867652893, + "learning_rate": 1.4551607445008461e-05, + "loss": 0.8725, + "step": 12571 + }, + { + "epoch": 7.090806542583192, + "grad_norm": 1.014736294746399, + "learning_rate": 1.4548787366046248e-05, + "loss": 0.6737, + "step": 12572 + }, + { + "epoch": 7.091370558375634, + "grad_norm": 1.3863458633422852, + "learning_rate": 1.4545967287084039e-05, + "loss": 0.8181, + "step": 12573 + }, + { + "epoch": 7.091934574168077, + "grad_norm": 0.9614561796188354, + "learning_rate": 1.4543147208121826e-05, + "loss": 0.7147, + "step": 12574 + }, + { + "epoch": 7.092498589960519, + "grad_norm": 1.913243293762207, + "learning_rate": 1.4540327129159617e-05, + "loss": 0.8975, + "step": 12575 + }, + { + "epoch": 7.093062605752961, + "grad_norm": 0.9799350500106812, + "learning_rate": 1.4537507050197407e-05, + "loss": 0.6886, + "step": 12576 + }, + { + "epoch": 7.093626621545403, + "grad_norm": 1.177548885345459, + "learning_rate": 1.4534686971235196e-05, + "loss": 0.7822, + "step": 12577 + }, + { + "epoch": 7.094190637337846, + "grad_norm": 0.7404084801673889, + "learning_rate": 1.4531866892272986e-05, + "loss": 0.5964, + "step": 12578 + }, + { + "epoch": 7.094754653130288, + "grad_norm": 1.4525433778762817, + "learning_rate": 1.4529046813310773e-05, + "loss": 0.7756, + "step": 12579 + }, + { + "epoch": 7.095318668922729, + "grad_norm": 1.1838555335998535, + "learning_rate": 1.4526226734348564e-05, + "loss": 0.8241, + "step": 12580 + }, + { + "epoch": 7.095882684715172, + "grad_norm": 1.248809576034546, + "learning_rate": 1.4523406655386351e-05, + "loss": 0.7857, + "step": 12581 + }, + { + "epoch": 7.096446700507614, + "grad_norm": 1.0081720352172852, + "learning_rate": 1.4520586576424142e-05, + "loss": 0.7544, + "step": 12582 + }, + { + "epoch": 7.097010716300057, + "grad_norm": 1.196969747543335, + "learning_rate": 1.4517766497461929e-05, + "loss": 0.7545, + "step": 12583 + }, + { + "epoch": 7.0975747320924985, + "grad_norm": 1.1131534576416016, + "learning_rate": 1.4514946418499719e-05, + "loss": 0.6765, + "step": 12584 + }, + { + "epoch": 7.098138747884941, + "grad_norm": 1.2820725440979004, + "learning_rate": 1.4512126339537508e-05, + "loss": 0.8427, + "step": 12585 + }, + { + "epoch": 7.098702763677383, + "grad_norm": 1.3831900358200073, + "learning_rate": 1.4509306260575298e-05, + "loss": 0.8812, + "step": 12586 + }, + { + "epoch": 7.099266779469825, + "grad_norm": 0.9836101531982422, + "learning_rate": 1.4506486181613086e-05, + "loss": 0.7388, + "step": 12587 + }, + { + "epoch": 7.0998307952622675, + "grad_norm": 1.5863687992095947, + "learning_rate": 1.4503666102650876e-05, + "loss": 0.8139, + "step": 12588 + }, + { + "epoch": 7.100394811054709, + "grad_norm": 1.5763827562332153, + "learning_rate": 1.4500846023688663e-05, + "loss": 0.6807, + "step": 12589 + }, + { + "epoch": 7.100958826847152, + "grad_norm": 1.5882383584976196, + "learning_rate": 1.4498025944726454e-05, + "loss": 0.7267, + "step": 12590 + }, + { + "epoch": 7.101522842639594, + "grad_norm": 0.9624414443969727, + "learning_rate": 1.449520586576424e-05, + "loss": 0.715, + "step": 12591 + }, + { + "epoch": 7.1020868584320365, + "grad_norm": 1.1619110107421875, + "learning_rate": 1.4492385786802031e-05, + "loss": 0.7343, + "step": 12592 + }, + { + "epoch": 7.102650874224478, + "grad_norm": 0.8814444541931152, + "learning_rate": 1.448956570783982e-05, + "loss": 0.6529, + "step": 12593 + }, + { + "epoch": 7.10321489001692, + "grad_norm": 0.8558862209320068, + "learning_rate": 1.448674562887761e-05, + "loss": 0.7033, + "step": 12594 + }, + { + "epoch": 7.103778905809363, + "grad_norm": 1.2753223180770874, + "learning_rate": 1.4483925549915398e-05, + "loss": 0.8279, + "step": 12595 + }, + { + "epoch": 7.104342921601805, + "grad_norm": 1.1635538339614868, + "learning_rate": 1.4481105470953188e-05, + "loss": 0.6307, + "step": 12596 + }, + { + "epoch": 7.104906937394247, + "grad_norm": 0.7963916063308716, + "learning_rate": 1.4478285391990975e-05, + "loss": 0.7046, + "step": 12597 + }, + { + "epoch": 7.105470953186689, + "grad_norm": 0.9345372915267944, + "learning_rate": 1.4475465313028766e-05, + "loss": 0.767, + "step": 12598 + }, + { + "epoch": 7.106034968979132, + "grad_norm": 1.421312928199768, + "learning_rate": 1.4472645234066553e-05, + "loss": 0.7853, + "step": 12599 + }, + { + "epoch": 7.106598984771574, + "grad_norm": 1.4196690320968628, + "learning_rate": 1.4469825155104343e-05, + "loss": 0.7803, + "step": 12600 + }, + { + "epoch": 7.107163000564015, + "grad_norm": 0.8194848895072937, + "learning_rate": 1.4467005076142132e-05, + "loss": 0.6849, + "step": 12601 + }, + { + "epoch": 7.107727016356458, + "grad_norm": 1.3536585569381714, + "learning_rate": 1.4464184997179923e-05, + "loss": 0.7363, + "step": 12602 + }, + { + "epoch": 7.1082910321489, + "grad_norm": 1.2891767024993896, + "learning_rate": 1.446136491821771e-05, + "loss": 0.7034, + "step": 12603 + }, + { + "epoch": 7.108855047941343, + "grad_norm": 1.0306949615478516, + "learning_rate": 1.44585448392555e-05, + "loss": 0.7466, + "step": 12604 + }, + { + "epoch": 7.109419063733784, + "grad_norm": 1.170852541923523, + "learning_rate": 1.4455724760293287e-05, + "loss": 0.7005, + "step": 12605 + }, + { + "epoch": 7.109983079526227, + "grad_norm": 1.6070609092712402, + "learning_rate": 1.4452904681331078e-05, + "loss": 0.8057, + "step": 12606 + }, + { + "epoch": 7.110547095318669, + "grad_norm": 1.0349793434143066, + "learning_rate": 1.4450084602368867e-05, + "loss": 0.7769, + "step": 12607 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 1.0373327732086182, + "learning_rate": 1.4447264523406655e-05, + "loss": 0.6782, + "step": 12608 + }, + { + "epoch": 7.111675126903553, + "grad_norm": 1.2869642972946167, + "learning_rate": 1.4444444444444444e-05, + "loss": 0.8049, + "step": 12609 + }, + { + "epoch": 7.112239142695995, + "grad_norm": 1.1130679845809937, + "learning_rate": 1.4441624365482235e-05, + "loss": 0.7814, + "step": 12610 + }, + { + "epoch": 7.112803158488438, + "grad_norm": 1.5892338752746582, + "learning_rate": 1.4438804286520025e-05, + "loss": 0.8151, + "step": 12611 + }, + { + "epoch": 7.11336717428088, + "grad_norm": 1.044042706489563, + "learning_rate": 1.4435984207557812e-05, + "loss": 0.6883, + "step": 12612 + }, + { + "epoch": 7.113931190073322, + "grad_norm": 1.306272029876709, + "learning_rate": 1.4433164128595603e-05, + "loss": 0.8087, + "step": 12613 + }, + { + "epoch": 7.114495205865764, + "grad_norm": 1.2314167022705078, + "learning_rate": 1.443034404963339e-05, + "loss": 0.7427, + "step": 12614 + }, + { + "epoch": 7.115059221658206, + "grad_norm": 1.3619593381881714, + "learning_rate": 1.442752397067118e-05, + "loss": 0.7535, + "step": 12615 + }, + { + "epoch": 7.115623237450649, + "grad_norm": 0.7894978523254395, + "learning_rate": 1.4424703891708969e-05, + "loss": 0.647, + "step": 12616 + }, + { + "epoch": 7.1161872532430905, + "grad_norm": 1.4734606742858887, + "learning_rate": 1.4421883812746758e-05, + "loss": 0.8167, + "step": 12617 + }, + { + "epoch": 7.116751269035533, + "grad_norm": 0.9926421046257019, + "learning_rate": 1.4419063733784547e-05, + "loss": 0.7451, + "step": 12618 + }, + { + "epoch": 7.117315284827975, + "grad_norm": 1.224073886871338, + "learning_rate": 1.4416243654822337e-05, + "loss": 0.754, + "step": 12619 + }, + { + "epoch": 7.117879300620418, + "grad_norm": 1.1523423194885254, + "learning_rate": 1.4413423575860124e-05, + "loss": 0.7015, + "step": 12620 + }, + { + "epoch": 7.1184433164128595, + "grad_norm": 1.32709801197052, + "learning_rate": 1.4410603496897915e-05, + "loss": 0.7916, + "step": 12621 + }, + { + "epoch": 7.119007332205301, + "grad_norm": 1.1220484972000122, + "learning_rate": 1.4407783417935702e-05, + "loss": 0.777, + "step": 12622 + }, + { + "epoch": 7.119571347997744, + "grad_norm": 1.1919010877609253, + "learning_rate": 1.4404963338973492e-05, + "loss": 0.7663, + "step": 12623 + }, + { + "epoch": 7.120135363790186, + "grad_norm": 0.8528133034706116, + "learning_rate": 1.4402143260011281e-05, + "loss": 0.6627, + "step": 12624 + }, + { + "epoch": 7.1206993795826286, + "grad_norm": 1.0005401372909546, + "learning_rate": 1.4399323181049072e-05, + "loss": 0.6789, + "step": 12625 + }, + { + "epoch": 7.12126339537507, + "grad_norm": 1.1358296871185303, + "learning_rate": 1.4396503102086859e-05, + "loss": 0.7066, + "step": 12626 + }, + { + "epoch": 7.121827411167513, + "grad_norm": 1.1939876079559326, + "learning_rate": 1.439368302312465e-05, + "loss": 0.6708, + "step": 12627 + }, + { + "epoch": 7.122391426959955, + "grad_norm": 0.9698951840400696, + "learning_rate": 1.4390862944162436e-05, + "loss": 0.7302, + "step": 12628 + }, + { + "epoch": 7.122955442752397, + "grad_norm": 1.2179266214370728, + "learning_rate": 1.4388042865200227e-05, + "loss": 0.7304, + "step": 12629 + }, + { + "epoch": 7.123519458544839, + "grad_norm": 1.4355226755142212, + "learning_rate": 1.4385222786238014e-05, + "loss": 0.7823, + "step": 12630 + }, + { + "epoch": 7.124083474337281, + "grad_norm": 1.1264500617980957, + "learning_rate": 1.4382402707275804e-05, + "loss": 0.7141, + "step": 12631 + }, + { + "epoch": 7.124647490129724, + "grad_norm": 0.7627832293510437, + "learning_rate": 1.4379582628313593e-05, + "loss": 0.6242, + "step": 12632 + }, + { + "epoch": 7.125211505922166, + "grad_norm": 0.9831207990646362, + "learning_rate": 1.4376762549351384e-05, + "loss": 0.7256, + "step": 12633 + }, + { + "epoch": 7.125775521714608, + "grad_norm": 0.8493204116821289, + "learning_rate": 1.437394247038917e-05, + "loss": 0.6788, + "step": 12634 + }, + { + "epoch": 7.12633953750705, + "grad_norm": 1.5563647747039795, + "learning_rate": 1.4371122391426961e-05, + "loss": 0.6777, + "step": 12635 + }, + { + "epoch": 7.126903553299492, + "grad_norm": 0.9097020626068115, + "learning_rate": 1.4368302312464748e-05, + "loss": 0.5965, + "step": 12636 + }, + { + "epoch": 7.127467569091935, + "grad_norm": 1.59683358669281, + "learning_rate": 1.4365482233502539e-05, + "loss": 0.8111, + "step": 12637 + }, + { + "epoch": 7.1280315848843765, + "grad_norm": 1.167089581489563, + "learning_rate": 1.4362662154540326e-05, + "loss": 0.7238, + "step": 12638 + }, + { + "epoch": 7.128595600676819, + "grad_norm": 1.457876205444336, + "learning_rate": 1.4359842075578116e-05, + "loss": 0.8441, + "step": 12639 + }, + { + "epoch": 7.129159616469261, + "grad_norm": 1.2267587184906006, + "learning_rate": 1.4357021996615905e-05, + "loss": 0.7571, + "step": 12640 + }, + { + "epoch": 7.129723632261704, + "grad_norm": 1.1426591873168945, + "learning_rate": 1.4354201917653696e-05, + "loss": 0.7182, + "step": 12641 + }, + { + "epoch": 7.1302876480541455, + "grad_norm": 1.0056103467941284, + "learning_rate": 1.4351381838691483e-05, + "loss": 0.5955, + "step": 12642 + }, + { + "epoch": 7.130851663846587, + "grad_norm": 1.1962637901306152, + "learning_rate": 1.4348561759729273e-05, + "loss": 0.7742, + "step": 12643 + }, + { + "epoch": 7.13141567963903, + "grad_norm": 1.2111643552780151, + "learning_rate": 1.434574168076706e-05, + "loss": 0.7588, + "step": 12644 + }, + { + "epoch": 7.131979695431472, + "grad_norm": 1.571656346321106, + "learning_rate": 1.4342921601804851e-05, + "loss": 0.7549, + "step": 12645 + }, + { + "epoch": 7.1325437112239145, + "grad_norm": 0.8946714401245117, + "learning_rate": 1.4340101522842641e-05, + "loss": 0.6238, + "step": 12646 + }, + { + "epoch": 7.133107727016356, + "grad_norm": 1.0204721689224243, + "learning_rate": 1.4337281443880429e-05, + "loss": 0.6964, + "step": 12647 + }, + { + "epoch": 7.133671742808799, + "grad_norm": 1.0045192241668701, + "learning_rate": 1.4334461364918219e-05, + "loss": 0.6295, + "step": 12648 + }, + { + "epoch": 7.134235758601241, + "grad_norm": 1.2870376110076904, + "learning_rate": 1.4331641285956008e-05, + "loss": 0.7952, + "step": 12649 + }, + { + "epoch": 7.134799774393683, + "grad_norm": 1.0053164958953857, + "learning_rate": 1.4328821206993798e-05, + "loss": 0.6111, + "step": 12650 + }, + { + "epoch": 7.135363790186125, + "grad_norm": 1.1008987426757812, + "learning_rate": 1.4326001128031585e-05, + "loss": 0.7557, + "step": 12651 + }, + { + "epoch": 7.135927805978567, + "grad_norm": 1.1540343761444092, + "learning_rate": 1.4323181049069376e-05, + "loss": 0.772, + "step": 12652 + }, + { + "epoch": 7.13649182177101, + "grad_norm": 1.1220673322677612, + "learning_rate": 1.4320360970107163e-05, + "loss": 0.7463, + "step": 12653 + }, + { + "epoch": 7.137055837563452, + "grad_norm": 1.1484615802764893, + "learning_rate": 1.4317540891144954e-05, + "loss": 0.7105, + "step": 12654 + }, + { + "epoch": 7.137619853355894, + "grad_norm": 1.4083746671676636, + "learning_rate": 1.431472081218274e-05, + "loss": 0.8374, + "step": 12655 + }, + { + "epoch": 7.138183869148336, + "grad_norm": 1.0554354190826416, + "learning_rate": 1.4311900733220531e-05, + "loss": 0.7353, + "step": 12656 + }, + { + "epoch": 7.138747884940778, + "grad_norm": 1.1993780136108398, + "learning_rate": 1.430908065425832e-05, + "loss": 0.7379, + "step": 12657 + }, + { + "epoch": 7.139311900733221, + "grad_norm": 1.311021327972412, + "learning_rate": 1.430626057529611e-05, + "loss": 0.7504, + "step": 12658 + }, + { + "epoch": 7.1398759165256624, + "grad_norm": 1.1840921640396118, + "learning_rate": 1.4303440496333897e-05, + "loss": 0.6856, + "step": 12659 + }, + { + "epoch": 7.140439932318105, + "grad_norm": 1.040117859840393, + "learning_rate": 1.4300620417371688e-05, + "loss": 0.7581, + "step": 12660 + }, + { + "epoch": 7.141003948110547, + "grad_norm": 0.9369787573814392, + "learning_rate": 1.4297800338409475e-05, + "loss": 0.7631, + "step": 12661 + }, + { + "epoch": 7.14156796390299, + "grad_norm": 1.1139341592788696, + "learning_rate": 1.4294980259447266e-05, + "loss": 0.8081, + "step": 12662 + }, + { + "epoch": 7.1421319796954315, + "grad_norm": 1.3002123832702637, + "learning_rate": 1.4292160180485054e-05, + "loss": 0.7429, + "step": 12663 + }, + { + "epoch": 7.142695995487873, + "grad_norm": 1.186822772026062, + "learning_rate": 1.4289340101522843e-05, + "loss": 0.7292, + "step": 12664 + }, + { + "epoch": 7.143260011280316, + "grad_norm": 0.7476882934570312, + "learning_rate": 1.4286520022560632e-05, + "loss": 0.6413, + "step": 12665 + }, + { + "epoch": 7.143824027072758, + "grad_norm": 1.2374579906463623, + "learning_rate": 1.4283699943598422e-05, + "loss": 0.7226, + "step": 12666 + }, + { + "epoch": 7.1443880428652005, + "grad_norm": 1.2614331245422363, + "learning_rate": 1.428087986463621e-05, + "loss": 0.7009, + "step": 12667 + }, + { + "epoch": 7.144952058657642, + "grad_norm": 0.8784431219100952, + "learning_rate": 1.4278059785674e-05, + "loss": 0.7111, + "step": 12668 + }, + { + "epoch": 7.145516074450085, + "grad_norm": 0.9829002022743225, + "learning_rate": 1.4275239706711787e-05, + "loss": 0.6797, + "step": 12669 + }, + { + "epoch": 7.146080090242527, + "grad_norm": 1.194248080253601, + "learning_rate": 1.4272419627749578e-05, + "loss": 0.7557, + "step": 12670 + }, + { + "epoch": 7.146644106034969, + "grad_norm": 1.5534026622772217, + "learning_rate": 1.4269599548787366e-05, + "loss": 0.8271, + "step": 12671 + }, + { + "epoch": 7.147208121827411, + "grad_norm": 2.360133409500122, + "learning_rate": 1.4266779469825157e-05, + "loss": 0.7197, + "step": 12672 + }, + { + "epoch": 7.147772137619853, + "grad_norm": 1.674665927886963, + "learning_rate": 1.4263959390862944e-05, + "loss": 0.7766, + "step": 12673 + }, + { + "epoch": 7.148336153412296, + "grad_norm": 1.0568193197250366, + "learning_rate": 1.4261139311900735e-05, + "loss": 0.7051, + "step": 12674 + }, + { + "epoch": 7.148900169204738, + "grad_norm": 0.8639178276062012, + "learning_rate": 1.4258319232938522e-05, + "loss": 0.6336, + "step": 12675 + }, + { + "epoch": 7.14946418499718, + "grad_norm": 1.3708847761154175, + "learning_rate": 1.4255499153976312e-05, + "loss": 0.7235, + "step": 12676 + }, + { + "epoch": 7.150028200789622, + "grad_norm": 1.1474573612213135, + "learning_rate": 1.42526790750141e-05, + "loss": 0.7249, + "step": 12677 + }, + { + "epoch": 7.150592216582064, + "grad_norm": 0.7873499393463135, + "learning_rate": 1.424985899605189e-05, + "loss": 0.6176, + "step": 12678 + }, + { + "epoch": 7.151156232374507, + "grad_norm": 1.3369210958480835, + "learning_rate": 1.4247038917089679e-05, + "loss": 0.6948, + "step": 12679 + }, + { + "epoch": 7.151720248166948, + "grad_norm": 0.962020218372345, + "learning_rate": 1.4244218838127469e-05, + "loss": 0.7277, + "step": 12680 + }, + { + "epoch": 7.152284263959391, + "grad_norm": 0.9445446133613586, + "learning_rate": 1.424139875916526e-05, + "loss": 0.7696, + "step": 12681 + }, + { + "epoch": 7.152848279751833, + "grad_norm": 1.4147084951400757, + "learning_rate": 1.4238578680203047e-05, + "loss": 0.893, + "step": 12682 + }, + { + "epoch": 7.153412295544276, + "grad_norm": 1.0866106748580933, + "learning_rate": 1.4235758601240837e-05, + "loss": 0.6743, + "step": 12683 + }, + { + "epoch": 7.153976311336717, + "grad_norm": 1.0703030824661255, + "learning_rate": 1.4232938522278624e-05, + "loss": 0.7431, + "step": 12684 + }, + { + "epoch": 7.154540327129159, + "grad_norm": 1.3523802757263184, + "learning_rate": 1.4230118443316415e-05, + "loss": 0.7742, + "step": 12685 + }, + { + "epoch": 7.155104342921602, + "grad_norm": 0.9585387110710144, + "learning_rate": 1.4227298364354202e-05, + "loss": 0.7286, + "step": 12686 + }, + { + "epoch": 7.155668358714044, + "grad_norm": 1.4135152101516724, + "learning_rate": 1.4224478285391992e-05, + "loss": 0.695, + "step": 12687 + }, + { + "epoch": 7.156232374506486, + "grad_norm": 1.3860832452774048, + "learning_rate": 1.4221658206429781e-05, + "loss": 0.7514, + "step": 12688 + }, + { + "epoch": 7.156796390298928, + "grad_norm": 0.9258489608764648, + "learning_rate": 1.4218838127467572e-05, + "loss": 0.7934, + "step": 12689 + }, + { + "epoch": 7.157360406091371, + "grad_norm": 1.4811991453170776, + "learning_rate": 1.4216018048505359e-05, + "loss": 0.8556, + "step": 12690 + }, + { + "epoch": 7.157924421883813, + "grad_norm": 1.1232854127883911, + "learning_rate": 1.421319796954315e-05, + "loss": 0.7002, + "step": 12691 + }, + { + "epoch": 7.1584884376762545, + "grad_norm": 1.0626399517059326, + "learning_rate": 1.4210377890580936e-05, + "loss": 0.7397, + "step": 12692 + }, + { + "epoch": 7.159052453468697, + "grad_norm": 0.8041501641273499, + "learning_rate": 1.4207557811618727e-05, + "loss": 0.6688, + "step": 12693 + }, + { + "epoch": 7.159616469261139, + "grad_norm": 0.9410261511802673, + "learning_rate": 1.4204737732656514e-05, + "loss": 0.6158, + "step": 12694 + }, + { + "epoch": 7.160180485053582, + "grad_norm": 0.9059900641441345, + "learning_rate": 1.4201917653694304e-05, + "loss": 0.693, + "step": 12695 + }, + { + "epoch": 7.1607445008460235, + "grad_norm": 0.9660104513168335, + "learning_rate": 1.4199097574732093e-05, + "loss": 0.8188, + "step": 12696 + }, + { + "epoch": 7.161308516638466, + "grad_norm": 1.0974277257919312, + "learning_rate": 1.4196277495769884e-05, + "loss": 0.7534, + "step": 12697 + }, + { + "epoch": 7.161872532430908, + "grad_norm": 0.9339850544929504, + "learning_rate": 1.419345741680767e-05, + "loss": 0.7239, + "step": 12698 + }, + { + "epoch": 7.16243654822335, + "grad_norm": 1.1700491905212402, + "learning_rate": 1.4190637337845461e-05, + "loss": 0.6466, + "step": 12699 + }, + { + "epoch": 7.1630005640157925, + "grad_norm": 1.5696866512298584, + "learning_rate": 1.4187817258883248e-05, + "loss": 0.7641, + "step": 12700 + }, + { + "epoch": 7.163564579808234, + "grad_norm": 0.9481701850891113, + "learning_rate": 1.4184997179921039e-05, + "loss": 0.7812, + "step": 12701 + }, + { + "epoch": 7.164128595600677, + "grad_norm": 1.320637822151184, + "learning_rate": 1.4182177100958826e-05, + "loss": 0.662, + "step": 12702 + }, + { + "epoch": 7.164692611393119, + "grad_norm": 0.9295559525489807, + "learning_rate": 1.4179357021996616e-05, + "loss": 0.6449, + "step": 12703 + }, + { + "epoch": 7.1652566271855616, + "grad_norm": 1.0644100904464722, + "learning_rate": 1.4176536943034405e-05, + "loss": 0.8838, + "step": 12704 + }, + { + "epoch": 7.165820642978003, + "grad_norm": 1.1577214002609253, + "learning_rate": 1.4173716864072196e-05, + "loss": 0.5839, + "step": 12705 + }, + { + "epoch": 7.166384658770445, + "grad_norm": 4.089418411254883, + "learning_rate": 1.4170896785109983e-05, + "loss": 0.7774, + "step": 12706 + }, + { + "epoch": 7.166948674562888, + "grad_norm": 1.05019211769104, + "learning_rate": 1.4168076706147773e-05, + "loss": 0.6757, + "step": 12707 + }, + { + "epoch": 7.16751269035533, + "grad_norm": 1.1136856079101562, + "learning_rate": 1.416525662718556e-05, + "loss": 0.6917, + "step": 12708 + }, + { + "epoch": 7.168076706147772, + "grad_norm": 1.2550464868545532, + "learning_rate": 1.4162436548223351e-05, + "loss": 0.7221, + "step": 12709 + }, + { + "epoch": 7.168640721940214, + "grad_norm": 1.0244874954223633, + "learning_rate": 1.4159616469261138e-05, + "loss": 0.7428, + "step": 12710 + }, + { + "epoch": 7.169204737732657, + "grad_norm": 1.4039644002914429, + "learning_rate": 1.4156796390298928e-05, + "loss": 0.7107, + "step": 12711 + }, + { + "epoch": 7.169768753525099, + "grad_norm": 1.2034437656402588, + "learning_rate": 1.4153976311336717e-05, + "loss": 0.6815, + "step": 12712 + }, + { + "epoch": 7.1703327693175405, + "grad_norm": 1.2244389057159424, + "learning_rate": 1.4151156232374508e-05, + "loss": 0.7009, + "step": 12713 + }, + { + "epoch": 7.170896785109983, + "grad_norm": 0.8723857998847961, + "learning_rate": 1.4148336153412295e-05, + "loss": 0.6961, + "step": 12714 + }, + { + "epoch": 7.171460800902425, + "grad_norm": 0.947414219379425, + "learning_rate": 1.4145516074450085e-05, + "loss": 0.6794, + "step": 12715 + }, + { + "epoch": 7.172024816694868, + "grad_norm": 1.4575269222259521, + "learning_rate": 1.4142695995487876e-05, + "loss": 0.8918, + "step": 12716 + }, + { + "epoch": 7.1725888324873095, + "grad_norm": 1.2820518016815186, + "learning_rate": 1.4139875916525663e-05, + "loss": 0.7521, + "step": 12717 + }, + { + "epoch": 7.173152848279752, + "grad_norm": 1.2155035734176636, + "learning_rate": 1.4137055837563453e-05, + "loss": 0.7968, + "step": 12718 + }, + { + "epoch": 7.173716864072194, + "grad_norm": 1.1816468238830566, + "learning_rate": 1.413423575860124e-05, + "loss": 0.8248, + "step": 12719 + }, + { + "epoch": 7.174280879864636, + "grad_norm": 0.7165026664733887, + "learning_rate": 1.4131415679639031e-05, + "loss": 0.6346, + "step": 12720 + }, + { + "epoch": 7.1748448956570785, + "grad_norm": 1.375710368156433, + "learning_rate": 1.412859560067682e-05, + "loss": 0.6745, + "step": 12721 + }, + { + "epoch": 7.17540891144952, + "grad_norm": 1.0026865005493164, + "learning_rate": 1.412577552171461e-05, + "loss": 0.7063, + "step": 12722 + }, + { + "epoch": 7.175972927241963, + "grad_norm": 1.0040651559829712, + "learning_rate": 1.4122955442752397e-05, + "loss": 0.7537, + "step": 12723 + }, + { + "epoch": 7.176536943034405, + "grad_norm": 1.4222146272659302, + "learning_rate": 1.4120135363790188e-05, + "loss": 0.8221, + "step": 12724 + }, + { + "epoch": 7.1771009588268475, + "grad_norm": 0.9221699237823486, + "learning_rate": 1.4117315284827975e-05, + "loss": 0.7033, + "step": 12725 + }, + { + "epoch": 7.177664974619289, + "grad_norm": 1.2641531229019165, + "learning_rate": 1.4114495205865766e-05, + "loss": 0.7726, + "step": 12726 + }, + { + "epoch": 7.178228990411731, + "grad_norm": 1.1687675714492798, + "learning_rate": 1.4111675126903554e-05, + "loss": 0.7527, + "step": 12727 + }, + { + "epoch": 7.178793006204174, + "grad_norm": 1.167651891708374, + "learning_rate": 1.4108855047941343e-05, + "loss": 0.7172, + "step": 12728 + }, + { + "epoch": 7.179357021996616, + "grad_norm": 1.181254267692566, + "learning_rate": 1.4106034968979132e-05, + "loss": 0.859, + "step": 12729 + }, + { + "epoch": 7.179921037789058, + "grad_norm": 1.0932233333587646, + "learning_rate": 1.4103214890016922e-05, + "loss": 0.8162, + "step": 12730 + }, + { + "epoch": 7.1804850535815, + "grad_norm": 1.039055347442627, + "learning_rate": 1.410039481105471e-05, + "loss": 0.7258, + "step": 12731 + }, + { + "epoch": 7.181049069373943, + "grad_norm": 0.9431255459785461, + "learning_rate": 1.40975747320925e-05, + "loss": 0.6696, + "step": 12732 + }, + { + "epoch": 7.181613085166385, + "grad_norm": 1.0648887157440186, + "learning_rate": 1.4094754653130287e-05, + "loss": 0.8136, + "step": 12733 + }, + { + "epoch": 7.182177100958826, + "grad_norm": 0.8207390308380127, + "learning_rate": 1.4091934574168078e-05, + "loss": 0.6923, + "step": 12734 + }, + { + "epoch": 7.182741116751269, + "grad_norm": 1.2208746671676636, + "learning_rate": 1.4089114495205866e-05, + "loss": 0.6772, + "step": 12735 + }, + { + "epoch": 7.183305132543711, + "grad_norm": 1.896144151687622, + "learning_rate": 1.4086294416243657e-05, + "loss": 0.8109, + "step": 12736 + }, + { + "epoch": 7.183869148336154, + "grad_norm": 1.3452203273773193, + "learning_rate": 1.4083474337281444e-05, + "loss": 0.7613, + "step": 12737 + }, + { + "epoch": 7.1844331641285955, + "grad_norm": 1.2935407161712646, + "learning_rate": 1.4080654258319234e-05, + "loss": 0.7516, + "step": 12738 + }, + { + "epoch": 7.184997179921038, + "grad_norm": 1.2681665420532227, + "learning_rate": 1.4077834179357022e-05, + "loss": 0.6755, + "step": 12739 + }, + { + "epoch": 7.18556119571348, + "grad_norm": 0.9702036380767822, + "learning_rate": 1.4075014100394812e-05, + "loss": 0.74, + "step": 12740 + }, + { + "epoch": 7.186125211505922, + "grad_norm": 1.5241252183914185, + "learning_rate": 1.4072194021432599e-05, + "loss": 0.883, + "step": 12741 + }, + { + "epoch": 7.1866892272983645, + "grad_norm": 1.0513970851898193, + "learning_rate": 1.406937394247039e-05, + "loss": 0.8183, + "step": 12742 + }, + { + "epoch": 7.187253243090806, + "grad_norm": 1.4664500951766968, + "learning_rate": 1.4066553863508178e-05, + "loss": 0.795, + "step": 12743 + }, + { + "epoch": 7.187817258883249, + "grad_norm": 1.332922101020813, + "learning_rate": 1.4063733784545969e-05, + "loss": 0.7859, + "step": 12744 + }, + { + "epoch": 7.188381274675691, + "grad_norm": 1.2444826364517212, + "learning_rate": 1.4060913705583756e-05, + "loss": 0.7676, + "step": 12745 + }, + { + "epoch": 7.1889452904681335, + "grad_norm": 1.251089334487915, + "learning_rate": 1.4058093626621547e-05, + "loss": 0.6654, + "step": 12746 + }, + { + "epoch": 7.189509306260575, + "grad_norm": 1.1850908994674683, + "learning_rate": 1.4055273547659334e-05, + "loss": 0.7055, + "step": 12747 + }, + { + "epoch": 7.190073322053017, + "grad_norm": 1.2273164987564087, + "learning_rate": 1.4052453468697124e-05, + "loss": 0.7775, + "step": 12748 + }, + { + "epoch": 7.19063733784546, + "grad_norm": 1.25630521774292, + "learning_rate": 1.4049633389734911e-05, + "loss": 0.763, + "step": 12749 + }, + { + "epoch": 7.191201353637902, + "grad_norm": 1.0099239349365234, + "learning_rate": 1.4046813310772702e-05, + "loss": 0.7391, + "step": 12750 + }, + { + "epoch": 7.191765369430344, + "grad_norm": 1.1621524095535278, + "learning_rate": 1.404399323181049e-05, + "loss": 0.7366, + "step": 12751 + }, + { + "epoch": 7.192329385222786, + "grad_norm": 1.3265228271484375, + "learning_rate": 1.4041173152848281e-05, + "loss": 0.7735, + "step": 12752 + }, + { + "epoch": 7.192893401015229, + "grad_norm": 0.9678058624267578, + "learning_rate": 1.4038353073886071e-05, + "loss": 0.7532, + "step": 12753 + }, + { + "epoch": 7.193457416807671, + "grad_norm": 1.1261624097824097, + "learning_rate": 1.4035532994923859e-05, + "loss": 0.7358, + "step": 12754 + }, + { + "epoch": 7.194021432600112, + "grad_norm": 1.3635109663009644, + "learning_rate": 1.4032712915961649e-05, + "loss": 0.606, + "step": 12755 + }, + { + "epoch": 7.194585448392555, + "grad_norm": 0.9547290205955505, + "learning_rate": 1.4029892836999436e-05, + "loss": 0.6926, + "step": 12756 + }, + { + "epoch": 7.195149464184997, + "grad_norm": 1.3446670770645142, + "learning_rate": 1.4027072758037227e-05, + "loss": 0.7315, + "step": 12757 + }, + { + "epoch": 7.19571347997744, + "grad_norm": 1.044897198677063, + "learning_rate": 1.4024252679075014e-05, + "loss": 0.8119, + "step": 12758 + }, + { + "epoch": 7.196277495769881, + "grad_norm": 0.8914640545845032, + "learning_rate": 1.4021432600112804e-05, + "loss": 0.6788, + "step": 12759 + }, + { + "epoch": 7.196841511562324, + "grad_norm": 1.2406812906265259, + "learning_rate": 1.4018612521150593e-05, + "loss": 0.6996, + "step": 12760 + }, + { + "epoch": 7.197405527354766, + "grad_norm": 1.3336269855499268, + "learning_rate": 1.4015792442188384e-05, + "loss": 0.6927, + "step": 12761 + }, + { + "epoch": 7.197969543147208, + "grad_norm": 0.9501215815544128, + "learning_rate": 1.401297236322617e-05, + "loss": 0.6736, + "step": 12762 + }, + { + "epoch": 7.19853355893965, + "grad_norm": 1.1028584241867065, + "learning_rate": 1.4010152284263961e-05, + "loss": 0.7933, + "step": 12763 + }, + { + "epoch": 7.199097574732092, + "grad_norm": 1.0575264692306519, + "learning_rate": 1.4007332205301748e-05, + "loss": 0.7453, + "step": 12764 + }, + { + "epoch": 7.199661590524535, + "grad_norm": 1.0190428495407104, + "learning_rate": 1.4004512126339539e-05, + "loss": 0.7394, + "step": 12765 + }, + { + "epoch": 7.200225606316977, + "grad_norm": 1.425191879272461, + "learning_rate": 1.4001692047377326e-05, + "loss": 0.8143, + "step": 12766 + }, + { + "epoch": 7.200789622109419, + "grad_norm": 1.8370082378387451, + "learning_rate": 1.3998871968415116e-05, + "loss": 0.8381, + "step": 12767 + }, + { + "epoch": 7.201353637901861, + "grad_norm": 1.1444491147994995, + "learning_rate": 1.3996051889452905e-05, + "loss": 0.7689, + "step": 12768 + }, + { + "epoch": 7.201917653694303, + "grad_norm": 1.2121846675872803, + "learning_rate": 1.3993231810490696e-05, + "loss": 0.6815, + "step": 12769 + }, + { + "epoch": 7.202481669486746, + "grad_norm": 1.1490103006362915, + "learning_rate": 1.3990411731528483e-05, + "loss": 0.7146, + "step": 12770 + }, + { + "epoch": 7.2030456852791875, + "grad_norm": 1.1278396844863892, + "learning_rate": 1.3987591652566273e-05, + "loss": 0.6073, + "step": 12771 + }, + { + "epoch": 7.20360970107163, + "grad_norm": 1.7832733392715454, + "learning_rate": 1.398477157360406e-05, + "loss": 0.6978, + "step": 12772 + }, + { + "epoch": 7.204173716864072, + "grad_norm": 1.0360530614852905, + "learning_rate": 1.398195149464185e-05, + "loss": 0.6404, + "step": 12773 + }, + { + "epoch": 7.204737732656515, + "grad_norm": 1.2686057090759277, + "learning_rate": 1.397913141567964e-05, + "loss": 0.7586, + "step": 12774 + }, + { + "epoch": 7.2053017484489565, + "grad_norm": 1.467653512954712, + "learning_rate": 1.3976311336717428e-05, + "loss": 0.7593, + "step": 12775 + }, + { + "epoch": 7.205865764241398, + "grad_norm": 1.1332703828811646, + "learning_rate": 1.3973491257755217e-05, + "loss": 0.8231, + "step": 12776 + }, + { + "epoch": 7.206429780033841, + "grad_norm": 0.9784940481185913, + "learning_rate": 1.3970671178793008e-05, + "loss": 0.7494, + "step": 12777 + }, + { + "epoch": 7.206993795826283, + "grad_norm": 0.9160412549972534, + "learning_rate": 1.3967851099830795e-05, + "loss": 0.7007, + "step": 12778 + }, + { + "epoch": 7.2075578116187256, + "grad_norm": 1.6200060844421387, + "learning_rate": 1.3965031020868585e-05, + "loss": 0.8398, + "step": 12779 + }, + { + "epoch": 7.208121827411167, + "grad_norm": 1.0702356100082397, + "learning_rate": 1.3962210941906372e-05, + "loss": 0.8442, + "step": 12780 + }, + { + "epoch": 7.20868584320361, + "grad_norm": 1.3440786600112915, + "learning_rate": 1.3959390862944163e-05, + "loss": 0.6412, + "step": 12781 + }, + { + "epoch": 7.209249858996052, + "grad_norm": 1.349217176437378, + "learning_rate": 1.3956570783981952e-05, + "loss": 0.7379, + "step": 12782 + }, + { + "epoch": 7.209813874788494, + "grad_norm": 0.9128397703170776, + "learning_rate": 1.3953750705019742e-05, + "loss": 0.7338, + "step": 12783 + }, + { + "epoch": 7.210377890580936, + "grad_norm": 1.1185263395309448, + "learning_rate": 1.395093062605753e-05, + "loss": 0.8189, + "step": 12784 + }, + { + "epoch": 7.210941906373378, + "grad_norm": 0.9962294101715088, + "learning_rate": 1.394811054709532e-05, + "loss": 0.7135, + "step": 12785 + }, + { + "epoch": 7.211505922165821, + "grad_norm": 1.1719062328338623, + "learning_rate": 1.3945290468133107e-05, + "loss": 0.6807, + "step": 12786 + }, + { + "epoch": 7.212069937958263, + "grad_norm": 1.038213849067688, + "learning_rate": 1.3942470389170897e-05, + "loss": 0.6496, + "step": 12787 + }, + { + "epoch": 7.212633953750705, + "grad_norm": 1.331235647201538, + "learning_rate": 1.3939650310208688e-05, + "loss": 0.8337, + "step": 12788 + }, + { + "epoch": 7.213197969543147, + "grad_norm": 1.0672245025634766, + "learning_rate": 1.3936830231246475e-05, + "loss": 0.7091, + "step": 12789 + }, + { + "epoch": 7.213761985335589, + "grad_norm": 1.0591466426849365, + "learning_rate": 1.3934010152284265e-05, + "loss": 0.7195, + "step": 12790 + }, + { + "epoch": 7.214326001128032, + "grad_norm": 1.3760697841644287, + "learning_rate": 1.3931190073322054e-05, + "loss": 0.8741, + "step": 12791 + }, + { + "epoch": 7.2148900169204735, + "grad_norm": 1.28533136844635, + "learning_rate": 1.3928369994359845e-05, + "loss": 0.6941, + "step": 12792 + }, + { + "epoch": 7.215454032712916, + "grad_norm": 0.8171401619911194, + "learning_rate": 1.3925549915397632e-05, + "loss": 0.6704, + "step": 12793 + }, + { + "epoch": 7.216018048505358, + "grad_norm": 1.2155076265335083, + "learning_rate": 1.3922729836435422e-05, + "loss": 0.8197, + "step": 12794 + }, + { + "epoch": 7.216582064297801, + "grad_norm": 1.1597063541412354, + "learning_rate": 1.391990975747321e-05, + "loss": 0.7611, + "step": 12795 + }, + { + "epoch": 7.2171460800902425, + "grad_norm": 1.644256591796875, + "learning_rate": 1.3917089678511e-05, + "loss": 0.838, + "step": 12796 + }, + { + "epoch": 7.217710095882684, + "grad_norm": 1.1829462051391602, + "learning_rate": 1.3914269599548787e-05, + "loss": 0.7651, + "step": 12797 + }, + { + "epoch": 7.218274111675127, + "grad_norm": 1.268670678138733, + "learning_rate": 1.3911449520586577e-05, + "loss": 0.7035, + "step": 12798 + }, + { + "epoch": 7.218838127467569, + "grad_norm": 0.9415582418441772, + "learning_rate": 1.3908629441624366e-05, + "loss": 0.704, + "step": 12799 + }, + { + "epoch": 7.2194021432600115, + "grad_norm": 1.1944586038589478, + "learning_rate": 1.3905809362662157e-05, + "loss": 0.7729, + "step": 12800 + }, + { + "epoch": 7.219966159052453, + "grad_norm": 1.2383314371109009, + "learning_rate": 1.3902989283699944e-05, + "loss": 0.6694, + "step": 12801 + }, + { + "epoch": 7.220530174844896, + "grad_norm": 0.7599455118179321, + "learning_rate": 1.3900169204737734e-05, + "loss": 0.6476, + "step": 12802 + }, + { + "epoch": 7.221094190637338, + "grad_norm": 1.054945945739746, + "learning_rate": 1.3897349125775521e-05, + "loss": 0.6515, + "step": 12803 + }, + { + "epoch": 7.22165820642978, + "grad_norm": 0.9464331865310669, + "learning_rate": 1.3894529046813312e-05, + "loss": 0.6469, + "step": 12804 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 0.8269473910331726, + "learning_rate": 1.3891708967851099e-05, + "loss": 0.6462, + "step": 12805 + }, + { + "epoch": 7.222786238014664, + "grad_norm": 1.241490364074707, + "learning_rate": 1.388888888888889e-05, + "loss": 0.8503, + "step": 12806 + }, + { + "epoch": 7.223350253807107, + "grad_norm": 1.1778844594955444, + "learning_rate": 1.3886068809926678e-05, + "loss": 0.7237, + "step": 12807 + }, + { + "epoch": 7.223914269599549, + "grad_norm": 1.1724286079406738, + "learning_rate": 1.3883248730964469e-05, + "loss": 0.7708, + "step": 12808 + }, + { + "epoch": 7.224478285391991, + "grad_norm": 1.0056736469268799, + "learning_rate": 1.3880428652002256e-05, + "loss": 0.6606, + "step": 12809 + }, + { + "epoch": 7.225042301184433, + "grad_norm": 1.704931378364563, + "learning_rate": 1.3877608573040046e-05, + "loss": 0.8633, + "step": 12810 + }, + { + "epoch": 7.225606316976875, + "grad_norm": 1.2625576257705688, + "learning_rate": 1.3874788494077834e-05, + "loss": 0.7262, + "step": 12811 + }, + { + "epoch": 7.226170332769318, + "grad_norm": 1.1610517501831055, + "learning_rate": 1.3871968415115624e-05, + "loss": 0.6472, + "step": 12812 + }, + { + "epoch": 7.2267343485617594, + "grad_norm": 1.2757381200790405, + "learning_rate": 1.3869148336153411e-05, + "loss": 0.8132, + "step": 12813 + }, + { + "epoch": 7.227298364354202, + "grad_norm": 1.0739467144012451, + "learning_rate": 1.3866328257191202e-05, + "loss": 0.6647, + "step": 12814 + }, + { + "epoch": 7.227862380146644, + "grad_norm": 1.6823617219924927, + "learning_rate": 1.386350817822899e-05, + "loss": 0.7369, + "step": 12815 + }, + { + "epoch": 7.228426395939087, + "grad_norm": 0.8261311650276184, + "learning_rate": 1.3860688099266781e-05, + "loss": 0.6263, + "step": 12816 + }, + { + "epoch": 7.2289904117315285, + "grad_norm": 0.9875473976135254, + "learning_rate": 1.3857868020304568e-05, + "loss": 0.6548, + "step": 12817 + }, + { + "epoch": 7.22955442752397, + "grad_norm": 1.3431278467178345, + "learning_rate": 1.3855047941342359e-05, + "loss": 0.7665, + "step": 12818 + }, + { + "epoch": 7.230118443316413, + "grad_norm": 0.9709897041320801, + "learning_rate": 1.3852227862380146e-05, + "loss": 0.6152, + "step": 12819 + }, + { + "epoch": 7.230682459108855, + "grad_norm": 1.271140456199646, + "learning_rate": 1.3849407783417936e-05, + "loss": 0.6849, + "step": 12820 + }, + { + "epoch": 7.2312464749012975, + "grad_norm": 0.9815000891685486, + "learning_rate": 1.3846587704455725e-05, + "loss": 0.7549, + "step": 12821 + }, + { + "epoch": 7.231810490693739, + "grad_norm": 1.0233222246170044, + "learning_rate": 1.3843767625493514e-05, + "loss": 0.6458, + "step": 12822 + }, + { + "epoch": 7.232374506486182, + "grad_norm": 1.1369304656982422, + "learning_rate": 1.3840947546531304e-05, + "loss": 0.7593, + "step": 12823 + }, + { + "epoch": 7.232938522278624, + "grad_norm": 1.655411720275879, + "learning_rate": 1.3838127467569093e-05, + "loss": 0.7815, + "step": 12824 + }, + { + "epoch": 7.233502538071066, + "grad_norm": 1.0084413290023804, + "learning_rate": 1.3835307388606883e-05, + "loss": 0.6518, + "step": 12825 + }, + { + "epoch": 7.234066553863508, + "grad_norm": 1.432975172996521, + "learning_rate": 1.383248730964467e-05, + "loss": 0.798, + "step": 12826 + }, + { + "epoch": 7.23463056965595, + "grad_norm": 0.8066166043281555, + "learning_rate": 1.3829667230682461e-05, + "loss": 0.604, + "step": 12827 + }, + { + "epoch": 7.235194585448393, + "grad_norm": 1.156788945198059, + "learning_rate": 1.3826847151720248e-05, + "loss": 0.7661, + "step": 12828 + }, + { + "epoch": 7.235758601240835, + "grad_norm": 1.171623945236206, + "learning_rate": 1.3824027072758039e-05, + "loss": 0.7184, + "step": 12829 + }, + { + "epoch": 7.236322617033277, + "grad_norm": 1.2534642219543457, + "learning_rate": 1.3821206993795827e-05, + "loss": 0.7813, + "step": 12830 + }, + { + "epoch": 7.236886632825719, + "grad_norm": 1.3068218231201172, + "learning_rate": 1.3818386914833616e-05, + "loss": 0.7529, + "step": 12831 + }, + { + "epoch": 7.237450648618161, + "grad_norm": 1.3093619346618652, + "learning_rate": 1.3815566835871405e-05, + "loss": 0.7404, + "step": 12832 + }, + { + "epoch": 7.238014664410604, + "grad_norm": 0.8684159517288208, + "learning_rate": 1.3812746756909196e-05, + "loss": 0.7319, + "step": 12833 + }, + { + "epoch": 7.238578680203045, + "grad_norm": 0.7644657492637634, + "learning_rate": 1.3809926677946983e-05, + "loss": 0.6117, + "step": 12834 + }, + { + "epoch": 7.239142695995488, + "grad_norm": 1.181040644645691, + "learning_rate": 1.3807106598984773e-05, + "loss": 0.6558, + "step": 12835 + }, + { + "epoch": 7.23970671178793, + "grad_norm": 1.1636250019073486, + "learning_rate": 1.380428652002256e-05, + "loss": 0.7572, + "step": 12836 + }, + { + "epoch": 7.240270727580373, + "grad_norm": 0.9466129541397095, + "learning_rate": 1.380146644106035e-05, + "loss": 0.6875, + "step": 12837 + }, + { + "epoch": 7.240834743372814, + "grad_norm": 1.2095483541488647, + "learning_rate": 1.379864636209814e-05, + "loss": 0.7683, + "step": 12838 + }, + { + "epoch": 7.241398759165256, + "grad_norm": 0.9651422500610352, + "learning_rate": 1.379582628313593e-05, + "loss": 0.7151, + "step": 12839 + }, + { + "epoch": 7.241962774957699, + "grad_norm": 1.3382840156555176, + "learning_rate": 1.3793006204173717e-05, + "loss": 0.8223, + "step": 12840 + }, + { + "epoch": 7.242526790750141, + "grad_norm": 1.03338623046875, + "learning_rate": 1.3790186125211508e-05, + "loss": 0.7442, + "step": 12841 + }, + { + "epoch": 7.243090806542583, + "grad_norm": 1.2269293069839478, + "learning_rate": 1.3787366046249295e-05, + "loss": 0.6584, + "step": 12842 + }, + { + "epoch": 7.243654822335025, + "grad_norm": 1.3081505298614502, + "learning_rate": 1.3784545967287085e-05, + "loss": 0.6796, + "step": 12843 + }, + { + "epoch": 7.244218838127468, + "grad_norm": 1.353606104850769, + "learning_rate": 1.3781725888324872e-05, + "loss": 0.81, + "step": 12844 + }, + { + "epoch": 7.24478285391991, + "grad_norm": 1.1820824146270752, + "learning_rate": 1.3778905809362663e-05, + "loss": 0.7022, + "step": 12845 + }, + { + "epoch": 7.2453468697123515, + "grad_norm": 1.2430474758148193, + "learning_rate": 1.3776085730400452e-05, + "loss": 0.7539, + "step": 12846 + }, + { + "epoch": 7.245910885504794, + "grad_norm": 0.930336594581604, + "learning_rate": 1.3773265651438242e-05, + "loss": 0.7271, + "step": 12847 + }, + { + "epoch": 7.246474901297236, + "grad_norm": 1.3661794662475586, + "learning_rate": 1.377044557247603e-05, + "loss": 0.7688, + "step": 12848 + }, + { + "epoch": 7.247038917089679, + "grad_norm": 1.213212013244629, + "learning_rate": 1.376762549351382e-05, + "loss": 0.7227, + "step": 12849 + }, + { + "epoch": 7.2476029328821205, + "grad_norm": 1.2047210931777954, + "learning_rate": 1.3764805414551607e-05, + "loss": 0.7488, + "step": 12850 + }, + { + "epoch": 7.248166948674563, + "grad_norm": 1.1765846014022827, + "learning_rate": 1.3761985335589397e-05, + "loss": 0.7255, + "step": 12851 + }, + { + "epoch": 7.248730964467005, + "grad_norm": 1.0786503553390503, + "learning_rate": 1.3759165256627184e-05, + "loss": 0.7751, + "step": 12852 + }, + { + "epoch": 7.249294980259447, + "grad_norm": 1.0083400011062622, + "learning_rate": 1.3756345177664975e-05, + "loss": 0.6747, + "step": 12853 + }, + { + "epoch": 7.2498589960518895, + "grad_norm": 1.0228312015533447, + "learning_rate": 1.3753525098702764e-05, + "loss": 0.7245, + "step": 12854 + }, + { + "epoch": 7.250423011844331, + "grad_norm": 3.321502208709717, + "learning_rate": 1.3750705019740554e-05, + "loss": 0.7237, + "step": 12855 + }, + { + "epoch": 7.250987027636774, + "grad_norm": 1.1596121788024902, + "learning_rate": 1.3747884940778341e-05, + "loss": 0.7579, + "step": 12856 + }, + { + "epoch": 7.251551043429216, + "grad_norm": 1.0391615629196167, + "learning_rate": 1.3745064861816132e-05, + "loss": 0.7477, + "step": 12857 + }, + { + "epoch": 7.2521150592216586, + "grad_norm": 1.2783921957015991, + "learning_rate": 1.3742244782853922e-05, + "loss": 0.7275, + "step": 12858 + }, + { + "epoch": 7.2526790750141, + "grad_norm": 1.0155233144760132, + "learning_rate": 1.373942470389171e-05, + "loss": 0.7611, + "step": 12859 + }, + { + "epoch": 7.253243090806542, + "grad_norm": 0.9329730868339539, + "learning_rate": 1.37366046249295e-05, + "loss": 0.8082, + "step": 12860 + }, + { + "epoch": 7.253807106598985, + "grad_norm": 1.4760785102844238, + "learning_rate": 1.3733784545967287e-05, + "loss": 0.7752, + "step": 12861 + }, + { + "epoch": 7.254371122391427, + "grad_norm": 1.200742483139038, + "learning_rate": 1.3730964467005077e-05, + "loss": 0.7473, + "step": 12862 + }, + { + "epoch": 7.254935138183869, + "grad_norm": 1.2568106651306152, + "learning_rate": 1.3728144388042866e-05, + "loss": 0.8864, + "step": 12863 + }, + { + "epoch": 7.255499153976311, + "grad_norm": 0.9832430481910706, + "learning_rate": 1.3725324309080657e-05, + "loss": 0.6853, + "step": 12864 + }, + { + "epoch": 7.256063169768754, + "grad_norm": 1.2159067392349243, + "learning_rate": 1.3722504230118444e-05, + "loss": 0.7633, + "step": 12865 + }, + { + "epoch": 7.256627185561196, + "grad_norm": 1.0031987428665161, + "learning_rate": 1.3719684151156234e-05, + "loss": 0.6691, + "step": 12866 + }, + { + "epoch": 7.2571912013536375, + "grad_norm": 1.2634044885635376, + "learning_rate": 1.3716864072194021e-05, + "loss": 0.6634, + "step": 12867 + }, + { + "epoch": 7.25775521714608, + "grad_norm": 1.1051123142242432, + "learning_rate": 1.3714043993231812e-05, + "loss": 0.6589, + "step": 12868 + }, + { + "epoch": 7.258319232938522, + "grad_norm": 0.8843507766723633, + "learning_rate": 1.3711223914269599e-05, + "loss": 0.6929, + "step": 12869 + }, + { + "epoch": 7.258883248730965, + "grad_norm": 1.2374855279922485, + "learning_rate": 1.370840383530739e-05, + "loss": 0.7416, + "step": 12870 + }, + { + "epoch": 7.2594472645234065, + "grad_norm": 1.068827509880066, + "learning_rate": 1.3705583756345178e-05, + "loss": 0.7217, + "step": 12871 + }, + { + "epoch": 7.260011280315849, + "grad_norm": 1.1910480260849, + "learning_rate": 1.3702763677382969e-05, + "loss": 0.7322, + "step": 12872 + }, + { + "epoch": 7.260575296108291, + "grad_norm": 1.501662254333496, + "learning_rate": 1.3699943598420756e-05, + "loss": 0.808, + "step": 12873 + }, + { + "epoch": 7.261139311900733, + "grad_norm": 1.134765625, + "learning_rate": 1.3697123519458546e-05, + "loss": 0.7508, + "step": 12874 + }, + { + "epoch": 7.2617033276931755, + "grad_norm": 1.141241431236267, + "learning_rate": 1.3694303440496333e-05, + "loss": 0.725, + "step": 12875 + }, + { + "epoch": 7.262267343485617, + "grad_norm": 1.4585965871810913, + "learning_rate": 1.3691483361534124e-05, + "loss": 0.8002, + "step": 12876 + }, + { + "epoch": 7.26283135927806, + "grad_norm": 1.3375070095062256, + "learning_rate": 1.3688663282571911e-05, + "loss": 0.7008, + "step": 12877 + }, + { + "epoch": 7.263395375070502, + "grad_norm": 1.1142858266830444, + "learning_rate": 1.3685843203609702e-05, + "loss": 0.7453, + "step": 12878 + }, + { + "epoch": 7.2639593908629445, + "grad_norm": 1.9373037815093994, + "learning_rate": 1.368302312464749e-05, + "loss": 0.811, + "step": 12879 + }, + { + "epoch": 7.264523406655386, + "grad_norm": 1.066938042640686, + "learning_rate": 1.368020304568528e-05, + "loss": 0.7797, + "step": 12880 + }, + { + "epoch": 7.265087422447828, + "grad_norm": 0.9015576839447021, + "learning_rate": 1.3677382966723068e-05, + "loss": 0.7256, + "step": 12881 + }, + { + "epoch": 7.265651438240271, + "grad_norm": 1.297222375869751, + "learning_rate": 1.3674562887760858e-05, + "loss": 0.6491, + "step": 12882 + }, + { + "epoch": 7.266215454032713, + "grad_norm": 1.5587122440338135, + "learning_rate": 1.3671742808798646e-05, + "loss": 0.7815, + "step": 12883 + }, + { + "epoch": 7.266779469825155, + "grad_norm": 0.9139974117279053, + "learning_rate": 1.3668922729836436e-05, + "loss": 0.7429, + "step": 12884 + }, + { + "epoch": 7.267343485617597, + "grad_norm": 1.0931682586669922, + "learning_rate": 1.3666102650874225e-05, + "loss": 0.6048, + "step": 12885 + }, + { + "epoch": 7.26790750141004, + "grad_norm": 0.8316499590873718, + "learning_rate": 1.3663282571912014e-05, + "loss": 0.6948, + "step": 12886 + }, + { + "epoch": 7.268471517202482, + "grad_norm": 1.1159683465957642, + "learning_rate": 1.3660462492949802e-05, + "loss": 0.7868, + "step": 12887 + }, + { + "epoch": 7.269035532994923, + "grad_norm": 1.2674044370651245, + "learning_rate": 1.3657642413987593e-05, + "loss": 0.7355, + "step": 12888 + }, + { + "epoch": 7.269599548787366, + "grad_norm": 1.5052582025527954, + "learning_rate": 1.365482233502538e-05, + "loss": 0.7428, + "step": 12889 + }, + { + "epoch": 7.270163564579808, + "grad_norm": 0.953239381313324, + "learning_rate": 1.365200225606317e-05, + "loss": 0.7361, + "step": 12890 + }, + { + "epoch": 7.270727580372251, + "grad_norm": 1.0636948347091675, + "learning_rate": 1.3649182177100958e-05, + "loss": 0.8299, + "step": 12891 + }, + { + "epoch": 7.2712915961646925, + "grad_norm": 1.2508398294448853, + "learning_rate": 1.3646362098138748e-05, + "loss": 0.7571, + "step": 12892 + }, + { + "epoch": 7.271855611957135, + "grad_norm": 1.0599108934402466, + "learning_rate": 1.3643542019176539e-05, + "loss": 0.7405, + "step": 12893 + }, + { + "epoch": 7.272419627749577, + "grad_norm": 1.2770776748657227, + "learning_rate": 1.3640721940214327e-05, + "loss": 0.7561, + "step": 12894 + }, + { + "epoch": 7.272983643542019, + "grad_norm": 1.9113316535949707, + "learning_rate": 1.3637901861252116e-05, + "loss": 0.9519, + "step": 12895 + }, + { + "epoch": 7.2735476593344615, + "grad_norm": 1.246008276939392, + "learning_rate": 1.3635081782289905e-05, + "loss": 0.7453, + "step": 12896 + }, + { + "epoch": 7.274111675126903, + "grad_norm": 1.0447384119033813, + "learning_rate": 1.3632261703327695e-05, + "loss": 0.7283, + "step": 12897 + }, + { + "epoch": 7.274675690919346, + "grad_norm": 1.4999542236328125, + "learning_rate": 1.3629441624365483e-05, + "loss": 0.8128, + "step": 12898 + }, + { + "epoch": 7.275239706711788, + "grad_norm": 2.858595848083496, + "learning_rate": 1.3626621545403273e-05, + "loss": 0.6779, + "step": 12899 + }, + { + "epoch": 7.2758037225042305, + "grad_norm": 0.9866513013839722, + "learning_rate": 1.362380146644106e-05, + "loss": 0.6535, + "step": 12900 + }, + { + "epoch": 7.276367738296672, + "grad_norm": 1.0323022603988647, + "learning_rate": 1.362098138747885e-05, + "loss": 0.6925, + "step": 12901 + }, + { + "epoch": 7.276931754089114, + "grad_norm": 1.1696239709854126, + "learning_rate": 1.361816130851664e-05, + "loss": 0.6592, + "step": 12902 + }, + { + "epoch": 7.277495769881557, + "grad_norm": 1.0865702629089355, + "learning_rate": 1.361534122955443e-05, + "loss": 0.7842, + "step": 12903 + }, + { + "epoch": 7.278059785673999, + "grad_norm": 0.9431591629981995, + "learning_rate": 1.3612521150592217e-05, + "loss": 0.8296, + "step": 12904 + }, + { + "epoch": 7.278623801466441, + "grad_norm": 1.1291007995605469, + "learning_rate": 1.3609701071630008e-05, + "loss": 0.8487, + "step": 12905 + }, + { + "epoch": 7.279187817258883, + "grad_norm": 0.9359367489814758, + "learning_rate": 1.3606880992667795e-05, + "loss": 0.581, + "step": 12906 + }, + { + "epoch": 7.279751833051326, + "grad_norm": 1.3543709516525269, + "learning_rate": 1.3604060913705585e-05, + "loss": 0.7069, + "step": 12907 + }, + { + "epoch": 7.280315848843768, + "grad_norm": 1.112746238708496, + "learning_rate": 1.3601240834743372e-05, + "loss": 0.8043, + "step": 12908 + }, + { + "epoch": 7.280879864636209, + "grad_norm": 0.9643915295600891, + "learning_rate": 1.3598420755781163e-05, + "loss": 0.7228, + "step": 12909 + }, + { + "epoch": 7.281443880428652, + "grad_norm": 0.9304347634315491, + "learning_rate": 1.3595600676818951e-05, + "loss": 0.728, + "step": 12910 + }, + { + "epoch": 7.282007896221094, + "grad_norm": 1.092449426651001, + "learning_rate": 1.3592780597856742e-05, + "loss": 0.7322, + "step": 12911 + }, + { + "epoch": 7.282571912013537, + "grad_norm": 1.0412907600402832, + "learning_rate": 1.3589960518894529e-05, + "loss": 0.7045, + "step": 12912 + }, + { + "epoch": 7.283135927805978, + "grad_norm": 1.0284243822097778, + "learning_rate": 1.358714043993232e-05, + "loss": 0.7735, + "step": 12913 + }, + { + "epoch": 7.283699943598421, + "grad_norm": 1.4327999353408813, + "learning_rate": 1.3584320360970107e-05, + "loss": 0.7556, + "step": 12914 + }, + { + "epoch": 7.284263959390863, + "grad_norm": 0.9387492537498474, + "learning_rate": 1.3581500282007897e-05, + "loss": 0.6289, + "step": 12915 + }, + { + "epoch": 7.284827975183305, + "grad_norm": 0.8996789455413818, + "learning_rate": 1.3578680203045684e-05, + "loss": 0.6821, + "step": 12916 + }, + { + "epoch": 7.285391990975747, + "grad_norm": 1.0676565170288086, + "learning_rate": 1.3575860124083475e-05, + "loss": 0.7647, + "step": 12917 + }, + { + "epoch": 7.285956006768189, + "grad_norm": 1.3899372816085815, + "learning_rate": 1.3573040045121264e-05, + "loss": 0.8849, + "step": 12918 + }, + { + "epoch": 7.286520022560632, + "grad_norm": 0.9231827259063721, + "learning_rate": 1.3570219966159054e-05, + "loss": 0.6507, + "step": 12919 + }, + { + "epoch": 7.287084038353074, + "grad_norm": 1.3501487970352173, + "learning_rate": 1.3567399887196841e-05, + "loss": 0.8134, + "step": 12920 + }, + { + "epoch": 7.287648054145516, + "grad_norm": 1.1659247875213623, + "learning_rate": 1.3564579808234632e-05, + "loss": 0.7374, + "step": 12921 + }, + { + "epoch": 7.288212069937958, + "grad_norm": 1.324929118156433, + "learning_rate": 1.3561759729272419e-05, + "loss": 0.8222, + "step": 12922 + }, + { + "epoch": 7.288776085730401, + "grad_norm": 1.3988882303237915, + "learning_rate": 1.355893965031021e-05, + "loss": 0.7536, + "step": 12923 + }, + { + "epoch": 7.289340101522843, + "grad_norm": 1.2552098035812378, + "learning_rate": 1.3556119571347996e-05, + "loss": 0.7721, + "step": 12924 + }, + { + "epoch": 7.2899041173152845, + "grad_norm": 1.0636528730392456, + "learning_rate": 1.3553299492385787e-05, + "loss": 0.7545, + "step": 12925 + }, + { + "epoch": 7.290468133107727, + "grad_norm": 1.0278080701828003, + "learning_rate": 1.3550479413423576e-05, + "loss": 0.7031, + "step": 12926 + }, + { + "epoch": 7.291032148900169, + "grad_norm": 1.0674768686294556, + "learning_rate": 1.3547659334461366e-05, + "loss": 0.6738, + "step": 12927 + }, + { + "epoch": 7.291596164692612, + "grad_norm": 1.0209589004516602, + "learning_rate": 1.3544839255499157e-05, + "loss": 0.6239, + "step": 12928 + }, + { + "epoch": 7.2921601804850535, + "grad_norm": 1.072161316871643, + "learning_rate": 1.3542019176536944e-05, + "loss": 0.7005, + "step": 12929 + }, + { + "epoch": 7.292724196277495, + "grad_norm": 1.1728277206420898, + "learning_rate": 1.3539199097574734e-05, + "loss": 0.8384, + "step": 12930 + }, + { + "epoch": 7.293288212069938, + "grad_norm": 1.3893349170684814, + "learning_rate": 1.3536379018612521e-05, + "loss": 0.7456, + "step": 12931 + }, + { + "epoch": 7.29385222786238, + "grad_norm": 0.9923695921897888, + "learning_rate": 1.3533558939650312e-05, + "loss": 0.6877, + "step": 12932 + }, + { + "epoch": 7.2944162436548226, + "grad_norm": 1.0548471212387085, + "learning_rate": 1.3530738860688099e-05, + "loss": 0.7164, + "step": 12933 + }, + { + "epoch": 7.294980259447264, + "grad_norm": 1.5754328966140747, + "learning_rate": 1.352791878172589e-05, + "loss": 0.7975, + "step": 12934 + }, + { + "epoch": 7.295544275239707, + "grad_norm": 1.2222615480422974, + "learning_rate": 1.3525098702763678e-05, + "loss": 0.6529, + "step": 12935 + }, + { + "epoch": 7.296108291032149, + "grad_norm": 1.0843242406845093, + "learning_rate": 1.3522278623801469e-05, + "loss": 0.6805, + "step": 12936 + }, + { + "epoch": 7.296672306824592, + "grad_norm": 0.9974583983421326, + "learning_rate": 1.3519458544839256e-05, + "loss": 0.7188, + "step": 12937 + }, + { + "epoch": 7.297236322617033, + "grad_norm": 1.1371639966964722, + "learning_rate": 1.3516638465877046e-05, + "loss": 0.7664, + "step": 12938 + }, + { + "epoch": 7.297800338409475, + "grad_norm": 1.5233837366104126, + "learning_rate": 1.3513818386914833e-05, + "loss": 0.8211, + "step": 12939 + }, + { + "epoch": 7.298364354201918, + "grad_norm": 1.3870248794555664, + "learning_rate": 1.3510998307952624e-05, + "loss": 0.8139, + "step": 12940 + }, + { + "epoch": 7.29892836999436, + "grad_norm": 0.8481972217559814, + "learning_rate": 1.3508178228990413e-05, + "loss": 0.7145, + "step": 12941 + }, + { + "epoch": 7.299492385786802, + "grad_norm": 1.0248503684997559, + "learning_rate": 1.3505358150028201e-05, + "loss": 0.6608, + "step": 12942 + }, + { + "epoch": 7.300056401579244, + "grad_norm": 1.3405576944351196, + "learning_rate": 1.350253807106599e-05, + "loss": 0.7945, + "step": 12943 + }, + { + "epoch": 7.300620417371686, + "grad_norm": 1.174859881401062, + "learning_rate": 1.349971799210378e-05, + "loss": 0.7052, + "step": 12944 + }, + { + "epoch": 7.301184433164129, + "grad_norm": 1.2788594961166382, + "learning_rate": 1.3496897913141568e-05, + "loss": 0.7962, + "step": 12945 + }, + { + "epoch": 7.3017484489565705, + "grad_norm": 1.0888457298278809, + "learning_rate": 1.3494077834179358e-05, + "loss": 0.798, + "step": 12946 + }, + { + "epoch": 7.302312464749013, + "grad_norm": 1.0917335748672485, + "learning_rate": 1.3491257755217145e-05, + "loss": 0.6169, + "step": 12947 + }, + { + "epoch": 7.302876480541455, + "grad_norm": 1.1274341344833374, + "learning_rate": 1.3488437676254936e-05, + "loss": 0.7108, + "step": 12948 + }, + { + "epoch": 7.303440496333898, + "grad_norm": 1.1963114738464355, + "learning_rate": 1.3485617597292725e-05, + "loss": 0.7323, + "step": 12949 + }, + { + "epoch": 7.3040045121263395, + "grad_norm": 0.9453120231628418, + "learning_rate": 1.3482797518330515e-05, + "loss": 0.6668, + "step": 12950 + }, + { + "epoch": 7.304568527918782, + "grad_norm": 0.936318576335907, + "learning_rate": 1.3479977439368302e-05, + "loss": 0.7458, + "step": 12951 + }, + { + "epoch": 7.305132543711224, + "grad_norm": 0.9924915432929993, + "learning_rate": 1.3477157360406093e-05, + "loss": 0.8142, + "step": 12952 + }, + { + "epoch": 7.305696559503666, + "grad_norm": 1.1340770721435547, + "learning_rate": 1.347433728144388e-05, + "loss": 0.7394, + "step": 12953 + }, + { + "epoch": 7.3062605752961085, + "grad_norm": 1.1677052974700928, + "learning_rate": 1.347151720248167e-05, + "loss": 0.7969, + "step": 12954 + }, + { + "epoch": 7.30682459108855, + "grad_norm": 1.1553044319152832, + "learning_rate": 1.3468697123519458e-05, + "loss": 0.6952, + "step": 12955 + }, + { + "epoch": 7.307388606880993, + "grad_norm": 1.4482874870300293, + "learning_rate": 1.3465877044557248e-05, + "loss": 0.898, + "step": 12956 + }, + { + "epoch": 7.307952622673435, + "grad_norm": 1.3878448009490967, + "learning_rate": 1.3463056965595037e-05, + "loss": 0.7618, + "step": 12957 + }, + { + "epoch": 7.308516638465877, + "grad_norm": 1.4187681674957275, + "learning_rate": 1.3460236886632827e-05, + "loss": 0.7946, + "step": 12958 + }, + { + "epoch": 7.309080654258319, + "grad_norm": 1.1964025497436523, + "learning_rate": 1.3457416807670614e-05, + "loss": 0.7757, + "step": 12959 + }, + { + "epoch": 7.309644670050761, + "grad_norm": 1.051560401916504, + "learning_rate": 1.3454596728708405e-05, + "loss": 0.5999, + "step": 12960 + }, + { + "epoch": 7.310208685843204, + "grad_norm": 1.5216559171676636, + "learning_rate": 1.3451776649746192e-05, + "loss": 0.7371, + "step": 12961 + }, + { + "epoch": 7.310772701635646, + "grad_norm": 1.0727596282958984, + "learning_rate": 1.3448956570783982e-05, + "loss": 0.6177, + "step": 12962 + }, + { + "epoch": 7.311336717428088, + "grad_norm": 0.8787218928337097, + "learning_rate": 1.3446136491821773e-05, + "loss": 0.6961, + "step": 12963 + }, + { + "epoch": 7.31190073322053, + "grad_norm": 1.0368813276290894, + "learning_rate": 1.344331641285956e-05, + "loss": 0.6608, + "step": 12964 + }, + { + "epoch": 7.312464749012973, + "grad_norm": 1.207017421722412, + "learning_rate": 1.344049633389735e-05, + "loss": 0.7393, + "step": 12965 + }, + { + "epoch": 7.313028764805415, + "grad_norm": 1.1123614311218262, + "learning_rate": 1.343767625493514e-05, + "loss": 0.7708, + "step": 12966 + }, + { + "epoch": 7.3135927805978564, + "grad_norm": 1.1887660026550293, + "learning_rate": 1.343485617597293e-05, + "loss": 0.8372, + "step": 12967 + }, + { + "epoch": 7.314156796390299, + "grad_norm": 1.1709684133529663, + "learning_rate": 1.3432036097010717e-05, + "loss": 0.7459, + "step": 12968 + }, + { + "epoch": 7.314720812182741, + "grad_norm": 0.8759309649467468, + "learning_rate": 1.3429216018048507e-05, + "loss": 0.7389, + "step": 12969 + }, + { + "epoch": 7.315284827975184, + "grad_norm": 0.8766718506813049, + "learning_rate": 1.3426395939086295e-05, + "loss": 0.6975, + "step": 12970 + }, + { + "epoch": 7.3158488437676255, + "grad_norm": 1.098523736000061, + "learning_rate": 1.3423575860124085e-05, + "loss": 0.7283, + "step": 12971 + }, + { + "epoch": 7.316412859560067, + "grad_norm": 1.0742367506027222, + "learning_rate": 1.3420755781161872e-05, + "loss": 0.6736, + "step": 12972 + }, + { + "epoch": 7.31697687535251, + "grad_norm": 1.2199561595916748, + "learning_rate": 1.3417935702199663e-05, + "loss": 0.7498, + "step": 12973 + }, + { + "epoch": 7.317540891144952, + "grad_norm": 1.2119076251983643, + "learning_rate": 1.3415115623237451e-05, + "loss": 0.78, + "step": 12974 + }, + { + "epoch": 7.3181049069373945, + "grad_norm": 1.0273728370666504, + "learning_rate": 1.3412295544275242e-05, + "loss": 0.8158, + "step": 12975 + }, + { + "epoch": 7.318668922729836, + "grad_norm": 1.0364826917648315, + "learning_rate": 1.3409475465313029e-05, + "loss": 0.6902, + "step": 12976 + }, + { + "epoch": 7.319232938522279, + "grad_norm": 0.9087433218955994, + "learning_rate": 1.340665538635082e-05, + "loss": 0.622, + "step": 12977 + }, + { + "epoch": 7.319796954314721, + "grad_norm": 1.2556006908416748, + "learning_rate": 1.3403835307388607e-05, + "loss": 0.8234, + "step": 12978 + }, + { + "epoch": 7.3203609701071635, + "grad_norm": 1.011400818824768, + "learning_rate": 1.3401015228426397e-05, + "loss": 0.7519, + "step": 12979 + }, + { + "epoch": 7.320924985899605, + "grad_norm": 1.379001259803772, + "learning_rate": 1.3398195149464184e-05, + "loss": 0.7994, + "step": 12980 + }, + { + "epoch": 7.321489001692047, + "grad_norm": 0.9507874250411987, + "learning_rate": 1.3395375070501975e-05, + "loss": 0.7144, + "step": 12981 + }, + { + "epoch": 7.32205301748449, + "grad_norm": 1.6055655479431152, + "learning_rate": 1.3392554991539763e-05, + "loss": 0.7273, + "step": 12982 + }, + { + "epoch": 7.322617033276932, + "grad_norm": 1.1631133556365967, + "learning_rate": 1.3389734912577554e-05, + "loss": 0.7681, + "step": 12983 + }, + { + "epoch": 7.323181049069374, + "grad_norm": 1.780295968055725, + "learning_rate": 1.3386914833615341e-05, + "loss": 0.6828, + "step": 12984 + }, + { + "epoch": 7.323745064861816, + "grad_norm": 1.3178176879882812, + "learning_rate": 1.3384094754653132e-05, + "loss": 0.7623, + "step": 12985 + }, + { + "epoch": 7.324309080654258, + "grad_norm": 0.9748982787132263, + "learning_rate": 1.3381274675690919e-05, + "loss": 0.6884, + "step": 12986 + }, + { + "epoch": 7.324873096446701, + "grad_norm": 1.3586207628250122, + "learning_rate": 1.337845459672871e-05, + "loss": 0.7382, + "step": 12987 + }, + { + "epoch": 7.325437112239142, + "grad_norm": 0.8994705677032471, + "learning_rate": 1.3375634517766498e-05, + "loss": 0.6396, + "step": 12988 + }, + { + "epoch": 7.326001128031585, + "grad_norm": 1.0928773880004883, + "learning_rate": 1.3372814438804287e-05, + "loss": 0.6056, + "step": 12989 + }, + { + "epoch": 7.326565143824027, + "grad_norm": 0.8605398535728455, + "learning_rate": 1.3369994359842076e-05, + "loss": 0.661, + "step": 12990 + }, + { + "epoch": 7.32712915961647, + "grad_norm": 0.9672868847846985, + "learning_rate": 1.3367174280879866e-05, + "loss": 0.683, + "step": 12991 + }, + { + "epoch": 7.327693175408911, + "grad_norm": 1.4099615812301636, + "learning_rate": 1.3364354201917653e-05, + "loss": 0.7833, + "step": 12992 + }, + { + "epoch": 7.328257191201354, + "grad_norm": 1.3904438018798828, + "learning_rate": 1.3361534122955444e-05, + "loss": 0.7129, + "step": 12993 + }, + { + "epoch": 7.328821206993796, + "grad_norm": 1.2430990934371948, + "learning_rate": 1.335871404399323e-05, + "loss": 0.7773, + "step": 12994 + }, + { + "epoch": 7.329385222786238, + "grad_norm": 0.9305856823921204, + "learning_rate": 1.3355893965031021e-05, + "loss": 0.6747, + "step": 12995 + }, + { + "epoch": 7.32994923857868, + "grad_norm": 1.243175983428955, + "learning_rate": 1.335307388606881e-05, + "loss": 0.7161, + "step": 12996 + }, + { + "epoch": 7.330513254371122, + "grad_norm": 0.9397953152656555, + "learning_rate": 1.33502538071066e-05, + "loss": 0.8063, + "step": 12997 + }, + { + "epoch": 7.331077270163565, + "grad_norm": 0.9048129916191101, + "learning_rate": 1.3347433728144388e-05, + "loss": 0.6143, + "step": 12998 + }, + { + "epoch": 7.331641285956007, + "grad_norm": 1.1825493574142456, + "learning_rate": 1.3344613649182178e-05, + "loss": 0.6768, + "step": 12999 + }, + { + "epoch": 7.3322053017484485, + "grad_norm": 1.129331350326538, + "learning_rate": 1.3341793570219969e-05, + "loss": 0.7042, + "step": 13000 + }, + { + "epoch": 7.332769317540891, + "grad_norm": 1.3752912282943726, + "learning_rate": 1.3338973491257756e-05, + "loss": 0.7414, + "step": 13001 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 1.0280210971832275, + "learning_rate": 1.3336153412295546e-05, + "loss": 0.7882, + "step": 13002 + }, + { + "epoch": 7.333897349125776, + "grad_norm": 1.517985463142395, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.7915, + "step": 13003 + }, + { + "epoch": 7.3344613649182175, + "grad_norm": 0.8667292594909668, + "learning_rate": 1.3330513254371124e-05, + "loss": 0.712, + "step": 13004 + }, + { + "epoch": 7.33502538071066, + "grad_norm": 1.5067250728607178, + "learning_rate": 1.3327693175408913e-05, + "loss": 0.668, + "step": 13005 + }, + { + "epoch": 7.335589396503102, + "grad_norm": 1.242521047592163, + "learning_rate": 1.3324873096446703e-05, + "loss": 0.7337, + "step": 13006 + }, + { + "epoch": 7.336153412295545, + "grad_norm": 1.0968477725982666, + "learning_rate": 1.332205301748449e-05, + "loss": 0.866, + "step": 13007 + }, + { + "epoch": 7.3367174280879865, + "grad_norm": 1.2365602254867554, + "learning_rate": 1.331923293852228e-05, + "loss": 0.6885, + "step": 13008 + }, + { + "epoch": 7.337281443880428, + "grad_norm": 1.0079103708267212, + "learning_rate": 1.3316412859560068e-05, + "loss": 0.6957, + "step": 13009 + }, + { + "epoch": 7.337845459672871, + "grad_norm": 1.106360912322998, + "learning_rate": 1.3313592780597858e-05, + "loss": 0.749, + "step": 13010 + }, + { + "epoch": 7.338409475465313, + "grad_norm": 0.9532142281532288, + "learning_rate": 1.3310772701635645e-05, + "loss": 0.7256, + "step": 13011 + }, + { + "epoch": 7.3389734912577556, + "grad_norm": 1.1117945909500122, + "learning_rate": 1.3307952622673436e-05, + "loss": 0.6997, + "step": 13012 + }, + { + "epoch": 7.339537507050197, + "grad_norm": 0.8109918236732483, + "learning_rate": 1.3305132543711225e-05, + "loss": 0.6409, + "step": 13013 + }, + { + "epoch": 7.340101522842639, + "grad_norm": 1.0641767978668213, + "learning_rate": 1.3302312464749015e-05, + "loss": 0.7106, + "step": 13014 + }, + { + "epoch": 7.340665538635082, + "grad_norm": 0.8740127086639404, + "learning_rate": 1.3299492385786802e-05, + "loss": 0.7357, + "step": 13015 + }, + { + "epoch": 7.341229554427524, + "grad_norm": 1.216629981994629, + "learning_rate": 1.3296672306824593e-05, + "loss": 0.8241, + "step": 13016 + }, + { + "epoch": 7.341793570219966, + "grad_norm": 1.130780577659607, + "learning_rate": 1.329385222786238e-05, + "loss": 0.7542, + "step": 13017 + }, + { + "epoch": 7.342357586012408, + "grad_norm": 1.0169247388839722, + "learning_rate": 1.329103214890017e-05, + "loss": 0.7427, + "step": 13018 + }, + { + "epoch": 7.342921601804851, + "grad_norm": 1.3268609046936035, + "learning_rate": 1.3288212069937957e-05, + "loss": 0.7627, + "step": 13019 + }, + { + "epoch": 7.343485617597293, + "grad_norm": 0.8272603154182434, + "learning_rate": 1.3285391990975748e-05, + "loss": 0.512, + "step": 13020 + }, + { + "epoch": 7.344049633389735, + "grad_norm": 1.173985481262207, + "learning_rate": 1.3282571912013537e-05, + "loss": 0.8243, + "step": 13021 + }, + { + "epoch": 7.344613649182177, + "grad_norm": 1.0786441564559937, + "learning_rate": 1.3279751833051327e-05, + "loss": 0.7589, + "step": 13022 + }, + { + "epoch": 7.345177664974619, + "grad_norm": 1.1456540822982788, + "learning_rate": 1.3276931754089114e-05, + "loss": 0.7233, + "step": 13023 + }, + { + "epoch": 7.345741680767062, + "grad_norm": 1.1931310892105103, + "learning_rate": 1.3274111675126905e-05, + "loss": 0.6398, + "step": 13024 + }, + { + "epoch": 7.3463056965595035, + "grad_norm": 1.2091697454452515, + "learning_rate": 1.3271291596164692e-05, + "loss": 0.6054, + "step": 13025 + }, + { + "epoch": 7.346869712351946, + "grad_norm": 1.0844422578811646, + "learning_rate": 1.3268471517202482e-05, + "loss": 0.7142, + "step": 13026 + }, + { + "epoch": 7.347433728144388, + "grad_norm": 1.1275960206985474, + "learning_rate": 1.326565143824027e-05, + "loss": 0.8443, + "step": 13027 + }, + { + "epoch": 7.34799774393683, + "grad_norm": 1.4539401531219482, + "learning_rate": 1.326283135927806e-05, + "loss": 0.8529, + "step": 13028 + }, + { + "epoch": 7.3485617597292725, + "grad_norm": 1.3150016069412231, + "learning_rate": 1.3260011280315849e-05, + "loss": 0.7871, + "step": 13029 + }, + { + "epoch": 7.349125775521714, + "grad_norm": 1.0082114934921265, + "learning_rate": 1.325719120135364e-05, + "loss": 0.7052, + "step": 13030 + }, + { + "epoch": 7.349689791314157, + "grad_norm": 1.0374116897583008, + "learning_rate": 1.3254371122391426e-05, + "loss": 0.7015, + "step": 13031 + }, + { + "epoch": 7.350253807106599, + "grad_norm": 0.6842848062515259, + "learning_rate": 1.3251551043429217e-05, + "loss": 0.5954, + "step": 13032 + }, + { + "epoch": 7.3508178228990415, + "grad_norm": 1.1316825151443481, + "learning_rate": 1.3248730964467004e-05, + "loss": 0.7846, + "step": 13033 + }, + { + "epoch": 7.351381838691483, + "grad_norm": 1.1517306566238403, + "learning_rate": 1.3245910885504794e-05, + "loss": 0.6646, + "step": 13034 + }, + { + "epoch": 7.351945854483926, + "grad_norm": 1.0459662675857544, + "learning_rate": 1.3243090806542585e-05, + "loss": 0.7572, + "step": 13035 + }, + { + "epoch": 7.352509870276368, + "grad_norm": 1.106874942779541, + "learning_rate": 1.3240270727580372e-05, + "loss": 0.7405, + "step": 13036 + }, + { + "epoch": 7.35307388606881, + "grad_norm": 1.2284046411514282, + "learning_rate": 1.3237450648618163e-05, + "loss": 0.8052, + "step": 13037 + }, + { + "epoch": 7.353637901861252, + "grad_norm": 1.1849085092544556, + "learning_rate": 1.3234630569655951e-05, + "loss": 0.7728, + "step": 13038 + }, + { + "epoch": 7.354201917653694, + "grad_norm": 1.0701743364334106, + "learning_rate": 1.3231810490693742e-05, + "loss": 0.6735, + "step": 13039 + }, + { + "epoch": 7.354765933446137, + "grad_norm": 1.2919056415557861, + "learning_rate": 1.3228990411731529e-05, + "loss": 0.8184, + "step": 13040 + }, + { + "epoch": 7.355329949238579, + "grad_norm": 1.0671463012695312, + "learning_rate": 1.322617033276932e-05, + "loss": 0.6693, + "step": 13041 + }, + { + "epoch": 7.35589396503102, + "grad_norm": 1.0491235256195068, + "learning_rate": 1.3223350253807107e-05, + "loss": 0.632, + "step": 13042 + }, + { + "epoch": 7.356457980823463, + "grad_norm": 1.3681936264038086, + "learning_rate": 1.3220530174844897e-05, + "loss": 0.6834, + "step": 13043 + }, + { + "epoch": 7.357021996615905, + "grad_norm": 1.058639407157898, + "learning_rate": 1.3217710095882684e-05, + "loss": 0.6967, + "step": 13044 + }, + { + "epoch": 7.357586012408348, + "grad_norm": 1.0334532260894775, + "learning_rate": 1.3214890016920475e-05, + "loss": 0.6845, + "step": 13045 + }, + { + "epoch": 7.3581500282007894, + "grad_norm": 1.2634615898132324, + "learning_rate": 1.3212069937958263e-05, + "loss": 0.7287, + "step": 13046 + }, + { + "epoch": 7.358714043993232, + "grad_norm": 0.829089343547821, + "learning_rate": 1.3209249858996054e-05, + "loss": 0.6794, + "step": 13047 + }, + { + "epoch": 7.359278059785674, + "grad_norm": 1.0015192031860352, + "learning_rate": 1.3206429780033841e-05, + "loss": 0.7145, + "step": 13048 + }, + { + "epoch": 7.359842075578117, + "grad_norm": 0.763897716999054, + "learning_rate": 1.3203609701071631e-05, + "loss": 0.6401, + "step": 13049 + }, + { + "epoch": 7.3604060913705585, + "grad_norm": 1.027462124824524, + "learning_rate": 1.3200789622109419e-05, + "loss": 0.7525, + "step": 13050 + }, + { + "epoch": 7.360970107163, + "grad_norm": 0.9552854895591736, + "learning_rate": 1.3197969543147209e-05, + "loss": 0.6877, + "step": 13051 + }, + { + "epoch": 7.361534122955443, + "grad_norm": 1.0274707078933716, + "learning_rate": 1.3195149464184998e-05, + "loss": 0.7356, + "step": 13052 + }, + { + "epoch": 7.362098138747885, + "grad_norm": 1.1734904050827026, + "learning_rate": 1.3192329385222787e-05, + "loss": 0.8307, + "step": 13053 + }, + { + "epoch": 7.3626621545403275, + "grad_norm": 0.8193031549453735, + "learning_rate": 1.3189509306260575e-05, + "loss": 0.6663, + "step": 13054 + }, + { + "epoch": 7.363226170332769, + "grad_norm": 0.9035179615020752, + "learning_rate": 1.3186689227298366e-05, + "loss": 0.6713, + "step": 13055 + }, + { + "epoch": 7.363790186125211, + "grad_norm": 1.2632977962493896, + "learning_rate": 1.3183869148336153e-05, + "loss": 0.6634, + "step": 13056 + }, + { + "epoch": 7.364354201917654, + "grad_norm": 1.1793794631958008, + "learning_rate": 1.3181049069373944e-05, + "loss": 0.7948, + "step": 13057 + }, + { + "epoch": 7.364918217710096, + "grad_norm": 1.225845217704773, + "learning_rate": 1.317822899041173e-05, + "loss": 0.7614, + "step": 13058 + }, + { + "epoch": 7.365482233502538, + "grad_norm": 1.309173583984375, + "learning_rate": 1.3175408911449521e-05, + "loss": 0.6348, + "step": 13059 + }, + { + "epoch": 7.36604624929498, + "grad_norm": 1.3510156869888306, + "learning_rate": 1.317258883248731e-05, + "loss": 0.6202, + "step": 13060 + }, + { + "epoch": 7.366610265087423, + "grad_norm": 1.411059856414795, + "learning_rate": 1.31697687535251e-05, + "loss": 0.6864, + "step": 13061 + }, + { + "epoch": 7.367174280879865, + "grad_norm": 1.0791692733764648, + "learning_rate": 1.3166948674562888e-05, + "loss": 0.8032, + "step": 13062 + }, + { + "epoch": 7.367738296672307, + "grad_norm": 1.2270694971084595, + "learning_rate": 1.3164128595600678e-05, + "loss": 0.6732, + "step": 13063 + }, + { + "epoch": 7.368302312464749, + "grad_norm": 1.1806720495224, + "learning_rate": 1.3161308516638465e-05, + "loss": 0.687, + "step": 13064 + }, + { + "epoch": 7.368866328257191, + "grad_norm": 1.1801191568374634, + "learning_rate": 1.3158488437676256e-05, + "loss": 0.7558, + "step": 13065 + }, + { + "epoch": 7.369430344049634, + "grad_norm": 1.0975604057312012, + "learning_rate": 1.3155668358714043e-05, + "loss": 0.6928, + "step": 13066 + }, + { + "epoch": 7.369994359842075, + "grad_norm": 0.9338303804397583, + "learning_rate": 1.3152848279751833e-05, + "loss": 0.6623, + "step": 13067 + }, + { + "epoch": 7.370558375634518, + "grad_norm": 1.2828954458236694, + "learning_rate": 1.3150028200789622e-05, + "loss": 0.8381, + "step": 13068 + }, + { + "epoch": 7.37112239142696, + "grad_norm": 1.7077313661575317, + "learning_rate": 1.3147208121827413e-05, + "loss": 0.8318, + "step": 13069 + }, + { + "epoch": 7.371686407219402, + "grad_norm": 1.3303762674331665, + "learning_rate": 1.3144388042865203e-05, + "loss": 0.6978, + "step": 13070 + }, + { + "epoch": 7.372250423011844, + "grad_norm": 1.0749249458312988, + "learning_rate": 1.314156796390299e-05, + "loss": 0.625, + "step": 13071 + }, + { + "epoch": 7.372814438804286, + "grad_norm": 0.8690874576568604, + "learning_rate": 1.313874788494078e-05, + "loss": 0.6425, + "step": 13072 + }, + { + "epoch": 7.373378454596729, + "grad_norm": 1.1671980619430542, + "learning_rate": 1.3135927805978568e-05, + "loss": 0.746, + "step": 13073 + }, + { + "epoch": 7.373942470389171, + "grad_norm": 1.2085034847259521, + "learning_rate": 1.3133107727016358e-05, + "loss": 0.7346, + "step": 13074 + }, + { + "epoch": 7.374506486181613, + "grad_norm": 0.9289335012435913, + "learning_rate": 1.3130287648054145e-05, + "loss": 0.7784, + "step": 13075 + }, + { + "epoch": 7.375070501974055, + "grad_norm": 1.0550286769866943, + "learning_rate": 1.3127467569091936e-05, + "loss": 0.7332, + "step": 13076 + }, + { + "epoch": 7.375634517766498, + "grad_norm": 1.6106373071670532, + "learning_rate": 1.3124647490129725e-05, + "loss": 0.8808, + "step": 13077 + }, + { + "epoch": 7.37619853355894, + "grad_norm": 1.272853136062622, + "learning_rate": 1.3121827411167515e-05, + "loss": 0.6972, + "step": 13078 + }, + { + "epoch": 7.3767625493513815, + "grad_norm": 1.1662179231643677, + "learning_rate": 1.3119007332205302e-05, + "loss": 0.7086, + "step": 13079 + }, + { + "epoch": 7.377326565143824, + "grad_norm": 0.8840844035148621, + "learning_rate": 1.3116187253243093e-05, + "loss": 0.7074, + "step": 13080 + }, + { + "epoch": 7.377890580936266, + "grad_norm": 0.951898455619812, + "learning_rate": 1.311336717428088e-05, + "loss": 0.7294, + "step": 13081 + }, + { + "epoch": 7.378454596728709, + "grad_norm": 1.3657397031784058, + "learning_rate": 1.311054709531867e-05, + "loss": 0.7632, + "step": 13082 + }, + { + "epoch": 7.3790186125211505, + "grad_norm": 1.3146274089813232, + "learning_rate": 1.3107727016356457e-05, + "loss": 0.775, + "step": 13083 + }, + { + "epoch": 7.379582628313592, + "grad_norm": 1.0878689289093018, + "learning_rate": 1.3104906937394248e-05, + "loss": 0.7107, + "step": 13084 + }, + { + "epoch": 7.380146644106035, + "grad_norm": 1.3724621534347534, + "learning_rate": 1.3102086858432037e-05, + "loss": 0.7618, + "step": 13085 + }, + { + "epoch": 7.380710659898477, + "grad_norm": 1.4402310848236084, + "learning_rate": 1.3099266779469827e-05, + "loss": 0.7732, + "step": 13086 + }, + { + "epoch": 7.3812746756909196, + "grad_norm": 1.4118578433990479, + "learning_rate": 1.3096446700507614e-05, + "loss": 0.7747, + "step": 13087 + }, + { + "epoch": 7.381838691483361, + "grad_norm": 0.939154863357544, + "learning_rate": 1.3093626621545405e-05, + "loss": 0.7658, + "step": 13088 + }, + { + "epoch": 7.382402707275804, + "grad_norm": 1.251112937927246, + "learning_rate": 1.3090806542583192e-05, + "loss": 0.7456, + "step": 13089 + }, + { + "epoch": 7.382966723068246, + "grad_norm": 1.478233814239502, + "learning_rate": 1.3087986463620982e-05, + "loss": 0.7814, + "step": 13090 + }, + { + "epoch": 7.383530738860689, + "grad_norm": 1.5646671056747437, + "learning_rate": 1.308516638465877e-05, + "loss": 0.8673, + "step": 13091 + }, + { + "epoch": 7.38409475465313, + "grad_norm": 0.9621566534042358, + "learning_rate": 1.308234630569656e-05, + "loss": 0.5888, + "step": 13092 + }, + { + "epoch": 7.384658770445572, + "grad_norm": 0.9193017482757568, + "learning_rate": 1.3079526226734349e-05, + "loss": 0.7122, + "step": 13093 + }, + { + "epoch": 7.385222786238015, + "grad_norm": 1.376639485359192, + "learning_rate": 1.307670614777214e-05, + "loss": 0.6973, + "step": 13094 + }, + { + "epoch": 7.385786802030457, + "grad_norm": 1.3512849807739258, + "learning_rate": 1.3073886068809926e-05, + "loss": 0.7692, + "step": 13095 + }, + { + "epoch": 7.386350817822899, + "grad_norm": 0.8807494044303894, + "learning_rate": 1.3071065989847717e-05, + "loss": 0.7481, + "step": 13096 + }, + { + "epoch": 7.386914833615341, + "grad_norm": 1.0660841464996338, + "learning_rate": 1.3068245910885504e-05, + "loss": 0.7011, + "step": 13097 + }, + { + "epoch": 7.387478849407783, + "grad_norm": 1.1397861242294312, + "learning_rate": 1.3065425831923294e-05, + "loss": 0.6883, + "step": 13098 + }, + { + "epoch": 7.388042865200226, + "grad_norm": 1.0624314546585083, + "learning_rate": 1.3062605752961083e-05, + "loss": 0.693, + "step": 13099 + }, + { + "epoch": 7.3886068809926675, + "grad_norm": 1.4811190366744995, + "learning_rate": 1.3059785673998872e-05, + "loss": 0.8417, + "step": 13100 + }, + { + "epoch": 7.38917089678511, + "grad_norm": 1.1013405323028564, + "learning_rate": 1.305696559503666e-05, + "loss": 0.6675, + "step": 13101 + }, + { + "epoch": 7.389734912577552, + "grad_norm": 1.1668702363967896, + "learning_rate": 1.3054145516074451e-05, + "loss": 0.7767, + "step": 13102 + }, + { + "epoch": 7.390298928369995, + "grad_norm": 0.999872624874115, + "learning_rate": 1.3051325437112238e-05, + "loss": 0.6454, + "step": 13103 + }, + { + "epoch": 7.3908629441624365, + "grad_norm": 1.3489261865615845, + "learning_rate": 1.3048505358150029e-05, + "loss": 0.737, + "step": 13104 + }, + { + "epoch": 7.391426959954879, + "grad_norm": 1.1995514631271362, + "learning_rate": 1.304568527918782e-05, + "loss": 0.7513, + "step": 13105 + }, + { + "epoch": 7.391990975747321, + "grad_norm": 1.5790718793869019, + "learning_rate": 1.3042865200225606e-05, + "loss": 0.7863, + "step": 13106 + }, + { + "epoch": 7.392554991539763, + "grad_norm": 1.1867153644561768, + "learning_rate": 1.3040045121263397e-05, + "loss": 0.703, + "step": 13107 + }, + { + "epoch": 7.3931190073322055, + "grad_norm": 0.946077287197113, + "learning_rate": 1.3037225042301186e-05, + "loss": 0.789, + "step": 13108 + }, + { + "epoch": 7.393683023124647, + "grad_norm": 1.290393590927124, + "learning_rate": 1.3034404963338975e-05, + "loss": 0.8273, + "step": 13109 + }, + { + "epoch": 7.39424703891709, + "grad_norm": 7.558187484741211, + "learning_rate": 1.3031584884376763e-05, + "loss": 0.8448, + "step": 13110 + }, + { + "epoch": 7.394811054709532, + "grad_norm": 0.9431838393211365, + "learning_rate": 1.3028764805414554e-05, + "loss": 0.6313, + "step": 13111 + }, + { + "epoch": 7.395375070501974, + "grad_norm": 1.3310531377792358, + "learning_rate": 1.3025944726452341e-05, + "loss": 0.7432, + "step": 13112 + }, + { + "epoch": 7.395939086294416, + "grad_norm": 0.9737228155136108, + "learning_rate": 1.3023124647490131e-05, + "loss": 0.7251, + "step": 13113 + }, + { + "epoch": 7.396503102086858, + "grad_norm": 1.2002251148223877, + "learning_rate": 1.3020304568527919e-05, + "loss": 0.7186, + "step": 13114 + }, + { + "epoch": 7.397067117879301, + "grad_norm": 1.193594217300415, + "learning_rate": 1.3017484489565709e-05, + "loss": 0.7712, + "step": 13115 + }, + { + "epoch": 7.397631133671743, + "grad_norm": 1.2249091863632202, + "learning_rate": 1.3014664410603498e-05, + "loss": 0.8141, + "step": 13116 + }, + { + "epoch": 7.398195149464185, + "grad_norm": 0.8157815933227539, + "learning_rate": 1.3011844331641288e-05, + "loss": 0.666, + "step": 13117 + }, + { + "epoch": 7.398759165256627, + "grad_norm": 1.192206859588623, + "learning_rate": 1.3009024252679075e-05, + "loss": 0.7719, + "step": 13118 + }, + { + "epoch": 7.39932318104907, + "grad_norm": 0.9382479786872864, + "learning_rate": 1.3006204173716866e-05, + "loss": 0.7516, + "step": 13119 + }, + { + "epoch": 7.399887196841512, + "grad_norm": 0.9446640014648438, + "learning_rate": 1.3003384094754653e-05, + "loss": 0.7977, + "step": 13120 + }, + { + "epoch": 7.4004512126339534, + "grad_norm": 1.1291619539260864, + "learning_rate": 1.3000564015792443e-05, + "loss": 0.7592, + "step": 13121 + }, + { + "epoch": 7.401015228426396, + "grad_norm": 1.2603265047073364, + "learning_rate": 1.299774393683023e-05, + "loss": 0.8294, + "step": 13122 + }, + { + "epoch": 7.401579244218838, + "grad_norm": 0.9264773726463318, + "learning_rate": 1.2994923857868021e-05, + "loss": 0.6814, + "step": 13123 + }, + { + "epoch": 7.402143260011281, + "grad_norm": 1.544134497642517, + "learning_rate": 1.299210377890581e-05, + "loss": 0.8921, + "step": 13124 + }, + { + "epoch": 7.4027072758037225, + "grad_norm": 0.8999110460281372, + "learning_rate": 1.29892836999436e-05, + "loss": 0.6717, + "step": 13125 + }, + { + "epoch": 7.403271291596164, + "grad_norm": 0.865032434463501, + "learning_rate": 1.2986463620981387e-05, + "loss": 0.6654, + "step": 13126 + }, + { + "epoch": 7.403835307388607, + "grad_norm": 1.2014013528823853, + "learning_rate": 1.2983643542019178e-05, + "loss": 0.7012, + "step": 13127 + }, + { + "epoch": 7.404399323181049, + "grad_norm": 1.1757789850234985, + "learning_rate": 1.2980823463056965e-05, + "loss": 0.7609, + "step": 13128 + }, + { + "epoch": 7.4049633389734915, + "grad_norm": 1.1918140649795532, + "learning_rate": 1.2978003384094756e-05, + "loss": 0.7472, + "step": 13129 + }, + { + "epoch": 7.405527354765933, + "grad_norm": 0.9582391381263733, + "learning_rate": 1.2975183305132543e-05, + "loss": 0.7342, + "step": 13130 + }, + { + "epoch": 7.406091370558376, + "grad_norm": 1.1106754541397095, + "learning_rate": 1.2972363226170333e-05, + "loss": 0.717, + "step": 13131 + }, + { + "epoch": 7.406655386350818, + "grad_norm": 1.0750819444656372, + "learning_rate": 1.2969543147208122e-05, + "loss": 0.7794, + "step": 13132 + }, + { + "epoch": 7.4072194021432605, + "grad_norm": 0.9617186188697815, + "learning_rate": 1.2966723068245912e-05, + "loss": 0.7825, + "step": 13133 + }, + { + "epoch": 7.407783417935702, + "grad_norm": 0.955285370349884, + "learning_rate": 1.29639029892837e-05, + "loss": 0.6946, + "step": 13134 + }, + { + "epoch": 7.408347433728144, + "grad_norm": 1.3749812841415405, + "learning_rate": 1.296108291032149e-05, + "loss": 0.7637, + "step": 13135 + }, + { + "epoch": 7.408911449520587, + "grad_norm": 1.1506203413009644, + "learning_rate": 1.2958262831359277e-05, + "loss": 0.7124, + "step": 13136 + }, + { + "epoch": 7.409475465313029, + "grad_norm": 1.179105281829834, + "learning_rate": 1.2955442752397068e-05, + "loss": 0.6478, + "step": 13137 + }, + { + "epoch": 7.410039481105471, + "grad_norm": 0.7385964393615723, + "learning_rate": 1.2952622673434855e-05, + "loss": 0.6306, + "step": 13138 + }, + { + "epoch": 7.410603496897913, + "grad_norm": 0.8199608325958252, + "learning_rate": 1.2949802594472645e-05, + "loss": 0.5822, + "step": 13139 + }, + { + "epoch": 7.411167512690355, + "grad_norm": 1.2095305919647217, + "learning_rate": 1.2946982515510436e-05, + "loss": 0.718, + "step": 13140 + }, + { + "epoch": 7.411731528482798, + "grad_norm": 1.6208417415618896, + "learning_rate": 1.2944162436548224e-05, + "loss": 0.8092, + "step": 13141 + }, + { + "epoch": 7.412295544275239, + "grad_norm": 0.9809584021568298, + "learning_rate": 1.2941342357586015e-05, + "loss": 0.6459, + "step": 13142 + }, + { + "epoch": 7.412859560067682, + "grad_norm": 1.4814307689666748, + "learning_rate": 1.2938522278623802e-05, + "loss": 0.7982, + "step": 13143 + }, + { + "epoch": 7.413423575860124, + "grad_norm": 1.0657330751419067, + "learning_rate": 1.2935702199661593e-05, + "loss": 0.7421, + "step": 13144 + }, + { + "epoch": 7.413987591652567, + "grad_norm": 1.0357985496520996, + "learning_rate": 1.293288212069938e-05, + "loss": 0.721, + "step": 13145 + }, + { + "epoch": 7.414551607445008, + "grad_norm": 1.300341010093689, + "learning_rate": 1.293006204173717e-05, + "loss": 0.7777, + "step": 13146 + }, + { + "epoch": 7.415115623237451, + "grad_norm": 1.625883936882019, + "learning_rate": 1.2927241962774957e-05, + "loss": 0.8118, + "step": 13147 + }, + { + "epoch": 7.415679639029893, + "grad_norm": 0.9422422647476196, + "learning_rate": 1.2924421883812748e-05, + "loss": 0.696, + "step": 13148 + }, + { + "epoch": 7.416243654822335, + "grad_norm": 1.123699426651001, + "learning_rate": 1.2921601804850537e-05, + "loss": 0.7502, + "step": 13149 + }, + { + "epoch": 7.416807670614777, + "grad_norm": 1.2534940242767334, + "learning_rate": 1.2918781725888327e-05, + "loss": 0.7915, + "step": 13150 + }, + { + "epoch": 7.417371686407219, + "grad_norm": 1.0448886156082153, + "learning_rate": 1.2915961646926114e-05, + "loss": 0.7488, + "step": 13151 + }, + { + "epoch": 7.417935702199662, + "grad_norm": 1.251857042312622, + "learning_rate": 1.2913141567963905e-05, + "loss": 0.7749, + "step": 13152 + }, + { + "epoch": 7.418499717992104, + "grad_norm": 1.3392099142074585, + "learning_rate": 1.2910321489001692e-05, + "loss": 0.7515, + "step": 13153 + }, + { + "epoch": 7.4190637337845455, + "grad_norm": 1.1762851476669312, + "learning_rate": 1.2907501410039482e-05, + "loss": 0.7195, + "step": 13154 + }, + { + "epoch": 7.419627749576988, + "grad_norm": 1.5611356496810913, + "learning_rate": 1.290468133107727e-05, + "loss": 0.788, + "step": 13155 + }, + { + "epoch": 7.42019176536943, + "grad_norm": 1.826703667640686, + "learning_rate": 1.290186125211506e-05, + "loss": 0.7897, + "step": 13156 + }, + { + "epoch": 7.420755781161873, + "grad_norm": 1.2641373872756958, + "learning_rate": 1.2899041173152849e-05, + "loss": 0.6945, + "step": 13157 + }, + { + "epoch": 7.4213197969543145, + "grad_norm": 1.2485712766647339, + "learning_rate": 1.2896221094190639e-05, + "loss": 0.6997, + "step": 13158 + }, + { + "epoch": 7.421883812746757, + "grad_norm": 1.138526439666748, + "learning_rate": 1.2893401015228426e-05, + "loss": 0.7957, + "step": 13159 + }, + { + "epoch": 7.422447828539199, + "grad_norm": 1.156752347946167, + "learning_rate": 1.2890580936266217e-05, + "loss": 0.8023, + "step": 13160 + }, + { + "epoch": 7.423011844331642, + "grad_norm": 1.1598031520843506, + "learning_rate": 1.2887760857304004e-05, + "loss": 0.8213, + "step": 13161 + }, + { + "epoch": 7.4235758601240835, + "grad_norm": 1.5059802532196045, + "learning_rate": 1.2884940778341794e-05, + "loss": 0.7028, + "step": 13162 + }, + { + "epoch": 7.424139875916525, + "grad_norm": 1.0346755981445312, + "learning_rate": 1.2882120699379583e-05, + "loss": 0.6057, + "step": 13163 + }, + { + "epoch": 7.424703891708968, + "grad_norm": 0.9525287747383118, + "learning_rate": 1.2879300620417372e-05, + "loss": 0.6524, + "step": 13164 + }, + { + "epoch": 7.42526790750141, + "grad_norm": 1.1903104782104492, + "learning_rate": 1.287648054145516e-05, + "loss": 0.7818, + "step": 13165 + }, + { + "epoch": 7.4258319232938526, + "grad_norm": 0.8836792707443237, + "learning_rate": 1.2873660462492951e-05, + "loss": 0.6275, + "step": 13166 + }, + { + "epoch": 7.426395939086294, + "grad_norm": 0.8597697615623474, + "learning_rate": 1.2870840383530738e-05, + "loss": 0.7418, + "step": 13167 + }, + { + "epoch": 7.426959954878736, + "grad_norm": 1.0238484144210815, + "learning_rate": 1.2868020304568529e-05, + "loss": 0.8177, + "step": 13168 + }, + { + "epoch": 7.427523970671179, + "grad_norm": 1.2607643604278564, + "learning_rate": 1.2865200225606316e-05, + "loss": 0.7783, + "step": 13169 + }, + { + "epoch": 7.428087986463621, + "grad_norm": 0.9772449731826782, + "learning_rate": 1.2862380146644106e-05, + "loss": 0.6859, + "step": 13170 + }, + { + "epoch": 7.428652002256063, + "grad_norm": 1.0733681917190552, + "learning_rate": 1.2859560067681895e-05, + "loss": 0.7815, + "step": 13171 + }, + { + "epoch": 7.429216018048505, + "grad_norm": 1.154114842414856, + "learning_rate": 1.2856739988719686e-05, + "loss": 0.7762, + "step": 13172 + }, + { + "epoch": 7.429780033840948, + "grad_norm": 1.1464824676513672, + "learning_rate": 1.2853919909757473e-05, + "loss": 0.7138, + "step": 13173 + }, + { + "epoch": 7.43034404963339, + "grad_norm": 1.14662504196167, + "learning_rate": 1.2851099830795263e-05, + "loss": 0.7332, + "step": 13174 + }, + { + "epoch": 7.430908065425832, + "grad_norm": 1.0636184215545654, + "learning_rate": 1.2848279751833054e-05, + "loss": 0.7405, + "step": 13175 + }, + { + "epoch": 7.431472081218274, + "grad_norm": 1.1423653364181519, + "learning_rate": 1.284545967287084e-05, + "loss": 0.8017, + "step": 13176 + }, + { + "epoch": 7.432036097010716, + "grad_norm": 1.4134808778762817, + "learning_rate": 1.2842639593908631e-05, + "loss": 0.7527, + "step": 13177 + }, + { + "epoch": 7.432600112803159, + "grad_norm": 1.121872067451477, + "learning_rate": 1.2839819514946418e-05, + "loss": 0.7712, + "step": 13178 + }, + { + "epoch": 7.4331641285956005, + "grad_norm": 1.2848949432373047, + "learning_rate": 1.2836999435984209e-05, + "loss": 0.7871, + "step": 13179 + }, + { + "epoch": 7.433728144388043, + "grad_norm": 1.1743501424789429, + "learning_rate": 1.2834179357021998e-05, + "loss": 0.6961, + "step": 13180 + }, + { + "epoch": 7.434292160180485, + "grad_norm": 0.8981717824935913, + "learning_rate": 1.2831359278059788e-05, + "loss": 0.7316, + "step": 13181 + }, + { + "epoch": 7.434856175972927, + "grad_norm": 1.216780424118042, + "learning_rate": 1.2828539199097575e-05, + "loss": 0.8679, + "step": 13182 + }, + { + "epoch": 7.4354201917653695, + "grad_norm": 1.00355863571167, + "learning_rate": 1.2825719120135366e-05, + "loss": 0.8079, + "step": 13183 + }, + { + "epoch": 7.435984207557811, + "grad_norm": 1.1833751201629639, + "learning_rate": 1.2822899041173153e-05, + "loss": 0.7174, + "step": 13184 + }, + { + "epoch": 7.436548223350254, + "grad_norm": 1.0872645378112793, + "learning_rate": 1.2820078962210943e-05, + "loss": 0.7423, + "step": 13185 + }, + { + "epoch": 7.437112239142696, + "grad_norm": 0.763755202293396, + "learning_rate": 1.281725888324873e-05, + "loss": 0.6297, + "step": 13186 + }, + { + "epoch": 7.4376762549351385, + "grad_norm": 1.0313209295272827, + "learning_rate": 1.2814438804286521e-05, + "loss": 0.7136, + "step": 13187 + }, + { + "epoch": 7.43824027072758, + "grad_norm": 1.112869381904602, + "learning_rate": 1.281161872532431e-05, + "loss": 0.8498, + "step": 13188 + }, + { + "epoch": 7.438804286520023, + "grad_norm": 1.092954397201538, + "learning_rate": 1.28087986463621e-05, + "loss": 0.6976, + "step": 13189 + }, + { + "epoch": 7.439368302312465, + "grad_norm": 0.958717405796051, + "learning_rate": 1.2805978567399887e-05, + "loss": 0.6532, + "step": 13190 + }, + { + "epoch": 7.439932318104907, + "grad_norm": 1.1048907041549683, + "learning_rate": 1.2803158488437678e-05, + "loss": 0.8761, + "step": 13191 + }, + { + "epoch": 7.440496333897349, + "grad_norm": 1.4738823175430298, + "learning_rate": 1.2800338409475465e-05, + "loss": 0.7563, + "step": 13192 + }, + { + "epoch": 7.441060349689791, + "grad_norm": 1.1344164609909058, + "learning_rate": 1.2797518330513255e-05, + "loss": 0.831, + "step": 13193 + }, + { + "epoch": 7.441624365482234, + "grad_norm": 1.2312581539154053, + "learning_rate": 1.2794698251551043e-05, + "loss": 0.7659, + "step": 13194 + }, + { + "epoch": 7.442188381274676, + "grad_norm": 1.2203114032745361, + "learning_rate": 1.2791878172588833e-05, + "loss": 0.7005, + "step": 13195 + }, + { + "epoch": 7.442752397067117, + "grad_norm": 0.9168539047241211, + "learning_rate": 1.2789058093626622e-05, + "loss": 0.7191, + "step": 13196 + }, + { + "epoch": 7.44331641285956, + "grad_norm": 1.0517314672470093, + "learning_rate": 1.2786238014664412e-05, + "loss": 0.6928, + "step": 13197 + }, + { + "epoch": 7.443880428652002, + "grad_norm": 1.309628963470459, + "learning_rate": 1.27834179357022e-05, + "loss": 0.7597, + "step": 13198 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 1.2632341384887695, + "learning_rate": 1.278059785673999e-05, + "loss": 0.7114, + "step": 13199 + }, + { + "epoch": 7.4450084602368864, + "grad_norm": 0.8777384757995605, + "learning_rate": 1.2777777777777777e-05, + "loss": 0.5919, + "step": 13200 + }, + { + "epoch": 7.445572476029329, + "grad_norm": 1.50380277633667, + "learning_rate": 1.2774957698815568e-05, + "loss": 0.6443, + "step": 13201 + }, + { + "epoch": 7.446136491821771, + "grad_norm": 0.9937084913253784, + "learning_rate": 1.2772137619853355e-05, + "loss": 0.6662, + "step": 13202 + }, + { + "epoch": 7.446700507614214, + "grad_norm": 1.3279242515563965, + "learning_rate": 1.2769317540891145e-05, + "loss": 0.81, + "step": 13203 + }, + { + "epoch": 7.4472645234066555, + "grad_norm": 0.7950617671012878, + "learning_rate": 1.2766497461928934e-05, + "loss": 0.6537, + "step": 13204 + }, + { + "epoch": 7.447828539199097, + "grad_norm": 1.234841227531433, + "learning_rate": 1.2763677382966724e-05, + "loss": 0.6839, + "step": 13205 + }, + { + "epoch": 7.44839255499154, + "grad_norm": 0.9719467759132385, + "learning_rate": 1.2760857304004512e-05, + "loss": 0.7516, + "step": 13206 + }, + { + "epoch": 7.448956570783982, + "grad_norm": 1.1857060194015503, + "learning_rate": 1.2758037225042302e-05, + "loss": 0.7552, + "step": 13207 + }, + { + "epoch": 7.4495205865764245, + "grad_norm": 1.060304045677185, + "learning_rate": 1.2755217146080089e-05, + "loss": 0.6804, + "step": 13208 + }, + { + "epoch": 7.450084602368866, + "grad_norm": 0.8277948498725891, + "learning_rate": 1.275239706711788e-05, + "loss": 0.6741, + "step": 13209 + }, + { + "epoch": 7.450648618161308, + "grad_norm": 1.4400691986083984, + "learning_rate": 1.2749576988155668e-05, + "loss": 0.8825, + "step": 13210 + }, + { + "epoch": 7.451212633953751, + "grad_norm": 1.2154206037521362, + "learning_rate": 1.2746756909193457e-05, + "loss": 0.7608, + "step": 13211 + }, + { + "epoch": 7.451776649746193, + "grad_norm": 0.9018325209617615, + "learning_rate": 1.2743936830231248e-05, + "loss": 0.6927, + "step": 13212 + }, + { + "epoch": 7.452340665538635, + "grad_norm": 1.4006894826889038, + "learning_rate": 1.2741116751269036e-05, + "loss": 0.8259, + "step": 13213 + }, + { + "epoch": 7.452904681331077, + "grad_norm": 1.0587760210037231, + "learning_rate": 1.2738296672306827e-05, + "loss": 0.7331, + "step": 13214 + }, + { + "epoch": 7.45346869712352, + "grad_norm": 1.1032726764678955, + "learning_rate": 1.2735476593344614e-05, + "loss": 0.7496, + "step": 13215 + }, + { + "epoch": 7.454032712915962, + "grad_norm": 1.1901416778564453, + "learning_rate": 1.2732656514382405e-05, + "loss": 0.8359, + "step": 13216 + }, + { + "epoch": 7.454596728708404, + "grad_norm": 1.1539381742477417, + "learning_rate": 1.2729836435420192e-05, + "loss": 0.7701, + "step": 13217 + }, + { + "epoch": 7.455160744500846, + "grad_norm": 0.9414248466491699, + "learning_rate": 1.2727016356457982e-05, + "loss": 0.7438, + "step": 13218 + }, + { + "epoch": 7.455724760293288, + "grad_norm": 1.1237659454345703, + "learning_rate": 1.2724196277495771e-05, + "loss": 0.7078, + "step": 13219 + }, + { + "epoch": 7.456288776085731, + "grad_norm": 1.3255647420883179, + "learning_rate": 1.272137619853356e-05, + "loss": 0.644, + "step": 13220 + }, + { + "epoch": 7.456852791878172, + "grad_norm": 0.8591626882553101, + "learning_rate": 1.2718556119571349e-05, + "loss": 0.6346, + "step": 13221 + }, + { + "epoch": 7.457416807670615, + "grad_norm": 1.2048983573913574, + "learning_rate": 1.2715736040609139e-05, + "loss": 0.7858, + "step": 13222 + }, + { + "epoch": 7.457980823463057, + "grad_norm": 1.1794415712356567, + "learning_rate": 1.2712915961646926e-05, + "loss": 0.788, + "step": 13223 + }, + { + "epoch": 7.458544839255499, + "grad_norm": 0.7098117470741272, + "learning_rate": 1.2710095882684717e-05, + "loss": 0.6146, + "step": 13224 + }, + { + "epoch": 7.459108855047941, + "grad_norm": 1.1983280181884766, + "learning_rate": 1.2707275803722504e-05, + "loss": 0.6664, + "step": 13225 + }, + { + "epoch": 7.459672870840383, + "grad_norm": 1.0749603509902954, + "learning_rate": 1.2704455724760294e-05, + "loss": 0.7947, + "step": 13226 + }, + { + "epoch": 7.460236886632826, + "grad_norm": 0.9786069989204407, + "learning_rate": 1.2701635645798083e-05, + "loss": 0.7051, + "step": 13227 + }, + { + "epoch": 7.460800902425268, + "grad_norm": 1.0509065389633179, + "learning_rate": 1.2698815566835874e-05, + "loss": 0.6679, + "step": 13228 + }, + { + "epoch": 7.46136491821771, + "grad_norm": 1.2478073835372925, + "learning_rate": 1.269599548787366e-05, + "loss": 0.8174, + "step": 13229 + }, + { + "epoch": 7.461928934010152, + "grad_norm": 0.8413200378417969, + "learning_rate": 1.2693175408911451e-05, + "loss": 0.7033, + "step": 13230 + }, + { + "epoch": 7.462492949802595, + "grad_norm": 1.3432377576828003, + "learning_rate": 1.2690355329949238e-05, + "loss": 0.822, + "step": 13231 + }, + { + "epoch": 7.463056965595037, + "grad_norm": 0.8783000707626343, + "learning_rate": 1.2687535250987029e-05, + "loss": 0.638, + "step": 13232 + }, + { + "epoch": 7.4636209813874785, + "grad_norm": 1.2845463752746582, + "learning_rate": 1.2684715172024816e-05, + "loss": 0.7833, + "step": 13233 + }, + { + "epoch": 7.464184997179921, + "grad_norm": 1.2513877153396606, + "learning_rate": 1.2681895093062606e-05, + "loss": 0.7966, + "step": 13234 + }, + { + "epoch": 7.464749012972363, + "grad_norm": 0.894888699054718, + "learning_rate": 1.2679075014100395e-05, + "loss": 0.6915, + "step": 13235 + }, + { + "epoch": 7.465313028764806, + "grad_norm": 0.9025793671607971, + "learning_rate": 1.2676254935138186e-05, + "loss": 0.6533, + "step": 13236 + }, + { + "epoch": 7.4658770445572475, + "grad_norm": 0.7494328022003174, + "learning_rate": 1.2673434856175973e-05, + "loss": 0.6291, + "step": 13237 + }, + { + "epoch": 7.466441060349689, + "grad_norm": 1.8663737773895264, + "learning_rate": 1.2670614777213763e-05, + "loss": 0.7422, + "step": 13238 + }, + { + "epoch": 7.467005076142132, + "grad_norm": 1.2722969055175781, + "learning_rate": 1.266779469825155e-05, + "loss": 0.8135, + "step": 13239 + }, + { + "epoch": 7.467569091934574, + "grad_norm": 1.00520658493042, + "learning_rate": 1.266497461928934e-05, + "loss": 0.658, + "step": 13240 + }, + { + "epoch": 7.4681331077270166, + "grad_norm": 1.384185791015625, + "learning_rate": 1.2662154540327128e-05, + "loss": 0.816, + "step": 13241 + }, + { + "epoch": 7.468697123519458, + "grad_norm": 1.3844492435455322, + "learning_rate": 1.2659334461364918e-05, + "loss": 0.8379, + "step": 13242 + }, + { + "epoch": 7.469261139311901, + "grad_norm": 1.4699710607528687, + "learning_rate": 1.2656514382402707e-05, + "loss": 0.6683, + "step": 13243 + }, + { + "epoch": 7.469825155104343, + "grad_norm": 1.0877792835235596, + "learning_rate": 1.2653694303440498e-05, + "loss": 0.7861, + "step": 13244 + }, + { + "epoch": 7.470389170896786, + "grad_norm": 1.1186527013778687, + "learning_rate": 1.2650874224478285e-05, + "loss": 0.721, + "step": 13245 + }, + { + "epoch": 7.470953186689227, + "grad_norm": 1.2385880947113037, + "learning_rate": 1.2648054145516075e-05, + "loss": 0.8526, + "step": 13246 + }, + { + "epoch": 7.471517202481669, + "grad_norm": 1.0720041990280151, + "learning_rate": 1.2645234066553866e-05, + "loss": 0.6411, + "step": 13247 + }, + { + "epoch": 7.472081218274112, + "grad_norm": 1.2902816534042358, + "learning_rate": 1.2642413987591653e-05, + "loss": 0.804, + "step": 13248 + }, + { + "epoch": 7.472645234066554, + "grad_norm": 1.9513349533081055, + "learning_rate": 1.2639593908629443e-05, + "loss": 0.7828, + "step": 13249 + }, + { + "epoch": 7.473209249858996, + "grad_norm": 1.0323292016983032, + "learning_rate": 1.263677382966723e-05, + "loss": 0.7147, + "step": 13250 + }, + { + "epoch": 7.473773265651438, + "grad_norm": 1.2381525039672852, + "learning_rate": 1.2633953750705021e-05, + "loss": 0.7865, + "step": 13251 + }, + { + "epoch": 7.47433728144388, + "grad_norm": 1.7332756519317627, + "learning_rate": 1.263113367174281e-05, + "loss": 0.8026, + "step": 13252 + }, + { + "epoch": 7.474901297236323, + "grad_norm": 0.8793673515319824, + "learning_rate": 1.26283135927806e-05, + "loss": 0.7033, + "step": 13253 + }, + { + "epoch": 7.4754653130287645, + "grad_norm": 1.0378950834274292, + "learning_rate": 1.2625493513818387e-05, + "loss": 0.6161, + "step": 13254 + }, + { + "epoch": 7.476029328821207, + "grad_norm": 1.065009355545044, + "learning_rate": 1.2622673434856178e-05, + "loss": 0.7486, + "step": 13255 + }, + { + "epoch": 7.476593344613649, + "grad_norm": 1.177228569984436, + "learning_rate": 1.2619853355893965e-05, + "loss": 0.701, + "step": 13256 + }, + { + "epoch": 7.477157360406092, + "grad_norm": 1.1983978748321533, + "learning_rate": 1.2617033276931755e-05, + "loss": 0.838, + "step": 13257 + }, + { + "epoch": 7.4777213761985335, + "grad_norm": 1.2225433588027954, + "learning_rate": 1.2614213197969542e-05, + "loss": 0.7308, + "step": 13258 + }, + { + "epoch": 7.478285391990976, + "grad_norm": 1.1486011743545532, + "learning_rate": 1.2611393119007333e-05, + "loss": 0.6772, + "step": 13259 + }, + { + "epoch": 7.478849407783418, + "grad_norm": 1.2151648998260498, + "learning_rate": 1.2608573040045122e-05, + "loss": 0.6879, + "step": 13260 + }, + { + "epoch": 7.47941342357586, + "grad_norm": 0.9940645694732666, + "learning_rate": 1.2605752961082912e-05, + "loss": 0.819, + "step": 13261 + }, + { + "epoch": 7.4799774393683025, + "grad_norm": 1.023062825202942, + "learning_rate": 1.26029328821207e-05, + "loss": 0.8042, + "step": 13262 + }, + { + "epoch": 7.480541455160744, + "grad_norm": 1.149954915046692, + "learning_rate": 1.260011280315849e-05, + "loss": 0.7096, + "step": 13263 + }, + { + "epoch": 7.481105470953187, + "grad_norm": 1.0788317918777466, + "learning_rate": 1.2597292724196277e-05, + "loss": 0.693, + "step": 13264 + }, + { + "epoch": 7.481669486745629, + "grad_norm": 1.262171745300293, + "learning_rate": 1.2594472645234067e-05, + "loss": 0.8331, + "step": 13265 + }, + { + "epoch": 7.482233502538071, + "grad_norm": 1.0139236450195312, + "learning_rate": 1.2591652566271856e-05, + "loss": 0.6954, + "step": 13266 + }, + { + "epoch": 7.482797518330513, + "grad_norm": 1.2192409038543701, + "learning_rate": 1.2588832487309645e-05, + "loss": 0.8141, + "step": 13267 + }, + { + "epoch": 7.483361534122955, + "grad_norm": 1.2864642143249512, + "learning_rate": 1.2586012408347434e-05, + "loss": 0.7218, + "step": 13268 + }, + { + "epoch": 7.483925549915398, + "grad_norm": 1.2791720628738403, + "learning_rate": 1.2583192329385224e-05, + "loss": 0.7347, + "step": 13269 + }, + { + "epoch": 7.48448956570784, + "grad_norm": 0.8941598534584045, + "learning_rate": 1.2580372250423011e-05, + "loss": 0.6756, + "step": 13270 + }, + { + "epoch": 7.485053581500282, + "grad_norm": 0.9982949495315552, + "learning_rate": 1.2577552171460802e-05, + "loss": 0.6675, + "step": 13271 + }, + { + "epoch": 7.485617597292724, + "grad_norm": 1.1644171476364136, + "learning_rate": 1.2574732092498589e-05, + "loss": 0.749, + "step": 13272 + }, + { + "epoch": 7.486181613085167, + "grad_norm": 0.8650705218315125, + "learning_rate": 1.257191201353638e-05, + "loss": 0.6384, + "step": 13273 + }, + { + "epoch": 7.486745628877609, + "grad_norm": 1.4706027507781982, + "learning_rate": 1.2569091934574168e-05, + "loss": 0.8133, + "step": 13274 + }, + { + "epoch": 7.4873096446700504, + "grad_norm": 0.7938990592956543, + "learning_rate": 1.2566271855611959e-05, + "loss": 0.6606, + "step": 13275 + }, + { + "epoch": 7.487873660462493, + "grad_norm": 0.8964648842811584, + "learning_rate": 1.2563451776649746e-05, + "loss": 0.6041, + "step": 13276 + }, + { + "epoch": 7.488437676254935, + "grad_norm": 1.4835689067840576, + "learning_rate": 1.2560631697687536e-05, + "loss": 0.8129, + "step": 13277 + }, + { + "epoch": 7.489001692047378, + "grad_norm": 1.175620198249817, + "learning_rate": 1.2557811618725323e-05, + "loss": 0.6643, + "step": 13278 + }, + { + "epoch": 7.4895657078398195, + "grad_norm": 1.4110853672027588, + "learning_rate": 1.2554991539763114e-05, + "loss": 0.8616, + "step": 13279 + }, + { + "epoch": 7.490129723632261, + "grad_norm": 0.9353614449501038, + "learning_rate": 1.2552171460800901e-05, + "loss": 0.6864, + "step": 13280 + }, + { + "epoch": 7.490693739424704, + "grad_norm": 1.586500644683838, + "learning_rate": 1.2549351381838692e-05, + "loss": 0.7437, + "step": 13281 + }, + { + "epoch": 7.491257755217146, + "grad_norm": 1.3721882104873657, + "learning_rate": 1.2546531302876482e-05, + "loss": 0.7922, + "step": 13282 + }, + { + "epoch": 7.4918217710095885, + "grad_norm": 1.1068024635314941, + "learning_rate": 1.2543711223914271e-05, + "loss": 0.7165, + "step": 13283 + }, + { + "epoch": 7.49238578680203, + "grad_norm": 0.9032384753227234, + "learning_rate": 1.2540891144952061e-05, + "loss": 0.7017, + "step": 13284 + }, + { + "epoch": 7.492949802594473, + "grad_norm": 1.1009819507598877, + "learning_rate": 1.2538071065989848e-05, + "loss": 0.8426, + "step": 13285 + }, + { + "epoch": 7.493513818386915, + "grad_norm": 1.1404454708099365, + "learning_rate": 1.2535250987027639e-05, + "loss": 0.6917, + "step": 13286 + }, + { + "epoch": 7.4940778341793575, + "grad_norm": 0.9945648312568665, + "learning_rate": 1.2532430908065426e-05, + "loss": 0.6822, + "step": 13287 + }, + { + "epoch": 7.494641849971799, + "grad_norm": 0.7986001968383789, + "learning_rate": 1.2529610829103217e-05, + "loss": 0.5954, + "step": 13288 + }, + { + "epoch": 7.495205865764241, + "grad_norm": 1.0275813341140747, + "learning_rate": 1.2526790750141004e-05, + "loss": 0.739, + "step": 13289 + }, + { + "epoch": 7.495769881556684, + "grad_norm": 1.152844786643982, + "learning_rate": 1.2523970671178794e-05, + "loss": 0.7337, + "step": 13290 + }, + { + "epoch": 7.496333897349126, + "grad_norm": 1.1994603872299194, + "learning_rate": 1.2521150592216583e-05, + "loss": 0.8535, + "step": 13291 + }, + { + "epoch": 7.496897913141568, + "grad_norm": 1.2275015115737915, + "learning_rate": 1.2518330513254373e-05, + "loss": 0.6836, + "step": 13292 + }, + { + "epoch": 7.49746192893401, + "grad_norm": 1.5621047019958496, + "learning_rate": 1.251551043429216e-05, + "loss": 0.7573, + "step": 13293 + }, + { + "epoch": 7.498025944726452, + "grad_norm": 1.2851029634475708, + "learning_rate": 1.2512690355329951e-05, + "loss": 0.7694, + "step": 13294 + }, + { + "epoch": 7.498589960518895, + "grad_norm": 1.5690810680389404, + "learning_rate": 1.2509870276367738e-05, + "loss": 0.7285, + "step": 13295 + }, + { + "epoch": 7.499153976311336, + "grad_norm": 1.0886739492416382, + "learning_rate": 1.2507050197405529e-05, + "loss": 0.8289, + "step": 13296 + }, + { + "epoch": 7.499717992103779, + "grad_norm": 0.9752559661865234, + "learning_rate": 1.2504230118443316e-05, + "loss": 0.6891, + "step": 13297 + }, + { + "epoch": 7.500282007896221, + "grad_norm": 1.1196706295013428, + "learning_rate": 1.2501410039481106e-05, + "loss": 0.5984, + "step": 13298 + }, + { + "epoch": 7.500846023688664, + "grad_norm": 1.085620403289795, + "learning_rate": 1.2498589960518895e-05, + "loss": 0.7053, + "step": 13299 + }, + { + "epoch": 7.501410039481105, + "grad_norm": 1.5332022905349731, + "learning_rate": 1.2495769881556685e-05, + "loss": 0.8088, + "step": 13300 + }, + { + "epoch": 7.501974055273548, + "grad_norm": 1.0643125772476196, + "learning_rate": 1.2492949802594474e-05, + "loss": 0.7482, + "step": 13301 + }, + { + "epoch": 7.50253807106599, + "grad_norm": 1.4955353736877441, + "learning_rate": 1.2490129723632263e-05, + "loss": 0.8867, + "step": 13302 + }, + { + "epoch": 7.503102086858432, + "grad_norm": 1.249145746231079, + "learning_rate": 1.2487309644670052e-05, + "loss": 0.7086, + "step": 13303 + }, + { + "epoch": 7.503666102650874, + "grad_norm": 1.3599493503570557, + "learning_rate": 1.248448956570784e-05, + "loss": 0.6921, + "step": 13304 + }, + { + "epoch": 7.504230118443316, + "grad_norm": 1.5265085697174072, + "learning_rate": 1.248166948674563e-05, + "loss": 0.7759, + "step": 13305 + }, + { + "epoch": 7.504794134235759, + "grad_norm": 1.157604694366455, + "learning_rate": 1.2478849407783418e-05, + "loss": 0.6144, + "step": 13306 + }, + { + "epoch": 7.505358150028201, + "grad_norm": 1.1201553344726562, + "learning_rate": 1.2476029328821209e-05, + "loss": 0.6618, + "step": 13307 + }, + { + "epoch": 7.5059221658206425, + "grad_norm": 0.9959499835968018, + "learning_rate": 1.2473209249858998e-05, + "loss": 0.7359, + "step": 13308 + }, + { + "epoch": 7.506486181613085, + "grad_norm": 1.2275488376617432, + "learning_rate": 1.2470389170896786e-05, + "loss": 0.704, + "step": 13309 + }, + { + "epoch": 7.507050197405527, + "grad_norm": 1.0794817209243774, + "learning_rate": 1.2467569091934575e-05, + "loss": 0.7081, + "step": 13310 + }, + { + "epoch": 7.50761421319797, + "grad_norm": 0.9433355331420898, + "learning_rate": 1.2464749012972364e-05, + "loss": 0.7762, + "step": 13311 + }, + { + "epoch": 7.5081782289904115, + "grad_norm": 1.2130022048950195, + "learning_rate": 1.2461928934010153e-05, + "loss": 0.6987, + "step": 13312 + }, + { + "epoch": 7.508742244782854, + "grad_norm": 1.4178848266601562, + "learning_rate": 1.2459108855047942e-05, + "loss": 0.7899, + "step": 13313 + }, + { + "epoch": 7.509306260575296, + "grad_norm": 1.2304264307022095, + "learning_rate": 1.245628877608573e-05, + "loss": 0.7573, + "step": 13314 + }, + { + "epoch": 7.509870276367739, + "grad_norm": 1.084240436553955, + "learning_rate": 1.245346869712352e-05, + "loss": 0.685, + "step": 13315 + }, + { + "epoch": 7.5104342921601805, + "grad_norm": 1.2152683734893799, + "learning_rate": 1.245064861816131e-05, + "loss": 0.8213, + "step": 13316 + }, + { + "epoch": 7.510998307952622, + "grad_norm": 1.0164124965667725, + "learning_rate": 1.2447828539199098e-05, + "loss": 0.6743, + "step": 13317 + }, + { + "epoch": 7.511562323745065, + "grad_norm": 1.3177872896194458, + "learning_rate": 1.2445008460236887e-05, + "loss": 0.7163, + "step": 13318 + }, + { + "epoch": 7.512126339537507, + "grad_norm": 1.1091892719268799, + "learning_rate": 1.2442188381274676e-05, + "loss": 0.7231, + "step": 13319 + }, + { + "epoch": 7.5126903553299496, + "grad_norm": 1.0551085472106934, + "learning_rate": 1.2439368302312465e-05, + "loss": 0.645, + "step": 13320 + }, + { + "epoch": 7.513254371122391, + "grad_norm": 1.120760202407837, + "learning_rate": 1.2436548223350254e-05, + "loss": 0.7304, + "step": 13321 + }, + { + "epoch": 7.513818386914833, + "grad_norm": 0.9404204487800598, + "learning_rate": 1.2433728144388042e-05, + "loss": 0.7981, + "step": 13322 + }, + { + "epoch": 7.514382402707276, + "grad_norm": 1.1947537660598755, + "learning_rate": 1.2430908065425833e-05, + "loss": 0.7568, + "step": 13323 + }, + { + "epoch": 7.514946418499718, + "grad_norm": 0.8343985676765442, + "learning_rate": 1.2428087986463622e-05, + "loss": 0.6669, + "step": 13324 + }, + { + "epoch": 7.51551043429216, + "grad_norm": 1.198580026626587, + "learning_rate": 1.242526790750141e-05, + "loss": 0.692, + "step": 13325 + }, + { + "epoch": 7.516074450084602, + "grad_norm": 1.863755464553833, + "learning_rate": 1.24224478285392e-05, + "loss": 0.8862, + "step": 13326 + }, + { + "epoch": 7.516638465877045, + "grad_norm": 1.004461646080017, + "learning_rate": 1.2419627749576988e-05, + "loss": 0.7268, + "step": 13327 + }, + { + "epoch": 7.517202481669487, + "grad_norm": 1.3333640098571777, + "learning_rate": 1.2416807670614777e-05, + "loss": 0.7163, + "step": 13328 + }, + { + "epoch": 7.517766497461929, + "grad_norm": 1.2382264137268066, + "learning_rate": 1.2413987591652566e-05, + "loss": 0.7108, + "step": 13329 + }, + { + "epoch": 7.518330513254371, + "grad_norm": 0.9994521141052246, + "learning_rate": 1.2411167512690356e-05, + "loss": 0.6871, + "step": 13330 + }, + { + "epoch": 7.518894529046813, + "grad_norm": 1.2299408912658691, + "learning_rate": 1.2408347433728145e-05, + "loss": 0.8072, + "step": 13331 + }, + { + "epoch": 7.519458544839256, + "grad_norm": 1.1687933206558228, + "learning_rate": 1.2405527354765934e-05, + "loss": 0.6727, + "step": 13332 + }, + { + "epoch": 7.5200225606316975, + "grad_norm": 1.1461387872695923, + "learning_rate": 1.2402707275803723e-05, + "loss": 0.8918, + "step": 13333 + }, + { + "epoch": 7.52058657642414, + "grad_norm": 1.1321402788162231, + "learning_rate": 1.2399887196841513e-05, + "loss": 0.7333, + "step": 13334 + }, + { + "epoch": 7.521150592216582, + "grad_norm": 1.0611282587051392, + "learning_rate": 1.2397067117879302e-05, + "loss": 0.8083, + "step": 13335 + }, + { + "epoch": 7.521714608009024, + "grad_norm": 1.2700930833816528, + "learning_rate": 1.239424703891709e-05, + "loss": 0.7392, + "step": 13336 + }, + { + "epoch": 7.5222786238014665, + "grad_norm": 1.2028175592422485, + "learning_rate": 1.239142695995488e-05, + "loss": 0.8079, + "step": 13337 + }, + { + "epoch": 7.522842639593908, + "grad_norm": 1.3681820631027222, + "learning_rate": 1.2388606880992668e-05, + "loss": 0.8247, + "step": 13338 + }, + { + "epoch": 7.523406655386351, + "grad_norm": 0.8238645195960999, + "learning_rate": 1.2385786802030459e-05, + "loss": 0.6414, + "step": 13339 + }, + { + "epoch": 7.523970671178793, + "grad_norm": 1.0946892499923706, + "learning_rate": 1.2382966723068248e-05, + "loss": 0.7733, + "step": 13340 + }, + { + "epoch": 7.5245346869712355, + "grad_norm": 0.8400049209594727, + "learning_rate": 1.2380146644106036e-05, + "loss": 0.7227, + "step": 13341 + }, + { + "epoch": 7.525098702763677, + "grad_norm": 1.0357496738433838, + "learning_rate": 1.2377326565143825e-05, + "loss": 0.7813, + "step": 13342 + }, + { + "epoch": 7.52566271855612, + "grad_norm": 1.0072628259658813, + "learning_rate": 1.2374506486181614e-05, + "loss": 0.6398, + "step": 13343 + }, + { + "epoch": 7.526226734348562, + "grad_norm": 0.9880034923553467, + "learning_rate": 1.2371686407219403e-05, + "loss": 0.7598, + "step": 13344 + }, + { + "epoch": 7.526790750141004, + "grad_norm": 1.3248443603515625, + "learning_rate": 1.2368866328257191e-05, + "loss": 0.7338, + "step": 13345 + }, + { + "epoch": 7.527354765933446, + "grad_norm": 0.9793855547904968, + "learning_rate": 1.236604624929498e-05, + "loss": 0.7068, + "step": 13346 + }, + { + "epoch": 7.527918781725888, + "grad_norm": 1.2219741344451904, + "learning_rate": 1.236322617033277e-05, + "loss": 0.8287, + "step": 13347 + }, + { + "epoch": 7.528482797518331, + "grad_norm": 1.0830625295639038, + "learning_rate": 1.236040609137056e-05, + "loss": 0.7011, + "step": 13348 + }, + { + "epoch": 7.529046813310773, + "grad_norm": 1.3970786333084106, + "learning_rate": 1.2357586012408348e-05, + "loss": 0.8225, + "step": 13349 + }, + { + "epoch": 7.529610829103214, + "grad_norm": 1.382968783378601, + "learning_rate": 1.2354765933446137e-05, + "loss": 0.8113, + "step": 13350 + }, + { + "epoch": 7.530174844895657, + "grad_norm": 1.2687441110610962, + "learning_rate": 1.2351945854483926e-05, + "loss": 0.778, + "step": 13351 + }, + { + "epoch": 7.530738860688099, + "grad_norm": 1.1987186670303345, + "learning_rate": 1.2349125775521715e-05, + "loss": 0.7738, + "step": 13352 + }, + { + "epoch": 7.531302876480542, + "grad_norm": 1.0261948108673096, + "learning_rate": 1.2346305696559504e-05, + "loss": 0.7532, + "step": 13353 + }, + { + "epoch": 7.5318668922729834, + "grad_norm": 1.1375731229782104, + "learning_rate": 1.2343485617597294e-05, + "loss": 0.7648, + "step": 13354 + }, + { + "epoch": 7.532430908065426, + "grad_norm": 1.193454623222351, + "learning_rate": 1.2340665538635083e-05, + "loss": 0.7823, + "step": 13355 + }, + { + "epoch": 7.532994923857868, + "grad_norm": 1.6465563774108887, + "learning_rate": 1.2337845459672872e-05, + "loss": 0.8396, + "step": 13356 + }, + { + "epoch": 7.533558939650311, + "grad_norm": 0.8453050851821899, + "learning_rate": 1.233502538071066e-05, + "loss": 0.6304, + "step": 13357 + }, + { + "epoch": 7.5341229554427525, + "grad_norm": 0.9016390442848206, + "learning_rate": 1.233220530174845e-05, + "loss": 0.7576, + "step": 13358 + }, + { + "epoch": 7.534686971235194, + "grad_norm": 0.9104501008987427, + "learning_rate": 1.2329385222786238e-05, + "loss": 0.6789, + "step": 13359 + }, + { + "epoch": 7.535250987027637, + "grad_norm": 1.292044758796692, + "learning_rate": 1.2326565143824027e-05, + "loss": 0.7478, + "step": 13360 + }, + { + "epoch": 7.535815002820079, + "grad_norm": 1.1198437213897705, + "learning_rate": 1.2323745064861816e-05, + "loss": 0.6964, + "step": 13361 + }, + { + "epoch": 7.5363790186125215, + "grad_norm": 1.3203288316726685, + "learning_rate": 1.2320924985899606e-05, + "loss": 0.7245, + "step": 13362 + }, + { + "epoch": 7.536943034404963, + "grad_norm": 1.557139277458191, + "learning_rate": 1.2318104906937395e-05, + "loss": 0.8311, + "step": 13363 + }, + { + "epoch": 7.537507050197405, + "grad_norm": 1.2623518705368042, + "learning_rate": 1.2315284827975184e-05, + "loss": 0.7026, + "step": 13364 + }, + { + "epoch": 7.538071065989848, + "grad_norm": 1.156834363937378, + "learning_rate": 1.2312464749012973e-05, + "loss": 0.6921, + "step": 13365 + }, + { + "epoch": 7.53863508178229, + "grad_norm": 0.800491452217102, + "learning_rate": 1.2309644670050761e-05, + "loss": 0.6682, + "step": 13366 + }, + { + "epoch": 7.539199097574732, + "grad_norm": 1.083186388015747, + "learning_rate": 1.230682459108855e-05, + "loss": 0.8561, + "step": 13367 + }, + { + "epoch": 7.539763113367174, + "grad_norm": 1.409746766090393, + "learning_rate": 1.2304004512126339e-05, + "loss": 0.7952, + "step": 13368 + }, + { + "epoch": 7.540327129159617, + "grad_norm": 1.087438941001892, + "learning_rate": 1.2301184433164128e-05, + "loss": 0.686, + "step": 13369 + }, + { + "epoch": 7.540891144952059, + "grad_norm": 0.8981974124908447, + "learning_rate": 1.2298364354201918e-05, + "loss": 0.736, + "step": 13370 + }, + { + "epoch": 7.541455160744501, + "grad_norm": 1.4716190099716187, + "learning_rate": 1.2295544275239709e-05, + "loss": 0.8196, + "step": 13371 + }, + { + "epoch": 7.542019176536943, + "grad_norm": 1.1126822233200073, + "learning_rate": 1.2292724196277497e-05, + "loss": 0.7682, + "step": 13372 + }, + { + "epoch": 7.542583192329385, + "grad_norm": 1.2605948448181152, + "learning_rate": 1.2289904117315286e-05, + "loss": 0.7995, + "step": 13373 + }, + { + "epoch": 7.543147208121828, + "grad_norm": 0.9393181204795837, + "learning_rate": 1.2287084038353075e-05, + "loss": 0.7622, + "step": 13374 + }, + { + "epoch": 7.543711223914269, + "grad_norm": 1.0544376373291016, + "learning_rate": 1.2284263959390864e-05, + "loss": 0.8158, + "step": 13375 + }, + { + "epoch": 7.544275239706712, + "grad_norm": 1.2624859809875488, + "learning_rate": 1.2281443880428653e-05, + "loss": 0.8201, + "step": 13376 + }, + { + "epoch": 7.544839255499154, + "grad_norm": 0.9005662798881531, + "learning_rate": 1.2278623801466441e-05, + "loss": 0.6827, + "step": 13377 + }, + { + "epoch": 7.545403271291596, + "grad_norm": 1.4532592296600342, + "learning_rate": 1.227580372250423e-05, + "loss": 0.7947, + "step": 13378 + }, + { + "epoch": 7.545967287084038, + "grad_norm": 1.3717018365859985, + "learning_rate": 1.227298364354202e-05, + "loss": 0.8033, + "step": 13379 + }, + { + "epoch": 7.54653130287648, + "grad_norm": 2.061267137527466, + "learning_rate": 1.227016356457981e-05, + "loss": 0.7304, + "step": 13380 + }, + { + "epoch": 7.547095318668923, + "grad_norm": 1.251373052597046, + "learning_rate": 1.2267343485617598e-05, + "loss": 0.7721, + "step": 13381 + }, + { + "epoch": 7.547659334461365, + "grad_norm": 1.411458969116211, + "learning_rate": 1.2264523406655387e-05, + "loss": 0.7823, + "step": 13382 + }, + { + "epoch": 7.548223350253807, + "grad_norm": 1.192658543586731, + "learning_rate": 1.2261703327693176e-05, + "loss": 0.7178, + "step": 13383 + }, + { + "epoch": 7.548787366046249, + "grad_norm": 1.4429454803466797, + "learning_rate": 1.2258883248730965e-05, + "loss": 0.8292, + "step": 13384 + }, + { + "epoch": 7.549351381838692, + "grad_norm": 1.233953595161438, + "learning_rate": 1.2256063169768754e-05, + "loss": 0.7885, + "step": 13385 + }, + { + "epoch": 7.549915397631134, + "grad_norm": 1.025954246520996, + "learning_rate": 1.2253243090806544e-05, + "loss": 0.7147, + "step": 13386 + }, + { + "epoch": 7.5504794134235755, + "grad_norm": 1.5494389533996582, + "learning_rate": 1.2250423011844333e-05, + "loss": 0.7065, + "step": 13387 + }, + { + "epoch": 7.551043429216018, + "grad_norm": 1.0709832906723022, + "learning_rate": 1.2247602932882122e-05, + "loss": 0.6834, + "step": 13388 + }, + { + "epoch": 7.55160744500846, + "grad_norm": 1.1639697551727295, + "learning_rate": 1.224478285391991e-05, + "loss": 0.7856, + "step": 13389 + }, + { + "epoch": 7.552171460800903, + "grad_norm": 1.2576478719711304, + "learning_rate": 1.22419627749577e-05, + "loss": 0.7189, + "step": 13390 + }, + { + "epoch": 7.5527354765933445, + "grad_norm": 1.2126917839050293, + "learning_rate": 1.2239142695995488e-05, + "loss": 0.7784, + "step": 13391 + }, + { + "epoch": 7.553299492385786, + "grad_norm": 1.336808204650879, + "learning_rate": 1.2236322617033277e-05, + "loss": 0.7825, + "step": 13392 + }, + { + "epoch": 7.553863508178229, + "grad_norm": 1.0229401588439941, + "learning_rate": 1.2233502538071066e-05, + "loss": 0.7838, + "step": 13393 + }, + { + "epoch": 7.554427523970671, + "grad_norm": 1.0978050231933594, + "learning_rate": 1.2230682459108856e-05, + "loss": 0.6005, + "step": 13394 + }, + { + "epoch": 7.5549915397631136, + "grad_norm": 1.0787280797958374, + "learning_rate": 1.2227862380146645e-05, + "loss": 0.7119, + "step": 13395 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 1.1939640045166016, + "learning_rate": 1.2225042301184434e-05, + "loss": 0.7064, + "step": 13396 + }, + { + "epoch": 7.556119571347998, + "grad_norm": 0.8872738480567932, + "learning_rate": 1.2222222222222222e-05, + "loss": 0.7298, + "step": 13397 + }, + { + "epoch": 7.55668358714044, + "grad_norm": 1.0490896701812744, + "learning_rate": 1.2219402143260011e-05, + "loss": 0.7597, + "step": 13398 + }, + { + "epoch": 7.557247602932883, + "grad_norm": 0.9552010893821716, + "learning_rate": 1.22165820642978e-05, + "loss": 0.7296, + "step": 13399 + }, + { + "epoch": 7.557811618725324, + "grad_norm": 1.054522156715393, + "learning_rate": 1.2213761985335589e-05, + "loss": 0.6816, + "step": 13400 + }, + { + "epoch": 7.558375634517766, + "grad_norm": 1.1627482175827026, + "learning_rate": 1.2210941906373378e-05, + "loss": 0.7643, + "step": 13401 + }, + { + "epoch": 7.558939650310209, + "grad_norm": 1.098281979560852, + "learning_rate": 1.2208121827411168e-05, + "loss": 0.7408, + "step": 13402 + }, + { + "epoch": 7.559503666102651, + "grad_norm": 1.185247540473938, + "learning_rate": 1.2205301748448957e-05, + "loss": 0.8272, + "step": 13403 + }, + { + "epoch": 7.560067681895093, + "grad_norm": 0.855276346206665, + "learning_rate": 1.2202481669486746e-05, + "loss": 0.7126, + "step": 13404 + }, + { + "epoch": 7.560631697687535, + "grad_norm": 1.2931995391845703, + "learning_rate": 1.2199661590524536e-05, + "loss": 0.7453, + "step": 13405 + }, + { + "epoch": 7.561195713479977, + "grad_norm": 1.0925229787826538, + "learning_rate": 1.2196841511562325e-05, + "loss": 0.6972, + "step": 13406 + }, + { + "epoch": 7.56175972927242, + "grad_norm": 1.2886584997177124, + "learning_rate": 1.2194021432600114e-05, + "loss": 0.7719, + "step": 13407 + }, + { + "epoch": 7.5623237450648615, + "grad_norm": 1.046086072921753, + "learning_rate": 1.2191201353637903e-05, + "loss": 0.688, + "step": 13408 + }, + { + "epoch": 7.562887760857304, + "grad_norm": 1.1184066534042358, + "learning_rate": 1.2188381274675691e-05, + "loss": 0.7972, + "step": 13409 + }, + { + "epoch": 7.563451776649746, + "grad_norm": 1.055094838142395, + "learning_rate": 1.218556119571348e-05, + "loss": 0.7365, + "step": 13410 + }, + { + "epoch": 7.564015792442189, + "grad_norm": 1.0059285163879395, + "learning_rate": 1.218274111675127e-05, + "loss": 0.6642, + "step": 13411 + }, + { + "epoch": 7.5645798082346305, + "grad_norm": 1.1653308868408203, + "learning_rate": 1.217992103778906e-05, + "loss": 0.7049, + "step": 13412 + }, + { + "epoch": 7.565143824027073, + "grad_norm": 1.2765883207321167, + "learning_rate": 1.2177100958826848e-05, + "loss": 0.7803, + "step": 13413 + }, + { + "epoch": 7.565707839819515, + "grad_norm": 0.8046526312828064, + "learning_rate": 1.2174280879864637e-05, + "loss": 0.708, + "step": 13414 + }, + { + "epoch": 7.566271855611957, + "grad_norm": 0.8300604820251465, + "learning_rate": 1.2171460800902426e-05, + "loss": 0.6638, + "step": 13415 + }, + { + "epoch": 7.5668358714043995, + "grad_norm": 1.5410813093185425, + "learning_rate": 1.2168640721940215e-05, + "loss": 0.804, + "step": 13416 + }, + { + "epoch": 7.567399887196841, + "grad_norm": 1.370536208152771, + "learning_rate": 1.2165820642978003e-05, + "loss": 0.7397, + "step": 13417 + }, + { + "epoch": 7.567963902989284, + "grad_norm": 1.0731325149536133, + "learning_rate": 1.2163000564015794e-05, + "loss": 0.7299, + "step": 13418 + }, + { + "epoch": 7.568527918781726, + "grad_norm": 1.069061279296875, + "learning_rate": 1.2160180485053583e-05, + "loss": 0.8147, + "step": 13419 + }, + { + "epoch": 7.569091934574168, + "grad_norm": 1.2678375244140625, + "learning_rate": 1.2157360406091372e-05, + "loss": 0.676, + "step": 13420 + }, + { + "epoch": 7.56965595036661, + "grad_norm": 0.9873997569084167, + "learning_rate": 1.215454032712916e-05, + "loss": 0.7665, + "step": 13421 + }, + { + "epoch": 7.570219966159052, + "grad_norm": 1.0231467485427856, + "learning_rate": 1.215172024816695e-05, + "loss": 0.6319, + "step": 13422 + }, + { + "epoch": 7.570783981951495, + "grad_norm": 1.3870149850845337, + "learning_rate": 1.2148900169204738e-05, + "loss": 0.9156, + "step": 13423 + }, + { + "epoch": 7.571347997743937, + "grad_norm": 1.1646955013275146, + "learning_rate": 1.2146080090242527e-05, + "loss": 0.6252, + "step": 13424 + }, + { + "epoch": 7.571912013536379, + "grad_norm": 1.4912567138671875, + "learning_rate": 1.2143260011280316e-05, + "loss": 0.7748, + "step": 13425 + }, + { + "epoch": 7.572476029328821, + "grad_norm": 1.0635216236114502, + "learning_rate": 1.2140439932318106e-05, + "loss": 0.703, + "step": 13426 + }, + { + "epoch": 7.573040045121264, + "grad_norm": 1.0621100664138794, + "learning_rate": 1.2137619853355895e-05, + "loss": 0.6511, + "step": 13427 + }, + { + "epoch": 7.573604060913706, + "grad_norm": 1.0746703147888184, + "learning_rate": 1.2134799774393684e-05, + "loss": 0.7052, + "step": 13428 + }, + { + "epoch": 7.5741680767061474, + "grad_norm": 1.4169042110443115, + "learning_rate": 1.2131979695431472e-05, + "loss": 0.8019, + "step": 13429 + }, + { + "epoch": 7.57473209249859, + "grad_norm": 0.8553583025932312, + "learning_rate": 1.2129159616469261e-05, + "loss": 0.6846, + "step": 13430 + }, + { + "epoch": 7.575296108291032, + "grad_norm": 1.1834598779678345, + "learning_rate": 1.212633953750705e-05, + "loss": 0.7721, + "step": 13431 + }, + { + "epoch": 7.575860124083475, + "grad_norm": 0.9381166100502014, + "learning_rate": 1.2123519458544839e-05, + "loss": 0.7598, + "step": 13432 + }, + { + "epoch": 7.5764241398759165, + "grad_norm": 0.8907508850097656, + "learning_rate": 1.212069937958263e-05, + "loss": 0.7232, + "step": 13433 + }, + { + "epoch": 7.576988155668358, + "grad_norm": 1.0897173881530762, + "learning_rate": 1.2117879300620418e-05, + "loss": 0.7078, + "step": 13434 + }, + { + "epoch": 7.577552171460801, + "grad_norm": 0.9215393662452698, + "learning_rate": 1.2115059221658207e-05, + "loss": 0.7551, + "step": 13435 + }, + { + "epoch": 7.578116187253243, + "grad_norm": 1.0902491807937622, + "learning_rate": 1.2112239142695996e-05, + "loss": 0.7665, + "step": 13436 + }, + { + "epoch": 7.5786802030456855, + "grad_norm": 1.325312614440918, + "learning_rate": 1.2109419063733784e-05, + "loss": 0.7124, + "step": 13437 + }, + { + "epoch": 7.579244218838127, + "grad_norm": 1.1259570121765137, + "learning_rate": 1.2106598984771573e-05, + "loss": 0.7676, + "step": 13438 + }, + { + "epoch": 7.57980823463057, + "grad_norm": 1.0658698081970215, + "learning_rate": 1.2103778905809362e-05, + "loss": 0.8266, + "step": 13439 + }, + { + "epoch": 7.580372250423012, + "grad_norm": 1.6316686868667603, + "learning_rate": 1.2100958826847153e-05, + "loss": 0.742, + "step": 13440 + }, + { + "epoch": 7.5809362662154545, + "grad_norm": 1.1329448223114014, + "learning_rate": 1.2098138747884941e-05, + "loss": 0.768, + "step": 13441 + }, + { + "epoch": 7.581500282007896, + "grad_norm": 0.9989105463027954, + "learning_rate": 1.2095318668922732e-05, + "loss": 0.7348, + "step": 13442 + }, + { + "epoch": 7.582064297800338, + "grad_norm": 1.366527795791626, + "learning_rate": 1.209249858996052e-05, + "loss": 0.6645, + "step": 13443 + }, + { + "epoch": 7.582628313592781, + "grad_norm": 1.2179089784622192, + "learning_rate": 1.208967851099831e-05, + "loss": 0.7568, + "step": 13444 + }, + { + "epoch": 7.583192329385223, + "grad_norm": 1.2411699295043945, + "learning_rate": 1.2086858432036098e-05, + "loss": 0.7599, + "step": 13445 + }, + { + "epoch": 7.583756345177665, + "grad_norm": 1.186671257019043, + "learning_rate": 1.2084038353073887e-05, + "loss": 0.7455, + "step": 13446 + }, + { + "epoch": 7.584320360970107, + "grad_norm": 1.0723676681518555, + "learning_rate": 1.2081218274111676e-05, + "loss": 0.7781, + "step": 13447 + }, + { + "epoch": 7.584884376762549, + "grad_norm": 1.4751447439193726, + "learning_rate": 1.2078398195149465e-05, + "loss": 0.9864, + "step": 13448 + }, + { + "epoch": 7.585448392554992, + "grad_norm": 1.1869789361953735, + "learning_rate": 1.2075578116187253e-05, + "loss": 0.7467, + "step": 13449 + }, + { + "epoch": 7.586012408347433, + "grad_norm": 1.1687979698181152, + "learning_rate": 1.2072758037225044e-05, + "loss": 0.7035, + "step": 13450 + }, + { + "epoch": 7.586576424139876, + "grad_norm": 0.9663916826248169, + "learning_rate": 1.2069937958262833e-05, + "loss": 0.6348, + "step": 13451 + }, + { + "epoch": 7.587140439932318, + "grad_norm": 1.447433590888977, + "learning_rate": 1.2067117879300622e-05, + "loss": 0.6929, + "step": 13452 + }, + { + "epoch": 7.587704455724761, + "grad_norm": 1.314131259918213, + "learning_rate": 1.206429780033841e-05, + "loss": 0.8084, + "step": 13453 + }, + { + "epoch": 7.588268471517202, + "grad_norm": 1.0709600448608398, + "learning_rate": 1.2061477721376199e-05, + "loss": 0.7037, + "step": 13454 + }, + { + "epoch": 7.588832487309645, + "grad_norm": 1.103563904762268, + "learning_rate": 1.2058657642413988e-05, + "loss": 0.768, + "step": 13455 + }, + { + "epoch": 7.589396503102087, + "grad_norm": 1.4002052545547485, + "learning_rate": 1.2055837563451777e-05, + "loss": 0.8358, + "step": 13456 + }, + { + "epoch": 7.589960518894529, + "grad_norm": 1.196589469909668, + "learning_rate": 1.2053017484489566e-05, + "loss": 0.6516, + "step": 13457 + }, + { + "epoch": 7.590524534686971, + "grad_norm": 1.197852373123169, + "learning_rate": 1.2050197405527356e-05, + "loss": 0.839, + "step": 13458 + }, + { + "epoch": 7.591088550479413, + "grad_norm": 1.5992562770843506, + "learning_rate": 1.2047377326565145e-05, + "loss": 0.8914, + "step": 13459 + }, + { + "epoch": 7.591652566271856, + "grad_norm": 0.9069504737854004, + "learning_rate": 1.2044557247602934e-05, + "loss": 0.6447, + "step": 13460 + }, + { + "epoch": 7.592216582064298, + "grad_norm": 1.019014596939087, + "learning_rate": 1.2041737168640722e-05, + "loss": 0.8325, + "step": 13461 + }, + { + "epoch": 7.5927805978567395, + "grad_norm": 1.077813982963562, + "learning_rate": 1.2038917089678511e-05, + "loss": 0.7488, + "step": 13462 + }, + { + "epoch": 7.593344613649182, + "grad_norm": 1.1887903213500977, + "learning_rate": 1.20360970107163e-05, + "loss": 0.7465, + "step": 13463 + }, + { + "epoch": 7.593908629441624, + "grad_norm": 1.4179973602294922, + "learning_rate": 1.2033276931754089e-05, + "loss": 0.7609, + "step": 13464 + }, + { + "epoch": 7.594472645234067, + "grad_norm": 1.0379323959350586, + "learning_rate": 1.203045685279188e-05, + "loss": 0.814, + "step": 13465 + }, + { + "epoch": 7.5950366610265085, + "grad_norm": 1.3364815711975098, + "learning_rate": 1.2027636773829668e-05, + "loss": 0.7244, + "step": 13466 + }, + { + "epoch": 7.595600676818951, + "grad_norm": 1.5506174564361572, + "learning_rate": 1.2024816694867457e-05, + "loss": 0.8093, + "step": 13467 + }, + { + "epoch": 7.596164692611393, + "grad_norm": 0.9550436735153198, + "learning_rate": 1.2021996615905246e-05, + "loss": 0.6782, + "step": 13468 + }, + { + "epoch": 7.596728708403836, + "grad_norm": 1.0024093389511108, + "learning_rate": 1.2019176536943034e-05, + "loss": 0.6849, + "step": 13469 + }, + { + "epoch": 7.5972927241962775, + "grad_norm": 1.077596664428711, + "learning_rate": 1.2016356457980823e-05, + "loss": 0.6916, + "step": 13470 + }, + { + "epoch": 7.597856739988719, + "grad_norm": 1.0731121301651, + "learning_rate": 1.2013536379018612e-05, + "loss": 0.7006, + "step": 13471 + }, + { + "epoch": 7.598420755781162, + "grad_norm": 1.566319227218628, + "learning_rate": 1.20107163000564e-05, + "loss": 0.7789, + "step": 13472 + }, + { + "epoch": 7.598984771573604, + "grad_norm": 1.2082237005233765, + "learning_rate": 1.2007896221094191e-05, + "loss": 0.7178, + "step": 13473 + }, + { + "epoch": 7.5995487873660466, + "grad_norm": 0.9832833409309387, + "learning_rate": 1.200507614213198e-05, + "loss": 0.6571, + "step": 13474 + }, + { + "epoch": 7.600112803158488, + "grad_norm": 1.453317642211914, + "learning_rate": 1.2002256063169769e-05, + "loss": 0.8235, + "step": 13475 + }, + { + "epoch": 7.60067681895093, + "grad_norm": 1.1533482074737549, + "learning_rate": 1.199943598420756e-05, + "loss": 0.7131, + "step": 13476 + }, + { + "epoch": 7.601240834743373, + "grad_norm": 0.82097989320755, + "learning_rate": 1.1996615905245348e-05, + "loss": 0.6475, + "step": 13477 + }, + { + "epoch": 7.601804850535815, + "grad_norm": 1.0848854780197144, + "learning_rate": 1.1993795826283137e-05, + "loss": 0.7556, + "step": 13478 + }, + { + "epoch": 7.602368866328257, + "grad_norm": 1.0953593254089355, + "learning_rate": 1.1990975747320926e-05, + "loss": 0.7425, + "step": 13479 + }, + { + "epoch": 7.602932882120699, + "grad_norm": 0.9879669547080994, + "learning_rate": 1.1988155668358715e-05, + "loss": 0.6919, + "step": 13480 + }, + { + "epoch": 7.603496897913142, + "grad_norm": 1.2736960649490356, + "learning_rate": 1.1985335589396503e-05, + "loss": 0.7565, + "step": 13481 + }, + { + "epoch": 7.604060913705584, + "grad_norm": 1.2210124731063843, + "learning_rate": 1.1982515510434294e-05, + "loss": 0.844, + "step": 13482 + }, + { + "epoch": 7.604624929498026, + "grad_norm": 1.0410133600234985, + "learning_rate": 1.1979695431472083e-05, + "loss": 0.7242, + "step": 13483 + }, + { + "epoch": 7.605188945290468, + "grad_norm": 1.019896388053894, + "learning_rate": 1.1976875352509871e-05, + "loss": 0.7143, + "step": 13484 + }, + { + "epoch": 7.60575296108291, + "grad_norm": 0.9406927227973938, + "learning_rate": 1.197405527354766e-05, + "loss": 0.7856, + "step": 13485 + }, + { + "epoch": 7.606316976875353, + "grad_norm": 0.9297866225242615, + "learning_rate": 1.1971235194585449e-05, + "loss": 0.6762, + "step": 13486 + }, + { + "epoch": 7.6068809926677945, + "grad_norm": 1.8674181699752808, + "learning_rate": 1.1968415115623238e-05, + "loss": 0.8633, + "step": 13487 + }, + { + "epoch": 7.607445008460237, + "grad_norm": 1.2536756992340088, + "learning_rate": 1.1965595036661027e-05, + "loss": 0.7726, + "step": 13488 + }, + { + "epoch": 7.608009024252679, + "grad_norm": 1.2008496522903442, + "learning_rate": 1.1962774957698815e-05, + "loss": 0.7062, + "step": 13489 + }, + { + "epoch": 7.608573040045121, + "grad_norm": 0.839030921459198, + "learning_rate": 1.1959954878736606e-05, + "loss": 0.6352, + "step": 13490 + }, + { + "epoch": 7.6091370558375635, + "grad_norm": 1.2117043733596802, + "learning_rate": 1.1957134799774395e-05, + "loss": 0.7125, + "step": 13491 + }, + { + "epoch": 7.609701071630005, + "grad_norm": 1.1468440294265747, + "learning_rate": 1.1954314720812184e-05, + "loss": 0.76, + "step": 13492 + }, + { + "epoch": 7.610265087422448, + "grad_norm": 1.0437580347061157, + "learning_rate": 1.1951494641849972e-05, + "loss": 0.749, + "step": 13493 + }, + { + "epoch": 7.61082910321489, + "grad_norm": 1.0352373123168945, + "learning_rate": 1.1948674562887761e-05, + "loss": 0.7198, + "step": 13494 + }, + { + "epoch": 7.6113931190073325, + "grad_norm": 0.8887649178504944, + "learning_rate": 1.194585448392555e-05, + "loss": 0.6736, + "step": 13495 + }, + { + "epoch": 7.611957134799774, + "grad_norm": 1.2198431491851807, + "learning_rate": 1.1943034404963339e-05, + "loss": 0.8106, + "step": 13496 + }, + { + "epoch": 7.612521150592217, + "grad_norm": 0.9760954976081848, + "learning_rate": 1.194021432600113e-05, + "loss": 0.7785, + "step": 13497 + }, + { + "epoch": 7.613085166384659, + "grad_norm": 1.1671699285507202, + "learning_rate": 1.1937394247038918e-05, + "loss": 0.7626, + "step": 13498 + }, + { + "epoch": 7.613649182177101, + "grad_norm": 1.1655514240264893, + "learning_rate": 1.1934574168076707e-05, + "loss": 0.6741, + "step": 13499 + }, + { + "epoch": 7.614213197969543, + "grad_norm": 0.8859889507293701, + "learning_rate": 1.1931754089114496e-05, + "loss": 0.6528, + "step": 13500 + }, + { + "epoch": 7.614777213761985, + "grad_norm": 1.4014736413955688, + "learning_rate": 1.1928934010152284e-05, + "loss": 0.7191, + "step": 13501 + }, + { + "epoch": 7.615341229554428, + "grad_norm": 1.138163685798645, + "learning_rate": 1.1926113931190073e-05, + "loss": 0.7226, + "step": 13502 + }, + { + "epoch": 7.61590524534687, + "grad_norm": 1.2484480142593384, + "learning_rate": 1.1923293852227862e-05, + "loss": 0.7563, + "step": 13503 + }, + { + "epoch": 7.616469261139311, + "grad_norm": 1.0100747346878052, + "learning_rate": 1.192047377326565e-05, + "loss": 0.6985, + "step": 13504 + }, + { + "epoch": 7.617033276931754, + "grad_norm": 1.0072131156921387, + "learning_rate": 1.1917653694303441e-05, + "loss": 0.7499, + "step": 13505 + }, + { + "epoch": 7.617597292724196, + "grad_norm": 1.2592322826385498, + "learning_rate": 1.191483361534123e-05, + "loss": 0.6446, + "step": 13506 + }, + { + "epoch": 7.618161308516639, + "grad_norm": 0.917455792427063, + "learning_rate": 1.1912013536379019e-05, + "loss": 0.6594, + "step": 13507 + }, + { + "epoch": 7.6187253243090804, + "grad_norm": 1.2883501052856445, + "learning_rate": 1.1909193457416808e-05, + "loss": 0.7177, + "step": 13508 + }, + { + "epoch": 7.619289340101523, + "grad_norm": 1.580651879310608, + "learning_rate": 1.1906373378454596e-05, + "loss": 0.6882, + "step": 13509 + }, + { + "epoch": 7.619853355893965, + "grad_norm": 1.136240005493164, + "learning_rate": 1.1903553299492385e-05, + "loss": 0.6551, + "step": 13510 + }, + { + "epoch": 7.620417371686408, + "grad_norm": 1.006110429763794, + "learning_rate": 1.1900733220530176e-05, + "loss": 0.6437, + "step": 13511 + }, + { + "epoch": 7.6209813874788495, + "grad_norm": 1.3845444917678833, + "learning_rate": 1.1897913141567965e-05, + "loss": 0.7806, + "step": 13512 + }, + { + "epoch": 7.621545403271291, + "grad_norm": 0.9514762759208679, + "learning_rate": 1.1895093062605753e-05, + "loss": 0.6616, + "step": 13513 + }, + { + "epoch": 7.622109419063734, + "grad_norm": 1.0639824867248535, + "learning_rate": 1.1892272983643544e-05, + "loss": 0.7772, + "step": 13514 + }, + { + "epoch": 7.622673434856176, + "grad_norm": 0.9930744171142578, + "learning_rate": 1.1889452904681333e-05, + "loss": 0.7165, + "step": 13515 + }, + { + "epoch": 7.6232374506486185, + "grad_norm": 1.1275089979171753, + "learning_rate": 1.1886632825719121e-05, + "loss": 0.6968, + "step": 13516 + }, + { + "epoch": 7.62380146644106, + "grad_norm": 1.1906859874725342, + "learning_rate": 1.188381274675691e-05, + "loss": 0.7661, + "step": 13517 + }, + { + "epoch": 7.624365482233502, + "grad_norm": 0.9514314532279968, + "learning_rate": 1.1880992667794699e-05, + "loss": 0.6242, + "step": 13518 + }, + { + "epoch": 7.624929498025945, + "grad_norm": 1.0568416118621826, + "learning_rate": 1.1878172588832488e-05, + "loss": 0.7616, + "step": 13519 + }, + { + "epoch": 7.625493513818387, + "grad_norm": 1.0414332151412964, + "learning_rate": 1.1875352509870277e-05, + "loss": 0.6749, + "step": 13520 + }, + { + "epoch": 7.626057529610829, + "grad_norm": 0.9990115761756897, + "learning_rate": 1.1872532430908065e-05, + "loss": 0.7446, + "step": 13521 + }, + { + "epoch": 7.626621545403271, + "grad_norm": 1.2641912698745728, + "learning_rate": 1.1869712351945856e-05, + "loss": 0.6973, + "step": 13522 + }, + { + "epoch": 7.627185561195714, + "grad_norm": 0.9090347290039062, + "learning_rate": 1.1866892272983645e-05, + "loss": 0.6333, + "step": 13523 + }, + { + "epoch": 7.627749576988156, + "grad_norm": 1.1445043087005615, + "learning_rate": 1.1864072194021434e-05, + "loss": 0.7704, + "step": 13524 + }, + { + "epoch": 7.628313592780598, + "grad_norm": 0.8951002955436707, + "learning_rate": 1.1861252115059222e-05, + "loss": 0.6516, + "step": 13525 + }, + { + "epoch": 7.62887760857304, + "grad_norm": 1.0498396158218384, + "learning_rate": 1.1858432036097011e-05, + "loss": 0.8052, + "step": 13526 + }, + { + "epoch": 7.629441624365482, + "grad_norm": 1.0845236778259277, + "learning_rate": 1.18556119571348e-05, + "loss": 0.7783, + "step": 13527 + }, + { + "epoch": 7.630005640157925, + "grad_norm": 1.3126351833343506, + "learning_rate": 1.1852791878172589e-05, + "loss": 0.8192, + "step": 13528 + }, + { + "epoch": 7.630569655950366, + "grad_norm": 0.8709771037101746, + "learning_rate": 1.184997179921038e-05, + "loss": 0.6867, + "step": 13529 + }, + { + "epoch": 7.631133671742809, + "grad_norm": 1.2420176267623901, + "learning_rate": 1.1847151720248168e-05, + "loss": 0.7501, + "step": 13530 + }, + { + "epoch": 7.631697687535251, + "grad_norm": 1.3492603302001953, + "learning_rate": 1.1844331641285957e-05, + "loss": 0.7476, + "step": 13531 + }, + { + "epoch": 7.632261703327693, + "grad_norm": 1.2262307405471802, + "learning_rate": 1.1841511562323746e-05, + "loss": 0.7548, + "step": 13532 + }, + { + "epoch": 7.632825719120135, + "grad_norm": 0.9855289459228516, + "learning_rate": 1.1838691483361534e-05, + "loss": 0.8344, + "step": 13533 + }, + { + "epoch": 7.633389734912577, + "grad_norm": 1.591766119003296, + "learning_rate": 1.1835871404399323e-05, + "loss": 0.7495, + "step": 13534 + }, + { + "epoch": 7.63395375070502, + "grad_norm": 1.5642313957214355, + "learning_rate": 1.1833051325437112e-05, + "loss": 0.878, + "step": 13535 + }, + { + "epoch": 7.634517766497462, + "grad_norm": 1.2018427848815918, + "learning_rate": 1.18302312464749e-05, + "loss": 0.7315, + "step": 13536 + }, + { + "epoch": 7.635081782289904, + "grad_norm": 1.2239878177642822, + "learning_rate": 1.1827411167512691e-05, + "loss": 0.6904, + "step": 13537 + }, + { + "epoch": 7.635645798082346, + "grad_norm": 1.0865801572799683, + "learning_rate": 1.182459108855048e-05, + "loss": 0.7268, + "step": 13538 + }, + { + "epoch": 7.636209813874789, + "grad_norm": 1.1454308032989502, + "learning_rate": 1.1821771009588269e-05, + "loss": 0.7051, + "step": 13539 + }, + { + "epoch": 7.636773829667231, + "grad_norm": 0.924612283706665, + "learning_rate": 1.1818950930626058e-05, + "loss": 0.6208, + "step": 13540 + }, + { + "epoch": 7.6373378454596725, + "grad_norm": 1.0974845886230469, + "learning_rate": 1.1816130851663846e-05, + "loss": 0.6756, + "step": 13541 + }, + { + "epoch": 7.637901861252115, + "grad_norm": 1.2483891248703003, + "learning_rate": 1.1813310772701635e-05, + "loss": 0.7147, + "step": 13542 + }, + { + "epoch": 7.638465877044557, + "grad_norm": 1.706210732460022, + "learning_rate": 1.1810490693739424e-05, + "loss": 0.8418, + "step": 13543 + }, + { + "epoch": 7.639029892837, + "grad_norm": 0.9227699041366577, + "learning_rate": 1.1807670614777215e-05, + "loss": 0.6271, + "step": 13544 + }, + { + "epoch": 7.6395939086294415, + "grad_norm": 1.1191428899765015, + "learning_rate": 1.1804850535815003e-05, + "loss": 0.7522, + "step": 13545 + }, + { + "epoch": 7.640157924421883, + "grad_norm": 1.4010287523269653, + "learning_rate": 1.1802030456852794e-05, + "loss": 0.7443, + "step": 13546 + }, + { + "epoch": 7.640721940214326, + "grad_norm": 0.8759185075759888, + "learning_rate": 1.1799210377890583e-05, + "loss": 0.6071, + "step": 13547 + }, + { + "epoch": 7.641285956006768, + "grad_norm": 1.4597197771072388, + "learning_rate": 1.1796390298928371e-05, + "loss": 0.7679, + "step": 13548 + }, + { + "epoch": 7.6418499717992106, + "grad_norm": 1.1693453788757324, + "learning_rate": 1.179357021996616e-05, + "loss": 0.7498, + "step": 13549 + }, + { + "epoch": 7.642413987591652, + "grad_norm": 1.0401877164840698, + "learning_rate": 1.1790750141003949e-05, + "loss": 0.8269, + "step": 13550 + }, + { + "epoch": 7.642978003384095, + "grad_norm": 1.2822284698486328, + "learning_rate": 1.1787930062041738e-05, + "loss": 0.7609, + "step": 13551 + }, + { + "epoch": 7.643542019176537, + "grad_norm": 1.1114088296890259, + "learning_rate": 1.1785109983079527e-05, + "loss": 0.6595, + "step": 13552 + }, + { + "epoch": 7.64410603496898, + "grad_norm": 1.1975733041763306, + "learning_rate": 1.1782289904117317e-05, + "loss": 0.6843, + "step": 13553 + }, + { + "epoch": 7.644670050761421, + "grad_norm": 0.9959655404090881, + "learning_rate": 1.1779469825155106e-05, + "loss": 0.784, + "step": 13554 + }, + { + "epoch": 7.645234066553863, + "grad_norm": 1.135927677154541, + "learning_rate": 1.1776649746192895e-05, + "loss": 0.7971, + "step": 13555 + }, + { + "epoch": 7.645798082346306, + "grad_norm": 1.21028470993042, + "learning_rate": 1.1773829667230683e-05, + "loss": 0.8202, + "step": 13556 + }, + { + "epoch": 7.646362098138748, + "grad_norm": 0.982040524482727, + "learning_rate": 1.1771009588268472e-05, + "loss": 0.7118, + "step": 13557 + }, + { + "epoch": 7.64692611393119, + "grad_norm": 0.9571792483329773, + "learning_rate": 1.1768189509306261e-05, + "loss": 0.7153, + "step": 13558 + }, + { + "epoch": 7.647490129723632, + "grad_norm": 1.0574746131896973, + "learning_rate": 1.176536943034405e-05, + "loss": 0.7697, + "step": 13559 + }, + { + "epoch": 7.648054145516074, + "grad_norm": 1.0064287185668945, + "learning_rate": 1.1762549351381839e-05, + "loss": 0.718, + "step": 13560 + }, + { + "epoch": 7.648618161308517, + "grad_norm": 1.1557462215423584, + "learning_rate": 1.1759729272419629e-05, + "loss": 0.7439, + "step": 13561 + }, + { + "epoch": 7.6491821771009585, + "grad_norm": 1.0656203031539917, + "learning_rate": 1.1756909193457418e-05, + "loss": 0.6872, + "step": 13562 + }, + { + "epoch": 7.649746192893401, + "grad_norm": 1.1632943153381348, + "learning_rate": 1.1754089114495207e-05, + "loss": 0.7822, + "step": 13563 + }, + { + "epoch": 7.650310208685843, + "grad_norm": 1.2977044582366943, + "learning_rate": 1.1751269035532996e-05, + "loss": 0.64, + "step": 13564 + }, + { + "epoch": 7.650874224478286, + "grad_norm": 0.8467106819152832, + "learning_rate": 1.1748448956570784e-05, + "loss": 0.6252, + "step": 13565 + }, + { + "epoch": 7.6514382402707275, + "grad_norm": 1.3260021209716797, + "learning_rate": 1.1745628877608573e-05, + "loss": 0.7085, + "step": 13566 + }, + { + "epoch": 7.65200225606317, + "grad_norm": 0.941424548625946, + "learning_rate": 1.1742808798646362e-05, + "loss": 0.7726, + "step": 13567 + }, + { + "epoch": 7.652566271855612, + "grad_norm": 0.9972519278526306, + "learning_rate": 1.173998871968415e-05, + "loss": 0.6769, + "step": 13568 + }, + { + "epoch": 7.653130287648054, + "grad_norm": 1.1202483177185059, + "learning_rate": 1.1737168640721941e-05, + "loss": 0.6491, + "step": 13569 + }, + { + "epoch": 7.6536943034404965, + "grad_norm": 1.1878982782363892, + "learning_rate": 1.173434856175973e-05, + "loss": 0.716, + "step": 13570 + }, + { + "epoch": 7.654258319232938, + "grad_norm": 0.9239189028739929, + "learning_rate": 1.1731528482797519e-05, + "loss": 0.6544, + "step": 13571 + }, + { + "epoch": 7.654822335025381, + "grad_norm": 1.3684684038162231, + "learning_rate": 1.1728708403835308e-05, + "loss": 0.755, + "step": 13572 + }, + { + "epoch": 7.655386350817823, + "grad_norm": 1.1714357137680054, + "learning_rate": 1.1725888324873096e-05, + "loss": 0.6419, + "step": 13573 + }, + { + "epoch": 7.655950366610265, + "grad_norm": 1.1095054149627686, + "learning_rate": 1.1723068245910885e-05, + "loss": 0.6769, + "step": 13574 + }, + { + "epoch": 7.656514382402707, + "grad_norm": 1.1217044591903687, + "learning_rate": 1.1720248166948674e-05, + "loss": 0.7337, + "step": 13575 + }, + { + "epoch": 7.657078398195149, + "grad_norm": 1.5349838733673096, + "learning_rate": 1.1717428087986464e-05, + "loss": 0.7974, + "step": 13576 + }, + { + "epoch": 7.657642413987592, + "grad_norm": 1.142672061920166, + "learning_rate": 1.1714608009024253e-05, + "loss": 0.7581, + "step": 13577 + }, + { + "epoch": 7.658206429780034, + "grad_norm": 1.060327172279358, + "learning_rate": 1.1711787930062042e-05, + "loss": 0.7137, + "step": 13578 + }, + { + "epoch": 7.658770445572476, + "grad_norm": 1.2106480598449707, + "learning_rate": 1.1708967851099831e-05, + "loss": 0.7768, + "step": 13579 + }, + { + "epoch": 7.659334461364918, + "grad_norm": 1.1641405820846558, + "learning_rate": 1.170614777213762e-05, + "loss": 0.7133, + "step": 13580 + }, + { + "epoch": 7.659898477157361, + "grad_norm": 1.2894971370697021, + "learning_rate": 1.1703327693175408e-05, + "loss": 0.7234, + "step": 13581 + }, + { + "epoch": 7.660462492949803, + "grad_norm": 1.9396538734436035, + "learning_rate": 1.1700507614213199e-05, + "loss": 0.8036, + "step": 13582 + }, + { + "epoch": 7.6610265087422444, + "grad_norm": 0.9458690881729126, + "learning_rate": 1.1697687535250988e-05, + "loss": 0.7847, + "step": 13583 + }, + { + "epoch": 7.661590524534687, + "grad_norm": 0.9939272403717041, + "learning_rate": 1.1694867456288777e-05, + "loss": 0.7298, + "step": 13584 + }, + { + "epoch": 7.662154540327129, + "grad_norm": 0.9392428994178772, + "learning_rate": 1.1692047377326567e-05, + "loss": 0.7518, + "step": 13585 + }, + { + "epoch": 7.662718556119572, + "grad_norm": 1.2706671953201294, + "learning_rate": 1.1689227298364356e-05, + "loss": 0.7844, + "step": 13586 + }, + { + "epoch": 7.6632825719120135, + "grad_norm": 1.019668698310852, + "learning_rate": 1.1686407219402145e-05, + "loss": 0.6196, + "step": 13587 + }, + { + "epoch": 7.663846587704455, + "grad_norm": 1.2169280052185059, + "learning_rate": 1.1683587140439933e-05, + "loss": 0.6848, + "step": 13588 + }, + { + "epoch": 7.664410603496898, + "grad_norm": 0.9962421655654907, + "learning_rate": 1.1680767061477722e-05, + "loss": 0.7784, + "step": 13589 + }, + { + "epoch": 7.66497461928934, + "grad_norm": 1.2320363521575928, + "learning_rate": 1.1677946982515511e-05, + "loss": 0.9072, + "step": 13590 + }, + { + "epoch": 7.6655386350817825, + "grad_norm": 1.3344841003417969, + "learning_rate": 1.16751269035533e-05, + "loss": 0.7673, + "step": 13591 + }, + { + "epoch": 7.666102650874224, + "grad_norm": 1.1818790435791016, + "learning_rate": 1.1672306824591089e-05, + "loss": 0.647, + "step": 13592 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 0.9477207064628601, + "learning_rate": 1.1669486745628879e-05, + "loss": 0.7203, + "step": 13593 + }, + { + "epoch": 7.667230682459109, + "grad_norm": 1.5381062030792236, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.757, + "step": 13594 + }, + { + "epoch": 7.6677946982515515, + "grad_norm": 0.9649113416671753, + "learning_rate": 1.1663846587704457e-05, + "loss": 0.6257, + "step": 13595 + }, + { + "epoch": 7.668358714043993, + "grad_norm": 1.3849762678146362, + "learning_rate": 1.1661026508742245e-05, + "loss": 0.7322, + "step": 13596 + }, + { + "epoch": 7.668922729836435, + "grad_norm": 1.165380835533142, + "learning_rate": 1.1658206429780034e-05, + "loss": 0.7883, + "step": 13597 + }, + { + "epoch": 7.669486745628878, + "grad_norm": 1.0382301807403564, + "learning_rate": 1.1655386350817823e-05, + "loss": 0.639, + "step": 13598 + }, + { + "epoch": 7.67005076142132, + "grad_norm": 1.2861483097076416, + "learning_rate": 1.1652566271855612e-05, + "loss": 0.7008, + "step": 13599 + }, + { + "epoch": 7.670614777213762, + "grad_norm": 1.2278733253479004, + "learning_rate": 1.16497461928934e-05, + "loss": 0.7715, + "step": 13600 + }, + { + "epoch": 7.671178793006204, + "grad_norm": 1.2278485298156738, + "learning_rate": 1.1646926113931191e-05, + "loss": 0.68, + "step": 13601 + }, + { + "epoch": 7.671742808798646, + "grad_norm": 1.2298893928527832, + "learning_rate": 1.164410603496898e-05, + "loss": 0.8556, + "step": 13602 + }, + { + "epoch": 7.672306824591089, + "grad_norm": 0.9408788681030273, + "learning_rate": 1.1641285956006769e-05, + "loss": 0.6544, + "step": 13603 + }, + { + "epoch": 7.67287084038353, + "grad_norm": 0.9155644774436951, + "learning_rate": 1.1638465877044558e-05, + "loss": 0.75, + "step": 13604 + }, + { + "epoch": 7.673434856175973, + "grad_norm": 1.2045791149139404, + "learning_rate": 1.1635645798082346e-05, + "loss": 0.7681, + "step": 13605 + }, + { + "epoch": 7.673998871968415, + "grad_norm": 1.6410564184188843, + "learning_rate": 1.1632825719120135e-05, + "loss": 0.7785, + "step": 13606 + }, + { + "epoch": 7.674562887760858, + "grad_norm": 1.2790697813034058, + "learning_rate": 1.1630005640157924e-05, + "loss": 0.8634, + "step": 13607 + }, + { + "epoch": 7.675126903553299, + "grad_norm": 1.2310433387756348, + "learning_rate": 1.1627185561195714e-05, + "loss": 0.7139, + "step": 13608 + }, + { + "epoch": 7.675690919345742, + "grad_norm": 1.3094065189361572, + "learning_rate": 1.1624365482233503e-05, + "loss": 0.8241, + "step": 13609 + }, + { + "epoch": 7.676254935138184, + "grad_norm": 1.1170573234558105, + "learning_rate": 1.1621545403271292e-05, + "loss": 0.7642, + "step": 13610 + }, + { + "epoch": 7.676818950930626, + "grad_norm": 1.4740217924118042, + "learning_rate": 1.161872532430908e-05, + "loss": 0.8662, + "step": 13611 + }, + { + "epoch": 7.677382966723068, + "grad_norm": 1.002384066581726, + "learning_rate": 1.161590524534687e-05, + "loss": 0.7352, + "step": 13612 + }, + { + "epoch": 7.67794698251551, + "grad_norm": 1.0180646181106567, + "learning_rate": 1.1613085166384658e-05, + "loss": 0.7216, + "step": 13613 + }, + { + "epoch": 7.678510998307953, + "grad_norm": 1.4193916320800781, + "learning_rate": 1.1610265087422447e-05, + "loss": 0.8272, + "step": 13614 + }, + { + "epoch": 7.679075014100395, + "grad_norm": 1.1289180517196655, + "learning_rate": 1.1607445008460236e-05, + "loss": 0.6617, + "step": 13615 + }, + { + "epoch": 7.6796390298928365, + "grad_norm": 1.2373939752578735, + "learning_rate": 1.1604624929498027e-05, + "loss": 0.7667, + "step": 13616 + }, + { + "epoch": 7.680203045685279, + "grad_norm": 1.2665375471115112, + "learning_rate": 1.1601804850535817e-05, + "loss": 0.8181, + "step": 13617 + }, + { + "epoch": 7.680767061477721, + "grad_norm": 0.7691184282302856, + "learning_rate": 1.1598984771573606e-05, + "loss": 0.6402, + "step": 13618 + }, + { + "epoch": 7.681331077270164, + "grad_norm": 0.9906105399131775, + "learning_rate": 1.1596164692611395e-05, + "loss": 0.6997, + "step": 13619 + }, + { + "epoch": 7.6818950930626055, + "grad_norm": 0.9941384196281433, + "learning_rate": 1.1593344613649183e-05, + "loss": 0.7132, + "step": 13620 + }, + { + "epoch": 7.682459108855048, + "grad_norm": 0.832711935043335, + "learning_rate": 1.1590524534686972e-05, + "loss": 0.6984, + "step": 13621 + }, + { + "epoch": 7.68302312464749, + "grad_norm": 1.210459589958191, + "learning_rate": 1.1587704455724761e-05, + "loss": 0.7321, + "step": 13622 + }, + { + "epoch": 7.683587140439933, + "grad_norm": 0.9947899580001831, + "learning_rate": 1.158488437676255e-05, + "loss": 0.6862, + "step": 13623 + }, + { + "epoch": 7.6841511562323745, + "grad_norm": 1.342125654220581, + "learning_rate": 1.1582064297800339e-05, + "loss": 0.7842, + "step": 13624 + }, + { + "epoch": 7.684715172024816, + "grad_norm": 1.3781330585479736, + "learning_rate": 1.1579244218838129e-05, + "loss": 0.7238, + "step": 13625 + }, + { + "epoch": 7.685279187817259, + "grad_norm": 0.969734787940979, + "learning_rate": 1.1576424139875918e-05, + "loss": 0.6595, + "step": 13626 + }, + { + "epoch": 7.685843203609701, + "grad_norm": 1.2980672121047974, + "learning_rate": 1.1573604060913707e-05, + "loss": 0.754, + "step": 13627 + }, + { + "epoch": 7.6864072194021436, + "grad_norm": 1.0404307842254639, + "learning_rate": 1.1570783981951495e-05, + "loss": 0.7067, + "step": 13628 + }, + { + "epoch": 7.686971235194585, + "grad_norm": 1.5092347860336304, + "learning_rate": 1.1567963902989284e-05, + "loss": 0.7855, + "step": 13629 + }, + { + "epoch": 7.687535250987027, + "grad_norm": 1.2232017517089844, + "learning_rate": 1.1565143824027073e-05, + "loss": 0.7491, + "step": 13630 + }, + { + "epoch": 7.68809926677947, + "grad_norm": 1.2269463539123535, + "learning_rate": 1.1562323745064862e-05, + "loss": 0.7469, + "step": 13631 + }, + { + "epoch": 7.688663282571912, + "grad_norm": 1.326890230178833, + "learning_rate": 1.1559503666102652e-05, + "loss": 0.8053, + "step": 13632 + }, + { + "epoch": 7.689227298364354, + "grad_norm": 0.9529855251312256, + "learning_rate": 1.1556683587140441e-05, + "loss": 0.7269, + "step": 13633 + }, + { + "epoch": 7.689791314156796, + "grad_norm": 1.1975009441375732, + "learning_rate": 1.155386350817823e-05, + "loss": 0.8477, + "step": 13634 + }, + { + "epoch": 7.690355329949239, + "grad_norm": 1.3155615329742432, + "learning_rate": 1.1551043429216019e-05, + "loss": 0.696, + "step": 13635 + }, + { + "epoch": 7.690919345741681, + "grad_norm": 1.389776587486267, + "learning_rate": 1.1548223350253808e-05, + "loss": 0.7016, + "step": 13636 + }, + { + "epoch": 7.691483361534123, + "grad_norm": 0.9327318072319031, + "learning_rate": 1.1545403271291596e-05, + "loss": 0.7617, + "step": 13637 + }, + { + "epoch": 7.692047377326565, + "grad_norm": 1.0190588235855103, + "learning_rate": 1.1542583192329385e-05, + "loss": 0.7565, + "step": 13638 + }, + { + "epoch": 7.692611393119007, + "grad_norm": 0.8675119280815125, + "learning_rate": 1.1539763113367174e-05, + "loss": 0.6758, + "step": 13639 + }, + { + "epoch": 7.69317540891145, + "grad_norm": 1.3731179237365723, + "learning_rate": 1.1536943034404964e-05, + "loss": 0.7403, + "step": 13640 + }, + { + "epoch": 7.6937394247038915, + "grad_norm": 1.087396264076233, + "learning_rate": 1.1534122955442753e-05, + "loss": 0.8127, + "step": 13641 + }, + { + "epoch": 7.694303440496334, + "grad_norm": 1.6038721799850464, + "learning_rate": 1.1531302876480542e-05, + "loss": 0.7471, + "step": 13642 + }, + { + "epoch": 7.694867456288776, + "grad_norm": 0.800441324710846, + "learning_rate": 1.152848279751833e-05, + "loss": 0.61, + "step": 13643 + }, + { + "epoch": 7.695431472081218, + "grad_norm": 1.0627919435501099, + "learning_rate": 1.152566271855612e-05, + "loss": 0.7902, + "step": 13644 + }, + { + "epoch": 7.6959954878736605, + "grad_norm": 1.4907912015914917, + "learning_rate": 1.1522842639593908e-05, + "loss": 0.8818, + "step": 13645 + }, + { + "epoch": 7.696559503666102, + "grad_norm": 1.404374599456787, + "learning_rate": 1.1520022560631697e-05, + "loss": 0.7237, + "step": 13646 + }, + { + "epoch": 7.697123519458545, + "grad_norm": 0.9720190763473511, + "learning_rate": 1.1517202481669486e-05, + "loss": 0.7006, + "step": 13647 + }, + { + "epoch": 7.697687535250987, + "grad_norm": 1.4084770679473877, + "learning_rate": 1.1514382402707276e-05, + "loss": 0.7553, + "step": 13648 + }, + { + "epoch": 7.6982515510434295, + "grad_norm": 1.6010676622390747, + "learning_rate": 1.1511562323745065e-05, + "loss": 0.67, + "step": 13649 + }, + { + "epoch": 7.698815566835871, + "grad_norm": 1.2554099559783936, + "learning_rate": 1.1508742244782854e-05, + "loss": 0.7073, + "step": 13650 + }, + { + "epoch": 7.699379582628314, + "grad_norm": 1.1161181926727295, + "learning_rate": 1.1505922165820643e-05, + "loss": 0.7254, + "step": 13651 + }, + { + "epoch": 7.699943598420756, + "grad_norm": 1.1895065307617188, + "learning_rate": 1.1503102086858433e-05, + "loss": 0.7036, + "step": 13652 + }, + { + "epoch": 7.700507614213198, + "grad_norm": 1.721167802810669, + "learning_rate": 1.1500282007896222e-05, + "loss": 0.8891, + "step": 13653 + }, + { + "epoch": 7.70107163000564, + "grad_norm": 0.8683729767799377, + "learning_rate": 1.1497461928934011e-05, + "loss": 0.6782, + "step": 13654 + }, + { + "epoch": 7.701635645798082, + "grad_norm": 1.088220238685608, + "learning_rate": 1.14946418499718e-05, + "loss": 0.7241, + "step": 13655 + }, + { + "epoch": 7.702199661590525, + "grad_norm": 1.0816949605941772, + "learning_rate": 1.1491821771009589e-05, + "loss": 0.6722, + "step": 13656 + }, + { + "epoch": 7.702763677382967, + "grad_norm": 1.3939549922943115, + "learning_rate": 1.1489001692047379e-05, + "loss": 0.7217, + "step": 13657 + }, + { + "epoch": 7.703327693175408, + "grad_norm": 0.9457477927207947, + "learning_rate": 1.1486181613085168e-05, + "loss": 0.6712, + "step": 13658 + }, + { + "epoch": 7.703891708967851, + "grad_norm": 1.2048993110656738, + "learning_rate": 1.1483361534122957e-05, + "loss": 0.822, + "step": 13659 + }, + { + "epoch": 7.704455724760293, + "grad_norm": 1.3587332963943481, + "learning_rate": 1.1480541455160745e-05, + "loss": 0.7606, + "step": 13660 + }, + { + "epoch": 7.705019740552736, + "grad_norm": 0.8695059418678284, + "learning_rate": 1.1477721376198534e-05, + "loss": 0.6883, + "step": 13661 + }, + { + "epoch": 7.7055837563451774, + "grad_norm": 1.4788058996200562, + "learning_rate": 1.1474901297236323e-05, + "loss": 0.7795, + "step": 13662 + }, + { + "epoch": 7.70614777213762, + "grad_norm": 1.1986029148101807, + "learning_rate": 1.1472081218274112e-05, + "loss": 0.6848, + "step": 13663 + }, + { + "epoch": 7.706711787930062, + "grad_norm": 0.9241847395896912, + "learning_rate": 1.1469261139311902e-05, + "loss": 0.7319, + "step": 13664 + }, + { + "epoch": 7.707275803722505, + "grad_norm": 1.4919512271881104, + "learning_rate": 1.1466441060349691e-05, + "loss": 0.7434, + "step": 13665 + }, + { + "epoch": 7.7078398195149465, + "grad_norm": 1.2548680305480957, + "learning_rate": 1.146362098138748e-05, + "loss": 0.7135, + "step": 13666 + }, + { + "epoch": 7.708403835307388, + "grad_norm": 1.3003932237625122, + "learning_rate": 1.1460800902425269e-05, + "loss": 0.7962, + "step": 13667 + }, + { + "epoch": 7.708967851099831, + "grad_norm": 1.037803053855896, + "learning_rate": 1.1457980823463057e-05, + "loss": 0.6672, + "step": 13668 + }, + { + "epoch": 7.709531866892273, + "grad_norm": 1.052949070930481, + "learning_rate": 1.1455160744500846e-05, + "loss": 0.7184, + "step": 13669 + }, + { + "epoch": 7.7100958826847155, + "grad_norm": 1.1968706846237183, + "learning_rate": 1.1452340665538635e-05, + "loss": 0.833, + "step": 13670 + }, + { + "epoch": 7.710659898477157, + "grad_norm": 0.9723986387252808, + "learning_rate": 1.1449520586576424e-05, + "loss": 0.6551, + "step": 13671 + }, + { + "epoch": 7.711223914269599, + "grad_norm": 1.0350710153579712, + "learning_rate": 1.1446700507614214e-05, + "loss": 0.7445, + "step": 13672 + }, + { + "epoch": 7.711787930062042, + "grad_norm": 1.0113410949707031, + "learning_rate": 1.1443880428652003e-05, + "loss": 0.737, + "step": 13673 + }, + { + "epoch": 7.712351945854484, + "grad_norm": 1.356825351715088, + "learning_rate": 1.1441060349689792e-05, + "loss": 0.8652, + "step": 13674 + }, + { + "epoch": 7.712915961646926, + "grad_norm": 1.2823442220687866, + "learning_rate": 1.143824027072758e-05, + "loss": 0.7268, + "step": 13675 + }, + { + "epoch": 7.713479977439368, + "grad_norm": 1.446251392364502, + "learning_rate": 1.143542019176537e-05, + "loss": 0.8038, + "step": 13676 + }, + { + "epoch": 7.714043993231811, + "grad_norm": 1.2808403968811035, + "learning_rate": 1.1432600112803158e-05, + "loss": 0.7614, + "step": 13677 + }, + { + "epoch": 7.714608009024253, + "grad_norm": 1.27597975730896, + "learning_rate": 1.1429780033840947e-05, + "loss": 0.8087, + "step": 13678 + }, + { + "epoch": 7.715172024816695, + "grad_norm": 1.1386356353759766, + "learning_rate": 1.1426959954878736e-05, + "loss": 0.692, + "step": 13679 + }, + { + "epoch": 7.715736040609137, + "grad_norm": 1.3177019357681274, + "learning_rate": 1.1424139875916526e-05, + "loss": 0.6673, + "step": 13680 + }, + { + "epoch": 7.716300056401579, + "grad_norm": 0.7399276494979858, + "learning_rate": 1.1421319796954315e-05, + "loss": 0.6079, + "step": 13681 + }, + { + "epoch": 7.716864072194022, + "grad_norm": 1.4578726291656494, + "learning_rate": 1.1418499717992104e-05, + "loss": 0.6726, + "step": 13682 + }, + { + "epoch": 7.717428087986463, + "grad_norm": 1.195835828781128, + "learning_rate": 1.1415679639029893e-05, + "loss": 0.7431, + "step": 13683 + }, + { + "epoch": 7.717992103778906, + "grad_norm": 1.0202293395996094, + "learning_rate": 1.1412859560067682e-05, + "loss": 0.7423, + "step": 13684 + }, + { + "epoch": 7.718556119571348, + "grad_norm": 1.1577829122543335, + "learning_rate": 1.141003948110547e-05, + "loss": 0.7658, + "step": 13685 + }, + { + "epoch": 7.71912013536379, + "grad_norm": 1.298365592956543, + "learning_rate": 1.140721940214326e-05, + "loss": 0.767, + "step": 13686 + }, + { + "epoch": 7.719684151156232, + "grad_norm": 1.167816162109375, + "learning_rate": 1.140439932318105e-05, + "loss": 0.7438, + "step": 13687 + }, + { + "epoch": 7.720248166948674, + "grad_norm": 1.4943400621414185, + "learning_rate": 1.1401579244218838e-05, + "loss": 0.7102, + "step": 13688 + }, + { + "epoch": 7.720812182741117, + "grad_norm": 0.9587531685829163, + "learning_rate": 1.1398759165256629e-05, + "loss": 0.7178, + "step": 13689 + }, + { + "epoch": 7.721376198533559, + "grad_norm": 0.8563487529754639, + "learning_rate": 1.1395939086294418e-05, + "loss": 0.6618, + "step": 13690 + }, + { + "epoch": 7.721940214326001, + "grad_norm": 1.6641619205474854, + "learning_rate": 1.1393119007332207e-05, + "loss": 0.7562, + "step": 13691 + }, + { + "epoch": 7.722504230118443, + "grad_norm": 1.1465673446655273, + "learning_rate": 1.1390298928369995e-05, + "loss": 0.7585, + "step": 13692 + }, + { + "epoch": 7.723068245910886, + "grad_norm": 1.2209807634353638, + "learning_rate": 1.1387478849407784e-05, + "loss": 0.7146, + "step": 13693 + }, + { + "epoch": 7.723632261703328, + "grad_norm": 0.84864342212677, + "learning_rate": 1.1384658770445573e-05, + "loss": 0.6449, + "step": 13694 + }, + { + "epoch": 7.7241962774957695, + "grad_norm": 1.2370083332061768, + "learning_rate": 1.1381838691483362e-05, + "loss": 0.8149, + "step": 13695 + }, + { + "epoch": 7.724760293288212, + "grad_norm": 0.9676856994628906, + "learning_rate": 1.1379018612521152e-05, + "loss": 0.6066, + "step": 13696 + }, + { + "epoch": 7.725324309080654, + "grad_norm": 1.1466825008392334, + "learning_rate": 1.1376198533558941e-05, + "loss": 0.7094, + "step": 13697 + }, + { + "epoch": 7.725888324873097, + "grad_norm": 1.4484678506851196, + "learning_rate": 1.137337845459673e-05, + "loss": 0.7326, + "step": 13698 + }, + { + "epoch": 7.7264523406655385, + "grad_norm": 1.335866928100586, + "learning_rate": 1.1370558375634519e-05, + "loss": 0.7112, + "step": 13699 + }, + { + "epoch": 7.72701635645798, + "grad_norm": 1.2345526218414307, + "learning_rate": 1.1367738296672307e-05, + "loss": 0.7258, + "step": 13700 + }, + { + "epoch": 7.727580372250423, + "grad_norm": 1.3477940559387207, + "learning_rate": 1.1364918217710096e-05, + "loss": 0.7627, + "step": 13701 + }, + { + "epoch": 7.728144388042865, + "grad_norm": 1.227122187614441, + "learning_rate": 1.1362098138747885e-05, + "loss": 0.8102, + "step": 13702 + }, + { + "epoch": 7.7287084038353075, + "grad_norm": 1.1207267045974731, + "learning_rate": 1.1359278059785674e-05, + "loss": 0.747, + "step": 13703 + }, + { + "epoch": 7.729272419627749, + "grad_norm": 1.255981206893921, + "learning_rate": 1.1356457980823464e-05, + "loss": 0.834, + "step": 13704 + }, + { + "epoch": 7.729836435420192, + "grad_norm": 1.1868687868118286, + "learning_rate": 1.1353637901861253e-05, + "loss": 0.704, + "step": 13705 + }, + { + "epoch": 7.730400451212634, + "grad_norm": 1.0680028200149536, + "learning_rate": 1.1350817822899042e-05, + "loss": 0.6664, + "step": 13706 + }, + { + "epoch": 7.730964467005077, + "grad_norm": 1.2184717655181885, + "learning_rate": 1.134799774393683e-05, + "loss": 0.8522, + "step": 13707 + }, + { + "epoch": 7.731528482797518, + "grad_norm": 1.2977581024169922, + "learning_rate": 1.134517766497462e-05, + "loss": 0.7744, + "step": 13708 + }, + { + "epoch": 7.73209249858996, + "grad_norm": 1.0294140577316284, + "learning_rate": 1.1342357586012408e-05, + "loss": 0.6842, + "step": 13709 + }, + { + "epoch": 7.732656514382403, + "grad_norm": 1.2417441606521606, + "learning_rate": 1.1339537507050197e-05, + "loss": 0.7283, + "step": 13710 + }, + { + "epoch": 7.733220530174845, + "grad_norm": 0.9712237119674683, + "learning_rate": 1.1336717428087988e-05, + "loss": 0.6761, + "step": 13711 + }, + { + "epoch": 7.733784545967287, + "grad_norm": 1.0382397174835205, + "learning_rate": 1.1333897349125776e-05, + "loss": 0.775, + "step": 13712 + }, + { + "epoch": 7.734348561759729, + "grad_norm": 1.3750702142715454, + "learning_rate": 1.1331077270163565e-05, + "loss": 0.7306, + "step": 13713 + }, + { + "epoch": 7.734912577552171, + "grad_norm": 1.1490201950073242, + "learning_rate": 1.1328257191201354e-05, + "loss": 0.7124, + "step": 13714 + }, + { + "epoch": 7.735476593344614, + "grad_norm": 1.0556720495224, + "learning_rate": 1.1325437112239143e-05, + "loss": 0.6531, + "step": 13715 + }, + { + "epoch": 7.7360406091370555, + "grad_norm": 1.0388433933258057, + "learning_rate": 1.1322617033276932e-05, + "loss": 0.7355, + "step": 13716 + }, + { + "epoch": 7.736604624929498, + "grad_norm": 1.029855489730835, + "learning_rate": 1.131979695431472e-05, + "loss": 0.7488, + "step": 13717 + }, + { + "epoch": 7.73716864072194, + "grad_norm": 1.1219239234924316, + "learning_rate": 1.131697687535251e-05, + "loss": 0.7258, + "step": 13718 + }, + { + "epoch": 7.737732656514383, + "grad_norm": 1.4336671829223633, + "learning_rate": 1.13141567963903e-05, + "loss": 0.7604, + "step": 13719 + }, + { + "epoch": 7.7382966723068245, + "grad_norm": 1.3367385864257812, + "learning_rate": 1.1311336717428088e-05, + "loss": 0.6778, + "step": 13720 + }, + { + "epoch": 7.738860688099267, + "grad_norm": 1.1257997751235962, + "learning_rate": 1.1308516638465877e-05, + "loss": 0.8248, + "step": 13721 + }, + { + "epoch": 7.739424703891709, + "grad_norm": 1.210922122001648, + "learning_rate": 1.1305696559503666e-05, + "loss": 0.7191, + "step": 13722 + }, + { + "epoch": 7.739988719684151, + "grad_norm": 0.8634220361709595, + "learning_rate": 1.1302876480541457e-05, + "loss": 0.7012, + "step": 13723 + }, + { + "epoch": 7.7405527354765935, + "grad_norm": 1.0493189096450806, + "learning_rate": 1.1300056401579245e-05, + "loss": 0.7941, + "step": 13724 + }, + { + "epoch": 7.741116751269035, + "grad_norm": 0.975142776966095, + "learning_rate": 1.1297236322617034e-05, + "loss": 0.6508, + "step": 13725 + }, + { + "epoch": 7.741680767061478, + "grad_norm": 1.0203263759613037, + "learning_rate": 1.1294416243654823e-05, + "loss": 0.6324, + "step": 13726 + }, + { + "epoch": 7.74224478285392, + "grad_norm": 1.055882453918457, + "learning_rate": 1.1291596164692612e-05, + "loss": 0.7718, + "step": 13727 + }, + { + "epoch": 7.742808798646362, + "grad_norm": 1.098766565322876, + "learning_rate": 1.1288776085730402e-05, + "loss": 0.8046, + "step": 13728 + }, + { + "epoch": 7.743372814438804, + "grad_norm": 1.080084204673767, + "learning_rate": 1.1285956006768191e-05, + "loss": 0.7409, + "step": 13729 + }, + { + "epoch": 7.743936830231246, + "grad_norm": 0.9604128003120422, + "learning_rate": 1.128313592780598e-05, + "loss": 0.6605, + "step": 13730 + }, + { + "epoch": 7.744500846023689, + "grad_norm": 1.08537757396698, + "learning_rate": 1.1280315848843769e-05, + "loss": 0.7373, + "step": 13731 + }, + { + "epoch": 7.745064861816131, + "grad_norm": 1.175888180732727, + "learning_rate": 1.1277495769881557e-05, + "loss": 0.7561, + "step": 13732 + }, + { + "epoch": 7.745628877608573, + "grad_norm": 1.4289295673370361, + "learning_rate": 1.1274675690919346e-05, + "loss": 0.7694, + "step": 13733 + }, + { + "epoch": 7.746192893401015, + "grad_norm": 1.0456267595291138, + "learning_rate": 1.1271855611957135e-05, + "loss": 0.7011, + "step": 13734 + }, + { + "epoch": 7.746756909193458, + "grad_norm": 0.9430054426193237, + "learning_rate": 1.1269035532994924e-05, + "loss": 0.7694, + "step": 13735 + }, + { + "epoch": 7.7473209249859, + "grad_norm": 1.1533071994781494, + "learning_rate": 1.1266215454032714e-05, + "loss": 0.6731, + "step": 13736 + }, + { + "epoch": 7.7478849407783414, + "grad_norm": 0.8938435912132263, + "learning_rate": 1.1263395375070503e-05, + "loss": 0.692, + "step": 13737 + }, + { + "epoch": 7.748448956570784, + "grad_norm": 1.12239408493042, + "learning_rate": 1.1260575296108292e-05, + "loss": 0.7205, + "step": 13738 + }, + { + "epoch": 7.749012972363226, + "grad_norm": 1.027445912361145, + "learning_rate": 1.125775521714608e-05, + "loss": 0.8203, + "step": 13739 + }, + { + "epoch": 7.749576988155669, + "grad_norm": 1.0590455532073975, + "learning_rate": 1.125493513818387e-05, + "loss": 0.7501, + "step": 13740 + }, + { + "epoch": 7.7501410039481105, + "grad_norm": 1.3369708061218262, + "learning_rate": 1.1252115059221658e-05, + "loss": 0.6843, + "step": 13741 + }, + { + "epoch": 7.750705019740552, + "grad_norm": 1.1416579484939575, + "learning_rate": 1.1249294980259447e-05, + "loss": 0.6736, + "step": 13742 + }, + { + "epoch": 7.751269035532995, + "grad_norm": 1.2358258962631226, + "learning_rate": 1.1246474901297238e-05, + "loss": 0.8151, + "step": 13743 + }, + { + "epoch": 7.751833051325437, + "grad_norm": 1.5620819330215454, + "learning_rate": 1.1243654822335026e-05, + "loss": 0.9162, + "step": 13744 + }, + { + "epoch": 7.7523970671178795, + "grad_norm": 1.1173371076583862, + "learning_rate": 1.1240834743372815e-05, + "loss": 0.6919, + "step": 13745 + }, + { + "epoch": 7.752961082910321, + "grad_norm": 1.001317024230957, + "learning_rate": 1.1238014664410604e-05, + "loss": 0.6995, + "step": 13746 + }, + { + "epoch": 7.753525098702764, + "grad_norm": 1.13509202003479, + "learning_rate": 1.1235194585448393e-05, + "loss": 0.7488, + "step": 13747 + }, + { + "epoch": 7.754089114495206, + "grad_norm": 1.2174737453460693, + "learning_rate": 1.1232374506486182e-05, + "loss": 0.7625, + "step": 13748 + }, + { + "epoch": 7.7546531302876485, + "grad_norm": 1.102100133895874, + "learning_rate": 1.122955442752397e-05, + "loss": 0.6991, + "step": 13749 + }, + { + "epoch": 7.75521714608009, + "grad_norm": 1.2135673761367798, + "learning_rate": 1.1226734348561759e-05, + "loss": 0.7773, + "step": 13750 + }, + { + "epoch": 7.755781161872532, + "grad_norm": 0.997021496295929, + "learning_rate": 1.122391426959955e-05, + "loss": 0.782, + "step": 13751 + }, + { + "epoch": 7.756345177664975, + "grad_norm": 1.1001285314559937, + "learning_rate": 1.1221094190637338e-05, + "loss": 0.7341, + "step": 13752 + }, + { + "epoch": 7.756909193457417, + "grad_norm": 0.8940239548683167, + "learning_rate": 1.1218274111675127e-05, + "loss": 0.7142, + "step": 13753 + }, + { + "epoch": 7.757473209249859, + "grad_norm": 1.4758250713348389, + "learning_rate": 1.1215454032712916e-05, + "loss": 0.7062, + "step": 13754 + }, + { + "epoch": 7.758037225042301, + "grad_norm": 1.0509289503097534, + "learning_rate": 1.1212633953750705e-05, + "loss": 0.7172, + "step": 13755 + }, + { + "epoch": 7.758601240834743, + "grad_norm": 1.05046546459198, + "learning_rate": 1.1209813874788494e-05, + "loss": 0.7231, + "step": 13756 + }, + { + "epoch": 7.759165256627186, + "grad_norm": 0.9685630202293396, + "learning_rate": 1.1206993795826282e-05, + "loss": 0.7348, + "step": 13757 + }, + { + "epoch": 7.759729272419627, + "grad_norm": 1.1346080303192139, + "learning_rate": 1.1204173716864073e-05, + "loss": 0.7857, + "step": 13758 + }, + { + "epoch": 7.76029328821207, + "grad_norm": 1.0548925399780273, + "learning_rate": 1.1201353637901862e-05, + "loss": 0.709, + "step": 13759 + }, + { + "epoch": 7.760857304004512, + "grad_norm": 0.845125138759613, + "learning_rate": 1.1198533558939652e-05, + "loss": 0.6792, + "step": 13760 + }, + { + "epoch": 7.761421319796955, + "grad_norm": 1.0526789426803589, + "learning_rate": 1.1195713479977441e-05, + "loss": 0.6943, + "step": 13761 + }, + { + "epoch": 7.761985335589396, + "grad_norm": 0.9494205713272095, + "learning_rate": 1.119289340101523e-05, + "loss": 0.7222, + "step": 13762 + }, + { + "epoch": 7.762549351381839, + "grad_norm": 1.2508277893066406, + "learning_rate": 1.1190073322053019e-05, + "loss": 0.7044, + "step": 13763 + }, + { + "epoch": 7.763113367174281, + "grad_norm": 0.9483262300491333, + "learning_rate": 1.1187253243090807e-05, + "loss": 0.7315, + "step": 13764 + }, + { + "epoch": 7.763677382966723, + "grad_norm": 0.9242522120475769, + "learning_rate": 1.1184433164128596e-05, + "loss": 0.6843, + "step": 13765 + }, + { + "epoch": 7.764241398759165, + "grad_norm": 0.9847538471221924, + "learning_rate": 1.1181613085166385e-05, + "loss": 0.6769, + "step": 13766 + }, + { + "epoch": 7.764805414551607, + "grad_norm": 0.9546516537666321, + "learning_rate": 1.1178793006204174e-05, + "loss": 0.6432, + "step": 13767 + }, + { + "epoch": 7.76536943034405, + "grad_norm": 1.380154013633728, + "learning_rate": 1.1175972927241964e-05, + "loss": 0.7547, + "step": 13768 + }, + { + "epoch": 7.765933446136492, + "grad_norm": 1.0653573274612427, + "learning_rate": 1.1173152848279753e-05, + "loss": 0.7003, + "step": 13769 + }, + { + "epoch": 7.7664974619289335, + "grad_norm": 1.148391842842102, + "learning_rate": 1.1170332769317542e-05, + "loss": 0.6857, + "step": 13770 + }, + { + "epoch": 7.767061477721376, + "grad_norm": 1.3139787912368774, + "learning_rate": 1.116751269035533e-05, + "loss": 0.7476, + "step": 13771 + }, + { + "epoch": 7.767625493513818, + "grad_norm": 1.2444113492965698, + "learning_rate": 1.116469261139312e-05, + "loss": 0.672, + "step": 13772 + }, + { + "epoch": 7.768189509306261, + "grad_norm": 1.3217029571533203, + "learning_rate": 1.1161872532430908e-05, + "loss": 0.7769, + "step": 13773 + }, + { + "epoch": 7.7687535250987025, + "grad_norm": 1.163419246673584, + "learning_rate": 1.1159052453468697e-05, + "loss": 0.7729, + "step": 13774 + }, + { + "epoch": 7.769317540891145, + "grad_norm": 1.9848370552062988, + "learning_rate": 1.1156232374506488e-05, + "loss": 0.8168, + "step": 13775 + }, + { + "epoch": 7.769881556683587, + "grad_norm": 1.1967852115631104, + "learning_rate": 1.1153412295544276e-05, + "loss": 0.6957, + "step": 13776 + }, + { + "epoch": 7.77044557247603, + "grad_norm": 1.3663418292999268, + "learning_rate": 1.1150592216582065e-05, + "loss": 0.776, + "step": 13777 + }, + { + "epoch": 7.7710095882684715, + "grad_norm": 1.003600001335144, + "learning_rate": 1.1147772137619854e-05, + "loss": 0.7301, + "step": 13778 + }, + { + "epoch": 7.771573604060913, + "grad_norm": 1.5623654127120972, + "learning_rate": 1.1144952058657643e-05, + "loss": 0.773, + "step": 13779 + }, + { + "epoch": 7.772137619853356, + "grad_norm": 1.1127992868423462, + "learning_rate": 1.1142131979695431e-05, + "loss": 0.6774, + "step": 13780 + }, + { + "epoch": 7.772701635645798, + "grad_norm": 1.2208739519119263, + "learning_rate": 1.113931190073322e-05, + "loss": 0.6601, + "step": 13781 + }, + { + "epoch": 7.7732656514382406, + "grad_norm": 1.1226959228515625, + "learning_rate": 1.1136491821771009e-05, + "loss": 0.7735, + "step": 13782 + }, + { + "epoch": 7.773829667230682, + "grad_norm": 0.7755024433135986, + "learning_rate": 1.11336717428088e-05, + "loss": 0.6357, + "step": 13783 + }, + { + "epoch": 7.774393683023124, + "grad_norm": 1.2179045677185059, + "learning_rate": 1.1130851663846588e-05, + "loss": 0.7542, + "step": 13784 + }, + { + "epoch": 7.774957698815567, + "grad_norm": 1.1029011011123657, + "learning_rate": 1.1128031584884377e-05, + "loss": 0.7504, + "step": 13785 + }, + { + "epoch": 7.775521714608009, + "grad_norm": 1.2559040784835815, + "learning_rate": 1.1125211505922166e-05, + "loss": 0.6891, + "step": 13786 + }, + { + "epoch": 7.776085730400451, + "grad_norm": 1.152976393699646, + "learning_rate": 1.1122391426959955e-05, + "loss": 0.6699, + "step": 13787 + }, + { + "epoch": 7.776649746192893, + "grad_norm": 1.0729016065597534, + "learning_rate": 1.1119571347997744e-05, + "loss": 0.6478, + "step": 13788 + }, + { + "epoch": 7.777213761985336, + "grad_norm": 1.1375652551651, + "learning_rate": 1.1116751269035532e-05, + "loss": 0.6198, + "step": 13789 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 1.1095036268234253, + "learning_rate": 1.1113931190073323e-05, + "loss": 0.7998, + "step": 13790 + }, + { + "epoch": 7.77834179357022, + "grad_norm": 1.4686942100524902, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.819, + "step": 13791 + }, + { + "epoch": 7.778905809362662, + "grad_norm": 1.1136023998260498, + "learning_rate": 1.11082910321489e-05, + "loss": 0.7681, + "step": 13792 + }, + { + "epoch": 7.779469825155104, + "grad_norm": 1.1285618543624878, + "learning_rate": 1.1105470953186691e-05, + "loss": 0.7055, + "step": 13793 + }, + { + "epoch": 7.780033840947547, + "grad_norm": 1.2199987173080444, + "learning_rate": 1.110265087422448e-05, + "loss": 0.6812, + "step": 13794 + }, + { + "epoch": 7.7805978567399885, + "grad_norm": 1.0073379278182983, + "learning_rate": 1.1099830795262269e-05, + "loss": 0.6838, + "step": 13795 + }, + { + "epoch": 7.781161872532431, + "grad_norm": 1.4355995655059814, + "learning_rate": 1.1097010716300057e-05, + "loss": 0.8069, + "step": 13796 + }, + { + "epoch": 7.781725888324873, + "grad_norm": 1.0951091051101685, + "learning_rate": 1.1094190637337846e-05, + "loss": 0.6615, + "step": 13797 + }, + { + "epoch": 7.782289904117315, + "grad_norm": 1.2469877004623413, + "learning_rate": 1.1091370558375635e-05, + "loss": 0.7811, + "step": 13798 + }, + { + "epoch": 7.7828539199097575, + "grad_norm": 1.153822898864746, + "learning_rate": 1.1088550479413425e-05, + "loss": 0.7836, + "step": 13799 + }, + { + "epoch": 7.783417935702199, + "grad_norm": 1.0053354501724243, + "learning_rate": 1.1085730400451214e-05, + "loss": 0.6826, + "step": 13800 + }, + { + "epoch": 7.783981951494642, + "grad_norm": 1.1149680614471436, + "learning_rate": 1.1082910321489003e-05, + "loss": 0.7452, + "step": 13801 + }, + { + "epoch": 7.784545967287084, + "grad_norm": 0.961659848690033, + "learning_rate": 1.1080090242526792e-05, + "loss": 0.7047, + "step": 13802 + }, + { + "epoch": 7.7851099830795265, + "grad_norm": 1.3035639524459839, + "learning_rate": 1.107727016356458e-05, + "loss": 0.6993, + "step": 13803 + }, + { + "epoch": 7.785673998871968, + "grad_norm": 1.0821409225463867, + "learning_rate": 1.107445008460237e-05, + "loss": 0.7767, + "step": 13804 + }, + { + "epoch": 7.786238014664411, + "grad_norm": 1.0749868154525757, + "learning_rate": 1.1071630005640158e-05, + "loss": 0.7552, + "step": 13805 + }, + { + "epoch": 7.786802030456853, + "grad_norm": 0.8224740028381348, + "learning_rate": 1.1068809926677947e-05, + "loss": 0.6533, + "step": 13806 + }, + { + "epoch": 7.787366046249295, + "grad_norm": 0.8751980066299438, + "learning_rate": 1.1065989847715737e-05, + "loss": 0.6174, + "step": 13807 + }, + { + "epoch": 7.787930062041737, + "grad_norm": 0.9998223781585693, + "learning_rate": 1.1063169768753526e-05, + "loss": 0.6859, + "step": 13808 + }, + { + "epoch": 7.788494077834179, + "grad_norm": 0.8960963487625122, + "learning_rate": 1.1060349689791315e-05, + "loss": 0.6254, + "step": 13809 + }, + { + "epoch": 7.789058093626622, + "grad_norm": 1.0086578130722046, + "learning_rate": 1.1057529610829104e-05, + "loss": 0.712, + "step": 13810 + }, + { + "epoch": 7.789622109419064, + "grad_norm": 1.2699581384658813, + "learning_rate": 1.1054709531866893e-05, + "loss": 0.7394, + "step": 13811 + }, + { + "epoch": 7.790186125211505, + "grad_norm": 1.5767242908477783, + "learning_rate": 1.1051889452904681e-05, + "loss": 0.7181, + "step": 13812 + }, + { + "epoch": 7.790750141003948, + "grad_norm": 1.0659770965576172, + "learning_rate": 1.104906937394247e-05, + "loss": 0.7867, + "step": 13813 + }, + { + "epoch": 7.79131415679639, + "grad_norm": 1.0242949724197388, + "learning_rate": 1.1046249294980259e-05, + "loss": 0.7975, + "step": 13814 + }, + { + "epoch": 7.791878172588833, + "grad_norm": 1.2223668098449707, + "learning_rate": 1.104342921601805e-05, + "loss": 0.8222, + "step": 13815 + }, + { + "epoch": 7.7924421883812744, + "grad_norm": 1.1306970119476318, + "learning_rate": 1.1040609137055838e-05, + "loss": 0.749, + "step": 13816 + }, + { + "epoch": 7.793006204173717, + "grad_norm": 1.4561203718185425, + "learning_rate": 1.1037789058093627e-05, + "loss": 0.7689, + "step": 13817 + }, + { + "epoch": 7.793570219966159, + "grad_norm": 1.3102048635482788, + "learning_rate": 1.1034968979131416e-05, + "loss": 0.7271, + "step": 13818 + }, + { + "epoch": 7.794134235758602, + "grad_norm": 1.051486849784851, + "learning_rate": 1.1032148900169205e-05, + "loss": 0.7537, + "step": 13819 + }, + { + "epoch": 7.7946982515510435, + "grad_norm": 1.358124852180481, + "learning_rate": 1.1029328821206994e-05, + "loss": 0.727, + "step": 13820 + }, + { + "epoch": 7.795262267343485, + "grad_norm": 0.968558669090271, + "learning_rate": 1.1026508742244782e-05, + "loss": 0.7311, + "step": 13821 + }, + { + "epoch": 7.795826283135928, + "grad_norm": 1.173577070236206, + "learning_rate": 1.1023688663282573e-05, + "loss": 0.6832, + "step": 13822 + }, + { + "epoch": 7.79639029892837, + "grad_norm": 1.0668327808380127, + "learning_rate": 1.1020868584320362e-05, + "loss": 0.7155, + "step": 13823 + }, + { + "epoch": 7.7969543147208125, + "grad_norm": 1.078688383102417, + "learning_rate": 1.101804850535815e-05, + "loss": 0.6351, + "step": 13824 + }, + { + "epoch": 7.797518330513254, + "grad_norm": 1.949091911315918, + "learning_rate": 1.101522842639594e-05, + "loss": 0.8289, + "step": 13825 + }, + { + "epoch": 7.798082346305696, + "grad_norm": 1.130125880241394, + "learning_rate": 1.1012408347433728e-05, + "loss": 0.6889, + "step": 13826 + }, + { + "epoch": 7.798646362098139, + "grad_norm": 1.0160735845565796, + "learning_rate": 1.1009588268471517e-05, + "loss": 0.7322, + "step": 13827 + }, + { + "epoch": 7.799210377890581, + "grad_norm": 1.1319477558135986, + "learning_rate": 1.1006768189509306e-05, + "loss": 0.6434, + "step": 13828 + }, + { + "epoch": 7.799774393683023, + "grad_norm": 1.1569844484329224, + "learning_rate": 1.1003948110547096e-05, + "loss": 0.7525, + "step": 13829 + }, + { + "epoch": 7.800338409475465, + "grad_norm": 1.3609062433242798, + "learning_rate": 1.1001128031584885e-05, + "loss": 0.8391, + "step": 13830 + }, + { + "epoch": 7.800902425267908, + "grad_norm": 1.0813463926315308, + "learning_rate": 1.0998307952622675e-05, + "loss": 0.7514, + "step": 13831 + }, + { + "epoch": 7.80146644106035, + "grad_norm": 0.9800832867622375, + "learning_rate": 1.0995487873660464e-05, + "loss": 0.704, + "step": 13832 + }, + { + "epoch": 7.802030456852792, + "grad_norm": 1.2347944974899292, + "learning_rate": 1.0992667794698253e-05, + "loss": 0.6985, + "step": 13833 + }, + { + "epoch": 7.802594472645234, + "grad_norm": 1.308095932006836, + "learning_rate": 1.0989847715736042e-05, + "loss": 0.7411, + "step": 13834 + }, + { + "epoch": 7.803158488437676, + "grad_norm": 0.8335132598876953, + "learning_rate": 1.098702763677383e-05, + "loss": 0.6813, + "step": 13835 + }, + { + "epoch": 7.803722504230119, + "grad_norm": 0.9631423354148865, + "learning_rate": 1.098420755781162e-05, + "loss": 0.7348, + "step": 13836 + }, + { + "epoch": 7.80428652002256, + "grad_norm": 0.9026042819023132, + "learning_rate": 1.0981387478849408e-05, + "loss": 0.689, + "step": 13837 + }, + { + "epoch": 7.804850535815003, + "grad_norm": 1.3282356262207031, + "learning_rate": 1.0978567399887197e-05, + "loss": 0.7378, + "step": 13838 + }, + { + "epoch": 7.805414551607445, + "grad_norm": 1.0084091424942017, + "learning_rate": 1.0975747320924987e-05, + "loss": 0.6748, + "step": 13839 + }, + { + "epoch": 7.805978567399887, + "grad_norm": 1.0226174592971802, + "learning_rate": 1.0972927241962776e-05, + "loss": 0.7236, + "step": 13840 + }, + { + "epoch": 7.806542583192329, + "grad_norm": 0.9933704733848572, + "learning_rate": 1.0970107163000565e-05, + "loss": 0.6752, + "step": 13841 + }, + { + "epoch": 7.807106598984771, + "grad_norm": 1.041385293006897, + "learning_rate": 1.0967287084038354e-05, + "loss": 0.7657, + "step": 13842 + }, + { + "epoch": 7.807670614777214, + "grad_norm": 1.2454432249069214, + "learning_rate": 1.0964467005076143e-05, + "loss": 0.7402, + "step": 13843 + }, + { + "epoch": 7.808234630569656, + "grad_norm": 1.473794937133789, + "learning_rate": 1.0961646926113931e-05, + "loss": 0.8128, + "step": 13844 + }, + { + "epoch": 7.808798646362098, + "grad_norm": 1.0804296731948853, + "learning_rate": 1.095882684715172e-05, + "loss": 0.6766, + "step": 13845 + }, + { + "epoch": 7.80936266215454, + "grad_norm": 1.1748404502868652, + "learning_rate": 1.0956006768189509e-05, + "loss": 0.7691, + "step": 13846 + }, + { + "epoch": 7.809926677946983, + "grad_norm": 1.0160943269729614, + "learning_rate": 1.09531866892273e-05, + "loss": 0.7079, + "step": 13847 + }, + { + "epoch": 7.810490693739425, + "grad_norm": 0.8938132524490356, + "learning_rate": 1.0950366610265088e-05, + "loss": 0.7221, + "step": 13848 + }, + { + "epoch": 7.8110547095318665, + "grad_norm": 0.8222218155860901, + "learning_rate": 1.0947546531302877e-05, + "loss": 0.6656, + "step": 13849 + }, + { + "epoch": 7.811618725324309, + "grad_norm": 1.355974555015564, + "learning_rate": 1.0944726452340666e-05, + "loss": 0.7658, + "step": 13850 + }, + { + "epoch": 7.812182741116751, + "grad_norm": 1.2716115713119507, + "learning_rate": 1.0941906373378455e-05, + "loss": 0.7311, + "step": 13851 + }, + { + "epoch": 7.812746756909194, + "grad_norm": 1.0059475898742676, + "learning_rate": 1.0939086294416243e-05, + "loss": 0.6784, + "step": 13852 + }, + { + "epoch": 7.8133107727016355, + "grad_norm": 0.8437081575393677, + "learning_rate": 1.0936266215454032e-05, + "loss": 0.5611, + "step": 13853 + }, + { + "epoch": 7.813874788494077, + "grad_norm": 1.3278508186340332, + "learning_rate": 1.0933446136491823e-05, + "loss": 0.7791, + "step": 13854 + }, + { + "epoch": 7.81443880428652, + "grad_norm": 1.0381015539169312, + "learning_rate": 1.0930626057529612e-05, + "loss": 0.7144, + "step": 13855 + }, + { + "epoch": 7.815002820078962, + "grad_norm": 1.0555355548858643, + "learning_rate": 1.09278059785674e-05, + "loss": 0.7316, + "step": 13856 + }, + { + "epoch": 7.8155668358714045, + "grad_norm": 1.3690413236618042, + "learning_rate": 1.092498589960519e-05, + "loss": 0.814, + "step": 13857 + }, + { + "epoch": 7.816130851663846, + "grad_norm": 1.1403669118881226, + "learning_rate": 1.0922165820642978e-05, + "loss": 0.8011, + "step": 13858 + }, + { + "epoch": 7.816694867456289, + "grad_norm": 1.3419828414916992, + "learning_rate": 1.0919345741680767e-05, + "loss": 0.7912, + "step": 13859 + }, + { + "epoch": 7.817258883248731, + "grad_norm": 1.0250413417816162, + "learning_rate": 1.0916525662718556e-05, + "loss": 0.6345, + "step": 13860 + }, + { + "epoch": 7.817822899041174, + "grad_norm": 1.022657036781311, + "learning_rate": 1.0913705583756344e-05, + "loss": 0.6743, + "step": 13861 + }, + { + "epoch": 7.818386914833615, + "grad_norm": 1.1575593948364258, + "learning_rate": 1.0910885504794135e-05, + "loss": 0.82, + "step": 13862 + }, + { + "epoch": 7.818950930626057, + "grad_norm": 1.0959523916244507, + "learning_rate": 1.0908065425831924e-05, + "loss": 0.7787, + "step": 13863 + }, + { + "epoch": 7.8195149464185, + "grad_norm": 0.9445610046386719, + "learning_rate": 1.0905245346869714e-05, + "loss": 0.6965, + "step": 13864 + }, + { + "epoch": 7.820078962210942, + "grad_norm": 1.1358002424240112, + "learning_rate": 1.0902425267907503e-05, + "loss": 0.7426, + "step": 13865 + }, + { + "epoch": 7.820642978003384, + "grad_norm": 1.3370434045791626, + "learning_rate": 1.0899605188945292e-05, + "loss": 0.7643, + "step": 13866 + }, + { + "epoch": 7.821206993795826, + "grad_norm": 1.3332453966140747, + "learning_rate": 1.089678510998308e-05, + "loss": 0.6887, + "step": 13867 + }, + { + "epoch": 7.821771009588268, + "grad_norm": 0.9707873463630676, + "learning_rate": 1.089396503102087e-05, + "loss": 0.6988, + "step": 13868 + }, + { + "epoch": 7.822335025380711, + "grad_norm": 1.2696434259414673, + "learning_rate": 1.0891144952058658e-05, + "loss": 0.7126, + "step": 13869 + }, + { + "epoch": 7.8228990411731525, + "grad_norm": 1.0514171123504639, + "learning_rate": 1.0888324873096447e-05, + "loss": 0.8013, + "step": 13870 + }, + { + "epoch": 7.823463056965595, + "grad_norm": 1.0876315832138062, + "learning_rate": 1.0885504794134237e-05, + "loss": 0.7434, + "step": 13871 + }, + { + "epoch": 7.824027072758037, + "grad_norm": 1.3885936737060547, + "learning_rate": 1.0882684715172026e-05, + "loss": 0.6716, + "step": 13872 + }, + { + "epoch": 7.82459108855048, + "grad_norm": 1.1148165464401245, + "learning_rate": 1.0879864636209815e-05, + "loss": 0.7186, + "step": 13873 + }, + { + "epoch": 7.8251551043429215, + "grad_norm": 1.0009440183639526, + "learning_rate": 1.0877044557247604e-05, + "loss": 0.7666, + "step": 13874 + }, + { + "epoch": 7.825719120135364, + "grad_norm": 0.8401767611503601, + "learning_rate": 1.0874224478285393e-05, + "loss": 0.6054, + "step": 13875 + }, + { + "epoch": 7.826283135927806, + "grad_norm": 1.1880251169204712, + "learning_rate": 1.0871404399323181e-05, + "loss": 0.7151, + "step": 13876 + }, + { + "epoch": 7.826847151720248, + "grad_norm": 1.3632583618164062, + "learning_rate": 1.086858432036097e-05, + "loss": 0.7039, + "step": 13877 + }, + { + "epoch": 7.8274111675126905, + "grad_norm": 0.8907787203788757, + "learning_rate": 1.086576424139876e-05, + "loss": 0.6764, + "step": 13878 + }, + { + "epoch": 7.827975183305132, + "grad_norm": 1.0090677738189697, + "learning_rate": 1.086294416243655e-05, + "loss": 0.8434, + "step": 13879 + }, + { + "epoch": 7.828539199097575, + "grad_norm": 1.0712978839874268, + "learning_rate": 1.0860124083474338e-05, + "loss": 0.7087, + "step": 13880 + }, + { + "epoch": 7.829103214890017, + "grad_norm": 1.037053108215332, + "learning_rate": 1.0857304004512127e-05, + "loss": 0.6967, + "step": 13881 + }, + { + "epoch": 7.829667230682459, + "grad_norm": 1.0835405588150024, + "learning_rate": 1.0854483925549916e-05, + "loss": 0.7353, + "step": 13882 + }, + { + "epoch": 7.830231246474901, + "grad_norm": 1.34237802028656, + "learning_rate": 1.0851663846587705e-05, + "loss": 0.929, + "step": 13883 + }, + { + "epoch": 7.830795262267343, + "grad_norm": 1.161277174949646, + "learning_rate": 1.0848843767625493e-05, + "loss": 0.7182, + "step": 13884 + }, + { + "epoch": 7.831359278059786, + "grad_norm": 1.1018379926681519, + "learning_rate": 1.0846023688663282e-05, + "loss": 0.7808, + "step": 13885 + }, + { + "epoch": 7.831923293852228, + "grad_norm": 1.0083327293395996, + "learning_rate": 1.0843203609701073e-05, + "loss": 0.7908, + "step": 13886 + }, + { + "epoch": 7.83248730964467, + "grad_norm": 1.6534597873687744, + "learning_rate": 1.0840383530738862e-05, + "loss": 0.6767, + "step": 13887 + }, + { + "epoch": 7.833051325437112, + "grad_norm": 1.1063438653945923, + "learning_rate": 1.083756345177665e-05, + "loss": 0.6412, + "step": 13888 + }, + { + "epoch": 7.833615341229555, + "grad_norm": 1.3986209630966187, + "learning_rate": 1.0834743372814439e-05, + "loss": 0.7572, + "step": 13889 + }, + { + "epoch": 7.834179357021997, + "grad_norm": 1.4945054054260254, + "learning_rate": 1.0831923293852228e-05, + "loss": 0.7964, + "step": 13890 + }, + { + "epoch": 7.8347433728144384, + "grad_norm": 1.049100399017334, + "learning_rate": 1.0829103214890017e-05, + "loss": 0.6594, + "step": 13891 + }, + { + "epoch": 7.835307388606881, + "grad_norm": 1.1963043212890625, + "learning_rate": 1.0826283135927806e-05, + "loss": 0.7837, + "step": 13892 + }, + { + "epoch": 7.835871404399323, + "grad_norm": 1.2176998853683472, + "learning_rate": 1.0823463056965594e-05, + "loss": 0.7677, + "step": 13893 + }, + { + "epoch": 7.836435420191766, + "grad_norm": 1.43440580368042, + "learning_rate": 1.0820642978003385e-05, + "loss": 0.7662, + "step": 13894 + }, + { + "epoch": 7.8369994359842075, + "grad_norm": 1.1343022584915161, + "learning_rate": 1.0817822899041174e-05, + "loss": 0.7184, + "step": 13895 + }, + { + "epoch": 7.837563451776649, + "grad_norm": 1.1229465007781982, + "learning_rate": 1.0815002820078962e-05, + "loss": 0.7307, + "step": 13896 + }, + { + "epoch": 7.838127467569092, + "grad_norm": 0.9402995705604553, + "learning_rate": 1.0812182741116751e-05, + "loss": 0.7823, + "step": 13897 + }, + { + "epoch": 7.838691483361534, + "grad_norm": 1.004928469657898, + "learning_rate": 1.080936266215454e-05, + "loss": 0.6324, + "step": 13898 + }, + { + "epoch": 7.8392554991539765, + "grad_norm": 1.0571893453598022, + "learning_rate": 1.080654258319233e-05, + "loss": 0.8013, + "step": 13899 + }, + { + "epoch": 7.839819514946418, + "grad_norm": 1.0319931507110596, + "learning_rate": 1.080372250423012e-05, + "loss": 0.6655, + "step": 13900 + }, + { + "epoch": 7.840383530738861, + "grad_norm": 1.188608169555664, + "learning_rate": 1.0800902425267908e-05, + "loss": 0.8112, + "step": 13901 + }, + { + "epoch": 7.840947546531303, + "grad_norm": 1.0898116827011108, + "learning_rate": 1.0798082346305697e-05, + "loss": 0.726, + "step": 13902 + }, + { + "epoch": 7.8415115623237455, + "grad_norm": 1.432056188583374, + "learning_rate": 1.0795262267343487e-05, + "loss": 0.7796, + "step": 13903 + }, + { + "epoch": 7.842075578116187, + "grad_norm": 1.4274227619171143, + "learning_rate": 1.0792442188381276e-05, + "loss": 0.7344, + "step": 13904 + }, + { + "epoch": 7.842639593908629, + "grad_norm": 0.8750420212745667, + "learning_rate": 1.0789622109419065e-05, + "loss": 0.6768, + "step": 13905 + }, + { + "epoch": 7.843203609701072, + "grad_norm": 1.421208143234253, + "learning_rate": 1.0786802030456854e-05, + "loss": 0.7541, + "step": 13906 + }, + { + "epoch": 7.843767625493514, + "grad_norm": 1.0756663084030151, + "learning_rate": 1.0783981951494643e-05, + "loss": 0.7088, + "step": 13907 + }, + { + "epoch": 7.844331641285956, + "grad_norm": 1.0464969873428345, + "learning_rate": 1.0781161872532431e-05, + "loss": 0.7678, + "step": 13908 + }, + { + "epoch": 7.844895657078398, + "grad_norm": 1.4321194887161255, + "learning_rate": 1.077834179357022e-05, + "loss": 0.7859, + "step": 13909 + }, + { + "epoch": 7.84545967287084, + "grad_norm": 1.1490296125411987, + "learning_rate": 1.077552171460801e-05, + "loss": 0.765, + "step": 13910 + }, + { + "epoch": 7.846023688663283, + "grad_norm": 1.0076521635055542, + "learning_rate": 1.07727016356458e-05, + "loss": 0.6718, + "step": 13911 + }, + { + "epoch": 7.846587704455724, + "grad_norm": 0.9408526420593262, + "learning_rate": 1.0769881556683588e-05, + "loss": 0.7602, + "step": 13912 + }, + { + "epoch": 7.847151720248167, + "grad_norm": 1.2174633741378784, + "learning_rate": 1.0767061477721377e-05, + "loss": 0.6765, + "step": 13913 + }, + { + "epoch": 7.847715736040609, + "grad_norm": 1.2595471143722534, + "learning_rate": 1.0764241398759166e-05, + "loss": 0.6875, + "step": 13914 + }, + { + "epoch": 7.848279751833052, + "grad_norm": 1.1793113946914673, + "learning_rate": 1.0761421319796955e-05, + "loss": 0.7746, + "step": 13915 + }, + { + "epoch": 7.848843767625493, + "grad_norm": 0.9931444525718689, + "learning_rate": 1.0758601240834743e-05, + "loss": 0.8091, + "step": 13916 + }, + { + "epoch": 7.849407783417936, + "grad_norm": 1.1312187910079956, + "learning_rate": 1.0755781161872532e-05, + "loss": 0.7845, + "step": 13917 + }, + { + "epoch": 7.849971799210378, + "grad_norm": 1.0907154083251953, + "learning_rate": 1.0752961082910323e-05, + "loss": 0.794, + "step": 13918 + }, + { + "epoch": 7.85053581500282, + "grad_norm": 1.4370745420455933, + "learning_rate": 1.0750141003948111e-05, + "loss": 0.6797, + "step": 13919 + }, + { + "epoch": 7.851099830795262, + "grad_norm": 1.0423192977905273, + "learning_rate": 1.07473209249859e-05, + "loss": 0.7532, + "step": 13920 + }, + { + "epoch": 7.851663846587704, + "grad_norm": 1.1396883726119995, + "learning_rate": 1.0744500846023689e-05, + "loss": 0.7455, + "step": 13921 + }, + { + "epoch": 7.852227862380147, + "grad_norm": 1.1990982294082642, + "learning_rate": 1.0741680767061478e-05, + "loss": 0.7724, + "step": 13922 + }, + { + "epoch": 7.852791878172589, + "grad_norm": 1.1521179676055908, + "learning_rate": 1.0738860688099267e-05, + "loss": 0.724, + "step": 13923 + }, + { + "epoch": 7.8533558939650305, + "grad_norm": 1.0752606391906738, + "learning_rate": 1.0736040609137055e-05, + "loss": 0.7901, + "step": 13924 + }, + { + "epoch": 7.853919909757473, + "grad_norm": 1.119064211845398, + "learning_rate": 1.0733220530174844e-05, + "loss": 0.6295, + "step": 13925 + }, + { + "epoch": 7.854483925549915, + "grad_norm": 1.3108187913894653, + "learning_rate": 1.0730400451212635e-05, + "loss": 0.7478, + "step": 13926 + }, + { + "epoch": 7.855047941342358, + "grad_norm": 0.9151445627212524, + "learning_rate": 1.0727580372250424e-05, + "loss": 0.6532, + "step": 13927 + }, + { + "epoch": 7.8556119571347995, + "grad_norm": 1.3621482849121094, + "learning_rate": 1.0724760293288212e-05, + "loss": 0.8286, + "step": 13928 + }, + { + "epoch": 7.856175972927242, + "grad_norm": 1.0273813009262085, + "learning_rate": 1.0721940214326001e-05, + "loss": 0.6116, + "step": 13929 + }, + { + "epoch": 7.856739988719684, + "grad_norm": 1.5351861715316772, + "learning_rate": 1.071912013536379e-05, + "loss": 0.8091, + "step": 13930 + }, + { + "epoch": 7.857304004512127, + "grad_norm": 1.0806541442871094, + "learning_rate": 1.0716300056401579e-05, + "loss": 0.762, + "step": 13931 + }, + { + "epoch": 7.8578680203045685, + "grad_norm": 1.615968942642212, + "learning_rate": 1.0713479977439368e-05, + "loss": 0.9073, + "step": 13932 + }, + { + "epoch": 7.85843203609701, + "grad_norm": 1.5986868143081665, + "learning_rate": 1.0710659898477158e-05, + "loss": 0.7106, + "step": 13933 + }, + { + "epoch": 7.858996051889453, + "grad_norm": 1.2271392345428467, + "learning_rate": 1.0707839819514947e-05, + "loss": 0.8027, + "step": 13934 + }, + { + "epoch": 7.859560067681895, + "grad_norm": 1.1032737493515015, + "learning_rate": 1.0705019740552737e-05, + "loss": 0.7171, + "step": 13935 + }, + { + "epoch": 7.8601240834743376, + "grad_norm": 1.3109381198883057, + "learning_rate": 1.0702199661590526e-05, + "loss": 0.7333, + "step": 13936 + }, + { + "epoch": 7.860688099266779, + "grad_norm": 1.1354259252548218, + "learning_rate": 1.0699379582628315e-05, + "loss": 0.5972, + "step": 13937 + }, + { + "epoch": 7.861252115059221, + "grad_norm": 1.0064417123794556, + "learning_rate": 1.0696559503666104e-05, + "loss": 0.8488, + "step": 13938 + }, + { + "epoch": 7.861816130851664, + "grad_norm": 1.0069273710250854, + "learning_rate": 1.0693739424703892e-05, + "loss": 0.7037, + "step": 13939 + }, + { + "epoch": 7.862380146644106, + "grad_norm": 1.3104798793792725, + "learning_rate": 1.0690919345741681e-05, + "loss": 0.702, + "step": 13940 + }, + { + "epoch": 7.862944162436548, + "grad_norm": 1.6593513488769531, + "learning_rate": 1.068809926677947e-05, + "loss": 0.7989, + "step": 13941 + }, + { + "epoch": 7.86350817822899, + "grad_norm": 0.9897142052650452, + "learning_rate": 1.068527918781726e-05, + "loss": 0.8326, + "step": 13942 + }, + { + "epoch": 7.864072194021433, + "grad_norm": 1.2812987565994263, + "learning_rate": 1.068245910885505e-05, + "loss": 0.7936, + "step": 13943 + }, + { + "epoch": 7.864636209813875, + "grad_norm": 1.1800868511199951, + "learning_rate": 1.0679639029892838e-05, + "loss": 0.7857, + "step": 13944 + }, + { + "epoch": 7.865200225606317, + "grad_norm": 1.3800328969955444, + "learning_rate": 1.0676818950930627e-05, + "loss": 0.718, + "step": 13945 + }, + { + "epoch": 7.865764241398759, + "grad_norm": 1.1178337335586548, + "learning_rate": 1.0673998871968416e-05, + "loss": 0.7509, + "step": 13946 + }, + { + "epoch": 7.866328257191201, + "grad_norm": 1.1307110786437988, + "learning_rate": 1.0671178793006205e-05, + "loss": 0.7248, + "step": 13947 + }, + { + "epoch": 7.866892272983644, + "grad_norm": 1.019721508026123, + "learning_rate": 1.0668358714043993e-05, + "loss": 0.7748, + "step": 13948 + }, + { + "epoch": 7.8674562887760855, + "grad_norm": 0.8302255868911743, + "learning_rate": 1.0665538635081782e-05, + "loss": 0.6689, + "step": 13949 + }, + { + "epoch": 7.868020304568528, + "grad_norm": 1.2560851573944092, + "learning_rate": 1.0662718556119573e-05, + "loss": 0.6923, + "step": 13950 + }, + { + "epoch": 7.86858432036097, + "grad_norm": 1.5231544971466064, + "learning_rate": 1.0659898477157361e-05, + "loss": 0.7314, + "step": 13951 + }, + { + "epoch": 7.869148336153412, + "grad_norm": 1.334028720855713, + "learning_rate": 1.065707839819515e-05, + "loss": 0.7147, + "step": 13952 + }, + { + "epoch": 7.8697123519458545, + "grad_norm": 0.9950805902481079, + "learning_rate": 1.0654258319232939e-05, + "loss": 0.7464, + "step": 13953 + }, + { + "epoch": 7.870276367738296, + "grad_norm": 1.4819648265838623, + "learning_rate": 1.0651438240270728e-05, + "loss": 0.7099, + "step": 13954 + }, + { + "epoch": 7.870840383530739, + "grad_norm": 1.018908977508545, + "learning_rate": 1.0648618161308517e-05, + "loss": 0.6828, + "step": 13955 + }, + { + "epoch": 7.871404399323181, + "grad_norm": 0.8982934951782227, + "learning_rate": 1.0645798082346305e-05, + "loss": 0.7349, + "step": 13956 + }, + { + "epoch": 7.8719684151156235, + "grad_norm": 1.1518197059631348, + "learning_rate": 1.0642978003384094e-05, + "loss": 0.8178, + "step": 13957 + }, + { + "epoch": 7.872532430908065, + "grad_norm": 1.1952993869781494, + "learning_rate": 1.0640157924421885e-05, + "loss": 0.7972, + "step": 13958 + }, + { + "epoch": 7.873096446700508, + "grad_norm": 1.3279021978378296, + "learning_rate": 1.0637337845459674e-05, + "loss": 0.8113, + "step": 13959 + }, + { + "epoch": 7.87366046249295, + "grad_norm": 1.089448094367981, + "learning_rate": 1.0634517766497462e-05, + "loss": 0.7789, + "step": 13960 + }, + { + "epoch": 7.874224478285392, + "grad_norm": 1.2591781616210938, + "learning_rate": 1.0631697687535251e-05, + "loss": 0.7831, + "step": 13961 + }, + { + "epoch": 7.874788494077834, + "grad_norm": 1.227932095527649, + "learning_rate": 1.062887760857304e-05, + "loss": 0.6939, + "step": 13962 + }, + { + "epoch": 7.875352509870276, + "grad_norm": 1.1257412433624268, + "learning_rate": 1.0626057529610829e-05, + "loss": 0.7228, + "step": 13963 + }, + { + "epoch": 7.875916525662719, + "grad_norm": 0.9178603291511536, + "learning_rate": 1.0623237450648617e-05, + "loss": 0.8101, + "step": 13964 + }, + { + "epoch": 7.876480541455161, + "grad_norm": 1.2010685205459595, + "learning_rate": 1.0620417371686408e-05, + "loss": 0.8282, + "step": 13965 + }, + { + "epoch": 7.877044557247602, + "grad_norm": 1.1630665063858032, + "learning_rate": 1.0617597292724197e-05, + "loss": 0.7977, + "step": 13966 + }, + { + "epoch": 7.877608573040045, + "grad_norm": 0.9463736414909363, + "learning_rate": 1.0614777213761986e-05, + "loss": 0.6509, + "step": 13967 + }, + { + "epoch": 7.878172588832487, + "grad_norm": 1.099184274673462, + "learning_rate": 1.0611957134799774e-05, + "loss": 0.7107, + "step": 13968 + }, + { + "epoch": 7.87873660462493, + "grad_norm": 1.2159252166748047, + "learning_rate": 1.0609137055837563e-05, + "loss": 0.7033, + "step": 13969 + }, + { + "epoch": 7.8793006204173714, + "grad_norm": 0.9693794846534729, + "learning_rate": 1.0606316976875354e-05, + "loss": 0.8062, + "step": 13970 + }, + { + "epoch": 7.879864636209814, + "grad_norm": 1.7277607917785645, + "learning_rate": 1.0603496897913142e-05, + "loss": 0.8441, + "step": 13971 + }, + { + "epoch": 7.880428652002256, + "grad_norm": 2.5101664066314697, + "learning_rate": 1.0600676818950931e-05, + "loss": 0.8533, + "step": 13972 + }, + { + "epoch": 7.880992667794699, + "grad_norm": 0.999215304851532, + "learning_rate": 1.059785673998872e-05, + "loss": 0.8207, + "step": 13973 + }, + { + "epoch": 7.8815566835871405, + "grad_norm": 0.9629529714584351, + "learning_rate": 1.059503666102651e-05, + "loss": 0.6927, + "step": 13974 + }, + { + "epoch": 7.882120699379582, + "grad_norm": 1.781927227973938, + "learning_rate": 1.05922165820643e-05, + "loss": 0.7462, + "step": 13975 + }, + { + "epoch": 7.882684715172025, + "grad_norm": 1.352475881576538, + "learning_rate": 1.0589396503102088e-05, + "loss": 0.7359, + "step": 13976 + }, + { + "epoch": 7.883248730964467, + "grad_norm": 1.2244826555252075, + "learning_rate": 1.0586576424139877e-05, + "loss": 0.7753, + "step": 13977 + }, + { + "epoch": 7.8838127467569095, + "grad_norm": 1.3265644311904907, + "learning_rate": 1.0583756345177666e-05, + "loss": 0.7776, + "step": 13978 + }, + { + "epoch": 7.884376762549351, + "grad_norm": 1.0415191650390625, + "learning_rate": 1.0580936266215455e-05, + "loss": 0.5638, + "step": 13979 + }, + { + "epoch": 7.884940778341793, + "grad_norm": 1.2348284721374512, + "learning_rate": 1.0578116187253243e-05, + "loss": 0.8225, + "step": 13980 + }, + { + "epoch": 7.885504794134236, + "grad_norm": 1.0442218780517578, + "learning_rate": 1.0575296108291032e-05, + "loss": 0.7217, + "step": 13981 + }, + { + "epoch": 7.886068809926678, + "grad_norm": 0.9922716021537781, + "learning_rate": 1.0572476029328823e-05, + "loss": 0.72, + "step": 13982 + }, + { + "epoch": 7.88663282571912, + "grad_norm": 1.4303467273712158, + "learning_rate": 1.0569655950366611e-05, + "loss": 0.7608, + "step": 13983 + }, + { + "epoch": 7.887196841511562, + "grad_norm": 0.9353770017623901, + "learning_rate": 1.05668358714044e-05, + "loss": 0.738, + "step": 13984 + }, + { + "epoch": 7.887760857304005, + "grad_norm": 0.8204127550125122, + "learning_rate": 1.0564015792442189e-05, + "loss": 0.6695, + "step": 13985 + }, + { + "epoch": 7.888324873096447, + "grad_norm": 1.227027416229248, + "learning_rate": 1.0561195713479978e-05, + "loss": 0.7485, + "step": 13986 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 1.0093026161193848, + "learning_rate": 1.0558375634517767e-05, + "loss": 0.7517, + "step": 13987 + }, + { + "epoch": 7.889452904681331, + "grad_norm": 1.1827441453933716, + "learning_rate": 1.0555555555555555e-05, + "loss": 0.7723, + "step": 13988 + }, + { + "epoch": 7.890016920473773, + "grad_norm": 1.3160701990127563, + "learning_rate": 1.0552735476593346e-05, + "loss": 0.6593, + "step": 13989 + }, + { + "epoch": 7.890580936266216, + "grad_norm": 1.0521717071533203, + "learning_rate": 1.0549915397631135e-05, + "loss": 0.7201, + "step": 13990 + }, + { + "epoch": 7.891144952058657, + "grad_norm": 1.1912670135498047, + "learning_rate": 1.0547095318668923e-05, + "loss": 0.7152, + "step": 13991 + }, + { + "epoch": 7.8917089678511, + "grad_norm": 1.2011431455612183, + "learning_rate": 1.0544275239706712e-05, + "loss": 0.8125, + "step": 13992 + }, + { + "epoch": 7.892272983643542, + "grad_norm": 1.4304406642913818, + "learning_rate": 1.0541455160744501e-05, + "loss": 0.7552, + "step": 13993 + }, + { + "epoch": 7.892836999435984, + "grad_norm": 0.9806925058364868, + "learning_rate": 1.053863508178229e-05, + "loss": 0.7748, + "step": 13994 + }, + { + "epoch": 7.893401015228426, + "grad_norm": 1.4163142442703247, + "learning_rate": 1.0535815002820079e-05, + "loss": 0.7485, + "step": 13995 + }, + { + "epoch": 7.893965031020868, + "grad_norm": 1.3808938264846802, + "learning_rate": 1.0532994923857867e-05, + "loss": 0.7698, + "step": 13996 + }, + { + "epoch": 7.894529046813311, + "grad_norm": 0.8120247721672058, + "learning_rate": 1.0530174844895658e-05, + "loss": 0.6829, + "step": 13997 + }, + { + "epoch": 7.895093062605753, + "grad_norm": 1.2551078796386719, + "learning_rate": 1.0527354765933447e-05, + "loss": 0.7506, + "step": 13998 + }, + { + "epoch": 7.895657078398195, + "grad_norm": 1.2072703838348389, + "learning_rate": 1.0524534686971236e-05, + "loss": 0.749, + "step": 13999 + }, + { + "epoch": 7.896221094190637, + "grad_norm": 1.0288515090942383, + "learning_rate": 1.0521714608009024e-05, + "loss": 0.7309, + "step": 14000 + }, + { + "epoch": 7.89678510998308, + "grad_norm": 0.9961109757423401, + "learning_rate": 1.0518894529046813e-05, + "loss": 0.7223, + "step": 14001 + }, + { + "epoch": 7.897349125775522, + "grad_norm": 1.0328370332717896, + "learning_rate": 1.0516074450084602e-05, + "loss": 0.8014, + "step": 14002 + }, + { + "epoch": 7.8979131415679635, + "grad_norm": 0.9351440072059631, + "learning_rate": 1.051325437112239e-05, + "loss": 0.758, + "step": 14003 + }, + { + "epoch": 7.898477157360406, + "grad_norm": 1.1811014413833618, + "learning_rate": 1.051043429216018e-05, + "loss": 0.7206, + "step": 14004 + }, + { + "epoch": 7.899041173152848, + "grad_norm": 0.993813157081604, + "learning_rate": 1.050761421319797e-05, + "loss": 0.7001, + "step": 14005 + }, + { + "epoch": 7.899605188945291, + "grad_norm": 1.135050654411316, + "learning_rate": 1.050479413423576e-05, + "loss": 0.7379, + "step": 14006 + }, + { + "epoch": 7.9001692047377325, + "grad_norm": 1.0926660299301147, + "learning_rate": 1.050197405527355e-05, + "loss": 0.6562, + "step": 14007 + }, + { + "epoch": 7.900733220530174, + "grad_norm": 1.273051381111145, + "learning_rate": 1.0499153976311338e-05, + "loss": 0.7828, + "step": 14008 + }, + { + "epoch": 7.901297236322617, + "grad_norm": 0.9852173328399658, + "learning_rate": 1.0496333897349127e-05, + "loss": 0.7103, + "step": 14009 + }, + { + "epoch": 7.901861252115059, + "grad_norm": 1.1269757747650146, + "learning_rate": 1.0493513818386916e-05, + "loss": 0.7935, + "step": 14010 + }, + { + "epoch": 7.9024252679075015, + "grad_norm": 1.5600154399871826, + "learning_rate": 1.0490693739424704e-05, + "loss": 0.8802, + "step": 14011 + }, + { + "epoch": 7.902989283699943, + "grad_norm": 1.1089075803756714, + "learning_rate": 1.0487873660462493e-05, + "loss": 0.7726, + "step": 14012 + }, + { + "epoch": 7.903553299492386, + "grad_norm": 0.8877044320106506, + "learning_rate": 1.0485053581500282e-05, + "loss": 0.6509, + "step": 14013 + }, + { + "epoch": 7.904117315284828, + "grad_norm": 1.1496756076812744, + "learning_rate": 1.0482233502538073e-05, + "loss": 0.7311, + "step": 14014 + }, + { + "epoch": 7.904681331077271, + "grad_norm": 1.0657532215118408, + "learning_rate": 1.0479413423575861e-05, + "loss": 0.8625, + "step": 14015 + }, + { + "epoch": 7.905245346869712, + "grad_norm": 0.8865318298339844, + "learning_rate": 1.047659334461365e-05, + "loss": 0.7433, + "step": 14016 + }, + { + "epoch": 7.905809362662154, + "grad_norm": 1.1982059478759766, + "learning_rate": 1.0473773265651439e-05, + "loss": 0.7218, + "step": 14017 + }, + { + "epoch": 7.906373378454597, + "grad_norm": 1.2887541055679321, + "learning_rate": 1.0470953186689228e-05, + "loss": 0.8364, + "step": 14018 + }, + { + "epoch": 7.906937394247039, + "grad_norm": 1.2271625995635986, + "learning_rate": 1.0468133107727017e-05, + "loss": 0.7093, + "step": 14019 + }, + { + "epoch": 7.907501410039481, + "grad_norm": 1.1503336429595947, + "learning_rate": 1.0465313028764805e-05, + "loss": 0.712, + "step": 14020 + }, + { + "epoch": 7.908065425831923, + "grad_norm": 0.972335696220398, + "learning_rate": 1.0462492949802596e-05, + "loss": 0.7438, + "step": 14021 + }, + { + "epoch": 7.908629441624365, + "grad_norm": 1.0235894918441772, + "learning_rate": 1.0459672870840385e-05, + "loss": 0.6604, + "step": 14022 + }, + { + "epoch": 7.909193457416808, + "grad_norm": 1.1738139390945435, + "learning_rate": 1.0456852791878173e-05, + "loss": 0.7008, + "step": 14023 + }, + { + "epoch": 7.9097574732092495, + "grad_norm": 1.1631882190704346, + "learning_rate": 1.0454032712915962e-05, + "loss": 0.6328, + "step": 14024 + }, + { + "epoch": 7.910321489001692, + "grad_norm": 1.2527875900268555, + "learning_rate": 1.0451212633953751e-05, + "loss": 0.7758, + "step": 14025 + }, + { + "epoch": 7.910885504794134, + "grad_norm": 0.9895687103271484, + "learning_rate": 1.044839255499154e-05, + "loss": 0.7747, + "step": 14026 + }, + { + "epoch": 7.911449520586577, + "grad_norm": 1.2982374429702759, + "learning_rate": 1.0445572476029329e-05, + "loss": 0.8711, + "step": 14027 + }, + { + "epoch": 7.9120135363790185, + "grad_norm": 1.05192232131958, + "learning_rate": 1.0442752397067117e-05, + "loss": 0.7999, + "step": 14028 + }, + { + "epoch": 7.912577552171461, + "grad_norm": 1.1256897449493408, + "learning_rate": 1.0439932318104908e-05, + "loss": 0.8618, + "step": 14029 + }, + { + "epoch": 7.913141567963903, + "grad_norm": 1.175299048423767, + "learning_rate": 1.0437112239142697e-05, + "loss": 0.7974, + "step": 14030 + }, + { + "epoch": 7.913705583756345, + "grad_norm": 1.3342301845550537, + "learning_rate": 1.0434292160180485e-05, + "loss": 0.7608, + "step": 14031 + }, + { + "epoch": 7.9142695995487875, + "grad_norm": 1.1817294359207153, + "learning_rate": 1.0431472081218274e-05, + "loss": 0.7345, + "step": 14032 + }, + { + "epoch": 7.914833615341229, + "grad_norm": 0.9383736252784729, + "learning_rate": 1.0428652002256063e-05, + "loss": 0.7993, + "step": 14033 + }, + { + "epoch": 7.915397631133672, + "grad_norm": 1.2194472551345825, + "learning_rate": 1.0425831923293852e-05, + "loss": 0.7534, + "step": 14034 + }, + { + "epoch": 7.915961646926114, + "grad_norm": 1.1782498359680176, + "learning_rate": 1.042301184433164e-05, + "loss": 0.7739, + "step": 14035 + }, + { + "epoch": 7.916525662718556, + "grad_norm": 0.9482442140579224, + "learning_rate": 1.042019176536943e-05, + "loss": 0.6733, + "step": 14036 + }, + { + "epoch": 7.917089678510998, + "grad_norm": 1.078311800956726, + "learning_rate": 1.041737168640722e-05, + "loss": 0.7376, + "step": 14037 + }, + { + "epoch": 7.91765369430344, + "grad_norm": 1.0666924715042114, + "learning_rate": 1.0414551607445009e-05, + "loss": 0.6629, + "step": 14038 + }, + { + "epoch": 7.918217710095883, + "grad_norm": 0.9671581387519836, + "learning_rate": 1.0411731528482798e-05, + "loss": 0.6973, + "step": 14039 + }, + { + "epoch": 7.918781725888325, + "grad_norm": 0.7959293127059937, + "learning_rate": 1.0408911449520586e-05, + "loss": 0.6718, + "step": 14040 + }, + { + "epoch": 7.919345741680767, + "grad_norm": 1.2912598848342896, + "learning_rate": 1.0406091370558377e-05, + "loss": 0.8685, + "step": 14041 + }, + { + "epoch": 7.919909757473209, + "grad_norm": 1.0888398885726929, + "learning_rate": 1.0403271291596166e-05, + "loss": 0.8328, + "step": 14042 + }, + { + "epoch": 7.920473773265652, + "grad_norm": 1.0683202743530273, + "learning_rate": 1.0400451212633954e-05, + "loss": 0.6626, + "step": 14043 + }, + { + "epoch": 7.921037789058094, + "grad_norm": 1.2440061569213867, + "learning_rate": 1.0397631133671743e-05, + "loss": 0.6472, + "step": 14044 + }, + { + "epoch": 7.9216018048505354, + "grad_norm": 1.372941017150879, + "learning_rate": 1.0394811054709532e-05, + "loss": 0.8519, + "step": 14045 + }, + { + "epoch": 7.922165820642978, + "grad_norm": 0.8740423321723938, + "learning_rate": 1.0391990975747323e-05, + "loss": 0.746, + "step": 14046 + }, + { + "epoch": 7.92272983643542, + "grad_norm": 1.095963478088379, + "learning_rate": 1.0389170896785111e-05, + "loss": 0.6843, + "step": 14047 + }, + { + "epoch": 7.923293852227863, + "grad_norm": 1.0383880138397217, + "learning_rate": 1.03863508178229e-05, + "loss": 0.7325, + "step": 14048 + }, + { + "epoch": 7.9238578680203045, + "grad_norm": 1.5318758487701416, + "learning_rate": 1.0383530738860689e-05, + "loss": 0.7626, + "step": 14049 + }, + { + "epoch": 7.924421883812746, + "grad_norm": 1.5276055335998535, + "learning_rate": 1.0380710659898478e-05, + "loss": 0.797, + "step": 14050 + }, + { + "epoch": 7.924985899605189, + "grad_norm": 1.370964765548706, + "learning_rate": 1.0377890580936267e-05, + "loss": 0.7868, + "step": 14051 + }, + { + "epoch": 7.925549915397631, + "grad_norm": 1.0286798477172852, + "learning_rate": 1.0375070501974055e-05, + "loss": 0.7234, + "step": 14052 + }, + { + "epoch": 7.9261139311900735, + "grad_norm": 1.1476508378982544, + "learning_rate": 1.0372250423011846e-05, + "loss": 0.7372, + "step": 14053 + }, + { + "epoch": 7.926677946982515, + "grad_norm": 1.1697182655334473, + "learning_rate": 1.0369430344049635e-05, + "loss": 0.774, + "step": 14054 + }, + { + "epoch": 7.927241962774958, + "grad_norm": 1.1706368923187256, + "learning_rate": 1.0366610265087423e-05, + "loss": 0.7452, + "step": 14055 + }, + { + "epoch": 7.9278059785674, + "grad_norm": 1.690914511680603, + "learning_rate": 1.0363790186125212e-05, + "loss": 0.7709, + "step": 14056 + }, + { + "epoch": 7.9283699943598425, + "grad_norm": 0.9758642911911011, + "learning_rate": 1.0360970107163001e-05, + "loss": 0.6862, + "step": 14057 + }, + { + "epoch": 7.928934010152284, + "grad_norm": 0.8072892427444458, + "learning_rate": 1.035815002820079e-05, + "loss": 0.6731, + "step": 14058 + }, + { + "epoch": 7.929498025944726, + "grad_norm": 0.8437314629554749, + "learning_rate": 1.0355329949238579e-05, + "loss": 0.6223, + "step": 14059 + }, + { + "epoch": 7.930062041737169, + "grad_norm": 0.9143855571746826, + "learning_rate": 1.0352509870276367e-05, + "loss": 0.679, + "step": 14060 + }, + { + "epoch": 7.930626057529611, + "grad_norm": 1.0611810684204102, + "learning_rate": 1.0349689791314158e-05, + "loss": 0.7947, + "step": 14061 + }, + { + "epoch": 7.931190073322053, + "grad_norm": 1.1464613676071167, + "learning_rate": 1.0346869712351947e-05, + "loss": 0.7084, + "step": 14062 + }, + { + "epoch": 7.931754089114495, + "grad_norm": 1.160826563835144, + "learning_rate": 1.0344049633389735e-05, + "loss": 0.8072, + "step": 14063 + }, + { + "epoch": 7.932318104906937, + "grad_norm": 1.0920497179031372, + "learning_rate": 1.0341229554427524e-05, + "loss": 0.7439, + "step": 14064 + }, + { + "epoch": 7.93288212069938, + "grad_norm": 1.464218258857727, + "learning_rate": 1.0338409475465313e-05, + "loss": 0.6657, + "step": 14065 + }, + { + "epoch": 7.933446136491821, + "grad_norm": 1.3728899955749512, + "learning_rate": 1.0335589396503102e-05, + "loss": 0.7368, + "step": 14066 + }, + { + "epoch": 7.934010152284264, + "grad_norm": 1.2092972993850708, + "learning_rate": 1.033276931754089e-05, + "loss": 0.6763, + "step": 14067 + }, + { + "epoch": 7.934574168076706, + "grad_norm": 0.9976261854171753, + "learning_rate": 1.0329949238578681e-05, + "loss": 0.713, + "step": 14068 + }, + { + "epoch": 7.935138183869149, + "grad_norm": 1.3353867530822754, + "learning_rate": 1.032712915961647e-05, + "loss": 0.69, + "step": 14069 + }, + { + "epoch": 7.93570219966159, + "grad_norm": 1.1060751676559448, + "learning_rate": 1.0324309080654259e-05, + "loss": 0.7196, + "step": 14070 + }, + { + "epoch": 7.936266215454033, + "grad_norm": 0.9001251459121704, + "learning_rate": 1.0321489001692048e-05, + "loss": 0.6847, + "step": 14071 + }, + { + "epoch": 7.936830231246475, + "grad_norm": 0.9190165996551514, + "learning_rate": 1.0318668922729836e-05, + "loss": 0.7107, + "step": 14072 + }, + { + "epoch": 7.937394247038917, + "grad_norm": 1.0265647172927856, + "learning_rate": 1.0315848843767625e-05, + "loss": 0.6946, + "step": 14073 + }, + { + "epoch": 7.937958262831359, + "grad_norm": 1.467876672744751, + "learning_rate": 1.0313028764805414e-05, + "loss": 0.7028, + "step": 14074 + }, + { + "epoch": 7.938522278623801, + "grad_norm": 1.2646746635437012, + "learning_rate": 1.0310208685843203e-05, + "loss": 0.7909, + "step": 14075 + }, + { + "epoch": 7.939086294416244, + "grad_norm": 1.1376399993896484, + "learning_rate": 1.0307388606880993e-05, + "loss": 0.6745, + "step": 14076 + }, + { + "epoch": 7.939650310208686, + "grad_norm": 1.2182611227035522, + "learning_rate": 1.0304568527918784e-05, + "loss": 0.8361, + "step": 14077 + }, + { + "epoch": 7.940214326001128, + "grad_norm": 0.8953420519828796, + "learning_rate": 1.0301748448956572e-05, + "loss": 0.6851, + "step": 14078 + }, + { + "epoch": 7.94077834179357, + "grad_norm": 1.1786398887634277, + "learning_rate": 1.0298928369994361e-05, + "loss": 0.7098, + "step": 14079 + }, + { + "epoch": 7.941342357586012, + "grad_norm": 1.4391270875930786, + "learning_rate": 1.029610829103215e-05, + "loss": 0.6165, + "step": 14080 + }, + { + "epoch": 7.941906373378455, + "grad_norm": 1.08609139919281, + "learning_rate": 1.0293288212069939e-05, + "loss": 0.6898, + "step": 14081 + }, + { + "epoch": 7.9424703891708965, + "grad_norm": 1.4606574773788452, + "learning_rate": 1.0290468133107728e-05, + "loss": 0.7798, + "step": 14082 + }, + { + "epoch": 7.943034404963339, + "grad_norm": 1.2489396333694458, + "learning_rate": 1.0287648054145516e-05, + "loss": 0.7084, + "step": 14083 + }, + { + "epoch": 7.943598420755781, + "grad_norm": 1.500008225440979, + "learning_rate": 1.0284827975183305e-05, + "loss": 0.8213, + "step": 14084 + }, + { + "epoch": 7.944162436548224, + "grad_norm": 1.145708441734314, + "learning_rate": 1.0282007896221096e-05, + "loss": 0.7399, + "step": 14085 + }, + { + "epoch": 7.9447264523406655, + "grad_norm": 1.216222882270813, + "learning_rate": 1.0279187817258885e-05, + "loss": 0.6623, + "step": 14086 + }, + { + "epoch": 7.945290468133107, + "grad_norm": 0.9878848791122437, + "learning_rate": 1.0276367738296673e-05, + "loss": 0.6844, + "step": 14087 + }, + { + "epoch": 7.94585448392555, + "grad_norm": 1.3246362209320068, + "learning_rate": 1.0273547659334462e-05, + "loss": 0.812, + "step": 14088 + }, + { + "epoch": 7.946418499717992, + "grad_norm": 1.1134519577026367, + "learning_rate": 1.0270727580372251e-05, + "loss": 0.7071, + "step": 14089 + }, + { + "epoch": 7.9469825155104346, + "grad_norm": 0.8499265313148499, + "learning_rate": 1.026790750141004e-05, + "loss": 0.7071, + "step": 14090 + }, + { + "epoch": 7.947546531302876, + "grad_norm": 1.1451358795166016, + "learning_rate": 1.0265087422447829e-05, + "loss": 0.785, + "step": 14091 + }, + { + "epoch": 7.948110547095319, + "grad_norm": 0.9826745986938477, + "learning_rate": 1.0262267343485617e-05, + "loss": 0.7495, + "step": 14092 + }, + { + "epoch": 7.948674562887761, + "grad_norm": 0.8495224714279175, + "learning_rate": 1.0259447264523408e-05, + "loss": 0.6616, + "step": 14093 + }, + { + "epoch": 7.949238578680203, + "grad_norm": 0.8908209800720215, + "learning_rate": 1.0256627185561197e-05, + "loss": 0.7428, + "step": 14094 + }, + { + "epoch": 7.949802594472645, + "grad_norm": 1.3322341442108154, + "learning_rate": 1.0253807106598985e-05, + "loss": 0.779, + "step": 14095 + }, + { + "epoch": 7.950366610265087, + "grad_norm": 0.999028742313385, + "learning_rate": 1.0250987027636774e-05, + "loss": 0.7026, + "step": 14096 + }, + { + "epoch": 7.95093062605753, + "grad_norm": 1.109712839126587, + "learning_rate": 1.0248166948674563e-05, + "loss": 0.8057, + "step": 14097 + }, + { + "epoch": 7.951494641849972, + "grad_norm": 1.0768442153930664, + "learning_rate": 1.0245346869712352e-05, + "loss": 0.6453, + "step": 14098 + }, + { + "epoch": 7.952058657642414, + "grad_norm": 0.9450891613960266, + "learning_rate": 1.024252679075014e-05, + "loss": 0.7679, + "step": 14099 + }, + { + "epoch": 7.952622673434856, + "grad_norm": 1.1579209566116333, + "learning_rate": 1.0239706711787931e-05, + "loss": 0.7609, + "step": 14100 + }, + { + "epoch": 7.953186689227298, + "grad_norm": 1.6201213598251343, + "learning_rate": 1.023688663282572e-05, + "loss": 0.929, + "step": 14101 + }, + { + "epoch": 7.953750705019741, + "grad_norm": 1.1910278797149658, + "learning_rate": 1.0234066553863509e-05, + "loss": 0.7533, + "step": 14102 + }, + { + "epoch": 7.9543147208121825, + "grad_norm": 0.9469682574272156, + "learning_rate": 1.0231246474901297e-05, + "loss": 0.7117, + "step": 14103 + }, + { + "epoch": 7.954878736604625, + "grad_norm": 1.3499200344085693, + "learning_rate": 1.0228426395939086e-05, + "loss": 0.7935, + "step": 14104 + }, + { + "epoch": 7.955442752397067, + "grad_norm": 0.9179076552391052, + "learning_rate": 1.0225606316976875e-05, + "loss": 0.7511, + "step": 14105 + }, + { + "epoch": 7.95600676818951, + "grad_norm": 1.1133838891983032, + "learning_rate": 1.0222786238014664e-05, + "loss": 0.7047, + "step": 14106 + }, + { + "epoch": 7.9565707839819515, + "grad_norm": 1.027016282081604, + "learning_rate": 1.0219966159052453e-05, + "loss": 0.7022, + "step": 14107 + }, + { + "epoch": 7.957134799774393, + "grad_norm": 0.9351163506507874, + "learning_rate": 1.0217146080090243e-05, + "loss": 0.7074, + "step": 14108 + }, + { + "epoch": 7.957698815566836, + "grad_norm": 1.3169490098953247, + "learning_rate": 1.0214326001128032e-05, + "loss": 0.7416, + "step": 14109 + }, + { + "epoch": 7.958262831359278, + "grad_norm": 1.111566424369812, + "learning_rate": 1.021150592216582e-05, + "loss": 0.7488, + "step": 14110 + }, + { + "epoch": 7.9588268471517205, + "grad_norm": 0.9171789288520813, + "learning_rate": 1.0208685843203611e-05, + "loss": 0.6765, + "step": 14111 + }, + { + "epoch": 7.959390862944162, + "grad_norm": 1.2375670671463013, + "learning_rate": 1.02058657642414e-05, + "loss": 0.754, + "step": 14112 + }, + { + "epoch": 7.959954878736605, + "grad_norm": 0.9830402135848999, + "learning_rate": 1.0203045685279189e-05, + "loss": 0.6617, + "step": 14113 + }, + { + "epoch": 7.960518894529047, + "grad_norm": 0.9417779445648193, + "learning_rate": 1.0200225606316978e-05, + "loss": 0.7196, + "step": 14114 + }, + { + "epoch": 7.961082910321489, + "grad_norm": 0.8670427799224854, + "learning_rate": 1.0197405527354766e-05, + "loss": 0.7136, + "step": 14115 + }, + { + "epoch": 7.961646926113931, + "grad_norm": 1.4521293640136719, + "learning_rate": 1.0194585448392555e-05, + "loss": 0.8452, + "step": 14116 + }, + { + "epoch": 7.962210941906373, + "grad_norm": 0.8318537473678589, + "learning_rate": 1.0191765369430346e-05, + "loss": 0.6736, + "step": 14117 + }, + { + "epoch": 7.962774957698816, + "grad_norm": 0.8767427206039429, + "learning_rate": 1.0188945290468135e-05, + "loss": 0.672, + "step": 14118 + }, + { + "epoch": 7.963338973491258, + "grad_norm": 1.2113288640975952, + "learning_rate": 1.0186125211505923e-05, + "loss": 0.8626, + "step": 14119 + }, + { + "epoch": 7.9639029892837, + "grad_norm": 1.059755802154541, + "learning_rate": 1.0183305132543712e-05, + "loss": 0.7333, + "step": 14120 + }, + { + "epoch": 7.964467005076142, + "grad_norm": 1.1271562576293945, + "learning_rate": 1.0180485053581501e-05, + "loss": 0.8448, + "step": 14121 + }, + { + "epoch": 7.965031020868584, + "grad_norm": 0.9494895935058594, + "learning_rate": 1.017766497461929e-05, + "loss": 0.7443, + "step": 14122 + }, + { + "epoch": 7.965595036661027, + "grad_norm": 1.4162489175796509, + "learning_rate": 1.0174844895657078e-05, + "loss": 0.8222, + "step": 14123 + }, + { + "epoch": 7.9661590524534684, + "grad_norm": 0.8497731685638428, + "learning_rate": 1.0172024816694867e-05, + "loss": 0.6331, + "step": 14124 + }, + { + "epoch": 7.966723068245911, + "grad_norm": 1.4739309549331665, + "learning_rate": 1.0169204737732658e-05, + "loss": 0.7258, + "step": 14125 + }, + { + "epoch": 7.967287084038353, + "grad_norm": 1.0019272565841675, + "learning_rate": 1.0166384658770447e-05, + "loss": 0.6329, + "step": 14126 + }, + { + "epoch": 7.967851099830796, + "grad_norm": 1.0189310312271118, + "learning_rate": 1.0163564579808235e-05, + "loss": 0.7648, + "step": 14127 + }, + { + "epoch": 7.9684151156232375, + "grad_norm": 1.2859675884246826, + "learning_rate": 1.0160744500846024e-05, + "loss": 0.8068, + "step": 14128 + }, + { + "epoch": 7.968979131415679, + "grad_norm": 1.039481282234192, + "learning_rate": 1.0157924421883813e-05, + "loss": 0.5747, + "step": 14129 + }, + { + "epoch": 7.969543147208122, + "grad_norm": 1.1321009397506714, + "learning_rate": 1.0155104342921602e-05, + "loss": 0.7468, + "step": 14130 + }, + { + "epoch": 7.970107163000564, + "grad_norm": 1.564864993095398, + "learning_rate": 1.015228426395939e-05, + "loss": 0.72, + "step": 14131 + }, + { + "epoch": 7.9706711787930065, + "grad_norm": 1.2158915996551514, + "learning_rate": 1.0149464184997181e-05, + "loss": 0.7857, + "step": 14132 + }, + { + "epoch": 7.971235194585448, + "grad_norm": 1.0529825687408447, + "learning_rate": 1.014664410603497e-05, + "loss": 0.661, + "step": 14133 + }, + { + "epoch": 7.971799210377891, + "grad_norm": 1.0870639085769653, + "learning_rate": 1.0143824027072759e-05, + "loss": 0.7254, + "step": 14134 + }, + { + "epoch": 7.972363226170333, + "grad_norm": 1.4267090559005737, + "learning_rate": 1.0141003948110547e-05, + "loss": 0.7962, + "step": 14135 + }, + { + "epoch": 7.972927241962775, + "grad_norm": 1.1275129318237305, + "learning_rate": 1.0138183869148336e-05, + "loss": 0.7241, + "step": 14136 + }, + { + "epoch": 7.973491257755217, + "grad_norm": 1.4183677434921265, + "learning_rate": 1.0135363790186125e-05, + "loss": 0.7555, + "step": 14137 + }, + { + "epoch": 7.974055273547659, + "grad_norm": 0.9634289145469666, + "learning_rate": 1.0132543711223914e-05, + "loss": 0.7572, + "step": 14138 + }, + { + "epoch": 7.974619289340102, + "grad_norm": 1.190148949623108, + "learning_rate": 1.0129723632261703e-05, + "loss": 0.8256, + "step": 14139 + }, + { + "epoch": 7.975183305132544, + "grad_norm": 0.9740539789199829, + "learning_rate": 1.0126903553299493e-05, + "loss": 0.7263, + "step": 14140 + }, + { + "epoch": 7.975747320924986, + "grad_norm": 0.9100780487060547, + "learning_rate": 1.0124083474337282e-05, + "loss": 0.7257, + "step": 14141 + }, + { + "epoch": 7.976311336717428, + "grad_norm": 1.0924663543701172, + "learning_rate": 1.012126339537507e-05, + "loss": 0.8185, + "step": 14142 + }, + { + "epoch": 7.97687535250987, + "grad_norm": 1.0016683340072632, + "learning_rate": 1.011844331641286e-05, + "loss": 0.6961, + "step": 14143 + }, + { + "epoch": 7.977439368302313, + "grad_norm": 1.1211742162704468, + "learning_rate": 1.0115623237450648e-05, + "loss": 0.6597, + "step": 14144 + }, + { + "epoch": 7.978003384094754, + "grad_norm": 1.3037501573562622, + "learning_rate": 1.0112803158488437e-05, + "loss": 0.6213, + "step": 14145 + }, + { + "epoch": 7.978567399887197, + "grad_norm": 1.1427329778671265, + "learning_rate": 1.0109983079526226e-05, + "loss": 0.7265, + "step": 14146 + }, + { + "epoch": 7.979131415679639, + "grad_norm": 1.3720002174377441, + "learning_rate": 1.0107163000564016e-05, + "loss": 0.7916, + "step": 14147 + }, + { + "epoch": 7.979695431472082, + "grad_norm": 1.02837073802948, + "learning_rate": 1.0104342921601805e-05, + "loss": 0.699, + "step": 14148 + }, + { + "epoch": 7.980259447264523, + "grad_norm": 1.0964548587799072, + "learning_rate": 1.0101522842639596e-05, + "loss": 0.6687, + "step": 14149 + }, + { + "epoch": 7.980823463056965, + "grad_norm": 0.8998172283172607, + "learning_rate": 1.0098702763677384e-05, + "loss": 0.7248, + "step": 14150 + }, + { + "epoch": 7.981387478849408, + "grad_norm": 0.8364894986152649, + "learning_rate": 1.0095882684715173e-05, + "loss": 0.5885, + "step": 14151 + }, + { + "epoch": 7.98195149464185, + "grad_norm": 1.0207229852676392, + "learning_rate": 1.0093062605752962e-05, + "loss": 0.7044, + "step": 14152 + }, + { + "epoch": 7.982515510434292, + "grad_norm": 0.8851233124732971, + "learning_rate": 1.0090242526790751e-05, + "loss": 0.6449, + "step": 14153 + }, + { + "epoch": 7.983079526226734, + "grad_norm": 0.9393771290779114, + "learning_rate": 1.008742244782854e-05, + "loss": 0.6455, + "step": 14154 + }, + { + "epoch": 7.983643542019177, + "grad_norm": 0.9932124018669128, + "learning_rate": 1.0084602368866328e-05, + "loss": 0.6818, + "step": 14155 + }, + { + "epoch": 7.984207557811619, + "grad_norm": 1.4426765441894531, + "learning_rate": 1.0081782289904119e-05, + "loss": 0.7603, + "step": 14156 + }, + { + "epoch": 7.9847715736040605, + "grad_norm": 0.9620354175567627, + "learning_rate": 1.0078962210941908e-05, + "loss": 0.6212, + "step": 14157 + }, + { + "epoch": 7.985335589396503, + "grad_norm": 0.8246600031852722, + "learning_rate": 1.0076142131979697e-05, + "loss": 0.6557, + "step": 14158 + }, + { + "epoch": 7.985899605188945, + "grad_norm": 1.0146175622940063, + "learning_rate": 1.0073322053017485e-05, + "loss": 0.6469, + "step": 14159 + }, + { + "epoch": 7.986463620981388, + "grad_norm": 1.2406765222549438, + "learning_rate": 1.0070501974055274e-05, + "loss": 0.7849, + "step": 14160 + }, + { + "epoch": 7.9870276367738295, + "grad_norm": 1.145145297050476, + "learning_rate": 1.0067681895093063e-05, + "loss": 0.796, + "step": 14161 + }, + { + "epoch": 7.987591652566272, + "grad_norm": 1.140358567237854, + "learning_rate": 1.0064861816130852e-05, + "loss": 0.7661, + "step": 14162 + }, + { + "epoch": 7.988155668358714, + "grad_norm": 1.5002061128616333, + "learning_rate": 1.006204173716864e-05, + "loss": 0.7603, + "step": 14163 + }, + { + "epoch": 7.988719684151156, + "grad_norm": 1.2236781120300293, + "learning_rate": 1.0059221658206431e-05, + "loss": 0.7765, + "step": 14164 + }, + { + "epoch": 7.9892836999435985, + "grad_norm": 1.3222246170043945, + "learning_rate": 1.005640157924422e-05, + "loss": 0.834, + "step": 14165 + }, + { + "epoch": 7.98984771573604, + "grad_norm": 1.0058525800704956, + "learning_rate": 1.0053581500282009e-05, + "loss": 0.5948, + "step": 14166 + }, + { + "epoch": 7.990411731528483, + "grad_norm": 1.3591108322143555, + "learning_rate": 1.0050761421319797e-05, + "loss": 0.7926, + "step": 14167 + }, + { + "epoch": 7.990975747320925, + "grad_norm": 0.902584969997406, + "learning_rate": 1.0047941342357586e-05, + "loss": 0.6786, + "step": 14168 + }, + { + "epoch": 7.991539763113368, + "grad_norm": 1.21331787109375, + "learning_rate": 1.0045121263395375e-05, + "loss": 0.6866, + "step": 14169 + }, + { + "epoch": 7.992103778905809, + "grad_norm": 1.0816552639007568, + "learning_rate": 1.0042301184433164e-05, + "loss": 0.6425, + "step": 14170 + }, + { + "epoch": 7.992667794698251, + "grad_norm": 0.8907977938652039, + "learning_rate": 1.0039481105470953e-05, + "loss": 0.7533, + "step": 14171 + }, + { + "epoch": 7.993231810490694, + "grad_norm": 0.8137229681015015, + "learning_rate": 1.0036661026508743e-05, + "loss": 0.6843, + "step": 14172 + }, + { + "epoch": 7.993795826283136, + "grad_norm": 1.103701114654541, + "learning_rate": 1.0033840947546532e-05, + "loss": 0.7412, + "step": 14173 + }, + { + "epoch": 7.994359842075578, + "grad_norm": 1.177544116973877, + "learning_rate": 1.003102086858432e-05, + "loss": 0.7583, + "step": 14174 + }, + { + "epoch": 7.99492385786802, + "grad_norm": 0.8940643072128296, + "learning_rate": 1.002820078962211e-05, + "loss": 0.7349, + "step": 14175 + }, + { + "epoch": 7.995487873660463, + "grad_norm": 1.0813591480255127, + "learning_rate": 1.0025380710659898e-05, + "loss": 0.728, + "step": 14176 + }, + { + "epoch": 7.996051889452905, + "grad_norm": 1.402542233467102, + "learning_rate": 1.0022560631697687e-05, + "loss": 0.7194, + "step": 14177 + }, + { + "epoch": 7.9966159052453465, + "grad_norm": 0.883377194404602, + "learning_rate": 1.0019740552735476e-05, + "loss": 0.7461, + "step": 14178 + }, + { + "epoch": 7.997179921037789, + "grad_norm": 0.9868687987327576, + "learning_rate": 1.0016920473773266e-05, + "loss": 0.7595, + "step": 14179 + }, + { + "epoch": 7.997743936830231, + "grad_norm": 1.273874044418335, + "learning_rate": 1.0014100394811055e-05, + "loss": 0.829, + "step": 14180 + }, + { + "epoch": 7.998307952622674, + "grad_norm": 1.0102514028549194, + "learning_rate": 1.0011280315848844e-05, + "loss": 0.6954, + "step": 14181 + }, + { + "epoch": 7.9988719684151155, + "grad_norm": 1.460245132446289, + "learning_rate": 1.0008460236886634e-05, + "loss": 0.7372, + "step": 14182 + }, + { + "epoch": 7.999435984207558, + "grad_norm": 0.871195375919342, + "learning_rate": 1.0005640157924423e-05, + "loss": 0.7309, + "step": 14183 + }, + { + "epoch": 8.0, + "grad_norm": 1.6548125743865967, + "learning_rate": 1.0002820078962212e-05, + "loss": 0.7794, + "step": 14184 + }, + { + "epoch": 8.000564015792442, + "grad_norm": 1.1098926067352295, + "learning_rate": 1e-05, + "loss": 0.7147, + "step": 14185 + }, + { + "epoch": 8.001128031584884, + "grad_norm": 1.1702321767807007, + "learning_rate": 9.99717992103779e-06, + "loss": 0.7903, + "step": 14186 + }, + { + "epoch": 8.001692047377327, + "grad_norm": 1.5242257118225098, + "learning_rate": 9.994359842075578e-06, + "loss": 0.7213, + "step": 14187 + }, + { + "epoch": 8.002256063169769, + "grad_norm": 1.1132314205169678, + "learning_rate": 9.991539763113369e-06, + "loss": 0.7473, + "step": 14188 + }, + { + "epoch": 8.00282007896221, + "grad_norm": 1.307416319847107, + "learning_rate": 9.988719684151158e-06, + "loss": 0.8297, + "step": 14189 + }, + { + "epoch": 8.003384094754653, + "grad_norm": 1.3258416652679443, + "learning_rate": 9.985899605188946e-06, + "loss": 0.7286, + "step": 14190 + }, + { + "epoch": 8.003948110547096, + "grad_norm": 1.0309913158416748, + "learning_rate": 9.983079526226735e-06, + "loss": 0.6689, + "step": 14191 + }, + { + "epoch": 8.004512126339538, + "grad_norm": 0.9060167074203491, + "learning_rate": 9.980259447264524e-06, + "loss": 0.6692, + "step": 14192 + }, + { + "epoch": 8.00507614213198, + "grad_norm": 1.1011656522750854, + "learning_rate": 9.977439368302313e-06, + "loss": 0.754, + "step": 14193 + }, + { + "epoch": 8.005640157924422, + "grad_norm": 0.9411531090736389, + "learning_rate": 9.974619289340102e-06, + "loss": 0.6112, + "step": 14194 + }, + { + "epoch": 8.006204173716863, + "grad_norm": 0.9809553623199463, + "learning_rate": 9.97179921037789e-06, + "loss": 0.7756, + "step": 14195 + }, + { + "epoch": 8.006768189509307, + "grad_norm": 0.9183862209320068, + "learning_rate": 9.968979131415681e-06, + "loss": 0.6834, + "step": 14196 + }, + { + "epoch": 8.007332205301749, + "grad_norm": 0.9766194224357605, + "learning_rate": 9.96615905245347e-06, + "loss": 0.7934, + "step": 14197 + }, + { + "epoch": 8.00789622109419, + "grad_norm": 1.0992504358291626, + "learning_rate": 9.963338973491259e-06, + "loss": 0.6858, + "step": 14198 + }, + { + "epoch": 8.008460236886632, + "grad_norm": 1.3304443359375, + "learning_rate": 9.960518894529047e-06, + "loss": 0.7685, + "step": 14199 + }, + { + "epoch": 8.009024252679074, + "grad_norm": 1.0141288042068481, + "learning_rate": 9.957698815566836e-06, + "loss": 0.6225, + "step": 14200 + }, + { + "epoch": 8.009588268471518, + "grad_norm": 0.9281772971153259, + "learning_rate": 9.954878736604625e-06, + "loss": 0.6841, + "step": 14201 + }, + { + "epoch": 8.01015228426396, + "grad_norm": 1.1707488298416138, + "learning_rate": 9.952058657642414e-06, + "loss": 0.8062, + "step": 14202 + }, + { + "epoch": 8.010716300056401, + "grad_norm": 1.6118992567062378, + "learning_rate": 9.949238578680203e-06, + "loss": 0.8043, + "step": 14203 + }, + { + "epoch": 8.011280315848843, + "grad_norm": 0.8615686297416687, + "learning_rate": 9.946418499717993e-06, + "loss": 0.7051, + "step": 14204 + }, + { + "epoch": 8.011844331641287, + "grad_norm": 1.333656907081604, + "learning_rate": 9.943598420755782e-06, + "loss": 0.7652, + "step": 14205 + }, + { + "epoch": 8.012408347433729, + "grad_norm": 0.9604176878929138, + "learning_rate": 9.94077834179357e-06, + "loss": 0.7123, + "step": 14206 + }, + { + "epoch": 8.01297236322617, + "grad_norm": 1.1237238645553589, + "learning_rate": 9.93795826283136e-06, + "loss": 0.7313, + "step": 14207 + }, + { + "epoch": 8.013536379018612, + "grad_norm": 1.4710065126419067, + "learning_rate": 9.935138183869148e-06, + "loss": 0.7343, + "step": 14208 + }, + { + "epoch": 8.014100394811054, + "grad_norm": 1.2038780450820923, + "learning_rate": 9.932318104906937e-06, + "loss": 0.7795, + "step": 14209 + }, + { + "epoch": 8.014664410603498, + "grad_norm": 1.2187583446502686, + "learning_rate": 9.929498025944726e-06, + "loss": 0.7738, + "step": 14210 + }, + { + "epoch": 8.01522842639594, + "grad_norm": 1.0734070539474487, + "learning_rate": 9.926677946982516e-06, + "loss": 0.7624, + "step": 14211 + }, + { + "epoch": 8.015792442188381, + "grad_norm": 1.0778182744979858, + "learning_rate": 9.923857868020305e-06, + "loss": 0.7303, + "step": 14212 + }, + { + "epoch": 8.016356457980823, + "grad_norm": 1.492691159248352, + "learning_rate": 9.921037789058094e-06, + "loss": 0.798, + "step": 14213 + }, + { + "epoch": 8.016920473773265, + "grad_norm": 1.2207119464874268, + "learning_rate": 9.918217710095883e-06, + "loss": 0.7286, + "step": 14214 + }, + { + "epoch": 8.017484489565708, + "grad_norm": 0.9925811290740967, + "learning_rate": 9.915397631133671e-06, + "loss": 0.7119, + "step": 14215 + }, + { + "epoch": 8.01804850535815, + "grad_norm": 0.9203447103500366, + "learning_rate": 9.91257755217146e-06, + "loss": 0.683, + "step": 14216 + }, + { + "epoch": 8.018612521150592, + "grad_norm": 0.8461692333221436, + "learning_rate": 9.90975747320925e-06, + "loss": 0.6477, + "step": 14217 + }, + { + "epoch": 8.019176536943034, + "grad_norm": 1.0888551473617554, + "learning_rate": 9.90693739424704e-06, + "loss": 0.7361, + "step": 14218 + }, + { + "epoch": 8.019740552735477, + "grad_norm": 1.1025842428207397, + "learning_rate": 9.904117315284828e-06, + "loss": 0.7979, + "step": 14219 + }, + { + "epoch": 8.02030456852792, + "grad_norm": 1.0340757369995117, + "learning_rate": 9.901297236322619e-06, + "loss": 0.8022, + "step": 14220 + }, + { + "epoch": 8.020868584320361, + "grad_norm": 1.257277250289917, + "learning_rate": 9.898477157360408e-06, + "loss": 0.8389, + "step": 14221 + }, + { + "epoch": 8.021432600112803, + "grad_norm": 1.167569637298584, + "learning_rate": 9.895657078398196e-06, + "loss": 0.7389, + "step": 14222 + }, + { + "epoch": 8.021996615905245, + "grad_norm": 1.2085115909576416, + "learning_rate": 9.892836999435985e-06, + "loss": 0.792, + "step": 14223 + }, + { + "epoch": 8.022560631697688, + "grad_norm": 0.9034778475761414, + "learning_rate": 9.890016920473774e-06, + "loss": 0.6641, + "step": 14224 + }, + { + "epoch": 8.02312464749013, + "grad_norm": 1.2006407976150513, + "learning_rate": 9.887196841511563e-06, + "loss": 0.8154, + "step": 14225 + }, + { + "epoch": 8.023688663282572, + "grad_norm": 0.9659071564674377, + "learning_rate": 9.884376762549352e-06, + "loss": 0.7115, + "step": 14226 + }, + { + "epoch": 8.024252679075014, + "grad_norm": 1.131486177444458, + "learning_rate": 9.88155668358714e-06, + "loss": 0.6612, + "step": 14227 + }, + { + "epoch": 8.024816694867456, + "grad_norm": 1.0551031827926636, + "learning_rate": 9.878736604624931e-06, + "loss": 0.7538, + "step": 14228 + }, + { + "epoch": 8.025380710659899, + "grad_norm": 0.877170979976654, + "learning_rate": 9.87591652566272e-06, + "loss": 0.6175, + "step": 14229 + }, + { + "epoch": 8.025944726452341, + "grad_norm": 1.1868903636932373, + "learning_rate": 9.873096446700509e-06, + "loss": 0.6472, + "step": 14230 + }, + { + "epoch": 8.026508742244783, + "grad_norm": 0.8767823576927185, + "learning_rate": 9.870276367738297e-06, + "loss": 0.7034, + "step": 14231 + }, + { + "epoch": 8.027072758037225, + "grad_norm": 1.408550500869751, + "learning_rate": 9.867456288776086e-06, + "loss": 0.7739, + "step": 14232 + }, + { + "epoch": 8.027636773829668, + "grad_norm": 0.9479654431343079, + "learning_rate": 9.864636209813875e-06, + "loss": 0.719, + "step": 14233 + }, + { + "epoch": 8.02820078962211, + "grad_norm": 1.1178020238876343, + "learning_rate": 9.861816130851664e-06, + "loss": 0.711, + "step": 14234 + }, + { + "epoch": 8.028764805414552, + "grad_norm": 1.4133059978485107, + "learning_rate": 9.858996051889454e-06, + "loss": 0.7277, + "step": 14235 + }, + { + "epoch": 8.029328821206994, + "grad_norm": 1.5595406293869019, + "learning_rate": 9.856175972927243e-06, + "loss": 0.8395, + "step": 14236 + }, + { + "epoch": 8.029892836999435, + "grad_norm": 0.9470071196556091, + "learning_rate": 9.853355893965032e-06, + "loss": 0.8438, + "step": 14237 + }, + { + "epoch": 8.030456852791879, + "grad_norm": 1.3055188655853271, + "learning_rate": 9.85053581500282e-06, + "loss": 0.6517, + "step": 14238 + }, + { + "epoch": 8.03102086858432, + "grad_norm": 1.0245940685272217, + "learning_rate": 9.84771573604061e-06, + "loss": 0.8064, + "step": 14239 + }, + { + "epoch": 8.031584884376763, + "grad_norm": 1.0706340074539185, + "learning_rate": 9.844895657078398e-06, + "loss": 0.7413, + "step": 14240 + }, + { + "epoch": 8.032148900169204, + "grad_norm": 1.0895885229110718, + "learning_rate": 9.842075578116187e-06, + "loss": 0.724, + "step": 14241 + }, + { + "epoch": 8.032712915961646, + "grad_norm": 0.75787752866745, + "learning_rate": 9.839255499153976e-06, + "loss": 0.6783, + "step": 14242 + }, + { + "epoch": 8.03327693175409, + "grad_norm": 0.9189192056655884, + "learning_rate": 9.836435420191766e-06, + "loss": 0.6822, + "step": 14243 + }, + { + "epoch": 8.033840947546532, + "grad_norm": 1.0682368278503418, + "learning_rate": 9.833615341229555e-06, + "loss": 0.7209, + "step": 14244 + }, + { + "epoch": 8.034404963338973, + "grad_norm": 1.598212480545044, + "learning_rate": 9.830795262267344e-06, + "loss": 0.8741, + "step": 14245 + }, + { + "epoch": 8.034968979131415, + "grad_norm": 0.9949703812599182, + "learning_rate": 9.827975183305133e-06, + "loss": 0.804, + "step": 14246 + }, + { + "epoch": 8.035532994923859, + "grad_norm": 1.0432226657867432, + "learning_rate": 9.825155104342921e-06, + "loss": 0.7197, + "step": 14247 + }, + { + "epoch": 8.0360970107163, + "grad_norm": 1.1949131488800049, + "learning_rate": 9.82233502538071e-06, + "loss": 0.6469, + "step": 14248 + }, + { + "epoch": 8.036661026508742, + "grad_norm": 1.3638182878494263, + "learning_rate": 9.819514946418499e-06, + "loss": 0.7415, + "step": 14249 + }, + { + "epoch": 8.037225042301184, + "grad_norm": 1.498835563659668, + "learning_rate": 9.816694867456288e-06, + "loss": 0.7757, + "step": 14250 + }, + { + "epoch": 8.037789058093626, + "grad_norm": 1.0751245021820068, + "learning_rate": 9.813874788494078e-06, + "loss": 0.7134, + "step": 14251 + }, + { + "epoch": 8.03835307388607, + "grad_norm": 0.9840787053108215, + "learning_rate": 9.811054709531867e-06, + "loss": 0.7208, + "step": 14252 + }, + { + "epoch": 8.038917089678511, + "grad_norm": 1.2427408695220947, + "learning_rate": 9.808234630569658e-06, + "loss": 0.6288, + "step": 14253 + }, + { + "epoch": 8.039481105470953, + "grad_norm": 0.9400123357772827, + "learning_rate": 9.805414551607446e-06, + "loss": 0.7552, + "step": 14254 + }, + { + "epoch": 8.040045121263395, + "grad_norm": 0.9896731972694397, + "learning_rate": 9.802594472645235e-06, + "loss": 0.6655, + "step": 14255 + }, + { + "epoch": 8.040609137055837, + "grad_norm": 1.2586594820022583, + "learning_rate": 9.799774393683024e-06, + "loss": 0.7526, + "step": 14256 + }, + { + "epoch": 8.04117315284828, + "grad_norm": 1.3734976053237915, + "learning_rate": 9.796954314720813e-06, + "loss": 0.8185, + "step": 14257 + }, + { + "epoch": 8.041737168640722, + "grad_norm": 1.0637168884277344, + "learning_rate": 9.794134235758602e-06, + "loss": 0.7599, + "step": 14258 + }, + { + "epoch": 8.042301184433164, + "grad_norm": 1.294805645942688, + "learning_rate": 9.79131415679639e-06, + "loss": 0.7195, + "step": 14259 + }, + { + "epoch": 8.042865200225606, + "grad_norm": 0.8425814509391785, + "learning_rate": 9.788494077834181e-06, + "loss": 0.7652, + "step": 14260 + }, + { + "epoch": 8.04342921601805, + "grad_norm": 1.6442673206329346, + "learning_rate": 9.78567399887197e-06, + "loss": 0.7816, + "step": 14261 + }, + { + "epoch": 8.043993231810491, + "grad_norm": 1.4796062707901, + "learning_rate": 9.782853919909758e-06, + "loss": 0.8027, + "step": 14262 + }, + { + "epoch": 8.044557247602933, + "grad_norm": 0.9141642451286316, + "learning_rate": 9.780033840947547e-06, + "loss": 0.6523, + "step": 14263 + }, + { + "epoch": 8.045121263395375, + "grad_norm": 1.1376142501831055, + "learning_rate": 9.777213761985336e-06, + "loss": 0.7897, + "step": 14264 + }, + { + "epoch": 8.045685279187817, + "grad_norm": 1.1726487874984741, + "learning_rate": 9.774393683023125e-06, + "loss": 0.692, + "step": 14265 + }, + { + "epoch": 8.04624929498026, + "grad_norm": 0.9822580814361572, + "learning_rate": 9.771573604060914e-06, + "loss": 0.6653, + "step": 14266 + }, + { + "epoch": 8.046813310772702, + "grad_norm": 1.0042914152145386, + "learning_rate": 9.768753525098704e-06, + "loss": 0.6674, + "step": 14267 + }, + { + "epoch": 8.047377326565144, + "grad_norm": 1.0660508871078491, + "learning_rate": 9.765933446136493e-06, + "loss": 0.6852, + "step": 14268 + }, + { + "epoch": 8.047941342357586, + "grad_norm": 1.4473799467086792, + "learning_rate": 9.763113367174282e-06, + "loss": 0.8389, + "step": 14269 + }, + { + "epoch": 8.048505358150027, + "grad_norm": 1.2326825857162476, + "learning_rate": 9.76029328821207e-06, + "loss": 0.7605, + "step": 14270 + }, + { + "epoch": 8.049069373942471, + "grad_norm": 1.0013097524642944, + "learning_rate": 9.75747320924986e-06, + "loss": 0.7761, + "step": 14271 + }, + { + "epoch": 8.049633389734913, + "grad_norm": 1.2447909116744995, + "learning_rate": 9.754653130287648e-06, + "loss": 0.7139, + "step": 14272 + }, + { + "epoch": 8.050197405527355, + "grad_norm": 1.1429544687271118, + "learning_rate": 9.751833051325437e-06, + "loss": 0.8395, + "step": 14273 + }, + { + "epoch": 8.050761421319796, + "grad_norm": 1.2514228820800781, + "learning_rate": 9.749012972363226e-06, + "loss": 0.7305, + "step": 14274 + }, + { + "epoch": 8.05132543711224, + "grad_norm": 1.1992865800857544, + "learning_rate": 9.746192893401016e-06, + "loss": 0.6998, + "step": 14275 + }, + { + "epoch": 8.051889452904682, + "grad_norm": 1.2087342739105225, + "learning_rate": 9.743372814438805e-06, + "loss": 0.6836, + "step": 14276 + }, + { + "epoch": 8.052453468697124, + "grad_norm": 1.3555033206939697, + "learning_rate": 9.740552735476594e-06, + "loss": 0.7902, + "step": 14277 + }, + { + "epoch": 8.053017484489565, + "grad_norm": 1.0117624998092651, + "learning_rate": 9.737732656514383e-06, + "loss": 0.7297, + "step": 14278 + }, + { + "epoch": 8.053581500282007, + "grad_norm": 0.8910059928894043, + "learning_rate": 9.734912577552171e-06, + "loss": 0.6295, + "step": 14279 + }, + { + "epoch": 8.05414551607445, + "grad_norm": 1.0086843967437744, + "learning_rate": 9.73209249858996e-06, + "loss": 0.6755, + "step": 14280 + }, + { + "epoch": 8.054709531866893, + "grad_norm": 1.1306536197662354, + "learning_rate": 9.729272419627749e-06, + "loss": 0.6952, + "step": 14281 + }, + { + "epoch": 8.055273547659334, + "grad_norm": 1.1001386642456055, + "learning_rate": 9.726452340665538e-06, + "loss": 0.6495, + "step": 14282 + }, + { + "epoch": 8.055837563451776, + "grad_norm": 1.294746994972229, + "learning_rate": 9.723632261703328e-06, + "loss": 0.7702, + "step": 14283 + }, + { + "epoch": 8.056401579244218, + "grad_norm": 1.077854037284851, + "learning_rate": 9.720812182741117e-06, + "loss": 0.6913, + "step": 14284 + }, + { + "epoch": 8.056965595036662, + "grad_norm": 1.0297675132751465, + "learning_rate": 9.717992103778906e-06, + "loss": 0.7096, + "step": 14285 + }, + { + "epoch": 8.057529610829103, + "grad_norm": 1.0027406215667725, + "learning_rate": 9.715172024816695e-06, + "loss": 0.7148, + "step": 14286 + }, + { + "epoch": 8.058093626621545, + "grad_norm": 0.9303333759307861, + "learning_rate": 9.712351945854483e-06, + "loss": 0.6604, + "step": 14287 + }, + { + "epoch": 8.058657642413987, + "grad_norm": 1.3499675989151, + "learning_rate": 9.709531866892274e-06, + "loss": 0.7419, + "step": 14288 + }, + { + "epoch": 8.05922165820643, + "grad_norm": 1.516823410987854, + "learning_rate": 9.706711787930063e-06, + "loss": 0.6991, + "step": 14289 + }, + { + "epoch": 8.059785673998872, + "grad_norm": 1.2625535726547241, + "learning_rate": 9.703891708967852e-06, + "loss": 0.7167, + "step": 14290 + }, + { + "epoch": 8.060349689791314, + "grad_norm": 1.244421124458313, + "learning_rate": 9.70107163000564e-06, + "loss": 0.7279, + "step": 14291 + }, + { + "epoch": 8.060913705583756, + "grad_norm": 1.2037662267684937, + "learning_rate": 9.69825155104343e-06, + "loss": 0.6971, + "step": 14292 + }, + { + "epoch": 8.061477721376198, + "grad_norm": 1.3234225511550903, + "learning_rate": 9.69543147208122e-06, + "loss": 0.7486, + "step": 14293 + }, + { + "epoch": 8.062041737168641, + "grad_norm": 1.3814738988876343, + "learning_rate": 9.692611393119008e-06, + "loss": 0.7452, + "step": 14294 + }, + { + "epoch": 8.062605752961083, + "grad_norm": 0.9230150580406189, + "learning_rate": 9.689791314156797e-06, + "loss": 0.7379, + "step": 14295 + }, + { + "epoch": 8.063169768753525, + "grad_norm": 1.1742905378341675, + "learning_rate": 9.686971235194586e-06, + "loss": 0.6452, + "step": 14296 + }, + { + "epoch": 8.063733784545967, + "grad_norm": 1.0717554092407227, + "learning_rate": 9.684151156232375e-06, + "loss": 0.7781, + "step": 14297 + }, + { + "epoch": 8.064297800338409, + "grad_norm": 1.3770813941955566, + "learning_rate": 9.681331077270164e-06, + "loss": 0.8037, + "step": 14298 + }, + { + "epoch": 8.064861816130852, + "grad_norm": 1.4909679889678955, + "learning_rate": 9.678510998307954e-06, + "loss": 0.7824, + "step": 14299 + }, + { + "epoch": 8.065425831923294, + "grad_norm": 0.897986888885498, + "learning_rate": 9.675690919345743e-06, + "loss": 0.7054, + "step": 14300 + }, + { + "epoch": 8.065989847715736, + "grad_norm": 1.399709701538086, + "learning_rate": 9.672870840383532e-06, + "loss": 0.7396, + "step": 14301 + }, + { + "epoch": 8.066553863508178, + "grad_norm": 1.0376423597335815, + "learning_rate": 9.67005076142132e-06, + "loss": 0.7199, + "step": 14302 + }, + { + "epoch": 8.067117879300621, + "grad_norm": 1.0718512535095215, + "learning_rate": 9.66723068245911e-06, + "loss": 0.6513, + "step": 14303 + }, + { + "epoch": 8.067681895093063, + "grad_norm": 1.0383026599884033, + "learning_rate": 9.664410603496898e-06, + "loss": 0.6443, + "step": 14304 + }, + { + "epoch": 8.068245910885505, + "grad_norm": 1.1672409772872925, + "learning_rate": 9.661590524534687e-06, + "loss": 0.7915, + "step": 14305 + }, + { + "epoch": 8.068809926677947, + "grad_norm": 1.5278390645980835, + "learning_rate": 9.658770445572476e-06, + "loss": 0.7667, + "step": 14306 + }, + { + "epoch": 8.069373942470389, + "grad_norm": 1.22074294090271, + "learning_rate": 9.655950366610266e-06, + "loss": 0.7056, + "step": 14307 + }, + { + "epoch": 8.069937958262832, + "grad_norm": 1.3091906309127808, + "learning_rate": 9.653130287648055e-06, + "loss": 0.9331, + "step": 14308 + }, + { + "epoch": 8.070501974055274, + "grad_norm": 0.8601928353309631, + "learning_rate": 9.650310208685844e-06, + "loss": 0.7054, + "step": 14309 + }, + { + "epoch": 8.071065989847716, + "grad_norm": 1.2680549621582031, + "learning_rate": 9.647490129723633e-06, + "loss": 0.7645, + "step": 14310 + }, + { + "epoch": 8.071630005640158, + "grad_norm": 1.3408596515655518, + "learning_rate": 9.644670050761421e-06, + "loss": 0.7354, + "step": 14311 + }, + { + "epoch": 8.0721940214326, + "grad_norm": 1.010263442993164, + "learning_rate": 9.64184997179921e-06, + "loss": 0.6781, + "step": 14312 + }, + { + "epoch": 8.072758037225043, + "grad_norm": 1.5333690643310547, + "learning_rate": 9.639029892836999e-06, + "loss": 0.6618, + "step": 14313 + }, + { + "epoch": 8.073322053017485, + "grad_norm": 1.4586461782455444, + "learning_rate": 9.636209813874788e-06, + "loss": 0.747, + "step": 14314 + }, + { + "epoch": 8.073886068809927, + "grad_norm": 0.9446694850921631, + "learning_rate": 9.633389734912578e-06, + "loss": 0.7067, + "step": 14315 + }, + { + "epoch": 8.074450084602368, + "grad_norm": 1.2924498319625854, + "learning_rate": 9.630569655950367e-06, + "loss": 0.8213, + "step": 14316 + }, + { + "epoch": 8.075014100394812, + "grad_norm": 1.2035373449325562, + "learning_rate": 9.627749576988156e-06, + "loss": 0.7611, + "step": 14317 + }, + { + "epoch": 8.075578116187254, + "grad_norm": 1.2503184080123901, + "learning_rate": 9.624929498025945e-06, + "loss": 0.7009, + "step": 14318 + }, + { + "epoch": 8.076142131979696, + "grad_norm": 0.9528659582138062, + "learning_rate": 9.622109419063733e-06, + "loss": 0.7147, + "step": 14319 + }, + { + "epoch": 8.076706147772137, + "grad_norm": 1.0995482206344604, + "learning_rate": 9.619289340101522e-06, + "loss": 0.7458, + "step": 14320 + }, + { + "epoch": 8.07727016356458, + "grad_norm": 1.1991617679595947, + "learning_rate": 9.616469261139311e-06, + "loss": 0.6913, + "step": 14321 + }, + { + "epoch": 8.077834179357023, + "grad_norm": 0.8516755104064941, + "learning_rate": 9.613649182177102e-06, + "loss": 0.7153, + "step": 14322 + }, + { + "epoch": 8.078398195149465, + "grad_norm": 0.9035592079162598, + "learning_rate": 9.61082910321489e-06, + "loss": 0.6252, + "step": 14323 + }, + { + "epoch": 8.078962210941906, + "grad_norm": 0.8962945342063904, + "learning_rate": 9.60800902425268e-06, + "loss": 0.6455, + "step": 14324 + }, + { + "epoch": 8.079526226734348, + "grad_norm": 1.288313388824463, + "learning_rate": 9.60518894529047e-06, + "loss": 0.6649, + "step": 14325 + }, + { + "epoch": 8.08009024252679, + "grad_norm": 0.8425770401954651, + "learning_rate": 9.602368866328258e-06, + "loss": 0.6391, + "step": 14326 + }, + { + "epoch": 8.080654258319234, + "grad_norm": 1.300345778465271, + "learning_rate": 9.599548787366047e-06, + "loss": 0.7621, + "step": 14327 + }, + { + "epoch": 8.081218274111675, + "grad_norm": 0.9145359992980957, + "learning_rate": 9.596728708403836e-06, + "loss": 0.7279, + "step": 14328 + }, + { + "epoch": 8.081782289904117, + "grad_norm": 1.2132513523101807, + "learning_rate": 9.593908629441625e-06, + "loss": 0.7532, + "step": 14329 + }, + { + "epoch": 8.082346305696559, + "grad_norm": 1.2346423864364624, + "learning_rate": 9.591088550479414e-06, + "loss": 0.7363, + "step": 14330 + }, + { + "epoch": 8.082910321489003, + "grad_norm": 1.450692892074585, + "learning_rate": 9.588268471517204e-06, + "loss": 0.7144, + "step": 14331 + }, + { + "epoch": 8.083474337281444, + "grad_norm": 1.2910199165344238, + "learning_rate": 9.585448392554993e-06, + "loss": 0.8297, + "step": 14332 + }, + { + "epoch": 8.084038353073886, + "grad_norm": 0.9999661445617676, + "learning_rate": 9.582628313592782e-06, + "loss": 0.6282, + "step": 14333 + }, + { + "epoch": 8.084602368866328, + "grad_norm": 1.3477510213851929, + "learning_rate": 9.57980823463057e-06, + "loss": 0.7479, + "step": 14334 + }, + { + "epoch": 8.08516638465877, + "grad_norm": 1.1576876640319824, + "learning_rate": 9.57698815566836e-06, + "loss": 0.8246, + "step": 14335 + }, + { + "epoch": 8.085730400451213, + "grad_norm": 1.064862608909607, + "learning_rate": 9.574168076706148e-06, + "loss": 0.7218, + "step": 14336 + }, + { + "epoch": 8.086294416243655, + "grad_norm": 1.060806393623352, + "learning_rate": 9.571347997743937e-06, + "loss": 0.6458, + "step": 14337 + }, + { + "epoch": 8.086858432036097, + "grad_norm": 0.9516388773918152, + "learning_rate": 9.568527918781726e-06, + "loss": 0.7104, + "step": 14338 + }, + { + "epoch": 8.087422447828539, + "grad_norm": 1.0577778816223145, + "learning_rate": 9.565707839819516e-06, + "loss": 0.7499, + "step": 14339 + }, + { + "epoch": 8.08798646362098, + "grad_norm": 1.205278754234314, + "learning_rate": 9.562887760857305e-06, + "loss": 0.695, + "step": 14340 + }, + { + "epoch": 8.088550479413424, + "grad_norm": 0.8861058354377747, + "learning_rate": 9.560067681895094e-06, + "loss": 0.6324, + "step": 14341 + }, + { + "epoch": 8.089114495205866, + "grad_norm": 1.3115142583847046, + "learning_rate": 9.557247602932883e-06, + "loss": 0.7649, + "step": 14342 + }, + { + "epoch": 8.089678510998308, + "grad_norm": 0.9126742482185364, + "learning_rate": 9.554427523970671e-06, + "loss": 0.593, + "step": 14343 + }, + { + "epoch": 8.09024252679075, + "grad_norm": 1.2711505889892578, + "learning_rate": 9.55160744500846e-06, + "loss": 0.7723, + "step": 14344 + }, + { + "epoch": 8.090806542583193, + "grad_norm": 1.3119615316390991, + "learning_rate": 9.548787366046249e-06, + "loss": 0.684, + "step": 14345 + }, + { + "epoch": 8.091370558375635, + "grad_norm": 1.4463781118392944, + "learning_rate": 9.54596728708404e-06, + "loss": 0.81, + "step": 14346 + }, + { + "epoch": 8.091934574168077, + "grad_norm": 0.9824456572532654, + "learning_rate": 9.543147208121828e-06, + "loss": 0.6927, + "step": 14347 + }, + { + "epoch": 8.092498589960519, + "grad_norm": 1.256579041481018, + "learning_rate": 9.540327129159617e-06, + "loss": 0.6752, + "step": 14348 + }, + { + "epoch": 8.09306260575296, + "grad_norm": 1.473055362701416, + "learning_rate": 9.537507050197406e-06, + "loss": 0.8071, + "step": 14349 + }, + { + "epoch": 8.093626621545404, + "grad_norm": 1.2597378492355347, + "learning_rate": 9.534686971235195e-06, + "loss": 0.8589, + "step": 14350 + }, + { + "epoch": 8.094190637337846, + "grad_norm": 1.1515523195266724, + "learning_rate": 9.531866892272983e-06, + "loss": 0.8082, + "step": 14351 + }, + { + "epoch": 8.094754653130288, + "grad_norm": 1.1346064805984497, + "learning_rate": 9.529046813310772e-06, + "loss": 0.7497, + "step": 14352 + }, + { + "epoch": 8.09531866892273, + "grad_norm": 1.0324907302856445, + "learning_rate": 9.526226734348561e-06, + "loss": 0.6073, + "step": 14353 + }, + { + "epoch": 8.095882684715171, + "grad_norm": 1.3812110424041748, + "learning_rate": 9.523406655386351e-06, + "loss": 0.7082, + "step": 14354 + }, + { + "epoch": 8.096446700507615, + "grad_norm": 1.1086400747299194, + "learning_rate": 9.52058657642414e-06, + "loss": 0.6462, + "step": 14355 + }, + { + "epoch": 8.097010716300057, + "grad_norm": 1.2850251197814941, + "learning_rate": 9.517766497461929e-06, + "loss": 0.7743, + "step": 14356 + }, + { + "epoch": 8.097574732092498, + "grad_norm": 1.338844656944275, + "learning_rate": 9.514946418499718e-06, + "loss": 0.6438, + "step": 14357 + }, + { + "epoch": 8.09813874788494, + "grad_norm": 1.2572797536849976, + "learning_rate": 9.512126339537507e-06, + "loss": 0.7156, + "step": 14358 + }, + { + "epoch": 8.098702763677384, + "grad_norm": 0.9918246269226074, + "learning_rate": 9.509306260575297e-06, + "loss": 0.7652, + "step": 14359 + }, + { + "epoch": 8.099266779469826, + "grad_norm": 1.279115915298462, + "learning_rate": 9.506486181613086e-06, + "loss": 0.7274, + "step": 14360 + }, + { + "epoch": 8.099830795262267, + "grad_norm": 1.2397836446762085, + "learning_rate": 9.503666102650875e-06, + "loss": 0.8344, + "step": 14361 + }, + { + "epoch": 8.10039481105471, + "grad_norm": 1.1292253732681274, + "learning_rate": 9.500846023688664e-06, + "loss": 0.6829, + "step": 14362 + }, + { + "epoch": 8.100958826847151, + "grad_norm": 1.149261236190796, + "learning_rate": 9.498025944726454e-06, + "loss": 0.7633, + "step": 14363 + }, + { + "epoch": 8.101522842639595, + "grad_norm": 0.8696141242980957, + "learning_rate": 9.495205865764243e-06, + "loss": 0.7248, + "step": 14364 + }, + { + "epoch": 8.102086858432036, + "grad_norm": 1.2767157554626465, + "learning_rate": 9.492385786802032e-06, + "loss": 0.7929, + "step": 14365 + }, + { + "epoch": 8.102650874224478, + "grad_norm": 0.9176170229911804, + "learning_rate": 9.48956570783982e-06, + "loss": 0.6804, + "step": 14366 + }, + { + "epoch": 8.10321489001692, + "grad_norm": 1.2085245847702026, + "learning_rate": 9.48674562887761e-06, + "loss": 0.7176, + "step": 14367 + }, + { + "epoch": 8.103778905809362, + "grad_norm": 1.1954805850982666, + "learning_rate": 9.483925549915398e-06, + "loss": 0.8025, + "step": 14368 + }, + { + "epoch": 8.104342921601805, + "grad_norm": 1.0798332691192627, + "learning_rate": 9.481105470953187e-06, + "loss": 0.7662, + "step": 14369 + }, + { + "epoch": 8.104906937394247, + "grad_norm": 1.3590821027755737, + "learning_rate": 9.478285391990976e-06, + "loss": 0.8453, + "step": 14370 + }, + { + "epoch": 8.105470953186689, + "grad_norm": 1.3411953449249268, + "learning_rate": 9.475465313028766e-06, + "loss": 0.7386, + "step": 14371 + }, + { + "epoch": 8.106034968979131, + "grad_norm": 1.0634633302688599, + "learning_rate": 9.472645234066555e-06, + "loss": 0.6939, + "step": 14372 + }, + { + "epoch": 8.106598984771574, + "grad_norm": 1.2620216608047485, + "learning_rate": 9.469825155104344e-06, + "loss": 0.7523, + "step": 14373 + }, + { + "epoch": 8.107163000564016, + "grad_norm": 1.495119571685791, + "learning_rate": 9.467005076142132e-06, + "loss": 0.8342, + "step": 14374 + }, + { + "epoch": 8.107727016356458, + "grad_norm": 1.0752618312835693, + "learning_rate": 9.464184997179921e-06, + "loss": 0.7422, + "step": 14375 + }, + { + "epoch": 8.1082910321489, + "grad_norm": 1.135589361190796, + "learning_rate": 9.46136491821771e-06, + "loss": 0.786, + "step": 14376 + }, + { + "epoch": 8.108855047941342, + "grad_norm": 0.8733665943145752, + "learning_rate": 9.458544839255499e-06, + "loss": 0.6575, + "step": 14377 + }, + { + "epoch": 8.109419063733785, + "grad_norm": 1.1183604001998901, + "learning_rate": 9.45572476029329e-06, + "loss": 0.7697, + "step": 14378 + }, + { + "epoch": 8.109983079526227, + "grad_norm": 1.276534914970398, + "learning_rate": 9.452904681331078e-06, + "loss": 0.743, + "step": 14379 + }, + { + "epoch": 8.110547095318669, + "grad_norm": 1.065873384475708, + "learning_rate": 9.450084602368867e-06, + "loss": 0.7475, + "step": 14380 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 1.7156435251235962, + "learning_rate": 9.447264523406656e-06, + "loss": 0.8953, + "step": 14381 + }, + { + "epoch": 8.111675126903553, + "grad_norm": 2.14090633392334, + "learning_rate": 9.444444444444445e-06, + "loss": 0.8615, + "step": 14382 + }, + { + "epoch": 8.112239142695996, + "grad_norm": 1.1534411907196045, + "learning_rate": 9.441624365482233e-06, + "loss": 0.6923, + "step": 14383 + }, + { + "epoch": 8.112803158488438, + "grad_norm": 0.9985365867614746, + "learning_rate": 9.438804286520022e-06, + "loss": 0.6969, + "step": 14384 + }, + { + "epoch": 8.11336717428088, + "grad_norm": 1.0222938060760498, + "learning_rate": 9.435984207557811e-06, + "loss": 0.6645, + "step": 14385 + }, + { + "epoch": 8.113931190073322, + "grad_norm": 1.2139062881469727, + "learning_rate": 9.433164128595601e-06, + "loss": 0.8269, + "step": 14386 + }, + { + "epoch": 8.114495205865765, + "grad_norm": 1.0066494941711426, + "learning_rate": 9.43034404963339e-06, + "loss": 0.6969, + "step": 14387 + }, + { + "epoch": 8.115059221658207, + "grad_norm": 1.0195627212524414, + "learning_rate": 9.427523970671179e-06, + "loss": 0.7291, + "step": 14388 + }, + { + "epoch": 8.115623237450649, + "grad_norm": 1.3565921783447266, + "learning_rate": 9.424703891708968e-06, + "loss": 0.695, + "step": 14389 + }, + { + "epoch": 8.11618725324309, + "grad_norm": 3.1826541423797607, + "learning_rate": 9.421883812746757e-06, + "loss": 0.8385, + "step": 14390 + }, + { + "epoch": 8.116751269035532, + "grad_norm": 1.2576913833618164, + "learning_rate": 9.419063733784545e-06, + "loss": 0.7202, + "step": 14391 + }, + { + "epoch": 8.117315284827976, + "grad_norm": 1.135554313659668, + "learning_rate": 9.416243654822334e-06, + "loss": 0.7108, + "step": 14392 + }, + { + "epoch": 8.117879300620418, + "grad_norm": 0.7940707802772522, + "learning_rate": 9.413423575860123e-06, + "loss": 0.6593, + "step": 14393 + }, + { + "epoch": 8.11844331641286, + "grad_norm": 1.451002597808838, + "learning_rate": 9.410603496897914e-06, + "loss": 0.8553, + "step": 14394 + }, + { + "epoch": 8.119007332205301, + "grad_norm": 1.0415593385696411, + "learning_rate": 9.407783417935704e-06, + "loss": 0.7946, + "step": 14395 + }, + { + "epoch": 8.119571347997743, + "grad_norm": 0.787516713142395, + "learning_rate": 9.404963338973493e-06, + "loss": 0.6436, + "step": 14396 + }, + { + "epoch": 8.120135363790187, + "grad_norm": 1.1227158308029175, + "learning_rate": 9.402143260011282e-06, + "loss": 0.7534, + "step": 14397 + }, + { + "epoch": 8.120699379582629, + "grad_norm": 1.253782033920288, + "learning_rate": 9.39932318104907e-06, + "loss": 0.7526, + "step": 14398 + }, + { + "epoch": 8.12126339537507, + "grad_norm": 0.9269724488258362, + "learning_rate": 9.39650310208686e-06, + "loss": 0.7213, + "step": 14399 + }, + { + "epoch": 8.121827411167512, + "grad_norm": 1.040869116783142, + "learning_rate": 9.393683023124648e-06, + "loss": 0.6111, + "step": 14400 + }, + { + "epoch": 8.122391426959956, + "grad_norm": 0.8837354183197021, + "learning_rate": 9.390862944162437e-06, + "loss": 0.7374, + "step": 14401 + }, + { + "epoch": 8.122955442752398, + "grad_norm": 1.6767657995224, + "learning_rate": 9.388042865200226e-06, + "loss": 0.8885, + "step": 14402 + }, + { + "epoch": 8.12351945854484, + "grad_norm": 1.2452187538146973, + "learning_rate": 9.385222786238016e-06, + "loss": 0.8747, + "step": 14403 + }, + { + "epoch": 8.124083474337281, + "grad_norm": 0.8425039052963257, + "learning_rate": 9.382402707275805e-06, + "loss": 0.6729, + "step": 14404 + }, + { + "epoch": 8.124647490129723, + "grad_norm": 0.9656307101249695, + "learning_rate": 9.379582628313594e-06, + "loss": 0.6281, + "step": 14405 + }, + { + "epoch": 8.125211505922167, + "grad_norm": 0.9625075459480286, + "learning_rate": 9.376762549351382e-06, + "loss": 0.7355, + "step": 14406 + }, + { + "epoch": 8.125775521714608, + "grad_norm": 0.8466848731040955, + "learning_rate": 9.373942470389171e-06, + "loss": 0.6966, + "step": 14407 + }, + { + "epoch": 8.12633953750705, + "grad_norm": 0.9924237728118896, + "learning_rate": 9.37112239142696e-06, + "loss": 0.6593, + "step": 14408 + }, + { + "epoch": 8.126903553299492, + "grad_norm": 1.2866929769515991, + "learning_rate": 9.368302312464749e-06, + "loss": 0.8924, + "step": 14409 + }, + { + "epoch": 8.127467569091934, + "grad_norm": 1.4214226007461548, + "learning_rate": 9.36548223350254e-06, + "loss": 0.8344, + "step": 14410 + }, + { + "epoch": 8.128031584884377, + "grad_norm": 1.0224926471710205, + "learning_rate": 9.362662154540328e-06, + "loss": 0.634, + "step": 14411 + }, + { + "epoch": 8.12859560067682, + "grad_norm": 1.019377589225769, + "learning_rate": 9.359842075578117e-06, + "loss": 0.7125, + "step": 14412 + }, + { + "epoch": 8.129159616469261, + "grad_norm": 1.502671718597412, + "learning_rate": 9.357021996615906e-06, + "loss": 0.7393, + "step": 14413 + }, + { + "epoch": 8.129723632261703, + "grad_norm": 1.0600816011428833, + "learning_rate": 9.354201917653695e-06, + "loss": 0.6843, + "step": 14414 + }, + { + "epoch": 8.130287648054146, + "grad_norm": 1.2030692100524902, + "learning_rate": 9.351381838691483e-06, + "loss": 0.7028, + "step": 14415 + }, + { + "epoch": 8.130851663846588, + "grad_norm": 1.2480859756469727, + "learning_rate": 9.348561759729272e-06, + "loss": 0.7394, + "step": 14416 + }, + { + "epoch": 8.13141567963903, + "grad_norm": 1.0415593385696411, + "learning_rate": 9.345741680767061e-06, + "loss": 0.7992, + "step": 14417 + }, + { + "epoch": 8.131979695431472, + "grad_norm": 0.9489843845367432, + "learning_rate": 9.342921601804851e-06, + "loss": 0.757, + "step": 14418 + }, + { + "epoch": 8.132543711223914, + "grad_norm": 1.0723320245742798, + "learning_rate": 9.34010152284264e-06, + "loss": 0.7795, + "step": 14419 + }, + { + "epoch": 8.133107727016357, + "grad_norm": 0.8871840238571167, + "learning_rate": 9.337281443880429e-06, + "loss": 0.7472, + "step": 14420 + }, + { + "epoch": 8.133671742808799, + "grad_norm": 1.1173099279403687, + "learning_rate": 9.334461364918218e-06, + "loss": 0.7212, + "step": 14421 + }, + { + "epoch": 8.13423575860124, + "grad_norm": 1.03821861743927, + "learning_rate": 9.331641285956007e-06, + "loss": 0.6745, + "step": 14422 + }, + { + "epoch": 8.134799774393683, + "grad_norm": 1.0089356899261475, + "learning_rate": 9.328821206993795e-06, + "loss": 0.8063, + "step": 14423 + }, + { + "epoch": 8.135363790186124, + "grad_norm": 1.0772149562835693, + "learning_rate": 9.326001128031584e-06, + "loss": 0.6779, + "step": 14424 + }, + { + "epoch": 8.135927805978568, + "grad_norm": 1.3908389806747437, + "learning_rate": 9.323181049069375e-06, + "loss": 0.8339, + "step": 14425 + }, + { + "epoch": 8.13649182177101, + "grad_norm": 1.1820238828659058, + "learning_rate": 9.320360970107163e-06, + "loss": 0.6956, + "step": 14426 + }, + { + "epoch": 8.137055837563452, + "grad_norm": 1.4695907831192017, + "learning_rate": 9.317540891144952e-06, + "loss": 0.7881, + "step": 14427 + }, + { + "epoch": 8.137619853355893, + "grad_norm": 1.2730904817581177, + "learning_rate": 9.314720812182741e-06, + "loss": 0.6591, + "step": 14428 + }, + { + "epoch": 8.138183869148337, + "grad_norm": 1.2049039602279663, + "learning_rate": 9.311900733220532e-06, + "loss": 0.7806, + "step": 14429 + }, + { + "epoch": 8.138747884940779, + "grad_norm": 1.558215618133545, + "learning_rate": 9.30908065425832e-06, + "loss": 0.7511, + "step": 14430 + }, + { + "epoch": 8.13931190073322, + "grad_norm": 1.0332945585250854, + "learning_rate": 9.306260575296109e-06, + "loss": 0.7424, + "step": 14431 + }, + { + "epoch": 8.139875916525662, + "grad_norm": 1.236380696296692, + "learning_rate": 9.303440496333898e-06, + "loss": 0.834, + "step": 14432 + }, + { + "epoch": 8.140439932318104, + "grad_norm": 1.4139904975891113, + "learning_rate": 9.300620417371687e-06, + "loss": 0.79, + "step": 14433 + }, + { + "epoch": 8.141003948110548, + "grad_norm": 1.188970923423767, + "learning_rate": 9.297800338409477e-06, + "loss": 0.6993, + "step": 14434 + }, + { + "epoch": 8.14156796390299, + "grad_norm": 1.0486892461776733, + "learning_rate": 9.294980259447266e-06, + "loss": 0.7091, + "step": 14435 + }, + { + "epoch": 8.142131979695431, + "grad_norm": 1.1726360321044922, + "learning_rate": 9.292160180485055e-06, + "loss": 0.7334, + "step": 14436 + }, + { + "epoch": 8.142695995487873, + "grad_norm": 1.468489408493042, + "learning_rate": 9.289340101522844e-06, + "loss": 0.7534, + "step": 14437 + }, + { + "epoch": 8.143260011280315, + "grad_norm": 1.4113250970840454, + "learning_rate": 9.286520022560632e-06, + "loss": 0.7472, + "step": 14438 + }, + { + "epoch": 8.143824027072759, + "grad_norm": 1.057701826095581, + "learning_rate": 9.283699943598421e-06, + "loss": 0.692, + "step": 14439 + }, + { + "epoch": 8.1443880428652, + "grad_norm": 1.2179354429244995, + "learning_rate": 9.28087986463621e-06, + "loss": 0.7288, + "step": 14440 + }, + { + "epoch": 8.144952058657642, + "grad_norm": 0.7293260097503662, + "learning_rate": 9.278059785673999e-06, + "loss": 0.6078, + "step": 14441 + }, + { + "epoch": 8.145516074450084, + "grad_norm": 0.9962444305419922, + "learning_rate": 9.27523970671179e-06, + "loss": 0.6452, + "step": 14442 + }, + { + "epoch": 8.146080090242528, + "grad_norm": 1.5658854246139526, + "learning_rate": 9.272419627749578e-06, + "loss": 0.7642, + "step": 14443 + }, + { + "epoch": 8.14664410603497, + "grad_norm": 1.0189299583435059, + "learning_rate": 9.269599548787367e-06, + "loss": 0.6838, + "step": 14444 + }, + { + "epoch": 8.147208121827411, + "grad_norm": 0.8725425601005554, + "learning_rate": 9.266779469825156e-06, + "loss": 0.6113, + "step": 14445 + }, + { + "epoch": 8.147772137619853, + "grad_norm": 1.1360886096954346, + "learning_rate": 9.263959390862944e-06, + "loss": 0.7497, + "step": 14446 + }, + { + "epoch": 8.148336153412295, + "grad_norm": 1.4091243743896484, + "learning_rate": 9.261139311900733e-06, + "loss": 0.7457, + "step": 14447 + }, + { + "epoch": 8.148900169204738, + "grad_norm": 1.2588448524475098, + "learning_rate": 9.258319232938522e-06, + "loss": 0.7817, + "step": 14448 + }, + { + "epoch": 8.14946418499718, + "grad_norm": 1.1171753406524658, + "learning_rate": 9.255499153976311e-06, + "loss": 0.5967, + "step": 14449 + }, + { + "epoch": 8.150028200789622, + "grad_norm": 1.4519481658935547, + "learning_rate": 9.252679075014101e-06, + "loss": 0.7389, + "step": 14450 + }, + { + "epoch": 8.150592216582064, + "grad_norm": 0.983766496181488, + "learning_rate": 9.24985899605189e-06, + "loss": 0.7181, + "step": 14451 + }, + { + "epoch": 8.151156232374506, + "grad_norm": 0.9022543430328369, + "learning_rate": 9.247038917089679e-06, + "loss": 0.7402, + "step": 14452 + }, + { + "epoch": 8.15172024816695, + "grad_norm": 0.8045946359634399, + "learning_rate": 9.244218838127468e-06, + "loss": 0.6156, + "step": 14453 + }, + { + "epoch": 8.152284263959391, + "grad_norm": 1.1548348665237427, + "learning_rate": 9.241398759165257e-06, + "loss": 0.7, + "step": 14454 + }, + { + "epoch": 8.152848279751833, + "grad_norm": 1.0362070798873901, + "learning_rate": 9.238578680203045e-06, + "loss": 0.7353, + "step": 14455 + }, + { + "epoch": 8.153412295544275, + "grad_norm": 1.2222089767456055, + "learning_rate": 9.235758601240834e-06, + "loss": 0.7812, + "step": 14456 + }, + { + "epoch": 8.153976311336718, + "grad_norm": 0.9890627861022949, + "learning_rate": 9.232938522278625e-06, + "loss": 0.6376, + "step": 14457 + }, + { + "epoch": 8.15454032712916, + "grad_norm": 0.9391911029815674, + "learning_rate": 9.230118443316413e-06, + "loss": 0.6607, + "step": 14458 + }, + { + "epoch": 8.155104342921602, + "grad_norm": 1.1795610189437866, + "learning_rate": 9.227298364354202e-06, + "loss": 0.7105, + "step": 14459 + }, + { + "epoch": 8.155668358714044, + "grad_norm": 1.120810866355896, + "learning_rate": 9.224478285391991e-06, + "loss": 0.7821, + "step": 14460 + }, + { + "epoch": 8.156232374506486, + "grad_norm": 0.9840031266212463, + "learning_rate": 9.22165820642978e-06, + "loss": 0.6373, + "step": 14461 + }, + { + "epoch": 8.156796390298929, + "grad_norm": 1.2032127380371094, + "learning_rate": 9.218838127467569e-06, + "loss": 0.734, + "step": 14462 + }, + { + "epoch": 8.157360406091371, + "grad_norm": 1.15204918384552, + "learning_rate": 9.216018048505357e-06, + "loss": 0.7242, + "step": 14463 + }, + { + "epoch": 8.157924421883813, + "grad_norm": 1.0131936073303223, + "learning_rate": 9.213197969543146e-06, + "loss": 0.6793, + "step": 14464 + }, + { + "epoch": 8.158488437676255, + "grad_norm": 0.8162087202072144, + "learning_rate": 9.210377890580937e-06, + "loss": 0.6812, + "step": 14465 + }, + { + "epoch": 8.159052453468696, + "grad_norm": 1.1669573783874512, + "learning_rate": 9.207557811618727e-06, + "loss": 0.7077, + "step": 14466 + }, + { + "epoch": 8.15961646926114, + "grad_norm": 1.4326956272125244, + "learning_rate": 9.204737732656516e-06, + "loss": 0.8686, + "step": 14467 + }, + { + "epoch": 8.160180485053582, + "grad_norm": 1.4051951169967651, + "learning_rate": 9.201917653694305e-06, + "loss": 0.7527, + "step": 14468 + }, + { + "epoch": 8.160744500846024, + "grad_norm": 0.9978289008140564, + "learning_rate": 9.199097574732094e-06, + "loss": 0.7154, + "step": 14469 + }, + { + "epoch": 8.161308516638465, + "grad_norm": 1.5813435316085815, + "learning_rate": 9.196277495769882e-06, + "loss": 0.7209, + "step": 14470 + }, + { + "epoch": 8.161872532430909, + "grad_norm": 1.000268578529358, + "learning_rate": 9.193457416807671e-06, + "loss": 0.7697, + "step": 14471 + }, + { + "epoch": 8.16243654822335, + "grad_norm": 0.9144911170005798, + "learning_rate": 9.19063733784546e-06, + "loss": 0.7194, + "step": 14472 + }, + { + "epoch": 8.163000564015793, + "grad_norm": 1.1803258657455444, + "learning_rate": 9.187817258883249e-06, + "loss": 0.6087, + "step": 14473 + }, + { + "epoch": 8.163564579808234, + "grad_norm": 1.1454517841339111, + "learning_rate": 9.18499717992104e-06, + "loss": 0.7763, + "step": 14474 + }, + { + "epoch": 8.164128595600676, + "grad_norm": 0.9941298365592957, + "learning_rate": 9.182177100958828e-06, + "loss": 0.6336, + "step": 14475 + }, + { + "epoch": 8.16469261139312, + "grad_norm": 1.1467715501785278, + "learning_rate": 9.179357021996617e-06, + "loss": 0.7228, + "step": 14476 + }, + { + "epoch": 8.165256627185562, + "grad_norm": 1.2099394798278809, + "learning_rate": 9.176536943034406e-06, + "loss": 0.7863, + "step": 14477 + }, + { + "epoch": 8.165820642978003, + "grad_norm": 1.1793229579925537, + "learning_rate": 9.173716864072194e-06, + "loss": 0.6916, + "step": 14478 + }, + { + "epoch": 8.166384658770445, + "grad_norm": 1.2802321910858154, + "learning_rate": 9.170896785109983e-06, + "loss": 0.744, + "step": 14479 + }, + { + "epoch": 8.166948674562887, + "grad_norm": 1.1823370456695557, + "learning_rate": 9.168076706147772e-06, + "loss": 0.6229, + "step": 14480 + }, + { + "epoch": 8.16751269035533, + "grad_norm": 1.3055925369262695, + "learning_rate": 9.16525662718556e-06, + "loss": 0.8267, + "step": 14481 + }, + { + "epoch": 8.168076706147772, + "grad_norm": 0.8951098918914795, + "learning_rate": 9.162436548223351e-06, + "loss": 0.7015, + "step": 14482 + }, + { + "epoch": 8.168640721940214, + "grad_norm": 0.8757317662239075, + "learning_rate": 9.15961646926114e-06, + "loss": 0.6994, + "step": 14483 + }, + { + "epoch": 8.169204737732656, + "grad_norm": 0.9282346963882446, + "learning_rate": 9.156796390298929e-06, + "loss": 0.7751, + "step": 14484 + }, + { + "epoch": 8.1697687535251, + "grad_norm": 1.12261962890625, + "learning_rate": 9.153976311336718e-06, + "loss": 0.7862, + "step": 14485 + }, + { + "epoch": 8.170332769317541, + "grad_norm": 1.0346664190292358, + "learning_rate": 9.151156232374507e-06, + "loss": 0.7515, + "step": 14486 + }, + { + "epoch": 8.170896785109983, + "grad_norm": 1.256689429283142, + "learning_rate": 9.148336153412295e-06, + "loss": 0.6606, + "step": 14487 + }, + { + "epoch": 8.171460800902425, + "grad_norm": 1.0535070896148682, + "learning_rate": 9.145516074450084e-06, + "loss": 0.7199, + "step": 14488 + }, + { + "epoch": 8.172024816694867, + "grad_norm": 1.2306766510009766, + "learning_rate": 9.142695995487875e-06, + "loss": 0.7174, + "step": 14489 + }, + { + "epoch": 8.17258883248731, + "grad_norm": 1.2283440828323364, + "learning_rate": 9.139875916525663e-06, + "loss": 0.7562, + "step": 14490 + }, + { + "epoch": 8.173152848279752, + "grad_norm": 1.130145788192749, + "learning_rate": 9.137055837563452e-06, + "loss": 0.8047, + "step": 14491 + }, + { + "epoch": 8.173716864072194, + "grad_norm": 1.3036819696426392, + "learning_rate": 9.134235758601241e-06, + "loss": 0.7339, + "step": 14492 + }, + { + "epoch": 8.174280879864636, + "grad_norm": 0.9220923185348511, + "learning_rate": 9.13141567963903e-06, + "loss": 0.7239, + "step": 14493 + }, + { + "epoch": 8.174844895657078, + "grad_norm": 1.232478380203247, + "learning_rate": 9.128595600676819e-06, + "loss": 0.5963, + "step": 14494 + }, + { + "epoch": 8.175408911449521, + "grad_norm": 1.3776642084121704, + "learning_rate": 9.125775521714607e-06, + "loss": 0.725, + "step": 14495 + }, + { + "epoch": 8.175972927241963, + "grad_norm": 1.0539498329162598, + "learning_rate": 9.122955442752396e-06, + "loss": 0.7434, + "step": 14496 + }, + { + "epoch": 8.176536943034405, + "grad_norm": 1.1005568504333496, + "learning_rate": 9.120135363790187e-06, + "loss": 0.6879, + "step": 14497 + }, + { + "epoch": 8.177100958826847, + "grad_norm": 1.0793017148971558, + "learning_rate": 9.117315284827975e-06, + "loss": 0.724, + "step": 14498 + }, + { + "epoch": 8.17766497461929, + "grad_norm": 1.161312460899353, + "learning_rate": 9.114495205865764e-06, + "loss": 0.8012, + "step": 14499 + }, + { + "epoch": 8.178228990411732, + "grad_norm": 1.0951282978057861, + "learning_rate": 9.111675126903555e-06, + "loss": 0.6458, + "step": 14500 + }, + { + "epoch": 8.178793006204174, + "grad_norm": 1.0486024618148804, + "learning_rate": 9.108855047941344e-06, + "loss": 0.7538, + "step": 14501 + }, + { + "epoch": 8.179357021996616, + "grad_norm": 0.8872196674346924, + "learning_rate": 9.106034968979132e-06, + "loss": 0.6864, + "step": 14502 + }, + { + "epoch": 8.179921037789057, + "grad_norm": 1.0986984968185425, + "learning_rate": 9.103214890016921e-06, + "loss": 0.699, + "step": 14503 + }, + { + "epoch": 8.180485053581501, + "grad_norm": 1.4410074949264526, + "learning_rate": 9.10039481105471e-06, + "loss": 0.7844, + "step": 14504 + }, + { + "epoch": 8.181049069373943, + "grad_norm": 1.6054778099060059, + "learning_rate": 9.097574732092499e-06, + "loss": 0.7791, + "step": 14505 + }, + { + "epoch": 8.181613085166385, + "grad_norm": 1.1787739992141724, + "learning_rate": 9.09475465313029e-06, + "loss": 0.6696, + "step": 14506 + }, + { + "epoch": 8.182177100958826, + "grad_norm": 0.9695673584938049, + "learning_rate": 9.091934574168078e-06, + "loss": 0.7511, + "step": 14507 + }, + { + "epoch": 8.182741116751268, + "grad_norm": 1.0514551401138306, + "learning_rate": 9.089114495205867e-06, + "loss": 0.6591, + "step": 14508 + }, + { + "epoch": 8.183305132543712, + "grad_norm": 1.00788414478302, + "learning_rate": 9.086294416243656e-06, + "loss": 0.676, + "step": 14509 + }, + { + "epoch": 8.183869148336154, + "grad_norm": 1.2525041103363037, + "learning_rate": 9.083474337281444e-06, + "loss": 0.742, + "step": 14510 + }, + { + "epoch": 8.184433164128595, + "grad_norm": 1.3369183540344238, + "learning_rate": 9.080654258319233e-06, + "loss": 0.7053, + "step": 14511 + }, + { + "epoch": 8.184997179921037, + "grad_norm": 1.3740296363830566, + "learning_rate": 9.077834179357022e-06, + "loss": 0.7996, + "step": 14512 + }, + { + "epoch": 8.18556119571348, + "grad_norm": 1.5001057386398315, + "learning_rate": 9.075014100394812e-06, + "loss": 0.8294, + "step": 14513 + }, + { + "epoch": 8.186125211505923, + "grad_norm": 1.1605360507965088, + "learning_rate": 9.072194021432601e-06, + "loss": 0.7611, + "step": 14514 + }, + { + "epoch": 8.186689227298364, + "grad_norm": 1.0403602123260498, + "learning_rate": 9.06937394247039e-06, + "loss": 0.6551, + "step": 14515 + }, + { + "epoch": 8.187253243090806, + "grad_norm": 1.064236044883728, + "learning_rate": 9.066553863508179e-06, + "loss": 0.73, + "step": 14516 + }, + { + "epoch": 8.187817258883248, + "grad_norm": 1.023679494857788, + "learning_rate": 9.063733784545968e-06, + "loss": 0.7737, + "step": 14517 + }, + { + "epoch": 8.188381274675692, + "grad_norm": 0.80314701795578, + "learning_rate": 9.060913705583756e-06, + "loss": 0.5921, + "step": 14518 + }, + { + "epoch": 8.188945290468133, + "grad_norm": 1.464506983757019, + "learning_rate": 9.058093626621545e-06, + "loss": 0.6934, + "step": 14519 + }, + { + "epoch": 8.189509306260575, + "grad_norm": 1.0159425735473633, + "learning_rate": 9.055273547659334e-06, + "loss": 0.6666, + "step": 14520 + }, + { + "epoch": 8.190073322053017, + "grad_norm": 1.007538080215454, + "learning_rate": 9.052453468697125e-06, + "loss": 0.8154, + "step": 14521 + }, + { + "epoch": 8.190637337845459, + "grad_norm": 1.076686143875122, + "learning_rate": 9.049633389734913e-06, + "loss": 0.7656, + "step": 14522 + }, + { + "epoch": 8.191201353637902, + "grad_norm": 1.1120582818984985, + "learning_rate": 9.046813310772702e-06, + "loss": 0.7074, + "step": 14523 + }, + { + "epoch": 8.191765369430344, + "grad_norm": 1.0995800495147705, + "learning_rate": 9.043993231810491e-06, + "loss": 0.7091, + "step": 14524 + }, + { + "epoch": 8.192329385222786, + "grad_norm": 1.318210482597351, + "learning_rate": 9.04117315284828e-06, + "loss": 0.8121, + "step": 14525 + }, + { + "epoch": 8.192893401015228, + "grad_norm": 1.421613097190857, + "learning_rate": 9.038353073886069e-06, + "loss": 0.6973, + "step": 14526 + }, + { + "epoch": 8.193457416807671, + "grad_norm": 1.423136830329895, + "learning_rate": 9.035532994923857e-06, + "loss": 0.8327, + "step": 14527 + }, + { + "epoch": 8.194021432600113, + "grad_norm": 1.0390676259994507, + "learning_rate": 9.032712915961646e-06, + "loss": 0.7215, + "step": 14528 + }, + { + "epoch": 8.194585448392555, + "grad_norm": 1.024012804031372, + "learning_rate": 9.029892836999437e-06, + "loss": 0.7175, + "step": 14529 + }, + { + "epoch": 8.195149464184997, + "grad_norm": 1.2640389204025269, + "learning_rate": 9.027072758037225e-06, + "loss": 0.7469, + "step": 14530 + }, + { + "epoch": 8.195713479977439, + "grad_norm": 1.0143561363220215, + "learning_rate": 9.024252679075014e-06, + "loss": 0.6318, + "step": 14531 + }, + { + "epoch": 8.196277495769882, + "grad_norm": 1.2200579643249512, + "learning_rate": 9.021432600112803e-06, + "loss": 0.6561, + "step": 14532 + }, + { + "epoch": 8.196841511562324, + "grad_norm": 1.210723876953125, + "learning_rate": 9.018612521150592e-06, + "loss": 0.7621, + "step": 14533 + }, + { + "epoch": 8.197405527354766, + "grad_norm": 1.1190528869628906, + "learning_rate": 9.01579244218838e-06, + "loss": 0.7201, + "step": 14534 + }, + { + "epoch": 8.197969543147208, + "grad_norm": 1.4964931011199951, + "learning_rate": 9.012972363226171e-06, + "loss": 0.7585, + "step": 14535 + }, + { + "epoch": 8.19853355893965, + "grad_norm": 1.3503841161727905, + "learning_rate": 9.01015228426396e-06, + "loss": 0.7909, + "step": 14536 + }, + { + "epoch": 8.199097574732093, + "grad_norm": 0.9985091686248779, + "learning_rate": 9.007332205301749e-06, + "loss": 0.7229, + "step": 14537 + }, + { + "epoch": 8.199661590524535, + "grad_norm": 1.3000737428665161, + "learning_rate": 9.00451212633954e-06, + "loss": 0.7835, + "step": 14538 + }, + { + "epoch": 8.200225606316977, + "grad_norm": 1.260412573814392, + "learning_rate": 9.001692047377328e-06, + "loss": 0.802, + "step": 14539 + }, + { + "epoch": 8.200789622109419, + "grad_norm": 1.1200952529907227, + "learning_rate": 8.998871968415117e-06, + "loss": 0.7225, + "step": 14540 + }, + { + "epoch": 8.201353637901862, + "grad_norm": 1.2292529344558716, + "learning_rate": 8.996051889452906e-06, + "loss": 0.7809, + "step": 14541 + }, + { + "epoch": 8.201917653694304, + "grad_norm": 1.591900110244751, + "learning_rate": 8.993231810490694e-06, + "loss": 0.7568, + "step": 14542 + }, + { + "epoch": 8.202481669486746, + "grad_norm": 1.1498708724975586, + "learning_rate": 8.990411731528483e-06, + "loss": 0.7434, + "step": 14543 + }, + { + "epoch": 8.203045685279188, + "grad_norm": 0.6963046789169312, + "learning_rate": 8.987591652566272e-06, + "loss": 0.6059, + "step": 14544 + }, + { + "epoch": 8.20360970107163, + "grad_norm": 0.9652628302574158, + "learning_rate": 8.984771573604062e-06, + "loss": 0.676, + "step": 14545 + }, + { + "epoch": 8.204173716864073, + "grad_norm": 1.4077666997909546, + "learning_rate": 8.981951494641851e-06, + "loss": 0.733, + "step": 14546 + }, + { + "epoch": 8.204737732656515, + "grad_norm": 0.9733876585960388, + "learning_rate": 8.97913141567964e-06, + "loss": 0.7696, + "step": 14547 + }, + { + "epoch": 8.205301748448957, + "grad_norm": 1.2326594591140747, + "learning_rate": 8.976311336717429e-06, + "loss": 0.7347, + "step": 14548 + }, + { + "epoch": 8.205865764241398, + "grad_norm": 1.0359238386154175, + "learning_rate": 8.973491257755218e-06, + "loss": 0.6574, + "step": 14549 + }, + { + "epoch": 8.20642978003384, + "grad_norm": 1.2587931156158447, + "learning_rate": 8.970671178793006e-06, + "loss": 0.7194, + "step": 14550 + }, + { + "epoch": 8.206993795826284, + "grad_norm": 1.2270123958587646, + "learning_rate": 8.967851099830795e-06, + "loss": 0.7696, + "step": 14551 + }, + { + "epoch": 8.207557811618726, + "grad_norm": 1.0668878555297852, + "learning_rate": 8.965031020868584e-06, + "loss": 0.6567, + "step": 14552 + }, + { + "epoch": 8.208121827411167, + "grad_norm": 1.1145371198654175, + "learning_rate": 8.962210941906375e-06, + "loss": 0.7601, + "step": 14553 + }, + { + "epoch": 8.20868584320361, + "grad_norm": 1.221010446548462, + "learning_rate": 8.959390862944163e-06, + "loss": 0.8191, + "step": 14554 + }, + { + "epoch": 8.209249858996053, + "grad_norm": 1.4683619737625122, + "learning_rate": 8.956570783981952e-06, + "loss": 0.7973, + "step": 14555 + }, + { + "epoch": 8.209813874788495, + "grad_norm": 1.0379488468170166, + "learning_rate": 8.953750705019741e-06, + "loss": 0.7261, + "step": 14556 + }, + { + "epoch": 8.210377890580936, + "grad_norm": 1.0216857194900513, + "learning_rate": 8.95093062605753e-06, + "loss": 0.6876, + "step": 14557 + }, + { + "epoch": 8.210941906373378, + "grad_norm": 1.1742438077926636, + "learning_rate": 8.948110547095318e-06, + "loss": 0.849, + "step": 14558 + }, + { + "epoch": 8.21150592216582, + "grad_norm": 1.2293751239776611, + "learning_rate": 8.945290468133107e-06, + "loss": 0.7577, + "step": 14559 + }, + { + "epoch": 8.212069937958264, + "grad_norm": 0.9511170983314514, + "learning_rate": 8.942470389170896e-06, + "loss": 0.679, + "step": 14560 + }, + { + "epoch": 8.212633953750705, + "grad_norm": 1.1939181089401245, + "learning_rate": 8.939650310208687e-06, + "loss": 0.7234, + "step": 14561 + }, + { + "epoch": 8.213197969543147, + "grad_norm": 1.2177066802978516, + "learning_rate": 8.936830231246475e-06, + "loss": 0.7309, + "step": 14562 + }, + { + "epoch": 8.213761985335589, + "grad_norm": 1.0052486658096313, + "learning_rate": 8.934010152284264e-06, + "loss": 0.7231, + "step": 14563 + }, + { + "epoch": 8.21432600112803, + "grad_norm": 1.340712070465088, + "learning_rate": 8.931190073322053e-06, + "loss": 0.7957, + "step": 14564 + }, + { + "epoch": 8.214890016920474, + "grad_norm": 1.0295439958572388, + "learning_rate": 8.928369994359842e-06, + "loss": 0.6555, + "step": 14565 + }, + { + "epoch": 8.215454032712916, + "grad_norm": 0.9230982065200806, + "learning_rate": 8.92554991539763e-06, + "loss": 0.6313, + "step": 14566 + }, + { + "epoch": 8.216018048505358, + "grad_norm": 1.5549213886260986, + "learning_rate": 8.92272983643542e-06, + "loss": 0.7234, + "step": 14567 + }, + { + "epoch": 8.2165820642978, + "grad_norm": 1.5701167583465576, + "learning_rate": 8.91990975747321e-06, + "loss": 0.7922, + "step": 14568 + }, + { + "epoch": 8.217146080090243, + "grad_norm": 1.0113165378570557, + "learning_rate": 8.917089678510999e-06, + "loss": 0.6922, + "step": 14569 + }, + { + "epoch": 8.217710095882685, + "grad_norm": 1.3087027072906494, + "learning_rate": 8.914269599548789e-06, + "loss": 0.84, + "step": 14570 + }, + { + "epoch": 8.218274111675127, + "grad_norm": 1.0647599697113037, + "learning_rate": 8.911449520586578e-06, + "loss": 0.717, + "step": 14571 + }, + { + "epoch": 8.218838127467569, + "grad_norm": 0.9714478254318237, + "learning_rate": 8.908629441624367e-06, + "loss": 0.6853, + "step": 14572 + }, + { + "epoch": 8.21940214326001, + "grad_norm": 0.8611709475517273, + "learning_rate": 8.905809362662156e-06, + "loss": 0.7279, + "step": 14573 + }, + { + "epoch": 8.219966159052454, + "grad_norm": 0.8847202062606812, + "learning_rate": 8.902989283699944e-06, + "loss": 0.6052, + "step": 14574 + }, + { + "epoch": 8.220530174844896, + "grad_norm": 1.2072010040283203, + "learning_rate": 8.900169204737733e-06, + "loss": 0.7525, + "step": 14575 + }, + { + "epoch": 8.221094190637338, + "grad_norm": 0.9777812957763672, + "learning_rate": 8.897349125775522e-06, + "loss": 0.7678, + "step": 14576 + }, + { + "epoch": 8.22165820642978, + "grad_norm": 1.2315512895584106, + "learning_rate": 8.894529046813312e-06, + "loss": 0.7535, + "step": 14577 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 1.1365898847579956, + "learning_rate": 8.891708967851101e-06, + "loss": 0.7851, + "step": 14578 + }, + { + "epoch": 8.222786238014665, + "grad_norm": 1.0974072217941284, + "learning_rate": 8.88888888888889e-06, + "loss": 0.65, + "step": 14579 + }, + { + "epoch": 8.223350253807107, + "grad_norm": 1.2202292680740356, + "learning_rate": 8.886068809926679e-06, + "loss": 0.7905, + "step": 14580 + }, + { + "epoch": 8.223914269599549, + "grad_norm": 0.8584040999412537, + "learning_rate": 8.883248730964468e-06, + "loss": 0.729, + "step": 14581 + }, + { + "epoch": 8.22447828539199, + "grad_norm": 1.0608493089675903, + "learning_rate": 8.880428652002256e-06, + "loss": 0.6568, + "step": 14582 + }, + { + "epoch": 8.225042301184434, + "grad_norm": 0.9372578263282776, + "learning_rate": 8.877608573040045e-06, + "loss": 0.6664, + "step": 14583 + }, + { + "epoch": 8.225606316976876, + "grad_norm": 1.0850821733474731, + "learning_rate": 8.874788494077834e-06, + "loss": 0.7683, + "step": 14584 + }, + { + "epoch": 8.226170332769318, + "grad_norm": 1.012911081314087, + "learning_rate": 8.871968415115624e-06, + "loss": 0.5886, + "step": 14585 + }, + { + "epoch": 8.22673434856176, + "grad_norm": 1.003897786140442, + "learning_rate": 8.869148336153413e-06, + "loss": 0.7335, + "step": 14586 + }, + { + "epoch": 8.227298364354201, + "grad_norm": 1.163629174232483, + "learning_rate": 8.866328257191202e-06, + "loss": 0.7206, + "step": 14587 + }, + { + "epoch": 8.227862380146645, + "grad_norm": 1.175919771194458, + "learning_rate": 8.863508178228991e-06, + "loss": 0.7421, + "step": 14588 + }, + { + "epoch": 8.228426395939087, + "grad_norm": 1.654258370399475, + "learning_rate": 8.86068809926678e-06, + "loss": 0.7303, + "step": 14589 + }, + { + "epoch": 8.228990411731528, + "grad_norm": 1.04506254196167, + "learning_rate": 8.857868020304568e-06, + "loss": 0.7292, + "step": 14590 + }, + { + "epoch": 8.22955442752397, + "grad_norm": 1.2144761085510254, + "learning_rate": 8.855047941342357e-06, + "loss": 0.666, + "step": 14591 + }, + { + "epoch": 8.230118443316412, + "grad_norm": 1.0487085580825806, + "learning_rate": 8.852227862380148e-06, + "loss": 0.6689, + "step": 14592 + }, + { + "epoch": 8.230682459108856, + "grad_norm": 1.0822980403900146, + "learning_rate": 8.849407783417937e-06, + "loss": 0.752, + "step": 14593 + }, + { + "epoch": 8.231246474901297, + "grad_norm": 1.1542257070541382, + "learning_rate": 8.846587704455725e-06, + "loss": 0.6619, + "step": 14594 + }, + { + "epoch": 8.23181049069374, + "grad_norm": 1.8076547384262085, + "learning_rate": 8.843767625493514e-06, + "loss": 0.8928, + "step": 14595 + }, + { + "epoch": 8.232374506486181, + "grad_norm": 1.5449323654174805, + "learning_rate": 8.840947546531303e-06, + "loss": 0.7612, + "step": 14596 + }, + { + "epoch": 8.232938522278625, + "grad_norm": 0.7532070875167847, + "learning_rate": 8.838127467569092e-06, + "loss": 0.614, + "step": 14597 + }, + { + "epoch": 8.233502538071066, + "grad_norm": 1.4425475597381592, + "learning_rate": 8.83530738860688e-06, + "loss": 0.8266, + "step": 14598 + }, + { + "epoch": 8.234066553863508, + "grad_norm": 1.077536702156067, + "learning_rate": 8.83248730964467e-06, + "loss": 0.7713, + "step": 14599 + }, + { + "epoch": 8.23463056965595, + "grad_norm": 0.9258852601051331, + "learning_rate": 8.82966723068246e-06, + "loss": 0.6031, + "step": 14600 + }, + { + "epoch": 8.235194585448392, + "grad_norm": 0.8607993721961975, + "learning_rate": 8.826847151720249e-06, + "loss": 0.5768, + "step": 14601 + }, + { + "epoch": 8.235758601240835, + "grad_norm": 1.1145713329315186, + "learning_rate": 8.824027072758037e-06, + "loss": 0.7868, + "step": 14602 + }, + { + "epoch": 8.236322617033277, + "grad_norm": 1.0468510389328003, + "learning_rate": 8.821206993795826e-06, + "loss": 0.5782, + "step": 14603 + }, + { + "epoch": 8.236886632825719, + "grad_norm": 1.0652267932891846, + "learning_rate": 8.818386914833615e-06, + "loss": 0.6973, + "step": 14604 + }, + { + "epoch": 8.237450648618161, + "grad_norm": 1.068056344985962, + "learning_rate": 8.815566835871404e-06, + "loss": 0.739, + "step": 14605 + }, + { + "epoch": 8.238014664410603, + "grad_norm": 0.9237123727798462, + "learning_rate": 8.812746756909194e-06, + "loss": 0.6841, + "step": 14606 + }, + { + "epoch": 8.238578680203046, + "grad_norm": 0.9887650012969971, + "learning_rate": 8.809926677946983e-06, + "loss": 0.7943, + "step": 14607 + }, + { + "epoch": 8.239142695995488, + "grad_norm": 0.957483172416687, + "learning_rate": 8.807106598984772e-06, + "loss": 0.6673, + "step": 14608 + }, + { + "epoch": 8.23970671178793, + "grad_norm": 1.1213150024414062, + "learning_rate": 8.804286520022562e-06, + "loss": 0.7501, + "step": 14609 + }, + { + "epoch": 8.240270727580372, + "grad_norm": 1.0523966550827026, + "learning_rate": 8.801466441060351e-06, + "loss": 0.8593, + "step": 14610 + }, + { + "epoch": 8.240834743372815, + "grad_norm": 1.1795929670333862, + "learning_rate": 8.79864636209814e-06, + "loss": 0.805, + "step": 14611 + }, + { + "epoch": 8.241398759165257, + "grad_norm": 1.2499878406524658, + "learning_rate": 8.795826283135929e-06, + "loss": 0.7922, + "step": 14612 + }, + { + "epoch": 8.241962774957699, + "grad_norm": 1.8644452095031738, + "learning_rate": 8.793006204173718e-06, + "loss": 0.8349, + "step": 14613 + }, + { + "epoch": 8.24252679075014, + "grad_norm": 1.0842218399047852, + "learning_rate": 8.790186125211506e-06, + "loss": 0.7742, + "step": 14614 + }, + { + "epoch": 8.243090806542583, + "grad_norm": 1.2547293901443481, + "learning_rate": 8.787366046249295e-06, + "loss": 0.7349, + "step": 14615 + }, + { + "epoch": 8.243654822335026, + "grad_norm": 1.3571970462799072, + "learning_rate": 8.784545967287084e-06, + "loss": 0.789, + "step": 14616 + }, + { + "epoch": 8.244218838127468, + "grad_norm": 0.9709070920944214, + "learning_rate": 8.781725888324874e-06, + "loss": 0.6755, + "step": 14617 + }, + { + "epoch": 8.24478285391991, + "grad_norm": 0.9766169786453247, + "learning_rate": 8.778905809362663e-06, + "loss": 0.6733, + "step": 14618 + }, + { + "epoch": 8.245346869712352, + "grad_norm": 0.8627756237983704, + "learning_rate": 8.776085730400452e-06, + "loss": 0.6441, + "step": 14619 + }, + { + "epoch": 8.245910885504793, + "grad_norm": 1.1164277791976929, + "learning_rate": 8.77326565143824e-06, + "loss": 0.6864, + "step": 14620 + }, + { + "epoch": 8.246474901297237, + "grad_norm": 1.4212604761123657, + "learning_rate": 8.77044557247603e-06, + "loss": 0.7387, + "step": 14621 + }, + { + "epoch": 8.247038917089679, + "grad_norm": 0.9292868971824646, + "learning_rate": 8.767625493513818e-06, + "loss": 0.7449, + "step": 14622 + }, + { + "epoch": 8.24760293288212, + "grad_norm": 1.1052851676940918, + "learning_rate": 8.764805414551607e-06, + "loss": 0.8088, + "step": 14623 + }, + { + "epoch": 8.248166948674562, + "grad_norm": 1.0908461809158325, + "learning_rate": 8.761985335589398e-06, + "loss": 0.7037, + "step": 14624 + }, + { + "epoch": 8.248730964467006, + "grad_norm": 1.266261339187622, + "learning_rate": 8.759165256627186e-06, + "loss": 0.7657, + "step": 14625 + }, + { + "epoch": 8.249294980259448, + "grad_norm": 1.2176237106323242, + "learning_rate": 8.756345177664975e-06, + "loss": 0.7519, + "step": 14626 + }, + { + "epoch": 8.24985899605189, + "grad_norm": 1.0151807069778442, + "learning_rate": 8.753525098702764e-06, + "loss": 0.7742, + "step": 14627 + }, + { + "epoch": 8.250423011844331, + "grad_norm": 1.366382360458374, + "learning_rate": 8.750705019740553e-06, + "loss": 0.7121, + "step": 14628 + }, + { + "epoch": 8.250987027636773, + "grad_norm": 1.2539377212524414, + "learning_rate": 8.747884940778342e-06, + "loss": 0.7113, + "step": 14629 + }, + { + "epoch": 8.251551043429217, + "grad_norm": 1.5371347665786743, + "learning_rate": 8.74506486181613e-06, + "loss": 0.7602, + "step": 14630 + }, + { + "epoch": 8.252115059221659, + "grad_norm": 1.4505422115325928, + "learning_rate": 8.74224478285392e-06, + "loss": 0.7778, + "step": 14631 + }, + { + "epoch": 8.2526790750141, + "grad_norm": 1.1464776992797852, + "learning_rate": 8.73942470389171e-06, + "loss": 0.782, + "step": 14632 + }, + { + "epoch": 8.253243090806542, + "grad_norm": 1.19456946849823, + "learning_rate": 8.736604624929499e-06, + "loss": 0.775, + "step": 14633 + }, + { + "epoch": 8.253807106598984, + "grad_norm": 0.9087913036346436, + "learning_rate": 8.733784545967287e-06, + "loss": 0.7655, + "step": 14634 + }, + { + "epoch": 8.254371122391428, + "grad_norm": 0.9865677952766418, + "learning_rate": 8.730964467005076e-06, + "loss": 0.7573, + "step": 14635 + }, + { + "epoch": 8.25493513818387, + "grad_norm": 0.9738414883613586, + "learning_rate": 8.728144388042865e-06, + "loss": 0.7734, + "step": 14636 + }, + { + "epoch": 8.255499153976311, + "grad_norm": 1.510877013206482, + "learning_rate": 8.725324309080654e-06, + "loss": 0.7936, + "step": 14637 + }, + { + "epoch": 8.256063169768753, + "grad_norm": 1.1205382347106934, + "learning_rate": 8.722504230118443e-06, + "loss": 0.8392, + "step": 14638 + }, + { + "epoch": 8.256627185561197, + "grad_norm": 1.1919710636138916, + "learning_rate": 8.719684151156231e-06, + "loss": 0.7671, + "step": 14639 + }, + { + "epoch": 8.257191201353638, + "grad_norm": 0.8537715077400208, + "learning_rate": 8.716864072194022e-06, + "loss": 0.648, + "step": 14640 + }, + { + "epoch": 8.25775521714608, + "grad_norm": 1.6444059610366821, + "learning_rate": 8.714043993231812e-06, + "loss": 0.823, + "step": 14641 + }, + { + "epoch": 8.258319232938522, + "grad_norm": 1.5086302757263184, + "learning_rate": 8.711223914269601e-06, + "loss": 0.7749, + "step": 14642 + }, + { + "epoch": 8.258883248730964, + "grad_norm": 0.9597691297531128, + "learning_rate": 8.70840383530739e-06, + "loss": 0.6342, + "step": 14643 + }, + { + "epoch": 8.259447264523407, + "grad_norm": 1.0851012468338013, + "learning_rate": 8.705583756345179e-06, + "loss": 0.7569, + "step": 14644 + }, + { + "epoch": 8.26001128031585, + "grad_norm": 1.181555986404419, + "learning_rate": 8.702763677382968e-06, + "loss": 0.7605, + "step": 14645 + }, + { + "epoch": 8.260575296108291, + "grad_norm": 1.1406795978546143, + "learning_rate": 8.699943598420756e-06, + "loss": 0.7651, + "step": 14646 + }, + { + "epoch": 8.261139311900733, + "grad_norm": 1.0625202655792236, + "learning_rate": 8.697123519458545e-06, + "loss": 0.7435, + "step": 14647 + }, + { + "epoch": 8.261703327693175, + "grad_norm": 1.3735157251358032, + "learning_rate": 8.694303440496334e-06, + "loss": 0.8406, + "step": 14648 + }, + { + "epoch": 8.262267343485618, + "grad_norm": 1.1814147233963013, + "learning_rate": 8.691483361534124e-06, + "loss": 0.7729, + "step": 14649 + }, + { + "epoch": 8.26283135927806, + "grad_norm": 1.2511297464370728, + "learning_rate": 8.688663282571913e-06, + "loss": 0.6923, + "step": 14650 + }, + { + "epoch": 8.263395375070502, + "grad_norm": 1.0682945251464844, + "learning_rate": 8.685843203609702e-06, + "loss": 0.6856, + "step": 14651 + }, + { + "epoch": 8.263959390862944, + "grad_norm": 1.2224971055984497, + "learning_rate": 8.68302312464749e-06, + "loss": 0.6744, + "step": 14652 + }, + { + "epoch": 8.264523406655387, + "grad_norm": 1.0486689805984497, + "learning_rate": 8.68020304568528e-06, + "loss": 0.7855, + "step": 14653 + }, + { + "epoch": 8.265087422447829, + "grad_norm": 1.1969276666641235, + "learning_rate": 8.677382966723068e-06, + "loss": 0.7698, + "step": 14654 + }, + { + "epoch": 8.26565143824027, + "grad_norm": 1.4655786752700806, + "learning_rate": 8.674562887760857e-06, + "loss": 0.6837, + "step": 14655 + }, + { + "epoch": 8.266215454032713, + "grad_norm": 0.9706430435180664, + "learning_rate": 8.671742808798648e-06, + "loss": 0.7879, + "step": 14656 + }, + { + "epoch": 8.266779469825154, + "grad_norm": 1.1026111841201782, + "learning_rate": 8.668922729836436e-06, + "loss": 0.7559, + "step": 14657 + }, + { + "epoch": 8.267343485617598, + "grad_norm": 1.4272387027740479, + "learning_rate": 8.666102650874225e-06, + "loss": 0.6841, + "step": 14658 + }, + { + "epoch": 8.26790750141004, + "grad_norm": 1.1684800386428833, + "learning_rate": 8.663282571912014e-06, + "loss": 0.7288, + "step": 14659 + }, + { + "epoch": 8.268471517202482, + "grad_norm": 1.1423407793045044, + "learning_rate": 8.660462492949803e-06, + "loss": 0.7923, + "step": 14660 + }, + { + "epoch": 8.269035532994923, + "grad_norm": 0.8197610974311829, + "learning_rate": 8.657642413987592e-06, + "loss": 0.6219, + "step": 14661 + }, + { + "epoch": 8.269599548787365, + "grad_norm": 1.1951980590820312, + "learning_rate": 8.65482233502538e-06, + "loss": 0.6921, + "step": 14662 + }, + { + "epoch": 8.270163564579809, + "grad_norm": 1.0825872421264648, + "learning_rate": 8.65200225606317e-06, + "loss": 0.6995, + "step": 14663 + }, + { + "epoch": 8.27072758037225, + "grad_norm": 1.281596302986145, + "learning_rate": 8.64918217710096e-06, + "loss": 0.7713, + "step": 14664 + }, + { + "epoch": 8.271291596164692, + "grad_norm": 1.2100082635879517, + "learning_rate": 8.646362098138749e-06, + "loss": 0.7813, + "step": 14665 + }, + { + "epoch": 8.271855611957134, + "grad_norm": 0.9265011548995972, + "learning_rate": 8.643542019176537e-06, + "loss": 0.6629, + "step": 14666 + }, + { + "epoch": 8.272419627749578, + "grad_norm": 1.1526641845703125, + "learning_rate": 8.640721940214326e-06, + "loss": 0.7247, + "step": 14667 + }, + { + "epoch": 8.27298364354202, + "grad_norm": 1.0266640186309814, + "learning_rate": 8.637901861252115e-06, + "loss": 0.7559, + "step": 14668 + }, + { + "epoch": 8.273547659334461, + "grad_norm": 1.2393205165863037, + "learning_rate": 8.635081782289904e-06, + "loss": 0.7265, + "step": 14669 + }, + { + "epoch": 8.274111675126903, + "grad_norm": 1.0361411571502686, + "learning_rate": 8.632261703327693e-06, + "loss": 0.7421, + "step": 14670 + }, + { + "epoch": 8.274675690919345, + "grad_norm": 1.1590887308120728, + "learning_rate": 8.629441624365483e-06, + "loss": 0.6992, + "step": 14671 + }, + { + "epoch": 8.275239706711789, + "grad_norm": 1.3456183671951294, + "learning_rate": 8.626621545403272e-06, + "loss": 0.716, + "step": 14672 + }, + { + "epoch": 8.27580372250423, + "grad_norm": 0.908362090587616, + "learning_rate": 8.62380146644106e-06, + "loss": 0.5866, + "step": 14673 + }, + { + "epoch": 8.276367738296672, + "grad_norm": 0.9740831851959229, + "learning_rate": 8.62098138747885e-06, + "loss": 0.7139, + "step": 14674 + }, + { + "epoch": 8.276931754089114, + "grad_norm": 0.9935624003410339, + "learning_rate": 8.618161308516638e-06, + "loss": 0.7089, + "step": 14675 + }, + { + "epoch": 8.277495769881556, + "grad_norm": 1.166931390762329, + "learning_rate": 8.615341229554429e-06, + "loss": 0.6525, + "step": 14676 + }, + { + "epoch": 8.278059785674, + "grad_norm": 1.2995439767837524, + "learning_rate": 8.612521150592217e-06, + "loss": 0.8026, + "step": 14677 + }, + { + "epoch": 8.278623801466441, + "grad_norm": 1.192671298980713, + "learning_rate": 8.609701071630006e-06, + "loss": 0.7566, + "step": 14678 + }, + { + "epoch": 8.279187817258883, + "grad_norm": 1.0996854305267334, + "learning_rate": 8.606880992667795e-06, + "loss": 0.7108, + "step": 14679 + }, + { + "epoch": 8.279751833051325, + "grad_norm": 1.5432840585708618, + "learning_rate": 8.604060913705586e-06, + "loss": 0.7868, + "step": 14680 + }, + { + "epoch": 8.280315848843768, + "grad_norm": 1.404897689819336, + "learning_rate": 8.601240834743374e-06, + "loss": 0.676, + "step": 14681 + }, + { + "epoch": 8.28087986463621, + "grad_norm": 0.8994016647338867, + "learning_rate": 8.598420755781163e-06, + "loss": 0.7498, + "step": 14682 + }, + { + "epoch": 8.281443880428652, + "grad_norm": 1.165785789489746, + "learning_rate": 8.595600676818952e-06, + "loss": 0.8404, + "step": 14683 + }, + { + "epoch": 8.282007896221094, + "grad_norm": 1.0104374885559082, + "learning_rate": 8.59278059785674e-06, + "loss": 0.7125, + "step": 14684 + }, + { + "epoch": 8.282571912013536, + "grad_norm": 1.389040231704712, + "learning_rate": 8.58996051889453e-06, + "loss": 0.8418, + "step": 14685 + }, + { + "epoch": 8.28313592780598, + "grad_norm": 1.078653335571289, + "learning_rate": 8.587140439932318e-06, + "loss": 0.668, + "step": 14686 + }, + { + "epoch": 8.283699943598421, + "grad_norm": 1.2362616062164307, + "learning_rate": 8.584320360970107e-06, + "loss": 0.7547, + "step": 14687 + }, + { + "epoch": 8.284263959390863, + "grad_norm": 1.5244636535644531, + "learning_rate": 8.581500282007898e-06, + "loss": 0.7642, + "step": 14688 + }, + { + "epoch": 8.284827975183305, + "grad_norm": 1.067031741142273, + "learning_rate": 8.578680203045686e-06, + "loss": 0.7886, + "step": 14689 + }, + { + "epoch": 8.285391990975747, + "grad_norm": 1.7582390308380127, + "learning_rate": 8.575860124083475e-06, + "loss": 0.8414, + "step": 14690 + }, + { + "epoch": 8.28595600676819, + "grad_norm": 0.9625334739685059, + "learning_rate": 8.573040045121264e-06, + "loss": 0.6659, + "step": 14691 + }, + { + "epoch": 8.286520022560632, + "grad_norm": 1.1483986377716064, + "learning_rate": 8.570219966159053e-06, + "loss": 0.843, + "step": 14692 + }, + { + "epoch": 8.287084038353074, + "grad_norm": 1.2669472694396973, + "learning_rate": 8.567399887196842e-06, + "loss": 0.7434, + "step": 14693 + }, + { + "epoch": 8.287648054145516, + "grad_norm": 1.142396092414856, + "learning_rate": 8.56457980823463e-06, + "loss": 0.7396, + "step": 14694 + }, + { + "epoch": 8.28821206993796, + "grad_norm": 1.6333544254302979, + "learning_rate": 8.56175972927242e-06, + "loss": 0.8502, + "step": 14695 + }, + { + "epoch": 8.288776085730401, + "grad_norm": 1.2918436527252197, + "learning_rate": 8.55893965031021e-06, + "loss": 0.7204, + "step": 14696 + }, + { + "epoch": 8.289340101522843, + "grad_norm": 1.2339420318603516, + "learning_rate": 8.556119571347998e-06, + "loss": 0.6667, + "step": 14697 + }, + { + "epoch": 8.289904117315285, + "grad_norm": 1.2909948825836182, + "learning_rate": 8.553299492385787e-06, + "loss": 0.7056, + "step": 14698 + }, + { + "epoch": 8.290468133107726, + "grad_norm": 1.4249544143676758, + "learning_rate": 8.550479413423576e-06, + "loss": 0.7974, + "step": 14699 + }, + { + "epoch": 8.29103214890017, + "grad_norm": 1.094882607460022, + "learning_rate": 8.547659334461365e-06, + "loss": 0.6912, + "step": 14700 + }, + { + "epoch": 8.291596164692612, + "grad_norm": 1.1252371072769165, + "learning_rate": 8.544839255499154e-06, + "loss": 0.7477, + "step": 14701 + }, + { + "epoch": 8.292160180485054, + "grad_norm": 1.33502197265625, + "learning_rate": 8.542019176536942e-06, + "loss": 0.7293, + "step": 14702 + }, + { + "epoch": 8.292724196277495, + "grad_norm": 1.0323537588119507, + "learning_rate": 8.539199097574733e-06, + "loss": 0.667, + "step": 14703 + }, + { + "epoch": 8.293288212069937, + "grad_norm": 1.0219985246658325, + "learning_rate": 8.536379018612522e-06, + "loss": 0.655, + "step": 14704 + }, + { + "epoch": 8.29385222786238, + "grad_norm": 1.2530819177627563, + "learning_rate": 8.53355893965031e-06, + "loss": 0.761, + "step": 14705 + }, + { + "epoch": 8.294416243654823, + "grad_norm": 1.5229976177215576, + "learning_rate": 8.5307388606881e-06, + "loss": 0.7809, + "step": 14706 + }, + { + "epoch": 8.294980259447264, + "grad_norm": 1.0109893083572388, + "learning_rate": 8.527918781725888e-06, + "loss": 0.7264, + "step": 14707 + }, + { + "epoch": 8.295544275239706, + "grad_norm": 1.228587031364441, + "learning_rate": 8.525098702763677e-06, + "loss": 0.7793, + "step": 14708 + }, + { + "epoch": 8.29610829103215, + "grad_norm": 1.1946799755096436, + "learning_rate": 8.522278623801466e-06, + "loss": 0.6394, + "step": 14709 + }, + { + "epoch": 8.296672306824592, + "grad_norm": 1.3250318765640259, + "learning_rate": 8.519458544839255e-06, + "loss": 0.6958, + "step": 14710 + }, + { + "epoch": 8.297236322617033, + "grad_norm": 0.9943655729293823, + "learning_rate": 8.516638465877045e-06, + "loss": 0.6905, + "step": 14711 + }, + { + "epoch": 8.297800338409475, + "grad_norm": 1.0372731685638428, + "learning_rate": 8.513818386914836e-06, + "loss": 0.7277, + "step": 14712 + }, + { + "epoch": 8.298364354201917, + "grad_norm": 1.1895555257797241, + "learning_rate": 8.510998307952624e-06, + "loss": 0.7501, + "step": 14713 + }, + { + "epoch": 8.29892836999436, + "grad_norm": 0.9331670999526978, + "learning_rate": 8.508178228990413e-06, + "loss": 0.7586, + "step": 14714 + }, + { + "epoch": 8.299492385786802, + "grad_norm": 1.1107500791549683, + "learning_rate": 8.505358150028202e-06, + "loss": 0.6787, + "step": 14715 + }, + { + "epoch": 8.300056401579244, + "grad_norm": 1.0695676803588867, + "learning_rate": 8.50253807106599e-06, + "loss": 0.685, + "step": 14716 + }, + { + "epoch": 8.300620417371686, + "grad_norm": 1.1155340671539307, + "learning_rate": 8.49971799210378e-06, + "loss": 0.6691, + "step": 14717 + }, + { + "epoch": 8.301184433164128, + "grad_norm": 1.0725128650665283, + "learning_rate": 8.496897913141568e-06, + "loss": 0.7861, + "step": 14718 + }, + { + "epoch": 8.301748448956571, + "grad_norm": 1.2644516229629517, + "learning_rate": 8.494077834179357e-06, + "loss": 0.7033, + "step": 14719 + }, + { + "epoch": 8.302312464749013, + "grad_norm": 1.1033669710159302, + "learning_rate": 8.491257755217148e-06, + "loss": 0.8304, + "step": 14720 + }, + { + "epoch": 8.302876480541455, + "grad_norm": 0.8305072784423828, + "learning_rate": 8.488437676254936e-06, + "loss": 0.7175, + "step": 14721 + }, + { + "epoch": 8.303440496333897, + "grad_norm": 1.1762703657150269, + "learning_rate": 8.485617597292725e-06, + "loss": 0.6748, + "step": 14722 + }, + { + "epoch": 8.30400451212634, + "grad_norm": 0.9852115511894226, + "learning_rate": 8.482797518330514e-06, + "loss": 0.7228, + "step": 14723 + }, + { + "epoch": 8.304568527918782, + "grad_norm": 1.1315381526947021, + "learning_rate": 8.479977439368303e-06, + "loss": 0.653, + "step": 14724 + }, + { + "epoch": 8.305132543711224, + "grad_norm": 1.3449383974075317, + "learning_rate": 8.477157360406092e-06, + "loss": 0.8091, + "step": 14725 + }, + { + "epoch": 8.305696559503666, + "grad_norm": 1.1042473316192627, + "learning_rate": 8.47433728144388e-06, + "loss": 0.759, + "step": 14726 + }, + { + "epoch": 8.306260575296108, + "grad_norm": 1.218883752822876, + "learning_rate": 8.471517202481669e-06, + "loss": 0.7399, + "step": 14727 + }, + { + "epoch": 8.306824591088551, + "grad_norm": 1.355048418045044, + "learning_rate": 8.46869712351946e-06, + "loss": 0.8039, + "step": 14728 + }, + { + "epoch": 8.307388606880993, + "grad_norm": 1.5267404317855835, + "learning_rate": 8.465877044557248e-06, + "loss": 0.8217, + "step": 14729 + }, + { + "epoch": 8.307952622673435, + "grad_norm": 1.1894537210464478, + "learning_rate": 8.463056965595037e-06, + "loss": 0.758, + "step": 14730 + }, + { + "epoch": 8.308516638465877, + "grad_norm": 0.8684662580490112, + "learning_rate": 8.460236886632826e-06, + "loss": 0.6753, + "step": 14731 + }, + { + "epoch": 8.309080654258318, + "grad_norm": 1.0910850763320923, + "learning_rate": 8.457416807670615e-06, + "loss": 0.7613, + "step": 14732 + }, + { + "epoch": 8.309644670050762, + "grad_norm": 1.2043997049331665, + "learning_rate": 8.454596728708404e-06, + "loss": 0.7446, + "step": 14733 + }, + { + "epoch": 8.310208685843204, + "grad_norm": 0.8384119868278503, + "learning_rate": 8.451776649746192e-06, + "loss": 0.6681, + "step": 14734 + }, + { + "epoch": 8.310772701635646, + "grad_norm": 0.8924356698989868, + "learning_rate": 8.448956570783983e-06, + "loss": 0.5983, + "step": 14735 + }, + { + "epoch": 8.311336717428087, + "grad_norm": 1.6302480697631836, + "learning_rate": 8.446136491821772e-06, + "loss": 0.7454, + "step": 14736 + }, + { + "epoch": 8.311900733220531, + "grad_norm": 0.9157452583312988, + "learning_rate": 8.44331641285956e-06, + "loss": 0.6743, + "step": 14737 + }, + { + "epoch": 8.312464749012973, + "grad_norm": 1.020167589187622, + "learning_rate": 8.44049633389735e-06, + "loss": 0.6646, + "step": 14738 + }, + { + "epoch": 8.313028764805415, + "grad_norm": 1.3978224992752075, + "learning_rate": 8.437676254935138e-06, + "loss": 0.83, + "step": 14739 + }, + { + "epoch": 8.313592780597856, + "grad_norm": 1.2133339643478394, + "learning_rate": 8.434856175972927e-06, + "loss": 0.7708, + "step": 14740 + }, + { + "epoch": 8.314156796390298, + "grad_norm": 1.2240325212478638, + "learning_rate": 8.432036097010716e-06, + "loss": 0.6973, + "step": 14741 + }, + { + "epoch": 8.314720812182742, + "grad_norm": 1.097545862197876, + "learning_rate": 8.429216018048504e-06, + "loss": 0.8769, + "step": 14742 + }, + { + "epoch": 8.315284827975184, + "grad_norm": 0.8542583584785461, + "learning_rate": 8.426395939086295e-06, + "loss": 0.7092, + "step": 14743 + }, + { + "epoch": 8.315848843767625, + "grad_norm": 1.1390092372894287, + "learning_rate": 8.423575860124084e-06, + "loss": 0.7665, + "step": 14744 + }, + { + "epoch": 8.316412859560067, + "grad_norm": 1.4143565893173218, + "learning_rate": 8.420755781161873e-06, + "loss": 0.76, + "step": 14745 + }, + { + "epoch": 8.316976875352509, + "grad_norm": 1.5299433469772339, + "learning_rate": 8.417935702199661e-06, + "loss": 0.6327, + "step": 14746 + }, + { + "epoch": 8.317540891144953, + "grad_norm": 1.0789426565170288, + "learning_rate": 8.415115623237452e-06, + "loss": 0.7092, + "step": 14747 + }, + { + "epoch": 8.318104906937394, + "grad_norm": 0.9210767149925232, + "learning_rate": 8.41229554427524e-06, + "loss": 0.753, + "step": 14748 + }, + { + "epoch": 8.318668922729836, + "grad_norm": 1.4307912588119507, + "learning_rate": 8.40947546531303e-06, + "loss": 0.8171, + "step": 14749 + }, + { + "epoch": 8.319232938522278, + "grad_norm": 1.2945258617401123, + "learning_rate": 8.406655386350818e-06, + "loss": 0.8115, + "step": 14750 + }, + { + "epoch": 8.319796954314722, + "grad_norm": 1.1626778841018677, + "learning_rate": 8.403835307388607e-06, + "loss": 0.7045, + "step": 14751 + }, + { + "epoch": 8.320360970107163, + "grad_norm": 0.8202970623970032, + "learning_rate": 8.401015228426398e-06, + "loss": 0.5965, + "step": 14752 + }, + { + "epoch": 8.320924985899605, + "grad_norm": 1.0372717380523682, + "learning_rate": 8.398195149464186e-06, + "loss": 0.7641, + "step": 14753 + }, + { + "epoch": 8.321489001692047, + "grad_norm": 1.2182202339172363, + "learning_rate": 8.395375070501975e-06, + "loss": 0.7932, + "step": 14754 + }, + { + "epoch": 8.322053017484489, + "grad_norm": 1.6578710079193115, + "learning_rate": 8.392554991539764e-06, + "loss": 0.8421, + "step": 14755 + }, + { + "epoch": 8.322617033276932, + "grad_norm": 0.998447060585022, + "learning_rate": 8.389734912577553e-06, + "loss": 0.7986, + "step": 14756 + }, + { + "epoch": 8.323181049069374, + "grad_norm": 1.8367743492126465, + "learning_rate": 8.386914833615342e-06, + "loss": 0.8389, + "step": 14757 + }, + { + "epoch": 8.323745064861816, + "grad_norm": 0.8003693222999573, + "learning_rate": 8.38409475465313e-06, + "loss": 0.6845, + "step": 14758 + }, + { + "epoch": 8.324309080654258, + "grad_norm": 1.0811527967453003, + "learning_rate": 8.381274675690919e-06, + "loss": 0.7503, + "step": 14759 + }, + { + "epoch": 8.3248730964467, + "grad_norm": 1.4887644052505493, + "learning_rate": 8.37845459672871e-06, + "loss": 0.8086, + "step": 14760 + }, + { + "epoch": 8.325437112239143, + "grad_norm": 1.465421438217163, + "learning_rate": 8.375634517766498e-06, + "loss": 0.8352, + "step": 14761 + }, + { + "epoch": 8.326001128031585, + "grad_norm": 0.9966151118278503, + "learning_rate": 8.372814438804287e-06, + "loss": 0.7575, + "step": 14762 + }, + { + "epoch": 8.326565143824027, + "grad_norm": 1.2816431522369385, + "learning_rate": 8.369994359842076e-06, + "loss": 0.759, + "step": 14763 + }, + { + "epoch": 8.327129159616469, + "grad_norm": 1.0052363872528076, + "learning_rate": 8.367174280879865e-06, + "loss": 0.7213, + "step": 14764 + }, + { + "epoch": 8.327693175408912, + "grad_norm": 1.077474594116211, + "learning_rate": 8.364354201917654e-06, + "loss": 0.6067, + "step": 14765 + }, + { + "epoch": 8.328257191201354, + "grad_norm": 1.3090780973434448, + "learning_rate": 8.361534122955442e-06, + "loss": 0.6823, + "step": 14766 + }, + { + "epoch": 8.328821206993796, + "grad_norm": 0.7935914993286133, + "learning_rate": 8.358714043993233e-06, + "loss": 0.6164, + "step": 14767 + }, + { + "epoch": 8.329385222786238, + "grad_norm": 1.366105079650879, + "learning_rate": 8.355893965031022e-06, + "loss": 0.7712, + "step": 14768 + }, + { + "epoch": 8.32994923857868, + "grad_norm": 1.218195915222168, + "learning_rate": 8.35307388606881e-06, + "loss": 0.8247, + "step": 14769 + }, + { + "epoch": 8.330513254371123, + "grad_norm": 1.2643954753875732, + "learning_rate": 8.3502538071066e-06, + "loss": 0.711, + "step": 14770 + }, + { + "epoch": 8.331077270163565, + "grad_norm": 1.1849902868270874, + "learning_rate": 8.347433728144388e-06, + "loss": 0.6737, + "step": 14771 + }, + { + "epoch": 8.331641285956007, + "grad_norm": 0.9148752093315125, + "learning_rate": 8.344613649182177e-06, + "loss": 0.6401, + "step": 14772 + }, + { + "epoch": 8.332205301748449, + "grad_norm": 1.0048407316207886, + "learning_rate": 8.341793570219966e-06, + "loss": 0.6966, + "step": 14773 + }, + { + "epoch": 8.33276931754089, + "grad_norm": 1.216493844985962, + "learning_rate": 8.338973491257754e-06, + "loss": 0.7749, + "step": 14774 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 1.2276554107666016, + "learning_rate": 8.336153412295545e-06, + "loss": 0.848, + "step": 14775 + }, + { + "epoch": 8.333897349125776, + "grad_norm": 1.0446677207946777, + "learning_rate": 8.333333333333334e-06, + "loss": 0.764, + "step": 14776 + }, + { + "epoch": 8.334461364918218, + "grad_norm": 0.965808093547821, + "learning_rate": 8.330513254371123e-06, + "loss": 0.7908, + "step": 14777 + }, + { + "epoch": 8.33502538071066, + "grad_norm": 1.1444776058197021, + "learning_rate": 8.327693175408911e-06, + "loss": 0.8098, + "step": 14778 + }, + { + "epoch": 8.335589396503103, + "grad_norm": 1.1518056392669678, + "learning_rate": 8.3248730964467e-06, + "loss": 0.7458, + "step": 14779 + }, + { + "epoch": 8.336153412295545, + "grad_norm": 1.1347407102584839, + "learning_rate": 8.322053017484489e-06, + "loss": 0.7284, + "step": 14780 + }, + { + "epoch": 8.336717428087987, + "grad_norm": 1.188822627067566, + "learning_rate": 8.319232938522278e-06, + "loss": 0.8419, + "step": 14781 + }, + { + "epoch": 8.337281443880428, + "grad_norm": 0.9359663128852844, + "learning_rate": 8.316412859560068e-06, + "loss": 0.7016, + "step": 14782 + }, + { + "epoch": 8.33784545967287, + "grad_norm": 1.413138508796692, + "learning_rate": 8.313592780597857e-06, + "loss": 0.7519, + "step": 14783 + }, + { + "epoch": 8.338409475465314, + "grad_norm": 1.040068507194519, + "learning_rate": 8.310772701635647e-06, + "loss": 0.7041, + "step": 14784 + }, + { + "epoch": 8.338973491257756, + "grad_norm": 1.026668667793274, + "learning_rate": 8.307952622673436e-06, + "loss": 0.7395, + "step": 14785 + }, + { + "epoch": 8.339537507050197, + "grad_norm": 1.1061224937438965, + "learning_rate": 8.305132543711225e-06, + "loss": 0.7134, + "step": 14786 + }, + { + "epoch": 8.34010152284264, + "grad_norm": 0.8845131993293762, + "learning_rate": 8.302312464749014e-06, + "loss": 0.7174, + "step": 14787 + }, + { + "epoch": 8.340665538635081, + "grad_norm": 1.1152594089508057, + "learning_rate": 8.299492385786803e-06, + "loss": 0.7164, + "step": 14788 + }, + { + "epoch": 8.341229554427525, + "grad_norm": 1.2388935089111328, + "learning_rate": 8.296672306824591e-06, + "loss": 0.7401, + "step": 14789 + }, + { + "epoch": 8.341793570219966, + "grad_norm": 0.9050916433334351, + "learning_rate": 8.29385222786238e-06, + "loss": 0.6451, + "step": 14790 + }, + { + "epoch": 8.342357586012408, + "grad_norm": 1.0844405889511108, + "learning_rate": 8.29103214890017e-06, + "loss": 0.6751, + "step": 14791 + }, + { + "epoch": 8.34292160180485, + "grad_norm": 1.1345009803771973, + "learning_rate": 8.28821206993796e-06, + "loss": 0.6986, + "step": 14792 + }, + { + "epoch": 8.343485617597294, + "grad_norm": 0.8092815279960632, + "learning_rate": 8.285391990975748e-06, + "loss": 0.6896, + "step": 14793 + }, + { + "epoch": 8.344049633389735, + "grad_norm": 1.0377577543258667, + "learning_rate": 8.282571912013537e-06, + "loss": 0.7375, + "step": 14794 + }, + { + "epoch": 8.344613649182177, + "grad_norm": 1.209452509880066, + "learning_rate": 8.279751833051326e-06, + "loss": 0.693, + "step": 14795 + }, + { + "epoch": 8.345177664974619, + "grad_norm": 1.0528860092163086, + "learning_rate": 8.276931754089115e-06, + "loss": 0.7322, + "step": 14796 + }, + { + "epoch": 8.34574168076706, + "grad_norm": 1.0340399742126465, + "learning_rate": 8.274111675126904e-06, + "loss": 0.7284, + "step": 14797 + }, + { + "epoch": 8.346305696559504, + "grad_norm": 1.0488417148590088, + "learning_rate": 8.271291596164692e-06, + "loss": 0.7575, + "step": 14798 + }, + { + "epoch": 8.346869712351946, + "grad_norm": 1.1983200311660767, + "learning_rate": 8.268471517202483e-06, + "loss": 0.7154, + "step": 14799 + }, + { + "epoch": 8.347433728144388, + "grad_norm": 1.3463603258132935, + "learning_rate": 8.265651438240272e-06, + "loss": 0.7013, + "step": 14800 + }, + { + "epoch": 8.34799774393683, + "grad_norm": 1.0312585830688477, + "learning_rate": 8.26283135927806e-06, + "loss": 0.5654, + "step": 14801 + }, + { + "epoch": 8.348561759729272, + "grad_norm": 1.1589910984039307, + "learning_rate": 8.26001128031585e-06, + "loss": 0.7274, + "step": 14802 + }, + { + "epoch": 8.349125775521715, + "grad_norm": 1.2404459714889526, + "learning_rate": 8.257191201353638e-06, + "loss": 0.7306, + "step": 14803 + }, + { + "epoch": 8.349689791314157, + "grad_norm": 0.8957176208496094, + "learning_rate": 8.254371122391427e-06, + "loss": 0.5652, + "step": 14804 + }, + { + "epoch": 8.350253807106599, + "grad_norm": 0.992577850818634, + "learning_rate": 8.251551043429216e-06, + "loss": 0.6198, + "step": 14805 + }, + { + "epoch": 8.35081782289904, + "grad_norm": 1.1736679077148438, + "learning_rate": 8.248730964467004e-06, + "loss": 0.7337, + "step": 14806 + }, + { + "epoch": 8.351381838691484, + "grad_norm": 1.0146602392196655, + "learning_rate": 8.245910885504795e-06, + "loss": 0.7645, + "step": 14807 + }, + { + "epoch": 8.351945854483926, + "grad_norm": 1.5467175245285034, + "learning_rate": 8.243090806542584e-06, + "loss": 0.8321, + "step": 14808 + }, + { + "epoch": 8.352509870276368, + "grad_norm": 0.9412670135498047, + "learning_rate": 8.240270727580372e-06, + "loss": 0.657, + "step": 14809 + }, + { + "epoch": 8.35307388606881, + "grad_norm": 1.0333486795425415, + "learning_rate": 8.237450648618161e-06, + "loss": 0.6895, + "step": 14810 + }, + { + "epoch": 8.353637901861251, + "grad_norm": 1.0445640087127686, + "learning_rate": 8.23463056965595e-06, + "loss": 0.7776, + "step": 14811 + }, + { + "epoch": 8.354201917653695, + "grad_norm": 0.8818947672843933, + "learning_rate": 8.231810490693739e-06, + "loss": 0.6836, + "step": 14812 + }, + { + "epoch": 8.354765933446137, + "grad_norm": 1.0464718341827393, + "learning_rate": 8.228990411731528e-06, + "loss": 0.7809, + "step": 14813 + }, + { + "epoch": 8.355329949238579, + "grad_norm": 1.192651629447937, + "learning_rate": 8.226170332769318e-06, + "loss": 0.6388, + "step": 14814 + }, + { + "epoch": 8.35589396503102, + "grad_norm": 1.214300513267517, + "learning_rate": 8.223350253807107e-06, + "loss": 0.8015, + "step": 14815 + }, + { + "epoch": 8.356457980823462, + "grad_norm": 1.0189135074615479, + "learning_rate": 8.220530174844896e-06, + "loss": 0.5829, + "step": 14816 + }, + { + "epoch": 8.357021996615906, + "grad_norm": 1.188607931137085, + "learning_rate": 8.217710095882685e-06, + "loss": 0.7302, + "step": 14817 + }, + { + "epoch": 8.357586012408348, + "grad_norm": 1.3026949167251587, + "learning_rate": 8.214890016920475e-06, + "loss": 0.7898, + "step": 14818 + }, + { + "epoch": 8.35815002820079, + "grad_norm": 1.3174656629562378, + "learning_rate": 8.212069937958264e-06, + "loss": 0.751, + "step": 14819 + }, + { + "epoch": 8.358714043993231, + "grad_norm": 1.3024004697799683, + "learning_rate": 8.209249858996053e-06, + "loss": 0.7078, + "step": 14820 + }, + { + "epoch": 8.359278059785675, + "grad_norm": 1.1291760206222534, + "learning_rate": 8.206429780033841e-06, + "loss": 0.7213, + "step": 14821 + }, + { + "epoch": 8.359842075578117, + "grad_norm": 0.9099069237709045, + "learning_rate": 8.20360970107163e-06, + "loss": 0.699, + "step": 14822 + }, + { + "epoch": 8.360406091370558, + "grad_norm": 1.331843614578247, + "learning_rate": 8.20078962210942e-06, + "loss": 0.7679, + "step": 14823 + }, + { + "epoch": 8.360970107163, + "grad_norm": 1.1917837858200073, + "learning_rate": 8.19796954314721e-06, + "loss": 0.7617, + "step": 14824 + }, + { + "epoch": 8.361534122955442, + "grad_norm": 0.9815728664398193, + "learning_rate": 8.195149464184998e-06, + "loss": 0.7568, + "step": 14825 + }, + { + "epoch": 8.362098138747886, + "grad_norm": 1.3929675817489624, + "learning_rate": 8.192329385222787e-06, + "loss": 0.6909, + "step": 14826 + }, + { + "epoch": 8.362662154540327, + "grad_norm": 1.1022471189498901, + "learning_rate": 8.189509306260576e-06, + "loss": 0.7676, + "step": 14827 + }, + { + "epoch": 8.36322617033277, + "grad_norm": 1.0886650085449219, + "learning_rate": 8.186689227298365e-06, + "loss": 0.781, + "step": 14828 + }, + { + "epoch": 8.363790186125211, + "grad_norm": 1.0543650388717651, + "learning_rate": 8.183869148336154e-06, + "loss": 0.7325, + "step": 14829 + }, + { + "epoch": 8.364354201917653, + "grad_norm": 1.286658763885498, + "learning_rate": 8.181049069373942e-06, + "loss": 0.7201, + "step": 14830 + }, + { + "epoch": 8.364918217710096, + "grad_norm": 0.9785083532333374, + "learning_rate": 8.178228990411733e-06, + "loss": 0.6858, + "step": 14831 + }, + { + "epoch": 8.365482233502538, + "grad_norm": 0.9919059872627258, + "learning_rate": 8.175408911449522e-06, + "loss": 0.6122, + "step": 14832 + }, + { + "epoch": 8.36604624929498, + "grad_norm": 1.1045076847076416, + "learning_rate": 8.17258883248731e-06, + "loss": 0.7227, + "step": 14833 + }, + { + "epoch": 8.366610265087422, + "grad_norm": 1.1214078664779663, + "learning_rate": 8.1697687535251e-06, + "loss": 0.7072, + "step": 14834 + }, + { + "epoch": 8.367174280879865, + "grad_norm": 1.0703446865081787, + "learning_rate": 8.166948674562888e-06, + "loss": 0.6769, + "step": 14835 + }, + { + "epoch": 8.367738296672307, + "grad_norm": 1.4124350547790527, + "learning_rate": 8.164128595600677e-06, + "loss": 0.6971, + "step": 14836 + }, + { + "epoch": 8.368302312464749, + "grad_norm": 0.9632684588432312, + "learning_rate": 8.161308516638466e-06, + "loss": 0.7855, + "step": 14837 + }, + { + "epoch": 8.368866328257191, + "grad_norm": 1.2223870754241943, + "learning_rate": 8.158488437676254e-06, + "loss": 0.7789, + "step": 14838 + }, + { + "epoch": 8.369430344049633, + "grad_norm": 0.7211498022079468, + "learning_rate": 8.155668358714045e-06, + "loss": 0.5958, + "step": 14839 + }, + { + "epoch": 8.369994359842076, + "grad_norm": 1.0453917980194092, + "learning_rate": 8.152848279751834e-06, + "loss": 0.7226, + "step": 14840 + }, + { + "epoch": 8.370558375634518, + "grad_norm": 1.5130912065505981, + "learning_rate": 8.150028200789622e-06, + "loss": 0.7552, + "step": 14841 + }, + { + "epoch": 8.37112239142696, + "grad_norm": 1.4445985555648804, + "learning_rate": 8.147208121827411e-06, + "loss": 0.9036, + "step": 14842 + }, + { + "epoch": 8.371686407219402, + "grad_norm": 0.854344367980957, + "learning_rate": 8.1443880428652e-06, + "loss": 0.6652, + "step": 14843 + }, + { + "epoch": 8.372250423011844, + "grad_norm": 0.8924941420555115, + "learning_rate": 8.141567963902989e-06, + "loss": 0.6713, + "step": 14844 + }, + { + "epoch": 8.372814438804287, + "grad_norm": 0.925161600112915, + "learning_rate": 8.138747884940778e-06, + "loss": 0.7371, + "step": 14845 + }, + { + "epoch": 8.373378454596729, + "grad_norm": 0.8519854545593262, + "learning_rate": 8.135927805978568e-06, + "loss": 0.6059, + "step": 14846 + }, + { + "epoch": 8.37394247038917, + "grad_norm": 1.3397693634033203, + "learning_rate": 8.133107727016357e-06, + "loss": 0.8171, + "step": 14847 + }, + { + "epoch": 8.374506486181613, + "grad_norm": 1.2100509405136108, + "learning_rate": 8.130287648054146e-06, + "loss": 0.7448, + "step": 14848 + }, + { + "epoch": 8.375070501974056, + "grad_norm": 1.247794270515442, + "learning_rate": 8.127467569091935e-06, + "loss": 0.7392, + "step": 14849 + }, + { + "epoch": 8.375634517766498, + "grad_norm": 1.0564240217208862, + "learning_rate": 8.124647490129723e-06, + "loss": 0.7244, + "step": 14850 + }, + { + "epoch": 8.37619853355894, + "grad_norm": 1.0817304849624634, + "learning_rate": 8.121827411167512e-06, + "loss": 0.8249, + "step": 14851 + }, + { + "epoch": 8.376762549351382, + "grad_norm": 1.3554819822311401, + "learning_rate": 8.119007332205301e-06, + "loss": 0.6839, + "step": 14852 + }, + { + "epoch": 8.377326565143823, + "grad_norm": 1.3223354816436768, + "learning_rate": 8.116187253243091e-06, + "loss": 0.745, + "step": 14853 + }, + { + "epoch": 8.377890580936267, + "grad_norm": 1.3630825281143188, + "learning_rate": 8.11336717428088e-06, + "loss": 0.7462, + "step": 14854 + }, + { + "epoch": 8.378454596728709, + "grad_norm": 1.4267535209655762, + "learning_rate": 8.11054709531867e-06, + "loss": 0.7703, + "step": 14855 + }, + { + "epoch": 8.37901861252115, + "grad_norm": 1.4715306758880615, + "learning_rate": 8.10772701635646e-06, + "loss": 0.7748, + "step": 14856 + }, + { + "epoch": 8.379582628313592, + "grad_norm": 1.3941630125045776, + "learning_rate": 8.104906937394248e-06, + "loss": 0.8133, + "step": 14857 + }, + { + "epoch": 8.380146644106034, + "grad_norm": 1.2871387004852295, + "learning_rate": 8.102086858432037e-06, + "loss": 0.7476, + "step": 14858 + }, + { + "epoch": 8.380710659898478, + "grad_norm": 1.1262444257736206, + "learning_rate": 8.099266779469826e-06, + "loss": 0.6738, + "step": 14859 + }, + { + "epoch": 8.38127467569092, + "grad_norm": 1.1983343362808228, + "learning_rate": 8.096446700507615e-06, + "loss": 0.6461, + "step": 14860 + }, + { + "epoch": 8.381838691483361, + "grad_norm": 1.0568450689315796, + "learning_rate": 8.093626621545403e-06, + "loss": 0.6761, + "step": 14861 + }, + { + "epoch": 8.382402707275803, + "grad_norm": 1.0042394399642944, + "learning_rate": 8.090806542583192e-06, + "loss": 0.7026, + "step": 14862 + }, + { + "epoch": 8.382966723068247, + "grad_norm": 1.329291820526123, + "learning_rate": 8.087986463620983e-06, + "loss": 0.7726, + "step": 14863 + }, + { + "epoch": 8.383530738860689, + "grad_norm": 1.078692078590393, + "learning_rate": 8.085166384658772e-06, + "loss": 0.6831, + "step": 14864 + }, + { + "epoch": 8.38409475465313, + "grad_norm": 1.4808740615844727, + "learning_rate": 8.08234630569656e-06, + "loss": 0.6964, + "step": 14865 + }, + { + "epoch": 8.384658770445572, + "grad_norm": 1.1421031951904297, + "learning_rate": 8.079526226734349e-06, + "loss": 0.7405, + "step": 14866 + }, + { + "epoch": 8.385222786238014, + "grad_norm": 1.9792863130569458, + "learning_rate": 8.076706147772138e-06, + "loss": 0.8031, + "step": 14867 + }, + { + "epoch": 8.385786802030458, + "grad_norm": 1.1213303804397583, + "learning_rate": 8.073886068809927e-06, + "loss": 0.6935, + "step": 14868 + }, + { + "epoch": 8.3863508178229, + "grad_norm": 1.4756401777267456, + "learning_rate": 8.071065989847716e-06, + "loss": 0.7449, + "step": 14869 + }, + { + "epoch": 8.386914833615341, + "grad_norm": 1.2166117429733276, + "learning_rate": 8.068245910885506e-06, + "loss": 0.7509, + "step": 14870 + }, + { + "epoch": 8.387478849407783, + "grad_norm": 1.0651090145111084, + "learning_rate": 8.065425831923295e-06, + "loss": 0.6597, + "step": 14871 + }, + { + "epoch": 8.388042865200225, + "grad_norm": 1.2776036262512207, + "learning_rate": 8.062605752961084e-06, + "loss": 0.7414, + "step": 14872 + }, + { + "epoch": 8.388606880992668, + "grad_norm": 1.2027548551559448, + "learning_rate": 8.059785673998872e-06, + "loss": 0.6643, + "step": 14873 + }, + { + "epoch": 8.38917089678511, + "grad_norm": 1.2607258558273315, + "learning_rate": 8.056965595036661e-06, + "loss": 0.7642, + "step": 14874 + }, + { + "epoch": 8.389734912577552, + "grad_norm": 1.1039398908615112, + "learning_rate": 8.05414551607445e-06, + "loss": 0.7483, + "step": 14875 + }, + { + "epoch": 8.390298928369994, + "grad_norm": 1.0198602676391602, + "learning_rate": 8.051325437112239e-06, + "loss": 0.6824, + "step": 14876 + }, + { + "epoch": 8.390862944162437, + "grad_norm": 1.0681679248809814, + "learning_rate": 8.048505358150028e-06, + "loss": 0.7809, + "step": 14877 + }, + { + "epoch": 8.39142695995488, + "grad_norm": 1.3124432563781738, + "learning_rate": 8.045685279187818e-06, + "loss": 0.765, + "step": 14878 + }, + { + "epoch": 8.391990975747321, + "grad_norm": 1.0148530006408691, + "learning_rate": 8.042865200225607e-06, + "loss": 0.7955, + "step": 14879 + }, + { + "epoch": 8.392554991539763, + "grad_norm": 1.180572509765625, + "learning_rate": 8.040045121263396e-06, + "loss": 0.8103, + "step": 14880 + }, + { + "epoch": 8.393119007332205, + "grad_norm": 0.8119643926620483, + "learning_rate": 8.037225042301184e-06, + "loss": 0.6811, + "step": 14881 + }, + { + "epoch": 8.393683023124648, + "grad_norm": 0.8296633362770081, + "learning_rate": 8.034404963338973e-06, + "loss": 0.6489, + "step": 14882 + }, + { + "epoch": 8.39424703891709, + "grad_norm": 1.0064482688903809, + "learning_rate": 8.031584884376762e-06, + "loss": 0.689, + "step": 14883 + }, + { + "epoch": 8.394811054709532, + "grad_norm": 0.9958735108375549, + "learning_rate": 8.028764805414551e-06, + "loss": 0.6827, + "step": 14884 + }, + { + "epoch": 8.395375070501974, + "grad_norm": 1.2529873847961426, + "learning_rate": 8.02594472645234e-06, + "loss": 0.7719, + "step": 14885 + }, + { + "epoch": 8.395939086294415, + "grad_norm": 0.9372895956039429, + "learning_rate": 8.02312464749013e-06, + "loss": 0.7846, + "step": 14886 + }, + { + "epoch": 8.396503102086859, + "grad_norm": 1.2735443115234375, + "learning_rate": 8.020304568527919e-06, + "loss": 0.6529, + "step": 14887 + }, + { + "epoch": 8.3970671178793, + "grad_norm": 0.9084073901176453, + "learning_rate": 8.01748448956571e-06, + "loss": 0.741, + "step": 14888 + }, + { + "epoch": 8.397631133671743, + "grad_norm": 0.9710484743118286, + "learning_rate": 8.014664410603498e-06, + "loss": 0.7809, + "step": 14889 + }, + { + "epoch": 8.398195149464184, + "grad_norm": 1.1997123956680298, + "learning_rate": 8.011844331641287e-06, + "loss": 0.8291, + "step": 14890 + }, + { + "epoch": 8.398759165256628, + "grad_norm": 0.8331245183944702, + "learning_rate": 8.009024252679076e-06, + "loss": 0.7221, + "step": 14891 + }, + { + "epoch": 8.39932318104907, + "grad_norm": 0.8879786729812622, + "learning_rate": 8.006204173716865e-06, + "loss": 0.6055, + "step": 14892 + }, + { + "epoch": 8.399887196841512, + "grad_norm": 1.713029146194458, + "learning_rate": 8.003384094754653e-06, + "loss": 0.7398, + "step": 14893 + }, + { + "epoch": 8.400451212633953, + "grad_norm": 1.010624885559082, + "learning_rate": 8.000564015792442e-06, + "loss": 0.6101, + "step": 14894 + }, + { + "epoch": 8.401015228426395, + "grad_norm": 0.7909660935401917, + "learning_rate": 7.997743936830233e-06, + "loss": 0.6405, + "step": 14895 + }, + { + "epoch": 8.401579244218839, + "grad_norm": 0.9703121781349182, + "learning_rate": 7.994923857868022e-06, + "loss": 0.8133, + "step": 14896 + }, + { + "epoch": 8.40214326001128, + "grad_norm": 1.1050735712051392, + "learning_rate": 7.99210377890581e-06, + "loss": 0.7225, + "step": 14897 + }, + { + "epoch": 8.402707275803722, + "grad_norm": 1.0670698881149292, + "learning_rate": 7.989283699943599e-06, + "loss": 0.7553, + "step": 14898 + }, + { + "epoch": 8.403271291596164, + "grad_norm": 1.1727728843688965, + "learning_rate": 7.986463620981388e-06, + "loss": 0.7092, + "step": 14899 + }, + { + "epoch": 8.403835307388606, + "grad_norm": 1.6756969690322876, + "learning_rate": 7.983643542019177e-06, + "loss": 0.8778, + "step": 14900 + }, + { + "epoch": 8.40439932318105, + "grad_norm": 1.0106443166732788, + "learning_rate": 7.980823463056965e-06, + "loss": 0.7404, + "step": 14901 + }, + { + "epoch": 8.404963338973491, + "grad_norm": 0.9911119937896729, + "learning_rate": 7.978003384094756e-06, + "loss": 0.6983, + "step": 14902 + }, + { + "epoch": 8.405527354765933, + "grad_norm": 1.1414073705673218, + "learning_rate": 7.975183305132545e-06, + "loss": 0.73, + "step": 14903 + }, + { + "epoch": 8.406091370558375, + "grad_norm": 1.4740593433380127, + "learning_rate": 7.972363226170334e-06, + "loss": 0.699, + "step": 14904 + }, + { + "epoch": 8.406655386350819, + "grad_norm": 0.9021777510643005, + "learning_rate": 7.969543147208122e-06, + "loss": 0.6644, + "step": 14905 + }, + { + "epoch": 8.40721940214326, + "grad_norm": 0.9842178821563721, + "learning_rate": 7.966723068245911e-06, + "loss": 0.784, + "step": 14906 + }, + { + "epoch": 8.407783417935702, + "grad_norm": 1.050978660583496, + "learning_rate": 7.9639029892837e-06, + "loss": 0.6809, + "step": 14907 + }, + { + "epoch": 8.408347433728144, + "grad_norm": 1.0294240713119507, + "learning_rate": 7.961082910321489e-06, + "loss": 0.7419, + "step": 14908 + }, + { + "epoch": 8.408911449520586, + "grad_norm": 1.6057556867599487, + "learning_rate": 7.958262831359278e-06, + "loss": 0.6508, + "step": 14909 + }, + { + "epoch": 8.40947546531303, + "grad_norm": 1.3474377393722534, + "learning_rate": 7.955442752397068e-06, + "loss": 0.7795, + "step": 14910 + }, + { + "epoch": 8.410039481105471, + "grad_norm": 0.9504514336585999, + "learning_rate": 7.952622673434857e-06, + "loss": 0.6707, + "step": 14911 + }, + { + "epoch": 8.410603496897913, + "grad_norm": 1.358525276184082, + "learning_rate": 7.949802594472646e-06, + "loss": 0.6602, + "step": 14912 + }, + { + "epoch": 8.411167512690355, + "grad_norm": 1.247259497642517, + "learning_rate": 7.946982515510434e-06, + "loss": 0.7711, + "step": 14913 + }, + { + "epoch": 8.411731528482797, + "grad_norm": 1.0731215476989746, + "learning_rate": 7.944162436548223e-06, + "loss": 0.663, + "step": 14914 + }, + { + "epoch": 8.41229554427524, + "grad_norm": 1.3280155658721924, + "learning_rate": 7.941342357586012e-06, + "loss": 0.7276, + "step": 14915 + }, + { + "epoch": 8.412859560067682, + "grad_norm": 0.8425941467285156, + "learning_rate": 7.9385222786238e-06, + "loss": 0.7034, + "step": 14916 + }, + { + "epoch": 8.413423575860124, + "grad_norm": 1.1978697776794434, + "learning_rate": 7.93570219966159e-06, + "loss": 0.7866, + "step": 14917 + }, + { + "epoch": 8.413987591652566, + "grad_norm": 0.9989462494850159, + "learning_rate": 7.93288212069938e-06, + "loss": 0.7459, + "step": 14918 + }, + { + "epoch": 8.41455160744501, + "grad_norm": 1.0837154388427734, + "learning_rate": 7.930062041737169e-06, + "loss": 0.784, + "step": 14919 + }, + { + "epoch": 8.415115623237451, + "grad_norm": 1.1532952785491943, + "learning_rate": 7.927241962774958e-06, + "loss": 0.7328, + "step": 14920 + }, + { + "epoch": 8.415679639029893, + "grad_norm": 0.8777796626091003, + "learning_rate": 7.924421883812747e-06, + "loss": 0.6002, + "step": 14921 + }, + { + "epoch": 8.416243654822335, + "grad_norm": 1.2093801498413086, + "learning_rate": 7.921601804850535e-06, + "loss": 0.8018, + "step": 14922 + }, + { + "epoch": 8.416807670614777, + "grad_norm": 0.9380366802215576, + "learning_rate": 7.918781725888324e-06, + "loss": 0.7624, + "step": 14923 + }, + { + "epoch": 8.41737168640722, + "grad_norm": 1.1976631879806519, + "learning_rate": 7.915961646926115e-06, + "loss": 0.7353, + "step": 14924 + }, + { + "epoch": 8.417935702199662, + "grad_norm": 0.8591684699058533, + "learning_rate": 7.913141567963903e-06, + "loss": 0.6328, + "step": 14925 + }, + { + "epoch": 8.418499717992104, + "grad_norm": 1.0391805171966553, + "learning_rate": 7.910321489001692e-06, + "loss": 0.7548, + "step": 14926 + }, + { + "epoch": 8.419063733784546, + "grad_norm": 0.9126459956169128, + "learning_rate": 7.907501410039483e-06, + "loss": 0.6358, + "step": 14927 + }, + { + "epoch": 8.419627749576987, + "grad_norm": 1.1571255922317505, + "learning_rate": 7.904681331077271e-06, + "loss": 0.8847, + "step": 14928 + }, + { + "epoch": 8.420191765369431, + "grad_norm": 1.0009803771972656, + "learning_rate": 7.90186125211506e-06, + "loss": 0.7427, + "step": 14929 + }, + { + "epoch": 8.420755781161873, + "grad_norm": 1.002811074256897, + "learning_rate": 7.899041173152849e-06, + "loss": 0.7295, + "step": 14930 + }, + { + "epoch": 8.421319796954315, + "grad_norm": 1.2934373617172241, + "learning_rate": 7.896221094190638e-06, + "loss": 0.7686, + "step": 14931 + }, + { + "epoch": 8.421883812746756, + "grad_norm": 0.9789605140686035, + "learning_rate": 7.893401015228427e-06, + "loss": 0.7436, + "step": 14932 + }, + { + "epoch": 8.4224478285392, + "grad_norm": 1.18302583694458, + "learning_rate": 7.890580936266215e-06, + "loss": 0.8098, + "step": 14933 + }, + { + "epoch": 8.423011844331642, + "grad_norm": 1.6706980466842651, + "learning_rate": 7.887760857304006e-06, + "loss": 0.7712, + "step": 14934 + }, + { + "epoch": 8.423575860124084, + "grad_norm": 1.0056219100952148, + "learning_rate": 7.884940778341795e-06, + "loss": 0.7687, + "step": 14935 + }, + { + "epoch": 8.424139875916525, + "grad_norm": 1.0048973560333252, + "learning_rate": 7.882120699379584e-06, + "loss": 0.6923, + "step": 14936 + }, + { + "epoch": 8.424703891708967, + "grad_norm": 1.2230726480484009, + "learning_rate": 7.879300620417372e-06, + "loss": 0.7719, + "step": 14937 + }, + { + "epoch": 8.42526790750141, + "grad_norm": 1.0260028839111328, + "learning_rate": 7.876480541455161e-06, + "loss": 0.6574, + "step": 14938 + }, + { + "epoch": 8.425831923293853, + "grad_norm": 1.1220024824142456, + "learning_rate": 7.87366046249295e-06, + "loss": 0.7915, + "step": 14939 + }, + { + "epoch": 8.426395939086294, + "grad_norm": 1.2645471096038818, + "learning_rate": 7.870840383530739e-06, + "loss": 0.8835, + "step": 14940 + }, + { + "epoch": 8.426959954878736, + "grad_norm": 1.47337007522583, + "learning_rate": 7.868020304568528e-06, + "loss": 0.6708, + "step": 14941 + }, + { + "epoch": 8.427523970671178, + "grad_norm": 1.096015214920044, + "learning_rate": 7.865200225606318e-06, + "loss": 0.7043, + "step": 14942 + }, + { + "epoch": 8.428087986463622, + "grad_norm": 1.3364489078521729, + "learning_rate": 7.862380146644107e-06, + "loss": 0.7971, + "step": 14943 + }, + { + "epoch": 8.428652002256063, + "grad_norm": 0.9860238432884216, + "learning_rate": 7.859560067681896e-06, + "loss": 0.7172, + "step": 14944 + }, + { + "epoch": 8.429216018048505, + "grad_norm": 0.9061923623085022, + "learning_rate": 7.856739988719684e-06, + "loss": 0.7696, + "step": 14945 + }, + { + "epoch": 8.429780033840947, + "grad_norm": 0.9679127931594849, + "learning_rate": 7.853919909757473e-06, + "loss": 0.761, + "step": 14946 + }, + { + "epoch": 8.43034404963339, + "grad_norm": 1.0371006727218628, + "learning_rate": 7.851099830795262e-06, + "loss": 0.7122, + "step": 14947 + }, + { + "epoch": 8.430908065425832, + "grad_norm": 1.5122026205062866, + "learning_rate": 7.84827975183305e-06, + "loss": 0.8023, + "step": 14948 + }, + { + "epoch": 8.431472081218274, + "grad_norm": 1.1405292749404907, + "learning_rate": 7.845459672870841e-06, + "loss": 0.7391, + "step": 14949 + }, + { + "epoch": 8.432036097010716, + "grad_norm": 1.0531507730484009, + "learning_rate": 7.84263959390863e-06, + "loss": 0.5982, + "step": 14950 + }, + { + "epoch": 8.432600112803158, + "grad_norm": 1.3185335397720337, + "learning_rate": 7.839819514946419e-06, + "loss": 0.8165, + "step": 14951 + }, + { + "epoch": 8.433164128595601, + "grad_norm": 0.9955393075942993, + "learning_rate": 7.836999435984208e-06, + "loss": 0.6478, + "step": 14952 + }, + { + "epoch": 8.433728144388043, + "grad_norm": 0.9424549341201782, + "learning_rate": 7.834179357021996e-06, + "loss": 0.7216, + "step": 14953 + }, + { + "epoch": 8.434292160180485, + "grad_norm": 1.088079571723938, + "learning_rate": 7.831359278059785e-06, + "loss": 0.73, + "step": 14954 + }, + { + "epoch": 8.434856175972927, + "grad_norm": 1.329513430595398, + "learning_rate": 7.828539199097574e-06, + "loss": 0.7369, + "step": 14955 + }, + { + "epoch": 8.435420191765369, + "grad_norm": 0.9059813022613525, + "learning_rate": 7.825719120135363e-06, + "loss": 0.6787, + "step": 14956 + }, + { + "epoch": 8.435984207557812, + "grad_norm": 1.0688778162002563, + "learning_rate": 7.822899041173153e-06, + "loss": 0.7596, + "step": 14957 + }, + { + "epoch": 8.436548223350254, + "grad_norm": 1.180153250694275, + "learning_rate": 7.820078962210942e-06, + "loss": 0.7112, + "step": 14958 + }, + { + "epoch": 8.437112239142696, + "grad_norm": 1.1912717819213867, + "learning_rate": 7.817258883248733e-06, + "loss": 0.7569, + "step": 14959 + }, + { + "epoch": 8.437676254935138, + "grad_norm": 1.5776715278625488, + "learning_rate": 7.814438804286521e-06, + "loss": 0.7395, + "step": 14960 + }, + { + "epoch": 8.438240270727581, + "grad_norm": 1.2959620952606201, + "learning_rate": 7.81161872532431e-06, + "loss": 0.6991, + "step": 14961 + }, + { + "epoch": 8.438804286520023, + "grad_norm": 0.9843363761901855, + "learning_rate": 7.808798646362099e-06, + "loss": 0.701, + "step": 14962 + }, + { + "epoch": 8.439368302312465, + "grad_norm": 1.0567364692687988, + "learning_rate": 7.805978567399888e-06, + "loss": 0.6829, + "step": 14963 + }, + { + "epoch": 8.439932318104907, + "grad_norm": 0.8616993427276611, + "learning_rate": 7.803158488437677e-06, + "loss": 0.7075, + "step": 14964 + }, + { + "epoch": 8.440496333897348, + "grad_norm": 0.9013767242431641, + "learning_rate": 7.800338409475465e-06, + "loss": 0.7232, + "step": 14965 + }, + { + "epoch": 8.441060349689792, + "grad_norm": 1.3386013507843018, + "learning_rate": 7.797518330513256e-06, + "loss": 0.7303, + "step": 14966 + }, + { + "epoch": 8.441624365482234, + "grad_norm": 1.033847451210022, + "learning_rate": 7.794698251551045e-06, + "loss": 0.756, + "step": 14967 + }, + { + "epoch": 8.442188381274676, + "grad_norm": 1.358120083808899, + "learning_rate": 7.791878172588833e-06, + "loss": 0.8719, + "step": 14968 + }, + { + "epoch": 8.442752397067117, + "grad_norm": 1.185025691986084, + "learning_rate": 7.789058093626622e-06, + "loss": 0.7112, + "step": 14969 + }, + { + "epoch": 8.44331641285956, + "grad_norm": 1.2600386142730713, + "learning_rate": 7.786238014664411e-06, + "loss": 0.8157, + "step": 14970 + }, + { + "epoch": 8.443880428652003, + "grad_norm": 1.3030850887298584, + "learning_rate": 7.7834179357022e-06, + "loss": 0.7611, + "step": 14971 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 1.0873974561691284, + "learning_rate": 7.780597856739989e-06, + "loss": 0.7122, + "step": 14972 + }, + { + "epoch": 8.445008460236886, + "grad_norm": 1.5035772323608398, + "learning_rate": 7.777777777777777e-06, + "loss": 0.7934, + "step": 14973 + }, + { + "epoch": 8.445572476029328, + "grad_norm": 1.0091071128845215, + "learning_rate": 7.774957698815568e-06, + "loss": 0.793, + "step": 14974 + }, + { + "epoch": 8.446136491821772, + "grad_norm": 1.0316674709320068, + "learning_rate": 7.772137619853357e-06, + "loss": 0.6824, + "step": 14975 + }, + { + "epoch": 8.446700507614214, + "grad_norm": 0.9475125670433044, + "learning_rate": 7.769317540891146e-06, + "loss": 0.7892, + "step": 14976 + }, + { + "epoch": 8.447264523406655, + "grad_norm": 1.0940755605697632, + "learning_rate": 7.766497461928934e-06, + "loss": 0.6764, + "step": 14977 + }, + { + "epoch": 8.447828539199097, + "grad_norm": 1.3441095352172852, + "learning_rate": 7.763677382966723e-06, + "loss": 0.6891, + "step": 14978 + }, + { + "epoch": 8.448392554991539, + "grad_norm": 1.5407845973968506, + "learning_rate": 7.760857304004512e-06, + "loss": 0.7285, + "step": 14979 + }, + { + "epoch": 8.448956570783983, + "grad_norm": 1.0767629146575928, + "learning_rate": 7.7580372250423e-06, + "loss": 0.8531, + "step": 14980 + }, + { + "epoch": 8.449520586576424, + "grad_norm": 1.0241498947143555, + "learning_rate": 7.755217146080091e-06, + "loss": 0.7172, + "step": 14981 + }, + { + "epoch": 8.450084602368866, + "grad_norm": 1.103499174118042, + "learning_rate": 7.75239706711788e-06, + "loss": 0.7137, + "step": 14982 + }, + { + "epoch": 8.450648618161308, + "grad_norm": 1.2992866039276123, + "learning_rate": 7.749576988155669e-06, + "loss": 0.6824, + "step": 14983 + }, + { + "epoch": 8.45121263395375, + "grad_norm": 0.9624170660972595, + "learning_rate": 7.746756909193458e-06, + "loss": 0.6652, + "step": 14984 + }, + { + "epoch": 8.451776649746193, + "grad_norm": 1.4642291069030762, + "learning_rate": 7.743936830231246e-06, + "loss": 0.6975, + "step": 14985 + }, + { + "epoch": 8.452340665538635, + "grad_norm": 0.8672617077827454, + "learning_rate": 7.741116751269035e-06, + "loss": 0.6665, + "step": 14986 + }, + { + "epoch": 8.452904681331077, + "grad_norm": 2.948418378829956, + "learning_rate": 7.738296672306824e-06, + "loss": 0.6968, + "step": 14987 + }, + { + "epoch": 8.453468697123519, + "grad_norm": 1.483472228050232, + "learning_rate": 7.735476593344613e-06, + "loss": 0.8, + "step": 14988 + }, + { + "epoch": 8.454032712915962, + "grad_norm": 0.9111480116844177, + "learning_rate": 7.732656514382403e-06, + "loss": 0.6434, + "step": 14989 + }, + { + "epoch": 8.454596728708404, + "grad_norm": 1.0602643489837646, + "learning_rate": 7.729836435420192e-06, + "loss": 0.6701, + "step": 14990 + }, + { + "epoch": 8.455160744500846, + "grad_norm": 1.0268350839614868, + "learning_rate": 7.727016356457981e-06, + "loss": 0.6764, + "step": 14991 + }, + { + "epoch": 8.455724760293288, + "grad_norm": 0.963785707950592, + "learning_rate": 7.72419627749577e-06, + "loss": 0.7629, + "step": 14992 + }, + { + "epoch": 8.45628877608573, + "grad_norm": 1.0524944067001343, + "learning_rate": 7.721376198533558e-06, + "loss": 0.6971, + "step": 14993 + }, + { + "epoch": 8.456852791878173, + "grad_norm": 0.9926384687423706, + "learning_rate": 7.718556119571349e-06, + "loss": 0.7148, + "step": 14994 + }, + { + "epoch": 8.457416807670615, + "grad_norm": 0.8195834755897522, + "learning_rate": 7.715736040609138e-06, + "loss": 0.6838, + "step": 14995 + }, + { + "epoch": 8.457980823463057, + "grad_norm": 1.4161304235458374, + "learning_rate": 7.712915961646927e-06, + "loss": 0.7874, + "step": 14996 + }, + { + "epoch": 8.458544839255499, + "grad_norm": 1.2136950492858887, + "learning_rate": 7.710095882684715e-06, + "loss": 0.7419, + "step": 14997 + }, + { + "epoch": 8.45910885504794, + "grad_norm": 1.0875271558761597, + "learning_rate": 7.707275803722506e-06, + "loss": 0.6316, + "step": 14998 + }, + { + "epoch": 8.459672870840384, + "grad_norm": 1.2251660823822021, + "learning_rate": 7.704455724760295e-06, + "loss": 0.739, + "step": 14999 + }, + { + "epoch": 8.460236886632826, + "grad_norm": 1.2987197637557983, + "learning_rate": 7.701635645798083e-06, + "loss": 0.7811, + "step": 15000 + }, + { + "epoch": 8.460800902425268, + "grad_norm": 1.3454500436782837, + "learning_rate": 7.698815566835872e-06, + "loss": 0.8475, + "step": 15001 + }, + { + "epoch": 8.46136491821771, + "grad_norm": 0.9557936787605286, + "learning_rate": 7.695995487873661e-06, + "loss": 0.7639, + "step": 15002 + }, + { + "epoch": 8.461928934010153, + "grad_norm": 1.3074063062667847, + "learning_rate": 7.69317540891145e-06, + "loss": 0.6922, + "step": 15003 + }, + { + "epoch": 8.462492949802595, + "grad_norm": 1.0488160848617554, + "learning_rate": 7.690355329949239e-06, + "loss": 0.7772, + "step": 15004 + }, + { + "epoch": 8.463056965595037, + "grad_norm": 0.9458208084106445, + "learning_rate": 7.687535250987027e-06, + "loss": 0.7096, + "step": 15005 + }, + { + "epoch": 8.463620981387479, + "grad_norm": 0.8864594101905823, + "learning_rate": 7.684715172024818e-06, + "loss": 0.6319, + "step": 15006 + }, + { + "epoch": 8.46418499717992, + "grad_norm": 0.9436306357383728, + "learning_rate": 7.681895093062607e-06, + "loss": 0.7783, + "step": 15007 + }, + { + "epoch": 8.464749012972364, + "grad_norm": 0.7593825459480286, + "learning_rate": 7.679075014100396e-06, + "loss": 0.6459, + "step": 15008 + }, + { + "epoch": 8.465313028764806, + "grad_norm": 1.0517736673355103, + "learning_rate": 7.676254935138184e-06, + "loss": 0.7542, + "step": 15009 + }, + { + "epoch": 8.465877044557248, + "grad_norm": 1.3843797445297241, + "learning_rate": 7.673434856175973e-06, + "loss": 0.7715, + "step": 15010 + }, + { + "epoch": 8.46644106034969, + "grad_norm": 0.972857654094696, + "learning_rate": 7.670614777213762e-06, + "loss": 0.6985, + "step": 15011 + }, + { + "epoch": 8.467005076142131, + "grad_norm": 1.0607064962387085, + "learning_rate": 7.66779469825155e-06, + "loss": 0.8021, + "step": 15012 + }, + { + "epoch": 8.467569091934575, + "grad_norm": 0.9745750427246094, + "learning_rate": 7.664974619289341e-06, + "loss": 0.7305, + "step": 15013 + }, + { + "epoch": 8.468133107727017, + "grad_norm": 1.0327194929122925, + "learning_rate": 7.66215454032713e-06, + "loss": 0.7081, + "step": 15014 + }, + { + "epoch": 8.468697123519458, + "grad_norm": 1.1559077501296997, + "learning_rate": 7.659334461364919e-06, + "loss": 0.7213, + "step": 15015 + }, + { + "epoch": 8.4692611393119, + "grad_norm": 1.3429977893829346, + "learning_rate": 7.656514382402708e-06, + "loss": 0.7071, + "step": 15016 + }, + { + "epoch": 8.469825155104344, + "grad_norm": 0.984678328037262, + "learning_rate": 7.653694303440496e-06, + "loss": 0.7918, + "step": 15017 + }, + { + "epoch": 8.470389170896786, + "grad_norm": 1.3265153169631958, + "learning_rate": 7.650874224478285e-06, + "loss": 0.769, + "step": 15018 + }, + { + "epoch": 8.470953186689227, + "grad_norm": 1.0942813158035278, + "learning_rate": 7.648054145516074e-06, + "loss": 0.7368, + "step": 15019 + }, + { + "epoch": 8.47151720248167, + "grad_norm": 1.174698829650879, + "learning_rate": 7.645234066553863e-06, + "loss": 0.6771, + "step": 15020 + }, + { + "epoch": 8.472081218274111, + "grad_norm": 1.1351511478424072, + "learning_rate": 7.642413987591653e-06, + "loss": 0.6824, + "step": 15021 + }, + { + "epoch": 8.472645234066555, + "grad_norm": 1.1840460300445557, + "learning_rate": 7.639593908629442e-06, + "loss": 0.6906, + "step": 15022 + }, + { + "epoch": 8.473209249858996, + "grad_norm": 0.9757822155952454, + "learning_rate": 7.636773829667231e-06, + "loss": 0.75, + "step": 15023 + }, + { + "epoch": 8.473773265651438, + "grad_norm": 0.8901917338371277, + "learning_rate": 7.63395375070502e-06, + "loss": 0.6484, + "step": 15024 + }, + { + "epoch": 8.47433728144388, + "grad_norm": 1.4648821353912354, + "learning_rate": 7.631133671742808e-06, + "loss": 0.766, + "step": 15025 + }, + { + "epoch": 8.474901297236322, + "grad_norm": 0.8206763863563538, + "learning_rate": 7.628313592780597e-06, + "loss": 0.6727, + "step": 15026 + }, + { + "epoch": 8.475465313028765, + "grad_norm": 1.085073709487915, + "learning_rate": 7.625493513818387e-06, + "loss": 0.6615, + "step": 15027 + }, + { + "epoch": 8.476029328821207, + "grad_norm": 1.2487804889678955, + "learning_rate": 7.622673434856176e-06, + "loss": 0.765, + "step": 15028 + }, + { + "epoch": 8.476593344613649, + "grad_norm": 1.3121616840362549, + "learning_rate": 7.6198533558939645e-06, + "loss": 0.7481, + "step": 15029 + }, + { + "epoch": 8.47715736040609, + "grad_norm": 1.1783647537231445, + "learning_rate": 7.617033276931755e-06, + "loss": 0.6744, + "step": 15030 + }, + { + "epoch": 8.477721376198534, + "grad_norm": 0.8510147929191589, + "learning_rate": 7.614213197969544e-06, + "loss": 0.7137, + "step": 15031 + }, + { + "epoch": 8.478285391990976, + "grad_norm": 1.0658727884292603, + "learning_rate": 7.611393119007333e-06, + "loss": 0.7127, + "step": 15032 + }, + { + "epoch": 8.478849407783418, + "grad_norm": 1.000308871269226, + "learning_rate": 7.608573040045122e-06, + "loss": 0.7774, + "step": 15033 + }, + { + "epoch": 8.47941342357586, + "grad_norm": 0.8721605539321899, + "learning_rate": 7.605752961082911e-06, + "loss": 0.6847, + "step": 15034 + }, + { + "epoch": 8.479977439368302, + "grad_norm": 0.9843941926956177, + "learning_rate": 7.6029328821207e-06, + "loss": 0.6737, + "step": 15035 + }, + { + "epoch": 8.480541455160745, + "grad_norm": 1.099317193031311, + "learning_rate": 7.6001128031584894e-06, + "loss": 0.7681, + "step": 15036 + }, + { + "epoch": 8.481105470953187, + "grad_norm": 0.9297100901603699, + "learning_rate": 7.597292724196278e-06, + "loss": 0.6696, + "step": 15037 + }, + { + "epoch": 8.481669486745629, + "grad_norm": 1.194954752922058, + "learning_rate": 7.594472645234067e-06, + "loss": 0.7547, + "step": 15038 + }, + { + "epoch": 8.48223350253807, + "grad_norm": 0.9223700165748596, + "learning_rate": 7.591652566271857e-06, + "loss": 0.7452, + "step": 15039 + }, + { + "epoch": 8.482797518330512, + "grad_norm": 1.0678393840789795, + "learning_rate": 7.5888324873096455e-06, + "loss": 0.689, + "step": 15040 + }, + { + "epoch": 8.483361534122956, + "grad_norm": 1.571291446685791, + "learning_rate": 7.586012408347434e-06, + "loss": 0.8182, + "step": 15041 + }, + { + "epoch": 8.483925549915398, + "grad_norm": 1.1440399885177612, + "learning_rate": 7.583192329385223e-06, + "loss": 0.7471, + "step": 15042 + }, + { + "epoch": 8.48448956570784, + "grad_norm": 1.2423659563064575, + "learning_rate": 7.580372250423013e-06, + "loss": 0.8198, + "step": 15043 + }, + { + "epoch": 8.485053581500281, + "grad_norm": 1.2552623748779297, + "learning_rate": 7.5775521714608015e-06, + "loss": 0.7366, + "step": 15044 + }, + { + "epoch": 8.485617597292725, + "grad_norm": 0.9864591956138611, + "learning_rate": 7.57473209249859e-06, + "loss": 0.7546, + "step": 15045 + }, + { + "epoch": 8.486181613085167, + "grad_norm": 1.301412582397461, + "learning_rate": 7.571912013536379e-06, + "loss": 0.7892, + "step": 15046 + }, + { + "epoch": 8.486745628877609, + "grad_norm": 0.968721866607666, + "learning_rate": 7.569091934574169e-06, + "loss": 0.6823, + "step": 15047 + }, + { + "epoch": 8.48730964467005, + "grad_norm": 0.9290567636489868, + "learning_rate": 7.5662718556119575e-06, + "loss": 0.6348, + "step": 15048 + }, + { + "epoch": 8.487873660462492, + "grad_norm": 1.2687859535217285, + "learning_rate": 7.563451776649746e-06, + "loss": 0.8001, + "step": 15049 + }, + { + "epoch": 8.488437676254936, + "grad_norm": 1.491166353225708, + "learning_rate": 7.560631697687535e-06, + "loss": 0.707, + "step": 15050 + }, + { + "epoch": 8.489001692047378, + "grad_norm": 1.0414966344833374, + "learning_rate": 7.557811618725325e-06, + "loss": 0.6758, + "step": 15051 + }, + { + "epoch": 8.48956570783982, + "grad_norm": 1.2387478351593018, + "learning_rate": 7.5549915397631136e-06, + "loss": 0.6963, + "step": 15052 + }, + { + "epoch": 8.490129723632261, + "grad_norm": 1.0192700624465942, + "learning_rate": 7.552171460800902e-06, + "loss": 0.7319, + "step": 15053 + }, + { + "epoch": 8.490693739424703, + "grad_norm": 1.057407021522522, + "learning_rate": 7.549351381838691e-06, + "loss": 0.7702, + "step": 15054 + }, + { + "epoch": 8.491257755217147, + "grad_norm": 1.452993631362915, + "learning_rate": 7.546531302876481e-06, + "loss": 0.8012, + "step": 15055 + }, + { + "epoch": 8.491821771009588, + "grad_norm": 1.2649613618850708, + "learning_rate": 7.54371122391427e-06, + "loss": 0.7892, + "step": 15056 + }, + { + "epoch": 8.49238578680203, + "grad_norm": 1.4271585941314697, + "learning_rate": 7.540891144952058e-06, + "loss": 0.7562, + "step": 15057 + }, + { + "epoch": 8.492949802594472, + "grad_norm": 0.9514934420585632, + "learning_rate": 7.538071065989848e-06, + "loss": 0.7374, + "step": 15058 + }, + { + "epoch": 8.493513818386916, + "grad_norm": 0.9849633574485779, + "learning_rate": 7.535250987027637e-06, + "loss": 0.6567, + "step": 15059 + }, + { + "epoch": 8.494077834179357, + "grad_norm": 1.209430456161499, + "learning_rate": 7.532430908065426e-06, + "loss": 0.6802, + "step": 15060 + }, + { + "epoch": 8.4946418499718, + "grad_norm": 1.6451332569122314, + "learning_rate": 7.5296108291032144e-06, + "loss": 0.7592, + "step": 15061 + }, + { + "epoch": 8.495205865764241, + "grad_norm": 1.0619696378707886, + "learning_rate": 7.526790750141004e-06, + "loss": 0.7223, + "step": 15062 + }, + { + "epoch": 8.495769881556683, + "grad_norm": 1.0014559030532837, + "learning_rate": 7.523970671178793e-06, + "loss": 0.7215, + "step": 15063 + }, + { + "epoch": 8.496333897349126, + "grad_norm": 1.005547285079956, + "learning_rate": 7.521150592216582e-06, + "loss": 0.6542, + "step": 15064 + }, + { + "epoch": 8.496897913141568, + "grad_norm": 1.0576857328414917, + "learning_rate": 7.518330513254372e-06, + "loss": 0.7618, + "step": 15065 + }, + { + "epoch": 8.49746192893401, + "grad_norm": 0.9427942633628845, + "learning_rate": 7.515510434292161e-06, + "loss": 0.6557, + "step": 15066 + }, + { + "epoch": 8.498025944726452, + "grad_norm": 0.9539085626602173, + "learning_rate": 7.512690355329951e-06, + "loss": 0.613, + "step": 15067 + }, + { + "epoch": 8.498589960518894, + "grad_norm": 1.2347713708877563, + "learning_rate": 7.509870276367739e-06, + "loss": 0.7427, + "step": 15068 + }, + { + "epoch": 8.499153976311337, + "grad_norm": 1.2285445928573608, + "learning_rate": 7.507050197405528e-06, + "loss": 0.772, + "step": 15069 + }, + { + "epoch": 8.499717992103779, + "grad_norm": 0.9951485395431519, + "learning_rate": 7.504230118443317e-06, + "loss": 0.7219, + "step": 15070 + }, + { + "epoch": 8.500282007896221, + "grad_norm": 1.1914209127426147, + "learning_rate": 7.501410039481107e-06, + "loss": 0.7182, + "step": 15071 + }, + { + "epoch": 8.500846023688663, + "grad_norm": 1.3411368131637573, + "learning_rate": 7.4985899605188954e-06, + "loss": 0.7842, + "step": 15072 + }, + { + "epoch": 8.501410039481106, + "grad_norm": 1.2831891775131226, + "learning_rate": 7.495769881556684e-06, + "loss": 0.7647, + "step": 15073 + }, + { + "epoch": 8.501974055273548, + "grad_norm": 0.8909532427787781, + "learning_rate": 7.492949802594473e-06, + "loss": 0.7174, + "step": 15074 + }, + { + "epoch": 8.50253807106599, + "grad_norm": 1.3049360513687134, + "learning_rate": 7.490129723632263e-06, + "loss": 0.6966, + "step": 15075 + }, + { + "epoch": 8.503102086858432, + "grad_norm": 1.412278652191162, + "learning_rate": 7.4873096446700515e-06, + "loss": 0.7719, + "step": 15076 + }, + { + "epoch": 8.503666102650874, + "grad_norm": 1.2755563259124756, + "learning_rate": 7.48448956570784e-06, + "loss": 0.7347, + "step": 15077 + }, + { + "epoch": 8.504230118443317, + "grad_norm": 1.3162107467651367, + "learning_rate": 7.481669486745629e-06, + "loss": 0.717, + "step": 15078 + }, + { + "epoch": 8.504794134235759, + "grad_norm": 1.2792129516601562, + "learning_rate": 7.478849407783419e-06, + "loss": 0.7299, + "step": 15079 + }, + { + "epoch": 8.5053581500282, + "grad_norm": 0.947776734828949, + "learning_rate": 7.4760293288212075e-06, + "loss": 0.5884, + "step": 15080 + }, + { + "epoch": 8.505922165820643, + "grad_norm": 1.221564531326294, + "learning_rate": 7.473209249858996e-06, + "loss": 0.8528, + "step": 15081 + }, + { + "epoch": 8.506486181613084, + "grad_norm": 1.2020912170410156, + "learning_rate": 7.470389170896785e-06, + "loss": 0.7947, + "step": 15082 + }, + { + "epoch": 8.507050197405528, + "grad_norm": 1.1408647298812866, + "learning_rate": 7.467569091934575e-06, + "loss": 0.7456, + "step": 15083 + }, + { + "epoch": 8.50761421319797, + "grad_norm": 1.426596999168396, + "learning_rate": 7.4647490129723635e-06, + "loss": 0.775, + "step": 15084 + }, + { + "epoch": 8.508178228990412, + "grad_norm": 1.1576511859893799, + "learning_rate": 7.461928934010152e-06, + "loss": 0.7403, + "step": 15085 + }, + { + "epoch": 8.508742244782853, + "grad_norm": 1.1532018184661865, + "learning_rate": 7.459108855047941e-06, + "loss": 0.7956, + "step": 15086 + }, + { + "epoch": 8.509306260575297, + "grad_norm": 1.7106486558914185, + "learning_rate": 7.456288776085731e-06, + "loss": 0.8654, + "step": 15087 + }, + { + "epoch": 8.509870276367739, + "grad_norm": 1.3459895849227905, + "learning_rate": 7.4534686971235196e-06, + "loss": 0.8028, + "step": 15088 + }, + { + "epoch": 8.51043429216018, + "grad_norm": 1.1952837705612183, + "learning_rate": 7.450648618161308e-06, + "loss": 0.7489, + "step": 15089 + }, + { + "epoch": 8.510998307952622, + "grad_norm": 1.1852314472198486, + "learning_rate": 7.447828539199098e-06, + "loss": 0.7039, + "step": 15090 + }, + { + "epoch": 8.511562323745064, + "grad_norm": 1.332240104675293, + "learning_rate": 7.445008460236887e-06, + "loss": 0.7979, + "step": 15091 + }, + { + "epoch": 8.512126339537508, + "grad_norm": 1.295633316040039, + "learning_rate": 7.442188381274676e-06, + "loss": 0.7641, + "step": 15092 + }, + { + "epoch": 8.51269035532995, + "grad_norm": 1.2338340282440186, + "learning_rate": 7.439368302312464e-06, + "loss": 0.7679, + "step": 15093 + }, + { + "epoch": 8.513254371122391, + "grad_norm": 1.2664657831192017, + "learning_rate": 7.436548223350254e-06, + "loss": 0.8617, + "step": 15094 + }, + { + "epoch": 8.513818386914833, + "grad_norm": 0.8468585014343262, + "learning_rate": 7.433728144388043e-06, + "loss": 0.7144, + "step": 15095 + }, + { + "epoch": 8.514382402707275, + "grad_norm": 1.0507224798202515, + "learning_rate": 7.430908065425832e-06, + "loss": 0.7701, + "step": 15096 + }, + { + "epoch": 8.514946418499719, + "grad_norm": 1.3229146003723145, + "learning_rate": 7.4280879864636204e-06, + "loss": 0.7548, + "step": 15097 + }, + { + "epoch": 8.51551043429216, + "grad_norm": 1.0580941438674927, + "learning_rate": 7.42526790750141e-06, + "loss": 0.7784, + "step": 15098 + }, + { + "epoch": 8.516074450084602, + "grad_norm": 1.1572388410568237, + "learning_rate": 7.422447828539199e-06, + "loss": 0.6787, + "step": 15099 + }, + { + "epoch": 8.516638465877044, + "grad_norm": 0.9883232116699219, + "learning_rate": 7.419627749576989e-06, + "loss": 0.6918, + "step": 15100 + }, + { + "epoch": 8.517202481669488, + "grad_norm": 1.1012508869171143, + "learning_rate": 7.416807670614778e-06, + "loss": 0.7077, + "step": 15101 + }, + { + "epoch": 8.51776649746193, + "grad_norm": 0.8669304847717285, + "learning_rate": 7.413987591652567e-06, + "loss": 0.7101, + "step": 15102 + }, + { + "epoch": 8.518330513254371, + "grad_norm": 1.0250515937805176, + "learning_rate": 7.411167512690357e-06, + "loss": 0.6124, + "step": 15103 + }, + { + "epoch": 8.518894529046813, + "grad_norm": 1.1222193241119385, + "learning_rate": 7.408347433728145e-06, + "loss": 0.7365, + "step": 15104 + }, + { + "epoch": 8.519458544839255, + "grad_norm": 1.1477006673812866, + "learning_rate": 7.405527354765934e-06, + "loss": 0.7057, + "step": 15105 + }, + { + "epoch": 8.520022560631698, + "grad_norm": 0.7858308553695679, + "learning_rate": 7.402707275803723e-06, + "loss": 0.6433, + "step": 15106 + }, + { + "epoch": 8.52058657642414, + "grad_norm": 1.3657116889953613, + "learning_rate": 7.399887196841513e-06, + "loss": 0.7058, + "step": 15107 + }, + { + "epoch": 8.521150592216582, + "grad_norm": 1.030246376991272, + "learning_rate": 7.3970671178793014e-06, + "loss": 0.6349, + "step": 15108 + }, + { + "epoch": 8.521714608009024, + "grad_norm": 1.1437181234359741, + "learning_rate": 7.39424703891709e-06, + "loss": 0.7554, + "step": 15109 + }, + { + "epoch": 8.522278623801466, + "grad_norm": 1.8174529075622559, + "learning_rate": 7.391426959954879e-06, + "loss": 0.7049, + "step": 15110 + }, + { + "epoch": 8.52284263959391, + "grad_norm": 1.6279679536819458, + "learning_rate": 7.388606880992669e-06, + "loss": 0.8647, + "step": 15111 + }, + { + "epoch": 8.523406655386351, + "grad_norm": 1.1401135921478271, + "learning_rate": 7.3857868020304575e-06, + "loss": 0.6758, + "step": 15112 + }, + { + "epoch": 8.523970671178793, + "grad_norm": 1.0064069032669067, + "learning_rate": 7.382966723068246e-06, + "loss": 0.6816, + "step": 15113 + }, + { + "epoch": 8.524534686971235, + "grad_norm": 1.2896983623504639, + "learning_rate": 7.380146644106035e-06, + "loss": 0.7782, + "step": 15114 + }, + { + "epoch": 8.525098702763678, + "grad_norm": 1.029151201248169, + "learning_rate": 7.377326565143825e-06, + "loss": 0.742, + "step": 15115 + }, + { + "epoch": 8.52566271855612, + "grad_norm": 1.1235960721969604, + "learning_rate": 7.3745064861816135e-06, + "loss": 0.7322, + "step": 15116 + }, + { + "epoch": 8.526226734348562, + "grad_norm": 0.9689204692840576, + "learning_rate": 7.371686407219402e-06, + "loss": 0.6172, + "step": 15117 + }, + { + "epoch": 8.526790750141004, + "grad_norm": 1.105353593826294, + "learning_rate": 7.368866328257192e-06, + "loss": 0.75, + "step": 15118 + }, + { + "epoch": 8.527354765933445, + "grad_norm": 0.9836883544921875, + "learning_rate": 7.366046249294981e-06, + "loss": 0.7166, + "step": 15119 + }, + { + "epoch": 8.527918781725889, + "grad_norm": 1.1746699810028076, + "learning_rate": 7.3632261703327695e-06, + "loss": 0.7208, + "step": 15120 + }, + { + "epoch": 8.52848279751833, + "grad_norm": 1.3170769214630127, + "learning_rate": 7.360406091370558e-06, + "loss": 0.7825, + "step": 15121 + }, + { + "epoch": 8.529046813310773, + "grad_norm": 1.0907320976257324, + "learning_rate": 7.357586012408348e-06, + "loss": 0.7224, + "step": 15122 + }, + { + "epoch": 8.529610829103214, + "grad_norm": 1.1680041551589966, + "learning_rate": 7.354765933446137e-06, + "loss": 0.7459, + "step": 15123 + }, + { + "epoch": 8.530174844895656, + "grad_norm": 1.0280462503433228, + "learning_rate": 7.3519458544839256e-06, + "loss": 0.7367, + "step": 15124 + }, + { + "epoch": 8.5307388606881, + "grad_norm": 1.3119710683822632, + "learning_rate": 7.349125775521714e-06, + "loss": 0.811, + "step": 15125 + }, + { + "epoch": 8.531302876480542, + "grad_norm": 0.9633259773254395, + "learning_rate": 7.346305696559504e-06, + "loss": 0.6797, + "step": 15126 + }, + { + "epoch": 8.531866892272983, + "grad_norm": 1.2816147804260254, + "learning_rate": 7.343485617597293e-06, + "loss": 0.7723, + "step": 15127 + }, + { + "epoch": 8.532430908065425, + "grad_norm": 0.946269154548645, + "learning_rate": 7.340665538635082e-06, + "loss": 0.6476, + "step": 15128 + }, + { + "epoch": 8.532994923857869, + "grad_norm": 0.9205780029296875, + "learning_rate": 7.33784545967287e-06, + "loss": 0.638, + "step": 15129 + }, + { + "epoch": 8.53355893965031, + "grad_norm": 1.0405505895614624, + "learning_rate": 7.33502538071066e-06, + "loss": 0.7086, + "step": 15130 + }, + { + "epoch": 8.534122955442752, + "grad_norm": 1.0170912742614746, + "learning_rate": 7.332205301748449e-06, + "loss": 0.6847, + "step": 15131 + }, + { + "epoch": 8.534686971235194, + "grad_norm": 1.0456801652908325, + "learning_rate": 7.329385222786238e-06, + "loss": 0.6627, + "step": 15132 + }, + { + "epoch": 8.535250987027636, + "grad_norm": 1.0178802013397217, + "learning_rate": 7.3265651438240264e-06, + "loss": 0.751, + "step": 15133 + }, + { + "epoch": 8.53581500282008, + "grad_norm": 0.9875189661979675, + "learning_rate": 7.323745064861816e-06, + "loss": 0.7085, + "step": 15134 + }, + { + "epoch": 8.536379018612521, + "grad_norm": 1.4220050573349, + "learning_rate": 7.320924985899605e-06, + "loss": 0.8546, + "step": 15135 + }, + { + "epoch": 8.536943034404963, + "grad_norm": 0.9418342709541321, + "learning_rate": 7.318104906937395e-06, + "loss": 0.641, + "step": 15136 + }, + { + "epoch": 8.537507050197405, + "grad_norm": 0.9452613592147827, + "learning_rate": 7.315284827975184e-06, + "loss": 0.6758, + "step": 15137 + }, + { + "epoch": 8.538071065989847, + "grad_norm": 0.9116767644882202, + "learning_rate": 7.312464749012973e-06, + "loss": 0.755, + "step": 15138 + }, + { + "epoch": 8.53863508178229, + "grad_norm": 1.1533137559890747, + "learning_rate": 7.309644670050763e-06, + "loss": 0.7396, + "step": 15139 + }, + { + "epoch": 8.539199097574732, + "grad_norm": 1.364820957183838, + "learning_rate": 7.306824591088551e-06, + "loss": 0.749, + "step": 15140 + }, + { + "epoch": 8.539763113367174, + "grad_norm": 1.0950853824615479, + "learning_rate": 7.30400451212634e-06, + "loss": 0.6262, + "step": 15141 + }, + { + "epoch": 8.540327129159616, + "grad_norm": 1.031322956085205, + "learning_rate": 7.301184433164129e-06, + "loss": 0.7178, + "step": 15142 + }, + { + "epoch": 8.54089114495206, + "grad_norm": 0.8723858594894409, + "learning_rate": 7.298364354201919e-06, + "loss": 0.6079, + "step": 15143 + }, + { + "epoch": 8.541455160744501, + "grad_norm": 0.9238008856773376, + "learning_rate": 7.295544275239707e-06, + "loss": 0.6968, + "step": 15144 + }, + { + "epoch": 8.542019176536943, + "grad_norm": 0.9409024119377136, + "learning_rate": 7.292724196277496e-06, + "loss": 0.7563, + "step": 15145 + }, + { + "epoch": 8.542583192329385, + "grad_norm": 1.2169263362884521, + "learning_rate": 7.289904117315286e-06, + "loss": 0.7921, + "step": 15146 + }, + { + "epoch": 8.543147208121827, + "grad_norm": 1.3865070343017578, + "learning_rate": 7.287084038353075e-06, + "loss": 0.7719, + "step": 15147 + }, + { + "epoch": 8.54371122391427, + "grad_norm": 1.0855891704559326, + "learning_rate": 7.2842639593908635e-06, + "loss": 0.6918, + "step": 15148 + }, + { + "epoch": 8.544275239706712, + "grad_norm": 1.197250485420227, + "learning_rate": 7.281443880428652e-06, + "loss": 0.7017, + "step": 15149 + }, + { + "epoch": 8.544839255499154, + "grad_norm": 0.9891507625579834, + "learning_rate": 7.278623801466442e-06, + "loss": 0.7333, + "step": 15150 + }, + { + "epoch": 8.545403271291596, + "grad_norm": 1.0623739957809448, + "learning_rate": 7.275803722504231e-06, + "loss": 0.7455, + "step": 15151 + }, + { + "epoch": 8.545967287084038, + "grad_norm": 0.9582376480102539, + "learning_rate": 7.2729836435420195e-06, + "loss": 0.7065, + "step": 15152 + }, + { + "epoch": 8.546531302876481, + "grad_norm": 0.9112138152122498, + "learning_rate": 7.270163564579808e-06, + "loss": 0.7155, + "step": 15153 + }, + { + "epoch": 8.547095318668923, + "grad_norm": 1.241019606590271, + "learning_rate": 7.267343485617598e-06, + "loss": 0.7539, + "step": 15154 + }, + { + "epoch": 8.547659334461365, + "grad_norm": 1.1446647644042969, + "learning_rate": 7.264523406655387e-06, + "loss": 0.6988, + "step": 15155 + }, + { + "epoch": 8.548223350253807, + "grad_norm": 0.9058417081832886, + "learning_rate": 7.2617033276931755e-06, + "loss": 0.7008, + "step": 15156 + }, + { + "epoch": 8.54878736604625, + "grad_norm": 1.1546220779418945, + "learning_rate": 7.258883248730964e-06, + "loss": 0.7386, + "step": 15157 + }, + { + "epoch": 8.549351381838692, + "grad_norm": 1.069836139678955, + "learning_rate": 7.256063169768754e-06, + "loss": 0.7344, + "step": 15158 + }, + { + "epoch": 8.549915397631134, + "grad_norm": 1.305080771446228, + "learning_rate": 7.253243090806543e-06, + "loss": 0.7768, + "step": 15159 + }, + { + "epoch": 8.550479413423576, + "grad_norm": 1.265473484992981, + "learning_rate": 7.2504230118443316e-06, + "loss": 0.7667, + "step": 15160 + }, + { + "epoch": 8.551043429216017, + "grad_norm": 1.0672712326049805, + "learning_rate": 7.24760293288212e-06, + "loss": 0.7318, + "step": 15161 + }, + { + "epoch": 8.551607445008461, + "grad_norm": 1.0369902849197388, + "learning_rate": 7.24478285391991e-06, + "loss": 0.7477, + "step": 15162 + }, + { + "epoch": 8.552171460800903, + "grad_norm": 0.925971508026123, + "learning_rate": 7.241962774957699e-06, + "loss": 0.6127, + "step": 15163 + }, + { + "epoch": 8.552735476593345, + "grad_norm": 0.8692816495895386, + "learning_rate": 7.239142695995488e-06, + "loss": 0.7741, + "step": 15164 + }, + { + "epoch": 8.553299492385786, + "grad_norm": 0.9106912016868591, + "learning_rate": 7.236322617033276e-06, + "loss": 0.7386, + "step": 15165 + }, + { + "epoch": 8.553863508178228, + "grad_norm": 1.2161394357681274, + "learning_rate": 7.233502538071066e-06, + "loss": 0.6862, + "step": 15166 + }, + { + "epoch": 8.554427523970672, + "grad_norm": 1.1795710325241089, + "learning_rate": 7.230682459108855e-06, + "loss": 0.7867, + "step": 15167 + }, + { + "epoch": 8.554991539763114, + "grad_norm": 1.1193487644195557, + "learning_rate": 7.227862380146644e-06, + "loss": 0.8064, + "step": 15168 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 1.1100234985351562, + "learning_rate": 7.225042301184433e-06, + "loss": 0.7798, + "step": 15169 + }, + { + "epoch": 8.556119571347997, + "grad_norm": 0.7888591289520264, + "learning_rate": 7.222222222222222e-06, + "loss": 0.5449, + "step": 15170 + }, + { + "epoch": 8.55668358714044, + "grad_norm": 1.091304063796997, + "learning_rate": 7.2194021432600126e-06, + "loss": 0.7805, + "step": 15171 + }, + { + "epoch": 8.557247602932883, + "grad_norm": 1.1235806941986084, + "learning_rate": 7.216582064297801e-06, + "loss": 0.7839, + "step": 15172 + }, + { + "epoch": 8.557811618725324, + "grad_norm": 1.345531702041626, + "learning_rate": 7.21376198533559e-06, + "loss": 0.7754, + "step": 15173 + }, + { + "epoch": 8.558375634517766, + "grad_norm": 0.9771006107330322, + "learning_rate": 7.210941906373379e-06, + "loss": 0.7846, + "step": 15174 + }, + { + "epoch": 8.558939650310208, + "grad_norm": 1.1825950145721436, + "learning_rate": 7.208121827411169e-06, + "loss": 0.8195, + "step": 15175 + }, + { + "epoch": 8.559503666102652, + "grad_norm": 1.1138304471969604, + "learning_rate": 7.205301748448957e-06, + "loss": 0.7085, + "step": 15176 + }, + { + "epoch": 8.560067681895093, + "grad_norm": 0.9352763295173645, + "learning_rate": 7.202481669486746e-06, + "loss": 0.7508, + "step": 15177 + }, + { + "epoch": 8.560631697687535, + "grad_norm": 1.2387028932571411, + "learning_rate": 7.199661590524536e-06, + "loss": 0.8167, + "step": 15178 + }, + { + "epoch": 8.561195713479977, + "grad_norm": 0.9029189348220825, + "learning_rate": 7.196841511562325e-06, + "loss": 0.5819, + "step": 15179 + }, + { + "epoch": 8.561759729272419, + "grad_norm": 1.4263947010040283, + "learning_rate": 7.194021432600113e-06, + "loss": 0.8323, + "step": 15180 + }, + { + "epoch": 8.562323745064862, + "grad_norm": 1.4607467651367188, + "learning_rate": 7.191201353637902e-06, + "loss": 0.7422, + "step": 15181 + }, + { + "epoch": 8.562887760857304, + "grad_norm": 1.1068694591522217, + "learning_rate": 7.188381274675692e-06, + "loss": 0.681, + "step": 15182 + }, + { + "epoch": 8.563451776649746, + "grad_norm": 1.280057668685913, + "learning_rate": 7.185561195713481e-06, + "loss": 0.7384, + "step": 15183 + }, + { + "epoch": 8.564015792442188, + "grad_norm": 1.194482445716858, + "learning_rate": 7.1827411167512694e-06, + "loss": 0.7923, + "step": 15184 + }, + { + "epoch": 8.564579808234631, + "grad_norm": 1.1010876893997192, + "learning_rate": 7.179921037789058e-06, + "loss": 0.7426, + "step": 15185 + }, + { + "epoch": 8.565143824027073, + "grad_norm": 1.1584899425506592, + "learning_rate": 7.177100958826848e-06, + "loss": 0.7409, + "step": 15186 + }, + { + "epoch": 8.565707839819515, + "grad_norm": 1.0998334884643555, + "learning_rate": 7.174280879864637e-06, + "loss": 0.6911, + "step": 15187 + }, + { + "epoch": 8.566271855611957, + "grad_norm": 1.032204270362854, + "learning_rate": 7.1714608009024255e-06, + "loss": 0.5722, + "step": 15188 + }, + { + "epoch": 8.566835871404399, + "grad_norm": 1.1258131265640259, + "learning_rate": 7.168640721940214e-06, + "loss": 0.6803, + "step": 15189 + }, + { + "epoch": 8.567399887196842, + "grad_norm": 1.1925173997879028, + "learning_rate": 7.165820642978004e-06, + "loss": 0.7158, + "step": 15190 + }, + { + "epoch": 8.567963902989284, + "grad_norm": 1.0361248254776, + "learning_rate": 7.163000564015793e-06, + "loss": 0.7176, + "step": 15191 + }, + { + "epoch": 8.568527918781726, + "grad_norm": 0.9048416018486023, + "learning_rate": 7.1601804850535815e-06, + "loss": 0.645, + "step": 15192 + }, + { + "epoch": 8.569091934574168, + "grad_norm": 1.6104336977005005, + "learning_rate": 7.15736040609137e-06, + "loss": 0.8346, + "step": 15193 + }, + { + "epoch": 8.56965595036661, + "grad_norm": 1.2122488021850586, + "learning_rate": 7.15454032712916e-06, + "loss": 0.7954, + "step": 15194 + }, + { + "epoch": 8.570219966159053, + "grad_norm": 1.0771865844726562, + "learning_rate": 7.151720248166949e-06, + "loss": 0.8402, + "step": 15195 + }, + { + "epoch": 8.570783981951495, + "grad_norm": 1.1961462497711182, + "learning_rate": 7.1489001692047375e-06, + "loss": 0.7746, + "step": 15196 + }, + { + "epoch": 8.571347997743937, + "grad_norm": 1.1534922122955322, + "learning_rate": 7.146080090242527e-06, + "loss": 0.7545, + "step": 15197 + }, + { + "epoch": 8.571912013536378, + "grad_norm": 1.7133841514587402, + "learning_rate": 7.143260011280316e-06, + "loss": 0.8147, + "step": 15198 + }, + { + "epoch": 8.572476029328822, + "grad_norm": 0.8261776566505432, + "learning_rate": 7.140439932318105e-06, + "loss": 0.6329, + "step": 15199 + }, + { + "epoch": 8.573040045121264, + "grad_norm": 1.0329883098602295, + "learning_rate": 7.137619853355894e-06, + "loss": 0.7486, + "step": 15200 + }, + { + "epoch": 8.573604060913706, + "grad_norm": 1.080061674118042, + "learning_rate": 7.134799774393683e-06, + "loss": 0.7646, + "step": 15201 + }, + { + "epoch": 8.574168076706147, + "grad_norm": 1.1058239936828613, + "learning_rate": 7.131979695431472e-06, + "loss": 0.6497, + "step": 15202 + }, + { + "epoch": 8.57473209249859, + "grad_norm": 1.2703425884246826, + "learning_rate": 7.129159616469261e-06, + "loss": 0.8342, + "step": 15203 + }, + { + "epoch": 8.575296108291033, + "grad_norm": 1.1390254497528076, + "learning_rate": 7.12633953750705e-06, + "loss": 0.7327, + "step": 15204 + }, + { + "epoch": 8.575860124083475, + "grad_norm": 0.9908905029296875, + "learning_rate": 7.123519458544839e-06, + "loss": 0.6262, + "step": 15205 + }, + { + "epoch": 8.576424139875916, + "grad_norm": 1.2281097173690796, + "learning_rate": 7.12069937958263e-06, + "loss": 0.7667, + "step": 15206 + }, + { + "epoch": 8.576988155668358, + "grad_norm": 0.7808961272239685, + "learning_rate": 7.1178793006204185e-06, + "loss": 0.6251, + "step": 15207 + }, + { + "epoch": 8.577552171460802, + "grad_norm": 1.1170179843902588, + "learning_rate": 7.115059221658207e-06, + "loss": 0.7421, + "step": 15208 + }, + { + "epoch": 8.578116187253244, + "grad_norm": 1.0737783908843994, + "learning_rate": 7.112239142695996e-06, + "loss": 0.8171, + "step": 15209 + }, + { + "epoch": 8.578680203045685, + "grad_norm": 1.5979390144348145, + "learning_rate": 7.109419063733786e-06, + "loss": 0.7368, + "step": 15210 + }, + { + "epoch": 8.579244218838127, + "grad_norm": 1.0285542011260986, + "learning_rate": 7.106598984771575e-06, + "loss": 0.7232, + "step": 15211 + }, + { + "epoch": 8.579808234630569, + "grad_norm": 1.6901699304580688, + "learning_rate": 7.103778905809363e-06, + "loss": 0.8187, + "step": 15212 + }, + { + "epoch": 8.580372250423013, + "grad_norm": 1.2677736282348633, + "learning_rate": 7.100958826847152e-06, + "loss": 0.7933, + "step": 15213 + }, + { + "epoch": 8.580936266215454, + "grad_norm": 0.8965944647789001, + "learning_rate": 7.098138747884942e-06, + "loss": 0.7552, + "step": 15214 + }, + { + "epoch": 8.581500282007896, + "grad_norm": 1.290641188621521, + "learning_rate": 7.095318668922731e-06, + "loss": 0.7425, + "step": 15215 + }, + { + "epoch": 8.582064297800338, + "grad_norm": 1.3030457496643066, + "learning_rate": 7.092498589960519e-06, + "loss": 0.7016, + "step": 15216 + }, + { + "epoch": 8.58262831359278, + "grad_norm": 1.624037265777588, + "learning_rate": 7.089678510998308e-06, + "loss": 0.7682, + "step": 15217 + }, + { + "epoch": 8.583192329385223, + "grad_norm": 1.1462260484695435, + "learning_rate": 7.086858432036098e-06, + "loss": 0.7043, + "step": 15218 + }, + { + "epoch": 8.583756345177665, + "grad_norm": 1.204193353652954, + "learning_rate": 7.084038353073887e-06, + "loss": 0.671, + "step": 15219 + }, + { + "epoch": 8.584320360970107, + "grad_norm": 1.3734790086746216, + "learning_rate": 7.0812182741116754e-06, + "loss": 0.7534, + "step": 15220 + }, + { + "epoch": 8.584884376762549, + "grad_norm": 1.4698320627212524, + "learning_rate": 7.078398195149464e-06, + "loss": 0.6586, + "step": 15221 + }, + { + "epoch": 8.58544839255499, + "grad_norm": 1.1866796016693115, + "learning_rate": 7.075578116187254e-06, + "loss": 0.6688, + "step": 15222 + }, + { + "epoch": 8.586012408347434, + "grad_norm": 1.3216067552566528, + "learning_rate": 7.072758037225043e-06, + "loss": 0.6489, + "step": 15223 + }, + { + "epoch": 8.586576424139876, + "grad_norm": 1.0538830757141113, + "learning_rate": 7.0699379582628315e-06, + "loss": 0.7066, + "step": 15224 + }, + { + "epoch": 8.587140439932318, + "grad_norm": 1.3939263820648193, + "learning_rate": 7.06711787930062e-06, + "loss": 0.7364, + "step": 15225 + }, + { + "epoch": 8.58770445572476, + "grad_norm": 1.5449801683425903, + "learning_rate": 7.06429780033841e-06, + "loss": 0.8381, + "step": 15226 + }, + { + "epoch": 8.588268471517203, + "grad_norm": 1.2519714832305908, + "learning_rate": 7.061477721376199e-06, + "loss": 0.6108, + "step": 15227 + }, + { + "epoch": 8.588832487309645, + "grad_norm": 1.5235308408737183, + "learning_rate": 7.0586576424139875e-06, + "loss": 0.7739, + "step": 15228 + }, + { + "epoch": 8.589396503102087, + "grad_norm": 0.8440749049186707, + "learning_rate": 7.055837563451777e-06, + "loss": 0.6545, + "step": 15229 + }, + { + "epoch": 8.589960518894529, + "grad_norm": 1.004091501235962, + "learning_rate": 7.053017484489566e-06, + "loss": 0.7199, + "step": 15230 + }, + { + "epoch": 8.59052453468697, + "grad_norm": 1.0118687152862549, + "learning_rate": 7.050197405527355e-06, + "loss": 0.8247, + "step": 15231 + }, + { + "epoch": 8.591088550479414, + "grad_norm": 0.9232345223426819, + "learning_rate": 7.0473773265651435e-06, + "loss": 0.7736, + "step": 15232 + }, + { + "epoch": 8.591652566271856, + "grad_norm": 0.85239577293396, + "learning_rate": 7.044557247602933e-06, + "loss": 0.71, + "step": 15233 + }, + { + "epoch": 8.592216582064298, + "grad_norm": 1.0429221391677856, + "learning_rate": 7.041737168640722e-06, + "loss": 0.735, + "step": 15234 + }, + { + "epoch": 8.59278059785674, + "grad_norm": 1.1068484783172607, + "learning_rate": 7.038917089678511e-06, + "loss": 0.702, + "step": 15235 + }, + { + "epoch": 8.593344613649183, + "grad_norm": 1.2645529508590698, + "learning_rate": 7.0360970107162996e-06, + "loss": 0.7545, + "step": 15236 + }, + { + "epoch": 8.593908629441625, + "grad_norm": 1.4123586416244507, + "learning_rate": 7.033276931754089e-06, + "loss": 0.7488, + "step": 15237 + }, + { + "epoch": 8.594472645234067, + "grad_norm": 1.1435656547546387, + "learning_rate": 7.030456852791878e-06, + "loss": 0.7283, + "step": 15238 + }, + { + "epoch": 8.595036661026509, + "grad_norm": 1.1279054880142212, + "learning_rate": 7.027636773829667e-06, + "loss": 0.738, + "step": 15239 + }, + { + "epoch": 8.59560067681895, + "grad_norm": 1.046100378036499, + "learning_rate": 7.024816694867456e-06, + "loss": 0.698, + "step": 15240 + }, + { + "epoch": 8.596164692611394, + "grad_norm": 1.1571097373962402, + "learning_rate": 7.021996615905245e-06, + "loss": 0.662, + "step": 15241 + }, + { + "epoch": 8.596728708403836, + "grad_norm": 0.9867609739303589, + "learning_rate": 7.019176536943036e-06, + "loss": 0.724, + "step": 15242 + }, + { + "epoch": 8.597292724196278, + "grad_norm": 1.3864481449127197, + "learning_rate": 7.0163564579808245e-06, + "loss": 0.8694, + "step": 15243 + }, + { + "epoch": 8.59785673998872, + "grad_norm": 1.2762099504470825, + "learning_rate": 7.013536379018613e-06, + "loss": 0.7392, + "step": 15244 + }, + { + "epoch": 8.598420755781161, + "grad_norm": 0.9116499423980713, + "learning_rate": 7.010716300056402e-06, + "loss": 0.6881, + "step": 15245 + }, + { + "epoch": 8.598984771573605, + "grad_norm": 0.9553113579750061, + "learning_rate": 7.007896221094192e-06, + "loss": 0.686, + "step": 15246 + }, + { + "epoch": 8.599548787366047, + "grad_norm": 1.154005765914917, + "learning_rate": 7.0050761421319806e-06, + "loss": 0.6387, + "step": 15247 + }, + { + "epoch": 8.600112803158488, + "grad_norm": 1.10969078540802, + "learning_rate": 7.002256063169769e-06, + "loss": 0.7483, + "step": 15248 + }, + { + "epoch": 8.60067681895093, + "grad_norm": 0.9237793684005737, + "learning_rate": 6.999435984207558e-06, + "loss": 0.7966, + "step": 15249 + }, + { + "epoch": 8.601240834743372, + "grad_norm": 1.2075401544570923, + "learning_rate": 6.996615905245348e-06, + "loss": 0.7003, + "step": 15250 + }, + { + "epoch": 8.601804850535816, + "grad_norm": 0.8055686354637146, + "learning_rate": 6.993795826283137e-06, + "loss": 0.655, + "step": 15251 + }, + { + "epoch": 8.602368866328257, + "grad_norm": 0.8745337128639221, + "learning_rate": 6.990975747320925e-06, + "loss": 0.7223, + "step": 15252 + }, + { + "epoch": 8.6029328821207, + "grad_norm": 1.3635637760162354, + "learning_rate": 6.988155668358714e-06, + "loss": 0.6766, + "step": 15253 + }, + { + "epoch": 8.603496897913141, + "grad_norm": 0.996279239654541, + "learning_rate": 6.985335589396504e-06, + "loss": 0.7408, + "step": 15254 + }, + { + "epoch": 8.604060913705585, + "grad_norm": 1.1280596256256104, + "learning_rate": 6.982515510434293e-06, + "loss": 0.6506, + "step": 15255 + }, + { + "epoch": 8.604624929498026, + "grad_norm": 1.4129924774169922, + "learning_rate": 6.9796954314720814e-06, + "loss": 0.8863, + "step": 15256 + }, + { + "epoch": 8.605188945290468, + "grad_norm": 1.1689152717590332, + "learning_rate": 6.976875352509871e-06, + "loss": 0.7152, + "step": 15257 + }, + { + "epoch": 8.60575296108291, + "grad_norm": 1.5045921802520752, + "learning_rate": 6.97405527354766e-06, + "loss": 0.8952, + "step": 15258 + }, + { + "epoch": 8.606316976875352, + "grad_norm": 1.3635894060134888, + "learning_rate": 6.971235194585449e-06, + "loss": 0.7165, + "step": 15259 + }, + { + "epoch": 8.606880992667795, + "grad_norm": 1.0930750370025635, + "learning_rate": 6.9684151156232375e-06, + "loss": 0.7238, + "step": 15260 + }, + { + "epoch": 8.607445008460237, + "grad_norm": 1.0966802835464478, + "learning_rate": 6.965595036661027e-06, + "loss": 0.6835, + "step": 15261 + }, + { + "epoch": 8.608009024252679, + "grad_norm": 0.9149605631828308, + "learning_rate": 6.962774957698816e-06, + "loss": 0.734, + "step": 15262 + }, + { + "epoch": 8.60857304004512, + "grad_norm": 1.0981372594833374, + "learning_rate": 6.959954878736605e-06, + "loss": 0.7049, + "step": 15263 + }, + { + "epoch": 8.609137055837564, + "grad_norm": 0.9078680872917175, + "learning_rate": 6.9571347997743935e-06, + "loss": 0.7269, + "step": 15264 + }, + { + "epoch": 8.609701071630006, + "grad_norm": 1.0861910581588745, + "learning_rate": 6.954314720812183e-06, + "loss": 0.6468, + "step": 15265 + }, + { + "epoch": 8.610265087422448, + "grad_norm": 0.8261767625808716, + "learning_rate": 6.951494641849972e-06, + "loss": 0.6618, + "step": 15266 + }, + { + "epoch": 8.61082910321489, + "grad_norm": 0.919592022895813, + "learning_rate": 6.948674562887761e-06, + "loss": 0.7387, + "step": 15267 + }, + { + "epoch": 8.611393119007332, + "grad_norm": 1.4321471452713013, + "learning_rate": 6.9458544839255495e-06, + "loss": 0.7706, + "step": 15268 + }, + { + "epoch": 8.611957134799775, + "grad_norm": 1.3274884223937988, + "learning_rate": 6.943034404963339e-06, + "loss": 0.8129, + "step": 15269 + }, + { + "epoch": 8.612521150592217, + "grad_norm": 1.304203748703003, + "learning_rate": 6.940214326001128e-06, + "loss": 0.6733, + "step": 15270 + }, + { + "epoch": 8.613085166384659, + "grad_norm": 1.1030627489089966, + "learning_rate": 6.937394247038917e-06, + "loss": 0.7764, + "step": 15271 + }, + { + "epoch": 8.6136491821771, + "grad_norm": 1.2206757068634033, + "learning_rate": 6.9345741680767056e-06, + "loss": 0.7444, + "step": 15272 + }, + { + "epoch": 8.614213197969542, + "grad_norm": 1.2820465564727783, + "learning_rate": 6.931754089114495e-06, + "loss": 0.808, + "step": 15273 + }, + { + "epoch": 8.614777213761986, + "grad_norm": 1.2552598714828491, + "learning_rate": 6.928934010152284e-06, + "loss": 0.7983, + "step": 15274 + }, + { + "epoch": 8.615341229554428, + "grad_norm": 1.3359458446502686, + "learning_rate": 6.926113931190073e-06, + "loss": 0.6809, + "step": 15275 + }, + { + "epoch": 8.61590524534687, + "grad_norm": 1.2496039867401123, + "learning_rate": 6.9232938522278624e-06, + "loss": 0.7501, + "step": 15276 + }, + { + "epoch": 8.616469261139311, + "grad_norm": 1.299379825592041, + "learning_rate": 6.920473773265652e-06, + "loss": 0.6794, + "step": 15277 + }, + { + "epoch": 8.617033276931753, + "grad_norm": 1.3815112113952637, + "learning_rate": 6.917653694303442e-06, + "loss": 0.788, + "step": 15278 + }, + { + "epoch": 8.617597292724197, + "grad_norm": 0.9997470378875732, + "learning_rate": 6.9148336153412305e-06, + "loss": 0.7253, + "step": 15279 + }, + { + "epoch": 8.618161308516639, + "grad_norm": 1.1217093467712402, + "learning_rate": 6.912013536379019e-06, + "loss": 0.8044, + "step": 15280 + }, + { + "epoch": 8.61872532430908, + "grad_norm": 0.9505613446235657, + "learning_rate": 6.909193457416808e-06, + "loss": 0.7198, + "step": 15281 + }, + { + "epoch": 8.619289340101522, + "grad_norm": 1.0371266603469849, + "learning_rate": 6.906373378454598e-06, + "loss": 0.7386, + "step": 15282 + }, + { + "epoch": 8.619853355893966, + "grad_norm": 1.4017987251281738, + "learning_rate": 6.9035532994923866e-06, + "loss": 0.8014, + "step": 15283 + }, + { + "epoch": 8.620417371686408, + "grad_norm": 0.902422308921814, + "learning_rate": 6.900733220530175e-06, + "loss": 0.6438, + "step": 15284 + }, + { + "epoch": 8.62098138747885, + "grad_norm": 1.2299630641937256, + "learning_rate": 6.897913141567965e-06, + "loss": 0.7537, + "step": 15285 + }, + { + "epoch": 8.621545403271291, + "grad_norm": 1.0026118755340576, + "learning_rate": 6.895093062605754e-06, + "loss": 0.7129, + "step": 15286 + }, + { + "epoch": 8.622109419063733, + "grad_norm": 1.3354735374450684, + "learning_rate": 6.892272983643543e-06, + "loss": 0.7662, + "step": 15287 + }, + { + "epoch": 8.622673434856177, + "grad_norm": 1.1093130111694336, + "learning_rate": 6.889452904681331e-06, + "loss": 0.7458, + "step": 15288 + }, + { + "epoch": 8.623237450648618, + "grad_norm": 1.1020361185073853, + "learning_rate": 6.886632825719121e-06, + "loss": 0.6182, + "step": 15289 + }, + { + "epoch": 8.62380146644106, + "grad_norm": 1.0171172618865967, + "learning_rate": 6.88381274675691e-06, + "loss": 0.6504, + "step": 15290 + }, + { + "epoch": 8.624365482233502, + "grad_norm": 1.433475375175476, + "learning_rate": 6.880992667794699e-06, + "loss": 0.8618, + "step": 15291 + }, + { + "epoch": 8.624929498025946, + "grad_norm": 0.7713577747344971, + "learning_rate": 6.8781725888324874e-06, + "loss": 0.6556, + "step": 15292 + }, + { + "epoch": 8.625493513818387, + "grad_norm": 1.5445500612258911, + "learning_rate": 6.875352509870277e-06, + "loss": 0.7957, + "step": 15293 + }, + { + "epoch": 8.62605752961083, + "grad_norm": 1.0559998750686646, + "learning_rate": 6.872532430908066e-06, + "loss": 0.6932, + "step": 15294 + }, + { + "epoch": 8.626621545403271, + "grad_norm": 1.2484246492385864, + "learning_rate": 6.869712351945855e-06, + "loss": 0.7796, + "step": 15295 + }, + { + "epoch": 8.627185561195713, + "grad_norm": 1.1300193071365356, + "learning_rate": 6.8668922729836435e-06, + "loss": 0.7179, + "step": 15296 + }, + { + "epoch": 8.627749576988156, + "grad_norm": 0.9424248337745667, + "learning_rate": 6.864072194021433e-06, + "loss": 0.6759, + "step": 15297 + }, + { + "epoch": 8.628313592780598, + "grad_norm": 1.0928163528442383, + "learning_rate": 6.861252115059222e-06, + "loss": 0.6819, + "step": 15298 + }, + { + "epoch": 8.62887760857304, + "grad_norm": 0.9775147438049316, + "learning_rate": 6.858432036097011e-06, + "loss": 0.7515, + "step": 15299 + }, + { + "epoch": 8.629441624365482, + "grad_norm": 1.0672898292541504, + "learning_rate": 6.8556119571347995e-06, + "loss": 0.7052, + "step": 15300 + }, + { + "epoch": 8.630005640157924, + "grad_norm": 0.9861478805541992, + "learning_rate": 6.852791878172589e-06, + "loss": 0.7868, + "step": 15301 + }, + { + "epoch": 8.630569655950367, + "grad_norm": 1.5561131238937378, + "learning_rate": 6.849971799210378e-06, + "loss": 0.8326, + "step": 15302 + }, + { + "epoch": 8.631133671742809, + "grad_norm": 1.094545602798462, + "learning_rate": 6.847151720248167e-06, + "loss": 0.7291, + "step": 15303 + }, + { + "epoch": 8.631697687535251, + "grad_norm": 1.4640530347824097, + "learning_rate": 6.8443316412859555e-06, + "loss": 0.8875, + "step": 15304 + }, + { + "epoch": 8.632261703327693, + "grad_norm": 1.109947681427002, + "learning_rate": 6.841511562323745e-06, + "loss": 0.6811, + "step": 15305 + }, + { + "epoch": 8.632825719120135, + "grad_norm": 1.564441204071045, + "learning_rate": 6.838691483361534e-06, + "loss": 0.8467, + "step": 15306 + }, + { + "epoch": 8.633389734912578, + "grad_norm": 1.1922132968902588, + "learning_rate": 6.835871404399323e-06, + "loss": 0.741, + "step": 15307 + }, + { + "epoch": 8.63395375070502, + "grad_norm": 0.8751668930053711, + "learning_rate": 6.833051325437112e-06, + "loss": 0.672, + "step": 15308 + }, + { + "epoch": 8.634517766497462, + "grad_norm": 1.1376062631607056, + "learning_rate": 6.830231246474901e-06, + "loss": 0.7406, + "step": 15309 + }, + { + "epoch": 8.635081782289904, + "grad_norm": 1.502536416053772, + "learning_rate": 6.82741116751269e-06, + "loss": 0.7798, + "step": 15310 + }, + { + "epoch": 8.635645798082347, + "grad_norm": 1.3258482217788696, + "learning_rate": 6.824591088550479e-06, + "loss": 0.6857, + "step": 15311 + }, + { + "epoch": 8.636209813874789, + "grad_norm": 1.1042613983154297, + "learning_rate": 6.821771009588269e-06, + "loss": 0.7547, + "step": 15312 + }, + { + "epoch": 8.63677382966723, + "grad_norm": 0.995097279548645, + "learning_rate": 6.818950930626058e-06, + "loss": 0.6432, + "step": 15313 + }, + { + "epoch": 8.637337845459673, + "grad_norm": 0.8915089964866638, + "learning_rate": 6.816130851663848e-06, + "loss": 0.6734, + "step": 15314 + }, + { + "epoch": 8.637901861252114, + "grad_norm": 0.9066778421401978, + "learning_rate": 6.8133107727016365e-06, + "loss": 0.6536, + "step": 15315 + }, + { + "epoch": 8.638465877044558, + "grad_norm": 1.4599462747573853, + "learning_rate": 6.810490693739425e-06, + "loss": 0.726, + "step": 15316 + }, + { + "epoch": 8.639029892837, + "grad_norm": 1.263175368309021, + "learning_rate": 6.807670614777215e-06, + "loss": 0.6392, + "step": 15317 + }, + { + "epoch": 8.639593908629442, + "grad_norm": 1.0230991840362549, + "learning_rate": 6.804850535815004e-06, + "loss": 0.6954, + "step": 15318 + }, + { + "epoch": 8.640157924421883, + "grad_norm": 1.0123714208602905, + "learning_rate": 6.8020304568527926e-06, + "loss": 0.6859, + "step": 15319 + }, + { + "epoch": 8.640721940214327, + "grad_norm": 1.206356406211853, + "learning_rate": 6.799210377890581e-06, + "loss": 0.7623, + "step": 15320 + }, + { + "epoch": 8.641285956006769, + "grad_norm": 1.2148866653442383, + "learning_rate": 6.796390298928371e-06, + "loss": 0.8913, + "step": 15321 + }, + { + "epoch": 8.64184997179921, + "grad_norm": 0.9810484647750854, + "learning_rate": 6.79357021996616e-06, + "loss": 0.564, + "step": 15322 + }, + { + "epoch": 8.642413987591652, + "grad_norm": 1.0931731462478638, + "learning_rate": 6.790750141003949e-06, + "loss": 0.7724, + "step": 15323 + }, + { + "epoch": 8.642978003384094, + "grad_norm": 1.112351655960083, + "learning_rate": 6.787930062041737e-06, + "loss": 0.7795, + "step": 15324 + }, + { + "epoch": 8.643542019176538, + "grad_norm": 1.0682170391082764, + "learning_rate": 6.785109983079527e-06, + "loss": 0.7034, + "step": 15325 + }, + { + "epoch": 8.64410603496898, + "grad_norm": 1.2764641046524048, + "learning_rate": 6.782289904117316e-06, + "loss": 0.7662, + "step": 15326 + }, + { + "epoch": 8.644670050761421, + "grad_norm": 1.6452339887619019, + "learning_rate": 6.779469825155105e-06, + "loss": 0.7266, + "step": 15327 + }, + { + "epoch": 8.645234066553863, + "grad_norm": 1.1058543920516968, + "learning_rate": 6.776649746192893e-06, + "loss": 0.8029, + "step": 15328 + }, + { + "epoch": 8.645798082346305, + "grad_norm": 1.496788740158081, + "learning_rate": 6.773829667230683e-06, + "loss": 0.6307, + "step": 15329 + }, + { + "epoch": 8.646362098138749, + "grad_norm": 1.3787345886230469, + "learning_rate": 6.771009588268472e-06, + "loss": 0.8066, + "step": 15330 + }, + { + "epoch": 8.64692611393119, + "grad_norm": 1.0322413444519043, + "learning_rate": 6.768189509306261e-06, + "loss": 0.6661, + "step": 15331 + }, + { + "epoch": 8.647490129723632, + "grad_norm": 1.1207503080368042, + "learning_rate": 6.7653694303440495e-06, + "loss": 0.6385, + "step": 15332 + }, + { + "epoch": 8.648054145516074, + "grad_norm": 1.5400331020355225, + "learning_rate": 6.762549351381839e-06, + "loss": 0.7645, + "step": 15333 + }, + { + "epoch": 8.648618161308516, + "grad_norm": 1.1709932088851929, + "learning_rate": 6.759729272419628e-06, + "loss": 0.7682, + "step": 15334 + }, + { + "epoch": 8.64918217710096, + "grad_norm": 1.3646169900894165, + "learning_rate": 6.756909193457417e-06, + "loss": 0.7994, + "step": 15335 + }, + { + "epoch": 8.649746192893401, + "grad_norm": 1.1832456588745117, + "learning_rate": 6.754089114495206e-06, + "loss": 0.8192, + "step": 15336 + }, + { + "epoch": 8.650310208685843, + "grad_norm": 0.8583177328109741, + "learning_rate": 6.751269035532995e-06, + "loss": 0.6136, + "step": 15337 + }, + { + "epoch": 8.650874224478285, + "grad_norm": 1.4503241777420044, + "learning_rate": 6.748448956570784e-06, + "loss": 0.7824, + "step": 15338 + }, + { + "epoch": 8.651438240270728, + "grad_norm": 0.9725977182388306, + "learning_rate": 6.745628877608573e-06, + "loss": 0.7201, + "step": 15339 + }, + { + "epoch": 8.65200225606317, + "grad_norm": 1.109695315361023, + "learning_rate": 6.742808798646362e-06, + "loss": 0.7371, + "step": 15340 + }, + { + "epoch": 8.652566271855612, + "grad_norm": 1.220137357711792, + "learning_rate": 6.739988719684151e-06, + "loss": 0.7226, + "step": 15341 + }, + { + "epoch": 8.653130287648054, + "grad_norm": 0.9039838910102844, + "learning_rate": 6.73716864072194e-06, + "loss": 0.7308, + "step": 15342 + }, + { + "epoch": 8.653694303440496, + "grad_norm": 1.1285400390625, + "learning_rate": 6.734348561759729e-06, + "loss": 0.6596, + "step": 15343 + }, + { + "epoch": 8.65425831923294, + "grad_norm": 1.2721023559570312, + "learning_rate": 6.731528482797518e-06, + "loss": 0.9057, + "step": 15344 + }, + { + "epoch": 8.654822335025381, + "grad_norm": 1.2536232471466064, + "learning_rate": 6.728708403835307e-06, + "loss": 0.6353, + "step": 15345 + }, + { + "epoch": 8.655386350817823, + "grad_norm": 0.9101911187171936, + "learning_rate": 6.725888324873096e-06, + "loss": 0.7521, + "step": 15346 + }, + { + "epoch": 8.655950366610265, + "grad_norm": 1.9570059776306152, + "learning_rate": 6.7230682459108865e-06, + "loss": 0.9589, + "step": 15347 + }, + { + "epoch": 8.656514382402708, + "grad_norm": 1.1281927824020386, + "learning_rate": 6.720248166948675e-06, + "loss": 0.7431, + "step": 15348 + }, + { + "epoch": 8.65707839819515, + "grad_norm": 0.9882855415344238, + "learning_rate": 6.717428087986465e-06, + "loss": 0.6966, + "step": 15349 + }, + { + "epoch": 8.657642413987592, + "grad_norm": 1.1257790327072144, + "learning_rate": 6.714608009024254e-06, + "loss": 0.7022, + "step": 15350 + }, + { + "epoch": 8.658206429780034, + "grad_norm": 1.2056330442428589, + "learning_rate": 6.7117879300620425e-06, + "loss": 0.8042, + "step": 15351 + }, + { + "epoch": 8.658770445572475, + "grad_norm": 1.1620615720748901, + "learning_rate": 6.708967851099831e-06, + "loss": 0.7714, + "step": 15352 + }, + { + "epoch": 8.659334461364919, + "grad_norm": 1.0938656330108643, + "learning_rate": 6.706147772137621e-06, + "loss": 0.6501, + "step": 15353 + }, + { + "epoch": 8.65989847715736, + "grad_norm": 0.8316680788993835, + "learning_rate": 6.70332769317541e-06, + "loss": 0.6764, + "step": 15354 + }, + { + "epoch": 8.660462492949803, + "grad_norm": 1.0284446477890015, + "learning_rate": 6.7005076142131985e-06, + "loss": 0.7931, + "step": 15355 + }, + { + "epoch": 8.661026508742244, + "grad_norm": 1.3753877878189087, + "learning_rate": 6.697687535250987e-06, + "loss": 0.7332, + "step": 15356 + }, + { + "epoch": 8.661590524534686, + "grad_norm": 1.0367250442504883, + "learning_rate": 6.694867456288777e-06, + "loss": 0.6635, + "step": 15357 + }, + { + "epoch": 8.66215454032713, + "grad_norm": 1.339915156364441, + "learning_rate": 6.692047377326566e-06, + "loss": 0.7535, + "step": 15358 + }, + { + "epoch": 8.662718556119572, + "grad_norm": 1.3254127502441406, + "learning_rate": 6.689227298364355e-06, + "loss": 0.7406, + "step": 15359 + }, + { + "epoch": 8.663282571912013, + "grad_norm": 1.1648155450820923, + "learning_rate": 6.686407219402143e-06, + "loss": 0.838, + "step": 15360 + }, + { + "epoch": 8.663846587704455, + "grad_norm": 1.1628994941711426, + "learning_rate": 6.683587140439933e-06, + "loss": 0.7707, + "step": 15361 + }, + { + "epoch": 8.664410603496897, + "grad_norm": 1.134117603302002, + "learning_rate": 6.680767061477722e-06, + "loss": 0.7699, + "step": 15362 + }, + { + "epoch": 8.66497461928934, + "grad_norm": 1.1481170654296875, + "learning_rate": 6.677946982515511e-06, + "loss": 0.6569, + "step": 15363 + }, + { + "epoch": 8.665538635081782, + "grad_norm": 1.1297211647033691, + "learning_rate": 6.6751269035533e-06, + "loss": 0.6494, + "step": 15364 + }, + { + "epoch": 8.666102650874224, + "grad_norm": 1.2918386459350586, + "learning_rate": 6.672306824591089e-06, + "loss": 0.8003, + "step": 15365 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 1.4451130628585815, + "learning_rate": 6.669486745628878e-06, + "loss": 0.9078, + "step": 15366 + }, + { + "epoch": 8.66723068245911, + "grad_norm": 1.371841549873352, + "learning_rate": 6.666666666666667e-06, + "loss": 0.7456, + "step": 15367 + }, + { + "epoch": 8.667794698251551, + "grad_norm": 1.0603669881820679, + "learning_rate": 6.663846587704456e-06, + "loss": 0.6813, + "step": 15368 + }, + { + "epoch": 8.668358714043993, + "grad_norm": 1.1656566858291626, + "learning_rate": 6.661026508742245e-06, + "loss": 0.7448, + "step": 15369 + }, + { + "epoch": 8.668922729836435, + "grad_norm": 1.1613578796386719, + "learning_rate": 6.658206429780034e-06, + "loss": 0.7312, + "step": 15370 + }, + { + "epoch": 8.669486745628877, + "grad_norm": 1.046484351158142, + "learning_rate": 6.655386350817823e-06, + "loss": 0.6547, + "step": 15371 + }, + { + "epoch": 8.67005076142132, + "grad_norm": 0.9836048483848572, + "learning_rate": 6.652566271855612e-06, + "loss": 0.7505, + "step": 15372 + }, + { + "epoch": 8.670614777213762, + "grad_norm": 1.1737200021743774, + "learning_rate": 6.649746192893401e-06, + "loss": 0.7219, + "step": 15373 + }, + { + "epoch": 8.671178793006204, + "grad_norm": 1.178311824798584, + "learning_rate": 6.64692611393119e-06, + "loss": 0.7587, + "step": 15374 + }, + { + "epoch": 8.671742808798646, + "grad_norm": 1.1610702276229858, + "learning_rate": 6.644106034968979e-06, + "loss": 0.7331, + "step": 15375 + }, + { + "epoch": 8.67230682459109, + "grad_norm": 1.816522240638733, + "learning_rate": 6.641285956006768e-06, + "loss": 0.7607, + "step": 15376 + }, + { + "epoch": 8.672870840383531, + "grad_norm": 0.8972865343093872, + "learning_rate": 6.638465877044557e-06, + "loss": 0.6664, + "step": 15377 + }, + { + "epoch": 8.673434856175973, + "grad_norm": 0.7637702822685242, + "learning_rate": 6.635645798082346e-06, + "loss": 0.5862, + "step": 15378 + }, + { + "epoch": 8.673998871968415, + "grad_norm": 0.9927362203598022, + "learning_rate": 6.632825719120135e-06, + "loss": 0.7359, + "step": 15379 + }, + { + "epoch": 8.674562887760857, + "grad_norm": 1.1747636795043945, + "learning_rate": 6.630005640157924e-06, + "loss": 0.7662, + "step": 15380 + }, + { + "epoch": 8.6751269035533, + "grad_norm": 1.1374385356903076, + "learning_rate": 6.627185561195713e-06, + "loss": 0.7277, + "step": 15381 + }, + { + "epoch": 8.675690919345742, + "grad_norm": 1.1549381017684937, + "learning_rate": 6.624365482233502e-06, + "loss": 0.7571, + "step": 15382 + }, + { + "epoch": 8.676254935138184, + "grad_norm": 1.0567164421081543, + "learning_rate": 6.6215454032712925e-06, + "loss": 0.682, + "step": 15383 + }, + { + "epoch": 8.676818950930626, + "grad_norm": 0.8901659250259399, + "learning_rate": 6.618725324309081e-06, + "loss": 0.6783, + "step": 15384 + }, + { + "epoch": 8.677382966723068, + "grad_norm": 1.3357014656066895, + "learning_rate": 6.615905245346871e-06, + "loss": 0.751, + "step": 15385 + }, + { + "epoch": 8.677946982515511, + "grad_norm": 1.0616923570632935, + "learning_rate": 6.61308516638466e-06, + "loss": 0.713, + "step": 15386 + }, + { + "epoch": 8.678510998307953, + "grad_norm": 1.2247058153152466, + "learning_rate": 6.6102650874224485e-06, + "loss": 0.7579, + "step": 15387 + }, + { + "epoch": 8.679075014100395, + "grad_norm": 0.8936659693717957, + "learning_rate": 6.607445008460237e-06, + "loss": 0.6621, + "step": 15388 + }, + { + "epoch": 8.679639029892837, + "grad_norm": 1.074418306350708, + "learning_rate": 6.604624929498027e-06, + "loss": 0.7287, + "step": 15389 + }, + { + "epoch": 8.680203045685278, + "grad_norm": 1.2258051633834839, + "learning_rate": 6.601804850535816e-06, + "loss": 0.8004, + "step": 15390 + }, + { + "epoch": 8.680767061477722, + "grad_norm": 1.1570817232131958, + "learning_rate": 6.5989847715736045e-06, + "loss": 0.7616, + "step": 15391 + }, + { + "epoch": 8.681331077270164, + "grad_norm": 1.105161428451538, + "learning_rate": 6.596164692611393e-06, + "loss": 0.7541, + "step": 15392 + }, + { + "epoch": 8.681895093062606, + "grad_norm": 1.1302506923675537, + "learning_rate": 6.593344613649183e-06, + "loss": 0.5872, + "step": 15393 + }, + { + "epoch": 8.682459108855047, + "grad_norm": 1.0987153053283691, + "learning_rate": 6.590524534686972e-06, + "loss": 0.7293, + "step": 15394 + }, + { + "epoch": 8.683023124647491, + "grad_norm": 1.7234615087509155, + "learning_rate": 6.5877044557247606e-06, + "loss": 0.8403, + "step": 15395 + }, + { + "epoch": 8.683587140439933, + "grad_norm": 1.1349525451660156, + "learning_rate": 6.58488437676255e-06, + "loss": 0.7409, + "step": 15396 + }, + { + "epoch": 8.684151156232375, + "grad_norm": 1.4439619779586792, + "learning_rate": 6.582064297800339e-06, + "loss": 0.7896, + "step": 15397 + }, + { + "epoch": 8.684715172024816, + "grad_norm": 1.4205210208892822, + "learning_rate": 6.579244218838128e-06, + "loss": 0.7661, + "step": 15398 + }, + { + "epoch": 8.685279187817258, + "grad_norm": 1.2269872426986694, + "learning_rate": 6.576424139875917e-06, + "loss": 0.8672, + "step": 15399 + }, + { + "epoch": 8.685843203609702, + "grad_norm": 1.2503567934036255, + "learning_rate": 6.573604060913706e-06, + "loss": 0.772, + "step": 15400 + }, + { + "epoch": 8.686407219402144, + "grad_norm": 1.228306770324707, + "learning_rate": 6.570783981951495e-06, + "loss": 0.7609, + "step": 15401 + }, + { + "epoch": 8.686971235194585, + "grad_norm": 1.1069109439849854, + "learning_rate": 6.567963902989284e-06, + "loss": 0.7631, + "step": 15402 + }, + { + "epoch": 8.687535250987027, + "grad_norm": 0.8677695989608765, + "learning_rate": 6.565143824027073e-06, + "loss": 0.6964, + "step": 15403 + }, + { + "epoch": 8.68809926677947, + "grad_norm": 1.0479416847229004, + "learning_rate": 6.562323745064862e-06, + "loss": 0.8127, + "step": 15404 + }, + { + "epoch": 8.688663282571913, + "grad_norm": 5.24631929397583, + "learning_rate": 6.559503666102651e-06, + "loss": 0.8172, + "step": 15405 + }, + { + "epoch": 8.689227298364354, + "grad_norm": 1.1604382991790771, + "learning_rate": 6.55668358714044e-06, + "loss": 0.7041, + "step": 15406 + }, + { + "epoch": 8.689791314156796, + "grad_norm": 0.8835191130638123, + "learning_rate": 6.553863508178229e-06, + "loss": 0.5905, + "step": 15407 + }, + { + "epoch": 8.690355329949238, + "grad_norm": 1.1715302467346191, + "learning_rate": 6.551043429216018e-06, + "loss": 0.7538, + "step": 15408 + }, + { + "epoch": 8.690919345741682, + "grad_norm": 0.9852918982505798, + "learning_rate": 6.548223350253807e-06, + "loss": 0.7561, + "step": 15409 + }, + { + "epoch": 8.691483361534123, + "grad_norm": 1.12043297290802, + "learning_rate": 6.545403271291596e-06, + "loss": 0.7175, + "step": 15410 + }, + { + "epoch": 8.692047377326565, + "grad_norm": 1.3216108083724976, + "learning_rate": 6.542583192329385e-06, + "loss": 0.7042, + "step": 15411 + }, + { + "epoch": 8.692611393119007, + "grad_norm": 0.9448184967041016, + "learning_rate": 6.539763113367174e-06, + "loss": 0.7634, + "step": 15412 + }, + { + "epoch": 8.693175408911449, + "grad_norm": 1.0661946535110474, + "learning_rate": 6.536943034404963e-06, + "loss": 0.7543, + "step": 15413 + }, + { + "epoch": 8.693739424703892, + "grad_norm": 1.322628378868103, + "learning_rate": 6.534122955442752e-06, + "loss": 0.7806, + "step": 15414 + }, + { + "epoch": 8.694303440496334, + "grad_norm": 1.031729817390442, + "learning_rate": 6.531302876480542e-06, + "loss": 0.723, + "step": 15415 + }, + { + "epoch": 8.694867456288776, + "grad_norm": 1.6692633628845215, + "learning_rate": 6.52848279751833e-06, + "loss": 0.7097, + "step": 15416 + }, + { + "epoch": 8.695431472081218, + "grad_norm": 1.4683862924575806, + "learning_rate": 6.525662718556119e-06, + "loss": 0.8056, + "step": 15417 + }, + { + "epoch": 8.69599548787366, + "grad_norm": 0.9794811606407166, + "learning_rate": 6.52284263959391e-06, + "loss": 0.7334, + "step": 15418 + }, + { + "epoch": 8.696559503666103, + "grad_norm": 1.1047577857971191, + "learning_rate": 6.5200225606316985e-06, + "loss": 0.7157, + "step": 15419 + }, + { + "epoch": 8.697123519458545, + "grad_norm": 0.982990026473999, + "learning_rate": 6.517202481669487e-06, + "loss": 0.8382, + "step": 15420 + }, + { + "epoch": 8.697687535250987, + "grad_norm": 1.699403166770935, + "learning_rate": 6.514382402707277e-06, + "loss": 0.7669, + "step": 15421 + }, + { + "epoch": 8.698251551043429, + "grad_norm": 1.3081954717636108, + "learning_rate": 6.511562323745066e-06, + "loss": 0.6878, + "step": 15422 + }, + { + "epoch": 8.698815566835872, + "grad_norm": 1.3966948986053467, + "learning_rate": 6.5087422447828545e-06, + "loss": 0.7793, + "step": 15423 + }, + { + "epoch": 8.699379582628314, + "grad_norm": 1.199329137802124, + "learning_rate": 6.505922165820644e-06, + "loss": 0.738, + "step": 15424 + }, + { + "epoch": 8.699943598420756, + "grad_norm": 1.2227466106414795, + "learning_rate": 6.503102086858433e-06, + "loss": 0.7838, + "step": 15425 + }, + { + "epoch": 8.700507614213198, + "grad_norm": 1.2226587533950806, + "learning_rate": 6.500282007896222e-06, + "loss": 0.7666, + "step": 15426 + }, + { + "epoch": 8.70107163000564, + "grad_norm": 0.9000455141067505, + "learning_rate": 6.4974619289340105e-06, + "loss": 0.7281, + "step": 15427 + }, + { + "epoch": 8.701635645798083, + "grad_norm": 1.4049789905548096, + "learning_rate": 6.4946418499718e-06, + "loss": 0.7111, + "step": 15428 + }, + { + "epoch": 8.702199661590525, + "grad_norm": 1.482905626296997, + "learning_rate": 6.491821771009589e-06, + "loss": 0.7667, + "step": 15429 + }, + { + "epoch": 8.702763677382967, + "grad_norm": 1.2099708318710327, + "learning_rate": 6.489001692047378e-06, + "loss": 0.8551, + "step": 15430 + }, + { + "epoch": 8.703327693175408, + "grad_norm": 1.2875268459320068, + "learning_rate": 6.4861816130851666e-06, + "loss": 0.7989, + "step": 15431 + }, + { + "epoch": 8.703891708967852, + "grad_norm": 1.2142198085784912, + "learning_rate": 6.483361534122956e-06, + "loss": 0.6721, + "step": 15432 + }, + { + "epoch": 8.704455724760294, + "grad_norm": 0.8984055519104004, + "learning_rate": 6.480541455160745e-06, + "loss": 0.701, + "step": 15433 + }, + { + "epoch": 8.705019740552736, + "grad_norm": 1.1432745456695557, + "learning_rate": 6.477721376198534e-06, + "loss": 0.6829, + "step": 15434 + }, + { + "epoch": 8.705583756345177, + "grad_norm": 1.306002140045166, + "learning_rate": 6.474901297236323e-06, + "loss": 0.8086, + "step": 15435 + }, + { + "epoch": 8.70614777213762, + "grad_norm": 1.0662920475006104, + "learning_rate": 6.472081218274112e-06, + "loss": 0.6447, + "step": 15436 + }, + { + "epoch": 8.706711787930063, + "grad_norm": 3.5944676399230957, + "learning_rate": 6.469261139311901e-06, + "loss": 0.7838, + "step": 15437 + }, + { + "epoch": 8.707275803722505, + "grad_norm": 1.0981237888336182, + "learning_rate": 6.46644106034969e-06, + "loss": 0.7603, + "step": 15438 + }, + { + "epoch": 8.707839819514946, + "grad_norm": 0.8724241852760315, + "learning_rate": 6.463620981387479e-06, + "loss": 0.6751, + "step": 15439 + }, + { + "epoch": 8.708403835307388, + "grad_norm": 0.8484669327735901, + "learning_rate": 6.460800902425268e-06, + "loss": 0.6189, + "step": 15440 + }, + { + "epoch": 8.70896785109983, + "grad_norm": 1.1495970487594604, + "learning_rate": 6.457980823463057e-06, + "loss": 0.6873, + "step": 15441 + }, + { + "epoch": 8.709531866892274, + "grad_norm": 1.2144182920455933, + "learning_rate": 6.455160744500846e-06, + "loss": 0.7145, + "step": 15442 + }, + { + "epoch": 8.710095882684715, + "grad_norm": 1.111000895500183, + "learning_rate": 6.452340665538635e-06, + "loss": 0.6601, + "step": 15443 + }, + { + "epoch": 8.710659898477157, + "grad_norm": 0.8591135740280151, + "learning_rate": 6.449520586576424e-06, + "loss": 0.676, + "step": 15444 + }, + { + "epoch": 8.711223914269599, + "grad_norm": 1.3954252004623413, + "learning_rate": 6.446700507614213e-06, + "loss": 0.705, + "step": 15445 + }, + { + "epoch": 8.71178793006204, + "grad_norm": 1.1353248357772827, + "learning_rate": 6.443880428652002e-06, + "loss": 0.8408, + "step": 15446 + }, + { + "epoch": 8.712351945854484, + "grad_norm": 0.8923770785331726, + "learning_rate": 6.4410603496897915e-06, + "loss": 0.7357, + "step": 15447 + }, + { + "epoch": 8.712915961646926, + "grad_norm": 1.1578763723373413, + "learning_rate": 6.43824027072758e-06, + "loss": 0.6668, + "step": 15448 + }, + { + "epoch": 8.713479977439368, + "grad_norm": 1.374727725982666, + "learning_rate": 6.435420191765369e-06, + "loss": 0.7333, + "step": 15449 + }, + { + "epoch": 8.71404399323181, + "grad_norm": 1.3736212253570557, + "learning_rate": 6.432600112803158e-06, + "loss": 0.7256, + "step": 15450 + }, + { + "epoch": 8.714608009024253, + "grad_norm": 1.0902634859085083, + "learning_rate": 6.429780033840948e-06, + "loss": 0.7909, + "step": 15451 + }, + { + "epoch": 8.715172024816695, + "grad_norm": 1.427219271659851, + "learning_rate": 6.426959954878736e-06, + "loss": 0.8545, + "step": 15452 + }, + { + "epoch": 8.715736040609137, + "grad_norm": 0.9603972434997559, + "learning_rate": 6.424139875916527e-06, + "loss": 0.6686, + "step": 15453 + }, + { + "epoch": 8.716300056401579, + "grad_norm": 1.5420094728469849, + "learning_rate": 6.421319796954316e-06, + "loss": 0.8582, + "step": 15454 + }, + { + "epoch": 8.71686407219402, + "grad_norm": 1.29427969455719, + "learning_rate": 6.4184997179921045e-06, + "loss": 0.8659, + "step": 15455 + }, + { + "epoch": 8.717428087986464, + "grad_norm": 1.2379268407821655, + "learning_rate": 6.415679639029894e-06, + "loss": 0.7992, + "step": 15456 + }, + { + "epoch": 8.717992103778906, + "grad_norm": 0.975213885307312, + "learning_rate": 6.412859560067683e-06, + "loss": 0.6789, + "step": 15457 + }, + { + "epoch": 8.718556119571348, + "grad_norm": 1.327671766281128, + "learning_rate": 6.410039481105472e-06, + "loss": 0.8518, + "step": 15458 + }, + { + "epoch": 8.71912013536379, + "grad_norm": 0.9536583423614502, + "learning_rate": 6.4072194021432605e-06, + "loss": 0.6865, + "step": 15459 + }, + { + "epoch": 8.719684151156233, + "grad_norm": 1.462229609489441, + "learning_rate": 6.40439932318105e-06, + "loss": 0.7987, + "step": 15460 + }, + { + "epoch": 8.720248166948675, + "grad_norm": 0.9187049269676208, + "learning_rate": 6.401579244218839e-06, + "loss": 0.6924, + "step": 15461 + }, + { + "epoch": 8.720812182741117, + "grad_norm": 1.2096037864685059, + "learning_rate": 6.398759165256628e-06, + "loss": 0.76, + "step": 15462 + }, + { + "epoch": 8.721376198533559, + "grad_norm": 1.7989064455032349, + "learning_rate": 6.3959390862944165e-06, + "loss": 0.8056, + "step": 15463 + }, + { + "epoch": 8.721940214326, + "grad_norm": 1.2397273778915405, + "learning_rate": 6.393119007332206e-06, + "loss": 0.7762, + "step": 15464 + }, + { + "epoch": 8.722504230118444, + "grad_norm": 1.3805557489395142, + "learning_rate": 6.390298928369995e-06, + "loss": 0.8073, + "step": 15465 + }, + { + "epoch": 8.723068245910886, + "grad_norm": 1.0532846450805664, + "learning_rate": 6.387478849407784e-06, + "loss": 0.6146, + "step": 15466 + }, + { + "epoch": 8.723632261703328, + "grad_norm": 1.039189338684082, + "learning_rate": 6.3846587704455726e-06, + "loss": 0.6829, + "step": 15467 + }, + { + "epoch": 8.72419627749577, + "grad_norm": 1.119734525680542, + "learning_rate": 6.381838691483362e-06, + "loss": 0.7009, + "step": 15468 + }, + { + "epoch": 8.724760293288211, + "grad_norm": 1.1235712766647339, + "learning_rate": 6.379018612521151e-06, + "loss": 0.7187, + "step": 15469 + }, + { + "epoch": 8.725324309080655, + "grad_norm": 1.127616047859192, + "learning_rate": 6.37619853355894e-06, + "loss": 0.6943, + "step": 15470 + }, + { + "epoch": 8.725888324873097, + "grad_norm": 1.2915619611740112, + "learning_rate": 6.373378454596729e-06, + "loss": 0.6689, + "step": 15471 + }, + { + "epoch": 8.726452340665539, + "grad_norm": 1.2489266395568848, + "learning_rate": 6.370558375634518e-06, + "loss": 0.6959, + "step": 15472 + }, + { + "epoch": 8.72701635645798, + "grad_norm": 1.0618795156478882, + "learning_rate": 6.367738296672307e-06, + "loss": 0.6879, + "step": 15473 + }, + { + "epoch": 8.727580372250422, + "grad_norm": 0.9089023470878601, + "learning_rate": 6.364918217710096e-06, + "loss": 0.7054, + "step": 15474 + }, + { + "epoch": 8.728144388042866, + "grad_norm": 1.07749605178833, + "learning_rate": 6.3620981387478855e-06, + "loss": 0.7697, + "step": 15475 + }, + { + "epoch": 8.728708403835308, + "grad_norm": 1.0908616781234741, + "learning_rate": 6.359278059785674e-06, + "loss": 0.6766, + "step": 15476 + }, + { + "epoch": 8.72927241962775, + "grad_norm": 1.0819088220596313, + "learning_rate": 6.356457980823463e-06, + "loss": 0.6199, + "step": 15477 + }, + { + "epoch": 8.729836435420191, + "grad_norm": 1.604052186012268, + "learning_rate": 6.353637901861252e-06, + "loss": 0.8119, + "step": 15478 + }, + { + "epoch": 8.730400451212635, + "grad_norm": 1.287759780883789, + "learning_rate": 6.3508178228990415e-06, + "loss": 0.7657, + "step": 15479 + }, + { + "epoch": 8.730964467005077, + "grad_norm": 1.1557908058166504, + "learning_rate": 6.34799774393683e-06, + "loss": 0.673, + "step": 15480 + }, + { + "epoch": 8.731528482797518, + "grad_norm": 1.0581538677215576, + "learning_rate": 6.345177664974619e-06, + "loss": 0.8275, + "step": 15481 + }, + { + "epoch": 8.73209249858996, + "grad_norm": 1.0756120681762695, + "learning_rate": 6.342357586012408e-06, + "loss": 0.6987, + "step": 15482 + }, + { + "epoch": 8.732656514382402, + "grad_norm": 0.9823361039161682, + "learning_rate": 6.3395375070501975e-06, + "loss": 0.7605, + "step": 15483 + }, + { + "epoch": 8.733220530174846, + "grad_norm": 1.2914754152297974, + "learning_rate": 6.336717428087986e-06, + "loss": 0.8151, + "step": 15484 + }, + { + "epoch": 8.733784545967287, + "grad_norm": 1.1443654298782349, + "learning_rate": 6.333897349125775e-06, + "loss": 0.6568, + "step": 15485 + }, + { + "epoch": 8.73434856175973, + "grad_norm": 0.8231571912765503, + "learning_rate": 6.331077270163564e-06, + "loss": 0.6779, + "step": 15486 + }, + { + "epoch": 8.734912577552171, + "grad_norm": 1.0352234840393066, + "learning_rate": 6.3282571912013536e-06, + "loss": 0.7401, + "step": 15487 + }, + { + "epoch": 8.735476593344615, + "grad_norm": 0.8658595681190491, + "learning_rate": 6.325437112239142e-06, + "loss": 0.6615, + "step": 15488 + }, + { + "epoch": 8.736040609137056, + "grad_norm": 0.9380359649658203, + "learning_rate": 6.322617033276933e-06, + "loss": 0.7205, + "step": 15489 + }, + { + "epoch": 8.736604624929498, + "grad_norm": 0.8712440729141235, + "learning_rate": 6.319796954314722e-06, + "loss": 0.6694, + "step": 15490 + }, + { + "epoch": 8.73716864072194, + "grad_norm": 1.4647296667099, + "learning_rate": 6.3169768753525105e-06, + "loss": 0.8233, + "step": 15491 + }, + { + "epoch": 8.737732656514382, + "grad_norm": 1.328002691268921, + "learning_rate": 6.3141567963903e-06, + "loss": 0.8089, + "step": 15492 + }, + { + "epoch": 8.738296672306825, + "grad_norm": 1.0921446084976196, + "learning_rate": 6.311336717428089e-06, + "loss": 0.7045, + "step": 15493 + }, + { + "epoch": 8.738860688099267, + "grad_norm": 1.1310778856277466, + "learning_rate": 6.308516638465878e-06, + "loss": 0.7293, + "step": 15494 + }, + { + "epoch": 8.739424703891709, + "grad_norm": 1.3427226543426514, + "learning_rate": 6.3056965595036665e-06, + "loss": 0.8246, + "step": 15495 + }, + { + "epoch": 8.73998871968415, + "grad_norm": 1.2247834205627441, + "learning_rate": 6.302876480541456e-06, + "loss": 0.7719, + "step": 15496 + }, + { + "epoch": 8.740552735476593, + "grad_norm": 1.1024537086486816, + "learning_rate": 6.300056401579245e-06, + "loss": 0.7465, + "step": 15497 + }, + { + "epoch": 8.741116751269036, + "grad_norm": 0.8955037593841553, + "learning_rate": 6.297236322617034e-06, + "loss": 0.7359, + "step": 15498 + }, + { + "epoch": 8.741680767061478, + "grad_norm": 1.0002938508987427, + "learning_rate": 6.2944162436548225e-06, + "loss": 0.6317, + "step": 15499 + }, + { + "epoch": 8.74224478285392, + "grad_norm": 1.4340003728866577, + "learning_rate": 6.291596164692612e-06, + "loss": 0.8577, + "step": 15500 + }, + { + "epoch": 8.742808798646362, + "grad_norm": 1.0606027841567993, + "learning_rate": 6.288776085730401e-06, + "loss": 0.6764, + "step": 15501 + }, + { + "epoch": 8.743372814438803, + "grad_norm": 1.3036659955978394, + "learning_rate": 6.28595600676819e-06, + "loss": 0.7189, + "step": 15502 + }, + { + "epoch": 8.743936830231247, + "grad_norm": 1.2097845077514648, + "learning_rate": 6.283135927805979e-06, + "loss": 0.6686, + "step": 15503 + }, + { + "epoch": 8.744500846023689, + "grad_norm": 1.2927864789962769, + "learning_rate": 6.280315848843768e-06, + "loss": 0.8071, + "step": 15504 + }, + { + "epoch": 8.74506486181613, + "grad_norm": 1.2143995761871338, + "learning_rate": 6.277495769881557e-06, + "loss": 0.7329, + "step": 15505 + }, + { + "epoch": 8.745628877608572, + "grad_norm": 0.9531599283218384, + "learning_rate": 6.274675690919346e-06, + "loss": 0.83, + "step": 15506 + }, + { + "epoch": 8.746192893401016, + "grad_norm": 1.1057904958724976, + "learning_rate": 6.2718556119571354e-06, + "loss": 0.6789, + "step": 15507 + }, + { + "epoch": 8.746756909193458, + "grad_norm": 0.8429037928581238, + "learning_rate": 6.269035532994924e-06, + "loss": 0.6446, + "step": 15508 + }, + { + "epoch": 8.7473209249859, + "grad_norm": 1.090570330619812, + "learning_rate": 6.266215454032713e-06, + "loss": 0.7638, + "step": 15509 + }, + { + "epoch": 8.747884940778341, + "grad_norm": 1.2970267534255981, + "learning_rate": 6.263395375070502e-06, + "loss": 0.6701, + "step": 15510 + }, + { + "epoch": 8.748448956570783, + "grad_norm": 0.9433238506317139, + "learning_rate": 6.2605752961082915e-06, + "loss": 0.6676, + "step": 15511 + }, + { + "epoch": 8.749012972363227, + "grad_norm": 1.0087308883666992, + "learning_rate": 6.25775521714608e-06, + "loss": 0.6534, + "step": 15512 + }, + { + "epoch": 8.749576988155669, + "grad_norm": 1.0812126398086548, + "learning_rate": 6.254935138183869e-06, + "loss": 0.7756, + "step": 15513 + }, + { + "epoch": 8.75014100394811, + "grad_norm": 0.9889826774597168, + "learning_rate": 6.252115059221658e-06, + "loss": 0.6612, + "step": 15514 + }, + { + "epoch": 8.750705019740552, + "grad_norm": 1.5265731811523438, + "learning_rate": 6.2492949802594475e-06, + "loss": 0.8034, + "step": 15515 + }, + { + "epoch": 8.751269035532996, + "grad_norm": 1.141449213027954, + "learning_rate": 6.246474901297237e-06, + "loss": 0.6755, + "step": 15516 + }, + { + "epoch": 8.751833051325438, + "grad_norm": 0.9791549444198608, + "learning_rate": 6.243654822335026e-06, + "loss": 0.7625, + "step": 15517 + }, + { + "epoch": 8.75239706711788, + "grad_norm": 1.1894326210021973, + "learning_rate": 6.240834743372815e-06, + "loss": 0.8462, + "step": 15518 + }, + { + "epoch": 8.752961082910321, + "grad_norm": 1.0851012468338013, + "learning_rate": 6.238014664410604e-06, + "loss": 0.6946, + "step": 15519 + }, + { + "epoch": 8.753525098702763, + "grad_norm": 1.251665472984314, + "learning_rate": 6.235194585448393e-06, + "loss": 0.686, + "step": 15520 + }, + { + "epoch": 8.754089114495207, + "grad_norm": 1.5009726285934448, + "learning_rate": 6.232374506486182e-06, + "loss": 0.8398, + "step": 15521 + }, + { + "epoch": 8.754653130287648, + "grad_norm": 0.9636143445968628, + "learning_rate": 6.229554427523971e-06, + "loss": 0.6741, + "step": 15522 + }, + { + "epoch": 8.75521714608009, + "grad_norm": 1.2854325771331787, + "learning_rate": 6.22673434856176e-06, + "loss": 0.6637, + "step": 15523 + }, + { + "epoch": 8.755781161872532, + "grad_norm": 1.1567490100860596, + "learning_rate": 6.223914269599549e-06, + "loss": 0.7644, + "step": 15524 + }, + { + "epoch": 8.756345177664974, + "grad_norm": 1.2864997386932373, + "learning_rate": 6.221094190637338e-06, + "loss": 0.7693, + "step": 15525 + }, + { + "epoch": 8.756909193457417, + "grad_norm": 1.1223338842391968, + "learning_rate": 6.218274111675127e-06, + "loss": 0.8155, + "step": 15526 + }, + { + "epoch": 8.75747320924986, + "grad_norm": 1.3593497276306152, + "learning_rate": 6.2154540327129164e-06, + "loss": 0.7132, + "step": 15527 + }, + { + "epoch": 8.758037225042301, + "grad_norm": 1.6199170351028442, + "learning_rate": 6.212633953750705e-06, + "loss": 0.7685, + "step": 15528 + }, + { + "epoch": 8.758601240834743, + "grad_norm": 1.1197850704193115, + "learning_rate": 6.209813874788494e-06, + "loss": 0.7562, + "step": 15529 + }, + { + "epoch": 8.759165256627185, + "grad_norm": 1.132891297340393, + "learning_rate": 6.206993795826283e-06, + "loss": 0.7282, + "step": 15530 + }, + { + "epoch": 8.759729272419628, + "grad_norm": 1.522104263305664, + "learning_rate": 6.2041737168640725e-06, + "loss": 0.7219, + "step": 15531 + }, + { + "epoch": 8.76029328821207, + "grad_norm": 1.0966304540634155, + "learning_rate": 6.201353637901861e-06, + "loss": 0.7744, + "step": 15532 + }, + { + "epoch": 8.760857304004512, + "grad_norm": 1.074599266052246, + "learning_rate": 6.198533558939651e-06, + "loss": 0.7332, + "step": 15533 + }, + { + "epoch": 8.761421319796954, + "grad_norm": 0.9620651006698608, + "learning_rate": 6.19571347997744e-06, + "loss": 0.8002, + "step": 15534 + }, + { + "epoch": 8.761985335589397, + "grad_norm": 1.1505537033081055, + "learning_rate": 6.192893401015229e-06, + "loss": 0.6838, + "step": 15535 + }, + { + "epoch": 8.762549351381839, + "grad_norm": 1.2832986116409302, + "learning_rate": 6.190073322053018e-06, + "loss": 0.7424, + "step": 15536 + }, + { + "epoch": 8.763113367174281, + "grad_norm": 1.3579455614089966, + "learning_rate": 6.187253243090807e-06, + "loss": 0.742, + "step": 15537 + }, + { + "epoch": 8.763677382966723, + "grad_norm": 0.9753795266151428, + "learning_rate": 6.184433164128596e-06, + "loss": 0.6778, + "step": 15538 + }, + { + "epoch": 8.764241398759165, + "grad_norm": 1.1941770315170288, + "learning_rate": 6.181613085166385e-06, + "loss": 0.7922, + "step": 15539 + }, + { + "epoch": 8.764805414551608, + "grad_norm": 0.9866235256195068, + "learning_rate": 6.178793006204174e-06, + "loss": 0.7356, + "step": 15540 + }, + { + "epoch": 8.76536943034405, + "grad_norm": 1.4840527772903442, + "learning_rate": 6.175972927241963e-06, + "loss": 0.7772, + "step": 15541 + }, + { + "epoch": 8.765933446136492, + "grad_norm": 1.1368982791900635, + "learning_rate": 6.173152848279752e-06, + "loss": 0.7675, + "step": 15542 + }, + { + "epoch": 8.766497461928934, + "grad_norm": 1.081639051437378, + "learning_rate": 6.1703327693175414e-06, + "loss": 0.7745, + "step": 15543 + }, + { + "epoch": 8.767061477721377, + "grad_norm": 0.9950189590454102, + "learning_rate": 6.16751269035533e-06, + "loss": 0.7584, + "step": 15544 + }, + { + "epoch": 8.767625493513819, + "grad_norm": 1.2526136636734009, + "learning_rate": 6.164692611393119e-06, + "loss": 0.7424, + "step": 15545 + }, + { + "epoch": 8.76818950930626, + "grad_norm": 1.0134758949279785, + "learning_rate": 6.161872532430908e-06, + "loss": 0.6294, + "step": 15546 + }, + { + "epoch": 8.768753525098703, + "grad_norm": 1.2265914678573608, + "learning_rate": 6.1590524534686975e-06, + "loss": 0.7304, + "step": 15547 + }, + { + "epoch": 8.769317540891144, + "grad_norm": 1.0943092107772827, + "learning_rate": 6.156232374506486e-06, + "loss": 0.7553, + "step": 15548 + }, + { + "epoch": 8.769881556683588, + "grad_norm": 1.274154782295227, + "learning_rate": 6.153412295544275e-06, + "loss": 0.7104, + "step": 15549 + }, + { + "epoch": 8.77044557247603, + "grad_norm": 1.4010734558105469, + "learning_rate": 6.150592216582064e-06, + "loss": 0.8008, + "step": 15550 + }, + { + "epoch": 8.771009588268472, + "grad_norm": 1.1088474988937378, + "learning_rate": 6.147772137619854e-06, + "loss": 0.6633, + "step": 15551 + }, + { + "epoch": 8.771573604060913, + "grad_norm": 1.039989948272705, + "learning_rate": 6.144952058657643e-06, + "loss": 0.713, + "step": 15552 + }, + { + "epoch": 8.772137619853355, + "grad_norm": 1.1231533288955688, + "learning_rate": 6.142131979695432e-06, + "loss": 0.7772, + "step": 15553 + }, + { + "epoch": 8.772701635645799, + "grad_norm": 0.9565939903259277, + "learning_rate": 6.139311900733221e-06, + "loss": 0.6537, + "step": 15554 + }, + { + "epoch": 8.77326565143824, + "grad_norm": 1.5761942863464355, + "learning_rate": 6.13649182177101e-06, + "loss": 0.8081, + "step": 15555 + }, + { + "epoch": 8.773829667230682, + "grad_norm": 1.1248325109481812, + "learning_rate": 6.133671742808799e-06, + "loss": 0.7214, + "step": 15556 + }, + { + "epoch": 8.774393683023124, + "grad_norm": 1.643825888633728, + "learning_rate": 6.130851663846588e-06, + "loss": 0.7902, + "step": 15557 + }, + { + "epoch": 8.774957698815566, + "grad_norm": 1.8981904983520508, + "learning_rate": 6.128031584884377e-06, + "loss": 0.7903, + "step": 15558 + }, + { + "epoch": 8.77552171460801, + "grad_norm": 1.1845028400421143, + "learning_rate": 6.125211505922166e-06, + "loss": 0.8054, + "step": 15559 + }, + { + "epoch": 8.776085730400451, + "grad_norm": 1.091767430305481, + "learning_rate": 6.122391426959955e-06, + "loss": 0.7588, + "step": 15560 + }, + { + "epoch": 8.776649746192893, + "grad_norm": 1.2349642515182495, + "learning_rate": 6.119571347997744e-06, + "loss": 0.7792, + "step": 15561 + }, + { + "epoch": 8.777213761985335, + "grad_norm": 0.9029604196548462, + "learning_rate": 6.116751269035533e-06, + "loss": 0.7832, + "step": 15562 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 1.1902233362197876, + "learning_rate": 6.1139311900733224e-06, + "loss": 0.6565, + "step": 15563 + }, + { + "epoch": 8.77834179357022, + "grad_norm": 1.3734434843063354, + "learning_rate": 6.111111111111111e-06, + "loss": 0.72, + "step": 15564 + }, + { + "epoch": 8.778905809362662, + "grad_norm": 1.4183632135391235, + "learning_rate": 6.1082910321489e-06, + "loss": 0.7977, + "step": 15565 + }, + { + "epoch": 8.779469825155104, + "grad_norm": 1.0488208532333374, + "learning_rate": 6.105470953186689e-06, + "loss": 0.7273, + "step": 15566 + }, + { + "epoch": 8.780033840947546, + "grad_norm": 0.8601890802383423, + "learning_rate": 6.1026508742244785e-06, + "loss": 0.666, + "step": 15567 + }, + { + "epoch": 8.78059785673999, + "grad_norm": 1.1309901475906372, + "learning_rate": 6.099830795262268e-06, + "loss": 0.8202, + "step": 15568 + }, + { + "epoch": 8.781161872532431, + "grad_norm": 1.7256306409835815, + "learning_rate": 6.097010716300057e-06, + "loss": 0.8778, + "step": 15569 + }, + { + "epoch": 8.781725888324873, + "grad_norm": 1.1116325855255127, + "learning_rate": 6.094190637337846e-06, + "loss": 0.7602, + "step": 15570 + }, + { + "epoch": 8.782289904117315, + "grad_norm": 1.3879235982894897, + "learning_rate": 6.091370558375635e-06, + "loss": 0.7453, + "step": 15571 + }, + { + "epoch": 8.782853919909758, + "grad_norm": 1.1323702335357666, + "learning_rate": 6.088550479413424e-06, + "loss": 0.7731, + "step": 15572 + }, + { + "epoch": 8.7834179357022, + "grad_norm": 1.0165343284606934, + "learning_rate": 6.085730400451213e-06, + "loss": 0.6819, + "step": 15573 + }, + { + "epoch": 8.783981951494642, + "grad_norm": 1.1582896709442139, + "learning_rate": 6.082910321489002e-06, + "loss": 0.6416, + "step": 15574 + }, + { + "epoch": 8.784545967287084, + "grad_norm": 1.1973322629928589, + "learning_rate": 6.080090242526791e-06, + "loss": 0.7246, + "step": 15575 + }, + { + "epoch": 8.785109983079526, + "grad_norm": 0.972099244594574, + "learning_rate": 6.07727016356458e-06, + "loss": 0.6998, + "step": 15576 + }, + { + "epoch": 8.78567399887197, + "grad_norm": 1.3087419271469116, + "learning_rate": 6.074450084602369e-06, + "loss": 0.7203, + "step": 15577 + }, + { + "epoch": 8.786238014664411, + "grad_norm": 1.0730199813842773, + "learning_rate": 6.071630005640158e-06, + "loss": 0.6724, + "step": 15578 + }, + { + "epoch": 8.786802030456853, + "grad_norm": 0.7856429219245911, + "learning_rate": 6.068809926677947e-06, + "loss": 0.6245, + "step": 15579 + }, + { + "epoch": 8.787366046249295, + "grad_norm": 1.1397223472595215, + "learning_rate": 6.065989847715736e-06, + "loss": 0.6366, + "step": 15580 + }, + { + "epoch": 8.787930062041736, + "grad_norm": 3.4923670291900635, + "learning_rate": 6.063169768753525e-06, + "loss": 0.6589, + "step": 15581 + }, + { + "epoch": 8.78849407783418, + "grad_norm": 0.8885844945907593, + "learning_rate": 6.060349689791315e-06, + "loss": 0.7562, + "step": 15582 + }, + { + "epoch": 8.789058093626622, + "grad_norm": 1.187247395515442, + "learning_rate": 6.0575296108291035e-06, + "loss": 0.8263, + "step": 15583 + }, + { + "epoch": 8.789622109419064, + "grad_norm": 1.3775355815887451, + "learning_rate": 6.054709531866892e-06, + "loss": 0.8188, + "step": 15584 + }, + { + "epoch": 8.790186125211505, + "grad_norm": 1.2643077373504639, + "learning_rate": 6.051889452904681e-06, + "loss": 0.7538, + "step": 15585 + }, + { + "epoch": 8.790750141003947, + "grad_norm": 1.0932230949401855, + "learning_rate": 6.049069373942471e-06, + "loss": 0.6805, + "step": 15586 + }, + { + "epoch": 8.79131415679639, + "grad_norm": 0.8353531956672668, + "learning_rate": 6.04624929498026e-06, + "loss": 0.6542, + "step": 15587 + }, + { + "epoch": 8.791878172588833, + "grad_norm": 1.0382499694824219, + "learning_rate": 6.043429216018049e-06, + "loss": 0.6866, + "step": 15588 + }, + { + "epoch": 8.792442188381274, + "grad_norm": 0.8515567779541016, + "learning_rate": 6.040609137055838e-06, + "loss": 0.73, + "step": 15589 + }, + { + "epoch": 8.793006204173716, + "grad_norm": 1.2737370729446411, + "learning_rate": 6.037789058093627e-06, + "loss": 0.776, + "step": 15590 + }, + { + "epoch": 8.79357021996616, + "grad_norm": 1.2184597253799438, + "learning_rate": 6.034968979131416e-06, + "loss": 0.7346, + "step": 15591 + }, + { + "epoch": 8.794134235758602, + "grad_norm": 1.181344985961914, + "learning_rate": 6.032148900169205e-06, + "loss": 0.7711, + "step": 15592 + }, + { + "epoch": 8.794698251551043, + "grad_norm": 0.8684912919998169, + "learning_rate": 6.029328821206994e-06, + "loss": 0.7468, + "step": 15593 + }, + { + "epoch": 8.795262267343485, + "grad_norm": 1.3354825973510742, + "learning_rate": 6.026508742244783e-06, + "loss": 0.764, + "step": 15594 + }, + { + "epoch": 8.795826283135927, + "grad_norm": 1.2298897504806519, + "learning_rate": 6.023688663282572e-06, + "loss": 0.7707, + "step": 15595 + }, + { + "epoch": 8.79639029892837, + "grad_norm": 1.173187494277954, + "learning_rate": 6.020868584320361e-06, + "loss": 0.794, + "step": 15596 + }, + { + "epoch": 8.796954314720812, + "grad_norm": 1.0805641412734985, + "learning_rate": 6.01804850535815e-06, + "loss": 0.7992, + "step": 15597 + }, + { + "epoch": 8.797518330513254, + "grad_norm": 0.9117469191551208, + "learning_rate": 6.01522842639594e-06, + "loss": 0.6624, + "step": 15598 + }, + { + "epoch": 8.798082346305696, + "grad_norm": 3.162827968597412, + "learning_rate": 6.0124083474337284e-06, + "loss": 0.779, + "step": 15599 + }, + { + "epoch": 8.79864636209814, + "grad_norm": 1.124650239944458, + "learning_rate": 6.009588268471517e-06, + "loss": 0.6857, + "step": 15600 + }, + { + "epoch": 8.799210377890581, + "grad_norm": 1.7683964967727661, + "learning_rate": 6.006768189509306e-06, + "loss": 0.6636, + "step": 15601 + }, + { + "epoch": 8.799774393683023, + "grad_norm": 0.9235720634460449, + "learning_rate": 6.003948110547096e-06, + "loss": 0.6421, + "step": 15602 + }, + { + "epoch": 8.800338409475465, + "grad_norm": 1.6238961219787598, + "learning_rate": 6.0011280315848845e-06, + "loss": 0.7375, + "step": 15603 + }, + { + "epoch": 8.800902425267907, + "grad_norm": 1.3714300394058228, + "learning_rate": 5.998307952622674e-06, + "loss": 0.8093, + "step": 15604 + }, + { + "epoch": 8.80146644106035, + "grad_norm": 1.1223558187484741, + "learning_rate": 5.995487873660463e-06, + "loss": 0.681, + "step": 15605 + }, + { + "epoch": 8.802030456852792, + "grad_norm": 0.8962498903274536, + "learning_rate": 5.992667794698252e-06, + "loss": 0.6646, + "step": 15606 + }, + { + "epoch": 8.802594472645234, + "grad_norm": 1.178114652633667, + "learning_rate": 5.989847715736041e-06, + "loss": 0.6752, + "step": 15607 + }, + { + "epoch": 8.803158488437676, + "grad_norm": 1.4982410669326782, + "learning_rate": 5.98702763677383e-06, + "loss": 0.7728, + "step": 15608 + }, + { + "epoch": 8.803722504230118, + "grad_norm": 1.3325992822647095, + "learning_rate": 5.984207557811619e-06, + "loss": 0.7825, + "step": 15609 + }, + { + "epoch": 8.804286520022561, + "grad_norm": 1.2819204330444336, + "learning_rate": 5.981387478849408e-06, + "loss": 0.6439, + "step": 15610 + }, + { + "epoch": 8.804850535815003, + "grad_norm": 1.010475754737854, + "learning_rate": 5.978567399887197e-06, + "loss": 0.6073, + "step": 15611 + }, + { + "epoch": 8.805414551607445, + "grad_norm": 1.2399208545684814, + "learning_rate": 5.975747320924986e-06, + "loss": 0.642, + "step": 15612 + }, + { + "epoch": 8.805978567399887, + "grad_norm": 1.2574782371520996, + "learning_rate": 5.972927241962775e-06, + "loss": 0.7994, + "step": 15613 + }, + { + "epoch": 8.806542583192329, + "grad_norm": 1.3710798025131226, + "learning_rate": 5.970107163000565e-06, + "loss": 0.7367, + "step": 15614 + }, + { + "epoch": 8.807106598984772, + "grad_norm": 1.3298286199569702, + "learning_rate": 5.967287084038353e-06, + "loss": 0.784, + "step": 15615 + }, + { + "epoch": 8.807670614777214, + "grad_norm": 0.922718346118927, + "learning_rate": 5.964467005076142e-06, + "loss": 0.7145, + "step": 15616 + }, + { + "epoch": 8.808234630569656, + "grad_norm": 0.8811307549476624, + "learning_rate": 5.961646926113931e-06, + "loss": 0.7156, + "step": 15617 + }, + { + "epoch": 8.808798646362098, + "grad_norm": 1.179840087890625, + "learning_rate": 5.958826847151721e-06, + "loss": 0.8235, + "step": 15618 + }, + { + "epoch": 8.809362662154541, + "grad_norm": 0.9190409183502197, + "learning_rate": 5.9560067681895094e-06, + "loss": 0.608, + "step": 15619 + }, + { + "epoch": 8.809926677946983, + "grad_norm": 1.6388593912124634, + "learning_rate": 5.953186689227298e-06, + "loss": 0.7363, + "step": 15620 + }, + { + "epoch": 8.810490693739425, + "grad_norm": 0.929120659828186, + "learning_rate": 5.950366610265088e-06, + "loss": 0.599, + "step": 15621 + }, + { + "epoch": 8.811054709531867, + "grad_norm": 0.941989541053772, + "learning_rate": 5.947546531302877e-06, + "loss": 0.6844, + "step": 15622 + }, + { + "epoch": 8.811618725324308, + "grad_norm": 1.1240545511245728, + "learning_rate": 5.944726452340666e-06, + "loss": 0.565, + "step": 15623 + }, + { + "epoch": 8.812182741116752, + "grad_norm": 1.091033697128296, + "learning_rate": 5.941906373378455e-06, + "loss": 0.6834, + "step": 15624 + }, + { + "epoch": 8.812746756909194, + "grad_norm": 1.2313772439956665, + "learning_rate": 5.939086294416244e-06, + "loss": 0.6981, + "step": 15625 + }, + { + "epoch": 8.813310772701636, + "grad_norm": 1.1712548732757568, + "learning_rate": 5.936266215454033e-06, + "loss": 0.7264, + "step": 15626 + }, + { + "epoch": 8.813874788494077, + "grad_norm": 1.0581414699554443, + "learning_rate": 5.933446136491822e-06, + "loss": 0.7585, + "step": 15627 + }, + { + "epoch": 8.814438804286521, + "grad_norm": 1.1387128829956055, + "learning_rate": 5.930626057529611e-06, + "loss": 0.7076, + "step": 15628 + }, + { + "epoch": 8.815002820078963, + "grad_norm": 1.019397497177124, + "learning_rate": 5.9278059785674e-06, + "loss": 0.7317, + "step": 15629 + }, + { + "epoch": 8.815566835871405, + "grad_norm": 1.7505066394805908, + "learning_rate": 5.92498589960519e-06, + "loss": 0.9215, + "step": 15630 + }, + { + "epoch": 8.816130851663846, + "grad_norm": 1.1035609245300293, + "learning_rate": 5.922165820642978e-06, + "loss": 0.8204, + "step": 15631 + }, + { + "epoch": 8.816694867456288, + "grad_norm": 1.4110445976257324, + "learning_rate": 5.919345741680767e-06, + "loss": 0.7129, + "step": 15632 + }, + { + "epoch": 8.817258883248732, + "grad_norm": 1.5037713050842285, + "learning_rate": 5.916525662718556e-06, + "loss": 0.7929, + "step": 15633 + }, + { + "epoch": 8.817822899041174, + "grad_norm": 1.0779588222503662, + "learning_rate": 5.913705583756346e-06, + "loss": 0.6509, + "step": 15634 + }, + { + "epoch": 8.818386914833615, + "grad_norm": 1.269577980041504, + "learning_rate": 5.9108855047941344e-06, + "loss": 0.7068, + "step": 15635 + }, + { + "epoch": 8.818950930626057, + "grad_norm": 0.9411147832870483, + "learning_rate": 5.908065425831923e-06, + "loss": 0.7495, + "step": 15636 + }, + { + "epoch": 8.819514946418499, + "grad_norm": 1.20859956741333, + "learning_rate": 5.905245346869712e-06, + "loss": 0.7495, + "step": 15637 + }, + { + "epoch": 8.820078962210943, + "grad_norm": 1.2856906652450562, + "learning_rate": 5.902425267907502e-06, + "loss": 0.7483, + "step": 15638 + }, + { + "epoch": 8.820642978003384, + "grad_norm": 1.1414729356765747, + "learning_rate": 5.899605188945291e-06, + "loss": 0.7555, + "step": 15639 + }, + { + "epoch": 8.821206993795826, + "grad_norm": 1.3891990184783936, + "learning_rate": 5.89678510998308e-06, + "loss": 0.7594, + "step": 15640 + }, + { + "epoch": 8.821771009588268, + "grad_norm": 1.0065819025039673, + "learning_rate": 5.893965031020869e-06, + "loss": 0.7743, + "step": 15641 + }, + { + "epoch": 8.82233502538071, + "grad_norm": 1.1780575513839722, + "learning_rate": 5.8911449520586585e-06, + "loss": 0.7379, + "step": 15642 + }, + { + "epoch": 8.822899041173153, + "grad_norm": 1.046801209449768, + "learning_rate": 5.888324873096447e-06, + "loss": 0.7696, + "step": 15643 + }, + { + "epoch": 8.823463056965595, + "grad_norm": 1.123863935470581, + "learning_rate": 5.885504794134236e-06, + "loss": 0.6898, + "step": 15644 + }, + { + "epoch": 8.824027072758037, + "grad_norm": 1.3523814678192139, + "learning_rate": 5.882684715172025e-06, + "loss": 0.8542, + "step": 15645 + }, + { + "epoch": 8.824591088550479, + "grad_norm": 0.9896530508995056, + "learning_rate": 5.8798646362098146e-06, + "loss": 0.794, + "step": 15646 + }, + { + "epoch": 8.825155104342922, + "grad_norm": 1.2922645807266235, + "learning_rate": 5.877044557247603e-06, + "loss": 0.6871, + "step": 15647 + }, + { + "epoch": 8.825719120135364, + "grad_norm": 1.2725235223770142, + "learning_rate": 5.874224478285392e-06, + "loss": 0.6878, + "step": 15648 + }, + { + "epoch": 8.826283135927806, + "grad_norm": 1.2262216806411743, + "learning_rate": 5.871404399323181e-06, + "loss": 0.8343, + "step": 15649 + }, + { + "epoch": 8.826847151720248, + "grad_norm": 1.07465660572052, + "learning_rate": 5.868584320360971e-06, + "loss": 0.7258, + "step": 15650 + }, + { + "epoch": 8.82741116751269, + "grad_norm": 1.2994170188903809, + "learning_rate": 5.865764241398759e-06, + "loss": 0.7053, + "step": 15651 + }, + { + "epoch": 8.827975183305133, + "grad_norm": 1.817016363143921, + "learning_rate": 5.862944162436548e-06, + "loss": 0.8659, + "step": 15652 + }, + { + "epoch": 8.828539199097575, + "grad_norm": 1.2600477933883667, + "learning_rate": 5.860124083474337e-06, + "loss": 0.8029, + "step": 15653 + }, + { + "epoch": 8.829103214890017, + "grad_norm": 0.9805848598480225, + "learning_rate": 5.857304004512127e-06, + "loss": 0.7489, + "step": 15654 + }, + { + "epoch": 8.829667230682459, + "grad_norm": 1.3755483627319336, + "learning_rate": 5.8544839255499154e-06, + "loss": 0.8144, + "step": 15655 + }, + { + "epoch": 8.830231246474902, + "grad_norm": 0.9888287782669067, + "learning_rate": 5.851663846587704e-06, + "loss": 0.6399, + "step": 15656 + }, + { + "epoch": 8.830795262267344, + "grad_norm": 1.192792296409607, + "learning_rate": 5.848843767625494e-06, + "loss": 0.7766, + "step": 15657 + }, + { + "epoch": 8.831359278059786, + "grad_norm": 0.8978944420814514, + "learning_rate": 5.8460236886632835e-06, + "loss": 0.6356, + "step": 15658 + }, + { + "epoch": 8.831923293852228, + "grad_norm": 0.9847171902656555, + "learning_rate": 5.843203609701072e-06, + "loss": 0.6481, + "step": 15659 + }, + { + "epoch": 8.83248730964467, + "grad_norm": 0.9284185171127319, + "learning_rate": 5.840383530738861e-06, + "loss": 0.6913, + "step": 15660 + }, + { + "epoch": 8.833051325437113, + "grad_norm": 1.2115224599838257, + "learning_rate": 5.83756345177665e-06, + "loss": 0.8022, + "step": 15661 + }, + { + "epoch": 8.833615341229555, + "grad_norm": 0.9643070697784424, + "learning_rate": 5.8347433728144396e-06, + "loss": 0.7589, + "step": 15662 + }, + { + "epoch": 8.834179357021997, + "grad_norm": 0.9055867195129395, + "learning_rate": 5.831923293852228e-06, + "loss": 0.7519, + "step": 15663 + }, + { + "epoch": 8.834743372814438, + "grad_norm": 0.8887812495231628, + "learning_rate": 5.829103214890017e-06, + "loss": 0.6465, + "step": 15664 + }, + { + "epoch": 8.83530738860688, + "grad_norm": 1.1144975423812866, + "learning_rate": 5.826283135927806e-06, + "loss": 0.7075, + "step": 15665 + }, + { + "epoch": 8.835871404399324, + "grad_norm": 1.0702053308486938, + "learning_rate": 5.823463056965596e-06, + "loss": 0.6635, + "step": 15666 + }, + { + "epoch": 8.836435420191766, + "grad_norm": 0.980436384677887, + "learning_rate": 5.820642978003384e-06, + "loss": 0.5446, + "step": 15667 + }, + { + "epoch": 8.836999435984207, + "grad_norm": 1.322812557220459, + "learning_rate": 5.817822899041173e-06, + "loss": 0.7716, + "step": 15668 + }, + { + "epoch": 8.83756345177665, + "grad_norm": 1.3686426877975464, + "learning_rate": 5.815002820078962e-06, + "loss": 0.7306, + "step": 15669 + }, + { + "epoch": 8.838127467569091, + "grad_norm": 1.0878245830535889, + "learning_rate": 5.812182741116752e-06, + "loss": 0.7684, + "step": 15670 + }, + { + "epoch": 8.838691483361535, + "grad_norm": 1.3213614225387573, + "learning_rate": 5.80936266215454e-06, + "loss": 0.7061, + "step": 15671 + }, + { + "epoch": 8.839255499153976, + "grad_norm": 1.2673777341842651, + "learning_rate": 5.806542583192329e-06, + "loss": 0.7619, + "step": 15672 + }, + { + "epoch": 8.839819514946418, + "grad_norm": 1.0701059103012085, + "learning_rate": 5.803722504230118e-06, + "loss": 0.7964, + "step": 15673 + }, + { + "epoch": 8.84038353073886, + "grad_norm": 1.3040567636489868, + "learning_rate": 5.8009024252679085e-06, + "loss": 0.6841, + "step": 15674 + }, + { + "epoch": 8.840947546531304, + "grad_norm": 0.8800081610679626, + "learning_rate": 5.798082346305697e-06, + "loss": 0.6697, + "step": 15675 + }, + { + "epoch": 8.841511562323745, + "grad_norm": 1.0676779747009277, + "learning_rate": 5.795262267343486e-06, + "loss": 0.7241, + "step": 15676 + }, + { + "epoch": 8.842075578116187, + "grad_norm": 1.2600401639938354, + "learning_rate": 5.792442188381275e-06, + "loss": 0.7287, + "step": 15677 + }, + { + "epoch": 8.842639593908629, + "grad_norm": 1.297631859779358, + "learning_rate": 5.7896221094190645e-06, + "loss": 0.8228, + "step": 15678 + }, + { + "epoch": 8.843203609701071, + "grad_norm": 0.9326059818267822, + "learning_rate": 5.786802030456853e-06, + "loss": 0.68, + "step": 15679 + }, + { + "epoch": 8.843767625493514, + "grad_norm": 1.0351128578186035, + "learning_rate": 5.783981951494642e-06, + "loss": 0.5724, + "step": 15680 + }, + { + "epoch": 8.844331641285956, + "grad_norm": 1.1093522310256958, + "learning_rate": 5.781161872532431e-06, + "loss": 0.6512, + "step": 15681 + }, + { + "epoch": 8.844895657078398, + "grad_norm": 1.01486337184906, + "learning_rate": 5.7783417935702206e-06, + "loss": 0.7009, + "step": 15682 + }, + { + "epoch": 8.84545967287084, + "grad_norm": 1.023905873298645, + "learning_rate": 5.775521714608009e-06, + "loss": 0.7616, + "step": 15683 + }, + { + "epoch": 8.846023688663283, + "grad_norm": 0.9236940145492554, + "learning_rate": 5.772701635645798e-06, + "loss": 0.7746, + "step": 15684 + }, + { + "epoch": 8.846587704455725, + "grad_norm": 1.1203362941741943, + "learning_rate": 5.769881556683587e-06, + "loss": 0.832, + "step": 15685 + }, + { + "epoch": 8.847151720248167, + "grad_norm": 1.0735952854156494, + "learning_rate": 5.767061477721377e-06, + "loss": 0.687, + "step": 15686 + }, + { + "epoch": 8.847715736040609, + "grad_norm": 1.230911135673523, + "learning_rate": 5.764241398759165e-06, + "loss": 0.8258, + "step": 15687 + }, + { + "epoch": 8.84827975183305, + "grad_norm": 0.7918270826339722, + "learning_rate": 5.761421319796954e-06, + "loss": 0.6033, + "step": 15688 + }, + { + "epoch": 8.848843767625494, + "grad_norm": 1.0827267169952393, + "learning_rate": 5.758601240834743e-06, + "loss": 0.6753, + "step": 15689 + }, + { + "epoch": 8.849407783417936, + "grad_norm": 1.373448133468628, + "learning_rate": 5.755781161872533e-06, + "loss": 0.6902, + "step": 15690 + }, + { + "epoch": 8.849971799210378, + "grad_norm": 1.2351467609405518, + "learning_rate": 5.7529610829103214e-06, + "loss": 0.8017, + "step": 15691 + }, + { + "epoch": 8.85053581500282, + "grad_norm": 1.0031163692474365, + "learning_rate": 5.750141003948111e-06, + "loss": 0.6814, + "step": 15692 + }, + { + "epoch": 8.851099830795262, + "grad_norm": 0.8948768973350525, + "learning_rate": 5.7473209249859e-06, + "loss": 0.621, + "step": 15693 + }, + { + "epoch": 8.851663846587705, + "grad_norm": 0.8991069793701172, + "learning_rate": 5.7445008460236895e-06, + "loss": 0.6419, + "step": 15694 + }, + { + "epoch": 8.852227862380147, + "grad_norm": 1.1699590682983398, + "learning_rate": 5.741680767061478e-06, + "loss": 0.6995, + "step": 15695 + }, + { + "epoch": 8.852791878172589, + "grad_norm": 0.9730979800224304, + "learning_rate": 5.738860688099267e-06, + "loss": 0.7462, + "step": 15696 + }, + { + "epoch": 8.85335589396503, + "grad_norm": 1.4148118495941162, + "learning_rate": 5.736040609137056e-06, + "loss": 0.8393, + "step": 15697 + }, + { + "epoch": 8.853919909757472, + "grad_norm": 0.972818911075592, + "learning_rate": 5.7332205301748455e-06, + "loss": 0.6721, + "step": 15698 + }, + { + "epoch": 8.854483925549916, + "grad_norm": 1.388077735900879, + "learning_rate": 5.730400451212634e-06, + "loss": 0.7169, + "step": 15699 + }, + { + "epoch": 8.855047941342358, + "grad_norm": 1.0251038074493408, + "learning_rate": 5.727580372250423e-06, + "loss": 0.6565, + "step": 15700 + }, + { + "epoch": 8.8556119571348, + "grad_norm": 0.8757269978523254, + "learning_rate": 5.724760293288212e-06, + "loss": 0.6848, + "step": 15701 + }, + { + "epoch": 8.856175972927241, + "grad_norm": 1.489950180053711, + "learning_rate": 5.721940214326002e-06, + "loss": 0.8614, + "step": 15702 + }, + { + "epoch": 8.856739988719685, + "grad_norm": 1.2343307733535767, + "learning_rate": 5.71912013536379e-06, + "loss": 0.7154, + "step": 15703 + }, + { + "epoch": 8.857304004512127, + "grad_norm": 1.20963454246521, + "learning_rate": 5.716300056401579e-06, + "loss": 0.7494, + "step": 15704 + }, + { + "epoch": 8.857868020304569, + "grad_norm": 1.2071659564971924, + "learning_rate": 5.713479977439368e-06, + "loss": 0.7509, + "step": 15705 + }, + { + "epoch": 8.85843203609701, + "grad_norm": 1.0318934917449951, + "learning_rate": 5.710659898477158e-06, + "loss": 0.7597, + "step": 15706 + }, + { + "epoch": 8.858996051889452, + "grad_norm": 1.1348581314086914, + "learning_rate": 5.707839819514946e-06, + "loss": 0.8272, + "step": 15707 + }, + { + "epoch": 8.859560067681896, + "grad_norm": 1.0110337734222412, + "learning_rate": 5.705019740552735e-06, + "loss": 0.7495, + "step": 15708 + }, + { + "epoch": 8.860124083474338, + "grad_norm": 1.2825908660888672, + "learning_rate": 5.702199661590525e-06, + "loss": 0.7326, + "step": 15709 + }, + { + "epoch": 8.86068809926678, + "grad_norm": 1.8588130474090576, + "learning_rate": 5.6993795826283145e-06, + "loss": 0.7962, + "step": 15710 + }, + { + "epoch": 8.861252115059221, + "grad_norm": 1.3623991012573242, + "learning_rate": 5.696559503666103e-06, + "loss": 0.6988, + "step": 15711 + }, + { + "epoch": 8.861816130851665, + "grad_norm": 1.0488650798797607, + "learning_rate": 5.693739424703892e-06, + "loss": 0.6928, + "step": 15712 + }, + { + "epoch": 8.862380146644107, + "grad_norm": 1.5607597827911377, + "learning_rate": 5.690919345741681e-06, + "loss": 0.7795, + "step": 15713 + }, + { + "epoch": 8.862944162436548, + "grad_norm": 1.005859375, + "learning_rate": 5.6880992667794705e-06, + "loss": 0.6975, + "step": 15714 + }, + { + "epoch": 8.86350817822899, + "grad_norm": 1.756171464920044, + "learning_rate": 5.685279187817259e-06, + "loss": 0.6702, + "step": 15715 + }, + { + "epoch": 8.864072194021432, + "grad_norm": 0.8208812475204468, + "learning_rate": 5.682459108855048e-06, + "loss": 0.6633, + "step": 15716 + }, + { + "epoch": 8.864636209813876, + "grad_norm": 1.6820091009140015, + "learning_rate": 5.679639029892837e-06, + "loss": 0.7212, + "step": 15717 + }, + { + "epoch": 8.865200225606317, + "grad_norm": 1.3103634119033813, + "learning_rate": 5.6768189509306266e-06, + "loss": 0.672, + "step": 15718 + }, + { + "epoch": 8.86576424139876, + "grad_norm": 1.191770076751709, + "learning_rate": 5.673998871968415e-06, + "loss": 0.742, + "step": 15719 + }, + { + "epoch": 8.866328257191201, + "grad_norm": 1.3711180686950684, + "learning_rate": 5.671178793006204e-06, + "loss": 0.8026, + "step": 15720 + }, + { + "epoch": 8.866892272983643, + "grad_norm": 1.1000151634216309, + "learning_rate": 5.668358714043994e-06, + "loss": 0.7184, + "step": 15721 + }, + { + "epoch": 8.867456288776086, + "grad_norm": 1.2739207744598389, + "learning_rate": 5.665538635081783e-06, + "loss": 0.7454, + "step": 15722 + }, + { + "epoch": 8.868020304568528, + "grad_norm": 1.2608009576797485, + "learning_rate": 5.662718556119571e-06, + "loss": 0.7991, + "step": 15723 + }, + { + "epoch": 8.86858432036097, + "grad_norm": 1.2203834056854248, + "learning_rate": 5.65989847715736e-06, + "loss": 0.8424, + "step": 15724 + }, + { + "epoch": 8.869148336153412, + "grad_norm": 1.3660273551940918, + "learning_rate": 5.65707839819515e-06, + "loss": 0.7874, + "step": 15725 + }, + { + "epoch": 8.869712351945854, + "grad_norm": 1.622586965560913, + "learning_rate": 5.654258319232939e-06, + "loss": 0.928, + "step": 15726 + }, + { + "epoch": 8.870276367738297, + "grad_norm": 1.0482491254806519, + "learning_rate": 5.651438240270728e-06, + "loss": 0.5939, + "step": 15727 + }, + { + "epoch": 8.870840383530739, + "grad_norm": 1.3662999868392944, + "learning_rate": 5.648618161308517e-06, + "loss": 0.7534, + "step": 15728 + }, + { + "epoch": 8.87140439932318, + "grad_norm": 1.172473430633545, + "learning_rate": 5.645798082346306e-06, + "loss": 0.6625, + "step": 15729 + }, + { + "epoch": 8.871968415115623, + "grad_norm": 0.9664345383644104, + "learning_rate": 5.6429780033840955e-06, + "loss": 0.7075, + "step": 15730 + }, + { + "epoch": 8.872532430908066, + "grad_norm": 0.8102058172225952, + "learning_rate": 5.640157924421884e-06, + "loss": 0.5783, + "step": 15731 + }, + { + "epoch": 8.873096446700508, + "grad_norm": 1.2564972639083862, + "learning_rate": 5.637337845459673e-06, + "loss": 0.8427, + "step": 15732 + }, + { + "epoch": 8.87366046249295, + "grad_norm": 1.4216090440750122, + "learning_rate": 5.634517766497462e-06, + "loss": 0.7502, + "step": 15733 + }, + { + "epoch": 8.874224478285392, + "grad_norm": 0.9237523674964905, + "learning_rate": 5.6316976875352515e-06, + "loss": 0.6053, + "step": 15734 + }, + { + "epoch": 8.874788494077833, + "grad_norm": 0.8269522190093994, + "learning_rate": 5.62887760857304e-06, + "loss": 0.6348, + "step": 15735 + }, + { + "epoch": 8.875352509870277, + "grad_norm": 0.9311215281486511, + "learning_rate": 5.626057529610829e-06, + "loss": 0.7082, + "step": 15736 + }, + { + "epoch": 8.875916525662719, + "grad_norm": 1.0754636526107788, + "learning_rate": 5.623237450648619e-06, + "loss": 0.7733, + "step": 15737 + }, + { + "epoch": 8.87648054145516, + "grad_norm": 1.262937307357788, + "learning_rate": 5.6204173716864076e-06, + "loss": 0.7521, + "step": 15738 + }, + { + "epoch": 8.877044557247602, + "grad_norm": 1.278791904449463, + "learning_rate": 5.617597292724196e-06, + "loss": 0.8705, + "step": 15739 + }, + { + "epoch": 8.877608573040046, + "grad_norm": 0.9948554635047913, + "learning_rate": 5.614777213761985e-06, + "loss": 0.727, + "step": 15740 + }, + { + "epoch": 8.878172588832488, + "grad_norm": 1.0560808181762695, + "learning_rate": 5.611957134799775e-06, + "loss": 0.7837, + "step": 15741 + }, + { + "epoch": 8.87873660462493, + "grad_norm": 1.248939037322998, + "learning_rate": 5.609137055837564e-06, + "loss": 0.8122, + "step": 15742 + }, + { + "epoch": 8.879300620417371, + "grad_norm": 1.123515248298645, + "learning_rate": 5.606316976875352e-06, + "loss": 0.7221, + "step": 15743 + }, + { + "epoch": 8.879864636209813, + "grad_norm": 1.1616442203521729, + "learning_rate": 5.603496897913141e-06, + "loss": 0.657, + "step": 15744 + }, + { + "epoch": 8.880428652002257, + "grad_norm": 1.3480210304260254, + "learning_rate": 5.600676818950931e-06, + "loss": 0.8539, + "step": 15745 + }, + { + "epoch": 8.880992667794699, + "grad_norm": 1.071242332458496, + "learning_rate": 5.5978567399887205e-06, + "loss": 0.8844, + "step": 15746 + }, + { + "epoch": 8.88155668358714, + "grad_norm": 1.4100210666656494, + "learning_rate": 5.595036661026509e-06, + "loss": 0.7894, + "step": 15747 + }, + { + "epoch": 8.882120699379582, + "grad_norm": 1.4230810403823853, + "learning_rate": 5.592216582064298e-06, + "loss": 0.8031, + "step": 15748 + }, + { + "epoch": 8.882684715172024, + "grad_norm": 1.1534876823425293, + "learning_rate": 5.589396503102087e-06, + "loss": 0.6514, + "step": 15749 + }, + { + "epoch": 8.883248730964468, + "grad_norm": 0.8556134700775146, + "learning_rate": 5.5865764241398765e-06, + "loss": 0.7267, + "step": 15750 + }, + { + "epoch": 8.88381274675691, + "grad_norm": 1.1266390085220337, + "learning_rate": 5.583756345177665e-06, + "loss": 0.652, + "step": 15751 + }, + { + "epoch": 8.884376762549351, + "grad_norm": 1.2896419763565063, + "learning_rate": 5.580936266215454e-06, + "loss": 0.7082, + "step": 15752 + }, + { + "epoch": 8.884940778341793, + "grad_norm": 0.8820551633834839, + "learning_rate": 5.578116187253244e-06, + "loss": 0.7172, + "step": 15753 + }, + { + "epoch": 8.885504794134235, + "grad_norm": 1.049302339553833, + "learning_rate": 5.5752961082910326e-06, + "loss": 0.7651, + "step": 15754 + }, + { + "epoch": 8.886068809926678, + "grad_norm": 1.3554673194885254, + "learning_rate": 5.572476029328821e-06, + "loss": 0.7319, + "step": 15755 + }, + { + "epoch": 8.88663282571912, + "grad_norm": 1.4989933967590332, + "learning_rate": 5.56965595036661e-06, + "loss": 0.7382, + "step": 15756 + }, + { + "epoch": 8.887196841511562, + "grad_norm": 0.9341689944267273, + "learning_rate": 5.5668358714044e-06, + "loss": 0.6659, + "step": 15757 + }, + { + "epoch": 8.887760857304004, + "grad_norm": 1.2227275371551514, + "learning_rate": 5.564015792442189e-06, + "loss": 0.7094, + "step": 15758 + }, + { + "epoch": 8.888324873096447, + "grad_norm": 1.3713475465774536, + "learning_rate": 5.561195713479977e-06, + "loss": 0.802, + "step": 15759 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 1.0989142656326294, + "learning_rate": 5.558375634517766e-06, + "loss": 0.7624, + "step": 15760 + }, + { + "epoch": 8.889452904681331, + "grad_norm": 1.058530569076538, + "learning_rate": 5.555555555555556e-06, + "loss": 0.6735, + "step": 15761 + }, + { + "epoch": 8.890016920473773, + "grad_norm": 1.0357707738876343, + "learning_rate": 5.5527354765933455e-06, + "loss": 0.7096, + "step": 15762 + }, + { + "epoch": 8.890580936266215, + "grad_norm": 1.3981021642684937, + "learning_rate": 5.549915397631134e-06, + "loss": 0.7668, + "step": 15763 + }, + { + "epoch": 8.891144952058658, + "grad_norm": 1.0735515356063843, + "learning_rate": 5.547095318668923e-06, + "loss": 0.8125, + "step": 15764 + }, + { + "epoch": 8.8917089678511, + "grad_norm": 1.3582524061203003, + "learning_rate": 5.544275239706713e-06, + "loss": 0.784, + "step": 15765 + }, + { + "epoch": 8.892272983643542, + "grad_norm": 1.0622209310531616, + "learning_rate": 5.5414551607445015e-06, + "loss": 0.6401, + "step": 15766 + }, + { + "epoch": 8.892836999435984, + "grad_norm": 0.916308581829071, + "learning_rate": 5.53863508178229e-06, + "loss": 0.7524, + "step": 15767 + }, + { + "epoch": 8.893401015228427, + "grad_norm": 0.9756428003311157, + "learning_rate": 5.535815002820079e-06, + "loss": 0.6725, + "step": 15768 + }, + { + "epoch": 8.893965031020869, + "grad_norm": 1.6206532716751099, + "learning_rate": 5.532994923857869e-06, + "loss": 0.7451, + "step": 15769 + }, + { + "epoch": 8.894529046813311, + "grad_norm": 1.0901280641555786, + "learning_rate": 5.5301748448956575e-06, + "loss": 0.7598, + "step": 15770 + }, + { + "epoch": 8.895093062605753, + "grad_norm": 1.1709078550338745, + "learning_rate": 5.527354765933446e-06, + "loss": 0.717, + "step": 15771 + }, + { + "epoch": 8.895657078398195, + "grad_norm": 1.0966469049453735, + "learning_rate": 5.524534686971235e-06, + "loss": 0.7209, + "step": 15772 + }, + { + "epoch": 8.896221094190638, + "grad_norm": 1.372986912727356, + "learning_rate": 5.521714608009025e-06, + "loss": 0.7974, + "step": 15773 + }, + { + "epoch": 8.89678510998308, + "grad_norm": 1.2141896486282349, + "learning_rate": 5.5188945290468136e-06, + "loss": 0.7037, + "step": 15774 + }, + { + "epoch": 8.897349125775522, + "grad_norm": 0.9688345789909363, + "learning_rate": 5.516074450084602e-06, + "loss": 0.7345, + "step": 15775 + }, + { + "epoch": 8.897913141567964, + "grad_norm": 1.1612064838409424, + "learning_rate": 5.513254371122391e-06, + "loss": 0.76, + "step": 15776 + }, + { + "epoch": 8.898477157360405, + "grad_norm": 0.9274187088012695, + "learning_rate": 5.510434292160181e-06, + "loss": 0.6538, + "step": 15777 + }, + { + "epoch": 8.899041173152849, + "grad_norm": 1.198647379875183, + "learning_rate": 5.50761421319797e-06, + "loss": 0.7819, + "step": 15778 + }, + { + "epoch": 8.89960518894529, + "grad_norm": 1.145241141319275, + "learning_rate": 5.504794134235758e-06, + "loss": 0.789, + "step": 15779 + }, + { + "epoch": 8.900169204737733, + "grad_norm": 1.0762748718261719, + "learning_rate": 5.501974055273548e-06, + "loss": 0.7064, + "step": 15780 + }, + { + "epoch": 8.900733220530174, + "grad_norm": 1.0879936218261719, + "learning_rate": 5.499153976311338e-06, + "loss": 0.8215, + "step": 15781 + }, + { + "epoch": 8.901297236322616, + "grad_norm": 1.2765438556671143, + "learning_rate": 5.4963338973491265e-06, + "loss": 0.8339, + "step": 15782 + }, + { + "epoch": 8.90186125211506, + "grad_norm": 1.588844656944275, + "learning_rate": 5.493513818386915e-06, + "loss": 0.801, + "step": 15783 + }, + { + "epoch": 8.902425267907502, + "grad_norm": 0.9832957983016968, + "learning_rate": 5.490693739424704e-06, + "loss": 0.6577, + "step": 15784 + }, + { + "epoch": 8.902989283699943, + "grad_norm": 38.768646240234375, + "learning_rate": 5.487873660462494e-06, + "loss": 0.7575, + "step": 15785 + }, + { + "epoch": 8.903553299492385, + "grad_norm": 1.082545518875122, + "learning_rate": 5.4850535815002825e-06, + "loss": 0.6807, + "step": 15786 + }, + { + "epoch": 8.904117315284829, + "grad_norm": 1.0032862424850464, + "learning_rate": 5.482233502538071e-06, + "loss": 0.7516, + "step": 15787 + }, + { + "epoch": 8.90468133107727, + "grad_norm": 1.3084895610809326, + "learning_rate": 5.47941342357586e-06, + "loss": 0.7691, + "step": 15788 + }, + { + "epoch": 8.905245346869712, + "grad_norm": 1.1091649532318115, + "learning_rate": 5.47659334461365e-06, + "loss": 0.8003, + "step": 15789 + }, + { + "epoch": 8.905809362662154, + "grad_norm": 1.0567142963409424, + "learning_rate": 5.4737732656514385e-06, + "loss": 0.6839, + "step": 15790 + }, + { + "epoch": 8.906373378454596, + "grad_norm": 1.10641348361969, + "learning_rate": 5.470953186689227e-06, + "loss": 0.6831, + "step": 15791 + }, + { + "epoch": 8.90693739424704, + "grad_norm": 1.18779718875885, + "learning_rate": 5.468133107727016e-06, + "loss": 0.821, + "step": 15792 + }, + { + "epoch": 8.907501410039481, + "grad_norm": 1.1446330547332764, + "learning_rate": 5.465313028764806e-06, + "loss": 0.6849, + "step": 15793 + }, + { + "epoch": 8.908065425831923, + "grad_norm": 1.2544018030166626, + "learning_rate": 5.462492949802595e-06, + "loss": 0.7568, + "step": 15794 + }, + { + "epoch": 8.908629441624365, + "grad_norm": 1.428969383239746, + "learning_rate": 5.459672870840383e-06, + "loss": 0.7777, + "step": 15795 + }, + { + "epoch": 8.909193457416809, + "grad_norm": 1.236256718635559, + "learning_rate": 5.456852791878172e-06, + "loss": 0.689, + "step": 15796 + }, + { + "epoch": 8.90975747320925, + "grad_norm": 0.7884833812713623, + "learning_rate": 5.454032712915962e-06, + "loss": 0.6581, + "step": 15797 + }, + { + "epoch": 8.910321489001692, + "grad_norm": 0.9530991911888123, + "learning_rate": 5.4512126339537515e-06, + "loss": 0.8105, + "step": 15798 + }, + { + "epoch": 8.910885504794134, + "grad_norm": 1.2930959463119507, + "learning_rate": 5.44839255499154e-06, + "loss": 0.7456, + "step": 15799 + }, + { + "epoch": 8.911449520586576, + "grad_norm": 0.9321538209915161, + "learning_rate": 5.445572476029329e-06, + "loss": 0.6997, + "step": 15800 + }, + { + "epoch": 8.91201353637902, + "grad_norm": 1.080620288848877, + "learning_rate": 5.442752397067119e-06, + "loss": 0.7016, + "step": 15801 + }, + { + "epoch": 8.912577552171461, + "grad_norm": 1.2217285633087158, + "learning_rate": 5.4399323181049075e-06, + "loss": 0.675, + "step": 15802 + }, + { + "epoch": 8.913141567963903, + "grad_norm": 0.9145601987838745, + "learning_rate": 5.437112239142696e-06, + "loss": 0.7532, + "step": 15803 + }, + { + "epoch": 8.913705583756345, + "grad_norm": 1.0888193845748901, + "learning_rate": 5.434292160180485e-06, + "loss": 0.7451, + "step": 15804 + }, + { + "epoch": 8.914269599548787, + "grad_norm": 1.1162375211715698, + "learning_rate": 5.431472081218275e-06, + "loss": 0.8463, + "step": 15805 + }, + { + "epoch": 8.91483361534123, + "grad_norm": 1.2149909734725952, + "learning_rate": 5.4286520022560635e-06, + "loss": 0.8949, + "step": 15806 + }, + { + "epoch": 8.915397631133672, + "grad_norm": 0.8092107176780701, + "learning_rate": 5.425831923293852e-06, + "loss": 0.6189, + "step": 15807 + }, + { + "epoch": 8.915961646926114, + "grad_norm": 0.8696866035461426, + "learning_rate": 5.423011844331641e-06, + "loss": 0.767, + "step": 15808 + }, + { + "epoch": 8.916525662718556, + "grad_norm": 1.0503185987472534, + "learning_rate": 5.420191765369431e-06, + "loss": 0.6299, + "step": 15809 + }, + { + "epoch": 8.917089678510997, + "grad_norm": 1.1305928230285645, + "learning_rate": 5.4173716864072196e-06, + "loss": 0.709, + "step": 15810 + }, + { + "epoch": 8.917653694303441, + "grad_norm": 0.8414058685302734, + "learning_rate": 5.414551607445008e-06, + "loss": 0.6953, + "step": 15811 + }, + { + "epoch": 8.918217710095883, + "grad_norm": 1.122900366783142, + "learning_rate": 5.411731528482797e-06, + "loss": 0.7001, + "step": 15812 + }, + { + "epoch": 8.918781725888325, + "grad_norm": 1.074803352355957, + "learning_rate": 5.408911449520587e-06, + "loss": 0.7521, + "step": 15813 + }, + { + "epoch": 8.919345741680766, + "grad_norm": 0.9712690114974976, + "learning_rate": 5.406091370558376e-06, + "loss": 0.7783, + "step": 15814 + }, + { + "epoch": 8.91990975747321, + "grad_norm": 1.319130778312683, + "learning_rate": 5.403271291596165e-06, + "loss": 0.7686, + "step": 15815 + }, + { + "epoch": 8.920473773265652, + "grad_norm": 1.1539115905761719, + "learning_rate": 5.400451212633954e-06, + "loss": 0.8211, + "step": 15816 + }, + { + "epoch": 8.921037789058094, + "grad_norm": 1.0287261009216309, + "learning_rate": 5.397631133671744e-06, + "loss": 0.7932, + "step": 15817 + }, + { + "epoch": 8.921601804850535, + "grad_norm": 1.062155842781067, + "learning_rate": 5.3948110547095325e-06, + "loss": 0.721, + "step": 15818 + }, + { + "epoch": 8.922165820642977, + "grad_norm": 1.30418860912323, + "learning_rate": 5.391990975747321e-06, + "loss": 0.7492, + "step": 15819 + }, + { + "epoch": 8.92272983643542, + "grad_norm": 1.5048186779022217, + "learning_rate": 5.38917089678511e-06, + "loss": 0.8561, + "step": 15820 + }, + { + "epoch": 8.923293852227863, + "grad_norm": 1.24424409866333, + "learning_rate": 5.3863508178229e-06, + "loss": 0.7424, + "step": 15821 + }, + { + "epoch": 8.923857868020304, + "grad_norm": 0.9977489113807678, + "learning_rate": 5.3835307388606885e-06, + "loss": 0.6546, + "step": 15822 + }, + { + "epoch": 8.924421883812746, + "grad_norm": 0.783354640007019, + "learning_rate": 5.380710659898477e-06, + "loss": 0.6284, + "step": 15823 + }, + { + "epoch": 8.92498589960519, + "grad_norm": 1.168539047241211, + "learning_rate": 5.377890580936266e-06, + "loss": 0.735, + "step": 15824 + }, + { + "epoch": 8.925549915397632, + "grad_norm": 1.275687336921692, + "learning_rate": 5.375070501974056e-06, + "loss": 0.8313, + "step": 15825 + }, + { + "epoch": 8.926113931190073, + "grad_norm": 1.1648240089416504, + "learning_rate": 5.3722504230118445e-06, + "loss": 0.6437, + "step": 15826 + }, + { + "epoch": 8.926677946982515, + "grad_norm": 1.8380441665649414, + "learning_rate": 5.369430344049633e-06, + "loss": 0.8257, + "step": 15827 + }, + { + "epoch": 8.927241962774957, + "grad_norm": 1.295216679573059, + "learning_rate": 5.366610265087422e-06, + "loss": 0.6105, + "step": 15828 + }, + { + "epoch": 8.9278059785674, + "grad_norm": 0.8163246512413025, + "learning_rate": 5.363790186125212e-06, + "loss": 0.6863, + "step": 15829 + }, + { + "epoch": 8.928369994359842, + "grad_norm": 1.4860239028930664, + "learning_rate": 5.3609701071630006e-06, + "loss": 0.754, + "step": 15830 + }, + { + "epoch": 8.928934010152284, + "grad_norm": 1.2355467081069946, + "learning_rate": 5.358150028200789e-06, + "loss": 0.7729, + "step": 15831 + }, + { + "epoch": 8.929498025944726, + "grad_norm": 1.5916576385498047, + "learning_rate": 5.355329949238579e-06, + "loss": 0.699, + "step": 15832 + }, + { + "epoch": 8.930062041737168, + "grad_norm": 0.9231787323951721, + "learning_rate": 5.352509870276369e-06, + "loss": 0.7674, + "step": 15833 + }, + { + "epoch": 8.930626057529611, + "grad_norm": 1.0148537158966064, + "learning_rate": 5.3496897913141575e-06, + "loss": 0.7086, + "step": 15834 + }, + { + "epoch": 8.931190073322053, + "grad_norm": 1.035001277923584, + "learning_rate": 5.346869712351946e-06, + "loss": 0.7354, + "step": 15835 + }, + { + "epoch": 8.931754089114495, + "grad_norm": 1.0425517559051514, + "learning_rate": 5.344049633389735e-06, + "loss": 0.7706, + "step": 15836 + }, + { + "epoch": 8.932318104906937, + "grad_norm": 0.9780281186103821, + "learning_rate": 5.341229554427525e-06, + "loss": 0.6106, + "step": 15837 + }, + { + "epoch": 8.932882120699379, + "grad_norm": 1.2757850885391235, + "learning_rate": 5.3384094754653135e-06, + "loss": 0.7032, + "step": 15838 + }, + { + "epoch": 8.933446136491822, + "grad_norm": 1.0296671390533447, + "learning_rate": 5.335589396503102e-06, + "loss": 0.5989, + "step": 15839 + }, + { + "epoch": 8.934010152284264, + "grad_norm": 1.1179358959197998, + "learning_rate": 5.332769317540891e-06, + "loss": 0.656, + "step": 15840 + }, + { + "epoch": 8.934574168076706, + "grad_norm": 1.4389020204544067, + "learning_rate": 5.329949238578681e-06, + "loss": 0.7097, + "step": 15841 + }, + { + "epoch": 8.935138183869148, + "grad_norm": 0.9603685736656189, + "learning_rate": 5.3271291596164695e-06, + "loss": 0.752, + "step": 15842 + }, + { + "epoch": 8.935702199661591, + "grad_norm": 1.3887977600097656, + "learning_rate": 5.324309080654258e-06, + "loss": 0.8624, + "step": 15843 + }, + { + "epoch": 8.936266215454033, + "grad_norm": 1.3158464431762695, + "learning_rate": 5.321489001692047e-06, + "loss": 0.671, + "step": 15844 + }, + { + "epoch": 8.936830231246475, + "grad_norm": 1.1264548301696777, + "learning_rate": 5.318668922729837e-06, + "loss": 0.785, + "step": 15845 + }, + { + "epoch": 8.937394247038917, + "grad_norm": 1.250641942024231, + "learning_rate": 5.3158488437676256e-06, + "loss": 0.7317, + "step": 15846 + }, + { + "epoch": 8.937958262831359, + "grad_norm": 0.9813650846481323, + "learning_rate": 5.313028764805414e-06, + "loss": 0.6462, + "step": 15847 + }, + { + "epoch": 8.938522278623802, + "grad_norm": 0.9153981804847717, + "learning_rate": 5.310208685843204e-06, + "loss": 0.7575, + "step": 15848 + }, + { + "epoch": 8.939086294416244, + "grad_norm": 1.0090031623840332, + "learning_rate": 5.307388606880993e-06, + "loss": 0.7105, + "step": 15849 + }, + { + "epoch": 8.939650310208686, + "grad_norm": 1.160786747932434, + "learning_rate": 5.304568527918782e-06, + "loss": 0.7486, + "step": 15850 + }, + { + "epoch": 8.940214326001128, + "grad_norm": 1.0782432556152344, + "learning_rate": 5.301748448956571e-06, + "loss": 0.6543, + "step": 15851 + }, + { + "epoch": 8.940778341793571, + "grad_norm": 1.521697998046875, + "learning_rate": 5.29892836999436e-06, + "loss": 0.7296, + "step": 15852 + }, + { + "epoch": 8.941342357586013, + "grad_norm": 1.0428308248519897, + "learning_rate": 5.29610829103215e-06, + "loss": 0.7706, + "step": 15853 + }, + { + "epoch": 8.941906373378455, + "grad_norm": 1.045055866241455, + "learning_rate": 5.2932882120699385e-06, + "loss": 0.67, + "step": 15854 + }, + { + "epoch": 8.942470389170897, + "grad_norm": 1.3147435188293457, + "learning_rate": 5.290468133107727e-06, + "loss": 0.6987, + "step": 15855 + }, + { + "epoch": 8.943034404963338, + "grad_norm": 0.9530452489852905, + "learning_rate": 5.287648054145516e-06, + "loss": 0.7247, + "step": 15856 + }, + { + "epoch": 8.943598420755782, + "grad_norm": 0.8685605525970459, + "learning_rate": 5.284827975183306e-06, + "loss": 0.7042, + "step": 15857 + }, + { + "epoch": 8.944162436548224, + "grad_norm": 1.0515016317367554, + "learning_rate": 5.2820078962210945e-06, + "loss": 0.6497, + "step": 15858 + }, + { + "epoch": 8.944726452340666, + "grad_norm": 1.2457458972930908, + "learning_rate": 5.279187817258883e-06, + "loss": 0.7541, + "step": 15859 + }, + { + "epoch": 8.945290468133107, + "grad_norm": 1.384328007698059, + "learning_rate": 5.276367738296673e-06, + "loss": 0.8123, + "step": 15860 + }, + { + "epoch": 8.94585448392555, + "grad_norm": 0.7887617349624634, + "learning_rate": 5.273547659334462e-06, + "loss": 0.6481, + "step": 15861 + }, + { + "epoch": 8.946418499717993, + "grad_norm": 0.9260929226875305, + "learning_rate": 5.2707275803722505e-06, + "loss": 0.6232, + "step": 15862 + }, + { + "epoch": 8.946982515510435, + "grad_norm": 1.0644629001617432, + "learning_rate": 5.267907501410039e-06, + "loss": 0.771, + "step": 15863 + }, + { + "epoch": 8.947546531302876, + "grad_norm": 1.073157548904419, + "learning_rate": 5.265087422447829e-06, + "loss": 0.6986, + "step": 15864 + }, + { + "epoch": 8.948110547095318, + "grad_norm": 1.2839953899383545, + "learning_rate": 5.262267343485618e-06, + "loss": 0.7507, + "step": 15865 + }, + { + "epoch": 8.94867456288776, + "grad_norm": 1.338239073753357, + "learning_rate": 5.2594472645234066e-06, + "loss": 0.6643, + "step": 15866 + }, + { + "epoch": 8.949238578680204, + "grad_norm": 0.8513134717941284, + "learning_rate": 5.256627185561195e-06, + "loss": 0.7211, + "step": 15867 + }, + { + "epoch": 8.949802594472645, + "grad_norm": 1.0552146434783936, + "learning_rate": 5.253807106598985e-06, + "loss": 0.6773, + "step": 15868 + }, + { + "epoch": 8.950366610265087, + "grad_norm": 1.276821255683899, + "learning_rate": 5.250987027636775e-06, + "loss": 0.878, + "step": 15869 + }, + { + "epoch": 8.950930626057529, + "grad_norm": 0.9401856064796448, + "learning_rate": 5.2481669486745634e-06, + "loss": 0.7532, + "step": 15870 + }, + { + "epoch": 8.951494641849973, + "grad_norm": 1.0312224626541138, + "learning_rate": 5.245346869712352e-06, + "loss": 0.7191, + "step": 15871 + }, + { + "epoch": 8.952058657642414, + "grad_norm": 1.0635826587677002, + "learning_rate": 5.242526790750141e-06, + "loss": 0.7384, + "step": 15872 + }, + { + "epoch": 8.952622673434856, + "grad_norm": 1.1784917116165161, + "learning_rate": 5.239706711787931e-06, + "loss": 0.7034, + "step": 15873 + }, + { + "epoch": 8.953186689227298, + "grad_norm": 1.3946559429168701, + "learning_rate": 5.2368866328257195e-06, + "loss": 0.8269, + "step": 15874 + }, + { + "epoch": 8.95375070501974, + "grad_norm": 1.331864356994629, + "learning_rate": 5.234066553863508e-06, + "loss": 0.7132, + "step": 15875 + }, + { + "epoch": 8.954314720812183, + "grad_norm": 1.2032052278518677, + "learning_rate": 5.231246474901298e-06, + "loss": 0.7223, + "step": 15876 + }, + { + "epoch": 8.954878736604625, + "grad_norm": 1.197270154953003, + "learning_rate": 5.228426395939087e-06, + "loss": 0.7212, + "step": 15877 + }, + { + "epoch": 8.955442752397067, + "grad_norm": 0.9056366682052612, + "learning_rate": 5.2256063169768755e-06, + "loss": 0.6898, + "step": 15878 + }, + { + "epoch": 8.956006768189509, + "grad_norm": 1.0238271951675415, + "learning_rate": 5.222786238014664e-06, + "loss": 0.703, + "step": 15879 + }, + { + "epoch": 8.956570783981952, + "grad_norm": 0.8508138656616211, + "learning_rate": 5.219966159052454e-06, + "loss": 0.7232, + "step": 15880 + }, + { + "epoch": 8.957134799774394, + "grad_norm": 0.9308162927627563, + "learning_rate": 5.217146080090243e-06, + "loss": 0.6595, + "step": 15881 + }, + { + "epoch": 8.957698815566836, + "grad_norm": 1.3479732275009155, + "learning_rate": 5.2143260011280315e-06, + "loss": 0.6328, + "step": 15882 + }, + { + "epoch": 8.958262831359278, + "grad_norm": 1.3475415706634521, + "learning_rate": 5.21150592216582e-06, + "loss": 0.8495, + "step": 15883 + }, + { + "epoch": 8.95882684715172, + "grad_norm": 1.203624963760376, + "learning_rate": 5.20868584320361e-06, + "loss": 0.8456, + "step": 15884 + }, + { + "epoch": 8.959390862944163, + "grad_norm": 1.1656550168991089, + "learning_rate": 5.205865764241399e-06, + "loss": 0.68, + "step": 15885 + }, + { + "epoch": 8.959954878736605, + "grad_norm": 0.8074144124984741, + "learning_rate": 5.203045685279188e-06, + "loss": 0.6652, + "step": 15886 + }, + { + "epoch": 8.960518894529047, + "grad_norm": 1.0262455940246582, + "learning_rate": 5.200225606316977e-06, + "loss": 0.6312, + "step": 15887 + }, + { + "epoch": 8.961082910321489, + "grad_norm": 1.1063846349716187, + "learning_rate": 5.197405527354766e-06, + "loss": 0.7423, + "step": 15888 + }, + { + "epoch": 8.96164692611393, + "grad_norm": 1.0691596269607544, + "learning_rate": 5.194585448392556e-06, + "loss": 0.677, + "step": 15889 + }, + { + "epoch": 8.962210941906374, + "grad_norm": 1.0212454795837402, + "learning_rate": 5.1917653694303445e-06, + "loss": 0.6922, + "step": 15890 + }, + { + "epoch": 8.962774957698816, + "grad_norm": 0.9919488430023193, + "learning_rate": 5.188945290468133e-06, + "loss": 0.6481, + "step": 15891 + }, + { + "epoch": 8.963338973491258, + "grad_norm": 1.4514470100402832, + "learning_rate": 5.186125211505923e-06, + "loss": 0.8002, + "step": 15892 + }, + { + "epoch": 8.9639029892837, + "grad_norm": 0.9248488545417786, + "learning_rate": 5.183305132543712e-06, + "loss": 0.7312, + "step": 15893 + }, + { + "epoch": 8.964467005076141, + "grad_norm": 1.1202380657196045, + "learning_rate": 5.1804850535815005e-06, + "loss": 0.7822, + "step": 15894 + }, + { + "epoch": 8.965031020868585, + "grad_norm": 1.1502293348312378, + "learning_rate": 5.177664974619289e-06, + "loss": 0.6505, + "step": 15895 + }, + { + "epoch": 8.965595036661027, + "grad_norm": 1.0049749612808228, + "learning_rate": 5.174844895657079e-06, + "loss": 0.641, + "step": 15896 + }, + { + "epoch": 8.966159052453468, + "grad_norm": 1.3751606941223145, + "learning_rate": 5.172024816694868e-06, + "loss": 0.7512, + "step": 15897 + }, + { + "epoch": 8.96672306824591, + "grad_norm": 1.248694896697998, + "learning_rate": 5.1692047377326565e-06, + "loss": 0.7453, + "step": 15898 + }, + { + "epoch": 8.967287084038354, + "grad_norm": 0.9460358619689941, + "learning_rate": 5.166384658770445e-06, + "loss": 0.7302, + "step": 15899 + }, + { + "epoch": 8.967851099830796, + "grad_norm": 0.7934116721153259, + "learning_rate": 5.163564579808235e-06, + "loss": 0.6574, + "step": 15900 + }, + { + "epoch": 8.968415115623237, + "grad_norm": 1.0906742811203003, + "learning_rate": 5.160744500846024e-06, + "loss": 0.7523, + "step": 15901 + }, + { + "epoch": 8.96897913141568, + "grad_norm": 1.4507838487625122, + "learning_rate": 5.1579244218838126e-06, + "loss": 0.721, + "step": 15902 + }, + { + "epoch": 8.969543147208121, + "grad_norm": 1.7749367952346802, + "learning_rate": 5.155104342921601e-06, + "loss": 0.8387, + "step": 15903 + }, + { + "epoch": 8.970107163000565, + "grad_norm": 1.2755976915359497, + "learning_rate": 5.152284263959392e-06, + "loss": 0.7234, + "step": 15904 + }, + { + "epoch": 8.970671178793006, + "grad_norm": 0.8468562364578247, + "learning_rate": 5.149464184997181e-06, + "loss": 0.624, + "step": 15905 + }, + { + "epoch": 8.971235194585448, + "grad_norm": 0.9373912811279297, + "learning_rate": 5.1466441060349694e-06, + "loss": 0.7603, + "step": 15906 + }, + { + "epoch": 8.97179921037789, + "grad_norm": 0.8173861503601074, + "learning_rate": 5.143824027072758e-06, + "loss": 0.622, + "step": 15907 + }, + { + "epoch": 8.972363226170334, + "grad_norm": 1.1034513711929321, + "learning_rate": 5.141003948110548e-06, + "loss": 0.767, + "step": 15908 + }, + { + "epoch": 8.972927241962775, + "grad_norm": 1.4558260440826416, + "learning_rate": 5.138183869148337e-06, + "loss": 0.7487, + "step": 15909 + }, + { + "epoch": 8.973491257755217, + "grad_norm": 1.0929731130599976, + "learning_rate": 5.1353637901861255e-06, + "loss": 0.6793, + "step": 15910 + }, + { + "epoch": 8.974055273547659, + "grad_norm": 1.2151786088943481, + "learning_rate": 5.132543711223914e-06, + "loss": 0.8267, + "step": 15911 + }, + { + "epoch": 8.974619289340101, + "grad_norm": 1.151726484298706, + "learning_rate": 5.129723632261704e-06, + "loss": 0.8109, + "step": 15912 + }, + { + "epoch": 8.975183305132544, + "grad_norm": 1.100723147392273, + "learning_rate": 5.126903553299493e-06, + "loss": 0.6897, + "step": 15913 + }, + { + "epoch": 8.975747320924986, + "grad_norm": 1.2665903568267822, + "learning_rate": 5.1240834743372815e-06, + "loss": 0.682, + "step": 15914 + }, + { + "epoch": 8.976311336717428, + "grad_norm": 1.1217104196548462, + "learning_rate": 5.12126339537507e-06, + "loss": 0.7958, + "step": 15915 + }, + { + "epoch": 8.97687535250987, + "grad_norm": 0.913301944732666, + "learning_rate": 5.11844331641286e-06, + "loss": 0.6842, + "step": 15916 + }, + { + "epoch": 8.977439368302312, + "grad_norm": 0.9488143920898438, + "learning_rate": 5.115623237450649e-06, + "loss": 0.6934, + "step": 15917 + }, + { + "epoch": 8.978003384094755, + "grad_norm": 1.0718958377838135, + "learning_rate": 5.1128031584884375e-06, + "loss": 0.7449, + "step": 15918 + }, + { + "epoch": 8.978567399887197, + "grad_norm": 0.9817270040512085, + "learning_rate": 5.109983079526226e-06, + "loss": 0.7582, + "step": 15919 + }, + { + "epoch": 8.979131415679639, + "grad_norm": 1.2222950458526611, + "learning_rate": 5.107163000564016e-06, + "loss": 0.772, + "step": 15920 + }, + { + "epoch": 8.97969543147208, + "grad_norm": 1.2320572137832642, + "learning_rate": 5.104342921601806e-06, + "loss": 0.672, + "step": 15921 + }, + { + "epoch": 8.980259447264523, + "grad_norm": 1.2315428256988525, + "learning_rate": 5.101522842639594e-06, + "loss": 0.6998, + "step": 15922 + }, + { + "epoch": 8.980823463056966, + "grad_norm": 1.2007957696914673, + "learning_rate": 5.098702763677383e-06, + "loss": 0.738, + "step": 15923 + }, + { + "epoch": 8.981387478849408, + "grad_norm": 1.4036494493484497, + "learning_rate": 5.095882684715173e-06, + "loss": 0.8101, + "step": 15924 + }, + { + "epoch": 8.98195149464185, + "grad_norm": 0.9528080224990845, + "learning_rate": 5.093062605752962e-06, + "loss": 0.7679, + "step": 15925 + }, + { + "epoch": 8.982515510434292, + "grad_norm": 1.3837624788284302, + "learning_rate": 5.0902425267907504e-06, + "loss": 0.8619, + "step": 15926 + }, + { + "epoch": 8.983079526226735, + "grad_norm": 1.6269915103912354, + "learning_rate": 5.087422447828539e-06, + "loss": 0.7956, + "step": 15927 + }, + { + "epoch": 8.983643542019177, + "grad_norm": 0.9345769286155701, + "learning_rate": 5.084602368866329e-06, + "loss": 0.675, + "step": 15928 + }, + { + "epoch": 8.984207557811619, + "grad_norm": 1.4430878162384033, + "learning_rate": 5.081782289904118e-06, + "loss": 0.7793, + "step": 15929 + }, + { + "epoch": 8.98477157360406, + "grad_norm": 1.2255343198776245, + "learning_rate": 5.0789622109419065e-06, + "loss": 0.8073, + "step": 15930 + }, + { + "epoch": 8.985335589396502, + "grad_norm": 1.1657794713974, + "learning_rate": 5.076142131979695e-06, + "loss": 0.7242, + "step": 15931 + }, + { + "epoch": 8.985899605188946, + "grad_norm": 0.970984935760498, + "learning_rate": 5.073322053017485e-06, + "loss": 0.7223, + "step": 15932 + }, + { + "epoch": 8.986463620981388, + "grad_norm": 1.2341653108596802, + "learning_rate": 5.070501974055274e-06, + "loss": 0.7806, + "step": 15933 + }, + { + "epoch": 8.98702763677383, + "grad_norm": 1.1332703828811646, + "learning_rate": 5.0676818950930625e-06, + "loss": 0.7592, + "step": 15934 + }, + { + "epoch": 8.987591652566271, + "grad_norm": 1.2320103645324707, + "learning_rate": 5.064861816130851e-06, + "loss": 0.6998, + "step": 15935 + }, + { + "epoch": 8.988155668358715, + "grad_norm": 1.3513600826263428, + "learning_rate": 5.062041737168641e-06, + "loss": 0.8228, + "step": 15936 + }, + { + "epoch": 8.988719684151157, + "grad_norm": 1.0335019826889038, + "learning_rate": 5.05922165820643e-06, + "loss": 0.7413, + "step": 15937 + }, + { + "epoch": 8.989283699943599, + "grad_norm": 1.2017652988433838, + "learning_rate": 5.0564015792442185e-06, + "loss": 0.7038, + "step": 15938 + }, + { + "epoch": 8.98984771573604, + "grad_norm": 1.1877362728118896, + "learning_rate": 5.053581500282008e-06, + "loss": 0.583, + "step": 15939 + }, + { + "epoch": 8.990411731528482, + "grad_norm": 1.1380499601364136, + "learning_rate": 5.050761421319798e-06, + "loss": 0.8463, + "step": 15940 + }, + { + "epoch": 8.990975747320926, + "grad_norm": 1.2859827280044556, + "learning_rate": 5.047941342357587e-06, + "loss": 0.7633, + "step": 15941 + }, + { + "epoch": 8.991539763113368, + "grad_norm": 1.4764797687530518, + "learning_rate": 5.0451212633953754e-06, + "loss": 0.7612, + "step": 15942 + }, + { + "epoch": 8.99210377890581, + "grad_norm": 1.1686817407608032, + "learning_rate": 5.042301184433164e-06, + "loss": 0.7543, + "step": 15943 + }, + { + "epoch": 8.992667794698251, + "grad_norm": 1.3224800825119019, + "learning_rate": 5.039481105470954e-06, + "loss": 0.7551, + "step": 15944 + }, + { + "epoch": 8.993231810490693, + "grad_norm": 1.189420461654663, + "learning_rate": 5.036661026508743e-06, + "loss": 0.6806, + "step": 15945 + }, + { + "epoch": 8.993795826283137, + "grad_norm": 1.1304861307144165, + "learning_rate": 5.0338409475465315e-06, + "loss": 0.7746, + "step": 15946 + }, + { + "epoch": 8.994359842075578, + "grad_norm": 1.0631506443023682, + "learning_rate": 5.03102086858432e-06, + "loss": 0.7288, + "step": 15947 + }, + { + "epoch": 8.99492385786802, + "grad_norm": 0.9707735776901245, + "learning_rate": 5.02820078962211e-06, + "loss": 0.7522, + "step": 15948 + }, + { + "epoch": 8.995487873660462, + "grad_norm": 1.4708589315414429, + "learning_rate": 5.025380710659899e-06, + "loss": 0.7978, + "step": 15949 + }, + { + "epoch": 8.996051889452904, + "grad_norm": 1.0420600175857544, + "learning_rate": 5.0225606316976875e-06, + "loss": 0.669, + "step": 15950 + }, + { + "epoch": 8.996615905245347, + "grad_norm": 1.1830830574035645, + "learning_rate": 5.019740552735476e-06, + "loss": 0.6648, + "step": 15951 + }, + { + "epoch": 8.99717992103779, + "grad_norm": 1.2839174270629883, + "learning_rate": 5.016920473773266e-06, + "loss": 0.7905, + "step": 15952 + }, + { + "epoch": 8.997743936830231, + "grad_norm": 1.2240755558013916, + "learning_rate": 5.014100394811055e-06, + "loss": 0.7677, + "step": 15953 + }, + { + "epoch": 8.998307952622673, + "grad_norm": 0.8475838303565979, + "learning_rate": 5.0112803158488435e-06, + "loss": 0.6806, + "step": 15954 + }, + { + "epoch": 8.998871968415116, + "grad_norm": 1.0159705877304077, + "learning_rate": 5.008460236886633e-06, + "loss": 0.7245, + "step": 15955 + }, + { + "epoch": 8.999435984207558, + "grad_norm": 1.1016842126846313, + "learning_rate": 5.005640157924422e-06, + "loss": 0.7742, + "step": 15956 + }, + { + "epoch": 9.0, + "grad_norm": 1.7572476863861084, + "learning_rate": 5.002820078962212e-06, + "loss": 0.7351, + "step": 15957 + }, + { + "epoch": 9.000564015792442, + "grad_norm": 1.331146001815796, + "learning_rate": 5e-06, + "loss": 0.7107, + "step": 15958 + }, + { + "epoch": 9.001128031584884, + "grad_norm": 1.0346821546554565, + "learning_rate": 4.997179921037789e-06, + "loss": 0.7001, + "step": 15959 + }, + { + "epoch": 9.001692047377327, + "grad_norm": 0.9397339224815369, + "learning_rate": 4.994359842075579e-06, + "loss": 0.704, + "step": 15960 + }, + { + "epoch": 9.002256063169769, + "grad_norm": 0.874614953994751, + "learning_rate": 4.991539763113368e-06, + "loss": 0.6384, + "step": 15961 + }, + { + "epoch": 9.00282007896221, + "grad_norm": 0.8957715630531311, + "learning_rate": 4.9887196841511564e-06, + "loss": 0.6663, + "step": 15962 + }, + { + "epoch": 9.003384094754653, + "grad_norm": 1.0235298871994019, + "learning_rate": 4.985899605188945e-06, + "loss": 0.6378, + "step": 15963 + }, + { + "epoch": 9.003948110547096, + "grad_norm": 1.2796112298965454, + "learning_rate": 4.983079526226735e-06, + "loss": 0.8061, + "step": 15964 + }, + { + "epoch": 9.004512126339538, + "grad_norm": 1.4773025512695312, + "learning_rate": 4.980259447264524e-06, + "loss": 0.7021, + "step": 15965 + }, + { + "epoch": 9.00507614213198, + "grad_norm": 1.1506446599960327, + "learning_rate": 4.9774393683023125e-06, + "loss": 0.6755, + "step": 15966 + }, + { + "epoch": 9.005640157924422, + "grad_norm": 1.4199743270874023, + "learning_rate": 4.974619289340101e-06, + "loss": 0.6756, + "step": 15967 + }, + { + "epoch": 9.006204173716863, + "grad_norm": 1.441837191581726, + "learning_rate": 4.971799210377891e-06, + "loss": 0.7725, + "step": 15968 + }, + { + "epoch": 9.006768189509307, + "grad_norm": 1.5660793781280518, + "learning_rate": 4.96897913141568e-06, + "loss": 0.7768, + "step": 15969 + }, + { + "epoch": 9.007332205301749, + "grad_norm": 1.2938315868377686, + "learning_rate": 4.9661590524534685e-06, + "loss": 0.7493, + "step": 15970 + }, + { + "epoch": 9.00789622109419, + "grad_norm": 1.120928168296814, + "learning_rate": 4.963338973491258e-06, + "loss": 0.7088, + "step": 15971 + }, + { + "epoch": 9.008460236886632, + "grad_norm": 1.3895457983016968, + "learning_rate": 4.960518894529047e-06, + "loss": 0.7796, + "step": 15972 + }, + { + "epoch": 9.009024252679074, + "grad_norm": 0.8622786402702332, + "learning_rate": 4.957698815566836e-06, + "loss": 0.729, + "step": 15973 + }, + { + "epoch": 9.009588268471518, + "grad_norm": 1.1012152433395386, + "learning_rate": 4.954878736604625e-06, + "loss": 0.7006, + "step": 15974 + }, + { + "epoch": 9.01015228426396, + "grad_norm": 1.3750544786453247, + "learning_rate": 4.952058657642414e-06, + "loss": 0.7269, + "step": 15975 + }, + { + "epoch": 9.010716300056401, + "grad_norm": 1.122191309928894, + "learning_rate": 4.949238578680204e-06, + "loss": 0.7112, + "step": 15976 + }, + { + "epoch": 9.011280315848843, + "grad_norm": 1.089470386505127, + "learning_rate": 4.946418499717993e-06, + "loss": 0.7159, + "step": 15977 + }, + { + "epoch": 9.011844331641287, + "grad_norm": 1.2051920890808105, + "learning_rate": 4.943598420755781e-06, + "loss": 0.807, + "step": 15978 + }, + { + "epoch": 9.012408347433729, + "grad_norm": 0.8679171204566956, + "learning_rate": 4.94077834179357e-06, + "loss": 0.654, + "step": 15979 + }, + { + "epoch": 9.01297236322617, + "grad_norm": 1.052694320678711, + "learning_rate": 4.93795826283136e-06, + "loss": 0.6836, + "step": 15980 + }, + { + "epoch": 9.013536379018612, + "grad_norm": 1.0331541299819946, + "learning_rate": 4.935138183869149e-06, + "loss": 0.7681, + "step": 15981 + }, + { + "epoch": 9.014100394811054, + "grad_norm": 1.1566786766052246, + "learning_rate": 4.9323181049069375e-06, + "loss": 0.7549, + "step": 15982 + }, + { + "epoch": 9.014664410603498, + "grad_norm": 1.1111778020858765, + "learning_rate": 4.929498025944727e-06, + "loss": 0.7445, + "step": 15983 + }, + { + "epoch": 9.01522842639594, + "grad_norm": 1.2955868244171143, + "learning_rate": 4.926677946982516e-06, + "loss": 0.6984, + "step": 15984 + }, + { + "epoch": 9.015792442188381, + "grad_norm": 1.1261435747146606, + "learning_rate": 4.923857868020305e-06, + "loss": 0.7066, + "step": 15985 + }, + { + "epoch": 9.016356457980823, + "grad_norm": 1.74954092502594, + "learning_rate": 4.9210377890580935e-06, + "loss": 0.8668, + "step": 15986 + }, + { + "epoch": 9.016920473773265, + "grad_norm": 3.313535213470459, + "learning_rate": 4.918217710095883e-06, + "loss": 0.7859, + "step": 15987 + }, + { + "epoch": 9.017484489565708, + "grad_norm": 1.3805365562438965, + "learning_rate": 4.915397631133672e-06, + "loss": 0.7628, + "step": 15988 + }, + { + "epoch": 9.01804850535815, + "grad_norm": 0.9634362459182739, + "learning_rate": 4.912577552171461e-06, + "loss": 0.7582, + "step": 15989 + }, + { + "epoch": 9.018612521150592, + "grad_norm": 0.877607524394989, + "learning_rate": 4.9097574732092495e-06, + "loss": 0.714, + "step": 15990 + }, + { + "epoch": 9.019176536943034, + "grad_norm": 0.8795639872550964, + "learning_rate": 4.906937394247039e-06, + "loss": 0.627, + "step": 15991 + }, + { + "epoch": 9.019740552735477, + "grad_norm": 0.9152646660804749, + "learning_rate": 4.904117315284829e-06, + "loss": 0.7016, + "step": 15992 + }, + { + "epoch": 9.02030456852792, + "grad_norm": 0.9639829993247986, + "learning_rate": 4.901297236322618e-06, + "loss": 0.6955, + "step": 15993 + }, + { + "epoch": 9.020868584320361, + "grad_norm": 1.1164250373840332, + "learning_rate": 4.898477157360406e-06, + "loss": 0.6877, + "step": 15994 + }, + { + "epoch": 9.021432600112803, + "grad_norm": 1.0709789991378784, + "learning_rate": 4.895657078398195e-06, + "loss": 0.8022, + "step": 15995 + }, + { + "epoch": 9.021996615905245, + "grad_norm": 1.0841516256332397, + "learning_rate": 4.892836999435985e-06, + "loss": 0.7128, + "step": 15996 + }, + { + "epoch": 9.022560631697688, + "grad_norm": 1.0332112312316895, + "learning_rate": 4.890016920473774e-06, + "loss": 0.6938, + "step": 15997 + }, + { + "epoch": 9.02312464749013, + "grad_norm": 1.3435001373291016, + "learning_rate": 4.8871968415115624e-06, + "loss": 0.6611, + "step": 15998 + }, + { + "epoch": 9.023688663282572, + "grad_norm": 1.127380132675171, + "learning_rate": 4.884376762549352e-06, + "loss": 0.8681, + "step": 15999 + }, + { + "epoch": 9.024252679075014, + "grad_norm": 1.0129574537277222, + "learning_rate": 4.881556683587141e-06, + "loss": 0.6204, + "step": 16000 + }, + { + "epoch": 9.024816694867456, + "grad_norm": 1.2058113813400269, + "learning_rate": 4.87873660462493e-06, + "loss": 0.6757, + "step": 16001 + }, + { + "epoch": 9.025380710659899, + "grad_norm": 1.1192001104354858, + "learning_rate": 4.8759165256627185e-06, + "loss": 0.6957, + "step": 16002 + }, + { + "epoch": 9.025944726452341, + "grad_norm": 1.4110193252563477, + "learning_rate": 4.873096446700508e-06, + "loss": 0.7691, + "step": 16003 + }, + { + "epoch": 9.026508742244783, + "grad_norm": 1.3408186435699463, + "learning_rate": 4.870276367738297e-06, + "loss": 0.7629, + "step": 16004 + }, + { + "epoch": 9.027072758037225, + "grad_norm": 1.5567564964294434, + "learning_rate": 4.867456288776086e-06, + "loss": 0.8241, + "step": 16005 + }, + { + "epoch": 9.027636773829668, + "grad_norm": 0.904026985168457, + "learning_rate": 4.8646362098138745e-06, + "loss": 0.6997, + "step": 16006 + }, + { + "epoch": 9.02820078962211, + "grad_norm": 0.9785336256027222, + "learning_rate": 4.861816130851664e-06, + "loss": 0.6479, + "step": 16007 + }, + { + "epoch": 9.028764805414552, + "grad_norm": 1.6322990655899048, + "learning_rate": 4.858996051889453e-06, + "loss": 0.6452, + "step": 16008 + }, + { + "epoch": 9.029328821206994, + "grad_norm": 0.9876791834831238, + "learning_rate": 4.856175972927242e-06, + "loss": 0.7603, + "step": 16009 + }, + { + "epoch": 9.029892836999435, + "grad_norm": 1.0825579166412354, + "learning_rate": 4.853355893965031e-06, + "loss": 0.7243, + "step": 16010 + }, + { + "epoch": 9.030456852791879, + "grad_norm": 1.0345646142959595, + "learning_rate": 4.85053581500282e-06, + "loss": 0.6342, + "step": 16011 + }, + { + "epoch": 9.03102086858432, + "grad_norm": 1.1030175685882568, + "learning_rate": 4.84771573604061e-06, + "loss": 0.7514, + "step": 16012 + }, + { + "epoch": 9.031584884376763, + "grad_norm": 1.5007236003875732, + "learning_rate": 4.844895657078399e-06, + "loss": 0.7834, + "step": 16013 + }, + { + "epoch": 9.032148900169204, + "grad_norm": 1.1768471002578735, + "learning_rate": 4.842075578116187e-06, + "loss": 0.6872, + "step": 16014 + }, + { + "epoch": 9.032712915961646, + "grad_norm": 1.0567306280136108, + "learning_rate": 4.839255499153977e-06, + "loss": 0.7122, + "step": 16015 + }, + { + "epoch": 9.03327693175409, + "grad_norm": 1.1083251237869263, + "learning_rate": 4.836435420191766e-06, + "loss": 0.8261, + "step": 16016 + }, + { + "epoch": 9.033840947546532, + "grad_norm": 1.277868390083313, + "learning_rate": 4.833615341229555e-06, + "loss": 0.7485, + "step": 16017 + }, + { + "epoch": 9.034404963338973, + "grad_norm": 1.1421161890029907, + "learning_rate": 4.8307952622673434e-06, + "loss": 0.6061, + "step": 16018 + }, + { + "epoch": 9.034968979131415, + "grad_norm": 1.5363094806671143, + "learning_rate": 4.827975183305133e-06, + "loss": 0.8042, + "step": 16019 + }, + { + "epoch": 9.035532994923859, + "grad_norm": 1.1198104619979858, + "learning_rate": 4.825155104342922e-06, + "loss": 0.749, + "step": 16020 + }, + { + "epoch": 9.0360970107163, + "grad_norm": 0.9240202903747559, + "learning_rate": 4.822335025380711e-06, + "loss": 0.6906, + "step": 16021 + }, + { + "epoch": 9.036661026508742, + "grad_norm": 1.0692270994186401, + "learning_rate": 4.8195149464184995e-06, + "loss": 0.7485, + "step": 16022 + }, + { + "epoch": 9.037225042301184, + "grad_norm": 1.5881887674331665, + "learning_rate": 4.816694867456289e-06, + "loss": 0.8253, + "step": 16023 + }, + { + "epoch": 9.037789058093626, + "grad_norm": 1.193845510482788, + "learning_rate": 4.813874788494078e-06, + "loss": 0.6716, + "step": 16024 + }, + { + "epoch": 9.03835307388607, + "grad_norm": 1.2481471300125122, + "learning_rate": 4.811054709531867e-06, + "loss": 0.7389, + "step": 16025 + }, + { + "epoch": 9.038917089678511, + "grad_norm": 1.3627279996871948, + "learning_rate": 4.8082346305696555e-06, + "loss": 0.7277, + "step": 16026 + }, + { + "epoch": 9.039481105470953, + "grad_norm": 1.2216287851333618, + "learning_rate": 4.805414551607445e-06, + "loss": 0.6808, + "step": 16027 + }, + { + "epoch": 9.040045121263395, + "grad_norm": 1.6564923524856567, + "learning_rate": 4.802594472645235e-06, + "loss": 0.826, + "step": 16028 + }, + { + "epoch": 9.040609137055837, + "grad_norm": 0.9360911250114441, + "learning_rate": 4.799774393683024e-06, + "loss": 0.6617, + "step": 16029 + }, + { + "epoch": 9.04117315284828, + "grad_norm": 1.170788049697876, + "learning_rate": 4.796954314720812e-06, + "loss": 0.763, + "step": 16030 + }, + { + "epoch": 9.041737168640722, + "grad_norm": 1.1935275793075562, + "learning_rate": 4.794134235758602e-06, + "loss": 0.6484, + "step": 16031 + }, + { + "epoch": 9.042301184433164, + "grad_norm": 1.1985949277877808, + "learning_rate": 4.791314156796391e-06, + "loss": 0.7592, + "step": 16032 + }, + { + "epoch": 9.042865200225606, + "grad_norm": 1.0412176847457886, + "learning_rate": 4.78849407783418e-06, + "loss": 0.7272, + "step": 16033 + }, + { + "epoch": 9.04342921601805, + "grad_norm": 1.0883506536483765, + "learning_rate": 4.7856739988719684e-06, + "loss": 0.6611, + "step": 16034 + }, + { + "epoch": 9.043993231810491, + "grad_norm": 1.2086009979248047, + "learning_rate": 4.782853919909758e-06, + "loss": 0.7182, + "step": 16035 + }, + { + "epoch": 9.044557247602933, + "grad_norm": 1.0734044313430786, + "learning_rate": 4.780033840947547e-06, + "loss": 0.6862, + "step": 16036 + }, + { + "epoch": 9.045121263395375, + "grad_norm": 0.8517735600471497, + "learning_rate": 4.777213761985336e-06, + "loss": 0.629, + "step": 16037 + }, + { + "epoch": 9.045685279187817, + "grad_norm": 1.3632506132125854, + "learning_rate": 4.7743936830231245e-06, + "loss": 0.8288, + "step": 16038 + }, + { + "epoch": 9.04624929498026, + "grad_norm": 1.3856253623962402, + "learning_rate": 4.771573604060914e-06, + "loss": 0.7322, + "step": 16039 + }, + { + "epoch": 9.046813310772702, + "grad_norm": 1.0410246849060059, + "learning_rate": 4.768753525098703e-06, + "loss": 0.6525, + "step": 16040 + }, + { + "epoch": 9.047377326565144, + "grad_norm": 1.2768962383270264, + "learning_rate": 4.765933446136492e-06, + "loss": 0.6944, + "step": 16041 + }, + { + "epoch": 9.047941342357586, + "grad_norm": 0.8956677913665771, + "learning_rate": 4.7631133671742805e-06, + "loss": 0.6935, + "step": 16042 + }, + { + "epoch": 9.048505358150027, + "grad_norm": 1.4260854721069336, + "learning_rate": 4.76029328821207e-06, + "loss": 0.7418, + "step": 16043 + }, + { + "epoch": 9.049069373942471, + "grad_norm": 1.2384930849075317, + "learning_rate": 4.757473209249859e-06, + "loss": 0.7611, + "step": 16044 + }, + { + "epoch": 9.049633389734913, + "grad_norm": 1.0148632526397705, + "learning_rate": 4.754653130287649e-06, + "loss": 0.7478, + "step": 16045 + }, + { + "epoch": 9.050197405527355, + "grad_norm": 1.0389338731765747, + "learning_rate": 4.751833051325437e-06, + "loss": 0.7267, + "step": 16046 + }, + { + "epoch": 9.050761421319796, + "grad_norm": 1.2166215181350708, + "learning_rate": 4.749012972363227e-06, + "loss": 0.728, + "step": 16047 + }, + { + "epoch": 9.05132543711224, + "grad_norm": 1.0272762775421143, + "learning_rate": 4.746192893401016e-06, + "loss": 0.6995, + "step": 16048 + }, + { + "epoch": 9.051889452904682, + "grad_norm": 1.0998260974884033, + "learning_rate": 4.743372814438805e-06, + "loss": 0.7444, + "step": 16049 + }, + { + "epoch": 9.052453468697124, + "grad_norm": 1.0225896835327148, + "learning_rate": 4.740552735476593e-06, + "loss": 0.6523, + "step": 16050 + }, + { + "epoch": 9.053017484489565, + "grad_norm": 1.0796284675598145, + "learning_rate": 4.737732656514383e-06, + "loss": 0.6687, + "step": 16051 + }, + { + "epoch": 9.053581500282007, + "grad_norm": 1.034570574760437, + "learning_rate": 4.734912577552172e-06, + "loss": 0.5892, + "step": 16052 + }, + { + "epoch": 9.05414551607445, + "grad_norm": 1.0572808980941772, + "learning_rate": 4.732092498589961e-06, + "loss": 0.8083, + "step": 16053 + }, + { + "epoch": 9.054709531866893, + "grad_norm": 1.233532428741455, + "learning_rate": 4.7292724196277494e-06, + "loss": 0.8191, + "step": 16054 + }, + { + "epoch": 9.055273547659334, + "grad_norm": 1.1021130084991455, + "learning_rate": 4.726452340665539e-06, + "loss": 0.7905, + "step": 16055 + }, + { + "epoch": 9.055837563451776, + "grad_norm": 1.247498869895935, + "learning_rate": 4.723632261703328e-06, + "loss": 0.7782, + "step": 16056 + }, + { + "epoch": 9.056401579244218, + "grad_norm": 1.0180128812789917, + "learning_rate": 4.720812182741117e-06, + "loss": 0.7834, + "step": 16057 + }, + { + "epoch": 9.056965595036662, + "grad_norm": 0.9744764566421509, + "learning_rate": 4.7179921037789055e-06, + "loss": 0.7269, + "step": 16058 + }, + { + "epoch": 9.057529610829103, + "grad_norm": 0.8956677317619324, + "learning_rate": 4.715172024816695e-06, + "loss": 0.6844, + "step": 16059 + }, + { + "epoch": 9.058093626621545, + "grad_norm": 1.5919603109359741, + "learning_rate": 4.712351945854484e-06, + "loss": 0.8405, + "step": 16060 + }, + { + "epoch": 9.058657642413987, + "grad_norm": 1.1904010772705078, + "learning_rate": 4.709531866892273e-06, + "loss": 0.7008, + "step": 16061 + }, + { + "epoch": 9.05922165820643, + "grad_norm": 1.2010782957077026, + "learning_rate": 4.7067117879300615e-06, + "loss": 0.7398, + "step": 16062 + }, + { + "epoch": 9.059785673998872, + "grad_norm": 1.2893022298812866, + "learning_rate": 4.703891708967852e-06, + "loss": 0.747, + "step": 16063 + }, + { + "epoch": 9.060349689791314, + "grad_norm": 0.9754787087440491, + "learning_rate": 4.701071630005641e-06, + "loss": 0.6659, + "step": 16064 + }, + { + "epoch": 9.060913705583756, + "grad_norm": 1.0420042276382446, + "learning_rate": 4.69825155104343e-06, + "loss": 0.6755, + "step": 16065 + }, + { + "epoch": 9.061477721376198, + "grad_norm": 0.9472784996032715, + "learning_rate": 4.695431472081218e-06, + "loss": 0.6345, + "step": 16066 + }, + { + "epoch": 9.062041737168641, + "grad_norm": 1.0364463329315186, + "learning_rate": 4.692611393119008e-06, + "loss": 0.7503, + "step": 16067 + }, + { + "epoch": 9.062605752961083, + "grad_norm": 1.1325113773345947, + "learning_rate": 4.689791314156797e-06, + "loss": 0.7108, + "step": 16068 + }, + { + "epoch": 9.063169768753525, + "grad_norm": 0.9275105595588684, + "learning_rate": 4.686971235194586e-06, + "loss": 0.7648, + "step": 16069 + }, + { + "epoch": 9.063733784545967, + "grad_norm": 0.9919751882553101, + "learning_rate": 4.684151156232374e-06, + "loss": 0.6429, + "step": 16070 + }, + { + "epoch": 9.064297800338409, + "grad_norm": 1.1647865772247314, + "learning_rate": 4.681331077270164e-06, + "loss": 0.7371, + "step": 16071 + }, + { + "epoch": 9.064861816130852, + "grad_norm": 1.2984435558319092, + "learning_rate": 4.678510998307953e-06, + "loss": 0.8341, + "step": 16072 + }, + { + "epoch": 9.065425831923294, + "grad_norm": 1.2736107110977173, + "learning_rate": 4.675690919345742e-06, + "loss": 0.6831, + "step": 16073 + }, + { + "epoch": 9.065989847715736, + "grad_norm": 1.0120819807052612, + "learning_rate": 4.6728708403835305e-06, + "loss": 0.7634, + "step": 16074 + }, + { + "epoch": 9.066553863508178, + "grad_norm": 1.0204700231552124, + "learning_rate": 4.67005076142132e-06, + "loss": 0.7869, + "step": 16075 + }, + { + "epoch": 9.067117879300621, + "grad_norm": 1.4855326414108276, + "learning_rate": 4.667230682459109e-06, + "loss": 0.8046, + "step": 16076 + }, + { + "epoch": 9.067681895093063, + "grad_norm": 0.9269580841064453, + "learning_rate": 4.664410603496898e-06, + "loss": 0.71, + "step": 16077 + }, + { + "epoch": 9.068245910885505, + "grad_norm": 1.1221604347229004, + "learning_rate": 4.661590524534687e-06, + "loss": 0.7128, + "step": 16078 + }, + { + "epoch": 9.068809926677947, + "grad_norm": 1.1103686094284058, + "learning_rate": 4.658770445572476e-06, + "loss": 0.6722, + "step": 16079 + }, + { + "epoch": 9.069373942470389, + "grad_norm": 1.1430978775024414, + "learning_rate": 4.655950366610266e-06, + "loss": 0.6552, + "step": 16080 + }, + { + "epoch": 9.069937958262832, + "grad_norm": 1.1496353149414062, + "learning_rate": 4.6531302876480546e-06, + "loss": 0.7093, + "step": 16081 + }, + { + "epoch": 9.070501974055274, + "grad_norm": 1.0828956365585327, + "learning_rate": 4.650310208685843e-06, + "loss": 0.7127, + "step": 16082 + }, + { + "epoch": 9.071065989847716, + "grad_norm": 0.9311686158180237, + "learning_rate": 4.647490129723633e-06, + "loss": 0.7142, + "step": 16083 + }, + { + "epoch": 9.071630005640158, + "grad_norm": 1.2978299856185913, + "learning_rate": 4.644670050761422e-06, + "loss": 0.7541, + "step": 16084 + }, + { + "epoch": 9.0721940214326, + "grad_norm": 1.096793532371521, + "learning_rate": 4.641849971799211e-06, + "loss": 0.7555, + "step": 16085 + }, + { + "epoch": 9.072758037225043, + "grad_norm": 0.962598443031311, + "learning_rate": 4.639029892836999e-06, + "loss": 0.6391, + "step": 16086 + }, + { + "epoch": 9.073322053017485, + "grad_norm": 1.1176179647445679, + "learning_rate": 4.636209813874789e-06, + "loss": 0.7189, + "step": 16087 + }, + { + "epoch": 9.073886068809927, + "grad_norm": 1.1882266998291016, + "learning_rate": 4.633389734912578e-06, + "loss": 0.6768, + "step": 16088 + }, + { + "epoch": 9.074450084602368, + "grad_norm": 1.3127882480621338, + "learning_rate": 4.630569655950367e-06, + "loss": 0.7788, + "step": 16089 + }, + { + "epoch": 9.075014100394812, + "grad_norm": 1.4028688669204712, + "learning_rate": 4.6277495769881554e-06, + "loss": 0.7655, + "step": 16090 + }, + { + "epoch": 9.075578116187254, + "grad_norm": 1.0268715620040894, + "learning_rate": 4.624929498025945e-06, + "loss": 0.6477, + "step": 16091 + }, + { + "epoch": 9.076142131979696, + "grad_norm": 0.9289050698280334, + "learning_rate": 4.622109419063734e-06, + "loss": 0.5925, + "step": 16092 + }, + { + "epoch": 9.076706147772137, + "grad_norm": 0.7782682776451111, + "learning_rate": 4.619289340101523e-06, + "loss": 0.6459, + "step": 16093 + }, + { + "epoch": 9.07727016356458, + "grad_norm": 6.060657978057861, + "learning_rate": 4.616469261139312e-06, + "loss": 0.7497, + "step": 16094 + }, + { + "epoch": 9.077834179357023, + "grad_norm": 0.9178531169891357, + "learning_rate": 4.613649182177101e-06, + "loss": 0.7206, + "step": 16095 + }, + { + "epoch": 9.078398195149465, + "grad_norm": 1.185835361480713, + "learning_rate": 4.61082910321489e-06, + "loss": 0.6842, + "step": 16096 + }, + { + "epoch": 9.078962210941906, + "grad_norm": 1.1462972164154053, + "learning_rate": 4.608009024252679e-06, + "loss": 0.7046, + "step": 16097 + }, + { + "epoch": 9.079526226734348, + "grad_norm": 1.522416591644287, + "learning_rate": 4.605188945290468e-06, + "loss": 0.7078, + "step": 16098 + }, + { + "epoch": 9.08009024252679, + "grad_norm": 1.3578976392745972, + "learning_rate": 4.602368866328258e-06, + "loss": 0.7537, + "step": 16099 + }, + { + "epoch": 9.080654258319234, + "grad_norm": 1.0975170135498047, + "learning_rate": 4.599548787366047e-06, + "loss": 0.6682, + "step": 16100 + }, + { + "epoch": 9.081218274111675, + "grad_norm": 1.3777201175689697, + "learning_rate": 4.596728708403836e-06, + "loss": 0.7803, + "step": 16101 + }, + { + "epoch": 9.081782289904117, + "grad_norm": 1.1617611646652222, + "learning_rate": 4.593908629441624e-06, + "loss": 0.7997, + "step": 16102 + }, + { + "epoch": 9.082346305696559, + "grad_norm": 1.3660231828689575, + "learning_rate": 4.591088550479414e-06, + "loss": 0.7637, + "step": 16103 + }, + { + "epoch": 9.082910321489003, + "grad_norm": 1.053105115890503, + "learning_rate": 4.588268471517203e-06, + "loss": 0.7099, + "step": 16104 + }, + { + "epoch": 9.083474337281444, + "grad_norm": 1.246742606163025, + "learning_rate": 4.585448392554992e-06, + "loss": 0.7466, + "step": 16105 + }, + { + "epoch": 9.084038353073886, + "grad_norm": 1.414999008178711, + "learning_rate": 4.58262831359278e-06, + "loss": 0.7534, + "step": 16106 + }, + { + "epoch": 9.084602368866328, + "grad_norm": 1.191598653793335, + "learning_rate": 4.57980823463057e-06, + "loss": 0.6993, + "step": 16107 + }, + { + "epoch": 9.08516638465877, + "grad_norm": 1.0413438081741333, + "learning_rate": 4.576988155668359e-06, + "loss": 0.7713, + "step": 16108 + }, + { + "epoch": 9.085730400451213, + "grad_norm": 1.0777570009231567, + "learning_rate": 4.574168076706148e-06, + "loss": 0.6803, + "step": 16109 + }, + { + "epoch": 9.086294416243655, + "grad_norm": 1.0583102703094482, + "learning_rate": 4.571347997743937e-06, + "loss": 0.8666, + "step": 16110 + }, + { + "epoch": 9.086858432036097, + "grad_norm": 1.610270380973816, + "learning_rate": 4.568527918781726e-06, + "loss": 0.7291, + "step": 16111 + }, + { + "epoch": 9.087422447828539, + "grad_norm": 1.2236613035202026, + "learning_rate": 4.565707839819515e-06, + "loss": 0.7855, + "step": 16112 + }, + { + "epoch": 9.08798646362098, + "grad_norm": 0.9823960661888123, + "learning_rate": 4.562887760857304e-06, + "loss": 0.7494, + "step": 16113 + }, + { + "epoch": 9.088550479413424, + "grad_norm": 1.3944199085235596, + "learning_rate": 4.560067681895093e-06, + "loss": 0.7185, + "step": 16114 + }, + { + "epoch": 9.089114495205866, + "grad_norm": 1.0476011037826538, + "learning_rate": 4.557247602932882e-06, + "loss": 0.7287, + "step": 16115 + }, + { + "epoch": 9.089678510998308, + "grad_norm": 0.9022760391235352, + "learning_rate": 4.554427523970672e-06, + "loss": 0.7261, + "step": 16116 + }, + { + "epoch": 9.09024252679075, + "grad_norm": 1.065961480140686, + "learning_rate": 4.5516074450084606e-06, + "loss": 0.7845, + "step": 16117 + }, + { + "epoch": 9.090806542583193, + "grad_norm": 1.1234095096588135, + "learning_rate": 4.548787366046249e-06, + "loss": 0.7361, + "step": 16118 + }, + { + "epoch": 9.091370558375635, + "grad_norm": 1.2683006525039673, + "learning_rate": 4.545967287084039e-06, + "loss": 0.7816, + "step": 16119 + }, + { + "epoch": 9.091934574168077, + "grad_norm": 1.1081433296203613, + "learning_rate": 4.543147208121828e-06, + "loss": 0.6588, + "step": 16120 + }, + { + "epoch": 9.092498589960519, + "grad_norm": 1.7631394863128662, + "learning_rate": 4.540327129159617e-06, + "loss": 0.8113, + "step": 16121 + }, + { + "epoch": 9.09306260575296, + "grad_norm": 1.4281777143478394, + "learning_rate": 4.537507050197406e-06, + "loss": 0.7681, + "step": 16122 + }, + { + "epoch": 9.093626621545404, + "grad_norm": 1.0229394435882568, + "learning_rate": 4.534686971235195e-06, + "loss": 0.6918, + "step": 16123 + }, + { + "epoch": 9.094190637337846, + "grad_norm": 1.1410080194473267, + "learning_rate": 4.531866892272984e-06, + "loss": 0.75, + "step": 16124 + }, + { + "epoch": 9.094754653130288, + "grad_norm": 0.8399147987365723, + "learning_rate": 4.529046813310773e-06, + "loss": 0.6701, + "step": 16125 + }, + { + "epoch": 9.09531866892273, + "grad_norm": 1.2211812734603882, + "learning_rate": 4.526226734348562e-06, + "loss": 0.7465, + "step": 16126 + }, + { + "epoch": 9.095882684715171, + "grad_norm": 1.2039499282836914, + "learning_rate": 4.523406655386351e-06, + "loss": 0.7967, + "step": 16127 + }, + { + "epoch": 9.096446700507615, + "grad_norm": 1.1658422946929932, + "learning_rate": 4.52058657642414e-06, + "loss": 0.7731, + "step": 16128 + }, + { + "epoch": 9.097010716300057, + "grad_norm": 1.3508890867233276, + "learning_rate": 4.517766497461929e-06, + "loss": 0.7685, + "step": 16129 + }, + { + "epoch": 9.097574732092498, + "grad_norm": 1.004562258720398, + "learning_rate": 4.514946418499718e-06, + "loss": 0.679, + "step": 16130 + }, + { + "epoch": 9.09813874788494, + "grad_norm": 1.0317918062210083, + "learning_rate": 4.512126339537507e-06, + "loss": 0.6519, + "step": 16131 + }, + { + "epoch": 9.098702763677384, + "grad_norm": 1.2100838422775269, + "learning_rate": 4.509306260575296e-06, + "loss": 0.8215, + "step": 16132 + }, + { + "epoch": 9.099266779469826, + "grad_norm": 1.1527905464172363, + "learning_rate": 4.5064861816130855e-06, + "loss": 0.8057, + "step": 16133 + }, + { + "epoch": 9.099830795262267, + "grad_norm": 1.4418792724609375, + "learning_rate": 4.503666102650874e-06, + "loss": 0.8544, + "step": 16134 + }, + { + "epoch": 9.10039481105471, + "grad_norm": 0.872087836265564, + "learning_rate": 4.500846023688664e-06, + "loss": 0.7111, + "step": 16135 + }, + { + "epoch": 9.100958826847151, + "grad_norm": 0.9522708654403687, + "learning_rate": 4.498025944726453e-06, + "loss": 0.6836, + "step": 16136 + }, + { + "epoch": 9.101522842639595, + "grad_norm": 1.2446537017822266, + "learning_rate": 4.4952058657642416e-06, + "loss": 0.7699, + "step": 16137 + }, + { + "epoch": 9.102086858432036, + "grad_norm": 1.0636556148529053, + "learning_rate": 4.492385786802031e-06, + "loss": 0.7381, + "step": 16138 + }, + { + "epoch": 9.102650874224478, + "grad_norm": 1.1489278078079224, + "learning_rate": 4.48956570783982e-06, + "loss": 0.8183, + "step": 16139 + }, + { + "epoch": 9.10321489001692, + "grad_norm": 1.1305546760559082, + "learning_rate": 4.486745628877609e-06, + "loss": 0.7599, + "step": 16140 + }, + { + "epoch": 9.103778905809362, + "grad_norm": 1.0552092790603638, + "learning_rate": 4.483925549915398e-06, + "loss": 0.7364, + "step": 16141 + }, + { + "epoch": 9.104342921601805, + "grad_norm": 1.0071816444396973, + "learning_rate": 4.481105470953187e-06, + "loss": 0.715, + "step": 16142 + }, + { + "epoch": 9.104906937394247, + "grad_norm": 1.080299735069275, + "learning_rate": 4.478285391990976e-06, + "loss": 0.7322, + "step": 16143 + }, + { + "epoch": 9.105470953186689, + "grad_norm": 1.4481704235076904, + "learning_rate": 4.475465313028765e-06, + "loss": 0.748, + "step": 16144 + }, + { + "epoch": 9.106034968979131, + "grad_norm": 1.0257490873336792, + "learning_rate": 4.472645234066554e-06, + "loss": 0.7819, + "step": 16145 + }, + { + "epoch": 9.106598984771574, + "grad_norm": 1.138546347618103, + "learning_rate": 4.469825155104343e-06, + "loss": 0.708, + "step": 16146 + }, + { + "epoch": 9.107163000564016, + "grad_norm": 1.1100693941116333, + "learning_rate": 4.467005076142132e-06, + "loss": 0.7128, + "step": 16147 + }, + { + "epoch": 9.107727016356458, + "grad_norm": 1.0376390218734741, + "learning_rate": 4.464184997179921e-06, + "loss": 0.73, + "step": 16148 + }, + { + "epoch": 9.1082910321489, + "grad_norm": 1.0429298877716064, + "learning_rate": 4.46136491821771e-06, + "loss": 0.7259, + "step": 16149 + }, + { + "epoch": 9.108855047941342, + "grad_norm": 0.9901449084281921, + "learning_rate": 4.458544839255499e-06, + "loss": 0.7402, + "step": 16150 + }, + { + "epoch": 9.109419063733785, + "grad_norm": 1.3132340908050537, + "learning_rate": 4.455724760293289e-06, + "loss": 0.6263, + "step": 16151 + }, + { + "epoch": 9.109983079526227, + "grad_norm": 0.8644490242004395, + "learning_rate": 4.452904681331078e-06, + "loss": 0.7339, + "step": 16152 + }, + { + "epoch": 9.110547095318669, + "grad_norm": 1.108721137046814, + "learning_rate": 4.4500846023688666e-06, + "loss": 0.7534, + "step": 16153 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 1.3228652477264404, + "learning_rate": 4.447264523406656e-06, + "loss": 0.6868, + "step": 16154 + }, + { + "epoch": 9.111675126903553, + "grad_norm": 1.4110759496688843, + "learning_rate": 4.444444444444445e-06, + "loss": 0.8182, + "step": 16155 + }, + { + "epoch": 9.112239142695996, + "grad_norm": 1.0533279180526733, + "learning_rate": 4.441624365482234e-06, + "loss": 0.6673, + "step": 16156 + }, + { + "epoch": 9.112803158488438, + "grad_norm": 1.1314057111740112, + "learning_rate": 4.438804286520023e-06, + "loss": 0.6561, + "step": 16157 + }, + { + "epoch": 9.11336717428088, + "grad_norm": 1.0933759212493896, + "learning_rate": 4.435984207557812e-06, + "loss": 0.8267, + "step": 16158 + }, + { + "epoch": 9.113931190073322, + "grad_norm": 0.9434036612510681, + "learning_rate": 4.433164128595601e-06, + "loss": 0.6329, + "step": 16159 + }, + { + "epoch": 9.114495205865765, + "grad_norm": 1.1425267457962036, + "learning_rate": 4.43034404963339e-06, + "loss": 0.7379, + "step": 16160 + }, + { + "epoch": 9.115059221658207, + "grad_norm": 1.2401926517486572, + "learning_rate": 4.427523970671179e-06, + "loss": 0.7383, + "step": 16161 + }, + { + "epoch": 9.115623237450649, + "grad_norm": 1.1675130128860474, + "learning_rate": 4.424703891708968e-06, + "loss": 0.6612, + "step": 16162 + }, + { + "epoch": 9.11618725324309, + "grad_norm": 0.9214248657226562, + "learning_rate": 4.421883812746757e-06, + "loss": 0.7273, + "step": 16163 + }, + { + "epoch": 9.116751269035532, + "grad_norm": 1.1455014944076538, + "learning_rate": 4.419063733784546e-06, + "loss": 0.7177, + "step": 16164 + }, + { + "epoch": 9.117315284827976, + "grad_norm": 0.9182458519935608, + "learning_rate": 4.416243654822335e-06, + "loss": 0.7558, + "step": 16165 + }, + { + "epoch": 9.117879300620418, + "grad_norm": 1.032861590385437, + "learning_rate": 4.413423575860124e-06, + "loss": 0.791, + "step": 16166 + }, + { + "epoch": 9.11844331641286, + "grad_norm": 1.2692387104034424, + "learning_rate": 4.410603496897913e-06, + "loss": 0.7784, + "step": 16167 + }, + { + "epoch": 9.119007332205301, + "grad_norm": 0.808616042137146, + "learning_rate": 4.407783417935702e-06, + "loss": 0.6394, + "step": 16168 + }, + { + "epoch": 9.119571347997743, + "grad_norm": 1.2250127792358398, + "learning_rate": 4.4049633389734915e-06, + "loss": 0.7425, + "step": 16169 + }, + { + "epoch": 9.120135363790187, + "grad_norm": 0.9961995482444763, + "learning_rate": 4.402143260011281e-06, + "loss": 0.7375, + "step": 16170 + }, + { + "epoch": 9.120699379582629, + "grad_norm": 1.4625091552734375, + "learning_rate": 4.39932318104907e-06, + "loss": 0.7932, + "step": 16171 + }, + { + "epoch": 9.12126339537507, + "grad_norm": 1.015218734741211, + "learning_rate": 4.396503102086859e-06, + "loss": 0.7777, + "step": 16172 + }, + { + "epoch": 9.121827411167512, + "grad_norm": 1.2886968851089478, + "learning_rate": 4.3936830231246476e-06, + "loss": 0.7013, + "step": 16173 + }, + { + "epoch": 9.122391426959956, + "grad_norm": 1.1412982940673828, + "learning_rate": 4.390862944162437e-06, + "loss": 0.68, + "step": 16174 + }, + { + "epoch": 9.122955442752398, + "grad_norm": 1.0603257417678833, + "learning_rate": 4.388042865200226e-06, + "loss": 0.7036, + "step": 16175 + }, + { + "epoch": 9.12351945854484, + "grad_norm": 0.8965171575546265, + "learning_rate": 4.385222786238015e-06, + "loss": 0.7348, + "step": 16176 + }, + { + "epoch": 9.124083474337281, + "grad_norm": 0.954842746257782, + "learning_rate": 4.382402707275804e-06, + "loss": 0.738, + "step": 16177 + }, + { + "epoch": 9.124647490129723, + "grad_norm": 1.037748098373413, + "learning_rate": 4.379582628313593e-06, + "loss": 0.6472, + "step": 16178 + }, + { + "epoch": 9.125211505922167, + "grad_norm": 1.2443764209747314, + "learning_rate": 4.376762549351382e-06, + "loss": 0.7242, + "step": 16179 + }, + { + "epoch": 9.125775521714608, + "grad_norm": 1.0923717021942139, + "learning_rate": 4.373942470389171e-06, + "loss": 0.6823, + "step": 16180 + }, + { + "epoch": 9.12633953750705, + "grad_norm": 1.199257254600525, + "learning_rate": 4.37112239142696e-06, + "loss": 0.722, + "step": 16181 + }, + { + "epoch": 9.126903553299492, + "grad_norm": 1.1705117225646973, + "learning_rate": 4.368302312464749e-06, + "loss": 0.8159, + "step": 16182 + }, + { + "epoch": 9.127467569091934, + "grad_norm": 1.0152450799942017, + "learning_rate": 4.365482233502538e-06, + "loss": 0.7675, + "step": 16183 + }, + { + "epoch": 9.128031584884377, + "grad_norm": 1.2268179655075073, + "learning_rate": 4.362662154540327e-06, + "loss": 0.776, + "step": 16184 + }, + { + "epoch": 9.12859560067682, + "grad_norm": 1.0451217889785767, + "learning_rate": 4.359842075578116e-06, + "loss": 0.7193, + "step": 16185 + }, + { + "epoch": 9.129159616469261, + "grad_norm": 1.4649015665054321, + "learning_rate": 4.357021996615906e-06, + "loss": 0.6476, + "step": 16186 + }, + { + "epoch": 9.129723632261703, + "grad_norm": 1.0989917516708374, + "learning_rate": 4.354201917653695e-06, + "loss": 0.7011, + "step": 16187 + }, + { + "epoch": 9.130287648054146, + "grad_norm": 1.2926746606826782, + "learning_rate": 4.351381838691484e-06, + "loss": 0.7264, + "step": 16188 + }, + { + "epoch": 9.130851663846588, + "grad_norm": 1.2408719062805176, + "learning_rate": 4.3485617597292725e-06, + "loss": 0.7157, + "step": 16189 + }, + { + "epoch": 9.13141567963903, + "grad_norm": 1.1262847185134888, + "learning_rate": 4.345741680767062e-06, + "loss": 0.7695, + "step": 16190 + }, + { + "epoch": 9.131979695431472, + "grad_norm": 0.9239082932472229, + "learning_rate": 4.342921601804851e-06, + "loss": 0.6941, + "step": 16191 + }, + { + "epoch": 9.132543711223914, + "grad_norm": 1.1613889932632446, + "learning_rate": 4.34010152284264e-06, + "loss": 0.8454, + "step": 16192 + }, + { + "epoch": 9.133107727016357, + "grad_norm": 1.3228230476379395, + "learning_rate": 4.337281443880429e-06, + "loss": 0.7326, + "step": 16193 + }, + { + "epoch": 9.133671742808799, + "grad_norm": 1.3938850164413452, + "learning_rate": 4.334461364918218e-06, + "loss": 0.7549, + "step": 16194 + }, + { + "epoch": 9.13423575860124, + "grad_norm": 0.9386385679244995, + "learning_rate": 4.331641285956007e-06, + "loss": 0.6864, + "step": 16195 + }, + { + "epoch": 9.134799774393683, + "grad_norm": 1.5524202585220337, + "learning_rate": 4.328821206993796e-06, + "loss": 0.8157, + "step": 16196 + }, + { + "epoch": 9.135363790186124, + "grad_norm": 0.9794740676879883, + "learning_rate": 4.326001128031585e-06, + "loss": 0.7407, + "step": 16197 + }, + { + "epoch": 9.135927805978568, + "grad_norm": 1.0772299766540527, + "learning_rate": 4.323181049069374e-06, + "loss": 0.6835, + "step": 16198 + }, + { + "epoch": 9.13649182177101, + "grad_norm": 0.9936906695365906, + "learning_rate": 4.320360970107163e-06, + "loss": 0.8468, + "step": 16199 + }, + { + "epoch": 9.137055837563452, + "grad_norm": 1.1385432481765747, + "learning_rate": 4.317540891144952e-06, + "loss": 0.7391, + "step": 16200 + }, + { + "epoch": 9.137619853355893, + "grad_norm": 1.239545226097107, + "learning_rate": 4.3147208121827415e-06, + "loss": 0.7935, + "step": 16201 + }, + { + "epoch": 9.138183869148337, + "grad_norm": 1.1543071269989014, + "learning_rate": 4.31190073322053e-06, + "loss": 0.7026, + "step": 16202 + }, + { + "epoch": 9.138747884940779, + "grad_norm": 0.9887071847915649, + "learning_rate": 4.309080654258319e-06, + "loss": 0.7301, + "step": 16203 + }, + { + "epoch": 9.13931190073322, + "grad_norm": 1.3183302879333496, + "learning_rate": 4.306260575296109e-06, + "loss": 0.799, + "step": 16204 + }, + { + "epoch": 9.139875916525662, + "grad_norm": 1.166011929512024, + "learning_rate": 4.3034404963338975e-06, + "loss": 0.7029, + "step": 16205 + }, + { + "epoch": 9.140439932318104, + "grad_norm": 1.0245563983917236, + "learning_rate": 4.300620417371687e-06, + "loss": 0.6704, + "step": 16206 + }, + { + "epoch": 9.141003948110548, + "grad_norm": 1.3167126178741455, + "learning_rate": 4.297800338409476e-06, + "loss": 0.778, + "step": 16207 + }, + { + "epoch": 9.14156796390299, + "grad_norm": 1.0158451795578003, + "learning_rate": 4.294980259447265e-06, + "loss": 0.8703, + "step": 16208 + }, + { + "epoch": 9.142131979695431, + "grad_norm": 1.103230595588684, + "learning_rate": 4.2921601804850536e-06, + "loss": 0.6581, + "step": 16209 + }, + { + "epoch": 9.142695995487873, + "grad_norm": 1.0641635656356812, + "learning_rate": 4.289340101522843e-06, + "loss": 0.7282, + "step": 16210 + }, + { + "epoch": 9.143260011280315, + "grad_norm": 1.159733533859253, + "learning_rate": 4.286520022560632e-06, + "loss": 0.7268, + "step": 16211 + }, + { + "epoch": 9.143824027072759, + "grad_norm": 1.140592336654663, + "learning_rate": 4.283699943598421e-06, + "loss": 0.8352, + "step": 16212 + }, + { + "epoch": 9.1443880428652, + "grad_norm": 1.3258304595947266, + "learning_rate": 4.28087986463621e-06, + "loss": 0.741, + "step": 16213 + }, + { + "epoch": 9.144952058657642, + "grad_norm": 1.0909216403961182, + "learning_rate": 4.278059785673999e-06, + "loss": 0.6952, + "step": 16214 + }, + { + "epoch": 9.145516074450084, + "grad_norm": 1.115855097770691, + "learning_rate": 4.275239706711788e-06, + "loss": 0.7544, + "step": 16215 + }, + { + "epoch": 9.146080090242528, + "grad_norm": 0.7990935444831848, + "learning_rate": 4.272419627749577e-06, + "loss": 0.5988, + "step": 16216 + }, + { + "epoch": 9.14664410603497, + "grad_norm": 0.8566650152206421, + "learning_rate": 4.2695995487873665e-06, + "loss": 0.7078, + "step": 16217 + }, + { + "epoch": 9.147208121827411, + "grad_norm": 1.241239070892334, + "learning_rate": 4.266779469825155e-06, + "loss": 0.929, + "step": 16218 + }, + { + "epoch": 9.147772137619853, + "grad_norm": 1.0693343877792358, + "learning_rate": 4.263959390862944e-06, + "loss": 0.7473, + "step": 16219 + }, + { + "epoch": 9.148336153412295, + "grad_norm": 0.7507848739624023, + "learning_rate": 4.261139311900733e-06, + "loss": 0.6142, + "step": 16220 + }, + { + "epoch": 9.148900169204738, + "grad_norm": 1.3160749673843384, + "learning_rate": 4.2583192329385225e-06, + "loss": 0.7908, + "step": 16221 + }, + { + "epoch": 9.14946418499718, + "grad_norm": 0.977007269859314, + "learning_rate": 4.255499153976312e-06, + "loss": 0.7872, + "step": 16222 + }, + { + "epoch": 9.150028200789622, + "grad_norm": 1.1566091775894165, + "learning_rate": 4.252679075014101e-06, + "loss": 0.6762, + "step": 16223 + }, + { + "epoch": 9.150592216582064, + "grad_norm": 1.160731554031372, + "learning_rate": 4.24985899605189e-06, + "loss": 0.7213, + "step": 16224 + }, + { + "epoch": 9.151156232374506, + "grad_norm": 1.2820221185684204, + "learning_rate": 4.2470389170896785e-06, + "loss": 0.7365, + "step": 16225 + }, + { + "epoch": 9.15172024816695, + "grad_norm": 0.9196208119392395, + "learning_rate": 4.244218838127468e-06, + "loss": 0.607, + "step": 16226 + }, + { + "epoch": 9.152284263959391, + "grad_norm": 0.7803400754928589, + "learning_rate": 4.241398759165257e-06, + "loss": 0.6465, + "step": 16227 + }, + { + "epoch": 9.152848279751833, + "grad_norm": 0.9460358619689941, + "learning_rate": 4.238578680203046e-06, + "loss": 0.6591, + "step": 16228 + }, + { + "epoch": 9.153412295544275, + "grad_norm": 1.1653975248336792, + "learning_rate": 4.2357586012408346e-06, + "loss": 0.7613, + "step": 16229 + }, + { + "epoch": 9.153976311336718, + "grad_norm": 1.0602221488952637, + "learning_rate": 4.232938522278624e-06, + "loss": 0.7086, + "step": 16230 + }, + { + "epoch": 9.15454032712916, + "grad_norm": 1.3423354625701904, + "learning_rate": 4.230118443316413e-06, + "loss": 0.7205, + "step": 16231 + }, + { + "epoch": 9.155104342921602, + "grad_norm": 1.2735742330551147, + "learning_rate": 4.227298364354202e-06, + "loss": 0.7625, + "step": 16232 + }, + { + "epoch": 9.155668358714044, + "grad_norm": 0.9813812971115112, + "learning_rate": 4.2244782853919915e-06, + "loss": 0.7976, + "step": 16233 + }, + { + "epoch": 9.156232374506486, + "grad_norm": 1.7177094221115112, + "learning_rate": 4.22165820642978e-06, + "loss": 0.8302, + "step": 16234 + }, + { + "epoch": 9.156796390298929, + "grad_norm": 1.4608129262924194, + "learning_rate": 4.218838127467569e-06, + "loss": 0.7644, + "step": 16235 + }, + { + "epoch": 9.157360406091371, + "grad_norm": 1.1926722526550293, + "learning_rate": 4.216018048505358e-06, + "loss": 0.7068, + "step": 16236 + }, + { + "epoch": 9.157924421883813, + "grad_norm": 0.8508747816085815, + "learning_rate": 4.2131979695431475e-06, + "loss": 0.6001, + "step": 16237 + }, + { + "epoch": 9.158488437676255, + "grad_norm": 1.2279667854309082, + "learning_rate": 4.210377890580936e-06, + "loss": 0.7268, + "step": 16238 + }, + { + "epoch": 9.159052453468696, + "grad_norm": 1.2388124465942383, + "learning_rate": 4.207557811618726e-06, + "loss": 0.8042, + "step": 16239 + }, + { + "epoch": 9.15961646926114, + "grad_norm": 1.3446416854858398, + "learning_rate": 4.204737732656515e-06, + "loss": 0.7848, + "step": 16240 + }, + { + "epoch": 9.160180485053582, + "grad_norm": 0.8814458250999451, + "learning_rate": 4.2019176536943035e-06, + "loss": 0.7023, + "step": 16241 + }, + { + "epoch": 9.160744500846024, + "grad_norm": 0.9517921805381775, + "learning_rate": 4.199097574732093e-06, + "loss": 0.7117, + "step": 16242 + }, + { + "epoch": 9.161308516638465, + "grad_norm": 1.0717854499816895, + "learning_rate": 4.196277495769882e-06, + "loss": 0.6729, + "step": 16243 + }, + { + "epoch": 9.161872532430909, + "grad_norm": 0.9981174468994141, + "learning_rate": 4.193457416807671e-06, + "loss": 0.8021, + "step": 16244 + }, + { + "epoch": 9.16243654822335, + "grad_norm": 1.3794025182724, + "learning_rate": 4.1906373378454596e-06, + "loss": 0.7189, + "step": 16245 + }, + { + "epoch": 9.163000564015793, + "grad_norm": 1.3087093830108643, + "learning_rate": 4.187817258883249e-06, + "loss": 0.7699, + "step": 16246 + }, + { + "epoch": 9.163564579808234, + "grad_norm": 1.0363695621490479, + "learning_rate": 4.184997179921038e-06, + "loss": 0.6791, + "step": 16247 + }, + { + "epoch": 9.164128595600676, + "grad_norm": 0.9268625378608704, + "learning_rate": 4.182177100958827e-06, + "loss": 0.6975, + "step": 16248 + }, + { + "epoch": 9.16469261139312, + "grad_norm": 0.8144223690032959, + "learning_rate": 4.1793570219966164e-06, + "loss": 0.6394, + "step": 16249 + }, + { + "epoch": 9.165256627185562, + "grad_norm": 0.96278315782547, + "learning_rate": 4.176536943034405e-06, + "loss": 0.6715, + "step": 16250 + }, + { + "epoch": 9.165820642978003, + "grad_norm": 1.2092636823654175, + "learning_rate": 4.173716864072194e-06, + "loss": 0.6539, + "step": 16251 + }, + { + "epoch": 9.166384658770445, + "grad_norm": 0.9558839201927185, + "learning_rate": 4.170896785109983e-06, + "loss": 0.6991, + "step": 16252 + }, + { + "epoch": 9.166948674562887, + "grad_norm": 1.4217491149902344, + "learning_rate": 4.1680767061477725e-06, + "loss": 0.8063, + "step": 16253 + }, + { + "epoch": 9.16751269035533, + "grad_norm": 0.9972169995307922, + "learning_rate": 4.165256627185561e-06, + "loss": 0.718, + "step": 16254 + }, + { + "epoch": 9.168076706147772, + "grad_norm": 1.1399407386779785, + "learning_rate": 4.16243654822335e-06, + "loss": 0.761, + "step": 16255 + }, + { + "epoch": 9.168640721940214, + "grad_norm": 0.9951194524765015, + "learning_rate": 4.159616469261139e-06, + "loss": 0.6416, + "step": 16256 + }, + { + "epoch": 9.169204737732656, + "grad_norm": 1.2594294548034668, + "learning_rate": 4.1567963902989285e-06, + "loss": 0.5521, + "step": 16257 + }, + { + "epoch": 9.1697687535251, + "grad_norm": 1.069734811782837, + "learning_rate": 4.153976311336718e-06, + "loss": 0.7658, + "step": 16258 + }, + { + "epoch": 9.170332769317541, + "grad_norm": 0.8085337281227112, + "learning_rate": 4.151156232374507e-06, + "loss": 0.626, + "step": 16259 + }, + { + "epoch": 9.170896785109983, + "grad_norm": 1.4131510257720947, + "learning_rate": 4.148336153412296e-06, + "loss": 0.7649, + "step": 16260 + }, + { + "epoch": 9.171460800902425, + "grad_norm": 1.1323330402374268, + "learning_rate": 4.145516074450085e-06, + "loss": 0.8137, + "step": 16261 + }, + { + "epoch": 9.172024816694867, + "grad_norm": 1.0935124158859253, + "learning_rate": 4.142695995487874e-06, + "loss": 0.754, + "step": 16262 + }, + { + "epoch": 9.17258883248731, + "grad_norm": 1.1644401550292969, + "learning_rate": 4.139875916525663e-06, + "loss": 0.6251, + "step": 16263 + }, + { + "epoch": 9.173152848279752, + "grad_norm": 1.4396113157272339, + "learning_rate": 4.137055837563452e-06, + "loss": 0.7538, + "step": 16264 + }, + { + "epoch": 9.173716864072194, + "grad_norm": 1.2009528875350952, + "learning_rate": 4.134235758601241e-06, + "loss": 0.7238, + "step": 16265 + }, + { + "epoch": 9.174280879864636, + "grad_norm": 0.8626651763916016, + "learning_rate": 4.13141567963903e-06, + "loss": 0.6678, + "step": 16266 + }, + { + "epoch": 9.174844895657078, + "grad_norm": 1.2956900596618652, + "learning_rate": 4.128595600676819e-06, + "loss": 0.7251, + "step": 16267 + }, + { + "epoch": 9.175408911449521, + "grad_norm": 1.0113346576690674, + "learning_rate": 4.125775521714608e-06, + "loss": 0.7096, + "step": 16268 + }, + { + "epoch": 9.175972927241963, + "grad_norm": 1.1280697584152222, + "learning_rate": 4.1229554427523974e-06, + "loss": 0.8126, + "step": 16269 + }, + { + "epoch": 9.176536943034405, + "grad_norm": 1.2035424709320068, + "learning_rate": 4.120135363790186e-06, + "loss": 0.7547, + "step": 16270 + }, + { + "epoch": 9.177100958826847, + "grad_norm": 0.9778417348861694, + "learning_rate": 4.117315284827975e-06, + "loss": 0.7118, + "step": 16271 + }, + { + "epoch": 9.17766497461929, + "grad_norm": 1.0680824518203735, + "learning_rate": 4.114495205865764e-06, + "loss": 0.7089, + "step": 16272 + }, + { + "epoch": 9.178228990411732, + "grad_norm": 1.2401032447814941, + "learning_rate": 4.1116751269035535e-06, + "loss": 0.7381, + "step": 16273 + }, + { + "epoch": 9.178793006204174, + "grad_norm": 1.0978517532348633, + "learning_rate": 4.108855047941342e-06, + "loss": 0.721, + "step": 16274 + }, + { + "epoch": 9.179357021996616, + "grad_norm": 1.1368603706359863, + "learning_rate": 4.106034968979132e-06, + "loss": 0.6816, + "step": 16275 + }, + { + "epoch": 9.179921037789057, + "grad_norm": 1.2958694696426392, + "learning_rate": 4.103214890016921e-06, + "loss": 0.6929, + "step": 16276 + }, + { + "epoch": 9.180485053581501, + "grad_norm": 1.3167595863342285, + "learning_rate": 4.10039481105471e-06, + "loss": 0.7655, + "step": 16277 + }, + { + "epoch": 9.181049069373943, + "grad_norm": 1.4576207399368286, + "learning_rate": 4.097574732092499e-06, + "loss": 0.806, + "step": 16278 + }, + { + "epoch": 9.181613085166385, + "grad_norm": 1.106299638748169, + "learning_rate": 4.094754653130288e-06, + "loss": 0.7225, + "step": 16279 + }, + { + "epoch": 9.182177100958826, + "grad_norm": 1.1525267362594604, + "learning_rate": 4.091934574168077e-06, + "loss": 0.7424, + "step": 16280 + }, + { + "epoch": 9.182741116751268, + "grad_norm": 1.058426022529602, + "learning_rate": 4.089114495205866e-06, + "loss": 0.6753, + "step": 16281 + }, + { + "epoch": 9.183305132543712, + "grad_norm": 1.4010460376739502, + "learning_rate": 4.086294416243655e-06, + "loss": 0.8239, + "step": 16282 + }, + { + "epoch": 9.183869148336154, + "grad_norm": 2.2168710231781006, + "learning_rate": 4.083474337281444e-06, + "loss": 0.6045, + "step": 16283 + }, + { + "epoch": 9.184433164128595, + "grad_norm": 1.2393990755081177, + "learning_rate": 4.080654258319233e-06, + "loss": 0.7766, + "step": 16284 + }, + { + "epoch": 9.184997179921037, + "grad_norm": 1.3038748502731323, + "learning_rate": 4.0778341793570224e-06, + "loss": 0.7805, + "step": 16285 + }, + { + "epoch": 9.18556119571348, + "grad_norm": 1.2315683364868164, + "learning_rate": 4.075014100394811e-06, + "loss": 0.725, + "step": 16286 + }, + { + "epoch": 9.186125211505923, + "grad_norm": 1.2287400960922241, + "learning_rate": 4.0721940214326e-06, + "loss": 0.7629, + "step": 16287 + }, + { + "epoch": 9.186689227298364, + "grad_norm": 1.1322404146194458, + "learning_rate": 4.069373942470389e-06, + "loss": 0.6349, + "step": 16288 + }, + { + "epoch": 9.187253243090806, + "grad_norm": 1.629786729812622, + "learning_rate": 4.0665538635081785e-06, + "loss": 0.8786, + "step": 16289 + }, + { + "epoch": 9.187817258883248, + "grad_norm": 0.9108105301856995, + "learning_rate": 4.063733784545967e-06, + "loss": 0.6979, + "step": 16290 + }, + { + "epoch": 9.188381274675692, + "grad_norm": 1.291988492012024, + "learning_rate": 4.060913705583756e-06, + "loss": 0.8245, + "step": 16291 + }, + { + "epoch": 9.188945290468133, + "grad_norm": 1.1476794481277466, + "learning_rate": 4.058093626621546e-06, + "loss": 0.6833, + "step": 16292 + }, + { + "epoch": 9.189509306260575, + "grad_norm": 1.4970275163650513, + "learning_rate": 4.055273547659335e-06, + "loss": 0.7389, + "step": 16293 + }, + { + "epoch": 9.190073322053017, + "grad_norm": 1.1272962093353271, + "learning_rate": 4.052453468697124e-06, + "loss": 0.7204, + "step": 16294 + }, + { + "epoch": 9.190637337845459, + "grad_norm": 1.1552706956863403, + "learning_rate": 4.049633389734913e-06, + "loss": 0.779, + "step": 16295 + }, + { + "epoch": 9.191201353637902, + "grad_norm": 1.0496457815170288, + "learning_rate": 4.046813310772702e-06, + "loss": 0.7319, + "step": 16296 + }, + { + "epoch": 9.191765369430344, + "grad_norm": 1.0892667770385742, + "learning_rate": 4.043993231810491e-06, + "loss": 0.7452, + "step": 16297 + }, + { + "epoch": 9.192329385222786, + "grad_norm": 1.2486457824707031, + "learning_rate": 4.04117315284828e-06, + "loss": 0.7746, + "step": 16298 + }, + { + "epoch": 9.192893401015228, + "grad_norm": 1.3185995817184448, + "learning_rate": 4.038353073886069e-06, + "loss": 0.6841, + "step": 16299 + }, + { + "epoch": 9.193457416807671, + "grad_norm": 1.6664685010910034, + "learning_rate": 4.035532994923858e-06, + "loss": 0.8163, + "step": 16300 + }, + { + "epoch": 9.194021432600113, + "grad_norm": 1.1730254888534546, + "learning_rate": 4.032712915961647e-06, + "loss": 0.7691, + "step": 16301 + }, + { + "epoch": 9.194585448392555, + "grad_norm": 1.1897668838500977, + "learning_rate": 4.029892836999436e-06, + "loss": 0.7405, + "step": 16302 + }, + { + "epoch": 9.195149464184997, + "grad_norm": 1.105602741241455, + "learning_rate": 4.027072758037225e-06, + "loss": 0.7045, + "step": 16303 + }, + { + "epoch": 9.195713479977439, + "grad_norm": 1.036719799041748, + "learning_rate": 4.024252679075014e-06, + "loss": 0.7675, + "step": 16304 + }, + { + "epoch": 9.196277495769882, + "grad_norm": 1.1770117282867432, + "learning_rate": 4.0214326001128034e-06, + "loss": 0.7612, + "step": 16305 + }, + { + "epoch": 9.196841511562324, + "grad_norm": 1.4637141227722168, + "learning_rate": 4.018612521150592e-06, + "loss": 0.7631, + "step": 16306 + }, + { + "epoch": 9.197405527354766, + "grad_norm": 1.398167610168457, + "learning_rate": 4.015792442188381e-06, + "loss": 0.7721, + "step": 16307 + }, + { + "epoch": 9.197969543147208, + "grad_norm": 1.2308521270751953, + "learning_rate": 4.01297236322617e-06, + "loss": 0.8088, + "step": 16308 + }, + { + "epoch": 9.19853355893965, + "grad_norm": 0.8527022004127502, + "learning_rate": 4.0101522842639595e-06, + "loss": 0.7062, + "step": 16309 + }, + { + "epoch": 9.199097574732093, + "grad_norm": 1.1194403171539307, + "learning_rate": 4.007332205301749e-06, + "loss": 0.7553, + "step": 16310 + }, + { + "epoch": 9.199661590524535, + "grad_norm": 1.2688621282577515, + "learning_rate": 4.004512126339538e-06, + "loss": 0.7073, + "step": 16311 + }, + { + "epoch": 9.200225606316977, + "grad_norm": 1.4591350555419922, + "learning_rate": 4.001692047377327e-06, + "loss": 0.8068, + "step": 16312 + }, + { + "epoch": 9.200789622109419, + "grad_norm": 1.2634943723678589, + "learning_rate": 3.998871968415116e-06, + "loss": 0.7853, + "step": 16313 + }, + { + "epoch": 9.201353637901862, + "grad_norm": 1.3813681602478027, + "learning_rate": 3.996051889452905e-06, + "loss": 0.7068, + "step": 16314 + }, + { + "epoch": 9.201917653694304, + "grad_norm": 1.020168423652649, + "learning_rate": 3.993231810490694e-06, + "loss": 0.8259, + "step": 16315 + }, + { + "epoch": 9.202481669486746, + "grad_norm": 1.1678391695022583, + "learning_rate": 3.990411731528483e-06, + "loss": 0.8118, + "step": 16316 + }, + { + "epoch": 9.203045685279188, + "grad_norm": 0.9077108502388, + "learning_rate": 3.987591652566272e-06, + "loss": 0.6249, + "step": 16317 + }, + { + "epoch": 9.20360970107163, + "grad_norm": 1.010437250137329, + "learning_rate": 3.984771573604061e-06, + "loss": 0.7189, + "step": 16318 + }, + { + "epoch": 9.204173716864073, + "grad_norm": 1.3909317255020142, + "learning_rate": 3.98195149464185e-06, + "loss": 0.687, + "step": 16319 + }, + { + "epoch": 9.204737732656515, + "grad_norm": 1.2365823984146118, + "learning_rate": 3.979131415679639e-06, + "loss": 0.7922, + "step": 16320 + }, + { + "epoch": 9.205301748448957, + "grad_norm": 0.9465535283088684, + "learning_rate": 3.976311336717428e-06, + "loss": 0.711, + "step": 16321 + }, + { + "epoch": 9.205865764241398, + "grad_norm": 1.1060515642166138, + "learning_rate": 3.973491257755217e-06, + "loss": 0.7621, + "step": 16322 + }, + { + "epoch": 9.20642978003384, + "grad_norm": 5.942829608917236, + "learning_rate": 3.970671178793006e-06, + "loss": 0.8056, + "step": 16323 + }, + { + "epoch": 9.206993795826284, + "grad_norm": 0.9000225067138672, + "learning_rate": 3.967851099830795e-06, + "loss": 0.6248, + "step": 16324 + }, + { + "epoch": 9.207557811618726, + "grad_norm": 1.1845101118087769, + "learning_rate": 3.9650310208685845e-06, + "loss": 0.7169, + "step": 16325 + }, + { + "epoch": 9.208121827411167, + "grad_norm": 1.394002914428711, + "learning_rate": 3.962210941906373e-06, + "loss": 0.8174, + "step": 16326 + }, + { + "epoch": 9.20868584320361, + "grad_norm": 1.1399543285369873, + "learning_rate": 3.959390862944162e-06, + "loss": 0.627, + "step": 16327 + }, + { + "epoch": 9.209249858996053, + "grad_norm": 1.2804772853851318, + "learning_rate": 3.956570783981952e-06, + "loss": 0.6841, + "step": 16328 + }, + { + "epoch": 9.209813874788495, + "grad_norm": 1.41414213180542, + "learning_rate": 3.953750705019741e-06, + "loss": 0.7497, + "step": 16329 + }, + { + "epoch": 9.210377890580936, + "grad_norm": 1.0924936532974243, + "learning_rate": 3.95093062605753e-06, + "loss": 0.7348, + "step": 16330 + }, + { + "epoch": 9.210941906373378, + "grad_norm": 1.0301837921142578, + "learning_rate": 3.948110547095319e-06, + "loss": 0.7272, + "step": 16331 + }, + { + "epoch": 9.21150592216582, + "grad_norm": 0.929201602935791, + "learning_rate": 3.945290468133108e-06, + "loss": 0.7069, + "step": 16332 + }, + { + "epoch": 9.212069937958264, + "grad_norm": 0.877187967300415, + "learning_rate": 3.942470389170897e-06, + "loss": 0.7352, + "step": 16333 + }, + { + "epoch": 9.212633953750705, + "grad_norm": 1.2407708168029785, + "learning_rate": 3.939650310208686e-06, + "loss": 0.7645, + "step": 16334 + }, + { + "epoch": 9.213197969543147, + "grad_norm": 0.8515196442604065, + "learning_rate": 3.936830231246475e-06, + "loss": 0.7073, + "step": 16335 + }, + { + "epoch": 9.213761985335589, + "grad_norm": 0.9799866676330566, + "learning_rate": 3.934010152284264e-06, + "loss": 0.7905, + "step": 16336 + }, + { + "epoch": 9.21432600112803, + "grad_norm": 1.1374096870422363, + "learning_rate": 3.931190073322053e-06, + "loss": 0.68, + "step": 16337 + }, + { + "epoch": 9.214890016920474, + "grad_norm": 1.2316594123840332, + "learning_rate": 3.928369994359842e-06, + "loss": 0.8606, + "step": 16338 + }, + { + "epoch": 9.215454032712916, + "grad_norm": 0.9678153991699219, + "learning_rate": 3.925549915397631e-06, + "loss": 0.7637, + "step": 16339 + }, + { + "epoch": 9.216018048505358, + "grad_norm": 1.4272730350494385, + "learning_rate": 3.922729836435421e-06, + "loss": 0.798, + "step": 16340 + }, + { + "epoch": 9.2165820642978, + "grad_norm": 1.2008320093154907, + "learning_rate": 3.9199097574732094e-06, + "loss": 0.7146, + "step": 16341 + }, + { + "epoch": 9.217146080090243, + "grad_norm": 1.0293601751327515, + "learning_rate": 3.917089678510998e-06, + "loss": 0.7441, + "step": 16342 + }, + { + "epoch": 9.217710095882685, + "grad_norm": 1.3819866180419922, + "learning_rate": 3.914269599548787e-06, + "loss": 0.8097, + "step": 16343 + }, + { + "epoch": 9.218274111675127, + "grad_norm": 1.2056517601013184, + "learning_rate": 3.911449520586577e-06, + "loss": 0.7534, + "step": 16344 + }, + { + "epoch": 9.218838127467569, + "grad_norm": 1.281254529953003, + "learning_rate": 3.908629441624366e-06, + "loss": 0.6819, + "step": 16345 + }, + { + "epoch": 9.21940214326001, + "grad_norm": 1.0248936414718628, + "learning_rate": 3.905809362662155e-06, + "loss": 0.7315, + "step": 16346 + }, + { + "epoch": 9.219966159052454, + "grad_norm": 1.2345322370529175, + "learning_rate": 3.902989283699944e-06, + "loss": 0.7271, + "step": 16347 + }, + { + "epoch": 9.220530174844896, + "grad_norm": 0.9296325445175171, + "learning_rate": 3.900169204737733e-06, + "loss": 0.6497, + "step": 16348 + }, + { + "epoch": 9.221094190637338, + "grad_norm": 1.124742031097412, + "learning_rate": 3.897349125775522e-06, + "loss": 0.6214, + "step": 16349 + }, + { + "epoch": 9.22165820642978, + "grad_norm": 1.3152949810028076, + "learning_rate": 3.894529046813311e-06, + "loss": 0.6746, + "step": 16350 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 1.091979742050171, + "learning_rate": 3.8917089678511e-06, + "loss": 0.7204, + "step": 16351 + }, + { + "epoch": 9.222786238014665, + "grad_norm": 1.2263096570968628, + "learning_rate": 3.888888888888889e-06, + "loss": 0.7806, + "step": 16352 + }, + { + "epoch": 9.223350253807107, + "grad_norm": 1.39168119430542, + "learning_rate": 3.886068809926678e-06, + "loss": 0.6917, + "step": 16353 + }, + { + "epoch": 9.223914269599549, + "grad_norm": 1.1728994846343994, + "learning_rate": 3.883248730964467e-06, + "loss": 0.8163, + "step": 16354 + }, + { + "epoch": 9.22447828539199, + "grad_norm": 1.0781868696212769, + "learning_rate": 3.880428652002256e-06, + "loss": 0.6688, + "step": 16355 + }, + { + "epoch": 9.225042301184434, + "grad_norm": 1.0750380754470825, + "learning_rate": 3.877608573040046e-06, + "loss": 0.7912, + "step": 16356 + }, + { + "epoch": 9.225606316976876, + "grad_norm": 1.539908528327942, + "learning_rate": 3.874788494077834e-06, + "loss": 0.7881, + "step": 16357 + }, + { + "epoch": 9.226170332769318, + "grad_norm": 1.1102142333984375, + "learning_rate": 3.871968415115623e-06, + "loss": 0.7096, + "step": 16358 + }, + { + "epoch": 9.22673434856176, + "grad_norm": 1.4165425300598145, + "learning_rate": 3.869148336153412e-06, + "loss": 0.7151, + "step": 16359 + }, + { + "epoch": 9.227298364354201, + "grad_norm": 0.8795508742332458, + "learning_rate": 3.866328257191202e-06, + "loss": 0.652, + "step": 16360 + }, + { + "epoch": 9.227862380146645, + "grad_norm": 0.923186719417572, + "learning_rate": 3.8635081782289904e-06, + "loss": 0.7259, + "step": 16361 + }, + { + "epoch": 9.228426395939087, + "grad_norm": 1.0470130443572998, + "learning_rate": 3.860688099266779e-06, + "loss": 0.7693, + "step": 16362 + }, + { + "epoch": 9.228990411731528, + "grad_norm": 1.4420853853225708, + "learning_rate": 3.857868020304569e-06, + "loss": 0.7301, + "step": 16363 + }, + { + "epoch": 9.22955442752397, + "grad_norm": 0.9386510848999023, + "learning_rate": 3.855047941342358e-06, + "loss": 0.7905, + "step": 16364 + }, + { + "epoch": 9.230118443316412, + "grad_norm": 1.4532979726791382, + "learning_rate": 3.852227862380147e-06, + "loss": 0.7349, + "step": 16365 + }, + { + "epoch": 9.230682459108856, + "grad_norm": 0.8756967186927795, + "learning_rate": 3.849407783417936e-06, + "loss": 0.6428, + "step": 16366 + }, + { + "epoch": 9.231246474901297, + "grad_norm": 1.2149410247802734, + "learning_rate": 3.846587704455725e-06, + "loss": 0.6607, + "step": 16367 + }, + { + "epoch": 9.23181049069374, + "grad_norm": 1.1220312118530273, + "learning_rate": 3.843767625493514e-06, + "loss": 0.7298, + "step": 16368 + }, + { + "epoch": 9.232374506486181, + "grad_norm": 0.9511142373085022, + "learning_rate": 3.840947546531303e-06, + "loss": 0.7316, + "step": 16369 + }, + { + "epoch": 9.232938522278625, + "grad_norm": 0.9670475721359253, + "learning_rate": 3.838127467569092e-06, + "loss": 0.6951, + "step": 16370 + }, + { + "epoch": 9.233502538071066, + "grad_norm": 1.575133204460144, + "learning_rate": 3.835307388606881e-06, + "loss": 0.8264, + "step": 16371 + }, + { + "epoch": 9.234066553863508, + "grad_norm": 0.8753059506416321, + "learning_rate": 3.832487309644671e-06, + "loss": 0.6534, + "step": 16372 + }, + { + "epoch": 9.23463056965595, + "grad_norm": 1.304291844367981, + "learning_rate": 3.829667230682459e-06, + "loss": 0.8239, + "step": 16373 + }, + { + "epoch": 9.235194585448392, + "grad_norm": 1.042163610458374, + "learning_rate": 3.826847151720248e-06, + "loss": 0.6718, + "step": 16374 + }, + { + "epoch": 9.235758601240835, + "grad_norm": 0.9488685131072998, + "learning_rate": 3.824027072758037e-06, + "loss": 0.7627, + "step": 16375 + }, + { + "epoch": 9.236322617033277, + "grad_norm": 0.9594658017158508, + "learning_rate": 3.821206993795827e-06, + "loss": 0.6562, + "step": 16376 + }, + { + "epoch": 9.236886632825719, + "grad_norm": 0.8872804045677185, + "learning_rate": 3.8183869148336154e-06, + "loss": 0.6833, + "step": 16377 + }, + { + "epoch": 9.237450648618161, + "grad_norm": 0.8673959970474243, + "learning_rate": 3.815566835871404e-06, + "loss": 0.7145, + "step": 16378 + }, + { + "epoch": 9.238014664410603, + "grad_norm": 1.0264875888824463, + "learning_rate": 3.8127467569091934e-06, + "loss": 0.6381, + "step": 16379 + }, + { + "epoch": 9.238578680203046, + "grad_norm": 1.1855684518814087, + "learning_rate": 3.8099266779469822e-06, + "loss": 0.7277, + "step": 16380 + }, + { + "epoch": 9.239142695995488, + "grad_norm": 1.8306342363357544, + "learning_rate": 3.807106598984772e-06, + "loss": 0.8437, + "step": 16381 + }, + { + "epoch": 9.23970671178793, + "grad_norm": 1.3586798906326294, + "learning_rate": 3.804286520022561e-06, + "loss": 0.755, + "step": 16382 + }, + { + "epoch": 9.240270727580372, + "grad_norm": 1.4058552980422974, + "learning_rate": 3.80146644106035e-06, + "loss": 0.8055, + "step": 16383 + }, + { + "epoch": 9.240834743372815, + "grad_norm": 0.8106296062469482, + "learning_rate": 3.798646362098139e-06, + "loss": 0.6812, + "step": 16384 + }, + { + "epoch": 9.241398759165257, + "grad_norm": 0.8867407441139221, + "learning_rate": 3.7958262831359283e-06, + "loss": 0.6512, + "step": 16385 + }, + { + "epoch": 9.241962774957699, + "grad_norm": 1.1249812841415405, + "learning_rate": 3.793006204173717e-06, + "loss": 0.7235, + "step": 16386 + }, + { + "epoch": 9.24252679075014, + "grad_norm": 1.0722837448120117, + "learning_rate": 3.7901861252115064e-06, + "loss": 0.6487, + "step": 16387 + }, + { + "epoch": 9.243090806542583, + "grad_norm": 1.4122790098190308, + "learning_rate": 3.787366046249295e-06, + "loss": 0.8133, + "step": 16388 + }, + { + "epoch": 9.243654822335026, + "grad_norm": 1.007272720336914, + "learning_rate": 3.7845459672870844e-06, + "loss": 0.6393, + "step": 16389 + }, + { + "epoch": 9.244218838127468, + "grad_norm": 1.18426513671875, + "learning_rate": 3.781725888324873e-06, + "loss": 0.6921, + "step": 16390 + }, + { + "epoch": 9.24478285391991, + "grad_norm": 0.9851979613304138, + "learning_rate": 3.7789058093626624e-06, + "loss": 0.6339, + "step": 16391 + }, + { + "epoch": 9.245346869712352, + "grad_norm": 1.132017731666565, + "learning_rate": 3.776085730400451e-06, + "loss": 0.7835, + "step": 16392 + }, + { + "epoch": 9.245910885504793, + "grad_norm": 1.2655640840530396, + "learning_rate": 3.7732656514382404e-06, + "loss": 0.7044, + "step": 16393 + }, + { + "epoch": 9.246474901297237, + "grad_norm": 1.1246557235717773, + "learning_rate": 3.770445572476029e-06, + "loss": 0.6585, + "step": 16394 + }, + { + "epoch": 9.247038917089679, + "grad_norm": 1.1935596466064453, + "learning_rate": 3.7676254935138184e-06, + "loss": 0.7234, + "step": 16395 + }, + { + "epoch": 9.24760293288212, + "grad_norm": 1.0913729667663574, + "learning_rate": 3.7648054145516072e-06, + "loss": 0.6992, + "step": 16396 + }, + { + "epoch": 9.248166948674562, + "grad_norm": 1.5249501466751099, + "learning_rate": 3.7619853355893964e-06, + "loss": 0.7102, + "step": 16397 + }, + { + "epoch": 9.248730964467006, + "grad_norm": 1.1176624298095703, + "learning_rate": 3.759165256627186e-06, + "loss": 0.7644, + "step": 16398 + }, + { + "epoch": 9.249294980259448, + "grad_norm": 0.8384261727333069, + "learning_rate": 3.7563451776649753e-06, + "loss": 0.6413, + "step": 16399 + }, + { + "epoch": 9.24985899605189, + "grad_norm": 1.4488011598587036, + "learning_rate": 3.753525098702764e-06, + "loss": 0.7436, + "step": 16400 + }, + { + "epoch": 9.250423011844331, + "grad_norm": 1.2197239398956299, + "learning_rate": 3.7507050197405533e-06, + "loss": 0.739, + "step": 16401 + }, + { + "epoch": 9.250987027636773, + "grad_norm": 1.0071941614151, + "learning_rate": 3.747884940778342e-06, + "loss": 0.7749, + "step": 16402 + }, + { + "epoch": 9.251551043429217, + "grad_norm": 1.4101998805999756, + "learning_rate": 3.7450648618161313e-06, + "loss": 0.7542, + "step": 16403 + }, + { + "epoch": 9.252115059221659, + "grad_norm": 1.5109493732452393, + "learning_rate": 3.74224478285392e-06, + "loss": 0.761, + "step": 16404 + }, + { + "epoch": 9.2526790750141, + "grad_norm": 1.0144349336624146, + "learning_rate": 3.7394247038917094e-06, + "loss": 0.7182, + "step": 16405 + }, + { + "epoch": 9.253243090806542, + "grad_norm": 0.7757428288459778, + "learning_rate": 3.736604624929498e-06, + "loss": 0.6254, + "step": 16406 + }, + { + "epoch": 9.253807106598984, + "grad_norm": 0.9152683019638062, + "learning_rate": 3.7337845459672874e-06, + "loss": 0.6564, + "step": 16407 + }, + { + "epoch": 9.254371122391428, + "grad_norm": 1.7274845838546753, + "learning_rate": 3.730964467005076e-06, + "loss": 0.8552, + "step": 16408 + }, + { + "epoch": 9.25493513818387, + "grad_norm": 0.8925145268440247, + "learning_rate": 3.7281443880428654e-06, + "loss": 0.7295, + "step": 16409 + }, + { + "epoch": 9.255499153976311, + "grad_norm": 1.1113619804382324, + "learning_rate": 3.725324309080654e-06, + "loss": 0.5837, + "step": 16410 + }, + { + "epoch": 9.256063169768753, + "grad_norm": 0.906386137008667, + "learning_rate": 3.7225042301184434e-06, + "loss": 0.7441, + "step": 16411 + }, + { + "epoch": 9.256627185561197, + "grad_norm": 1.4307911396026611, + "learning_rate": 3.719684151156232e-06, + "loss": 0.6889, + "step": 16412 + }, + { + "epoch": 9.257191201353638, + "grad_norm": 1.2138416767120361, + "learning_rate": 3.7168640721940214e-06, + "loss": 0.7658, + "step": 16413 + }, + { + "epoch": 9.25775521714608, + "grad_norm": 1.3559819459915161, + "learning_rate": 3.7140439932318102e-06, + "loss": 0.7638, + "step": 16414 + }, + { + "epoch": 9.258319232938522, + "grad_norm": 0.949389636516571, + "learning_rate": 3.7112239142695994e-06, + "loss": 0.6337, + "step": 16415 + }, + { + "epoch": 9.258883248730964, + "grad_norm": 1.2861040830612183, + "learning_rate": 3.708403835307389e-06, + "loss": 0.7679, + "step": 16416 + }, + { + "epoch": 9.259447264523407, + "grad_norm": 0.9065354466438293, + "learning_rate": 3.7055837563451783e-06, + "loss": 0.6604, + "step": 16417 + }, + { + "epoch": 9.26001128031585, + "grad_norm": 0.8991524577140808, + "learning_rate": 3.702763677382967e-06, + "loss": 0.7404, + "step": 16418 + }, + { + "epoch": 9.260575296108291, + "grad_norm": 1.147009015083313, + "learning_rate": 3.6999435984207563e-06, + "loss": 0.6609, + "step": 16419 + }, + { + "epoch": 9.261139311900733, + "grad_norm": 1.1979022026062012, + "learning_rate": 3.697123519458545e-06, + "loss": 0.7187, + "step": 16420 + }, + { + "epoch": 9.261703327693175, + "grad_norm": 0.9631653428077698, + "learning_rate": 3.6943034404963343e-06, + "loss": 0.6736, + "step": 16421 + }, + { + "epoch": 9.262267343485618, + "grad_norm": 1.0886479616165161, + "learning_rate": 3.691483361534123e-06, + "loss": 0.7887, + "step": 16422 + }, + { + "epoch": 9.26283135927806, + "grad_norm": 1.1131082773208618, + "learning_rate": 3.6886632825719123e-06, + "loss": 0.7643, + "step": 16423 + }, + { + "epoch": 9.263395375070502, + "grad_norm": 1.6161986589431763, + "learning_rate": 3.685843203609701e-06, + "loss": 0.7892, + "step": 16424 + }, + { + "epoch": 9.263959390862944, + "grad_norm": 1.1295520067214966, + "learning_rate": 3.6830231246474904e-06, + "loss": 0.7841, + "step": 16425 + }, + { + "epoch": 9.264523406655387, + "grad_norm": 0.8409298658370972, + "learning_rate": 3.680203045685279e-06, + "loss": 0.7009, + "step": 16426 + }, + { + "epoch": 9.265087422447829, + "grad_norm": 1.2283210754394531, + "learning_rate": 3.6773829667230684e-06, + "loss": 0.7323, + "step": 16427 + }, + { + "epoch": 9.26565143824027, + "grad_norm": 0.9847985506057739, + "learning_rate": 3.674562887760857e-06, + "loss": 0.668, + "step": 16428 + }, + { + "epoch": 9.266215454032713, + "grad_norm": 1.449109673500061, + "learning_rate": 3.6717428087986464e-06, + "loss": 0.6631, + "step": 16429 + }, + { + "epoch": 9.266779469825154, + "grad_norm": 1.074863314628601, + "learning_rate": 3.668922729836435e-06, + "loss": 0.7034, + "step": 16430 + }, + { + "epoch": 9.267343485617598, + "grad_norm": 1.1922204494476318, + "learning_rate": 3.6661026508742244e-06, + "loss": 0.8057, + "step": 16431 + }, + { + "epoch": 9.26790750141004, + "grad_norm": 1.0452444553375244, + "learning_rate": 3.6632825719120132e-06, + "loss": 0.7414, + "step": 16432 + }, + { + "epoch": 9.268471517202482, + "grad_norm": 1.3434141874313354, + "learning_rate": 3.6604624929498024e-06, + "loss": 0.8481, + "step": 16433 + }, + { + "epoch": 9.269035532994923, + "grad_norm": 1.014459490776062, + "learning_rate": 3.657642413987592e-06, + "loss": 0.6372, + "step": 16434 + }, + { + "epoch": 9.269599548787365, + "grad_norm": 1.0262850522994995, + "learning_rate": 3.6548223350253813e-06, + "loss": 0.7229, + "step": 16435 + }, + { + "epoch": 9.270163564579809, + "grad_norm": 1.245778203010559, + "learning_rate": 3.65200225606317e-06, + "loss": 0.8358, + "step": 16436 + }, + { + "epoch": 9.27072758037225, + "grad_norm": 1.1421664953231812, + "learning_rate": 3.6491821771009593e-06, + "loss": 0.7617, + "step": 16437 + }, + { + "epoch": 9.271291596164692, + "grad_norm": 1.1073672771453857, + "learning_rate": 3.646362098138748e-06, + "loss": 0.7171, + "step": 16438 + }, + { + "epoch": 9.271855611957134, + "grad_norm": 1.1652320623397827, + "learning_rate": 3.6435420191765373e-06, + "loss": 0.8072, + "step": 16439 + }, + { + "epoch": 9.272419627749578, + "grad_norm": 0.9918177127838135, + "learning_rate": 3.640721940214326e-06, + "loss": 0.6787, + "step": 16440 + }, + { + "epoch": 9.27298364354202, + "grad_norm": 1.090867280960083, + "learning_rate": 3.6379018612521153e-06, + "loss": 0.6336, + "step": 16441 + }, + { + "epoch": 9.273547659334461, + "grad_norm": 1.0593253374099731, + "learning_rate": 3.635081782289904e-06, + "loss": 0.8603, + "step": 16442 + }, + { + "epoch": 9.274111675126903, + "grad_norm": 1.1066784858703613, + "learning_rate": 3.6322617033276934e-06, + "loss": 0.7555, + "step": 16443 + }, + { + "epoch": 9.274675690919345, + "grad_norm": 1.5276596546173096, + "learning_rate": 3.629441624365482e-06, + "loss": 0.7791, + "step": 16444 + }, + { + "epoch": 9.275239706711789, + "grad_norm": 1.0579782724380493, + "learning_rate": 3.6266215454032714e-06, + "loss": 0.756, + "step": 16445 + }, + { + "epoch": 9.27580372250423, + "grad_norm": 1.3068437576293945, + "learning_rate": 3.62380146644106e-06, + "loss": 0.7259, + "step": 16446 + }, + { + "epoch": 9.276367738296672, + "grad_norm": 1.3601034879684448, + "learning_rate": 3.6209813874788494e-06, + "loss": 0.7447, + "step": 16447 + }, + { + "epoch": 9.276931754089114, + "grad_norm": 1.0105695724487305, + "learning_rate": 3.618161308516638e-06, + "loss": 0.7635, + "step": 16448 + }, + { + "epoch": 9.277495769881556, + "grad_norm": 1.0441616773605347, + "learning_rate": 3.6153412295544274e-06, + "loss": 0.7216, + "step": 16449 + }, + { + "epoch": 9.278059785674, + "grad_norm": 1.1937531232833862, + "learning_rate": 3.6125211505922166e-06, + "loss": 0.659, + "step": 16450 + }, + { + "epoch": 9.278623801466441, + "grad_norm": 1.1117618083953857, + "learning_rate": 3.6097010716300063e-06, + "loss": 0.7102, + "step": 16451 + }, + { + "epoch": 9.279187817258883, + "grad_norm": 0.9539058804512024, + "learning_rate": 3.606880992667795e-06, + "loss": 0.7908, + "step": 16452 + }, + { + "epoch": 9.279751833051325, + "grad_norm": 0.9517561197280884, + "learning_rate": 3.6040609137055843e-06, + "loss": 0.6454, + "step": 16453 + }, + { + "epoch": 9.280315848843768, + "grad_norm": 1.3464738130569458, + "learning_rate": 3.601240834743373e-06, + "loss": 0.7456, + "step": 16454 + }, + { + "epoch": 9.28087986463621, + "grad_norm": 1.1512951850891113, + "learning_rate": 3.5984207557811623e-06, + "loss": 0.8327, + "step": 16455 + }, + { + "epoch": 9.281443880428652, + "grad_norm": 1.0664267539978027, + "learning_rate": 3.595600676818951e-06, + "loss": 0.6896, + "step": 16456 + }, + { + "epoch": 9.282007896221094, + "grad_norm": 1.3499622344970703, + "learning_rate": 3.5927805978567403e-06, + "loss": 0.7205, + "step": 16457 + }, + { + "epoch": 9.282571912013536, + "grad_norm": 1.4003639221191406, + "learning_rate": 3.589960518894529e-06, + "loss": 0.7472, + "step": 16458 + }, + { + "epoch": 9.28313592780598, + "grad_norm": 1.267612099647522, + "learning_rate": 3.5871404399323183e-06, + "loss": 0.6755, + "step": 16459 + }, + { + "epoch": 9.283699943598421, + "grad_norm": 0.9734606146812439, + "learning_rate": 3.584320360970107e-06, + "loss": 0.7992, + "step": 16460 + }, + { + "epoch": 9.284263959390863, + "grad_norm": 1.136357307434082, + "learning_rate": 3.5815002820078964e-06, + "loss": 0.8005, + "step": 16461 + }, + { + "epoch": 9.284827975183305, + "grad_norm": 1.0804251432418823, + "learning_rate": 3.578680203045685e-06, + "loss": 0.8093, + "step": 16462 + }, + { + "epoch": 9.285391990975747, + "grad_norm": 1.1486786603927612, + "learning_rate": 3.5758601240834744e-06, + "loss": 0.6465, + "step": 16463 + }, + { + "epoch": 9.28595600676819, + "grad_norm": 1.0917366743087769, + "learning_rate": 3.5730400451212636e-06, + "loss": 0.6995, + "step": 16464 + }, + { + "epoch": 9.286520022560632, + "grad_norm": 1.0399529933929443, + "learning_rate": 3.5702199661590524e-06, + "loss": 0.7391, + "step": 16465 + }, + { + "epoch": 9.287084038353074, + "grad_norm": 1.2323880195617676, + "learning_rate": 3.5673998871968416e-06, + "loss": 0.8179, + "step": 16466 + }, + { + "epoch": 9.287648054145516, + "grad_norm": 1.341059684753418, + "learning_rate": 3.5645798082346304e-06, + "loss": 0.7698, + "step": 16467 + }, + { + "epoch": 9.28821206993796, + "grad_norm": 1.2727906703948975, + "learning_rate": 3.5617597292724196e-06, + "loss": 0.7143, + "step": 16468 + }, + { + "epoch": 9.288776085730401, + "grad_norm": 1.3998112678527832, + "learning_rate": 3.5589396503102093e-06, + "loss": 0.6262, + "step": 16469 + }, + { + "epoch": 9.289340101522843, + "grad_norm": 1.3615704774856567, + "learning_rate": 3.556119571347998e-06, + "loss": 0.7359, + "step": 16470 + }, + { + "epoch": 9.289904117315285, + "grad_norm": 0.8815415501594543, + "learning_rate": 3.5532994923857873e-06, + "loss": 0.6003, + "step": 16471 + }, + { + "epoch": 9.290468133107726, + "grad_norm": 1.155030608177185, + "learning_rate": 3.550479413423576e-06, + "loss": 0.6666, + "step": 16472 + }, + { + "epoch": 9.29103214890017, + "grad_norm": 1.2976983785629272, + "learning_rate": 3.5476593344613653e-06, + "loss": 0.7665, + "step": 16473 + }, + { + "epoch": 9.291596164692612, + "grad_norm": 1.1728723049163818, + "learning_rate": 3.544839255499154e-06, + "loss": 0.7197, + "step": 16474 + }, + { + "epoch": 9.292160180485054, + "grad_norm": 0.9556059241294861, + "learning_rate": 3.5420191765369433e-06, + "loss": 0.7517, + "step": 16475 + }, + { + "epoch": 9.292724196277495, + "grad_norm": 0.9437292814254761, + "learning_rate": 3.539199097574732e-06, + "loss": 0.6954, + "step": 16476 + }, + { + "epoch": 9.293288212069937, + "grad_norm": 1.407457947731018, + "learning_rate": 3.5363790186125213e-06, + "loss": 0.8997, + "step": 16477 + }, + { + "epoch": 9.29385222786238, + "grad_norm": 1.1240015029907227, + "learning_rate": 3.53355893965031e-06, + "loss": 0.7116, + "step": 16478 + }, + { + "epoch": 9.294416243654823, + "grad_norm": 1.6304765939712524, + "learning_rate": 3.5307388606880994e-06, + "loss": 0.7868, + "step": 16479 + }, + { + "epoch": 9.294980259447264, + "grad_norm": 1.0289937257766724, + "learning_rate": 3.5279187817258886e-06, + "loss": 0.7284, + "step": 16480 + }, + { + "epoch": 9.295544275239706, + "grad_norm": 1.2204591035842896, + "learning_rate": 3.5250987027636774e-06, + "loss": 0.675, + "step": 16481 + }, + { + "epoch": 9.29610829103215, + "grad_norm": 1.1667450666427612, + "learning_rate": 3.5222786238014666e-06, + "loss": 0.7449, + "step": 16482 + }, + { + "epoch": 9.296672306824592, + "grad_norm": 1.2177375555038452, + "learning_rate": 3.5194585448392554e-06, + "loss": 0.7978, + "step": 16483 + }, + { + "epoch": 9.297236322617033, + "grad_norm": 1.2106996774673462, + "learning_rate": 3.5166384658770446e-06, + "loss": 0.7464, + "step": 16484 + }, + { + "epoch": 9.297800338409475, + "grad_norm": 1.4360897541046143, + "learning_rate": 3.5138183869148334e-06, + "loss": 0.753, + "step": 16485 + }, + { + "epoch": 9.298364354201917, + "grad_norm": 1.0538443326950073, + "learning_rate": 3.5109983079526226e-06, + "loss": 0.7191, + "step": 16486 + }, + { + "epoch": 9.29892836999436, + "grad_norm": 0.9887361526489258, + "learning_rate": 3.5081782289904123e-06, + "loss": 0.718, + "step": 16487 + }, + { + "epoch": 9.299492385786802, + "grad_norm": 1.4421221017837524, + "learning_rate": 3.505358150028201e-06, + "loss": 0.8277, + "step": 16488 + }, + { + "epoch": 9.300056401579244, + "grad_norm": 1.4844828844070435, + "learning_rate": 3.5025380710659903e-06, + "loss": 0.6858, + "step": 16489 + }, + { + "epoch": 9.300620417371686, + "grad_norm": 1.1845767498016357, + "learning_rate": 3.499717992103779e-06, + "loss": 0.7358, + "step": 16490 + }, + { + "epoch": 9.301184433164128, + "grad_norm": 1.2004963159561157, + "learning_rate": 3.4968979131415683e-06, + "loss": 0.7571, + "step": 16491 + }, + { + "epoch": 9.301748448956571, + "grad_norm": 1.3404057025909424, + "learning_rate": 3.494077834179357e-06, + "loss": 0.871, + "step": 16492 + }, + { + "epoch": 9.302312464749013, + "grad_norm": 1.006629467010498, + "learning_rate": 3.4912577552171463e-06, + "loss": 0.7785, + "step": 16493 + }, + { + "epoch": 9.302876480541455, + "grad_norm": 0.8998542428016663, + "learning_rate": 3.4884376762549355e-06, + "loss": 0.6186, + "step": 16494 + }, + { + "epoch": 9.303440496333897, + "grad_norm": 1.3028407096862793, + "learning_rate": 3.4856175972927243e-06, + "loss": 0.7562, + "step": 16495 + }, + { + "epoch": 9.30400451212634, + "grad_norm": 0.9920302629470825, + "learning_rate": 3.4827975183305136e-06, + "loss": 0.7329, + "step": 16496 + }, + { + "epoch": 9.304568527918782, + "grad_norm": 1.151939868927002, + "learning_rate": 3.4799774393683024e-06, + "loss": 0.7673, + "step": 16497 + }, + { + "epoch": 9.305132543711224, + "grad_norm": 1.1293411254882812, + "learning_rate": 3.4771573604060916e-06, + "loss": 0.7703, + "step": 16498 + }, + { + "epoch": 9.305696559503666, + "grad_norm": 1.2062007188796997, + "learning_rate": 3.4743372814438804e-06, + "loss": 0.6706, + "step": 16499 + }, + { + "epoch": 9.306260575296108, + "grad_norm": 1.2104545831680298, + "learning_rate": 3.4715172024816696e-06, + "loss": 0.6666, + "step": 16500 + }, + { + "epoch": 9.306824591088551, + "grad_norm": 0.8663122057914734, + "learning_rate": 3.4686971235194584e-06, + "loss": 0.6976, + "step": 16501 + }, + { + "epoch": 9.307388606880993, + "grad_norm": 1.1955935955047607, + "learning_rate": 3.4658770445572476e-06, + "loss": 0.6758, + "step": 16502 + }, + { + "epoch": 9.307952622673435, + "grad_norm": 1.1505755186080933, + "learning_rate": 3.4630569655950364e-06, + "loss": 0.8311, + "step": 16503 + }, + { + "epoch": 9.308516638465877, + "grad_norm": 1.112964391708374, + "learning_rate": 3.460236886632826e-06, + "loss": 0.7557, + "step": 16504 + }, + { + "epoch": 9.309080654258318, + "grad_norm": 0.9214155077934265, + "learning_rate": 3.4574168076706153e-06, + "loss": 0.7273, + "step": 16505 + }, + { + "epoch": 9.309644670050762, + "grad_norm": 1.2613630294799805, + "learning_rate": 3.454596728708404e-06, + "loss": 0.703, + "step": 16506 + }, + { + "epoch": 9.310208685843204, + "grad_norm": 1.1074460744857788, + "learning_rate": 3.4517766497461933e-06, + "loss": 0.6933, + "step": 16507 + }, + { + "epoch": 9.310772701635646, + "grad_norm": 1.2477415800094604, + "learning_rate": 3.4489565707839825e-06, + "loss": 0.7251, + "step": 16508 + }, + { + "epoch": 9.311336717428087, + "grad_norm": 1.0939055681228638, + "learning_rate": 3.4461364918217713e-06, + "loss": 0.6983, + "step": 16509 + }, + { + "epoch": 9.311900733220531, + "grad_norm": 1.0647469758987427, + "learning_rate": 3.4433164128595605e-06, + "loss": 0.7045, + "step": 16510 + }, + { + "epoch": 9.312464749012973, + "grad_norm": 0.9466874599456787, + "learning_rate": 3.4404963338973493e-06, + "loss": 0.7215, + "step": 16511 + }, + { + "epoch": 9.313028764805415, + "grad_norm": 0.8201346397399902, + "learning_rate": 3.4376762549351385e-06, + "loss": 0.651, + "step": 16512 + }, + { + "epoch": 9.313592780597856, + "grad_norm": 0.9461120367050171, + "learning_rate": 3.4348561759729273e-06, + "loss": 0.7223, + "step": 16513 + }, + { + "epoch": 9.314156796390298, + "grad_norm": 1.2067070007324219, + "learning_rate": 3.4320360970107166e-06, + "loss": 0.68, + "step": 16514 + }, + { + "epoch": 9.314720812182742, + "grad_norm": 1.5006030797958374, + "learning_rate": 3.4292160180485053e-06, + "loss": 0.7033, + "step": 16515 + }, + { + "epoch": 9.315284827975184, + "grad_norm": 1.0839699506759644, + "learning_rate": 3.4263959390862946e-06, + "loss": 0.6834, + "step": 16516 + }, + { + "epoch": 9.315848843767625, + "grad_norm": 1.1173810958862305, + "learning_rate": 3.4235758601240834e-06, + "loss": 0.7152, + "step": 16517 + }, + { + "epoch": 9.316412859560067, + "grad_norm": 1.0295932292938232, + "learning_rate": 3.4207557811618726e-06, + "loss": 0.6398, + "step": 16518 + }, + { + "epoch": 9.316976875352509, + "grad_norm": 1.0739258527755737, + "learning_rate": 3.4179357021996614e-06, + "loss": 0.7358, + "step": 16519 + }, + { + "epoch": 9.317540891144953, + "grad_norm": 1.2729374170303345, + "learning_rate": 3.4151156232374506e-06, + "loss": 0.7492, + "step": 16520 + }, + { + "epoch": 9.318104906937394, + "grad_norm": 1.1524943113327026, + "learning_rate": 3.4122955442752394e-06, + "loss": 0.8115, + "step": 16521 + }, + { + "epoch": 9.318668922729836, + "grad_norm": 0.9205895662307739, + "learning_rate": 3.409475465313029e-06, + "loss": 0.733, + "step": 16522 + }, + { + "epoch": 9.319232938522278, + "grad_norm": 1.60706627368927, + "learning_rate": 3.4066553863508183e-06, + "loss": 0.7923, + "step": 16523 + }, + { + "epoch": 9.319796954314722, + "grad_norm": 0.9933270215988159, + "learning_rate": 3.4038353073886075e-06, + "loss": 0.6971, + "step": 16524 + }, + { + "epoch": 9.320360970107163, + "grad_norm": 0.9633849263191223, + "learning_rate": 3.4010152284263963e-06, + "loss": 0.7301, + "step": 16525 + }, + { + "epoch": 9.320924985899605, + "grad_norm": 1.304128885269165, + "learning_rate": 3.3981951494641855e-06, + "loss": 0.7175, + "step": 16526 + }, + { + "epoch": 9.321489001692047, + "grad_norm": 1.2032274007797241, + "learning_rate": 3.3953750705019743e-06, + "loss": 0.722, + "step": 16527 + }, + { + "epoch": 9.322053017484489, + "grad_norm": 1.4419864416122437, + "learning_rate": 3.3925549915397635e-06, + "loss": 0.7604, + "step": 16528 + }, + { + "epoch": 9.322617033276932, + "grad_norm": 1.3006367683410645, + "learning_rate": 3.3897349125775523e-06, + "loss": 0.76, + "step": 16529 + }, + { + "epoch": 9.323181049069374, + "grad_norm": 1.0416475534439087, + "learning_rate": 3.3869148336153415e-06, + "loss": 0.7907, + "step": 16530 + }, + { + "epoch": 9.323745064861816, + "grad_norm": 0.9530289769172668, + "learning_rate": 3.3840947546531303e-06, + "loss": 0.774, + "step": 16531 + }, + { + "epoch": 9.324309080654258, + "grad_norm": 1.0401359796524048, + "learning_rate": 3.3812746756909195e-06, + "loss": 0.6996, + "step": 16532 + }, + { + "epoch": 9.3248730964467, + "grad_norm": 1.2409985065460205, + "learning_rate": 3.3784545967287083e-06, + "loss": 0.7785, + "step": 16533 + }, + { + "epoch": 9.325437112239143, + "grad_norm": 1.1846033334732056, + "learning_rate": 3.3756345177664976e-06, + "loss": 0.754, + "step": 16534 + }, + { + "epoch": 9.326001128031585, + "grad_norm": 0.6804112792015076, + "learning_rate": 3.3728144388042864e-06, + "loss": 0.5612, + "step": 16535 + }, + { + "epoch": 9.326565143824027, + "grad_norm": 1.1418089866638184, + "learning_rate": 3.3699943598420756e-06, + "loss": 0.699, + "step": 16536 + }, + { + "epoch": 9.327129159616469, + "grad_norm": 1.3662128448486328, + "learning_rate": 3.3671742808798644e-06, + "loss": 0.6615, + "step": 16537 + }, + { + "epoch": 9.327693175408912, + "grad_norm": 1.095058798789978, + "learning_rate": 3.3643542019176536e-06, + "loss": 0.6961, + "step": 16538 + }, + { + "epoch": 9.328257191201354, + "grad_norm": 1.511971116065979, + "learning_rate": 3.3615341229554432e-06, + "loss": 0.7188, + "step": 16539 + }, + { + "epoch": 9.328821206993796, + "grad_norm": 1.198433518409729, + "learning_rate": 3.3587140439932325e-06, + "loss": 0.7085, + "step": 16540 + }, + { + "epoch": 9.329385222786238, + "grad_norm": 1.0739766359329224, + "learning_rate": 3.3558939650310213e-06, + "loss": 0.7506, + "step": 16541 + }, + { + "epoch": 9.32994923857868, + "grad_norm": 1.0130585432052612, + "learning_rate": 3.3530738860688105e-06, + "loss": 0.689, + "step": 16542 + }, + { + "epoch": 9.330513254371123, + "grad_norm": 1.3540053367614746, + "learning_rate": 3.3502538071065993e-06, + "loss": 0.7386, + "step": 16543 + }, + { + "epoch": 9.331077270163565, + "grad_norm": 1.303062915802002, + "learning_rate": 3.3474337281443885e-06, + "loss": 0.7606, + "step": 16544 + }, + { + "epoch": 9.331641285956007, + "grad_norm": 1.2810580730438232, + "learning_rate": 3.3446136491821773e-06, + "loss": 0.7068, + "step": 16545 + }, + { + "epoch": 9.332205301748449, + "grad_norm": 0.9497186541557312, + "learning_rate": 3.3417935702199665e-06, + "loss": 0.612, + "step": 16546 + }, + { + "epoch": 9.33276931754089, + "grad_norm": 1.2332504987716675, + "learning_rate": 3.3389734912577553e-06, + "loss": 0.7069, + "step": 16547 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 1.3536901473999023, + "learning_rate": 3.3361534122955445e-06, + "loss": 0.7441, + "step": 16548 + }, + { + "epoch": 9.333897349125776, + "grad_norm": 1.3842014074325562, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.7805, + "step": 16549 + }, + { + "epoch": 9.334461364918218, + "grad_norm": 0.8761115670204163, + "learning_rate": 3.3305132543711225e-06, + "loss": 0.6879, + "step": 16550 + }, + { + "epoch": 9.33502538071066, + "grad_norm": 1.3711817264556885, + "learning_rate": 3.3276931754089113e-06, + "loss": 0.7573, + "step": 16551 + }, + { + "epoch": 9.335589396503103, + "grad_norm": 1.2727144956588745, + "learning_rate": 3.3248730964467006e-06, + "loss": 0.8065, + "step": 16552 + }, + { + "epoch": 9.336153412295545, + "grad_norm": 1.4860931634902954, + "learning_rate": 3.3220530174844894e-06, + "loss": 0.7102, + "step": 16553 + }, + { + "epoch": 9.336717428087987, + "grad_norm": 0.9103001356124878, + "learning_rate": 3.3192329385222786e-06, + "loss": 0.6336, + "step": 16554 + }, + { + "epoch": 9.337281443880428, + "grad_norm": 1.562077522277832, + "learning_rate": 3.3164128595600674e-06, + "loss": 0.8117, + "step": 16555 + }, + { + "epoch": 9.33784545967287, + "grad_norm": 1.3585854768753052, + "learning_rate": 3.3135927805978566e-06, + "loss": 0.744, + "step": 16556 + }, + { + "epoch": 9.338409475465314, + "grad_norm": 1.0590927600860596, + "learning_rate": 3.3107727016356462e-06, + "loss": 0.7402, + "step": 16557 + }, + { + "epoch": 9.338973491257756, + "grad_norm": 0.8730952739715576, + "learning_rate": 3.3079526226734355e-06, + "loss": 0.6818, + "step": 16558 + }, + { + "epoch": 9.339537507050197, + "grad_norm": 1.1234557628631592, + "learning_rate": 3.3051325437112243e-06, + "loss": 0.7868, + "step": 16559 + }, + { + "epoch": 9.34010152284264, + "grad_norm": 1.0694090127944946, + "learning_rate": 3.3023124647490135e-06, + "loss": 0.8715, + "step": 16560 + }, + { + "epoch": 9.340665538635081, + "grad_norm": 0.894550621509552, + "learning_rate": 3.2994923857868023e-06, + "loss": 0.734, + "step": 16561 + }, + { + "epoch": 9.341229554427525, + "grad_norm": 1.1073795557022095, + "learning_rate": 3.2966723068245915e-06, + "loss": 0.6925, + "step": 16562 + }, + { + "epoch": 9.341793570219966, + "grad_norm": 1.1735429763793945, + "learning_rate": 3.2938522278623803e-06, + "loss": 0.8119, + "step": 16563 + }, + { + "epoch": 9.342357586012408, + "grad_norm": 0.9554508328437805, + "learning_rate": 3.2910321489001695e-06, + "loss": 0.6817, + "step": 16564 + }, + { + "epoch": 9.34292160180485, + "grad_norm": 1.387319564819336, + "learning_rate": 3.2882120699379583e-06, + "loss": 0.7961, + "step": 16565 + }, + { + "epoch": 9.343485617597294, + "grad_norm": 1.027084231376648, + "learning_rate": 3.2853919909757475e-06, + "loss": 0.725, + "step": 16566 + }, + { + "epoch": 9.344049633389735, + "grad_norm": 1.1149450540542603, + "learning_rate": 3.2825719120135363e-06, + "loss": 0.676, + "step": 16567 + }, + { + "epoch": 9.344613649182177, + "grad_norm": 1.421710729598999, + "learning_rate": 3.2797518330513255e-06, + "loss": 0.6999, + "step": 16568 + }, + { + "epoch": 9.345177664974619, + "grad_norm": 1.3093209266662598, + "learning_rate": 3.2769317540891143e-06, + "loss": 0.7207, + "step": 16569 + }, + { + "epoch": 9.34574168076706, + "grad_norm": 1.0960965156555176, + "learning_rate": 3.2741116751269036e-06, + "loss": 0.7209, + "step": 16570 + }, + { + "epoch": 9.346305696559504, + "grad_norm": 1.0659468173980713, + "learning_rate": 3.2712915961646924e-06, + "loss": 0.6996, + "step": 16571 + }, + { + "epoch": 9.346869712351946, + "grad_norm": 1.1388370990753174, + "learning_rate": 3.2684715172024816e-06, + "loss": 0.6019, + "step": 16572 + }, + { + "epoch": 9.347433728144388, + "grad_norm": 1.0439950227737427, + "learning_rate": 3.265651438240271e-06, + "loss": 0.6868, + "step": 16573 + }, + { + "epoch": 9.34799774393683, + "grad_norm": 0.814814031124115, + "learning_rate": 3.2628313592780596e-06, + "loss": 0.6326, + "step": 16574 + }, + { + "epoch": 9.348561759729272, + "grad_norm": 1.0245881080627441, + "learning_rate": 3.2600112803158492e-06, + "loss": 0.8069, + "step": 16575 + }, + { + "epoch": 9.349125775521715, + "grad_norm": 1.337105631828308, + "learning_rate": 3.2571912013536385e-06, + "loss": 0.7307, + "step": 16576 + }, + { + "epoch": 9.349689791314157, + "grad_norm": 1.3506011962890625, + "learning_rate": 3.2543711223914273e-06, + "loss": 0.7584, + "step": 16577 + }, + { + "epoch": 9.350253807106599, + "grad_norm": 1.2488855123519897, + "learning_rate": 3.2515510434292165e-06, + "loss": 0.5736, + "step": 16578 + }, + { + "epoch": 9.35081782289904, + "grad_norm": 0.9711638689041138, + "learning_rate": 3.2487309644670053e-06, + "loss": 0.6685, + "step": 16579 + }, + { + "epoch": 9.351381838691484, + "grad_norm": 1.2914947271347046, + "learning_rate": 3.2459108855047945e-06, + "loss": 0.8169, + "step": 16580 + }, + { + "epoch": 9.351945854483926, + "grad_norm": 0.889773428440094, + "learning_rate": 3.2430908065425833e-06, + "loss": 0.7052, + "step": 16581 + }, + { + "epoch": 9.352509870276368, + "grad_norm": 0.8724265694618225, + "learning_rate": 3.2402707275803725e-06, + "loss": 0.7044, + "step": 16582 + }, + { + "epoch": 9.35307388606881, + "grad_norm": 1.067966103553772, + "learning_rate": 3.2374506486181613e-06, + "loss": 0.7352, + "step": 16583 + }, + { + "epoch": 9.353637901861251, + "grad_norm": 1.1087759733200073, + "learning_rate": 3.2346305696559505e-06, + "loss": 0.7123, + "step": 16584 + }, + { + "epoch": 9.354201917653695, + "grad_norm": 1.167306661605835, + "learning_rate": 3.2318104906937393e-06, + "loss": 0.697, + "step": 16585 + }, + { + "epoch": 9.354765933446137, + "grad_norm": 1.1575907468795776, + "learning_rate": 3.2289904117315285e-06, + "loss": 0.7426, + "step": 16586 + }, + { + "epoch": 9.355329949238579, + "grad_norm": 1.4257128238677979, + "learning_rate": 3.2261703327693173e-06, + "loss": 0.6788, + "step": 16587 + }, + { + "epoch": 9.35589396503102, + "grad_norm": 1.1460195779800415, + "learning_rate": 3.2233502538071066e-06, + "loss": 0.7938, + "step": 16588 + }, + { + "epoch": 9.356457980823462, + "grad_norm": 1.1601982116699219, + "learning_rate": 3.2205301748448958e-06, + "loss": 0.6186, + "step": 16589 + }, + { + "epoch": 9.357021996615906, + "grad_norm": 1.117168664932251, + "learning_rate": 3.2177100958826846e-06, + "loss": 0.7678, + "step": 16590 + }, + { + "epoch": 9.357586012408348, + "grad_norm": 1.1663063764572144, + "learning_rate": 3.214890016920474e-06, + "loss": 0.6657, + "step": 16591 + }, + { + "epoch": 9.35815002820079, + "grad_norm": 1.238419532775879, + "learning_rate": 3.2120699379582634e-06, + "loss": 0.6826, + "step": 16592 + }, + { + "epoch": 9.358714043993231, + "grad_norm": 1.1534401178359985, + "learning_rate": 3.2092498589960522e-06, + "loss": 0.8033, + "step": 16593 + }, + { + "epoch": 9.359278059785675, + "grad_norm": 1.2551339864730835, + "learning_rate": 3.2064297800338414e-06, + "loss": 0.7405, + "step": 16594 + }, + { + "epoch": 9.359842075578117, + "grad_norm": 0.9843132495880127, + "learning_rate": 3.2036097010716302e-06, + "loss": 0.7379, + "step": 16595 + }, + { + "epoch": 9.360406091370558, + "grad_norm": 1.34474778175354, + "learning_rate": 3.2007896221094195e-06, + "loss": 0.7682, + "step": 16596 + }, + { + "epoch": 9.360970107163, + "grad_norm": 1.3228421211242676, + "learning_rate": 3.1979695431472083e-06, + "loss": 0.6782, + "step": 16597 + }, + { + "epoch": 9.361534122955442, + "grad_norm": 1.112168788909912, + "learning_rate": 3.1951494641849975e-06, + "loss": 0.772, + "step": 16598 + }, + { + "epoch": 9.362098138747886, + "grad_norm": 1.3182783126831055, + "learning_rate": 3.1923293852227863e-06, + "loss": 0.84, + "step": 16599 + }, + { + "epoch": 9.362662154540327, + "grad_norm": 1.36008620262146, + "learning_rate": 3.1895093062605755e-06, + "loss": 0.7873, + "step": 16600 + }, + { + "epoch": 9.36322617033277, + "grad_norm": 1.2689043283462524, + "learning_rate": 3.1866892272983643e-06, + "loss": 0.7456, + "step": 16601 + }, + { + "epoch": 9.363790186125211, + "grad_norm": 1.0811244249343872, + "learning_rate": 3.1838691483361535e-06, + "loss": 0.6622, + "step": 16602 + }, + { + "epoch": 9.364354201917653, + "grad_norm": 1.001526117324829, + "learning_rate": 3.1810490693739427e-06, + "loss": 0.6694, + "step": 16603 + }, + { + "epoch": 9.364918217710096, + "grad_norm": 0.8131436705589294, + "learning_rate": 3.1782289904117315e-06, + "loss": 0.5464, + "step": 16604 + }, + { + "epoch": 9.365482233502538, + "grad_norm": 0.9107757210731506, + "learning_rate": 3.1754089114495208e-06, + "loss": 0.74, + "step": 16605 + }, + { + "epoch": 9.36604624929498, + "grad_norm": 1.3604120016098022, + "learning_rate": 3.1725888324873095e-06, + "loss": 0.7215, + "step": 16606 + }, + { + "epoch": 9.366610265087422, + "grad_norm": 1.1646041870117188, + "learning_rate": 3.1697687535250988e-06, + "loss": 0.7654, + "step": 16607 + }, + { + "epoch": 9.367174280879865, + "grad_norm": 0.9697343111038208, + "learning_rate": 3.1669486745628876e-06, + "loss": 0.6824, + "step": 16608 + }, + { + "epoch": 9.367738296672307, + "grad_norm": 1.5547080039978027, + "learning_rate": 3.1641285956006768e-06, + "loss": 0.737, + "step": 16609 + }, + { + "epoch": 9.368302312464749, + "grad_norm": 1.2453370094299316, + "learning_rate": 3.1613085166384664e-06, + "loss": 0.741, + "step": 16610 + }, + { + "epoch": 9.368866328257191, + "grad_norm": 1.5490858554840088, + "learning_rate": 3.1584884376762552e-06, + "loss": 0.7429, + "step": 16611 + }, + { + "epoch": 9.369430344049633, + "grad_norm": 1.3226193189620972, + "learning_rate": 3.1556683587140444e-06, + "loss": 0.7769, + "step": 16612 + }, + { + "epoch": 9.369994359842076, + "grad_norm": 1.4058074951171875, + "learning_rate": 3.1528482797518332e-06, + "loss": 0.8472, + "step": 16613 + }, + { + "epoch": 9.370558375634518, + "grad_norm": 1.6174252033233643, + "learning_rate": 3.1500282007896225e-06, + "loss": 0.688, + "step": 16614 + }, + { + "epoch": 9.37112239142696, + "grad_norm": 1.4707554578781128, + "learning_rate": 3.1472081218274113e-06, + "loss": 0.7665, + "step": 16615 + }, + { + "epoch": 9.371686407219402, + "grad_norm": 1.1165308952331543, + "learning_rate": 3.1443880428652005e-06, + "loss": 0.7118, + "step": 16616 + }, + { + "epoch": 9.372250423011844, + "grad_norm": 1.4813741445541382, + "learning_rate": 3.1415679639029897e-06, + "loss": 0.7724, + "step": 16617 + }, + { + "epoch": 9.372814438804287, + "grad_norm": 0.9620729088783264, + "learning_rate": 3.1387478849407785e-06, + "loss": 0.69, + "step": 16618 + }, + { + "epoch": 9.373378454596729, + "grad_norm": 1.1315196752548218, + "learning_rate": 3.1359278059785677e-06, + "loss": 0.7308, + "step": 16619 + }, + { + "epoch": 9.37394247038917, + "grad_norm": 1.2842700481414795, + "learning_rate": 3.1331077270163565e-06, + "loss": 0.7387, + "step": 16620 + }, + { + "epoch": 9.374506486181613, + "grad_norm": 0.987734854221344, + "learning_rate": 3.1302876480541457e-06, + "loss": 0.6336, + "step": 16621 + }, + { + "epoch": 9.375070501974056, + "grad_norm": 0.9759315848350525, + "learning_rate": 3.1274675690919345e-06, + "loss": 0.7369, + "step": 16622 + }, + { + "epoch": 9.375634517766498, + "grad_norm": 1.1729505062103271, + "learning_rate": 3.1246474901297237e-06, + "loss": 0.7521, + "step": 16623 + }, + { + "epoch": 9.37619853355894, + "grad_norm": 1.2763733863830566, + "learning_rate": 3.121827411167513e-06, + "loss": 0.6895, + "step": 16624 + }, + { + "epoch": 9.376762549351382, + "grad_norm": 1.2172881364822388, + "learning_rate": 3.119007332205302e-06, + "loss": 0.7489, + "step": 16625 + }, + { + "epoch": 9.377326565143823, + "grad_norm": 1.0222371816635132, + "learning_rate": 3.116187253243091e-06, + "loss": 0.6828, + "step": 16626 + }, + { + "epoch": 9.377890580936267, + "grad_norm": 1.064335584640503, + "learning_rate": 3.11336717428088e-06, + "loss": 0.884, + "step": 16627 + }, + { + "epoch": 9.378454596728709, + "grad_norm": 0.9215359091758728, + "learning_rate": 3.110547095318669e-06, + "loss": 0.6022, + "step": 16628 + }, + { + "epoch": 9.37901861252115, + "grad_norm": 1.0036571025848389, + "learning_rate": 3.1077270163564582e-06, + "loss": 0.7034, + "step": 16629 + }, + { + "epoch": 9.379582628313592, + "grad_norm": 0.9445674419403076, + "learning_rate": 3.104906937394247e-06, + "loss": 0.6468, + "step": 16630 + }, + { + "epoch": 9.380146644106034, + "grad_norm": 1.4008793830871582, + "learning_rate": 3.1020868584320362e-06, + "loss": 0.8048, + "step": 16631 + }, + { + "epoch": 9.380710659898478, + "grad_norm": 1.0776515007019043, + "learning_rate": 3.0992667794698255e-06, + "loss": 0.6301, + "step": 16632 + }, + { + "epoch": 9.38127467569092, + "grad_norm": 1.0065001249313354, + "learning_rate": 3.0964467005076147e-06, + "loss": 0.745, + "step": 16633 + }, + { + "epoch": 9.381838691483361, + "grad_norm": 1.1274728775024414, + "learning_rate": 3.0936266215454035e-06, + "loss": 0.7308, + "step": 16634 + }, + { + "epoch": 9.382402707275803, + "grad_norm": 0.9770369529724121, + "learning_rate": 3.0908065425831927e-06, + "loss": 0.6731, + "step": 16635 + }, + { + "epoch": 9.382966723068247, + "grad_norm": 1.2529120445251465, + "learning_rate": 3.0879864636209815e-06, + "loss": 0.7775, + "step": 16636 + }, + { + "epoch": 9.383530738860689, + "grad_norm": 1.2303893566131592, + "learning_rate": 3.0851663846587707e-06, + "loss": 0.7042, + "step": 16637 + }, + { + "epoch": 9.38409475465313, + "grad_norm": 1.0195869207382202, + "learning_rate": 3.0823463056965595e-06, + "loss": 0.8332, + "step": 16638 + }, + { + "epoch": 9.384658770445572, + "grad_norm": 0.8309354782104492, + "learning_rate": 3.0795262267343487e-06, + "loss": 0.6595, + "step": 16639 + }, + { + "epoch": 9.385222786238014, + "grad_norm": 1.0647491216659546, + "learning_rate": 3.0767061477721375e-06, + "loss": 0.6837, + "step": 16640 + }, + { + "epoch": 9.385786802030458, + "grad_norm": 1.0803806781768799, + "learning_rate": 3.073886068809927e-06, + "loss": 0.6888, + "step": 16641 + }, + { + "epoch": 9.3863508178229, + "grad_norm": 1.1996583938598633, + "learning_rate": 3.071065989847716e-06, + "loss": 0.7436, + "step": 16642 + }, + { + "epoch": 9.386914833615341, + "grad_norm": 1.304529070854187, + "learning_rate": 3.068245910885505e-06, + "loss": 0.7744, + "step": 16643 + }, + { + "epoch": 9.387478849407783, + "grad_norm": 1.4937655925750732, + "learning_rate": 3.065425831923294e-06, + "loss": 0.7539, + "step": 16644 + }, + { + "epoch": 9.388042865200225, + "grad_norm": 1.1419843435287476, + "learning_rate": 3.062605752961083e-06, + "loss": 0.7929, + "step": 16645 + }, + { + "epoch": 9.388606880992668, + "grad_norm": 1.2742058038711548, + "learning_rate": 3.059785673998872e-06, + "loss": 0.6999, + "step": 16646 + }, + { + "epoch": 9.38917089678511, + "grad_norm": 2.067535638809204, + "learning_rate": 3.0569655950366612e-06, + "loss": 0.8401, + "step": 16647 + }, + { + "epoch": 9.389734912577552, + "grad_norm": 1.899698257446289, + "learning_rate": 3.05414551607445e-06, + "loss": 0.8049, + "step": 16648 + }, + { + "epoch": 9.390298928369994, + "grad_norm": 1.3094737529754639, + "learning_rate": 3.0513254371122392e-06, + "loss": 0.7241, + "step": 16649 + }, + { + "epoch": 9.390862944162437, + "grad_norm": 0.8531742691993713, + "learning_rate": 3.0485053581500285e-06, + "loss": 0.708, + "step": 16650 + }, + { + "epoch": 9.39142695995488, + "grad_norm": 1.3763548135757446, + "learning_rate": 3.0456852791878177e-06, + "loss": 0.8406, + "step": 16651 + }, + { + "epoch": 9.391990975747321, + "grad_norm": 1.4048757553100586, + "learning_rate": 3.0428652002256065e-06, + "loss": 0.7763, + "step": 16652 + }, + { + "epoch": 9.392554991539763, + "grad_norm": 0.9941235184669495, + "learning_rate": 3.0400451212633957e-06, + "loss": 0.6232, + "step": 16653 + }, + { + "epoch": 9.393119007332205, + "grad_norm": 0.988383948802948, + "learning_rate": 3.0372250423011845e-06, + "loss": 0.6515, + "step": 16654 + }, + { + "epoch": 9.393683023124648, + "grad_norm": 1.2090483903884888, + "learning_rate": 3.0344049633389737e-06, + "loss": 0.6944, + "step": 16655 + }, + { + "epoch": 9.39424703891709, + "grad_norm": 0.9868251085281372, + "learning_rate": 3.0315848843767625e-06, + "loss": 0.6892, + "step": 16656 + }, + { + "epoch": 9.394811054709532, + "grad_norm": 1.4585535526275635, + "learning_rate": 3.0287648054145517e-06, + "loss": 0.7067, + "step": 16657 + }, + { + "epoch": 9.395375070501974, + "grad_norm": 0.8508133292198181, + "learning_rate": 3.0259447264523405e-06, + "loss": 0.6823, + "step": 16658 + }, + { + "epoch": 9.395939086294415, + "grad_norm": 0.901139497756958, + "learning_rate": 3.02312464749013e-06, + "loss": 0.7209, + "step": 16659 + }, + { + "epoch": 9.396503102086859, + "grad_norm": 1.4500855207443237, + "learning_rate": 3.020304568527919e-06, + "loss": 0.7192, + "step": 16660 + }, + { + "epoch": 9.3970671178793, + "grad_norm": 1.1796382665634155, + "learning_rate": 3.017484489565708e-06, + "loss": 0.7686, + "step": 16661 + }, + { + "epoch": 9.397631133671743, + "grad_norm": 0.9471594095230103, + "learning_rate": 3.014664410603497e-06, + "loss": 0.6626, + "step": 16662 + }, + { + "epoch": 9.398195149464184, + "grad_norm": 1.0302491188049316, + "learning_rate": 3.011844331641286e-06, + "loss": 0.7295, + "step": 16663 + }, + { + "epoch": 9.398759165256628, + "grad_norm": 1.0805795192718506, + "learning_rate": 3.009024252679075e-06, + "loss": 0.594, + "step": 16664 + }, + { + "epoch": 9.39932318104907, + "grad_norm": 1.0603877305984497, + "learning_rate": 3.0062041737168642e-06, + "loss": 0.6675, + "step": 16665 + }, + { + "epoch": 9.399887196841512, + "grad_norm": 1.0718001127243042, + "learning_rate": 3.003384094754653e-06, + "loss": 0.6663, + "step": 16666 + }, + { + "epoch": 9.400451212633953, + "grad_norm": 0.9504727721214294, + "learning_rate": 3.0005640157924422e-06, + "loss": 0.7248, + "step": 16667 + }, + { + "epoch": 9.401015228426395, + "grad_norm": 1.1042039394378662, + "learning_rate": 2.9977439368302315e-06, + "loss": 0.7597, + "step": 16668 + }, + { + "epoch": 9.401579244218839, + "grad_norm": 1.3233604431152344, + "learning_rate": 2.9949238578680207e-06, + "loss": 0.705, + "step": 16669 + }, + { + "epoch": 9.40214326001128, + "grad_norm": 1.0691808462142944, + "learning_rate": 2.9921037789058095e-06, + "loss": 0.6999, + "step": 16670 + }, + { + "epoch": 9.402707275803722, + "grad_norm": 1.1588561534881592, + "learning_rate": 2.9892836999435987e-06, + "loss": 0.7803, + "step": 16671 + }, + { + "epoch": 9.403271291596164, + "grad_norm": 1.2514506578445435, + "learning_rate": 2.9864636209813875e-06, + "loss": 0.7372, + "step": 16672 + }, + { + "epoch": 9.403835307388606, + "grad_norm": 1.346883773803711, + "learning_rate": 2.9836435420191767e-06, + "loss": 0.6877, + "step": 16673 + }, + { + "epoch": 9.40439932318105, + "grad_norm": 1.0937175750732422, + "learning_rate": 2.9808234630569655e-06, + "loss": 0.7892, + "step": 16674 + }, + { + "epoch": 9.404963338973491, + "grad_norm": 0.9270522594451904, + "learning_rate": 2.9780033840947547e-06, + "loss": 0.6766, + "step": 16675 + }, + { + "epoch": 9.405527354765933, + "grad_norm": 1.0285855531692505, + "learning_rate": 2.975183305132544e-06, + "loss": 0.7257, + "step": 16676 + }, + { + "epoch": 9.406091370558375, + "grad_norm": 1.0577521324157715, + "learning_rate": 2.972363226170333e-06, + "loss": 0.7076, + "step": 16677 + }, + { + "epoch": 9.406655386350819, + "grad_norm": 1.108322024345398, + "learning_rate": 2.969543147208122e-06, + "loss": 0.6651, + "step": 16678 + }, + { + "epoch": 9.40721940214326, + "grad_norm": 1.3857742547988892, + "learning_rate": 2.966723068245911e-06, + "loss": 0.8413, + "step": 16679 + }, + { + "epoch": 9.407783417935702, + "grad_norm": 1.1547939777374268, + "learning_rate": 2.9639029892837e-06, + "loss": 0.6439, + "step": 16680 + }, + { + "epoch": 9.408347433728144, + "grad_norm": 1.2589627504348755, + "learning_rate": 2.961082910321489e-06, + "loss": 0.7347, + "step": 16681 + }, + { + "epoch": 9.408911449520586, + "grad_norm": 1.148451566696167, + "learning_rate": 2.958262831359278e-06, + "loss": 0.7848, + "step": 16682 + }, + { + "epoch": 9.40947546531303, + "grad_norm": 1.0202194452285767, + "learning_rate": 2.9554427523970672e-06, + "loss": 0.5927, + "step": 16683 + }, + { + "epoch": 9.410039481105471, + "grad_norm": 1.3472871780395508, + "learning_rate": 2.952622673434856e-06, + "loss": 0.6928, + "step": 16684 + }, + { + "epoch": 9.410603496897913, + "grad_norm": 0.9117048382759094, + "learning_rate": 2.9498025944726457e-06, + "loss": 0.6225, + "step": 16685 + }, + { + "epoch": 9.411167512690355, + "grad_norm": 1.375108003616333, + "learning_rate": 2.9469825155104344e-06, + "loss": 0.713, + "step": 16686 + }, + { + "epoch": 9.411731528482797, + "grad_norm": 0.9759188294410706, + "learning_rate": 2.9441624365482237e-06, + "loss": 0.652, + "step": 16687 + }, + { + "epoch": 9.41229554427524, + "grad_norm": 1.178274393081665, + "learning_rate": 2.9413423575860125e-06, + "loss": 0.8055, + "step": 16688 + }, + { + "epoch": 9.412859560067682, + "grad_norm": 1.2151225805282593, + "learning_rate": 2.9385222786238017e-06, + "loss": 0.6172, + "step": 16689 + }, + { + "epoch": 9.413423575860124, + "grad_norm": 1.4598246812820435, + "learning_rate": 2.9357021996615905e-06, + "loss": 0.8058, + "step": 16690 + }, + { + "epoch": 9.413987591652566, + "grad_norm": 1.5375882387161255, + "learning_rate": 2.9328821206993797e-06, + "loss": 0.7173, + "step": 16691 + }, + { + "epoch": 9.41455160744501, + "grad_norm": 1.1798789501190186, + "learning_rate": 2.9300620417371685e-06, + "loss": 0.7691, + "step": 16692 + }, + { + "epoch": 9.415115623237451, + "grad_norm": 0.9440761208534241, + "learning_rate": 2.9272419627749577e-06, + "loss": 0.7149, + "step": 16693 + }, + { + "epoch": 9.415679639029893, + "grad_norm": 1.0349845886230469, + "learning_rate": 2.924421883812747e-06, + "loss": 0.6603, + "step": 16694 + }, + { + "epoch": 9.416243654822335, + "grad_norm": 1.2857481241226196, + "learning_rate": 2.921601804850536e-06, + "loss": 0.6698, + "step": 16695 + }, + { + "epoch": 9.416807670614777, + "grad_norm": 1.0961592197418213, + "learning_rate": 2.918781725888325e-06, + "loss": 0.6778, + "step": 16696 + }, + { + "epoch": 9.41737168640722, + "grad_norm": 0.8641760945320129, + "learning_rate": 2.915961646926114e-06, + "loss": 0.691, + "step": 16697 + }, + { + "epoch": 9.417935702199662, + "grad_norm": 0.9668837189674377, + "learning_rate": 2.913141567963903e-06, + "loss": 0.6357, + "step": 16698 + }, + { + "epoch": 9.418499717992104, + "grad_norm": 1.0515565872192383, + "learning_rate": 2.910321489001692e-06, + "loss": 0.6273, + "step": 16699 + }, + { + "epoch": 9.419063733784546, + "grad_norm": 1.5493991374969482, + "learning_rate": 2.907501410039481e-06, + "loss": 0.9418, + "step": 16700 + }, + { + "epoch": 9.419627749576987, + "grad_norm": 0.9109415411949158, + "learning_rate": 2.90468133107727e-06, + "loss": 0.6701, + "step": 16701 + }, + { + "epoch": 9.420191765369431, + "grad_norm": 1.0574884414672852, + "learning_rate": 2.901861252115059e-06, + "loss": 0.7087, + "step": 16702 + }, + { + "epoch": 9.420755781161873, + "grad_norm": 0.7367934584617615, + "learning_rate": 2.8990411731528486e-06, + "loss": 0.5872, + "step": 16703 + }, + { + "epoch": 9.421319796954315, + "grad_norm": 1.2078900337219238, + "learning_rate": 2.8962210941906374e-06, + "loss": 0.6085, + "step": 16704 + }, + { + "epoch": 9.421883812746756, + "grad_norm": 1.0100791454315186, + "learning_rate": 2.8934010152284267e-06, + "loss": 0.8015, + "step": 16705 + }, + { + "epoch": 9.4224478285392, + "grad_norm": 1.076796054840088, + "learning_rate": 2.8905809362662155e-06, + "loss": 0.6235, + "step": 16706 + }, + { + "epoch": 9.423011844331642, + "grad_norm": 1.289921522140503, + "learning_rate": 2.8877608573040047e-06, + "loss": 0.6411, + "step": 16707 + }, + { + "epoch": 9.423575860124084, + "grad_norm": 1.4197766780853271, + "learning_rate": 2.8849407783417935e-06, + "loss": 0.7643, + "step": 16708 + }, + { + "epoch": 9.424139875916525, + "grad_norm": 1.0258642435073853, + "learning_rate": 2.8821206993795827e-06, + "loss": 0.7235, + "step": 16709 + }, + { + "epoch": 9.424703891708967, + "grad_norm": 0.9239495396614075, + "learning_rate": 2.8793006204173715e-06, + "loss": 0.7181, + "step": 16710 + }, + { + "epoch": 9.42526790750141, + "grad_norm": 1.0994333028793335, + "learning_rate": 2.8764805414551607e-06, + "loss": 0.8051, + "step": 16711 + }, + { + "epoch": 9.425831923293853, + "grad_norm": 0.9622085690498352, + "learning_rate": 2.87366046249295e-06, + "loss": 0.7404, + "step": 16712 + }, + { + "epoch": 9.426395939086294, + "grad_norm": 1.0781304836273193, + "learning_rate": 2.870840383530739e-06, + "loss": 0.8119, + "step": 16713 + }, + { + "epoch": 9.426959954878736, + "grad_norm": 1.0079731941223145, + "learning_rate": 2.868020304568528e-06, + "loss": 0.8199, + "step": 16714 + }, + { + "epoch": 9.427523970671178, + "grad_norm": 1.1994268894195557, + "learning_rate": 2.865200225606317e-06, + "loss": 0.6811, + "step": 16715 + }, + { + "epoch": 9.428087986463622, + "grad_norm": 1.1032726764678955, + "learning_rate": 2.862380146644106e-06, + "loss": 0.7495, + "step": 16716 + }, + { + "epoch": 9.428652002256063, + "grad_norm": 0.916800856590271, + "learning_rate": 2.859560067681895e-06, + "loss": 0.6963, + "step": 16717 + }, + { + "epoch": 9.429216018048505, + "grad_norm": 1.3977855443954468, + "learning_rate": 2.856739988719684e-06, + "loss": 0.743, + "step": 16718 + }, + { + "epoch": 9.429780033840947, + "grad_norm": 1.1978356838226318, + "learning_rate": 2.853919909757473e-06, + "loss": 0.7179, + "step": 16719 + }, + { + "epoch": 9.43034404963339, + "grad_norm": 1.243802785873413, + "learning_rate": 2.8510998307952624e-06, + "loss": 0.696, + "step": 16720 + }, + { + "epoch": 9.430908065425832, + "grad_norm": 1.1370813846588135, + "learning_rate": 2.8482797518330516e-06, + "loss": 0.8104, + "step": 16721 + }, + { + "epoch": 9.431472081218274, + "grad_norm": 1.379589557647705, + "learning_rate": 2.8454596728708404e-06, + "loss": 0.6678, + "step": 16722 + }, + { + "epoch": 9.432036097010716, + "grad_norm": 1.4218965768814087, + "learning_rate": 2.8426395939086297e-06, + "loss": 0.7917, + "step": 16723 + }, + { + "epoch": 9.432600112803158, + "grad_norm": 0.9917271733283997, + "learning_rate": 2.8398195149464185e-06, + "loss": 0.7515, + "step": 16724 + }, + { + "epoch": 9.433164128595601, + "grad_norm": 1.2454438209533691, + "learning_rate": 2.8369994359842077e-06, + "loss": 0.8199, + "step": 16725 + }, + { + "epoch": 9.433728144388043, + "grad_norm": 1.1641814708709717, + "learning_rate": 2.834179357021997e-06, + "loss": 0.6896, + "step": 16726 + }, + { + "epoch": 9.434292160180485, + "grad_norm": 1.1557296514511108, + "learning_rate": 2.8313592780597857e-06, + "loss": 0.7148, + "step": 16727 + }, + { + "epoch": 9.434856175972927, + "grad_norm": 0.8267213106155396, + "learning_rate": 2.828539199097575e-06, + "loss": 0.7063, + "step": 16728 + }, + { + "epoch": 9.435420191765369, + "grad_norm": 0.8091318011283875, + "learning_rate": 2.825719120135364e-06, + "loss": 0.6488, + "step": 16729 + }, + { + "epoch": 9.435984207557812, + "grad_norm": 0.9142667651176453, + "learning_rate": 2.822899041173153e-06, + "loss": 0.6961, + "step": 16730 + }, + { + "epoch": 9.436548223350254, + "grad_norm": 0.8772026896476746, + "learning_rate": 2.820078962210942e-06, + "loss": 0.699, + "step": 16731 + }, + { + "epoch": 9.437112239142696, + "grad_norm": 1.1518193483352661, + "learning_rate": 2.817258883248731e-06, + "loss": 0.6586, + "step": 16732 + }, + { + "epoch": 9.437676254935138, + "grad_norm": 1.3499454259872437, + "learning_rate": 2.81443880428652e-06, + "loss": 0.6508, + "step": 16733 + }, + { + "epoch": 9.438240270727581, + "grad_norm": 1.0132962465286255, + "learning_rate": 2.8116187253243094e-06, + "loss": 0.7821, + "step": 16734 + }, + { + "epoch": 9.438804286520023, + "grad_norm": 1.0337690114974976, + "learning_rate": 2.808798646362098e-06, + "loss": 0.7152, + "step": 16735 + }, + { + "epoch": 9.439368302312465, + "grad_norm": 1.0938459634780884, + "learning_rate": 2.8059785673998874e-06, + "loss": 0.7037, + "step": 16736 + }, + { + "epoch": 9.439932318104907, + "grad_norm": 0.9915792942047119, + "learning_rate": 2.803158488437676e-06, + "loss": 0.632, + "step": 16737 + }, + { + "epoch": 9.440496333897348, + "grad_norm": 1.206157922744751, + "learning_rate": 2.8003384094754654e-06, + "loss": 0.6791, + "step": 16738 + }, + { + "epoch": 9.441060349689792, + "grad_norm": 1.06992769241333, + "learning_rate": 2.7975183305132546e-06, + "loss": 0.6713, + "step": 16739 + }, + { + "epoch": 9.441624365482234, + "grad_norm": 1.0915344953536987, + "learning_rate": 2.7946982515510434e-06, + "loss": 0.6648, + "step": 16740 + }, + { + "epoch": 9.442188381274676, + "grad_norm": 1.3664631843566895, + "learning_rate": 2.7918781725888327e-06, + "loss": 0.6866, + "step": 16741 + }, + { + "epoch": 9.442752397067117, + "grad_norm": 1.5588710308074951, + "learning_rate": 2.789058093626622e-06, + "loss": 0.7742, + "step": 16742 + }, + { + "epoch": 9.44331641285956, + "grad_norm": 1.2238761186599731, + "learning_rate": 2.7862380146644107e-06, + "loss": 0.834, + "step": 16743 + }, + { + "epoch": 9.443880428652003, + "grad_norm": 1.4626612663269043, + "learning_rate": 2.7834179357022e-06, + "loss": 0.8258, + "step": 16744 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.8520432710647583, + "learning_rate": 2.7805978567399887e-06, + "loss": 0.7414, + "step": 16745 + }, + { + "epoch": 9.445008460236886, + "grad_norm": 1.1972029209136963, + "learning_rate": 2.777777777777778e-06, + "loss": 0.7534, + "step": 16746 + }, + { + "epoch": 9.445572476029328, + "grad_norm": 1.2521939277648926, + "learning_rate": 2.774957698815567e-06, + "loss": 0.7406, + "step": 16747 + }, + { + "epoch": 9.446136491821772, + "grad_norm": 1.1170048713684082, + "learning_rate": 2.7721376198533564e-06, + "loss": 0.7119, + "step": 16748 + }, + { + "epoch": 9.446700507614214, + "grad_norm": 1.139299750328064, + "learning_rate": 2.769317540891145e-06, + "loss": 0.6469, + "step": 16749 + }, + { + "epoch": 9.447264523406655, + "grad_norm": 1.0496729612350464, + "learning_rate": 2.7664974619289344e-06, + "loss": 0.6983, + "step": 16750 + }, + { + "epoch": 9.447828539199097, + "grad_norm": 1.0867719650268555, + "learning_rate": 2.763677382966723e-06, + "loss": 0.7039, + "step": 16751 + }, + { + "epoch": 9.448392554991539, + "grad_norm": 1.1695042848587036, + "learning_rate": 2.7608573040045124e-06, + "loss": 0.6142, + "step": 16752 + }, + { + "epoch": 9.448956570783983, + "grad_norm": 1.0296430587768555, + "learning_rate": 2.758037225042301e-06, + "loss": 0.6472, + "step": 16753 + }, + { + "epoch": 9.449520586576424, + "grad_norm": 1.0591751337051392, + "learning_rate": 2.7552171460800904e-06, + "loss": 0.747, + "step": 16754 + }, + { + "epoch": 9.450084602368866, + "grad_norm": 1.268384337425232, + "learning_rate": 2.752397067117879e-06, + "loss": 0.8912, + "step": 16755 + }, + { + "epoch": 9.450648618161308, + "grad_norm": 0.8668540716171265, + "learning_rate": 2.749576988155669e-06, + "loss": 0.6448, + "step": 16756 + }, + { + "epoch": 9.45121263395375, + "grad_norm": 1.285248041152954, + "learning_rate": 2.7467569091934576e-06, + "loss": 0.6845, + "step": 16757 + }, + { + "epoch": 9.451776649746193, + "grad_norm": 0.9016695618629456, + "learning_rate": 2.743936830231247e-06, + "loss": 0.6545, + "step": 16758 + }, + { + "epoch": 9.452340665538635, + "grad_norm": 1.116599440574646, + "learning_rate": 2.7411167512690357e-06, + "loss": 0.6972, + "step": 16759 + }, + { + "epoch": 9.452904681331077, + "grad_norm": 1.4413926601409912, + "learning_rate": 2.738296672306825e-06, + "loss": 0.7381, + "step": 16760 + }, + { + "epoch": 9.453468697123519, + "grad_norm": 1.2414857149124146, + "learning_rate": 2.7354765933446137e-06, + "loss": 0.6731, + "step": 16761 + }, + { + "epoch": 9.454032712915962, + "grad_norm": 1.0076911449432373, + "learning_rate": 2.732656514382403e-06, + "loss": 0.7172, + "step": 16762 + }, + { + "epoch": 9.454596728708404, + "grad_norm": 0.9524306654930115, + "learning_rate": 2.7298364354201917e-06, + "loss": 0.6517, + "step": 16763 + }, + { + "epoch": 9.455160744500846, + "grad_norm": 1.1845577955245972, + "learning_rate": 2.727016356457981e-06, + "loss": 0.7401, + "step": 16764 + }, + { + "epoch": 9.455724760293288, + "grad_norm": 0.8148297667503357, + "learning_rate": 2.72419627749577e-06, + "loss": 0.6753, + "step": 16765 + }, + { + "epoch": 9.45628877608573, + "grad_norm": 1.8892176151275635, + "learning_rate": 2.7213761985335593e-06, + "loss": 0.782, + "step": 16766 + }, + { + "epoch": 9.456852791878173, + "grad_norm": 1.3791385889053345, + "learning_rate": 2.718556119571348e-06, + "loss": 0.7184, + "step": 16767 + }, + { + "epoch": 9.457416807670615, + "grad_norm": 1.0799156427383423, + "learning_rate": 2.7157360406091374e-06, + "loss": 0.707, + "step": 16768 + }, + { + "epoch": 9.457980823463057, + "grad_norm": 1.04583740234375, + "learning_rate": 2.712915961646926e-06, + "loss": 0.6787, + "step": 16769 + }, + { + "epoch": 9.458544839255499, + "grad_norm": 1.427718997001648, + "learning_rate": 2.7100958826847154e-06, + "loss": 0.7087, + "step": 16770 + }, + { + "epoch": 9.45910885504794, + "grad_norm": 1.175147294998169, + "learning_rate": 2.707275803722504e-06, + "loss": 0.7034, + "step": 16771 + }, + { + "epoch": 9.459672870840384, + "grad_norm": 0.9751408100128174, + "learning_rate": 2.7044557247602934e-06, + "loss": 0.6946, + "step": 16772 + }, + { + "epoch": 9.460236886632826, + "grad_norm": 1.275699257850647, + "learning_rate": 2.7016356457980826e-06, + "loss": 0.6798, + "step": 16773 + }, + { + "epoch": 9.460800902425268, + "grad_norm": 0.9224823117256165, + "learning_rate": 2.698815566835872e-06, + "loss": 0.7563, + "step": 16774 + }, + { + "epoch": 9.46136491821771, + "grad_norm": 1.1667732000350952, + "learning_rate": 2.6959954878736606e-06, + "loss": 0.6132, + "step": 16775 + }, + { + "epoch": 9.461928934010153, + "grad_norm": 0.8293353319168091, + "learning_rate": 2.69317540891145e-06, + "loss": 0.6361, + "step": 16776 + }, + { + "epoch": 9.462492949802595, + "grad_norm": 1.4277219772338867, + "learning_rate": 2.6903553299492387e-06, + "loss": 0.6671, + "step": 16777 + }, + { + "epoch": 9.463056965595037, + "grad_norm": 1.0217760801315308, + "learning_rate": 2.687535250987028e-06, + "loss": 0.5885, + "step": 16778 + }, + { + "epoch": 9.463620981387479, + "grad_norm": 1.3540992736816406, + "learning_rate": 2.6847151720248167e-06, + "loss": 0.7027, + "step": 16779 + }, + { + "epoch": 9.46418499717992, + "grad_norm": 1.1396735906600952, + "learning_rate": 2.681895093062606e-06, + "loss": 0.7602, + "step": 16780 + }, + { + "epoch": 9.464749012972364, + "grad_norm": 1.2163857221603394, + "learning_rate": 2.6790750141003947e-06, + "loss": 0.75, + "step": 16781 + }, + { + "epoch": 9.465313028764806, + "grad_norm": 1.5484745502471924, + "learning_rate": 2.6762549351381843e-06, + "loss": 0.7809, + "step": 16782 + }, + { + "epoch": 9.465877044557248, + "grad_norm": 0.9709252715110779, + "learning_rate": 2.673434856175973e-06, + "loss": 0.6904, + "step": 16783 + }, + { + "epoch": 9.46644106034969, + "grad_norm": 1.1045101881027222, + "learning_rate": 2.6706147772137623e-06, + "loss": 0.789, + "step": 16784 + }, + { + "epoch": 9.467005076142131, + "grad_norm": 1.0772666931152344, + "learning_rate": 2.667794698251551e-06, + "loss": 0.6974, + "step": 16785 + }, + { + "epoch": 9.467569091934575, + "grad_norm": 1.5338369607925415, + "learning_rate": 2.6649746192893404e-06, + "loss": 0.7933, + "step": 16786 + }, + { + "epoch": 9.468133107727017, + "grad_norm": 0.8553886413574219, + "learning_rate": 2.662154540327129e-06, + "loss": 0.7153, + "step": 16787 + }, + { + "epoch": 9.468697123519458, + "grad_norm": 1.245405912399292, + "learning_rate": 2.6593344613649184e-06, + "loss": 0.7626, + "step": 16788 + }, + { + "epoch": 9.4692611393119, + "grad_norm": 1.4286949634552002, + "learning_rate": 2.656514382402707e-06, + "loss": 0.7274, + "step": 16789 + }, + { + "epoch": 9.469825155104344, + "grad_norm": 1.319513201713562, + "learning_rate": 2.6536943034404964e-06, + "loss": 0.7166, + "step": 16790 + }, + { + "epoch": 9.470389170896786, + "grad_norm": 1.0983299016952515, + "learning_rate": 2.6508742244782856e-06, + "loss": 0.6769, + "step": 16791 + }, + { + "epoch": 9.470953186689227, + "grad_norm": 1.7689422369003296, + "learning_rate": 2.648054145516075e-06, + "loss": 0.854, + "step": 16792 + }, + { + "epoch": 9.47151720248167, + "grad_norm": 1.3285305500030518, + "learning_rate": 2.6452340665538636e-06, + "loss": 0.6994, + "step": 16793 + }, + { + "epoch": 9.472081218274111, + "grad_norm": 1.4788942337036133, + "learning_rate": 2.642413987591653e-06, + "loss": 0.7899, + "step": 16794 + }, + { + "epoch": 9.472645234066555, + "grad_norm": 1.3562495708465576, + "learning_rate": 2.6395939086294416e-06, + "loss": 0.8038, + "step": 16795 + }, + { + "epoch": 9.473209249858996, + "grad_norm": 1.3115154504776, + "learning_rate": 2.636773829667231e-06, + "loss": 0.7257, + "step": 16796 + }, + { + "epoch": 9.473773265651438, + "grad_norm": 1.138838529586792, + "learning_rate": 2.6339537507050197e-06, + "loss": 0.7243, + "step": 16797 + }, + { + "epoch": 9.47433728144388, + "grad_norm": 0.8856690526008606, + "learning_rate": 2.631133671742809e-06, + "loss": 0.7384, + "step": 16798 + }, + { + "epoch": 9.474901297236322, + "grad_norm": 1.3230966329574585, + "learning_rate": 2.6283135927805977e-06, + "loss": 0.723, + "step": 16799 + }, + { + "epoch": 9.475465313028765, + "grad_norm": 1.1135969161987305, + "learning_rate": 2.6254935138183873e-06, + "loss": 0.7042, + "step": 16800 + }, + { + "epoch": 9.476029328821207, + "grad_norm": 1.346746802330017, + "learning_rate": 2.622673434856176e-06, + "loss": 0.7716, + "step": 16801 + }, + { + "epoch": 9.476593344613649, + "grad_norm": 1.0959421396255493, + "learning_rate": 2.6198533558939653e-06, + "loss": 0.6139, + "step": 16802 + }, + { + "epoch": 9.47715736040609, + "grad_norm": 1.1642656326293945, + "learning_rate": 2.617033276931754e-06, + "loss": 0.7805, + "step": 16803 + }, + { + "epoch": 9.477721376198534, + "grad_norm": 1.6383278369903564, + "learning_rate": 2.6142131979695434e-06, + "loss": 0.6164, + "step": 16804 + }, + { + "epoch": 9.478285391990976, + "grad_norm": 1.0356941223144531, + "learning_rate": 2.611393119007332e-06, + "loss": 0.7775, + "step": 16805 + }, + { + "epoch": 9.478849407783418, + "grad_norm": 1.073109745979309, + "learning_rate": 2.6085730400451214e-06, + "loss": 0.7094, + "step": 16806 + }, + { + "epoch": 9.47941342357586, + "grad_norm": 1.2429546117782593, + "learning_rate": 2.60575296108291e-06, + "loss": 0.7628, + "step": 16807 + }, + { + "epoch": 9.479977439368302, + "grad_norm": 1.0116045475006104, + "learning_rate": 2.6029328821206994e-06, + "loss": 0.6453, + "step": 16808 + }, + { + "epoch": 9.480541455160745, + "grad_norm": 1.1997016668319702, + "learning_rate": 2.6001128031584886e-06, + "loss": 0.7262, + "step": 16809 + }, + { + "epoch": 9.481105470953187, + "grad_norm": 0.9903463125228882, + "learning_rate": 2.597292724196278e-06, + "loss": 0.7377, + "step": 16810 + }, + { + "epoch": 9.481669486745629, + "grad_norm": 1.049180269241333, + "learning_rate": 2.5944726452340666e-06, + "loss": 0.7999, + "step": 16811 + }, + { + "epoch": 9.48223350253807, + "grad_norm": 1.199329137802124, + "learning_rate": 2.591652566271856e-06, + "loss": 0.716, + "step": 16812 + }, + { + "epoch": 9.482797518330512, + "grad_norm": 0.8825774192810059, + "learning_rate": 2.5888324873096446e-06, + "loss": 0.6277, + "step": 16813 + }, + { + "epoch": 9.483361534122956, + "grad_norm": 1.217737078666687, + "learning_rate": 2.586012408347434e-06, + "loss": 0.6472, + "step": 16814 + }, + { + "epoch": 9.483925549915398, + "grad_norm": 1.3826907873153687, + "learning_rate": 2.5831923293852227e-06, + "loss": 0.7472, + "step": 16815 + }, + { + "epoch": 9.48448956570784, + "grad_norm": 1.3673409223556519, + "learning_rate": 2.580372250423012e-06, + "loss": 0.7484, + "step": 16816 + }, + { + "epoch": 9.485053581500281, + "grad_norm": 1.0874269008636475, + "learning_rate": 2.5775521714608007e-06, + "loss": 0.7272, + "step": 16817 + }, + { + "epoch": 9.485617597292725, + "grad_norm": 1.3957538604736328, + "learning_rate": 2.5747320924985903e-06, + "loss": 0.7369, + "step": 16818 + }, + { + "epoch": 9.486181613085167, + "grad_norm": 1.0836257934570312, + "learning_rate": 2.571912013536379e-06, + "loss": 0.6495, + "step": 16819 + }, + { + "epoch": 9.486745628877609, + "grad_norm": 1.2216020822525024, + "learning_rate": 2.5690919345741683e-06, + "loss": 0.7955, + "step": 16820 + }, + { + "epoch": 9.48730964467005, + "grad_norm": 1.1428550481796265, + "learning_rate": 2.566271855611957e-06, + "loss": 0.7785, + "step": 16821 + }, + { + "epoch": 9.487873660462492, + "grad_norm": 1.2285621166229248, + "learning_rate": 2.5634517766497464e-06, + "loss": 0.9106, + "step": 16822 + }, + { + "epoch": 9.488437676254936, + "grad_norm": 0.854004979133606, + "learning_rate": 2.560631697687535e-06, + "loss": 0.6634, + "step": 16823 + }, + { + "epoch": 9.489001692047378, + "grad_norm": 1.0547822713851929, + "learning_rate": 2.5578116187253244e-06, + "loss": 0.6475, + "step": 16824 + }, + { + "epoch": 9.48956570783982, + "grad_norm": 1.13764488697052, + "learning_rate": 2.554991539763113e-06, + "loss": 0.7144, + "step": 16825 + }, + { + "epoch": 9.490129723632261, + "grad_norm": 1.1231420040130615, + "learning_rate": 2.552171460800903e-06, + "loss": 0.6735, + "step": 16826 + }, + { + "epoch": 9.490693739424703, + "grad_norm": 1.092463731765747, + "learning_rate": 2.5493513818386916e-06, + "loss": 0.7212, + "step": 16827 + }, + { + "epoch": 9.491257755217147, + "grad_norm": 1.1727746725082397, + "learning_rate": 2.546531302876481e-06, + "loss": 0.7354, + "step": 16828 + }, + { + "epoch": 9.491821771009588, + "grad_norm": 1.0317836999893188, + "learning_rate": 2.5437112239142696e-06, + "loss": 0.7389, + "step": 16829 + }, + { + "epoch": 9.49238578680203, + "grad_norm": 1.1837196350097656, + "learning_rate": 2.540891144952059e-06, + "loss": 0.7489, + "step": 16830 + }, + { + "epoch": 9.492949802594472, + "grad_norm": 1.3922094106674194, + "learning_rate": 2.5380710659898476e-06, + "loss": 0.7987, + "step": 16831 + }, + { + "epoch": 9.493513818386916, + "grad_norm": 0.9166086912155151, + "learning_rate": 2.535250987027637e-06, + "loss": 0.7031, + "step": 16832 + }, + { + "epoch": 9.494077834179357, + "grad_norm": 1.309356689453125, + "learning_rate": 2.5324309080654257e-06, + "loss": 0.7344, + "step": 16833 + }, + { + "epoch": 9.4946418499718, + "grad_norm": 1.3599884510040283, + "learning_rate": 2.529610829103215e-06, + "loss": 0.8627, + "step": 16834 + }, + { + "epoch": 9.495205865764241, + "grad_norm": 0.9805280566215515, + "learning_rate": 2.526790750141004e-06, + "loss": 0.7622, + "step": 16835 + }, + { + "epoch": 9.495769881556683, + "grad_norm": 0.9546520709991455, + "learning_rate": 2.5239706711787933e-06, + "loss": 0.6781, + "step": 16836 + }, + { + "epoch": 9.496333897349126, + "grad_norm": 0.9784214496612549, + "learning_rate": 2.521150592216582e-06, + "loss": 0.7017, + "step": 16837 + }, + { + "epoch": 9.496897913141568, + "grad_norm": 1.6470484733581543, + "learning_rate": 2.5183305132543713e-06, + "loss": 0.8092, + "step": 16838 + }, + { + "epoch": 9.49746192893401, + "grad_norm": 1.1511729955673218, + "learning_rate": 2.51551043429216e-06, + "loss": 0.7745, + "step": 16839 + }, + { + "epoch": 9.498025944726452, + "grad_norm": 1.301607370376587, + "learning_rate": 2.5126903553299493e-06, + "loss": 0.7739, + "step": 16840 + }, + { + "epoch": 9.498589960518894, + "grad_norm": 1.0057276487350464, + "learning_rate": 2.509870276367738e-06, + "loss": 0.5693, + "step": 16841 + }, + { + "epoch": 9.499153976311337, + "grad_norm": 0.8213812708854675, + "learning_rate": 2.5070501974055274e-06, + "loss": 0.6155, + "step": 16842 + }, + { + "epoch": 9.499717992103779, + "grad_norm": 1.0985300540924072, + "learning_rate": 2.5042301184433166e-06, + "loss": 0.6737, + "step": 16843 + }, + { + "epoch": 9.500282007896221, + "grad_norm": 1.1064624786376953, + "learning_rate": 2.501410039481106e-06, + "loss": 0.7198, + "step": 16844 + }, + { + "epoch": 9.500846023688663, + "grad_norm": 1.045055866241455, + "learning_rate": 2.4985899605188946e-06, + "loss": 0.6864, + "step": 16845 + }, + { + "epoch": 9.501410039481106, + "grad_norm": 1.1249345541000366, + "learning_rate": 2.495769881556684e-06, + "loss": 0.7721, + "step": 16846 + }, + { + "epoch": 9.501974055273548, + "grad_norm": 1.5830382108688354, + "learning_rate": 2.4929498025944726e-06, + "loss": 0.8926, + "step": 16847 + }, + { + "epoch": 9.50253807106599, + "grad_norm": 0.9290525913238525, + "learning_rate": 2.490129723632262e-06, + "loss": 0.6856, + "step": 16848 + }, + { + "epoch": 9.503102086858432, + "grad_norm": 1.2384454011917114, + "learning_rate": 2.4873096446700506e-06, + "loss": 0.7395, + "step": 16849 + }, + { + "epoch": 9.503666102650874, + "grad_norm": 0.8410322666168213, + "learning_rate": 2.48448956570784e-06, + "loss": 0.6313, + "step": 16850 + }, + { + "epoch": 9.504230118443317, + "grad_norm": 1.046583890914917, + "learning_rate": 2.481669486745629e-06, + "loss": 0.6945, + "step": 16851 + }, + { + "epoch": 9.504794134235759, + "grad_norm": 0.9072194695472717, + "learning_rate": 2.478849407783418e-06, + "loss": 0.7171, + "step": 16852 + }, + { + "epoch": 9.5053581500282, + "grad_norm": 1.2133338451385498, + "learning_rate": 2.476029328821207e-06, + "loss": 0.734, + "step": 16853 + }, + { + "epoch": 9.505922165820643, + "grad_norm": 0.9039990901947021, + "learning_rate": 2.4732092498589963e-06, + "loss": 0.76, + "step": 16854 + }, + { + "epoch": 9.506486181613084, + "grad_norm": 1.2998639345169067, + "learning_rate": 2.470389170896785e-06, + "loss": 0.7905, + "step": 16855 + }, + { + "epoch": 9.507050197405528, + "grad_norm": 0.8218722343444824, + "learning_rate": 2.4675690919345743e-06, + "loss": 0.6456, + "step": 16856 + }, + { + "epoch": 9.50761421319797, + "grad_norm": 1.28633713722229, + "learning_rate": 2.4647490129723635e-06, + "loss": 0.79, + "step": 16857 + }, + { + "epoch": 9.508178228990412, + "grad_norm": 1.4276793003082275, + "learning_rate": 2.4619289340101523e-06, + "loss": 0.6815, + "step": 16858 + }, + { + "epoch": 9.508742244782853, + "grad_norm": 1.2110813856124878, + "learning_rate": 2.4591088550479416e-06, + "loss": 0.7578, + "step": 16859 + }, + { + "epoch": 9.509306260575297, + "grad_norm": 1.0233737230300903, + "learning_rate": 2.4562887760857304e-06, + "loss": 0.7434, + "step": 16860 + }, + { + "epoch": 9.509870276367739, + "grad_norm": 1.2127430438995361, + "learning_rate": 2.4534686971235196e-06, + "loss": 0.8115, + "step": 16861 + }, + { + "epoch": 9.51043429216018, + "grad_norm": 1.3175206184387207, + "learning_rate": 2.450648618161309e-06, + "loss": 0.7299, + "step": 16862 + }, + { + "epoch": 9.510998307952622, + "grad_norm": 1.2476248741149902, + "learning_rate": 2.4478285391990976e-06, + "loss": 0.669, + "step": 16863 + }, + { + "epoch": 9.511562323745064, + "grad_norm": 0.9769840836524963, + "learning_rate": 2.445008460236887e-06, + "loss": 0.6999, + "step": 16864 + }, + { + "epoch": 9.512126339537508, + "grad_norm": 1.0446110963821411, + "learning_rate": 2.442188381274676e-06, + "loss": 0.6433, + "step": 16865 + }, + { + "epoch": 9.51269035532995, + "grad_norm": 1.5136970281600952, + "learning_rate": 2.439368302312465e-06, + "loss": 0.7232, + "step": 16866 + }, + { + "epoch": 9.513254371122391, + "grad_norm": 1.3308202028274536, + "learning_rate": 2.436548223350254e-06, + "loss": 0.8098, + "step": 16867 + }, + { + "epoch": 9.513818386914833, + "grad_norm": 0.990633487701416, + "learning_rate": 2.433728144388043e-06, + "loss": 0.7387, + "step": 16868 + }, + { + "epoch": 9.514382402707275, + "grad_norm": 1.3053234815597534, + "learning_rate": 2.430908065425832e-06, + "loss": 0.7785, + "step": 16869 + }, + { + "epoch": 9.514946418499719, + "grad_norm": 1.137553334236145, + "learning_rate": 2.428087986463621e-06, + "loss": 0.7263, + "step": 16870 + }, + { + "epoch": 9.51551043429216, + "grad_norm": 1.0661903619766235, + "learning_rate": 2.42526790750141e-06, + "loss": 0.6517, + "step": 16871 + }, + { + "epoch": 9.516074450084602, + "grad_norm": 1.0346733331680298, + "learning_rate": 2.4224478285391993e-06, + "loss": 0.6717, + "step": 16872 + }, + { + "epoch": 9.516638465877044, + "grad_norm": 1.0660433769226074, + "learning_rate": 2.4196277495769885e-06, + "loss": 0.7139, + "step": 16873 + }, + { + "epoch": 9.517202481669488, + "grad_norm": 1.0200765132904053, + "learning_rate": 2.4168076706147773e-06, + "loss": 0.6912, + "step": 16874 + }, + { + "epoch": 9.51776649746193, + "grad_norm": 1.2194498777389526, + "learning_rate": 2.4139875916525665e-06, + "loss": 0.7489, + "step": 16875 + }, + { + "epoch": 9.518330513254371, + "grad_norm": 0.9576532244682312, + "learning_rate": 2.4111675126903553e-06, + "loss": 0.8061, + "step": 16876 + }, + { + "epoch": 9.518894529046813, + "grad_norm": 1.4674650430679321, + "learning_rate": 2.4083474337281446e-06, + "loss": 0.8114, + "step": 16877 + }, + { + "epoch": 9.519458544839255, + "grad_norm": 1.3614734411239624, + "learning_rate": 2.4055273547659334e-06, + "loss": 0.7412, + "step": 16878 + }, + { + "epoch": 9.520022560631698, + "grad_norm": 0.9895516633987427, + "learning_rate": 2.4027072758037226e-06, + "loss": 0.7508, + "step": 16879 + }, + { + "epoch": 9.52058657642414, + "grad_norm": 1.1113406419754028, + "learning_rate": 2.399887196841512e-06, + "loss": 0.7069, + "step": 16880 + }, + { + "epoch": 9.521150592216582, + "grad_norm": 1.2249815464019775, + "learning_rate": 2.397067117879301e-06, + "loss": 0.8573, + "step": 16881 + }, + { + "epoch": 9.521714608009024, + "grad_norm": 0.9151893258094788, + "learning_rate": 2.39424703891709e-06, + "loss": 0.7151, + "step": 16882 + }, + { + "epoch": 9.522278623801466, + "grad_norm": 1.6028741598129272, + "learning_rate": 2.391426959954879e-06, + "loss": 0.8098, + "step": 16883 + }, + { + "epoch": 9.52284263959391, + "grad_norm": 0.881813645362854, + "learning_rate": 2.388606880992668e-06, + "loss": 0.7223, + "step": 16884 + }, + { + "epoch": 9.523406655386351, + "grad_norm": 1.2153464555740356, + "learning_rate": 2.385786802030457e-06, + "loss": 0.7671, + "step": 16885 + }, + { + "epoch": 9.523970671178793, + "grad_norm": 1.0198243856430054, + "learning_rate": 2.382966723068246e-06, + "loss": 0.6554, + "step": 16886 + }, + { + "epoch": 9.524534686971235, + "grad_norm": 1.2584881782531738, + "learning_rate": 2.380146644106035e-06, + "loss": 0.7556, + "step": 16887 + }, + { + "epoch": 9.525098702763678, + "grad_norm": 1.353541374206543, + "learning_rate": 2.3773265651438243e-06, + "loss": 0.749, + "step": 16888 + }, + { + "epoch": 9.52566271855612, + "grad_norm": 0.8439225554466248, + "learning_rate": 2.3745064861816135e-06, + "loss": 0.6464, + "step": 16889 + }, + { + "epoch": 9.526226734348562, + "grad_norm": 1.351564884185791, + "learning_rate": 2.3716864072194023e-06, + "loss": 0.8133, + "step": 16890 + }, + { + "epoch": 9.526790750141004, + "grad_norm": 1.0377165079116821, + "learning_rate": 2.3688663282571915e-06, + "loss": 0.7527, + "step": 16891 + }, + { + "epoch": 9.527354765933445, + "grad_norm": 0.9945024847984314, + "learning_rate": 2.3660462492949803e-06, + "loss": 0.6876, + "step": 16892 + }, + { + "epoch": 9.527918781725889, + "grad_norm": 1.1036460399627686, + "learning_rate": 2.3632261703327695e-06, + "loss": 0.8133, + "step": 16893 + }, + { + "epoch": 9.52848279751833, + "grad_norm": 1.2730382680892944, + "learning_rate": 2.3604060913705583e-06, + "loss": 0.7618, + "step": 16894 + }, + { + "epoch": 9.529046813310773, + "grad_norm": 1.3379673957824707, + "learning_rate": 2.3575860124083476e-06, + "loss": 0.7968, + "step": 16895 + }, + { + "epoch": 9.529610829103214, + "grad_norm": 1.5548434257507324, + "learning_rate": 2.3547659334461364e-06, + "loss": 0.7929, + "step": 16896 + }, + { + "epoch": 9.530174844895656, + "grad_norm": 1.3492038249969482, + "learning_rate": 2.351945854483926e-06, + "loss": 0.7731, + "step": 16897 + }, + { + "epoch": 9.5307388606881, + "grad_norm": 0.9583719372749329, + "learning_rate": 2.349125775521715e-06, + "loss": 0.7059, + "step": 16898 + }, + { + "epoch": 9.531302876480542, + "grad_norm": 1.1302826404571533, + "learning_rate": 2.346305696559504e-06, + "loss": 0.7084, + "step": 16899 + }, + { + "epoch": 9.531866892272983, + "grad_norm": 1.0633132457733154, + "learning_rate": 2.343485617597293e-06, + "loss": 0.7659, + "step": 16900 + }, + { + "epoch": 9.532430908065425, + "grad_norm": 1.3578367233276367, + "learning_rate": 2.340665538635082e-06, + "loss": 0.6896, + "step": 16901 + }, + { + "epoch": 9.532994923857869, + "grad_norm": 1.0355346202850342, + "learning_rate": 2.337845459672871e-06, + "loss": 0.7229, + "step": 16902 + }, + { + "epoch": 9.53355893965031, + "grad_norm": 1.0572973489761353, + "learning_rate": 2.33502538071066e-06, + "loss": 0.7004, + "step": 16903 + }, + { + "epoch": 9.534122955442752, + "grad_norm": 0.9354488849639893, + "learning_rate": 2.332205301748449e-06, + "loss": 0.6858, + "step": 16904 + }, + { + "epoch": 9.534686971235194, + "grad_norm": 1.059337854385376, + "learning_rate": 2.329385222786238e-06, + "loss": 0.8094, + "step": 16905 + }, + { + "epoch": 9.535250987027636, + "grad_norm": 1.0763673782348633, + "learning_rate": 2.3265651438240273e-06, + "loss": 0.7938, + "step": 16906 + }, + { + "epoch": 9.53581500282008, + "grad_norm": 1.2227603197097778, + "learning_rate": 2.3237450648618165e-06, + "loss": 0.7485, + "step": 16907 + }, + { + "epoch": 9.536379018612521, + "grad_norm": 1.2251399755477905, + "learning_rate": 2.3209249858996053e-06, + "loss": 0.7369, + "step": 16908 + }, + { + "epoch": 9.536943034404963, + "grad_norm": 1.310640573501587, + "learning_rate": 2.3181049069373945e-06, + "loss": 0.7153, + "step": 16909 + }, + { + "epoch": 9.537507050197405, + "grad_norm": 1.3333741426467896, + "learning_rate": 2.3152848279751833e-06, + "loss": 0.8585, + "step": 16910 + }, + { + "epoch": 9.538071065989847, + "grad_norm": 1.0112119913101196, + "learning_rate": 2.3124647490129725e-06, + "loss": 0.673, + "step": 16911 + }, + { + "epoch": 9.53863508178229, + "grad_norm": 1.2998223304748535, + "learning_rate": 2.3096446700507613e-06, + "loss": 0.7966, + "step": 16912 + }, + { + "epoch": 9.539199097574732, + "grad_norm": 1.2720422744750977, + "learning_rate": 2.3068245910885506e-06, + "loss": 0.601, + "step": 16913 + }, + { + "epoch": 9.539763113367174, + "grad_norm": 1.1530388593673706, + "learning_rate": 2.3040045121263394e-06, + "loss": 0.7492, + "step": 16914 + }, + { + "epoch": 9.540327129159616, + "grad_norm": 1.2572230100631714, + "learning_rate": 2.301184433164129e-06, + "loss": 0.7555, + "step": 16915 + }, + { + "epoch": 9.54089114495206, + "grad_norm": 1.3417688608169556, + "learning_rate": 2.298364354201918e-06, + "loss": 0.7913, + "step": 16916 + }, + { + "epoch": 9.541455160744501, + "grad_norm": 1.5259850025177002, + "learning_rate": 2.295544275239707e-06, + "loss": 0.7574, + "step": 16917 + }, + { + "epoch": 9.542019176536943, + "grad_norm": 1.0686371326446533, + "learning_rate": 2.292724196277496e-06, + "loss": 0.7722, + "step": 16918 + }, + { + "epoch": 9.542583192329385, + "grad_norm": 1.3901236057281494, + "learning_rate": 2.289904117315285e-06, + "loss": 0.6891, + "step": 16919 + }, + { + "epoch": 9.543147208121827, + "grad_norm": 0.9142773747444153, + "learning_rate": 2.287084038353074e-06, + "loss": 0.6443, + "step": 16920 + }, + { + "epoch": 9.54371122391427, + "grad_norm": 1.0246797800064087, + "learning_rate": 2.284263959390863e-06, + "loss": 0.6683, + "step": 16921 + }, + { + "epoch": 9.544275239706712, + "grad_norm": 1.275346040725708, + "learning_rate": 2.281443880428652e-06, + "loss": 0.7531, + "step": 16922 + }, + { + "epoch": 9.544839255499154, + "grad_norm": 1.203822374343872, + "learning_rate": 2.278623801466441e-06, + "loss": 0.8188, + "step": 16923 + }, + { + "epoch": 9.545403271291596, + "grad_norm": 1.2168691158294678, + "learning_rate": 2.2758037225042303e-06, + "loss": 0.6968, + "step": 16924 + }, + { + "epoch": 9.545967287084038, + "grad_norm": 1.189871072769165, + "learning_rate": 2.2729836435420195e-06, + "loss": 0.6776, + "step": 16925 + }, + { + "epoch": 9.546531302876481, + "grad_norm": 0.9345690011978149, + "learning_rate": 2.2701635645798083e-06, + "loss": 0.6929, + "step": 16926 + }, + { + "epoch": 9.547095318668923, + "grad_norm": 0.9677059650421143, + "learning_rate": 2.2673434856175975e-06, + "loss": 0.7375, + "step": 16927 + }, + { + "epoch": 9.547659334461365, + "grad_norm": 1.0292912721633911, + "learning_rate": 2.2645234066553863e-06, + "loss": 0.7198, + "step": 16928 + }, + { + "epoch": 9.548223350253807, + "grad_norm": 1.1676077842712402, + "learning_rate": 2.2617033276931755e-06, + "loss": 0.6187, + "step": 16929 + }, + { + "epoch": 9.54878736604625, + "grad_norm": 0.753789484500885, + "learning_rate": 2.2588832487309643e-06, + "loss": 0.6354, + "step": 16930 + }, + { + "epoch": 9.549351381838692, + "grad_norm": 1.4546314477920532, + "learning_rate": 2.2560631697687536e-06, + "loss": 0.7742, + "step": 16931 + }, + { + "epoch": 9.549915397631134, + "grad_norm": 1.4960649013519287, + "learning_rate": 2.2532430908065428e-06, + "loss": 0.665, + "step": 16932 + }, + { + "epoch": 9.550479413423576, + "grad_norm": 1.292235016822815, + "learning_rate": 2.250423011844332e-06, + "loss": 0.8166, + "step": 16933 + }, + { + "epoch": 9.551043429216017, + "grad_norm": 1.671505093574524, + "learning_rate": 2.2476029328821208e-06, + "loss": 0.7934, + "step": 16934 + }, + { + "epoch": 9.551607445008461, + "grad_norm": 1.0827726125717163, + "learning_rate": 2.24478285391991e-06, + "loss": 0.6647, + "step": 16935 + }, + { + "epoch": 9.552171460800903, + "grad_norm": 1.0549770593643188, + "learning_rate": 2.241962774957699e-06, + "loss": 0.8343, + "step": 16936 + }, + { + "epoch": 9.552735476593345, + "grad_norm": 1.2941584587097168, + "learning_rate": 2.239142695995488e-06, + "loss": 0.7822, + "step": 16937 + }, + { + "epoch": 9.553299492385786, + "grad_norm": 1.147049903869629, + "learning_rate": 2.236322617033277e-06, + "loss": 0.728, + "step": 16938 + }, + { + "epoch": 9.553863508178228, + "grad_norm": 1.1190423965454102, + "learning_rate": 2.233502538071066e-06, + "loss": 0.8405, + "step": 16939 + }, + { + "epoch": 9.554427523970672, + "grad_norm": 1.3161035776138306, + "learning_rate": 2.230682459108855e-06, + "loss": 0.7002, + "step": 16940 + }, + { + "epoch": 9.554991539763114, + "grad_norm": 1.0634171962738037, + "learning_rate": 2.2278623801466445e-06, + "loss": 0.6889, + "step": 16941 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 1.3257852792739868, + "learning_rate": 2.2250423011844333e-06, + "loss": 0.7369, + "step": 16942 + }, + { + "epoch": 9.556119571347997, + "grad_norm": 1.4156644344329834, + "learning_rate": 2.2222222222222225e-06, + "loss": 0.766, + "step": 16943 + }, + { + "epoch": 9.55668358714044, + "grad_norm": 1.141638159751892, + "learning_rate": 2.2194021432600113e-06, + "loss": 0.8064, + "step": 16944 + }, + { + "epoch": 9.557247602932883, + "grad_norm": 1.074483871459961, + "learning_rate": 2.2165820642978005e-06, + "loss": 0.7986, + "step": 16945 + }, + { + "epoch": 9.557811618725324, + "grad_norm": 1.1612013578414917, + "learning_rate": 2.2137619853355893e-06, + "loss": 0.6797, + "step": 16946 + }, + { + "epoch": 9.558375634517766, + "grad_norm": 1.242236852645874, + "learning_rate": 2.2109419063733785e-06, + "loss": 0.674, + "step": 16947 + }, + { + "epoch": 9.558939650310208, + "grad_norm": 1.868303656578064, + "learning_rate": 2.2081218274111673e-06, + "loss": 0.881, + "step": 16948 + }, + { + "epoch": 9.559503666102652, + "grad_norm": 1.042689323425293, + "learning_rate": 2.2053017484489565e-06, + "loss": 0.7336, + "step": 16949 + }, + { + "epoch": 9.560067681895093, + "grad_norm": 1.2705318927764893, + "learning_rate": 2.2024816694867458e-06, + "loss": 0.8106, + "step": 16950 + }, + { + "epoch": 9.560631697687535, + "grad_norm": 1.3453962802886963, + "learning_rate": 2.199661590524535e-06, + "loss": 0.7987, + "step": 16951 + }, + { + "epoch": 9.561195713479977, + "grad_norm": 1.0573081970214844, + "learning_rate": 2.1968415115623238e-06, + "loss": 0.774, + "step": 16952 + }, + { + "epoch": 9.561759729272419, + "grad_norm": 1.401362419128418, + "learning_rate": 2.194021432600113e-06, + "loss": 0.7154, + "step": 16953 + }, + { + "epoch": 9.562323745064862, + "grad_norm": 1.5754235982894897, + "learning_rate": 2.191201353637902e-06, + "loss": 0.7048, + "step": 16954 + }, + { + "epoch": 9.562887760857304, + "grad_norm": 1.6127065420150757, + "learning_rate": 2.188381274675691e-06, + "loss": 0.7697, + "step": 16955 + }, + { + "epoch": 9.563451776649746, + "grad_norm": 1.350279688835144, + "learning_rate": 2.18556119571348e-06, + "loss": 0.7776, + "step": 16956 + }, + { + "epoch": 9.564015792442188, + "grad_norm": 1.0780162811279297, + "learning_rate": 2.182741116751269e-06, + "loss": 0.8085, + "step": 16957 + }, + { + "epoch": 9.564579808234631, + "grad_norm": 0.9586175680160522, + "learning_rate": 2.179921037789058e-06, + "loss": 0.7135, + "step": 16958 + }, + { + "epoch": 9.565143824027073, + "grad_norm": 1.4749336242675781, + "learning_rate": 2.1771009588268475e-06, + "loss": 0.7241, + "step": 16959 + }, + { + "epoch": 9.565707839819515, + "grad_norm": 1.0095642805099487, + "learning_rate": 2.1742808798646363e-06, + "loss": 0.6591, + "step": 16960 + }, + { + "epoch": 9.566271855611957, + "grad_norm": 1.1827921867370605, + "learning_rate": 2.1714608009024255e-06, + "loss": 0.5339, + "step": 16961 + }, + { + "epoch": 9.566835871404399, + "grad_norm": 1.18414306640625, + "learning_rate": 2.1686407219402143e-06, + "loss": 0.7889, + "step": 16962 + }, + { + "epoch": 9.567399887196842, + "grad_norm": 1.2238308191299438, + "learning_rate": 2.1658206429780035e-06, + "loss": 0.7134, + "step": 16963 + }, + { + "epoch": 9.567963902989284, + "grad_norm": 1.03020179271698, + "learning_rate": 2.1630005640157923e-06, + "loss": 0.747, + "step": 16964 + }, + { + "epoch": 9.568527918781726, + "grad_norm": 1.20469069480896, + "learning_rate": 2.1601804850535815e-06, + "loss": 0.7554, + "step": 16965 + }, + { + "epoch": 9.569091934574168, + "grad_norm": 0.919558584690094, + "learning_rate": 2.1573604060913707e-06, + "loss": 0.6782, + "step": 16966 + }, + { + "epoch": 9.56965595036661, + "grad_norm": 1.10629403591156, + "learning_rate": 2.1545403271291595e-06, + "loss": 0.6953, + "step": 16967 + }, + { + "epoch": 9.570219966159053, + "grad_norm": 1.0453587770462036, + "learning_rate": 2.1517202481669488e-06, + "loss": 0.7089, + "step": 16968 + }, + { + "epoch": 9.570783981951495, + "grad_norm": 1.1198384761810303, + "learning_rate": 2.148900169204738e-06, + "loss": 0.7687, + "step": 16969 + }, + { + "epoch": 9.571347997743937, + "grad_norm": 1.1971545219421387, + "learning_rate": 2.1460800902425268e-06, + "loss": 0.8444, + "step": 16970 + }, + { + "epoch": 9.571912013536378, + "grad_norm": 1.1683170795440674, + "learning_rate": 2.143260011280316e-06, + "loss": 0.8347, + "step": 16971 + }, + { + "epoch": 9.572476029328822, + "grad_norm": 1.3359637260437012, + "learning_rate": 2.140439932318105e-06, + "loss": 0.7603, + "step": 16972 + }, + { + "epoch": 9.573040045121264, + "grad_norm": 1.0415440797805786, + "learning_rate": 2.137619853355894e-06, + "loss": 0.7392, + "step": 16973 + }, + { + "epoch": 9.573604060913706, + "grad_norm": 0.894939661026001, + "learning_rate": 2.1347997743936832e-06, + "loss": 0.6684, + "step": 16974 + }, + { + "epoch": 9.574168076706147, + "grad_norm": 1.4691919088363647, + "learning_rate": 2.131979695431472e-06, + "loss": 0.7284, + "step": 16975 + }, + { + "epoch": 9.57473209249859, + "grad_norm": 0.9448394775390625, + "learning_rate": 2.1291596164692613e-06, + "loss": 0.7242, + "step": 16976 + }, + { + "epoch": 9.575296108291033, + "grad_norm": 1.1194419860839844, + "learning_rate": 2.1263395375070505e-06, + "loss": 0.8405, + "step": 16977 + }, + { + "epoch": 9.575860124083475, + "grad_norm": 1.4276572465896606, + "learning_rate": 2.1235194585448393e-06, + "loss": 0.7809, + "step": 16978 + }, + { + "epoch": 9.576424139875916, + "grad_norm": 0.9379543662071228, + "learning_rate": 2.1206993795826285e-06, + "loss": 0.7397, + "step": 16979 + }, + { + "epoch": 9.576988155668358, + "grad_norm": 1.2429707050323486, + "learning_rate": 2.1178793006204173e-06, + "loss": 0.7112, + "step": 16980 + }, + { + "epoch": 9.577552171460802, + "grad_norm": 1.0694615840911865, + "learning_rate": 2.1150592216582065e-06, + "loss": 0.7381, + "step": 16981 + }, + { + "epoch": 9.578116187253244, + "grad_norm": 1.1834312677383423, + "learning_rate": 2.1122391426959957e-06, + "loss": 0.6947, + "step": 16982 + }, + { + "epoch": 9.578680203045685, + "grad_norm": 1.037592887878418, + "learning_rate": 2.1094190637337845e-06, + "loss": 0.6681, + "step": 16983 + }, + { + "epoch": 9.579244218838127, + "grad_norm": 1.2175322771072388, + "learning_rate": 2.1065989847715737e-06, + "loss": 0.802, + "step": 16984 + }, + { + "epoch": 9.579808234630569, + "grad_norm": 0.9289380311965942, + "learning_rate": 2.103778905809363e-06, + "loss": 0.7543, + "step": 16985 + }, + { + "epoch": 9.580372250423013, + "grad_norm": 1.082656741142273, + "learning_rate": 2.1009588268471518e-06, + "loss": 0.7877, + "step": 16986 + }, + { + "epoch": 9.580936266215454, + "grad_norm": 0.9071274399757385, + "learning_rate": 2.098138747884941e-06, + "loss": 0.6745, + "step": 16987 + }, + { + "epoch": 9.581500282007896, + "grad_norm": 1.392248272895813, + "learning_rate": 2.0953186689227298e-06, + "loss": 0.8239, + "step": 16988 + }, + { + "epoch": 9.582064297800338, + "grad_norm": 1.3974261283874512, + "learning_rate": 2.092498589960519e-06, + "loss": 0.7072, + "step": 16989 + }, + { + "epoch": 9.58262831359278, + "grad_norm": 1.2241984605789185, + "learning_rate": 2.0896785109983082e-06, + "loss": 0.7138, + "step": 16990 + }, + { + "epoch": 9.583192329385223, + "grad_norm": 1.0923044681549072, + "learning_rate": 2.086858432036097e-06, + "loss": 0.7469, + "step": 16991 + }, + { + "epoch": 9.583756345177665, + "grad_norm": 1.0394879579544067, + "learning_rate": 2.0840383530738862e-06, + "loss": 0.6658, + "step": 16992 + }, + { + "epoch": 9.584320360970107, + "grad_norm": 0.9872029423713684, + "learning_rate": 2.081218274111675e-06, + "loss": 0.6161, + "step": 16993 + }, + { + "epoch": 9.584884376762549, + "grad_norm": 1.2476648092269897, + "learning_rate": 2.0783981951494643e-06, + "loss": 0.7694, + "step": 16994 + }, + { + "epoch": 9.58544839255499, + "grad_norm": 1.2070256471633911, + "learning_rate": 2.0755781161872535e-06, + "loss": 0.8928, + "step": 16995 + }, + { + "epoch": 9.586012408347434, + "grad_norm": 1.5473096370697021, + "learning_rate": 2.0727580372250427e-06, + "loss": 0.7285, + "step": 16996 + }, + { + "epoch": 9.586576424139876, + "grad_norm": 0.788927435874939, + "learning_rate": 2.0699379582628315e-06, + "loss": 0.6661, + "step": 16997 + }, + { + "epoch": 9.587140439932318, + "grad_norm": 1.23869788646698, + "learning_rate": 2.0671178793006207e-06, + "loss": 0.743, + "step": 16998 + }, + { + "epoch": 9.58770445572476, + "grad_norm": 1.008231520652771, + "learning_rate": 2.0642978003384095e-06, + "loss": 0.7081, + "step": 16999 + }, + { + "epoch": 9.588268471517203, + "grad_norm": 1.2366175651550293, + "learning_rate": 2.0614777213761987e-06, + "loss": 0.7386, + "step": 17000 + }, + { + "epoch": 9.588832487309645, + "grad_norm": 1.0507636070251465, + "learning_rate": 2.0586576424139875e-06, + "loss": 0.759, + "step": 17001 + }, + { + "epoch": 9.589396503102087, + "grad_norm": 1.344072937965393, + "learning_rate": 2.0558375634517767e-06, + "loss": 0.7706, + "step": 17002 + }, + { + "epoch": 9.589960518894529, + "grad_norm": 0.970289945602417, + "learning_rate": 2.053017484489566e-06, + "loss": 0.7368, + "step": 17003 + }, + { + "epoch": 9.59052453468697, + "grad_norm": 1.136814832687378, + "learning_rate": 2.050197405527355e-06, + "loss": 0.6718, + "step": 17004 + }, + { + "epoch": 9.591088550479414, + "grad_norm": 1.1506251096725464, + "learning_rate": 2.047377326565144e-06, + "loss": 0.8032, + "step": 17005 + }, + { + "epoch": 9.591652566271856, + "grad_norm": 1.0027772188186646, + "learning_rate": 2.044557247602933e-06, + "loss": 0.7226, + "step": 17006 + }, + { + "epoch": 9.592216582064298, + "grad_norm": 1.1187790632247925, + "learning_rate": 2.041737168640722e-06, + "loss": 0.7949, + "step": 17007 + }, + { + "epoch": 9.59278059785674, + "grad_norm": 1.467378854751587, + "learning_rate": 2.0389170896785112e-06, + "loss": 0.7452, + "step": 17008 + }, + { + "epoch": 9.593344613649183, + "grad_norm": 1.0892645120620728, + "learning_rate": 2.0360970107163e-06, + "loss": 0.7321, + "step": 17009 + }, + { + "epoch": 9.593908629441625, + "grad_norm": 1.225731372833252, + "learning_rate": 2.0332769317540892e-06, + "loss": 0.7142, + "step": 17010 + }, + { + "epoch": 9.594472645234067, + "grad_norm": 1.3018513917922974, + "learning_rate": 2.030456852791878e-06, + "loss": 0.7601, + "step": 17011 + }, + { + "epoch": 9.595036661026509, + "grad_norm": 1.1204867362976074, + "learning_rate": 2.0276367738296677e-06, + "loss": 0.7193, + "step": 17012 + }, + { + "epoch": 9.59560067681895, + "grad_norm": 1.1103566884994507, + "learning_rate": 2.0248166948674565e-06, + "loss": 0.6979, + "step": 17013 + }, + { + "epoch": 9.596164692611394, + "grad_norm": 1.2043836116790771, + "learning_rate": 2.0219966159052457e-06, + "loss": 0.7471, + "step": 17014 + }, + { + "epoch": 9.596728708403836, + "grad_norm": 1.1065911054611206, + "learning_rate": 2.0191765369430345e-06, + "loss": 0.8943, + "step": 17015 + }, + { + "epoch": 9.597292724196278, + "grad_norm": 1.2550158500671387, + "learning_rate": 2.0163564579808237e-06, + "loss": 0.7613, + "step": 17016 + }, + { + "epoch": 9.59785673998872, + "grad_norm": 1.6353307962417603, + "learning_rate": 2.0135363790186125e-06, + "loss": 0.7485, + "step": 17017 + }, + { + "epoch": 9.598420755781161, + "grad_norm": 0.9880104660987854, + "learning_rate": 2.0107163000564017e-06, + "loss": 0.6944, + "step": 17018 + }, + { + "epoch": 9.598984771573605, + "grad_norm": 1.088002324104309, + "learning_rate": 2.0078962210941905e-06, + "loss": 0.724, + "step": 17019 + }, + { + "epoch": 9.599548787366047, + "grad_norm": 0.959341824054718, + "learning_rate": 2.0050761421319797e-06, + "loss": 0.6871, + "step": 17020 + }, + { + "epoch": 9.600112803158488, + "grad_norm": 1.1161733865737915, + "learning_rate": 2.002256063169769e-06, + "loss": 0.6579, + "step": 17021 + }, + { + "epoch": 9.60067681895093, + "grad_norm": 1.3617855310440063, + "learning_rate": 1.999435984207558e-06, + "loss": 0.8112, + "step": 17022 + }, + { + "epoch": 9.601240834743372, + "grad_norm": 1.4666613340377808, + "learning_rate": 1.996615905245347e-06, + "loss": 0.7091, + "step": 17023 + }, + { + "epoch": 9.601804850535816, + "grad_norm": 1.28895103931427, + "learning_rate": 1.993795826283136e-06, + "loss": 0.7698, + "step": 17024 + }, + { + "epoch": 9.602368866328257, + "grad_norm": 1.2193403244018555, + "learning_rate": 1.990975747320925e-06, + "loss": 0.6646, + "step": 17025 + }, + { + "epoch": 9.6029328821207, + "grad_norm": 1.0618962049484253, + "learning_rate": 1.988155668358714e-06, + "loss": 0.72, + "step": 17026 + }, + { + "epoch": 9.603496897913141, + "grad_norm": 1.185981035232544, + "learning_rate": 1.985335589396503e-06, + "loss": 0.789, + "step": 17027 + }, + { + "epoch": 9.604060913705585, + "grad_norm": 1.0561110973358154, + "learning_rate": 1.9825155104342922e-06, + "loss": 0.6156, + "step": 17028 + }, + { + "epoch": 9.604624929498026, + "grad_norm": 1.0297341346740723, + "learning_rate": 1.979695431472081e-06, + "loss": 0.6413, + "step": 17029 + }, + { + "epoch": 9.605188945290468, + "grad_norm": 1.1940345764160156, + "learning_rate": 1.9768753525098707e-06, + "loss": 0.7205, + "step": 17030 + }, + { + "epoch": 9.60575296108291, + "grad_norm": 1.5868008136749268, + "learning_rate": 1.9740552735476595e-06, + "loss": 0.7611, + "step": 17031 + }, + { + "epoch": 9.606316976875352, + "grad_norm": 1.0094953775405884, + "learning_rate": 1.9712351945854487e-06, + "loss": 0.6435, + "step": 17032 + }, + { + "epoch": 9.606880992667795, + "grad_norm": 0.9480701088905334, + "learning_rate": 1.9684151156232375e-06, + "loss": 0.6823, + "step": 17033 + }, + { + "epoch": 9.607445008460237, + "grad_norm": 1.2434896230697632, + "learning_rate": 1.9655950366610267e-06, + "loss": 0.6821, + "step": 17034 + }, + { + "epoch": 9.608009024252679, + "grad_norm": 1.2887037992477417, + "learning_rate": 1.9627749576988155e-06, + "loss": 0.8002, + "step": 17035 + }, + { + "epoch": 9.60857304004512, + "grad_norm": 1.4880955219268799, + "learning_rate": 1.9599548787366047e-06, + "loss": 0.7716, + "step": 17036 + }, + { + "epoch": 9.609137055837564, + "grad_norm": 1.5378352403640747, + "learning_rate": 1.9571347997743935e-06, + "loss": 0.7595, + "step": 17037 + }, + { + "epoch": 9.609701071630006, + "grad_norm": 1.0678677558898926, + "learning_rate": 1.954314720812183e-06, + "loss": 0.7267, + "step": 17038 + }, + { + "epoch": 9.610265087422448, + "grad_norm": 1.3078974485397339, + "learning_rate": 1.951494641849972e-06, + "loss": 0.7949, + "step": 17039 + }, + { + "epoch": 9.61082910321489, + "grad_norm": 1.1615840196609497, + "learning_rate": 1.948674562887761e-06, + "loss": 0.8724, + "step": 17040 + }, + { + "epoch": 9.611393119007332, + "grad_norm": 1.247825026512146, + "learning_rate": 1.94585448392555e-06, + "loss": 0.739, + "step": 17041 + }, + { + "epoch": 9.611957134799775, + "grad_norm": 1.5032392740249634, + "learning_rate": 1.943034404963339e-06, + "loss": 0.7193, + "step": 17042 + }, + { + "epoch": 9.612521150592217, + "grad_norm": 0.8085206151008606, + "learning_rate": 1.940214326001128e-06, + "loss": 0.6354, + "step": 17043 + }, + { + "epoch": 9.613085166384659, + "grad_norm": 1.030497670173645, + "learning_rate": 1.937394247038917e-06, + "loss": 0.6977, + "step": 17044 + }, + { + "epoch": 9.6136491821771, + "grad_norm": 1.0508135557174683, + "learning_rate": 1.934574168076706e-06, + "loss": 0.7066, + "step": 17045 + }, + { + "epoch": 9.614213197969542, + "grad_norm": 1.0356595516204834, + "learning_rate": 1.9317540891144952e-06, + "loss": 0.6601, + "step": 17046 + }, + { + "epoch": 9.614777213761986, + "grad_norm": 1.1123136281967163, + "learning_rate": 1.9289340101522844e-06, + "loss": 0.6739, + "step": 17047 + }, + { + "epoch": 9.615341229554428, + "grad_norm": 1.0076724290847778, + "learning_rate": 1.9261139311900737e-06, + "loss": 0.7377, + "step": 17048 + }, + { + "epoch": 9.61590524534687, + "grad_norm": 0.7987669110298157, + "learning_rate": 1.9232938522278625e-06, + "loss": 0.6571, + "step": 17049 + }, + { + "epoch": 9.616469261139311, + "grad_norm": 1.3133527040481567, + "learning_rate": 1.9204737732656517e-06, + "loss": 0.7105, + "step": 17050 + }, + { + "epoch": 9.617033276931753, + "grad_norm": 1.0434304475784302, + "learning_rate": 1.9176536943034405e-06, + "loss": 0.7117, + "step": 17051 + }, + { + "epoch": 9.617597292724197, + "grad_norm": 0.8479599356651306, + "learning_rate": 1.9148336153412297e-06, + "loss": 0.6742, + "step": 17052 + }, + { + "epoch": 9.618161308516639, + "grad_norm": 1.1636124849319458, + "learning_rate": 1.9120135363790185e-06, + "loss": 0.7103, + "step": 17053 + }, + { + "epoch": 9.61872532430908, + "grad_norm": 0.7291487455368042, + "learning_rate": 1.9091934574168077e-06, + "loss": 0.5822, + "step": 17054 + }, + { + "epoch": 9.619289340101522, + "grad_norm": 0.947270929813385, + "learning_rate": 1.9063733784545967e-06, + "loss": 0.725, + "step": 17055 + }, + { + "epoch": 9.619853355893966, + "grad_norm": 1.2126834392547607, + "learning_rate": 1.903553299492386e-06, + "loss": 0.7181, + "step": 17056 + }, + { + "epoch": 9.620417371686408, + "grad_norm": 1.2254606485366821, + "learning_rate": 1.900733220530175e-06, + "loss": 0.7401, + "step": 17057 + }, + { + "epoch": 9.62098138747885, + "grad_norm": 1.0583642721176147, + "learning_rate": 1.8979131415679642e-06, + "loss": 0.7136, + "step": 17058 + }, + { + "epoch": 9.621545403271291, + "grad_norm": 0.9080269932746887, + "learning_rate": 1.8950930626057532e-06, + "loss": 0.7671, + "step": 17059 + }, + { + "epoch": 9.622109419063733, + "grad_norm": 0.8295275568962097, + "learning_rate": 1.8922729836435422e-06, + "loss": 0.688, + "step": 17060 + }, + { + "epoch": 9.622673434856177, + "grad_norm": 1.0400865077972412, + "learning_rate": 1.8894529046813312e-06, + "loss": 0.7178, + "step": 17061 + }, + { + "epoch": 9.623237450648618, + "grad_norm": 1.2049628496170044, + "learning_rate": 1.8866328257191202e-06, + "loss": 0.7023, + "step": 17062 + }, + { + "epoch": 9.62380146644106, + "grad_norm": 0.9880895018577576, + "learning_rate": 1.8838127467569092e-06, + "loss": 0.751, + "step": 17063 + }, + { + "epoch": 9.624365482233502, + "grad_norm": 1.2704379558563232, + "learning_rate": 1.8809926677946982e-06, + "loss": 0.7224, + "step": 17064 + }, + { + "epoch": 9.624929498025946, + "grad_norm": 1.0853028297424316, + "learning_rate": 1.8781725888324877e-06, + "loss": 0.6553, + "step": 17065 + }, + { + "epoch": 9.625493513818387, + "grad_norm": 1.2867422103881836, + "learning_rate": 1.8753525098702767e-06, + "loss": 0.7477, + "step": 17066 + }, + { + "epoch": 9.62605752961083, + "grad_norm": 0.9018807411193848, + "learning_rate": 1.8725324309080657e-06, + "loss": 0.6914, + "step": 17067 + }, + { + "epoch": 9.626621545403271, + "grad_norm": 1.508981466293335, + "learning_rate": 1.8697123519458547e-06, + "loss": 0.7526, + "step": 17068 + }, + { + "epoch": 9.627185561195713, + "grad_norm": 0.9818080067634583, + "learning_rate": 1.8668922729836437e-06, + "loss": 0.6675, + "step": 17069 + }, + { + "epoch": 9.627749576988156, + "grad_norm": 1.1313999891281128, + "learning_rate": 1.8640721940214327e-06, + "loss": 0.7755, + "step": 17070 + }, + { + "epoch": 9.628313592780598, + "grad_norm": 0.8729102611541748, + "learning_rate": 1.8612521150592217e-06, + "loss": 0.659, + "step": 17071 + }, + { + "epoch": 9.62887760857304, + "grad_norm": 1.126357078552246, + "learning_rate": 1.8584320360970107e-06, + "loss": 0.8304, + "step": 17072 + }, + { + "epoch": 9.629441624365482, + "grad_norm": 0.9092158675193787, + "learning_rate": 1.8556119571347997e-06, + "loss": 0.6551, + "step": 17073 + }, + { + "epoch": 9.630005640157924, + "grad_norm": 1.029772400856018, + "learning_rate": 1.8527918781725891e-06, + "loss": 0.6989, + "step": 17074 + }, + { + "epoch": 9.630569655950367, + "grad_norm": 1.194417953491211, + "learning_rate": 1.8499717992103782e-06, + "loss": 0.7333, + "step": 17075 + }, + { + "epoch": 9.631133671742809, + "grad_norm": 1.0666968822479248, + "learning_rate": 1.8471517202481672e-06, + "loss": 0.7473, + "step": 17076 + }, + { + "epoch": 9.631697687535251, + "grad_norm": 1.579750895500183, + "learning_rate": 1.8443316412859562e-06, + "loss": 0.7657, + "step": 17077 + }, + { + "epoch": 9.632261703327693, + "grad_norm": 1.121142864227295, + "learning_rate": 1.8415115623237452e-06, + "loss": 0.6613, + "step": 17078 + }, + { + "epoch": 9.632825719120135, + "grad_norm": 1.5177606344223022, + "learning_rate": 1.8386914833615342e-06, + "loss": 0.8282, + "step": 17079 + }, + { + "epoch": 9.633389734912578, + "grad_norm": 1.4770838022232056, + "learning_rate": 1.8358714043993232e-06, + "loss": 0.7291, + "step": 17080 + }, + { + "epoch": 9.63395375070502, + "grad_norm": 1.0142195224761963, + "learning_rate": 1.8330513254371122e-06, + "loss": 0.7463, + "step": 17081 + }, + { + "epoch": 9.634517766497462, + "grad_norm": 1.1499226093292236, + "learning_rate": 1.8302312464749012e-06, + "loss": 0.7826, + "step": 17082 + }, + { + "epoch": 9.635081782289904, + "grad_norm": 0.9645760655403137, + "learning_rate": 1.8274111675126906e-06, + "loss": 0.6168, + "step": 17083 + }, + { + "epoch": 9.635645798082347, + "grad_norm": 1.1212290525436401, + "learning_rate": 1.8245910885504797e-06, + "loss": 0.7593, + "step": 17084 + }, + { + "epoch": 9.636209813874789, + "grad_norm": 1.1744403839111328, + "learning_rate": 1.8217710095882687e-06, + "loss": 0.6869, + "step": 17085 + }, + { + "epoch": 9.63677382966723, + "grad_norm": 1.440670132637024, + "learning_rate": 1.8189509306260577e-06, + "loss": 0.781, + "step": 17086 + }, + { + "epoch": 9.637337845459673, + "grad_norm": 1.0382839441299438, + "learning_rate": 1.8161308516638467e-06, + "loss": 0.6411, + "step": 17087 + }, + { + "epoch": 9.637901861252114, + "grad_norm": 0.9696965217590332, + "learning_rate": 1.8133107727016357e-06, + "loss": 0.6573, + "step": 17088 + }, + { + "epoch": 9.638465877044558, + "grad_norm": 1.0013011693954468, + "learning_rate": 1.8104906937394247e-06, + "loss": 0.6925, + "step": 17089 + }, + { + "epoch": 9.639029892837, + "grad_norm": 1.1389713287353516, + "learning_rate": 1.8076706147772137e-06, + "loss": 0.7756, + "step": 17090 + }, + { + "epoch": 9.639593908629442, + "grad_norm": 0.9189877510070801, + "learning_rate": 1.8048505358150031e-06, + "loss": 0.5506, + "step": 17091 + }, + { + "epoch": 9.640157924421883, + "grad_norm": 1.1271659135818481, + "learning_rate": 1.8020304568527921e-06, + "loss": 0.6668, + "step": 17092 + }, + { + "epoch": 9.640721940214327, + "grad_norm": 1.3582489490509033, + "learning_rate": 1.7992103778905812e-06, + "loss": 0.6973, + "step": 17093 + }, + { + "epoch": 9.641285956006769, + "grad_norm": 1.2752244472503662, + "learning_rate": 1.7963902989283702e-06, + "loss": 0.6344, + "step": 17094 + }, + { + "epoch": 9.64184997179921, + "grad_norm": 0.9786638617515564, + "learning_rate": 1.7935702199661592e-06, + "loss": 0.6995, + "step": 17095 + }, + { + "epoch": 9.642413987591652, + "grad_norm": 1.50727117061615, + "learning_rate": 1.7907501410039482e-06, + "loss": 0.709, + "step": 17096 + }, + { + "epoch": 9.642978003384094, + "grad_norm": 1.2290369272232056, + "learning_rate": 1.7879300620417372e-06, + "loss": 0.8076, + "step": 17097 + }, + { + "epoch": 9.643542019176538, + "grad_norm": 1.0045329332351685, + "learning_rate": 1.7851099830795262e-06, + "loss": 0.7517, + "step": 17098 + }, + { + "epoch": 9.64410603496898, + "grad_norm": 1.4159257411956787, + "learning_rate": 1.7822899041173152e-06, + "loss": 0.7411, + "step": 17099 + }, + { + "epoch": 9.644670050761421, + "grad_norm": 2.433058023452759, + "learning_rate": 1.7794698251551046e-06, + "loss": 0.7047, + "step": 17100 + }, + { + "epoch": 9.645234066553863, + "grad_norm": 1.1372871398925781, + "learning_rate": 1.7766497461928936e-06, + "loss": 0.7357, + "step": 17101 + }, + { + "epoch": 9.645798082346305, + "grad_norm": 1.1561036109924316, + "learning_rate": 1.7738296672306827e-06, + "loss": 0.6688, + "step": 17102 + }, + { + "epoch": 9.646362098138749, + "grad_norm": 1.3122326135635376, + "learning_rate": 1.7710095882684717e-06, + "loss": 0.8465, + "step": 17103 + }, + { + "epoch": 9.64692611393119, + "grad_norm": 1.3750468492507935, + "learning_rate": 1.7681895093062607e-06, + "loss": 0.7745, + "step": 17104 + }, + { + "epoch": 9.647490129723632, + "grad_norm": 1.2628233432769775, + "learning_rate": 1.7653694303440497e-06, + "loss": 0.7018, + "step": 17105 + }, + { + "epoch": 9.648054145516074, + "grad_norm": 1.0881608724594116, + "learning_rate": 1.7625493513818387e-06, + "loss": 0.7069, + "step": 17106 + }, + { + "epoch": 9.648618161308516, + "grad_norm": 0.9805597066879272, + "learning_rate": 1.7597292724196277e-06, + "loss": 0.7971, + "step": 17107 + }, + { + "epoch": 9.64918217710096, + "grad_norm": 0.9591033458709717, + "learning_rate": 1.7569091934574167e-06, + "loss": 0.6965, + "step": 17108 + }, + { + "epoch": 9.649746192893401, + "grad_norm": 1.2409782409667969, + "learning_rate": 1.7540891144952061e-06, + "loss": 0.7323, + "step": 17109 + }, + { + "epoch": 9.650310208685843, + "grad_norm": 1.205665111541748, + "learning_rate": 1.7512690355329951e-06, + "loss": 0.678, + "step": 17110 + }, + { + "epoch": 9.650874224478285, + "grad_norm": 0.8867470622062683, + "learning_rate": 1.7484489565707842e-06, + "loss": 0.7033, + "step": 17111 + }, + { + "epoch": 9.651438240270728, + "grad_norm": 1.677999496459961, + "learning_rate": 1.7456288776085732e-06, + "loss": 0.8077, + "step": 17112 + }, + { + "epoch": 9.65200225606317, + "grad_norm": 1.339018702507019, + "learning_rate": 1.7428087986463622e-06, + "loss": 0.7452, + "step": 17113 + }, + { + "epoch": 9.652566271855612, + "grad_norm": 1.4058743715286255, + "learning_rate": 1.7399887196841512e-06, + "loss": 0.7673, + "step": 17114 + }, + { + "epoch": 9.653130287648054, + "grad_norm": 0.9827184081077576, + "learning_rate": 1.7371686407219402e-06, + "loss": 0.6543, + "step": 17115 + }, + { + "epoch": 9.653694303440496, + "grad_norm": 1.0264630317687988, + "learning_rate": 1.7343485617597292e-06, + "loss": 0.7739, + "step": 17116 + }, + { + "epoch": 9.65425831923294, + "grad_norm": 1.4346110820770264, + "learning_rate": 1.7315284827975182e-06, + "loss": 0.868, + "step": 17117 + }, + { + "epoch": 9.654822335025381, + "grad_norm": 1.3915427923202515, + "learning_rate": 1.7287084038353076e-06, + "loss": 0.6911, + "step": 17118 + }, + { + "epoch": 9.655386350817823, + "grad_norm": 0.9284894466400146, + "learning_rate": 1.7258883248730966e-06, + "loss": 0.7639, + "step": 17119 + }, + { + "epoch": 9.655950366610265, + "grad_norm": 1.2438420057296753, + "learning_rate": 1.7230682459108856e-06, + "loss": 0.7613, + "step": 17120 + }, + { + "epoch": 9.656514382402708, + "grad_norm": 1.4405137300491333, + "learning_rate": 1.7202481669486747e-06, + "loss": 0.7578, + "step": 17121 + }, + { + "epoch": 9.65707839819515, + "grad_norm": 1.1148227453231812, + "learning_rate": 1.7174280879864637e-06, + "loss": 0.725, + "step": 17122 + }, + { + "epoch": 9.657642413987592, + "grad_norm": 0.8830407857894897, + "learning_rate": 1.7146080090242527e-06, + "loss": 0.7189, + "step": 17123 + }, + { + "epoch": 9.658206429780034, + "grad_norm": 1.740195631980896, + "learning_rate": 1.7117879300620417e-06, + "loss": 0.7511, + "step": 17124 + }, + { + "epoch": 9.658770445572475, + "grad_norm": 1.0396708250045776, + "learning_rate": 1.7089678510998307e-06, + "loss": 0.8399, + "step": 17125 + }, + { + "epoch": 9.659334461364919, + "grad_norm": 1.2593684196472168, + "learning_rate": 1.7061477721376197e-06, + "loss": 0.6619, + "step": 17126 + }, + { + "epoch": 9.65989847715736, + "grad_norm": 1.3949205875396729, + "learning_rate": 1.7033276931754091e-06, + "loss": 0.7994, + "step": 17127 + }, + { + "epoch": 9.660462492949803, + "grad_norm": 1.143883466720581, + "learning_rate": 1.7005076142131981e-06, + "loss": 0.6363, + "step": 17128 + }, + { + "epoch": 9.661026508742244, + "grad_norm": 0.9947854280471802, + "learning_rate": 1.6976875352509871e-06, + "loss": 0.6801, + "step": 17129 + }, + { + "epoch": 9.661590524534686, + "grad_norm": 0.9202463030815125, + "learning_rate": 1.6948674562887762e-06, + "loss": 0.7381, + "step": 17130 + }, + { + "epoch": 9.66215454032713, + "grad_norm": 1.104881763458252, + "learning_rate": 1.6920473773265652e-06, + "loss": 0.7845, + "step": 17131 + }, + { + "epoch": 9.662718556119572, + "grad_norm": 0.9220927357673645, + "learning_rate": 1.6892272983643542e-06, + "loss": 0.7031, + "step": 17132 + }, + { + "epoch": 9.663282571912013, + "grad_norm": 1.2769761085510254, + "learning_rate": 1.6864072194021432e-06, + "loss": 0.7237, + "step": 17133 + }, + { + "epoch": 9.663846587704455, + "grad_norm": 1.2350331544876099, + "learning_rate": 1.6835871404399322e-06, + "loss": 0.6588, + "step": 17134 + }, + { + "epoch": 9.664410603496897, + "grad_norm": 0.8751922249794006, + "learning_rate": 1.6807670614777216e-06, + "loss": 0.6516, + "step": 17135 + }, + { + "epoch": 9.66497461928934, + "grad_norm": 1.1992093324661255, + "learning_rate": 1.6779469825155106e-06, + "loss": 0.7602, + "step": 17136 + }, + { + "epoch": 9.665538635081782, + "grad_norm": 1.129764199256897, + "learning_rate": 1.6751269035532996e-06, + "loss": 0.7912, + "step": 17137 + }, + { + "epoch": 9.666102650874224, + "grad_norm": 1.314218521118164, + "learning_rate": 1.6723068245910886e-06, + "loss": 0.7285, + "step": 17138 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 0.8398863077163696, + "learning_rate": 1.6694867456288777e-06, + "loss": 0.7088, + "step": 17139 + }, + { + "epoch": 9.66723068245911, + "grad_norm": 1.376514196395874, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.7315, + "step": 17140 + }, + { + "epoch": 9.667794698251551, + "grad_norm": 1.1191059350967407, + "learning_rate": 1.6638465877044557e-06, + "loss": 0.7544, + "step": 17141 + }, + { + "epoch": 9.668358714043993, + "grad_norm": 1.1972191333770752, + "learning_rate": 1.6610265087422447e-06, + "loss": 0.6829, + "step": 17142 + }, + { + "epoch": 9.668922729836435, + "grad_norm": 1.1529672145843506, + "learning_rate": 1.6582064297800337e-06, + "loss": 0.6575, + "step": 17143 + }, + { + "epoch": 9.669486745628877, + "grad_norm": 1.1024034023284912, + "learning_rate": 1.6553863508178231e-06, + "loss": 0.7629, + "step": 17144 + }, + { + "epoch": 9.67005076142132, + "grad_norm": 1.22067129611969, + "learning_rate": 1.6525662718556121e-06, + "loss": 0.8018, + "step": 17145 + }, + { + "epoch": 9.670614777213762, + "grad_norm": 1.4046828746795654, + "learning_rate": 1.6497461928934011e-06, + "loss": 0.8157, + "step": 17146 + }, + { + "epoch": 9.671178793006204, + "grad_norm": 1.2571109533309937, + "learning_rate": 1.6469261139311901e-06, + "loss": 0.6622, + "step": 17147 + }, + { + "epoch": 9.671742808798646, + "grad_norm": 1.267261028289795, + "learning_rate": 1.6441060349689792e-06, + "loss": 0.7393, + "step": 17148 + }, + { + "epoch": 9.67230682459109, + "grad_norm": 0.9140281677246094, + "learning_rate": 1.6412859560067682e-06, + "loss": 0.7257, + "step": 17149 + }, + { + "epoch": 9.672870840383531, + "grad_norm": 1.5856585502624512, + "learning_rate": 1.6384658770445572e-06, + "loss": 0.8462, + "step": 17150 + }, + { + "epoch": 9.673434856175973, + "grad_norm": 1.1370375156402588, + "learning_rate": 1.6356457980823462e-06, + "loss": 0.6425, + "step": 17151 + }, + { + "epoch": 9.673998871968415, + "grad_norm": 0.9969896078109741, + "learning_rate": 1.6328257191201354e-06, + "loss": 0.7994, + "step": 17152 + }, + { + "epoch": 9.674562887760857, + "grad_norm": 1.3180311918258667, + "learning_rate": 1.6300056401579246e-06, + "loss": 0.8658, + "step": 17153 + }, + { + "epoch": 9.6751269035533, + "grad_norm": 1.1209052801132202, + "learning_rate": 1.6271855611957136e-06, + "loss": 0.7052, + "step": 17154 + }, + { + "epoch": 9.675690919345742, + "grad_norm": 1.0340520143508911, + "learning_rate": 1.6243654822335026e-06, + "loss": 0.6912, + "step": 17155 + }, + { + "epoch": 9.676254935138184, + "grad_norm": 1.1623409986495972, + "learning_rate": 1.6215454032712916e-06, + "loss": 0.7088, + "step": 17156 + }, + { + "epoch": 9.676818950930626, + "grad_norm": 0.8661175966262817, + "learning_rate": 1.6187253243090806e-06, + "loss": 0.6784, + "step": 17157 + }, + { + "epoch": 9.677382966723068, + "grad_norm": 1.2032603025436401, + "learning_rate": 1.6159052453468697e-06, + "loss": 0.7146, + "step": 17158 + }, + { + "epoch": 9.677946982515511, + "grad_norm": 1.1830025911331177, + "learning_rate": 1.6130851663846587e-06, + "loss": 0.7798, + "step": 17159 + }, + { + "epoch": 9.678510998307953, + "grad_norm": 1.2027077674865723, + "learning_rate": 1.6102650874224479e-06, + "loss": 0.7069, + "step": 17160 + }, + { + "epoch": 9.679075014100395, + "grad_norm": 1.2893528938293457, + "learning_rate": 1.607445008460237e-06, + "loss": 0.5798, + "step": 17161 + }, + { + "epoch": 9.679639029892837, + "grad_norm": 1.0123878717422485, + "learning_rate": 1.6046249294980261e-06, + "loss": 0.7094, + "step": 17162 + }, + { + "epoch": 9.680203045685278, + "grad_norm": 1.3202497959136963, + "learning_rate": 1.6018048505358151e-06, + "loss": 0.8096, + "step": 17163 + }, + { + "epoch": 9.680767061477722, + "grad_norm": 1.719221591949463, + "learning_rate": 1.5989847715736041e-06, + "loss": 0.759, + "step": 17164 + }, + { + "epoch": 9.681331077270164, + "grad_norm": 1.045297384262085, + "learning_rate": 1.5961646926113931e-06, + "loss": 0.7943, + "step": 17165 + }, + { + "epoch": 9.681895093062606, + "grad_norm": 0.983683168888092, + "learning_rate": 1.5933446136491821e-06, + "loss": 0.7225, + "step": 17166 + }, + { + "epoch": 9.682459108855047, + "grad_norm": 1.5494239330291748, + "learning_rate": 1.5905245346869714e-06, + "loss": 0.7373, + "step": 17167 + }, + { + "epoch": 9.683023124647491, + "grad_norm": 0.9419671893119812, + "learning_rate": 1.5877044557247604e-06, + "loss": 0.7301, + "step": 17168 + }, + { + "epoch": 9.683587140439933, + "grad_norm": 1.3261141777038574, + "learning_rate": 1.5848843767625494e-06, + "loss": 0.8715, + "step": 17169 + }, + { + "epoch": 9.684151156232375, + "grad_norm": 1.0616248846054077, + "learning_rate": 1.5820642978003384e-06, + "loss": 0.6286, + "step": 17170 + }, + { + "epoch": 9.684715172024816, + "grad_norm": 1.025311827659607, + "learning_rate": 1.5792442188381276e-06, + "loss": 0.7019, + "step": 17171 + }, + { + "epoch": 9.685279187817258, + "grad_norm": 1.077523946762085, + "learning_rate": 1.5764241398759166e-06, + "loss": 0.5493, + "step": 17172 + }, + { + "epoch": 9.685843203609702, + "grad_norm": 1.01401948928833, + "learning_rate": 1.5736040609137056e-06, + "loss": 0.6774, + "step": 17173 + }, + { + "epoch": 9.686407219402144, + "grad_norm": 1.598657250404358, + "learning_rate": 1.5707839819514948e-06, + "loss": 0.8922, + "step": 17174 + }, + { + "epoch": 9.686971235194585, + "grad_norm": 1.02587890625, + "learning_rate": 1.5679639029892839e-06, + "loss": 0.6248, + "step": 17175 + }, + { + "epoch": 9.687535250987027, + "grad_norm": 1.1166462898254395, + "learning_rate": 1.5651438240270729e-06, + "loss": 0.6464, + "step": 17176 + }, + { + "epoch": 9.68809926677947, + "grad_norm": 1.2859504222869873, + "learning_rate": 1.5623237450648619e-06, + "loss": 0.7198, + "step": 17177 + }, + { + "epoch": 9.688663282571913, + "grad_norm": 1.0150212049484253, + "learning_rate": 1.559503666102651e-06, + "loss": 0.7012, + "step": 17178 + }, + { + "epoch": 9.689227298364354, + "grad_norm": 1.333328127861023, + "learning_rate": 1.55668358714044e-06, + "loss": 0.8091, + "step": 17179 + }, + { + "epoch": 9.689791314156796, + "grad_norm": 1.0060611963272095, + "learning_rate": 1.5538635081782291e-06, + "loss": 0.7821, + "step": 17180 + }, + { + "epoch": 9.690355329949238, + "grad_norm": 1.1167762279510498, + "learning_rate": 1.5510434292160181e-06, + "loss": 0.7794, + "step": 17181 + }, + { + "epoch": 9.690919345741682, + "grad_norm": 1.485939621925354, + "learning_rate": 1.5482233502538073e-06, + "loss": 0.7562, + "step": 17182 + }, + { + "epoch": 9.691483361534123, + "grad_norm": 0.9476321339607239, + "learning_rate": 1.5454032712915963e-06, + "loss": 0.6313, + "step": 17183 + }, + { + "epoch": 9.692047377326565, + "grad_norm": 0.9689860939979553, + "learning_rate": 1.5425831923293854e-06, + "loss": 0.7831, + "step": 17184 + }, + { + "epoch": 9.692611393119007, + "grad_norm": 1.2421956062316895, + "learning_rate": 1.5397631133671744e-06, + "loss": 0.7466, + "step": 17185 + }, + { + "epoch": 9.693175408911449, + "grad_norm": 1.3224892616271973, + "learning_rate": 1.5369430344049636e-06, + "loss": 0.6964, + "step": 17186 + }, + { + "epoch": 9.693739424703892, + "grad_norm": 1.2993650436401367, + "learning_rate": 1.5341229554427526e-06, + "loss": 0.7526, + "step": 17187 + }, + { + "epoch": 9.694303440496334, + "grad_norm": 1.2408117055892944, + "learning_rate": 1.5313028764805416e-06, + "loss": 0.819, + "step": 17188 + }, + { + "epoch": 9.694867456288776, + "grad_norm": 1.340201497077942, + "learning_rate": 1.5284827975183306e-06, + "loss": 0.7615, + "step": 17189 + }, + { + "epoch": 9.695431472081218, + "grad_norm": 1.1550819873809814, + "learning_rate": 1.5256627185561196e-06, + "loss": 0.7613, + "step": 17190 + }, + { + "epoch": 9.69599548787366, + "grad_norm": 0.9010380506515503, + "learning_rate": 1.5228426395939088e-06, + "loss": 0.6472, + "step": 17191 + }, + { + "epoch": 9.696559503666103, + "grad_norm": 1.0283522605895996, + "learning_rate": 1.5200225606316978e-06, + "loss": 0.7782, + "step": 17192 + }, + { + "epoch": 9.697123519458545, + "grad_norm": 1.113653302192688, + "learning_rate": 1.5172024816694869e-06, + "loss": 0.8135, + "step": 17193 + }, + { + "epoch": 9.697687535250987, + "grad_norm": 1.0388797521591187, + "learning_rate": 1.5143824027072759e-06, + "loss": 0.6676, + "step": 17194 + }, + { + "epoch": 9.698251551043429, + "grad_norm": 0.9810493588447571, + "learning_rate": 1.511562323745065e-06, + "loss": 0.7108, + "step": 17195 + }, + { + "epoch": 9.698815566835872, + "grad_norm": 1.5308868885040283, + "learning_rate": 1.508742244782854e-06, + "loss": 0.7844, + "step": 17196 + }, + { + "epoch": 9.699379582628314, + "grad_norm": 1.303504467010498, + "learning_rate": 1.505922165820643e-06, + "loss": 0.7748, + "step": 17197 + }, + { + "epoch": 9.699943598420756, + "grad_norm": 1.2837458848953247, + "learning_rate": 1.5031020868584321e-06, + "loss": 0.7495, + "step": 17198 + }, + { + "epoch": 9.700507614213198, + "grad_norm": 0.850150465965271, + "learning_rate": 1.5002820078962211e-06, + "loss": 0.6039, + "step": 17199 + }, + { + "epoch": 9.70107163000564, + "grad_norm": 0.905622124671936, + "learning_rate": 1.4974619289340103e-06, + "loss": 0.6395, + "step": 17200 + }, + { + "epoch": 9.701635645798083, + "grad_norm": 1.2134531736373901, + "learning_rate": 1.4946418499717993e-06, + "loss": 0.7802, + "step": 17201 + }, + { + "epoch": 9.702199661590525, + "grad_norm": 1.022157073020935, + "learning_rate": 1.4918217710095884e-06, + "loss": 0.6726, + "step": 17202 + }, + { + "epoch": 9.702763677382967, + "grad_norm": 1.3335731029510498, + "learning_rate": 1.4890016920473774e-06, + "loss": 0.7852, + "step": 17203 + }, + { + "epoch": 9.703327693175408, + "grad_norm": 1.2937086820602417, + "learning_rate": 1.4861816130851666e-06, + "loss": 0.7455, + "step": 17204 + }, + { + "epoch": 9.703891708967852, + "grad_norm": 0.9644644260406494, + "learning_rate": 1.4833615341229556e-06, + "loss": 0.8065, + "step": 17205 + }, + { + "epoch": 9.704455724760294, + "grad_norm": 1.47747802734375, + "learning_rate": 1.4805414551607446e-06, + "loss": 0.8317, + "step": 17206 + }, + { + "epoch": 9.705019740552736, + "grad_norm": 0.9944032430648804, + "learning_rate": 1.4777213761985336e-06, + "loss": 0.6028, + "step": 17207 + }, + { + "epoch": 9.705583756345177, + "grad_norm": 1.4087148904800415, + "learning_rate": 1.4749012972363228e-06, + "loss": 0.7124, + "step": 17208 + }, + { + "epoch": 9.70614777213762, + "grad_norm": 1.6703416109085083, + "learning_rate": 1.4720812182741118e-06, + "loss": 0.8333, + "step": 17209 + }, + { + "epoch": 9.706711787930063, + "grad_norm": 1.0062326192855835, + "learning_rate": 1.4692611393119008e-06, + "loss": 0.6562, + "step": 17210 + }, + { + "epoch": 9.707275803722505, + "grad_norm": 1.1708736419677734, + "learning_rate": 1.4664410603496899e-06, + "loss": 0.7489, + "step": 17211 + }, + { + "epoch": 9.707839819514946, + "grad_norm": 1.4310880899429321, + "learning_rate": 1.4636209813874789e-06, + "loss": 0.7437, + "step": 17212 + }, + { + "epoch": 9.708403835307388, + "grad_norm": 1.1699879169464111, + "learning_rate": 1.460800902425268e-06, + "loss": 0.7746, + "step": 17213 + }, + { + "epoch": 9.70896785109983, + "grad_norm": 0.9250511527061462, + "learning_rate": 1.457980823463057e-06, + "loss": 0.722, + "step": 17214 + }, + { + "epoch": 9.709531866892274, + "grad_norm": 0.9584273099899292, + "learning_rate": 1.455160744500846e-06, + "loss": 0.7348, + "step": 17215 + }, + { + "epoch": 9.710095882684715, + "grad_norm": 1.2147116661071777, + "learning_rate": 1.452340665538635e-06, + "loss": 0.7308, + "step": 17216 + }, + { + "epoch": 9.710659898477157, + "grad_norm": 1.0682237148284912, + "learning_rate": 1.4495205865764243e-06, + "loss": 0.7324, + "step": 17217 + }, + { + "epoch": 9.711223914269599, + "grad_norm": 1.2517290115356445, + "learning_rate": 1.4467005076142133e-06, + "loss": 0.7135, + "step": 17218 + }, + { + "epoch": 9.71178793006204, + "grad_norm": 1.1373021602630615, + "learning_rate": 1.4438804286520023e-06, + "loss": 0.7102, + "step": 17219 + }, + { + "epoch": 9.712351945854484, + "grad_norm": 1.285385251045227, + "learning_rate": 1.4410603496897913e-06, + "loss": 0.7723, + "step": 17220 + }, + { + "epoch": 9.712915961646926, + "grad_norm": 1.439632773399353, + "learning_rate": 1.4382402707275804e-06, + "loss": 0.8918, + "step": 17221 + }, + { + "epoch": 9.713479977439368, + "grad_norm": 1.1035033464431763, + "learning_rate": 1.4354201917653696e-06, + "loss": 0.747, + "step": 17222 + }, + { + "epoch": 9.71404399323181, + "grad_norm": 1.2622733116149902, + "learning_rate": 1.4326001128031586e-06, + "loss": 0.7919, + "step": 17223 + }, + { + "epoch": 9.714608009024253, + "grad_norm": 1.4619414806365967, + "learning_rate": 1.4297800338409476e-06, + "loss": 0.7415, + "step": 17224 + }, + { + "epoch": 9.715172024816695, + "grad_norm": 1.0748968124389648, + "learning_rate": 1.4269599548787366e-06, + "loss": 0.744, + "step": 17225 + }, + { + "epoch": 9.715736040609137, + "grad_norm": 1.0149348974227905, + "learning_rate": 1.4241398759165258e-06, + "loss": 0.6621, + "step": 17226 + }, + { + "epoch": 9.716300056401579, + "grad_norm": 1.307675838470459, + "learning_rate": 1.4213197969543148e-06, + "loss": 0.8005, + "step": 17227 + }, + { + "epoch": 9.71686407219402, + "grad_norm": 1.189125418663025, + "learning_rate": 1.4184997179921038e-06, + "loss": 0.7722, + "step": 17228 + }, + { + "epoch": 9.717428087986464, + "grad_norm": 0.8022294640541077, + "learning_rate": 1.4156796390298928e-06, + "loss": 0.6741, + "step": 17229 + }, + { + "epoch": 9.717992103778906, + "grad_norm": 0.924187183380127, + "learning_rate": 1.412859560067682e-06, + "loss": 0.7163, + "step": 17230 + }, + { + "epoch": 9.718556119571348, + "grad_norm": 1.091426968574524, + "learning_rate": 1.410039481105471e-06, + "loss": 0.6801, + "step": 17231 + }, + { + "epoch": 9.71912013536379, + "grad_norm": 1.1142044067382812, + "learning_rate": 1.40721940214326e-06, + "loss": 0.7709, + "step": 17232 + }, + { + "epoch": 9.719684151156233, + "grad_norm": 1.195451021194458, + "learning_rate": 1.404399323181049e-06, + "loss": 0.7837, + "step": 17233 + }, + { + "epoch": 9.720248166948675, + "grad_norm": 1.1285064220428467, + "learning_rate": 1.401579244218838e-06, + "loss": 0.762, + "step": 17234 + }, + { + "epoch": 9.720812182741117, + "grad_norm": 0.9914716482162476, + "learning_rate": 1.3987591652566273e-06, + "loss": 0.7902, + "step": 17235 + }, + { + "epoch": 9.721376198533559, + "grad_norm": 0.851546049118042, + "learning_rate": 1.3959390862944163e-06, + "loss": 0.6674, + "step": 17236 + }, + { + "epoch": 9.721940214326, + "grad_norm": 1.0482652187347412, + "learning_rate": 1.3931190073322053e-06, + "loss": 0.6805, + "step": 17237 + }, + { + "epoch": 9.722504230118444, + "grad_norm": 1.221415638923645, + "learning_rate": 1.3902989283699943e-06, + "loss": 0.7672, + "step": 17238 + }, + { + "epoch": 9.723068245910886, + "grad_norm": 1.244498610496521, + "learning_rate": 1.3874788494077836e-06, + "loss": 0.7726, + "step": 17239 + }, + { + "epoch": 9.723632261703328, + "grad_norm": 1.1077330112457275, + "learning_rate": 1.3846587704455726e-06, + "loss": 0.8378, + "step": 17240 + }, + { + "epoch": 9.72419627749577, + "grad_norm": 0.9514763951301575, + "learning_rate": 1.3818386914833616e-06, + "loss": 0.6954, + "step": 17241 + }, + { + "epoch": 9.724760293288211, + "grad_norm": 1.0221325159072876, + "learning_rate": 1.3790186125211506e-06, + "loss": 0.736, + "step": 17242 + }, + { + "epoch": 9.725324309080655, + "grad_norm": 0.9290971755981445, + "learning_rate": 1.3761985335589396e-06, + "loss": 0.6767, + "step": 17243 + }, + { + "epoch": 9.725888324873097, + "grad_norm": 1.2354673147201538, + "learning_rate": 1.3733784545967288e-06, + "loss": 0.7225, + "step": 17244 + }, + { + "epoch": 9.726452340665539, + "grad_norm": 1.036234974861145, + "learning_rate": 1.3705583756345178e-06, + "loss": 0.7527, + "step": 17245 + }, + { + "epoch": 9.72701635645798, + "grad_norm": 0.9929497838020325, + "learning_rate": 1.3677382966723068e-06, + "loss": 0.7266, + "step": 17246 + }, + { + "epoch": 9.727580372250422, + "grad_norm": 1.468697428703308, + "learning_rate": 1.3649182177100958e-06, + "loss": 0.8066, + "step": 17247 + }, + { + "epoch": 9.728144388042866, + "grad_norm": 1.397308111190796, + "learning_rate": 1.362098138747885e-06, + "loss": 0.7023, + "step": 17248 + }, + { + "epoch": 9.728708403835308, + "grad_norm": 0.8670526742935181, + "learning_rate": 1.359278059785674e-06, + "loss": 0.6779, + "step": 17249 + }, + { + "epoch": 9.72927241962775, + "grad_norm": 1.079200029373169, + "learning_rate": 1.356457980823463e-06, + "loss": 0.6929, + "step": 17250 + }, + { + "epoch": 9.729836435420191, + "grad_norm": 0.9117211103439331, + "learning_rate": 1.353637901861252e-06, + "loss": 0.7273, + "step": 17251 + }, + { + "epoch": 9.730400451212635, + "grad_norm": 1.3066375255584717, + "learning_rate": 1.3508178228990413e-06, + "loss": 0.7845, + "step": 17252 + }, + { + "epoch": 9.730964467005077, + "grad_norm": 0.9291660189628601, + "learning_rate": 1.3479977439368303e-06, + "loss": 0.6471, + "step": 17253 + }, + { + "epoch": 9.731528482797518, + "grad_norm": 1.1582953929901123, + "learning_rate": 1.3451776649746193e-06, + "loss": 0.7688, + "step": 17254 + }, + { + "epoch": 9.73209249858996, + "grad_norm": 1.1587495803833008, + "learning_rate": 1.3423575860124083e-06, + "loss": 0.7574, + "step": 17255 + }, + { + "epoch": 9.732656514382402, + "grad_norm": 1.0506361722946167, + "learning_rate": 1.3395375070501973e-06, + "loss": 0.7199, + "step": 17256 + }, + { + "epoch": 9.733220530174846, + "grad_norm": 1.4637556076049805, + "learning_rate": 1.3367174280879866e-06, + "loss": 0.8667, + "step": 17257 + }, + { + "epoch": 9.733784545967287, + "grad_norm": 1.0930113792419434, + "learning_rate": 1.3338973491257756e-06, + "loss": 0.7563, + "step": 17258 + }, + { + "epoch": 9.73434856175973, + "grad_norm": 0.9572893977165222, + "learning_rate": 1.3310772701635646e-06, + "loss": 0.6156, + "step": 17259 + }, + { + "epoch": 9.734912577552171, + "grad_norm": 1.068151831626892, + "learning_rate": 1.3282571912013536e-06, + "loss": 0.7462, + "step": 17260 + }, + { + "epoch": 9.735476593344615, + "grad_norm": 1.3485878705978394, + "learning_rate": 1.3254371122391428e-06, + "loss": 0.7213, + "step": 17261 + }, + { + "epoch": 9.736040609137056, + "grad_norm": 1.0987964868545532, + "learning_rate": 1.3226170332769318e-06, + "loss": 0.7706, + "step": 17262 + }, + { + "epoch": 9.736604624929498, + "grad_norm": 2.3893442153930664, + "learning_rate": 1.3197969543147208e-06, + "loss": 0.7704, + "step": 17263 + }, + { + "epoch": 9.73716864072194, + "grad_norm": 1.1546530723571777, + "learning_rate": 1.3169768753525098e-06, + "loss": 0.7307, + "step": 17264 + }, + { + "epoch": 9.737732656514382, + "grad_norm": 0.9800902605056763, + "learning_rate": 1.3141567963902988e-06, + "loss": 0.6681, + "step": 17265 + }, + { + "epoch": 9.738296672306825, + "grad_norm": 1.155168056488037, + "learning_rate": 1.311336717428088e-06, + "loss": 0.8084, + "step": 17266 + }, + { + "epoch": 9.738860688099267, + "grad_norm": 1.4173461198806763, + "learning_rate": 1.308516638465877e-06, + "loss": 0.7178, + "step": 17267 + }, + { + "epoch": 9.739424703891709, + "grad_norm": 1.0746508836746216, + "learning_rate": 1.305696559503666e-06, + "loss": 0.6128, + "step": 17268 + }, + { + "epoch": 9.73998871968415, + "grad_norm": 1.1461920738220215, + "learning_rate": 1.302876480541455e-06, + "loss": 0.7496, + "step": 17269 + }, + { + "epoch": 9.740552735476593, + "grad_norm": 1.000115156173706, + "learning_rate": 1.3000564015792443e-06, + "loss": 0.6496, + "step": 17270 + }, + { + "epoch": 9.741116751269036, + "grad_norm": 1.0807297229766846, + "learning_rate": 1.2972363226170333e-06, + "loss": 0.6401, + "step": 17271 + }, + { + "epoch": 9.741680767061478, + "grad_norm": 1.1232620477676392, + "learning_rate": 1.2944162436548223e-06, + "loss": 0.7257, + "step": 17272 + }, + { + "epoch": 9.74224478285392, + "grad_norm": 1.2450659275054932, + "learning_rate": 1.2915961646926113e-06, + "loss": 0.7429, + "step": 17273 + }, + { + "epoch": 9.742808798646362, + "grad_norm": 1.1745219230651855, + "learning_rate": 1.2887760857304003e-06, + "loss": 0.7586, + "step": 17274 + }, + { + "epoch": 9.743372814438803, + "grad_norm": 1.1075317859649658, + "learning_rate": 1.2859560067681896e-06, + "loss": 0.6903, + "step": 17275 + }, + { + "epoch": 9.743936830231247, + "grad_norm": 1.3692890405654907, + "learning_rate": 1.2831359278059786e-06, + "loss": 0.6922, + "step": 17276 + }, + { + "epoch": 9.744500846023689, + "grad_norm": 0.978813111782074, + "learning_rate": 1.2803158488437676e-06, + "loss": 0.6674, + "step": 17277 + }, + { + "epoch": 9.74506486181613, + "grad_norm": 1.0557259321212769, + "learning_rate": 1.2774957698815566e-06, + "loss": 0.6508, + "step": 17278 + }, + { + "epoch": 9.745628877608572, + "grad_norm": 1.4715486764907837, + "learning_rate": 1.2746756909193458e-06, + "loss": 0.731, + "step": 17279 + }, + { + "epoch": 9.746192893401016, + "grad_norm": 1.3502227067947388, + "learning_rate": 1.2718556119571348e-06, + "loss": 0.7201, + "step": 17280 + }, + { + "epoch": 9.746756909193458, + "grad_norm": 1.034761667251587, + "learning_rate": 1.2690355329949238e-06, + "loss": 0.6829, + "step": 17281 + }, + { + "epoch": 9.7473209249859, + "grad_norm": 1.0978426933288574, + "learning_rate": 1.2662154540327128e-06, + "loss": 0.6978, + "step": 17282 + }, + { + "epoch": 9.747884940778341, + "grad_norm": 1.0452656745910645, + "learning_rate": 1.263395375070502e-06, + "loss": 0.684, + "step": 17283 + }, + { + "epoch": 9.748448956570783, + "grad_norm": 1.302400827407837, + "learning_rate": 1.260575296108291e-06, + "loss": 0.747, + "step": 17284 + }, + { + "epoch": 9.749012972363227, + "grad_norm": 1.041455626487732, + "learning_rate": 1.25775521714608e-06, + "loss": 0.7697, + "step": 17285 + }, + { + "epoch": 9.749576988155669, + "grad_norm": 1.1948997974395752, + "learning_rate": 1.254935138183869e-06, + "loss": 0.8856, + "step": 17286 + }, + { + "epoch": 9.75014100394811, + "grad_norm": 1.53201162815094, + "learning_rate": 1.2521150592216583e-06, + "loss": 0.7441, + "step": 17287 + }, + { + "epoch": 9.750705019740552, + "grad_norm": 1.1999077796936035, + "learning_rate": 1.2492949802594473e-06, + "loss": 0.7244, + "step": 17288 + }, + { + "epoch": 9.751269035532996, + "grad_norm": 1.3878737688064575, + "learning_rate": 1.2464749012972363e-06, + "loss": 0.8187, + "step": 17289 + }, + { + "epoch": 9.751833051325438, + "grad_norm": 1.0900689363479614, + "learning_rate": 1.2436548223350253e-06, + "loss": 0.682, + "step": 17290 + }, + { + "epoch": 9.75239706711788, + "grad_norm": 1.2616161108016968, + "learning_rate": 1.2408347433728145e-06, + "loss": 0.8262, + "step": 17291 + }, + { + "epoch": 9.752961082910321, + "grad_norm": 0.7602943778038025, + "learning_rate": 1.2380146644106035e-06, + "loss": 0.6216, + "step": 17292 + }, + { + "epoch": 9.753525098702763, + "grad_norm": 1.5080382823944092, + "learning_rate": 1.2351945854483926e-06, + "loss": 0.798, + "step": 17293 + }, + { + "epoch": 9.754089114495207, + "grad_norm": 1.4849226474761963, + "learning_rate": 1.2323745064861818e-06, + "loss": 0.7631, + "step": 17294 + }, + { + "epoch": 9.754653130287648, + "grad_norm": 1.4794964790344238, + "learning_rate": 1.2295544275239708e-06, + "loss": 0.7389, + "step": 17295 + }, + { + "epoch": 9.75521714608009, + "grad_norm": 1.0420637130737305, + "learning_rate": 1.2267343485617598e-06, + "loss": 0.7481, + "step": 17296 + }, + { + "epoch": 9.755781161872532, + "grad_norm": 1.5383038520812988, + "learning_rate": 1.2239142695995488e-06, + "loss": 0.787, + "step": 17297 + }, + { + "epoch": 9.756345177664974, + "grad_norm": 1.4831299781799316, + "learning_rate": 1.221094190637338e-06, + "loss": 0.7898, + "step": 17298 + }, + { + "epoch": 9.756909193457417, + "grad_norm": 0.8274185657501221, + "learning_rate": 1.218274111675127e-06, + "loss": 0.6811, + "step": 17299 + }, + { + "epoch": 9.75747320924986, + "grad_norm": 1.1715673208236694, + "learning_rate": 1.215454032712916e-06, + "loss": 0.7323, + "step": 17300 + }, + { + "epoch": 9.758037225042301, + "grad_norm": 1.2589625120162964, + "learning_rate": 1.212633953750705e-06, + "loss": 0.8081, + "step": 17301 + }, + { + "epoch": 9.758601240834743, + "grad_norm": 1.038156509399414, + "learning_rate": 1.2098138747884943e-06, + "loss": 0.6613, + "step": 17302 + }, + { + "epoch": 9.759165256627185, + "grad_norm": 1.044348955154419, + "learning_rate": 1.2069937958262833e-06, + "loss": 0.7665, + "step": 17303 + }, + { + "epoch": 9.759729272419628, + "grad_norm": 1.0608711242675781, + "learning_rate": 1.2041737168640723e-06, + "loss": 0.6903, + "step": 17304 + }, + { + "epoch": 9.76029328821207, + "grad_norm": 0.8669581413269043, + "learning_rate": 1.2013536379018613e-06, + "loss": 0.7177, + "step": 17305 + }, + { + "epoch": 9.760857304004512, + "grad_norm": 1.3035835027694702, + "learning_rate": 1.1985335589396505e-06, + "loss": 0.7943, + "step": 17306 + }, + { + "epoch": 9.761421319796954, + "grad_norm": 1.1842483282089233, + "learning_rate": 1.1957134799774395e-06, + "loss": 0.725, + "step": 17307 + }, + { + "epoch": 9.761985335589397, + "grad_norm": 0.9909364581108093, + "learning_rate": 1.1928934010152285e-06, + "loss": 0.717, + "step": 17308 + }, + { + "epoch": 9.762549351381839, + "grad_norm": 1.6474754810333252, + "learning_rate": 1.1900733220530175e-06, + "loss": 0.8106, + "step": 17309 + }, + { + "epoch": 9.763113367174281, + "grad_norm": 1.330762267112732, + "learning_rate": 1.1872532430908068e-06, + "loss": 0.8325, + "step": 17310 + }, + { + "epoch": 9.763677382966723, + "grad_norm": 1.176673173904419, + "learning_rate": 1.1844331641285958e-06, + "loss": 0.7825, + "step": 17311 + }, + { + "epoch": 9.764241398759165, + "grad_norm": 1.2244011163711548, + "learning_rate": 1.1816130851663848e-06, + "loss": 0.7079, + "step": 17312 + }, + { + "epoch": 9.764805414551608, + "grad_norm": 1.1315367221832275, + "learning_rate": 1.1787930062041738e-06, + "loss": 0.7191, + "step": 17313 + }, + { + "epoch": 9.76536943034405, + "grad_norm": 1.3320176601409912, + "learning_rate": 1.175972927241963e-06, + "loss": 0.7324, + "step": 17314 + }, + { + "epoch": 9.765933446136492, + "grad_norm": 1.2341744899749756, + "learning_rate": 1.173152848279752e-06, + "loss": 0.7955, + "step": 17315 + }, + { + "epoch": 9.766497461928934, + "grad_norm": 1.2021784782409668, + "learning_rate": 1.170332769317541e-06, + "loss": 0.7372, + "step": 17316 + }, + { + "epoch": 9.767061477721377, + "grad_norm": 1.2911149263381958, + "learning_rate": 1.16751269035533e-06, + "loss": 0.6842, + "step": 17317 + }, + { + "epoch": 9.767625493513819, + "grad_norm": 1.3680390119552612, + "learning_rate": 1.164692611393119e-06, + "loss": 0.7906, + "step": 17318 + }, + { + "epoch": 9.76818950930626, + "grad_norm": 0.9435783624649048, + "learning_rate": 1.1618725324309083e-06, + "loss": 0.602, + "step": 17319 + }, + { + "epoch": 9.768753525098703, + "grad_norm": 0.9460337162017822, + "learning_rate": 1.1590524534686973e-06, + "loss": 0.7124, + "step": 17320 + }, + { + "epoch": 9.769317540891144, + "grad_norm": 1.2994225025177002, + "learning_rate": 1.1562323745064863e-06, + "loss": 0.7437, + "step": 17321 + }, + { + "epoch": 9.769881556683588, + "grad_norm": 0.8987550139427185, + "learning_rate": 1.1534122955442753e-06, + "loss": 0.6228, + "step": 17322 + }, + { + "epoch": 9.77044557247603, + "grad_norm": 0.8818815350532532, + "learning_rate": 1.1505922165820645e-06, + "loss": 0.5853, + "step": 17323 + }, + { + "epoch": 9.771009588268472, + "grad_norm": 0.8537310361862183, + "learning_rate": 1.1477721376198535e-06, + "loss": 0.6743, + "step": 17324 + }, + { + "epoch": 9.771573604060913, + "grad_norm": 1.2766586542129517, + "learning_rate": 1.1449520586576425e-06, + "loss": 0.8171, + "step": 17325 + }, + { + "epoch": 9.772137619853355, + "grad_norm": 0.9809885025024414, + "learning_rate": 1.1421319796954315e-06, + "loss": 0.6353, + "step": 17326 + }, + { + "epoch": 9.772701635645799, + "grad_norm": 1.5175429582595825, + "learning_rate": 1.1393119007332205e-06, + "loss": 0.8352, + "step": 17327 + }, + { + "epoch": 9.77326565143824, + "grad_norm": 1.3648600578308105, + "learning_rate": 1.1364918217710098e-06, + "loss": 0.7095, + "step": 17328 + }, + { + "epoch": 9.773829667230682, + "grad_norm": 0.914574146270752, + "learning_rate": 1.1336717428087988e-06, + "loss": 0.6639, + "step": 17329 + }, + { + "epoch": 9.774393683023124, + "grad_norm": 1.097282886505127, + "learning_rate": 1.1308516638465878e-06, + "loss": 0.7449, + "step": 17330 + }, + { + "epoch": 9.774957698815566, + "grad_norm": 0.8307239413261414, + "learning_rate": 1.1280315848843768e-06, + "loss": 0.6311, + "step": 17331 + }, + { + "epoch": 9.77552171460801, + "grad_norm": 1.2460079193115234, + "learning_rate": 1.125211505922166e-06, + "loss": 0.707, + "step": 17332 + }, + { + "epoch": 9.776085730400451, + "grad_norm": 1.2716630697250366, + "learning_rate": 1.122391426959955e-06, + "loss": 0.7749, + "step": 17333 + }, + { + "epoch": 9.776649746192893, + "grad_norm": 1.1626123189926147, + "learning_rate": 1.119571347997744e-06, + "loss": 0.6974, + "step": 17334 + }, + { + "epoch": 9.777213761985335, + "grad_norm": 0.9535446763038635, + "learning_rate": 1.116751269035533e-06, + "loss": 0.6588, + "step": 17335 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 1.2403512001037598, + "learning_rate": 1.1139311900733222e-06, + "loss": 0.7095, + "step": 17336 + }, + { + "epoch": 9.77834179357022, + "grad_norm": 1.1144944429397583, + "learning_rate": 1.1111111111111112e-06, + "loss": 0.7194, + "step": 17337 + }, + { + "epoch": 9.778905809362662, + "grad_norm": 1.0890427827835083, + "learning_rate": 1.1082910321489003e-06, + "loss": 0.5869, + "step": 17338 + }, + { + "epoch": 9.779469825155104, + "grad_norm": 1.3565242290496826, + "learning_rate": 1.1054709531866893e-06, + "loss": 0.7598, + "step": 17339 + }, + { + "epoch": 9.780033840947546, + "grad_norm": 1.047999620437622, + "learning_rate": 1.1026508742244783e-06, + "loss": 0.8034, + "step": 17340 + }, + { + "epoch": 9.78059785673999, + "grad_norm": 1.1821168661117554, + "learning_rate": 1.0998307952622675e-06, + "loss": 0.7106, + "step": 17341 + }, + { + "epoch": 9.781161872532431, + "grad_norm": 0.9858113527297974, + "learning_rate": 1.0970107163000565e-06, + "loss": 0.7798, + "step": 17342 + }, + { + "epoch": 9.781725888324873, + "grad_norm": 1.7147148847579956, + "learning_rate": 1.0941906373378455e-06, + "loss": 0.7461, + "step": 17343 + }, + { + "epoch": 9.782289904117315, + "grad_norm": 1.5834819078445435, + "learning_rate": 1.0913705583756345e-06, + "loss": 0.6308, + "step": 17344 + }, + { + "epoch": 9.782853919909758, + "grad_norm": 1.379291296005249, + "learning_rate": 1.0885504794134237e-06, + "loss": 0.791, + "step": 17345 + }, + { + "epoch": 9.7834179357022, + "grad_norm": 1.026865839958191, + "learning_rate": 1.0857304004512127e-06, + "loss": 0.8253, + "step": 17346 + }, + { + "epoch": 9.783981951494642, + "grad_norm": 1.4664804935455322, + "learning_rate": 1.0829103214890018e-06, + "loss": 0.6642, + "step": 17347 + }, + { + "epoch": 9.784545967287084, + "grad_norm": 0.8244261145591736, + "learning_rate": 1.0800902425267908e-06, + "loss": 0.634, + "step": 17348 + }, + { + "epoch": 9.785109983079526, + "grad_norm": 1.081296443939209, + "learning_rate": 1.0772701635645798e-06, + "loss": 0.7083, + "step": 17349 + }, + { + "epoch": 9.78567399887197, + "grad_norm": 1.4792320728302002, + "learning_rate": 1.074450084602369e-06, + "loss": 0.6949, + "step": 17350 + }, + { + "epoch": 9.786238014664411, + "grad_norm": 1.034199595451355, + "learning_rate": 1.071630005640158e-06, + "loss": 0.6556, + "step": 17351 + }, + { + "epoch": 9.786802030456853, + "grad_norm": 1.1446306705474854, + "learning_rate": 1.068809926677947e-06, + "loss": 0.6378, + "step": 17352 + }, + { + "epoch": 9.787366046249295, + "grad_norm": 0.9454776644706726, + "learning_rate": 1.065989847715736e-06, + "loss": 0.609, + "step": 17353 + }, + { + "epoch": 9.787930062041736, + "grad_norm": 1.337180495262146, + "learning_rate": 1.0631697687535252e-06, + "loss": 0.8539, + "step": 17354 + }, + { + "epoch": 9.78849407783418, + "grad_norm": 1.063730001449585, + "learning_rate": 1.0603496897913142e-06, + "loss": 0.6434, + "step": 17355 + }, + { + "epoch": 9.789058093626622, + "grad_norm": 1.1948448419570923, + "learning_rate": 1.0575296108291033e-06, + "loss": 0.7183, + "step": 17356 + }, + { + "epoch": 9.789622109419064, + "grad_norm": 0.9168012738227844, + "learning_rate": 1.0547095318668923e-06, + "loss": 0.6908, + "step": 17357 + }, + { + "epoch": 9.790186125211505, + "grad_norm": 1.3657022714614868, + "learning_rate": 1.0518894529046815e-06, + "loss": 0.7995, + "step": 17358 + }, + { + "epoch": 9.790750141003947, + "grad_norm": 1.1715381145477295, + "learning_rate": 1.0490693739424705e-06, + "loss": 0.8026, + "step": 17359 + }, + { + "epoch": 9.79131415679639, + "grad_norm": 1.1385910511016846, + "learning_rate": 1.0462492949802595e-06, + "loss": 0.7334, + "step": 17360 + }, + { + "epoch": 9.791878172588833, + "grad_norm": 0.9327241778373718, + "learning_rate": 1.0434292160180485e-06, + "loss": 0.74, + "step": 17361 + }, + { + "epoch": 9.792442188381274, + "grad_norm": 0.7185825109481812, + "learning_rate": 1.0406091370558375e-06, + "loss": 0.5908, + "step": 17362 + }, + { + "epoch": 9.793006204173716, + "grad_norm": 1.214795708656311, + "learning_rate": 1.0377890580936267e-06, + "loss": 0.7411, + "step": 17363 + }, + { + "epoch": 9.79357021996616, + "grad_norm": 0.715731680393219, + "learning_rate": 1.0349689791314157e-06, + "loss": 0.5886, + "step": 17364 + }, + { + "epoch": 9.794134235758602, + "grad_norm": 1.0708564519882202, + "learning_rate": 1.0321489001692048e-06, + "loss": 0.818, + "step": 17365 + }, + { + "epoch": 9.794698251551043, + "grad_norm": 1.098395824432373, + "learning_rate": 1.0293288212069938e-06, + "loss": 0.7385, + "step": 17366 + }, + { + "epoch": 9.795262267343485, + "grad_norm": 1.0778664350509644, + "learning_rate": 1.026508742244783e-06, + "loss": 0.7631, + "step": 17367 + }, + { + "epoch": 9.795826283135927, + "grad_norm": 1.246421456336975, + "learning_rate": 1.023688663282572e-06, + "loss": 0.7406, + "step": 17368 + }, + { + "epoch": 9.79639029892837, + "grad_norm": 0.9806740880012512, + "learning_rate": 1.020868584320361e-06, + "loss": 0.5858, + "step": 17369 + }, + { + "epoch": 9.796954314720812, + "grad_norm": 1.0840545892715454, + "learning_rate": 1.01804850535815e-06, + "loss": 0.7328, + "step": 17370 + }, + { + "epoch": 9.797518330513254, + "grad_norm": 1.0119153261184692, + "learning_rate": 1.015228426395939e-06, + "loss": 0.7833, + "step": 17371 + }, + { + "epoch": 9.798082346305696, + "grad_norm": 0.8254488706588745, + "learning_rate": 1.0124083474337282e-06, + "loss": 0.6588, + "step": 17372 + }, + { + "epoch": 9.79864636209814, + "grad_norm": 0.8512617945671082, + "learning_rate": 1.0095882684715172e-06, + "loss": 0.6436, + "step": 17373 + }, + { + "epoch": 9.799210377890581, + "grad_norm": 1.0385946035385132, + "learning_rate": 1.0067681895093063e-06, + "loss": 0.6588, + "step": 17374 + }, + { + "epoch": 9.799774393683023, + "grad_norm": 1.1679491996765137, + "learning_rate": 1.0039481105470953e-06, + "loss": 0.6697, + "step": 17375 + }, + { + "epoch": 9.800338409475465, + "grad_norm": 1.2895041704177856, + "learning_rate": 1.0011280315848845e-06, + "loss": 0.7186, + "step": 17376 + }, + { + "epoch": 9.800902425267907, + "grad_norm": 1.476844072341919, + "learning_rate": 9.983079526226735e-07, + "loss": 0.8501, + "step": 17377 + }, + { + "epoch": 9.80146644106035, + "grad_norm": 1.3683550357818604, + "learning_rate": 9.954878736604625e-07, + "loss": 0.6972, + "step": 17378 + }, + { + "epoch": 9.802030456852792, + "grad_norm": 0.7961894273757935, + "learning_rate": 9.926677946982515e-07, + "loss": 0.57, + "step": 17379 + }, + { + "epoch": 9.802594472645234, + "grad_norm": 1.152233600616455, + "learning_rate": 9.898477157360405e-07, + "loss": 0.7252, + "step": 17380 + }, + { + "epoch": 9.803158488437676, + "grad_norm": 7.263650417327881, + "learning_rate": 9.870276367738297e-07, + "loss": 0.8068, + "step": 17381 + }, + { + "epoch": 9.803722504230118, + "grad_norm": 1.2058957815170288, + "learning_rate": 9.842075578116187e-07, + "loss": 0.8216, + "step": 17382 + }, + { + "epoch": 9.804286520022561, + "grad_norm": 0.9708526134490967, + "learning_rate": 9.813874788494077e-07, + "loss": 0.6372, + "step": 17383 + }, + { + "epoch": 9.804850535815003, + "grad_norm": 2.2045392990112305, + "learning_rate": 9.785673998871968e-07, + "loss": 0.7212, + "step": 17384 + }, + { + "epoch": 9.805414551607445, + "grad_norm": 1.2760804891586304, + "learning_rate": 9.75747320924986e-07, + "loss": 0.7592, + "step": 17385 + }, + { + "epoch": 9.805978567399887, + "grad_norm": 1.336472511291504, + "learning_rate": 9.72927241962775e-07, + "loss": 0.7956, + "step": 17386 + }, + { + "epoch": 9.806542583192329, + "grad_norm": 1.3092082738876343, + "learning_rate": 9.70107163000564e-07, + "loss": 0.6757, + "step": 17387 + }, + { + "epoch": 9.807106598984772, + "grad_norm": 1.147050142288208, + "learning_rate": 9.67287084038353e-07, + "loss": 0.6909, + "step": 17388 + }, + { + "epoch": 9.807670614777214, + "grad_norm": 0.8570289611816406, + "learning_rate": 9.644670050761422e-07, + "loss": 0.647, + "step": 17389 + }, + { + "epoch": 9.808234630569656, + "grad_norm": 1.0272822380065918, + "learning_rate": 9.616469261139312e-07, + "loss": 0.7281, + "step": 17390 + }, + { + "epoch": 9.808798646362098, + "grad_norm": 1.4789777994155884, + "learning_rate": 9.588268471517202e-07, + "loss": 0.6667, + "step": 17391 + }, + { + "epoch": 9.809362662154541, + "grad_norm": 1.1437430381774902, + "learning_rate": 9.560067681895092e-07, + "loss": 0.7113, + "step": 17392 + }, + { + "epoch": 9.809926677946983, + "grad_norm": 1.2118257284164429, + "learning_rate": 9.531866892272984e-07, + "loss": 0.7442, + "step": 17393 + }, + { + "epoch": 9.810490693739425, + "grad_norm": 1.1499062776565552, + "learning_rate": 9.503666102650875e-07, + "loss": 0.7407, + "step": 17394 + }, + { + "epoch": 9.811054709531867, + "grad_norm": 1.0538818836212158, + "learning_rate": 9.475465313028766e-07, + "loss": 0.6314, + "step": 17395 + }, + { + "epoch": 9.811618725324308, + "grad_norm": 1.2353986501693726, + "learning_rate": 9.447264523406656e-07, + "loss": 0.7391, + "step": 17396 + }, + { + "epoch": 9.812182741116752, + "grad_norm": 1.0848474502563477, + "learning_rate": 9.419063733784546e-07, + "loss": 0.7692, + "step": 17397 + }, + { + "epoch": 9.812746756909194, + "grad_norm": 1.0704834461212158, + "learning_rate": 9.390862944162438e-07, + "loss": 0.7016, + "step": 17398 + }, + { + "epoch": 9.813310772701636, + "grad_norm": 0.9627912044525146, + "learning_rate": 9.362662154540328e-07, + "loss": 0.7401, + "step": 17399 + }, + { + "epoch": 9.813874788494077, + "grad_norm": 1.1579172611236572, + "learning_rate": 9.334461364918218e-07, + "loss": 0.7442, + "step": 17400 + }, + { + "epoch": 9.814438804286521, + "grad_norm": 1.3121116161346436, + "learning_rate": 9.306260575296109e-07, + "loss": 0.6769, + "step": 17401 + }, + { + "epoch": 9.815002820078963, + "grad_norm": 1.046138048171997, + "learning_rate": 9.278059785673999e-07, + "loss": 0.6594, + "step": 17402 + }, + { + "epoch": 9.815566835871405, + "grad_norm": 1.0308040380477905, + "learning_rate": 9.249858996051891e-07, + "loss": 0.7027, + "step": 17403 + }, + { + "epoch": 9.816130851663846, + "grad_norm": 0.9984390735626221, + "learning_rate": 9.221658206429781e-07, + "loss": 0.653, + "step": 17404 + }, + { + "epoch": 9.816694867456288, + "grad_norm": 1.033941626548767, + "learning_rate": 9.193457416807671e-07, + "loss": 0.7348, + "step": 17405 + }, + { + "epoch": 9.817258883248732, + "grad_norm": 1.318007230758667, + "learning_rate": 9.165256627185561e-07, + "loss": 0.7412, + "step": 17406 + }, + { + "epoch": 9.817822899041174, + "grad_norm": 1.0227309465408325, + "learning_rate": 9.137055837563453e-07, + "loss": 0.6521, + "step": 17407 + }, + { + "epoch": 9.818386914833615, + "grad_norm": 1.3658723831176758, + "learning_rate": 9.108855047941343e-07, + "loss": 0.7105, + "step": 17408 + }, + { + "epoch": 9.818950930626057, + "grad_norm": 1.31228768825531, + "learning_rate": 9.080654258319233e-07, + "loss": 0.729, + "step": 17409 + }, + { + "epoch": 9.819514946418499, + "grad_norm": 1.1976194381713867, + "learning_rate": 9.052453468697123e-07, + "loss": 0.8105, + "step": 17410 + }, + { + "epoch": 9.820078962210943, + "grad_norm": 1.2952706813812256, + "learning_rate": 9.024252679075016e-07, + "loss": 0.7674, + "step": 17411 + }, + { + "epoch": 9.820642978003384, + "grad_norm": 1.7757292985916138, + "learning_rate": 8.996051889452906e-07, + "loss": 0.6844, + "step": 17412 + }, + { + "epoch": 9.821206993795826, + "grad_norm": 0.9413073658943176, + "learning_rate": 8.967851099830796e-07, + "loss": 0.6162, + "step": 17413 + }, + { + "epoch": 9.821771009588268, + "grad_norm": 1.3662846088409424, + "learning_rate": 8.939650310208686e-07, + "loss": 0.7593, + "step": 17414 + }, + { + "epoch": 9.82233502538071, + "grad_norm": 1.3803443908691406, + "learning_rate": 8.911449520586576e-07, + "loss": 0.7762, + "step": 17415 + }, + { + "epoch": 9.822899041173153, + "grad_norm": 1.5092111825942993, + "learning_rate": 8.883248730964468e-07, + "loss": 0.8603, + "step": 17416 + }, + { + "epoch": 9.823463056965595, + "grad_norm": 1.007668137550354, + "learning_rate": 8.855047941342358e-07, + "loss": 0.7634, + "step": 17417 + }, + { + "epoch": 9.824027072758037, + "grad_norm": 1.0865923166275024, + "learning_rate": 8.826847151720248e-07, + "loss": 0.8206, + "step": 17418 + }, + { + "epoch": 9.824591088550479, + "grad_norm": 1.1771180629730225, + "learning_rate": 8.798646362098138e-07, + "loss": 0.6758, + "step": 17419 + }, + { + "epoch": 9.825155104342922, + "grad_norm": 1.008675217628479, + "learning_rate": 8.770445572476031e-07, + "loss": 0.6216, + "step": 17420 + }, + { + "epoch": 9.825719120135364, + "grad_norm": 1.1897461414337158, + "learning_rate": 8.742244782853921e-07, + "loss": 0.7974, + "step": 17421 + }, + { + "epoch": 9.826283135927806, + "grad_norm": 1.0289397239685059, + "learning_rate": 8.714043993231811e-07, + "loss": 0.7806, + "step": 17422 + }, + { + "epoch": 9.826847151720248, + "grad_norm": 1.046972393989563, + "learning_rate": 8.685843203609701e-07, + "loss": 0.6731, + "step": 17423 + }, + { + "epoch": 9.82741116751269, + "grad_norm": 1.3636795282363892, + "learning_rate": 8.657642413987591e-07, + "loss": 0.7989, + "step": 17424 + }, + { + "epoch": 9.827975183305133, + "grad_norm": 0.8082247972488403, + "learning_rate": 8.629441624365483e-07, + "loss": 0.5764, + "step": 17425 + }, + { + "epoch": 9.828539199097575, + "grad_norm": 1.0806999206542969, + "learning_rate": 8.601240834743373e-07, + "loss": 0.808, + "step": 17426 + }, + { + "epoch": 9.829103214890017, + "grad_norm": 0.7575154900550842, + "learning_rate": 8.573040045121263e-07, + "loss": 0.6187, + "step": 17427 + }, + { + "epoch": 9.829667230682459, + "grad_norm": 1.2110984325408936, + "learning_rate": 8.544839255499153e-07, + "loss": 0.6961, + "step": 17428 + }, + { + "epoch": 9.830231246474902, + "grad_norm": 0.9238287806510925, + "learning_rate": 8.516638465877046e-07, + "loss": 0.7243, + "step": 17429 + }, + { + "epoch": 9.830795262267344, + "grad_norm": 0.7290323972702026, + "learning_rate": 8.488437676254936e-07, + "loss": 0.6207, + "step": 17430 + }, + { + "epoch": 9.831359278059786, + "grad_norm": 1.1390115022659302, + "learning_rate": 8.460236886632826e-07, + "loss": 0.6863, + "step": 17431 + }, + { + "epoch": 9.831923293852228, + "grad_norm": 1.058968424797058, + "learning_rate": 8.432036097010716e-07, + "loss": 0.7024, + "step": 17432 + }, + { + "epoch": 9.83248730964467, + "grad_norm": 1.1154541969299316, + "learning_rate": 8.403835307388608e-07, + "loss": 0.6749, + "step": 17433 + }, + { + "epoch": 9.833051325437113, + "grad_norm": 1.038639783859253, + "learning_rate": 8.375634517766498e-07, + "loss": 0.7116, + "step": 17434 + }, + { + "epoch": 9.833615341229555, + "grad_norm": 1.4292291402816772, + "learning_rate": 8.347433728144388e-07, + "loss": 0.7244, + "step": 17435 + }, + { + "epoch": 9.834179357021997, + "grad_norm": 1.2087548971176147, + "learning_rate": 8.319232938522278e-07, + "loss": 0.7142, + "step": 17436 + }, + { + "epoch": 9.834743372814438, + "grad_norm": 1.1971988677978516, + "learning_rate": 8.291032148900168e-07, + "loss": 0.732, + "step": 17437 + }, + { + "epoch": 9.83530738860688, + "grad_norm": 1.0016748905181885, + "learning_rate": 8.262831359278061e-07, + "loss": 0.7025, + "step": 17438 + }, + { + "epoch": 9.835871404399324, + "grad_norm": 1.1094032526016235, + "learning_rate": 8.234630569655951e-07, + "loss": 0.6674, + "step": 17439 + }, + { + "epoch": 9.836435420191766, + "grad_norm": 1.3857156038284302, + "learning_rate": 8.206429780033841e-07, + "loss": 0.8002, + "step": 17440 + }, + { + "epoch": 9.836999435984207, + "grad_norm": 1.1711641550064087, + "learning_rate": 8.178228990411731e-07, + "loss": 0.6173, + "step": 17441 + }, + { + "epoch": 9.83756345177665, + "grad_norm": 1.0495328903198242, + "learning_rate": 8.150028200789623e-07, + "loss": 0.7353, + "step": 17442 + }, + { + "epoch": 9.838127467569091, + "grad_norm": 1.1245590448379517, + "learning_rate": 8.121827411167513e-07, + "loss": 0.7549, + "step": 17443 + }, + { + "epoch": 9.838691483361535, + "grad_norm": 1.177092432975769, + "learning_rate": 8.093626621545403e-07, + "loss": 0.7354, + "step": 17444 + }, + { + "epoch": 9.839255499153976, + "grad_norm": 0.9202442169189453, + "learning_rate": 8.065425831923293e-07, + "loss": 0.7469, + "step": 17445 + }, + { + "epoch": 9.839819514946418, + "grad_norm": 1.0825430154800415, + "learning_rate": 8.037225042301184e-07, + "loss": 0.6524, + "step": 17446 + }, + { + "epoch": 9.84038353073886, + "grad_norm": 1.0648006200790405, + "learning_rate": 8.009024252679076e-07, + "loss": 0.642, + "step": 17447 + }, + { + "epoch": 9.840947546531304, + "grad_norm": 0.9800079464912415, + "learning_rate": 7.980823463056966e-07, + "loss": 0.7497, + "step": 17448 + }, + { + "epoch": 9.841511562323745, + "grad_norm": 1.237940788269043, + "learning_rate": 7.952622673434857e-07, + "loss": 0.6998, + "step": 17449 + }, + { + "epoch": 9.842075578116187, + "grad_norm": 0.9350874423980713, + "learning_rate": 7.924421883812747e-07, + "loss": 0.6574, + "step": 17450 + }, + { + "epoch": 9.842639593908629, + "grad_norm": 1.2476868629455566, + "learning_rate": 7.896221094190638e-07, + "loss": 0.8445, + "step": 17451 + }, + { + "epoch": 9.843203609701071, + "grad_norm": 1.2392796277999878, + "learning_rate": 7.868020304568528e-07, + "loss": 0.71, + "step": 17452 + }, + { + "epoch": 9.843767625493514, + "grad_norm": 1.1672393083572388, + "learning_rate": 7.839819514946419e-07, + "loss": 0.8274, + "step": 17453 + }, + { + "epoch": 9.844331641285956, + "grad_norm": 1.078926682472229, + "learning_rate": 7.811618725324309e-07, + "loss": 0.6702, + "step": 17454 + }, + { + "epoch": 9.844895657078398, + "grad_norm": 1.0623952150344849, + "learning_rate": 7.7834179357022e-07, + "loss": 0.5867, + "step": 17455 + }, + { + "epoch": 9.84545967287084, + "grad_norm": 1.3073409795761108, + "learning_rate": 7.755217146080091e-07, + "loss": 0.7708, + "step": 17456 + }, + { + "epoch": 9.846023688663283, + "grad_norm": 1.0135247707366943, + "learning_rate": 7.727016356457982e-07, + "loss": 0.7134, + "step": 17457 + }, + { + "epoch": 9.846587704455725, + "grad_norm": 1.124956727027893, + "learning_rate": 7.698815566835872e-07, + "loss": 0.7816, + "step": 17458 + }, + { + "epoch": 9.847151720248167, + "grad_norm": 1.1295243501663208, + "learning_rate": 7.670614777213763e-07, + "loss": 0.7719, + "step": 17459 + }, + { + "epoch": 9.847715736040609, + "grad_norm": 1.3002548217773438, + "learning_rate": 7.642413987591653e-07, + "loss": 0.7836, + "step": 17460 + }, + { + "epoch": 9.84827975183305, + "grad_norm": 1.4106029272079468, + "learning_rate": 7.614213197969544e-07, + "loss": 0.8555, + "step": 17461 + }, + { + "epoch": 9.848843767625494, + "grad_norm": 1.2961879968643188, + "learning_rate": 7.586012408347434e-07, + "loss": 0.831, + "step": 17462 + }, + { + "epoch": 9.849407783417936, + "grad_norm": 1.1642102003097534, + "learning_rate": 7.557811618725325e-07, + "loss": 0.7684, + "step": 17463 + }, + { + "epoch": 9.849971799210378, + "grad_norm": 1.5272533893585205, + "learning_rate": 7.529610829103215e-07, + "loss": 0.6913, + "step": 17464 + }, + { + "epoch": 9.85053581500282, + "grad_norm": 0.9426175355911255, + "learning_rate": 7.501410039481106e-07, + "loss": 0.6959, + "step": 17465 + }, + { + "epoch": 9.851099830795262, + "grad_norm": 0.9944673776626587, + "learning_rate": 7.473209249858997e-07, + "loss": 0.6829, + "step": 17466 + }, + { + "epoch": 9.851663846587705, + "grad_norm": 1.492812156677246, + "learning_rate": 7.445008460236887e-07, + "loss": 0.793, + "step": 17467 + }, + { + "epoch": 9.852227862380147, + "grad_norm": 1.2080790996551514, + "learning_rate": 7.416807670614778e-07, + "loss": 0.8536, + "step": 17468 + }, + { + "epoch": 9.852791878172589, + "grad_norm": 0.8683860898017883, + "learning_rate": 7.388606880992668e-07, + "loss": 0.7367, + "step": 17469 + }, + { + "epoch": 9.85335589396503, + "grad_norm": 0.9951611161231995, + "learning_rate": 7.360406091370559e-07, + "loss": 0.6992, + "step": 17470 + }, + { + "epoch": 9.853919909757472, + "grad_norm": 0.9445409774780273, + "learning_rate": 7.332205301748449e-07, + "loss": 0.7622, + "step": 17471 + }, + { + "epoch": 9.854483925549916, + "grad_norm": 1.4805971384048462, + "learning_rate": 7.30400451212634e-07, + "loss": 0.6731, + "step": 17472 + }, + { + "epoch": 9.855047941342358, + "grad_norm": 0.9829257130622864, + "learning_rate": 7.27580372250423e-07, + "loss": 0.7083, + "step": 17473 + }, + { + "epoch": 9.8556119571348, + "grad_norm": 1.086444616317749, + "learning_rate": 7.247602932882122e-07, + "loss": 0.7115, + "step": 17474 + }, + { + "epoch": 9.856175972927241, + "grad_norm": 1.0760035514831543, + "learning_rate": 7.219402143260012e-07, + "loss": 0.6896, + "step": 17475 + }, + { + "epoch": 9.856739988719685, + "grad_norm": 1.4452720880508423, + "learning_rate": 7.191201353637902e-07, + "loss": 0.7561, + "step": 17476 + }, + { + "epoch": 9.857304004512127, + "grad_norm": 0.9316533207893372, + "learning_rate": 7.163000564015793e-07, + "loss": 0.6791, + "step": 17477 + }, + { + "epoch": 9.857868020304569, + "grad_norm": 1.2798492908477783, + "learning_rate": 7.134799774393683e-07, + "loss": 0.685, + "step": 17478 + }, + { + "epoch": 9.85843203609701, + "grad_norm": 0.9378250241279602, + "learning_rate": 7.106598984771574e-07, + "loss": 0.7262, + "step": 17479 + }, + { + "epoch": 9.858996051889452, + "grad_norm": 1.092014193534851, + "learning_rate": 7.078398195149464e-07, + "loss": 0.8059, + "step": 17480 + }, + { + "epoch": 9.859560067681896, + "grad_norm": 1.1351583003997803, + "learning_rate": 7.050197405527355e-07, + "loss": 0.7272, + "step": 17481 + }, + { + "epoch": 9.860124083474338, + "grad_norm": 1.1330362558364868, + "learning_rate": 7.021996615905245e-07, + "loss": 0.7993, + "step": 17482 + }, + { + "epoch": 9.86068809926678, + "grad_norm": 1.110664963722229, + "learning_rate": 6.993795826283137e-07, + "loss": 0.7333, + "step": 17483 + }, + { + "epoch": 9.861252115059221, + "grad_norm": 1.2250365018844604, + "learning_rate": 6.965595036661027e-07, + "loss": 0.6839, + "step": 17484 + }, + { + "epoch": 9.861816130851665, + "grad_norm": 3.4222264289855957, + "learning_rate": 6.937394247038918e-07, + "loss": 0.8504, + "step": 17485 + }, + { + "epoch": 9.862380146644107, + "grad_norm": 1.213783860206604, + "learning_rate": 6.909193457416808e-07, + "loss": 0.7166, + "step": 17486 + }, + { + "epoch": 9.862944162436548, + "grad_norm": 1.3316596746444702, + "learning_rate": 6.880992667794698e-07, + "loss": 0.695, + "step": 17487 + }, + { + "epoch": 9.86350817822899, + "grad_norm": 1.2173619270324707, + "learning_rate": 6.852791878172589e-07, + "loss": 0.6606, + "step": 17488 + }, + { + "epoch": 9.864072194021432, + "grad_norm": 1.3690290451049805, + "learning_rate": 6.824591088550479e-07, + "loss": 0.7003, + "step": 17489 + }, + { + "epoch": 9.864636209813876, + "grad_norm": 0.8626280426979065, + "learning_rate": 6.79639029892837e-07, + "loss": 0.5953, + "step": 17490 + }, + { + "epoch": 9.865200225606317, + "grad_norm": 1.1044559478759766, + "learning_rate": 6.76818950930626e-07, + "loss": 0.7396, + "step": 17491 + }, + { + "epoch": 9.86576424139876, + "grad_norm": 1.0982956886291504, + "learning_rate": 6.739988719684152e-07, + "loss": 0.7264, + "step": 17492 + }, + { + "epoch": 9.866328257191201, + "grad_norm": 1.00615394115448, + "learning_rate": 6.711787930062042e-07, + "loss": 0.7062, + "step": 17493 + }, + { + "epoch": 9.866892272983643, + "grad_norm": 1.1656345129013062, + "learning_rate": 6.683587140439933e-07, + "loss": 0.7293, + "step": 17494 + }, + { + "epoch": 9.867456288776086, + "grad_norm": 1.3232988119125366, + "learning_rate": 6.655386350817823e-07, + "loss": 0.7066, + "step": 17495 + }, + { + "epoch": 9.868020304568528, + "grad_norm": 1.4310252666473389, + "learning_rate": 6.627185561195714e-07, + "loss": 0.7638, + "step": 17496 + }, + { + "epoch": 9.86858432036097, + "grad_norm": 1.1282968521118164, + "learning_rate": 6.598984771573604e-07, + "loss": 0.7872, + "step": 17497 + }, + { + "epoch": 9.869148336153412, + "grad_norm": 1.2901356220245361, + "learning_rate": 6.570783981951494e-07, + "loss": 0.7726, + "step": 17498 + }, + { + "epoch": 9.869712351945854, + "grad_norm": 1.0490412712097168, + "learning_rate": 6.542583192329385e-07, + "loss": 0.7511, + "step": 17499 + }, + { + "epoch": 9.870276367738297, + "grad_norm": 1.7306162118911743, + "learning_rate": 6.514382402707275e-07, + "loss": 0.8111, + "step": 17500 + } + ], + "logging_steps": 1, + "max_steps": 17730, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 533465448192000.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}