{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996328388897048, "eval_steps": 500, "global_step": 636, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004699662211778528, "grad_norm": 0.37387487292289734, "learning_rate": 9.984276729559748e-06, "loss": 14.8177, "step": 1 }, { "epoch": 0.009399324423557056, "grad_norm": 0.6966238021850586, "learning_rate": 9.968553459119497e-06, "loss": 19.1515, "step": 2 }, { "epoch": 0.014098986635335585, "grad_norm": 0.41008296608924866, "learning_rate": 9.952830188679246e-06, "loss": 18.0352, "step": 3 }, { "epoch": 0.018798648847114113, "grad_norm": 0.3127067983150482, "learning_rate": 9.937106918238994e-06, "loss": 16.7313, "step": 4 }, { "epoch": 0.02349831105889264, "grad_norm": 0.3166525959968567, "learning_rate": 9.921383647798743e-06, "loss": 18.1761, "step": 5 }, { "epoch": 0.02819797327067117, "grad_norm": 0.23945656418800354, "learning_rate": 9.905660377358492e-06, "loss": 16.389, "step": 6 }, { "epoch": 0.0328976354824497, "grad_norm": 0.34680044651031494, "learning_rate": 9.88993710691824e-06, "loss": 17.1364, "step": 7 }, { "epoch": 0.037597297694228225, "grad_norm": 0.2910321056842804, "learning_rate": 9.874213836477988e-06, "loss": 15.698, "step": 8 }, { "epoch": 0.04229695990600676, "grad_norm": 0.48074567317962646, "learning_rate": 9.858490566037736e-06, "loss": 15.518, "step": 9 }, { "epoch": 0.04699662211778528, "grad_norm": 0.43127334117889404, "learning_rate": 9.842767295597485e-06, "loss": 15.3071, "step": 10 }, { "epoch": 0.051696284329563816, "grad_norm": 0.40872570872306824, "learning_rate": 9.827044025157232e-06, "loss": 15.8108, "step": 11 }, { "epoch": 0.05639594654134234, "grad_norm": 0.27355578541755676, "learning_rate": 9.811320754716981e-06, "loss": 15.344, "step": 12 }, { "epoch": 0.06109560875312087, "grad_norm": 0.38371846079826355, "learning_rate": 9.79559748427673e-06, "loss": 16.4535, "step": 13 }, { "epoch": 0.0657952709648994, "grad_norm": 0.3085726797580719, "learning_rate": 9.77987421383648e-06, "loss": 16.2245, "step": 14 }, { "epoch": 0.07049493317667793, "grad_norm": 0.41184675693511963, "learning_rate": 9.764150943396227e-06, "loss": 15.9048, "step": 15 }, { "epoch": 0.07519459538845645, "grad_norm": 0.20860107243061066, "learning_rate": 9.748427672955976e-06, "loss": 15.6796, "step": 16 }, { "epoch": 0.07989425760023498, "grad_norm": 0.31924474239349365, "learning_rate": 9.732704402515723e-06, "loss": 15.6624, "step": 17 }, { "epoch": 0.08459391981201352, "grad_norm": 0.2883175015449524, "learning_rate": 9.716981132075472e-06, "loss": 16.3162, "step": 18 }, { "epoch": 0.08929358202379203, "grad_norm": 0.24593952298164368, "learning_rate": 9.70125786163522e-06, "loss": 14.9932, "step": 19 }, { "epoch": 0.09399324423557057, "grad_norm": 0.23556920886039734, "learning_rate": 9.685534591194969e-06, "loss": 15.3128, "step": 20 }, { "epoch": 0.0986929064473491, "grad_norm": 0.24440030753612518, "learning_rate": 9.669811320754718e-06, "loss": 14.5098, "step": 21 }, { "epoch": 0.10339256865912763, "grad_norm": 0.3471313714981079, "learning_rate": 9.654088050314467e-06, "loss": 14.8793, "step": 22 }, { "epoch": 0.10809223087090615, "grad_norm": 0.39680835604667664, "learning_rate": 9.638364779874214e-06, "loss": 14.3111, "step": 23 }, { "epoch": 0.11279189308268468, "grad_norm": 0.220255047082901, "learning_rate": 9.622641509433963e-06, "loss": 14.3893, "step": 24 }, { "epoch": 0.11749155529446322, "grad_norm": 0.23698750138282776, "learning_rate": 9.606918238993711e-06, "loss": 14.2878, "step": 25 }, { "epoch": 0.12219121750624173, "grad_norm": 0.26187270879745483, "learning_rate": 9.59119496855346e-06, "loss": 13.8048, "step": 26 }, { "epoch": 0.12689087971802027, "grad_norm": 0.2389146238565445, "learning_rate": 9.575471698113207e-06, "loss": 14.0085, "step": 27 }, { "epoch": 0.1315905419297988, "grad_norm": 0.16800901293754578, "learning_rate": 9.559748427672956e-06, "loss": 15.1844, "step": 28 }, { "epoch": 0.13629020414157733, "grad_norm": 0.29327359795570374, "learning_rate": 9.544025157232705e-06, "loss": 14.4227, "step": 29 }, { "epoch": 0.14098986635335586, "grad_norm": 0.449453741312027, "learning_rate": 9.528301886792455e-06, "loss": 13.4381, "step": 30 }, { "epoch": 0.14568952856513437, "grad_norm": 0.2172345072031021, "learning_rate": 9.512578616352202e-06, "loss": 14.3078, "step": 31 }, { "epoch": 0.1503891907769129, "grad_norm": 0.2602121829986572, "learning_rate": 9.496855345911951e-06, "loss": 14.1546, "step": 32 }, { "epoch": 0.15508885298869143, "grad_norm": 0.2682882845401764, "learning_rate": 9.481132075471698e-06, "loss": 14.9953, "step": 33 }, { "epoch": 0.15978851520046997, "grad_norm": 0.20965975522994995, "learning_rate": 9.465408805031447e-06, "loss": 12.4998, "step": 34 }, { "epoch": 0.1644881774122485, "grad_norm": 0.18718308210372925, "learning_rate": 9.449685534591195e-06, "loss": 13.6696, "step": 35 }, { "epoch": 0.16918783962402703, "grad_norm": 0.3141671121120453, "learning_rate": 9.433962264150944e-06, "loss": 13.4048, "step": 36 }, { "epoch": 0.17388750183580556, "grad_norm": 0.31229591369628906, "learning_rate": 9.418238993710691e-06, "loss": 14.1609, "step": 37 }, { "epoch": 0.17858716404758407, "grad_norm": 0.32083943486213684, "learning_rate": 9.40251572327044e-06, "loss": 13.3907, "step": 38 }, { "epoch": 0.1832868262593626, "grad_norm": 0.2657932639122009, "learning_rate": 9.38679245283019e-06, "loss": 13.5954, "step": 39 }, { "epoch": 0.18798648847114113, "grad_norm": 0.2124919593334198, "learning_rate": 9.371069182389939e-06, "loss": 12.1275, "step": 40 }, { "epoch": 0.19268615068291967, "grad_norm": 0.2245129644870758, "learning_rate": 9.355345911949686e-06, "loss": 14.4843, "step": 41 }, { "epoch": 0.1973858128946982, "grad_norm": 0.2546398937702179, "learning_rate": 9.339622641509435e-06, "loss": 13.987, "step": 42 }, { "epoch": 0.20208547510647673, "grad_norm": 0.2680453956127167, "learning_rate": 9.323899371069182e-06, "loss": 14.2591, "step": 43 }, { "epoch": 0.20678513731825526, "grad_norm": 0.24811767041683197, "learning_rate": 9.308176100628931e-06, "loss": 12.6381, "step": 44 }, { "epoch": 0.21148479953003377, "grad_norm": 0.3238183259963989, "learning_rate": 9.292452830188679e-06, "loss": 13.2494, "step": 45 }, { "epoch": 0.2161844617418123, "grad_norm": 0.1908600628376007, "learning_rate": 9.276729559748428e-06, "loss": 12.382, "step": 46 }, { "epoch": 0.22088412395359083, "grad_norm": 0.2511428892612457, "learning_rate": 9.261006289308177e-06, "loss": 13.7845, "step": 47 }, { "epoch": 0.22558378616536937, "grad_norm": 0.24330441653728485, "learning_rate": 9.245283018867926e-06, "loss": 14.4374, "step": 48 }, { "epoch": 0.2302834483771479, "grad_norm": 0.32802456617355347, "learning_rate": 9.229559748427674e-06, "loss": 15.1429, "step": 49 }, { "epoch": 0.23498311058892643, "grad_norm": 0.2274816483259201, "learning_rate": 9.213836477987423e-06, "loss": 14.2767, "step": 50 }, { "epoch": 0.23968277280070496, "grad_norm": 0.2556872069835663, "learning_rate": 9.19811320754717e-06, "loss": 12.8074, "step": 51 }, { "epoch": 0.24438243501248347, "grad_norm": 0.23709972202777863, "learning_rate": 9.182389937106919e-06, "loss": 14.2371, "step": 52 }, { "epoch": 0.249082097224262, "grad_norm": 0.2529979646205902, "learning_rate": 9.166666666666666e-06, "loss": 13.4761, "step": 53 }, { "epoch": 0.25378175943604053, "grad_norm": 0.20136186480522156, "learning_rate": 9.150943396226416e-06, "loss": 13.5667, "step": 54 }, { "epoch": 0.25848142164781907, "grad_norm": 0.26017817854881287, "learning_rate": 9.135220125786165e-06, "loss": 12.0966, "step": 55 }, { "epoch": 0.2631810838595976, "grad_norm": 0.229143887758255, "learning_rate": 9.119496855345914e-06, "loss": 11.9974, "step": 56 }, { "epoch": 0.26788074607137613, "grad_norm": 0.21751444041728973, "learning_rate": 9.103773584905661e-06, "loss": 12.672, "step": 57 }, { "epoch": 0.27258040828315466, "grad_norm": 0.33019641041755676, "learning_rate": 9.08805031446541e-06, "loss": 13.1276, "step": 58 }, { "epoch": 0.2772800704949332, "grad_norm": 0.33665406703948975, "learning_rate": 9.072327044025158e-06, "loss": 14.1335, "step": 59 }, { "epoch": 0.2819797327067117, "grad_norm": 0.26073992252349854, "learning_rate": 9.056603773584907e-06, "loss": 12.0254, "step": 60 }, { "epoch": 0.28667939491849026, "grad_norm": 0.19817674160003662, "learning_rate": 9.040880503144654e-06, "loss": 12.1673, "step": 61 }, { "epoch": 0.29137905713026874, "grad_norm": 0.21664577722549438, "learning_rate": 9.025157232704403e-06, "loss": 12.5545, "step": 62 }, { "epoch": 0.29607871934204727, "grad_norm": 0.23701545596122742, "learning_rate": 9.009433962264152e-06, "loss": 14.0941, "step": 63 }, { "epoch": 0.3007783815538258, "grad_norm": 0.3105368912220001, "learning_rate": 8.9937106918239e-06, "loss": 12.8562, "step": 64 }, { "epoch": 0.30547804376560433, "grad_norm": 0.2481117844581604, "learning_rate": 8.977987421383649e-06, "loss": 13.4049, "step": 65 }, { "epoch": 0.31017770597738287, "grad_norm": 0.19058114290237427, "learning_rate": 8.962264150943398e-06, "loss": 12.7632, "step": 66 }, { "epoch": 0.3148773681891614, "grad_norm": 0.19078046083450317, "learning_rate": 8.946540880503145e-06, "loss": 11.4443, "step": 67 }, { "epoch": 0.31957703040093993, "grad_norm": 0.28984832763671875, "learning_rate": 8.930817610062894e-06, "loss": 11.4983, "step": 68 }, { "epoch": 0.32427669261271846, "grad_norm": 0.22708000242710114, "learning_rate": 8.915094339622642e-06, "loss": 11.4445, "step": 69 }, { "epoch": 0.328976354824497, "grad_norm": 0.2923092246055603, "learning_rate": 8.89937106918239e-06, "loss": 11.9092, "step": 70 }, { "epoch": 0.33367601703627553, "grad_norm": 0.2001643031835556, "learning_rate": 8.883647798742138e-06, "loss": 12.4458, "step": 71 }, { "epoch": 0.33837567924805406, "grad_norm": 0.2258598357439041, "learning_rate": 8.867924528301887e-06, "loss": 13.2357, "step": 72 }, { "epoch": 0.3430753414598326, "grad_norm": 0.16833625733852386, "learning_rate": 8.852201257861636e-06, "loss": 11.8965, "step": 73 }, { "epoch": 0.3477750036716111, "grad_norm": 0.21175910532474518, "learning_rate": 8.836477987421385e-06, "loss": 12.3812, "step": 74 }, { "epoch": 0.3524746658833896, "grad_norm": 0.23696675896644592, "learning_rate": 8.820754716981133e-06, "loss": 12.5212, "step": 75 }, { "epoch": 0.35717432809516814, "grad_norm": 0.2253318428993225, "learning_rate": 8.805031446540882e-06, "loss": 13.5576, "step": 76 }, { "epoch": 0.36187399030694667, "grad_norm": 0.4422793984413147, "learning_rate": 8.789308176100629e-06, "loss": 12.2526, "step": 77 }, { "epoch": 0.3665736525187252, "grad_norm": 0.24399343132972717, "learning_rate": 8.773584905660378e-06, "loss": 13.1149, "step": 78 }, { "epoch": 0.37127331473050373, "grad_norm": 0.24371573328971863, "learning_rate": 8.757861635220126e-06, "loss": 13.8228, "step": 79 }, { "epoch": 0.37597297694228227, "grad_norm": 0.34779033064842224, "learning_rate": 8.742138364779875e-06, "loss": 13.9884, "step": 80 }, { "epoch": 0.3806726391540608, "grad_norm": 0.18815316259860992, "learning_rate": 8.726415094339624e-06, "loss": 12.9627, "step": 81 }, { "epoch": 0.38537230136583933, "grad_norm": 0.19260907173156738, "learning_rate": 8.710691823899373e-06, "loss": 12.4689, "step": 82 }, { "epoch": 0.39007196357761786, "grad_norm": 0.2607057988643646, "learning_rate": 8.69496855345912e-06, "loss": 11.9264, "step": 83 }, { "epoch": 0.3947716257893964, "grad_norm": 0.3185189962387085, "learning_rate": 8.67924528301887e-06, "loss": 13.5177, "step": 84 }, { "epoch": 0.39947128800117493, "grad_norm": 0.2083529680967331, "learning_rate": 8.663522012578617e-06, "loss": 12.9385, "step": 85 }, { "epoch": 0.40417095021295346, "grad_norm": 0.31580668687820435, "learning_rate": 8.647798742138366e-06, "loss": 12.07, "step": 86 }, { "epoch": 0.408870612424732, "grad_norm": 0.2299092411994934, "learning_rate": 8.632075471698113e-06, "loss": 12.4621, "step": 87 }, { "epoch": 0.4135702746365105, "grad_norm": 0.17189660668373108, "learning_rate": 8.616352201257862e-06, "loss": 12.0443, "step": 88 }, { "epoch": 0.418269936848289, "grad_norm": 0.20141251385211945, "learning_rate": 8.600628930817611e-06, "loss": 13.3633, "step": 89 }, { "epoch": 0.42296959906006754, "grad_norm": 0.23173026740550995, "learning_rate": 8.58490566037736e-06, "loss": 11.6632, "step": 90 }, { "epoch": 0.42766926127184607, "grad_norm": 0.31551459431648254, "learning_rate": 8.569182389937108e-06, "loss": 11.6973, "step": 91 }, { "epoch": 0.4323689234836246, "grad_norm": 0.21022996306419373, "learning_rate": 8.553459119496857e-06, "loss": 12.2876, "step": 92 }, { "epoch": 0.43706858569540313, "grad_norm": 0.18736988306045532, "learning_rate": 8.537735849056604e-06, "loss": 12.0661, "step": 93 }, { "epoch": 0.44176824790718167, "grad_norm": 0.2048555314540863, "learning_rate": 8.522012578616353e-06, "loss": 12.3136, "step": 94 }, { "epoch": 0.4464679101189602, "grad_norm": 0.2603492736816406, "learning_rate": 8.5062893081761e-06, "loss": 11.6119, "step": 95 }, { "epoch": 0.45116757233073873, "grad_norm": 0.19393539428710938, "learning_rate": 8.49056603773585e-06, "loss": 11.3181, "step": 96 }, { "epoch": 0.45586723454251726, "grad_norm": 0.33143100142478943, "learning_rate": 8.474842767295599e-06, "loss": 12.7137, "step": 97 }, { "epoch": 0.4605668967542958, "grad_norm": 0.2693021893501282, "learning_rate": 8.459119496855346e-06, "loss": 11.7055, "step": 98 }, { "epoch": 0.46526655896607433, "grad_norm": 0.19107303023338318, "learning_rate": 8.443396226415095e-06, "loss": 10.6541, "step": 99 }, { "epoch": 0.46996622117785286, "grad_norm": 0.21745392680168152, "learning_rate": 8.427672955974844e-06, "loss": 12.7379, "step": 100 }, { "epoch": 0.4746658833896314, "grad_norm": 0.2305111140012741, "learning_rate": 8.411949685534592e-06, "loss": 10.6451, "step": 101 }, { "epoch": 0.4793655456014099, "grad_norm": 0.1797025352716446, "learning_rate": 8.39622641509434e-06, "loss": 11.7878, "step": 102 }, { "epoch": 0.4840652078131884, "grad_norm": 0.1907547563314438, "learning_rate": 8.380503144654088e-06, "loss": 12.4672, "step": 103 }, { "epoch": 0.48876487002496694, "grad_norm": 0.19535870850086212, "learning_rate": 8.364779874213837e-06, "loss": 13.9468, "step": 104 }, { "epoch": 0.49346453223674547, "grad_norm": 0.13700228929519653, "learning_rate": 8.349056603773585e-06, "loss": 12.0716, "step": 105 }, { "epoch": 0.498164194448524, "grad_norm": 0.18657416105270386, "learning_rate": 8.333333333333334e-06, "loss": 11.9293, "step": 106 }, { "epoch": 0.5028638566603025, "grad_norm": 0.18295076489448547, "learning_rate": 8.317610062893083e-06, "loss": 12.3987, "step": 107 }, { "epoch": 0.5075635188720811, "grad_norm": 0.14946052432060242, "learning_rate": 8.301886792452832e-06, "loss": 10.5516, "step": 108 }, { "epoch": 0.5122631810838596, "grad_norm": 0.2341127097606659, "learning_rate": 8.28616352201258e-06, "loss": 11.9196, "step": 109 }, { "epoch": 0.5169628432956381, "grad_norm": 0.2202298790216446, "learning_rate": 8.270440251572328e-06, "loss": 11.8898, "step": 110 }, { "epoch": 0.5216625055074167, "grad_norm": 0.35969528555870056, "learning_rate": 8.254716981132076e-06, "loss": 11.5818, "step": 111 }, { "epoch": 0.5263621677191952, "grad_norm": 0.25757789611816406, "learning_rate": 8.238993710691825e-06, "loss": 11.1507, "step": 112 }, { "epoch": 0.5310618299309737, "grad_norm": 0.21967916190624237, "learning_rate": 8.223270440251572e-06, "loss": 11.9559, "step": 113 }, { "epoch": 0.5357614921427523, "grad_norm": 0.2096310555934906, "learning_rate": 8.207547169811321e-06, "loss": 11.5168, "step": 114 }, { "epoch": 0.5404611543545308, "grad_norm": 0.193389430642128, "learning_rate": 8.19182389937107e-06, "loss": 10.2956, "step": 115 }, { "epoch": 0.5451608165663093, "grad_norm": 0.193772092461586, "learning_rate": 8.17610062893082e-06, "loss": 11.9338, "step": 116 }, { "epoch": 0.5498604787780879, "grad_norm": 0.20478712022304535, "learning_rate": 8.160377358490567e-06, "loss": 12.3949, "step": 117 }, { "epoch": 0.5545601409898664, "grad_norm": 0.1741381585597992, "learning_rate": 8.144654088050316e-06, "loss": 11.2504, "step": 118 }, { "epoch": 0.5592598032016449, "grad_norm": 0.25559255480766296, "learning_rate": 8.128930817610063e-06, "loss": 12.338, "step": 119 }, { "epoch": 0.5639594654134235, "grad_norm": 0.31693610548973083, "learning_rate": 8.113207547169812e-06, "loss": 10.3527, "step": 120 }, { "epoch": 0.568659127625202, "grad_norm": 0.192024365067482, "learning_rate": 8.09748427672956e-06, "loss": 10.6996, "step": 121 }, { "epoch": 0.5733587898369805, "grad_norm": 0.2648765742778778, "learning_rate": 8.081761006289309e-06, "loss": 11.7775, "step": 122 }, { "epoch": 0.5780584520487589, "grad_norm": 0.1612204909324646, "learning_rate": 8.066037735849058e-06, "loss": 11.8107, "step": 123 }, { "epoch": 0.5827581142605375, "grad_norm": 0.21173831820487976, "learning_rate": 8.050314465408805e-06, "loss": 10.785, "step": 124 }, { "epoch": 0.587457776472316, "grad_norm": 0.24022893607616425, "learning_rate": 8.034591194968554e-06, "loss": 11.9655, "step": 125 }, { "epoch": 0.5921574386840945, "grad_norm": 0.27667760848999023, "learning_rate": 8.018867924528303e-06, "loss": 11.3075, "step": 126 }, { "epoch": 0.5968571008958731, "grad_norm": 0.23273716866970062, "learning_rate": 8.00314465408805e-06, "loss": 11.512, "step": 127 }, { "epoch": 0.6015567631076516, "grad_norm": 0.15333759784698486, "learning_rate": 7.9874213836478e-06, "loss": 10.6626, "step": 128 }, { "epoch": 0.6062564253194301, "grad_norm": 0.23889288306236267, "learning_rate": 7.971698113207547e-06, "loss": 11.7195, "step": 129 }, { "epoch": 0.6109560875312087, "grad_norm": 0.1967829167842865, "learning_rate": 7.955974842767296e-06, "loss": 11.7576, "step": 130 }, { "epoch": 0.6156557497429872, "grad_norm": 0.2775883674621582, "learning_rate": 7.940251572327044e-06, "loss": 12.1472, "step": 131 }, { "epoch": 0.6203554119547657, "grad_norm": 0.18012608587741852, "learning_rate": 7.924528301886793e-06, "loss": 11.4322, "step": 132 }, { "epoch": 0.6250550741665443, "grad_norm": 0.27960145473480225, "learning_rate": 7.908805031446542e-06, "loss": 11.5231, "step": 133 }, { "epoch": 0.6297547363783228, "grad_norm": 0.2150716334581375, "learning_rate": 7.893081761006291e-06, "loss": 11.7891, "step": 134 }, { "epoch": 0.6344543985901013, "grad_norm": 0.2049262374639511, "learning_rate": 7.877358490566038e-06, "loss": 12.3417, "step": 135 }, { "epoch": 0.6391540608018799, "grad_norm": 0.4161284863948822, "learning_rate": 7.861635220125787e-06, "loss": 11.9387, "step": 136 }, { "epoch": 0.6438537230136584, "grad_norm": 0.20076847076416016, "learning_rate": 7.845911949685535e-06, "loss": 12.3023, "step": 137 }, { "epoch": 0.6485533852254369, "grad_norm": 0.20139706134796143, "learning_rate": 7.830188679245284e-06, "loss": 10.7882, "step": 138 }, { "epoch": 0.6532530474372155, "grad_norm": 0.17897853255271912, "learning_rate": 7.814465408805031e-06, "loss": 11.2138, "step": 139 }, { "epoch": 0.657952709648994, "grad_norm": 0.17108504474163055, "learning_rate": 7.79874213836478e-06, "loss": 11.946, "step": 140 }, { "epoch": 0.6626523718607725, "grad_norm": 0.21145059168338776, "learning_rate": 7.78301886792453e-06, "loss": 10.6362, "step": 141 }, { "epoch": 0.6673520340725511, "grad_norm": 0.23491205275058746, "learning_rate": 7.767295597484279e-06, "loss": 11.6229, "step": 142 }, { "epoch": 0.6720516962843296, "grad_norm": 0.19559265673160553, "learning_rate": 7.751572327044026e-06, "loss": 10.4194, "step": 143 }, { "epoch": 0.6767513584961081, "grad_norm": 0.17953048646450043, "learning_rate": 7.735849056603775e-06, "loss": 11.715, "step": 144 }, { "epoch": 0.6814510207078867, "grad_norm": 0.19390526413917542, "learning_rate": 7.720125786163522e-06, "loss": 11.7313, "step": 145 }, { "epoch": 0.6861506829196652, "grad_norm": 0.2010110318660736, "learning_rate": 7.704402515723271e-06, "loss": 12.0915, "step": 146 }, { "epoch": 0.6908503451314437, "grad_norm": 0.1508968621492386, "learning_rate": 7.688679245283019e-06, "loss": 9.0544, "step": 147 }, { "epoch": 0.6955500073432223, "grad_norm": 0.17742106318473816, "learning_rate": 7.672955974842768e-06, "loss": 10.9338, "step": 148 }, { "epoch": 0.7002496695550008, "grad_norm": 0.1881588250398636, "learning_rate": 7.657232704402517e-06, "loss": 10.7714, "step": 149 }, { "epoch": 0.7049493317667792, "grad_norm": 0.3921796679496765, "learning_rate": 7.641509433962266e-06, "loss": 12.6045, "step": 150 }, { "epoch": 0.7096489939785577, "grad_norm": 0.22628262639045715, "learning_rate": 7.6257861635220135e-06, "loss": 10.6045, "step": 151 }, { "epoch": 0.7143486561903363, "grad_norm": 0.18339803814888, "learning_rate": 7.6100628930817626e-06, "loss": 11.7633, "step": 152 }, { "epoch": 0.7190483184021148, "grad_norm": 0.15982794761657715, "learning_rate": 7.59433962264151e-06, "loss": 10.8146, "step": 153 }, { "epoch": 0.7237479806138933, "grad_norm": 0.18353401124477386, "learning_rate": 7.578616352201259e-06, "loss": 9.7017, "step": 154 }, { "epoch": 0.7284476428256719, "grad_norm": 0.1364990770816803, "learning_rate": 7.562893081761007e-06, "loss": 11.6746, "step": 155 }, { "epoch": 0.7331473050374504, "grad_norm": 0.15395711362361908, "learning_rate": 7.5471698113207555e-06, "loss": 11.2388, "step": 156 }, { "epoch": 0.7378469672492289, "grad_norm": 0.16953708231449127, "learning_rate": 7.531446540880504e-06, "loss": 10.5221, "step": 157 }, { "epoch": 0.7425466294610075, "grad_norm": 0.2511339783668518, "learning_rate": 7.515723270440253e-06, "loss": 10.0649, "step": 158 }, { "epoch": 0.747246291672786, "grad_norm": 0.22080384194850922, "learning_rate": 7.500000000000001e-06, "loss": 11.0268, "step": 159 }, { "epoch": 0.7519459538845645, "grad_norm": 0.2894453704357147, "learning_rate": 7.484276729559748e-06, "loss": 11.1155, "step": 160 }, { "epoch": 0.7566456160963431, "grad_norm": 0.20514556765556335, "learning_rate": 7.4685534591194975e-06, "loss": 12.266, "step": 161 }, { "epoch": 0.7613452783081216, "grad_norm": 0.16512590646743774, "learning_rate": 7.452830188679246e-06, "loss": 12.7621, "step": 162 }, { "epoch": 0.7660449405199001, "grad_norm": 0.20450714230537415, "learning_rate": 7.437106918238994e-06, "loss": 10.2383, "step": 163 }, { "epoch": 0.7707446027316787, "grad_norm": 0.2836724817752838, "learning_rate": 7.421383647798742e-06, "loss": 10.8839, "step": 164 }, { "epoch": 0.7754442649434572, "grad_norm": 0.177320659160614, "learning_rate": 7.405660377358491e-06, "loss": 10.4404, "step": 165 }, { "epoch": 0.7801439271552357, "grad_norm": 0.1881173849105835, "learning_rate": 7.389937106918239e-06, "loss": 10.5858, "step": 166 }, { "epoch": 0.7848435893670143, "grad_norm": 0.18388114869594574, "learning_rate": 7.374213836477988e-06, "loss": 10.0372, "step": 167 }, { "epoch": 0.7895432515787928, "grad_norm": 0.20303378999233246, "learning_rate": 7.358490566037736e-06, "loss": 11.4814, "step": 168 }, { "epoch": 0.7942429137905713, "grad_norm": 0.20170536637306213, "learning_rate": 7.342767295597485e-06, "loss": 10.6799, "step": 169 }, { "epoch": 0.7989425760023499, "grad_norm": 0.1719183623790741, "learning_rate": 7.3270440251572324e-06, "loss": 10.5034, "step": 170 }, { "epoch": 0.8036422382141284, "grad_norm": 0.25152912735939026, "learning_rate": 7.3113207547169815e-06, "loss": 10.7397, "step": 171 }, { "epoch": 0.8083419004259069, "grad_norm": 0.19823051989078522, "learning_rate": 7.29559748427673e-06, "loss": 10.983, "step": 172 }, { "epoch": 0.8130415626376855, "grad_norm": 0.2038576751947403, "learning_rate": 7.279874213836479e-06, "loss": 10.4441, "step": 173 }, { "epoch": 0.817741224849464, "grad_norm": 0.17723773419857025, "learning_rate": 7.264150943396226e-06, "loss": 10.3421, "step": 174 }, { "epoch": 0.8224408870612425, "grad_norm": 0.18553687632083893, "learning_rate": 7.248427672955975e-06, "loss": 12.3211, "step": 175 }, { "epoch": 0.827140549273021, "grad_norm": 0.14637748897075653, "learning_rate": 7.2327044025157235e-06, "loss": 9.8436, "step": 176 }, { "epoch": 0.8318402114847996, "grad_norm": 0.23111458122730255, "learning_rate": 7.216981132075473e-06, "loss": 12.0587, "step": 177 }, { "epoch": 0.836539873696578, "grad_norm": 0.43327322602272034, "learning_rate": 7.20125786163522e-06, "loss": 11.5681, "step": 178 }, { "epoch": 0.8412395359083565, "grad_norm": 0.18507908284664154, "learning_rate": 7.185534591194969e-06, "loss": 12.423, "step": 179 }, { "epoch": 0.8459391981201351, "grad_norm": 0.17362703382968903, "learning_rate": 7.169811320754717e-06, "loss": 10.3329, "step": 180 }, { "epoch": 0.8506388603319136, "grad_norm": 0.23827636241912842, "learning_rate": 7.154088050314466e-06, "loss": 11.9506, "step": 181 }, { "epoch": 0.8553385225436921, "grad_norm": 0.24631419777870178, "learning_rate": 7.138364779874214e-06, "loss": 10.0765, "step": 182 }, { "epoch": 0.8600381847554707, "grad_norm": 0.24519377946853638, "learning_rate": 7.122641509433963e-06, "loss": 10.486, "step": 183 }, { "epoch": 0.8647378469672492, "grad_norm": 0.22264322638511658, "learning_rate": 7.106918238993711e-06, "loss": 11.5366, "step": 184 }, { "epoch": 0.8694375091790277, "grad_norm": 0.2268192321062088, "learning_rate": 7.09119496855346e-06, "loss": 10.432, "step": 185 }, { "epoch": 0.8741371713908063, "grad_norm": 0.20069079101085663, "learning_rate": 7.0754716981132075e-06, "loss": 10.9919, "step": 186 }, { "epoch": 0.8788368336025848, "grad_norm": 0.18285229802131653, "learning_rate": 7.059748427672957e-06, "loss": 10.8613, "step": 187 }, { "epoch": 0.8835364958143633, "grad_norm": 0.30948406457901, "learning_rate": 7.044025157232705e-06, "loss": 10.0019, "step": 188 }, { "epoch": 0.8882361580261419, "grad_norm": 0.15717379748821259, "learning_rate": 7.028301886792454e-06, "loss": 12.0274, "step": 189 }, { "epoch": 0.8929358202379204, "grad_norm": 0.19054202735424042, "learning_rate": 7.012578616352201e-06, "loss": 10.1378, "step": 190 }, { "epoch": 0.8976354824496989, "grad_norm": 0.19569998979568481, "learning_rate": 6.99685534591195e-06, "loss": 11.2031, "step": 191 }, { "epoch": 0.9023351446614775, "grad_norm": 0.47344332933425903, "learning_rate": 6.981132075471699e-06, "loss": 11.4871, "step": 192 }, { "epoch": 0.907034806873256, "grad_norm": 0.21780353784561157, "learning_rate": 6.965408805031447e-06, "loss": 10.3987, "step": 193 }, { "epoch": 0.9117344690850345, "grad_norm": 0.1753062754869461, "learning_rate": 6.949685534591195e-06, "loss": 11.913, "step": 194 }, { "epoch": 0.9164341312968131, "grad_norm": 0.19800685346126556, "learning_rate": 6.933962264150944e-06, "loss": 10.986, "step": 195 }, { "epoch": 0.9211337935085916, "grad_norm": 0.24941404163837433, "learning_rate": 6.9182389937106915e-06, "loss": 11.5825, "step": 196 }, { "epoch": 0.9258334557203701, "grad_norm": 0.21111537516117096, "learning_rate": 6.902515723270441e-06, "loss": 9.8846, "step": 197 }, { "epoch": 0.9305331179321487, "grad_norm": 0.21166007220745087, "learning_rate": 6.886792452830189e-06, "loss": 10.3414, "step": 198 }, { "epoch": 0.9352327801439272, "grad_norm": 0.1425541788339615, "learning_rate": 6.871069182389938e-06, "loss": 10.9606, "step": 199 }, { "epoch": 0.9399324423557057, "grad_norm": 0.1676245927810669, "learning_rate": 6.855345911949685e-06, "loss": 9.8579, "step": 200 }, { "epoch": 0.9446321045674843, "grad_norm": 0.2449372261762619, "learning_rate": 6.839622641509434e-06, "loss": 9.9599, "step": 201 }, { "epoch": 0.9493317667792628, "grad_norm": 0.1561560481786728, "learning_rate": 6.823899371069183e-06, "loss": 11.4323, "step": 202 }, { "epoch": 0.9540314289910413, "grad_norm": 0.1817087084054947, "learning_rate": 6.808176100628932e-06, "loss": 10.023, "step": 203 }, { "epoch": 0.9587310912028199, "grad_norm": 0.21511977910995483, "learning_rate": 6.792452830188679e-06, "loss": 11.2811, "step": 204 }, { "epoch": 0.9634307534145983, "grad_norm": 0.17556354403495789, "learning_rate": 6.776729559748428e-06, "loss": 11.3001, "step": 205 }, { "epoch": 0.9681304156263768, "grad_norm": 0.16658397018909454, "learning_rate": 6.761006289308176e-06, "loss": 10.7399, "step": 206 }, { "epoch": 0.9728300778381553, "grad_norm": 0.16622519493103027, "learning_rate": 6.7452830188679255e-06, "loss": 10.1286, "step": 207 }, { "epoch": 0.9775297400499339, "grad_norm": 0.14768865704536438, "learning_rate": 6.729559748427673e-06, "loss": 10.7119, "step": 208 }, { "epoch": 0.9822294022617124, "grad_norm": 0.142805278301239, "learning_rate": 6.713836477987422e-06, "loss": 11.4976, "step": 209 }, { "epoch": 0.9869290644734909, "grad_norm": 0.15663962066173553, "learning_rate": 6.69811320754717e-06, "loss": 9.6873, "step": 210 }, { "epoch": 0.9916287266852695, "grad_norm": 0.22869648039340973, "learning_rate": 6.682389937106919e-06, "loss": 10.6141, "step": 211 }, { "epoch": 0.996328388897048, "grad_norm": 0.16368289291858673, "learning_rate": 6.666666666666667e-06, "loss": 9.58, "step": 212 }, { "epoch": 1.0046996622117785, "grad_norm": 0.26223883032798767, "learning_rate": 6.650943396226416e-06, "loss": 10.6104, "step": 213 }, { "epoch": 1.009399324423557, "grad_norm": 0.18566545844078064, "learning_rate": 6.635220125786164e-06, "loss": 10.1867, "step": 214 }, { "epoch": 1.0140989866353356, "grad_norm": 0.15616296231746674, "learning_rate": 6.619496855345913e-06, "loss": 9.4169, "step": 215 }, { "epoch": 1.0187986488471141, "grad_norm": 0.2797134816646576, "learning_rate": 6.60377358490566e-06, "loss": 10.5557, "step": 216 }, { "epoch": 1.0234983110588927, "grad_norm": 0.2116226851940155, "learning_rate": 6.5880503144654095e-06, "loss": 10.5718, "step": 217 }, { "epoch": 1.0281979732706712, "grad_norm": 0.2222115844488144, "learning_rate": 6.572327044025158e-06, "loss": 10.2063, "step": 218 }, { "epoch": 1.0328976354824497, "grad_norm": 0.2480054348707199, "learning_rate": 6.556603773584907e-06, "loss": 11.3938, "step": 219 }, { "epoch": 1.0375972976942283, "grad_norm": 0.18250253796577454, "learning_rate": 6.540880503144654e-06, "loss": 9.7526, "step": 220 }, { "epoch": 1.0422969599060068, "grad_norm": 0.1839306354522705, "learning_rate": 6.525157232704403e-06, "loss": 9.7129, "step": 221 }, { "epoch": 1.0469966221177853, "grad_norm": 0.17117497324943542, "learning_rate": 6.5094339622641515e-06, "loss": 11.5024, "step": 222 }, { "epoch": 1.0516962843295639, "grad_norm": 0.12768374383449554, "learning_rate": 6.4937106918239e-06, "loss": 11.2154, "step": 223 }, { "epoch": 1.0563959465413424, "grad_norm": 0.18520669639110565, "learning_rate": 6.477987421383648e-06, "loss": 9.4937, "step": 224 }, { "epoch": 1.061095608753121, "grad_norm": 0.20890559256076813, "learning_rate": 6.462264150943397e-06, "loss": 9.4864, "step": 225 }, { "epoch": 1.0657952709648995, "grad_norm": 0.17774340510368347, "learning_rate": 6.446540880503145e-06, "loss": 10.5667, "step": 226 }, { "epoch": 1.070494933176678, "grad_norm": 0.179142564535141, "learning_rate": 6.4308176100628935e-06, "loss": 9.9923, "step": 227 }, { "epoch": 1.0751945953884565, "grad_norm": 0.18621426820755005, "learning_rate": 6.415094339622642e-06, "loss": 11.4307, "step": 228 }, { "epoch": 1.079894257600235, "grad_norm": 0.20719768106937408, "learning_rate": 6.399371069182391e-06, "loss": 9.7865, "step": 229 }, { "epoch": 1.0845939198120136, "grad_norm": 0.1468082219362259, "learning_rate": 6.383647798742138e-06, "loss": 9.0166, "step": 230 }, { "epoch": 1.0892935820237921, "grad_norm": 0.173754021525383, "learning_rate": 6.367924528301887e-06, "loss": 11.015, "step": 231 }, { "epoch": 1.0939932442355706, "grad_norm": 0.17140689492225647, "learning_rate": 6.3522012578616355e-06, "loss": 11.3881, "step": 232 }, { "epoch": 1.0986929064473492, "grad_norm": 0.19918528199195862, "learning_rate": 6.336477987421385e-06, "loss": 8.8754, "step": 233 }, { "epoch": 1.1033925686591277, "grad_norm": 0.19411884248256683, "learning_rate": 6.320754716981132e-06, "loss": 9.1916, "step": 234 }, { "epoch": 1.1080922308709062, "grad_norm": 0.177324116230011, "learning_rate": 6.305031446540881e-06, "loss": 11.5269, "step": 235 }, { "epoch": 1.1127918930826848, "grad_norm": 0.19485007226467133, "learning_rate": 6.289308176100629e-06, "loss": 9.3556, "step": 236 }, { "epoch": 1.1174915552944633, "grad_norm": 0.15749092400074005, "learning_rate": 6.273584905660378e-06, "loss": 9.8539, "step": 237 }, { "epoch": 1.1221912175062418, "grad_norm": 0.14400354027748108, "learning_rate": 6.257861635220126e-06, "loss": 10.6714, "step": 238 }, { "epoch": 1.1268908797180202, "grad_norm": 0.16621728241443634, "learning_rate": 6.242138364779875e-06, "loss": 10.2598, "step": 239 }, { "epoch": 1.131590541929799, "grad_norm": 0.18759991228580475, "learning_rate": 6.226415094339623e-06, "loss": 10.2747, "step": 240 }, { "epoch": 1.1362902041415772, "grad_norm": 0.28644540905952454, "learning_rate": 6.210691823899372e-06, "loss": 9.7841, "step": 241 }, { "epoch": 1.140989866353356, "grad_norm": 0.28347471356391907, "learning_rate": 6.1949685534591195e-06, "loss": 10.1872, "step": 242 }, { "epoch": 1.1456895285651343, "grad_norm": 0.2755683362483978, "learning_rate": 6.179245283018869e-06, "loss": 9.4194, "step": 243 }, { "epoch": 1.1503891907769128, "grad_norm": 0.16993623971939087, "learning_rate": 6.163522012578617e-06, "loss": 10.3603, "step": 244 }, { "epoch": 1.1550888529886914, "grad_norm": 0.17048682272434235, "learning_rate": 6.147798742138366e-06, "loss": 10.0733, "step": 245 }, { "epoch": 1.1597885152004699, "grad_norm": 0.22276446223258972, "learning_rate": 6.132075471698113e-06, "loss": 10.4929, "step": 246 }, { "epoch": 1.1644881774122484, "grad_norm": 0.22395849227905273, "learning_rate": 6.116352201257862e-06, "loss": 11.7205, "step": 247 }, { "epoch": 1.169187839624027, "grad_norm": 0.16276244819164276, "learning_rate": 6.100628930817611e-06, "loss": 10.9938, "step": 248 }, { "epoch": 1.1738875018358055, "grad_norm": 0.21257179975509644, "learning_rate": 6.08490566037736e-06, "loss": 9.8752, "step": 249 }, { "epoch": 1.178587164047584, "grad_norm": 0.22887296974658966, "learning_rate": 6.069182389937107e-06, "loss": 9.5889, "step": 250 }, { "epoch": 1.1832868262593625, "grad_norm": 0.1974973976612091, "learning_rate": 6.053459119496856e-06, "loss": 9.3645, "step": 251 }, { "epoch": 1.187986488471141, "grad_norm": 0.23462526500225067, "learning_rate": 6.037735849056604e-06, "loss": 9.4658, "step": 252 }, { "epoch": 1.1926861506829196, "grad_norm": 0.16101180016994476, "learning_rate": 6.022012578616353e-06, "loss": 10.2914, "step": 253 }, { "epoch": 1.1973858128946981, "grad_norm": 0.2459559589624405, "learning_rate": 6.006289308176101e-06, "loss": 9.0605, "step": 254 }, { "epoch": 1.2020854751064767, "grad_norm": 0.14153866469860077, "learning_rate": 5.99056603773585e-06, "loss": 10.53, "step": 255 }, { "epoch": 1.2067851373182552, "grad_norm": 0.3186815083026886, "learning_rate": 5.974842767295598e-06, "loss": 11.1498, "step": 256 }, { "epoch": 1.2114847995300337, "grad_norm": 0.37049660086631775, "learning_rate": 5.959119496855346e-06, "loss": 10.4863, "step": 257 }, { "epoch": 1.2161844617418123, "grad_norm": 0.14857573807239532, "learning_rate": 5.943396226415095e-06, "loss": 10.641, "step": 258 }, { "epoch": 1.2208841239535908, "grad_norm": 0.19735755026340485, "learning_rate": 5.927672955974844e-06, "loss": 11.4228, "step": 259 }, { "epoch": 1.2255837861653693, "grad_norm": 0.16926360130310059, "learning_rate": 5.911949685534591e-06, "loss": 9.4661, "step": 260 }, { "epoch": 1.2302834483771479, "grad_norm": 0.13348612189292908, "learning_rate": 5.89622641509434e-06, "loss": 9.5991, "step": 261 }, { "epoch": 1.2349831105889264, "grad_norm": 0.1581430286169052, "learning_rate": 5.880503144654088e-06, "loss": 9.8854, "step": 262 }, { "epoch": 1.239682772800705, "grad_norm": 0.12091105431318283, "learning_rate": 5.8647798742138375e-06, "loss": 9.2929, "step": 263 }, { "epoch": 1.2443824350124835, "grad_norm": 0.18068204820156097, "learning_rate": 5.849056603773585e-06, "loss": 11.9561, "step": 264 }, { "epoch": 1.249082097224262, "grad_norm": 0.16259127855300903, "learning_rate": 5.833333333333334e-06, "loss": 10.1489, "step": 265 }, { "epoch": 1.2537817594360405, "grad_norm": 0.17135359346866608, "learning_rate": 5.817610062893082e-06, "loss": 10.7507, "step": 266 }, { "epoch": 1.258481421647819, "grad_norm": 0.18598735332489014, "learning_rate": 5.801886792452831e-06, "loss": 10.0606, "step": 267 }, { "epoch": 1.2631810838595976, "grad_norm": 0.21258696913719177, "learning_rate": 5.786163522012579e-06, "loss": 8.953, "step": 268 }, { "epoch": 1.2678807460713761, "grad_norm": 0.15013380348682404, "learning_rate": 5.770440251572328e-06, "loss": 10.3239, "step": 269 }, { "epoch": 1.2725804082831547, "grad_norm": 0.14898964762687683, "learning_rate": 5.754716981132076e-06, "loss": 9.2403, "step": 270 }, { "epoch": 1.2772800704949332, "grad_norm": 0.21635949611663818, "learning_rate": 5.738993710691825e-06, "loss": 10.378, "step": 271 }, { "epoch": 1.2819797327067117, "grad_norm": 0.21074920892715454, "learning_rate": 5.723270440251572e-06, "loss": 9.47, "step": 272 }, { "epoch": 1.2866793949184903, "grad_norm": 0.19226451218128204, "learning_rate": 5.7075471698113215e-06, "loss": 9.5547, "step": 273 }, { "epoch": 1.2913790571302688, "grad_norm": 0.20751795172691345, "learning_rate": 5.69182389937107e-06, "loss": 9.7948, "step": 274 }, { "epoch": 1.2960787193420473, "grad_norm": 0.13100376725196838, "learning_rate": 5.676100628930819e-06, "loss": 9.6389, "step": 275 }, { "epoch": 1.3007783815538259, "grad_norm": 0.21564634144306183, "learning_rate": 5.660377358490566e-06, "loss": 9.1505, "step": 276 }, { "epoch": 1.3054780437656044, "grad_norm": 0.14475569128990173, "learning_rate": 5.644654088050315e-06, "loss": 9.8558, "step": 277 }, { "epoch": 1.310177705977383, "grad_norm": 0.1774779111146927, "learning_rate": 5.6289308176100635e-06, "loss": 9.4868, "step": 278 }, { "epoch": 1.3148773681891615, "grad_norm": 0.1645752489566803, "learning_rate": 5.613207547169813e-06, "loss": 9.4504, "step": 279 }, { "epoch": 1.31957703040094, "grad_norm": 0.17156900465488434, "learning_rate": 5.59748427672956e-06, "loss": 9.5404, "step": 280 }, { "epoch": 1.3242766926127185, "grad_norm": 0.16523773968219757, "learning_rate": 5.581761006289309e-06, "loss": 10.0618, "step": 281 }, { "epoch": 1.328976354824497, "grad_norm": 0.1551135629415512, "learning_rate": 5.566037735849057e-06, "loss": 10.3954, "step": 282 }, { "epoch": 1.3336760170362756, "grad_norm": 0.18945059180259705, "learning_rate": 5.5503144654088055e-06, "loss": 9.6077, "step": 283 }, { "epoch": 1.3383756792480541, "grad_norm": 0.19237595796585083, "learning_rate": 5.534591194968554e-06, "loss": 9.277, "step": 284 }, { "epoch": 1.3430753414598327, "grad_norm": 0.1267118752002716, "learning_rate": 5.518867924528303e-06, "loss": 10.9243, "step": 285 }, { "epoch": 1.3477750036716112, "grad_norm": 0.1710660606622696, "learning_rate": 5.503144654088051e-06, "loss": 9.5044, "step": 286 }, { "epoch": 1.3524746658833897, "grad_norm": 0.14960743486881256, "learning_rate": 5.487421383647799e-06, "loss": 9.1807, "step": 287 }, { "epoch": 1.357174328095168, "grad_norm": 0.15505310893058777, "learning_rate": 5.4716981132075475e-06, "loss": 9.5316, "step": 288 }, { "epoch": 1.3618739903069468, "grad_norm": 0.29475870728492737, "learning_rate": 5.455974842767297e-06, "loss": 9.3668, "step": 289 }, { "epoch": 1.366573652518725, "grad_norm": 0.1836201548576355, "learning_rate": 5.440251572327044e-06, "loss": 10.2698, "step": 290 }, { "epoch": 1.3712733147305038, "grad_norm": 0.174160435795784, "learning_rate": 5.424528301886793e-06, "loss": 9.4744, "step": 291 }, { "epoch": 1.3759729769422822, "grad_norm": 0.1628328412771225, "learning_rate": 5.408805031446541e-06, "loss": 10.9402, "step": 292 }, { "epoch": 1.380672639154061, "grad_norm": 0.2673213481903076, "learning_rate": 5.39308176100629e-06, "loss": 9.7072, "step": 293 }, { "epoch": 1.3853723013658392, "grad_norm": 0.14892983436584473, "learning_rate": 5.377358490566038e-06, "loss": 9.0972, "step": 294 }, { "epoch": 1.390071963577618, "grad_norm": 0.15011869370937347, "learning_rate": 5.361635220125787e-06, "loss": 10.2168, "step": 295 }, { "epoch": 1.3947716257893963, "grad_norm": 0.16285346448421478, "learning_rate": 5.345911949685535e-06, "loss": 8.912, "step": 296 }, { "epoch": 1.399471288001175, "grad_norm": 0.19020162522792816, "learning_rate": 5.330188679245284e-06, "loss": 10.4482, "step": 297 }, { "epoch": 1.4041709502129534, "grad_norm": 0.17386487126350403, "learning_rate": 5.3144654088050315e-06, "loss": 8.5551, "step": 298 }, { "epoch": 1.408870612424732, "grad_norm": 0.13091512024402618, "learning_rate": 5.298742138364781e-06, "loss": 9.4529, "step": 299 }, { "epoch": 1.4135702746365104, "grad_norm": 0.15498989820480347, "learning_rate": 5.283018867924529e-06, "loss": 8.6886, "step": 300 }, { "epoch": 1.418269936848289, "grad_norm": 0.17087408900260925, "learning_rate": 5.267295597484278e-06, "loss": 10.7793, "step": 301 }, { "epoch": 1.4229695990600675, "grad_norm": 0.23628047108650208, "learning_rate": 5.251572327044025e-06, "loss": 10.8393, "step": 302 }, { "epoch": 1.427669261271846, "grad_norm": 0.16381074488162994, "learning_rate": 5.235849056603774e-06, "loss": 8.6979, "step": 303 }, { "epoch": 1.4323689234836245, "grad_norm": 0.2537349760532379, "learning_rate": 5.220125786163523e-06, "loss": 10.0461, "step": 304 }, { "epoch": 1.437068585695403, "grad_norm": 0.17883513867855072, "learning_rate": 5.204402515723272e-06, "loss": 10.1509, "step": 305 }, { "epoch": 1.4417682479071816, "grad_norm": 0.22167061269283295, "learning_rate": 5.188679245283019e-06, "loss": 10.6258, "step": 306 }, { "epoch": 1.4464679101189601, "grad_norm": 0.2985490560531616, "learning_rate": 5.172955974842768e-06, "loss": 9.5367, "step": 307 }, { "epoch": 1.4511675723307387, "grad_norm": 0.187641903758049, "learning_rate": 5.157232704402516e-06, "loss": 8.5468, "step": 308 }, { "epoch": 1.4558672345425172, "grad_norm": 0.16621382534503937, "learning_rate": 5.1415094339622655e-06, "loss": 10.7152, "step": 309 }, { "epoch": 1.4605668967542957, "grad_norm": 0.18901924788951874, "learning_rate": 5.125786163522013e-06, "loss": 9.3358, "step": 310 }, { "epoch": 1.4652665589660743, "grad_norm": 0.1986590474843979, "learning_rate": 5.110062893081762e-06, "loss": 9.5057, "step": 311 }, { "epoch": 1.4699662211778528, "grad_norm": 0.19186130166053772, "learning_rate": 5.09433962264151e-06, "loss": 9.0993, "step": 312 }, { "epoch": 1.4746658833896313, "grad_norm": 0.18051902949810028, "learning_rate": 5.078616352201258e-06, "loss": 9.4301, "step": 313 }, { "epoch": 1.4793655456014099, "grad_norm": 0.16041314601898193, "learning_rate": 5.062893081761007e-06, "loss": 9.283, "step": 314 }, { "epoch": 1.4840652078131884, "grad_norm": 0.1544487476348877, "learning_rate": 5.047169811320756e-06, "loss": 10.86, "step": 315 }, { "epoch": 1.488764870024967, "grad_norm": 0.23861843347549438, "learning_rate": 5.031446540880504e-06, "loss": 9.4436, "step": 316 }, { "epoch": 1.4934645322367455, "grad_norm": 0.17076392471790314, "learning_rate": 5.015723270440252e-06, "loss": 9.3497, "step": 317 }, { "epoch": 1.498164194448524, "grad_norm": 0.1647375226020813, "learning_rate": 5e-06, "loss": 9.8566, "step": 318 }, { "epoch": 1.5028638566603025, "grad_norm": 0.1486530900001526, "learning_rate": 4.984276729559749e-06, "loss": 10.0189, "step": 319 }, { "epoch": 1.507563518872081, "grad_norm": 0.21668870747089386, "learning_rate": 4.968553459119497e-06, "loss": 10.2389, "step": 320 }, { "epoch": 1.5122631810838596, "grad_norm": 0.1586449295282364, "learning_rate": 4.952830188679246e-06, "loss": 9.8241, "step": 321 }, { "epoch": 1.5169628432956381, "grad_norm": 0.18685030937194824, "learning_rate": 4.937106918238994e-06, "loss": 11.0133, "step": 322 }, { "epoch": 1.5216625055074167, "grad_norm": 0.14795856177806854, "learning_rate": 4.921383647798742e-06, "loss": 10.9296, "step": 323 }, { "epoch": 1.5263621677191952, "grad_norm": 0.17823056876659393, "learning_rate": 4.905660377358491e-06, "loss": 9.2171, "step": 324 }, { "epoch": 1.5310618299309737, "grad_norm": 0.20636893808841705, "learning_rate": 4.88993710691824e-06, "loss": 9.1884, "step": 325 }, { "epoch": 1.5357614921427523, "grad_norm": 0.19597569108009338, "learning_rate": 4.874213836477988e-06, "loss": 9.8362, "step": 326 }, { "epoch": 1.5404611543545308, "grad_norm": 0.1484641134738922, "learning_rate": 4.858490566037736e-06, "loss": 10.2958, "step": 327 }, { "epoch": 1.5451608165663093, "grad_norm": 0.22989784181118011, "learning_rate": 4.842767295597484e-06, "loss": 10.0049, "step": 328 }, { "epoch": 1.5498604787780879, "grad_norm": 0.1617046743631363, "learning_rate": 4.8270440251572335e-06, "loss": 9.4836, "step": 329 }, { "epoch": 1.5545601409898664, "grad_norm": 0.16669578850269318, "learning_rate": 4.811320754716982e-06, "loss": 9.3237, "step": 330 }, { "epoch": 1.559259803201645, "grad_norm": 0.2456568032503128, "learning_rate": 4.79559748427673e-06, "loss": 9.977, "step": 331 }, { "epoch": 1.5639594654134235, "grad_norm": 0.24188953638076782, "learning_rate": 4.779874213836478e-06, "loss": 9.806, "step": 332 }, { "epoch": 1.568659127625202, "grad_norm": 0.15256938338279724, "learning_rate": 4.764150943396227e-06, "loss": 9.9676, "step": 333 }, { "epoch": 1.5733587898369805, "grad_norm": 0.1987762153148651, "learning_rate": 4.7484276729559755e-06, "loss": 10.2033, "step": 334 }, { "epoch": 1.5780584520487588, "grad_norm": 0.2421812266111374, "learning_rate": 4.732704402515724e-06, "loss": 9.2433, "step": 335 }, { "epoch": 1.5827581142605376, "grad_norm": 0.2165605127811432, "learning_rate": 4.716981132075472e-06, "loss": 9.6531, "step": 336 }, { "epoch": 1.587457776472316, "grad_norm": 0.13957588374614716, "learning_rate": 4.70125786163522e-06, "loss": 9.447, "step": 337 }, { "epoch": 1.5921574386840947, "grad_norm": 0.11904534697532654, "learning_rate": 4.685534591194969e-06, "loss": 9.2246, "step": 338 }, { "epoch": 1.596857100895873, "grad_norm": 0.17070595920085907, "learning_rate": 4.6698113207547175e-06, "loss": 10.0194, "step": 339 }, { "epoch": 1.6015567631076517, "grad_norm": 0.143925279378891, "learning_rate": 4.654088050314466e-06, "loss": 9.3624, "step": 340 }, { "epoch": 1.60625642531943, "grad_norm": 0.16152456402778625, "learning_rate": 4.638364779874214e-06, "loss": 10.4735, "step": 341 }, { "epoch": 1.6109560875312088, "grad_norm": 0.18913985788822174, "learning_rate": 4.622641509433963e-06, "loss": 10.123, "step": 342 }, { "epoch": 1.615655749742987, "grad_norm": 0.13386861979961395, "learning_rate": 4.606918238993711e-06, "loss": 10.4526, "step": 343 }, { "epoch": 1.6203554119547658, "grad_norm": 0.20337802171707153, "learning_rate": 4.5911949685534595e-06, "loss": 9.1595, "step": 344 }, { "epoch": 1.6250550741665442, "grad_norm": 0.1720411479473114, "learning_rate": 4.575471698113208e-06, "loss": 9.6249, "step": 345 }, { "epoch": 1.629754736378323, "grad_norm": 0.1636141985654831, "learning_rate": 4.559748427672957e-06, "loss": 9.2014, "step": 346 }, { "epoch": 1.6344543985901012, "grad_norm": 0.21507582068443298, "learning_rate": 4.544025157232705e-06, "loss": 9.2198, "step": 347 }, { "epoch": 1.63915406080188, "grad_norm": 0.17695969343185425, "learning_rate": 4.528301886792453e-06, "loss": 8.1701, "step": 348 }, { "epoch": 1.6438537230136583, "grad_norm": 0.1695805937051773, "learning_rate": 4.5125786163522015e-06, "loss": 10.5658, "step": 349 }, { "epoch": 1.648553385225437, "grad_norm": 0.2079424113035202, "learning_rate": 4.49685534591195e-06, "loss": 8.6012, "step": 350 }, { "epoch": 1.6532530474372154, "grad_norm": 0.1686774492263794, "learning_rate": 4.481132075471699e-06, "loss": 9.1372, "step": 351 }, { "epoch": 1.657952709648994, "grad_norm": 0.20620812475681305, "learning_rate": 4.465408805031447e-06, "loss": 9.109, "step": 352 }, { "epoch": 1.6626523718607724, "grad_norm": 0.1773538589477539, "learning_rate": 4.449685534591195e-06, "loss": 10.8451, "step": 353 }, { "epoch": 1.6673520340725512, "grad_norm": 0.1696254014968872, "learning_rate": 4.4339622641509435e-06, "loss": 10.1702, "step": 354 }, { "epoch": 1.6720516962843295, "grad_norm": 0.1324741393327713, "learning_rate": 4.418238993710693e-06, "loss": 9.9574, "step": 355 }, { "epoch": 1.6767513584961082, "grad_norm": 0.1837111860513687, "learning_rate": 4.402515723270441e-06, "loss": 9.0295, "step": 356 }, { "epoch": 1.6814510207078865, "grad_norm": 0.20189788937568665, "learning_rate": 4.386792452830189e-06, "loss": 9.3521, "step": 357 }, { "epoch": 1.6861506829196653, "grad_norm": 0.27319082617759705, "learning_rate": 4.371069182389937e-06, "loss": 9.2089, "step": 358 }, { "epoch": 1.6908503451314436, "grad_norm": 0.17554742097854614, "learning_rate": 4.355345911949686e-06, "loss": 9.7537, "step": 359 }, { "epoch": 1.6955500073432224, "grad_norm": 0.18987105786800385, "learning_rate": 4.339622641509435e-06, "loss": 9.4864, "step": 360 }, { "epoch": 1.7002496695550007, "grad_norm": 0.2505553066730499, "learning_rate": 4.323899371069183e-06, "loss": 8.236, "step": 361 }, { "epoch": 1.7049493317667792, "grad_norm": 0.20137320458889008, "learning_rate": 4.308176100628931e-06, "loss": 8.3513, "step": 362 }, { "epoch": 1.7096489939785577, "grad_norm": 0.16769157350063324, "learning_rate": 4.29245283018868e-06, "loss": 8.2796, "step": 363 }, { "epoch": 1.7143486561903363, "grad_norm": 0.1518823355436325, "learning_rate": 4.276729559748428e-06, "loss": 9.6663, "step": 364 }, { "epoch": 1.7190483184021148, "grad_norm": 0.1490839570760727, "learning_rate": 4.261006289308177e-06, "loss": 10.0672, "step": 365 }, { "epoch": 1.7237479806138933, "grad_norm": 0.21474531292915344, "learning_rate": 4.245283018867925e-06, "loss": 9.2953, "step": 366 }, { "epoch": 1.7284476428256719, "grad_norm": 0.15353207290172577, "learning_rate": 4.229559748427673e-06, "loss": 9.0618, "step": 367 }, { "epoch": 1.7331473050374504, "grad_norm": 0.23263074457645416, "learning_rate": 4.213836477987422e-06, "loss": 10.3876, "step": 368 }, { "epoch": 1.737846967249229, "grad_norm": 0.2000337690114975, "learning_rate": 4.19811320754717e-06, "loss": 9.2049, "step": 369 }, { "epoch": 1.7425466294610075, "grad_norm": 0.20854943990707397, "learning_rate": 4.182389937106919e-06, "loss": 8.7975, "step": 370 }, { "epoch": 1.747246291672786, "grad_norm": 0.17304840683937073, "learning_rate": 4.166666666666667e-06, "loss": 9.8699, "step": 371 }, { "epoch": 1.7519459538845645, "grad_norm": 0.19021810591220856, "learning_rate": 4.150943396226416e-06, "loss": 9.3764, "step": 372 }, { "epoch": 1.756645616096343, "grad_norm": 0.1873621791601181, "learning_rate": 4.135220125786164e-06, "loss": 8.9573, "step": 373 }, { "epoch": 1.7613452783081216, "grad_norm": 0.1484212577342987, "learning_rate": 4.119496855345912e-06, "loss": 9.4656, "step": 374 }, { "epoch": 1.7660449405199001, "grad_norm": 0.1981293112039566, "learning_rate": 4.103773584905661e-06, "loss": 9.1688, "step": 375 }, { "epoch": 1.7707446027316787, "grad_norm": 0.19324973225593567, "learning_rate": 4.08805031446541e-06, "loss": 9.0968, "step": 376 }, { "epoch": 1.7754442649434572, "grad_norm": 0.1694146990776062, "learning_rate": 4.072327044025158e-06, "loss": 8.8419, "step": 377 }, { "epoch": 1.7801439271552357, "grad_norm": 0.2026609629392624, "learning_rate": 4.056603773584906e-06, "loss": 9.6672, "step": 378 }, { "epoch": 1.7848435893670143, "grad_norm": 0.1619563102722168, "learning_rate": 4.040880503144654e-06, "loss": 8.845, "step": 379 }, { "epoch": 1.7895432515787928, "grad_norm": 0.1429334133863449, "learning_rate": 4.025157232704403e-06, "loss": 9.3556, "step": 380 }, { "epoch": 1.7942429137905713, "grad_norm": 0.13862460851669312, "learning_rate": 4.009433962264152e-06, "loss": 8.0309, "step": 381 }, { "epoch": 1.7989425760023499, "grad_norm": 0.2269888073205948, "learning_rate": 3.9937106918239e-06, "loss": 9.0674, "step": 382 }, { "epoch": 1.8036422382141284, "grad_norm": 0.19257493317127228, "learning_rate": 3.977987421383648e-06, "loss": 9.5704, "step": 383 }, { "epoch": 1.808341900425907, "grad_norm": 0.17540623247623444, "learning_rate": 3.962264150943396e-06, "loss": 9.412, "step": 384 }, { "epoch": 1.8130415626376855, "grad_norm": 0.1327970027923584, "learning_rate": 3.9465408805031455e-06, "loss": 8.1447, "step": 385 }, { "epoch": 1.817741224849464, "grad_norm": 0.2137085646390915, "learning_rate": 3.930817610062894e-06, "loss": 9.9722, "step": 386 }, { "epoch": 1.8224408870612425, "grad_norm": 0.1811988800764084, "learning_rate": 3.915094339622642e-06, "loss": 9.0096, "step": 387 }, { "epoch": 1.827140549273021, "grad_norm": 0.20717856287956238, "learning_rate": 3.89937106918239e-06, "loss": 9.36, "step": 388 }, { "epoch": 1.8318402114847996, "grad_norm": 0.20474138855934143, "learning_rate": 3.883647798742139e-06, "loss": 10.1551, "step": 389 }, { "epoch": 1.836539873696578, "grad_norm": 0.15711812674999237, "learning_rate": 3.8679245283018875e-06, "loss": 9.0021, "step": 390 }, { "epoch": 1.8412395359083567, "grad_norm": 0.17283909022808075, "learning_rate": 3.852201257861636e-06, "loss": 10.1104, "step": 391 }, { "epoch": 1.845939198120135, "grad_norm": 0.15262113511562347, "learning_rate": 3.836477987421384e-06, "loss": 8.6559, "step": 392 }, { "epoch": 1.8506388603319137, "grad_norm": 0.18552465736865997, "learning_rate": 3.820754716981133e-06, "loss": 10.4646, "step": 393 }, { "epoch": 1.855338522543692, "grad_norm": 0.2289816290140152, "learning_rate": 3.8050314465408813e-06, "loss": 9.278, "step": 394 }, { "epoch": 1.8600381847554708, "grad_norm": 0.20872613787651062, "learning_rate": 3.7893081761006295e-06, "loss": 10.1933, "step": 395 }, { "epoch": 1.864737846967249, "grad_norm": 0.13862136006355286, "learning_rate": 3.7735849056603777e-06, "loss": 10.1102, "step": 396 }, { "epoch": 1.8694375091790278, "grad_norm": 0.2503572106361389, "learning_rate": 3.7578616352201264e-06, "loss": 8.916, "step": 397 }, { "epoch": 1.8741371713908062, "grad_norm": 0.19755487143993378, "learning_rate": 3.742138364779874e-06, "loss": 9.4722, "step": 398 }, { "epoch": 1.878836833602585, "grad_norm": 0.19663922488689423, "learning_rate": 3.726415094339623e-06, "loss": 9.997, "step": 399 }, { "epoch": 1.8835364958143632, "grad_norm": 0.18214724957942963, "learning_rate": 3.710691823899371e-06, "loss": 10.2177, "step": 400 }, { "epoch": 1.888236158026142, "grad_norm": 0.16722829639911652, "learning_rate": 3.6949685534591193e-06, "loss": 9.4884, "step": 401 }, { "epoch": 1.8929358202379203, "grad_norm": 0.18669411540031433, "learning_rate": 3.679245283018868e-06, "loss": 8.6767, "step": 402 }, { "epoch": 1.897635482449699, "grad_norm": 0.18424375355243683, "learning_rate": 3.6635220125786162e-06, "loss": 8.3095, "step": 403 }, { "epoch": 1.9023351446614774, "grad_norm": 0.19896991550922394, "learning_rate": 3.647798742138365e-06, "loss": 9.3901, "step": 404 }, { "epoch": 1.907034806873256, "grad_norm": 0.1650737076997757, "learning_rate": 3.632075471698113e-06, "loss": 8.6404, "step": 405 }, { "epoch": 1.9117344690850344, "grad_norm": 0.2115001082420349, "learning_rate": 3.6163522012578618e-06, "loss": 8.0328, "step": 406 }, { "epoch": 1.9164341312968132, "grad_norm": 0.14102685451507568, "learning_rate": 3.60062893081761e-06, "loss": 8.4303, "step": 407 }, { "epoch": 1.9211337935085915, "grad_norm": 0.15853671729564667, "learning_rate": 3.5849056603773586e-06, "loss": 8.9311, "step": 408 }, { "epoch": 1.9258334557203702, "grad_norm": 0.13820263743400574, "learning_rate": 3.569182389937107e-06, "loss": 8.9898, "step": 409 }, { "epoch": 1.9305331179321485, "grad_norm": 0.18544992804527283, "learning_rate": 3.5534591194968555e-06, "loss": 8.4406, "step": 410 }, { "epoch": 1.9352327801439273, "grad_norm": 0.1638360172510147, "learning_rate": 3.5377358490566038e-06, "loss": 9.1398, "step": 411 }, { "epoch": 1.9399324423557056, "grad_norm": 0.2164238542318344, "learning_rate": 3.5220125786163524e-06, "loss": 9.7534, "step": 412 }, { "epoch": 1.9446321045674844, "grad_norm": 0.12553298473358154, "learning_rate": 3.5062893081761007e-06, "loss": 8.8001, "step": 413 }, { "epoch": 1.9493317667792627, "grad_norm": 0.19941245019435883, "learning_rate": 3.4905660377358493e-06, "loss": 8.3854, "step": 414 }, { "epoch": 1.9540314289910414, "grad_norm": 0.20216481387615204, "learning_rate": 3.4748427672955975e-06, "loss": 9.4938, "step": 415 }, { "epoch": 1.9587310912028197, "grad_norm": 0.1449737548828125, "learning_rate": 3.4591194968553458e-06, "loss": 9.2678, "step": 416 }, { "epoch": 1.9634307534145983, "grad_norm": 0.22720523178577423, "learning_rate": 3.4433962264150944e-06, "loss": 9.1216, "step": 417 }, { "epoch": 1.9681304156263768, "grad_norm": 0.16688017547130585, "learning_rate": 3.4276729559748427e-06, "loss": 8.1686, "step": 418 }, { "epoch": 1.9728300778381553, "grad_norm": 0.19298182427883148, "learning_rate": 3.4119496855345913e-06, "loss": 9.0564, "step": 419 }, { "epoch": 1.9775297400499339, "grad_norm": 0.17087504267692566, "learning_rate": 3.3962264150943395e-06, "loss": 9.1819, "step": 420 }, { "epoch": 1.9822294022617124, "grad_norm": 0.27183791995048523, "learning_rate": 3.380503144654088e-06, "loss": 9.4169, "step": 421 }, { "epoch": 1.986929064473491, "grad_norm": 0.13685189187526703, "learning_rate": 3.3647798742138364e-06, "loss": 9.6564, "step": 422 }, { "epoch": 1.9916287266852695, "grad_norm": 0.2539704144001007, "learning_rate": 3.349056603773585e-06, "loss": 8.5989, "step": 423 }, { "epoch": 1.996328388897048, "grad_norm": 0.14592863619327545, "learning_rate": 3.3333333333333333e-06, "loss": 8.672, "step": 424 }, { "epoch": 2.0046996622117783, "grad_norm": 0.20275884866714478, "learning_rate": 3.317610062893082e-06, "loss": 8.121, "step": 425 }, { "epoch": 2.009399324423557, "grad_norm": 0.14590352773666382, "learning_rate": 3.30188679245283e-06, "loss": 8.3972, "step": 426 }, { "epoch": 2.0140989866353354, "grad_norm": 0.13498660922050476, "learning_rate": 3.286163522012579e-06, "loss": 8.8664, "step": 427 }, { "epoch": 2.018798648847114, "grad_norm": 0.17794980108737946, "learning_rate": 3.270440251572327e-06, "loss": 8.2335, "step": 428 }, { "epoch": 2.0234983110588924, "grad_norm": 0.21292459964752197, "learning_rate": 3.2547169811320758e-06, "loss": 9.1668, "step": 429 }, { "epoch": 2.028197973270671, "grad_norm": 0.2391827553510666, "learning_rate": 3.238993710691824e-06, "loss": 9.2038, "step": 430 }, { "epoch": 2.0328976354824495, "grad_norm": 0.22036492824554443, "learning_rate": 3.2232704402515726e-06, "loss": 8.2135, "step": 431 }, { "epoch": 2.0375972976942283, "grad_norm": 0.18384629487991333, "learning_rate": 3.207547169811321e-06, "loss": 8.4177, "step": 432 }, { "epoch": 2.0422969599060066, "grad_norm": 0.22049392759799957, "learning_rate": 3.191823899371069e-06, "loss": 8.7121, "step": 433 }, { "epoch": 2.0469966221177853, "grad_norm": 0.19101233780384064, "learning_rate": 3.1761006289308178e-06, "loss": 9.0261, "step": 434 }, { "epoch": 2.0516962843295636, "grad_norm": 0.16205710172653198, "learning_rate": 3.160377358490566e-06, "loss": 7.843, "step": 435 }, { "epoch": 2.0563959465413424, "grad_norm": 0.15637215971946716, "learning_rate": 3.1446540880503146e-06, "loss": 8.6742, "step": 436 }, { "epoch": 2.0610956087531207, "grad_norm": 0.20816561579704285, "learning_rate": 3.128930817610063e-06, "loss": 7.677, "step": 437 }, { "epoch": 2.0657952709648995, "grad_norm": 0.13828535377979279, "learning_rate": 3.1132075471698115e-06, "loss": 8.8801, "step": 438 }, { "epoch": 2.0704949331766778, "grad_norm": 0.20928382873535156, "learning_rate": 3.0974842767295598e-06, "loss": 7.8039, "step": 439 }, { "epoch": 2.0751945953884565, "grad_norm": 0.22111253440380096, "learning_rate": 3.0817610062893084e-06, "loss": 9.6013, "step": 440 }, { "epoch": 2.079894257600235, "grad_norm": 0.20437324047088623, "learning_rate": 3.0660377358490567e-06, "loss": 8.4785, "step": 441 }, { "epoch": 2.0845939198120136, "grad_norm": 0.1478756070137024, "learning_rate": 3.0503144654088053e-06, "loss": 9.3812, "step": 442 }, { "epoch": 2.089293582023792, "grad_norm": 0.19669660925865173, "learning_rate": 3.0345911949685535e-06, "loss": 8.9415, "step": 443 }, { "epoch": 2.0939932442355706, "grad_norm": 0.17696648836135864, "learning_rate": 3.018867924528302e-06, "loss": 8.1716, "step": 444 }, { "epoch": 2.098692906447349, "grad_norm": 0.18872305750846863, "learning_rate": 3.0031446540880504e-06, "loss": 8.8441, "step": 445 }, { "epoch": 2.1033925686591277, "grad_norm": 0.16521762311458588, "learning_rate": 2.987421383647799e-06, "loss": 8.9084, "step": 446 }, { "epoch": 2.108092230870906, "grad_norm": 0.1834409087896347, "learning_rate": 2.9716981132075473e-06, "loss": 9.0528, "step": 447 }, { "epoch": 2.112791893082685, "grad_norm": 0.16718453168869019, "learning_rate": 2.9559748427672955e-06, "loss": 9.0606, "step": 448 }, { "epoch": 2.117491555294463, "grad_norm": 0.14127828180789948, "learning_rate": 2.940251572327044e-06, "loss": 8.1169, "step": 449 }, { "epoch": 2.122191217506242, "grad_norm": 0.14984261989593506, "learning_rate": 2.9245283018867924e-06, "loss": 8.8896, "step": 450 }, { "epoch": 2.12689087971802, "grad_norm": 0.1980627477169037, "learning_rate": 2.908805031446541e-06, "loss": 8.5788, "step": 451 }, { "epoch": 2.131590541929799, "grad_norm": 0.15255112946033478, "learning_rate": 2.8930817610062893e-06, "loss": 8.4127, "step": 452 }, { "epoch": 2.136290204141577, "grad_norm": 0.19393834471702576, "learning_rate": 2.877358490566038e-06, "loss": 8.365, "step": 453 }, { "epoch": 2.140989866353356, "grad_norm": 0.15506310760974884, "learning_rate": 2.861635220125786e-06, "loss": 8.8021, "step": 454 }, { "epoch": 2.1456895285651343, "grad_norm": 0.21137897670269012, "learning_rate": 2.845911949685535e-06, "loss": 8.9953, "step": 455 }, { "epoch": 2.150389190776913, "grad_norm": 0.18918299674987793, "learning_rate": 2.830188679245283e-06, "loss": 8.4305, "step": 456 }, { "epoch": 2.1550888529886914, "grad_norm": 0.18130037188529968, "learning_rate": 2.8144654088050318e-06, "loss": 8.0617, "step": 457 }, { "epoch": 2.15978851520047, "grad_norm": 0.1432488113641739, "learning_rate": 2.79874213836478e-06, "loss": 8.2193, "step": 458 }, { "epoch": 2.1644881774122484, "grad_norm": 0.17982621490955353, "learning_rate": 2.7830188679245286e-06, "loss": 8.712, "step": 459 }, { "epoch": 2.169187839624027, "grad_norm": 0.17188197374343872, "learning_rate": 2.767295597484277e-06, "loss": 9.503, "step": 460 }, { "epoch": 2.1738875018358055, "grad_norm": 0.17847636342048645, "learning_rate": 2.7515723270440255e-06, "loss": 9.1732, "step": 461 }, { "epoch": 2.1785871640475842, "grad_norm": 0.16977262496948242, "learning_rate": 2.7358490566037738e-06, "loss": 9.8435, "step": 462 }, { "epoch": 2.1832868262593625, "grad_norm": 0.15752658247947693, "learning_rate": 2.720125786163522e-06, "loss": 8.8716, "step": 463 }, { "epoch": 2.1879864884711413, "grad_norm": 0.19320890307426453, "learning_rate": 2.7044025157232706e-06, "loss": 8.8021, "step": 464 }, { "epoch": 2.1926861506829196, "grad_norm": 0.17799217998981476, "learning_rate": 2.688679245283019e-06, "loss": 8.8318, "step": 465 }, { "epoch": 2.1973858128946984, "grad_norm": 0.17455509305000305, "learning_rate": 2.6729559748427675e-06, "loss": 9.1465, "step": 466 }, { "epoch": 2.2020854751064767, "grad_norm": 0.15405863523483276, "learning_rate": 2.6572327044025158e-06, "loss": 8.5727, "step": 467 }, { "epoch": 2.2067851373182554, "grad_norm": 0.15029039978981018, "learning_rate": 2.6415094339622644e-06, "loss": 7.7094, "step": 468 }, { "epoch": 2.2114847995300337, "grad_norm": 0.16541877388954163, "learning_rate": 2.6257861635220127e-06, "loss": 8.8547, "step": 469 }, { "epoch": 2.2161844617418125, "grad_norm": 0.16449324786663055, "learning_rate": 2.6100628930817613e-06, "loss": 8.5582, "step": 470 }, { "epoch": 2.220884123953591, "grad_norm": 0.21227984130382538, "learning_rate": 2.5943396226415095e-06, "loss": 7.3667, "step": 471 }, { "epoch": 2.2255837861653696, "grad_norm": 0.12738406658172607, "learning_rate": 2.578616352201258e-06, "loss": 7.903, "step": 472 }, { "epoch": 2.230283448377148, "grad_norm": 0.17838919162750244, "learning_rate": 2.5628930817610064e-06, "loss": 10.4478, "step": 473 }, { "epoch": 2.2349831105889266, "grad_norm": 0.20598867535591125, "learning_rate": 2.547169811320755e-06, "loss": 7.8447, "step": 474 }, { "epoch": 2.239682772800705, "grad_norm": 0.15466590225696564, "learning_rate": 2.5314465408805033e-06, "loss": 8.9438, "step": 475 }, { "epoch": 2.2443824350124837, "grad_norm": 0.17884458601474762, "learning_rate": 2.515723270440252e-06, "loss": 7.8261, "step": 476 }, { "epoch": 2.249082097224262, "grad_norm": 0.14316703379154205, "learning_rate": 2.5e-06, "loss": 9.866, "step": 477 }, { "epoch": 2.2537817594360403, "grad_norm": 0.14656466245651245, "learning_rate": 2.4842767295597484e-06, "loss": 8.2903, "step": 478 }, { "epoch": 2.258481421647819, "grad_norm": 0.1635284423828125, "learning_rate": 2.468553459119497e-06, "loss": 8.5256, "step": 479 }, { "epoch": 2.263181083859598, "grad_norm": 0.22854338586330414, "learning_rate": 2.4528301886792453e-06, "loss": 7.917, "step": 480 }, { "epoch": 2.267880746071376, "grad_norm": 0.1849682331085205, "learning_rate": 2.437106918238994e-06, "loss": 8.2091, "step": 481 }, { "epoch": 2.2725804082831544, "grad_norm": 0.15696680545806885, "learning_rate": 2.421383647798742e-06, "loss": 9.6068, "step": 482 }, { "epoch": 2.277280070494933, "grad_norm": 0.1514119952917099, "learning_rate": 2.405660377358491e-06, "loss": 8.8259, "step": 483 }, { "epoch": 2.281979732706712, "grad_norm": 0.2207663506269455, "learning_rate": 2.389937106918239e-06, "loss": 9.1401, "step": 484 }, { "epoch": 2.2866793949184903, "grad_norm": 0.10197854787111282, "learning_rate": 2.3742138364779878e-06, "loss": 9.3325, "step": 485 }, { "epoch": 2.2913790571302686, "grad_norm": 0.15092411637306213, "learning_rate": 2.358490566037736e-06, "loss": 8.1425, "step": 486 }, { "epoch": 2.2960787193420473, "grad_norm": 0.16204456984996796, "learning_rate": 2.3427672955974846e-06, "loss": 7.3509, "step": 487 }, { "epoch": 2.3007783815538256, "grad_norm": 0.22659596800804138, "learning_rate": 2.327044025157233e-06, "loss": 8.9592, "step": 488 }, { "epoch": 2.3054780437656044, "grad_norm": 0.21369831264019012, "learning_rate": 2.3113207547169815e-06, "loss": 8.4935, "step": 489 }, { "epoch": 2.3101777059773827, "grad_norm": 0.16711963713169098, "learning_rate": 2.2955974842767298e-06, "loss": 9.2076, "step": 490 }, { "epoch": 2.3148773681891615, "grad_norm": 0.2575282156467438, "learning_rate": 2.2798742138364784e-06, "loss": 8.7313, "step": 491 }, { "epoch": 2.3195770304009398, "grad_norm": 0.17614226043224335, "learning_rate": 2.2641509433962266e-06, "loss": 8.518, "step": 492 }, { "epoch": 2.3242766926127185, "grad_norm": 0.15815369784832, "learning_rate": 2.248427672955975e-06, "loss": 8.2335, "step": 493 }, { "epoch": 2.328976354824497, "grad_norm": 0.17895972728729248, "learning_rate": 2.2327044025157235e-06, "loss": 7.1084, "step": 494 }, { "epoch": 2.3336760170362756, "grad_norm": 0.20956727862358093, "learning_rate": 2.2169811320754718e-06, "loss": 7.494, "step": 495 }, { "epoch": 2.338375679248054, "grad_norm": 0.1270400881767273, "learning_rate": 2.2012578616352204e-06, "loss": 9.1078, "step": 496 }, { "epoch": 2.3430753414598327, "grad_norm": 0.15311852097511292, "learning_rate": 2.1855345911949687e-06, "loss": 7.6996, "step": 497 }, { "epoch": 2.347775003671611, "grad_norm": 0.2237144559621811, "learning_rate": 2.1698113207547173e-06, "loss": 8.5155, "step": 498 }, { "epoch": 2.3524746658833897, "grad_norm": 0.21806317567825317, "learning_rate": 2.1540880503144655e-06, "loss": 8.4596, "step": 499 }, { "epoch": 2.357174328095168, "grad_norm": 0.21347108483314514, "learning_rate": 2.138364779874214e-06, "loss": 8.1543, "step": 500 }, { "epoch": 2.361873990306947, "grad_norm": 0.2042577564716339, "learning_rate": 2.1226415094339624e-06, "loss": 8.1098, "step": 501 }, { "epoch": 2.366573652518725, "grad_norm": 0.14325296878814697, "learning_rate": 2.106918238993711e-06, "loss": 9.2361, "step": 502 }, { "epoch": 2.371273314730504, "grad_norm": 0.11437965929508209, "learning_rate": 2.0911949685534593e-06, "loss": 9.0062, "step": 503 }, { "epoch": 2.375972976942282, "grad_norm": 0.12539036571979523, "learning_rate": 2.075471698113208e-06, "loss": 7.0536, "step": 504 }, { "epoch": 2.380672639154061, "grad_norm": 0.1560347080230713, "learning_rate": 2.059748427672956e-06, "loss": 8.0084, "step": 505 }, { "epoch": 2.385372301365839, "grad_norm": 0.13887687027454376, "learning_rate": 2.044025157232705e-06, "loss": 8.7966, "step": 506 }, { "epoch": 2.390071963577618, "grad_norm": 0.17969252169132233, "learning_rate": 2.028301886792453e-06, "loss": 8.5925, "step": 507 }, { "epoch": 2.3947716257893963, "grad_norm": 0.18632932007312775, "learning_rate": 2.0125786163522013e-06, "loss": 8.687, "step": 508 }, { "epoch": 2.399471288001175, "grad_norm": 0.15372858941555023, "learning_rate": 1.99685534591195e-06, "loss": 9.1667, "step": 509 }, { "epoch": 2.4041709502129534, "grad_norm": 0.22014212608337402, "learning_rate": 1.981132075471698e-06, "loss": 8.3866, "step": 510 }, { "epoch": 2.408870612424732, "grad_norm": 0.08676121383905411, "learning_rate": 1.965408805031447e-06, "loss": 9.3444, "step": 511 }, { "epoch": 2.4135702746365104, "grad_norm": 0.18776309490203857, "learning_rate": 1.949685534591195e-06, "loss": 7.8951, "step": 512 }, { "epoch": 2.418269936848289, "grad_norm": 0.152197003364563, "learning_rate": 1.9339622641509438e-06, "loss": 9.8167, "step": 513 }, { "epoch": 2.4229695990600675, "grad_norm": 0.16580915451049805, "learning_rate": 1.918238993710692e-06, "loss": 8.8834, "step": 514 }, { "epoch": 2.4276692612718462, "grad_norm": 0.16281157732009888, "learning_rate": 1.9025157232704406e-06, "loss": 7.8295, "step": 515 }, { "epoch": 2.4323689234836245, "grad_norm": 0.15925729274749756, "learning_rate": 1.8867924528301889e-06, "loss": 10.0162, "step": 516 }, { "epoch": 2.4370685856954033, "grad_norm": 0.1319301426410675, "learning_rate": 1.871069182389937e-06, "loss": 7.8179, "step": 517 }, { "epoch": 2.4417682479071816, "grad_norm": 0.18848244845867157, "learning_rate": 1.8553459119496855e-06, "loss": 7.8018, "step": 518 }, { "epoch": 2.4464679101189604, "grad_norm": 0.3406160771846771, "learning_rate": 1.839622641509434e-06, "loss": 8.2953, "step": 519 }, { "epoch": 2.4511675723307387, "grad_norm": 0.14691799879074097, "learning_rate": 1.8238993710691824e-06, "loss": 9.3757, "step": 520 }, { "epoch": 2.4558672345425174, "grad_norm": 0.17265410721302032, "learning_rate": 1.8081761006289309e-06, "loss": 7.8975, "step": 521 }, { "epoch": 2.4605668967542957, "grad_norm": 0.15075534582138062, "learning_rate": 1.7924528301886793e-06, "loss": 8.4369, "step": 522 }, { "epoch": 2.4652665589660745, "grad_norm": 0.168269544839859, "learning_rate": 1.7767295597484278e-06, "loss": 9.0436, "step": 523 }, { "epoch": 2.469966221177853, "grad_norm": 0.25321871042251587, "learning_rate": 1.7610062893081762e-06, "loss": 8.4128, "step": 524 }, { "epoch": 2.4746658833896316, "grad_norm": 0.12230651080608368, "learning_rate": 1.7452830188679247e-06, "loss": 7.5998, "step": 525 }, { "epoch": 2.47936554560141, "grad_norm": 0.16334621608257294, "learning_rate": 1.7295597484276729e-06, "loss": 8.9995, "step": 526 }, { "epoch": 2.484065207813188, "grad_norm": 0.14537131786346436, "learning_rate": 1.7138364779874213e-06, "loss": 9.0791, "step": 527 }, { "epoch": 2.488764870024967, "grad_norm": 0.26995235681533813, "learning_rate": 1.6981132075471698e-06, "loss": 10.0012, "step": 528 }, { "epoch": 2.4934645322367457, "grad_norm": 0.22433443367481232, "learning_rate": 1.6823899371069182e-06, "loss": 8.5953, "step": 529 }, { "epoch": 2.498164194448524, "grad_norm": 0.11027607321739197, "learning_rate": 1.6666666666666667e-06, "loss": 7.4473, "step": 530 }, { "epoch": 2.5028638566603023, "grad_norm": 0.18855731189250946, "learning_rate": 1.650943396226415e-06, "loss": 8.4203, "step": 531 }, { "epoch": 2.507563518872081, "grad_norm": 0.15325301885604858, "learning_rate": 1.6352201257861635e-06, "loss": 8.0624, "step": 532 }, { "epoch": 2.51226318108386, "grad_norm": 0.15549103915691376, "learning_rate": 1.619496855345912e-06, "loss": 8.3803, "step": 533 }, { "epoch": 2.516962843295638, "grad_norm": 0.173740953207016, "learning_rate": 1.6037735849056604e-06, "loss": 8.8591, "step": 534 }, { "epoch": 2.5216625055074164, "grad_norm": 0.2141212821006775, "learning_rate": 1.5880503144654089e-06, "loss": 7.3826, "step": 535 }, { "epoch": 2.526362167719195, "grad_norm": 0.20882973074913025, "learning_rate": 1.5723270440251573e-06, "loss": 9.429, "step": 536 }, { "epoch": 2.531061829930974, "grad_norm": 0.13823898136615753, "learning_rate": 1.5566037735849058e-06, "loss": 8.9627, "step": 537 }, { "epoch": 2.5357614921427523, "grad_norm": 0.1580439805984497, "learning_rate": 1.5408805031446542e-06, "loss": 8.8832, "step": 538 }, { "epoch": 2.5404611543545306, "grad_norm": 0.19053000211715698, "learning_rate": 1.5251572327044027e-06, "loss": 7.6073, "step": 539 }, { "epoch": 2.5451608165663093, "grad_norm": 0.16570697724819183, "learning_rate": 1.509433962264151e-06, "loss": 8.2866, "step": 540 }, { "epoch": 2.549860478778088, "grad_norm": 0.21306215226650238, "learning_rate": 1.4937106918238995e-06, "loss": 8.2945, "step": 541 }, { "epoch": 2.5545601409898664, "grad_norm": 0.20488876104354858, "learning_rate": 1.4779874213836478e-06, "loss": 6.5529, "step": 542 }, { "epoch": 2.5592598032016447, "grad_norm": 0.16557811200618744, "learning_rate": 1.4622641509433962e-06, "loss": 8.678, "step": 543 }, { "epoch": 2.5639594654134235, "grad_norm": 0.25693365931510925, "learning_rate": 1.4465408805031447e-06, "loss": 8.6303, "step": 544 }, { "epoch": 2.568659127625202, "grad_norm": 0.1821936070919037, "learning_rate": 1.430817610062893e-06, "loss": 9.5128, "step": 545 }, { "epoch": 2.5733587898369805, "grad_norm": 0.17231619358062744, "learning_rate": 1.4150943396226415e-06, "loss": 9.2251, "step": 546 }, { "epoch": 2.578058452048759, "grad_norm": 0.15963391959667206, "learning_rate": 1.39937106918239e-06, "loss": 9.2843, "step": 547 }, { "epoch": 2.5827581142605376, "grad_norm": 0.13365676999092102, "learning_rate": 1.3836477987421384e-06, "loss": 8.5598, "step": 548 }, { "epoch": 2.587457776472316, "grad_norm": 0.14855387806892395, "learning_rate": 1.3679245283018869e-06, "loss": 8.0332, "step": 549 }, { "epoch": 2.5921574386840947, "grad_norm": 0.22655688226222992, "learning_rate": 1.3522012578616353e-06, "loss": 9.4581, "step": 550 }, { "epoch": 2.596857100895873, "grad_norm": 0.17062649130821228, "learning_rate": 1.3364779874213838e-06, "loss": 8.4587, "step": 551 }, { "epoch": 2.6015567631076517, "grad_norm": 0.19863267242908478, "learning_rate": 1.3207547169811322e-06, "loss": 9.1046, "step": 552 }, { "epoch": 2.60625642531943, "grad_norm": 0.21137800812721252, "learning_rate": 1.3050314465408807e-06, "loss": 7.1397, "step": 553 }, { "epoch": 2.610956087531209, "grad_norm": 0.16712285578250885, "learning_rate": 1.289308176100629e-06, "loss": 8.2662, "step": 554 }, { "epoch": 2.615655749742987, "grad_norm": 0.1876005232334137, "learning_rate": 1.2735849056603775e-06, "loss": 8.335, "step": 555 }, { "epoch": 2.620355411954766, "grad_norm": 0.1494407057762146, "learning_rate": 1.257861635220126e-06, "loss": 8.6635, "step": 556 }, { "epoch": 2.625055074166544, "grad_norm": 0.25798699259757996, "learning_rate": 1.2421383647798742e-06, "loss": 9.084, "step": 557 }, { "epoch": 2.629754736378323, "grad_norm": 0.16380231082439423, "learning_rate": 1.2264150943396227e-06, "loss": 7.4629, "step": 558 }, { "epoch": 2.634454398590101, "grad_norm": 0.17343473434448242, "learning_rate": 1.210691823899371e-06, "loss": 8.6482, "step": 559 }, { "epoch": 2.63915406080188, "grad_norm": 0.13443821668624878, "learning_rate": 1.1949685534591195e-06, "loss": 8.8155, "step": 560 }, { "epoch": 2.6438537230136583, "grad_norm": 0.14360274374485016, "learning_rate": 1.179245283018868e-06, "loss": 8.8279, "step": 561 }, { "epoch": 2.648553385225437, "grad_norm": 0.14061862230300903, "learning_rate": 1.1635220125786164e-06, "loss": 7.4424, "step": 562 }, { "epoch": 2.6532530474372154, "grad_norm": 0.1606811285018921, "learning_rate": 1.1477987421383649e-06, "loss": 9.038, "step": 563 }, { "epoch": 2.657952709648994, "grad_norm": 0.22029317915439606, "learning_rate": 1.1320754716981133e-06, "loss": 8.2132, "step": 564 }, { "epoch": 2.6626523718607724, "grad_norm": 0.23397786915302277, "learning_rate": 1.1163522012578618e-06, "loss": 9.5489, "step": 565 }, { "epoch": 2.667352034072551, "grad_norm": 0.16674695909023285, "learning_rate": 1.1006289308176102e-06, "loss": 8.4566, "step": 566 }, { "epoch": 2.6720516962843295, "grad_norm": 0.13880182802677155, "learning_rate": 1.0849056603773587e-06, "loss": 8.458, "step": 567 }, { "epoch": 2.6767513584961082, "grad_norm": 0.1874023973941803, "learning_rate": 1.069182389937107e-06, "loss": 7.3215, "step": 568 }, { "epoch": 2.6814510207078865, "grad_norm": 0.21336112916469574, "learning_rate": 1.0534591194968555e-06, "loss": 8.5312, "step": 569 }, { "epoch": 2.6861506829196653, "grad_norm": 0.17397966980934143, "learning_rate": 1.037735849056604e-06, "loss": 9.6743, "step": 570 }, { "epoch": 2.6908503451314436, "grad_norm": 0.1389649212360382, "learning_rate": 1.0220125786163524e-06, "loss": 8.3201, "step": 571 }, { "epoch": 2.6955500073432224, "grad_norm": 0.25726965069770813, "learning_rate": 1.0062893081761007e-06, "loss": 7.3605, "step": 572 }, { "epoch": 2.7002496695550007, "grad_norm": 0.22075869143009186, "learning_rate": 9.90566037735849e-07, "loss": 8.0443, "step": 573 }, { "epoch": 2.7049493317667794, "grad_norm": 0.15426917374134064, "learning_rate": 9.748427672955975e-07, "loss": 7.7999, "step": 574 }, { "epoch": 2.7096489939785577, "grad_norm": 0.1829274594783783, "learning_rate": 9.59119496855346e-07, "loss": 8.0041, "step": 575 }, { "epoch": 2.714348656190336, "grad_norm": 0.16636963188648224, "learning_rate": 9.433962264150944e-07, "loss": 7.5831, "step": 576 }, { "epoch": 2.719048318402115, "grad_norm": 0.2158714234828949, "learning_rate": 9.276729559748428e-07, "loss": 7.7653, "step": 577 }, { "epoch": 2.7237479806138936, "grad_norm": 0.16352398693561554, "learning_rate": 9.119496855345912e-07, "loss": 7.8731, "step": 578 }, { "epoch": 2.728447642825672, "grad_norm": 0.16695372760295868, "learning_rate": 8.962264150943397e-07, "loss": 8.2942, "step": 579 }, { "epoch": 2.73314730503745, "grad_norm": 0.18953217566013336, "learning_rate": 8.805031446540881e-07, "loss": 8.0006, "step": 580 }, { "epoch": 2.737846967249229, "grad_norm": 0.1785098761320114, "learning_rate": 8.647798742138364e-07, "loss": 8.4262, "step": 581 }, { "epoch": 2.7425466294610077, "grad_norm": 0.15528902411460876, "learning_rate": 8.490566037735849e-07, "loss": 7.6721, "step": 582 }, { "epoch": 2.747246291672786, "grad_norm": 0.19808244705200195, "learning_rate": 8.333333333333333e-07, "loss": 9.2401, "step": 583 }, { "epoch": 2.7519459538845643, "grad_norm": 0.1847376823425293, "learning_rate": 8.176100628930818e-07, "loss": 7.4667, "step": 584 }, { "epoch": 2.756645616096343, "grad_norm": 0.1532883197069168, "learning_rate": 8.018867924528302e-07, "loss": 9.105, "step": 585 }, { "epoch": 2.761345278308122, "grad_norm": 0.19109874963760376, "learning_rate": 7.861635220125787e-07, "loss": 6.9919, "step": 586 }, { "epoch": 2.7660449405199, "grad_norm": 0.16163142025470734, "learning_rate": 7.704402515723271e-07, "loss": 8.3757, "step": 587 }, { "epoch": 2.7707446027316784, "grad_norm": 0.19182686507701874, "learning_rate": 7.547169811320755e-07, "loss": 8.0948, "step": 588 }, { "epoch": 2.775444264943457, "grad_norm": 0.18088266253471375, "learning_rate": 7.389937106918239e-07, "loss": 8.25, "step": 589 }, { "epoch": 2.780143927155236, "grad_norm": 0.14317212998867035, "learning_rate": 7.232704402515723e-07, "loss": 9.0643, "step": 590 }, { "epoch": 2.7848435893670143, "grad_norm": 0.19755391776561737, "learning_rate": 7.075471698113208e-07, "loss": 7.6289, "step": 591 }, { "epoch": 2.7895432515787926, "grad_norm": 0.17980189621448517, "learning_rate": 6.918238993710692e-07, "loss": 9.468, "step": 592 }, { "epoch": 2.7942429137905713, "grad_norm": 0.18988114595413208, "learning_rate": 6.761006289308177e-07, "loss": 8.8408, "step": 593 }, { "epoch": 2.79894257600235, "grad_norm": 0.199185311794281, "learning_rate": 6.603773584905661e-07, "loss": 7.489, "step": 594 }, { "epoch": 2.8036422382141284, "grad_norm": 0.21990296244621277, "learning_rate": 6.446540880503145e-07, "loss": 9.0859, "step": 595 }, { "epoch": 2.8083419004259067, "grad_norm": 0.14740413427352905, "learning_rate": 6.28930817610063e-07, "loss": 7.44, "step": 596 }, { "epoch": 2.8130415626376855, "grad_norm": 0.14299529790878296, "learning_rate": 6.132075471698113e-07, "loss": 8.3645, "step": 597 }, { "epoch": 2.817741224849464, "grad_norm": 0.11146759241819382, "learning_rate": 5.974842767295598e-07, "loss": 8.5886, "step": 598 }, { "epoch": 2.8224408870612425, "grad_norm": 0.14299720525741577, "learning_rate": 5.817610062893082e-07, "loss": 6.8773, "step": 599 }, { "epoch": 2.827140549273021, "grad_norm": 0.1588277667760849, "learning_rate": 5.660377358490567e-07, "loss": 7.5042, "step": 600 }, { "epoch": 2.8318402114847996, "grad_norm": 0.20314551889896393, "learning_rate": 5.503144654088051e-07, "loss": 8.6489, "step": 601 }, { "epoch": 2.836539873696578, "grad_norm": 0.21140792965888977, "learning_rate": 5.345911949685535e-07, "loss": 8.3971, "step": 602 }, { "epoch": 2.8412395359083567, "grad_norm": 0.18069565296173096, "learning_rate": 5.18867924528302e-07, "loss": 8.2446, "step": 603 }, { "epoch": 2.845939198120135, "grad_norm": 0.1393459141254425, "learning_rate": 5.031446540880503e-07, "loss": 7.7026, "step": 604 }, { "epoch": 2.8506388603319137, "grad_norm": 0.16935023665428162, "learning_rate": 4.874213836477988e-07, "loss": 8.8161, "step": 605 }, { "epoch": 2.855338522543692, "grad_norm": 0.15383748710155487, "learning_rate": 4.716981132075472e-07, "loss": 7.2609, "step": 606 }, { "epoch": 2.860038184755471, "grad_norm": 0.16811497509479523, "learning_rate": 4.559748427672956e-07, "loss": 8.7077, "step": 607 }, { "epoch": 2.864737846967249, "grad_norm": 0.14312513172626495, "learning_rate": 4.4025157232704405e-07, "loss": 8.4162, "step": 608 }, { "epoch": 2.869437509179028, "grad_norm": 0.19139531254768372, "learning_rate": 4.2452830188679244e-07, "loss": 9.3706, "step": 609 }, { "epoch": 2.874137171390806, "grad_norm": 0.17848049104213715, "learning_rate": 4.088050314465409e-07, "loss": 8.8553, "step": 610 }, { "epoch": 2.878836833602585, "grad_norm": 0.14866691827774048, "learning_rate": 3.9308176100628933e-07, "loss": 8.0507, "step": 611 }, { "epoch": 2.883536495814363, "grad_norm": 0.1754736602306366, "learning_rate": 3.773584905660378e-07, "loss": 8.4659, "step": 612 }, { "epoch": 2.888236158026142, "grad_norm": 0.16565312445163727, "learning_rate": 3.6163522012578617e-07, "loss": 8.1372, "step": 613 }, { "epoch": 2.8929358202379203, "grad_norm": 0.16558490693569183, "learning_rate": 3.459119496855346e-07, "loss": 6.3748, "step": 614 }, { "epoch": 2.897635482449699, "grad_norm": 0.15964233875274658, "learning_rate": 3.3018867924528305e-07, "loss": 8.0529, "step": 615 }, { "epoch": 2.9023351446614774, "grad_norm": 0.14555031061172485, "learning_rate": 3.144654088050315e-07, "loss": 9.1119, "step": 616 }, { "epoch": 2.907034806873256, "grad_norm": 0.15685509145259857, "learning_rate": 2.987421383647799e-07, "loss": 7.8754, "step": 617 }, { "epoch": 2.9117344690850344, "grad_norm": 0.2125602513551712, "learning_rate": 2.8301886792452833e-07, "loss": 7.9557, "step": 618 }, { "epoch": 2.916434131296813, "grad_norm": 0.1280517280101776, "learning_rate": 2.672955974842768e-07, "loss": 7.3978, "step": 619 }, { "epoch": 2.9211337935085915, "grad_norm": 0.1709686517715454, "learning_rate": 2.5157232704402517e-07, "loss": 8.94, "step": 620 }, { "epoch": 2.9258334557203702, "grad_norm": 0.19064606726169586, "learning_rate": 2.358490566037736e-07, "loss": 7.9244, "step": 621 }, { "epoch": 2.9305331179321485, "grad_norm": 0.17508478462696075, "learning_rate": 2.2012578616352203e-07, "loss": 7.2854, "step": 622 }, { "epoch": 2.9352327801439273, "grad_norm": 0.14699864387512207, "learning_rate": 2.0440251572327044e-07, "loss": 8.2576, "step": 623 }, { "epoch": 2.9399324423557056, "grad_norm": 0.14411219954490662, "learning_rate": 1.886792452830189e-07, "loss": 8.9638, "step": 624 }, { "epoch": 2.9446321045674844, "grad_norm": 0.20984366536140442, "learning_rate": 1.729559748427673e-07, "loss": 9.2744, "step": 625 }, { "epoch": 2.9493317667792627, "grad_norm": 0.17715144157409668, "learning_rate": 1.5723270440251575e-07, "loss": 7.7256, "step": 626 }, { "epoch": 2.9540314289910414, "grad_norm": 0.21797651052474976, "learning_rate": 1.4150943396226417e-07, "loss": 7.1143, "step": 627 }, { "epoch": 2.9587310912028197, "grad_norm": 0.13316509127616882, "learning_rate": 1.2578616352201258e-07, "loss": 9.2316, "step": 628 }, { "epoch": 2.963430753414598, "grad_norm": 0.2036881297826767, "learning_rate": 1.1006289308176101e-07, "loss": 7.8184, "step": 629 }, { "epoch": 2.968130415626377, "grad_norm": 0.1443832665681839, "learning_rate": 9.433962264150944e-08, "loss": 8.4576, "step": 630 }, { "epoch": 2.9728300778381556, "grad_norm": 0.139601930975914, "learning_rate": 7.861635220125787e-08, "loss": 7.6434, "step": 631 }, { "epoch": 2.977529740049934, "grad_norm": 0.17927348613739014, "learning_rate": 6.289308176100629e-08, "loss": 8.1955, "step": 632 }, { "epoch": 2.982229402261712, "grad_norm": 0.21196012198925018, "learning_rate": 4.716981132075472e-08, "loss": 8.0864, "step": 633 }, { "epoch": 2.986929064473491, "grad_norm": 0.1846797615289688, "learning_rate": 3.1446540880503146e-08, "loss": 7.8229, "step": 634 }, { "epoch": 2.9916287266852697, "grad_norm": 0.17866931855678558, "learning_rate": 1.5723270440251573e-08, "loss": 7.0628, "step": 635 }, { "epoch": 2.996328388897048, "grad_norm": 0.13222432136535645, "learning_rate": 0.0, "loss": 8.0092, "step": 636 }, { "epoch": 2.996328388897048, "step": 636, "total_flos": 2.967999731877282e+18, "train_loss": 10.161077564617372, "train_runtime": 60566.2542, "train_samples_per_second": 1.349, "train_steps_per_second": 0.011 } ], "logging_steps": 1.0, "max_steps": 636, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.967999731877282e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }