{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50000, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008, "grad_norm": 1.5249121521928364, "learning_rate": 8e-10, "loss": 0.3578, "step": 2 }, { "epoch": 0.0016, "grad_norm": 3.04013114408118, "learning_rate": 1.6e-09, "loss": -0.3638, "step": 4 }, { "epoch": 0.0024, "grad_norm": 14.399566409644445, "learning_rate": 2.4e-09, "loss": -0.5453, "step": 6 }, { "epoch": 0.0032, "grad_norm": 1.058111653988024, "learning_rate": 3.2e-09, "loss": 0.4043, "step": 8 }, { "epoch": 0.004, "grad_norm": 5.8550272702903285, "learning_rate": 4e-09, "loss": 0.2492, "step": 10 }, { "epoch": 0.0048, "grad_norm": 0.8103035684471422, "learning_rate": 4.8e-09, "loss": 0.1155, "step": 12 }, { "epoch": 0.0056, "grad_norm": 1.439010851325566, "learning_rate": 5.6e-09, "loss": 0.4965, "step": 14 }, { "epoch": 0.0064, "grad_norm": 3.279695602874879, "learning_rate": 6.4e-09, "loss": 0.1119, "step": 16 }, { "epoch": 0.0072, "grad_norm": 3.9544253242598595, "learning_rate": 7.199999999999999e-09, "loss": 0.2711, "step": 18 }, { "epoch": 0.008, "grad_norm": 2.303298337789742, "learning_rate": 8e-09, "loss": -0.2384, "step": 20 }, { "epoch": 0.0088, "grad_norm": 0.6956054922056536, "learning_rate": 8.8e-09, "loss": -0.0266, "step": 22 }, { "epoch": 0.0096, "grad_norm": 1.6058336187090627, "learning_rate": 9.6e-09, "loss": 0.2667, "step": 24 }, { "epoch": 0.0104, "grad_norm": 0.5873869381231204, "learning_rate": 1.0399999999999999e-08, "loss": 0.279, "step": 26 }, { "epoch": 0.0112, "grad_norm": 1.3919968190642873, "learning_rate": 1.12e-08, "loss": 0.2859, "step": 28 }, { "epoch": 0.012, "grad_norm": 0.9786688052322894, "learning_rate": 1.1999999999999998e-08, "loss": 0.2605, "step": 30 }, { "epoch": 0.0128, "grad_norm": 0.997140725588297, "learning_rate": 1.28e-08, "loss": -0.0304, "step": 32 }, { "epoch": 0.0136, "grad_norm": 0.712545871939318, "learning_rate": 1.3600000000000001e-08, "loss": 0.1921, "step": 34 }, { "epoch": 0.0144, "grad_norm": 2.3171450622225698, "learning_rate": 1.4399999999999998e-08, "loss": 0.0005, "step": 36 }, { "epoch": 0.0152, "grad_norm": 1.383748393656482, "learning_rate": 1.52e-08, "loss": 0.6168, "step": 38 }, { "epoch": 0.016, "grad_norm": 2.2831084349959063, "learning_rate": 1.6e-08, "loss": -0.2458, "step": 40 }, { "epoch": 0.0168, "grad_norm": 3.7540820685705554, "learning_rate": 1.68e-08, "loss": -0.5973, "step": 42 }, { "epoch": 0.0176, "grad_norm": 1.581263276047286, "learning_rate": 1.76e-08, "loss": 0.2239, "step": 44 }, { "epoch": 0.0184, "grad_norm": 2.233121763402274, "learning_rate": 1.84e-08, "loss": 0.0193, "step": 46 }, { "epoch": 0.0192, "grad_norm": 0.8606914336336169, "learning_rate": 1.92e-08, "loss": -0.253, "step": 48 }, { "epoch": 0.02, "grad_norm": 5.702797171498987, "learning_rate": 2e-08, "loss": 0.2654, "step": 50 }, { "epoch": 0.0208, "grad_norm": 0.4089715021280977, "learning_rate": 2.0799999999999998e-08, "loss": 0.292, "step": 52 }, { "epoch": 0.0216, "grad_norm": 0.8023217053247889, "learning_rate": 2.1599999999999998e-08, "loss": 0.46, "step": 54 }, { "epoch": 0.0224, "grad_norm": 5.4143014554749245, "learning_rate": 2.24e-08, "loss": -0.1526, "step": 56 }, { "epoch": 0.0232, "grad_norm": 0.6957451761861397, "learning_rate": 2.32e-08, "loss": 0.2491, "step": 58 }, { "epoch": 0.024, "grad_norm": 3.6795198695452123, "learning_rate": 2.3999999999999997e-08, "loss": -0.0906, "step": 60 }, { "epoch": 0.0248, "grad_norm": 1.7378271537499541, "learning_rate": 2.4799999999999997e-08, "loss": 0.7429, "step": 62 }, { "epoch": 0.0256, "grad_norm": 3.6225251386805883, "learning_rate": 2.56e-08, "loss": 0.3386, "step": 64 }, { "epoch": 0.0264, "grad_norm": 1.7773145305018176, "learning_rate": 2.64e-08, "loss": -0.2629, "step": 66 }, { "epoch": 0.0272, "grad_norm": 0.7495628843105195, "learning_rate": 2.7200000000000002e-08, "loss": 0.1966, "step": 68 }, { "epoch": 0.028, "grad_norm": 2.885359440654563, "learning_rate": 2.8000000000000003e-08, "loss": 0.0439, "step": 70 }, { "epoch": 0.0288, "grad_norm": 1.015945747749439, "learning_rate": 2.8799999999999996e-08, "loss": -0.2864, "step": 72 }, { "epoch": 0.0296, "grad_norm": 0.5182079375617211, "learning_rate": 2.9599999999999997e-08, "loss": -0.166, "step": 74 }, { "epoch": 0.0304, "grad_norm": 0.9817246812987965, "learning_rate": 3.04e-08, "loss": 0.3014, "step": 76 }, { "epoch": 0.0312, "grad_norm": 0.4557071988963493, "learning_rate": 3.12e-08, "loss": -0.1152, "step": 78 }, { "epoch": 0.032, "grad_norm": 4.763328548185686, "learning_rate": 3.2e-08, "loss": -0.1538, "step": 80 }, { "epoch": 0.0328, "grad_norm": 0.9349584077447443, "learning_rate": 3.28e-08, "loss": -0.0778, "step": 82 }, { "epoch": 0.0336, "grad_norm": 4.258851934530662, "learning_rate": 3.36e-08, "loss": -0.1298, "step": 84 }, { "epoch": 0.0344, "grad_norm": 1.3039223561516173, "learning_rate": 3.44e-08, "loss": 0.5712, "step": 86 }, { "epoch": 0.0352, "grad_norm": 0.8670442729703873, "learning_rate": 3.52e-08, "loss": 0.0303, "step": 88 }, { "epoch": 0.036, "grad_norm": 1.0441412548980031, "learning_rate": 3.6e-08, "loss": -0.1422, "step": 90 }, { "epoch": 0.0368, "grad_norm": 3.5521797457066855, "learning_rate": 3.68e-08, "loss": -0.453, "step": 92 }, { "epoch": 0.0376, "grad_norm": 0.7571538831950513, "learning_rate": 3.76e-08, "loss": 0.4032, "step": 94 }, { "epoch": 0.0384, "grad_norm": 0.9420773865217862, "learning_rate": 3.84e-08, "loss": 0.4746, "step": 96 }, { "epoch": 0.0392, "grad_norm": 0.7161321209562558, "learning_rate": 3.92e-08, "loss": 0.0431, "step": 98 }, { "epoch": 0.04, "grad_norm": 0.7244176997474762, "learning_rate": 4e-08, "loss": 0.0103, "step": 100 }, { "epoch": 0.0408, "grad_norm": 1.2508780434898261, "learning_rate": 4.0799999999999995e-08, "loss": -0.2087, "step": 102 }, { "epoch": 0.0416, "grad_norm": 2.8150494985583787, "learning_rate": 4.1599999999999995e-08, "loss": -0.4386, "step": 104 }, { "epoch": 0.0424, "grad_norm": 5.244508372577221, "learning_rate": 4.2399999999999996e-08, "loss": -0.0776, "step": 106 }, { "epoch": 0.0432, "grad_norm": 1.0118362643085135, "learning_rate": 4.3199999999999996e-08, "loss": 0.2148, "step": 108 }, { "epoch": 0.044, "grad_norm": 2.453561169568128, "learning_rate": 4.4e-08, "loss": -0.28, "step": 110 }, { "epoch": 0.0448, "grad_norm": 5.3747338716221895, "learning_rate": 4.48e-08, "loss": 0.4104, "step": 112 }, { "epoch": 0.0456, "grad_norm": 0.3632823515674558, "learning_rate": 4.56e-08, "loss": 0.1336, "step": 114 }, { "epoch": 0.0464, "grad_norm": 2.2581925885780403, "learning_rate": 4.64e-08, "loss": 0.3285, "step": 116 }, { "epoch": 0.0472, "grad_norm": 1.4956909565408185, "learning_rate": 4.719999999999999e-08, "loss": 0.0122, "step": 118 }, { "epoch": 0.048, "grad_norm": 1.7769183707894964, "learning_rate": 4.799999999999999e-08, "loss": 0.5743, "step": 120 }, { "epoch": 0.0488, "grad_norm": 1.2583349136163662, "learning_rate": 4.8799999999999994e-08, "loss": -0.0307, "step": 122 }, { "epoch": 0.0496, "grad_norm": 0.7008190210050554, "learning_rate": 4.9599999999999994e-08, "loss": 0.0008, "step": 124 }, { "epoch": 0.0504, "grad_norm": 8.020060442286011, "learning_rate": 5.04e-08, "loss": -0.1381, "step": 126 }, { "epoch": 0.0512, "grad_norm": 3.2600734226930705, "learning_rate": 5.12e-08, "loss": -0.152, "step": 128 }, { "epoch": 0.052, "grad_norm": 0.8910507406833589, "learning_rate": 5.2e-08, "loss": -0.4405, "step": 130 }, { "epoch": 0.0528, "grad_norm": 0.7722355780927433, "learning_rate": 5.28e-08, "loss": 0.2391, "step": 132 }, { "epoch": 0.0536, "grad_norm": 1.6289451664213528, "learning_rate": 5.3600000000000004e-08, "loss": -0.1101, "step": 134 }, { "epoch": 0.0544, "grad_norm": 1.0056870022444548, "learning_rate": 5.4400000000000004e-08, "loss": 0.0633, "step": 136 }, { "epoch": 0.0552, "grad_norm": 2.3308622414962508, "learning_rate": 5.5200000000000005e-08, "loss": 0.4368, "step": 138 }, { "epoch": 0.056, "grad_norm": 0.7924323699418531, "learning_rate": 5.6000000000000005e-08, "loss": 0.5237, "step": 140 }, { "epoch": 0.0568, "grad_norm": 3.8386273487570692, "learning_rate": 5.679999999999999e-08, "loss": -0.1094, "step": 142 }, { "epoch": 0.0576, "grad_norm": 1.2380510876659898, "learning_rate": 5.759999999999999e-08, "loss": 0.1722, "step": 144 }, { "epoch": 0.0584, "grad_norm": 1.9382814912362036, "learning_rate": 5.8399999999999994e-08, "loss": -0.1648, "step": 146 }, { "epoch": 0.0592, "grad_norm": 1.3282095077859473, "learning_rate": 5.9199999999999994e-08, "loss": 0.0622, "step": 148 }, { "epoch": 0.06, "grad_norm": 1.555494681939169, "learning_rate": 6e-08, "loss": -0.3911, "step": 150 }, { "epoch": 0.0608, "grad_norm": 2.669943445564743, "learning_rate": 6.08e-08, "loss": -0.4855, "step": 152 }, { "epoch": 0.0616, "grad_norm": 2.8935870289471177, "learning_rate": 6.16e-08, "loss": -0.5626, "step": 154 }, { "epoch": 0.0624, "grad_norm": 4.067814139067205, "learning_rate": 6.24e-08, "loss": -0.6101, "step": 156 }, { "epoch": 0.0632, "grad_norm": 7.45875665988826, "learning_rate": 6.32e-08, "loss": -0.8887, "step": 158 }, { "epoch": 0.064, "grad_norm": 1.113875500199247, "learning_rate": 6.4e-08, "loss": -0.0743, "step": 160 }, { "epoch": 0.0648, "grad_norm": 1.0811766104737213, "learning_rate": 6.48e-08, "loss": 0.4856, "step": 162 }, { "epoch": 0.0656, "grad_norm": 3.6239338057499606, "learning_rate": 6.56e-08, "loss": -0.2057, "step": 164 }, { "epoch": 0.0664, "grad_norm": 1.9376472037589891, "learning_rate": 6.64e-08, "loss": 0.0629, "step": 166 }, { "epoch": 0.0672, "grad_norm": 0.8329527775863854, "learning_rate": 6.72e-08, "loss": 0.1928, "step": 168 }, { "epoch": 0.068, "grad_norm": 3.7893101636338664, "learning_rate": 6.8e-08, "loss": 0.4232, "step": 170 }, { "epoch": 0.0688, "grad_norm": 1.0070374331551073, "learning_rate": 6.88e-08, "loss": 0.1722, "step": 172 }, { "epoch": 0.0696, "grad_norm": 0.9497901793517014, "learning_rate": 6.959999999999999e-08, "loss": -0.2653, "step": 174 }, { "epoch": 0.0704, "grad_norm": 1.650849094611574, "learning_rate": 7.04e-08, "loss": -0.7653, "step": 176 }, { "epoch": 0.0712, "grad_norm": 1.3515156957900079, "learning_rate": 7.119999999999999e-08, "loss": -0.8214, "step": 178 }, { "epoch": 0.072, "grad_norm": 3.1324182478150067, "learning_rate": 7.2e-08, "loss": -0.7823, "step": 180 }, { "epoch": 0.0728, "grad_norm": 1.4174012963850768, "learning_rate": 7.279999999999999e-08, "loss": 0.2644, "step": 182 }, { "epoch": 0.0736, "grad_norm": 1.210359954223697, "learning_rate": 7.36e-08, "loss": 0.2054, "step": 184 }, { "epoch": 0.0744, "grad_norm": 0.8136072967624918, "learning_rate": 7.439999999999999e-08, "loss": 0.2106, "step": 186 }, { "epoch": 0.0752, "grad_norm": 2.585403352286107, "learning_rate": 7.52e-08, "loss": -0.4224, "step": 188 }, { "epoch": 0.076, "grad_norm": 10.231551056457478, "learning_rate": 7.599999999999999e-08, "loss": 0.2142, "step": 190 }, { "epoch": 0.0768, "grad_norm": 3.2173359330709483, "learning_rate": 7.68e-08, "loss": 0.6189, "step": 192 }, { "epoch": 0.0776, "grad_norm": 2.124353538166263, "learning_rate": 7.76e-08, "loss": 0.2982, "step": 194 }, { "epoch": 0.0784, "grad_norm": 0.552359523142279, "learning_rate": 7.84e-08, "loss": 0.143, "step": 196 }, { "epoch": 0.0792, "grad_norm": 0.8478239264393724, "learning_rate": 7.92e-08, "loss": 0.2097, "step": 198 }, { "epoch": 0.08, "grad_norm": 5.107721369380915, "learning_rate": 8e-08, "loss": 0.5133, "step": 200 }, { "epoch": 0.0808, "grad_norm": 0.7042643328899001, "learning_rate": 8.08e-08, "loss": 0.299, "step": 202 }, { "epoch": 0.0816, "grad_norm": 1.8155271031174458, "learning_rate": 8.159999999999999e-08, "loss": -0.2053, "step": 204 }, { "epoch": 0.0824, "grad_norm": 9.910433671130228, "learning_rate": 8.24e-08, "loss": -0.6662, "step": 206 }, { "epoch": 0.0832, "grad_norm": 2.785320281862689, "learning_rate": 8.319999999999999e-08, "loss": -0.1989, "step": 208 }, { "epoch": 0.084, "grad_norm": 8.98969821762216, "learning_rate": 8.4e-08, "loss": 0.0962, "step": 210 }, { "epoch": 0.0848, "grad_norm": 3.886512189538636, "learning_rate": 8.479999999999999e-08, "loss": 0.6847, "step": 212 }, { "epoch": 0.0856, "grad_norm": 1.354000381017739, "learning_rate": 8.56e-08, "loss": 0.0522, "step": 214 }, { "epoch": 0.0864, "grad_norm": 5.066941305120673, "learning_rate": 8.639999999999999e-08, "loss": -0.1425, "step": 216 }, { "epoch": 0.0872, "grad_norm": 3.7051328665717844, "learning_rate": 8.72e-08, "loss": -0.7429, "step": 218 }, { "epoch": 0.088, "grad_norm": 0.45345084267213714, "learning_rate": 8.8e-08, "loss": -0.0116, "step": 220 }, { "epoch": 0.0888, "grad_norm": 3.441731395892135, "learning_rate": 8.88e-08, "loss": -0.448, "step": 222 }, { "epoch": 0.0896, "grad_norm": 1.3984343399803179, "learning_rate": 8.96e-08, "loss": -0.4024, "step": 224 }, { "epoch": 0.0904, "grad_norm": 3.182154434652412, "learning_rate": 9.04e-08, "loss": -0.3474, "step": 226 }, { "epoch": 0.0912, "grad_norm": 6.415250120623118, "learning_rate": 9.12e-08, "loss": 0.4669, "step": 228 }, { "epoch": 0.092, "grad_norm": 1.7120711032079754, "learning_rate": 9.2e-08, "loss": 0.0868, "step": 230 }, { "epoch": 0.0928, "grad_norm": 0.5203104044416641, "learning_rate": 9.28e-08, "loss": 0.0763, "step": 232 }, { "epoch": 0.0936, "grad_norm": 0.9813223999574935, "learning_rate": 9.36e-08, "loss": 0.1502, "step": 234 }, { "epoch": 0.0944, "grad_norm": 1.7901780101296731, "learning_rate": 9.439999999999999e-08, "loss": 0.0934, "step": 236 }, { "epoch": 0.0952, "grad_norm": 0.8817934891230337, "learning_rate": 9.519999999999999e-08, "loss": 0.2892, "step": 238 }, { "epoch": 0.096, "grad_norm": 2.9378020793772595, "learning_rate": 9.599999999999999e-08, "loss": -0.0585, "step": 240 }, { "epoch": 0.0968, "grad_norm": 1.568885492438894, "learning_rate": 9.679999999999999e-08, "loss": 0.0506, "step": 242 }, { "epoch": 0.0976, "grad_norm": 8.732342707439889, "learning_rate": 9.759999999999999e-08, "loss": -0.7635, "step": 244 }, { "epoch": 0.0984, "grad_norm": 6.993836328479647, "learning_rate": 9.84e-08, "loss": -0.6434, "step": 246 }, { "epoch": 0.0992, "grad_norm": 0.7295566781785762, "learning_rate": 9.919999999999999e-08, "loss": 0.232, "step": 248 }, { "epoch": 0.1, "grad_norm": 4.386003779293064, "learning_rate": 1e-07, "loss": -0.0654, "step": 250 }, { "epoch": 0.1008, "grad_norm": 0.7038059549047322, "learning_rate": 9.999980504497802e-08, "loss": 0.6679, "step": 252 }, { "epoch": 0.1016, "grad_norm": 2.9598822379536207, "learning_rate": 9.99992201814324e-08, "loss": 0.131, "step": 254 }, { "epoch": 0.1024, "grad_norm": 0.715967277720422, "learning_rate": 9.999824541392403e-08, "loss": 0.2579, "step": 256 }, { "epoch": 0.1032, "grad_norm": 3.2102671741061517, "learning_rate": 9.999688075005433e-08, "loss": 0.2389, "step": 258 }, { "epoch": 0.104, "grad_norm": 6.929091165178379, "learning_rate": 9.999512620046521e-08, "loss": -0.7848, "step": 260 }, { "epoch": 0.1048, "grad_norm": 2.1099354362146148, "learning_rate": 9.999298177883901e-08, "loss": 0.3355, "step": 262 }, { "epoch": 0.1056, "grad_norm": 1.2644257110280757, "learning_rate": 9.999044750189838e-08, "loss": 0.0575, "step": 264 }, { "epoch": 0.1064, "grad_norm": 1.996930248451988, "learning_rate": 9.99875233894061e-08, "loss": -0.2129, "step": 266 }, { "epoch": 0.1072, "grad_norm": 1.5240927675226754, "learning_rate": 9.9984209464165e-08, "loss": -0.2003, "step": 268 }, { "epoch": 0.108, "grad_norm": 4.774899294709947, "learning_rate": 9.99805057520177e-08, "loss": 0.638, "step": 270 }, { "epoch": 0.1088, "grad_norm": 0.8387248005163368, "learning_rate": 9.997641228184654e-08, "loss": 0.3563, "step": 272 }, { "epoch": 0.1096, "grad_norm": 1.208727180928816, "learning_rate": 9.997192908557321e-08, "loss": 0.069, "step": 274 }, { "epoch": 0.1104, "grad_norm": 5.53379592764042, "learning_rate": 9.996705619815856e-08, "loss": -0.0175, "step": 276 }, { "epoch": 0.1112, "grad_norm": 0.9868503521319727, "learning_rate": 9.996179365760234e-08, "loss": -0.291, "step": 278 }, { "epoch": 0.112, "grad_norm": 1.9505786022160347, "learning_rate": 9.995614150494291e-08, "loss": -0.1812, "step": 280 }, { "epoch": 0.1128, "grad_norm": 1.4087235500925508, "learning_rate": 9.995009978425691e-08, "loss": 0.3919, "step": 282 }, { "epoch": 0.1136, "grad_norm": 0.9739855158507091, "learning_rate": 9.994366854265884e-08, "loss": 0.092, "step": 284 }, { "epoch": 0.1144, "grad_norm": 0.8823957198854653, "learning_rate": 9.993684783030087e-08, "loss": 0.1602, "step": 286 }, { "epoch": 0.1152, "grad_norm": 0.4284628297532619, "learning_rate": 9.992963770037227e-08, "loss": 0.0426, "step": 288 }, { "epoch": 0.116, "grad_norm": 5.36183114688074, "learning_rate": 9.992203820909905e-08, "loss": -0.5808, "step": 290 }, { "epoch": 0.1168, "grad_norm": 1.6702312862565898, "learning_rate": 9.991404941574359e-08, "loss": 0.6036, "step": 292 }, { "epoch": 0.1176, "grad_norm": 4.073720902655535, "learning_rate": 9.990567138260413e-08, "loss": -0.2273, "step": 294 }, { "epoch": 0.1184, "grad_norm": 0.9971658863201419, "learning_rate": 9.989690417501422e-08, "loss": -0.1263, "step": 296 }, { "epoch": 0.1192, "grad_norm": 3.631953518488206, "learning_rate": 9.988774786134233e-08, "loss": 0.4006, "step": 298 }, { "epoch": 0.12, "grad_norm": 0.7820782538072933, "learning_rate": 9.98782025129912e-08, "loss": -0.2066, "step": 300 }, { "epoch": 0.1208, "grad_norm": 0.5878841242648716, "learning_rate": 9.986826820439743e-08, "loss": 0.0478, "step": 302 }, { "epoch": 0.1216, "grad_norm": 4.974449133943707, "learning_rate": 9.985794501303068e-08, "loss": -0.1471, "step": 304 }, { "epoch": 0.1224, "grad_norm": 0.601813755158989, "learning_rate": 9.984723301939335e-08, "loss": 0.7176, "step": 306 }, { "epoch": 0.1232, "grad_norm": 1.730417084082021, "learning_rate": 9.983613230701966e-08, "loss": 0.2672, "step": 308 }, { "epoch": 0.124, "grad_norm": 7.242082158593375, "learning_rate": 9.982464296247522e-08, "loss": -0.2698, "step": 310 }, { "epoch": 0.1248, "grad_norm": 1.0223985096226282, "learning_rate": 9.981276507535624e-08, "loss": -0.0735, "step": 312 }, { "epoch": 0.1256, "grad_norm": 0.8180401255353809, "learning_rate": 9.980049873828886e-08, "loss": 0.1891, "step": 314 }, { "epoch": 0.1264, "grad_norm": 0.30041193539482725, "learning_rate": 9.978784404692845e-08, "loss": 0.2988, "step": 316 }, { "epoch": 0.1272, "grad_norm": 1.1628849074237195, "learning_rate": 9.977480109995883e-08, "loss": 0.3059, "step": 318 }, { "epoch": 0.128, "grad_norm": 0.4521879964000807, "learning_rate": 9.976136999909155e-08, "loss": -0.05, "step": 320 }, { "epoch": 0.1288, "grad_norm": 1.5045100482252924, "learning_rate": 9.974755084906502e-08, "loss": 0.2253, "step": 322 }, { "epoch": 0.1296, "grad_norm": 2.6671918830350028, "learning_rate": 9.97333437576437e-08, "loss": 0.4161, "step": 324 }, { "epoch": 0.1304, "grad_norm": 1.3360264740430006, "learning_rate": 9.97187488356174e-08, "loss": 0.2868, "step": 326 }, { "epoch": 0.1312, "grad_norm": 1.0280162385963292, "learning_rate": 9.970376619680023e-08, "loss": 0.2481, "step": 328 }, { "epoch": 0.132, "grad_norm": 2.0832282668075215, "learning_rate": 9.968839595802981e-08, "loss": 0.0362, "step": 330 }, { "epoch": 0.1328, "grad_norm": 2.0881240693312186, "learning_rate": 9.967263823916638e-08, "loss": 0.2028, "step": 332 }, { "epoch": 0.1336, "grad_norm": 4.159751465077768, "learning_rate": 9.965649316309176e-08, "loss": -0.2774, "step": 334 }, { "epoch": 0.1344, "grad_norm": 0.7533129835190434, "learning_rate": 9.963996085570852e-08, "loss": -0.1283, "step": 336 }, { "epoch": 0.1352, "grad_norm": 2.2959735255682103, "learning_rate": 9.962304144593891e-08, "loss": 0.0126, "step": 338 }, { "epoch": 0.136, "grad_norm": 2.197098452582562, "learning_rate": 9.96057350657239e-08, "loss": 0.4126, "step": 340 }, { "epoch": 0.1368, "grad_norm": 1.58670105786256, "learning_rate": 9.958804185002208e-08, "loss": 0.0979, "step": 342 }, { "epoch": 0.1376, "grad_norm": 0.8455424268179775, "learning_rate": 9.956996193680873e-08, "loss": 0.2987, "step": 344 }, { "epoch": 0.1384, "grad_norm": 2.302950068704004, "learning_rate": 9.955149546707462e-08, "loss": -0.767, "step": 346 }, { "epoch": 0.1392, "grad_norm": 7.511930834971542, "learning_rate": 9.953264258482504e-08, "loss": -0.0085, "step": 348 }, { "epoch": 0.14, "grad_norm": 1.9232817968569562, "learning_rate": 9.951340343707851e-08, "loss": -0.3246, "step": 350 }, { "epoch": 0.1408, "grad_norm": 2.6343239762516686, "learning_rate": 9.949377817386579e-08, "loss": 0.0478, "step": 352 }, { "epoch": 0.1416, "grad_norm": 7.259506557578069, "learning_rate": 9.94737669482286e-08, "loss": 0.2815, "step": 354 }, { "epoch": 0.1424, "grad_norm": 3.994459016722064, "learning_rate": 9.945336991621852e-08, "loss": 0.2718, "step": 356 }, { "epoch": 0.1432, "grad_norm": 4.984507811555781, "learning_rate": 9.94325872368957e-08, "loss": 0.1226, "step": 358 }, { "epoch": 0.144, "grad_norm": 1.79885150464052, "learning_rate": 9.941141907232764e-08, "loss": 0.2574, "step": 360 }, { "epoch": 0.1448, "grad_norm": 1.3244125851707795, "learning_rate": 9.938986558758794e-08, "loss": 0.2669, "step": 362 }, { "epoch": 0.1456, "grad_norm": 1.3154285100930394, "learning_rate": 9.936792695075501e-08, "loss": 0.2291, "step": 364 }, { "epoch": 0.1464, "grad_norm": 6.029684432028888, "learning_rate": 9.934560333291076e-08, "loss": -0.6053, "step": 366 }, { "epoch": 0.1472, "grad_norm": 0.5904631749204958, "learning_rate": 9.932289490813921e-08, "loss": 0.0027, "step": 368 }, { "epoch": 0.148, "grad_norm": 2.339856125696244, "learning_rate": 9.929980185352524e-08, "loss": -0.0744, "step": 370 }, { "epoch": 0.1488, "grad_norm": 0.688355527790139, "learning_rate": 9.927632434915314e-08, "loss": -0.1327, "step": 372 }, { "epoch": 0.1496, "grad_norm": 3.7397017176068985, "learning_rate": 9.925246257810518e-08, "loss": 0.2688, "step": 374 }, { "epoch": 0.1504, "grad_norm": 4.068851885945964, "learning_rate": 9.922821672646027e-08, "loss": 0.2873, "step": 376 }, { "epoch": 0.1512, "grad_norm": 1.5558033071036992, "learning_rate": 9.920358698329241e-08, "loss": 0.3764, "step": 378 }, { "epoch": 0.152, "grad_norm": 4.653760724189732, "learning_rate": 9.91785735406693e-08, "loss": 0.2148, "step": 380 }, { "epoch": 0.1528, "grad_norm": 3.650076624124744, "learning_rate": 9.915317659365076e-08, "loss": 0.5768, "step": 382 }, { "epoch": 0.1536, "grad_norm": 1.9826449037148115, "learning_rate": 9.912739634028733e-08, "loss": 0.222, "step": 384 }, { "epoch": 0.1544, "grad_norm": 3.4468216333543062, "learning_rate": 9.910123298161858e-08, "loss": -0.1868, "step": 386 }, { "epoch": 0.1552, "grad_norm": 3.165093593385751, "learning_rate": 9.907468672167164e-08, "loss": 0.091, "step": 388 }, { "epoch": 0.156, "grad_norm": 1.7541440814162952, "learning_rate": 9.904775776745957e-08, "loss": -0.1529, "step": 390 }, { "epoch": 0.1568, "grad_norm": 2.1449500638726633, "learning_rate": 9.902044632897977e-08, "loss": 0.4718, "step": 392 }, { "epoch": 0.1576, "grad_norm": 1.2436286136089842, "learning_rate": 9.899275261921234e-08, "loss": 0.1954, "step": 394 }, { "epoch": 0.1584, "grad_norm": 0.7997219065764383, "learning_rate": 9.896467685411837e-08, "loss": 0.175, "step": 396 }, { "epoch": 0.1592, "grad_norm": 0.697589771763831, "learning_rate": 9.89362192526383e-08, "loss": 0.3507, "step": 398 }, { "epoch": 0.16, "grad_norm": 3.5679392354742254, "learning_rate": 9.890738003669028e-08, "loss": 0.0395, "step": 400 }, { "epoch": 0.1608, "grad_norm": 1.6853215016570104, "learning_rate": 9.887815943116827e-08, "loss": -0.5082, "step": 402 }, { "epoch": 0.1616, "grad_norm": 6.85324036827588, "learning_rate": 9.88485576639404e-08, "loss": -0.1898, "step": 404 }, { "epoch": 0.1624, "grad_norm": 0.7110369727557027, "learning_rate": 9.881857496584725e-08, "loss": 0.3305, "step": 406 }, { "epoch": 0.1632, "grad_norm": 5.312906782551955, "learning_rate": 9.878821157069988e-08, "loss": -0.6801, "step": 408 }, { "epoch": 0.164, "grad_norm": 1.6548236452861331, "learning_rate": 9.875746771527815e-08, "loss": 0.1536, "step": 410 }, { "epoch": 0.1648, "grad_norm": 0.8245149921627257, "learning_rate": 9.872634363932886e-08, "loss": 0.2232, "step": 412 }, { "epoch": 0.1656, "grad_norm": 3.3934339705566265, "learning_rate": 9.869483958556375e-08, "loss": 0.291, "step": 414 }, { "epoch": 0.1664, "grad_norm": 1.4486896021406688, "learning_rate": 9.86629557996578e-08, "loss": -0.0683, "step": 416 }, { "epoch": 0.1672, "grad_norm": 3.52336675846488, "learning_rate": 9.863069253024718e-08, "loss": -0.0894, "step": 418 }, { "epoch": 0.168, "grad_norm": 1.1828263915820312, "learning_rate": 9.859805002892732e-08, "loss": 0.1527, "step": 420 }, { "epoch": 0.1688, "grad_norm": 1.372569800294142, "learning_rate": 9.856502855025099e-08, "loss": 0.2844, "step": 422 }, { "epoch": 0.1696, "grad_norm": 0.5902723465561974, "learning_rate": 9.853162835172635e-08, "loss": 0.1511, "step": 424 }, { "epoch": 0.1704, "grad_norm": 8.507287499579553, "learning_rate": 9.849784969381486e-08, "loss": -0.3861, "step": 426 }, { "epoch": 0.1712, "grad_norm": 3.9377780888287806, "learning_rate": 9.846369283992926e-08, "loss": -0.4743, "step": 428 }, { "epoch": 0.172, "grad_norm": 6.138594217904337, "learning_rate": 9.842915805643155e-08, "loss": 0.0042, "step": 430 }, { "epoch": 0.1728, "grad_norm": 1.0255687135099294, "learning_rate": 9.839424561263092e-08, "loss": -0.1807, "step": 432 }, { "epoch": 0.1736, "grad_norm": 7.290670829010674, "learning_rate": 9.835895578078164e-08, "loss": -0.2322, "step": 434 }, { "epoch": 0.1744, "grad_norm": 1.6945340966017044, "learning_rate": 9.832328883608088e-08, "loss": 0.2358, "step": 436 }, { "epoch": 0.1752, "grad_norm": 1.1149664885651573, "learning_rate": 9.828724505666664e-08, "loss": 0.1624, "step": 438 }, { "epoch": 0.176, "grad_norm": 0.33285825801011937, "learning_rate": 9.825082472361557e-08, "loss": 0.3141, "step": 440 }, { "epoch": 0.1768, "grad_norm": 22.31453237037553, "learning_rate": 9.821402812094073e-08, "loss": -0.0172, "step": 442 }, { "epoch": 0.1776, "grad_norm": 0.8704346962948754, "learning_rate": 9.817685553558944e-08, "loss": 0.1275, "step": 444 }, { "epoch": 0.1784, "grad_norm": 1.2245015051060388, "learning_rate": 9.813930725744094e-08, "loss": 0.1762, "step": 446 }, { "epoch": 0.1792, "grad_norm": 2.737100038252088, "learning_rate": 9.810138357930428e-08, "loss": 0.5655, "step": 448 }, { "epoch": 0.18, "grad_norm": 1.284138044981843, "learning_rate": 9.806308479691594e-08, "loss": -0.7118, "step": 450 }, { "epoch": 0.1808, "grad_norm": 3.123077229176161, "learning_rate": 9.802441120893748e-08, "loss": -0.2685, "step": 452 }, { "epoch": 0.1816, "grad_norm": 0.875376956117242, "learning_rate": 9.798536311695334e-08, "loss": 0.2294, "step": 454 }, { "epoch": 0.1824, "grad_norm": 7.85609200053166, "learning_rate": 9.794594082546834e-08, "loss": -0.4686, "step": 456 }, { "epoch": 0.1832, "grad_norm": 0.9291930325028112, "learning_rate": 9.790614464190548e-08, "loss": -0.6784, "step": 458 }, { "epoch": 0.184, "grad_norm": 0.9820076685954113, "learning_rate": 9.786597487660335e-08, "loss": -0.2107, "step": 460 }, { "epoch": 0.1848, "grad_norm": 1.460713191289714, "learning_rate": 9.782543184281388e-08, "loss": 0.1244, "step": 462 }, { "epoch": 0.1856, "grad_norm": 0.6179361492687564, "learning_rate": 9.77845158566998e-08, "loss": 0.1714, "step": 464 }, { "epoch": 0.1864, "grad_norm": 0.4346256375122305, "learning_rate": 9.774322723733215e-08, "loss": 0.3512, "step": 466 }, { "epoch": 0.1872, "grad_norm": 1.2715138565567314, "learning_rate": 9.770156630668789e-08, "loss": 0.1171, "step": 468 }, { "epoch": 0.188, "grad_norm": 1.5946034157660203, "learning_rate": 9.765953338964735e-08, "loss": 0.3309, "step": 470 }, { "epoch": 0.1888, "grad_norm": 0.7622483814283684, "learning_rate": 9.761712881399163e-08, "loss": 0.4465, "step": 472 }, { "epoch": 0.1896, "grad_norm": 1.9649939425146115, "learning_rate": 9.757435291040015e-08, "loss": 0.2973, "step": 474 }, { "epoch": 0.1904, "grad_norm": 0.9914397389532802, "learning_rate": 9.7531206012448e-08, "loss": 0.3274, "step": 476 }, { "epoch": 0.1912, "grad_norm": 0.7243228964834183, "learning_rate": 9.748768845660334e-08, "loss": 0.2104, "step": 478 }, { "epoch": 0.192, "grad_norm": 0.6054705552582188, "learning_rate": 9.744380058222482e-08, "loss": 0.0408, "step": 480 }, { "epoch": 0.1928, "grad_norm": 1.1899869470335924, "learning_rate": 9.739954273155891e-08, "loss": 0.1459, "step": 482 }, { "epoch": 0.1936, "grad_norm": 0.808307853178962, "learning_rate": 9.735491524973722e-08, "loss": 0.1125, "step": 484 }, { "epoch": 0.1944, "grad_norm": 1.0815367893705388, "learning_rate": 9.730991848477379e-08, "loss": -0.1682, "step": 486 }, { "epoch": 0.1952, "grad_norm": 1.1920456124546996, "learning_rate": 9.726455278756247e-08, "loss": 0.2205, "step": 488 }, { "epoch": 0.196, "grad_norm": 1.7732089420749002, "learning_rate": 9.721881851187405e-08, "loss": 0.1552, "step": 490 }, { "epoch": 0.1968, "grad_norm": 2.2419073705232715, "learning_rate": 9.717271601435362e-08, "loss": 0.2732, "step": 492 }, { "epoch": 0.1976, "grad_norm": 0.7131704530453624, "learning_rate": 9.71262456545177e-08, "loss": 0.2964, "step": 494 }, { "epoch": 0.1984, "grad_norm": 1.9224161883701323, "learning_rate": 9.70794077947515e-08, "loss": -0.1305, "step": 496 }, { "epoch": 0.1992, "grad_norm": 2.420968387762715, "learning_rate": 9.703220280030607e-08, "loss": -0.1025, "step": 498 }, { "epoch": 0.2, "grad_norm": 0.5917101352557091, "learning_rate": 9.698463103929542e-08, "loss": 0.4162, "step": 500 }, { "epoch": 0.2008, "grad_norm": 0.37261372003965865, "learning_rate": 9.693669288269371e-08, "loss": -0.0685, "step": 502 }, { "epoch": 0.2016, "grad_norm": 0.9143691951148681, "learning_rate": 9.688838870433229e-08, "loss": -0.2294, "step": 504 }, { "epoch": 0.2024, "grad_norm": 0.9193655718296037, "learning_rate": 9.683971888089688e-08, "loss": 0.391, "step": 506 }, { "epoch": 0.2032, "grad_norm": 7.494175919423912, "learning_rate": 9.679068379192454e-08, "loss": -0.3181, "step": 508 }, { "epoch": 0.204, "grad_norm": 19.535752409376773, "learning_rate": 9.67412838198007e-08, "loss": -0.86, "step": 510 }, { "epoch": 0.2048, "grad_norm": 1.1410695459720073, "learning_rate": 9.669151934975633e-08, "loss": 0.2809, "step": 512 }, { "epoch": 0.2056, "grad_norm": 1.377010762791552, "learning_rate": 9.664139076986472e-08, "loss": 0.162, "step": 514 }, { "epoch": 0.2064, "grad_norm": 0.557979664652959, "learning_rate": 9.659089847103862e-08, "loss": 0.33, "step": 516 }, { "epoch": 0.2072, "grad_norm": 1.553002684885157, "learning_rate": 9.654004284702711e-08, "loss": 0.0052, "step": 518 }, { "epoch": 0.208, "grad_norm": 0.889311576739284, "learning_rate": 9.648882429441257e-08, "loss": 0.1826, "step": 520 }, { "epoch": 0.2088, "grad_norm": 3.4858180208508043, "learning_rate": 9.643724321260756e-08, "loss": -0.3282, "step": 522 }, { "epoch": 0.2096, "grad_norm": 2.1719560909307174, "learning_rate": 9.63853000038517e-08, "loss": 0.4663, "step": 524 }, { "epoch": 0.2104, "grad_norm": 1.084770634896945, "learning_rate": 9.63329950732086e-08, "loss": 0.1794, "step": 526 }, { "epoch": 0.2112, "grad_norm": 1.570077846681942, "learning_rate": 9.628032882856261e-08, "loss": 0.4361, "step": 528 }, { "epoch": 0.212, "grad_norm": 3.257604693080942, "learning_rate": 9.622730168061565e-08, "loss": 0.267, "step": 530 }, { "epoch": 0.2128, "grad_norm": 1.2962127715499705, "learning_rate": 9.61739140428841e-08, "loss": -0.5759, "step": 532 }, { "epoch": 0.2136, "grad_norm": 0.4783553609890881, "learning_rate": 9.612016633169549e-08, "loss": -0.4113, "step": 534 }, { "epoch": 0.2144, "grad_norm": 1.1329977586766047, "learning_rate": 9.606605896618526e-08, "loss": -0.6282, "step": 536 }, { "epoch": 0.2152, "grad_norm": 9.092699988961803, "learning_rate": 9.601159236829351e-08, "loss": 0.0028, "step": 538 }, { "epoch": 0.216, "grad_norm": 1.7521340541068868, "learning_rate": 9.595676696276171e-08, "loss": 0.3847, "step": 540 }, { "epoch": 0.2168, "grad_norm": 1.5211299849991022, "learning_rate": 9.59015831771294e-08, "loss": 0.1855, "step": 542 }, { "epoch": 0.2176, "grad_norm": 2.3997915440768085, "learning_rate": 9.584604144173082e-08, "loss": 0.0154, "step": 544 }, { "epoch": 0.2184, "grad_norm": 7.7867544210113655, "learning_rate": 9.579014218969157e-08, "loss": -0.2799, "step": 546 }, { "epoch": 0.2192, "grad_norm": 2.246425515456882, "learning_rate": 9.573388585692524e-08, "loss": 0.0657, "step": 548 }, { "epoch": 0.22, "grad_norm": 0.8920881325520396, "learning_rate": 9.567727288213003e-08, "loss": 0.0662, "step": 550 }, { "epoch": 0.2208, "grad_norm": 2.069608229929687, "learning_rate": 9.562030370678531e-08, "loss": -0.352, "step": 552 }, { "epoch": 0.2216, "grad_norm": 7.019347228888489, "learning_rate": 9.556297877514811e-08, "loss": 0.6677, "step": 554 }, { "epoch": 0.2224, "grad_norm": 0.7140207308779009, "learning_rate": 9.550529853424978e-08, "loss": -0.1234, "step": 556 }, { "epoch": 0.2232, "grad_norm": 1.739597248566285, "learning_rate": 9.544726343389244e-08, "loss": 0.176, "step": 558 }, { "epoch": 0.224, "grad_norm": 3.759867753849374, "learning_rate": 9.538887392664543e-08, "loss": -0.4603, "step": 560 }, { "epoch": 0.2248, "grad_norm": 0.4680240642658474, "learning_rate": 9.533013046784188e-08, "loss": -0.1854, "step": 562 }, { "epoch": 0.2256, "grad_norm": 4.0223246024546455, "learning_rate": 9.527103351557508e-08, "loss": -0.5082, "step": 564 }, { "epoch": 0.2264, "grad_norm": 0.4298188905176273, "learning_rate": 9.521158353069493e-08, "loss": 0.352, "step": 566 }, { "epoch": 0.2272, "grad_norm": 6.510408037187395, "learning_rate": 9.515178097680436e-08, "loss": -0.1873, "step": 568 }, { "epoch": 0.228, "grad_norm": 2.7431324947016487, "learning_rate": 9.509162632025569e-08, "loss": -0.2765, "step": 570 }, { "epoch": 0.2288, "grad_norm": 1.7314378634262093, "learning_rate": 9.503112003014701e-08, "loss": 0.1129, "step": 572 }, { "epoch": 0.2296, "grad_norm": 0.4152358895518038, "learning_rate": 9.497026257831855e-08, "loss": -0.1104, "step": 574 }, { "epoch": 0.2304, "grad_norm": 3.0143102426871207, "learning_rate": 9.490905443934891e-08, "loss": 0.233, "step": 576 }, { "epoch": 0.2312, "grad_norm": 1.254785319884344, "learning_rate": 9.484749609055149e-08, "loss": 0.0741, "step": 578 }, { "epoch": 0.232, "grad_norm": 0.4516039050645454, "learning_rate": 9.478558801197064e-08, "loss": -0.5912, "step": 580 }, { "epoch": 0.2328, "grad_norm": 2.0311852841582443, "learning_rate": 9.4723330686378e-08, "loss": 0.5177, "step": 582 }, { "epoch": 0.2336, "grad_norm": 0.7527146505774129, "learning_rate": 9.466072459926868e-08, "loss": -0.2521, "step": 584 }, { "epoch": 0.2344, "grad_norm": 1.7528339797546952, "learning_rate": 9.459777023885753e-08, "loss": 0.1981, "step": 586 }, { "epoch": 0.2352, "grad_norm": 0.9484429211584349, "learning_rate": 9.453446809607533e-08, "loss": 0.0565, "step": 588 }, { "epoch": 0.236, "grad_norm": 2.576119540140844, "learning_rate": 9.447081866456487e-08, "loss": -0.3846, "step": 590 }, { "epoch": 0.2368, "grad_norm": 1.9661922778166474, "learning_rate": 9.440682244067722e-08, "loss": -0.1543, "step": 592 }, { "epoch": 0.2376, "grad_norm": 1.0393258447068892, "learning_rate": 9.434247992346779e-08, "loss": 0.3713, "step": 594 }, { "epoch": 0.2384, "grad_norm": 0.926942209886049, "learning_rate": 9.427779161469245e-08, "loss": 0.1346, "step": 596 }, { "epoch": 0.2392, "grad_norm": 2.329000993582009, "learning_rate": 9.421275801880362e-08, "loss": -0.126, "step": 598 }, { "epoch": 0.24, "grad_norm": 1.6875574527329094, "learning_rate": 9.414737964294634e-08, "loss": 0.3109, "step": 600 }, { "epoch": 0.2408, "grad_norm": 0.5023669006234125, "learning_rate": 9.408165699695434e-08, "loss": -0.0132, "step": 602 }, { "epoch": 0.2416, "grad_norm": 2.0881231177462456, "learning_rate": 9.4015590593346e-08, "loss": 0.5198, "step": 604 }, { "epoch": 0.2424, "grad_norm": 1.0863869319130528, "learning_rate": 9.394918094732042e-08, "loss": 0.1587, "step": 606 }, { "epoch": 0.2432, "grad_norm": 1.5582534983831973, "learning_rate": 9.388242857675335e-08, "loss": -0.3655, "step": 608 }, { "epoch": 0.244, "grad_norm": 0.5461657587975756, "learning_rate": 9.381533400219317e-08, "loss": 0.0735, "step": 610 }, { "epoch": 0.2448, "grad_norm": 2.4930252888255886, "learning_rate": 9.374789774685689e-08, "loss": -0.9575, "step": 612 }, { "epoch": 0.2456, "grad_norm": 2.0194068377552132, "learning_rate": 9.368012033662593e-08, "loss": -0.2119, "step": 614 }, { "epoch": 0.2464, "grad_norm": 1.3750808155144445, "learning_rate": 9.361200230004218e-08, "loss": -0.0461, "step": 616 }, { "epoch": 0.2472, "grad_norm": 0.8051442888860127, "learning_rate": 9.354354416830377e-08, "loss": -0.2925, "step": 618 }, { "epoch": 0.248, "grad_norm": 0.8432217688207168, "learning_rate": 9.347474647526095e-08, "loss": -0.2154, "step": 620 }, { "epoch": 0.2488, "grad_norm": 0.6853710470578885, "learning_rate": 9.340560975741196e-08, "loss": 0.4846, "step": 622 }, { "epoch": 0.2496, "grad_norm": 1.403223282772839, "learning_rate": 9.333613455389882e-08, "loss": 0.0296, "step": 624 }, { "epoch": 0.2504, "grad_norm": 2.1131053251547693, "learning_rate": 9.32663214065031e-08, "loss": -0.2981, "step": 626 }, { "epoch": 0.2512, "grad_norm": 0.7873380820908549, "learning_rate": 9.319617085964175e-08, "loss": 0.2121, "step": 628 }, { "epoch": 0.252, "grad_norm": 0.9528682959478656, "learning_rate": 9.312568346036287e-08, "loss": 0.4284, "step": 630 }, { "epoch": 0.2528, "grad_norm": 1.097234538369852, "learning_rate": 9.30548597583413e-08, "loss": 0.3954, "step": 632 }, { "epoch": 0.2536, "grad_norm": 1.8851125035672085, "learning_rate": 9.298370030587455e-08, "loss": 0.4441, "step": 634 }, { "epoch": 0.2544, "grad_norm": 2.1853617450331693, "learning_rate": 9.291220565787828e-08, "loss": 0.4437, "step": 636 }, { "epoch": 0.2552, "grad_norm": 2.3936375731578625, "learning_rate": 9.284037637188213e-08, "loss": -0.4182, "step": 638 }, { "epoch": 0.256, "grad_norm": 2.646325229764863, "learning_rate": 9.276821300802533e-08, "loss": 0.119, "step": 640 }, { "epoch": 0.2568, "grad_norm": 0.9756272329795392, "learning_rate": 9.269571612905225e-08, "loss": -0.3106, "step": 642 }, { "epoch": 0.2576, "grad_norm": 1.4379456308252396, "learning_rate": 9.262288630030812e-08, "loss": 0.3465, "step": 644 }, { "epoch": 0.2584, "grad_norm": 0.6377811081598714, "learning_rate": 9.254972408973459e-08, "loss": 0.0878, "step": 646 }, { "epoch": 0.2592, "grad_norm": 1.5771004258451025, "learning_rate": 9.247623006786527e-08, "loss": 0.4204, "step": 648 }, { "epoch": 0.26, "grad_norm": 0.8176119979876094, "learning_rate": 9.240240480782128e-08, "loss": 0.3434, "step": 650 }, { "epoch": 0.2608, "grad_norm": 1.1167330010140615, "learning_rate": 9.232824888530688e-08, "loss": -0.1252, "step": 652 }, { "epoch": 0.2616, "grad_norm": 2.902711665458815, "learning_rate": 9.225376287860483e-08, "loss": 0.1784, "step": 654 }, { "epoch": 0.2624, "grad_norm": 1.1021902043030998, "learning_rate": 9.217894736857194e-08, "loss": 0.077, "step": 656 }, { "epoch": 0.2632, "grad_norm": 1.5828751888652974, "learning_rate": 9.210380293863461e-08, "loss": 0.6164, "step": 658 }, { "epoch": 0.264, "grad_norm": 1.7579085274204935, "learning_rate": 9.20283301747842e-08, "loss": 0.2696, "step": 660 }, { "epoch": 0.2648, "grad_norm": 0.7147277225164167, "learning_rate": 9.19525296655725e-08, "loss": 0.2439, "step": 662 }, { "epoch": 0.2656, "grad_norm": 3.6763274266756794, "learning_rate": 9.187640200210708e-08, "loss": -0.6141, "step": 664 }, { "epoch": 0.2664, "grad_norm": 0.8865442357420804, "learning_rate": 9.179994777804675e-08, "loss": -0.0577, "step": 666 }, { "epoch": 0.2672, "grad_norm": 1.8760630726535639, "learning_rate": 9.172316758959695e-08, "loss": -0.3246, "step": 668 }, { "epoch": 0.268, "grad_norm": 4.0499280919666685, "learning_rate": 9.164606203550497e-08, "loss": -0.3236, "step": 670 }, { "epoch": 0.2688, "grad_norm": 0.2388213957572059, "learning_rate": 9.156863171705543e-08, "loss": 0.3181, "step": 672 }, { "epoch": 0.2696, "grad_norm": 0.7858037287124114, "learning_rate": 9.149087723806548e-08, "loss": -0.3492, "step": 674 }, { "epoch": 0.2704, "grad_norm": 0.7846789738735171, "learning_rate": 9.141279920488021e-08, "loss": -0.0031, "step": 676 }, { "epoch": 0.2712, "grad_norm": 0.6079509641140145, "learning_rate": 9.133439822636777e-08, "loss": 0.5217, "step": 678 }, { "epoch": 0.272, "grad_norm": 3.1157318757855785, "learning_rate": 9.125567491391475e-08, "loss": -0.2582, "step": 680 }, { "epoch": 0.2728, "grad_norm": 2.8344713124178016, "learning_rate": 9.117662988142136e-08, "loss": -0.0037, "step": 682 }, { "epoch": 0.2736, "grad_norm": 1.1048675461441084, "learning_rate": 9.109726374529665e-08, "loss": -0.3684, "step": 684 }, { "epoch": 0.2744, "grad_norm": 0.6734941872876059, "learning_rate": 9.101757712445368e-08, "loss": -0.3566, "step": 686 }, { "epoch": 0.2752, "grad_norm": 5.2809791174675205, "learning_rate": 9.093757064030472e-08, "loss": 0.4808, "step": 688 }, { "epoch": 0.276, "grad_norm": 1.0801262787173656, "learning_rate": 9.085724491675641e-08, "loss": 0.2411, "step": 690 }, { "epoch": 0.2768, "grad_norm": 2.006574190663329, "learning_rate": 9.077660058020491e-08, "loss": 0.4176, "step": 692 }, { "epoch": 0.2776, "grad_norm": 1.2950834174730455, "learning_rate": 9.06956382595309e-08, "loss": 0.1336, "step": 694 }, { "epoch": 0.2784, "grad_norm": 5.230890162016179, "learning_rate": 9.061435858609485e-08, "loss": -0.1621, "step": 696 }, { "epoch": 0.2792, "grad_norm": 12.169821952332974, "learning_rate": 9.053276219373199e-08, "loss": 0.2004, "step": 698 }, { "epoch": 0.28, "grad_norm": 3.660251173933071, "learning_rate": 9.045084971874737e-08, "loss": 0.2771, "step": 700 }, { "epoch": 0.2808, "grad_norm": 1.163297828202275, "learning_rate": 9.036862179991091e-08, "loss": 0.2461, "step": 702 }, { "epoch": 0.2816, "grad_norm": 1.0335361654882531, "learning_rate": 9.028607907845246e-08, "loss": 0.1717, "step": 704 }, { "epoch": 0.2824, "grad_norm": 1.8229718995331785, "learning_rate": 9.020322219805673e-08, "loss": 0.2178, "step": 706 }, { "epoch": 0.2832, "grad_norm": 2.2765609772885416, "learning_rate": 9.012005180485833e-08, "loss": 0.2202, "step": 708 }, { "epoch": 0.284, "grad_norm": 0.4850755241158522, "learning_rate": 9.003656854743666e-08, "loss": 0.2864, "step": 710 }, { "epoch": 0.2848, "grad_norm": 1.4341650344819474, "learning_rate": 8.995277307681098e-08, "loss": -0.6383, "step": 712 }, { "epoch": 0.2856, "grad_norm": 12.279835227706903, "learning_rate": 8.986866604643517e-08, "loss": -0.7269, "step": 714 }, { "epoch": 0.2864, "grad_norm": 3.9952094340159023, "learning_rate": 8.978424811219276e-08, "loss": -0.1366, "step": 716 }, { "epoch": 0.2872, "grad_norm": 0.8124079844550998, "learning_rate": 8.969951993239177e-08, "loss": 0.1862, "step": 718 }, { "epoch": 0.288, "grad_norm": 8.097449745501883, "learning_rate": 8.961448216775953e-08, "loss": -0.3673, "step": 720 }, { "epoch": 0.2888, "grad_norm": 1.1732802960791322, "learning_rate": 8.952913548143764e-08, "loss": 0.4006, "step": 722 }, { "epoch": 0.2896, "grad_norm": 1.5360714076560518, "learning_rate": 8.94434805389767e-08, "loss": 0.1462, "step": 724 }, { "epoch": 0.2904, "grad_norm": 0.5114107508157508, "learning_rate": 8.935751800833116e-08, "loss": 0.0877, "step": 726 }, { "epoch": 0.2912, "grad_norm": 1.427690720685384, "learning_rate": 8.927124855985408e-08, "loss": 0.1363, "step": 728 }, { "epoch": 0.292, "grad_norm": 4.4627972614515565, "learning_rate": 8.918467286629199e-08, "loss": -0.3287, "step": 730 }, { "epoch": 0.2928, "grad_norm": 3.3272363902338182, "learning_rate": 8.90977916027795e-08, "loss": -0.2832, "step": 732 }, { "epoch": 0.2936, "grad_norm": 0.7746604883747309, "learning_rate": 8.901060544683418e-08, "loss": 0.0766, "step": 734 }, { "epoch": 0.2944, "grad_norm": 1.5873072227936746, "learning_rate": 8.892311507835117e-08, "loss": 0.4345, "step": 736 }, { "epoch": 0.2952, "grad_norm": 1.3768473445563956, "learning_rate": 8.883532117959796e-08, "loss": 0.0693, "step": 738 }, { "epoch": 0.296, "grad_norm": 20.81939969162968, "learning_rate": 8.874722443520898e-08, "loss": -0.4809, "step": 740 }, { "epoch": 0.2968, "grad_norm": 14.945778018331922, "learning_rate": 8.865882553218036e-08, "loss": -0.5521, "step": 742 }, { "epoch": 0.2976, "grad_norm": 0.39785231657900016, "learning_rate": 8.857012515986451e-08, "loss": 0.1503, "step": 744 }, { "epoch": 0.2984, "grad_norm": 1.2686422724522106, "learning_rate": 8.848112400996473e-08, "loss": 0.2367, "step": 746 }, { "epoch": 0.2992, "grad_norm": 1.569638918420828, "learning_rate": 8.839182277652987e-08, "loss": -0.4723, "step": 748 }, { "epoch": 0.3, "grad_norm": 1.1198134951928183, "learning_rate": 8.83022221559489e-08, "loss": -0.2171, "step": 750 }, { "epoch": 0.3008, "grad_norm": 0.8104270584597341, "learning_rate": 8.821232284694544e-08, "loss": 0.0921, "step": 752 }, { "epoch": 0.3016, "grad_norm": 2.9028381646787453, "learning_rate": 8.812212555057239e-08, "loss": -0.0545, "step": 754 }, { "epoch": 0.3024, "grad_norm": 0.8899926905526622, "learning_rate": 8.803163097020636e-08, "loss": -0.0558, "step": 756 }, { "epoch": 0.3032, "grad_norm": 1.7828842004216436, "learning_rate": 8.794083981154227e-08, "loss": 0.7208, "step": 758 }, { "epoch": 0.304, "grad_norm": 2.9786430555101795, "learning_rate": 8.784975278258782e-08, "loss": -0.078, "step": 760 }, { "epoch": 0.3048, "grad_norm": 2.8287310614940777, "learning_rate": 8.775837059365795e-08, "loss": 0.088, "step": 762 }, { "epoch": 0.3056, "grad_norm": 0.8845002129698117, "learning_rate": 8.766669395736934e-08, "loss": -0.0491, "step": 764 }, { "epoch": 0.3064, "grad_norm": 3.5268253668779934, "learning_rate": 8.75747235886348e-08, "loss": 0.0034, "step": 766 }, { "epoch": 0.3072, "grad_norm": 1.7452615611959836, "learning_rate": 8.748246020465775e-08, "loss": 0.4788, "step": 768 }, { "epoch": 0.308, "grad_norm": 13.507874280818228, "learning_rate": 8.738990452492659e-08, "loss": -0.1238, "step": 770 }, { "epoch": 0.3088, "grad_norm": 3.4954667832405177, "learning_rate": 8.729705727120911e-08, "loss": 0.1762, "step": 772 }, { "epoch": 0.3096, "grad_norm": 11.084239211030003, "learning_rate": 8.720391916754682e-08, "loss": -0.3814, "step": 774 }, { "epoch": 0.3104, "grad_norm": 1.190008032651553, "learning_rate": 8.711049094024941e-08, "loss": 0.4016, "step": 776 }, { "epoch": 0.3112, "grad_norm": 2.6524719701229373, "learning_rate": 8.70167733178889e-08, "loss": -0.4937, "step": 778 }, { "epoch": 0.312, "grad_norm": 3.6563349073371434, "learning_rate": 8.69227670312942e-08, "loss": 0.2101, "step": 780 }, { "epoch": 0.3128, "grad_norm": 5.589523245640799, "learning_rate": 8.682847281354516e-08, "loss": -0.5319, "step": 782 }, { "epoch": 0.3136, "grad_norm": 0.9937104750252121, "learning_rate": 8.673389139996707e-08, "loss": 0.5273, "step": 784 }, { "epoch": 0.3144, "grad_norm": 1.5490092245622005, "learning_rate": 8.663902352812478e-08, "loss": 0.4851, "step": 786 }, { "epoch": 0.3152, "grad_norm": 0.9247150320866911, "learning_rate": 8.654386993781701e-08, "loss": -0.1605, "step": 788 }, { "epoch": 0.316, "grad_norm": 0.8661553436999507, "learning_rate": 8.644843137107057e-08, "loss": 0.0903, "step": 790 }, { "epoch": 0.3168, "grad_norm": 0.8966245324936362, "learning_rate": 8.635270857213459e-08, "loss": 0.2469, "step": 792 }, { "epoch": 0.3176, "grad_norm": 6.075740058565734, "learning_rate": 8.625670228747466e-08, "loss": 0.3518, "step": 794 }, { "epoch": 0.3184, "grad_norm": 5.768120136868427, "learning_rate": 8.61604132657671e-08, "loss": -0.1788, "step": 796 }, { "epoch": 0.3192, "grad_norm": 8.014889998847247, "learning_rate": 8.606384225789303e-08, "loss": -0.5493, "step": 798 }, { "epoch": 0.32, "grad_norm": 2.732855863760049, "learning_rate": 8.596699001693255e-08, "loss": -0.4401, "step": 800 }, { "epoch": 0.3208, "grad_norm": 5.674762098665864, "learning_rate": 8.586985729815893e-08, "loss": -0.6151, "step": 802 }, { "epoch": 0.3216, "grad_norm": 1.203107745655249, "learning_rate": 8.577244485903259e-08, "loss": 0.3376, "step": 804 }, { "epoch": 0.3224, "grad_norm": 0.7743743600052015, "learning_rate": 8.567475345919531e-08, "loss": 0.2416, "step": 806 }, { "epoch": 0.3232, "grad_norm": 3.2313530818726686, "learning_rate": 8.557678386046428e-08, "loss": -0.2523, "step": 808 }, { "epoch": 0.324, "grad_norm": 3.2135557626127316, "learning_rate": 8.547853682682604e-08, "loss": -0.8818, "step": 810 }, { "epoch": 0.3248, "grad_norm": 2.443067918618587, "learning_rate": 8.538001312443076e-08, "loss": -0.623, "step": 812 }, { "epoch": 0.3256, "grad_norm": 6.5609892559503855, "learning_rate": 8.528121352158604e-08, "loss": -0.4258, "step": 814 }, { "epoch": 0.3264, "grad_norm": 0.7000997613564107, "learning_rate": 8.518213878875102e-08, "loss": 0.1103, "step": 816 }, { "epoch": 0.3272, "grad_norm": 6.1200733738526525, "learning_rate": 8.508278969853036e-08, "loss": 0.1271, "step": 818 }, { "epoch": 0.328, "grad_norm": 2.6047004705005787, "learning_rate": 8.498316702566827e-08, "loss": 0.0993, "step": 820 }, { "epoch": 0.3288, "grad_norm": 0.9677372851549512, "learning_rate": 8.488327154704232e-08, "loss": 0.0594, "step": 822 }, { "epoch": 0.3296, "grad_norm": 3.4430495127951977, "learning_rate": 8.478310404165754e-08, "loss": -0.1936, "step": 824 }, { "epoch": 0.3304, "grad_norm": 0.3601588143401079, "learning_rate": 8.468266529064024e-08, "loss": -0.1365, "step": 826 }, { "epoch": 0.3312, "grad_norm": 1.292385094296757, "learning_rate": 8.4581956077232e-08, "loss": -0.3464, "step": 828 }, { "epoch": 0.332, "grad_norm": 1.9117854973492814, "learning_rate": 8.448097718678348e-08, "loss": 0.4335, "step": 830 }, { "epoch": 0.3328, "grad_norm": 1.8283142200028784, "learning_rate": 8.437972940674837e-08, "loss": -0.0491, "step": 832 }, { "epoch": 0.3336, "grad_norm": 3.8320528125639104, "learning_rate": 8.427821352667717e-08, "loss": -0.4612, "step": 834 }, { "epoch": 0.3344, "grad_norm": 1.8095888332666445, "learning_rate": 8.417643033821113e-08, "loss": 0.4844, "step": 836 }, { "epoch": 0.3352, "grad_norm": 1.5804671191319755, "learning_rate": 8.407438063507599e-08, "loss": 0.1414, "step": 838 }, { "epoch": 0.336, "grad_norm": 2.4402963026274174, "learning_rate": 8.397206521307583e-08, "loss": 0.2388, "step": 840 }, { "epoch": 0.3368, "grad_norm": 1.4966413361633364, "learning_rate": 8.386948487008686e-08, "loss": 0.5727, "step": 842 }, { "epoch": 0.3376, "grad_norm": 1.7284763532645568, "learning_rate": 8.376664040605121e-08, "loss": -0.3214, "step": 844 }, { "epoch": 0.3384, "grad_norm": 0.7112316019800302, "learning_rate": 8.366353262297068e-08, "loss": 0.1026, "step": 846 }, { "epoch": 0.3392, "grad_norm": 10.49080301872087, "learning_rate": 8.356016232490046e-08, "loss": -0.8355, "step": 848 }, { "epoch": 0.34, "grad_norm": 0.7290221856294098, "learning_rate": 8.34565303179429e-08, "loss": -0.3258, "step": 850 }, { "epoch": 0.3408, "grad_norm": 1.1703618452430726, "learning_rate": 8.335263741024122e-08, "loss": 0.6358, "step": 852 }, { "epoch": 0.3416, "grad_norm": 2.850142298515102, "learning_rate": 8.324848441197316e-08, "loss": -0.1372, "step": 854 }, { "epoch": 0.3424, "grad_norm": 0.973368655758617, "learning_rate": 8.314407213534475e-08, "loss": 0.9766, "step": 856 }, { "epoch": 0.3432, "grad_norm": 1.3793864686515331, "learning_rate": 8.303940139458388e-08, "loss": 0.2899, "step": 858 }, { "epoch": 0.344, "grad_norm": 1.6558863887284723, "learning_rate": 8.293447300593401e-08, "loss": -0.6861, "step": 860 }, { "epoch": 0.3448, "grad_norm": 3.229334640432589, "learning_rate": 8.282928778764781e-08, "loss": 0.4103, "step": 862 }, { "epoch": 0.3456, "grad_norm": 1.9022283387070154, "learning_rate": 8.272384655998074e-08, "loss": 0.3683, "step": 864 }, { "epoch": 0.3464, "grad_norm": 3.861434626902648, "learning_rate": 8.261815014518466e-08, "loss": 0.1213, "step": 866 }, { "epoch": 0.3472, "grad_norm": 0.3346123728991653, "learning_rate": 8.251219936750143e-08, "loss": 0.3042, "step": 868 }, { "epoch": 0.348, "grad_norm": 1.804028815127322, "learning_rate": 8.240599505315654e-08, "loss": 0.3678, "step": 870 }, { "epoch": 0.3488, "grad_norm": 2.22654583204003, "learning_rate": 8.229953803035255e-08, "loss": -0.5044, "step": 872 }, { "epoch": 0.3496, "grad_norm": 2.8036340506880593, "learning_rate": 8.219282912926268e-08, "loss": -0.0045, "step": 874 }, { "epoch": 0.3504, "grad_norm": 1.2472980474279702, "learning_rate": 8.208586918202443e-08, "loss": -0.4355, "step": 876 }, { "epoch": 0.3512, "grad_norm": 2.0236633425024078, "learning_rate": 8.19786590227329e-08, "loss": -0.4585, "step": 878 }, { "epoch": 0.352, "grad_norm": 2.553585628725583, "learning_rate": 8.187119948743448e-08, "loss": -0.4744, "step": 880 }, { "epoch": 0.3528, "grad_norm": 4.526772208608427, "learning_rate": 8.176349141412021e-08, "loss": 0.0674, "step": 882 }, { "epoch": 0.3536, "grad_norm": 3.0131476143455584, "learning_rate": 8.165553564271928e-08, "loss": -0.8816, "step": 884 }, { "epoch": 0.3544, "grad_norm": 2.4303141324929434, "learning_rate": 8.154733301509249e-08, "loss": 0.0354, "step": 886 }, { "epoch": 0.3552, "grad_norm": 1.4923649469015177, "learning_rate": 8.143888437502565e-08, "loss": 0.2033, "step": 888 }, { "epoch": 0.356, "grad_norm": 1.0499738094965403, "learning_rate": 8.133019056822303e-08, "loss": -0.084, "step": 890 }, { "epoch": 0.3568, "grad_norm": 2.306429104903375, "learning_rate": 8.122125244230078e-08, "loss": -0.0835, "step": 892 }, { "epoch": 0.3576, "grad_norm": 9.447440348177143, "learning_rate": 8.111207084678033e-08, "loss": -0.1714, "step": 894 }, { "epoch": 0.3584, "grad_norm": 3.3146862784828723, "learning_rate": 8.100264663308162e-08, "loss": 0.3046, "step": 896 }, { "epoch": 0.3592, "grad_norm": 0.5247128917973009, "learning_rate": 8.089298065451672e-08, "loss": 0.0776, "step": 898 }, { "epoch": 0.36, "grad_norm": 2.6116572800466438, "learning_rate": 8.07830737662829e-08, "loss": 0.1196, "step": 900 }, { "epoch": 0.3608, "grad_norm": 1.7532572130157607, "learning_rate": 8.06729268254562e-08, "loss": 0.4049, "step": 902 }, { "epoch": 0.3616, "grad_norm": 0.872383155375364, "learning_rate": 8.056254069098459e-08, "loss": 0.4134, "step": 904 }, { "epoch": 0.3624, "grad_norm": 2.0280592050080077, "learning_rate": 8.045191622368127e-08, "loss": -0.1264, "step": 906 }, { "epoch": 0.3632, "grad_norm": 7.2837279920656295, "learning_rate": 8.034105428621811e-08, "loss": -0.4519, "step": 908 }, { "epoch": 0.364, "grad_norm": 2.010871912862232, "learning_rate": 8.022995574311875e-08, "loss": 0.0267, "step": 910 }, { "epoch": 0.3648, "grad_norm": 2.18845588960694, "learning_rate": 8.011862146075193e-08, "loss": -0.5606, "step": 912 }, { "epoch": 0.3656, "grad_norm": 10.672904965184774, "learning_rate": 8.000705230732477e-08, "loss": -0.3007, "step": 914 }, { "epoch": 0.3664, "grad_norm": 16.229779007192096, "learning_rate": 7.989524915287594e-08, "loss": 0.1867, "step": 916 }, { "epoch": 0.3672, "grad_norm": 1.321736634994464, "learning_rate": 7.978321286926891e-08, "loss": 0.1814, "step": 918 }, { "epoch": 0.368, "grad_norm": 2.137899951816609, "learning_rate": 7.967094433018507e-08, "loss": 0.4335, "step": 920 }, { "epoch": 0.3688, "grad_norm": 0.8963724642161506, "learning_rate": 7.95584444111171e-08, "loss": -0.0275, "step": 922 }, { "epoch": 0.3696, "grad_norm": 0.784399520286717, "learning_rate": 7.944571398936193e-08, "loss": 0.0624, "step": 924 }, { "epoch": 0.3704, "grad_norm": 3.366159505865075, "learning_rate": 7.933275394401407e-08, "loss": -0.1544, "step": 926 }, { "epoch": 0.3712, "grad_norm": 0.8620601529573644, "learning_rate": 7.92195651559586e-08, "loss": -0.1152, "step": 928 }, { "epoch": 0.372, "grad_norm": 4.160557824531761, "learning_rate": 7.910614850786447e-08, "loss": -0.1786, "step": 930 }, { "epoch": 0.3728, "grad_norm": 1.1732726155263975, "learning_rate": 7.899250488417746e-08, "loss": -0.1146, "step": 932 }, { "epoch": 0.3736, "grad_norm": 0.7626058693505505, "learning_rate": 7.887863517111337e-08, "loss": 0.0288, "step": 934 }, { "epoch": 0.3744, "grad_norm": 1.2784721116511524, "learning_rate": 7.876454025665113e-08, "loss": 0.3692, "step": 936 }, { "epoch": 0.3752, "grad_norm": 3.8432163391561636, "learning_rate": 7.865022103052577e-08, "loss": -0.6191, "step": 938 }, { "epoch": 0.376, "grad_norm": 0.41200718458395047, "learning_rate": 7.853567838422159e-08, "loss": -0.2775, "step": 940 }, { "epoch": 0.3768, "grad_norm": 3.210456122992894, "learning_rate": 7.842091321096515e-08, "loss": 0.04, "step": 942 }, { "epoch": 0.3776, "grad_norm": 8.946861643817154, "learning_rate": 7.830592640571832e-08, "loss": 0.0712, "step": 944 }, { "epoch": 0.3784, "grad_norm": 1.1380080448822825, "learning_rate": 7.819071886517134e-08, "loss": 0.2481, "step": 946 }, { "epoch": 0.3792, "grad_norm": 0.903931154067042, "learning_rate": 7.807529148773572e-08, "loss": 0.5167, "step": 948 }, { "epoch": 0.38, "grad_norm": 5.787579761684592, "learning_rate": 7.795964517353733e-08, "loss": -0.2009, "step": 950 }, { "epoch": 0.3808, "grad_norm": 1.23258440991286, "learning_rate": 7.78437808244094e-08, "loss": 0.1332, "step": 952 }, { "epoch": 0.3816, "grad_norm": 8.57781181693697, "learning_rate": 7.772769934388537e-08, "loss": -0.3453, "step": 954 }, { "epoch": 0.3824, "grad_norm": 9.984455229867079, "learning_rate": 7.761140163719193e-08, "loss": -0.9802, "step": 956 }, { "epoch": 0.3832, "grad_norm": 14.58764723753251, "learning_rate": 7.749488861124199e-08, "loss": -0.4665, "step": 958 }, { "epoch": 0.384, "grad_norm": 3.4788077577478496, "learning_rate": 7.737816117462751e-08, "loss": -0.4591, "step": 960 }, { "epoch": 0.3848, "grad_norm": 0.8756186903705511, "learning_rate": 7.72612202376125e-08, "loss": -0.2308, "step": 962 }, { "epoch": 0.3856, "grad_norm": 0.21704219860580345, "learning_rate": 7.714406671212587e-08, "loss": 0.3277, "step": 964 }, { "epoch": 0.3864, "grad_norm": 5.59613367286991, "learning_rate": 7.702670151175435e-08, "loss": -0.5679, "step": 966 }, { "epoch": 0.3872, "grad_norm": 0.84568283496626, "learning_rate": 7.690912555173535e-08, "loss": 0.1284, "step": 968 }, { "epoch": 0.388, "grad_norm": 4.19251355544568, "learning_rate": 7.679133974894983e-08, "loss": -0.4117, "step": 970 }, { "epoch": 0.3888, "grad_norm": 1.087531497704038, "learning_rate": 7.667334502191513e-08, "loss": 0.1098, "step": 972 }, { "epoch": 0.3896, "grad_norm": 1.603047898028564, "learning_rate": 7.655514229077783e-08, "loss": -0.4038, "step": 974 }, { "epoch": 0.3904, "grad_norm": 2.333486581996855, "learning_rate": 7.643673247730658e-08, "loss": 0.0592, "step": 976 }, { "epoch": 0.3912, "grad_norm": 8.69472674858354, "learning_rate": 7.631811650488489e-08, "loss": -0.7602, "step": 978 }, { "epoch": 0.392, "grad_norm": 1.0210334223940398, "learning_rate": 7.619929529850396e-08, "loss": 0.2272, "step": 980 }, { "epoch": 0.3928, "grad_norm": 1.048418990456133, "learning_rate": 7.608026978475539e-08, "loss": -0.4598, "step": 982 }, { "epoch": 0.3936, "grad_norm": 0.9998711691112794, "learning_rate": 7.596104089182406e-08, "loss": 0.1215, "step": 984 }, { "epoch": 0.3944, "grad_norm": 3.0306983649369843, "learning_rate": 7.584160954948084e-08, "loss": 0.0192, "step": 986 }, { "epoch": 0.3952, "grad_norm": 10.037202598057146, "learning_rate": 7.572197668907532e-08, "loss": 0.0057, "step": 988 }, { "epoch": 0.396, "grad_norm": 0.5591265397866508, "learning_rate": 7.560214324352857e-08, "loss": 0.1889, "step": 990 }, { "epoch": 0.3968, "grad_norm": 2.1451327721737683, "learning_rate": 7.548211014732589e-08, "loss": -0.0682, "step": 992 }, { "epoch": 0.3976, "grad_norm": 1.4947596521706652, "learning_rate": 7.536187833650945e-08, "loss": 0.0717, "step": 994 }, { "epoch": 0.3984, "grad_norm": 0.7487011304939436, "learning_rate": 7.524144874867109e-08, "loss": -0.041, "step": 996 }, { "epoch": 0.3992, "grad_norm": 2.8470129286226644, "learning_rate": 7.51208223229449e-08, "loss": -0.685, "step": 998 }, { "epoch": 0.4, "grad_norm": 1.1509305577734978, "learning_rate": 7.5e-08, "loss": 0.4244, "step": 1000 }, { "epoch": 0.4008, "grad_norm": 2.2471149932142804, "learning_rate": 7.487898272203312e-08, "loss": -0.2415, "step": 1002 }, { "epoch": 0.4016, "grad_norm": 2.3336181887777396, "learning_rate": 7.475777143276131e-08, "loss": -0.1276, "step": 1004 }, { "epoch": 0.4024, "grad_norm": 3.451215377583405, "learning_rate": 7.463636707741458e-08, "loss": 0.186, "step": 1006 }, { "epoch": 0.4032, "grad_norm": 4.432546794380947, "learning_rate": 7.451477060272842e-08, "loss": -0.4754, "step": 1008 }, { "epoch": 0.404, "grad_norm": 1.2726178274814892, "learning_rate": 7.439298295693663e-08, "loss": -0.1686, "step": 1010 }, { "epoch": 0.4048, "grad_norm": 1.6595160080860332, "learning_rate": 7.427100508976369e-08, "loss": 0.4122, "step": 1012 }, { "epoch": 0.4056, "grad_norm": 0.9527094988783708, "learning_rate": 7.414883795241753e-08, "loss": 0.0897, "step": 1014 }, { "epoch": 0.4064, "grad_norm": 0.8197073391125492, "learning_rate": 7.402648249758203e-08, "loss": 0.0477, "step": 1016 }, { "epoch": 0.4072, "grad_norm": 0.4730719086957203, "learning_rate": 7.390393967940962e-08, "loss": 0.0388, "step": 1018 }, { "epoch": 0.408, "grad_norm": 0.6938425628800243, "learning_rate": 7.378121045351376e-08, "loss": -0.0765, "step": 1020 }, { "epoch": 0.4088, "grad_norm": 2.4855748056225404, "learning_rate": 7.365829577696165e-08, "loss": -0.4727, "step": 1022 }, { "epoch": 0.4096, "grad_norm": 1.2525228424015165, "learning_rate": 7.353519660826664e-08, "loss": -0.1318, "step": 1024 }, { "epoch": 0.4104, "grad_norm": 0.6828130325148093, "learning_rate": 7.341191390738072e-08, "loss": 0.0043, "step": 1026 }, { "epoch": 0.4112, "grad_norm": 2.016735168065313, "learning_rate": 7.32884486356872e-08, "loss": 0.3202, "step": 1028 }, { "epoch": 0.412, "grad_norm": 4.271680320972975, "learning_rate": 7.316480175599309e-08, "loss": -0.2884, "step": 1030 }, { "epoch": 0.4128, "grad_norm": 0.44940231823968296, "learning_rate": 7.304097423252155e-08, "loss": 0.279, "step": 1032 }, { "epoch": 0.4136, "grad_norm": 1.7416239543161751, "learning_rate": 7.291696703090449e-08, "loss": 0.1598, "step": 1034 }, { "epoch": 0.4144, "grad_norm": 8.770657113289268, "learning_rate": 7.2792781118175e-08, "loss": -0.5416, "step": 1036 }, { "epoch": 0.4152, "grad_norm": 0.9351697205394859, "learning_rate": 7.266841746275975e-08, "loss": 0.1555, "step": 1038 }, { "epoch": 0.416, "grad_norm": 2.5220066205160485, "learning_rate": 7.254387703447154e-08, "loss": 0.1865, "step": 1040 }, { "epoch": 0.4168, "grad_norm": 2.209916897967486, "learning_rate": 7.241916080450162e-08, "loss": -0.263, "step": 1042 }, { "epoch": 0.4176, "grad_norm": 5.469961942467383, "learning_rate": 7.22942697454122e-08, "loss": -0.2177, "step": 1044 }, { "epoch": 0.4184, "grad_norm": 0.5272933331368399, "learning_rate": 7.216920483112885e-08, "loss": 0.0983, "step": 1046 }, { "epoch": 0.4192, "grad_norm": 1.4143301998010451, "learning_rate": 7.204396703693293e-08, "loss": -0.2537, "step": 1048 }, { "epoch": 0.42, "grad_norm": 0.4246102208293558, "learning_rate": 7.191855733945387e-08, "loss": 0.3528, "step": 1050 }, { "epoch": 0.4208, "grad_norm": 0.5561666198760952, "learning_rate": 7.17929767166617e-08, "loss": 0.0547, "step": 1052 }, { "epoch": 0.4216, "grad_norm": 0.8856453777963561, "learning_rate": 7.166722614785936e-08, "loss": -0.4613, "step": 1054 }, { "epoch": 0.4224, "grad_norm": 3.8778489840855306, "learning_rate": 7.154130661367503e-08, "loss": -0.3592, "step": 1056 }, { "epoch": 0.4232, "grad_norm": 1.111848344560476, "learning_rate": 7.141521909605451e-08, "loss": 0.3647, "step": 1058 }, { "epoch": 0.424, "grad_norm": 0.8976496692377764, "learning_rate": 7.128896457825363e-08, "loss": -0.3394, "step": 1060 }, { "epoch": 0.4248, "grad_norm": 1.9854864333684115, "learning_rate": 7.116254404483048e-08, "loss": 0.5498, "step": 1062 }, { "epoch": 0.4256, "grad_norm": 0.518945106869338, "learning_rate": 7.103595848163774e-08, "loss": 0.1052, "step": 1064 }, { "epoch": 0.4264, "grad_norm": 0.81312188171614, "learning_rate": 7.090920887581506e-08, "loss": 0.2085, "step": 1066 }, { "epoch": 0.4272, "grad_norm": 2.542255401078519, "learning_rate": 7.078229621578139e-08, "loss": -0.401, "step": 1068 }, { "epoch": 0.428, "grad_norm": 0.7969133711740632, "learning_rate": 7.06552214912271e-08, "loss": 0.2313, "step": 1070 }, { "epoch": 0.4288, "grad_norm": 2.013951635316577, "learning_rate": 7.05279856931064e-08, "loss": -0.0206, "step": 1072 }, { "epoch": 0.4296, "grad_norm": 3.233439286834546, "learning_rate": 7.040058981362963e-08, "loss": 0.0567, "step": 1074 }, { "epoch": 0.4304, "grad_norm": 2.038925823472598, "learning_rate": 7.027303484625546e-08, "loss": 0.4609, "step": 1076 }, { "epoch": 0.4312, "grad_norm": 5.3939877710558495, "learning_rate": 7.014532178568313e-08, "loss": -0.2891, "step": 1078 }, { "epoch": 0.432, "grad_norm": 5.083293406395563, "learning_rate": 7.001745162784475e-08, "loss": -0.7984, "step": 1080 }, { "epoch": 0.4328, "grad_norm": 1.6561122980559624, "learning_rate": 6.988942536989749e-08, "loss": -0.3287, "step": 1082 }, { "epoch": 0.4336, "grad_norm": 1.4731754331723663, "learning_rate": 6.976124401021582e-08, "loss": -0.3449, "step": 1084 }, { "epoch": 0.4344, "grad_norm": 2.130459655887093, "learning_rate": 6.963290854838375e-08, "loss": 0.1084, "step": 1086 }, { "epoch": 0.4352, "grad_norm": 0.7379825159584281, "learning_rate": 6.950441998518698e-08, "loss": 0.1926, "step": 1088 }, { "epoch": 0.436, "grad_norm": 5.9866066131944375, "learning_rate": 6.937577932260514e-08, "loss": -0.2981, "step": 1090 }, { "epoch": 0.4368, "grad_norm": 1.6941594895354701, "learning_rate": 6.924698756380397e-08, "loss": -0.04, "step": 1092 }, { "epoch": 0.4376, "grad_norm": 17.324712344427475, "learning_rate": 6.911804571312744e-08, "loss": 0.631, "step": 1094 }, { "epoch": 0.4384, "grad_norm": 5.676574825229265, "learning_rate": 6.898895477609006e-08, "loss": 0.4486, "step": 1096 }, { "epoch": 0.4392, "grad_norm": 2.188959065969351, "learning_rate": 6.885971575936883e-08, "loss": 0.1522, "step": 1098 }, { "epoch": 0.44, "grad_norm": 1.205606460513578, "learning_rate": 6.87303296707956e-08, "loss": -0.1985, "step": 1100 }, { "epoch": 0.4408, "grad_norm": 2.860447532152119, "learning_rate": 6.860079751934908e-08, "loss": 0.4336, "step": 1102 }, { "epoch": 0.4416, "grad_norm": 0.6325136441386785, "learning_rate": 6.847112031514696e-08, "loss": 0.0757, "step": 1104 }, { "epoch": 0.4424, "grad_norm": 1.2112086018815402, "learning_rate": 6.83412990694382e-08, "loss": -0.3603, "step": 1106 }, { "epoch": 0.4432, "grad_norm": 5.835113794965243, "learning_rate": 6.82113347945949e-08, "loss": 0.031, "step": 1108 }, { "epoch": 0.444, "grad_norm": 1.414086162153916, "learning_rate": 6.808122850410461e-08, "loss": 0.6686, "step": 1110 }, { "epoch": 0.4448, "grad_norm": 1.9404063883884395, "learning_rate": 6.79509812125623e-08, "loss": -0.039, "step": 1112 }, { "epoch": 0.4456, "grad_norm": 3.1144430060760655, "learning_rate": 6.782059393566253e-08, "loss": 0.1551, "step": 1114 }, { "epoch": 0.4464, "grad_norm": 0.954886541069574, "learning_rate": 6.769006769019147e-08, "loss": 0.4238, "step": 1116 }, { "epoch": 0.4472, "grad_norm": 2.417988928194323, "learning_rate": 6.755940349401899e-08, "loss": 0.4811, "step": 1118 }, { "epoch": 0.448, "grad_norm": 0.7492639665364688, "learning_rate": 6.742860236609076e-08, "loss": 0.1234, "step": 1120 }, { "epoch": 0.4488, "grad_norm": 1.3267895869858666, "learning_rate": 6.729766532642024e-08, "loss": -0.0093, "step": 1122 }, { "epoch": 0.4496, "grad_norm": 3.1523703905692986, "learning_rate": 6.716659339608076e-08, "loss": -1.1553, "step": 1124 }, { "epoch": 0.4504, "grad_norm": 6.3271911129401595, "learning_rate": 6.70353875971976e-08, "loss": 0.4817, "step": 1126 }, { "epoch": 0.4512, "grad_norm": 3.458276951838705, "learning_rate": 6.690404895293986e-08, "loss": -0.106, "step": 1128 }, { "epoch": 0.452, "grad_norm": 1.1983521344982837, "learning_rate": 6.677257848751275e-08, "loss": -0.1368, "step": 1130 }, { "epoch": 0.4528, "grad_norm": 2.562911563697214, "learning_rate": 6.664097722614933e-08, "loss": -0.6246, "step": 1132 }, { "epoch": 0.4536, "grad_norm": 2.292976959310392, "learning_rate": 6.650924619510268e-08, "loss": -0.3059, "step": 1134 }, { "epoch": 0.4544, "grad_norm": 1.9146997642645225, "learning_rate": 6.637738642163784e-08, "loss": -0.4552, "step": 1136 }, { "epoch": 0.4552, "grad_norm": 2.303506195238013, "learning_rate": 6.624539893402382e-08, "loss": -0.0411, "step": 1138 }, { "epoch": 0.456, "grad_norm": 2.429304538352616, "learning_rate": 6.611328476152556e-08, "loss": -0.09, "step": 1140 }, { "epoch": 0.4568, "grad_norm": 1.5901142036458538, "learning_rate": 6.598104493439589e-08, "loss": 0.366, "step": 1142 }, { "epoch": 0.4576, "grad_norm": 2.2501212390984793, "learning_rate": 6.58486804838676e-08, "loss": -0.2237, "step": 1144 }, { "epoch": 0.4584, "grad_norm": 3.1938054664306015, "learning_rate": 6.57161924421452e-08, "loss": -0.2891, "step": 1146 }, { "epoch": 0.4592, "grad_norm": 0.8017365109031118, "learning_rate": 6.558358184239709e-08, "loss": -0.1112, "step": 1148 }, { "epoch": 0.46, "grad_norm": 2.0186522257754107, "learning_rate": 6.545084971874738e-08, "loss": 0.1282, "step": 1150 }, { "epoch": 0.4608, "grad_norm": 1.474938378888324, "learning_rate": 6.531799710626778e-08, "loss": 0.1798, "step": 1152 }, { "epoch": 0.4616, "grad_norm": 7.871129967536662, "learning_rate": 6.518502504096971e-08, "loss": -0.8227, "step": 1154 }, { "epoch": 0.4624, "grad_norm": 2.8902307300787933, "learning_rate": 6.505193455979603e-08, "loss": 0.2635, "step": 1156 }, { "epoch": 0.4632, "grad_norm": 1.143186288398571, "learning_rate": 6.491872670061302e-08, "loss": -0.0355, "step": 1158 }, { "epoch": 0.464, "grad_norm": 3.7075989676791417, "learning_rate": 6.478540250220234e-08, "loss": -0.0786, "step": 1160 }, { "epoch": 0.4648, "grad_norm": 4.792785138183704, "learning_rate": 6.465196300425286e-08, "loss": -0.4437, "step": 1162 }, { "epoch": 0.4656, "grad_norm": 0.8922724462770598, "learning_rate": 6.451840924735264e-08, "loss": -0.2213, "step": 1164 }, { "epoch": 0.4664, "grad_norm": 2.491727325910274, "learning_rate": 6.438474227298065e-08, "loss": 0.0669, "step": 1166 }, { "epoch": 0.4672, "grad_norm": 1.4732480640539267, "learning_rate": 6.42509631234988e-08, "loss": 0.2351, "step": 1168 }, { "epoch": 0.468, "grad_norm": 8.137988516764139, "learning_rate": 6.411707284214382e-08, "loss": -0.3911, "step": 1170 }, { "epoch": 0.4688, "grad_norm": 8.004252108406288, "learning_rate": 6.398307247301899e-08, "loss": -0.0938, "step": 1172 }, { "epoch": 0.4696, "grad_norm": 1.9959364399427888, "learning_rate": 6.384896306108611e-08, "loss": -0.1873, "step": 1174 }, { "epoch": 0.4704, "grad_norm": 10.098214183461678, "learning_rate": 6.371474565215733e-08, "loss": 0.0064, "step": 1176 }, { "epoch": 0.4712, "grad_norm": 0.6662802639277847, "learning_rate": 6.358042129288693e-08, "loss": -0.1318, "step": 1178 }, { "epoch": 0.472, "grad_norm": 3.1607119261285224, "learning_rate": 6.344599103076328e-08, "loss": 0.7315, "step": 1180 }, { "epoch": 0.4728, "grad_norm": 2.020466390354618, "learning_rate": 6.331145591410057e-08, "loss": 0.2858, "step": 1182 }, { "epoch": 0.4736, "grad_norm": 0.4928769684178318, "learning_rate": 6.317681699203063e-08, "loss": 0.2666, "step": 1184 }, { "epoch": 0.4744, "grad_norm": 9.872920047662523, "learning_rate": 6.304207531449485e-08, "loss": 0.0105, "step": 1186 }, { "epoch": 0.4752, "grad_norm": 8.16277185549274, "learning_rate": 6.290723193223589e-08, "loss": -0.5512, "step": 1188 }, { "epoch": 0.476, "grad_norm": 2.8766477891050917, "learning_rate": 6.277228789678953e-08, "loss": 0.6091, "step": 1190 }, { "epoch": 0.4768, "grad_norm": 0.6120893860982233, "learning_rate": 6.263724426047647e-08, "loss": 0.1113, "step": 1192 }, { "epoch": 0.4776, "grad_norm": 2.867371220131585, "learning_rate": 6.25021020763941e-08, "loss": -0.1485, "step": 1194 }, { "epoch": 0.4784, "grad_norm": 2.3805466052582953, "learning_rate": 6.236686239840835e-08, "loss": 0.1272, "step": 1196 }, { "epoch": 0.4792, "grad_norm": 0.8751211655303213, "learning_rate": 6.223152628114536e-08, "loss": 0.1164, "step": 1198 }, { "epoch": 0.48, "grad_norm": 3.7721597694707953, "learning_rate": 6.209609477998338e-08, "loss": 0.7313, "step": 1200 }, { "epoch": 0.4808, "grad_norm": 3.419798472878702, "learning_rate": 6.196056895104447e-08, "loss": 0.013, "step": 1202 }, { "epoch": 0.4816, "grad_norm": 4.687774153819853, "learning_rate": 6.182494985118624e-08, "loss": 0.23, "step": 1204 }, { "epoch": 0.4824, "grad_norm": 2.125081697473343, "learning_rate": 6.168923853799368e-08, "loss": -0.0661, "step": 1206 }, { "epoch": 0.4832, "grad_norm": 2.556521965745637, "learning_rate": 6.15534360697709e-08, "loss": -0.1983, "step": 1208 }, { "epoch": 0.484, "grad_norm": 1.2329813510099201, "learning_rate": 6.141754350553279e-08, "loss": 0.2845, "step": 1210 }, { "epoch": 0.4848, "grad_norm": 1.3362598649725015, "learning_rate": 6.128156190499687e-08, "loss": -0.2324, "step": 1212 }, { "epoch": 0.4856, "grad_norm": 6.79709632065024, "learning_rate": 6.114549232857502e-08, "loss": -0.3551, "step": 1214 }, { "epoch": 0.4864, "grad_norm": 1.2638697254758269, "learning_rate": 6.100933583736507e-08, "loss": 0.255, "step": 1216 }, { "epoch": 0.4872, "grad_norm": 5.7754492883128385, "learning_rate": 6.087309349314274e-08, "loss": -0.6309, "step": 1218 }, { "epoch": 0.488, "grad_norm": 0.9571850975922288, "learning_rate": 6.073676635835316e-08, "loss": -0.2826, "step": 1220 }, { "epoch": 0.4888, "grad_norm": 1.4309851718621818, "learning_rate": 6.060035549610274e-08, "loss": 0.2769, "step": 1222 }, { "epoch": 0.4896, "grad_norm": 1.0145754210048563, "learning_rate": 6.046386197015075e-08, "loss": -0.9362, "step": 1224 }, { "epoch": 0.4904, "grad_norm": 6.626223220436085, "learning_rate": 6.032728684490118e-08, "loss": 0.4322, "step": 1226 }, { "epoch": 0.4912, "grad_norm": 1.3984038449961043, "learning_rate": 6.019063118539424e-08, "loss": -0.4794, "step": 1228 }, { "epoch": 0.492, "grad_norm": 0.5800974015428817, "learning_rate": 6.005389605729824e-08, "loss": -0.2262, "step": 1230 }, { "epoch": 0.4928, "grad_norm": 1.6940574308328136, "learning_rate": 5.991708252690116e-08, "loss": 0.1601, "step": 1232 }, { "epoch": 0.4936, "grad_norm": 1.18887607438921, "learning_rate": 5.978019166110241e-08, "loss": 0.3557, "step": 1234 }, { "epoch": 0.4944, "grad_norm": 3.2297465206800453, "learning_rate": 5.964322452740445e-08, "loss": 0.4208, "step": 1236 }, { "epoch": 0.4952, "grad_norm": 2.763317691111721, "learning_rate": 5.950618219390451e-08, "loss": -0.101, "step": 1238 }, { "epoch": 0.496, "grad_norm": 1.9082022213348315, "learning_rate": 5.936906572928624e-08, "loss": 0.0438, "step": 1240 }, { "epoch": 0.4968, "grad_norm": 0.7433758520431626, "learning_rate": 5.923187620281135e-08, "loss": 0.3283, "step": 1242 }, { "epoch": 0.4976, "grad_norm": 2.7729120367779854, "learning_rate": 5.909461468431134e-08, "loss": -0.0354, "step": 1244 }, { "epoch": 0.4984, "grad_norm": 20.0112850631017, "learning_rate": 5.895728224417912e-08, "loss": -1.2127, "step": 1246 }, { "epoch": 0.4992, "grad_norm": 1.0654897055122667, "learning_rate": 5.881987995336062e-08, "loss": -0.0584, "step": 1248 }, { "epoch": 0.5, "grad_norm": 2.7229608606775404, "learning_rate": 5.868240888334653e-08, "loss": -0.6298, "step": 1250 }, { "epoch": 0.5008, "grad_norm": 4.006337091885257, "learning_rate": 5.854487010616384e-08, "loss": -0.3494, "step": 1252 }, { "epoch": 0.5016, "grad_norm": 5.495686782120867, "learning_rate": 5.840726469436757e-08, "loss": 0.2171, "step": 1254 }, { "epoch": 0.5024, "grad_norm": 1.1311488028436698, "learning_rate": 5.826959372103239e-08, "loss": 0.3292, "step": 1256 }, { "epoch": 0.5032, "grad_norm": 3.065820295521943, "learning_rate": 5.8131858259744184e-08, "loss": -0.4639, "step": 1258 }, { "epoch": 0.504, "grad_norm": 1.6251303612715686, "learning_rate": 5.799405938459174e-08, "loss": -0.1503, "step": 1260 }, { "epoch": 0.5048, "grad_norm": 0.863499018143362, "learning_rate": 5.7856198170158386e-08, "loss": 0.2304, "step": 1262 }, { "epoch": 0.5056, "grad_norm": 1.1532321430846115, "learning_rate": 5.771827569151356e-08, "loss": 0.459, "step": 1264 }, { "epoch": 0.5064, "grad_norm": 1.7764896164807287, "learning_rate": 5.758029302420445e-08, "loss": -0.0018, "step": 1266 }, { "epoch": 0.5072, "grad_norm": 2.650509930359115, "learning_rate": 5.744225124424761e-08, "loss": 0.4307, "step": 1268 }, { "epoch": 0.508, "grad_norm": 18.546073788989784, "learning_rate": 5.730415142812058e-08, "loss": -0.3913, "step": 1270 }, { "epoch": 0.5088, "grad_norm": 0.9501942078311948, "learning_rate": 5.716599465275347e-08, "loss": -0.1711, "step": 1272 }, { "epoch": 0.5096, "grad_norm": 1.138324851030599, "learning_rate": 5.702778199552054e-08, "loss": 0.2208, "step": 1274 }, { "epoch": 0.5104, "grad_norm": 6.125933607526262, "learning_rate": 5.68895145342319e-08, "loss": -0.6492, "step": 1276 }, { "epoch": 0.5112, "grad_norm": 1.656529407336565, "learning_rate": 5.6751193347124956e-08, "loss": -0.855, "step": 1278 }, { "epoch": 0.512, "grad_norm": 4.206513837985695, "learning_rate": 5.6612819512856126e-08, "loss": -0.3752, "step": 1280 }, { "epoch": 0.5128, "grad_norm": 2.233239265114843, "learning_rate": 5.647439411049234e-08, "loss": -0.0278, "step": 1282 }, { "epoch": 0.5136, "grad_norm": 6.244695059810562, "learning_rate": 5.6335918219502735e-08, "loss": -0.6849, "step": 1284 }, { "epoch": 0.5144, "grad_norm": 5.213764846569251, "learning_rate": 5.6197392919750087e-08, "loss": 0.1645, "step": 1286 }, { "epoch": 0.5152, "grad_norm": 1.6884839847809912, "learning_rate": 5.605881929148253e-08, "loss": 0.1449, "step": 1288 }, { "epoch": 0.516, "grad_norm": 4.038680184005945, "learning_rate": 5.592019841532506e-08, "loss": -0.6065, "step": 1290 }, { "epoch": 0.5168, "grad_norm": 0.9467192350608314, "learning_rate": 5.578153137227108e-08, "loss": 0.0454, "step": 1292 }, { "epoch": 0.5176, "grad_norm": 1.9828901866201523, "learning_rate": 5.564281924367408e-08, "loss": 0.1477, "step": 1294 }, { "epoch": 0.5184, "grad_norm": 0.7766424541672694, "learning_rate": 5.5504063111239105e-08, "loss": -0.1194, "step": 1296 }, { "epoch": 0.5192, "grad_norm": 2.870243408385356, "learning_rate": 5.536526405701433e-08, "loss": -0.0621, "step": 1298 }, { "epoch": 0.52, "grad_norm": 4.470988805451735, "learning_rate": 5.5226423163382677e-08, "loss": -0.2258, "step": 1300 }, { "epoch": 0.5208, "grad_norm": 1.6523567908201209, "learning_rate": 5.5087541513053315e-08, "loss": 0.468, "step": 1302 }, { "epoch": 0.5216, "grad_norm": 8.354761935132903, "learning_rate": 5.494862018905325e-08, "loss": -0.6695, "step": 1304 }, { "epoch": 0.5224, "grad_norm": 1.0600003214166593, "learning_rate": 5.480966027471888e-08, "loss": 0.1322, "step": 1306 }, { "epoch": 0.5232, "grad_norm": 1.447189011032088, "learning_rate": 5.4670662853687534e-08, "loss": 0.3698, "step": 1308 }, { "epoch": 0.524, "grad_norm": 24.626486600138083, "learning_rate": 5.4531629009889016e-08, "loss": -0.9646, "step": 1310 }, { "epoch": 0.5248, "grad_norm": 1.347665993514397, "learning_rate": 5.4392559827537165e-08, "loss": -0.3711, "step": 1312 }, { "epoch": 0.5256, "grad_norm": 1.4919349101226638, "learning_rate": 5.42534563911214e-08, "loss": 0.0689, "step": 1314 }, { "epoch": 0.5264, "grad_norm": 2.758769622371056, "learning_rate": 5.4114319785398274e-08, "loss": -0.8156, "step": 1316 }, { "epoch": 0.5272, "grad_norm": 1.6889051720367219, "learning_rate": 5.397515109538299e-08, "loss": -0.0759, "step": 1318 }, { "epoch": 0.528, "grad_norm": 1.1002825695875742, "learning_rate": 5.383595140634093e-08, "loss": -0.2669, "step": 1320 }, { "epoch": 0.5288, "grad_norm": 2.659963420284105, "learning_rate": 5.369672180377926e-08, "loss": 0.5772, "step": 1322 }, { "epoch": 0.5296, "grad_norm": 1.2855668675869305, "learning_rate": 5.355746337343835e-08, "loss": -0.5896, "step": 1324 }, { "epoch": 0.5304, "grad_norm": 0.5835664795498647, "learning_rate": 5.341817720128343e-08, "loss": 0.0908, "step": 1326 }, { "epoch": 0.5312, "grad_norm": 3.3942164266943045, "learning_rate": 5.327886437349608e-08, "loss": -0.2154, "step": 1328 }, { "epoch": 0.532, "grad_norm": 3.1474290472029947, "learning_rate": 5.313952597646567e-08, "loss": -0.2349, "step": 1330 }, { "epoch": 0.5328, "grad_norm": 2.2807680960100885, "learning_rate": 5.300016309678104e-08, "loss": -0.295, "step": 1332 }, { "epoch": 0.5336, "grad_norm": 16.907562511619666, "learning_rate": 5.286077682122191e-08, "loss": -0.3385, "step": 1334 }, { "epoch": 0.5344, "grad_norm": 1.5533490919001942, "learning_rate": 5.272136823675045e-08, "loss": 0.1332, "step": 1336 }, { "epoch": 0.5352, "grad_norm": 8.807183927730819, "learning_rate": 5.258193843050283e-08, "loss": -0.1528, "step": 1338 }, { "epoch": 0.536, "grad_norm": 1.4862400548123054, "learning_rate": 5.2442488489780666e-08, "loss": 0.1324, "step": 1340 }, { "epoch": 0.5368, "grad_norm": 2.753596760613398, "learning_rate": 5.230301950204261e-08, "loss": 0.2668, "step": 1342 }, { "epoch": 0.5376, "grad_norm": 1.3967208213678357, "learning_rate": 5.216353255489585e-08, "loss": 0.1049, "step": 1344 }, { "epoch": 0.5384, "grad_norm": 3.5485496578024294, "learning_rate": 5.2024028736087624e-08, "loss": 0.3956, "step": 1346 }, { "epoch": 0.5392, "grad_norm": 10.383666207219719, "learning_rate": 5.188450913349674e-08, "loss": -0.5953, "step": 1348 }, { "epoch": 0.54, "grad_norm": 1.361423830248357, "learning_rate": 5.1744974835125056e-08, "loss": 0.3575, "step": 1350 }, { "epoch": 0.5408, "grad_norm": 1.3762037764186998, "learning_rate": 5.160542692908908e-08, "loss": 0.0923, "step": 1352 }, { "epoch": 0.5416, "grad_norm": 0.7036483104762083, "learning_rate": 5.146586650361142e-08, "loss": 0.0543, "step": 1354 }, { "epoch": 0.5424, "grad_norm": 1.7035046490079102, "learning_rate": 5.132629464701229e-08, "loss": 0.594, "step": 1356 }, { "epoch": 0.5432, "grad_norm": 0.6577494292524566, "learning_rate": 5.11867124477011e-08, "loss": -0.4076, "step": 1358 }, { "epoch": 0.544, "grad_norm": 1.705082611585673, "learning_rate": 5.104712099416785e-08, "loss": 0.0251, "step": 1360 }, { "epoch": 0.5448, "grad_norm": 3.689442768358194, "learning_rate": 5.090752137497474e-08, "loss": 0.3629, "step": 1362 }, { "epoch": 0.5456, "grad_norm": 2.0580858641655766, "learning_rate": 5.0767914678747645e-08, "loss": -0.6431, "step": 1364 }, { "epoch": 0.5464, "grad_norm": 4.358761098082887, "learning_rate": 5.0628301994167634e-08, "loss": -0.1779, "step": 1366 }, { "epoch": 0.5472, "grad_norm": 2.246896501191125, "learning_rate": 5.048868440996246e-08, "loss": 0.1894, "step": 1368 }, { "epoch": 0.548, "grad_norm": 2.3402721079385125, "learning_rate": 5.034906301489807e-08, "loss": 0.2495, "step": 1370 }, { "epoch": 0.5488, "grad_norm": 1.6112023177078798, "learning_rate": 5.02094388977702e-08, "loss": 0.4752, "step": 1372 }, { "epoch": 0.5496, "grad_norm": 1.0857915819905197, "learning_rate": 5.0069813147395725e-08, "loss": 0.0583, "step": 1374 }, { "epoch": 0.5504, "grad_norm": 1.3169352802028227, "learning_rate": 4.993018685260428e-08, "loss": 0.1941, "step": 1376 }, { "epoch": 0.5512, "grad_norm": 8.45210897957043, "learning_rate": 4.979056110222981e-08, "loss": 0.2704, "step": 1378 }, { "epoch": 0.552, "grad_norm": 2.195741227779714, "learning_rate": 4.9650936985101923e-08, "loss": -0.2134, "step": 1380 }, { "epoch": 0.5528, "grad_norm": 6.579583911683551, "learning_rate": 4.9511315590037556e-08, "loss": 0.4378, "step": 1382 }, { "epoch": 0.5536, "grad_norm": 5.730838024543505, "learning_rate": 4.937169800583236e-08, "loss": -0.9895, "step": 1384 }, { "epoch": 0.5544, "grad_norm": 2.261790288927768, "learning_rate": 4.923208532125235e-08, "loss": 0.3107, "step": 1386 }, { "epoch": 0.5552, "grad_norm": 3.5203869215668147, "learning_rate": 4.909247862502526e-08, "loss": -0.0657, "step": 1388 }, { "epoch": 0.556, "grad_norm": 1.1765877795788182, "learning_rate": 4.895287900583215e-08, "loss": 0.3294, "step": 1390 }, { "epoch": 0.5568, "grad_norm": 6.610541172365496, "learning_rate": 4.881328755229891e-08, "loss": 0.8219, "step": 1392 }, { "epoch": 0.5576, "grad_norm": 2.047741288861338, "learning_rate": 4.867370535298769e-08, "loss": -0.1988, "step": 1394 }, { "epoch": 0.5584, "grad_norm": 1.5984989104657576, "learning_rate": 4.8534133496388587e-08, "loss": 0.1535, "step": 1396 }, { "epoch": 0.5592, "grad_norm": 2.9232487652209476, "learning_rate": 4.839457307091092e-08, "loss": -0.6327, "step": 1398 }, { "epoch": 0.56, "grad_norm": 6.6685003828240825, "learning_rate": 4.8255025164874966e-08, "loss": -0.2491, "step": 1400 }, { "epoch": 0.5608, "grad_norm": 2.122107355750566, "learning_rate": 4.8115490866503265e-08, "loss": 0.3453, "step": 1402 }, { "epoch": 0.5616, "grad_norm": 1.9370580784349283, "learning_rate": 4.797597126391238e-08, "loss": -0.1718, "step": 1404 }, { "epoch": 0.5624, "grad_norm": 1.1675280554254117, "learning_rate": 4.783646744510415e-08, "loss": 0.3293, "step": 1406 }, { "epoch": 0.5632, "grad_norm": 1.3803962338075701, "learning_rate": 4.7696980497957376e-08, "loss": 0.0989, "step": 1408 }, { "epoch": 0.564, "grad_norm": 4.01537644874263, "learning_rate": 4.7557511510219336e-08, "loss": -0.3572, "step": 1410 }, { "epoch": 0.5648, "grad_norm": 2.09156888701562, "learning_rate": 4.741806156949717e-08, "loss": -0.5134, "step": 1412 }, { "epoch": 0.5656, "grad_norm": 1.1090034858220563, "learning_rate": 4.727863176324955e-08, "loss": -0.4258, "step": 1414 }, { "epoch": 0.5664, "grad_norm": 1.561513006214317, "learning_rate": 4.7139223178778094e-08, "loss": 1.0116, "step": 1416 }, { "epoch": 0.5672, "grad_norm": 2.3494087850012355, "learning_rate": 4.6999836903218977e-08, "loss": -0.3086, "step": 1418 }, { "epoch": 0.568, "grad_norm": 3.018769501552223, "learning_rate": 4.686047402353433e-08, "loss": -0.2078, "step": 1420 }, { "epoch": 0.5688, "grad_norm": 1.7120820897257356, "learning_rate": 4.6721135626503934e-08, "loss": -0.2402, "step": 1422 }, { "epoch": 0.5696, "grad_norm": 3.19271086146731, "learning_rate": 4.658182279871657e-08, "loss": 0.1192, "step": 1424 }, { "epoch": 0.5704, "grad_norm": 1.1452136953708933, "learning_rate": 4.6442536626561665e-08, "loss": -0.2687, "step": 1426 }, { "epoch": 0.5712, "grad_norm": 0.8475452145849302, "learning_rate": 4.630327819622075e-08, "loss": 0.3452, "step": 1428 }, { "epoch": 0.572, "grad_norm": 0.8234581334530525, "learning_rate": 4.6164048593659066e-08, "loss": -0.2424, "step": 1430 }, { "epoch": 0.5728, "grad_norm": 2.4890263153905052, "learning_rate": 4.6024848904617014e-08, "loss": -0.9922, "step": 1432 }, { "epoch": 0.5736, "grad_norm": 1.9063590684348184, "learning_rate": 4.5885680214601715e-08, "loss": 0.1718, "step": 1434 }, { "epoch": 0.5744, "grad_norm": 1.8826789109483555, "learning_rate": 4.57465436088786e-08, "loss": 0.4459, "step": 1436 }, { "epoch": 0.5752, "grad_norm": 2.2514983487036946, "learning_rate": 4.5607440172462844e-08, "loss": 0.0877, "step": 1438 }, { "epoch": 0.576, "grad_norm": 4.746528942441705, "learning_rate": 4.5468370990111e-08, "loss": -0.3161, "step": 1440 }, { "epoch": 0.5768, "grad_norm": 3.997914923728025, "learning_rate": 4.5329337146312474e-08, "loss": -0.3476, "step": 1442 }, { "epoch": 0.5776, "grad_norm": 1.5223776751107094, "learning_rate": 4.5190339725281135e-08, "loss": 0.212, "step": 1444 }, { "epoch": 0.5784, "grad_norm": 12.004095787528229, "learning_rate": 4.505137981094675e-08, "loss": 0.1348, "step": 1446 }, { "epoch": 0.5792, "grad_norm": 2.6311737832063007, "learning_rate": 4.491245848694669e-08, "loss": -0.2547, "step": 1448 }, { "epoch": 0.58, "grad_norm": 3.5026859584821617, "learning_rate": 4.477357683661733e-08, "loss": 0.4626, "step": 1450 }, { "epoch": 0.5808, "grad_norm": 2.096013846352325, "learning_rate": 4.463473594298566e-08, "loss": 0.5435, "step": 1452 }, { "epoch": 0.5816, "grad_norm": 0.870773168164334, "learning_rate": 4.44959368887609e-08, "loss": 0.1898, "step": 1454 }, { "epoch": 0.5824, "grad_norm": 1.555370027932697, "learning_rate": 4.435718075632591e-08, "loss": 0.4965, "step": 1456 }, { "epoch": 0.5832, "grad_norm": 1.3573063856986074, "learning_rate": 4.421846862772893e-08, "loss": 0.4763, "step": 1458 }, { "epoch": 0.584, "grad_norm": 1.1211818311912025, "learning_rate": 4.407980158467495e-08, "loss": -0.1748, "step": 1460 }, { "epoch": 0.5848, "grad_norm": 6.912907432379036, "learning_rate": 4.3941180708517485e-08, "loss": -0.269, "step": 1462 }, { "epoch": 0.5856, "grad_norm": 8.017186323455453, "learning_rate": 4.380260708024991e-08, "loss": -0.0244, "step": 1464 }, { "epoch": 0.5864, "grad_norm": 1.0752198808722395, "learning_rate": 4.3664081780497274e-08, "loss": 0.0872, "step": 1466 }, { "epoch": 0.5872, "grad_norm": 2.4634723955812987, "learning_rate": 4.352560588950766e-08, "loss": 0.077, "step": 1468 }, { "epoch": 0.588, "grad_norm": 2.981216805268154, "learning_rate": 4.338718048714387e-08, "loss": -0.8672, "step": 1470 }, { "epoch": 0.5888, "grad_norm": 1.4875870951817871, "learning_rate": 4.324880665287504e-08, "loss": -0.6377, "step": 1472 }, { "epoch": 0.5896, "grad_norm": 1.782623092796258, "learning_rate": 4.31104854657681e-08, "loss": -0.0933, "step": 1474 }, { "epoch": 0.5904, "grad_norm": 0.8071293257332645, "learning_rate": 4.2972218004479454e-08, "loss": 0.352, "step": 1476 }, { "epoch": 0.5912, "grad_norm": 0.769833398090707, "learning_rate": 4.2834005347246527e-08, "loss": 0.3886, "step": 1478 }, { "epoch": 0.592, "grad_norm": 2.9776410320929094, "learning_rate": 4.269584857187942e-08, "loss": -0.2999, "step": 1480 }, { "epoch": 0.5928, "grad_norm": 2.39213087733995, "learning_rate": 4.2557748755752384e-08, "loss": -0.0361, "step": 1482 }, { "epoch": 0.5936, "grad_norm": 2.819987283948475, "learning_rate": 4.2419706975795565e-08, "loss": -0.4916, "step": 1484 }, { "epoch": 0.5944, "grad_norm": 6.92777825106192, "learning_rate": 4.228172430848644e-08, "loss": -0.0633, "step": 1486 }, { "epoch": 0.5952, "grad_norm": 1.4052822764386514, "learning_rate": 4.214380182984163e-08, "loss": 0.018, "step": 1488 }, { "epoch": 0.596, "grad_norm": 3.8090615419941787, "learning_rate": 4.200594061540826e-08, "loss": 0.0335, "step": 1490 }, { "epoch": 0.5968, "grad_norm": 1.6846374599762706, "learning_rate": 4.186814174025582e-08, "loss": -0.1758, "step": 1492 }, { "epoch": 0.5976, "grad_norm": 1.072240401366043, "learning_rate": 4.1730406278967615e-08, "loss": 0.4449, "step": 1494 }, { "epoch": 0.5984, "grad_norm": 10.451556251883762, "learning_rate": 4.159273530563242e-08, "loss": -0.4106, "step": 1496 }, { "epoch": 0.5992, "grad_norm": 11.313841249734088, "learning_rate": 4.145512989383617e-08, "loss": -0.8206, "step": 1498 }, { "epoch": 0.6, "grad_norm": 2.4207080559333343, "learning_rate": 4.131759111665348e-08, "loss": -0.2507, "step": 1500 }, { "epoch": 0.6008, "grad_norm": 1.2087387002841612, "learning_rate": 4.118012004663939e-08, "loss": 0.1111, "step": 1502 }, { "epoch": 0.6016, "grad_norm": 11.267790806667549, "learning_rate": 4.1042717755820884e-08, "loss": -0.74, "step": 1504 }, { "epoch": 0.6024, "grad_norm": 1.578751129473022, "learning_rate": 4.0905385315688665e-08, "loss": -0.04, "step": 1506 }, { "epoch": 0.6032, "grad_norm": 3.0840269712264425, "learning_rate": 4.076812379718866e-08, "loss": -0.031, "step": 1508 }, { "epoch": 0.604, "grad_norm": 0.8881811712106842, "learning_rate": 4.0630934270713755e-08, "loss": 0.2177, "step": 1510 }, { "epoch": 0.6048, "grad_norm": 0.4207527868210718, "learning_rate": 4.04938178060955e-08, "loss": -0.1827, "step": 1512 }, { "epoch": 0.6056, "grad_norm": 2.082799290232294, "learning_rate": 4.035677547259554e-08, "loss": 0.3833, "step": 1514 }, { "epoch": 0.6064, "grad_norm": 1.824044182709251, "learning_rate": 4.0219808338897595e-08, "loss": -0.3403, "step": 1516 }, { "epoch": 0.6072, "grad_norm": 4.001405593088844, "learning_rate": 4.008291747309884e-08, "loss": -0.1168, "step": 1518 }, { "epoch": 0.608, "grad_norm": 7.045037320579325, "learning_rate": 3.9946103942701774e-08, "loss": -0.7949, "step": 1520 }, { "epoch": 0.6088, "grad_norm": 2.254675863007796, "learning_rate": 3.980936881460576e-08, "loss": 0.1668, "step": 1522 }, { "epoch": 0.6096, "grad_norm": 1.212683615035007, "learning_rate": 3.967271315509884e-08, "loss": 0.3422, "step": 1524 }, { "epoch": 0.6104, "grad_norm": 0.724005345416449, "learning_rate": 3.9536138029849244e-08, "loss": -0.3961, "step": 1526 }, { "epoch": 0.6112, "grad_norm": 0.8678246145177999, "learning_rate": 3.939964450389728e-08, "loss": -0.1928, "step": 1528 }, { "epoch": 0.612, "grad_norm": 2.191880510252536, "learning_rate": 3.926323364164684e-08, "loss": 0.6699, "step": 1530 }, { "epoch": 0.6128, "grad_norm": 2.551242846475273, "learning_rate": 3.912690650685726e-08, "loss": 0.614, "step": 1532 }, { "epoch": 0.6136, "grad_norm": 3.3125167161611127, "learning_rate": 3.8990664162634925e-08, "loss": 0.0522, "step": 1534 }, { "epoch": 0.6144, "grad_norm": 1.2654840033977905, "learning_rate": 3.8854507671424976e-08, "loss": 0.2405, "step": 1536 }, { "epoch": 0.6152, "grad_norm": 21.026384912783755, "learning_rate": 3.8718438095003126e-08, "loss": -0.8664, "step": 1538 }, { "epoch": 0.616, "grad_norm": 2.0778940857004526, "learning_rate": 3.858245649446721e-08, "loss": 0.4077, "step": 1540 }, { "epoch": 0.6168, "grad_norm": 1.6499639662053487, "learning_rate": 3.8446563930229115e-08, "loss": 0.5043, "step": 1542 }, { "epoch": 0.6176, "grad_norm": 3.6786502697072185, "learning_rate": 3.831076146200632e-08, "loss": 0.498, "step": 1544 }, { "epoch": 0.6184, "grad_norm": 5.321778159467506, "learning_rate": 3.8175050148813775e-08, "loss": -1.2537, "step": 1546 }, { "epoch": 0.6192, "grad_norm": 2.1309052496847487, "learning_rate": 3.8039431048955536e-08, "loss": -0.221, "step": 1548 }, { "epoch": 0.62, "grad_norm": 7.567379404853537, "learning_rate": 3.7903905220016615e-08, "loss": -0.3417, "step": 1550 }, { "epoch": 0.6208, "grad_norm": 1.7798868128143455, "learning_rate": 3.776847371885464e-08, "loss": 0.0114, "step": 1552 }, { "epoch": 0.6216, "grad_norm": 1.9507417381620744, "learning_rate": 3.763313760159164e-08, "loss": 0.3459, "step": 1554 }, { "epoch": 0.6224, "grad_norm": 5.572740486405863, "learning_rate": 3.749789792360589e-08, "loss": -0.2116, "step": 1556 }, { "epoch": 0.6232, "grad_norm": 2.7566473475035354, "learning_rate": 3.7362755739523535e-08, "loss": 0.1338, "step": 1558 }, { "epoch": 0.624, "grad_norm": 1.580365481427382, "learning_rate": 3.7227712103210477e-08, "loss": 0.2273, "step": 1560 }, { "epoch": 0.6248, "grad_norm": 1.3510627818996688, "learning_rate": 3.709276806776412e-08, "loss": 0.129, "step": 1562 }, { "epoch": 0.6256, "grad_norm": 1.7556788740153348, "learning_rate": 3.695792468550516e-08, "loss": 0.1901, "step": 1564 }, { "epoch": 0.6264, "grad_norm": 1.6324801438453431, "learning_rate": 3.682318300796937e-08, "loss": 0.1479, "step": 1566 }, { "epoch": 0.6272, "grad_norm": 0.9547955178731168, "learning_rate": 3.668854408589944e-08, "loss": -0.7838, "step": 1568 }, { "epoch": 0.628, "grad_norm": 2.8978051303938144, "learning_rate": 3.6554008969236713e-08, "loss": 0.6166, "step": 1570 }, { "epoch": 0.6288, "grad_norm": 2.8743673901784264, "learning_rate": 3.641957870711305e-08, "loss": -0.436, "step": 1572 }, { "epoch": 0.6296, "grad_norm": 2.503466521405713, "learning_rate": 3.6285254347842674e-08, "loss": 0.1542, "step": 1574 }, { "epoch": 0.6304, "grad_norm": 3.2381022094414145, "learning_rate": 3.615103693891388e-08, "loss": 0.0577, "step": 1576 }, { "epoch": 0.6312, "grad_norm": 1.8671448154056591, "learning_rate": 3.601692752698101e-08, "loss": 0.2242, "step": 1578 }, { "epoch": 0.632, "grad_norm": 2.0522855223453926, "learning_rate": 3.588292715785617e-08, "loss": 0.4403, "step": 1580 }, { "epoch": 0.6328, "grad_norm": 3.1330172236269536, "learning_rate": 3.574903687650119e-08, "loss": 0.0543, "step": 1582 }, { "epoch": 0.6336, "grad_norm": 1.3225626787576295, "learning_rate": 3.561525772701937e-08, "loss": 0.4842, "step": 1584 }, { "epoch": 0.6344, "grad_norm": 2.6414224390304817, "learning_rate": 3.548159075264738e-08, "loss": 0.2436, "step": 1586 }, { "epoch": 0.6352, "grad_norm": 1.7396874661159625, "learning_rate": 3.5348036995747135e-08, "loss": 0.0321, "step": 1588 }, { "epoch": 0.636, "grad_norm": 13.844859220247393, "learning_rate": 3.5214597497797685e-08, "loss": -0.244, "step": 1590 }, { "epoch": 0.6368, "grad_norm": 4.157937629367746, "learning_rate": 3.508127329938699e-08, "loss": -0.6609, "step": 1592 }, { "epoch": 0.6376, "grad_norm": 1.3790702379267534, "learning_rate": 3.4948065440203976e-08, "loss": -0.2822, "step": 1594 }, { "epoch": 0.6384, "grad_norm": 1.5374452328347417, "learning_rate": 3.481497495903029e-08, "loss": 0.1825, "step": 1596 }, { "epoch": 0.6392, "grad_norm": 3.5241161870744406, "learning_rate": 3.4682002893732196e-08, "loss": -0.2878, "step": 1598 }, { "epoch": 0.64, "grad_norm": 3.8388338807250255, "learning_rate": 3.4549150281252633e-08, "loss": -0.2252, "step": 1600 }, { "epoch": 0.6408, "grad_norm": 1.3127426977863277, "learning_rate": 3.4416418157602904e-08, "loss": 0.4729, "step": 1602 }, { "epoch": 0.6416, "grad_norm": 2.3638705643343987, "learning_rate": 3.428380755785481e-08, "loss": -0.1709, "step": 1604 }, { "epoch": 0.6424, "grad_norm": 1.8145811980021247, "learning_rate": 3.415131951613241e-08, "loss": -0.2825, "step": 1606 }, { "epoch": 0.6432, "grad_norm": 2.4403191054850026, "learning_rate": 3.40189550656041e-08, "loss": -0.955, "step": 1608 }, { "epoch": 0.644, "grad_norm": 2.397240952582479, "learning_rate": 3.388671523847445e-08, "loss": 0.099, "step": 1610 }, { "epoch": 0.6448, "grad_norm": 1.6254363718545937, "learning_rate": 3.3754601065976186e-08, "loss": -0.12, "step": 1612 }, { "epoch": 0.6456, "grad_norm": 2.786169171848472, "learning_rate": 3.362261357836216e-08, "loss": 0.0492, "step": 1614 }, { "epoch": 0.6464, "grad_norm": 6.170904881710663, "learning_rate": 3.349075380489731e-08, "loss": -0.1465, "step": 1616 }, { "epoch": 0.6472, "grad_norm": 1.0165447336675035, "learning_rate": 3.335902277385067e-08, "loss": -0.452, "step": 1618 }, { "epoch": 0.648, "grad_norm": 1.7328957888882133, "learning_rate": 3.3227421512487255e-08, "loss": 0.0252, "step": 1620 }, { "epoch": 0.6488, "grad_norm": 1.6122988397073823, "learning_rate": 3.3095951047060146e-08, "loss": 0.3178, "step": 1622 }, { "epoch": 0.6496, "grad_norm": 3.0074140803632896, "learning_rate": 3.2964612402802415e-08, "loss": -0.3556, "step": 1624 }, { "epoch": 0.6504, "grad_norm": 3.4523497524991344, "learning_rate": 3.283340660391924e-08, "loss": -0.1988, "step": 1626 }, { "epoch": 0.6512, "grad_norm": 6.280464925298079, "learning_rate": 3.270233467357976e-08, "loss": -0.2276, "step": 1628 }, { "epoch": 0.652, "grad_norm": 5.443046138761929, "learning_rate": 3.2571397633909245e-08, "loss": -0.2363, "step": 1630 }, { "epoch": 0.6528, "grad_norm": 6.722447896835382, "learning_rate": 3.2440596505981e-08, "loss": -0.1321, "step": 1632 }, { "epoch": 0.6536, "grad_norm": 1.7144203731513765, "learning_rate": 3.230993230980852e-08, "loss": 0.3665, "step": 1634 }, { "epoch": 0.6544, "grad_norm": 4.593149677505712, "learning_rate": 3.217940606433747e-08, "loss": -0.4483, "step": 1636 }, { "epoch": 0.6552, "grad_norm": 3.078404716919227, "learning_rate": 3.2049018787437685e-08, "loss": 0.2563, "step": 1638 }, { "epoch": 0.656, "grad_norm": 0.4005810169418313, "learning_rate": 3.191877149589539e-08, "loss": -1.0674, "step": 1640 }, { "epoch": 0.6568, "grad_norm": 3.253000576185386, "learning_rate": 3.178866520540508e-08, "loss": 0.2512, "step": 1642 }, { "epoch": 0.6576, "grad_norm": 1.3405373696289142, "learning_rate": 3.16587009305618e-08, "loss": 0.2423, "step": 1644 }, { "epoch": 0.6584, "grad_norm": 3.0107159967656805, "learning_rate": 3.1528879684853026e-08, "loss": 0.3202, "step": 1646 }, { "epoch": 0.6592, "grad_norm": 1.133978959437793, "learning_rate": 3.1399202480650944e-08, "loss": -0.2006, "step": 1648 }, { "epoch": 0.66, "grad_norm": 0.8674875540781807, "learning_rate": 3.126967032920439e-08, "loss": 0.2478, "step": 1650 }, { "epoch": 0.6608, "grad_norm": 4.877486225676917, "learning_rate": 3.114028424063118e-08, "loss": 0.8045, "step": 1652 }, { "epoch": 0.6616, "grad_norm": 6.668255903811672, "learning_rate": 3.101104522390995e-08, "loss": 0.0588, "step": 1654 }, { "epoch": 0.6624, "grad_norm": 4.225599344016223, "learning_rate": 3.088195428687253e-08, "loss": -0.4538, "step": 1656 }, { "epoch": 0.6632, "grad_norm": 3.2904442215649268, "learning_rate": 3.075301243619603e-08, "loss": -0.8012, "step": 1658 }, { "epoch": 0.664, "grad_norm": 1.2097594409637307, "learning_rate": 3.0624220677394854e-08, "loss": 0.2129, "step": 1660 }, { "epoch": 0.6648, "grad_norm": 1.2181710897816944, "learning_rate": 3.0495580014813014e-08, "loss": 0.1647, "step": 1662 }, { "epoch": 0.6656, "grad_norm": 2.1392759816544125, "learning_rate": 3.036709145161625e-08, "loss": 0.0828, "step": 1664 }, { "epoch": 0.6664, "grad_norm": 2.1928200746586746, "learning_rate": 3.0238755989784186e-08, "loss": 0.337, "step": 1666 }, { "epoch": 0.6672, "grad_norm": 2.839822192787894, "learning_rate": 3.0110574630102516e-08, "loss": 0.2746, "step": 1668 }, { "epoch": 0.668, "grad_norm": 2.3747566081007414, "learning_rate": 2.998254837215526e-08, "loss": 0.2913, "step": 1670 }, { "epoch": 0.6688, "grad_norm": 1.0576318241526304, "learning_rate": 2.985467821431687e-08, "loss": 0.445, "step": 1672 }, { "epoch": 0.6696, "grad_norm": 5.321277022454464, "learning_rate": 2.9726965153744547e-08, "loss": -0.7301, "step": 1674 }, { "epoch": 0.6704, "grad_norm": 1.6420958574076474, "learning_rate": 2.959941018637036e-08, "loss": 0.3301, "step": 1676 }, { "epoch": 0.6712, "grad_norm": 6.762985212922795, "learning_rate": 2.94720143068936e-08, "loss": -0.5514, "step": 1678 }, { "epoch": 0.672, "grad_norm": 5.343972437244011, "learning_rate": 2.9344778508772915e-08, "loss": -0.7603, "step": 1680 }, { "epoch": 0.6728, "grad_norm": 1.4173868345663152, "learning_rate": 2.9217703784218607e-08, "loss": -0.2702, "step": 1682 }, { "epoch": 0.6736, "grad_norm": 2.408297080132785, "learning_rate": 2.909079112418493e-08, "loss": 0.239, "step": 1684 }, { "epoch": 0.6744, "grad_norm": 2.526003439412088, "learning_rate": 2.896404151836227e-08, "loss": -0.1545, "step": 1686 }, { "epoch": 0.6752, "grad_norm": 2.152154481783791, "learning_rate": 2.8837455955169542e-08, "loss": 0.2247, "step": 1688 }, { "epoch": 0.676, "grad_norm": 4.841731380516718, "learning_rate": 2.8711035421746366e-08, "loss": -0.2435, "step": 1690 }, { "epoch": 0.6768, "grad_norm": 1.4440262452483728, "learning_rate": 2.8584780903945488e-08, "loss": 0.1137, "step": 1692 }, { "epoch": 0.6776, "grad_norm": 1.058358316472121, "learning_rate": 2.8458693386324996e-08, "loss": -0.0944, "step": 1694 }, { "epoch": 0.6784, "grad_norm": 9.090390939902395, "learning_rate": 2.833277385214064e-08, "loss": 0.6503, "step": 1696 }, { "epoch": 0.6792, "grad_norm": 7.788906709654453, "learning_rate": 2.8207023283338298e-08, "loss": 0.483, "step": 1698 }, { "epoch": 0.68, "grad_norm": 2.497229006444577, "learning_rate": 2.8081442660546124e-08, "loss": -0.3163, "step": 1700 }, { "epoch": 0.6808, "grad_norm": 1.245945203078446, "learning_rate": 2.7956032963067077e-08, "loss": 0.1814, "step": 1702 }, { "epoch": 0.6816, "grad_norm": 1.5957075071574214, "learning_rate": 2.7830795168871123e-08, "loss": 0.1276, "step": 1704 }, { "epoch": 0.6824, "grad_norm": 0.912369643362412, "learning_rate": 2.77057302545878e-08, "loss": 0.0809, "step": 1706 }, { "epoch": 0.6832, "grad_norm": 12.398334532058389, "learning_rate": 2.7580839195498396e-08, "loss": 0.6571, "step": 1708 }, { "epoch": 0.684, "grad_norm": 1.8705294347889838, "learning_rate": 2.7456122965528473e-08, "loss": -0.0012, "step": 1710 }, { "epoch": 0.6848, "grad_norm": 8.43780230171603, "learning_rate": 2.7331582537240237e-08, "loss": 0.5914, "step": 1712 }, { "epoch": 0.6856, "grad_norm": 6.045183963191954, "learning_rate": 2.720721888182501e-08, "loss": 0.2553, "step": 1714 }, { "epoch": 0.6864, "grad_norm": 10.933471488223638, "learning_rate": 2.7083032969095502e-08, "loss": -0.1032, "step": 1716 }, { "epoch": 0.6872, "grad_norm": 6.965055302019932, "learning_rate": 2.695902576747846e-08, "loss": -0.526, "step": 1718 }, { "epoch": 0.688, "grad_norm": 8.512965363552777, "learning_rate": 2.6835198244006924e-08, "loss": -0.8081, "step": 1720 }, { "epoch": 0.6888, "grad_norm": 1.192736035070821, "learning_rate": 2.6711551364312785e-08, "loss": -0.0847, "step": 1722 }, { "epoch": 0.6896, "grad_norm": 1.6765610057391003, "learning_rate": 2.6588086092619278e-08, "loss": -0.27, "step": 1724 }, { "epoch": 0.6904, "grad_norm": 1.2188199326627063, "learning_rate": 2.646480339173337e-08, "loss": 0.1273, "step": 1726 }, { "epoch": 0.6912, "grad_norm": 1.8123646203452417, "learning_rate": 2.6341704223038347e-08, "loss": 0.2395, "step": 1728 }, { "epoch": 0.692, "grad_norm": 4.3649576952135485, "learning_rate": 2.6218789546486232e-08, "loss": -0.7106, "step": 1730 }, { "epoch": 0.6928, "grad_norm": 4.342633194501302, "learning_rate": 2.609606032059039e-08, "loss": -0.5549, "step": 1732 }, { "epoch": 0.6936, "grad_norm": 1.2870887711942558, "learning_rate": 2.5973517502417963e-08, "loss": 0.0922, "step": 1734 }, { "epoch": 0.6944, "grad_norm": 8.857731444056965, "learning_rate": 2.5851162047582476e-08, "loss": -1.2119, "step": 1736 }, { "epoch": 0.6952, "grad_norm": 0.9514384344774223, "learning_rate": 2.5728994910236302e-08, "loss": 0.111, "step": 1738 }, { "epoch": 0.696, "grad_norm": 11.6142955162987, "learning_rate": 2.5607017043063355e-08, "loss": 0.415, "step": 1740 }, { "epoch": 0.6968, "grad_norm": 4.579542574808095, "learning_rate": 2.548522939727156e-08, "loss": 0.3388, "step": 1742 }, { "epoch": 0.6976, "grad_norm": 1.4400831949708577, "learning_rate": 2.5363632922585427e-08, "loss": -0.0981, "step": 1744 }, { "epoch": 0.6984, "grad_norm": 2.087009445621481, "learning_rate": 2.5242228567238687e-08, "loss": 0.2813, "step": 1746 }, { "epoch": 0.6992, "grad_norm": 2.6328146915202986, "learning_rate": 2.512101727796687e-08, "loss": 0.1758, "step": 1748 }, { "epoch": 0.7, "grad_norm": 7.056808040316399, "learning_rate": 2.500000000000001e-08, "loss": 0.3373, "step": 1750 }, { "epoch": 0.7008, "grad_norm": 1.0437467398666396, "learning_rate": 2.4879177677055098e-08, "loss": 0.099, "step": 1752 }, { "epoch": 0.7016, "grad_norm": 3.8414096686764676, "learning_rate": 2.475855125132892e-08, "loss": -0.2663, "step": 1754 }, { "epoch": 0.7024, "grad_norm": 2.439424508673481, "learning_rate": 2.463812166349054e-08, "loss": 0.2117, "step": 1756 }, { "epoch": 0.7032, "grad_norm": 2.387800089257116, "learning_rate": 2.4517889852674113e-08, "loss": -0.0708, "step": 1758 }, { "epoch": 0.704, "grad_norm": 1.3000333539494446, "learning_rate": 2.439785675647143e-08, "loss": 0.1515, "step": 1760 }, { "epoch": 0.7048, "grad_norm": 2.198671971396822, "learning_rate": 2.4278023310924674e-08, "loss": 0.065, "step": 1762 }, { "epoch": 0.7056, "grad_norm": 17.120279372145863, "learning_rate": 2.4158390450519155e-08, "loss": -0.0036, "step": 1764 }, { "epoch": 0.7064, "grad_norm": 2.324948949045274, "learning_rate": 2.4038959108175926e-08, "loss": 0.1964, "step": 1766 }, { "epoch": 0.7072, "grad_norm": 3.109048925076997, "learning_rate": 2.391973021524461e-08, "loss": -0.0786, "step": 1768 }, { "epoch": 0.708, "grad_norm": 1.653300372155837, "learning_rate": 2.3800704701496048e-08, "loss": -0.0222, "step": 1770 }, { "epoch": 0.7088, "grad_norm": 1.2147631669738923, "learning_rate": 2.3681883495115112e-08, "loss": 0.6688, "step": 1772 }, { "epoch": 0.7096, "grad_norm": 1.0924259194845596, "learning_rate": 2.3563267522693414e-08, "loss": -0.0123, "step": 1774 }, { "epoch": 0.7104, "grad_norm": 0.4425561025986487, "learning_rate": 2.3444857709222176e-08, "loss": -0.065, "step": 1776 }, { "epoch": 0.7112, "grad_norm": 0.7572768433247227, "learning_rate": 2.332665497808487e-08, "loss": -0.1034, "step": 1778 }, { "epoch": 0.712, "grad_norm": 1.9702052124787697, "learning_rate": 2.3208660251050153e-08, "loss": 0.1152, "step": 1780 }, { "epoch": 0.7128, "grad_norm": 2.562823426275877, "learning_rate": 2.3090874448264637e-08, "loss": -0.3823, "step": 1782 }, { "epoch": 0.7136, "grad_norm": 2.7558169515582054, "learning_rate": 2.2973298488245645e-08, "loss": 0.1862, "step": 1784 }, { "epoch": 0.7144, "grad_norm": 0.9205424687808403, "learning_rate": 2.2855933287874134e-08, "loss": -0.2344, "step": 1786 }, { "epoch": 0.7152, "grad_norm": 4.049891733614343, "learning_rate": 2.2738779762387494e-08, "loss": 0.5732, "step": 1788 }, { "epoch": 0.716, "grad_norm": 5.109297282246342, "learning_rate": 2.2621838825372492e-08, "loss": 0.1657, "step": 1790 }, { "epoch": 0.7168, "grad_norm": 3.492531909450355, "learning_rate": 2.2505111388758008e-08, "loss": 0.5486, "step": 1792 }, { "epoch": 0.7176, "grad_norm": 1.7475753446352464, "learning_rate": 2.238859836280807e-08, "loss": 0.4643, "step": 1794 }, { "epoch": 0.7184, "grad_norm": 0.361925949755798, "learning_rate": 2.2272300656114645e-08, "loss": 0.1032, "step": 1796 }, { "epoch": 0.7192, "grad_norm": 3.604668638859301, "learning_rate": 2.215621917559062e-08, "loss": 0.0287, "step": 1798 }, { "epoch": 0.72, "grad_norm": 1.4617719449367537, "learning_rate": 2.2040354826462664e-08, "loss": 0.0297, "step": 1800 }, { "epoch": 0.7208, "grad_norm": 3.620213951707984, "learning_rate": 2.1924708512264277e-08, "loss": 0.1951, "step": 1802 }, { "epoch": 0.7216, "grad_norm": 0.7583235187586578, "learning_rate": 2.180928113482866e-08, "loss": -0.1489, "step": 1804 }, { "epoch": 0.7224, "grad_norm": 2.119491033850218, "learning_rate": 2.169407359428166e-08, "loss": -0.2957, "step": 1806 }, { "epoch": 0.7232, "grad_norm": 1.162852312754879, "learning_rate": 2.1579086789034866e-08, "loss": -0.2817, "step": 1808 }, { "epoch": 0.724, "grad_norm": 1.166771156161991, "learning_rate": 2.146432161577842e-08, "loss": 0.1943, "step": 1810 }, { "epoch": 0.7248, "grad_norm": 2.0229771272952326, "learning_rate": 2.1349778969474248e-08, "loss": -0.0322, "step": 1812 }, { "epoch": 0.7256, "grad_norm": 0.5930882949036291, "learning_rate": 2.123545974334887e-08, "loss": 0.1233, "step": 1814 }, { "epoch": 0.7264, "grad_norm": 5.29569204399418, "learning_rate": 2.1121364828886627e-08, "loss": -0.4993, "step": 1816 }, { "epoch": 0.7272, "grad_norm": 2.929051905849708, "learning_rate": 2.1007495115822537e-08, "loss": 0.1715, "step": 1818 }, { "epoch": 0.728, "grad_norm": 15.257361025978756, "learning_rate": 2.0893851492135533e-08, "loss": 0.2343, "step": 1820 }, { "epoch": 0.7288, "grad_norm": 2.803107615477731, "learning_rate": 2.07804348440414e-08, "loss": 1.046, "step": 1822 }, { "epoch": 0.7296, "grad_norm": 2.86449910527955, "learning_rate": 2.0667246055985938e-08, "loss": -0.0914, "step": 1824 }, { "epoch": 0.7304, "grad_norm": 4.345282427711913, "learning_rate": 2.0554286010638074e-08, "loss": 0.1557, "step": 1826 }, { "epoch": 0.7312, "grad_norm": 1.0680979217606996, "learning_rate": 2.0441555588882897e-08, "loss": 0.112, "step": 1828 }, { "epoch": 0.732, "grad_norm": 22.35076669899949, "learning_rate": 2.032905566981493e-08, "loss": -0.7291, "step": 1830 }, { "epoch": 0.7328, "grad_norm": 6.147459767039074, "learning_rate": 2.02167871307311e-08, "loss": 0.0367, "step": 1832 }, { "epoch": 0.7336, "grad_norm": 0.9403821792893974, "learning_rate": 2.0104750847124076e-08, "loss": 0.1168, "step": 1834 }, { "epoch": 0.7344, "grad_norm": 1.0648295993596697, "learning_rate": 1.9992947692675227e-08, "loss": -0.0064, "step": 1836 }, { "epoch": 0.7352, "grad_norm": 6.834382123337444, "learning_rate": 1.9881378539248077e-08, "loss": -0.0209, "step": 1838 }, { "epoch": 0.736, "grad_norm": 12.194299716833651, "learning_rate": 1.977004425688126e-08, "loss": -0.6629, "step": 1840 }, { "epoch": 0.7368, "grad_norm": 1.6811402432981566, "learning_rate": 1.9658945713781883e-08, "loss": -0.3665, "step": 1842 }, { "epoch": 0.7376, "grad_norm": 4.974628272901464, "learning_rate": 1.9548083776318723e-08, "loss": 0.3637, "step": 1844 }, { "epoch": 0.7384, "grad_norm": 0.8026057709807365, "learning_rate": 1.9437459309015425e-08, "loss": -0.1781, "step": 1846 }, { "epoch": 0.7392, "grad_norm": 0.9672604377243258, "learning_rate": 1.93270731745438e-08, "loss": 0.0373, "step": 1848 }, { "epoch": 0.74, "grad_norm": 1.6981668123356704, "learning_rate": 1.9216926233717085e-08, "loss": -0.0772, "step": 1850 }, { "epoch": 0.7408, "grad_norm": 4.5757533711944856, "learning_rate": 1.910701934548329e-08, "loss": 0.1371, "step": 1852 }, { "epoch": 0.7416, "grad_norm": 6.385512164095145, "learning_rate": 1.899735336691837e-08, "loss": -0.3518, "step": 1854 }, { "epoch": 0.7424, "grad_norm": 3.248106093810886, "learning_rate": 1.8887929153219685e-08, "loss": 0.2268, "step": 1856 }, { "epoch": 0.7432, "grad_norm": 2.8252792830886575, "learning_rate": 1.877874755769922e-08, "loss": 0.1589, "step": 1858 }, { "epoch": 0.744, "grad_norm": 5.264540846855634, "learning_rate": 1.866980943177699e-08, "loss": 0.0568, "step": 1860 }, { "epoch": 0.7448, "grad_norm": 1.275428245866234, "learning_rate": 1.8561115624974373e-08, "loss": 0.1517, "step": 1862 }, { "epoch": 0.7456, "grad_norm": 14.19208598391273, "learning_rate": 1.8452666984907517e-08, "loss": 0.2998, "step": 1864 }, { "epoch": 0.7464, "grad_norm": 7.6814967219852655, "learning_rate": 1.834446435728072e-08, "loss": 0.4844, "step": 1866 }, { "epoch": 0.7472, "grad_norm": 1.4193640099469234, "learning_rate": 1.8236508585879777e-08, "loss": 0.3173, "step": 1868 }, { "epoch": 0.748, "grad_norm": 2.041627690513066, "learning_rate": 1.8128800512565513e-08, "loss": 0.3641, "step": 1870 }, { "epoch": 0.7488, "grad_norm": 8.578127916608778, "learning_rate": 1.80213409772671e-08, "loss": -0.2812, "step": 1872 }, { "epoch": 0.7496, "grad_norm": 9.164783253693107, "learning_rate": 1.7914130817975594e-08, "loss": -0.9407, "step": 1874 }, { "epoch": 0.7504, "grad_norm": 3.716026385677171, "learning_rate": 1.7807170870737316e-08, "loss": -0.3692, "step": 1876 }, { "epoch": 0.7512, "grad_norm": 1.6681777892033154, "learning_rate": 1.770046196964747e-08, "loss": 0.3245, "step": 1878 }, { "epoch": 0.752, "grad_norm": 14.618819444480113, "learning_rate": 1.7594004946843455e-08, "loss": -0.1325, "step": 1880 }, { "epoch": 0.7528, "grad_norm": 3.4870170103139935, "learning_rate": 1.7487800632498543e-08, "loss": 0.4117, "step": 1882 }, { "epoch": 0.7536, "grad_norm": 2.1920712515819907, "learning_rate": 1.7381849854815355e-08, "loss": 0.4374, "step": 1884 }, { "epoch": 0.7544, "grad_norm": 7.829348703682726, "learning_rate": 1.7276153440019257e-08, "loss": -0.3397, "step": 1886 }, { "epoch": 0.7552, "grad_norm": 1.6655805866682127, "learning_rate": 1.7170712212352185e-08, "loss": -0.0982, "step": 1888 }, { "epoch": 0.756, "grad_norm": 0.593137714722611, "learning_rate": 1.7065526994065972e-08, "loss": -0.1808, "step": 1890 }, { "epoch": 0.7568, "grad_norm": 3.2668043762805024, "learning_rate": 1.6960598605416114e-08, "loss": -0.1903, "step": 1892 }, { "epoch": 0.7576, "grad_norm": 5.3787240854624745, "learning_rate": 1.6855927864655238e-08, "loss": 0.0949, "step": 1894 }, { "epoch": 0.7584, "grad_norm": 1.936247550747232, "learning_rate": 1.6751515588026826e-08, "loss": -0.3573, "step": 1896 }, { "epoch": 0.7592, "grad_norm": 2.071377080812025, "learning_rate": 1.6647362589758786e-08, "loss": 0.1047, "step": 1898 }, { "epoch": 0.76, "grad_norm": 6.789235393980865, "learning_rate": 1.6543469682057103e-08, "loss": 0.3172, "step": 1900 }, { "epoch": 0.7608, "grad_norm": 5.474981449589401, "learning_rate": 1.6439837675099537e-08, "loss": -0.2288, "step": 1902 }, { "epoch": 0.7616, "grad_norm": 3.199305250881023, "learning_rate": 1.6336467377029305e-08, "loss": -0.2315, "step": 1904 }, { "epoch": 0.7624, "grad_norm": 2.482569440776884, "learning_rate": 1.6233359593948776e-08, "loss": -0.025, "step": 1906 }, { "epoch": 0.7632, "grad_norm": 8.551827628823498, "learning_rate": 1.613051512991314e-08, "loss": -0.1971, "step": 1908 }, { "epoch": 0.764, "grad_norm": 13.097052250880473, "learning_rate": 1.6027934786924186e-08, "loss": 0.166, "step": 1910 }, { "epoch": 0.7648, "grad_norm": 1.4383545215228888, "learning_rate": 1.5925619364924016e-08, "loss": 0.3462, "step": 1912 }, { "epoch": 0.7656, "grad_norm": 4.345257668964619, "learning_rate": 1.5823569661788878e-08, "loss": -0.4007, "step": 1914 }, { "epoch": 0.7664, "grad_norm": 2.6275961887041532, "learning_rate": 1.5721786473322823e-08, "loss": 0.1353, "step": 1916 }, { "epoch": 0.7672, "grad_norm": 1.1508314194699722, "learning_rate": 1.5620270593251633e-08, "loss": 0.171, "step": 1918 }, { "epoch": 0.768, "grad_norm": 8.731880546216733, "learning_rate": 1.551902281321651e-08, "loss": 0.3329, "step": 1920 }, { "epoch": 0.7688, "grad_norm": 1.9434870913531628, "learning_rate": 1.5418043922768e-08, "loss": 0.6301, "step": 1922 }, { "epoch": 0.7696, "grad_norm": 0.9056332881493195, "learning_rate": 1.5317334709359757e-08, "loss": 0.1506, "step": 1924 }, { "epoch": 0.7704, "grad_norm": 7.321492065454491, "learning_rate": 1.5216895958342456e-08, "loss": -0.0844, "step": 1926 }, { "epoch": 0.7712, "grad_norm": 1.2676581635722497, "learning_rate": 1.5116728452957683e-08, "loss": 0.235, "step": 1928 }, { "epoch": 0.772, "grad_norm": 6.770596765408979, "learning_rate": 1.5016832974331723e-08, "loss": -0.1452, "step": 1930 }, { "epoch": 0.7728, "grad_norm": 0.774237776575187, "learning_rate": 1.4917210301469628e-08, "loss": -0.3171, "step": 1932 }, { "epoch": 0.7736, "grad_norm": 2.358703253521802, "learning_rate": 1.4817861211248994e-08, "loss": -0.3403, "step": 1934 }, { "epoch": 0.7744, "grad_norm": 2.091359277579041, "learning_rate": 1.471878647841398e-08, "loss": 0.3112, "step": 1936 }, { "epoch": 0.7752, "grad_norm": 2.4844540414134952, "learning_rate": 1.4619986875569246e-08, "loss": 0.0934, "step": 1938 }, { "epoch": 0.776, "grad_norm": 0.3358727391238236, "learning_rate": 1.4521463173173964e-08, "loss": -0.3601, "step": 1940 }, { "epoch": 0.7768, "grad_norm": 2.8912433454858864, "learning_rate": 1.4423216139535732e-08, "loss": -0.2012, "step": 1942 }, { "epoch": 0.7776, "grad_norm": 6.6239087581701135, "learning_rate": 1.4325246540804669e-08, "loss": -0.2919, "step": 1944 }, { "epoch": 0.7784, "grad_norm": 1.874229260557601, "learning_rate": 1.4227555140967401e-08, "loss": 0.3964, "step": 1946 }, { "epoch": 0.7792, "grad_norm": 3.4765126127575923, "learning_rate": 1.4130142701841075e-08, "loss": 0.0069, "step": 1948 }, { "epoch": 0.78, "grad_norm": 0.9836616938537979, "learning_rate": 1.4033009983067451e-08, "loss": 0.1279, "step": 1950 }, { "epoch": 0.7808, "grad_norm": 1.8450406813526212, "learning_rate": 1.3936157742106974e-08, "loss": -0.8598, "step": 1952 }, { "epoch": 0.7816, "grad_norm": 1.1621228145442448, "learning_rate": 1.3839586734232905e-08, "loss": -0.3734, "step": 1954 }, { "epoch": 0.7824, "grad_norm": 2.080888564116752, "learning_rate": 1.3743297712525331e-08, "loss": -0.2687, "step": 1956 }, { "epoch": 0.7832, "grad_norm": 1.8724244373925105, "learning_rate": 1.3647291427865415e-08, "loss": -0.0416, "step": 1958 }, { "epoch": 0.784, "grad_norm": 1.499292821299661, "learning_rate": 1.3551568628929434e-08, "loss": 0.211, "step": 1960 }, { "epoch": 0.7848, "grad_norm": 2.0052361060709814, "learning_rate": 1.3456130062183001e-08, "loss": -0.0768, "step": 1962 }, { "epoch": 0.7856, "grad_norm": 2.2666289421242523, "learning_rate": 1.3360976471875224e-08, "loss": 0.2792, "step": 1964 }, { "epoch": 0.7864, "grad_norm": 2.723585168664323, "learning_rate": 1.3266108600032928e-08, "loss": -0.4011, "step": 1966 }, { "epoch": 0.7872, "grad_norm": 1.7268778334192665, "learning_rate": 1.317152718645484e-08, "loss": 0.6589, "step": 1968 }, { "epoch": 0.788, "grad_norm": 2.0199606733082254, "learning_rate": 1.3077232968705804e-08, "loss": 0.1737, "step": 1970 }, { "epoch": 0.7888, "grad_norm": 4.804739415956699, "learning_rate": 1.2983226682111093e-08, "loss": -0.2499, "step": 1972 }, { "epoch": 0.7896, "grad_norm": 2.4472702050288295, "learning_rate": 1.2889509059750603e-08, "loss": 0.0527, "step": 1974 }, { "epoch": 0.7904, "grad_norm": 1.229092797828586, "learning_rate": 1.2796080832453182e-08, "loss": 0.0615, "step": 1976 }, { "epoch": 0.7912, "grad_norm": 4.579918800198668, "learning_rate": 1.2702942728790894e-08, "loss": -0.3586, "step": 1978 }, { "epoch": 0.792, "grad_norm": 0.901599843591529, "learning_rate": 1.2610095475073413e-08, "loss": 0.0544, "step": 1980 }, { "epoch": 0.7928, "grad_norm": 1.4180499738504504, "learning_rate": 1.2517539795342247e-08, "loss": -0.3661, "step": 1982 }, { "epoch": 0.7936, "grad_norm": 19.19099294718674, "learning_rate": 1.24252764113652e-08, "loss": 0.1257, "step": 1984 }, { "epoch": 0.7944, "grad_norm": 2.976073756733255, "learning_rate": 1.233330604263067e-08, "loss": -0.5093, "step": 1986 }, { "epoch": 0.7952, "grad_norm": 1.447954681140385, "learning_rate": 1.2241629406342047e-08, "loss": 0.4885, "step": 1988 }, { "epoch": 0.796, "grad_norm": 5.02311224236895, "learning_rate": 1.2150247217412186e-08, "loss": 0.4777, "step": 1990 }, { "epoch": 0.7968, "grad_norm": 1.3931834037300923, "learning_rate": 1.2059160188457724e-08, "loss": 0.1565, "step": 1992 }, { "epoch": 0.7976, "grad_norm": 2.213190281008005, "learning_rate": 1.196836902979364e-08, "loss": -0.1925, "step": 1994 }, { "epoch": 0.7984, "grad_norm": 3.692697293182129, "learning_rate": 1.18778744494276e-08, "loss": -0.3022, "step": 1996 }, { "epoch": 0.7992, "grad_norm": 1.368876462797977, "learning_rate": 1.1787677153054548e-08, "loss": 0.3078, "step": 1998 }, { "epoch": 0.8, "grad_norm": 10.774817453056974, "learning_rate": 1.1697777844051105e-08, "loss": 0.488, "step": 2000 }, { "epoch": 0.8008, "grad_norm": 1.9545132619300545, "learning_rate": 1.1608177223470139e-08, "loss": 0.3029, "step": 2002 }, { "epoch": 0.8016, "grad_norm": 1.806725462227303, "learning_rate": 1.1518875990035277e-08, "loss": -0.016, "step": 2004 }, { "epoch": 0.8024, "grad_norm": 3.92077777933892, "learning_rate": 1.142987484013549e-08, "loss": 0.2598, "step": 2006 }, { "epoch": 0.8032, "grad_norm": 1.1561570657627025, "learning_rate": 1.1341174467819637e-08, "loss": -0.216, "step": 2008 }, { "epoch": 0.804, "grad_norm": 4.944528510946132, "learning_rate": 1.1252775564791022e-08, "loss": -0.1682, "step": 2010 }, { "epoch": 0.8048, "grad_norm": 7.806682630259347, "learning_rate": 1.1164678820402057e-08, "loss": -0.2217, "step": 2012 }, { "epoch": 0.8056, "grad_norm": 0.5906600478058831, "learning_rate": 1.1076884921648832e-08, "loss": -0.0115, "step": 2014 }, { "epoch": 0.8064, "grad_norm": 1.5923039456841297, "learning_rate": 1.0989394553165831e-08, "loss": -0.4475, "step": 2016 }, { "epoch": 0.8072, "grad_norm": 5.561765344754305, "learning_rate": 1.0902208397220497e-08, "loss": 0.2437, "step": 2018 }, { "epoch": 0.808, "grad_norm": 0.8867433808948969, "learning_rate": 1.0815327133708013e-08, "loss": -0.0309, "step": 2020 }, { "epoch": 0.8088, "grad_norm": 1.200813453561211, "learning_rate": 1.0728751440145906e-08, "loss": -0.3607, "step": 2022 }, { "epoch": 0.8096, "grad_norm": 1.9415562238348962, "learning_rate": 1.0642481991668839e-08, "loss": -0.0327, "step": 2024 }, { "epoch": 0.8104, "grad_norm": 1.5620529100957672, "learning_rate": 1.05565194610233e-08, "loss": 0.2341, "step": 2026 }, { "epoch": 0.8112, "grad_norm": 2.647699103427376, "learning_rate": 1.047086451856235e-08, "loss": -0.2593, "step": 2028 }, { "epoch": 0.812, "grad_norm": 2.5701556161139893, "learning_rate": 1.038551783224047e-08, "loss": 0.6179, "step": 2030 }, { "epoch": 0.8128, "grad_norm": 0.8549953817365237, "learning_rate": 1.030048006760823e-08, "loss": -0.0904, "step": 2032 }, { "epoch": 0.8136, "grad_norm": 7.129839478186939, "learning_rate": 1.0215751887807228e-08, "loss": 0.3566, "step": 2034 }, { "epoch": 0.8144, "grad_norm": 3.6986430600685414, "learning_rate": 1.0131333953564824e-08, "loss": -1.0713, "step": 2036 }, { "epoch": 0.8152, "grad_norm": 2.1872738933486096, "learning_rate": 1.0047226923189023e-08, "loss": 0.3407, "step": 2038 }, { "epoch": 0.816, "grad_norm": 0.3128925347394902, "learning_rate": 9.96343145256333e-09, "loss": 0.0907, "step": 2040 }, { "epoch": 0.8168, "grad_norm": 4.786453975612638, "learning_rate": 9.87994819514168e-09, "loss": -0.0753, "step": 2042 }, { "epoch": 0.8176, "grad_norm": 14.82356796616775, "learning_rate": 9.796777801943268e-09, "loss": 0.0171, "step": 2044 }, { "epoch": 0.8184, "grad_norm": 2.633420400987255, "learning_rate": 9.71392092154753e-09, "loss": 0.161, "step": 2046 }, { "epoch": 0.8192, "grad_norm": 1.1007379261044739, "learning_rate": 9.63137820008908e-09, "loss": -0.364, "step": 2048 }, { "epoch": 0.82, "grad_norm": 0.8467779244521326, "learning_rate": 9.549150281252633e-09, "loss": 0.1341, "step": 2050 }, { "epoch": 0.8208, "grad_norm": 0.8244126140483381, "learning_rate": 9.467237806268008e-09, "loss": -0.2897, "step": 2052 }, { "epoch": 0.8216, "grad_norm": 5.462830167075669, "learning_rate": 9.385641413905138e-09, "loss": 0.0616, "step": 2054 }, { "epoch": 0.8224, "grad_norm": 0.9567013585298734, "learning_rate": 9.304361740469101e-09, "loss": 0.1821, "step": 2056 }, { "epoch": 0.8232, "grad_norm": 2.639434982510996, "learning_rate": 9.223399419795091e-09, "loss": -0.0443, "step": 2058 }, { "epoch": 0.824, "grad_norm": 5.980931285372222, "learning_rate": 9.142755083243575e-09, "loss": -0.5847, "step": 2060 }, { "epoch": 0.8248, "grad_norm": 1.221477188934739, "learning_rate": 9.062429359695279e-09, "loss": -0.6101, "step": 2062 }, { "epoch": 0.8256, "grad_norm": 2.592258170134965, "learning_rate": 8.98242287554633e-09, "loss": -0.6055, "step": 2064 }, { "epoch": 0.8264, "grad_norm": 14.286819868582796, "learning_rate": 8.902736254703347e-09, "loss": 0.1833, "step": 2066 }, { "epoch": 0.8272, "grad_norm": 0.6219523174946232, "learning_rate": 8.823370118578627e-09, "loss": 0.1085, "step": 2068 }, { "epoch": 0.828, "grad_norm": 2.4747243856386367, "learning_rate": 8.744325086085247e-09, "loss": 0.1628, "step": 2070 }, { "epoch": 0.8288, "grad_norm": 7.017050884644161, "learning_rate": 8.665601773632225e-09, "loss": -0.4075, "step": 2072 }, { "epoch": 0.8296, "grad_norm": 3.0344111746964733, "learning_rate": 8.587200795119792e-09, "loss": 0.1941, "step": 2074 }, { "epoch": 0.8304, "grad_norm": 3.63122732750374, "learning_rate": 8.509122761934518e-09, "loss": -0.6316, "step": 2076 }, { "epoch": 0.8312, "grad_norm": 2.727294872192778, "learning_rate": 8.431368282944584e-09, "loss": -0.5869, "step": 2078 }, { "epoch": 0.832, "grad_norm": 8.87803956255867, "learning_rate": 8.353937964495027e-09, "loss": -0.0651, "step": 2080 }, { "epoch": 0.8328, "grad_norm": 23.769707582315764, "learning_rate": 8.27683241040305e-09, "loss": -0.5846, "step": 2082 }, { "epoch": 0.8336, "grad_norm": 9.259758087731516, "learning_rate": 8.20005222195323e-09, "loss": 0.3543, "step": 2084 }, { "epoch": 0.8344, "grad_norm": 0.9286906076444958, "learning_rate": 8.123597997892918e-09, "loss": -0.3555, "step": 2086 }, { "epoch": 0.8352, "grad_norm": 2.541911515775242, "learning_rate": 8.047470334427504e-09, "loss": 0.5603, "step": 2088 }, { "epoch": 0.836, "grad_norm": 1.6475676211765033, "learning_rate": 7.971669825215788e-09, "loss": 0.3738, "step": 2090 }, { "epoch": 0.8368, "grad_norm": 2.3364186215000884, "learning_rate": 7.89619706136539e-09, "loss": -0.2587, "step": 2092 }, { "epoch": 0.8376, "grad_norm": 6.851362368895333, "learning_rate": 7.82105263142806e-09, "loss": -0.234, "step": 2094 }, { "epoch": 0.8384, "grad_norm": 3.042257259563013, "learning_rate": 7.746237121395184e-09, "loss": 0.0635, "step": 2096 }, { "epoch": 0.8392, "grad_norm": 5.577355769208644, "learning_rate": 7.671751114693104e-09, "loss": 0.346, "step": 2098 }, { "epoch": 0.84, "grad_norm": 16.068257540691818, "learning_rate": 7.597595192178703e-09, "loss": -0.234, "step": 2100 }, { "epoch": 0.8408, "grad_norm": 1.4550019286994413, "learning_rate": 7.523769932134739e-09, "loss": 0.3045, "step": 2102 }, { "epoch": 0.8416, "grad_norm": 11.674263008573776, "learning_rate": 7.450275910265413e-09, "loss": 0.0012, "step": 2104 }, { "epoch": 0.8424, "grad_norm": 3.5922243417097186, "learning_rate": 7.377113699691878e-09, "loss": 0.0608, "step": 2106 }, { "epoch": 0.8432, "grad_norm": 4.565894897934769, "learning_rate": 7.3042838709477476e-09, "loss": -0.3134, "step": 2108 }, { "epoch": 0.844, "grad_norm": 1.0790754610715667, "learning_rate": 7.23178699197467e-09, "loss": -0.7712, "step": 2110 }, { "epoch": 0.8448, "grad_norm": 2.5576412226752945, "learning_rate": 7.159623628117856e-09, "loss": -0.6058, "step": 2112 }, { "epoch": 0.8456, "grad_norm": 4.462503611601651, "learning_rate": 7.087794342121723e-09, "loss": -0.0884, "step": 2114 }, { "epoch": 0.8464, "grad_norm": 1.1927787940264971, "learning_rate": 7.0162996941254495e-09, "loss": 0.0271, "step": 2116 }, { "epoch": 0.8472, "grad_norm": 1.3778837117436245, "learning_rate": 6.945140241658687e-09, "loss": 0.0777, "step": 2118 }, { "epoch": 0.848, "grad_norm": 1.2376536517660377, "learning_rate": 6.874316539637126e-09, "loss": 0.2371, "step": 2120 }, { "epoch": 0.8488, "grad_norm": 10.4099869029666, "learning_rate": 6.803829140358236e-09, "loss": 0.0905, "step": 2122 }, { "epoch": 0.8496, "grad_norm": 2.0945162332974863, "learning_rate": 6.7336785934969e-09, "loss": -0.2163, "step": 2124 }, { "epoch": 0.8504, "grad_norm": 1.084301127375519, "learning_rate": 6.663865446101191e-09, "loss": 0.3561, "step": 2126 }, { "epoch": 0.8512, "grad_norm": 3.3122087703299163, "learning_rate": 6.594390242588044e-09, "loss": 0.654, "step": 2128 }, { "epoch": 0.852, "grad_norm": 16.300685078686683, "learning_rate": 6.525253524739049e-09, "loss": -0.4144, "step": 2130 }, { "epoch": 0.8528, "grad_norm": 1.9622079833970945, "learning_rate": 6.456455831696234e-09, "loss": 0.1632, "step": 2132 }, { "epoch": 0.8536, "grad_norm": 1.1269237351058636, "learning_rate": 6.3879976999578146e-09, "loss": 0.0483, "step": 2134 }, { "epoch": 0.8544, "grad_norm": 1.2343076260198358, "learning_rate": 6.319879663374067e-09, "loss": -0.173, "step": 2136 }, { "epoch": 0.8552, "grad_norm": 6.5182378553408125, "learning_rate": 6.252102253143121e-09, "loss": 0.3039, "step": 2138 }, { "epoch": 0.856, "grad_norm": 0.9652570226909292, "learning_rate": 6.184665997806831e-09, "loss": 0.3249, "step": 2140 }, { "epoch": 0.8568, "grad_norm": 1.0862936513727042, "learning_rate": 6.1175714232466534e-09, "loss": -0.0158, "step": 2142 }, { "epoch": 0.8576, "grad_norm": 4.249676407264014, "learning_rate": 6.050819052679585e-09, "loss": 0.1583, "step": 2144 }, { "epoch": 0.8584, "grad_norm": 2.9048667267483523, "learning_rate": 5.9844094066539896e-09, "loss": -0.1592, "step": 2146 }, { "epoch": 0.8592, "grad_norm": 1.7700629849804357, "learning_rate": 5.918343003045656e-09, "loss": 0.4752, "step": 2148 }, { "epoch": 0.86, "grad_norm": 0.6095234167162628, "learning_rate": 5.8526203570536504e-09, "loss": 0.1604, "step": 2150 }, { "epoch": 0.8608, "grad_norm": 2.2319947159315983, "learning_rate": 5.787241981196383e-09, "loss": -0.2655, "step": 2152 }, { "epoch": 0.8616, "grad_norm": 2.385565400146363, "learning_rate": 5.7222083853075585e-09, "loss": 0.1889, "step": 2154 }, { "epoch": 0.8624, "grad_norm": 1.2740430540187664, "learning_rate": 5.657520076532207e-09, "loss": -0.0907, "step": 2156 }, { "epoch": 0.8632, "grad_norm": 0.8496723396985106, "learning_rate": 5.593177559322776e-09, "loss": -0.1871, "step": 2158 }, { "epoch": 0.864, "grad_norm": 4.349013100883454, "learning_rate": 5.529181335435124e-09, "loss": -0.2457, "step": 2160 }, { "epoch": 0.8648, "grad_norm": 2.624184256356982, "learning_rate": 5.46553190392467e-09, "loss": -0.0564, "step": 2162 }, { "epoch": 0.8656, "grad_norm": 4.891384467420414, "learning_rate": 5.402229761142463e-09, "loss": 0.0126, "step": 2164 }, { "epoch": 0.8664, "grad_norm": 1.0690277941852824, "learning_rate": 5.33927540073133e-09, "loss": 0.1096, "step": 2166 }, { "epoch": 0.8672, "grad_norm": 1.0864935645910627, "learning_rate": 5.276669313622012e-09, "loss": 0.4332, "step": 2168 }, { "epoch": 0.868, "grad_norm": 1.3794846000185592, "learning_rate": 5.214411988029355e-09, "loss": -0.2237, "step": 2170 }, { "epoch": 0.8688, "grad_norm": 2.1597838851262567, "learning_rate": 5.1525039094485025e-09, "loss": -0.0318, "step": 2172 }, { "epoch": 0.8696, "grad_norm": 8.604568980124318, "learning_rate": 5.090945560651072e-09, "loss": 0.7412, "step": 2174 }, { "epoch": 0.8704, "grad_norm": 9.383895725071664, "learning_rate": 5.029737421681446e-09, "loss": 0.1443, "step": 2176 }, { "epoch": 0.8712, "grad_norm": 1.4793862750428757, "learning_rate": 4.968879969852985e-09, "loss": -0.0098, "step": 2178 }, { "epoch": 0.872, "grad_norm": 1.3610591443697106, "learning_rate": 4.9083736797443155e-09, "loss": 0.1644, "step": 2180 }, { "epoch": 0.8728, "grad_norm": 3.385230548173844, "learning_rate": 4.848219023195643e-09, "loss": -0.344, "step": 2182 }, { "epoch": 0.8736, "grad_norm": 4.705608399579721, "learning_rate": 4.788416469305068e-09, "loss": -0.4683, "step": 2184 }, { "epoch": 0.8744, "grad_norm": 1.8452932381974783, "learning_rate": 4.728966484424912e-09, "loss": 0.6021, "step": 2186 }, { "epoch": 0.8752, "grad_norm": 8.270747639931717, "learning_rate": 4.669869532158116e-09, "loss": -0.0376, "step": 2188 }, { "epoch": 0.876, "grad_norm": 1.5872349158016041, "learning_rate": 4.611126073354571e-09, "loss": -0.1316, "step": 2190 }, { "epoch": 0.8768, "grad_norm": 1.1707857137806994, "learning_rate": 4.552736566107562e-09, "loss": 0.1637, "step": 2192 }, { "epoch": 0.8776, "grad_norm": 3.206746320008537, "learning_rate": 4.494701465750217e-09, "loss": 0.0884, "step": 2194 }, { "epoch": 0.8784, "grad_norm": 0.43228851408832186, "learning_rate": 4.437021224851889e-09, "loss": -0.07, "step": 2196 }, { "epoch": 0.8792, "grad_norm": 5.848134722891433, "learning_rate": 4.379696293214696e-09, "loss": 0.3384, "step": 2198 }, { "epoch": 0.88, "grad_norm": 8.755920463976658, "learning_rate": 4.322727117869951e-09, "loss": -0.1251, "step": 2200 }, { "epoch": 0.8808, "grad_norm": 3.618392493004251, "learning_rate": 4.26611414307475e-09, "loss": -0.5217, "step": 2202 }, { "epoch": 0.8816, "grad_norm": 3.074281375800813, "learning_rate": 4.209857810308437e-09, "loss": -0.2288, "step": 2204 }, { "epoch": 0.8824, "grad_norm": 2.4682659786410603, "learning_rate": 4.1539585582691885e-09, "loss": 0.4702, "step": 2206 }, { "epoch": 0.8832, "grad_norm": 0.6352925864601222, "learning_rate": 4.098416822870593e-09, "loss": 0.1221, "step": 2208 }, { "epoch": 0.884, "grad_norm": 18.89293797441348, "learning_rate": 4.043233037238281e-09, "loss": -0.1175, "step": 2210 }, { "epoch": 0.8848, "grad_norm": 1.2424133172966803, "learning_rate": 3.9884076317064805e-09, "loss": 0.1494, "step": 2212 }, { "epoch": 0.8856, "grad_norm": 3.3762433833453573, "learning_rate": 3.933941033814736e-09, "loss": 0.4943, "step": 2214 }, { "epoch": 0.8864, "grad_norm": 2.941556993324549, "learning_rate": 3.879833668304505e-09, "loss": 0.2086, "step": 2216 }, { "epoch": 0.8872, "grad_norm": 1.2160258295220527, "learning_rate": 3.826085957115887e-09, "loss": 0.2668, "step": 2218 }, { "epoch": 0.888, "grad_norm": 1.9880657961417336, "learning_rate": 3.772698319384349e-09, "loss": 0.0593, "step": 2220 }, { "epoch": 0.8888, "grad_norm": 4.467655660380731, "learning_rate": 3.719671171437394e-09, "loss": -1.063, "step": 2222 }, { "epoch": 0.8896, "grad_norm": 3.831471428262152, "learning_rate": 3.667004926791395e-09, "loss": -0.4956, "step": 2224 }, { "epoch": 0.8904, "grad_norm": 2.917151972577474, "learning_rate": 3.614699996148285e-09, "loss": -0.1451, "step": 2226 }, { "epoch": 0.8912, "grad_norm": 0.614337892942797, "learning_rate": 3.5627567873924514e-09, "loss": -0.1331, "step": 2228 }, { "epoch": 0.892, "grad_norm": 1.1621377172904874, "learning_rate": 3.5111757055874323e-09, "loss": 0.5733, "step": 2230 }, { "epoch": 0.8928, "grad_norm": 1.6634098853629664, "learning_rate": 3.4599571529728867e-09, "loss": 0.044, "step": 2232 }, { "epoch": 0.8936, "grad_norm": 2.3445443840858844, "learning_rate": 3.4091015289613777e-09, "loss": -0.0438, "step": 2234 }, { "epoch": 0.8944, "grad_norm": 2.676333362752966, "learning_rate": 3.3586092301352675e-09, "loss": -0.2776, "step": 2236 }, { "epoch": 0.8952, "grad_norm": 4.7697980831198405, "learning_rate": 3.3084806502436613e-09, "loss": -0.3659, "step": 2238 }, { "epoch": 0.896, "grad_norm": 6.057959897509257, "learning_rate": 3.2587161801992778e-09, "loss": -0.1006, "step": 2240 }, { "epoch": 0.8968, "grad_norm": 2.4224067497579043, "learning_rate": 3.209316208075463e-09, "loss": 0.4279, "step": 2242 }, { "epoch": 0.8976, "grad_norm": 2.2956297867062876, "learning_rate": 3.1602811191031085e-09, "loss": -0.2779, "step": 2244 }, { "epoch": 0.8984, "grad_norm": 2.676327435440919, "learning_rate": 3.111611295667704e-09, "loss": 0.1528, "step": 2246 }, { "epoch": 0.8992, "grad_norm": 2.4298741519867852, "learning_rate": 3.0633071173062965e-09, "loss": 0.7115, "step": 2248 }, { "epoch": 0.9, "grad_norm": 1.2550146008412881, "learning_rate": 3.015368960704584e-09, "loss": 0.2171, "step": 2250 }, { "epoch": 0.9008, "grad_norm": 1.8243762323293855, "learning_rate": 2.967797199693928e-09, "loss": -0.7472, "step": 2252 }, { "epoch": 0.9016, "grad_norm": 0.9210039784654722, "learning_rate": 2.9205922052484953e-09, "loss": -0.1888, "step": 2254 }, { "epoch": 0.9024, "grad_norm": 0.8824442352985862, "learning_rate": 2.873754345482299e-09, "loss": -0.023, "step": 2256 }, { "epoch": 0.9032, "grad_norm": 0.7664098550437426, "learning_rate": 2.827283985646378e-09, "loss": 0.1549, "step": 2258 }, { "epoch": 0.904, "grad_norm": 4.9714840928816155, "learning_rate": 2.78118148812595e-09, "loss": -0.3595, "step": 2260 }, { "epoch": 0.9048, "grad_norm": 8.654708151874246, "learning_rate": 2.7354472124375303e-09, "loss": -0.7394, "step": 2262 }, { "epoch": 0.9056, "grad_norm": 4.601144208392627, "learning_rate": 2.690081515226206e-09, "loss": 0.2249, "step": 2264 }, { "epoch": 0.9064, "grad_norm": 3.654256720460866, "learning_rate": 2.6450847502627882e-09, "loss": 0.3918, "step": 2266 }, { "epoch": 0.9072, "grad_norm": 2.8319108774051296, "learning_rate": 2.600457268441092e-09, "loss": -0.6636, "step": 2268 }, { "epoch": 0.908, "grad_norm": 1.0973890560354937, "learning_rate": 2.5561994177751735e-09, "loss": -0.0566, "step": 2270 }, { "epoch": 0.9088, "grad_norm": 6.841431014654473, "learning_rate": 2.5123115433966615e-09, "loss": 0.4878, "step": 2272 }, { "epoch": 0.9096, "grad_norm": 0.9320764615479279, "learning_rate": 2.468793987551998e-09, "loss": 0.1557, "step": 2274 }, { "epoch": 0.9104, "grad_norm": 6.370923388386578, "learning_rate": 2.425647089599836e-09, "loss": -0.1932, "step": 2276 }, { "epoch": 0.9112, "grad_norm": 4.183725642916174, "learning_rate": 2.3828711860083674e-09, "loss": 0.098, "step": 2278 }, { "epoch": 0.912, "grad_norm": 4.068230796992909, "learning_rate": 2.340466610352654e-09, "loss": -0.3505, "step": 2280 }, { "epoch": 0.9128, "grad_norm": 5.482675919877662, "learning_rate": 2.2984336933121073e-09, "loss": -0.7297, "step": 2282 }, { "epoch": 0.9136, "grad_norm": 3.566681537761448, "learning_rate": 2.2567727626678524e-09, "loss": -0.8328, "step": 2284 }, { "epoch": 0.9144, "grad_norm": 9.269477254720458, "learning_rate": 2.215484143300206e-09, "loss": 0.0203, "step": 2286 }, { "epoch": 0.9152, "grad_norm": 0.7216914942401899, "learning_rate": 2.174568157186102e-09, "loss": -0.1531, "step": 2288 }, { "epoch": 0.916, "grad_norm": 1.1979336525277888, "learning_rate": 2.1340251233966378e-09, "loss": 0.021, "step": 2290 }, { "epoch": 0.9168, "grad_norm": 1.922137573312157, "learning_rate": 2.0938553580945204e-09, "loss": 0.0329, "step": 2292 }, { "epoch": 0.9176, "grad_norm": 3.2309428357308225, "learning_rate": 2.054059174531653e-09, "loss": 0.2099, "step": 2294 }, { "epoch": 0.9184, "grad_norm": 1.2013871635397861, "learning_rate": 2.0146368830466664e-09, "loss": -0.114, "step": 2296 }, { "epoch": 0.9192, "grad_norm": 2.574041866507427, "learning_rate": 1.97558879106251e-09, "loss": -0.0334, "step": 2298 }, { "epoch": 0.92, "grad_norm": 2.1091928529055455, "learning_rate": 1.9369152030840553e-09, "loss": -0.1678, "step": 2300 }, { "epoch": 0.9208, "grad_norm": 1.3784969035589727, "learning_rate": 1.8986164206957035e-09, "loss": 0.0433, "step": 2302 }, { "epoch": 0.9216, "grad_norm": 1.445665066799248, "learning_rate": 1.8606927425590612e-09, "loss": -0.3298, "step": 2304 }, { "epoch": 0.9224, "grad_norm": 4.39979141208013, "learning_rate": 1.8231444644105754e-09, "loss": -0.2644, "step": 2306 }, { "epoch": 0.9232, "grad_norm": 6.792092961183168, "learning_rate": 1.7859718790592726e-09, "loss": 0.2548, "step": 2308 }, { "epoch": 0.924, "grad_norm": 8.882254523598194, "learning_rate": 1.7491752763844291e-09, "loss": -0.3878, "step": 2310 }, { "epoch": 0.9248, "grad_norm": 0.3523571433732991, "learning_rate": 1.7127549433333555e-09, "loss": 0.1046, "step": 2312 }, { "epoch": 0.9256, "grad_norm": 1.222101846793018, "learning_rate": 1.67671116391912e-09, "loss": 0.0512, "step": 2314 }, { "epoch": 0.9264, "grad_norm": 1.0258685268101109, "learning_rate": 1.641044219218357e-09, "loss": -0.0404, "step": 2316 }, { "epoch": 0.9272, "grad_norm": 0.8920962345335293, "learning_rate": 1.6057543873690682e-09, "loss": -0.505, "step": 2318 }, { "epoch": 0.928, "grad_norm": 4.300808456044036, "learning_rate": 1.570841943568446e-09, "loss": -0.0736, "step": 2320 }, { "epoch": 0.9288, "grad_norm": 1.7383370212962577, "learning_rate": 1.5363071600707434e-09, "loss": -0.2897, "step": 2322 }, { "epoch": 0.9296, "grad_norm": 3.8514538101100926, "learning_rate": 1.5021503061851348e-09, "loss": 0.5155, "step": 2324 }, { "epoch": 0.9304, "grad_norm": 9.611231070810984, "learning_rate": 1.4683716482736363e-09, "loss": -1.3277, "step": 2326 }, { "epoch": 0.9312, "grad_norm": 15.037380136585435, "learning_rate": 1.4349714497490006e-09, "loss": -0.1934, "step": 2328 }, { "epoch": 0.932, "grad_norm": 5.102302088022175, "learning_rate": 1.401949971072691e-09, "loss": -0.1637, "step": 2330 }, { "epoch": 0.9328, "grad_norm": 3.2589093792296877, "learning_rate": 1.369307469752823e-09, "loss": -0.4259, "step": 2332 }, { "epoch": 0.9336, "grad_norm": 0.5647745584410777, "learning_rate": 1.337044200342191e-09, "loss": 0.0052, "step": 2334 }, { "epoch": 0.9344, "grad_norm": 2.8587476735191135, "learning_rate": 1.3051604144362404e-09, "loss": -0.1694, "step": 2336 }, { "epoch": 0.9352, "grad_norm": 6.105887846076872, "learning_rate": 1.2736563606711382e-09, "loss": -0.093, "step": 2338 }, { "epoch": 0.936, "grad_norm": 0.946233622816971, "learning_rate": 1.2425322847218367e-09, "loss": 0.1256, "step": 2340 }, { "epoch": 0.9368, "grad_norm": 0.8448247771748791, "learning_rate": 1.2117884293001256e-09, "loss": 0.3261, "step": 2342 }, { "epoch": 0.9376, "grad_norm": 1.1930610173772718, "learning_rate": 1.181425034152761e-09, "loss": 0.3508, "step": 2344 }, { "epoch": 0.9384, "grad_norm": 1.0424606834190295, "learning_rate": 1.1514423360595938e-09, "loss": 0.0268, "step": 2346 }, { "epoch": 0.9392, "grad_norm": 1.9104458999656608, "learning_rate": 1.1218405688317445e-09, "loss": 0.35, "step": 2348 }, { "epoch": 0.94, "grad_norm": 2.7868033745601655, "learning_rate": 1.0926199633097156e-09, "loss": 0.3013, "step": 2350 }, { "epoch": 0.9408, "grad_norm": 3.114424044546039, "learning_rate": 1.063780747361681e-09, "loss": 0.4124, "step": 2352 }, { "epoch": 0.9416, "grad_norm": 0.9758662069822744, "learning_rate": 1.0353231458816337e-09, "loss": 0.0924, "step": 2354 }, { "epoch": 0.9424, "grad_norm": 5.730640616704293, "learning_rate": 1.007247380787657e-09, "loss": -0.4145, "step": 2356 }, { "epoch": 0.9432, "grad_norm": 1.2895873153425517, "learning_rate": 9.795536710202169e-10, "loss": 0.1583, "step": 2358 }, { "epoch": 0.944, "grad_norm": 2.3790760214218243, "learning_rate": 9.522422325404233e-10, "loss": 0.008, "step": 2360 }, { "epoch": 0.9448, "grad_norm": 1.3863491242757506, "learning_rate": 9.253132783283546e-10, "loss": 0.1381, "step": 2362 }, { "epoch": 0.9456, "grad_norm": 11.01027091184072, "learning_rate": 8.987670183814134e-10, "loss": -0.4539, "step": 2364 }, { "epoch": 0.9464, "grad_norm": 2.65682385350019, "learning_rate": 8.726036597126618e-10, "loss": 0.0709, "step": 2366 }, { "epoch": 0.9472, "grad_norm": 2.744888591204184, "learning_rate": 8.468234063492285e-10, "loss": 0.4649, "step": 2368 }, { "epoch": 0.948, "grad_norm": 1.105547898891178, "learning_rate": 8.214264593307096e-10, "loss": 0.3651, "step": 2370 }, { "epoch": 0.9488, "grad_norm": 11.040756137143747, "learning_rate": 7.964130167075922e-10, "loss": -0.3469, "step": 2372 }, { "epoch": 0.9496, "grad_norm": 3.7513293798622853, "learning_rate": 7.717832735397334e-10, "loss": 0.3173, "step": 2374 }, { "epoch": 0.9504, "grad_norm": 8.41123333843685, "learning_rate": 7.475374218948116e-10, "loss": -0.2059, "step": 2376 }, { "epoch": 0.9512, "grad_norm": 7.723511557670311, "learning_rate": 7.236756508468611e-10, "loss": -0.3296, "step": 2378 }, { "epoch": 0.952, "grad_norm": 1.7434046344380822, "learning_rate": 7.001981464747564e-10, "loss": -0.2653, "step": 2380 }, { "epoch": 0.9528, "grad_norm": 1.2597364465844136, "learning_rate": 6.771050918607913e-10, "loss": 0.0558, "step": 2382 }, { "epoch": 0.9536, "grad_norm": 0.4546682108800604, "learning_rate": 6.543966670892464e-10, "loss": 0.1425, "step": 2384 }, { "epoch": 0.9544, "grad_norm": 10.810141795864281, "learning_rate": 6.320730492449799e-10, "loss": 0.6105, "step": 2386 }, { "epoch": 0.9552, "grad_norm": 6.814183252292936, "learning_rate": 6.101344124120555e-10, "loss": 0.0885, "step": 2388 }, { "epoch": 0.956, "grad_norm": 1.8255386397646123, "learning_rate": 5.885809276723608e-10, "loss": 0.1283, "step": 2390 }, { "epoch": 0.9568, "grad_norm": 4.4064865857808835, "learning_rate": 5.674127631043024e-10, "loss": -0.0873, "step": 2392 }, { "epoch": 0.9576, "grad_norm": 5.893788093122514, "learning_rate": 5.466300837814797e-10, "loss": -0.2882, "step": 2394 }, { "epoch": 0.9584, "grad_norm": 2.015069846194777, "learning_rate": 5.262330517713964e-10, "loss": 0.0329, "step": 2396 }, { "epoch": 0.9592, "grad_norm": 4.38595870418589, "learning_rate": 5.062218261342122e-10, "loss": -0.4992, "step": 2398 }, { "epoch": 0.96, "grad_norm": 1.7573595331693213, "learning_rate": 4.865965629214819e-10, "loss": 0.3999, "step": 2400 }, { "epoch": 0.9608, "grad_norm": 2.8275795449554244, "learning_rate": 4.673574151749571e-10, "loss": 0.2888, "step": 2402 }, { "epoch": 0.9616, "grad_norm": 2.782006626438932, "learning_rate": 4.485045329253645e-10, "loss": -0.3105, "step": 2404 }, { "epoch": 0.9624, "grad_norm": 2.067801654579334, "learning_rate": 4.3003806319127365e-10, "loss": -0.0758, "step": 2406 }, { "epoch": 0.9632, "grad_norm": 0.905276767266366, "learning_rate": 4.119581499779201e-10, "loss": 0.0992, "step": 2408 }, { "epoch": 0.964, "grad_norm": 6.919526983565309, "learning_rate": 3.9426493427611176e-10, "loss": -0.2644, "step": 2410 }, { "epoch": 0.9648, "grad_norm": 9.034362661975814, "learning_rate": 3.769585540610798e-10, "loss": -0.2986, "step": 2412 }, { "epoch": 0.9656, "grad_norm": 3.14086560297048, "learning_rate": 3.600391442914741e-10, "loss": 0.7052, "step": 2414 }, { "epoch": 0.9664, "grad_norm": 1.3381605274973811, "learning_rate": 3.4350683690823055e-10, "loss": 0.6932, "step": 2416 }, { "epoch": 0.9672, "grad_norm": 48.51483665029063, "learning_rate": 3.273617608336221e-10, "loss": -1.0707, "step": 2418 }, { "epoch": 0.968, "grad_norm": 3.755662155712364, "learning_rate": 3.116040419701815e-10, "loss": 0.1557, "step": 2420 }, { "epoch": 0.9688, "grad_norm": 1.933882174975995, "learning_rate": 2.962338031997691e-10, "loss": -0.3932, "step": 2422 }, { "epoch": 0.9696, "grad_norm": 3.292274309661597, "learning_rate": 2.81251164382601e-10, "loss": 0.2927, "step": 2424 }, { "epoch": 0.9704, "grad_norm": 0.9395481380197626, "learning_rate": 2.666562423562946e-10, "loss": -0.0198, "step": 2426 }, { "epoch": 0.9712, "grad_norm": 2.9117731605023116, "learning_rate": 2.5244915093499133e-10, "loss": 0.383, "step": 2428 }, { "epoch": 0.972, "grad_norm": 0.7403534269223006, "learning_rate": 2.3863000090844075e-10, "loss": 0.2351, "step": 2430 }, { "epoch": 0.9728, "grad_norm": 4.024365992664171, "learning_rate": 2.2519890004115138e-10, "loss": -0.7044, "step": 2432 }, { "epoch": 0.9736, "grad_norm": 1.791122582491928, "learning_rate": 2.1215595307154665e-10, "loss": 0.3281, "step": 2434 }, { "epoch": 0.9744, "grad_norm": 3.643294879721196, "learning_rate": 1.9950126171114357e-10, "loss": -0.3029, "step": 2436 }, { "epoch": 0.9752, "grad_norm": 2.401813097073988, "learning_rate": 1.872349246437699e-10, "loss": -0.346, "step": 2438 }, { "epoch": 0.976, "grad_norm": 1.161470321037417, "learning_rate": 1.753570375247815e-10, "loss": 0.0214, "step": 2440 }, { "epoch": 0.9768, "grad_norm": 2.0536425465151313, "learning_rate": 1.6386769298034064e-10, "loss": 0.221, "step": 2442 }, { "epoch": 0.9776, "grad_norm": 2.1285226467223963, "learning_rate": 1.5276698060665004e-10, "loss": -0.3179, "step": 2444 }, { "epoch": 0.9784, "grad_norm": 2.1485854952235712, "learning_rate": 1.4205498696930328e-10, "loss": 0.1872, "step": 2446 }, { "epoch": 0.9792, "grad_norm": 2.6297283339660535, "learning_rate": 1.317317956025743e-10, "loss": 0.3872, "step": 2448 }, { "epoch": 0.98, "grad_norm": 1.0558228777462992, "learning_rate": 1.2179748700879012e-10, "loss": -0.3332, "step": 2450 }, { "epoch": 0.9808, "grad_norm": 1.9479430382919645, "learning_rate": 1.1225213865767024e-10, "loss": -0.212, "step": 2452 }, { "epoch": 0.9816, "grad_norm": 10.17784962779835, "learning_rate": 1.0309582498577718e-10, "loss": -0.6738, "step": 2454 }, { "epoch": 0.9824, "grad_norm": 3.2715225558994083, "learning_rate": 9.432861739586683e-11, "loss": -0.2205, "step": 2456 }, { "epoch": 0.9832, "grad_norm": 12.126713223030167, "learning_rate": 8.595058425640011e-11, "loss": -0.026, "step": 2458 }, { "epoch": 0.984, "grad_norm": 2.5444346731980882, "learning_rate": 7.79617909009489e-11, "loss": 0.2714, "step": 2460 }, { "epoch": 0.9848, "grad_norm": 3.2112718047857647, "learning_rate": 7.036229962774087e-11, "loss": -0.4355, "step": 2462 }, { "epoch": 0.9856, "grad_norm": 2.4317779702655082, "learning_rate": 6.315216969912662e-11, "loss": -0.1779, "step": 2464 }, { "epoch": 0.9864, "grad_norm": 1.0366427411833663, "learning_rate": 5.633145734114664e-11, "loss": 0.0066, "step": 2466 }, { "epoch": 0.9872, "grad_norm": 9.102037371991102, "learning_rate": 4.990021574309833e-11, "loss": -0.5161, "step": 2468 }, { "epoch": 0.988, "grad_norm": 4.114717937151245, "learning_rate": 4.3858495057080834e-11, "loss": -0.3626, "step": 2470 }, { "epoch": 0.9888, "grad_norm": 7.563987228393988, "learning_rate": 3.820634239765641e-11, "loss": 0.3373, "step": 2472 }, { "epoch": 0.9896, "grad_norm": 2.948617798085717, "learning_rate": 3.294380184143963e-11, "loss": -0.1291, "step": 2474 }, { "epoch": 0.9904, "grad_norm": 3.8406470359549023, "learning_rate": 2.8070914426786552e-11, "loss": -0.2217, "step": 2476 }, { "epoch": 0.9912, "grad_norm": 4.310729814484611, "learning_rate": 2.3587718153444957e-11, "loss": 0.6706, "step": 2478 }, { "epoch": 0.992, "grad_norm": 3.119684178739935, "learning_rate": 1.9494247982282384e-11, "loss": 0.203, "step": 2480 }, { "epoch": 0.9928, "grad_norm": 2.0436367262108153, "learning_rate": 1.5790535835003004e-11, "loss": 0.0699, "step": 2482 }, { "epoch": 0.9936, "grad_norm": 1.9758044654494797, "learning_rate": 1.2476610593892268e-11, "loss": -0.2463, "step": 2484 }, { "epoch": 0.9944, "grad_norm": 7.343691779754545, "learning_rate": 9.552498101611517e-12, "loss": -0.2488, "step": 2486 }, { "epoch": 0.9952, "grad_norm": 0.6231400638141823, "learning_rate": 7.0182211609814965e-12, "loss": 0.6614, "step": 2488 }, { "epoch": 0.996, "grad_norm": 2.775645305565287, "learning_rate": 4.873799534788059e-12, "loss": 0.1694, "step": 2490 }, { "epoch": 0.9968, "grad_norm": 10.227058254229792, "learning_rate": 3.119249945676694e-12, "loss": -0.0563, "step": 2492 }, { "epoch": 0.9976, "grad_norm": 2.487983479578251, "learning_rate": 1.7545860759693443e-12, "loss": 0.1768, "step": 2494 }, { "epoch": 0.9984, "grad_norm": 0.8570358384984528, "learning_rate": 7.798185675866875e-13, "loss": 0.3369, "step": 2496 }, { "epoch": 0.9992, "grad_norm": 5.047844589765427, "learning_rate": 1.9495502197042212e-13, "loss": -0.8027, "step": 2498 }, { "epoch": 1.0, "grad_norm": 1.124613481673833, "learning_rate": 0.0, "loss": 0.1599, "step": 2500 }, { "epoch": 1.0, "step": 2500, "total_flos": 75625527656448.0, "train_loss": -0.011922632639855147, "train_runtime": 6151.6354, "train_samples_per_second": 1.625, "train_steps_per_second": 0.406 } ], "logging_steps": 2, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 75625527656448.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }