diff --git "a/rm-harmless-hhg/checkpoint-1250/trainer_state.json" "b/rm-harmless-hhg/checkpoint-1250/trainer_state.json" new file mode 100644--- /dev/null +++ "b/rm-harmless-hhg/checkpoint-1250/trainer_state.json" @@ -0,0 +1,20021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 1, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004, + "grad_norm": 45.0, + "learning_rate": 4.996e-05, + "loss": 0.5, + "step": 1 + }, + { + "epoch": 0.004, + "eval_accuracy": 0.458, + "eval_loss": 0.7124062776565552, + "eval_runtime": 29.0829, + "eval_samples_per_second": 17.192, + "eval_steps_per_second": 2.166, + "step": 1 + }, + { + "epoch": 0.008, + "grad_norm": 54.5, + "learning_rate": 4.992e-05, + "loss": 0.7109, + "step": 2 + }, + { + "epoch": 0.008, + "eval_accuracy": 0.45, + "eval_loss": 0.7394999861717224, + "eval_runtime": 29.3916, + "eval_samples_per_second": 17.012, + "eval_steps_per_second": 2.143, + "step": 2 + }, + { + "epoch": 0.012, + "grad_norm": 32.0, + "learning_rate": 4.9880000000000004e-05, + "loss": 0.4492, + "step": 3 + }, + { + "epoch": 0.012, + "eval_accuracy": 0.638, + "eval_loss": 0.6627500057220459, + "eval_runtime": 29.501, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.136, + "step": 3 + }, + { + "epoch": 0.016, + "grad_norm": 29.25, + "learning_rate": 4.9840000000000004e-05, + "loss": 0.6562, + "step": 4 + }, + { + "epoch": 0.016, + "eval_accuracy": 0.468, + "eval_loss": 0.7167500257492065, + "eval_runtime": 29.5886, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.129, + "step": 4 + }, + { + "epoch": 0.02, + "grad_norm": 21.75, + "learning_rate": 4.9800000000000004e-05, + "loss": 0.5742, + "step": 5 + }, + { + "epoch": 0.02, + "eval_accuracy": 0.476, + "eval_loss": 0.7177812457084656, + "eval_runtime": 29.6456, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.125, + "step": 5 + }, + { + "epoch": 0.024, + "grad_norm": 35.0, + "learning_rate": 4.976e-05, + "loss": 1.2422, + "step": 6 + }, + { + "epoch": 0.024, + "eval_accuracy": 0.63, + "eval_loss": 0.6948437690734863, + "eval_runtime": 29.7155, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.12, + "step": 6 + }, + { + "epoch": 0.028, + "grad_norm": 17.5, + "learning_rate": 4.972e-05, + "loss": 0.7578, + "step": 7 + }, + { + "epoch": 0.028, + "eval_accuracy": 0.618, + "eval_loss": 0.6891250014305115, + "eval_runtime": 29.6993, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.121, + "step": 7 + }, + { + "epoch": 0.032, + "grad_norm": 19.25, + "learning_rate": 4.9680000000000005e-05, + "loss": 0.7891, + "step": 8 + }, + { + "epoch": 0.032, + "eval_accuracy": 0.53, + "eval_loss": 0.7268750071525574, + "eval_runtime": 29.6641, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 8 + }, + { + "epoch": 0.036, + "grad_norm": 18.625, + "learning_rate": 4.9640000000000006e-05, + "loss": 0.7383, + "step": 9 + }, + { + "epoch": 0.036, + "eval_accuracy": 0.408, + "eval_loss": 0.7724375128746033, + "eval_runtime": 29.6769, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.123, + "step": 9 + }, + { + "epoch": 0.04, + "grad_norm": 54.75, + "learning_rate": 4.96e-05, + "loss": 0.7891, + "step": 10 + }, + { + "epoch": 0.04, + "eval_accuracy": 0.474, + "eval_loss": 0.7897031307220459, + "eval_runtime": 29.7442, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.118, + "step": 10 + }, + { + "epoch": 0.044, + "grad_norm": 23.5, + "learning_rate": 4.956e-05, + "loss": 0.7422, + "step": 11 + }, + { + "epoch": 0.044, + "eval_accuracy": 0.512, + "eval_loss": 0.7368593811988831, + "eval_runtime": 29.7047, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.121, + "step": 11 + }, + { + "epoch": 0.048, + "grad_norm": 31.875, + "learning_rate": 4.952e-05, + "loss": 0.9648, + "step": 12 + }, + { + "epoch": 0.048, + "eval_accuracy": 0.502, + "eval_loss": 0.7394062280654907, + "eval_runtime": 29.6121, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.128, + "step": 12 + }, + { + "epoch": 0.052, + "grad_norm": 69.0, + "learning_rate": 4.948000000000001e-05, + "loss": 1.0938, + "step": 13 + }, + { + "epoch": 0.052, + "eval_accuracy": 0.464, + "eval_loss": 0.7652812600135803, + "eval_runtime": 29.5696, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 13 + }, + { + "epoch": 0.056, + "grad_norm": 25.625, + "learning_rate": 4.944e-05, + "loss": 0.5, + "step": 14 + }, + { + "epoch": 0.056, + "eval_accuracy": 0.442, + "eval_loss": 0.7733906507492065, + "eval_runtime": 29.4548, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.139, + "step": 14 + }, + { + "epoch": 0.06, + "grad_norm": 27.0, + "learning_rate": 4.94e-05, + "loss": 0.6367, + "step": 15 + }, + { + "epoch": 0.06, + "eval_accuracy": 0.422, + "eval_loss": 0.8111249804496765, + "eval_runtime": 29.5364, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.133, + "step": 15 + }, + { + "epoch": 0.064, + "grad_norm": 33.5, + "learning_rate": 4.936e-05, + "loss": 0.6836, + "step": 16 + }, + { + "epoch": 0.064, + "eval_accuracy": 0.412, + "eval_loss": 0.8485312461853027, + "eval_runtime": 29.6476, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 16 + }, + { + "epoch": 0.068, + "grad_norm": 64.5, + "learning_rate": 4.932e-05, + "loss": 0.8672, + "step": 17 + }, + { + "epoch": 0.068, + "eval_accuracy": 0.446, + "eval_loss": 0.8228750228881836, + "eval_runtime": 29.6878, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.122, + "step": 17 + }, + { + "epoch": 0.072, + "grad_norm": 45.25, + "learning_rate": 4.928e-05, + "loss": 0.8867, + "step": 18 + }, + { + "epoch": 0.072, + "eval_accuracy": 0.48, + "eval_loss": 0.781000018119812, + "eval_runtime": 29.7017, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.121, + "step": 18 + }, + { + "epoch": 0.076, + "grad_norm": 33.75, + "learning_rate": 4.924e-05, + "loss": 0.6211, + "step": 19 + }, + { + "epoch": 0.076, + "eval_accuracy": 0.498, + "eval_loss": 0.7408124804496765, + "eval_runtime": 29.7015, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.121, + "step": 19 + }, + { + "epoch": 0.08, + "grad_norm": 21.125, + "learning_rate": 4.92e-05, + "loss": 0.668, + "step": 20 + }, + { + "epoch": 0.08, + "eval_accuracy": 0.516, + "eval_loss": 0.7375937700271606, + "eval_runtime": 29.6975, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.121, + "step": 20 + }, + { + "epoch": 0.084, + "grad_norm": 81.0, + "learning_rate": 4.9160000000000004e-05, + "loss": 0.6055, + "step": 21 + }, + { + "epoch": 0.084, + "eval_accuracy": 0.516, + "eval_loss": 0.7243124842643738, + "eval_runtime": 29.6516, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 21 + }, + { + "epoch": 0.088, + "grad_norm": 19.875, + "learning_rate": 4.9120000000000004e-05, + "loss": 0.6836, + "step": 22 + }, + { + "epoch": 0.088, + "eval_accuracy": 0.504, + "eval_loss": 0.7167500257492065, + "eval_runtime": 29.6555, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 22 + }, + { + "epoch": 0.092, + "grad_norm": 26.75, + "learning_rate": 4.9080000000000004e-05, + "loss": 0.7812, + "step": 23 + }, + { + "epoch": 0.092, + "eval_accuracy": 0.546, + "eval_loss": 0.703125, + "eval_runtime": 29.6513, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 23 + }, + { + "epoch": 0.096, + "grad_norm": 27.375, + "learning_rate": 4.9040000000000005e-05, + "loss": 0.6641, + "step": 24 + }, + { + "epoch": 0.096, + "eval_accuracy": 0.604, + "eval_loss": 0.6928125023841858, + "eval_runtime": 29.6621, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 24 + }, + { + "epoch": 0.1, + "grad_norm": 20.5, + "learning_rate": 4.9e-05, + "loss": 0.668, + "step": 25 + }, + { + "epoch": 0.1, + "eval_accuracy": 0.634, + "eval_loss": 0.6862499713897705, + "eval_runtime": 29.6741, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.123, + "step": 25 + }, + { + "epoch": 0.104, + "grad_norm": 22.75, + "learning_rate": 4.896e-05, + "loss": 0.7617, + "step": 26 + }, + { + "epoch": 0.104, + "eval_accuracy": 0.638, + "eval_loss": 0.6759687662124634, + "eval_runtime": 29.6821, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.122, + "step": 26 + }, + { + "epoch": 0.108, + "grad_norm": 87.0, + "learning_rate": 4.8920000000000006e-05, + "loss": 0.9727, + "step": 27 + }, + { + "epoch": 0.108, + "eval_accuracy": 0.638, + "eval_loss": 0.6718124747276306, + "eval_runtime": 29.6658, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 27 + }, + { + "epoch": 0.112, + "grad_norm": 29.0, + "learning_rate": 4.8880000000000006e-05, + "loss": 0.8242, + "step": 28 + }, + { + "epoch": 0.112, + "eval_accuracy": 0.666, + "eval_loss": 0.6636250019073486, + "eval_runtime": 29.7251, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.119, + "step": 28 + }, + { + "epoch": 0.116, + "grad_norm": 12.25, + "learning_rate": 4.884e-05, + "loss": 0.5664, + "step": 29 + }, + { + "epoch": 0.116, + "eval_accuracy": 0.726, + "eval_loss": 0.6483749747276306, + "eval_runtime": 29.6779, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.123, + "step": 29 + }, + { + "epoch": 0.12, + "grad_norm": 13.75, + "learning_rate": 4.88e-05, + "loss": 0.668, + "step": 30 + }, + { + "epoch": 0.12, + "eval_accuracy": 0.776, + "eval_loss": 0.6269999742507935, + "eval_runtime": 29.6171, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 30 + }, + { + "epoch": 0.124, + "grad_norm": 19.75, + "learning_rate": 4.876e-05, + "loss": 0.6953, + "step": 31 + }, + { + "epoch": 0.124, + "eval_accuracy": 0.798, + "eval_loss": 0.6130937337875366, + "eval_runtime": 29.5817, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 31 + }, + { + "epoch": 0.128, + "grad_norm": 20.75, + "learning_rate": 4.872000000000001e-05, + "loss": 0.5547, + "step": 32 + }, + { + "epoch": 0.128, + "eval_accuracy": 0.808, + "eval_loss": 0.6037812232971191, + "eval_runtime": 29.5078, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.135, + "step": 32 + }, + { + "epoch": 0.132, + "grad_norm": 23.875, + "learning_rate": 4.868e-05, + "loss": 0.7695, + "step": 33 + }, + { + "epoch": 0.132, + "eval_accuracy": 0.798, + "eval_loss": 0.6041874885559082, + "eval_runtime": 29.5414, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.133, + "step": 33 + }, + { + "epoch": 0.136, + "grad_norm": 15.6875, + "learning_rate": 4.864e-05, + "loss": 0.9688, + "step": 34 + }, + { + "epoch": 0.136, + "eval_accuracy": 0.808, + "eval_loss": 0.6112499833106995, + "eval_runtime": 29.625, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 34 + }, + { + "epoch": 0.14, + "grad_norm": 12.6875, + "learning_rate": 4.86e-05, + "loss": 0.6641, + "step": 35 + }, + { + "epoch": 0.14, + "eval_accuracy": 0.798, + "eval_loss": 0.6251562237739563, + "eval_runtime": 29.6521, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.125, + "step": 35 + }, + { + "epoch": 0.144, + "grad_norm": 13.6875, + "learning_rate": 4.856e-05, + "loss": 0.7812, + "step": 36 + }, + { + "epoch": 0.144, + "eval_accuracy": 0.808, + "eval_loss": 0.6315937638282776, + "eval_runtime": 29.719, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.12, + "step": 36 + }, + { + "epoch": 0.148, + "grad_norm": 9.4375, + "learning_rate": 4.852e-05, + "loss": 0.7305, + "step": 37 + }, + { + "epoch": 0.148, + "eval_accuracy": 0.814, + "eval_loss": 0.6296250224113464, + "eval_runtime": 29.656, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 37 + }, + { + "epoch": 0.152, + "grad_norm": 37.75, + "learning_rate": 4.8480000000000003e-05, + "loss": 0.7891, + "step": 38 + }, + { + "epoch": 0.152, + "eval_accuracy": 0.81, + "eval_loss": 0.6253437399864197, + "eval_runtime": 29.6593, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.124, + "step": 38 + }, + { + "epoch": 0.156, + "grad_norm": 7.09375, + "learning_rate": 4.8440000000000004e-05, + "loss": 0.5156, + "step": 39 + }, + { + "epoch": 0.156, + "eval_accuracy": 0.798, + "eval_loss": 0.6187812685966492, + "eval_runtime": 29.6667, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 39 + }, + { + "epoch": 0.16, + "grad_norm": 13.625, + "learning_rate": 4.8400000000000004e-05, + "loss": 0.5273, + "step": 40 + }, + { + "epoch": 0.16, + "eval_accuracy": 0.796, + "eval_loss": 0.609624981880188, + "eval_runtime": 29.6559, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 40 + }, + { + "epoch": 0.164, + "grad_norm": 19.75, + "learning_rate": 4.836e-05, + "loss": 0.6484, + "step": 41 + }, + { + "epoch": 0.164, + "eval_accuracy": 0.788, + "eval_loss": 0.601687490940094, + "eval_runtime": 29.715, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.12, + "step": 41 + }, + { + "epoch": 0.168, + "grad_norm": 13.75, + "learning_rate": 4.8320000000000005e-05, + "loss": 0.6719, + "step": 42 + }, + { + "epoch": 0.168, + "eval_accuracy": 0.786, + "eval_loss": 0.597781240940094, + "eval_runtime": 29.6718, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.123, + "step": 42 + }, + { + "epoch": 0.172, + "grad_norm": 29.125, + "learning_rate": 4.8280000000000005e-05, + "loss": 0.7969, + "step": 43 + }, + { + "epoch": 0.172, + "eval_accuracy": 0.802, + "eval_loss": 0.5973125100135803, + "eval_runtime": 29.6829, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.122, + "step": 43 + }, + { + "epoch": 0.176, + "grad_norm": 18.125, + "learning_rate": 4.824e-05, + "loss": 0.6953, + "step": 44 + }, + { + "epoch": 0.176, + "eval_accuracy": 0.79, + "eval_loss": 0.5939687490463257, + "eval_runtime": 29.6719, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.123, + "step": 44 + }, + { + "epoch": 0.18, + "grad_norm": 19.625, + "learning_rate": 4.82e-05, + "loss": 0.6523, + "step": 45 + }, + { + "epoch": 0.18, + "eval_accuracy": 0.794, + "eval_loss": 0.5889687538146973, + "eval_runtime": 29.6634, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 45 + }, + { + "epoch": 0.184, + "grad_norm": 8.1875, + "learning_rate": 4.816e-05, + "loss": 0.5156, + "step": 46 + }, + { + "epoch": 0.184, + "eval_accuracy": 0.81, + "eval_loss": 0.5811874866485596, + "eval_runtime": 29.5751, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 46 + }, + { + "epoch": 0.188, + "grad_norm": 11.125, + "learning_rate": 4.812000000000001e-05, + "loss": 0.6172, + "step": 47 + }, + { + "epoch": 0.188, + "eval_accuracy": 0.824, + "eval_loss": 0.5709062218666077, + "eval_runtime": 29.5824, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 47 + }, + { + "epoch": 0.192, + "grad_norm": 22.75, + "learning_rate": 4.808e-05, + "loss": 0.6836, + "step": 48 + }, + { + "epoch": 0.192, + "eval_accuracy": 0.83, + "eval_loss": 0.5614374876022339, + "eval_runtime": 29.5626, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 48 + }, + { + "epoch": 0.196, + "grad_norm": 22.125, + "learning_rate": 4.804e-05, + "loss": 1.0156, + "step": 49 + }, + { + "epoch": 0.196, + "eval_accuracy": 0.838, + "eval_loss": 0.5536562204360962, + "eval_runtime": 29.5412, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.133, + "step": 49 + }, + { + "epoch": 0.2, + "grad_norm": 10.3125, + "learning_rate": 4.8e-05, + "loss": 0.7734, + "step": 50 + }, + { + "epoch": 0.2, + "eval_accuracy": 0.842, + "eval_loss": 0.550906240940094, + "eval_runtime": 29.5731, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 50 + }, + { + "epoch": 0.204, + "grad_norm": 10.1875, + "learning_rate": 4.796e-05, + "loss": 0.4648, + "step": 51 + }, + { + "epoch": 0.204, + "eval_accuracy": 0.838, + "eval_loss": 0.5446875095367432, + "eval_runtime": 29.5403, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.133, + "step": 51 + }, + { + "epoch": 0.208, + "grad_norm": 11.875, + "learning_rate": 4.792e-05, + "loss": 0.6523, + "step": 52 + }, + { + "epoch": 0.208, + "eval_accuracy": 0.84, + "eval_loss": 0.5341562628746033, + "eval_runtime": 29.4583, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.139, + "step": 52 + }, + { + "epoch": 0.212, + "grad_norm": 12.4375, + "learning_rate": 4.788e-05, + "loss": 0.6328, + "step": 53 + }, + { + "epoch": 0.212, + "eval_accuracy": 0.842, + "eval_loss": 0.5233749747276306, + "eval_runtime": 29.6166, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 53 + }, + { + "epoch": 0.216, + "grad_norm": 12.75, + "learning_rate": 4.784e-05, + "loss": 0.5117, + "step": 54 + }, + { + "epoch": 0.216, + "eval_accuracy": 0.858, + "eval_loss": 0.5138437747955322, + "eval_runtime": 29.6698, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 54 + }, + { + "epoch": 0.22, + "grad_norm": 29.375, + "learning_rate": 4.78e-05, + "loss": 0.7969, + "step": 55 + }, + { + "epoch": 0.22, + "eval_accuracy": 0.862, + "eval_loss": 0.5067812204360962, + "eval_runtime": 29.6292, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 55 + }, + { + "epoch": 0.224, + "grad_norm": 13.5625, + "learning_rate": 4.7760000000000004e-05, + "loss": 0.6094, + "step": 56 + }, + { + "epoch": 0.224, + "eval_accuracy": 0.858, + "eval_loss": 0.4999062418937683, + "eval_runtime": 29.6425, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.125, + "step": 56 + }, + { + "epoch": 0.228, + "grad_norm": 8.875, + "learning_rate": 4.7720000000000004e-05, + "loss": 0.4551, + "step": 57 + }, + { + "epoch": 0.228, + "eval_accuracy": 0.862, + "eval_loss": 0.4843437373638153, + "eval_runtime": 29.6992, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.121, + "step": 57 + }, + { + "epoch": 0.232, + "grad_norm": 15.625, + "learning_rate": 4.7680000000000004e-05, + "loss": 0.5469, + "step": 58 + }, + { + "epoch": 0.232, + "eval_accuracy": 0.864, + "eval_loss": 0.47040626406669617, + "eval_runtime": 29.6655, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 58 + }, + { + "epoch": 0.236, + "grad_norm": 13.9375, + "learning_rate": 4.7640000000000005e-05, + "loss": 0.6523, + "step": 59 + }, + { + "epoch": 0.236, + "eval_accuracy": 0.864, + "eval_loss": 0.4586562514305115, + "eval_runtime": 29.6684, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.123, + "step": 59 + }, + { + "epoch": 0.24, + "grad_norm": 12.0625, + "learning_rate": 4.76e-05, + "loss": 0.5625, + "step": 60 + }, + { + "epoch": 0.24, + "eval_accuracy": 0.868, + "eval_loss": 0.4503749907016754, + "eval_runtime": 29.6717, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.123, + "step": 60 + }, + { + "epoch": 0.244, + "grad_norm": 31.375, + "learning_rate": 4.7560000000000005e-05, + "loss": 0.6758, + "step": 61 + }, + { + "epoch": 0.244, + "eval_accuracy": 0.872, + "eval_loss": 0.4425937533378601, + "eval_runtime": 29.6674, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 61 + }, + { + "epoch": 0.248, + "grad_norm": 12.3125, + "learning_rate": 4.7520000000000006e-05, + "loss": 0.5273, + "step": 62 + }, + { + "epoch": 0.248, + "eval_accuracy": 0.864, + "eval_loss": 0.43607813119888306, + "eval_runtime": 29.7062, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.121, + "step": 62 + }, + { + "epoch": 0.252, + "grad_norm": 26.375, + "learning_rate": 4.748e-05, + "loss": 1.2344, + "step": 63 + }, + { + "epoch": 0.252, + "eval_accuracy": 0.878, + "eval_loss": 0.4342343807220459, + "eval_runtime": 29.6437, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 63 + }, + { + "epoch": 0.256, + "grad_norm": 10.625, + "learning_rate": 4.744e-05, + "loss": 0.4219, + "step": 64 + }, + { + "epoch": 0.256, + "eval_accuracy": 0.89, + "eval_loss": 0.4313906133174896, + "eval_runtime": 29.5484, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 64 + }, + { + "epoch": 0.26, + "grad_norm": 6.875, + "learning_rate": 4.74e-05, + "loss": 0.2412, + "step": 65 + }, + { + "epoch": 0.26, + "eval_accuracy": 0.888, + "eval_loss": 0.4232812523841858, + "eval_runtime": 29.5186, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.134, + "step": 65 + }, + { + "epoch": 0.264, + "grad_norm": 12.0625, + "learning_rate": 4.736000000000001e-05, + "loss": 0.3027, + "step": 66 + }, + { + "epoch": 0.264, + "eval_accuracy": 0.888, + "eval_loss": 0.41115623712539673, + "eval_runtime": 29.4994, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.136, + "step": 66 + }, + { + "epoch": 0.268, + "grad_norm": 15.0625, + "learning_rate": 4.732e-05, + "loss": 0.7227, + "step": 67 + }, + { + "epoch": 0.268, + "eval_accuracy": 0.894, + "eval_loss": 0.4058906137943268, + "eval_runtime": 29.5424, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.133, + "step": 67 + }, + { + "epoch": 0.272, + "grad_norm": 16.875, + "learning_rate": 4.728e-05, + "loss": 0.6172, + "step": 68 + }, + { + "epoch": 0.272, + "eval_accuracy": 0.896, + "eval_loss": 0.40062499046325684, + "eval_runtime": 29.4685, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.138, + "step": 68 + }, + { + "epoch": 0.276, + "grad_norm": 8.4375, + "learning_rate": 4.724e-05, + "loss": 0.3047, + "step": 69 + }, + { + "epoch": 0.276, + "eval_accuracy": 0.908, + "eval_loss": 0.3950468897819519, + "eval_runtime": 29.5757, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 69 + }, + { + "epoch": 0.28, + "grad_norm": 13.25, + "learning_rate": 4.72e-05, + "loss": 0.4414, + "step": 70 + }, + { + "epoch": 0.28, + "eval_accuracy": 0.91, + "eval_loss": 0.3856250047683716, + "eval_runtime": 29.6181, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 70 + }, + { + "epoch": 0.284, + "grad_norm": 9.3125, + "learning_rate": 4.716e-05, + "loss": 0.4102, + "step": 71 + }, + { + "epoch": 0.284, + "eval_accuracy": 0.904, + "eval_loss": 0.3766250014305115, + "eval_runtime": 29.6404, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.125, + "step": 71 + }, + { + "epoch": 0.288, + "grad_norm": 22.375, + "learning_rate": 4.712e-05, + "loss": 0.8281, + "step": 72 + }, + { + "epoch": 0.288, + "eval_accuracy": 0.908, + "eval_loss": 0.3656406104564667, + "eval_runtime": 29.6484, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 72 + }, + { + "epoch": 0.292, + "grad_norm": 14.4375, + "learning_rate": 4.708e-05, + "loss": 0.6211, + "step": 73 + }, + { + "epoch": 0.292, + "eval_accuracy": 0.908, + "eval_loss": 0.35199999809265137, + "eval_runtime": 29.6563, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 73 + }, + { + "epoch": 0.296, + "grad_norm": 9.6875, + "learning_rate": 4.7040000000000004e-05, + "loss": 0.3105, + "step": 74 + }, + { + "epoch": 0.296, + "eval_accuracy": 0.904, + "eval_loss": 0.3412812352180481, + "eval_runtime": 29.6597, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.124, + "step": 74 + }, + { + "epoch": 0.3, + "grad_norm": 13.125, + "learning_rate": 4.7e-05, + "loss": 0.6055, + "step": 75 + }, + { + "epoch": 0.3, + "eval_accuracy": 0.91, + "eval_loss": 0.33653125166893005, + "eval_runtime": 29.6623, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 75 + }, + { + "epoch": 0.304, + "grad_norm": 12.25, + "learning_rate": 4.6960000000000004e-05, + "loss": 0.4102, + "step": 76 + }, + { + "epoch": 0.304, + "eval_accuracy": 0.914, + "eval_loss": 0.3292500078678131, + "eval_runtime": 29.6618, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 76 + }, + { + "epoch": 0.308, + "grad_norm": 11.6875, + "learning_rate": 4.6920000000000005e-05, + "loss": 0.2559, + "step": 77 + }, + { + "epoch": 0.308, + "eval_accuracy": 0.918, + "eval_loss": 0.320125013589859, + "eval_runtime": 29.7131, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.12, + "step": 77 + }, + { + "epoch": 0.312, + "grad_norm": 11.1875, + "learning_rate": 4.688e-05, + "loss": 0.3594, + "step": 78 + }, + { + "epoch": 0.312, + "eval_accuracy": 0.922, + "eval_loss": 0.3079531192779541, + "eval_runtime": 29.6528, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.125, + "step": 78 + }, + { + "epoch": 0.316, + "grad_norm": 14.1875, + "learning_rate": 4.684e-05, + "loss": 0.2754, + "step": 79 + }, + { + "epoch": 0.316, + "eval_accuracy": 0.924, + "eval_loss": 0.29627344012260437, + "eval_runtime": 29.7011, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.121, + "step": 79 + }, + { + "epoch": 0.32, + "grad_norm": 8.375, + "learning_rate": 4.6800000000000006e-05, + "loss": 0.252, + "step": 80 + }, + { + "epoch": 0.32, + "eval_accuracy": 0.922, + "eval_loss": 0.28282031416893005, + "eval_runtime": 29.7136, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.12, + "step": 80 + }, + { + "epoch": 0.324, + "grad_norm": 12.3125, + "learning_rate": 4.6760000000000006e-05, + "loss": 0.2793, + "step": 81 + }, + { + "epoch": 0.324, + "eval_accuracy": 0.924, + "eval_loss": 0.27314063906669617, + "eval_runtime": 29.6718, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.123, + "step": 81 + }, + { + "epoch": 0.328, + "grad_norm": 3.703125, + "learning_rate": 4.672e-05, + "loss": 0.0947, + "step": 82 + }, + { + "epoch": 0.328, + "eval_accuracy": 0.922, + "eval_loss": 0.26402342319488525, + "eval_runtime": 29.6751, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.123, + "step": 82 + }, + { + "epoch": 0.332, + "grad_norm": 25.625, + "learning_rate": 4.668e-05, + "loss": 0.7617, + "step": 83 + }, + { + "epoch": 0.332, + "eval_accuracy": 0.918, + "eval_loss": 0.2584453225135803, + "eval_runtime": 29.7047, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.121, + "step": 83 + }, + { + "epoch": 0.336, + "grad_norm": 21.25, + "learning_rate": 4.664e-05, + "loss": 0.5742, + "step": 84 + }, + { + "epoch": 0.336, + "eval_accuracy": 0.922, + "eval_loss": 0.2520507872104645, + "eval_runtime": 29.6589, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.124, + "step": 84 + }, + { + "epoch": 0.34, + "grad_norm": 19.0, + "learning_rate": 4.660000000000001e-05, + "loss": 0.2988, + "step": 85 + }, + { + "epoch": 0.34, + "eval_accuracy": 0.932, + "eval_loss": 0.24216796457767487, + "eval_runtime": 29.6629, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 85 + }, + { + "epoch": 0.344, + "grad_norm": 8.9375, + "learning_rate": 4.656e-05, + "loss": 0.1689, + "step": 86 + }, + { + "epoch": 0.344, + "eval_accuracy": 0.93, + "eval_loss": 0.23786328732967377, + "eval_runtime": 29.6953, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.122, + "step": 86 + }, + { + "epoch": 0.348, + "grad_norm": 2.390625, + "learning_rate": 4.652e-05, + "loss": 0.0143, + "step": 87 + }, + { + "epoch": 0.348, + "eval_accuracy": 0.93, + "eval_loss": 0.23689453303813934, + "eval_runtime": 29.6685, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.123, + "step": 87 + }, + { + "epoch": 0.352, + "grad_norm": 27.0, + "learning_rate": 4.648e-05, + "loss": 0.6562, + "step": 88 + }, + { + "epoch": 0.352, + "eval_accuracy": 0.926, + "eval_loss": 0.243958979845047, + "eval_runtime": 29.7107, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.12, + "step": 88 + }, + { + "epoch": 0.356, + "grad_norm": 52.0, + "learning_rate": 4.644e-05, + "loss": 0.6406, + "step": 89 + }, + { + "epoch": 0.356, + "eval_accuracy": 0.916, + "eval_loss": 0.25474607944488525, + "eval_runtime": 29.6658, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 89 + }, + { + "epoch": 0.36, + "grad_norm": 7.0625, + "learning_rate": 4.64e-05, + "loss": 0.0874, + "step": 90 + }, + { + "epoch": 0.36, + "eval_accuracy": 0.91, + "eval_loss": 0.26084569096565247, + "eval_runtime": 29.7147, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.12, + "step": 90 + }, + { + "epoch": 0.364, + "grad_norm": 17.375, + "learning_rate": 4.636e-05, + "loss": 0.2383, + "step": 91 + }, + { + "epoch": 0.364, + "eval_accuracy": 0.896, + "eval_loss": 0.2603535056114197, + "eval_runtime": 29.7131, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.12, + "step": 91 + }, + { + "epoch": 0.368, + "grad_norm": 10.125, + "learning_rate": 4.6320000000000004e-05, + "loss": 0.083, + "step": 92 + }, + { + "epoch": 0.368, + "eval_accuracy": 0.892, + "eval_loss": 0.25163769721984863, + "eval_runtime": 29.7008, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.121, + "step": 92 + }, + { + "epoch": 0.372, + "grad_norm": 54.0, + "learning_rate": 4.6280000000000004e-05, + "loss": 1.3438, + "step": 93 + }, + { + "epoch": 0.372, + "eval_accuracy": 0.906, + "eval_loss": 0.24094481766223907, + "eval_runtime": 29.6288, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 93 + }, + { + "epoch": 0.376, + "grad_norm": 97.5, + "learning_rate": 4.624e-05, + "loss": 0.625, + "step": 94 + }, + { + "epoch": 0.376, + "eval_accuracy": 0.912, + "eval_loss": 0.23179003596305847, + "eval_runtime": 29.6121, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.128, + "step": 94 + }, + { + "epoch": 0.38, + "grad_norm": 3.625, + "learning_rate": 4.6200000000000005e-05, + "loss": 0.0142, + "step": 95 + }, + { + "epoch": 0.38, + "eval_accuracy": 0.91, + "eval_loss": 0.22752636671066284, + "eval_runtime": 29.6055, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 95 + }, + { + "epoch": 0.384, + "grad_norm": 101.5, + "learning_rate": 4.6160000000000005e-05, + "loss": 0.4551, + "step": 96 + }, + { + "epoch": 0.384, + "eval_accuracy": 0.92, + "eval_loss": 0.2265285700559616, + "eval_runtime": 29.6102, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 96 + }, + { + "epoch": 0.388, + "grad_norm": 120.0, + "learning_rate": 4.612e-05, + "loss": 0.5977, + "step": 97 + }, + { + "epoch": 0.388, + "eval_accuracy": 0.932, + "eval_loss": 0.2259543389081955, + "eval_runtime": 29.6002, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 97 + }, + { + "epoch": 0.392, + "grad_norm": 24.875, + "learning_rate": 4.608e-05, + "loss": 0.3887, + "step": 98 + }, + { + "epoch": 0.392, + "eval_accuracy": 0.934, + "eval_loss": 0.23052392899990082, + "eval_runtime": 29.6058, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 98 + }, + { + "epoch": 0.396, + "grad_norm": 3.625, + "learning_rate": 4.604e-05, + "loss": 0.0194, + "step": 99 + }, + { + "epoch": 0.396, + "eval_accuracy": 0.932, + "eval_loss": 0.24512499570846558, + "eval_runtime": 29.6578, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.124, + "step": 99 + }, + { + "epoch": 0.4, + "grad_norm": 73.0, + "learning_rate": 4.600000000000001e-05, + "loss": 1.6016, + "step": 100 + }, + { + "epoch": 0.4, + "eval_accuracy": 0.928, + "eval_loss": 0.24807128310203552, + "eval_runtime": 29.6053, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 100 + }, + { + "epoch": 0.404, + "grad_norm": 45.75, + "learning_rate": 4.596e-05, + "loss": 0.5078, + "step": 101 + }, + { + "epoch": 0.404, + "eval_accuracy": 0.916, + "eval_loss": 0.25956445932388306, + "eval_runtime": 29.5954, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 101 + }, + { + "epoch": 0.408, + "grad_norm": 20.625, + "learning_rate": 4.592e-05, + "loss": 0.3164, + "step": 102 + }, + { + "epoch": 0.408, + "eval_accuracy": 0.912, + "eval_loss": 0.2644296884536743, + "eval_runtime": 29.4961, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.136, + "step": 102 + }, + { + "epoch": 0.412, + "grad_norm": 69.0, + "learning_rate": 4.588e-05, + "loss": 1.7969, + "step": 103 + }, + { + "epoch": 0.412, + "eval_accuracy": 0.904, + "eval_loss": 0.26585790514945984, + "eval_runtime": 29.5911, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 103 + }, + { + "epoch": 0.416, + "grad_norm": 4.78125, + "learning_rate": 4.584e-05, + "loss": 0.0111, + "step": 104 + }, + { + "epoch": 0.416, + "eval_accuracy": 0.91, + "eval_loss": 0.24243700504302979, + "eval_runtime": 29.6322, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 104 + }, + { + "epoch": 0.42, + "grad_norm": 54.75, + "learning_rate": 4.58e-05, + "loss": 1.1641, + "step": 105 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.922, + "eval_loss": 0.22700390219688416, + "eval_runtime": 29.657, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.124, + "step": 105 + }, + { + "epoch": 0.424, + "grad_norm": 26.0, + "learning_rate": 4.576e-05, + "loss": 0.2734, + "step": 106 + }, + { + "epoch": 0.424, + "eval_accuracy": 0.922, + "eval_loss": 0.21595019102096558, + "eval_runtime": 29.6513, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 106 + }, + { + "epoch": 0.428, + "grad_norm": 2.671875, + "learning_rate": 4.572e-05, + "loss": 0.0172, + "step": 107 + }, + { + "epoch": 0.428, + "eval_accuracy": 0.922, + "eval_loss": 0.20528613030910492, + "eval_runtime": 29.6702, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 107 + }, + { + "epoch": 0.432, + "grad_norm": 0.439453125, + "learning_rate": 4.568e-05, + "loss": 0.0028, + "step": 108 + }, + { + "epoch": 0.432, + "eval_accuracy": 0.922, + "eval_loss": 0.2044650912284851, + "eval_runtime": 29.6726, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.123, + "step": 108 + }, + { + "epoch": 0.436, + "grad_norm": 5.21875, + "learning_rate": 4.564e-05, + "loss": 0.0297, + "step": 109 + }, + { + "epoch": 0.436, + "eval_accuracy": 0.916, + "eval_loss": 0.20484326779842377, + "eval_runtime": 29.6769, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.123, + "step": 109 + }, + { + "epoch": 0.44, + "grad_norm": 16.125, + "learning_rate": 4.5600000000000004e-05, + "loss": 0.1719, + "step": 110 + }, + { + "epoch": 0.44, + "eval_accuracy": 0.926, + "eval_loss": 0.20142553746700287, + "eval_runtime": 29.6762, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.123, + "step": 110 + }, + { + "epoch": 0.444, + "grad_norm": 32.75, + "learning_rate": 4.5560000000000004e-05, + "loss": 0.2559, + "step": 111 + }, + { + "epoch": 0.444, + "eval_accuracy": 0.93, + "eval_loss": 0.2016633301973343, + "eval_runtime": 29.6657, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 111 + }, + { + "epoch": 0.448, + "grad_norm": 3.015625, + "learning_rate": 4.5520000000000005e-05, + "loss": 0.0156, + "step": 112 + }, + { + "epoch": 0.448, + "eval_accuracy": 0.928, + "eval_loss": 0.20660974085330963, + "eval_runtime": 29.6618, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 112 + }, + { + "epoch": 0.452, + "grad_norm": 5.1875, + "learning_rate": 4.548e-05, + "loss": 0.0261, + "step": 113 + }, + { + "epoch": 0.452, + "eval_accuracy": 0.93, + "eval_loss": 0.22146350145339966, + "eval_runtime": 29.6699, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 113 + }, + { + "epoch": 0.456, + "grad_norm": 61.75, + "learning_rate": 4.5440000000000005e-05, + "loss": 0.9922, + "step": 114 + }, + { + "epoch": 0.456, + "eval_accuracy": 0.926, + "eval_loss": 0.22674255073070526, + "eval_runtime": 29.6772, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.123, + "step": 114 + }, + { + "epoch": 0.46, + "grad_norm": 20.25, + "learning_rate": 4.5400000000000006e-05, + "loss": 0.165, + "step": 115 + }, + { + "epoch": 0.46, + "eval_accuracy": 0.92, + "eval_loss": 0.23935754597187042, + "eval_runtime": 29.6452, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.125, + "step": 115 + }, + { + "epoch": 0.464, + "grad_norm": 60.75, + "learning_rate": 4.536e-05, + "loss": 1.6016, + "step": 116 + }, + { + "epoch": 0.464, + "eval_accuracy": 0.922, + "eval_loss": 0.2547971308231354, + "eval_runtime": 29.6467, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 116 + }, + { + "epoch": 0.468, + "grad_norm": 33.75, + "learning_rate": 4.532e-05, + "loss": 0.1602, + "step": 117 + }, + { + "epoch": 0.468, + "eval_accuracy": 0.91, + "eval_loss": 0.2719508111476898, + "eval_runtime": 29.5525, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 117 + }, + { + "epoch": 0.472, + "grad_norm": 6.34375, + "learning_rate": 4.528e-05, + "loss": 0.0312, + "step": 118 + }, + { + "epoch": 0.472, + "eval_accuracy": 0.904, + "eval_loss": 0.27943530678749084, + "eval_runtime": 29.5803, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 118 + }, + { + "epoch": 0.476, + "grad_norm": 1.5625, + "learning_rate": 4.524000000000001e-05, + "loss": 0.0079, + "step": 119 + }, + { + "epoch": 0.476, + "eval_accuracy": 0.896, + "eval_loss": 0.29967138171195984, + "eval_runtime": 29.5216, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.134, + "step": 119 + }, + { + "epoch": 0.48, + "grad_norm": 21.0, + "learning_rate": 4.52e-05, + "loss": 0.054, + "step": 120 + }, + { + "epoch": 0.48, + "eval_accuracy": 0.896, + "eval_loss": 0.29985204339027405, + "eval_runtime": 29.4974, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.136, + "step": 120 + }, + { + "epoch": 0.484, + "grad_norm": 1.265625, + "learning_rate": 4.516e-05, + "loss": 0.0043, + "step": 121 + }, + { + "epoch": 0.484, + "eval_accuracy": 0.9, + "eval_loss": 0.28141626715660095, + "eval_runtime": 29.5482, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 121 + }, + { + "epoch": 0.488, + "grad_norm": 12.625, + "learning_rate": 4.512e-05, + "loss": 0.0918, + "step": 122 + }, + { + "epoch": 0.488, + "eval_accuracy": 0.92, + "eval_loss": 0.269345223903656, + "eval_runtime": 29.4923, + "eval_samples_per_second": 16.954, + "eval_steps_per_second": 2.136, + "step": 122 + }, + { + "epoch": 0.492, + "grad_norm": 32.75, + "learning_rate": 4.508e-05, + "loss": 2.4688, + "step": 123 + }, + { + "epoch": 0.492, + "eval_accuracy": 0.924, + "eval_loss": 0.2683058977127075, + "eval_runtime": 29.4852, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.137, + "step": 123 + }, + { + "epoch": 0.496, + "grad_norm": 17.875, + "learning_rate": 4.504e-05, + "loss": 0.1099, + "step": 124 + }, + { + "epoch": 0.496, + "eval_accuracy": 0.92, + "eval_loss": 0.27239134907722473, + "eval_runtime": 29.5371, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.133, + "step": 124 + }, + { + "epoch": 0.5, + "grad_norm": 8.75, + "learning_rate": 4.5e-05, + "loss": 0.0452, + "step": 125 + }, + { + "epoch": 0.5, + "eval_accuracy": 0.922, + "eval_loss": 0.27604541182518005, + "eval_runtime": 29.5793, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 125 + }, + { + "epoch": 0.504, + "grad_norm": 7.84375, + "learning_rate": 4.496e-05, + "loss": 0.0664, + "step": 126 + }, + { + "epoch": 0.504, + "eval_accuracy": 0.926, + "eval_loss": 0.2819291949272156, + "eval_runtime": 29.6502, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 126 + }, + { + "epoch": 0.508, + "grad_norm": 77.5, + "learning_rate": 4.4920000000000004e-05, + "loss": 1.1016, + "step": 127 + }, + { + "epoch": 0.508, + "eval_accuracy": 0.922, + "eval_loss": 0.2950668931007385, + "eval_runtime": 29.6267, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.126, + "step": 127 + }, + { + "epoch": 0.512, + "grad_norm": 0.11962890625, + "learning_rate": 4.488e-05, + "loss": 0.0009, + "step": 128 + }, + { + "epoch": 0.512, + "eval_accuracy": 0.924, + "eval_loss": 0.29832518100738525, + "eval_runtime": 29.6392, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 128 + }, + { + "epoch": 0.516, + "grad_norm": 9.8125, + "learning_rate": 4.4840000000000004e-05, + "loss": 0.0576, + "step": 129 + }, + { + "epoch": 0.516, + "eval_accuracy": 0.936, + "eval_loss": 0.2895214855670929, + "eval_runtime": 29.6917, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.122, + "step": 129 + }, + { + "epoch": 0.52, + "grad_norm": 3.171875, + "learning_rate": 4.4800000000000005e-05, + "loss": 0.0352, + "step": 130 + }, + { + "epoch": 0.52, + "eval_accuracy": 0.934, + "eval_loss": 0.28699952363967896, + "eval_runtime": 29.6534, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.125, + "step": 130 + }, + { + "epoch": 0.524, + "grad_norm": 15.625, + "learning_rate": 4.4760000000000005e-05, + "loss": 0.2559, + "step": 131 + }, + { + "epoch": 0.524, + "eval_accuracy": 0.926, + "eval_loss": 0.286679208278656, + "eval_runtime": 29.7049, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.121, + "step": 131 + }, + { + "epoch": 0.528, + "grad_norm": 6.53125, + "learning_rate": 4.472e-05, + "loss": 0.0503, + "step": 132 + }, + { + "epoch": 0.528, + "eval_accuracy": 0.928, + "eval_loss": 0.2788745164871216, + "eval_runtime": 29.71, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.12, + "step": 132 + }, + { + "epoch": 0.532, + "grad_norm": 59.25, + "learning_rate": 4.468e-05, + "loss": 0.8398, + "step": 133 + }, + { + "epoch": 0.532, + "eval_accuracy": 0.928, + "eval_loss": 0.270631343126297, + "eval_runtime": 29.6612, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 133 + }, + { + "epoch": 0.536, + "grad_norm": 4.125, + "learning_rate": 4.4640000000000006e-05, + "loss": 0.0303, + "step": 134 + }, + { + "epoch": 0.536, + "eval_accuracy": 0.936, + "eval_loss": 0.2580249011516571, + "eval_runtime": 29.6572, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.124, + "step": 134 + }, + { + "epoch": 0.54, + "grad_norm": 10.625, + "learning_rate": 4.46e-05, + "loss": 0.1729, + "step": 135 + }, + { + "epoch": 0.54, + "eval_accuracy": 0.936, + "eval_loss": 0.247315913438797, + "eval_runtime": 29.6474, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 135 + }, + { + "epoch": 0.544, + "grad_norm": 24.375, + "learning_rate": 4.456e-05, + "loss": 0.0306, + "step": 136 + }, + { + "epoch": 0.544, + "eval_accuracy": 0.938, + "eval_loss": 0.24405469000339508, + "eval_runtime": 29.6516, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.125, + "step": 136 + }, + { + "epoch": 0.548, + "grad_norm": 69.0, + "learning_rate": 4.452e-05, + "loss": 1.6797, + "step": 137 + }, + { + "epoch": 0.548, + "eval_accuracy": 0.93, + "eval_loss": 0.2395351529121399, + "eval_runtime": 29.6513, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 137 + }, + { + "epoch": 0.552, + "grad_norm": 51.5, + "learning_rate": 4.448e-05, + "loss": 0.1572, + "step": 138 + }, + { + "epoch": 0.552, + "eval_accuracy": 0.94, + "eval_loss": 0.22542382776737213, + "eval_runtime": 29.6658, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 138 + }, + { + "epoch": 0.556, + "grad_norm": 14.625, + "learning_rate": 4.444e-05, + "loss": 0.2402, + "step": 139 + }, + { + "epoch": 0.556, + "eval_accuracy": 0.934, + "eval_loss": 0.22569629549980164, + "eval_runtime": 29.7083, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.121, + "step": 139 + }, + { + "epoch": 0.56, + "grad_norm": 69.0, + "learning_rate": 4.44e-05, + "loss": 0.5312, + "step": 140 + }, + { + "epoch": 0.56, + "eval_accuracy": 0.932, + "eval_loss": 0.22229784727096558, + "eval_runtime": 29.7166, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.12, + "step": 140 + }, + { + "epoch": 0.564, + "grad_norm": 18.0, + "learning_rate": 4.436e-05, + "loss": 0.1953, + "step": 141 + }, + { + "epoch": 0.564, + "eval_accuracy": 0.944, + "eval_loss": 0.21418945491313934, + "eval_runtime": 29.6445, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 141 + }, + { + "epoch": 0.568, + "grad_norm": 16.375, + "learning_rate": 4.432e-05, + "loss": 0.1982, + "step": 142 + }, + { + "epoch": 0.568, + "eval_accuracy": 0.946, + "eval_loss": 0.20872265100479126, + "eval_runtime": 29.6434, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 142 + }, + { + "epoch": 0.572, + "grad_norm": 4.3125, + "learning_rate": 4.428e-05, + "loss": 0.0393, + "step": 143 + }, + { + "epoch": 0.572, + "eval_accuracy": 0.952, + "eval_loss": 0.2076674848794937, + "eval_runtime": 29.6509, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 143 + }, + { + "epoch": 0.576, + "grad_norm": 32.0, + "learning_rate": 4.424e-05, + "loss": 0.2754, + "step": 144 + }, + { + "epoch": 0.576, + "eval_accuracy": 0.95, + "eval_loss": 0.2188076227903366, + "eval_runtime": 29.6678, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 144 + }, + { + "epoch": 0.58, + "grad_norm": 33.0, + "learning_rate": 4.4200000000000004e-05, + "loss": 0.2832, + "step": 145 + }, + { + "epoch": 0.58, + "eval_accuracy": 0.948, + "eval_loss": 0.2288239747285843, + "eval_runtime": 29.6399, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.126, + "step": 145 + }, + { + "epoch": 0.584, + "grad_norm": 16.125, + "learning_rate": 4.4160000000000004e-05, + "loss": 0.2236, + "step": 146 + }, + { + "epoch": 0.584, + "eval_accuracy": 0.946, + "eval_loss": 0.23677490651607513, + "eval_runtime": 29.6737, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.123, + "step": 146 + }, + { + "epoch": 0.588, + "grad_norm": 10.125, + "learning_rate": 4.412e-05, + "loss": 0.063, + "step": 147 + }, + { + "epoch": 0.588, + "eval_accuracy": 0.942, + "eval_loss": 0.2572605013847351, + "eval_runtime": 29.6085, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 147 + }, + { + "epoch": 0.592, + "grad_norm": 38.75, + "learning_rate": 4.4080000000000005e-05, + "loss": 0.3867, + "step": 148 + }, + { + "epoch": 0.592, + "eval_accuracy": 0.926, + "eval_loss": 0.2880162298679352, + "eval_runtime": 29.609, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 148 + }, + { + "epoch": 0.596, + "grad_norm": 60.25, + "learning_rate": 4.4040000000000005e-05, + "loss": 0.3086, + "step": 149 + }, + { + "epoch": 0.596, + "eval_accuracy": 0.902, + "eval_loss": 0.3244847357273102, + "eval_runtime": 29.66, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.124, + "step": 149 + }, + { + "epoch": 0.6, + "grad_norm": 0.107421875, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0007, + "step": 150 + }, + { + "epoch": 0.6, + "eval_accuracy": 0.878, + "eval_loss": 0.38226020336151123, + "eval_runtime": 29.6623, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 150 + }, + { + "epoch": 0.604, + "grad_norm": 7.78125, + "learning_rate": 4.396e-05, + "loss": 0.0659, + "step": 151 + }, + { + "epoch": 0.604, + "eval_accuracy": 0.836, + "eval_loss": 0.49504920840263367, + "eval_runtime": 29.5385, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 151 + }, + { + "epoch": 0.608, + "grad_norm": 18.5, + "learning_rate": 4.392e-05, + "loss": 0.0742, + "step": 152 + }, + { + "epoch": 0.608, + "eval_accuracy": 0.81, + "eval_loss": 0.6082227826118469, + "eval_runtime": 29.5001, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.136, + "step": 152 + }, + { + "epoch": 0.612, + "grad_norm": 50.75, + "learning_rate": 4.388000000000001e-05, + "loss": 1.4609, + "step": 153 + }, + { + "epoch": 0.612, + "eval_accuracy": 0.804, + "eval_loss": 0.6597026586532593, + "eval_runtime": 29.5913, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 153 + }, + { + "epoch": 0.616, + "grad_norm": 18.875, + "learning_rate": 4.384e-05, + "loss": 0.1128, + "step": 154 + }, + { + "epoch": 0.616, + "eval_accuracy": 0.796, + "eval_loss": 0.6944511532783508, + "eval_runtime": 29.6382, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 154 + }, + { + "epoch": 0.62, + "grad_norm": 130.0, + "learning_rate": 4.38e-05, + "loss": 0.7109, + "step": 155 + }, + { + "epoch": 0.62, + "eval_accuracy": 0.814, + "eval_loss": 0.5749375224113464, + "eval_runtime": 29.6454, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.125, + "step": 155 + }, + { + "epoch": 0.624, + "grad_norm": 70.5, + "learning_rate": 4.376e-05, + "loss": 0.5156, + "step": 156 + }, + { + "epoch": 0.624, + "eval_accuracy": 0.86, + "eval_loss": 0.3731687068939209, + "eval_runtime": 29.6982, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.121, + "step": 156 + }, + { + "epoch": 0.628, + "grad_norm": 68.0, + "learning_rate": 4.372e-05, + "loss": 1.0625, + "step": 157 + }, + { + "epoch": 0.628, + "eval_accuracy": 0.912, + "eval_loss": 0.23024819791316986, + "eval_runtime": 29.6475, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 157 + }, + { + "epoch": 0.632, + "grad_norm": 0.015869140625, + "learning_rate": 4.368e-05, + "loss": 0.0001, + "step": 158 + }, + { + "epoch": 0.632, + "eval_accuracy": 0.944, + "eval_loss": 0.17584720253944397, + "eval_runtime": 29.7018, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.121, + "step": 158 + }, + { + "epoch": 0.636, + "grad_norm": 52.25, + "learning_rate": 4.364e-05, + "loss": 1.4922, + "step": 159 + }, + { + "epoch": 0.636, + "eval_accuracy": 0.956, + "eval_loss": 0.1370401233434677, + "eval_runtime": 29.7028, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.121, + "step": 159 + }, + { + "epoch": 0.64, + "grad_norm": 18.25, + "learning_rate": 4.36e-05, + "loss": 0.1494, + "step": 160 + }, + { + "epoch": 0.64, + "eval_accuracy": 0.966, + "eval_loss": 0.11924795806407928, + "eval_runtime": 29.7076, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.121, + "step": 160 + }, + { + "epoch": 0.644, + "grad_norm": 2.109375, + "learning_rate": 4.356e-05, + "loss": 0.0149, + "step": 161 + }, + { + "epoch": 0.644, + "eval_accuracy": 0.962, + "eval_loss": 0.10812396556138992, + "eval_runtime": 29.6629, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 161 + }, + { + "epoch": 0.648, + "grad_norm": 0.0201416015625, + "learning_rate": 4.352e-05, + "loss": 0.0001, + "step": 162 + }, + { + "epoch": 0.648, + "eval_accuracy": 0.97, + "eval_loss": 0.10453478991985321, + "eval_runtime": 29.6989, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.121, + "step": 162 + }, + { + "epoch": 0.652, + "grad_norm": 0.2255859375, + "learning_rate": 4.3480000000000004e-05, + "loss": 0.002, + "step": 163 + }, + { + "epoch": 0.652, + "eval_accuracy": 0.968, + "eval_loss": 0.10361865162849426, + "eval_runtime": 29.649, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 163 + }, + { + "epoch": 0.656, + "grad_norm": 2.375, + "learning_rate": 4.3440000000000004e-05, + "loss": 0.0075, + "step": 164 + }, + { + "epoch": 0.656, + "eval_accuracy": 0.97, + "eval_loss": 0.1029207780957222, + "eval_runtime": 29.6467, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 164 + }, + { + "epoch": 0.66, + "grad_norm": 3.6875, + "learning_rate": 4.3400000000000005e-05, + "loss": 0.0128, + "step": 165 + }, + { + "epoch": 0.66, + "eval_accuracy": 0.97, + "eval_loss": 0.1034383550286293, + "eval_runtime": 29.6994, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.121, + "step": 165 + }, + { + "epoch": 0.664, + "grad_norm": 26.125, + "learning_rate": 4.336e-05, + "loss": 0.2656, + "step": 166 + }, + { + "epoch": 0.664, + "eval_accuracy": 0.972, + "eval_loss": 0.10060693323612213, + "eval_runtime": 29.6473, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 166 + }, + { + "epoch": 0.668, + "grad_norm": 5.4375, + "learning_rate": 4.332e-05, + "loss": 0.0249, + "step": 167 + }, + { + "epoch": 0.668, + "eval_accuracy": 0.968, + "eval_loss": 0.09896374493837357, + "eval_runtime": 29.6452, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.125, + "step": 167 + }, + { + "epoch": 0.672, + "grad_norm": 0.080078125, + "learning_rate": 4.3280000000000006e-05, + "loss": 0.0006, + "step": 168 + }, + { + "epoch": 0.672, + "eval_accuracy": 0.966, + "eval_loss": 0.10120019316673279, + "eval_runtime": 29.6468, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 168 + }, + { + "epoch": 0.676, + "grad_norm": 0.036376953125, + "learning_rate": 4.324e-05, + "loss": 0.0003, + "step": 169 + }, + { + "epoch": 0.676, + "eval_accuracy": 0.966, + "eval_loss": 0.10330157727003098, + "eval_runtime": 29.7001, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.121, + "step": 169 + }, + { + "epoch": 0.68, + "grad_norm": 35.0, + "learning_rate": 4.32e-05, + "loss": 0.4863, + "step": 170 + }, + { + "epoch": 0.68, + "eval_accuracy": 0.966, + "eval_loss": 0.10276532173156738, + "eval_runtime": 29.6873, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.122, + "step": 170 + }, + { + "epoch": 0.684, + "grad_norm": 1.875, + "learning_rate": 4.316e-05, + "loss": 0.0072, + "step": 171 + }, + { + "epoch": 0.684, + "eval_accuracy": 0.966, + "eval_loss": 0.10432815551757812, + "eval_runtime": 29.6954, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.122, + "step": 171 + }, + { + "epoch": 0.688, + "grad_norm": 17.5, + "learning_rate": 4.312000000000001e-05, + "loss": 0.2852, + "step": 172 + }, + { + "epoch": 0.688, + "eval_accuracy": 0.966, + "eval_loss": 0.10174956917762756, + "eval_runtime": 29.6941, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.122, + "step": 172 + }, + { + "epoch": 0.692, + "grad_norm": 2.21875, + "learning_rate": 4.308e-05, + "loss": 0.0086, + "step": 173 + }, + { + "epoch": 0.692, + "eval_accuracy": 0.966, + "eval_loss": 0.09696061909198761, + "eval_runtime": 29.6458, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.125, + "step": 173 + }, + { + "epoch": 0.696, + "grad_norm": 18.75, + "learning_rate": 4.304e-05, + "loss": 0.1738, + "step": 174 + }, + { + "epoch": 0.696, + "eval_accuracy": 0.968, + "eval_loss": 0.09676006436347961, + "eval_runtime": 29.642, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.125, + "step": 174 + }, + { + "epoch": 0.7, + "grad_norm": 0.45703125, + "learning_rate": 4.3e-05, + "loss": 0.001, + "step": 175 + }, + { + "epoch": 0.7, + "eval_accuracy": 0.968, + "eval_loss": 0.09500230848789215, + "eval_runtime": 29.6349, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 175 + }, + { + "epoch": 0.704, + "grad_norm": 18.0, + "learning_rate": 4.296e-05, + "loss": 0.0918, + "step": 176 + }, + { + "epoch": 0.704, + "eval_accuracy": 0.968, + "eval_loss": 0.0910501554608345, + "eval_runtime": 29.6806, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.123, + "step": 176 + }, + { + "epoch": 0.708, + "grad_norm": 0.75390625, + "learning_rate": 4.292e-05, + "loss": 0.003, + "step": 177 + }, + { + "epoch": 0.708, + "eval_accuracy": 0.97, + "eval_loss": 0.08599226176738739, + "eval_runtime": 29.6376, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 177 + }, + { + "epoch": 0.712, + "grad_norm": 5.125, + "learning_rate": 4.288e-05, + "loss": 0.0177, + "step": 178 + }, + { + "epoch": 0.712, + "eval_accuracy": 0.97, + "eval_loss": 0.08361060917377472, + "eval_runtime": 29.6391, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 178 + }, + { + "epoch": 0.716, + "grad_norm": 0.10009765625, + "learning_rate": 4.284e-05, + "loss": 0.0005, + "step": 179 + }, + { + "epoch": 0.716, + "eval_accuracy": 0.968, + "eval_loss": 0.09491007775068283, + "eval_runtime": 29.6391, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 179 + }, + { + "epoch": 0.72, + "grad_norm": 13.75, + "learning_rate": 4.2800000000000004e-05, + "loss": 0.1113, + "step": 180 + }, + { + "epoch": 0.72, + "eval_accuracy": 0.974, + "eval_loss": 0.09185592085123062, + "eval_runtime": 29.6872, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.122, + "step": 180 + }, + { + "epoch": 0.724, + "grad_norm": 55.25, + "learning_rate": 4.276e-05, + "loss": 0.7812, + "step": 181 + }, + { + "epoch": 0.724, + "eval_accuracy": 0.972, + "eval_loss": 0.09238330274820328, + "eval_runtime": 29.695, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.122, + "step": 181 + }, + { + "epoch": 0.728, + "grad_norm": 18.375, + "learning_rate": 4.2720000000000004e-05, + "loss": 0.3047, + "step": 182 + }, + { + "epoch": 0.728, + "eval_accuracy": 0.972, + "eval_loss": 0.09142916649580002, + "eval_runtime": 29.6647, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 182 + }, + { + "epoch": 0.732, + "grad_norm": 2.859375, + "learning_rate": 4.2680000000000005e-05, + "loss": 0.0085, + "step": 183 + }, + { + "epoch": 0.732, + "eval_accuracy": 0.972, + "eval_loss": 0.09288589656352997, + "eval_runtime": 29.6393, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 183 + }, + { + "epoch": 0.736, + "grad_norm": 1.0625, + "learning_rate": 4.2640000000000005e-05, + "loss": 0.0046, + "step": 184 + }, + { + "epoch": 0.736, + "eval_accuracy": 0.974, + "eval_loss": 0.08880114555358887, + "eval_runtime": 29.6955, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.122, + "step": 184 + }, + { + "epoch": 0.74, + "grad_norm": 0.0225830078125, + "learning_rate": 4.26e-05, + "loss": 0.0001, + "step": 185 + }, + { + "epoch": 0.74, + "eval_accuracy": 0.974, + "eval_loss": 0.08820316940546036, + "eval_runtime": 29.6327, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 185 + }, + { + "epoch": 0.744, + "grad_norm": 14.0625, + "learning_rate": 4.256e-05, + "loss": 0.0334, + "step": 186 + }, + { + "epoch": 0.744, + "eval_accuracy": 0.974, + "eval_loss": 0.08765304833650589, + "eval_runtime": 29.6819, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.123, + "step": 186 + }, + { + "epoch": 0.748, + "grad_norm": 0.054931640625, + "learning_rate": 4.2520000000000006e-05, + "loss": 0.0002, + "step": 187 + }, + { + "epoch": 0.748, + "eval_accuracy": 0.974, + "eval_loss": 0.08688275516033173, + "eval_runtime": 29.6196, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 187 + }, + { + "epoch": 0.752, + "grad_norm": 0.0185546875, + "learning_rate": 4.248e-05, + "loss": 0.0001, + "step": 188 + }, + { + "epoch": 0.752, + "eval_accuracy": 0.976, + "eval_loss": 0.08738788962364197, + "eval_runtime": 29.6235, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 188 + }, + { + "epoch": 0.756, + "grad_norm": 28.375, + "learning_rate": 4.244e-05, + "loss": 0.5742, + "step": 189 + }, + { + "epoch": 0.756, + "eval_accuracy": 0.976, + "eval_loss": 0.0873558297753334, + "eval_runtime": 29.6216, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 189 + }, + { + "epoch": 0.76, + "grad_norm": 42.25, + "learning_rate": 4.24e-05, + "loss": 1.7266, + "step": 190 + }, + { + "epoch": 0.76, + "eval_accuracy": 0.976, + "eval_loss": 0.08426640182733536, + "eval_runtime": 29.6265, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.126, + "step": 190 + }, + { + "epoch": 0.764, + "grad_norm": 0.11767578125, + "learning_rate": 4.236e-05, + "loss": 0.0005, + "step": 191 + }, + { + "epoch": 0.764, + "eval_accuracy": 0.976, + "eval_loss": 0.0822475478053093, + "eval_runtime": 29.6798, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.123, + "step": 191 + }, + { + "epoch": 0.768, + "grad_norm": 132.0, + "learning_rate": 4.232e-05, + "loss": 0.416, + "step": 192 + }, + { + "epoch": 0.768, + "eval_accuracy": 0.976, + "eval_loss": 0.07963664084672928, + "eval_runtime": 29.6373, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 192 + }, + { + "epoch": 0.772, + "grad_norm": 258.0, + "learning_rate": 4.228e-05, + "loss": 0.3223, + "step": 193 + }, + { + "epoch": 0.772, + "eval_accuracy": 0.976, + "eval_loss": 0.07625701278448105, + "eval_runtime": 29.6308, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 193 + }, + { + "epoch": 0.776, + "grad_norm": 47.5, + "learning_rate": 4.224e-05, + "loss": 0.4453, + "step": 194 + }, + { + "epoch": 0.776, + "eval_accuracy": 0.974, + "eval_loss": 0.07586584240198135, + "eval_runtime": 29.6841, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.122, + "step": 194 + }, + { + "epoch": 0.78, + "grad_norm": 17.125, + "learning_rate": 4.22e-05, + "loss": 0.1172, + "step": 195 + }, + { + "epoch": 0.78, + "eval_accuracy": 0.974, + "eval_loss": 0.07492456585168839, + "eval_runtime": 29.6368, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 195 + }, + { + "epoch": 0.784, + "grad_norm": 0.1611328125, + "learning_rate": 4.2159999999999996e-05, + "loss": 0.001, + "step": 196 + }, + { + "epoch": 0.784, + "eval_accuracy": 0.974, + "eval_loss": 0.07631567865610123, + "eval_runtime": 29.6283, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 196 + }, + { + "epoch": 0.788, + "grad_norm": 0.427734375, + "learning_rate": 4.212e-05, + "loss": 0.0009, + "step": 197 + }, + { + "epoch": 0.788, + "eval_accuracy": 0.974, + "eval_loss": 0.07704315334558487, + "eval_runtime": 29.6245, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 197 + }, + { + "epoch": 0.792, + "grad_norm": 28.125, + "learning_rate": 4.2080000000000004e-05, + "loss": 0.4688, + "step": 198 + }, + { + "epoch": 0.792, + "eval_accuracy": 0.976, + "eval_loss": 0.07680439203977585, + "eval_runtime": 29.628, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 198 + }, + { + "epoch": 0.796, + "grad_norm": 0.236328125, + "learning_rate": 4.2040000000000004e-05, + "loss": 0.0012, + "step": 199 + }, + { + "epoch": 0.796, + "eval_accuracy": 0.976, + "eval_loss": 0.07766655087471008, + "eval_runtime": 29.6796, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.123, + "step": 199 + }, + { + "epoch": 0.8, + "grad_norm": 0.333984375, + "learning_rate": 4.2e-05, + "loss": 0.0024, + "step": 200 + }, + { + "epoch": 0.8, + "eval_accuracy": 0.978, + "eval_loss": 0.07604081928730011, + "eval_runtime": 29.6397, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.126, + "step": 200 + }, + { + "epoch": 0.804, + "grad_norm": 38.25, + "learning_rate": 4.196e-05, + "loss": 0.2041, + "step": 201 + }, + { + "epoch": 0.804, + "eval_accuracy": 0.978, + "eval_loss": 0.07405334711074829, + "eval_runtime": 29.6133, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 201 + }, + { + "epoch": 0.808, + "grad_norm": 0.42578125, + "learning_rate": 4.1920000000000005e-05, + "loss": 0.0026, + "step": 202 + }, + { + "epoch": 0.808, + "eval_accuracy": 0.978, + "eval_loss": 0.07225973159074783, + "eval_runtime": 29.4349, + "eval_samples_per_second": 16.987, + "eval_steps_per_second": 2.14, + "step": 202 + }, + { + "epoch": 0.812, + "grad_norm": 53.5, + "learning_rate": 4.1880000000000006e-05, + "loss": 0.1436, + "step": 203 + }, + { + "epoch": 0.812, + "eval_accuracy": 0.978, + "eval_loss": 0.07016235589981079, + "eval_runtime": 29.526, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.134, + "step": 203 + }, + { + "epoch": 0.816, + "grad_norm": 183.0, + "learning_rate": 4.184e-05, + "loss": 1.0391, + "step": 204 + }, + { + "epoch": 0.816, + "eval_accuracy": 0.978, + "eval_loss": 0.06834210455417633, + "eval_runtime": 29.6343, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 204 + }, + { + "epoch": 0.82, + "grad_norm": 0.369140625, + "learning_rate": 4.18e-05, + "loss": 0.0021, + "step": 205 + }, + { + "epoch": 0.82, + "eval_accuracy": 0.98, + "eval_loss": 0.06732385605573654, + "eval_runtime": 29.6568, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 205 + }, + { + "epoch": 0.824, + "grad_norm": 0.408203125, + "learning_rate": 4.176000000000001e-05, + "loss": 0.0016, + "step": 206 + }, + { + "epoch": 0.824, + "eval_accuracy": 0.98, + "eval_loss": 0.06818614155054092, + "eval_runtime": 29.6605, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 206 + }, + { + "epoch": 0.828, + "grad_norm": 1.1328125, + "learning_rate": 4.172e-05, + "loss": 0.0034, + "step": 207 + }, + { + "epoch": 0.828, + "eval_accuracy": 0.98, + "eval_loss": 0.06563514471054077, + "eval_runtime": 29.6147, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 207 + }, + { + "epoch": 0.832, + "grad_norm": 19.5, + "learning_rate": 4.168e-05, + "loss": 0.1133, + "step": 208 + }, + { + "epoch": 0.832, + "eval_accuracy": 0.982, + "eval_loss": 0.0643063336610794, + "eval_runtime": 29.6624, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 208 + }, + { + "epoch": 0.836, + "grad_norm": 2.453125, + "learning_rate": 4.164e-05, + "loss": 0.0031, + "step": 209 + }, + { + "epoch": 0.836, + "eval_accuracy": 0.982, + "eval_loss": 0.06570375710725784, + "eval_runtime": 29.602, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 209 + }, + { + "epoch": 0.84, + "grad_norm": 6.9375, + "learning_rate": 4.16e-05, + "loss": 0.0549, + "step": 210 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.98, + "eval_loss": 0.06586217135190964, + "eval_runtime": 29.6147, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 210 + }, + { + "epoch": 0.844, + "grad_norm": 0.169921875, + "learning_rate": 4.156e-05, + "loss": 0.0009, + "step": 211 + }, + { + "epoch": 0.844, + "eval_accuracy": 0.98, + "eval_loss": 0.06533855944871902, + "eval_runtime": 29.618, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 211 + }, + { + "epoch": 0.848, + "grad_norm": 0.0159912109375, + "learning_rate": 4.152e-05, + "loss": 0.0001, + "step": 212 + }, + { + "epoch": 0.848, + "eval_accuracy": 0.98, + "eval_loss": 0.06079791113734245, + "eval_runtime": 29.671, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.123, + "step": 212 + }, + { + "epoch": 0.852, + "grad_norm": 1.0, + "learning_rate": 4.148e-05, + "loss": 0.0031, + "step": 213 + }, + { + "epoch": 0.852, + "eval_accuracy": 0.98, + "eval_loss": 0.061831165105104446, + "eval_runtime": 29.6177, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 213 + }, + { + "epoch": 0.856, + "grad_norm": 2.890625, + "learning_rate": 4.144e-05, + "loss": 0.0201, + "step": 214 + }, + { + "epoch": 0.856, + "eval_accuracy": 0.978, + "eval_loss": 0.06264770030975342, + "eval_runtime": 29.6639, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 214 + }, + { + "epoch": 0.86, + "grad_norm": 41.0, + "learning_rate": 4.14e-05, + "loss": 0.3223, + "step": 215 + }, + { + "epoch": 0.86, + "eval_accuracy": 0.978, + "eval_loss": 0.06678374111652374, + "eval_runtime": 29.6139, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 215 + }, + { + "epoch": 0.864, + "grad_norm": 0.2001953125, + "learning_rate": 4.1360000000000004e-05, + "loss": 0.0006, + "step": 216 + }, + { + "epoch": 0.864, + "eval_accuracy": 0.976, + "eval_loss": 0.07042150944471359, + "eval_runtime": 29.6566, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 216 + }, + { + "epoch": 0.868, + "grad_norm": 49.5, + "learning_rate": 4.1320000000000004e-05, + "loss": 1.2656, + "step": 217 + }, + { + "epoch": 0.868, + "eval_accuracy": 0.97, + "eval_loss": 0.07522651553153992, + "eval_runtime": 29.6646, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 217 + }, + { + "epoch": 0.872, + "grad_norm": 92.5, + "learning_rate": 4.1280000000000005e-05, + "loss": 0.5898, + "step": 218 + }, + { + "epoch": 0.872, + "eval_accuracy": 0.972, + "eval_loss": 0.07427488267421722, + "eval_runtime": 29.6084, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 218 + }, + { + "epoch": 0.876, + "grad_norm": 9.5, + "learning_rate": 4.124e-05, + "loss": 0.032, + "step": 219 + }, + { + "epoch": 0.876, + "eval_accuracy": 0.97, + "eval_loss": 0.07966498285531998, + "eval_runtime": 29.6084, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 219 + }, + { + "epoch": 0.88, + "grad_norm": 13.375, + "learning_rate": 4.12e-05, + "loss": 0.0164, + "step": 220 + }, + { + "epoch": 0.88, + "eval_accuracy": 0.974, + "eval_loss": 0.08513650298118591, + "eval_runtime": 29.6607, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 220 + }, + { + "epoch": 0.884, + "grad_norm": 14.5625, + "learning_rate": 4.1160000000000006e-05, + "loss": 0.1562, + "step": 221 + }, + { + "epoch": 0.884, + "eval_accuracy": 0.97, + "eval_loss": 0.08612356334924698, + "eval_runtime": 29.6598, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.124, + "step": 221 + }, + { + "epoch": 0.888, + "grad_norm": 1.5625, + "learning_rate": 4.1120000000000006e-05, + "loss": 0.007, + "step": 222 + }, + { + "epoch": 0.888, + "eval_accuracy": 0.972, + "eval_loss": 0.08614454418420792, + "eval_runtime": 29.6609, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 222 + }, + { + "epoch": 0.892, + "grad_norm": 53.0, + "learning_rate": 4.108e-05, + "loss": 1.6328, + "step": 223 + }, + { + "epoch": 0.892, + "eval_accuracy": 0.976, + "eval_loss": 0.08457397669553757, + "eval_runtime": 29.6614, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 223 + }, + { + "epoch": 0.896, + "grad_norm": 97.5, + "learning_rate": 4.104e-05, + "loss": 1.1016, + "step": 224 + }, + { + "epoch": 0.896, + "eval_accuracy": 0.972, + "eval_loss": 0.08589766174554825, + "eval_runtime": 29.6619, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 224 + }, + { + "epoch": 0.9, + "grad_norm": 14.5, + "learning_rate": 4.1e-05, + "loss": 0.2168, + "step": 225 + }, + { + "epoch": 0.9, + "eval_accuracy": 0.974, + "eval_loss": 0.0851806253194809, + "eval_runtime": 29.6567, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 225 + }, + { + "epoch": 0.904, + "grad_norm": 3.625, + "learning_rate": 4.096e-05, + "loss": 0.0192, + "step": 226 + }, + { + "epoch": 0.904, + "eval_accuracy": 0.974, + "eval_loss": 0.08372227847576141, + "eval_runtime": 29.6162, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.127, + "step": 226 + }, + { + "epoch": 0.908, + "grad_norm": 1.234375, + "learning_rate": 4.092e-05, + "loss": 0.007, + "step": 227 + }, + { + "epoch": 0.908, + "eval_accuracy": 0.976, + "eval_loss": 0.07853132486343384, + "eval_runtime": 29.6081, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 227 + }, + { + "epoch": 0.912, + "grad_norm": 7.65625, + "learning_rate": 4.088e-05, + "loss": 0.0352, + "step": 228 + }, + { + "epoch": 0.912, + "eval_accuracy": 0.98, + "eval_loss": 0.07481507956981659, + "eval_runtime": 29.67, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 228 + }, + { + "epoch": 0.916, + "grad_norm": 0.050537109375, + "learning_rate": 4.084e-05, + "loss": 0.0001, + "step": 229 + }, + { + "epoch": 0.916, + "eval_accuracy": 0.978, + "eval_loss": 0.0743974819779396, + "eval_runtime": 29.6699, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 229 + }, + { + "epoch": 0.92, + "grad_norm": 0.003631591796875, + "learning_rate": 4.08e-05, + "loss": 0.0, + "step": 230 + }, + { + "epoch": 0.92, + "eval_accuracy": 0.98, + "eval_loss": 0.07246476411819458, + "eval_runtime": 29.6677, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 230 + }, + { + "epoch": 0.924, + "grad_norm": 8.1875, + "learning_rate": 4.076e-05, + "loss": 0.0635, + "step": 231 + }, + { + "epoch": 0.924, + "eval_accuracy": 0.982, + "eval_loss": 0.07053600996732712, + "eval_runtime": 29.668, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 231 + }, + { + "epoch": 0.928, + "grad_norm": 4.71875, + "learning_rate": 4.072e-05, + "loss": 0.0221, + "step": 232 + }, + { + "epoch": 0.928, + "eval_accuracy": 0.982, + "eval_loss": 0.06312824785709381, + "eval_runtime": 29.6235, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 232 + }, + { + "epoch": 0.932, + "grad_norm": 0.0242919921875, + "learning_rate": 4.0680000000000004e-05, + "loss": 0.0001, + "step": 233 + }, + { + "epoch": 0.932, + "eval_accuracy": 0.982, + "eval_loss": 0.059605829417705536, + "eval_runtime": 29.6658, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 233 + }, + { + "epoch": 0.936, + "grad_norm": 0.341796875, + "learning_rate": 4.064e-05, + "loss": 0.0004, + "step": 234 + }, + { + "epoch": 0.936, + "eval_accuracy": 0.982, + "eval_loss": 0.0584360733628273, + "eval_runtime": 29.6739, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.123, + "step": 234 + }, + { + "epoch": 0.94, + "grad_norm": 0.388671875, + "learning_rate": 4.0600000000000004e-05, + "loss": 0.0031, + "step": 235 + }, + { + "epoch": 0.94, + "eval_accuracy": 0.982, + "eval_loss": 0.05637088045477867, + "eval_runtime": 29.6695, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 235 + }, + { + "epoch": 0.944, + "grad_norm": 0.05322265625, + "learning_rate": 4.0560000000000005e-05, + "loss": 0.0002, + "step": 236 + }, + { + "epoch": 0.944, + "eval_accuracy": 0.984, + "eval_loss": 0.05546841397881508, + "eval_runtime": 29.6765, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.123, + "step": 236 + }, + { + "epoch": 0.948, + "grad_norm": 25.625, + "learning_rate": 4.0520000000000005e-05, + "loss": 0.1729, + "step": 237 + }, + { + "epoch": 0.948, + "eval_accuracy": 0.984, + "eval_loss": 0.0563667006790638, + "eval_runtime": 29.6653, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 237 + }, + { + "epoch": 0.952, + "grad_norm": 63.25, + "learning_rate": 4.048e-05, + "loss": 0.4844, + "step": 238 + }, + { + "epoch": 0.952, + "eval_accuracy": 0.986, + "eval_loss": 0.05481307953596115, + "eval_runtime": 29.6215, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 238 + }, + { + "epoch": 0.956, + "grad_norm": 0.0062255859375, + "learning_rate": 4.044e-05, + "loss": 0.0, + "step": 239 + }, + { + "epoch": 0.956, + "eval_accuracy": 0.986, + "eval_loss": 0.05165258049964905, + "eval_runtime": 29.6637, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 239 + }, + { + "epoch": 0.96, + "grad_norm": 4.21875, + "learning_rate": 4.0400000000000006e-05, + "loss": 0.025, + "step": 240 + }, + { + "epoch": 0.96, + "eval_accuracy": 0.988, + "eval_loss": 0.05035816878080368, + "eval_runtime": 29.6169, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 240 + }, + { + "epoch": 0.964, + "grad_norm": 0.0751953125, + "learning_rate": 4.0360000000000007e-05, + "loss": 0.0003, + "step": 241 + }, + { + "epoch": 0.964, + "eval_accuracy": 0.988, + "eval_loss": 0.04841839522123337, + "eval_runtime": 29.664, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 241 + }, + { + "epoch": 0.968, + "grad_norm": 41.5, + "learning_rate": 4.032e-05, + "loss": 1.4141, + "step": 242 + }, + { + "epoch": 0.968, + "eval_accuracy": 0.988, + "eval_loss": 0.04907584562897682, + "eval_runtime": 29.6205, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 242 + }, + { + "epoch": 0.972, + "grad_norm": 36.0, + "learning_rate": 4.028e-05, + "loss": 0.022, + "step": 243 + }, + { + "epoch": 0.972, + "eval_accuracy": 0.988, + "eval_loss": 0.04775354266166687, + "eval_runtime": 29.6694, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 243 + }, + { + "epoch": 0.976, + "grad_norm": 0.0027008056640625, + "learning_rate": 4.024e-05, + "loss": 0.0, + "step": 244 + }, + { + "epoch": 0.976, + "eval_accuracy": 0.986, + "eval_loss": 0.04700706899166107, + "eval_runtime": 29.668, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 244 + }, + { + "epoch": 0.98, + "grad_norm": 0.640625, + "learning_rate": 4.02e-05, + "loss": 0.0014, + "step": 245 + }, + { + "epoch": 0.98, + "eval_accuracy": 0.988, + "eval_loss": 0.047397430986166, + "eval_runtime": 29.6093, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 245 + }, + { + "epoch": 0.984, + "grad_norm": 1.15625, + "learning_rate": 4.016e-05, + "loss": 0.0039, + "step": 246 + }, + { + "epoch": 0.984, + "eval_accuracy": 0.988, + "eval_loss": 0.044841740280389786, + "eval_runtime": 29.6106, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 246 + }, + { + "epoch": 0.988, + "grad_norm": 0.0084228515625, + "learning_rate": 4.012e-05, + "loss": 0.0, + "step": 247 + }, + { + "epoch": 0.988, + "eval_accuracy": 0.988, + "eval_loss": 0.042322371155023575, + "eval_runtime": 29.6048, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 247 + }, + { + "epoch": 0.992, + "grad_norm": 9.0, + "learning_rate": 4.008e-05, + "loss": 0.0596, + "step": 248 + }, + { + "epoch": 0.992, + "eval_accuracy": 0.984, + "eval_loss": 0.037741873413324356, + "eval_runtime": 29.6597, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.124, + "step": 248 + }, + { + "epoch": 0.996, + "grad_norm": 249.0, + "learning_rate": 4.004e-05, + "loss": 0.5664, + "step": 249 + }, + { + "epoch": 0.996, + "eval_accuracy": 0.986, + "eval_loss": 0.04508182033896446, + "eval_runtime": 29.6537, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.125, + "step": 249 + }, + { + "epoch": 1.0, + "grad_norm": 0.0072021484375, + "learning_rate": 4e-05, + "loss": 0.0, + "step": 250 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.984, + "eval_loss": 0.05019988864660263, + "eval_runtime": 29.6482, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 250 + }, + { + "epoch": 1.004, + "grad_norm": 27.5, + "learning_rate": 3.9960000000000004e-05, + "loss": 0.0942, + "step": 251 + }, + { + "epoch": 1.004, + "eval_accuracy": 0.984, + "eval_loss": 0.05814649909734726, + "eval_runtime": 29.5712, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 251 + }, + { + "epoch": 1.008, + "grad_norm": 0.0015869140625, + "learning_rate": 3.9920000000000004e-05, + "loss": 0.0, + "step": 252 + }, + { + "epoch": 1.008, + "eval_accuracy": 0.984, + "eval_loss": 0.05021660402417183, + "eval_runtime": 29.4491, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.139, + "step": 252 + }, + { + "epoch": 1.012, + "grad_norm": 1.125, + "learning_rate": 3.988e-05, + "loss": 0.0049, + "step": 253 + }, + { + "epoch": 1.012, + "eval_accuracy": 0.982, + "eval_loss": 0.05302654579281807, + "eval_runtime": 29.5379, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 253 + }, + { + "epoch": 1.016, + "grad_norm": 1.7578125, + "learning_rate": 3.984e-05, + "loss": 0.0128, + "step": 254 + }, + { + "epoch": 1.016, + "eval_accuracy": 0.982, + "eval_loss": 0.060937847942113876, + "eval_runtime": 29.5778, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 254 + }, + { + "epoch": 1.02, + "grad_norm": 12.8125, + "learning_rate": 3.9800000000000005e-05, + "loss": 0.0723, + "step": 255 + }, + { + "epoch": 1.02, + "eval_accuracy": 0.982, + "eval_loss": 0.06790424883365631, + "eval_runtime": 29.5795, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 255 + }, + { + "epoch": 1.024, + "grad_norm": 0.291015625, + "learning_rate": 3.9760000000000006e-05, + "loss": 0.0022, + "step": 256 + }, + { + "epoch": 1.024, + "eval_accuracy": 0.98, + "eval_loss": 0.07177788019180298, + "eval_runtime": 29.6279, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 256 + }, + { + "epoch": 1.028, + "grad_norm": 39.75, + "learning_rate": 3.972e-05, + "loss": 0.1045, + "step": 257 + }, + { + "epoch": 1.028, + "eval_accuracy": 0.976, + "eval_loss": 0.07963619381189346, + "eval_runtime": 29.6288, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 257 + }, + { + "epoch": 1.032, + "grad_norm": 1.6953125, + "learning_rate": 3.968e-05, + "loss": 0.0081, + "step": 258 + }, + { + "epoch": 1.032, + "eval_accuracy": 0.976, + "eval_loss": 0.08145671337842941, + "eval_runtime": 29.6514, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 258 + }, + { + "epoch": 1.036, + "grad_norm": 2.015625, + "learning_rate": 3.964e-05, + "loss": 0.0099, + "step": 259 + }, + { + "epoch": 1.036, + "eval_accuracy": 0.978, + "eval_loss": 0.07274888455867767, + "eval_runtime": 29.6067, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 259 + }, + { + "epoch": 1.04, + "grad_norm": 1.7890625, + "learning_rate": 3.960000000000001e-05, + "loss": 0.0147, + "step": 260 + }, + { + "epoch": 1.04, + "eval_accuracy": 0.982, + "eval_loss": 0.06440262496471405, + "eval_runtime": 29.6054, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 260 + }, + { + "epoch": 1.044, + "grad_norm": 48.75, + "learning_rate": 3.956e-05, + "loss": 0.4512, + "step": 261 + }, + { + "epoch": 1.044, + "eval_accuracy": 0.988, + "eval_loss": 0.05720052495598793, + "eval_runtime": 29.5889, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.129, + "step": 261 + }, + { + "epoch": 1.048, + "grad_norm": 0.2890625, + "learning_rate": 3.952e-05, + "loss": 0.0014, + "step": 262 + }, + { + "epoch": 1.048, + "eval_accuracy": 0.988, + "eval_loss": 0.052172012627124786, + "eval_runtime": 29.57, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 262 + }, + { + "epoch": 1.052, + "grad_norm": 1.5859375, + "learning_rate": 3.948e-05, + "loss": 0.011, + "step": 263 + }, + { + "epoch": 1.052, + "eval_accuracy": 0.986, + "eval_loss": 0.04594781994819641, + "eval_runtime": 29.5708, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.13, + "step": 263 + }, + { + "epoch": 1.056, + "grad_norm": 0.032470703125, + "learning_rate": 3.944e-05, + "loss": 0.0001, + "step": 264 + }, + { + "epoch": 1.056, + "eval_accuracy": 0.986, + "eval_loss": 0.042698562145233154, + "eval_runtime": 29.577, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 264 + }, + { + "epoch": 1.06, + "grad_norm": 0.197265625, + "learning_rate": 3.94e-05, + "loss": 0.0012, + "step": 265 + }, + { + "epoch": 1.06, + "eval_accuracy": 0.984, + "eval_loss": 0.04107264056801796, + "eval_runtime": 29.5974, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 265 + }, + { + "epoch": 1.064, + "grad_norm": 0.265625, + "learning_rate": 3.936e-05, + "loss": 0.0009, + "step": 266 + }, + { + "epoch": 1.064, + "eval_accuracy": 0.984, + "eval_loss": 0.03928465396165848, + "eval_runtime": 29.6279, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 266 + }, + { + "epoch": 1.068, + "grad_norm": 0.302734375, + "learning_rate": 3.932e-05, + "loss": 0.0021, + "step": 267 + }, + { + "epoch": 1.068, + "eval_accuracy": 0.982, + "eval_loss": 0.03902623802423477, + "eval_runtime": 29.5777, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 267 + }, + { + "epoch": 1.072, + "grad_norm": 14.5, + "learning_rate": 3.9280000000000003e-05, + "loss": 0.0654, + "step": 268 + }, + { + "epoch": 1.072, + "eval_accuracy": 0.982, + "eval_loss": 0.038833752274513245, + "eval_runtime": 29.5774, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 268 + }, + { + "epoch": 1.076, + "grad_norm": 0.30859375, + "learning_rate": 3.9240000000000004e-05, + "loss": 0.0015, + "step": 269 + }, + { + "epoch": 1.076, + "eval_accuracy": 0.982, + "eval_loss": 0.03884024918079376, + "eval_runtime": 29.619, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 269 + }, + { + "epoch": 1.08, + "grad_norm": 0.00775146484375, + "learning_rate": 3.9200000000000004e-05, + "loss": 0.0001, + "step": 270 + }, + { + "epoch": 1.08, + "eval_accuracy": 0.982, + "eval_loss": 0.03861307352781296, + "eval_runtime": 29.5876, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 270 + }, + { + "epoch": 1.084, + "grad_norm": 0.96875, + "learning_rate": 3.9160000000000005e-05, + "loss": 0.0056, + "step": 271 + }, + { + "epoch": 1.084, + "eval_accuracy": 0.982, + "eval_loss": 0.038595061749219894, + "eval_runtime": 29.577, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 271 + }, + { + "epoch": 1.088, + "grad_norm": 0.000743865966796875, + "learning_rate": 3.912e-05, + "loss": 0.0, + "step": 272 + }, + { + "epoch": 1.088, + "eval_accuracy": 0.982, + "eval_loss": 0.0383642353117466, + "eval_runtime": 29.6319, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 272 + }, + { + "epoch": 1.092, + "grad_norm": 0.0021514892578125, + "learning_rate": 3.908e-05, + "loss": 0.0, + "step": 273 + }, + { + "epoch": 1.092, + "eval_accuracy": 0.982, + "eval_loss": 0.03839904069900513, + "eval_runtime": 29.5772, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 273 + }, + { + "epoch": 1.096, + "grad_norm": 0.400390625, + "learning_rate": 3.9040000000000006e-05, + "loss": 0.0022, + "step": 274 + }, + { + "epoch": 1.096, + "eval_accuracy": 0.984, + "eval_loss": 0.03887038305401802, + "eval_runtime": 29.6243, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 274 + }, + { + "epoch": 1.1, + "grad_norm": 0.12158203125, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0005, + "step": 275 + }, + { + "epoch": 1.1, + "eval_accuracy": 0.984, + "eval_loss": 0.039365388453006744, + "eval_runtime": 29.625, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 275 + }, + { + "epoch": 1.104, + "grad_norm": 18.0, + "learning_rate": 3.896e-05, + "loss": 0.0354, + "step": 276 + }, + { + "epoch": 1.104, + "eval_accuracy": 0.984, + "eval_loss": 0.0399734303355217, + "eval_runtime": 29.6308, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 276 + }, + { + "epoch": 1.108, + "grad_norm": 80.0, + "learning_rate": 3.892e-05, + "loss": 1.0391, + "step": 277 + }, + { + "epoch": 1.108, + "eval_accuracy": 0.986, + "eval_loss": 0.041828054934740067, + "eval_runtime": 29.5887, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.129, + "step": 277 + }, + { + "epoch": 1.112, + "grad_norm": 0.01446533203125, + "learning_rate": 3.888e-05, + "loss": 0.0001, + "step": 278 + }, + { + "epoch": 1.112, + "eval_accuracy": 0.984, + "eval_loss": 0.041034799069166183, + "eval_runtime": 29.5896, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.129, + "step": 278 + }, + { + "epoch": 1.116, + "grad_norm": 1.40625, + "learning_rate": 3.884e-05, + "loss": 0.0077, + "step": 279 + }, + { + "epoch": 1.116, + "eval_accuracy": 0.982, + "eval_loss": 0.0417436808347702, + "eval_runtime": 29.5841, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.13, + "step": 279 + }, + { + "epoch": 1.12, + "grad_norm": 6.84375, + "learning_rate": 3.88e-05, + "loss": 0.0356, + "step": 280 + }, + { + "epoch": 1.12, + "eval_accuracy": 0.982, + "eval_loss": 0.04216504842042923, + "eval_runtime": 29.5822, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 280 + }, + { + "epoch": 1.124, + "grad_norm": 0.016357421875, + "learning_rate": 3.876e-05, + "loss": 0.0001, + "step": 281 + }, + { + "epoch": 1.124, + "eval_accuracy": 0.984, + "eval_loss": 0.04284616932272911, + "eval_runtime": 29.5851, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 281 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 39.25, + "learning_rate": 3.872e-05, + "loss": 0.334, + "step": 282 + }, + { + "epoch": 1.1280000000000001, + "eval_accuracy": 0.982, + "eval_loss": 0.043841175734996796, + "eval_runtime": 29.5905, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 282 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 9.6875, + "learning_rate": 3.868e-05, + "loss": 0.0374, + "step": 283 + }, + { + "epoch": 1.1320000000000001, + "eval_accuracy": 0.982, + "eval_loss": 0.04576905071735382, + "eval_runtime": 29.5975, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 283 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.609375, + "learning_rate": 3.864e-05, + "loss": 0.0029, + "step": 284 + }, + { + "epoch": 1.1360000000000001, + "eval_accuracy": 0.984, + "eval_loss": 0.044647734612226486, + "eval_runtime": 29.639, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 284 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 0.001922607421875, + "learning_rate": 3.86e-05, + "loss": 0.0, + "step": 285 + }, + { + "epoch": 1.1400000000000001, + "eval_accuracy": 0.986, + "eval_loss": 0.04621968790888786, + "eval_runtime": 29.5921, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 285 + }, + { + "epoch": 1.144, + "grad_norm": 121.0, + "learning_rate": 3.8560000000000004e-05, + "loss": 0.7461, + "step": 286 + }, + { + "epoch": 1.144, + "eval_accuracy": 0.986, + "eval_loss": 0.04369563236832619, + "eval_runtime": 29.5981, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 286 + }, + { + "epoch": 1.148, + "grad_norm": 0.01177978515625, + "learning_rate": 3.8520000000000004e-05, + "loss": 0.0, + "step": 287 + }, + { + "epoch": 1.148, + "eval_accuracy": 0.986, + "eval_loss": 0.04458891972899437, + "eval_runtime": 29.5939, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 287 + }, + { + "epoch": 1.152, + "grad_norm": 57.75, + "learning_rate": 3.848e-05, + "loss": 2.1719, + "step": 288 + }, + { + "epoch": 1.152, + "eval_accuracy": 0.986, + "eval_loss": 0.045197661966085434, + "eval_runtime": 29.6042, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 288 + }, + { + "epoch": 1.156, + "grad_norm": 69.0, + "learning_rate": 3.8440000000000005e-05, + "loss": 0.3574, + "step": 289 + }, + { + "epoch": 1.156, + "eval_accuracy": 0.986, + "eval_loss": 0.042459942400455475, + "eval_runtime": 29.6505, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 289 + }, + { + "epoch": 1.16, + "grad_norm": 14.25, + "learning_rate": 3.8400000000000005e-05, + "loss": 0.0449, + "step": 290 + }, + { + "epoch": 1.16, + "eval_accuracy": 0.988, + "eval_loss": 0.041883524507284164, + "eval_runtime": 29.6564, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 290 + }, + { + "epoch": 1.164, + "grad_norm": 0.09130859375, + "learning_rate": 3.836e-05, + "loss": 0.0002, + "step": 291 + }, + { + "epoch": 1.164, + "eval_accuracy": 0.988, + "eval_loss": 0.04172814264893532, + "eval_runtime": 29.6538, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.125, + "step": 291 + }, + { + "epoch": 1.168, + "grad_norm": 3.4831464290618896e-07, + "learning_rate": 3.832e-05, + "loss": 0.0, + "step": 292 + }, + { + "epoch": 1.168, + "eval_accuracy": 0.988, + "eval_loss": 0.04241952300071716, + "eval_runtime": 29.5939, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 292 + }, + { + "epoch": 1.172, + "grad_norm": 0.00159454345703125, + "learning_rate": 3.828e-05, + "loss": 0.0, + "step": 293 + }, + { + "epoch": 1.172, + "eval_accuracy": 0.988, + "eval_loss": 0.04328082501888275, + "eval_runtime": 29.6002, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 293 + }, + { + "epoch": 1.176, + "grad_norm": 94.0, + "learning_rate": 3.8240000000000007e-05, + "loss": 0.332, + "step": 294 + }, + { + "epoch": 1.176, + "eval_accuracy": 0.988, + "eval_loss": 0.0429229736328125, + "eval_runtime": 29.5952, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 294 + }, + { + "epoch": 1.18, + "grad_norm": 0.054443359375, + "learning_rate": 3.82e-05, + "loss": 0.0001, + "step": 295 + }, + { + "epoch": 1.18, + "eval_accuracy": 0.99, + "eval_loss": 0.043315187096595764, + "eval_runtime": 29.6488, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 295 + }, + { + "epoch": 1.184, + "grad_norm": 1.734375, + "learning_rate": 3.816e-05, + "loss": 0.004, + "step": 296 + }, + { + "epoch": 1.184, + "eval_accuracy": 0.988, + "eval_loss": 0.04346512630581856, + "eval_runtime": 29.5976, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 296 + }, + { + "epoch": 1.188, + "grad_norm": 0.0157470703125, + "learning_rate": 3.812e-05, + "loss": 0.0, + "step": 297 + }, + { + "epoch": 1.188, + "eval_accuracy": 0.988, + "eval_loss": 0.04286235570907593, + "eval_runtime": 29.6442, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 297 + }, + { + "epoch": 1.192, + "grad_norm": 0.00010967254638671875, + "learning_rate": 3.808e-05, + "loss": 0.0, + "step": 298 + }, + { + "epoch": 1.192, + "eval_accuracy": 0.988, + "eval_loss": 0.043164145201444626, + "eval_runtime": 29.6054, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 298 + }, + { + "epoch": 1.196, + "grad_norm": 7.534027099609375e-05, + "learning_rate": 3.804e-05, + "loss": 0.0, + "step": 299 + }, + { + "epoch": 1.196, + "eval_accuracy": 0.988, + "eval_loss": 0.04308565706014633, + "eval_runtime": 29.6031, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 299 + }, + { + "epoch": 1.2, + "grad_norm": 1.6640625, + "learning_rate": 3.8e-05, + "loss": 0.0084, + "step": 300 + }, + { + "epoch": 1.2, + "eval_accuracy": 0.988, + "eval_loss": 0.04339505732059479, + "eval_runtime": 29.5937, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 300 + }, + { + "epoch": 1.204, + "grad_norm": 0.0008087158203125, + "learning_rate": 3.796e-05, + "loss": 0.0, + "step": 301 + }, + { + "epoch": 1.204, + "eval_accuracy": 0.988, + "eval_loss": 0.04419083148241043, + "eval_runtime": 29.5369, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.133, + "step": 301 + }, + { + "epoch": 1.208, + "grad_norm": 0.01611328125, + "learning_rate": 3.792e-05, + "loss": 0.0001, + "step": 302 + }, + { + "epoch": 1.208, + "eval_accuracy": 0.99, + "eval_loss": 0.04429430142045021, + "eval_runtime": 29.4602, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.138, + "step": 302 + }, + { + "epoch": 1.212, + "grad_norm": 0.00238037109375, + "learning_rate": 3.788e-05, + "loss": 0.0, + "step": 303 + }, + { + "epoch": 1.212, + "eval_accuracy": 0.99, + "eval_loss": 0.04510888084769249, + "eval_runtime": 29.5892, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.129, + "step": 303 + }, + { + "epoch": 1.216, + "grad_norm": 0.2392578125, + "learning_rate": 3.7840000000000004e-05, + "loss": 0.0004, + "step": 304 + }, + { + "epoch": 1.216, + "eval_accuracy": 0.99, + "eval_loss": 0.044927872717380524, + "eval_runtime": 29.6643, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 304 + }, + { + "epoch": 1.22, + "grad_norm": 0.310546875, + "learning_rate": 3.7800000000000004e-05, + "loss": 0.0008, + "step": 305 + }, + { + "epoch": 1.22, + "eval_accuracy": 0.99, + "eval_loss": 0.0457804799079895, + "eval_runtime": 29.5933, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 305 + }, + { + "epoch": 1.224, + "grad_norm": 6.0, + "learning_rate": 3.776e-05, + "loss": 0.0291, + "step": 306 + }, + { + "epoch": 1.224, + "eval_accuracy": 0.99, + "eval_loss": 0.04592352360486984, + "eval_runtime": 29.6018, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 306 + }, + { + "epoch": 1.228, + "grad_norm": 0.000576019287109375, + "learning_rate": 3.772e-05, + "loss": 0.0, + "step": 307 + }, + { + "epoch": 1.228, + "eval_accuracy": 0.99, + "eval_loss": 0.04750025272369385, + "eval_runtime": 29.6173, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 307 + }, + { + "epoch": 1.232, + "grad_norm": 0.001068115234375, + "learning_rate": 3.7680000000000005e-05, + "loss": 0.0, + "step": 308 + }, + { + "epoch": 1.232, + "eval_accuracy": 0.99, + "eval_loss": 0.048875801265239716, + "eval_runtime": 29.668, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 308 + }, + { + "epoch": 1.236, + "grad_norm": 2.609375, + "learning_rate": 3.7640000000000006e-05, + "loss": 0.0054, + "step": 309 + }, + { + "epoch": 1.236, + "eval_accuracy": 0.988, + "eval_loss": 0.05109934136271477, + "eval_runtime": 29.6068, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 309 + }, + { + "epoch": 1.24, + "grad_norm": 0.62890625, + "learning_rate": 3.76e-05, + "loss": 0.0016, + "step": 310 + }, + { + "epoch": 1.24, + "eval_accuracy": 0.988, + "eval_loss": 0.0544138103723526, + "eval_runtime": 29.6661, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 310 + }, + { + "epoch": 1.244, + "grad_norm": 0.0001678466796875, + "learning_rate": 3.756e-05, + "loss": 0.0, + "step": 311 + }, + { + "epoch": 1.244, + "eval_accuracy": 0.988, + "eval_loss": 0.05521370843052864, + "eval_runtime": 29.6158, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.127, + "step": 311 + }, + { + "epoch": 1.248, + "grad_norm": 1.3172626495361328e-05, + "learning_rate": 3.752e-05, + "loss": 0.0, + "step": 312 + }, + { + "epoch": 1.248, + "eval_accuracy": 0.988, + "eval_loss": 0.056619398295879364, + "eval_runtime": 29.6073, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 312 + }, + { + "epoch": 1.252, + "grad_norm": 0.0247802734375, + "learning_rate": 3.748000000000001e-05, + "loss": 0.0, + "step": 313 + }, + { + "epoch": 1.252, + "eval_accuracy": 0.988, + "eval_loss": 0.062135856598615646, + "eval_runtime": 29.6069, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 313 + }, + { + "epoch": 1.256, + "grad_norm": 1.015625, + "learning_rate": 3.744e-05, + "loss": 0.0044, + "step": 314 + }, + { + "epoch": 1.256, + "eval_accuracy": 0.988, + "eval_loss": 0.06401406228542328, + "eval_runtime": 29.6507, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 314 + }, + { + "epoch": 1.26, + "grad_norm": 3.453125, + "learning_rate": 3.74e-05, + "loss": 0.0032, + "step": 315 + }, + { + "epoch": 1.26, + "eval_accuracy": 0.986, + "eval_loss": 0.07324115931987762, + "eval_runtime": 29.5913, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 315 + }, + { + "epoch": 1.264, + "grad_norm": 0.169921875, + "learning_rate": 3.736e-05, + "loss": 0.0006, + "step": 316 + }, + { + "epoch": 1.264, + "eval_accuracy": 0.986, + "eval_loss": 0.07768232375383377, + "eval_runtime": 29.6567, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 316 + }, + { + "epoch": 1.268, + "grad_norm": 76.0, + "learning_rate": 3.732e-05, + "loss": 0.3984, + "step": 317 + }, + { + "epoch": 1.268, + "eval_accuracy": 0.986, + "eval_loss": 0.07793063670396805, + "eval_runtime": 29.6676, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 317 + }, + { + "epoch": 1.272, + "grad_norm": 113.0, + "learning_rate": 3.728e-05, + "loss": 0.5391, + "step": 318 + }, + { + "epoch": 1.272, + "eval_accuracy": 0.986, + "eval_loss": 0.07860060036182404, + "eval_runtime": 29.6607, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 318 + }, + { + "epoch": 1.276, + "grad_norm": 56.5, + "learning_rate": 3.724e-05, + "loss": 0.2598, + "step": 319 + }, + { + "epoch": 1.276, + "eval_accuracy": 0.986, + "eval_loss": 0.07452204078435898, + "eval_runtime": 29.6623, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 319 + }, + { + "epoch": 1.28, + "grad_norm": 0.00145721435546875, + "learning_rate": 3.72e-05, + "loss": 0.0, + "step": 320 + }, + { + "epoch": 1.28, + "eval_accuracy": 0.986, + "eval_loss": 0.07128868252038956, + "eval_runtime": 29.6218, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 320 + }, + { + "epoch": 1.284, + "grad_norm": 2.359375, + "learning_rate": 3.716e-05, + "loss": 0.007, + "step": 321 + }, + { + "epoch": 1.284, + "eval_accuracy": 0.986, + "eval_loss": 0.06924331933259964, + "eval_runtime": 29.6655, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 321 + }, + { + "epoch": 1.288, + "grad_norm": 0.00061798095703125, + "learning_rate": 3.712e-05, + "loss": 0.0, + "step": 322 + }, + { + "epoch": 1.288, + "eval_accuracy": 0.986, + "eval_loss": 0.06757459044456482, + "eval_runtime": 29.6095, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 322 + }, + { + "epoch": 1.292, + "grad_norm": 0.16015625, + "learning_rate": 3.7080000000000004e-05, + "loss": 0.0004, + "step": 323 + }, + { + "epoch": 1.292, + "eval_accuracy": 0.988, + "eval_loss": 0.06412693113088608, + "eval_runtime": 29.6023, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 323 + }, + { + "epoch": 1.296, + "grad_norm": 3.0040740966796875e-05, + "learning_rate": 3.7040000000000005e-05, + "loss": 0.0, + "step": 324 + }, + { + "epoch": 1.296, + "eval_accuracy": 0.988, + "eval_loss": 0.062421128153800964, + "eval_runtime": 29.6472, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 324 + }, + { + "epoch": 1.3, + "grad_norm": 7.25, + "learning_rate": 3.7e-05, + "loss": 0.0293, + "step": 325 + }, + { + "epoch": 1.3, + "eval_accuracy": 0.988, + "eval_loss": 0.0573146715760231, + "eval_runtime": 29.6535, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.125, + "step": 325 + }, + { + "epoch": 1.304, + "grad_norm": 0.0205078125, + "learning_rate": 3.696e-05, + "loss": 0.0001, + "step": 326 + }, + { + "epoch": 1.304, + "eval_accuracy": 0.988, + "eval_loss": 0.057243190705776215, + "eval_runtime": 29.6699, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 326 + }, + { + "epoch": 1.308, + "grad_norm": 0.5390625, + "learning_rate": 3.692e-05, + "loss": 0.0014, + "step": 327 + }, + { + "epoch": 1.308, + "eval_accuracy": 0.988, + "eval_loss": 0.05443909019231796, + "eval_runtime": 29.6591, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.124, + "step": 327 + }, + { + "epoch": 1.312, + "grad_norm": 0.0751953125, + "learning_rate": 3.6880000000000006e-05, + "loss": 0.0002, + "step": 328 + }, + { + "epoch": 1.312, + "eval_accuracy": 0.988, + "eval_loss": 0.054459791630506516, + "eval_runtime": 29.6583, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.124, + "step": 328 + }, + { + "epoch": 1.316, + "grad_norm": 0.5234375, + "learning_rate": 3.684e-05, + "loss": 0.0009, + "step": 329 + }, + { + "epoch": 1.316, + "eval_accuracy": 0.986, + "eval_loss": 0.05431881174445152, + "eval_runtime": 29.6059, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 329 + }, + { + "epoch": 1.32, + "grad_norm": 0.0003032684326171875, + "learning_rate": 3.68e-05, + "loss": 0.0, + "step": 330 + }, + { + "epoch": 1.32, + "eval_accuracy": 0.986, + "eval_loss": 0.0546875037252903, + "eval_runtime": 29.6059, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 330 + }, + { + "epoch": 1.324, + "grad_norm": 5.875, + "learning_rate": 3.676e-05, + "loss": 0.0222, + "step": 331 + }, + { + "epoch": 1.324, + "eval_accuracy": 0.984, + "eval_loss": 0.0545102134346962, + "eval_runtime": 29.6094, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 331 + }, + { + "epoch": 1.328, + "grad_norm": 0.546875, + "learning_rate": 3.672000000000001e-05, + "loss": 0.0013, + "step": 332 + }, + { + "epoch": 1.328, + "eval_accuracy": 0.984, + "eval_loss": 0.05404158681631088, + "eval_runtime": 29.6548, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.124, + "step": 332 + }, + { + "epoch": 1.332, + "grad_norm": 0.00732421875, + "learning_rate": 3.668e-05, + "loss": 0.0, + "step": 333 + }, + { + "epoch": 1.332, + "eval_accuracy": 0.986, + "eval_loss": 0.05502966791391373, + "eval_runtime": 29.6614, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 333 + }, + { + "epoch": 1.336, + "grad_norm": 0.017333984375, + "learning_rate": 3.664e-05, + "loss": 0.0, + "step": 334 + }, + { + "epoch": 1.336, + "eval_accuracy": 0.986, + "eval_loss": 0.055158209055662155, + "eval_runtime": 29.6006, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 334 + }, + { + "epoch": 1.34, + "grad_norm": 0.0009918212890625, + "learning_rate": 3.66e-05, + "loss": 0.0, + "step": 335 + }, + { + "epoch": 1.34, + "eval_accuracy": 0.984, + "eval_loss": 0.05652189627289772, + "eval_runtime": 29.5995, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 335 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 110.5, + "learning_rate": 3.656e-05, + "loss": 1.5469, + "step": 336 + }, + { + "epoch": 1.3439999999999999, + "eval_accuracy": 0.986, + "eval_loss": 0.055844374001026154, + "eval_runtime": 29.6478, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 336 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 0.045166015625, + "learning_rate": 3.652e-05, + "loss": 0.0002, + "step": 337 + }, + { + "epoch": 1.3479999999999999, + "eval_accuracy": 0.986, + "eval_loss": 0.055635880678892136, + "eval_runtime": 29.5999, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 337 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 14.3125, + "learning_rate": 3.648e-05, + "loss": 0.0378, + "step": 338 + }, + { + "epoch": 1.3519999999999999, + "eval_accuracy": 0.986, + "eval_loss": 0.058380722999572754, + "eval_runtime": 29.6226, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 338 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 1.984375, + "learning_rate": 3.6440000000000003e-05, + "loss": 0.004, + "step": 339 + }, + { + "epoch": 1.3559999999999999, + "eval_accuracy": 0.986, + "eval_loss": 0.05692053213715553, + "eval_runtime": 29.6394, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.126, + "step": 339 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.00726318359375, + "learning_rate": 3.6400000000000004e-05, + "loss": 0.0, + "step": 340 + }, + { + "epoch": 1.3599999999999999, + "eval_accuracy": 0.986, + "eval_loss": 0.05603298544883728, + "eval_runtime": 29.6119, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.128, + "step": 340 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 62.0, + "learning_rate": 3.636e-05, + "loss": 0.1816, + "step": 341 + }, + { + "epoch": 1.3639999999999999, + "eval_accuracy": 0.984, + "eval_loss": 0.05057508125901222, + "eval_runtime": 29.6036, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 341 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 41.25, + "learning_rate": 3.6320000000000005e-05, + "loss": 0.1226, + "step": 342 + }, + { + "epoch": 1.3679999999999999, + "eval_accuracy": 0.984, + "eval_loss": 0.047154005616903305, + "eval_runtime": 29.5982, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 342 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 1.296875, + "learning_rate": 3.6280000000000005e-05, + "loss": 0.0038, + "step": 343 + }, + { + "epoch": 1.3719999999999999, + "eval_accuracy": 0.982, + "eval_loss": 0.05200347304344177, + "eval_runtime": 29.6518, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.125, + "step": 343 + }, + { + "epoch": 1.376, + "grad_norm": 0.00113677978515625, + "learning_rate": 3.624e-05, + "loss": 0.0, + "step": 344 + }, + { + "epoch": 1.376, + "eval_accuracy": 0.974, + "eval_loss": 0.06347780674695969, + "eval_runtime": 29.6472, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 344 + }, + { + "epoch": 1.38, + "grad_norm": 0.10888671875, + "learning_rate": 3.62e-05, + "loss": 0.0002, + "step": 345 + }, + { + "epoch": 1.38, + "eval_accuracy": 0.972, + "eval_loss": 0.07466176152229309, + "eval_runtime": 29.5967, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 345 + }, + { + "epoch": 1.384, + "grad_norm": 6.15625, + "learning_rate": 3.616e-05, + "loss": 0.0166, + "step": 346 + }, + { + "epoch": 1.384, + "eval_accuracy": 0.956, + "eval_loss": 0.09891492873430252, + "eval_runtime": 29.599, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 346 + }, + { + "epoch": 1.388, + "grad_norm": 8.625, + "learning_rate": 3.6120000000000007e-05, + "loss": 0.0278, + "step": 347 + }, + { + "epoch": 1.388, + "eval_accuracy": 0.948, + "eval_loss": 0.13347820937633514, + "eval_runtime": 29.6504, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 347 + }, + { + "epoch": 1.392, + "grad_norm": 0.08837890625, + "learning_rate": 3.608e-05, + "loss": 0.0003, + "step": 348 + }, + { + "epoch": 1.392, + "eval_accuracy": 0.932, + "eval_loss": 0.16687442362308502, + "eval_runtime": 29.6, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 348 + }, + { + "epoch": 1.396, + "grad_norm": 110.0, + "learning_rate": 3.604e-05, + "loss": 2.3438, + "step": 349 + }, + { + "epoch": 1.396, + "eval_accuracy": 0.936, + "eval_loss": 0.1718183010816574, + "eval_runtime": 29.6141, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 349 + }, + { + "epoch": 1.4, + "grad_norm": 62.25, + "learning_rate": 3.6e-05, + "loss": 0.334, + "step": 350 + }, + { + "epoch": 1.4, + "eval_accuracy": 0.94, + "eval_loss": 0.14843949675559998, + "eval_runtime": 29.6008, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 350 + }, + { + "epoch": 1.404, + "grad_norm": 25.5, + "learning_rate": 3.596e-05, + "loss": 0.1377, + "step": 351 + }, + { + "epoch": 1.404, + "eval_accuracy": 0.958, + "eval_loss": 0.09256642311811447, + "eval_runtime": 29.5236, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.134, + "step": 351 + }, + { + "epoch": 1.408, + "grad_norm": 0.08203125, + "learning_rate": 3.592e-05, + "loss": 0.0001, + "step": 352 + }, + { + "epoch": 1.408, + "eval_accuracy": 0.972, + "eval_loss": 0.06676935404539108, + "eval_runtime": 29.5774, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 352 + }, + { + "epoch": 1.412, + "grad_norm": 41.0, + "learning_rate": 3.588e-05, + "loss": 0.2432, + "step": 353 + }, + { + "epoch": 1.412, + "eval_accuracy": 0.988, + "eval_loss": 0.05320088192820549, + "eval_runtime": 29.6262, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.126, + "step": 353 + }, + { + "epoch": 1.416, + "grad_norm": 3.765625, + "learning_rate": 3.584e-05, + "loss": 0.0082, + "step": 354 + }, + { + "epoch": 1.416, + "eval_accuracy": 0.99, + "eval_loss": 0.03936154022812843, + "eval_runtime": 29.651, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 354 + }, + { + "epoch": 1.42, + "grad_norm": 0.6328125, + "learning_rate": 3.58e-05, + "loss": 0.0015, + "step": 355 + }, + { + "epoch": 1.42, + "eval_accuracy": 0.99, + "eval_loss": 0.03550048917531967, + "eval_runtime": 29.6444, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 355 + }, + { + "epoch": 1.424, + "grad_norm": 0.033203125, + "learning_rate": 3.5759999999999996e-05, + "loss": 0.0001, + "step": 356 + }, + { + "epoch": 1.424, + "eval_accuracy": 0.988, + "eval_loss": 0.03779887408018112, + "eval_runtime": 29.639, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 356 + }, + { + "epoch": 1.428, + "grad_norm": 0.87890625, + "learning_rate": 3.5720000000000004e-05, + "loss": 0.0029, + "step": 357 + }, + { + "epoch": 1.428, + "eval_accuracy": 0.988, + "eval_loss": 0.039121538400650024, + "eval_runtime": 29.594, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 357 + }, + { + "epoch": 1.432, + "grad_norm": 10.75, + "learning_rate": 3.5680000000000004e-05, + "loss": 0.0459, + "step": 358 + }, + { + "epoch": 1.432, + "eval_accuracy": 0.988, + "eval_loss": 0.041328269988298416, + "eval_runtime": 29.605, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 358 + }, + { + "epoch": 1.436, + "grad_norm": 0.02001953125, + "learning_rate": 3.5640000000000004e-05, + "loss": 0.0001, + "step": 359 + }, + { + "epoch": 1.436, + "eval_accuracy": 0.988, + "eval_loss": 0.04410649091005325, + "eval_runtime": 29.6391, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 359 + }, + { + "epoch": 1.44, + "grad_norm": 0.00799560546875, + "learning_rate": 3.56e-05, + "loss": 0.0, + "step": 360 + }, + { + "epoch": 1.44, + "eval_accuracy": 0.99, + "eval_loss": 0.04554106667637825, + "eval_runtime": 29.605, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 360 + }, + { + "epoch": 1.444, + "grad_norm": 0.030029296875, + "learning_rate": 3.5560000000000005e-05, + "loss": 0.0001, + "step": 361 + }, + { + "epoch": 1.444, + "eval_accuracy": 0.99, + "eval_loss": 0.0456966869533062, + "eval_runtime": 29.5916, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 361 + }, + { + "epoch": 1.448, + "grad_norm": 0.000782012939453125, + "learning_rate": 3.5520000000000006e-05, + "loss": 0.0, + "step": 362 + }, + { + "epoch": 1.448, + "eval_accuracy": 0.99, + "eval_loss": 0.04337631165981293, + "eval_runtime": 29.5918, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 362 + }, + { + "epoch": 1.452, + "grad_norm": 0.008544921875, + "learning_rate": 3.548e-05, + "loss": 0.0, + "step": 363 + }, + { + "epoch": 1.452, + "eval_accuracy": 0.988, + "eval_loss": 0.04492032527923584, + "eval_runtime": 29.597, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 363 + }, + { + "epoch": 1.456, + "grad_norm": 0.00101470947265625, + "learning_rate": 3.544e-05, + "loss": 0.0, + "step": 364 + }, + { + "epoch": 1.456, + "eval_accuracy": 0.988, + "eval_loss": 0.045494042336940765, + "eval_runtime": 29.6563, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 364 + }, + { + "epoch": 1.46, + "grad_norm": 0.056640625, + "learning_rate": 3.54e-05, + "loss": 0.0002, + "step": 365 + }, + { + "epoch": 1.46, + "eval_accuracy": 0.99, + "eval_loss": 0.04683837667107582, + "eval_runtime": 29.6561, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 365 + }, + { + "epoch": 1.464, + "grad_norm": 0.00160980224609375, + "learning_rate": 3.536000000000001e-05, + "loss": 0.0, + "step": 366 + }, + { + "epoch": 1.464, + "eval_accuracy": 0.988, + "eval_loss": 0.048011139035224915, + "eval_runtime": 29.6606, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 366 + }, + { + "epoch": 1.468, + "grad_norm": 0.333984375, + "learning_rate": 3.532e-05, + "loss": 0.0014, + "step": 367 + }, + { + "epoch": 1.468, + "eval_accuracy": 0.988, + "eval_loss": 0.05059017613530159, + "eval_runtime": 29.6005, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 367 + }, + { + "epoch": 1.472, + "grad_norm": 8.0, + "learning_rate": 3.528e-05, + "loss": 0.0337, + "step": 368 + }, + { + "epoch": 1.472, + "eval_accuracy": 0.988, + "eval_loss": 0.05110400170087814, + "eval_runtime": 29.6145, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 368 + }, + { + "epoch": 1.476, + "grad_norm": 0.023193359375, + "learning_rate": 3.524e-05, + "loss": 0.0001, + "step": 369 + }, + { + "epoch": 1.476, + "eval_accuracy": 0.99, + "eval_loss": 0.04757150262594223, + "eval_runtime": 29.6176, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 369 + }, + { + "epoch": 1.48, + "grad_norm": 21.5, + "learning_rate": 3.52e-05, + "loss": 0.3066, + "step": 370 + }, + { + "epoch": 1.48, + "eval_accuracy": 0.988, + "eval_loss": 0.04958709329366684, + "eval_runtime": 29.672, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.123, + "step": 370 + }, + { + "epoch": 1.484, + "grad_norm": 0.21484375, + "learning_rate": 3.516e-05, + "loss": 0.0007, + "step": 371 + }, + { + "epoch": 1.484, + "eval_accuracy": 0.988, + "eval_loss": 0.05002021789550781, + "eval_runtime": 29.6024, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 371 + }, + { + "epoch": 1.488, + "grad_norm": 0.000431060791015625, + "learning_rate": 3.512e-05, + "loss": 0.0, + "step": 372 + }, + { + "epoch": 1.488, + "eval_accuracy": 0.988, + "eval_loss": 0.0518474280834198, + "eval_runtime": 29.5926, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 372 + }, + { + "epoch": 1.492, + "grad_norm": 5.1021575927734375e-05, + "learning_rate": 3.508e-05, + "loss": 0.0, + "step": 373 + }, + { + "epoch": 1.492, + "eval_accuracy": 0.988, + "eval_loss": 0.05172526463866234, + "eval_runtime": 29.6469, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 373 + }, + { + "epoch": 1.496, + "grad_norm": 0.2177734375, + "learning_rate": 3.504e-05, + "loss": 0.0006, + "step": 374 + }, + { + "epoch": 1.496, + "eval_accuracy": 0.988, + "eval_loss": 0.05098959431052208, + "eval_runtime": 29.6679, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 374 + }, + { + "epoch": 1.5, + "grad_norm": 33.5, + "learning_rate": 3.5e-05, + "loss": 0.1162, + "step": 375 + }, + { + "epoch": 1.5, + "eval_accuracy": 0.988, + "eval_loss": 0.05054827779531479, + "eval_runtime": 29.6147, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 375 + }, + { + "epoch": 1.504, + "grad_norm": 1.0859375, + "learning_rate": 3.4960000000000004e-05, + "loss": 0.0037, + "step": 376 + }, + { + "epoch": 1.504, + "eval_accuracy": 0.988, + "eval_loss": 0.05049055814743042, + "eval_runtime": 29.6004, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 376 + }, + { + "epoch": 1.508, + "grad_norm": 0.000530242919921875, + "learning_rate": 3.4920000000000004e-05, + "loss": 0.0, + "step": 377 + }, + { + "epoch": 1.508, + "eval_accuracy": 0.988, + "eval_loss": 0.0517900176346302, + "eval_runtime": 29.5966, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 377 + }, + { + "epoch": 1.512, + "grad_norm": 0.0390625, + "learning_rate": 3.4880000000000005e-05, + "loss": 0.0002, + "step": 378 + }, + { + "epoch": 1.512, + "eval_accuracy": 0.988, + "eval_loss": 0.05203340947628021, + "eval_runtime": 29.5912, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 378 + }, + { + "epoch": 1.516, + "grad_norm": 0.0130615234375, + "learning_rate": 3.484e-05, + "loss": 0.0, + "step": 379 + }, + { + "epoch": 1.516, + "eval_accuracy": 0.99, + "eval_loss": 0.050449371337890625, + "eval_runtime": 29.5825, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 379 + }, + { + "epoch": 1.52, + "grad_norm": 0.0126953125, + "learning_rate": 3.48e-05, + "loss": 0.0001, + "step": 380 + }, + { + "epoch": 1.52, + "eval_accuracy": 0.99, + "eval_loss": 0.05180507153272629, + "eval_runtime": 29.6416, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.125, + "step": 380 + }, + { + "epoch": 1.524, + "grad_norm": 5.71875, + "learning_rate": 3.4760000000000006e-05, + "loss": 0.021, + "step": 381 + }, + { + "epoch": 1.524, + "eval_accuracy": 0.99, + "eval_loss": 0.05119476094841957, + "eval_runtime": 29.6409, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.125, + "step": 381 + }, + { + "epoch": 1.528, + "grad_norm": 0.003875732421875, + "learning_rate": 3.472e-05, + "loss": 0.0, + "step": 382 + }, + { + "epoch": 1.528, + "eval_accuracy": 0.99, + "eval_loss": 0.05229797214269638, + "eval_runtime": 29.5939, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 382 + }, + { + "epoch": 1.532, + "grad_norm": 0.00096893310546875, + "learning_rate": 3.468e-05, + "loss": 0.0, + "step": 383 + }, + { + "epoch": 1.532, + "eval_accuracy": 0.99, + "eval_loss": 0.05340004712343216, + "eval_runtime": 29.6364, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 383 + }, + { + "epoch": 1.536, + "grad_norm": 0.00537109375, + "learning_rate": 3.464e-05, + "loss": 0.0, + "step": 384 + }, + { + "epoch": 1.536, + "eval_accuracy": 0.99, + "eval_loss": 0.053413160145282745, + "eval_runtime": 29.5925, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 384 + }, + { + "epoch": 1.54, + "grad_norm": 0.0013580322265625, + "learning_rate": 3.46e-05, + "loss": 0.0, + "step": 385 + }, + { + "epoch": 1.54, + "eval_accuracy": 0.99, + "eval_loss": 0.05419563874602318, + "eval_runtime": 29.6446, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 385 + }, + { + "epoch": 1.544, + "grad_norm": 0.000797271728515625, + "learning_rate": 3.456e-05, + "loss": 0.0, + "step": 386 + }, + { + "epoch": 1.544, + "eval_accuracy": 0.99, + "eval_loss": 0.05513488128781319, + "eval_runtime": 29.598, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 386 + }, + { + "epoch": 1.548, + "grad_norm": 17.375, + "learning_rate": 3.452e-05, + "loss": 0.0337, + "step": 387 + }, + { + "epoch": 1.548, + "eval_accuracy": 0.99, + "eval_loss": 0.0535738579928875, + "eval_runtime": 29.63, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 387 + }, + { + "epoch": 1.552, + "grad_norm": 6.28125, + "learning_rate": 3.448e-05, + "loss": 0.0201, + "step": 388 + }, + { + "epoch": 1.552, + "eval_accuracy": 0.99, + "eval_loss": 0.05296257883310318, + "eval_runtime": 29.6323, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 388 + }, + { + "epoch": 1.556, + "grad_norm": 3.28125, + "learning_rate": 3.444e-05, + "loss": 0.0092, + "step": 389 + }, + { + "epoch": 1.556, + "eval_accuracy": 0.99, + "eval_loss": 0.05016641691327095, + "eval_runtime": 29.5858, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 389 + }, + { + "epoch": 1.56, + "grad_norm": 59.25, + "learning_rate": 3.4399999999999996e-05, + "loss": 1.4219, + "step": 390 + }, + { + "epoch": 1.56, + "eval_accuracy": 0.99, + "eval_loss": 0.048158910125494, + "eval_runtime": 29.6363, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 390 + }, + { + "epoch": 1.564, + "grad_norm": 0.006378173828125, + "learning_rate": 3.436e-05, + "loss": 0.0, + "step": 391 + }, + { + "epoch": 1.564, + "eval_accuracy": 0.99, + "eval_loss": 0.04928350821137428, + "eval_runtime": 29.6543, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.124, + "step": 391 + }, + { + "epoch": 1.568, + "grad_norm": 0.3046875, + "learning_rate": 3.4320000000000003e-05, + "loss": 0.0014, + "step": 392 + }, + { + "epoch": 1.568, + "eval_accuracy": 0.99, + "eval_loss": 0.048312097787857056, + "eval_runtime": 29.6103, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 392 + }, + { + "epoch": 1.572, + "grad_norm": 0.10693359375, + "learning_rate": 3.4280000000000004e-05, + "loss": 0.0002, + "step": 393 + }, + { + "epoch": 1.572, + "eval_accuracy": 0.99, + "eval_loss": 0.047681793570518494, + "eval_runtime": 29.5949, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 393 + }, + { + "epoch": 1.576, + "grad_norm": 0.0341796875, + "learning_rate": 3.424e-05, + "loss": 0.0001, + "step": 394 + }, + { + "epoch": 1.576, + "eval_accuracy": 0.99, + "eval_loss": 0.0469617024064064, + "eval_runtime": 29.6317, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 394 + }, + { + "epoch": 1.58, + "grad_norm": 0.2265625, + "learning_rate": 3.4200000000000005e-05, + "loss": 0.0008, + "step": 395 + }, + { + "epoch": 1.58, + "eval_accuracy": 0.988, + "eval_loss": 0.04611307755112648, + "eval_runtime": 29.631, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 395 + }, + { + "epoch": 1.584, + "grad_norm": 0.00054168701171875, + "learning_rate": 3.4160000000000005e-05, + "loss": 0.0, + "step": 396 + }, + { + "epoch": 1.584, + "eval_accuracy": 0.988, + "eval_loss": 0.04510416463017464, + "eval_runtime": 29.5879, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 396 + }, + { + "epoch": 1.588, + "grad_norm": 0.0009002685546875, + "learning_rate": 3.412e-05, + "loss": 0.0, + "step": 397 + }, + { + "epoch": 1.588, + "eval_accuracy": 0.986, + "eval_loss": 0.04454531893134117, + "eval_runtime": 29.5873, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 397 + }, + { + "epoch": 1.592, + "grad_norm": 2.515625, + "learning_rate": 3.408e-05, + "loss": 0.0151, + "step": 398 + }, + { + "epoch": 1.592, + "eval_accuracy": 0.988, + "eval_loss": 0.04505619406700134, + "eval_runtime": 29.6413, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.125, + "step": 398 + }, + { + "epoch": 1.596, + "grad_norm": 3.65625, + "learning_rate": 3.404e-05, + "loss": 0.0156, + "step": 399 + }, + { + "epoch": 1.596, + "eval_accuracy": 0.986, + "eval_loss": 0.04331839829683304, + "eval_runtime": 29.6349, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 399 + }, + { + "epoch": 1.6, + "grad_norm": 0.00286865234375, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0, + "step": 400 + }, + { + "epoch": 1.6, + "eval_accuracy": 0.986, + "eval_loss": 0.04207012802362442, + "eval_runtime": 29.6228, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 400 + }, + { + "epoch": 1.604, + "grad_norm": 0.279296875, + "learning_rate": 3.396e-05, + "loss": 0.0011, + "step": 401 + }, + { + "epoch": 1.604, + "eval_accuracy": 0.988, + "eval_loss": 0.04162340611219406, + "eval_runtime": 29.5187, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.134, + "step": 401 + }, + { + "epoch": 1.608, + "grad_norm": 0.03173828125, + "learning_rate": 3.392e-05, + "loss": 0.0001, + "step": 402 + }, + { + "epoch": 1.608, + "eval_accuracy": 0.986, + "eval_loss": 0.04250640422105789, + "eval_runtime": 29.4917, + "eval_samples_per_second": 16.954, + "eval_steps_per_second": 2.136, + "step": 402 + }, + { + "epoch": 1.612, + "grad_norm": 1.0546875, + "learning_rate": 3.388e-05, + "loss": 0.0029, + "step": 403 + }, + { + "epoch": 1.612, + "eval_accuracy": 0.988, + "eval_loss": 0.04337010160088539, + "eval_runtime": 29.6002, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 403 + }, + { + "epoch": 1.616, + "grad_norm": 0.13671875, + "learning_rate": 3.384e-05, + "loss": 0.0006, + "step": 404 + }, + { + "epoch": 1.616, + "eval_accuracy": 0.986, + "eval_loss": 0.0438896045088768, + "eval_runtime": 29.5743, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 404 + }, + { + "epoch": 1.62, + "grad_norm": 0.134765625, + "learning_rate": 3.38e-05, + "loss": 0.0003, + "step": 405 + }, + { + "epoch": 1.62, + "eval_accuracy": 0.988, + "eval_loss": 0.04510389640927315, + "eval_runtime": 29.6372, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 405 + }, + { + "epoch": 1.624, + "grad_norm": 0.00051116943359375, + "learning_rate": 3.376e-05, + "loss": 0.0, + "step": 406 + }, + { + "epoch": 1.624, + "eval_accuracy": 0.986, + "eval_loss": 0.04580816254019737, + "eval_runtime": 29.6357, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 406 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 21.75, + "learning_rate": 3.372e-05, + "loss": 0.1099, + "step": 407 + }, + { + "epoch": 1.6280000000000001, + "eval_accuracy": 0.986, + "eval_loss": 0.04627891629934311, + "eval_runtime": 29.6293, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 407 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 4.7124922275543213e-07, + "learning_rate": 3.368e-05, + "loss": 0.0, + "step": 408 + }, + { + "epoch": 1.6320000000000001, + "eval_accuracy": 0.986, + "eval_loss": 0.0470556803047657, + "eval_runtime": 29.6245, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 408 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 50.5, + "learning_rate": 3.3639999999999996e-05, + "loss": 0.2539, + "step": 409 + }, + { + "epoch": 1.6360000000000001, + "eval_accuracy": 0.986, + "eval_loss": 0.04783182591199875, + "eval_runtime": 29.5778, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 409 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.014404296875, + "learning_rate": 3.3600000000000004e-05, + "loss": 0.0, + "step": 410 + }, + { + "epoch": 1.6400000000000001, + "eval_accuracy": 0.986, + "eval_loss": 0.046517953276634216, + "eval_runtime": 29.6213, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 410 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 0.0184326171875, + "learning_rate": 3.3560000000000004e-05, + "loss": 0.0001, + "step": 411 + }, + { + "epoch": 1.6440000000000001, + "eval_accuracy": 0.988, + "eval_loss": 0.046509597450494766, + "eval_runtime": 29.6302, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 411 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 3.361701965332031e-05, + "learning_rate": 3.3520000000000004e-05, + "loss": 0.0, + "step": 412 + }, + { + "epoch": 1.6480000000000001, + "eval_accuracy": 0.988, + "eval_loss": 0.04597162455320358, + "eval_runtime": 29.6427, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.125, + "step": 412 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 0.014404296875, + "learning_rate": 3.348e-05, + "loss": 0.0, + "step": 413 + }, + { + "epoch": 1.6520000000000001, + "eval_accuracy": 0.988, + "eval_loss": 0.0458352267742157, + "eval_runtime": 29.6189, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 413 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.94921875, + "learning_rate": 3.344e-05, + "loss": 0.0043, + "step": 414 + }, + { + "epoch": 1.6560000000000001, + "eval_accuracy": 0.99, + "eval_loss": 0.04465677589178085, + "eval_runtime": 29.5671, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 414 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 0.21484375, + "learning_rate": 3.3400000000000005e-05, + "loss": 0.0002, + "step": 415 + }, + { + "epoch": 1.6600000000000001, + "eval_accuracy": 0.99, + "eval_loss": 0.04561813175678253, + "eval_runtime": 29.5659, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 415 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.296875, + "learning_rate": 3.336e-05, + "loss": 0.0014, + "step": 416 + }, + { + "epoch": 1.6640000000000001, + "eval_accuracy": 0.99, + "eval_loss": 0.04659847170114517, + "eval_runtime": 29.5803, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 416 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 6.628036499023438e-05, + "learning_rate": 3.332e-05, + "loss": 0.0, + "step": 417 + }, + { + "epoch": 1.6680000000000001, + "eval_accuracy": 0.99, + "eval_loss": 0.04810212180018425, + "eval_runtime": 29.6348, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 417 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.004974365234375, + "learning_rate": 3.328e-05, + "loss": 0.0, + "step": 418 + }, + { + "epoch": 1.6720000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.0490386076271534, + "eval_runtime": 29.5772, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 418 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 9.393692016601562e-05, + "learning_rate": 3.324e-05, + "loss": 0.0, + "step": 419 + }, + { + "epoch": 1.6760000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.048343438655138016, + "eval_runtime": 29.5679, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 419 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.017822265625, + "learning_rate": 3.32e-05, + "loss": 0.0001, + "step": 420 + }, + { + "epoch": 1.6800000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04972007870674133, + "eval_runtime": 29.6232, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 420 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 4.25, + "learning_rate": 3.316e-05, + "loss": 0.0087, + "step": 421 + }, + { + "epoch": 1.6840000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04986485838890076, + "eval_runtime": 29.6181, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 421 + }, + { + "epoch": 1.688, + "grad_norm": 0.00118255615234375, + "learning_rate": 3.312e-05, + "loss": 0.0, + "step": 422 + }, + { + "epoch": 1.688, + "eval_accuracy": 0.992, + "eval_loss": 0.050420623272657394, + "eval_runtime": 29.5769, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 422 + }, + { + "epoch": 1.692, + "grad_norm": 59.0, + "learning_rate": 3.308e-05, + "loss": 0.3066, + "step": 423 + }, + { + "epoch": 1.692, + "eval_accuracy": 0.99, + "eval_loss": 0.05228077992796898, + "eval_runtime": 29.5879, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 423 + }, + { + "epoch": 1.696, + "grad_norm": 0.0023956298828125, + "learning_rate": 3.304e-05, + "loss": 0.0, + "step": 424 + }, + { + "epoch": 1.696, + "eval_accuracy": 0.99, + "eval_loss": 0.05255788564682007, + "eval_runtime": 29.5822, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 424 + }, + { + "epoch": 1.7, + "grad_norm": 31.0, + "learning_rate": 3.3e-05, + "loss": 0.1436, + "step": 425 + }, + { + "epoch": 1.7, + "eval_accuracy": 0.99, + "eval_loss": 0.05423716828227043, + "eval_runtime": 29.5748, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 425 + }, + { + "epoch": 1.704, + "grad_norm": 0.1845703125, + "learning_rate": 3.296e-05, + "loss": 0.0005, + "step": 426 + }, + { + "epoch": 1.704, + "eval_accuracy": 0.99, + "eval_loss": 0.05527259781956673, + "eval_runtime": 29.5701, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 426 + }, + { + "epoch": 1.708, + "grad_norm": 4.3125, + "learning_rate": 3.292e-05, + "loss": 0.0194, + "step": 427 + }, + { + "epoch": 1.708, + "eval_accuracy": 0.99, + "eval_loss": 0.05572051182389259, + "eval_runtime": 29.5821, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 427 + }, + { + "epoch": 1.712, + "grad_norm": 0.4140625, + "learning_rate": 3.288e-05, + "loss": 0.0007, + "step": 428 + }, + { + "epoch": 1.712, + "eval_accuracy": 0.99, + "eval_loss": 0.056864380836486816, + "eval_runtime": 29.6073, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 428 + }, + { + "epoch": 1.716, + "grad_norm": 0.00165557861328125, + "learning_rate": 3.2840000000000004e-05, + "loss": 0.0, + "step": 429 + }, + { + "epoch": 1.716, + "eval_accuracy": 0.99, + "eval_loss": 0.0584588497877121, + "eval_runtime": 29.6002, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 429 + }, + { + "epoch": 1.72, + "grad_norm": 0.0014190673828125, + "learning_rate": 3.2800000000000004e-05, + "loss": 0.0, + "step": 430 + }, + { + "epoch": 1.72, + "eval_accuracy": 0.99, + "eval_loss": 0.05960097536444664, + "eval_runtime": 29.6585, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.124, + "step": 430 + }, + { + "epoch": 1.724, + "grad_norm": 0.0130615234375, + "learning_rate": 3.2760000000000005e-05, + "loss": 0.0, + "step": 431 + }, + { + "epoch": 1.724, + "eval_accuracy": 0.99, + "eval_loss": 0.06020892411470413, + "eval_runtime": 29.6055, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 431 + }, + { + "epoch": 1.728, + "grad_norm": 13.25, + "learning_rate": 3.272e-05, + "loss": 0.0299, + "step": 432 + }, + { + "epoch": 1.728, + "eval_accuracy": 0.99, + "eval_loss": 0.059249553829431534, + "eval_runtime": 29.6344, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 432 + }, + { + "epoch": 1.732, + "grad_norm": 0.00677490234375, + "learning_rate": 3.268e-05, + "loss": 0.0, + "step": 433 + }, + { + "epoch": 1.732, + "eval_accuracy": 0.99, + "eval_loss": 0.0583471916615963, + "eval_runtime": 29.5732, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 433 + }, + { + "epoch": 1.736, + "grad_norm": 1.7523765563964844e-05, + "learning_rate": 3.2640000000000006e-05, + "loss": 0.0, + "step": 434 + }, + { + "epoch": 1.736, + "eval_accuracy": 0.99, + "eval_loss": 0.05761747807264328, + "eval_runtime": 29.5685, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 434 + }, + { + "epoch": 1.74, + "grad_norm": 1.6171875, + "learning_rate": 3.26e-05, + "loss": 0.0028, + "step": 435 + }, + { + "epoch": 1.74, + "eval_accuracy": 0.99, + "eval_loss": 0.057616934180259705, + "eval_runtime": 29.6281, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 435 + }, + { + "epoch": 1.744, + "grad_norm": 0.00140380859375, + "learning_rate": 3.256e-05, + "loss": 0.0, + "step": 436 + }, + { + "epoch": 1.744, + "eval_accuracy": 0.99, + "eval_loss": 0.05754851549863815, + "eval_runtime": 29.5853, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 436 + }, + { + "epoch": 1.748, + "grad_norm": 0.0255126953125, + "learning_rate": 3.252e-05, + "loss": 0.0001, + "step": 437 + }, + { + "epoch": 1.748, + "eval_accuracy": 0.99, + "eval_loss": 0.057762786746025085, + "eval_runtime": 29.5674, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 437 + }, + { + "epoch": 1.752, + "grad_norm": 2.3096799850463867e-06, + "learning_rate": 3.248e-05, + "loss": 0.0, + "step": 438 + }, + { + "epoch": 1.752, + "eval_accuracy": 0.99, + "eval_loss": 0.057130929082632065, + "eval_runtime": 29.6083, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 438 + }, + { + "epoch": 1.756, + "grad_norm": 0.0167236328125, + "learning_rate": 3.244e-05, + "loss": 0.0001, + "step": 439 + }, + { + "epoch": 1.756, + "eval_accuracy": 0.99, + "eval_loss": 0.05753396078944206, + "eval_runtime": 29.6095, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 439 + }, + { + "epoch": 1.76, + "grad_norm": 0.00018787384033203125, + "learning_rate": 3.24e-05, + "loss": 0.0, + "step": 440 + }, + { + "epoch": 1.76, + "eval_accuracy": 0.99, + "eval_loss": 0.05803915858268738, + "eval_runtime": 29.5705, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.13, + "step": 440 + }, + { + "epoch": 1.764, + "grad_norm": 6.1875, + "learning_rate": 3.236e-05, + "loss": 0.0129, + "step": 441 + }, + { + "epoch": 1.764, + "eval_accuracy": 0.99, + "eval_loss": 0.055975936353206635, + "eval_runtime": 29.58, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 441 + }, + { + "epoch": 1.768, + "grad_norm": 0.1640625, + "learning_rate": 3.232e-05, + "loss": 0.0003, + "step": 442 + }, + { + "epoch": 1.768, + "eval_accuracy": 0.99, + "eval_loss": 0.054880402982234955, + "eval_runtime": 29.5806, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 442 + }, + { + "epoch": 1.772, + "grad_norm": 9.3125, + "learning_rate": 3.2279999999999996e-05, + "loss": 0.0222, + "step": 443 + }, + { + "epoch": 1.772, + "eval_accuracy": 0.99, + "eval_loss": 0.051887646317481995, + "eval_runtime": 29.5792, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 443 + }, + { + "epoch": 1.776, + "grad_norm": 0.00909423828125, + "learning_rate": 3.224e-05, + "loss": 0.0, + "step": 444 + }, + { + "epoch": 1.776, + "eval_accuracy": 0.99, + "eval_loss": 0.050190314650535583, + "eval_runtime": 29.617, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 444 + }, + { + "epoch": 1.78, + "grad_norm": 0.1171875, + "learning_rate": 3.2200000000000003e-05, + "loss": 0.0005, + "step": 445 + }, + { + "epoch": 1.78, + "eval_accuracy": 0.992, + "eval_loss": 0.04871083423495293, + "eval_runtime": 29.6294, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 445 + }, + { + "epoch": 1.784, + "grad_norm": 8.165836334228516e-06, + "learning_rate": 3.2160000000000004e-05, + "loss": 0.0, + "step": 446 + }, + { + "epoch": 1.784, + "eval_accuracy": 0.99, + "eval_loss": 0.0485992468893528, + "eval_runtime": 29.5771, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 446 + }, + { + "epoch": 1.788, + "grad_norm": 0.00823974609375, + "learning_rate": 3.212e-05, + "loss": 0.0, + "step": 447 + }, + { + "epoch": 1.788, + "eval_accuracy": 0.992, + "eval_loss": 0.04576271027326584, + "eval_runtime": 29.6208, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 447 + }, + { + "epoch": 1.792, + "grad_norm": 0.05078125, + "learning_rate": 3.208e-05, + "loss": 0.0001, + "step": 448 + }, + { + "epoch": 1.792, + "eval_accuracy": 0.992, + "eval_loss": 0.04532519355416298, + "eval_runtime": 29.6331, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 448 + }, + { + "epoch": 1.796, + "grad_norm": 51.25, + "learning_rate": 3.2040000000000005e-05, + "loss": 0.3281, + "step": 449 + }, + { + "epoch": 1.796, + "eval_accuracy": 0.99, + "eval_loss": 0.044846054166555405, + "eval_runtime": 29.6318, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 449 + }, + { + "epoch": 1.8, + "grad_norm": 0.000774383544921875, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0, + "step": 450 + }, + { + "epoch": 1.8, + "eval_accuracy": 0.99, + "eval_loss": 0.04396031051874161, + "eval_runtime": 29.6207, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 450 + }, + { + "epoch": 1.804, + "grad_norm": 0.000286102294921875, + "learning_rate": 3.196e-05, + "loss": 0.0, + "step": 451 + }, + { + "epoch": 1.804, + "eval_accuracy": 0.99, + "eval_loss": 0.0420415885746479, + "eval_runtime": 29.3964, + "eval_samples_per_second": 17.009, + "eval_steps_per_second": 2.143, + "step": 451 + }, + { + "epoch": 1.808, + "grad_norm": 0.6953125, + "learning_rate": 3.192e-05, + "loss": 0.0027, + "step": 452 + }, + { + "epoch": 1.808, + "eval_accuracy": 0.99, + "eval_loss": 0.04303241893649101, + "eval_runtime": 29.4436, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.14, + "step": 452 + }, + { + "epoch": 1.812, + "grad_norm": 0.0986328125, + "learning_rate": 3.188e-05, + "loss": 0.0003, + "step": 453 + }, + { + "epoch": 1.812, + "eval_accuracy": 0.99, + "eval_loss": 0.04184306412935257, + "eval_runtime": 29.4815, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 2.137, + "step": 453 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 1.0609626770019531e-05, + "learning_rate": 3.184e-05, + "loss": 0.0, + "step": 454 + }, + { + "epoch": 1.8159999999999998, + "eval_accuracy": 0.988, + "eval_loss": 0.042281508445739746, + "eval_runtime": 29.5843, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.13, + "step": 454 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 0.0137939453125, + "learning_rate": 3.18e-05, + "loss": 0.0, + "step": 455 + }, + { + "epoch": 1.8199999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.042153824120759964, + "eval_runtime": 29.5938, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 455 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.0030517578125, + "learning_rate": 3.176e-05, + "loss": 0.0, + "step": 456 + }, + { + "epoch": 1.8239999999999998, + "eval_accuracy": 0.988, + "eval_loss": 0.042469702661037445, + "eval_runtime": 29.609, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 456 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 3.084540367126465e-06, + "learning_rate": 3.172e-05, + "loss": 0.0, + "step": 457 + }, + { + "epoch": 1.8279999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.0422566756606102, + "eval_runtime": 29.5703, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 457 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.11474609375, + "learning_rate": 3.168e-05, + "loss": 0.0005, + "step": 458 + }, + { + "epoch": 1.8319999999999999, + "eval_accuracy": 0.99, + "eval_loss": 0.041420578956604004, + "eval_runtime": 29.6134, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 458 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 4.28125, + "learning_rate": 3.164e-05, + "loss": 0.0084, + "step": 459 + }, + { + "epoch": 1.8359999999999999, + "eval_accuracy": 0.99, + "eval_loss": 0.040694572031497955, + "eval_runtime": 29.5741, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 459 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 5.698204040527344e-05, + "learning_rate": 3.16e-05, + "loss": 0.0, + "step": 460 + }, + { + "epoch": 1.8399999999999999, + "eval_accuracy": 0.99, + "eval_loss": 0.04125688225030899, + "eval_runtime": 29.5653, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 460 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 0.00101470947265625, + "learning_rate": 3.156e-05, + "loss": 0.0, + "step": 461 + }, + { + "epoch": 1.8439999999999999, + "eval_accuracy": 0.99, + "eval_loss": 0.04060182347893715, + "eval_runtime": 29.5632, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 461 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 18.375, + "learning_rate": 3.1519999999999996e-05, + "loss": 0.0029, + "step": 462 + }, + { + "epoch": 1.8479999999999999, + "eval_accuracy": 0.988, + "eval_loss": 0.040559835731983185, + "eval_runtime": 29.623, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 462 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 18.375, + "learning_rate": 3.1480000000000004e-05, + "loss": 0.0742, + "step": 463 + }, + { + "epoch": 1.8519999999999999, + "eval_accuracy": 0.988, + "eval_loss": 0.04083109647035599, + "eval_runtime": 29.5894, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.129, + "step": 463 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 9.679794311523438e-05, + "learning_rate": 3.1440000000000004e-05, + "loss": 0.0, + "step": 464 + }, + { + "epoch": 1.8559999999999999, + "eval_accuracy": 0.988, + "eval_loss": 0.03934440016746521, + "eval_runtime": 29.5854, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 464 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 54.5, + "learning_rate": 3.1400000000000004e-05, + "loss": 0.3086, + "step": 465 + }, + { + "epoch": 1.8599999999999999, + "eval_accuracy": 0.988, + "eval_loss": 0.04127156734466553, + "eval_runtime": 29.574, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 465 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 5.1875, + "learning_rate": 3.136e-05, + "loss": 0.1069, + "step": 466 + }, + { + "epoch": 1.8639999999999999, + "eval_accuracy": 0.99, + "eval_loss": 0.04152120277285576, + "eval_runtime": 29.5735, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 466 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 0.0179443359375, + "learning_rate": 3.132e-05, + "loss": 0.0001, + "step": 467 + }, + { + "epoch": 1.8679999999999999, + "eval_accuracy": 0.988, + "eval_loss": 0.0414867103099823, + "eval_runtime": 29.6299, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 467 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.00025177001953125, + "learning_rate": 3.1280000000000005e-05, + "loss": 0.0, + "step": 468 + }, + { + "epoch": 1.8719999999999999, + "eval_accuracy": 0.99, + "eval_loss": 0.04226494953036308, + "eval_runtime": 29.6422, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.125, + "step": 468 + }, + { + "epoch": 1.876, + "grad_norm": 0.0023345947265625, + "learning_rate": 3.1240000000000006e-05, + "loss": 0.0, + "step": 469 + }, + { + "epoch": 1.876, + "eval_accuracy": 0.99, + "eval_loss": 0.04160862788558006, + "eval_runtime": 29.5832, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.13, + "step": 469 + }, + { + "epoch": 1.88, + "grad_norm": 0.0262451171875, + "learning_rate": 3.12e-05, + "loss": 0.0001, + "step": 470 + }, + { + "epoch": 1.88, + "eval_accuracy": 0.99, + "eval_loss": 0.04169413074851036, + "eval_runtime": 29.6343, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 470 + }, + { + "epoch": 1.884, + "grad_norm": 0.0177001953125, + "learning_rate": 3.116e-05, + "loss": 0.0, + "step": 471 + }, + { + "epoch": 1.884, + "eval_accuracy": 0.99, + "eval_loss": 0.04050302132964134, + "eval_runtime": 29.5854, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 471 + }, + { + "epoch": 1.888, + "grad_norm": 2.078125, + "learning_rate": 3.112e-05, + "loss": 0.0058, + "step": 472 + }, + { + "epoch": 1.888, + "eval_accuracy": 0.99, + "eval_loss": 0.041037626564502716, + "eval_runtime": 29.6041, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 472 + }, + { + "epoch": 1.892, + "grad_norm": 56.25, + "learning_rate": 3.108e-05, + "loss": 0.4121, + "step": 473 + }, + { + "epoch": 1.892, + "eval_accuracy": 0.99, + "eval_loss": 0.04145205020904541, + "eval_runtime": 29.5967, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 473 + }, + { + "epoch": 1.896, + "grad_norm": 0.015869140625, + "learning_rate": 3.104e-05, + "loss": 0.0001, + "step": 474 + }, + { + "epoch": 1.896, + "eval_accuracy": 0.988, + "eval_loss": 0.04072720184922218, + "eval_runtime": 29.581, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 474 + }, + { + "epoch": 1.9, + "grad_norm": 0.0013275146484375, + "learning_rate": 3.1e-05, + "loss": 0.0, + "step": 475 + }, + { + "epoch": 1.9, + "eval_accuracy": 0.988, + "eval_loss": 0.04090461879968643, + "eval_runtime": 29.5731, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 475 + }, + { + "epoch": 1.904, + "grad_norm": 7.264316082000732e-07, + "learning_rate": 3.096e-05, + "loss": 0.0, + "step": 476 + }, + { + "epoch": 1.904, + "eval_accuracy": 0.99, + "eval_loss": 0.04181768000125885, + "eval_runtime": 29.5946, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 476 + }, + { + "epoch": 1.908, + "grad_norm": 6.341934204101562e-05, + "learning_rate": 3.092e-05, + "loss": 0.0, + "step": 477 + }, + { + "epoch": 1.908, + "eval_accuracy": 0.988, + "eval_loss": 0.04083319753408432, + "eval_runtime": 29.6494, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 477 + }, + { + "epoch": 1.912, + "grad_norm": 0.00225830078125, + "learning_rate": 3.088e-05, + "loss": 0.0, + "step": 478 + }, + { + "epoch": 1.912, + "eval_accuracy": 0.988, + "eval_loss": 0.04145211726427078, + "eval_runtime": 29.6044, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 478 + }, + { + "epoch": 1.916, + "grad_norm": 1.1640625, + "learning_rate": 3.084e-05, + "loss": 0.004, + "step": 479 + }, + { + "epoch": 1.916, + "eval_accuracy": 0.988, + "eval_loss": 0.04327717423439026, + "eval_runtime": 29.6485, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 479 + }, + { + "epoch": 1.92, + "grad_norm": 4.9591064453125e-05, + "learning_rate": 3.08e-05, + "loss": 0.0, + "step": 480 + }, + { + "epoch": 1.92, + "eval_accuracy": 0.986, + "eval_loss": 0.04455342888832092, + "eval_runtime": 29.5948, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 480 + }, + { + "epoch": 1.924, + "grad_norm": 82.0, + "learning_rate": 3.076e-05, + "loss": 0.3281, + "step": 481 + }, + { + "epoch": 1.924, + "eval_accuracy": 0.988, + "eval_loss": 0.046624235808849335, + "eval_runtime": 29.6468, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 481 + }, + { + "epoch": 1.928, + "grad_norm": 2.25, + "learning_rate": 3.072e-05, + "loss": 0.0092, + "step": 482 + }, + { + "epoch": 1.928, + "eval_accuracy": 0.986, + "eval_loss": 0.04853527992963791, + "eval_runtime": 29.5771, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 482 + }, + { + "epoch": 1.932, + "grad_norm": 0.01422119140625, + "learning_rate": 3.0680000000000004e-05, + "loss": 0.0, + "step": 483 + }, + { + "epoch": 1.932, + "eval_accuracy": 0.988, + "eval_loss": 0.049415215849876404, + "eval_runtime": 29.5623, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 483 + }, + { + "epoch": 1.936, + "grad_norm": 75.5, + "learning_rate": 3.0640000000000005e-05, + "loss": 0.9961, + "step": 484 + }, + { + "epoch": 1.936, + "eval_accuracy": 0.986, + "eval_loss": 0.0494716539978981, + "eval_runtime": 29.6256, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.127, + "step": 484 + }, + { + "epoch": 1.94, + "grad_norm": 0.02587890625, + "learning_rate": 3.06e-05, + "loss": 0.0001, + "step": 485 + }, + { + "epoch": 1.94, + "eval_accuracy": 0.99, + "eval_loss": 0.050509169697761536, + "eval_runtime": 29.6255, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.127, + "step": 485 + }, + { + "epoch": 1.944, + "grad_norm": 0.0016937255859375, + "learning_rate": 3.056e-05, + "loss": 0.0, + "step": 486 + }, + { + "epoch": 1.944, + "eval_accuracy": 0.988, + "eval_loss": 0.05192381888628006, + "eval_runtime": 29.6167, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 486 + }, + { + "epoch": 1.948, + "grad_norm": 0.035888671875, + "learning_rate": 3.0520000000000006e-05, + "loss": 0.0001, + "step": 487 + }, + { + "epoch": 1.948, + "eval_accuracy": 0.99, + "eval_loss": 0.0512448213994503, + "eval_runtime": 29.6297, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 487 + }, + { + "epoch": 1.952, + "grad_norm": 33.75, + "learning_rate": 3.0480000000000003e-05, + "loss": 0.1436, + "step": 488 + }, + { + "epoch": 1.952, + "eval_accuracy": 0.99, + "eval_loss": 0.05024087429046631, + "eval_runtime": 29.5758, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 488 + }, + { + "epoch": 1.956, + "grad_norm": 0.0269775390625, + "learning_rate": 3.0440000000000003e-05, + "loss": 0.0001, + "step": 489 + }, + { + "epoch": 1.956, + "eval_accuracy": 0.99, + "eval_loss": 0.0507158525288105, + "eval_runtime": 29.5742, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 489 + }, + { + "epoch": 1.96, + "grad_norm": 0.000186920166015625, + "learning_rate": 3.04e-05, + "loss": 0.0, + "step": 490 + }, + { + "epoch": 1.96, + "eval_accuracy": 0.99, + "eval_loss": 0.05065899342298508, + "eval_runtime": 29.6215, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 490 + }, + { + "epoch": 1.964, + "grad_norm": 24.875, + "learning_rate": 3.036e-05, + "loss": 0.0869, + "step": 491 + }, + { + "epoch": 1.964, + "eval_accuracy": 0.99, + "eval_loss": 0.05039619281888008, + "eval_runtime": 29.6225, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 491 + }, + { + "epoch": 1.968, + "grad_norm": 2.5153160095214844e-05, + "learning_rate": 3.0320000000000004e-05, + "loss": 0.0, + "step": 492 + }, + { + "epoch": 1.968, + "eval_accuracy": 0.99, + "eval_loss": 0.050637245178222656, + "eval_runtime": 29.6394, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.126, + "step": 492 + }, + { + "epoch": 1.972, + "grad_norm": 0.00016689300537109375, + "learning_rate": 3.028e-05, + "loss": 0.0, + "step": 493 + }, + { + "epoch": 1.972, + "eval_accuracy": 0.99, + "eval_loss": 0.05068213492631912, + "eval_runtime": 29.6581, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.124, + "step": 493 + }, + { + "epoch": 1.976, + "grad_norm": 0.13671875, + "learning_rate": 3.0240000000000002e-05, + "loss": 0.0004, + "step": 494 + }, + { + "epoch": 1.976, + "eval_accuracy": 0.99, + "eval_loss": 0.0499936044216156, + "eval_runtime": 29.6512, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 494 + }, + { + "epoch": 1.98, + "grad_norm": 147.0, + "learning_rate": 3.02e-05, + "loss": 0.6328, + "step": 495 + }, + { + "epoch": 1.98, + "eval_accuracy": 0.99, + "eval_loss": 0.04688640683889389, + "eval_runtime": 29.5956, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 495 + }, + { + "epoch": 1.984, + "grad_norm": 0.0400390625, + "learning_rate": 3.016e-05, + "loss": 0.0002, + "step": 496 + }, + { + "epoch": 1.984, + "eval_accuracy": 0.99, + "eval_loss": 0.04589138180017471, + "eval_runtime": 29.6434, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 496 + }, + { + "epoch": 1.988, + "grad_norm": 6.5, + "learning_rate": 3.0120000000000003e-05, + "loss": 0.0183, + "step": 497 + }, + { + "epoch": 1.988, + "eval_accuracy": 0.99, + "eval_loss": 0.04594644531607628, + "eval_runtime": 29.5803, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 497 + }, + { + "epoch": 1.992, + "grad_norm": 0.000873565673828125, + "learning_rate": 3.0080000000000003e-05, + "loss": 0.0, + "step": 498 + }, + { + "epoch": 1.992, + "eval_accuracy": 0.99, + "eval_loss": 0.045654188841581345, + "eval_runtime": 29.5742, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 498 + }, + { + "epoch": 1.996, + "grad_norm": 0.0019989013671875, + "learning_rate": 3.004e-05, + "loss": 0.0, + "step": 499 + }, + { + "epoch": 1.996, + "eval_accuracy": 0.99, + "eval_loss": 0.04669250175356865, + "eval_runtime": 29.6277, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 499 + }, + { + "epoch": 2.0, + "grad_norm": 0.107421875, + "learning_rate": 3e-05, + "loss": 0.0005, + "step": 500 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.992, + "eval_loss": 0.04524150863289833, + "eval_runtime": 29.5937, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 500 + }, + { + "epoch": 2.004, + "grad_norm": 0.038818359375, + "learning_rate": 2.9959999999999998e-05, + "loss": 0.0002, + "step": 501 + }, + { + "epoch": 2.004, + "eval_accuracy": 0.99, + "eval_loss": 0.045684993267059326, + "eval_runtime": 29.5055, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 501 + }, + { + "epoch": 2.008, + "grad_norm": 1.3113021850585938e-06, + "learning_rate": 2.9920000000000005e-05, + "loss": 0.0, + "step": 502 + }, + { + "epoch": 2.008, + "eval_accuracy": 0.99, + "eval_loss": 0.04524633288383484, + "eval_runtime": 29.4251, + "eval_samples_per_second": 16.992, + "eval_steps_per_second": 2.141, + "step": 502 + }, + { + "epoch": 2.012, + "grad_norm": 0.66796875, + "learning_rate": 2.9880000000000002e-05, + "loss": 0.0024, + "step": 503 + }, + { + "epoch": 2.012, + "eval_accuracy": 0.99, + "eval_loss": 0.04375443980097771, + "eval_runtime": 29.4976, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.136, + "step": 503 + }, + { + "epoch": 2.016, + "grad_norm": 0.00013637542724609375, + "learning_rate": 2.9840000000000002e-05, + "loss": 0.0, + "step": 504 + }, + { + "epoch": 2.016, + "eval_accuracy": 0.99, + "eval_loss": 0.042756035923957825, + "eval_runtime": 29.5527, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 504 + }, + { + "epoch": 2.02, + "grad_norm": 3.65625, + "learning_rate": 2.98e-05, + "loss": 0.0171, + "step": 505 + }, + { + "epoch": 2.02, + "eval_accuracy": 0.99, + "eval_loss": 0.042427610605955124, + "eval_runtime": 29.6155, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.127, + "step": 505 + }, + { + "epoch": 2.024, + "grad_norm": 0.000743865966796875, + "learning_rate": 2.976e-05, + "loss": 0.0, + "step": 506 + }, + { + "epoch": 2.024, + "eval_accuracy": 0.99, + "eval_loss": 0.04111619293689728, + "eval_runtime": 29.6151, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.127, + "step": 506 + }, + { + "epoch": 2.028, + "grad_norm": 7.75, + "learning_rate": 2.9720000000000003e-05, + "loss": 0.0327, + "step": 507 + }, + { + "epoch": 2.028, + "eval_accuracy": 0.99, + "eval_loss": 0.04136914387345314, + "eval_runtime": 29.6327, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 507 + }, + { + "epoch": 2.032, + "grad_norm": 0.0390625, + "learning_rate": 2.9680000000000004e-05, + "loss": 0.0001, + "step": 508 + }, + { + "epoch": 2.032, + "eval_accuracy": 0.99, + "eval_loss": 0.041296690702438354, + "eval_runtime": 29.5786, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 508 + }, + { + "epoch": 2.036, + "grad_norm": 0.0189208984375, + "learning_rate": 2.964e-05, + "loss": 0.0, + "step": 509 + }, + { + "epoch": 2.036, + "eval_accuracy": 0.99, + "eval_loss": 0.0424400195479393, + "eval_runtime": 29.6327, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 509 + }, + { + "epoch": 2.04, + "grad_norm": 17.25, + "learning_rate": 2.96e-05, + "loss": 0.0918, + "step": 510 + }, + { + "epoch": 2.04, + "eval_accuracy": 0.99, + "eval_loss": 0.04133867472410202, + "eval_runtime": 29.6187, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 510 + }, + { + "epoch": 2.044, + "grad_norm": 7.43865966796875e-05, + "learning_rate": 2.9559999999999998e-05, + "loss": 0.0, + "step": 511 + }, + { + "epoch": 2.044, + "eval_accuracy": 0.99, + "eval_loss": 0.04067375138401985, + "eval_runtime": 29.5727, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 511 + }, + { + "epoch": 2.048, + "grad_norm": 4.0, + "learning_rate": 2.9520000000000002e-05, + "loss": 0.0111, + "step": 512 + }, + { + "epoch": 2.048, + "eval_accuracy": 0.99, + "eval_loss": 0.040394917130470276, + "eval_runtime": 29.6184, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 512 + }, + { + "epoch": 2.052, + "grad_norm": 0.0019989013671875, + "learning_rate": 2.9480000000000002e-05, + "loss": 0.0, + "step": 513 + }, + { + "epoch": 2.052, + "eval_accuracy": 0.99, + "eval_loss": 0.03869184851646423, + "eval_runtime": 29.5871, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 513 + }, + { + "epoch": 2.056, + "grad_norm": 0.08251953125, + "learning_rate": 2.944e-05, + "loss": 0.0003, + "step": 514 + }, + { + "epoch": 2.056, + "eval_accuracy": 0.99, + "eval_loss": 0.03792296722531319, + "eval_runtime": 29.5852, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 514 + }, + { + "epoch": 2.06, + "grad_norm": 8.821487426757812e-05, + "learning_rate": 2.94e-05, + "loss": 0.0, + "step": 515 + }, + { + "epoch": 2.06, + "eval_accuracy": 0.99, + "eval_loss": 0.036994513124227524, + "eval_runtime": 29.5719, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 515 + }, + { + "epoch": 2.064, + "grad_norm": 6.71875, + "learning_rate": 2.9360000000000003e-05, + "loss": 0.0181, + "step": 516 + }, + { + "epoch": 2.064, + "eval_accuracy": 0.99, + "eval_loss": 0.036868028342723846, + "eval_runtime": 29.5693, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 516 + }, + { + "epoch": 2.068, + "grad_norm": 0.44921875, + "learning_rate": 2.9320000000000004e-05, + "loss": 0.0023, + "step": 517 + }, + { + "epoch": 2.068, + "eval_accuracy": 0.992, + "eval_loss": 0.03566519543528557, + "eval_runtime": 29.5766, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 517 + }, + { + "epoch": 2.072, + "grad_norm": 5.1975250244140625e-05, + "learning_rate": 2.928e-05, + "loss": 0.0, + "step": 518 + }, + { + "epoch": 2.072, + "eval_accuracy": 0.992, + "eval_loss": 0.03529760241508484, + "eval_runtime": 29.5679, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 518 + }, + { + "epoch": 2.076, + "grad_norm": 0.0751953125, + "learning_rate": 2.924e-05, + "loss": 0.0003, + "step": 519 + }, + { + "epoch": 2.076, + "eval_accuracy": 0.992, + "eval_loss": 0.035436466336250305, + "eval_runtime": 29.6188, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 519 + }, + { + "epoch": 2.08, + "grad_norm": 2.359375, + "learning_rate": 2.9199999999999998e-05, + "loss": 0.004, + "step": 520 + }, + { + "epoch": 2.08, + "eval_accuracy": 0.992, + "eval_loss": 0.03488504886627197, + "eval_runtime": 29.6339, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 520 + }, + { + "epoch": 2.084, + "grad_norm": 0.00836181640625, + "learning_rate": 2.9160000000000005e-05, + "loss": 0.0, + "step": 521 + }, + { + "epoch": 2.084, + "eval_accuracy": 0.992, + "eval_loss": 0.03614828363060951, + "eval_runtime": 29.5878, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 521 + }, + { + "epoch": 2.088, + "grad_norm": 0.045654296875, + "learning_rate": 2.9120000000000002e-05, + "loss": 0.0002, + "step": 522 + }, + { + "epoch": 2.088, + "eval_accuracy": 0.992, + "eval_loss": 0.03571697324514389, + "eval_runtime": 29.5809, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 522 + }, + { + "epoch": 2.092, + "grad_norm": 0.007171630859375, + "learning_rate": 2.9080000000000003e-05, + "loss": 0.0, + "step": 523 + }, + { + "epoch": 2.092, + "eval_accuracy": 0.992, + "eval_loss": 0.03579781576991081, + "eval_runtime": 29.5652, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 523 + }, + { + "epoch": 2.096, + "grad_norm": 2.625, + "learning_rate": 2.904e-05, + "loss": 0.0062, + "step": 524 + }, + { + "epoch": 2.096, + "eval_accuracy": 0.992, + "eval_loss": 0.035188838839530945, + "eval_runtime": 29.6283, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 524 + }, + { + "epoch": 2.1, + "grad_norm": 0.0006561279296875, + "learning_rate": 2.9e-05, + "loss": 0.0, + "step": 525 + }, + { + "epoch": 2.1, + "eval_accuracy": 0.992, + "eval_loss": 0.035695482045412064, + "eval_runtime": 29.574, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 525 + }, + { + "epoch": 2.104, + "grad_norm": 5.8650970458984375e-05, + "learning_rate": 2.8960000000000004e-05, + "loss": 0.0, + "step": 526 + }, + { + "epoch": 2.104, + "eval_accuracy": 0.992, + "eval_loss": 0.03589154779911041, + "eval_runtime": 29.5799, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 526 + }, + { + "epoch": 2.108, + "grad_norm": 0.1259765625, + "learning_rate": 2.8920000000000004e-05, + "loss": 0.0005, + "step": 527 + }, + { + "epoch": 2.108, + "eval_accuracy": 0.992, + "eval_loss": 0.03575721010565758, + "eval_runtime": 29.5872, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 527 + }, + { + "epoch": 2.112, + "grad_norm": 0.0869140625, + "learning_rate": 2.888e-05, + "loss": 0.0003, + "step": 528 + }, + { + "epoch": 2.112, + "eval_accuracy": 0.992, + "eval_loss": 0.03624149039387703, + "eval_runtime": 29.6038, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 528 + }, + { + "epoch": 2.116, + "grad_norm": 0.0015716552734375, + "learning_rate": 2.8840000000000002e-05, + "loss": 0.0, + "step": 529 + }, + { + "epoch": 2.116, + "eval_accuracy": 0.992, + "eval_loss": 0.0360187329351902, + "eval_runtime": 29.5864, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 529 + }, + { + "epoch": 2.12, + "grad_norm": 2.7179718017578125e-05, + "learning_rate": 2.88e-05, + "loss": 0.0, + "step": 530 + }, + { + "epoch": 2.12, + "eval_accuracy": 0.992, + "eval_loss": 0.036163799464702606, + "eval_runtime": 29.5753, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 530 + }, + { + "epoch": 2.124, + "grad_norm": 1.7881393432617188e-05, + "learning_rate": 2.8760000000000002e-05, + "loss": 0.0, + "step": 531 + }, + { + "epoch": 2.124, + "eval_accuracy": 0.992, + "eval_loss": 0.03639086335897446, + "eval_runtime": 29.6256, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.127, + "step": 531 + }, + { + "epoch": 2.128, + "grad_norm": 6.891787052154541e-07, + "learning_rate": 2.8720000000000003e-05, + "loss": 0.0, + "step": 532 + }, + { + "epoch": 2.128, + "eval_accuracy": 0.992, + "eval_loss": 0.035950351506471634, + "eval_runtime": 29.5791, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 532 + }, + { + "epoch": 2.132, + "grad_norm": 0.08544921875, + "learning_rate": 2.868e-05, + "loss": 0.0003, + "step": 533 + }, + { + "epoch": 2.132, + "eval_accuracy": 0.992, + "eval_loss": 0.03563554212450981, + "eval_runtime": 29.6243, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 533 + }, + { + "epoch": 2.136, + "grad_norm": 0.154296875, + "learning_rate": 2.864e-05, + "loss": 0.0004, + "step": 534 + }, + { + "epoch": 2.136, + "eval_accuracy": 0.992, + "eval_loss": 0.035604532808065414, + "eval_runtime": 29.5854, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 534 + }, + { + "epoch": 2.14, + "grad_norm": 63.5, + "learning_rate": 2.86e-05, + "loss": 0.5078, + "step": 535 + }, + { + "epoch": 2.14, + "eval_accuracy": 0.992, + "eval_loss": 0.03586398437619209, + "eval_runtime": 29.5814, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 535 + }, + { + "epoch": 2.144, + "grad_norm": 6.6875, + "learning_rate": 2.8560000000000004e-05, + "loss": 0.014, + "step": 536 + }, + { + "epoch": 2.144, + "eval_accuracy": 0.99, + "eval_loss": 0.03639049455523491, + "eval_runtime": 29.6192, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 536 + }, + { + "epoch": 2.148, + "grad_norm": 1.735985279083252e-06, + "learning_rate": 2.852e-05, + "loss": 0.0, + "step": 537 + }, + { + "epoch": 2.148, + "eval_accuracy": 0.99, + "eval_loss": 0.03739704564213753, + "eval_runtime": 29.6289, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 537 + }, + { + "epoch": 2.152, + "grad_norm": 7.75, + "learning_rate": 2.8480000000000002e-05, + "loss": 0.0119, + "step": 538 + }, + { + "epoch": 2.152, + "eval_accuracy": 0.99, + "eval_loss": 0.037241335958242416, + "eval_runtime": 29.6106, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 538 + }, + { + "epoch": 2.156, + "grad_norm": 0.1259765625, + "learning_rate": 2.844e-05, + "loss": 0.0006, + "step": 539 + }, + { + "epoch": 2.156, + "eval_accuracy": 0.99, + "eval_loss": 0.03755702078342438, + "eval_runtime": 29.5738, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 539 + }, + { + "epoch": 2.16, + "grad_norm": 2.9653310775756836e-06, + "learning_rate": 2.84e-05, + "loss": 0.0, + "step": 540 + }, + { + "epoch": 2.16, + "eval_accuracy": 0.99, + "eval_loss": 0.03782615065574646, + "eval_runtime": 29.5767, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 540 + }, + { + "epoch": 2.164, + "grad_norm": 0.03173828125, + "learning_rate": 2.8360000000000003e-05, + "loss": 0.0001, + "step": 541 + }, + { + "epoch": 2.164, + "eval_accuracy": 0.99, + "eval_loss": 0.03809650242328644, + "eval_runtime": 29.5756, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 541 + }, + { + "epoch": 2.168, + "grad_norm": 0.03271484375, + "learning_rate": 2.8320000000000003e-05, + "loss": 0.0001, + "step": 542 + }, + { + "epoch": 2.168, + "eval_accuracy": 0.988, + "eval_loss": 0.038281407207250595, + "eval_runtime": 29.6166, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 542 + }, + { + "epoch": 2.172, + "grad_norm": 2.6941299438476562e-05, + "learning_rate": 2.828e-05, + "loss": 0.0, + "step": 543 + }, + { + "epoch": 2.172, + "eval_accuracy": 0.988, + "eval_loss": 0.03809748962521553, + "eval_runtime": 29.6234, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 543 + }, + { + "epoch": 2.176, + "grad_norm": 0.0002460479736328125, + "learning_rate": 2.824e-05, + "loss": 0.0, + "step": 544 + }, + { + "epoch": 2.176, + "eval_accuracy": 0.988, + "eval_loss": 0.038248710334300995, + "eval_runtime": 29.6504, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.125, + "step": 544 + }, + { + "epoch": 2.18, + "grad_norm": 0.0028839111328125, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.0, + "step": 545 + }, + { + "epoch": 2.18, + "eval_accuracy": 0.988, + "eval_loss": 0.03828221186995506, + "eval_runtime": 29.6443, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 545 + }, + { + "epoch": 2.184, + "grad_norm": 5.555152893066406e-05, + "learning_rate": 2.816e-05, + "loss": 0.0, + "step": 546 + }, + { + "epoch": 2.184, + "eval_accuracy": 0.988, + "eval_loss": 0.03876853361725807, + "eval_runtime": 29.6312, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 546 + }, + { + "epoch": 2.188, + "grad_norm": 0.005889892578125, + "learning_rate": 2.8120000000000002e-05, + "loss": 0.0, + "step": 547 + }, + { + "epoch": 2.188, + "eval_accuracy": 0.988, + "eval_loss": 0.03910652920603752, + "eval_runtime": 29.6349, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 547 + }, + { + "epoch": 2.192, + "grad_norm": 0.2294921875, + "learning_rate": 2.8080000000000002e-05, + "loss": 0.0009, + "step": 548 + }, + { + "epoch": 2.192, + "eval_accuracy": 0.988, + "eval_loss": 0.03857544809579849, + "eval_runtime": 29.6678, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.124, + "step": 548 + }, + { + "epoch": 2.196, + "grad_norm": 1.5735626220703125e-05, + "learning_rate": 2.804e-05, + "loss": 0.0, + "step": 549 + }, + { + "epoch": 2.196, + "eval_accuracy": 0.988, + "eval_loss": 0.03866400197148323, + "eval_runtime": 29.6496, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 549 + }, + { + "epoch": 2.2, + "grad_norm": 0.0003662109375, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0, + "step": 550 + }, + { + "epoch": 2.2, + "eval_accuracy": 0.988, + "eval_loss": 0.038353774696588516, + "eval_runtime": 29.6054, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 550 + }, + { + "epoch": 2.204, + "grad_norm": 0.0008087158203125, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.0, + "step": 551 + }, + { + "epoch": 2.204, + "eval_accuracy": 0.988, + "eval_loss": 0.03830600157380104, + "eval_runtime": 29.4999, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.136, + "step": 551 + }, + { + "epoch": 2.208, + "grad_norm": 8.8125, + "learning_rate": 2.792e-05, + "loss": 0.0449, + "step": 552 + }, + { + "epoch": 2.208, + "eval_accuracy": 0.988, + "eval_loss": 0.03918537124991417, + "eval_runtime": 29.466, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.138, + "step": 552 + }, + { + "epoch": 2.212, + "grad_norm": 0.01171875, + "learning_rate": 2.788e-05, + "loss": 0.0001, + "step": 553 + }, + { + "epoch": 2.212, + "eval_accuracy": 0.988, + "eval_loss": 0.039132971316576004, + "eval_runtime": 29.5489, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 553 + }, + { + "epoch": 2.216, + "grad_norm": 82.0, + "learning_rate": 2.7839999999999998e-05, + "loss": 0.7734, + "step": 554 + }, + { + "epoch": 2.216, + "eval_accuracy": 0.988, + "eval_loss": 0.04014599695801735, + "eval_runtime": 29.548, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.132, + "step": 554 + }, + { + "epoch": 2.22, + "grad_norm": 0.006927490234375, + "learning_rate": 2.7800000000000005e-05, + "loss": 0.0, + "step": 555 + }, + { + "epoch": 2.22, + "eval_accuracy": 0.99, + "eval_loss": 0.04191231355071068, + "eval_runtime": 29.5661, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 555 + }, + { + "epoch": 2.224, + "grad_norm": 4.410743713378906e-05, + "learning_rate": 2.7760000000000002e-05, + "loss": 0.0, + "step": 556 + }, + { + "epoch": 2.224, + "eval_accuracy": 0.99, + "eval_loss": 0.04290500655770302, + "eval_runtime": 29.5724, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 556 + }, + { + "epoch": 2.228, + "grad_norm": 0.0091552734375, + "learning_rate": 2.7720000000000002e-05, + "loss": 0.0, + "step": 557 + }, + { + "epoch": 2.228, + "eval_accuracy": 0.99, + "eval_loss": 0.04387121647596359, + "eval_runtime": 29.5819, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 557 + }, + { + "epoch": 2.232, + "grad_norm": 0.00021457672119140625, + "learning_rate": 2.768e-05, + "loss": 0.0, + "step": 558 + }, + { + "epoch": 2.232, + "eval_accuracy": 0.99, + "eval_loss": 0.044530417770147324, + "eval_runtime": 29.6189, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 558 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 4.470348358154297e-06, + "learning_rate": 2.764e-05, + "loss": 0.0, + "step": 559 + }, + { + "epoch": 2.2359999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.0453888438642025, + "eval_runtime": 29.6242, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 559 + }, + { + "epoch": 2.24, + "grad_norm": 0.00121307373046875, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.0, + "step": 560 + }, + { + "epoch": 2.24, + "eval_accuracy": 0.99, + "eval_loss": 0.04467358812689781, + "eval_runtime": 29.6355, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 560 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 0.00653076171875, + "learning_rate": 2.7560000000000004e-05, + "loss": 0.0, + "step": 561 + }, + { + "epoch": 2.2439999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.04584978520870209, + "eval_runtime": 29.5928, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 561 + }, + { + "epoch": 2.248, + "grad_norm": 0.271484375, + "learning_rate": 2.752e-05, + "loss": 0.0004, + "step": 562 + }, + { + "epoch": 2.248, + "eval_accuracy": 0.99, + "eval_loss": 0.04464581981301308, + "eval_runtime": 29.5749, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 562 + }, + { + "epoch": 2.252, + "grad_norm": 0.2060546875, + "learning_rate": 2.748e-05, + "loss": 0.0007, + "step": 563 + }, + { + "epoch": 2.252, + "eval_accuracy": 0.99, + "eval_loss": 0.045410629361867905, + "eval_runtime": 29.5689, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 563 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 0.020751953125, + "learning_rate": 2.7439999999999998e-05, + "loss": 0.0, + "step": 564 + }, + { + "epoch": 2.2560000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.045135047286748886, + "eval_runtime": 29.5649, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 564 + }, + { + "epoch": 2.26, + "grad_norm": 338.0, + "learning_rate": 2.7400000000000002e-05, + "loss": 1.375, + "step": 565 + }, + { + "epoch": 2.26, + "eval_accuracy": 0.99, + "eval_loss": 0.04230556637048721, + "eval_runtime": 29.5705, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.13, + "step": 565 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 0.00106048583984375, + "learning_rate": 2.7360000000000002e-05, + "loss": 0.0, + "step": 566 + }, + { + "epoch": 2.2640000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04303897172212601, + "eval_runtime": 29.6145, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 566 + }, + { + "epoch": 2.268, + "grad_norm": 7.59375, + "learning_rate": 2.7320000000000003e-05, + "loss": 0.0469, + "step": 567 + }, + { + "epoch": 2.268, + "eval_accuracy": 0.99, + "eval_loss": 0.04300323501229286, + "eval_runtime": 29.6222, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 567 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.00164031982421875, + "learning_rate": 2.728e-05, + "loss": 0.0, + "step": 568 + }, + { + "epoch": 2.2720000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.043842192739248276, + "eval_runtime": 29.5865, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 568 + }, + { + "epoch": 2.276, + "grad_norm": 0.00022411346435546875, + "learning_rate": 2.724e-05, + "loss": 0.0, + "step": 569 + }, + { + "epoch": 2.276, + "eval_accuracy": 0.99, + "eval_loss": 0.043567270040512085, + "eval_runtime": 29.6249, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 569 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 35.25, + "learning_rate": 2.7200000000000004e-05, + "loss": 0.1621, + "step": 570 + }, + { + "epoch": 2.2800000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04446366801857948, + "eval_runtime": 29.5686, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 570 + }, + { + "epoch": 2.284, + "grad_norm": 0.275390625, + "learning_rate": 2.716e-05, + "loss": 0.0011, + "step": 571 + }, + { + "epoch": 2.284, + "eval_accuracy": 0.99, + "eval_loss": 0.04352005943655968, + "eval_runtime": 29.5762, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 571 + }, + { + "epoch": 2.288, + "grad_norm": 5.91278076171875e-05, + "learning_rate": 2.712e-05, + "loss": 0.0, + "step": 572 + }, + { + "epoch": 2.288, + "eval_accuracy": 0.99, + "eval_loss": 0.04475704953074455, + "eval_runtime": 29.5691, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 572 + }, + { + "epoch": 2.292, + "grad_norm": 1.1015625, + "learning_rate": 2.7079999999999998e-05, + "loss": 0.0015, + "step": 573 + }, + { + "epoch": 2.292, + "eval_accuracy": 0.99, + "eval_loss": 0.045727815479040146, + "eval_runtime": 29.5656, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 573 + }, + { + "epoch": 2.296, + "grad_norm": 4.112720489501953e-06, + "learning_rate": 2.704e-05, + "loss": 0.0, + "step": 574 + }, + { + "epoch": 2.296, + "eval_accuracy": 0.99, + "eval_loss": 0.04615223407745361, + "eval_runtime": 29.5708, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.13, + "step": 574 + }, + { + "epoch": 2.3, + "grad_norm": 0.1982421875, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0011, + "step": 575 + }, + { + "epoch": 2.3, + "eval_accuracy": 0.99, + "eval_loss": 0.046739671379327774, + "eval_runtime": 29.5619, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 575 + }, + { + "epoch": 2.304, + "grad_norm": 60.0, + "learning_rate": 2.6960000000000003e-05, + "loss": 0.4785, + "step": 576 + }, + { + "epoch": 2.304, + "eval_accuracy": 0.99, + "eval_loss": 0.045252349227666855, + "eval_runtime": 29.5507, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.132, + "step": 576 + }, + { + "epoch": 2.308, + "grad_norm": 0.031982421875, + "learning_rate": 2.692e-05, + "loss": 0.0001, + "step": 577 + }, + { + "epoch": 2.308, + "eval_accuracy": 0.99, + "eval_loss": 0.044204771518707275, + "eval_runtime": 29.5493, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 577 + }, + { + "epoch": 2.312, + "grad_norm": 0.000823974609375, + "learning_rate": 2.688e-05, + "loss": 0.0, + "step": 578 + }, + { + "epoch": 2.312, + "eval_accuracy": 0.99, + "eval_loss": 0.0439019501209259, + "eval_runtime": 29.6206, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 578 + }, + { + "epoch": 2.316, + "grad_norm": 1.296875, + "learning_rate": 2.6840000000000004e-05, + "loss": 0.0022, + "step": 579 + }, + { + "epoch": 2.316, + "eval_accuracy": 0.99, + "eval_loss": 0.04325909540057182, + "eval_runtime": 29.5595, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 579 + }, + { + "epoch": 2.32, + "grad_norm": 0.00311279296875, + "learning_rate": 2.6800000000000004e-05, + "loss": 0.0, + "step": 580 + }, + { + "epoch": 2.32, + "eval_accuracy": 0.99, + "eval_loss": 0.04274817556142807, + "eval_runtime": 29.5662, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 580 + }, + { + "epoch": 2.324, + "grad_norm": 0.6171875, + "learning_rate": 2.676e-05, + "loss": 0.0019, + "step": 581 + }, + { + "epoch": 2.324, + "eval_accuracy": 0.99, + "eval_loss": 0.04229842498898506, + "eval_runtime": 29.6213, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 581 + }, + { + "epoch": 2.328, + "grad_norm": 0.49609375, + "learning_rate": 2.672e-05, + "loss": 0.0012, + "step": 582 + }, + { + "epoch": 2.328, + "eval_accuracy": 0.992, + "eval_loss": 0.042954154312610626, + "eval_runtime": 29.619, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 582 + }, + { + "epoch": 2.332, + "grad_norm": 0.002349853515625, + "learning_rate": 2.668e-05, + "loss": 0.0, + "step": 583 + }, + { + "epoch": 2.332, + "eval_accuracy": 0.992, + "eval_loss": 0.04318434000015259, + "eval_runtime": 29.5736, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 583 + }, + { + "epoch": 2.336, + "grad_norm": 0.006683349609375, + "learning_rate": 2.6640000000000002e-05, + "loss": 0.0, + "step": 584 + }, + { + "epoch": 2.336, + "eval_accuracy": 0.992, + "eval_loss": 0.04294447600841522, + "eval_runtime": 29.6044, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 584 + }, + { + "epoch": 2.34, + "grad_norm": 1.0078125, + "learning_rate": 2.6600000000000003e-05, + "loss": 0.0029, + "step": 585 + }, + { + "epoch": 2.34, + "eval_accuracy": 0.992, + "eval_loss": 0.04387723654508591, + "eval_runtime": 29.598, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 585 + }, + { + "epoch": 2.344, + "grad_norm": 0.00653076171875, + "learning_rate": 2.6560000000000003e-05, + "loss": 0.0, + "step": 586 + }, + { + "epoch": 2.344, + "eval_accuracy": 0.992, + "eval_loss": 0.045031800866127014, + "eval_runtime": 29.5919, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 586 + }, + { + "epoch": 2.348, + "grad_norm": 0.00055694580078125, + "learning_rate": 2.652e-05, + "loss": 0.0, + "step": 587 + }, + { + "epoch": 2.348, + "eval_accuracy": 0.992, + "eval_loss": 0.04491906613111496, + "eval_runtime": 29.5624, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 587 + }, + { + "epoch": 2.352, + "grad_norm": 0.002655029296875, + "learning_rate": 2.648e-05, + "loss": 0.0, + "step": 588 + }, + { + "epoch": 2.352, + "eval_accuracy": 0.992, + "eval_loss": 0.045189693570137024, + "eval_runtime": 29.6102, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 588 + }, + { + "epoch": 2.356, + "grad_norm": 0.0106201171875, + "learning_rate": 2.6440000000000004e-05, + "loss": 0.0, + "step": 589 + }, + { + "epoch": 2.356, + "eval_accuracy": 0.992, + "eval_loss": 0.044962234795093536, + "eval_runtime": 29.5974, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 589 + }, + { + "epoch": 2.36, + "grad_norm": 0.185546875, + "learning_rate": 2.64e-05, + "loss": 0.0006, + "step": 590 + }, + { + "epoch": 2.36, + "eval_accuracy": 0.992, + "eval_loss": 0.04491714388132095, + "eval_runtime": 29.6017, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 590 + }, + { + "epoch": 2.364, + "grad_norm": 4.506111145019531e-05, + "learning_rate": 2.6360000000000002e-05, + "loss": 0.0, + "step": 591 + }, + { + "epoch": 2.364, + "eval_accuracy": 0.992, + "eval_loss": 0.04514503479003906, + "eval_runtime": 29.5615, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 591 + }, + { + "epoch": 2.368, + "grad_norm": 0.00063323974609375, + "learning_rate": 2.632e-05, + "loss": 0.0, + "step": 592 + }, + { + "epoch": 2.368, + "eval_accuracy": 0.992, + "eval_loss": 0.045434221625328064, + "eval_runtime": 29.5634, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 592 + }, + { + "epoch": 2.372, + "grad_norm": 0.484375, + "learning_rate": 2.628e-05, + "loss": 0.0019, + "step": 593 + }, + { + "epoch": 2.372, + "eval_accuracy": 0.992, + "eval_loss": 0.04527191072702408, + "eval_runtime": 29.5717, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 593 + }, + { + "epoch": 2.376, + "grad_norm": 0.004486083984375, + "learning_rate": 2.6240000000000003e-05, + "loss": 0.0, + "step": 594 + }, + { + "epoch": 2.376, + "eval_accuracy": 0.992, + "eval_loss": 0.04596880450844765, + "eval_runtime": 29.6112, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 594 + }, + { + "epoch": 2.38, + "grad_norm": 4.380941390991211e-06, + "learning_rate": 2.6200000000000003e-05, + "loss": 0.0, + "step": 595 + }, + { + "epoch": 2.38, + "eval_accuracy": 0.992, + "eval_loss": 0.046034738421440125, + "eval_runtime": 29.5739, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 595 + }, + { + "epoch": 2.384, + "grad_norm": 1.6640625, + "learning_rate": 2.616e-05, + "loss": 0.0053, + "step": 596 + }, + { + "epoch": 2.384, + "eval_accuracy": 0.992, + "eval_loss": 0.04650269076228142, + "eval_runtime": 29.6288, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 596 + }, + { + "epoch": 2.388, + "grad_norm": 2.03125, + "learning_rate": 2.612e-05, + "loss": 0.0045, + "step": 597 + }, + { + "epoch": 2.388, + "eval_accuracy": 0.992, + "eval_loss": 0.04628569632768631, + "eval_runtime": 29.6267, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.126, + "step": 597 + }, + { + "epoch": 2.392, + "grad_norm": 0.5625, + "learning_rate": 2.6079999999999998e-05, + "loss": 0.0023, + "step": 598 + }, + { + "epoch": 2.392, + "eval_accuracy": 0.992, + "eval_loss": 0.04611435905098915, + "eval_runtime": 29.5655, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 598 + }, + { + "epoch": 2.396, + "grad_norm": 0.0074462890625, + "learning_rate": 2.6040000000000005e-05, + "loss": 0.0, + "step": 599 + }, + { + "epoch": 2.396, + "eval_accuracy": 0.992, + "eval_loss": 0.04599509760737419, + "eval_runtime": 29.6063, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 599 + }, + { + "epoch": 2.4, + "grad_norm": 0.0380859375, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0002, + "step": 600 + }, + { + "epoch": 2.4, + "eval_accuracy": 0.992, + "eval_loss": 0.045650359243154526, + "eval_runtime": 29.618, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 600 + }, + { + "epoch": 2.404, + "grad_norm": 3.6656856536865234e-06, + "learning_rate": 2.5960000000000002e-05, + "loss": 0.0, + "step": 601 + }, + { + "epoch": 2.404, + "eval_accuracy": 0.992, + "eval_loss": 0.0458352193236351, + "eval_runtime": 29.5079, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.135, + "step": 601 + }, + { + "epoch": 2.408, + "grad_norm": 0.267578125, + "learning_rate": 2.592e-05, + "loss": 0.0005, + "step": 602 + }, + { + "epoch": 2.408, + "eval_accuracy": 0.992, + "eval_loss": 0.04518042132258415, + "eval_runtime": 29.4318, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.141, + "step": 602 + }, + { + "epoch": 2.412, + "grad_norm": 38.5, + "learning_rate": 2.588e-05, + "loss": 0.1582, + "step": 603 + }, + { + "epoch": 2.412, + "eval_accuracy": 0.992, + "eval_loss": 0.044891566038131714, + "eval_runtime": 29.5725, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 603 + }, + { + "epoch": 2.416, + "grad_norm": 0.054443359375, + "learning_rate": 2.5840000000000003e-05, + "loss": 0.0001, + "step": 604 + }, + { + "epoch": 2.416, + "eval_accuracy": 0.99, + "eval_loss": 0.04503171518445015, + "eval_runtime": 29.5991, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 604 + }, + { + "epoch": 2.42, + "grad_norm": 0.01092529296875, + "learning_rate": 2.58e-05, + "loss": 0.0, + "step": 605 + }, + { + "epoch": 2.42, + "eval_accuracy": 0.99, + "eval_loss": 0.04481023922562599, + "eval_runtime": 29.5904, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 605 + }, + { + "epoch": 2.424, + "grad_norm": 0.2578125, + "learning_rate": 2.576e-05, + "loss": 0.0005, + "step": 606 + }, + { + "epoch": 2.424, + "eval_accuracy": 0.99, + "eval_loss": 0.04564554616808891, + "eval_runtime": 29.5524, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 606 + }, + { + "epoch": 2.428, + "grad_norm": 94.0, + "learning_rate": 2.572e-05, + "loss": 0.6133, + "step": 607 + }, + { + "epoch": 2.428, + "eval_accuracy": 0.992, + "eval_loss": 0.046624038368463516, + "eval_runtime": 29.5535, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 607 + }, + { + "epoch": 2.432, + "grad_norm": 16.25, + "learning_rate": 2.5679999999999998e-05, + "loss": 0.0825, + "step": 608 + }, + { + "epoch": 2.432, + "eval_accuracy": 0.99, + "eval_loss": 0.0472550131380558, + "eval_runtime": 29.597, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 608 + }, + { + "epoch": 2.436, + "grad_norm": 1.8046875, + "learning_rate": 2.5640000000000002e-05, + "loss": 0.0049, + "step": 609 + }, + { + "epoch": 2.436, + "eval_accuracy": 0.99, + "eval_loss": 0.04957030341029167, + "eval_runtime": 29.6157, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.127, + "step": 609 + }, + { + "epoch": 2.44, + "grad_norm": 0.0015106201171875, + "learning_rate": 2.5600000000000002e-05, + "loss": 0.0, + "step": 610 + }, + { + "epoch": 2.44, + "eval_accuracy": 0.99, + "eval_loss": 0.052037451416254044, + "eval_runtime": 29.5702, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 610 + }, + { + "epoch": 2.444, + "grad_norm": 0.0224609375, + "learning_rate": 2.556e-05, + "loss": 0.0001, + "step": 611 + }, + { + "epoch": 2.444, + "eval_accuracy": 0.99, + "eval_loss": 0.05374222621321678, + "eval_runtime": 29.5682, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 611 + }, + { + "epoch": 2.448, + "grad_norm": 0.8515625, + "learning_rate": 2.552e-05, + "loss": 0.0024, + "step": 612 + }, + { + "epoch": 2.448, + "eval_accuracy": 0.99, + "eval_loss": 0.05670953169465065, + "eval_runtime": 29.6179, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 612 + }, + { + "epoch": 2.452, + "grad_norm": 9.685754776000977e-08, + "learning_rate": 2.5480000000000003e-05, + "loss": 0.0, + "step": 613 + }, + { + "epoch": 2.452, + "eval_accuracy": 0.99, + "eval_loss": 0.056698914617300034, + "eval_runtime": 29.6171, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 613 + }, + { + "epoch": 2.456, + "grad_norm": 0.01373291015625, + "learning_rate": 2.5440000000000004e-05, + "loss": 0.0, + "step": 614 + }, + { + "epoch": 2.456, + "eval_accuracy": 0.99, + "eval_loss": 0.05461630970239639, + "eval_runtime": 29.5695, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 614 + }, + { + "epoch": 2.46, + "grad_norm": 35.25, + "learning_rate": 2.54e-05, + "loss": 0.4902, + "step": 615 + }, + { + "epoch": 2.46, + "eval_accuracy": 0.99, + "eval_loss": 0.05321922153234482, + "eval_runtime": 29.6211, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 615 + }, + { + "epoch": 2.464, + "grad_norm": 6.15625, + "learning_rate": 2.536e-05, + "loss": 0.0317, + "step": 616 + }, + { + "epoch": 2.464, + "eval_accuracy": 0.99, + "eval_loss": 0.05391997843980789, + "eval_runtime": 29.5824, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 616 + }, + { + "epoch": 2.468, + "grad_norm": 0.076171875, + "learning_rate": 2.5319999999999998e-05, + "loss": 0.0002, + "step": 617 + }, + { + "epoch": 2.468, + "eval_accuracy": 0.99, + "eval_loss": 0.053858548402786255, + "eval_runtime": 29.615, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.127, + "step": 617 + }, + { + "epoch": 2.472, + "grad_norm": 6.866455078125e-05, + "learning_rate": 2.5280000000000005e-05, + "loss": 0.0, + "step": 618 + }, + { + "epoch": 2.472, + "eval_accuracy": 0.99, + "eval_loss": 0.05427628383040428, + "eval_runtime": 29.5778, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 618 + }, + { + "epoch": 2.476, + "grad_norm": 0.05029296875, + "learning_rate": 2.5240000000000002e-05, + "loss": 0.0001, + "step": 619 + }, + { + "epoch": 2.476, + "eval_accuracy": 0.99, + "eval_loss": 0.053539618849754333, + "eval_runtime": 29.5689, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 619 + }, + { + "epoch": 2.48, + "grad_norm": 4.6193599700927734e-07, + "learning_rate": 2.5200000000000003e-05, + "loss": 0.0, + "step": 620 + }, + { + "epoch": 2.48, + "eval_accuracy": 0.99, + "eval_loss": 0.053566209971904755, + "eval_runtime": 29.5814, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 620 + }, + { + "epoch": 2.484, + "grad_norm": 17.375, + "learning_rate": 2.516e-05, + "loss": 0.1006, + "step": 621 + }, + { + "epoch": 2.484, + "eval_accuracy": 0.99, + "eval_loss": 0.054140232503414154, + "eval_runtime": 29.5803, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 621 + }, + { + "epoch": 2.488, + "grad_norm": 0.3671875, + "learning_rate": 2.512e-05, + "loss": 0.0005, + "step": 622 + }, + { + "epoch": 2.488, + "eval_accuracy": 0.99, + "eval_loss": 0.05433822050690651, + "eval_runtime": 29.5876, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 622 + }, + { + "epoch": 2.492, + "grad_norm": 0.0027923583984375, + "learning_rate": 2.5080000000000004e-05, + "loss": 0.0, + "step": 623 + }, + { + "epoch": 2.492, + "eval_accuracy": 0.99, + "eval_loss": 0.054077088832855225, + "eval_runtime": 29.611, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 623 + }, + { + "epoch": 2.496, + "grad_norm": 3.2901763916015625e-05, + "learning_rate": 2.504e-05, + "loss": 0.0, + "step": 624 + }, + { + "epoch": 2.496, + "eval_accuracy": 0.99, + "eval_loss": 0.054623086005449295, + "eval_runtime": 29.606, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 624 + }, + { + "epoch": 2.5, + "grad_norm": 5.699694156646729e-07, + "learning_rate": 2.5e-05, + "loss": 0.0, + "step": 625 + }, + { + "epoch": 2.5, + "eval_accuracy": 0.99, + "eval_loss": 0.05433819815516472, + "eval_runtime": 29.5672, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 625 + }, + { + "epoch": 2.504, + "grad_norm": 1.049041748046875e-05, + "learning_rate": 2.496e-05, + "loss": 0.0, + "step": 626 + }, + { + "epoch": 2.504, + "eval_accuracy": 0.99, + "eval_loss": 0.05417303740978241, + "eval_runtime": 29.6213, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 626 + }, + { + "epoch": 2.508, + "grad_norm": 39.25, + "learning_rate": 2.4920000000000002e-05, + "loss": 0.126, + "step": 627 + }, + { + "epoch": 2.508, + "eval_accuracy": 0.99, + "eval_loss": 0.05326959863305092, + "eval_runtime": 29.6291, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 627 + }, + { + "epoch": 2.512, + "grad_norm": 0.96484375, + "learning_rate": 2.488e-05, + "loss": 0.0016, + "step": 628 + }, + { + "epoch": 2.512, + "eval_accuracy": 0.99, + "eval_loss": 0.05069474130868912, + "eval_runtime": 29.5648, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 628 + }, + { + "epoch": 2.516, + "grad_norm": 0.00836181640625, + "learning_rate": 2.4840000000000003e-05, + "loss": 0.0, + "step": 629 + }, + { + "epoch": 2.516, + "eval_accuracy": 0.99, + "eval_loss": 0.04954442009329796, + "eval_runtime": 29.5567, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.131, + "step": 629 + }, + { + "epoch": 2.52, + "grad_norm": 1.233816146850586e-05, + "learning_rate": 2.48e-05, + "loss": 0.0, + "step": 630 + }, + { + "epoch": 2.52, + "eval_accuracy": 0.99, + "eval_loss": 0.04910340905189514, + "eval_runtime": 29.5661, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 630 + }, + { + "epoch": 2.524, + "grad_norm": 0.01239013671875, + "learning_rate": 2.476e-05, + "loss": 0.0, + "step": 631 + }, + { + "epoch": 2.524, + "eval_accuracy": 0.99, + "eval_loss": 0.04852326214313507, + "eval_runtime": 29.5667, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 631 + }, + { + "epoch": 2.528, + "grad_norm": 0.00012874603271484375, + "learning_rate": 2.472e-05, + "loss": 0.0, + "step": 632 + }, + { + "epoch": 2.528, + "eval_accuracy": 0.99, + "eval_loss": 0.04870692268013954, + "eval_runtime": 29.6103, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 632 + }, + { + "epoch": 2.532, + "grad_norm": 1.5735626220703125e-05, + "learning_rate": 2.468e-05, + "loss": 0.0, + "step": 633 + }, + { + "epoch": 2.532, + "eval_accuracy": 0.99, + "eval_loss": 0.04821067303419113, + "eval_runtime": 29.5546, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 633 + }, + { + "epoch": 2.536, + "grad_norm": 0.005096435546875, + "learning_rate": 2.464e-05, + "loss": 0.0, + "step": 634 + }, + { + "epoch": 2.536, + "eval_accuracy": 0.99, + "eval_loss": 0.04783326014876366, + "eval_runtime": 29.5653, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 634 + }, + { + "epoch": 2.54, + "grad_norm": 3.765625, + "learning_rate": 2.46e-05, + "loss": 0.0087, + "step": 635 + }, + { + "epoch": 2.54, + "eval_accuracy": 0.99, + "eval_loss": 0.04818731173872948, + "eval_runtime": 29.5745, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 635 + }, + { + "epoch": 2.544, + "grad_norm": 0.000324249267578125, + "learning_rate": 2.4560000000000002e-05, + "loss": 0.0, + "step": 636 + }, + { + "epoch": 2.544, + "eval_accuracy": 0.99, + "eval_loss": 0.04767238348722458, + "eval_runtime": 29.6294, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 636 + }, + { + "epoch": 2.548, + "grad_norm": 0.00909423828125, + "learning_rate": 2.4520000000000002e-05, + "loss": 0.0, + "step": 637 + }, + { + "epoch": 2.548, + "eval_accuracy": 0.988, + "eval_loss": 0.0474889911711216, + "eval_runtime": 29.6395, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.126, + "step": 637 + }, + { + "epoch": 2.552, + "grad_norm": 1.3485550880432129e-06, + "learning_rate": 2.448e-05, + "loss": 0.0, + "step": 638 + }, + { + "epoch": 2.552, + "eval_accuracy": 0.988, + "eval_loss": 0.046911563724279404, + "eval_runtime": 29.63, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 638 + }, + { + "epoch": 2.556, + "grad_norm": 0.412109375, + "learning_rate": 2.4440000000000003e-05, + "loss": 0.0023, + "step": 639 + }, + { + "epoch": 2.556, + "eval_accuracy": 0.99, + "eval_loss": 0.04622817412018776, + "eval_runtime": 29.5628, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 639 + }, + { + "epoch": 2.56, + "grad_norm": 0.0220947265625, + "learning_rate": 2.44e-05, + "loss": 0.0001, + "step": 640 + }, + { + "epoch": 2.56, + "eval_accuracy": 0.988, + "eval_loss": 0.04699818044900894, + "eval_runtime": 29.5601, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 640 + }, + { + "epoch": 2.564, + "grad_norm": 0.00121307373046875, + "learning_rate": 2.4360000000000004e-05, + "loss": 0.0, + "step": 641 + }, + { + "epoch": 2.564, + "eval_accuracy": 0.988, + "eval_loss": 0.0462079793214798, + "eval_runtime": 29.5659, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 641 + }, + { + "epoch": 2.568, + "grad_norm": 0.00909423828125, + "learning_rate": 2.432e-05, + "loss": 0.0, + "step": 642 + }, + { + "epoch": 2.568, + "eval_accuracy": 0.99, + "eval_loss": 0.046004801988601685, + "eval_runtime": 29.5529, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 642 + }, + { + "epoch": 2.572, + "grad_norm": 0.005767822265625, + "learning_rate": 2.428e-05, + "loss": 0.0, + "step": 643 + }, + { + "epoch": 2.572, + "eval_accuracy": 0.988, + "eval_loss": 0.04656299576163292, + "eval_runtime": 29.5672, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 643 + }, + { + "epoch": 2.576, + "grad_norm": 0.047607421875, + "learning_rate": 2.4240000000000002e-05, + "loss": 0.0002, + "step": 644 + }, + { + "epoch": 2.576, + "eval_accuracy": 0.99, + "eval_loss": 0.0464743971824646, + "eval_runtime": 29.5677, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 644 + }, + { + "epoch": 2.58, + "grad_norm": 0.016357421875, + "learning_rate": 2.4200000000000002e-05, + "loss": 0.0001, + "step": 645 + }, + { + "epoch": 2.58, + "eval_accuracy": 0.988, + "eval_loss": 0.045848287642002106, + "eval_runtime": 29.6174, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 645 + }, + { + "epoch": 2.584, + "grad_norm": 0.00011205673217773438, + "learning_rate": 2.4160000000000002e-05, + "loss": 0.0, + "step": 646 + }, + { + "epoch": 2.584, + "eval_accuracy": 0.99, + "eval_loss": 0.04553021863102913, + "eval_runtime": 29.5786, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 646 + }, + { + "epoch": 2.588, + "grad_norm": 0.000270843505859375, + "learning_rate": 2.412e-05, + "loss": 0.0, + "step": 647 + }, + { + "epoch": 2.588, + "eval_accuracy": 0.988, + "eval_loss": 0.04596227407455444, + "eval_runtime": 29.5772, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 647 + }, + { + "epoch": 2.592, + "grad_norm": 0.013427734375, + "learning_rate": 2.408e-05, + "loss": 0.0, + "step": 648 + }, + { + "epoch": 2.592, + "eval_accuracy": 0.99, + "eval_loss": 0.04561232775449753, + "eval_runtime": 29.6203, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 648 + }, + { + "epoch": 2.596, + "grad_norm": 3.5762786865234375e-05, + "learning_rate": 2.404e-05, + "loss": 0.0, + "step": 649 + }, + { + "epoch": 2.596, + "eval_accuracy": 0.99, + "eval_loss": 0.04489670693874359, + "eval_runtime": 29.6275, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 649 + }, + { + "epoch": 2.6, + "grad_norm": 0.049560546875, + "learning_rate": 2.4e-05, + "loss": 0.0002, + "step": 650 + }, + { + "epoch": 2.6, + "eval_accuracy": 0.99, + "eval_loss": 0.04551723226904869, + "eval_runtime": 29.5807, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 650 + }, + { + "epoch": 2.604, + "grad_norm": 0.9609375, + "learning_rate": 2.396e-05, + "loss": 0.0021, + "step": 651 + }, + { + "epoch": 2.604, + "eval_accuracy": 0.99, + "eval_loss": 0.045753296464681625, + "eval_runtime": 29.3844, + "eval_samples_per_second": 17.016, + "eval_steps_per_second": 2.144, + "step": 651 + }, + { + "epoch": 2.608, + "grad_norm": 1.0788440704345703e-05, + "learning_rate": 2.392e-05, + "loss": 0.0, + "step": 652 + }, + { + "epoch": 2.608, + "eval_accuracy": 0.99, + "eval_loss": 0.04582967236638069, + "eval_runtime": 29.4175, + "eval_samples_per_second": 16.997, + "eval_steps_per_second": 2.142, + "step": 652 + }, + { + "epoch": 2.612, + "grad_norm": 0.003173828125, + "learning_rate": 2.3880000000000002e-05, + "loss": 0.0, + "step": 653 + }, + { + "epoch": 2.612, + "eval_accuracy": 0.99, + "eval_loss": 0.04627445340156555, + "eval_runtime": 29.4994, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.136, + "step": 653 + }, + { + "epoch": 2.616, + "grad_norm": 20.25, + "learning_rate": 2.3840000000000002e-05, + "loss": 0.0386, + "step": 654 + }, + { + "epoch": 2.616, + "eval_accuracy": 0.99, + "eval_loss": 0.04740177094936371, + "eval_runtime": 29.5952, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 654 + }, + { + "epoch": 2.62, + "grad_norm": 0.0400390625, + "learning_rate": 2.38e-05, + "loss": 0.0001, + "step": 655 + }, + { + "epoch": 2.62, + "eval_accuracy": 0.99, + "eval_loss": 0.0473930723965168, + "eval_runtime": 29.6018, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 655 + }, + { + "epoch": 2.624, + "grad_norm": 0.000476837158203125, + "learning_rate": 2.3760000000000003e-05, + "loss": 0.0, + "step": 656 + }, + { + "epoch": 2.624, + "eval_accuracy": 0.99, + "eval_loss": 0.048024822026491165, + "eval_runtime": 29.6124, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.127, + "step": 656 + }, + { + "epoch": 2.628, + "grad_norm": 0.390625, + "learning_rate": 2.372e-05, + "loss": 0.0016, + "step": 657 + }, + { + "epoch": 2.628, + "eval_accuracy": 0.99, + "eval_loss": 0.04743267968297005, + "eval_runtime": 29.5648, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 657 + }, + { + "epoch": 2.632, + "grad_norm": 0.462890625, + "learning_rate": 2.3680000000000004e-05, + "loss": 0.0003, + "step": 658 + }, + { + "epoch": 2.632, + "eval_accuracy": 0.99, + "eval_loss": 0.047982972115278244, + "eval_runtime": 29.5544, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 658 + }, + { + "epoch": 2.636, + "grad_norm": 0.0001392364501953125, + "learning_rate": 2.364e-05, + "loss": 0.0, + "step": 659 + }, + { + "epoch": 2.636, + "eval_accuracy": 0.99, + "eval_loss": 0.04943491518497467, + "eval_runtime": 29.6063, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 659 + }, + { + "epoch": 2.64, + "grad_norm": 0.001190185546875, + "learning_rate": 2.36e-05, + "loss": 0.0, + "step": 660 + }, + { + "epoch": 2.64, + "eval_accuracy": 0.99, + "eval_loss": 0.05000165104866028, + "eval_runtime": 29.6089, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 660 + }, + { + "epoch": 2.644, + "grad_norm": 0.0027313232421875, + "learning_rate": 2.356e-05, + "loss": 0.0, + "step": 661 + }, + { + "epoch": 2.644, + "eval_accuracy": 0.99, + "eval_loss": 0.051509130746126175, + "eval_runtime": 29.6061, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 661 + }, + { + "epoch": 2.648, + "grad_norm": 4.553794860839844e-05, + "learning_rate": 2.3520000000000002e-05, + "loss": 0.0, + "step": 662 + }, + { + "epoch": 2.648, + "eval_accuracy": 0.99, + "eval_loss": 0.05128471553325653, + "eval_runtime": 29.5654, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 662 + }, + { + "epoch": 2.652, + "grad_norm": 1.6540288925170898e-06, + "learning_rate": 2.3480000000000002e-05, + "loss": 0.0, + "step": 663 + }, + { + "epoch": 2.652, + "eval_accuracy": 0.99, + "eval_loss": 0.052438460290431976, + "eval_runtime": 29.6126, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.127, + "step": 663 + }, + { + "epoch": 2.656, + "grad_norm": 0.0029754638671875, + "learning_rate": 2.344e-05, + "loss": 0.0, + "step": 664 + }, + { + "epoch": 2.656, + "eval_accuracy": 0.99, + "eval_loss": 0.05244976654648781, + "eval_runtime": 29.5669, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 664 + }, + { + "epoch": 2.66, + "grad_norm": 0.0040283203125, + "learning_rate": 2.3400000000000003e-05, + "loss": 0.0, + "step": 665 + }, + { + "epoch": 2.66, + "eval_accuracy": 0.99, + "eval_loss": 0.05354370176792145, + "eval_runtime": 29.611, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 665 + }, + { + "epoch": 2.664, + "grad_norm": 59.5, + "learning_rate": 2.336e-05, + "loss": 1.7734, + "step": 666 + }, + { + "epoch": 2.664, + "eval_accuracy": 0.99, + "eval_loss": 0.05377240106463432, + "eval_runtime": 29.5627, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 666 + }, + { + "epoch": 2.668, + "grad_norm": 0.0390625, + "learning_rate": 2.332e-05, + "loss": 0.0001, + "step": 667 + }, + { + "epoch": 2.668, + "eval_accuracy": 0.99, + "eval_loss": 0.054078273475170135, + "eval_runtime": 29.5513, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.132, + "step": 667 + }, + { + "epoch": 2.672, + "grad_norm": 1.55717134475708e-06, + "learning_rate": 2.328e-05, + "loss": 0.0, + "step": 668 + }, + { + "epoch": 2.672, + "eval_accuracy": 0.99, + "eval_loss": 0.05282522737979889, + "eval_runtime": 29.5623, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 668 + }, + { + "epoch": 2.676, + "grad_norm": 0.150390625, + "learning_rate": 2.324e-05, + "loss": 0.0004, + "step": 669 + }, + { + "epoch": 2.676, + "eval_accuracy": 0.99, + "eval_loss": 0.05277922749519348, + "eval_runtime": 29.5649, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 669 + }, + { + "epoch": 2.68, + "grad_norm": 0.0277099609375, + "learning_rate": 2.32e-05, + "loss": 0.0001, + "step": 670 + }, + { + "epoch": 2.68, + "eval_accuracy": 0.99, + "eval_loss": 0.053095199167728424, + "eval_runtime": 29.6082, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 670 + }, + { + "epoch": 2.684, + "grad_norm": 9.3125, + "learning_rate": 2.3160000000000002e-05, + "loss": 0.0232, + "step": 671 + }, + { + "epoch": 2.684, + "eval_accuracy": 0.99, + "eval_loss": 0.052703745663166046, + "eval_runtime": 29.563, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 671 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 0.00787353515625, + "learning_rate": 2.312e-05, + "loss": 0.0, + "step": 672 + }, + { + "epoch": 2.6879999999999997, + "eval_accuracy": 0.99, + "eval_loss": 0.051467057317495346, + "eval_runtime": 29.5651, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 672 + }, + { + "epoch": 2.692, + "grad_norm": 0.408203125, + "learning_rate": 2.3080000000000003e-05, + "loss": 0.0011, + "step": 673 + }, + { + "epoch": 2.692, + "eval_accuracy": 0.99, + "eval_loss": 0.051942236721515656, + "eval_runtime": 29.5732, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 673 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 1.6328125, + "learning_rate": 2.304e-05, + "loss": 0.0034, + "step": 674 + }, + { + "epoch": 2.6959999999999997, + "eval_accuracy": 0.99, + "eval_loss": 0.05009673163294792, + "eval_runtime": 29.5703, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 674 + }, + { + "epoch": 2.7, + "grad_norm": 0.00010347366333007812, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0, + "step": 675 + }, + { + "epoch": 2.7, + "eval_accuracy": 0.99, + "eval_loss": 0.05019540339708328, + "eval_runtime": 29.5638, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 675 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 0.01953125, + "learning_rate": 2.296e-05, + "loss": 0.0001, + "step": 676 + }, + { + "epoch": 2.7039999999999997, + "eval_accuracy": 0.99, + "eval_loss": 0.0487380214035511, + "eval_runtime": 29.5659, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 676 + }, + { + "epoch": 2.708, + "grad_norm": 0.037109375, + "learning_rate": 2.292e-05, + "loss": 0.0001, + "step": 677 + }, + { + "epoch": 2.708, + "eval_accuracy": 0.99, + "eval_loss": 0.049743782728910446, + "eval_runtime": 29.5732, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 677 + }, + { + "epoch": 2.7119999999999997, + "grad_norm": 1.3984375, + "learning_rate": 2.288e-05, + "loss": 0.0035, + "step": 678 + }, + { + "epoch": 2.7119999999999997, + "eval_accuracy": 0.99, + "eval_loss": 0.04858223721385002, + "eval_runtime": 29.6185, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 678 + }, + { + "epoch": 2.716, + "grad_norm": 0.000278472900390625, + "learning_rate": 2.284e-05, + "loss": 0.0, + "step": 679 + }, + { + "epoch": 2.716, + "eval_accuracy": 0.99, + "eval_loss": 0.04874448478221893, + "eval_runtime": 29.6182, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 679 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.248046875, + "learning_rate": 2.2800000000000002e-05, + "loss": 0.0008, + "step": 680 + }, + { + "epoch": 2.7199999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.04635223373770714, + "eval_runtime": 29.5847, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.129, + "step": 680 + }, + { + "epoch": 2.724, + "grad_norm": 0.0001811981201171875, + "learning_rate": 2.2760000000000002e-05, + "loss": 0.0, + "step": 681 + }, + { + "epoch": 2.724, + "eval_accuracy": 0.99, + "eval_loss": 0.046702850610017776, + "eval_runtime": 29.6221, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 681 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 7.363269105553627e-09, + "learning_rate": 2.2720000000000003e-05, + "loss": 0.0, + "step": 682 + }, + { + "epoch": 2.7279999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.04602330923080444, + "eval_runtime": 29.5655, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 682 + }, + { + "epoch": 2.732, + "grad_norm": 0.0006866455078125, + "learning_rate": 2.268e-05, + "loss": 0.0, + "step": 683 + }, + { + "epoch": 2.732, + "eval_accuracy": 0.99, + "eval_loss": 0.044871073216199875, + "eval_runtime": 29.5611, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 683 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 0.0177001953125, + "learning_rate": 2.264e-05, + "loss": 0.0001, + "step": 684 + }, + { + "epoch": 2.7359999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.044409800320863724, + "eval_runtime": 29.5496, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 684 + }, + { + "epoch": 2.74, + "grad_norm": 0.0011749267578125, + "learning_rate": 2.26e-05, + "loss": 0.0, + "step": 685 + }, + { + "epoch": 2.74, + "eval_accuracy": 0.99, + "eval_loss": 0.044831205159425735, + "eval_runtime": 29.5454, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.132, + "step": 685 + }, + { + "epoch": 2.7439999999999998, + "grad_norm": 2.75, + "learning_rate": 2.256e-05, + "loss": 0.0153, + "step": 686 + }, + { + "epoch": 2.7439999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.0456419475376606, + "eval_runtime": 29.5532, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 686 + }, + { + "epoch": 2.748, + "grad_norm": 23.0, + "learning_rate": 2.252e-05, + "loss": 0.041, + "step": 687 + }, + { + "epoch": 2.748, + "eval_accuracy": 0.99, + "eval_loss": 0.04434041678905487, + "eval_runtime": 29.5848, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.129, + "step": 687 + }, + { + "epoch": 2.752, + "grad_norm": 5.781650543212891e-06, + "learning_rate": 2.248e-05, + "loss": 0.0, + "step": 688 + }, + { + "epoch": 2.752, + "eval_accuracy": 0.99, + "eval_loss": 0.0449509359896183, + "eval_runtime": 29.5676, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 688 + }, + { + "epoch": 2.7560000000000002, + "grad_norm": 0.04052734375, + "learning_rate": 2.244e-05, + "loss": 0.0001, + "step": 689 + }, + { + "epoch": 2.7560000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04488706588745117, + "eval_runtime": 29.5648, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 689 + }, + { + "epoch": 2.76, + "grad_norm": 1.2159347534179688e-05, + "learning_rate": 2.2400000000000002e-05, + "loss": 0.0, + "step": 690 + }, + { + "epoch": 2.76, + "eval_accuracy": 0.99, + "eval_loss": 0.04448350891470909, + "eval_runtime": 29.5655, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 690 + }, + { + "epoch": 2.7640000000000002, + "grad_norm": 0.1845703125, + "learning_rate": 2.236e-05, + "loss": 0.0006, + "step": 691 + }, + { + "epoch": 2.7640000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04574616998434067, + "eval_runtime": 29.5546, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 691 + }, + { + "epoch": 2.768, + "grad_norm": 18.25, + "learning_rate": 2.2320000000000003e-05, + "loss": 0.0232, + "step": 692 + }, + { + "epoch": 2.768, + "eval_accuracy": 0.99, + "eval_loss": 0.04586224630475044, + "eval_runtime": 29.5667, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 692 + }, + { + "epoch": 2.7720000000000002, + "grad_norm": 0.00020599365234375, + "learning_rate": 2.228e-05, + "loss": 0.0, + "step": 693 + }, + { + "epoch": 2.7720000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04390132054686546, + "eval_runtime": 29.6136, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 693 + }, + { + "epoch": 2.776, + "grad_norm": 0.0020751953125, + "learning_rate": 2.224e-05, + "loss": 0.0, + "step": 694 + }, + { + "epoch": 2.776, + "eval_accuracy": 0.99, + "eval_loss": 0.04451962932944298, + "eval_runtime": 29.5533, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 694 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 0.17578125, + "learning_rate": 2.22e-05, + "loss": 0.0006, + "step": 695 + }, + { + "epoch": 2.7800000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.0437055341899395, + "eval_runtime": 29.557, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.131, + "step": 695 + }, + { + "epoch": 2.784, + "grad_norm": 1.6689300537109375e-06, + "learning_rate": 2.216e-05, + "loss": 0.0, + "step": 696 + }, + { + "epoch": 2.784, + "eval_accuracy": 0.99, + "eval_loss": 0.044053442776203156, + "eval_runtime": 29.5969, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 696 + }, + { + "epoch": 2.7880000000000003, + "grad_norm": 0.00011014938354492188, + "learning_rate": 2.212e-05, + "loss": 0.0, + "step": 697 + }, + { + "epoch": 2.7880000000000003, + "eval_accuracy": 0.99, + "eval_loss": 0.044515788555145264, + "eval_runtime": 29.5431, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 697 + }, + { + "epoch": 2.792, + "grad_norm": 0.0498046875, + "learning_rate": 2.2080000000000002e-05, + "loss": 0.0001, + "step": 698 + }, + { + "epoch": 2.792, + "eval_accuracy": 0.99, + "eval_loss": 0.04467964172363281, + "eval_runtime": 29.5565, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 698 + }, + { + "epoch": 2.7960000000000003, + "grad_norm": 1.6570091247558594e-05, + "learning_rate": 2.2040000000000002e-05, + "loss": 0.0, + "step": 699 + }, + { + "epoch": 2.7960000000000003, + "eval_accuracy": 0.99, + "eval_loss": 0.043875306844711304, + "eval_runtime": 29.5727, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 699 + }, + { + "epoch": 2.8, + "grad_norm": 0.78515625, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0017, + "step": 700 + }, + { + "epoch": 2.8, + "eval_accuracy": 0.99, + "eval_loss": 0.04336533322930336, + "eval_runtime": 29.5873, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 700 + }, + { + "epoch": 2.8040000000000003, + "grad_norm": 0.0693359375, + "learning_rate": 2.196e-05, + "loss": 0.0001, + "step": 701 + }, + { + "epoch": 2.8040000000000003, + "eval_accuracy": 0.99, + "eval_loss": 0.04361381754279137, + "eval_runtime": 29.3607, + "eval_samples_per_second": 17.03, + "eval_steps_per_second": 2.146, + "step": 701 + }, + { + "epoch": 2.808, + "grad_norm": 0.1962890625, + "learning_rate": 2.192e-05, + "loss": 0.0006, + "step": 702 + }, + { + "epoch": 2.808, + "eval_accuracy": 0.99, + "eval_loss": 0.04354557767510414, + "eval_runtime": 29.3668, + "eval_samples_per_second": 17.026, + "eval_steps_per_second": 2.145, + "step": 702 + }, + { + "epoch": 2.8120000000000003, + "grad_norm": 0.212890625, + "learning_rate": 2.188e-05, + "loss": 0.0005, + "step": 703 + }, + { + "epoch": 2.8120000000000003, + "eval_accuracy": 0.99, + "eval_loss": 0.0430317297577858, + "eval_runtime": 29.4731, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.138, + "step": 703 + }, + { + "epoch": 2.816, + "grad_norm": 1.0069925338029861e-08, + "learning_rate": 2.184e-05, + "loss": 0.0, + "step": 704 + }, + { + "epoch": 2.816, + "eval_accuracy": 0.99, + "eval_loss": 0.04229697212576866, + "eval_runtime": 29.5125, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.135, + "step": 704 + }, + { + "epoch": 2.82, + "grad_norm": 0.043701171875, + "learning_rate": 2.18e-05, + "loss": 0.0001, + "step": 705 + }, + { + "epoch": 2.82, + "eval_accuracy": 0.99, + "eval_loss": 0.04303913936018944, + "eval_runtime": 29.5816, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 705 + }, + { + "epoch": 2.824, + "grad_norm": 0.000186920166015625, + "learning_rate": 2.176e-05, + "loss": 0.0, + "step": 706 + }, + { + "epoch": 2.824, + "eval_accuracy": 0.99, + "eval_loss": 0.043075211346149445, + "eval_runtime": 29.5349, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.133, + "step": 706 + }, + { + "epoch": 2.828, + "grad_norm": 0.0022735595703125, + "learning_rate": 2.1720000000000002e-05, + "loss": 0.0, + "step": 707 + }, + { + "epoch": 2.828, + "eval_accuracy": 0.99, + "eval_loss": 0.04217405989766121, + "eval_runtime": 29.5629, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 707 + }, + { + "epoch": 2.832, + "grad_norm": 0.0033416748046875, + "learning_rate": 2.168e-05, + "loss": 0.0, + "step": 708 + }, + { + "epoch": 2.832, + "eval_accuracy": 0.99, + "eval_loss": 0.04271232336759567, + "eval_runtime": 29.5506, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.132, + "step": 708 + }, + { + "epoch": 2.836, + "grad_norm": 0.057373046875, + "learning_rate": 2.1640000000000003e-05, + "loss": 0.0002, + "step": 709 + }, + { + "epoch": 2.836, + "eval_accuracy": 0.99, + "eval_loss": 0.04288361221551895, + "eval_runtime": 29.6264, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.126, + "step": 709 + }, + { + "epoch": 2.84, + "grad_norm": 100.0, + "learning_rate": 2.16e-05, + "loss": 0.8359, + "step": 710 + }, + { + "epoch": 2.84, + "eval_accuracy": 0.99, + "eval_loss": 0.04283612221479416, + "eval_runtime": 29.6222, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 710 + }, + { + "epoch": 2.844, + "grad_norm": 0.0030670166015625, + "learning_rate": 2.1560000000000004e-05, + "loss": 0.0, + "step": 711 + }, + { + "epoch": 2.844, + "eval_accuracy": 0.99, + "eval_loss": 0.042107194662094116, + "eval_runtime": 29.6211, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 711 + }, + { + "epoch": 2.848, + "grad_norm": 0.00213623046875, + "learning_rate": 2.152e-05, + "loss": 0.0, + "step": 712 + }, + { + "epoch": 2.848, + "eval_accuracy": 0.99, + "eval_loss": 0.041511375457048416, + "eval_runtime": 29.6017, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 712 + }, + { + "epoch": 2.852, + "grad_norm": 0.00311279296875, + "learning_rate": 2.148e-05, + "loss": 0.0, + "step": 713 + }, + { + "epoch": 2.852, + "eval_accuracy": 0.99, + "eval_loss": 0.041817326098680496, + "eval_runtime": 29.5538, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 713 + }, + { + "epoch": 2.856, + "grad_norm": 0.228515625, + "learning_rate": 2.144e-05, + "loss": 0.0007, + "step": 714 + }, + { + "epoch": 2.856, + "eval_accuracy": 0.99, + "eval_loss": 0.04130000248551369, + "eval_runtime": 29.5486, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 714 + }, + { + "epoch": 2.86, + "grad_norm": 0.640625, + "learning_rate": 2.1400000000000002e-05, + "loss": 0.0025, + "step": 715 + }, + { + "epoch": 2.86, + "eval_accuracy": 0.99, + "eval_loss": 0.041503094136714935, + "eval_runtime": 29.546, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.132, + "step": 715 + }, + { + "epoch": 2.864, + "grad_norm": 0.00093841552734375, + "learning_rate": 2.1360000000000002e-05, + "loss": 0.0, + "step": 716 + }, + { + "epoch": 2.864, + "eval_accuracy": 0.99, + "eval_loss": 0.041138000786304474, + "eval_runtime": 29.5981, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 716 + }, + { + "epoch": 2.868, + "grad_norm": 0.00023174285888671875, + "learning_rate": 2.1320000000000003e-05, + "loss": 0.0, + "step": 717 + }, + { + "epoch": 2.868, + "eval_accuracy": 0.99, + "eval_loss": 0.04053974896669388, + "eval_runtime": 29.6031, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 717 + }, + { + "epoch": 2.872, + "grad_norm": 0.09912109375, + "learning_rate": 2.128e-05, + "loss": 0.0003, + "step": 718 + }, + { + "epoch": 2.872, + "eval_accuracy": 0.99, + "eval_loss": 0.04098721221089363, + "eval_runtime": 29.5521, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 718 + }, + { + "epoch": 2.876, + "grad_norm": 0.0002880096435546875, + "learning_rate": 2.124e-05, + "loss": 0.0, + "step": 719 + }, + { + "epoch": 2.876, + "eval_accuracy": 0.99, + "eval_loss": 0.040826644748449326, + "eval_runtime": 29.5431, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 719 + }, + { + "epoch": 2.88, + "grad_norm": 0.12451171875, + "learning_rate": 2.12e-05, + "loss": 0.0005, + "step": 720 + }, + { + "epoch": 2.88, + "eval_accuracy": 0.99, + "eval_loss": 0.04069136828184128, + "eval_runtime": 29.6009, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 720 + }, + { + "epoch": 2.884, + "grad_norm": 0.0002002716064453125, + "learning_rate": 2.116e-05, + "loss": 0.0, + "step": 721 + }, + { + "epoch": 2.884, + "eval_accuracy": 0.99, + "eval_loss": 0.04054451361298561, + "eval_runtime": 29.5993, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 721 + }, + { + "epoch": 2.888, + "grad_norm": 0.06982421875, + "learning_rate": 2.112e-05, + "loss": 0.0002, + "step": 722 + }, + { + "epoch": 2.888, + "eval_accuracy": 0.99, + "eval_loss": 0.040723107755184174, + "eval_runtime": 29.5474, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.132, + "step": 722 + }, + { + "epoch": 2.892, + "grad_norm": 0.006072998046875, + "learning_rate": 2.1079999999999998e-05, + "loss": 0.0, + "step": 723 + }, + { + "epoch": 2.892, + "eval_accuracy": 0.99, + "eval_loss": 0.040542006492614746, + "eval_runtime": 29.5447, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.132, + "step": 723 + }, + { + "epoch": 2.896, + "grad_norm": 8.754432201385498e-07, + "learning_rate": 2.1040000000000002e-05, + "loss": 0.0, + "step": 724 + }, + { + "epoch": 2.896, + "eval_accuracy": 0.99, + "eval_loss": 0.03937985375523567, + "eval_runtime": 29.4578, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.139, + "step": 724 + }, + { + "epoch": 2.9, + "grad_norm": 0.0004825592041015625, + "learning_rate": 2.1e-05, + "loss": 0.0, + "step": 725 + }, + { + "epoch": 2.9, + "eval_accuracy": 0.99, + "eval_loss": 0.04061904549598694, + "eval_runtime": 29.5534, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 725 + }, + { + "epoch": 2.904, + "grad_norm": 6.198883056640625e-06, + "learning_rate": 2.0960000000000003e-05, + "loss": 0.0, + "step": 726 + }, + { + "epoch": 2.904, + "eval_accuracy": 0.99, + "eval_loss": 0.04089447483420372, + "eval_runtime": 29.5398, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.133, + "step": 726 + }, + { + "epoch": 2.908, + "grad_norm": 0.00014209747314453125, + "learning_rate": 2.092e-05, + "loss": 0.0, + "step": 727 + }, + { + "epoch": 2.908, + "eval_accuracy": 0.99, + "eval_loss": 0.0402870699763298, + "eval_runtime": 29.5422, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.133, + "step": 727 + }, + { + "epoch": 2.912, + "grad_norm": 6.437301635742188e-05, + "learning_rate": 2.0880000000000003e-05, + "loss": 0.0, + "step": 728 + }, + { + "epoch": 2.912, + "eval_accuracy": 0.99, + "eval_loss": 0.04053816944360733, + "eval_runtime": 29.5908, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 728 + }, + { + "epoch": 2.916, + "grad_norm": 0.00014972686767578125, + "learning_rate": 2.084e-05, + "loss": 0.0, + "step": 729 + }, + { + "epoch": 2.916, + "eval_accuracy": 0.99, + "eval_loss": 0.040060777217149734, + "eval_runtime": 29.537, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.133, + "step": 729 + }, + { + "epoch": 2.92, + "grad_norm": 0.0152587890625, + "learning_rate": 2.08e-05, + "loss": 0.0001, + "step": 730 + }, + { + "epoch": 2.92, + "eval_accuracy": 0.99, + "eval_loss": 0.04033130779862404, + "eval_runtime": 29.5488, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 730 + }, + { + "epoch": 2.924, + "grad_norm": 7.915496826171875e-05, + "learning_rate": 2.076e-05, + "loss": 0.0, + "step": 731 + }, + { + "epoch": 2.924, + "eval_accuracy": 0.99, + "eval_loss": 0.040370773524045944, + "eval_runtime": 29.5437, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 731 + }, + { + "epoch": 2.928, + "grad_norm": 0.000164031982421875, + "learning_rate": 2.072e-05, + "loss": 0.0, + "step": 732 + }, + { + "epoch": 2.928, + "eval_accuracy": 0.99, + "eval_loss": 0.040525566786527634, + "eval_runtime": 29.5942, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 732 + }, + { + "epoch": 2.932, + "grad_norm": 0.0634765625, + "learning_rate": 2.0680000000000002e-05, + "loss": 0.0002, + "step": 733 + }, + { + "epoch": 2.932, + "eval_accuracy": 0.99, + "eval_loss": 0.04057765752077103, + "eval_runtime": 29.5364, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.133, + "step": 733 + }, + { + "epoch": 2.936, + "grad_norm": 0.216796875, + "learning_rate": 2.0640000000000002e-05, + "loss": 0.0006, + "step": 734 + }, + { + "epoch": 2.936, + "eval_accuracy": 0.99, + "eval_loss": 0.04139241576194763, + "eval_runtime": 29.5376, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.133, + "step": 734 + }, + { + "epoch": 2.94, + "grad_norm": 0.0218505859375, + "learning_rate": 2.06e-05, + "loss": 0.0001, + "step": 735 + }, + { + "epoch": 2.94, + "eval_accuracy": 0.99, + "eval_loss": 0.04033602774143219, + "eval_runtime": 29.5965, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 735 + }, + { + "epoch": 2.944, + "grad_norm": 0.006500244140625, + "learning_rate": 2.0560000000000003e-05, + "loss": 0.0, + "step": 736 + }, + { + "epoch": 2.944, + "eval_accuracy": 0.99, + "eval_loss": 0.04087941721081734, + "eval_runtime": 29.5391, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 736 + }, + { + "epoch": 2.948, + "grad_norm": 1.341104507446289e-07, + "learning_rate": 2.052e-05, + "loss": 0.0, + "step": 737 + }, + { + "epoch": 2.948, + "eval_accuracy": 0.99, + "eval_loss": 0.04039126634597778, + "eval_runtime": 29.5407, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.133, + "step": 737 + }, + { + "epoch": 2.952, + "grad_norm": 0.011474609375, + "learning_rate": 2.048e-05, + "loss": 0.0, + "step": 738 + }, + { + "epoch": 2.952, + "eval_accuracy": 0.99, + "eval_loss": 0.0400020070374012, + "eval_runtime": 29.5964, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 738 + }, + { + "epoch": 2.956, + "grad_norm": 35.75, + "learning_rate": 2.044e-05, + "loss": 0.3633, + "step": 739 + }, + { + "epoch": 2.956, + "eval_accuracy": 0.99, + "eval_loss": 0.04048335179686546, + "eval_runtime": 29.5419, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.133, + "step": 739 + }, + { + "epoch": 2.96, + "grad_norm": 0.0004215240478515625, + "learning_rate": 2.04e-05, + "loss": 0.0, + "step": 740 + }, + { + "epoch": 2.96, + "eval_accuracy": 0.99, + "eval_loss": 0.04052147641777992, + "eval_runtime": 29.544, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 740 + }, + { + "epoch": 2.964, + "grad_norm": 30.875, + "learning_rate": 2.036e-05, + "loss": 0.1504, + "step": 741 + }, + { + "epoch": 2.964, + "eval_accuracy": 0.99, + "eval_loss": 0.039878539741039276, + "eval_runtime": 29.5978, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 741 + }, + { + "epoch": 2.968, + "grad_norm": 1.2200325727462769e-07, + "learning_rate": 2.032e-05, + "loss": 0.0, + "step": 742 + }, + { + "epoch": 2.968, + "eval_accuracy": 0.99, + "eval_loss": 0.039830610156059265, + "eval_runtime": 29.5441, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 742 + }, + { + "epoch": 2.972, + "grad_norm": 7.486343383789062e-05, + "learning_rate": 2.0280000000000002e-05, + "loss": 0.0, + "step": 743 + }, + { + "epoch": 2.972, + "eval_accuracy": 0.99, + "eval_loss": 0.039037469774484634, + "eval_runtime": 29.5423, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.133, + "step": 743 + }, + { + "epoch": 2.976, + "grad_norm": 0.08740234375, + "learning_rate": 2.024e-05, + "loss": 0.0003, + "step": 744 + }, + { + "epoch": 2.976, + "eval_accuracy": 0.99, + "eval_loss": 0.03924645110964775, + "eval_runtime": 29.593, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 744 + }, + { + "epoch": 2.98, + "grad_norm": 1.0356307029724121e-06, + "learning_rate": 2.0200000000000003e-05, + "loss": 0.0, + "step": 745 + }, + { + "epoch": 2.98, + "eval_accuracy": 0.99, + "eval_loss": 0.03921208903193474, + "eval_runtime": 29.5983, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 745 + }, + { + "epoch": 2.984, + "grad_norm": 1.3783574104309082e-07, + "learning_rate": 2.016e-05, + "loss": 0.0, + "step": 746 + }, + { + "epoch": 2.984, + "eval_accuracy": 0.99, + "eval_loss": 0.038118671625852585, + "eval_runtime": 29.5881, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 746 + }, + { + "epoch": 2.988, + "grad_norm": 0.00057220458984375, + "learning_rate": 2.012e-05, + "loss": 0.0, + "step": 747 + }, + { + "epoch": 2.988, + "eval_accuracy": 0.99, + "eval_loss": 0.03855925053358078, + "eval_runtime": 29.5403, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.133, + "step": 747 + }, + { + "epoch": 2.992, + "grad_norm": 2.3990869522094727e-06, + "learning_rate": 2.008e-05, + "loss": 0.0, + "step": 748 + }, + { + "epoch": 2.992, + "eval_accuracy": 0.99, + "eval_loss": 0.03877728804945946, + "eval_runtime": 29.5921, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 748 + }, + { + "epoch": 2.996, + "grad_norm": 0.0223388671875, + "learning_rate": 2.004e-05, + "loss": 0.0001, + "step": 749 + }, + { + "epoch": 2.996, + "eval_accuracy": 0.99, + "eval_loss": 0.038941629230976105, + "eval_runtime": 29.5944, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 749 + }, + { + "epoch": 3.0, + "grad_norm": 13.625, + "learning_rate": 2e-05, + "loss": 0.0654, + "step": 750 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.99, + "eval_loss": 0.039803396910429, + "eval_runtime": 29.6013, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 750 + }, + { + "epoch": 3.004, + "grad_norm": 0.00011110305786132812, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.0, + "step": 751 + }, + { + "epoch": 3.004, + "eval_accuracy": 0.992, + "eval_loss": 0.038968030363321304, + "eval_runtime": 29.4844, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.137, + "step": 751 + }, + { + "epoch": 3.008, + "grad_norm": 0.06787109375, + "learning_rate": 1.992e-05, + "loss": 0.0002, + "step": 752 + }, + { + "epoch": 3.008, + "eval_accuracy": 0.99, + "eval_loss": 0.03990323469042778, + "eval_runtime": 29.3847, + "eval_samples_per_second": 17.016, + "eval_steps_per_second": 2.144, + "step": 752 + }, + { + "epoch": 3.012, + "grad_norm": 0.004119873046875, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.0, + "step": 753 + }, + { + "epoch": 3.012, + "eval_accuracy": 0.992, + "eval_loss": 0.03944515436887741, + "eval_runtime": 29.4774, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.137, + "step": 753 + }, + { + "epoch": 3.016, + "grad_norm": 0.001678466796875, + "learning_rate": 1.984e-05, + "loss": 0.0, + "step": 754 + }, + { + "epoch": 3.016, + "eval_accuracy": 0.992, + "eval_loss": 0.04024035856127739, + "eval_runtime": 29.5164, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.134, + "step": 754 + }, + { + "epoch": 3.02, + "grad_norm": 3.910064697265625e-05, + "learning_rate": 1.9800000000000004e-05, + "loss": 0.0, + "step": 755 + }, + { + "epoch": 3.02, + "eval_accuracy": 0.99, + "eval_loss": 0.040287841111421585, + "eval_runtime": 29.5831, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 755 + }, + { + "epoch": 3.024, + "grad_norm": 0.0002880096435546875, + "learning_rate": 1.976e-05, + "loss": 0.0, + "step": 756 + }, + { + "epoch": 3.024, + "eval_accuracy": 0.992, + "eval_loss": 0.04013988375663757, + "eval_runtime": 29.5378, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 756 + }, + { + "epoch": 3.028, + "grad_norm": 0.000232696533203125, + "learning_rate": 1.972e-05, + "loss": 0.0, + "step": 757 + }, + { + "epoch": 3.028, + "eval_accuracy": 0.992, + "eval_loss": 0.04050203412771225, + "eval_runtime": 29.5486, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 757 + }, + { + "epoch": 3.032, + "grad_norm": 0.09423828125, + "learning_rate": 1.968e-05, + "loss": 0.0004, + "step": 758 + }, + { + "epoch": 3.032, + "eval_accuracy": 0.99, + "eval_loss": 0.04019290208816528, + "eval_runtime": 29.5446, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 758 + }, + { + "epoch": 3.036, + "grad_norm": 0.0002346038818359375, + "learning_rate": 1.9640000000000002e-05, + "loss": 0.0, + "step": 759 + }, + { + "epoch": 3.036, + "eval_accuracy": 0.992, + "eval_loss": 0.04038957506418228, + "eval_runtime": 29.5958, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 759 + }, + { + "epoch": 3.04, + "grad_norm": 0.000507354736328125, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.0, + "step": 760 + }, + { + "epoch": 3.04, + "eval_accuracy": 0.992, + "eval_loss": 0.04035268351435661, + "eval_runtime": 29.5403, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.133, + "step": 760 + }, + { + "epoch": 3.044, + "grad_norm": 57.0, + "learning_rate": 1.956e-05, + "loss": 0.2344, + "step": 761 + }, + { + "epoch": 3.044, + "eval_accuracy": 0.992, + "eval_loss": 0.040206458419561386, + "eval_runtime": 29.5393, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 761 + }, + { + "epoch": 3.048, + "grad_norm": 0.01312255859375, + "learning_rate": 1.9520000000000003e-05, + "loss": 0.0, + "step": 762 + }, + { + "epoch": 3.048, + "eval_accuracy": 0.99, + "eval_loss": 0.040016282349824905, + "eval_runtime": 29.5824, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 762 + }, + { + "epoch": 3.052, + "grad_norm": 0.00060272216796875, + "learning_rate": 1.948e-05, + "loss": 0.0, + "step": 763 + }, + { + "epoch": 3.052, + "eval_accuracy": 0.99, + "eval_loss": 0.039626188576221466, + "eval_runtime": 29.5795, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 763 + }, + { + "epoch": 3.056, + "grad_norm": 4.559755325317383e-06, + "learning_rate": 1.944e-05, + "loss": 0.0, + "step": 764 + }, + { + "epoch": 3.056, + "eval_accuracy": 0.992, + "eval_loss": 0.038529641926288605, + "eval_runtime": 29.5784, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 764 + }, + { + "epoch": 3.06, + "grad_norm": 1.990795135498047e-05, + "learning_rate": 1.94e-05, + "loss": 0.0, + "step": 765 + }, + { + "epoch": 3.06, + "eval_accuracy": 0.99, + "eval_loss": 0.039258018136024475, + "eval_runtime": 29.5691, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 765 + }, + { + "epoch": 3.064, + "grad_norm": 0.00110626220703125, + "learning_rate": 1.936e-05, + "loss": 0.0, + "step": 766 + }, + { + "epoch": 3.064, + "eval_accuracy": 0.99, + "eval_loss": 0.03960692882537842, + "eval_runtime": 29.5816, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 766 + }, + { + "epoch": 3.068, + "grad_norm": 3.987224772572517e-09, + "learning_rate": 1.932e-05, + "loss": 0.0, + "step": 767 + }, + { + "epoch": 3.068, + "eval_accuracy": 0.99, + "eval_loss": 0.04007257521152496, + "eval_runtime": 29.5788, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 767 + }, + { + "epoch": 3.072, + "grad_norm": 0.1171875, + "learning_rate": 1.9280000000000002e-05, + "loss": 0.0004, + "step": 768 + }, + { + "epoch": 3.072, + "eval_accuracy": 0.99, + "eval_loss": 0.03949208930134773, + "eval_runtime": 29.5445, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 768 + }, + { + "epoch": 3.076, + "grad_norm": 0.000225067138671875, + "learning_rate": 1.924e-05, + "loss": 0.0, + "step": 769 + }, + { + "epoch": 3.076, + "eval_accuracy": 0.99, + "eval_loss": 0.03969990462064743, + "eval_runtime": 29.5906, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 769 + }, + { + "epoch": 3.08, + "grad_norm": 5.269050598144531e-05, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.0, + "step": 770 + }, + { + "epoch": 3.08, + "eval_accuracy": 0.99, + "eval_loss": 0.03954623267054558, + "eval_runtime": 29.5303, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.133, + "step": 770 + }, + { + "epoch": 3.084, + "grad_norm": 7.43865966796875e-05, + "learning_rate": 1.916e-05, + "loss": 0.0, + "step": 771 + }, + { + "epoch": 3.084, + "eval_accuracy": 0.99, + "eval_loss": 0.04028802737593651, + "eval_runtime": 29.519, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.134, + "step": 771 + }, + { + "epoch": 3.088, + "grad_norm": 9.834766387939453e-06, + "learning_rate": 1.9120000000000003e-05, + "loss": 0.0, + "step": 772 + }, + { + "epoch": 3.088, + "eval_accuracy": 0.99, + "eval_loss": 0.03980438411235809, + "eval_runtime": 29.5569, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.131, + "step": 772 + }, + { + "epoch": 3.092, + "grad_norm": 1.0952353477478027e-06, + "learning_rate": 1.908e-05, + "loss": 0.0, + "step": 773 + }, + { + "epoch": 3.092, + "eval_accuracy": 0.99, + "eval_loss": 0.04079170897603035, + "eval_runtime": 29.559, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 773 + }, + { + "epoch": 3.096, + "grad_norm": 5.5789947509765625e-05, + "learning_rate": 1.904e-05, + "loss": 0.0, + "step": 774 + }, + { + "epoch": 3.096, + "eval_accuracy": 0.99, + "eval_loss": 0.039865873754024506, + "eval_runtime": 29.583, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 774 + }, + { + "epoch": 3.1, + "grad_norm": 6.75, + "learning_rate": 1.9e-05, + "loss": 0.015, + "step": 775 + }, + { + "epoch": 3.1, + "eval_accuracy": 0.992, + "eval_loss": 0.039904601871967316, + "eval_runtime": 29.566, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 775 + }, + { + "epoch": 3.104, + "grad_norm": 0.59375, + "learning_rate": 1.896e-05, + "loss": 0.0015, + "step": 776 + }, + { + "epoch": 3.104, + "eval_accuracy": 0.99, + "eval_loss": 0.04019094631075859, + "eval_runtime": 29.5645, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 776 + }, + { + "epoch": 3.108, + "grad_norm": 0.005340576171875, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.0, + "step": 777 + }, + { + "epoch": 3.108, + "eval_accuracy": 0.99, + "eval_loss": 0.04081292077898979, + "eval_runtime": 29.5058, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 777 + }, + { + "epoch": 3.112, + "grad_norm": 13.125, + "learning_rate": 1.888e-05, + "loss": 0.0547, + "step": 778 + }, + { + "epoch": 3.112, + "eval_accuracy": 0.99, + "eval_loss": 0.04073144868016243, + "eval_runtime": 29.5095, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.135, + "step": 778 + }, + { + "epoch": 3.116, + "grad_norm": 3.4332275390625e-05, + "learning_rate": 1.8840000000000003e-05, + "loss": 0.0, + "step": 779 + }, + { + "epoch": 3.116, + "eval_accuracy": 0.99, + "eval_loss": 0.04141020029783249, + "eval_runtime": 29.4949, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.136, + "step": 779 + }, + { + "epoch": 3.12, + "grad_norm": 0.84375, + "learning_rate": 1.88e-05, + "loss": 0.0029, + "step": 780 + }, + { + "epoch": 3.12, + "eval_accuracy": 0.99, + "eval_loss": 0.0429188534617424, + "eval_runtime": 29.5511, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.132, + "step": 780 + }, + { + "epoch": 3.124, + "grad_norm": 0.3046875, + "learning_rate": 1.876e-05, + "loss": 0.0014, + "step": 781 + }, + { + "epoch": 3.124, + "eval_accuracy": 0.99, + "eval_loss": 0.04302583262324333, + "eval_runtime": 29.5026, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.135, + "step": 781 + }, + { + "epoch": 3.128, + "grad_norm": 0.0196533203125, + "learning_rate": 1.872e-05, + "loss": 0.0001, + "step": 782 + }, + { + "epoch": 3.128, + "eval_accuracy": 0.99, + "eval_loss": 0.04397689178586006, + "eval_runtime": 29.4836, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.137, + "step": 782 + }, + { + "epoch": 3.132, + "grad_norm": 0.009521484375, + "learning_rate": 1.868e-05, + "loss": 0.0, + "step": 783 + }, + { + "epoch": 3.132, + "eval_accuracy": 0.99, + "eval_loss": 0.04501033574342728, + "eval_runtime": 29.4932, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.136, + "step": 783 + }, + { + "epoch": 3.136, + "grad_norm": 0.00909423828125, + "learning_rate": 1.864e-05, + "loss": 0.0, + "step": 784 + }, + { + "epoch": 3.136, + "eval_accuracy": 0.99, + "eval_loss": 0.045251671224832535, + "eval_runtime": 29.5049, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 784 + }, + { + "epoch": 3.14, + "grad_norm": 0.0031585693359375, + "learning_rate": 1.86e-05, + "loss": 0.0, + "step": 785 + }, + { + "epoch": 3.14, + "eval_accuracy": 0.99, + "eval_loss": 0.04533965885639191, + "eval_runtime": 29.534, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.133, + "step": 785 + }, + { + "epoch": 3.144, + "grad_norm": 0.2216796875, + "learning_rate": 1.856e-05, + "loss": 0.0011, + "step": 786 + }, + { + "epoch": 3.144, + "eval_accuracy": 0.99, + "eval_loss": 0.04656612500548363, + "eval_runtime": 29.5356, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.133, + "step": 786 + }, + { + "epoch": 3.148, + "grad_norm": 0.0002536773681640625, + "learning_rate": 1.8520000000000002e-05, + "loss": 0.0, + "step": 787 + }, + { + "epoch": 3.148, + "eval_accuracy": 0.99, + "eval_loss": 0.045900486409664154, + "eval_runtime": 29.5469, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.132, + "step": 787 + }, + { + "epoch": 3.152, + "grad_norm": 2.1576881408691406e-05, + "learning_rate": 1.848e-05, + "loss": 0.0, + "step": 788 + }, + { + "epoch": 3.152, + "eval_accuracy": 0.99, + "eval_loss": 0.0470956414937973, + "eval_runtime": 29.557, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.131, + "step": 788 + }, + { + "epoch": 3.156, + "grad_norm": 0.00079345703125, + "learning_rate": 1.8440000000000003e-05, + "loss": 0.0, + "step": 789 + }, + { + "epoch": 3.156, + "eval_accuracy": 0.99, + "eval_loss": 0.046438541263341904, + "eval_runtime": 29.619, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 789 + }, + { + "epoch": 3.16, + "grad_norm": 0.00165557861328125, + "learning_rate": 1.84e-05, + "loss": 0.0, + "step": 790 + }, + { + "epoch": 3.16, + "eval_accuracy": 0.99, + "eval_loss": 0.047281038016080856, + "eval_runtime": 29.5844, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.129, + "step": 790 + }, + { + "epoch": 3.164, + "grad_norm": 0.0260009765625, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.0001, + "step": 791 + }, + { + "epoch": 3.164, + "eval_accuracy": 0.99, + "eval_loss": 0.04764244705438614, + "eval_runtime": 29.5843, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.13, + "step": 791 + }, + { + "epoch": 3.168, + "grad_norm": 1.800060272216797e-05, + "learning_rate": 1.832e-05, + "loss": 0.0, + "step": 792 + }, + { + "epoch": 3.168, + "eval_accuracy": 0.99, + "eval_loss": 0.047347117215394974, + "eval_runtime": 29.5852, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 792 + }, + { + "epoch": 3.172, + "grad_norm": 0.00341796875, + "learning_rate": 1.828e-05, + "loss": 0.0, + "step": 793 + }, + { + "epoch": 3.172, + "eval_accuracy": 0.99, + "eval_loss": 0.047585275024175644, + "eval_runtime": 29.5613, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 793 + }, + { + "epoch": 3.176, + "grad_norm": 0.005340576171875, + "learning_rate": 1.824e-05, + "loss": 0.0, + "step": 794 + }, + { + "epoch": 3.176, + "eval_accuracy": 0.99, + "eval_loss": 0.04780645668506622, + "eval_runtime": 29.5602, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 794 + }, + { + "epoch": 3.18, + "grad_norm": 3.248453140258789e-06, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.0, + "step": 795 + }, + { + "epoch": 3.18, + "eval_accuracy": 0.99, + "eval_loss": 0.04831121489405632, + "eval_runtime": 29.5583, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.131, + "step": 795 + }, + { + "epoch": 3.184, + "grad_norm": 0.00016117095947265625, + "learning_rate": 1.8160000000000002e-05, + "loss": 0.0, + "step": 796 + }, + { + "epoch": 3.184, + "eval_accuracy": 0.99, + "eval_loss": 0.048098694533109665, + "eval_runtime": 29.5739, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 796 + }, + { + "epoch": 3.188, + "grad_norm": 1.2814998626708984e-05, + "learning_rate": 1.812e-05, + "loss": 0.0, + "step": 797 + }, + { + "epoch": 3.188, + "eval_accuracy": 0.99, + "eval_loss": 0.04814353585243225, + "eval_runtime": 29.5825, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 797 + }, + { + "epoch": 3.192, + "grad_norm": 1.55717134475708e-06, + "learning_rate": 1.808e-05, + "loss": 0.0, + "step": 798 + }, + { + "epoch": 3.192, + "eval_accuracy": 0.99, + "eval_loss": 0.04732260853052139, + "eval_runtime": 29.579, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 798 + }, + { + "epoch": 3.196, + "grad_norm": 5.066394805908203e-06, + "learning_rate": 1.804e-05, + "loss": 0.0, + "step": 799 + }, + { + "epoch": 3.196, + "eval_accuracy": 0.99, + "eval_loss": 0.04807932302355766, + "eval_runtime": 29.5865, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 799 + }, + { + "epoch": 3.2, + "grad_norm": 0.00167083740234375, + "learning_rate": 1.8e-05, + "loss": 0.0, + "step": 800 + }, + { + "epoch": 3.2, + "eval_accuracy": 0.99, + "eval_loss": 0.04750071465969086, + "eval_runtime": 29.5911, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 800 + }, + { + "epoch": 3.204, + "grad_norm": 174.0, + "learning_rate": 1.796e-05, + "loss": 0.8516, + "step": 801 + }, + { + "epoch": 3.204, + "eval_accuracy": 0.99, + "eval_loss": 0.047799624502658844, + "eval_runtime": 29.5802, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 801 + }, + { + "epoch": 3.208, + "grad_norm": 0.0068359375, + "learning_rate": 1.792e-05, + "loss": 0.0, + "step": 802 + }, + { + "epoch": 3.208, + "eval_accuracy": 0.99, + "eval_loss": 0.047932762652635574, + "eval_runtime": 29.4334, + "eval_samples_per_second": 16.987, + "eval_steps_per_second": 2.14, + "step": 802 + }, + { + "epoch": 3.212, + "grad_norm": 4.202127456665039e-06, + "learning_rate": 1.7879999999999998e-05, + "loss": 0.0, + "step": 803 + }, + { + "epoch": 3.212, + "eval_accuracy": 0.99, + "eval_loss": 0.04759809002280235, + "eval_runtime": 29.489, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.136, + "step": 803 + }, + { + "epoch": 3.216, + "grad_norm": 8.463859558105469e-06, + "learning_rate": 1.7840000000000002e-05, + "loss": 0.0, + "step": 804 + }, + { + "epoch": 3.216, + "eval_accuracy": 0.99, + "eval_loss": 0.047727134078741074, + "eval_runtime": 29.5207, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.134, + "step": 804 + }, + { + "epoch": 3.22, + "grad_norm": 9.202957153320312e-05, + "learning_rate": 1.78e-05, + "loss": 0.0, + "step": 805 + }, + { + "epoch": 3.22, + "eval_accuracy": 0.99, + "eval_loss": 0.0481918640434742, + "eval_runtime": 29.5156, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.134, + "step": 805 + }, + { + "epoch": 3.224, + "grad_norm": 1.5497207641601562e-06, + "learning_rate": 1.7760000000000003e-05, + "loss": 0.0, + "step": 806 + }, + { + "epoch": 3.224, + "eval_accuracy": 0.99, + "eval_loss": 0.047733698040246964, + "eval_runtime": 29.5123, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.135, + "step": 806 + }, + { + "epoch": 3.228, + "grad_norm": 0.328125, + "learning_rate": 1.772e-05, + "loss": 0.0007, + "step": 807 + }, + { + "epoch": 3.228, + "eval_accuracy": 0.99, + "eval_loss": 0.04704129695892334, + "eval_runtime": 29.5626, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 807 + }, + { + "epoch": 3.232, + "grad_norm": 0.049560546875, + "learning_rate": 1.7680000000000004e-05, + "loss": 0.0002, + "step": 808 + }, + { + "epoch": 3.232, + "eval_accuracy": 0.99, + "eval_loss": 0.04724576696753502, + "eval_runtime": 29.514, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.135, + "step": 808 + }, + { + "epoch": 3.2359999999999998, + "grad_norm": 8.404254913330078e-06, + "learning_rate": 1.764e-05, + "loss": 0.0, + "step": 809 + }, + { + "epoch": 3.2359999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.04765217751264572, + "eval_runtime": 29.5034, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.135, + "step": 809 + }, + { + "epoch": 3.24, + "grad_norm": 2.46875, + "learning_rate": 1.76e-05, + "loss": 0.0047, + "step": 810 + }, + { + "epoch": 3.24, + "eval_accuracy": 0.99, + "eval_loss": 0.04763682931661606, + "eval_runtime": 29.5031, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.135, + "step": 810 + }, + { + "epoch": 3.2439999999999998, + "grad_norm": 0.0002918243408203125, + "learning_rate": 1.756e-05, + "loss": 0.0, + "step": 811 + }, + { + "epoch": 3.2439999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.04729170352220535, + "eval_runtime": 29.4935, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.136, + "step": 811 + }, + { + "epoch": 3.248, + "grad_norm": 0.23828125, + "learning_rate": 1.752e-05, + "loss": 0.0008, + "step": 812 + }, + { + "epoch": 3.248, + "eval_accuracy": 0.99, + "eval_loss": 0.04724510759115219, + "eval_runtime": 29.544, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 812 + }, + { + "epoch": 3.252, + "grad_norm": 0.0169677734375, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.0, + "step": 813 + }, + { + "epoch": 3.252, + "eval_accuracy": 0.99, + "eval_loss": 0.04806523025035858, + "eval_runtime": 29.556, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 813 + }, + { + "epoch": 3.2560000000000002, + "grad_norm": 0.1748046875, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.0007, + "step": 814 + }, + { + "epoch": 3.2560000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04831075295805931, + "eval_runtime": 29.5557, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 814 + }, + { + "epoch": 3.26, + "grad_norm": 1.4375, + "learning_rate": 1.74e-05, + "loss": 0.0051, + "step": 815 + }, + { + "epoch": 3.26, + "eval_accuracy": 0.99, + "eval_loss": 0.047930411994457245, + "eval_runtime": 29.5046, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.135, + "step": 815 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 0.005462646484375, + "learning_rate": 1.736e-05, + "loss": 0.0, + "step": 816 + }, + { + "epoch": 3.2640000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04883721098303795, + "eval_runtime": 29.5059, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 816 + }, + { + "epoch": 3.268, + "grad_norm": 63.25, + "learning_rate": 1.732e-05, + "loss": 0.2432, + "step": 817 + }, + { + "epoch": 3.268, + "eval_accuracy": 0.99, + "eval_loss": 0.04773944616317749, + "eval_runtime": 29.5532, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 817 + }, + { + "epoch": 3.2720000000000002, + "grad_norm": 3.993511199951172e-06, + "learning_rate": 1.728e-05, + "loss": 0.0, + "step": 818 + }, + { + "epoch": 3.2720000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.048273585736751556, + "eval_runtime": 29.5055, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 818 + }, + { + "epoch": 3.276, + "grad_norm": 0.8046875, + "learning_rate": 1.724e-05, + "loss": 0.002, + "step": 819 + }, + { + "epoch": 3.276, + "eval_accuracy": 0.99, + "eval_loss": 0.048879630863666534, + "eval_runtime": 29.514, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.135, + "step": 819 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 1.328125, + "learning_rate": 1.7199999999999998e-05, + "loss": 0.0038, + "step": 820 + }, + { + "epoch": 3.2800000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04881564527750015, + "eval_runtime": 29.5678, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 820 + }, + { + "epoch": 3.284, + "grad_norm": 6.818771362304688e-05, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.0, + "step": 821 + }, + { + "epoch": 3.284, + "eval_accuracy": 0.99, + "eval_loss": 0.0481451153755188, + "eval_runtime": 29.5642, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 821 + }, + { + "epoch": 3.288, + "grad_norm": 4.0531158447265625e-06, + "learning_rate": 1.712e-05, + "loss": 0.0, + "step": 822 + }, + { + "epoch": 3.288, + "eval_accuracy": 0.99, + "eval_loss": 0.04888517037034035, + "eval_runtime": 29.5088, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.135, + "step": 822 + }, + { + "epoch": 3.292, + "grad_norm": 0.0087890625, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.0, + "step": 823 + }, + { + "epoch": 3.292, + "eval_accuracy": 0.99, + "eval_loss": 0.04852079600095749, + "eval_runtime": 29.5629, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 823 + }, + { + "epoch": 3.296, + "grad_norm": 0.01556396484375, + "learning_rate": 1.704e-05, + "loss": 0.0, + "step": 824 + }, + { + "epoch": 3.296, + "eval_accuracy": 0.99, + "eval_loss": 0.04906541481614113, + "eval_runtime": 29.5684, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 824 + }, + { + "epoch": 3.3, + "grad_norm": 0.35546875, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.001, + "step": 825 + }, + { + "epoch": 3.3, + "eval_accuracy": 0.99, + "eval_loss": 0.04913512244820595, + "eval_runtime": 29.5299, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.133, + "step": 825 + }, + { + "epoch": 3.304, + "grad_norm": 0.039794921875, + "learning_rate": 1.696e-05, + "loss": 0.0001, + "step": 826 + }, + { + "epoch": 3.304, + "eval_accuracy": 0.99, + "eval_loss": 0.048599932342767715, + "eval_runtime": 29.6013, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 826 + }, + { + "epoch": 3.308, + "grad_norm": 0.0025177001953125, + "learning_rate": 1.692e-05, + "loss": 0.0, + "step": 827 + }, + { + "epoch": 3.308, + "eval_accuracy": 0.99, + "eval_loss": 0.04838579148054123, + "eval_runtime": 29.6326, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 827 + }, + { + "epoch": 3.312, + "grad_norm": 0.1083984375, + "learning_rate": 1.688e-05, + "loss": 0.0003, + "step": 828 + }, + { + "epoch": 3.312, + "eval_accuracy": 0.99, + "eval_loss": 0.048215966671705246, + "eval_runtime": 29.5919, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 828 + }, + { + "epoch": 3.316, + "grad_norm": 37.5, + "learning_rate": 1.684e-05, + "loss": 0.2041, + "step": 829 + }, + { + "epoch": 3.316, + "eval_accuracy": 0.99, + "eval_loss": 0.04854058846831322, + "eval_runtime": 29.5879, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 829 + }, + { + "epoch": 3.32, + "grad_norm": 0.0002460479736328125, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.0, + "step": 830 + }, + { + "epoch": 3.32, + "eval_accuracy": 0.99, + "eval_loss": 0.049079779535532, + "eval_runtime": 29.5922, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 830 + }, + { + "epoch": 3.324, + "grad_norm": 0.1591796875, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.0004, + "step": 831 + }, + { + "epoch": 3.324, + "eval_accuracy": 0.99, + "eval_loss": 0.04882955178618431, + "eval_runtime": 29.5932, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 831 + }, + { + "epoch": 3.328, + "grad_norm": 2.372264862060547e-05, + "learning_rate": 1.672e-05, + "loss": 0.0, + "step": 832 + }, + { + "epoch": 3.328, + "eval_accuracy": 0.99, + "eval_loss": 0.04865111783146858, + "eval_runtime": 29.6006, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 832 + }, + { + "epoch": 3.332, + "grad_norm": 9.0625, + "learning_rate": 1.668e-05, + "loss": 0.0282, + "step": 833 + }, + { + "epoch": 3.332, + "eval_accuracy": 0.99, + "eval_loss": 0.04838387668132782, + "eval_runtime": 29.6389, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 833 + }, + { + "epoch": 3.336, + "grad_norm": 0.00189971923828125, + "learning_rate": 1.664e-05, + "loss": 0.0, + "step": 834 + }, + { + "epoch": 3.336, + "eval_accuracy": 0.99, + "eval_loss": 0.04762662202119827, + "eval_runtime": 29.6476, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 834 + }, + { + "epoch": 3.34, + "grad_norm": 0.00078582763671875, + "learning_rate": 1.66e-05, + "loss": 0.0, + "step": 835 + }, + { + "epoch": 3.34, + "eval_accuracy": 0.99, + "eval_loss": 0.04745440557599068, + "eval_runtime": 29.6429, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 835 + }, + { + "epoch": 3.344, + "grad_norm": 26.125, + "learning_rate": 1.656e-05, + "loss": 0.1318, + "step": 836 + }, + { + "epoch": 3.344, + "eval_accuracy": 0.99, + "eval_loss": 0.04818786680698395, + "eval_runtime": 29.5881, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 836 + }, + { + "epoch": 3.348, + "grad_norm": 0.00038909912109375, + "learning_rate": 1.652e-05, + "loss": 0.0, + "step": 837 + }, + { + "epoch": 3.348, + "eval_accuracy": 0.99, + "eval_loss": 0.047738902270793915, + "eval_runtime": 29.5778, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 837 + }, + { + "epoch": 3.352, + "grad_norm": 45.25, + "learning_rate": 1.648e-05, + "loss": 0.6289, + "step": 838 + }, + { + "epoch": 3.352, + "eval_accuracy": 0.99, + "eval_loss": 0.04696950688958168, + "eval_runtime": 29.6275, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 838 + }, + { + "epoch": 3.356, + "grad_norm": 8.106231689453125e-06, + "learning_rate": 1.644e-05, + "loss": 0.0, + "step": 839 + }, + { + "epoch": 3.356, + "eval_accuracy": 0.99, + "eval_loss": 0.04669180512428284, + "eval_runtime": 29.5672, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 839 + }, + { + "epoch": 3.36, + "grad_norm": 1.2945383787155151e-07, + "learning_rate": 1.6400000000000002e-05, + "loss": 0.0, + "step": 840 + }, + { + "epoch": 3.36, + "eval_accuracy": 0.99, + "eval_loss": 0.046532776206731796, + "eval_runtime": 29.5542, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 840 + }, + { + "epoch": 3.364, + "grad_norm": 0.00090789794921875, + "learning_rate": 1.636e-05, + "loss": 0.0, + "step": 841 + }, + { + "epoch": 3.364, + "eval_accuracy": 0.99, + "eval_loss": 0.04659077525138855, + "eval_runtime": 29.5496, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 841 + }, + { + "epoch": 3.368, + "grad_norm": 0.328125, + "learning_rate": 1.6320000000000003e-05, + "loss": 0.0016, + "step": 842 + }, + { + "epoch": 3.368, + "eval_accuracy": 0.99, + "eval_loss": 0.0460612028837204, + "eval_runtime": 29.6059, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 842 + }, + { + "epoch": 3.372, + "grad_norm": 0.00029754638671875, + "learning_rate": 1.628e-05, + "loss": 0.0, + "step": 843 + }, + { + "epoch": 3.372, + "eval_accuracy": 0.99, + "eval_loss": 0.04585425555706024, + "eval_runtime": 29.6202, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 843 + }, + { + "epoch": 3.376, + "grad_norm": 0.00179290771484375, + "learning_rate": 1.624e-05, + "loss": 0.0, + "step": 844 + }, + { + "epoch": 3.376, + "eval_accuracy": 0.99, + "eval_loss": 0.04610791802406311, + "eval_runtime": 29.6201, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 844 + }, + { + "epoch": 3.38, + "grad_norm": 3.769993782043457e-06, + "learning_rate": 1.62e-05, + "loss": 0.0, + "step": 845 + }, + { + "epoch": 3.38, + "eval_accuracy": 0.99, + "eval_loss": 0.04646699130535126, + "eval_runtime": 29.5591, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 845 + }, + { + "epoch": 3.384, + "grad_norm": 1.4841556549072266e-05, + "learning_rate": 1.616e-05, + "loss": 0.0, + "step": 846 + }, + { + "epoch": 3.384, + "eval_accuracy": 0.99, + "eval_loss": 0.047105856239795685, + "eval_runtime": 29.5613, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 846 + }, + { + "epoch": 3.388, + "grad_norm": 2.609375, + "learning_rate": 1.612e-05, + "loss": 0.0132, + "step": 847 + }, + { + "epoch": 3.388, + "eval_accuracy": 0.99, + "eval_loss": 0.04555107280611992, + "eval_runtime": 29.5684, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 847 + }, + { + "epoch": 3.392, + "grad_norm": 0.00017547607421875, + "learning_rate": 1.6080000000000002e-05, + "loss": 0.0, + "step": 848 + }, + { + "epoch": 3.392, + "eval_accuracy": 0.99, + "eval_loss": 0.04632912203669548, + "eval_runtime": 29.5667, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 848 + }, + { + "epoch": 3.396, + "grad_norm": 0.00274658203125, + "learning_rate": 1.604e-05, + "loss": 0.0, + "step": 849 + }, + { + "epoch": 3.396, + "eval_accuracy": 0.99, + "eval_loss": 0.045997172594070435, + "eval_runtime": 29.5635, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 849 + }, + { + "epoch": 3.4, + "grad_norm": 0.01953125, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0, + "step": 850 + }, + { + "epoch": 3.4, + "eval_accuracy": 0.99, + "eval_loss": 0.04594970867037773, + "eval_runtime": 29.5689, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 850 + }, + { + "epoch": 3.404, + "grad_norm": 0.0159912109375, + "learning_rate": 1.596e-05, + "loss": 0.0, + "step": 851 + }, + { + "epoch": 3.404, + "eval_accuracy": 0.99, + "eval_loss": 0.04550854489207268, + "eval_runtime": 29.5434, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.132, + "step": 851 + }, + { + "epoch": 3.408, + "grad_norm": 0.0260009765625, + "learning_rate": 1.592e-05, + "loss": 0.0001, + "step": 852 + }, + { + "epoch": 3.408, + "eval_accuracy": 0.99, + "eval_loss": 0.04582465812563896, + "eval_runtime": 29.4449, + "eval_samples_per_second": 16.981, + "eval_steps_per_second": 2.14, + "step": 852 + }, + { + "epoch": 3.412, + "grad_norm": 0.12451171875, + "learning_rate": 1.588e-05, + "loss": 0.0004, + "step": 853 + }, + { + "epoch": 3.412, + "eval_accuracy": 0.99, + "eval_loss": 0.045550890266895294, + "eval_runtime": 29.5674, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 853 + }, + { + "epoch": 3.416, + "grad_norm": 0.0238037109375, + "learning_rate": 1.584e-05, + "loss": 0.0001, + "step": 854 + }, + { + "epoch": 3.416, + "eval_accuracy": 0.99, + "eval_loss": 0.045643631368875504, + "eval_runtime": 29.5401, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.133, + "step": 854 + }, + { + "epoch": 3.42, + "grad_norm": 0.000232696533203125, + "learning_rate": 1.58e-05, + "loss": 0.0, + "step": 855 + }, + { + "epoch": 3.42, + "eval_accuracy": 0.99, + "eval_loss": 0.04545675590634346, + "eval_runtime": 29.5491, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 855 + }, + { + "epoch": 3.424, + "grad_norm": 0.000423431396484375, + "learning_rate": 1.5759999999999998e-05, + "loss": 0.0, + "step": 856 + }, + { + "epoch": 3.424, + "eval_accuracy": 0.99, + "eval_loss": 0.046077191829681396, + "eval_runtime": 29.6123, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.127, + "step": 856 + }, + { + "epoch": 3.428, + "grad_norm": 0.01348876953125, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.0, + "step": 857 + }, + { + "epoch": 3.428, + "eval_accuracy": 0.99, + "eval_loss": 0.04559361934661865, + "eval_runtime": 29.5718, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 857 + }, + { + "epoch": 3.432, + "grad_norm": 0.036865234375, + "learning_rate": 1.568e-05, + "loss": 0.0001, + "step": 858 + }, + { + "epoch": 3.432, + "eval_accuracy": 0.99, + "eval_loss": 0.045690521597862244, + "eval_runtime": 29.5721, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 858 + }, + { + "epoch": 3.436, + "grad_norm": 2.4437904357910156e-05, + "learning_rate": 1.5640000000000003e-05, + "loss": 0.0, + "step": 859 + }, + { + "epoch": 3.436, + "eval_accuracy": 0.99, + "eval_loss": 0.04639036953449249, + "eval_runtime": 29.5764, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 859 + }, + { + "epoch": 3.44, + "grad_norm": 2.518296241760254e-06, + "learning_rate": 1.56e-05, + "loss": 0.0, + "step": 860 + }, + { + "epoch": 3.44, + "eval_accuracy": 0.99, + "eval_loss": 0.045957814902067184, + "eval_runtime": 29.624, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 860 + }, + { + "epoch": 3.444, + "grad_norm": 0.625, + "learning_rate": 1.556e-05, + "loss": 0.002, + "step": 861 + }, + { + "epoch": 3.444, + "eval_accuracy": 0.99, + "eval_loss": 0.045427270233631134, + "eval_runtime": 29.5817, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 861 + }, + { + "epoch": 3.448, + "grad_norm": 2.294778823852539e-06, + "learning_rate": 1.552e-05, + "loss": 0.0, + "step": 862 + }, + { + "epoch": 3.448, + "eval_accuracy": 0.99, + "eval_loss": 0.0455552339553833, + "eval_runtime": 29.5988, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.128, + "step": 862 + }, + { + "epoch": 3.452, + "grad_norm": 1.5390625, + "learning_rate": 1.548e-05, + "loss": 0.0039, + "step": 863 + }, + { + "epoch": 3.452, + "eval_accuracy": 0.99, + "eval_loss": 0.04656358063220978, + "eval_runtime": 29.649, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 863 + }, + { + "epoch": 3.456, + "grad_norm": 0.00091552734375, + "learning_rate": 1.544e-05, + "loss": 0.0, + "step": 864 + }, + { + "epoch": 3.456, + "eval_accuracy": 0.99, + "eval_loss": 0.04577600210905075, + "eval_runtime": 29.6386, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 864 + }, + { + "epoch": 3.46, + "grad_norm": 9.655952453613281e-06, + "learning_rate": 1.54e-05, + "loss": 0.0, + "step": 865 + }, + { + "epoch": 3.46, + "eval_accuracy": 0.99, + "eval_loss": 0.046325474977493286, + "eval_runtime": 29.6422, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.125, + "step": 865 + }, + { + "epoch": 3.464, + "grad_norm": 25.625, + "learning_rate": 1.536e-05, + "loss": 0.0967, + "step": 866 + }, + { + "epoch": 3.464, + "eval_accuracy": 0.99, + "eval_loss": 0.04598759487271309, + "eval_runtime": 29.5993, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 866 + }, + { + "epoch": 3.468, + "grad_norm": 0.0091552734375, + "learning_rate": 1.5320000000000002e-05, + "loss": 0.0, + "step": 867 + }, + { + "epoch": 3.468, + "eval_accuracy": 0.99, + "eval_loss": 0.046821847558021545, + "eval_runtime": 29.577, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 867 + }, + { + "epoch": 3.472, + "grad_norm": 0.000423431396484375, + "learning_rate": 1.528e-05, + "loss": 0.0, + "step": 868 + }, + { + "epoch": 3.472, + "eval_accuracy": 0.99, + "eval_loss": 0.04548504576086998, + "eval_runtime": 29.6397, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.126, + "step": 868 + }, + { + "epoch": 3.476, + "grad_norm": 1.0788440704345703e-05, + "learning_rate": 1.5240000000000001e-05, + "loss": 0.0, + "step": 869 + }, + { + "epoch": 3.476, + "eval_accuracy": 0.99, + "eval_loss": 0.04586688056588173, + "eval_runtime": 29.6043, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 869 + }, + { + "epoch": 3.48, + "grad_norm": 0.000507354736328125, + "learning_rate": 1.52e-05, + "loss": 0.0, + "step": 870 + }, + { + "epoch": 3.48, + "eval_accuracy": 0.99, + "eval_loss": 0.04534892737865448, + "eval_runtime": 29.603, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 870 + }, + { + "epoch": 3.484, + "grad_norm": 0.00078582763671875, + "learning_rate": 1.5160000000000002e-05, + "loss": 0.0, + "step": 871 + }, + { + "epoch": 3.484, + "eval_accuracy": 0.99, + "eval_loss": 0.04481690376996994, + "eval_runtime": 29.6076, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 871 + }, + { + "epoch": 3.488, + "grad_norm": 3.748573362827301e-08, + "learning_rate": 1.5120000000000001e-05, + "loss": 0.0, + "step": 872 + }, + { + "epoch": 3.488, + "eval_accuracy": 0.99, + "eval_loss": 0.04542669653892517, + "eval_runtime": 29.6131, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 872 + }, + { + "epoch": 3.492, + "grad_norm": 8.106231689453125e-06, + "learning_rate": 1.508e-05, + "loss": 0.0, + "step": 873 + }, + { + "epoch": 3.492, + "eval_accuracy": 0.99, + "eval_loss": 0.04502712935209274, + "eval_runtime": 29.6227, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 873 + }, + { + "epoch": 3.496, + "grad_norm": 0.0003032684326171875, + "learning_rate": 1.5040000000000002e-05, + "loss": 0.0, + "step": 874 + }, + { + "epoch": 3.496, + "eval_accuracy": 0.99, + "eval_loss": 0.04480856657028198, + "eval_runtime": 29.6315, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 874 + }, + { + "epoch": 3.5, + "grad_norm": 3.8623809814453125e-05, + "learning_rate": 1.5e-05, + "loss": 0.0, + "step": 875 + }, + { + "epoch": 3.5, + "eval_accuracy": 0.99, + "eval_loss": 0.04453170672059059, + "eval_runtime": 29.6912, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.122, + "step": 875 + }, + { + "epoch": 3.504, + "grad_norm": 0.000156402587890625, + "learning_rate": 1.4960000000000002e-05, + "loss": 0.0, + "step": 876 + }, + { + "epoch": 3.504, + "eval_accuracy": 0.99, + "eval_loss": 0.045568838715553284, + "eval_runtime": 29.6476, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 876 + }, + { + "epoch": 3.508, + "grad_norm": 0.00168609619140625, + "learning_rate": 1.4920000000000001e-05, + "loss": 0.0, + "step": 877 + }, + { + "epoch": 3.508, + "eval_accuracy": 0.99, + "eval_loss": 0.0450400747358799, + "eval_runtime": 29.695, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.122, + "step": 877 + }, + { + "epoch": 3.512, + "grad_norm": 2.562999725341797e-06, + "learning_rate": 1.488e-05, + "loss": 0.0, + "step": 878 + }, + { + "epoch": 3.512, + "eval_accuracy": 0.99, + "eval_loss": 0.045760851353406906, + "eval_runtime": 29.6667, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 878 + }, + { + "epoch": 3.516, + "grad_norm": 3.4123659133911133e-06, + "learning_rate": 1.4840000000000002e-05, + "loss": 0.0, + "step": 879 + }, + { + "epoch": 3.516, + "eval_accuracy": 0.99, + "eval_loss": 0.04497358575463295, + "eval_runtime": 29.5297, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.133, + "step": 879 + }, + { + "epoch": 3.52, + "grad_norm": 1.1026859283447266e-05, + "learning_rate": 1.48e-05, + "loss": 0.0, + "step": 880 + }, + { + "epoch": 3.52, + "eval_accuracy": 0.99, + "eval_loss": 0.045344699174165726, + "eval_runtime": 29.4509, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.139, + "step": 880 + }, + { + "epoch": 3.524, + "grad_norm": 0.08447265625, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.0002, + "step": 881 + }, + { + "epoch": 3.524, + "eval_accuracy": 0.99, + "eval_loss": 0.045165497809648514, + "eval_runtime": 29.4549, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.139, + "step": 881 + }, + { + "epoch": 3.528, + "grad_norm": 1.5020370483398438e-05, + "learning_rate": 1.472e-05, + "loss": 0.0, + "step": 882 + }, + { + "epoch": 3.528, + "eval_accuracy": 0.99, + "eval_loss": 0.04507038742303848, + "eval_runtime": 29.4981, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.136, + "step": 882 + }, + { + "epoch": 3.532, + "grad_norm": 0.00017642974853515625, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.0, + "step": 883 + }, + { + "epoch": 3.532, + "eval_accuracy": 0.99, + "eval_loss": 0.04473983123898506, + "eval_runtime": 29.5635, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 883 + }, + { + "epoch": 3.536, + "grad_norm": 9.584426879882812e-05, + "learning_rate": 1.464e-05, + "loss": 0.0, + "step": 884 + }, + { + "epoch": 3.536, + "eval_accuracy": 0.99, + "eval_loss": 0.045080073177814484, + "eval_runtime": 29.5627, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 884 + }, + { + "epoch": 3.54, + "grad_norm": 3.4809112548828125e-05, + "learning_rate": 1.4599999999999999e-05, + "loss": 0.0, + "step": 885 + }, + { + "epoch": 3.54, + "eval_accuracy": 0.99, + "eval_loss": 0.04503197595477104, + "eval_runtime": 29.5826, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 885 + }, + { + "epoch": 3.544, + "grad_norm": 0.00151824951171875, + "learning_rate": 1.4560000000000001e-05, + "loss": 0.0, + "step": 886 + }, + { + "epoch": 3.544, + "eval_accuracy": 0.99, + "eval_loss": 0.04451673850417137, + "eval_runtime": 29.5856, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 886 + }, + { + "epoch": 3.548, + "grad_norm": 7.867813110351562e-06, + "learning_rate": 1.452e-05, + "loss": 0.0, + "step": 887 + }, + { + "epoch": 3.548, + "eval_accuracy": 0.99, + "eval_loss": 0.044902872294187546, + "eval_runtime": 29.5137, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.135, + "step": 887 + }, + { + "epoch": 3.552, + "grad_norm": 0.000560760498046875, + "learning_rate": 1.4480000000000002e-05, + "loss": 0.0, + "step": 888 + }, + { + "epoch": 3.552, + "eval_accuracy": 0.99, + "eval_loss": 0.04528782516717911, + "eval_runtime": 29.516, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.134, + "step": 888 + }, + { + "epoch": 3.556, + "grad_norm": 0.078125, + "learning_rate": 1.444e-05, + "loss": 0.0001, + "step": 889 + }, + { + "epoch": 3.556, + "eval_accuracy": 0.99, + "eval_loss": 0.045340392738580704, + "eval_runtime": 29.5106, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.135, + "step": 889 + }, + { + "epoch": 3.56, + "grad_norm": 0.006256103515625, + "learning_rate": 1.44e-05, + "loss": 0.0, + "step": 890 + }, + { + "epoch": 3.56, + "eval_accuracy": 0.99, + "eval_loss": 0.04489907994866371, + "eval_runtime": 29.5046, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 890 + }, + { + "epoch": 3.564, + "grad_norm": 0.000335693359375, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.0, + "step": 891 + }, + { + "epoch": 3.564, + "eval_accuracy": 0.99, + "eval_loss": 0.045272696763277054, + "eval_runtime": 29.5252, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.134, + "step": 891 + }, + { + "epoch": 3.568, + "grad_norm": 1.0132789611816406e-05, + "learning_rate": 1.432e-05, + "loss": 0.0, + "step": 892 + }, + { + "epoch": 3.568, + "eval_accuracy": 0.99, + "eval_loss": 0.04524017870426178, + "eval_runtime": 29.5357, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.133, + "step": 892 + }, + { + "epoch": 3.572, + "grad_norm": 0.005340576171875, + "learning_rate": 1.4280000000000002e-05, + "loss": 0.0, + "step": 893 + }, + { + "epoch": 3.572, + "eval_accuracy": 0.99, + "eval_loss": 0.0450594462454319, + "eval_runtime": 29.5832, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.13, + "step": 893 + }, + { + "epoch": 3.576, + "grad_norm": 0.00054168701171875, + "learning_rate": 1.4240000000000001e-05, + "loss": 0.0, + "step": 894 + }, + { + "epoch": 3.576, + "eval_accuracy": 0.99, + "eval_loss": 0.045167628675699234, + "eval_runtime": 29.5669, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 894 + }, + { + "epoch": 3.58, + "grad_norm": 1.4543533325195312e-05, + "learning_rate": 1.42e-05, + "loss": 0.0, + "step": 895 + }, + { + "epoch": 3.58, + "eval_accuracy": 0.99, + "eval_loss": 0.04533691331744194, + "eval_runtime": 29.4997, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.136, + "step": 895 + }, + { + "epoch": 3.584, + "grad_norm": 0.04150390625, + "learning_rate": 1.4160000000000002e-05, + "loss": 0.0001, + "step": 896 + }, + { + "epoch": 3.584, + "eval_accuracy": 0.99, + "eval_loss": 0.04472129046916962, + "eval_runtime": 29.5083, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.135, + "step": 896 + }, + { + "epoch": 3.588, + "grad_norm": 0.002227783203125, + "learning_rate": 1.412e-05, + "loss": 0.0, + "step": 897 + }, + { + "epoch": 3.588, + "eval_accuracy": 0.99, + "eval_loss": 0.04468397796154022, + "eval_runtime": 29.5101, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.135, + "step": 897 + }, + { + "epoch": 3.592, + "grad_norm": 0.001495361328125, + "learning_rate": 1.408e-05, + "loss": 0.0, + "step": 898 + }, + { + "epoch": 3.592, + "eval_accuracy": 0.99, + "eval_loss": 0.04517534747719765, + "eval_runtime": 29.4784, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.137, + "step": 898 + }, + { + "epoch": 3.596, + "grad_norm": 3.2633543014526367e-06, + "learning_rate": 1.4040000000000001e-05, + "loss": 0.0, + "step": 899 + }, + { + "epoch": 3.596, + "eval_accuracy": 0.99, + "eval_loss": 0.04489591345191002, + "eval_runtime": 29.5543, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 899 + }, + { + "epoch": 3.6, + "grad_norm": 0.01251220703125, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.0001, + "step": 900 + }, + { + "epoch": 3.6, + "eval_accuracy": 0.99, + "eval_loss": 0.04463127627968788, + "eval_runtime": 29.5081, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.135, + "step": 900 + }, + { + "epoch": 3.604, + "grad_norm": 0.0002593994140625, + "learning_rate": 1.396e-05, + "loss": 0.0, + "step": 901 + }, + { + "epoch": 3.604, + "eval_accuracy": 0.99, + "eval_loss": 0.0448782779276371, + "eval_runtime": 29.4538, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.139, + "step": 901 + }, + { + "epoch": 3.608, + "grad_norm": 0.00016498565673828125, + "learning_rate": 1.3919999999999999e-05, + "loss": 0.0, + "step": 902 + }, + { + "epoch": 3.608, + "eval_accuracy": 0.99, + "eval_loss": 0.044676780700683594, + "eval_runtime": 29.3753, + "eval_samples_per_second": 17.021, + "eval_steps_per_second": 2.145, + "step": 902 + }, + { + "epoch": 3.612, + "grad_norm": 2.921875, + "learning_rate": 1.3880000000000001e-05, + "loss": 0.004, + "step": 903 + }, + { + "epoch": 3.612, + "eval_accuracy": 0.99, + "eval_loss": 0.044511664658784866, + "eval_runtime": 29.5012, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.136, + "step": 903 + }, + { + "epoch": 3.616, + "grad_norm": 0.018798828125, + "learning_rate": 1.384e-05, + "loss": 0.0, + "step": 904 + }, + { + "epoch": 3.616, + "eval_accuracy": 0.99, + "eval_loss": 0.04487626627087593, + "eval_runtime": 29.5292, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.133, + "step": 904 + }, + { + "epoch": 3.62, + "grad_norm": 1.328125, + "learning_rate": 1.3800000000000002e-05, + "loss": 0.0038, + "step": 905 + }, + { + "epoch": 3.62, + "eval_accuracy": 0.99, + "eval_loss": 0.04518745839595795, + "eval_runtime": 29.4865, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.137, + "step": 905 + }, + { + "epoch": 3.624, + "grad_norm": 5.745887756347656e-05, + "learning_rate": 1.376e-05, + "loss": 0.0, + "step": 906 + }, + { + "epoch": 3.624, + "eval_accuracy": 0.99, + "eval_loss": 0.04448705166578293, + "eval_runtime": 29.5052, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 906 + }, + { + "epoch": 3.628, + "grad_norm": 9.625, + "learning_rate": 1.3719999999999999e-05, + "loss": 0.025, + "step": 907 + }, + { + "epoch": 3.628, + "eval_accuracy": 0.99, + "eval_loss": 0.04501795023679733, + "eval_runtime": 29.5385, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 907 + }, + { + "epoch": 3.632, + "grad_norm": 3.8929283618927e-07, + "learning_rate": 1.3680000000000001e-05, + "loss": 0.0, + "step": 908 + }, + { + "epoch": 3.632, + "eval_accuracy": 0.99, + "eval_loss": 0.04413921758532524, + "eval_runtime": 29.5164, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.134, + "step": 908 + }, + { + "epoch": 3.636, + "grad_norm": 0.00020503997802734375, + "learning_rate": 1.364e-05, + "loss": 0.0, + "step": 909 + }, + { + "epoch": 3.636, + "eval_accuracy": 0.99, + "eval_loss": 0.044362444430589676, + "eval_runtime": 29.5337, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.133, + "step": 909 + }, + { + "epoch": 3.64, + "grad_norm": 0.0025634765625, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.0, + "step": 910 + }, + { + "epoch": 3.64, + "eval_accuracy": 0.99, + "eval_loss": 0.04414486140012741, + "eval_runtime": 29.5337, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.133, + "step": 910 + }, + { + "epoch": 3.644, + "grad_norm": 1.7421875, + "learning_rate": 1.356e-05, + "loss": 0.0049, + "step": 911 + }, + { + "epoch": 3.644, + "eval_accuracy": 0.99, + "eval_loss": 0.04438505694270134, + "eval_runtime": 29.5869, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 911 + }, + { + "epoch": 3.648, + "grad_norm": 0.059814453125, + "learning_rate": 1.352e-05, + "loss": 0.0002, + "step": 912 + }, + { + "epoch": 3.648, + "eval_accuracy": 0.99, + "eval_loss": 0.044002216309309006, + "eval_runtime": 29.5381, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 912 + }, + { + "epoch": 3.652, + "grad_norm": 3.1478703022003174e-07, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.0, + "step": 913 + }, + { + "epoch": 3.652, + "eval_accuracy": 0.99, + "eval_loss": 0.04424380883574486, + "eval_runtime": 29.5947, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 913 + }, + { + "epoch": 3.656, + "grad_norm": 0.0269775390625, + "learning_rate": 1.344e-05, + "loss": 0.0001, + "step": 914 + }, + { + "epoch": 3.656, + "eval_accuracy": 0.99, + "eval_loss": 0.04349396377801895, + "eval_runtime": 29.5571, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.131, + "step": 914 + }, + { + "epoch": 3.66, + "grad_norm": 0.16796875, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.0005, + "step": 915 + }, + { + "epoch": 3.66, + "eval_accuracy": 0.99, + "eval_loss": 0.043777357786893845, + "eval_runtime": 29.5614, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 915 + }, + { + "epoch": 3.664, + "grad_norm": 7.05718994140625e-05, + "learning_rate": 1.336e-05, + "loss": 0.0, + "step": 916 + }, + { + "epoch": 3.664, + "eval_accuracy": 0.99, + "eval_loss": 0.042920198291540146, + "eval_runtime": 29.6197, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 916 + }, + { + "epoch": 3.668, + "grad_norm": 1.1640625, + "learning_rate": 1.3320000000000001e-05, + "loss": 0.0031, + "step": 917 + }, + { + "epoch": 3.668, + "eval_accuracy": 0.99, + "eval_loss": 0.04266909137368202, + "eval_runtime": 29.6058, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 917 + }, + { + "epoch": 3.672, + "grad_norm": 0.00116729736328125, + "learning_rate": 1.3280000000000002e-05, + "loss": 0.0, + "step": 918 + }, + { + "epoch": 3.672, + "eval_accuracy": 0.99, + "eval_loss": 0.04336573928594589, + "eval_runtime": 29.5724, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 918 + }, + { + "epoch": 3.676, + "grad_norm": 0.019775390625, + "learning_rate": 1.324e-05, + "loss": 0.0, + "step": 919 + }, + { + "epoch": 3.676, + "eval_accuracy": 0.99, + "eval_loss": 0.04331319406628609, + "eval_runtime": 29.5741, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 919 + }, + { + "epoch": 3.68, + "grad_norm": 0.00012302398681640625, + "learning_rate": 1.32e-05, + "loss": 0.0, + "step": 920 + }, + { + "epoch": 3.68, + "eval_accuracy": 0.99, + "eval_loss": 0.04347749426960945, + "eval_runtime": 29.5936, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 920 + }, + { + "epoch": 3.684, + "grad_norm": 2.1875, + "learning_rate": 1.316e-05, + "loss": 0.0062, + "step": 921 + }, + { + "epoch": 3.684, + "eval_accuracy": 0.99, + "eval_loss": 0.04306238889694214, + "eval_runtime": 29.5858, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 921 + }, + { + "epoch": 3.6879999999999997, + "grad_norm": 3.703125, + "learning_rate": 1.3120000000000001e-05, + "loss": 0.0019, + "step": 922 + }, + { + "epoch": 3.6879999999999997, + "eval_accuracy": 0.99, + "eval_loss": 0.0435580350458622, + "eval_runtime": 29.6359, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 922 + }, + { + "epoch": 3.692, + "grad_norm": 42.0, + "learning_rate": 1.308e-05, + "loss": 0.1729, + "step": 923 + }, + { + "epoch": 3.692, + "eval_accuracy": 0.99, + "eval_loss": 0.04330654814839363, + "eval_runtime": 29.5979, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 923 + }, + { + "epoch": 3.6959999999999997, + "grad_norm": 0.000759124755859375, + "learning_rate": 1.3039999999999999e-05, + "loss": 0.0, + "step": 924 + }, + { + "epoch": 3.6959999999999997, + "eval_accuracy": 0.99, + "eval_loss": 0.04379289224743843, + "eval_runtime": 29.6055, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 924 + }, + { + "epoch": 3.7, + "grad_norm": 0.0111083984375, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.0, + "step": 925 + }, + { + "epoch": 3.7, + "eval_accuracy": 0.99, + "eval_loss": 0.04407418891787529, + "eval_runtime": 29.6139, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 925 + }, + { + "epoch": 3.7039999999999997, + "grad_norm": 0.0002918243408203125, + "learning_rate": 1.296e-05, + "loss": 0.0, + "step": 926 + }, + { + "epoch": 3.7039999999999997, + "eval_accuracy": 0.99, + "eval_loss": 0.04353560507297516, + "eval_runtime": 29.5948, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 926 + }, + { + "epoch": 3.708, + "grad_norm": 0.0238037109375, + "learning_rate": 1.2920000000000002e-05, + "loss": 0.0001, + "step": 927 + }, + { + "epoch": 3.708, + "eval_accuracy": 0.99, + "eval_loss": 0.043574340641498566, + "eval_runtime": 29.5868, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 927 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 1.765625, + "learning_rate": 1.288e-05, + "loss": 0.0078, + "step": 928 + }, + { + "epoch": 3.7119999999999997, + "eval_accuracy": 0.99, + "eval_loss": 0.043133530765771866, + "eval_runtime": 29.6, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 928 + }, + { + "epoch": 3.716, + "grad_norm": 2.3096799850463867e-06, + "learning_rate": 1.2839999999999999e-05, + "loss": 0.0, + "step": 929 + }, + { + "epoch": 3.716, + "eval_accuracy": 0.99, + "eval_loss": 0.04431530088186264, + "eval_runtime": 29.6441, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 929 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 0.0013885498046875, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.0, + "step": 930 + }, + { + "epoch": 3.7199999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.04348866641521454, + "eval_runtime": 29.6552, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.124, + "step": 930 + }, + { + "epoch": 3.724, + "grad_norm": 0.07275390625, + "learning_rate": 1.276e-05, + "loss": 0.0001, + "step": 931 + }, + { + "epoch": 3.724, + "eval_accuracy": 0.99, + "eval_loss": 0.04372880607843399, + "eval_runtime": 29.6531, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.125, + "step": 931 + }, + { + "epoch": 3.7279999999999998, + "grad_norm": 0.13671875, + "learning_rate": 1.2720000000000002e-05, + "loss": 0.0004, + "step": 932 + }, + { + "epoch": 3.7279999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.043952103704214096, + "eval_runtime": 29.6167, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 932 + }, + { + "epoch": 3.732, + "grad_norm": 1.1171875, + "learning_rate": 1.268e-05, + "loss": 0.0022, + "step": 933 + }, + { + "epoch": 3.732, + "eval_accuracy": 0.99, + "eval_loss": 0.04300355911254883, + "eval_runtime": 29.6143, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 933 + }, + { + "epoch": 3.7359999999999998, + "grad_norm": 8.487701416015625e-05, + "learning_rate": 1.2640000000000003e-05, + "loss": 0.0, + "step": 934 + }, + { + "epoch": 3.7359999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.04370502382516861, + "eval_runtime": 29.6185, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 934 + }, + { + "epoch": 3.74, + "grad_norm": 1.7762184143066406e-05, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.0, + "step": 935 + }, + { + "epoch": 3.74, + "eval_accuracy": 0.99, + "eval_loss": 0.043567948043346405, + "eval_runtime": 29.669, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.123, + "step": 935 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 0.01904296875, + "learning_rate": 1.256e-05, + "loss": 0.0, + "step": 936 + }, + { + "epoch": 3.7439999999999998, + "eval_accuracy": 0.99, + "eval_loss": 0.04243483766913414, + "eval_runtime": 29.6134, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 936 + }, + { + "epoch": 3.748, + "grad_norm": 0.006011962890625, + "learning_rate": 1.252e-05, + "loss": 0.0, + "step": 937 + }, + { + "epoch": 3.748, + "eval_accuracy": 0.99, + "eval_loss": 0.04223757982254028, + "eval_runtime": 29.5988, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.128, + "step": 937 + }, + { + "epoch": 3.752, + "grad_norm": 1.043081283569336e-05, + "learning_rate": 1.248e-05, + "loss": 0.0, + "step": 938 + }, + { + "epoch": 3.752, + "eval_accuracy": 0.99, + "eval_loss": 0.04328092187643051, + "eval_runtime": 29.5941, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.129, + "step": 938 + }, + { + "epoch": 3.7560000000000002, + "grad_norm": 4.029273986816406e-05, + "learning_rate": 1.244e-05, + "loss": 0.0, + "step": 939 + }, + { + "epoch": 3.7560000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04292558133602142, + "eval_runtime": 29.5806, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 939 + }, + { + "epoch": 3.76, + "grad_norm": 0.003143310546875, + "learning_rate": 1.24e-05, + "loss": 0.0, + "step": 940 + }, + { + "epoch": 3.76, + "eval_accuracy": 0.99, + "eval_loss": 0.04296302795410156, + "eval_runtime": 29.5916, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 940 + }, + { + "epoch": 3.7640000000000002, + "grad_norm": 9.953975677490234e-06, + "learning_rate": 1.236e-05, + "loss": 0.0, + "step": 941 + }, + { + "epoch": 3.7640000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04307146742939949, + "eval_runtime": 29.6368, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 941 + }, + { + "epoch": 3.768, + "grad_norm": 7.46875, + "learning_rate": 1.232e-05, + "loss": 0.0186, + "step": 942 + }, + { + "epoch": 3.768, + "eval_accuracy": 0.99, + "eval_loss": 0.04303477331995964, + "eval_runtime": 29.6266, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.126, + "step": 942 + }, + { + "epoch": 3.7720000000000002, + "grad_norm": 0.037353515625, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.0002, + "step": 943 + }, + { + "epoch": 3.7720000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04279663786292076, + "eval_runtime": 29.5735, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 943 + }, + { + "epoch": 3.776, + "grad_norm": 0.20703125, + "learning_rate": 1.224e-05, + "loss": 0.0005, + "step": 944 + }, + { + "epoch": 3.776, + "eval_accuracy": 0.99, + "eval_loss": 0.04312938451766968, + "eval_runtime": 29.5743, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 944 + }, + { + "epoch": 3.7800000000000002, + "grad_norm": 1.055002212524414e-05, + "learning_rate": 1.22e-05, + "loss": 0.0, + "step": 945 + }, + { + "epoch": 3.7800000000000002, + "eval_accuracy": 0.99, + "eval_loss": 0.04254954308271408, + "eval_runtime": 29.6179, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 945 + }, + { + "epoch": 3.784, + "grad_norm": 0.83203125, + "learning_rate": 1.216e-05, + "loss": 0.0028, + "step": 946 + }, + { + "epoch": 3.784, + "eval_accuracy": 0.99, + "eval_loss": 0.042534299194812775, + "eval_runtime": 29.5755, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 946 + }, + { + "epoch": 3.7880000000000003, + "grad_norm": 0.1953125, + "learning_rate": 1.2120000000000001e-05, + "loss": 0.0006, + "step": 947 + }, + { + "epoch": 3.7880000000000003, + "eval_accuracy": 0.99, + "eval_loss": 0.04329424723982811, + "eval_runtime": 29.5835, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.13, + "step": 947 + }, + { + "epoch": 3.792, + "grad_norm": 1.8742866814136505e-08, + "learning_rate": 1.2080000000000001e-05, + "loss": 0.0, + "step": 948 + }, + { + "epoch": 3.792, + "eval_accuracy": 0.99, + "eval_loss": 0.04258767142891884, + "eval_runtime": 29.6229, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 948 + }, + { + "epoch": 3.7960000000000003, + "grad_norm": 0.016357421875, + "learning_rate": 1.204e-05, + "loss": 0.0, + "step": 949 + }, + { + "epoch": 3.7960000000000003, + "eval_accuracy": 0.99, + "eval_loss": 0.04225219786167145, + "eval_runtime": 29.5801, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 949 + }, + { + "epoch": 3.8, + "grad_norm": 0.00732421875, + "learning_rate": 1.2e-05, + "loss": 0.0, + "step": 950 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.99, + "eval_loss": 0.04215637966990471, + "eval_runtime": 29.6235, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 950 + }, + { + "epoch": 3.8040000000000003, + "grad_norm": 1.4424324035644531e-05, + "learning_rate": 1.196e-05, + "loss": 0.0, + "step": 951 + }, + { + "epoch": 3.8040000000000003, + "eval_accuracy": 0.99, + "eval_loss": 0.04148517921566963, + "eval_runtime": 29.4999, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.136, + "step": 951 + }, + { + "epoch": 3.808, + "grad_norm": 3.0994415283203125e-06, + "learning_rate": 1.1920000000000001e-05, + "loss": 0.0, + "step": 952 + }, + { + "epoch": 3.808, + "eval_accuracy": 0.99, + "eval_loss": 0.04249126836657524, + "eval_runtime": 29.4638, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.138, + "step": 952 + }, + { + "epoch": 3.8120000000000003, + "grad_norm": 0.00019359588623046875, + "learning_rate": 1.1880000000000001e-05, + "loss": 0.0, + "step": 953 + }, + { + "epoch": 3.8120000000000003, + "eval_accuracy": 0.99, + "eval_loss": 0.042157188057899475, + "eval_runtime": 29.5669, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 953 + }, + { + "epoch": 3.816, + "grad_norm": 4.410743713378906e-06, + "learning_rate": 1.1840000000000002e-05, + "loss": 0.0, + "step": 954 + }, + { + "epoch": 3.816, + "eval_accuracy": 0.99, + "eval_loss": 0.04181208834052086, + "eval_runtime": 29.5515, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.132, + "step": 954 + }, + { + "epoch": 3.82, + "grad_norm": 0.000148773193359375, + "learning_rate": 1.18e-05, + "loss": 0.0, + "step": 955 + }, + { + "epoch": 3.82, + "eval_accuracy": 0.99, + "eval_loss": 0.041841376572847366, + "eval_runtime": 29.5693, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 955 + }, + { + "epoch": 3.824, + "grad_norm": 6.794929504394531e-06, + "learning_rate": 1.1760000000000001e-05, + "loss": 0.0, + "step": 956 + }, + { + "epoch": 3.824, + "eval_accuracy": 0.99, + "eval_loss": 0.04177083447575569, + "eval_runtime": 29.6296, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 956 + }, + { + "epoch": 3.828, + "grad_norm": 1.4435499906539917e-08, + "learning_rate": 1.172e-05, + "loss": 0.0, + "step": 957 + }, + { + "epoch": 3.828, + "eval_accuracy": 0.99, + "eval_loss": 0.04161449149250984, + "eval_runtime": 29.5967, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 957 + }, + { + "epoch": 3.832, + "grad_norm": 3.562308847904205e-08, + "learning_rate": 1.168e-05, + "loss": 0.0, + "step": 958 + }, + { + "epoch": 3.832, + "eval_accuracy": 0.99, + "eval_loss": 0.04125643149018288, + "eval_runtime": 29.6359, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 958 + }, + { + "epoch": 3.836, + "grad_norm": 0.0027008056640625, + "learning_rate": 1.164e-05, + "loss": 0.0, + "step": 959 + }, + { + "epoch": 3.836, + "eval_accuracy": 0.99, + "eval_loss": 0.04197949543595314, + "eval_runtime": 29.6361, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 959 + }, + { + "epoch": 3.84, + "grad_norm": 5.736947059631348e-07, + "learning_rate": 1.16e-05, + "loss": 0.0, + "step": 960 + }, + { + "epoch": 3.84, + "eval_accuracy": 0.99, + "eval_loss": 0.04174867272377014, + "eval_runtime": 29.5932, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 960 + }, + { + "epoch": 3.844, + "grad_norm": 94.5, + "learning_rate": 1.156e-05, + "loss": 0.1816, + "step": 961 + }, + { + "epoch": 3.844, + "eval_accuracy": 0.99, + "eval_loss": 0.04199570417404175, + "eval_runtime": 29.6406, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.125, + "step": 961 + }, + { + "epoch": 3.848, + "grad_norm": 2.8312206268310547e-07, + "learning_rate": 1.152e-05, + "loss": 0.0, + "step": 962 + }, + { + "epoch": 3.848, + "eval_accuracy": 0.99, + "eval_loss": 0.041954588145017624, + "eval_runtime": 29.646, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.125, + "step": 962 + }, + { + "epoch": 3.852, + "grad_norm": 0.1162109375, + "learning_rate": 1.148e-05, + "loss": 0.0004, + "step": 963 + }, + { + "epoch": 3.852, + "eval_accuracy": 0.99, + "eval_loss": 0.041209135204553604, + "eval_runtime": 29.609, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 963 + }, + { + "epoch": 3.856, + "grad_norm": 6.295740604400635e-07, + "learning_rate": 1.144e-05, + "loss": 0.0, + "step": 964 + }, + { + "epoch": 3.856, + "eval_accuracy": 0.99, + "eval_loss": 0.04198392480611801, + "eval_runtime": 29.6064, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 964 + }, + { + "epoch": 3.86, + "grad_norm": 7.724761962890625e-05, + "learning_rate": 1.1400000000000001e-05, + "loss": 0.0, + "step": 965 + }, + { + "epoch": 3.86, + "eval_accuracy": 0.99, + "eval_loss": 0.041905585676431656, + "eval_runtime": 29.6146, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 965 + }, + { + "epoch": 3.864, + "grad_norm": 7.799826562404633e-09, + "learning_rate": 1.1360000000000001e-05, + "loss": 0.0, + "step": 966 + }, + { + "epoch": 3.864, + "eval_accuracy": 0.99, + "eval_loss": 0.04150322079658508, + "eval_runtime": 29.673, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.123, + "step": 966 + }, + { + "epoch": 3.868, + "grad_norm": 0.0020904541015625, + "learning_rate": 1.132e-05, + "loss": 0.0, + "step": 967 + }, + { + "epoch": 3.868, + "eval_accuracy": 0.99, + "eval_loss": 0.04134644940495491, + "eval_runtime": 29.6817, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.123, + "step": 967 + }, + { + "epoch": 3.872, + "grad_norm": 0.003631591796875, + "learning_rate": 1.128e-05, + "loss": 0.0, + "step": 968 + }, + { + "epoch": 3.872, + "eval_accuracy": 0.99, + "eval_loss": 0.041748009622097015, + "eval_runtime": 29.6321, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 968 + }, + { + "epoch": 3.876, + "grad_norm": 1.2442469596862793e-06, + "learning_rate": 1.124e-05, + "loss": 0.0, + "step": 969 + }, + { + "epoch": 3.876, + "eval_accuracy": 0.99, + "eval_loss": 0.040733445435762405, + "eval_runtime": 29.6286, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.126, + "step": 969 + }, + { + "epoch": 3.88, + "grad_norm": 0.0003223419189453125, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.0, + "step": 970 + }, + { + "epoch": 3.88, + "eval_accuracy": 0.99, + "eval_loss": 0.04287834092974663, + "eval_runtime": 29.6231, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 970 + }, + { + "epoch": 3.884, + "grad_norm": 6.151199340820312e-05, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.0, + "step": 971 + }, + { + "epoch": 3.884, + "eval_accuracy": 0.99, + "eval_loss": 0.04103665426373482, + "eval_runtime": 29.6387, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 971 + }, + { + "epoch": 3.888, + "grad_norm": 0.0002994537353515625, + "learning_rate": 1.112e-05, + "loss": 0.0, + "step": 972 + }, + { + "epoch": 3.888, + "eval_accuracy": 0.99, + "eval_loss": 0.04129284992814064, + "eval_runtime": 29.6066, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 972 + }, + { + "epoch": 3.892, + "grad_norm": 1.5854835510253906e-05, + "learning_rate": 1.108e-05, + "loss": 0.0, + "step": 973 + }, + { + "epoch": 3.892, + "eval_accuracy": 0.99, + "eval_loss": 0.04203477129340172, + "eval_runtime": 29.5546, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 973 + }, + { + "epoch": 3.896, + "grad_norm": 7.420778274536133e-06, + "learning_rate": 1.1040000000000001e-05, + "loss": 0.0, + "step": 974 + }, + { + "epoch": 3.896, + "eval_accuracy": 0.99, + "eval_loss": 0.04125024378299713, + "eval_runtime": 29.5337, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.133, + "step": 974 + }, + { + "epoch": 3.9, + "grad_norm": 0.1640625, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.0003, + "step": 975 + }, + { + "epoch": 3.9, + "eval_accuracy": 0.99, + "eval_loss": 0.04228623956441879, + "eval_runtime": 29.5753, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 975 + }, + { + "epoch": 3.904, + "grad_norm": 1.2265625, + "learning_rate": 1.096e-05, + "loss": 0.0031, + "step": 976 + }, + { + "epoch": 3.904, + "eval_accuracy": 0.99, + "eval_loss": 0.04101424291729927, + "eval_runtime": 29.5041, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.135, + "step": 976 + }, + { + "epoch": 3.908, + "grad_norm": 17.375, + "learning_rate": 1.092e-05, + "loss": 0.2344, + "step": 977 + }, + { + "epoch": 3.908, + "eval_accuracy": 0.99, + "eval_loss": 0.04132353141903877, + "eval_runtime": 29.4882, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.136, + "step": 977 + }, + { + "epoch": 3.912, + "grad_norm": 0.01904296875, + "learning_rate": 1.088e-05, + "loss": 0.0001, + "step": 978 + }, + { + "epoch": 3.912, + "eval_accuracy": 0.99, + "eval_loss": 0.04085206985473633, + "eval_runtime": 29.4711, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.138, + "step": 978 + }, + { + "epoch": 3.916, + "grad_norm": 6.5, + "learning_rate": 1.084e-05, + "loss": 0.0175, + "step": 979 + }, + { + "epoch": 3.916, + "eval_accuracy": 0.99, + "eval_loss": 0.03972804173827171, + "eval_runtime": 29.4558, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.139, + "step": 979 + }, + { + "epoch": 3.92, + "grad_norm": 2.078125, + "learning_rate": 1.08e-05, + "loss": 0.001, + "step": 980 + }, + { + "epoch": 3.92, + "eval_accuracy": 0.99, + "eval_loss": 0.040775902569293976, + "eval_runtime": 29.4499, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.139, + "step": 980 + }, + { + "epoch": 3.924, + "grad_norm": 0.000186920166015625, + "learning_rate": 1.076e-05, + "loss": 0.0, + "step": 981 + }, + { + "epoch": 3.924, + "eval_accuracy": 0.99, + "eval_loss": 0.04043996334075928, + "eval_runtime": 29.4468, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.139, + "step": 981 + }, + { + "epoch": 3.928, + "grad_norm": 3.6954879760742188e-06, + "learning_rate": 1.072e-05, + "loss": 0.0, + "step": 982 + }, + { + "epoch": 3.928, + "eval_accuracy": 0.99, + "eval_loss": 0.040118250995874405, + "eval_runtime": 29.4366, + "eval_samples_per_second": 16.986, + "eval_steps_per_second": 2.14, + "step": 982 + }, + { + "epoch": 3.932, + "grad_norm": 0.072265625, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.0003, + "step": 983 + }, + { + "epoch": 3.932, + "eval_accuracy": 0.99, + "eval_loss": 0.0395524799823761, + "eval_runtime": 29.4799, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.137, + "step": 983 + }, + { + "epoch": 3.936, + "grad_norm": 2.5727786123752594e-08, + "learning_rate": 1.064e-05, + "loss": 0.0, + "step": 984 + }, + { + "epoch": 3.936, + "eval_accuracy": 0.99, + "eval_loss": 0.04052968695759773, + "eval_runtime": 29.6227, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.127, + "step": 984 + }, + { + "epoch": 3.94, + "grad_norm": 0.0009765625, + "learning_rate": 1.06e-05, + "loss": 0.0, + "step": 985 + }, + { + "epoch": 3.94, + "eval_accuracy": 0.99, + "eval_loss": 0.040278397500514984, + "eval_runtime": 29.6667, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.124, + "step": 985 + }, + { + "epoch": 3.944, + "grad_norm": 1.0848045349121094e-05, + "learning_rate": 1.056e-05, + "loss": 0.0, + "step": 986 + }, + { + "epoch": 3.944, + "eval_accuracy": 0.99, + "eval_loss": 0.03977198898792267, + "eval_runtime": 29.6288, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 986 + }, + { + "epoch": 3.948, + "grad_norm": 0.10595703125, + "learning_rate": 1.0520000000000001e-05, + "loss": 0.0002, + "step": 987 + }, + { + "epoch": 3.948, + "eval_accuracy": 0.99, + "eval_loss": 0.03944958373904228, + "eval_runtime": 29.6755, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.123, + "step": 987 + }, + { + "epoch": 3.952, + "grad_norm": 63.75, + "learning_rate": 1.0480000000000001e-05, + "loss": 0.4004, + "step": 988 + }, + { + "epoch": 3.952, + "eval_accuracy": 0.99, + "eval_loss": 0.039371557533741, + "eval_runtime": 29.6207, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 988 + }, + { + "epoch": 3.956, + "grad_norm": 0.00160980224609375, + "learning_rate": 1.0440000000000002e-05, + "loss": 0.0, + "step": 989 + }, + { + "epoch": 3.956, + "eval_accuracy": 0.99, + "eval_loss": 0.0394514724612236, + "eval_runtime": 29.6241, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 989 + }, + { + "epoch": 3.96, + "grad_norm": 0.000782012939453125, + "learning_rate": 1.04e-05, + "loss": 0.0, + "step": 990 + }, + { + "epoch": 3.96, + "eval_accuracy": 0.99, + "eval_loss": 0.0388411246240139, + "eval_runtime": 29.6869, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.122, + "step": 990 + }, + { + "epoch": 3.964, + "grad_norm": 0.00015163421630859375, + "learning_rate": 1.036e-05, + "loss": 0.0, + "step": 991 + }, + { + "epoch": 3.964, + "eval_accuracy": 0.99, + "eval_loss": 0.03952277451753616, + "eval_runtime": 29.6456, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.125, + "step": 991 + }, + { + "epoch": 3.968, + "grad_norm": 1.5273690223693848e-06, + "learning_rate": 1.0320000000000001e-05, + "loss": 0.0, + "step": 992 + }, + { + "epoch": 3.968, + "eval_accuracy": 0.99, + "eval_loss": 0.03920529782772064, + "eval_runtime": 29.6293, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 992 + }, + { + "epoch": 3.972, + "grad_norm": 0.025146484375, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.0, + "step": 993 + }, + { + "epoch": 3.972, + "eval_accuracy": 0.99, + "eval_loss": 0.03968721628189087, + "eval_runtime": 29.6824, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.122, + "step": 993 + }, + { + "epoch": 3.976, + "grad_norm": 0.0120849609375, + "learning_rate": 1.024e-05, + "loss": 0.0, + "step": 994 + }, + { + "epoch": 3.976, + "eval_accuracy": 0.99, + "eval_loss": 0.03966926410794258, + "eval_runtime": 29.608, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 994 + }, + { + "epoch": 3.98, + "grad_norm": 0.0003871917724609375, + "learning_rate": 1.02e-05, + "loss": 0.0, + "step": 995 + }, + { + "epoch": 3.98, + "eval_accuracy": 0.99, + "eval_loss": 0.03921307250857353, + "eval_runtime": 29.5521, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 995 + }, + { + "epoch": 3.984, + "grad_norm": 7.4803829193115234e-06, + "learning_rate": 1.016e-05, + "loss": 0.0, + "step": 996 + }, + { + "epoch": 3.984, + "eval_accuracy": 0.99, + "eval_loss": 0.03962932154536247, + "eval_runtime": 29.5241, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.134, + "step": 996 + }, + { + "epoch": 3.988, + "grad_norm": 0.0024871826171875, + "learning_rate": 1.012e-05, + "loss": 0.0, + "step": 997 + }, + { + "epoch": 3.988, + "eval_accuracy": 0.99, + "eval_loss": 0.03896590322256088, + "eval_runtime": 29.5085, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.135, + "step": 997 + }, + { + "epoch": 3.992, + "grad_norm": 1.3515625, + "learning_rate": 1.008e-05, + "loss": 0.004, + "step": 998 + }, + { + "epoch": 3.992, + "eval_accuracy": 0.99, + "eval_loss": 0.03928358480334282, + "eval_runtime": 29.5088, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.135, + "step": 998 + }, + { + "epoch": 3.996, + "grad_norm": 4.506111145019531e-05, + "learning_rate": 1.004e-05, + "loss": 0.0, + "step": 999 + }, + { + "epoch": 3.996, + "eval_accuracy": 0.99, + "eval_loss": 0.03890083730220795, + "eval_runtime": 29.5053, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 999 + }, + { + "epoch": 4.0, + "grad_norm": 9.778887033462524e-08, + "learning_rate": 1e-05, + "loss": 0.0, + "step": 1000 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.99, + "eval_loss": 0.03954102471470833, + "eval_runtime": 29.5043, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.135, + "step": 1000 + }, + { + "epoch": 4.004, + "grad_norm": 0.0001850128173828125, + "learning_rate": 9.96e-06, + "loss": 0.0, + "step": 1001 + }, + { + "epoch": 4.004, + "eval_accuracy": 0.99, + "eval_loss": 0.039021410048007965, + "eval_runtime": 29.4021, + "eval_samples_per_second": 17.006, + "eval_steps_per_second": 2.143, + "step": 1001 + }, + { + "epoch": 4.008, + "grad_norm": 0.10107421875, + "learning_rate": 9.92e-06, + "loss": 0.0003, + "step": 1002 + }, + { + "epoch": 4.008, + "eval_accuracy": 0.99, + "eval_loss": 0.038015760481357574, + "eval_runtime": 29.3869, + "eval_samples_per_second": 17.014, + "eval_steps_per_second": 2.144, + "step": 1002 + }, + { + "epoch": 4.012, + "grad_norm": 0.019775390625, + "learning_rate": 9.88e-06, + "loss": 0.0, + "step": 1003 + }, + { + "epoch": 4.012, + "eval_accuracy": 0.99, + "eval_loss": 0.03934221714735031, + "eval_runtime": 29.5271, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.134, + "step": 1003 + }, + { + "epoch": 4.016, + "grad_norm": 1.811981201171875e-05, + "learning_rate": 9.84e-06, + "loss": 0.0, + "step": 1004 + }, + { + "epoch": 4.016, + "eval_accuracy": 0.99, + "eval_loss": 0.039433617144823074, + "eval_runtime": 29.6652, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 1004 + }, + { + "epoch": 4.02, + "grad_norm": 0.0107421875, + "learning_rate": 9.800000000000001e-06, + "loss": 0.0, + "step": 1005 + }, + { + "epoch": 4.02, + "eval_accuracy": 0.99, + "eval_loss": 0.03989221900701523, + "eval_runtime": 29.6361, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 1005 + }, + { + "epoch": 4.024, + "grad_norm": 41.75, + "learning_rate": 9.760000000000001e-06, + "loss": 0.2188, + "step": 1006 + }, + { + "epoch": 4.024, + "eval_accuracy": 0.99, + "eval_loss": 0.03940678387880325, + "eval_runtime": 29.662, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 1006 + }, + { + "epoch": 4.028, + "grad_norm": 4.589557647705078e-06, + "learning_rate": 9.72e-06, + "loss": 0.0, + "step": 1007 + }, + { + "epoch": 4.028, + "eval_accuracy": 0.99, + "eval_loss": 0.039036281406879425, + "eval_runtime": 29.6652, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 1007 + }, + { + "epoch": 4.032, + "grad_norm": 0.00028228759765625, + "learning_rate": 9.68e-06, + "loss": 0.0, + "step": 1008 + }, + { + "epoch": 4.032, + "eval_accuracy": 0.99, + "eval_loss": 0.03895065560936928, + "eval_runtime": 29.6627, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 1008 + }, + { + "epoch": 4.036, + "grad_norm": 0.000659942626953125, + "learning_rate": 9.640000000000001e-06, + "loss": 0.0, + "step": 1009 + }, + { + "epoch": 4.036, + "eval_accuracy": 0.99, + "eval_loss": 0.03935617208480835, + "eval_runtime": 29.6961, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.121, + "step": 1009 + }, + { + "epoch": 4.04, + "grad_norm": 0.01287841796875, + "learning_rate": 9.600000000000001e-06, + "loss": 0.0, + "step": 1010 + }, + { + "epoch": 4.04, + "eval_accuracy": 0.99, + "eval_loss": 0.03903951868414879, + "eval_runtime": 29.5666, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 1010 + }, + { + "epoch": 4.044, + "grad_norm": 0.216796875, + "learning_rate": 9.560000000000002e-06, + "loss": 0.0007, + "step": 1011 + }, + { + "epoch": 4.044, + "eval_accuracy": 0.99, + "eval_loss": 0.03764825686812401, + "eval_runtime": 29.5034, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.135, + "step": 1011 + }, + { + "epoch": 4.048, + "grad_norm": 18.875, + "learning_rate": 9.52e-06, + "loss": 0.0762, + "step": 1012 + }, + { + "epoch": 4.048, + "eval_accuracy": 0.99, + "eval_loss": 0.0392136350274086, + "eval_runtime": 29.5192, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.134, + "step": 1012 + }, + { + "epoch": 4.052, + "grad_norm": 0.00093841552734375, + "learning_rate": 9.48e-06, + "loss": 0.0, + "step": 1013 + }, + { + "epoch": 4.052, + "eval_accuracy": 0.99, + "eval_loss": 0.039286114275455475, + "eval_runtime": 29.4582, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.139, + "step": 1013 + }, + { + "epoch": 4.056, + "grad_norm": 9.441375732421875e-05, + "learning_rate": 9.44e-06, + "loss": 0.0, + "step": 1014 + }, + { + "epoch": 4.056, + "eval_accuracy": 0.99, + "eval_loss": 0.039109617471694946, + "eval_runtime": 29.4302, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.141, + "step": 1014 + }, + { + "epoch": 4.06, + "grad_norm": 0.00016307830810546875, + "learning_rate": 9.4e-06, + "loss": 0.0, + "step": 1015 + }, + { + "epoch": 4.06, + "eval_accuracy": 0.99, + "eval_loss": 0.03794221580028534, + "eval_runtime": 29.4185, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 2.142, + "step": 1015 + }, + { + "epoch": 4.064, + "grad_norm": 53.25, + "learning_rate": 9.36e-06, + "loss": 0.1309, + "step": 1016 + }, + { + "epoch": 4.064, + "eval_accuracy": 0.99, + "eval_loss": 0.03839749097824097, + "eval_runtime": 29.4863, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.137, + "step": 1016 + }, + { + "epoch": 4.068, + "grad_norm": 49.0, + "learning_rate": 9.32e-06, + "loss": 0.6875, + "step": 1017 + }, + { + "epoch": 4.068, + "eval_accuracy": 0.99, + "eval_loss": 0.03825528547167778, + "eval_runtime": 29.4896, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.136, + "step": 1017 + }, + { + "epoch": 4.072, + "grad_norm": 3.046875, + "learning_rate": 9.28e-06, + "loss": 0.0095, + "step": 1018 + }, + { + "epoch": 4.072, + "eval_accuracy": 0.99, + "eval_loss": 0.03824084997177124, + "eval_runtime": 29.6687, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.123, + "step": 1018 + }, + { + "epoch": 4.076, + "grad_norm": 4.4345855712890625e-05, + "learning_rate": 9.24e-06, + "loss": 0.0, + "step": 1019 + }, + { + "epoch": 4.076, + "eval_accuracy": 0.99, + "eval_loss": 0.037776555866003036, + "eval_runtime": 29.6638, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.124, + "step": 1019 + }, + { + "epoch": 4.08, + "grad_norm": 0.00153350830078125, + "learning_rate": 9.2e-06, + "loss": 0.0, + "step": 1020 + }, + { + "epoch": 4.08, + "eval_accuracy": 0.99, + "eval_loss": 0.03957686945796013, + "eval_runtime": 29.6698, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.123, + "step": 1020 + }, + { + "epoch": 4.084, + "grad_norm": 0.88671875, + "learning_rate": 9.16e-06, + "loss": 0.0033, + "step": 1021 + }, + { + "epoch": 4.084, + "eval_accuracy": 0.99, + "eval_loss": 0.03899029642343521, + "eval_runtime": 29.7242, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.119, + "step": 1021 + }, + { + "epoch": 4.088, + "grad_norm": 26.625, + "learning_rate": 9.12e-06, + "loss": 0.1816, + "step": 1022 + }, + { + "epoch": 4.088, + "eval_accuracy": 0.99, + "eval_loss": 0.039245299994945526, + "eval_runtime": 29.7142, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.12, + "step": 1022 + }, + { + "epoch": 4.092, + "grad_norm": 4.124641418457031e-05, + "learning_rate": 9.080000000000001e-06, + "loss": 0.0, + "step": 1023 + }, + { + "epoch": 4.092, + "eval_accuracy": 0.99, + "eval_loss": 0.0391768254339695, + "eval_runtime": 29.5963, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 1023 + }, + { + "epoch": 4.096, + "grad_norm": 3.170967102050781e-05, + "learning_rate": 9.04e-06, + "loss": 0.0, + "step": 1024 + }, + { + "epoch": 4.096, + "eval_accuracy": 0.99, + "eval_loss": 0.039372269064188004, + "eval_runtime": 29.5111, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.135, + "step": 1024 + }, + { + "epoch": 4.1, + "grad_norm": 3.1478703022003174e-07, + "learning_rate": 9e-06, + "loss": 0.0, + "step": 1025 + }, + { + "epoch": 4.1, + "eval_accuracy": 0.99, + "eval_loss": 0.03929569944739342, + "eval_runtime": 29.5214, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.134, + "step": 1025 + }, + { + "epoch": 4.104, + "grad_norm": 0.0003662109375, + "learning_rate": 8.96e-06, + "loss": 0.0, + "step": 1026 + }, + { + "epoch": 4.104, + "eval_accuracy": 0.99, + "eval_loss": 0.038825128227472305, + "eval_runtime": 29.4277, + "eval_samples_per_second": 16.991, + "eval_steps_per_second": 2.141, + "step": 1026 + }, + { + "epoch": 4.108, + "grad_norm": 0.000274658203125, + "learning_rate": 8.920000000000001e-06, + "loss": 0.0, + "step": 1027 + }, + { + "epoch": 4.108, + "eval_accuracy": 0.99, + "eval_loss": 0.038858287036418915, + "eval_runtime": 29.4117, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.142, + "step": 1027 + }, + { + "epoch": 4.112, + "grad_norm": 9.632110595703125e-05, + "learning_rate": 8.880000000000001e-06, + "loss": 0.0, + "step": 1028 + }, + { + "epoch": 4.112, + "eval_accuracy": 0.99, + "eval_loss": 0.03774901106953621, + "eval_runtime": 29.477, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.137, + "step": 1028 + }, + { + "epoch": 4.116, + "grad_norm": 0.000888824462890625, + "learning_rate": 8.840000000000002e-06, + "loss": 0.0, + "step": 1029 + }, + { + "epoch": 4.116, + "eval_accuracy": 0.99, + "eval_loss": 0.03896695002913475, + "eval_runtime": 29.6493, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 1029 + }, + { + "epoch": 4.12, + "grad_norm": 3.981590270996094e-05, + "learning_rate": 8.8e-06, + "loss": 0.0, + "step": 1030 + }, + { + "epoch": 4.12, + "eval_accuracy": 0.99, + "eval_loss": 0.04000476747751236, + "eval_runtime": 29.6518, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.125, + "step": 1030 + }, + { + "epoch": 4.124, + "grad_norm": 0.00020599365234375, + "learning_rate": 8.76e-06, + "loss": 0.0, + "step": 1031 + }, + { + "epoch": 4.124, + "eval_accuracy": 0.99, + "eval_loss": 0.039399441331624985, + "eval_runtime": 29.7104, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.12, + "step": 1031 + }, + { + "epoch": 4.128, + "grad_norm": 0.04345703125, + "learning_rate": 8.720000000000001e-06, + "loss": 0.0001, + "step": 1032 + }, + { + "epoch": 4.128, + "eval_accuracy": 0.99, + "eval_loss": 0.039167679846286774, + "eval_runtime": 29.7239, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.12, + "step": 1032 + }, + { + "epoch": 4.132, + "grad_norm": 0.01446533203125, + "learning_rate": 8.68e-06, + "loss": 0.0001, + "step": 1033 + }, + { + "epoch": 4.132, + "eval_accuracy": 0.99, + "eval_loss": 0.0390964113175869, + "eval_runtime": 29.7217, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.12, + "step": 1033 + }, + { + "epoch": 4.136, + "grad_norm": 0.00157928466796875, + "learning_rate": 8.64e-06, + "loss": 0.0, + "step": 1034 + }, + { + "epoch": 4.136, + "eval_accuracy": 0.99, + "eval_loss": 0.03853569179773331, + "eval_runtime": 29.6347, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.126, + "step": 1034 + }, + { + "epoch": 4.14, + "grad_norm": 0.0029754638671875, + "learning_rate": 8.599999999999999e-06, + "loss": 0.0, + "step": 1035 + }, + { + "epoch": 4.14, + "eval_accuracy": 0.99, + "eval_loss": 0.03876250237226486, + "eval_runtime": 29.5551, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1035 + }, + { + "epoch": 4.144, + "grad_norm": 0.00019741058349609375, + "learning_rate": 8.56e-06, + "loss": 0.0, + "step": 1036 + }, + { + "epoch": 4.144, + "eval_accuracy": 0.99, + "eval_loss": 0.03972771391272545, + "eval_runtime": 29.5098, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.135, + "step": 1036 + }, + { + "epoch": 4.148, + "grad_norm": 1.3203125, + "learning_rate": 8.52e-06, + "loss": 0.0048, + "step": 1037 + }, + { + "epoch": 4.148, + "eval_accuracy": 0.99, + "eval_loss": 0.04004419595003128, + "eval_runtime": 29.4814, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 2.137, + "step": 1037 + }, + { + "epoch": 4.152, + "grad_norm": 88.5, + "learning_rate": 8.48e-06, + "loss": 1.2344, + "step": 1038 + }, + { + "epoch": 4.152, + "eval_accuracy": 0.99, + "eval_loss": 0.03941133990883827, + "eval_runtime": 29.4697, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.138, + "step": 1038 + }, + { + "epoch": 4.156, + "grad_norm": 0.0498046875, + "learning_rate": 8.44e-06, + "loss": 0.0002, + "step": 1039 + }, + { + "epoch": 4.156, + "eval_accuracy": 0.99, + "eval_loss": 0.039659980684518814, + "eval_runtime": 29.4565, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.139, + "step": 1039 + }, + { + "epoch": 4.16, + "grad_norm": 0.0021209716796875, + "learning_rate": 8.400000000000001e-06, + "loss": 0.0, + "step": 1040 + }, + { + "epoch": 4.16, + "eval_accuracy": 0.99, + "eval_loss": 0.04016843065619469, + "eval_runtime": 29.5052, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 1040 + }, + { + "epoch": 4.164, + "grad_norm": 0.004730224609375, + "learning_rate": 8.36e-06, + "loss": 0.0, + "step": 1041 + }, + { + "epoch": 4.164, + "eval_accuracy": 0.99, + "eval_loss": 0.04058794304728508, + "eval_runtime": 29.5074, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.135, + "step": 1041 + }, + { + "epoch": 4.168, + "grad_norm": 0.00018978118896484375, + "learning_rate": 8.32e-06, + "loss": 0.0, + "step": 1042 + }, + { + "epoch": 4.168, + "eval_accuracy": 0.99, + "eval_loss": 0.04041881114244461, + "eval_runtime": 29.5027, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.135, + "step": 1042 + }, + { + "epoch": 4.172, + "grad_norm": 1.4483928680419922e-05, + "learning_rate": 8.28e-06, + "loss": 0.0, + "step": 1043 + }, + { + "epoch": 4.172, + "eval_accuracy": 0.99, + "eval_loss": 0.03954387828707695, + "eval_runtime": 29.4526, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.139, + "step": 1043 + }, + { + "epoch": 4.176, + "grad_norm": 0.1845703125, + "learning_rate": 8.24e-06, + "loss": 0.0007, + "step": 1044 + }, + { + "epoch": 4.176, + "eval_accuracy": 0.99, + "eval_loss": 0.0387566015124321, + "eval_runtime": 29.5233, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.134, + "step": 1044 + }, + { + "epoch": 4.18, + "grad_norm": 0.1689453125, + "learning_rate": 8.200000000000001e-06, + "loss": 0.0005, + "step": 1045 + }, + { + "epoch": 4.18, + "eval_accuracy": 0.99, + "eval_loss": 0.03895098716020584, + "eval_runtime": 29.6924, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.122, + "step": 1045 + }, + { + "epoch": 4.184, + "grad_norm": 0.00128173828125, + "learning_rate": 8.160000000000001e-06, + "loss": 0.0, + "step": 1046 + }, + { + "epoch": 4.184, + "eval_accuracy": 0.99, + "eval_loss": 0.03941427543759346, + "eval_runtime": 29.7365, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.119, + "step": 1046 + }, + { + "epoch": 4.188, + "grad_norm": 0.0011444091796875, + "learning_rate": 8.12e-06, + "loss": 0.0, + "step": 1047 + }, + { + "epoch": 4.188, + "eval_accuracy": 0.99, + "eval_loss": 0.039452120661735535, + "eval_runtime": 29.6792, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.123, + "step": 1047 + }, + { + "epoch": 4.192, + "grad_norm": 128.0, + "learning_rate": 8.08e-06, + "loss": 0.9375, + "step": 1048 + }, + { + "epoch": 4.192, + "eval_accuracy": 0.99, + "eval_loss": 0.041386596858501434, + "eval_runtime": 29.6787, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.123, + "step": 1048 + }, + { + "epoch": 4.196, + "grad_norm": 7.392372936010361e-09, + "learning_rate": 8.040000000000001e-06, + "loss": 0.0, + "step": 1049 + }, + { + "epoch": 4.196, + "eval_accuracy": 0.99, + "eval_loss": 0.03989020735025406, + "eval_runtime": 29.6645, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 1049 + }, + { + "epoch": 4.2, + "grad_norm": 2.16066837310791e-06, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0, + "step": 1050 + }, + { + "epoch": 4.2, + "eval_accuracy": 0.99, + "eval_loss": 0.040255263447761536, + "eval_runtime": 29.5869, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 1050 + }, + { + "epoch": 4.204, + "grad_norm": 4.291534423828125e-05, + "learning_rate": 7.96e-06, + "loss": 0.0, + "step": 1051 + }, + { + "epoch": 4.204, + "eval_accuracy": 0.99, + "eval_loss": 0.03939709812402725, + "eval_runtime": 29.4268, + "eval_samples_per_second": 16.991, + "eval_steps_per_second": 2.141, + "step": 1051 + }, + { + "epoch": 4.208, + "grad_norm": 0.000171661376953125, + "learning_rate": 7.92e-06, + "loss": 0.0, + "step": 1052 + }, + { + "epoch": 4.208, + "eval_accuracy": 0.99, + "eval_loss": 0.03950495272874832, + "eval_runtime": 29.4705, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.138, + "step": 1052 + }, + { + "epoch": 4.212, + "grad_norm": 0.006805419921875, + "learning_rate": 7.879999999999999e-06, + "loss": 0.0, + "step": 1053 + }, + { + "epoch": 4.212, + "eval_accuracy": 0.99, + "eval_loss": 0.04080510511994362, + "eval_runtime": 29.6038, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 1053 + }, + { + "epoch": 4.216, + "grad_norm": 4.59375, + "learning_rate": 7.84e-06, + "loss": 0.0181, + "step": 1054 + }, + { + "epoch": 4.216, + "eval_accuracy": 0.99, + "eval_loss": 0.039391010999679565, + "eval_runtime": 29.7114, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.12, + "step": 1054 + }, + { + "epoch": 4.22, + "grad_norm": 0.001556396484375, + "learning_rate": 7.8e-06, + "loss": 0.0, + "step": 1055 + }, + { + "epoch": 4.22, + "eval_accuracy": 0.99, + "eval_loss": 0.04029190167784691, + "eval_runtime": 29.6918, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.122, + "step": 1055 + }, + { + "epoch": 4.224, + "grad_norm": 2.9355287551879883e-06, + "learning_rate": 7.76e-06, + "loss": 0.0, + "step": 1056 + }, + { + "epoch": 4.224, + "eval_accuracy": 0.99, + "eval_loss": 0.04256340116262436, + "eval_runtime": 29.7047, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.121, + "step": 1056 + }, + { + "epoch": 4.228, + "grad_norm": 3.600120544433594e-05, + "learning_rate": 7.72e-06, + "loss": 0.0, + "step": 1057 + }, + { + "epoch": 4.228, + "eval_accuracy": 0.99, + "eval_loss": 0.040913090109825134, + "eval_runtime": 29.687, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.122, + "step": 1057 + }, + { + "epoch": 4.232, + "grad_norm": 0.0004558563232421875, + "learning_rate": 7.68e-06, + "loss": 0.0, + "step": 1058 + }, + { + "epoch": 4.232, + "eval_accuracy": 0.99, + "eval_loss": 0.03999725356698036, + "eval_runtime": 29.5907, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 1058 + }, + { + "epoch": 4.236, + "grad_norm": 1.1757947504520416e-08, + "learning_rate": 7.64e-06, + "loss": 0.0, + "step": 1059 + }, + { + "epoch": 4.236, + "eval_accuracy": 0.99, + "eval_loss": 0.039890944957733154, + "eval_runtime": 29.5784, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 1059 + }, + { + "epoch": 4.24, + "grad_norm": 0.01007080078125, + "learning_rate": 7.6e-06, + "loss": 0.0001, + "step": 1060 + }, + { + "epoch": 4.24, + "eval_accuracy": 0.99, + "eval_loss": 0.04023539647459984, + "eval_runtime": 29.5547, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1060 + }, + { + "epoch": 4.244, + "grad_norm": 0.001434326171875, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.0, + "step": 1061 + }, + { + "epoch": 4.244, + "eval_accuracy": 0.99, + "eval_loss": 0.03972232714295387, + "eval_runtime": 29.659, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.124, + "step": 1061 + }, + { + "epoch": 4.248, + "grad_norm": 0.0869140625, + "learning_rate": 7.520000000000001e-06, + "loss": 0.0002, + "step": 1062 + }, + { + "epoch": 4.248, + "eval_accuracy": 0.99, + "eval_loss": 0.03943677991628647, + "eval_runtime": 29.6897, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.122, + "step": 1062 + }, + { + "epoch": 4.252, + "grad_norm": 0.0020751953125, + "learning_rate": 7.480000000000001e-06, + "loss": 0.0, + "step": 1063 + }, + { + "epoch": 4.252, + "eval_accuracy": 0.99, + "eval_loss": 0.04217665269970894, + "eval_runtime": 29.6881, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.122, + "step": 1063 + }, + { + "epoch": 4.256, + "grad_norm": 32.75, + "learning_rate": 7.44e-06, + "loss": 0.0952, + "step": 1064 + }, + { + "epoch": 4.256, + "eval_accuracy": 0.99, + "eval_loss": 0.040702592581510544, + "eval_runtime": 29.682, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.123, + "step": 1064 + }, + { + "epoch": 4.26, + "grad_norm": 1.6670674085617065e-07, + "learning_rate": 7.4e-06, + "loss": 0.0, + "step": 1065 + }, + { + "epoch": 4.26, + "eval_accuracy": 0.99, + "eval_loss": 0.040398627519607544, + "eval_runtime": 29.63, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 1065 + }, + { + "epoch": 4.264, + "grad_norm": 1.78125, + "learning_rate": 7.36e-06, + "loss": 0.0079, + "step": 1066 + }, + { + "epoch": 4.264, + "eval_accuracy": 0.99, + "eval_loss": 0.040352705866098404, + "eval_runtime": 29.5387, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 1066 + }, + { + "epoch": 4.268, + "grad_norm": 47.0, + "learning_rate": 7.32e-06, + "loss": 0.2344, + "step": 1067 + }, + { + "epoch": 4.268, + "eval_accuracy": 0.99, + "eval_loss": 0.04044386371970177, + "eval_runtime": 29.5014, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.135, + "step": 1067 + }, + { + "epoch": 4.272, + "grad_norm": 0.09326171875, + "learning_rate": 7.280000000000001e-06, + "loss": 0.0003, + "step": 1068 + }, + { + "epoch": 4.272, + "eval_accuracy": 0.99, + "eval_loss": 0.040361903607845306, + "eval_runtime": 29.409, + "eval_samples_per_second": 17.002, + "eval_steps_per_second": 2.142, + "step": 1068 + }, + { + "epoch": 4.276, + "grad_norm": 6.29425048828125e-05, + "learning_rate": 7.240000000000001e-06, + "loss": 0.0, + "step": 1069 + }, + { + "epoch": 4.276, + "eval_accuracy": 0.99, + "eval_loss": 0.03990791365504265, + "eval_runtime": 29.4691, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.138, + "step": 1069 + }, + { + "epoch": 4.28, + "grad_norm": 1.214444637298584e-06, + "learning_rate": 7.2e-06, + "loss": 0.0, + "step": 1070 + }, + { + "epoch": 4.28, + "eval_accuracy": 0.99, + "eval_loss": 0.04080616682767868, + "eval_runtime": 29.5731, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 1070 + }, + { + "epoch": 4.284, + "grad_norm": 1.0896474123001099e-07, + "learning_rate": 7.16e-06, + "loss": 0.0, + "step": 1071 + }, + { + "epoch": 4.284, + "eval_accuracy": 0.99, + "eval_loss": 0.040195535868406296, + "eval_runtime": 29.6653, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 1071 + }, + { + "epoch": 4.288, + "grad_norm": 0.00830078125, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.0, + "step": 1072 + }, + { + "epoch": 4.288, + "eval_accuracy": 0.99, + "eval_loss": 0.041274294257164, + "eval_runtime": 29.6845, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.122, + "step": 1072 + }, + { + "epoch": 4.292, + "grad_norm": 0.0027313232421875, + "learning_rate": 7.080000000000001e-06, + "loss": 0.0, + "step": 1073 + }, + { + "epoch": 4.292, + "eval_accuracy": 0.99, + "eval_loss": 0.04054008424282074, + "eval_runtime": 29.729, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.119, + "step": 1073 + }, + { + "epoch": 4.296, + "grad_norm": 0.005157470703125, + "learning_rate": 7.04e-06, + "loss": 0.0, + "step": 1074 + }, + { + "epoch": 4.296, + "eval_accuracy": 0.99, + "eval_loss": 0.040245767682790756, + "eval_runtime": 29.6251, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 1074 + }, + { + "epoch": 4.3, + "grad_norm": 8.296966552734375e-05, + "learning_rate": 7.000000000000001e-06, + "loss": 0.0, + "step": 1075 + }, + { + "epoch": 4.3, + "eval_accuracy": 0.99, + "eval_loss": 0.04039880260825157, + "eval_runtime": 29.6161, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.127, + "step": 1075 + }, + { + "epoch": 4.304, + "grad_norm": 0.000270843505859375, + "learning_rate": 6.9599999999999994e-06, + "loss": 0.0, + "step": 1076 + }, + { + "epoch": 4.304, + "eval_accuracy": 0.99, + "eval_loss": 0.03998670354485512, + "eval_runtime": 29.5246, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.134, + "step": 1076 + }, + { + "epoch": 4.308, + "grad_norm": 2.193450927734375e-05, + "learning_rate": 6.92e-06, + "loss": 0.0, + "step": 1077 + }, + { + "epoch": 4.308, + "eval_accuracy": 0.99, + "eval_loss": 0.0394357405602932, + "eval_runtime": 29.5047, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.135, + "step": 1077 + }, + { + "epoch": 4.312, + "grad_norm": 7.40625, + "learning_rate": 6.88e-06, + "loss": 0.0216, + "step": 1078 + }, + { + "epoch": 4.312, + "eval_accuracy": 0.99, + "eval_loss": 0.04010191559791565, + "eval_runtime": 29.5392, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.133, + "step": 1078 + }, + { + "epoch": 4.316, + "grad_norm": 0.1796875, + "learning_rate": 6.840000000000001e-06, + "loss": 0.0006, + "step": 1079 + }, + { + "epoch": 4.316, + "eval_accuracy": 0.99, + "eval_loss": 0.04057632386684418, + "eval_runtime": 29.5811, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 1079 + }, + { + "epoch": 4.32, + "grad_norm": 0.00396728515625, + "learning_rate": 6.800000000000001e-06, + "loss": 0.0, + "step": 1080 + }, + { + "epoch": 4.32, + "eval_accuracy": 0.99, + "eval_loss": 0.039622604846954346, + "eval_runtime": 29.7101, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.12, + "step": 1080 + }, + { + "epoch": 4.324, + "grad_norm": 0.16015625, + "learning_rate": 6.76e-06, + "loss": 0.0006, + "step": 1081 + }, + { + "epoch": 4.324, + "eval_accuracy": 0.99, + "eval_loss": 0.04032677412033081, + "eval_runtime": 29.6952, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.122, + "step": 1081 + }, + { + "epoch": 4.328, + "grad_norm": 0.0118408203125, + "learning_rate": 6.72e-06, + "loss": 0.0, + "step": 1082 + }, + { + "epoch": 4.328, + "eval_accuracy": 0.99, + "eval_loss": 0.040385764092206955, + "eval_runtime": 29.6724, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.123, + "step": 1082 + }, + { + "epoch": 4.332, + "grad_norm": 0.005767822265625, + "learning_rate": 6.68e-06, + "loss": 0.0, + "step": 1083 + }, + { + "epoch": 4.332, + "eval_accuracy": 0.99, + "eval_loss": 0.03904436528682709, + "eval_runtime": 29.6179, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 1083 + }, + { + "epoch": 4.336, + "grad_norm": 0.279296875, + "learning_rate": 6.640000000000001e-06, + "loss": 0.0011, + "step": 1084 + }, + { + "epoch": 4.336, + "eval_accuracy": 0.99, + "eval_loss": 0.0394558422267437, + "eval_runtime": 29.5872, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 1084 + }, + { + "epoch": 4.34, + "grad_norm": 0.0023193359375, + "learning_rate": 6.6e-06, + "loss": 0.0, + "step": 1085 + }, + { + "epoch": 4.34, + "eval_accuracy": 0.99, + "eval_loss": 0.039561282843351364, + "eval_runtime": 29.4624, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.138, + "step": 1085 + }, + { + "epoch": 4.344, + "grad_norm": 25.125, + "learning_rate": 6.560000000000001e-06, + "loss": 0.0742, + "step": 1086 + }, + { + "epoch": 4.344, + "eval_accuracy": 0.99, + "eval_loss": 0.03975018113851547, + "eval_runtime": 29.4634, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.138, + "step": 1086 + }, + { + "epoch": 4.348, + "grad_norm": 5.078315734863281e-05, + "learning_rate": 6.519999999999999e-06, + "loss": 0.0, + "step": 1087 + }, + { + "epoch": 4.348, + "eval_accuracy": 0.99, + "eval_loss": 0.039539169520139694, + "eval_runtime": 29.4234, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.141, + "step": 1087 + }, + { + "epoch": 4.352, + "grad_norm": 4.023313522338867e-07, + "learning_rate": 6.48e-06, + "loss": 0.0, + "step": 1088 + }, + { + "epoch": 4.352, + "eval_accuracy": 0.99, + "eval_loss": 0.04068602994084358, + "eval_runtime": 29.6169, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 1088 + }, + { + "epoch": 4.356, + "grad_norm": 8.381903171539307e-08, + "learning_rate": 6.44e-06, + "loss": 0.0, + "step": 1089 + }, + { + "epoch": 4.356, + "eval_accuracy": 0.99, + "eval_loss": 0.04027983918786049, + "eval_runtime": 29.6643, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.124, + "step": 1089 + }, + { + "epoch": 4.36, + "grad_norm": 0.00171661376953125, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.0, + "step": 1090 + }, + { + "epoch": 4.36, + "eval_accuracy": 0.99, + "eval_loss": 0.041512519121170044, + "eval_runtime": 29.6769, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.123, + "step": 1090 + }, + { + "epoch": 4.364, + "grad_norm": 0.48046875, + "learning_rate": 6.360000000000001e-06, + "loss": 0.0012, + "step": 1091 + }, + { + "epoch": 4.364, + "eval_accuracy": 0.99, + "eval_loss": 0.04021277651190758, + "eval_runtime": 29.6241, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 1091 + }, + { + "epoch": 4.368, + "grad_norm": 101.0, + "learning_rate": 6.320000000000001e-06, + "loss": 1.0, + "step": 1092 + }, + { + "epoch": 4.368, + "eval_accuracy": 0.99, + "eval_loss": 0.03953231871128082, + "eval_runtime": 29.6091, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 1092 + }, + { + "epoch": 4.372, + "grad_norm": 0.0810546875, + "learning_rate": 6.28e-06, + "loss": 0.0003, + "step": 1093 + }, + { + "epoch": 4.372, + "eval_accuracy": 0.99, + "eval_loss": 0.03951101750135422, + "eval_runtime": 29.6098, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 1093 + }, + { + "epoch": 4.376, + "grad_norm": 0.00091552734375, + "learning_rate": 6.24e-06, + "loss": 0.0, + "step": 1094 + }, + { + "epoch": 4.376, + "eval_accuracy": 0.99, + "eval_loss": 0.040841083973646164, + "eval_runtime": 29.6618, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.124, + "step": 1094 + }, + { + "epoch": 4.38, + "grad_norm": 0.7890625, + "learning_rate": 6.2e-06, + "loss": 0.0006, + "step": 1095 + }, + { + "epoch": 4.38, + "eval_accuracy": 0.99, + "eval_loss": 0.040827225893735886, + "eval_runtime": 29.6071, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 1095 + }, + { + "epoch": 4.384, + "grad_norm": 0.00018310546875, + "learning_rate": 6.16e-06, + "loss": 0.0, + "step": 1096 + }, + { + "epoch": 4.384, + "eval_accuracy": 0.99, + "eval_loss": 0.03984154760837555, + "eval_runtime": 29.6018, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 1096 + }, + { + "epoch": 4.388, + "grad_norm": 0.162109375, + "learning_rate": 6.12e-06, + "loss": 0.0006, + "step": 1097 + }, + { + "epoch": 4.388, + "eval_accuracy": 0.99, + "eval_loss": 0.04108361527323723, + "eval_runtime": 29.6366, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.126, + "step": 1097 + }, + { + "epoch": 4.392, + "grad_norm": 4.675239324569702e-07, + "learning_rate": 6.08e-06, + "loss": 0.0, + "step": 1098 + }, + { + "epoch": 4.392, + "eval_accuracy": 0.99, + "eval_loss": 0.04145931452512741, + "eval_runtime": 29.5839, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.13, + "step": 1098 + }, + { + "epoch": 4.396, + "grad_norm": 5.6743621826171875e-05, + "learning_rate": 6.040000000000001e-06, + "loss": 0.0, + "step": 1099 + }, + { + "epoch": 4.396, + "eval_accuracy": 0.99, + "eval_loss": 0.040889304131269455, + "eval_runtime": 29.5802, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 1099 + }, + { + "epoch": 4.4, + "grad_norm": 1.0788440704345703e-05, + "learning_rate": 6e-06, + "loss": 0.0, + "step": 1100 + }, + { + "epoch": 4.4, + "eval_accuracy": 0.99, + "eval_loss": 0.040737319737672806, + "eval_runtime": 29.6191, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.127, + "step": 1100 + }, + { + "epoch": 4.404, + "grad_norm": 4.696846008300781e-05, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.0, + "step": 1101 + }, + { + "epoch": 4.404, + "eval_accuracy": 0.99, + "eval_loss": 0.04015502333641052, + "eval_runtime": 29.4114, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.142, + "step": 1101 + }, + { + "epoch": 4.408, + "grad_norm": 1.3485550880432129e-06, + "learning_rate": 5.920000000000001e-06, + "loss": 0.0, + "step": 1102 + }, + { + "epoch": 4.408, + "eval_accuracy": 0.99, + "eval_loss": 0.040009573101997375, + "eval_runtime": 29.3166, + "eval_samples_per_second": 17.055, + "eval_steps_per_second": 2.149, + "step": 1102 + }, + { + "epoch": 4.412, + "grad_norm": 1.1980533599853516e-05, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.0, + "step": 1103 + }, + { + "epoch": 4.412, + "eval_accuracy": 0.99, + "eval_loss": 0.039327431470155716, + "eval_runtime": 29.4636, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.138, + "step": 1103 + }, + { + "epoch": 4.416, + "grad_norm": 3.844499588012695e-06, + "learning_rate": 5.84e-06, + "loss": 0.0, + "step": 1104 + }, + { + "epoch": 4.416, + "eval_accuracy": 0.99, + "eval_loss": 0.04115131497383118, + "eval_runtime": 29.5228, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.134, + "step": 1104 + }, + { + "epoch": 4.42, + "grad_norm": 7.534027099609375e-05, + "learning_rate": 5.8e-06, + "loss": 0.0, + "step": 1105 + }, + { + "epoch": 4.42, + "eval_accuracy": 0.99, + "eval_loss": 0.0398724302649498, + "eval_runtime": 29.6058, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 1105 + }, + { + "epoch": 4.424, + "grad_norm": 4.470348358154297e-06, + "learning_rate": 5.76e-06, + "loss": 0.0, + "step": 1106 + }, + { + "epoch": 4.424, + "eval_accuracy": 0.99, + "eval_loss": 0.04051186516880989, + "eval_runtime": 29.5554, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 1106 + }, + { + "epoch": 4.428, + "grad_norm": 1.40625, + "learning_rate": 5.72e-06, + "loss": 0.0022, + "step": 1107 + }, + { + "epoch": 4.428, + "eval_accuracy": 0.99, + "eval_loss": 0.04041445255279541, + "eval_runtime": 29.6109, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 1107 + }, + { + "epoch": 4.432, + "grad_norm": 0.921875, + "learning_rate": 5.680000000000001e-06, + "loss": 0.0025, + "step": 1108 + }, + { + "epoch": 4.432, + "eval_accuracy": 0.99, + "eval_loss": 0.040283020585775375, + "eval_runtime": 29.5603, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 1108 + }, + { + "epoch": 4.436, + "grad_norm": 0.009765625, + "learning_rate": 5.64e-06, + "loss": 0.0, + "step": 1109 + }, + { + "epoch": 4.436, + "eval_accuracy": 0.99, + "eval_loss": 0.04118441790342331, + "eval_runtime": 29.5559, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 1109 + }, + { + "epoch": 4.44, + "grad_norm": 4.857778549194336e-06, + "learning_rate": 5.600000000000001e-06, + "loss": 0.0, + "step": 1110 + }, + { + "epoch": 4.44, + "eval_accuracy": 0.99, + "eval_loss": 0.04124853014945984, + "eval_runtime": 29.609, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 1110 + }, + { + "epoch": 4.444, + "grad_norm": 0.033203125, + "learning_rate": 5.56e-06, + "loss": 0.0, + "step": 1111 + }, + { + "epoch": 4.444, + "eval_accuracy": 0.99, + "eval_loss": 0.04115122929215431, + "eval_runtime": 29.5607, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 1111 + }, + { + "epoch": 4.448, + "grad_norm": 0.55859375, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.0013, + "step": 1112 + }, + { + "epoch": 4.448, + "eval_accuracy": 0.99, + "eval_loss": 0.04051363840699196, + "eval_runtime": 29.5567, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.131, + "step": 1112 + }, + { + "epoch": 4.452, + "grad_norm": 0.000606536865234375, + "learning_rate": 5.48e-06, + "loss": 0.0, + "step": 1113 + }, + { + "epoch": 4.452, + "eval_accuracy": 0.99, + "eval_loss": 0.03960827365517616, + "eval_runtime": 29.6095, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 1113 + }, + { + "epoch": 4.456, + "grad_norm": 7.152557373046875e-07, + "learning_rate": 5.44e-06, + "loss": 0.0, + "step": 1114 + }, + { + "epoch": 4.456, + "eval_accuracy": 0.99, + "eval_loss": 0.0406189039349556, + "eval_runtime": 29.6064, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 1114 + }, + { + "epoch": 4.46, + "grad_norm": 0.049560546875, + "learning_rate": 5.4e-06, + "loss": 0.0002, + "step": 1115 + }, + { + "epoch": 4.46, + "eval_accuracy": 0.99, + "eval_loss": 0.040841229259967804, + "eval_runtime": 29.5563, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 1115 + }, + { + "epoch": 4.464, + "grad_norm": 1.484375, + "learning_rate": 5.36e-06, + "loss": 0.004, + "step": 1116 + }, + { + "epoch": 4.464, + "eval_accuracy": 0.99, + "eval_loss": 0.04231882095336914, + "eval_runtime": 29.5997, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 1116 + }, + { + "epoch": 4.468, + "grad_norm": 6.29425048828125e-05, + "learning_rate": 5.32e-06, + "loss": 0.0, + "step": 1117 + }, + { + "epoch": 4.468, + "eval_accuracy": 0.99, + "eval_loss": 0.041371993720531464, + "eval_runtime": 29.6018, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 1117 + }, + { + "epoch": 4.4719999999999995, + "grad_norm": 0.000705718994140625, + "learning_rate": 5.28e-06, + "loss": 0.0, + "step": 1118 + }, + { + "epoch": 4.4719999999999995, + "eval_accuracy": 0.99, + "eval_loss": 0.04194151982665062, + "eval_runtime": 29.6019, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 1118 + }, + { + "epoch": 4.476, + "grad_norm": 1.51805579662323e-07, + "learning_rate": 5.240000000000001e-06, + "loss": 0.0, + "step": 1119 + }, + { + "epoch": 4.476, + "eval_accuracy": 0.99, + "eval_loss": 0.04085003212094307, + "eval_runtime": 29.5989, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.128, + "step": 1119 + }, + { + "epoch": 4.48, + "grad_norm": 0.27734375, + "learning_rate": 5.2e-06, + "loss": 0.0002, + "step": 1120 + }, + { + "epoch": 4.48, + "eval_accuracy": 0.99, + "eval_loss": 0.040493208914995193, + "eval_runtime": 29.5603, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 1120 + }, + { + "epoch": 4.484, + "grad_norm": 0.000514984130859375, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.0, + "step": 1121 + }, + { + "epoch": 4.484, + "eval_accuracy": 0.99, + "eval_loss": 0.0410919226706028, + "eval_runtime": 29.5731, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 1121 + }, + { + "epoch": 4.4879999999999995, + "grad_norm": 1.5079975128173828e-05, + "learning_rate": 5.12e-06, + "loss": 0.0, + "step": 1122 + }, + { + "epoch": 4.4879999999999995, + "eval_accuracy": 0.99, + "eval_loss": 0.040586281567811966, + "eval_runtime": 29.5987, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.128, + "step": 1122 + }, + { + "epoch": 4.492, + "grad_norm": 0.54296875, + "learning_rate": 5.08e-06, + "loss": 0.0017, + "step": 1123 + }, + { + "epoch": 4.492, + "eval_accuracy": 0.99, + "eval_loss": 0.040461767464876175, + "eval_runtime": 29.6165, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 1123 + }, + { + "epoch": 4.496, + "grad_norm": 0.0004482269287109375, + "learning_rate": 5.04e-06, + "loss": 0.0, + "step": 1124 + }, + { + "epoch": 4.496, + "eval_accuracy": 0.99, + "eval_loss": 0.04066528379917145, + "eval_runtime": 29.5466, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.132, + "step": 1124 + }, + { + "epoch": 4.5, + "grad_norm": 0.0927734375, + "learning_rate": 5e-06, + "loss": 0.0004, + "step": 1125 + }, + { + "epoch": 4.5, + "eval_accuracy": 0.99, + "eval_loss": 0.04074591398239136, + "eval_runtime": 29.5529, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 1125 + }, + { + "epoch": 4.504, + "grad_norm": 1.640625, + "learning_rate": 4.96e-06, + "loss": 0.0045, + "step": 1126 + }, + { + "epoch": 4.504, + "eval_accuracy": 0.99, + "eval_loss": 0.04060285910964012, + "eval_runtime": 29.5493, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 1126 + }, + { + "epoch": 4.508, + "grad_norm": 4.100799560546875e-05, + "learning_rate": 4.92e-06, + "loss": 0.0, + "step": 1127 + }, + { + "epoch": 4.508, + "eval_accuracy": 0.99, + "eval_loss": 0.04019388556480408, + "eval_runtime": 29.5487, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 1127 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 0.00023174285888671875, + "learning_rate": 4.880000000000001e-06, + "loss": 0.0, + "step": 1128 + }, + { + "epoch": 4.5120000000000005, + "eval_accuracy": 0.99, + "eval_loss": 0.040705468505620956, + "eval_runtime": 29.5492, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 1128 + }, + { + "epoch": 4.516, + "grad_norm": 0.1337890625, + "learning_rate": 4.84e-06, + "loss": 0.0004, + "step": 1129 + }, + { + "epoch": 4.516, + "eval_accuracy": 0.99, + "eval_loss": 0.04058826342225075, + "eval_runtime": 29.5486, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.132, + "step": 1129 + }, + { + "epoch": 4.52, + "grad_norm": 0.01251220703125, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0, + "step": 1130 + }, + { + "epoch": 4.52, + "eval_accuracy": 0.99, + "eval_loss": 0.039115119725465775, + "eval_runtime": 29.5973, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.129, + "step": 1130 + }, + { + "epoch": 4.524, + "grad_norm": 28.5, + "learning_rate": 4.76e-06, + "loss": 0.1187, + "step": 1131 + }, + { + "epoch": 4.524, + "eval_accuracy": 0.99, + "eval_loss": 0.040023814886808395, + "eval_runtime": 29.5542, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1131 + }, + { + "epoch": 4.5280000000000005, + "grad_norm": 7.534027099609375e-05, + "learning_rate": 4.72e-06, + "loss": 0.0, + "step": 1132 + }, + { + "epoch": 4.5280000000000005, + "eval_accuracy": 0.99, + "eval_loss": 0.04031150043010712, + "eval_runtime": 29.603, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.128, + "step": 1132 + }, + { + "epoch": 4.532, + "grad_norm": 0.000324249267578125, + "learning_rate": 4.68e-06, + "loss": 0.0, + "step": 1133 + }, + { + "epoch": 4.532, + "eval_accuracy": 0.99, + "eval_loss": 0.03942564129829407, + "eval_runtime": 29.5986, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.128, + "step": 1133 + }, + { + "epoch": 4.536, + "grad_norm": 0.2158203125, + "learning_rate": 4.64e-06, + "loss": 0.0006, + "step": 1134 + }, + { + "epoch": 4.536, + "eval_accuracy": 0.99, + "eval_loss": 0.04046883061528206, + "eval_runtime": 29.5548, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1134 + }, + { + "epoch": 4.54, + "grad_norm": 0.000537872314453125, + "learning_rate": 4.6e-06, + "loss": 0.0, + "step": 1135 + }, + { + "epoch": 4.54, + "eval_accuracy": 0.99, + "eval_loss": 0.040097832679748535, + "eval_runtime": 29.5544, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1135 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 2.4586915969848633e-07, + "learning_rate": 4.56e-06, + "loss": 0.0, + "step": 1136 + }, + { + "epoch": 4.5440000000000005, + "eval_accuracy": 0.99, + "eval_loss": 0.0395977720618248, + "eval_runtime": 29.5551, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1136 + }, + { + "epoch": 4.548, + "grad_norm": 0.010986328125, + "learning_rate": 4.52e-06, + "loss": 0.0, + "step": 1137 + }, + { + "epoch": 4.548, + "eval_accuracy": 0.99, + "eval_loss": 0.039543841034173965, + "eval_runtime": 29.5906, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.129, + "step": 1137 + }, + { + "epoch": 4.552, + "grad_norm": 0.00010633468627929688, + "learning_rate": 4.48e-06, + "loss": 0.0, + "step": 1138 + }, + { + "epoch": 4.552, + "eval_accuracy": 0.99, + "eval_loss": 0.039590492844581604, + "eval_runtime": 29.5562, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 1138 + }, + { + "epoch": 4.556, + "grad_norm": 2.7567148208618164e-07, + "learning_rate": 4.440000000000001e-06, + "loss": 0.0, + "step": 1139 + }, + { + "epoch": 4.556, + "eval_accuracy": 0.99, + "eval_loss": 0.04002032428979874, + "eval_runtime": 29.6057, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 1139 + }, + { + "epoch": 4.5600000000000005, + "grad_norm": 7.12275505065918e-06, + "learning_rate": 4.4e-06, + "loss": 0.0, + "step": 1140 + }, + { + "epoch": 4.5600000000000005, + "eval_accuracy": 0.99, + "eval_loss": 0.04044245183467865, + "eval_runtime": 29.6013, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 1140 + }, + { + "epoch": 4.564, + "grad_norm": 5.5, + "learning_rate": 4.360000000000001e-06, + "loss": 0.0137, + "step": 1141 + }, + { + "epoch": 4.564, + "eval_accuracy": 0.99, + "eval_loss": 0.03964082896709442, + "eval_runtime": 29.6009, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 1141 + }, + { + "epoch": 4.568, + "grad_norm": 1.564621925354004e-06, + "learning_rate": 4.32e-06, + "loss": 0.0, + "step": 1142 + }, + { + "epoch": 4.568, + "eval_accuracy": 0.99, + "eval_loss": 0.03999270871281624, + "eval_runtime": 29.5587, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.131, + "step": 1142 + }, + { + "epoch": 4.572, + "grad_norm": 0.032470703125, + "learning_rate": 4.28e-06, + "loss": 0.0001, + "step": 1143 + }, + { + "epoch": 4.572, + "eval_accuracy": 0.99, + "eval_loss": 0.041225966066122055, + "eval_runtime": 29.5531, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 1143 + }, + { + "epoch": 4.576, + "grad_norm": 4.649162292480469e-05, + "learning_rate": 4.24e-06, + "loss": 0.0, + "step": 1144 + }, + { + "epoch": 4.576, + "eval_accuracy": 0.99, + "eval_loss": 0.04027758538722992, + "eval_runtime": 29.5548, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1144 + }, + { + "epoch": 4.58, + "grad_norm": 2.71875, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0123, + "step": 1145 + }, + { + "epoch": 4.58, + "eval_accuracy": 0.99, + "eval_loss": 0.039853960275650024, + "eval_runtime": 29.5599, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 1145 + }, + { + "epoch": 4.584, + "grad_norm": 73.5, + "learning_rate": 4.16e-06, + "loss": 0.7266, + "step": 1146 + }, + { + "epoch": 4.584, + "eval_accuracy": 0.99, + "eval_loss": 0.03949594125151634, + "eval_runtime": 29.5923, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.129, + "step": 1146 + }, + { + "epoch": 4.588, + "grad_norm": 0.000690460205078125, + "learning_rate": 4.12e-06, + "loss": 0.0, + "step": 1147 + }, + { + "epoch": 4.588, + "eval_accuracy": 0.99, + "eval_loss": 0.038385454565286636, + "eval_runtime": 29.5517, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 1147 + }, + { + "epoch": 4.592, + "grad_norm": 0.0006866455078125, + "learning_rate": 4.080000000000001e-06, + "loss": 0.0, + "step": 1148 + }, + { + "epoch": 4.592, + "eval_accuracy": 0.99, + "eval_loss": 0.03963778167963028, + "eval_runtime": 29.5647, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 1148 + }, + { + "epoch": 4.596, + "grad_norm": 0.00457763671875, + "learning_rate": 4.04e-06, + "loss": 0.0, + "step": 1149 + }, + { + "epoch": 4.596, + "eval_accuracy": 0.99, + "eval_loss": 0.03961542248725891, + "eval_runtime": 29.5708, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.13, + "step": 1149 + }, + { + "epoch": 4.6, + "grad_norm": 0.08740234375, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0003, + "step": 1150 + }, + { + "epoch": 4.6, + "eval_accuracy": 0.99, + "eval_loss": 0.040243443101644516, + "eval_runtime": 29.5719, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 1150 + }, + { + "epoch": 4.604, + "grad_norm": 0.000957489013671875, + "learning_rate": 3.96e-06, + "loss": 0.0, + "step": 1151 + }, + { + "epoch": 4.604, + "eval_accuracy": 0.99, + "eval_loss": 0.03905465826392174, + "eval_runtime": 29.4665, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.138, + "step": 1151 + }, + { + "epoch": 4.608, + "grad_norm": 1.0609626770019531e-05, + "learning_rate": 3.92e-06, + "loss": 0.0, + "step": 1152 + }, + { + "epoch": 4.608, + "eval_accuracy": 0.99, + "eval_loss": 0.04010017216205597, + "eval_runtime": 29.3166, + "eval_samples_per_second": 17.055, + "eval_steps_per_second": 2.149, + "step": 1152 + }, + { + "epoch": 4.612, + "grad_norm": 0.056396484375, + "learning_rate": 3.88e-06, + "loss": 0.0002, + "step": 1153 + }, + { + "epoch": 4.612, + "eval_accuracy": 0.99, + "eval_loss": 0.03944189473986626, + "eval_runtime": 29.4615, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.138, + "step": 1153 + }, + { + "epoch": 4.616, + "grad_norm": 2.109375, + "learning_rate": 3.84e-06, + "loss": 0.0068, + "step": 1154 + }, + { + "epoch": 4.616, + "eval_accuracy": 0.99, + "eval_loss": 0.04013573005795479, + "eval_runtime": 29.5723, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 1154 + }, + { + "epoch": 4.62, + "grad_norm": 0.00115203857421875, + "learning_rate": 3.8e-06, + "loss": 0.0, + "step": 1155 + }, + { + "epoch": 4.62, + "eval_accuracy": 0.99, + "eval_loss": 0.038464102894067764, + "eval_runtime": 29.5522, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.132, + "step": 1155 + }, + { + "epoch": 4.624, + "grad_norm": 6.884336471557617e-06, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.0, + "step": 1156 + }, + { + "epoch": 4.624, + "eval_accuracy": 0.99, + "eval_loss": 0.0391128845512867, + "eval_runtime": 29.554, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1156 + }, + { + "epoch": 4.628, + "grad_norm": 1.4975666999816895e-06, + "learning_rate": 3.72e-06, + "loss": 0.0, + "step": 1157 + }, + { + "epoch": 4.628, + "eval_accuracy": 0.99, + "eval_loss": 0.03988758102059364, + "eval_runtime": 29.5541, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1157 + }, + { + "epoch": 4.632, + "grad_norm": 0.00017070770263671875, + "learning_rate": 3.68e-06, + "loss": 0.0, + "step": 1158 + }, + { + "epoch": 4.632, + "eval_accuracy": 0.99, + "eval_loss": 0.04001377522945404, + "eval_runtime": 29.6175, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 1158 + }, + { + "epoch": 4.636, + "grad_norm": 0.03759765625, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.0001, + "step": 1159 + }, + { + "epoch": 4.636, + "eval_accuracy": 0.99, + "eval_loss": 0.04001560062170029, + "eval_runtime": 29.59, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.129, + "step": 1159 + }, + { + "epoch": 4.64, + "grad_norm": 0.0036773681640625, + "learning_rate": 3.6e-06, + "loss": 0.0, + "step": 1160 + }, + { + "epoch": 4.64, + "eval_accuracy": 0.99, + "eval_loss": 0.03931561857461929, + "eval_runtime": 29.5831, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 1160 + }, + { + "epoch": 4.644, + "grad_norm": 0.150390625, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.0004, + "step": 1161 + }, + { + "epoch": 4.644, + "eval_accuracy": 0.99, + "eval_loss": 0.03851315751671791, + "eval_runtime": 29.6438, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.125, + "step": 1161 + }, + { + "epoch": 4.648, + "grad_norm": 26.5, + "learning_rate": 3.52e-06, + "loss": 0.0967, + "step": 1162 + }, + { + "epoch": 4.648, + "eval_accuracy": 0.99, + "eval_loss": 0.039514634758234024, + "eval_runtime": 29.6204, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 1162 + }, + { + "epoch": 4.652, + "grad_norm": 2.3543834686279297e-06, + "learning_rate": 3.4799999999999997e-06, + "loss": 0.0, + "step": 1163 + }, + { + "epoch": 4.652, + "eval_accuracy": 0.99, + "eval_loss": 0.04001433774828911, + "eval_runtime": 29.5692, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 1163 + }, + { + "epoch": 4.656, + "grad_norm": 8.106231689453125e-06, + "learning_rate": 3.44e-06, + "loss": 0.0, + "step": 1164 + }, + { + "epoch": 4.656, + "eval_accuracy": 0.99, + "eval_loss": 0.039934102445840836, + "eval_runtime": 29.5545, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1164 + }, + { + "epoch": 4.66, + "grad_norm": 5.438923835754395e-07, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0, + "step": 1165 + }, + { + "epoch": 4.66, + "eval_accuracy": 0.99, + "eval_loss": 0.03987100347876549, + "eval_runtime": 29.5678, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 1165 + }, + { + "epoch": 4.664, + "grad_norm": 6.794929504394531e-06, + "learning_rate": 3.36e-06, + "loss": 0.0, + "step": 1166 + }, + { + "epoch": 4.664, + "eval_accuracy": 0.99, + "eval_loss": 0.039547957479953766, + "eval_runtime": 29.5539, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.132, + "step": 1166 + }, + { + "epoch": 4.668, + "grad_norm": 0.0029754638671875, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.0, + "step": 1167 + }, + { + "epoch": 4.668, + "eval_accuracy": 0.99, + "eval_loss": 0.03945533186197281, + "eval_runtime": 29.5552, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 1167 + }, + { + "epoch": 4.672, + "grad_norm": 7.82012939453125e-05, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.0, + "step": 1168 + }, + { + "epoch": 4.672, + "eval_accuracy": 0.99, + "eval_loss": 0.03809399902820587, + "eval_runtime": 29.5607, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 1168 + }, + { + "epoch": 4.676, + "grad_norm": 0.0010986328125, + "learning_rate": 3.24e-06, + "loss": 0.0, + "step": 1169 + }, + { + "epoch": 4.676, + "eval_accuracy": 0.99, + "eval_loss": 0.03964677453041077, + "eval_runtime": 29.6086, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 1169 + }, + { + "epoch": 4.68, + "grad_norm": 0.0003032684326171875, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0, + "step": 1170 + }, + { + "epoch": 4.68, + "eval_accuracy": 0.99, + "eval_loss": 0.039588186889886856, + "eval_runtime": 29.5701, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 1170 + }, + { + "epoch": 4.684, + "grad_norm": 0.001617431640625, + "learning_rate": 3.1600000000000007e-06, + "loss": 0.0, + "step": 1171 + }, + { + "epoch": 4.684, + "eval_accuracy": 0.99, + "eval_loss": 0.0396348237991333, + "eval_runtime": 29.5688, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 1171 + }, + { + "epoch": 4.688, + "grad_norm": 8.149072527885437e-08, + "learning_rate": 3.12e-06, + "loss": 0.0, + "step": 1172 + }, + { + "epoch": 4.688, + "eval_accuracy": 0.99, + "eval_loss": 0.03861615061759949, + "eval_runtime": 29.5702, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 1172 + }, + { + "epoch": 4.692, + "grad_norm": 3.039836883544922e-06, + "learning_rate": 3.08e-06, + "loss": 0.0, + "step": 1173 + }, + { + "epoch": 4.692, + "eval_accuracy": 0.99, + "eval_loss": 0.03992399573326111, + "eval_runtime": 29.6208, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 1173 + }, + { + "epoch": 4.696, + "grad_norm": 1.6242265701293945e-06, + "learning_rate": 3.04e-06, + "loss": 0.0, + "step": 1174 + }, + { + "epoch": 4.696, + "eval_accuracy": 0.99, + "eval_loss": 0.03942912071943283, + "eval_runtime": 29.6121, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.128, + "step": 1174 + }, + { + "epoch": 4.7, + "grad_norm": 41.75, + "learning_rate": 3e-06, + "loss": 0.3066, + "step": 1175 + }, + { + "epoch": 4.7, + "eval_accuracy": 0.99, + "eval_loss": 0.03989109769463539, + "eval_runtime": 29.576, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 1175 + }, + { + "epoch": 4.704, + "grad_norm": 40.25, + "learning_rate": 2.9600000000000005e-06, + "loss": 0.2061, + "step": 1176 + }, + { + "epoch": 4.704, + "eval_accuracy": 0.99, + "eval_loss": 0.039516299962997437, + "eval_runtime": 29.6338, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 1176 + }, + { + "epoch": 4.708, + "grad_norm": 9.5367431640625e-05, + "learning_rate": 2.92e-06, + "loss": 0.0, + "step": 1177 + }, + { + "epoch": 4.708, + "eval_accuracy": 0.99, + "eval_loss": 0.04029437527060509, + "eval_runtime": 29.6257, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.127, + "step": 1177 + }, + { + "epoch": 4.712, + "grad_norm": 13.8125, + "learning_rate": 2.88e-06, + "loss": 0.0786, + "step": 1178 + }, + { + "epoch": 4.712, + "eval_accuracy": 0.99, + "eval_loss": 0.039798539131879807, + "eval_runtime": 29.5731, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 1178 + }, + { + "epoch": 4.716, + "grad_norm": 0.06298828125, + "learning_rate": 2.8400000000000003e-06, + "loss": 0.0001, + "step": 1179 + }, + { + "epoch": 4.716, + "eval_accuracy": 0.99, + "eval_loss": 0.03999168798327446, + "eval_runtime": 29.6011, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.128, + "step": 1179 + }, + { + "epoch": 4.72, + "grad_norm": 8.58306884765625e-05, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0, + "step": 1180 + }, + { + "epoch": 4.72, + "eval_accuracy": 0.99, + "eval_loss": 0.03966984152793884, + "eval_runtime": 29.6098, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.128, + "step": 1180 + }, + { + "epoch": 4.724, + "grad_norm": 3.769993782043457e-06, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.0, + "step": 1181 + }, + { + "epoch": 4.724, + "eval_accuracy": 0.99, + "eval_loss": 0.03956753388047218, + "eval_runtime": 29.5651, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 1181 + }, + { + "epoch": 4.728, + "grad_norm": 32.5, + "learning_rate": 2.72e-06, + "loss": 0.1943, + "step": 1182 + }, + { + "epoch": 4.728, + "eval_accuracy": 0.99, + "eval_loss": 0.03934183344244957, + "eval_runtime": 29.5623, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 1182 + }, + { + "epoch": 4.732, + "grad_norm": 7.748603820800781e-06, + "learning_rate": 2.68e-06, + "loss": 0.0, + "step": 1183 + }, + { + "epoch": 4.732, + "eval_accuracy": 0.99, + "eval_loss": 0.039091743528842926, + "eval_runtime": 29.5658, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 1183 + }, + { + "epoch": 4.736, + "grad_norm": 0.060302734375, + "learning_rate": 2.64e-06, + "loss": 0.0002, + "step": 1184 + }, + { + "epoch": 4.736, + "eval_accuracy": 0.99, + "eval_loss": 0.03988803178071976, + "eval_runtime": 29.5719, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 1184 + }, + { + "epoch": 4.74, + "grad_norm": 0.007720947265625, + "learning_rate": 2.6e-06, + "loss": 0.0, + "step": 1185 + }, + { + "epoch": 4.74, + "eval_accuracy": 0.99, + "eval_loss": 0.03944027051329613, + "eval_runtime": 29.5641, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 1185 + }, + { + "epoch": 4.744, + "grad_norm": 1.0331859812140465e-09, + "learning_rate": 2.56e-06, + "loss": 0.0, + "step": 1186 + }, + { + "epoch": 4.744, + "eval_accuracy": 0.99, + "eval_loss": 0.039343636482954025, + "eval_runtime": 29.6119, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.128, + "step": 1186 + }, + { + "epoch": 4.748, + "grad_norm": 0.69140625, + "learning_rate": 2.52e-06, + "loss": 0.0028, + "step": 1187 + }, + { + "epoch": 4.748, + "eval_accuracy": 0.99, + "eval_loss": 0.039592448621988297, + "eval_runtime": 29.5602, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.131, + "step": 1187 + }, + { + "epoch": 4.752, + "grad_norm": 0.00011110305786132812, + "learning_rate": 2.48e-06, + "loss": 0.0, + "step": 1188 + }, + { + "epoch": 4.752, + "eval_accuracy": 0.99, + "eval_loss": 0.03827960044145584, + "eval_runtime": 29.5607, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.131, + "step": 1188 + }, + { + "epoch": 4.756, + "grad_norm": 0.058349609375, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.0002, + "step": 1189 + }, + { + "epoch": 4.756, + "eval_accuracy": 0.99, + "eval_loss": 0.03910020366311073, + "eval_runtime": 29.612, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.128, + "step": 1189 + }, + { + "epoch": 4.76, + "grad_norm": 0.0002460479736328125, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0, + "step": 1190 + }, + { + "epoch": 4.76, + "eval_accuracy": 0.99, + "eval_loss": 0.038503438234329224, + "eval_runtime": 29.5765, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 1190 + }, + { + "epoch": 4.764, + "grad_norm": 0.40234375, + "learning_rate": 2.36e-06, + "loss": 0.0011, + "step": 1191 + }, + { + "epoch": 4.764, + "eval_accuracy": 0.99, + "eval_loss": 0.039353713393211365, + "eval_runtime": 29.5802, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 1191 + }, + { + "epoch": 4.768, + "grad_norm": 0.000431060791015625, + "learning_rate": 2.32e-06, + "loss": 0.0, + "step": 1192 + }, + { + "epoch": 4.768, + "eval_accuracy": 0.99, + "eval_loss": 0.040622152388095856, + "eval_runtime": 29.6169, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 1192 + }, + { + "epoch": 4.772, + "grad_norm": 6.593763828277588e-07, + "learning_rate": 2.28e-06, + "loss": 0.0, + "step": 1193 + }, + { + "epoch": 4.772, + "eval_accuracy": 0.99, + "eval_loss": 0.039329156279563904, + "eval_runtime": 29.5635, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 1193 + }, + { + "epoch": 4.776, + "grad_norm": 0.000110626220703125, + "learning_rate": 2.24e-06, + "loss": 0.0, + "step": 1194 + }, + { + "epoch": 4.776, + "eval_accuracy": 0.99, + "eval_loss": 0.04027732461690903, + "eval_runtime": 29.565, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.131, + "step": 1194 + }, + { + "epoch": 4.78, + "grad_norm": 8.869171142578125e-05, + "learning_rate": 2.2e-06, + "loss": 0.0, + "step": 1195 + }, + { + "epoch": 4.78, + "eval_accuracy": 0.99, + "eval_loss": 0.038827378302812576, + "eval_runtime": 29.577, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 1195 + }, + { + "epoch": 4.784, + "grad_norm": 0.0081787109375, + "learning_rate": 2.16e-06, + "loss": 0.0, + "step": 1196 + }, + { + "epoch": 4.784, + "eval_accuracy": 0.99, + "eval_loss": 0.04018732160329819, + "eval_runtime": 29.5799, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 1196 + }, + { + "epoch": 4.788, + "grad_norm": 0.1357421875, + "learning_rate": 2.12e-06, + "loss": 0.0004, + "step": 1197 + }, + { + "epoch": 4.788, + "eval_accuracy": 0.99, + "eval_loss": 0.03886519372463226, + "eval_runtime": 29.5683, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 1197 + }, + { + "epoch": 4.792, + "grad_norm": 5.030632019042969e-05, + "learning_rate": 2.08e-06, + "loss": 0.0, + "step": 1198 + }, + { + "epoch": 4.792, + "eval_accuracy": 0.99, + "eval_loss": 0.03988344594836235, + "eval_runtime": 29.6178, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.127, + "step": 1198 + }, + { + "epoch": 4.796, + "grad_norm": 6.723403930664062e-05, + "learning_rate": 2.0400000000000004e-06, + "loss": 0.0, + "step": 1199 + }, + { + "epoch": 4.796, + "eval_accuracy": 0.99, + "eval_loss": 0.04019662365317345, + "eval_runtime": 29.638, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 1199 + }, + { + "epoch": 4.8, + "grad_norm": 2.7060508728027344e-05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0, + "step": 1200 + }, + { + "epoch": 4.8, + "eval_accuracy": 0.99, + "eval_loss": 0.039890188723802567, + "eval_runtime": 29.6043, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.128, + "step": 1200 + }, + { + "epoch": 4.804, + "grad_norm": 0.00012302398681640625, + "learning_rate": 1.96e-06, + "loss": 0.0, + "step": 1201 + }, + { + "epoch": 4.804, + "eval_accuracy": 0.99, + "eval_loss": 0.03935541585087776, + "eval_runtime": 29.5561, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.132, + "step": 1201 + }, + { + "epoch": 4.808, + "grad_norm": 1.796875, + "learning_rate": 1.92e-06, + "loss": 0.0056, + "step": 1202 + }, + { + "epoch": 4.808, + "eval_accuracy": 0.99, + "eval_loss": 0.039739277213811874, + "eval_runtime": 29.3525, + "eval_samples_per_second": 17.034, + "eval_steps_per_second": 2.146, + "step": 1202 + }, + { + "epoch": 4.812, + "grad_norm": 10.8125, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.0825, + "step": 1203 + }, + { + "epoch": 4.812, + "eval_accuracy": 0.99, + "eval_loss": 0.040054816752672195, + "eval_runtime": 29.491, + "eval_samples_per_second": 16.954, + "eval_steps_per_second": 2.136, + "step": 1203 + }, + { + "epoch": 4.816, + "grad_norm": 6.67572021484375e-06, + "learning_rate": 1.84e-06, + "loss": 0.0, + "step": 1204 + }, + { + "epoch": 4.816, + "eval_accuracy": 0.99, + "eval_loss": 0.03980724513530731, + "eval_runtime": 29.5991, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 1204 + }, + { + "epoch": 4.82, + "grad_norm": 1.9354047253727913e-09, + "learning_rate": 1.8e-06, + "loss": 0.0, + "step": 1205 + }, + { + "epoch": 4.82, + "eval_accuracy": 0.99, + "eval_loss": 0.03939307481050491, + "eval_runtime": 29.5723, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 1205 + }, + { + "epoch": 4.824, + "grad_norm": 0.005035400390625, + "learning_rate": 1.76e-06, + "loss": 0.0, + "step": 1206 + }, + { + "epoch": 4.824, + "eval_accuracy": 0.99, + "eval_loss": 0.03938554599881172, + "eval_runtime": 29.5835, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.13, + "step": 1206 + }, + { + "epoch": 4.828, + "grad_norm": 0.0012664794921875, + "learning_rate": 1.72e-06, + "loss": 0.0, + "step": 1207 + }, + { + "epoch": 4.828, + "eval_accuracy": 0.99, + "eval_loss": 0.03997182473540306, + "eval_runtime": 29.5895, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.129, + "step": 1207 + }, + { + "epoch": 4.832, + "grad_norm": 0.0001697540283203125, + "learning_rate": 1.68e-06, + "loss": 0.0, + "step": 1208 + }, + { + "epoch": 4.832, + "eval_accuracy": 0.99, + "eval_loss": 0.04033800587058067, + "eval_runtime": 29.5703, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.131, + "step": 1208 + }, + { + "epoch": 4.836, + "grad_norm": 4.112720489501953e-06, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.0, + "step": 1209 + }, + { + "epoch": 4.836, + "eval_accuracy": 0.99, + "eval_loss": 0.03902960941195488, + "eval_runtime": 29.5724, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 1209 + }, + { + "epoch": 4.84, + "grad_norm": 1.3690441846847534e-07, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0, + "step": 1210 + }, + { + "epoch": 4.84, + "eval_accuracy": 0.99, + "eval_loss": 0.04041973873972893, + "eval_runtime": 29.6243, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.127, + "step": 1210 + }, + { + "epoch": 4.844, + "grad_norm": 0.00323486328125, + "learning_rate": 1.56e-06, + "loss": 0.0, + "step": 1211 + }, + { + "epoch": 4.844, + "eval_accuracy": 0.99, + "eval_loss": 0.04026920720934868, + "eval_runtime": 29.5967, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.129, + "step": 1211 + }, + { + "epoch": 4.848, + "grad_norm": 26.375, + "learning_rate": 1.52e-06, + "loss": 0.1143, + "step": 1212 + }, + { + "epoch": 4.848, + "eval_accuracy": 0.99, + "eval_loss": 0.03972255811095238, + "eval_runtime": 29.5812, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 1212 + }, + { + "epoch": 4.852, + "grad_norm": 0.003204345703125, + "learning_rate": 1.4800000000000002e-06, + "loss": 0.0, + "step": 1213 + }, + { + "epoch": 4.852, + "eval_accuracy": 0.99, + "eval_loss": 0.03963300213217735, + "eval_runtime": 29.5851, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 1213 + }, + { + "epoch": 4.856, + "grad_norm": 0.00031280517578125, + "learning_rate": 1.44e-06, + "loss": 0.0, + "step": 1214 + }, + { + "epoch": 4.856, + "eval_accuracy": 0.99, + "eval_loss": 0.039066459983587265, + "eval_runtime": 29.5855, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.129, + "step": 1214 + }, + { + "epoch": 4.86, + "grad_norm": 6.794929504394531e-06, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0, + "step": 1215 + }, + { + "epoch": 4.86, + "eval_accuracy": 0.99, + "eval_loss": 0.03944956511259079, + "eval_runtime": 29.5755, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 1215 + }, + { + "epoch": 4.864, + "grad_norm": 0.00013637542724609375, + "learning_rate": 1.36e-06, + "loss": 0.0, + "step": 1216 + }, + { + "epoch": 4.864, + "eval_accuracy": 0.99, + "eval_loss": 0.0399693064391613, + "eval_runtime": 29.6206, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.127, + "step": 1216 + }, + { + "epoch": 4.868, + "grad_norm": 1.3096723705530167e-08, + "learning_rate": 1.32e-06, + "loss": 0.0, + "step": 1217 + }, + { + "epoch": 4.868, + "eval_accuracy": 0.99, + "eval_loss": 0.03994003310799599, + "eval_runtime": 29.5682, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 1217 + }, + { + "epoch": 4.872, + "grad_norm": 1.214444637298584e-06, + "learning_rate": 1.28e-06, + "loss": 0.0, + "step": 1218 + }, + { + "epoch": 4.872, + "eval_accuracy": 0.99, + "eval_loss": 0.03892018273472786, + "eval_runtime": 29.6321, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.126, + "step": 1218 + }, + { + "epoch": 4.876, + "grad_norm": 1.5735626220703125e-05, + "learning_rate": 1.24e-06, + "loss": 0.0, + "step": 1219 + }, + { + "epoch": 4.876, + "eval_accuracy": 0.99, + "eval_loss": 0.03942938521504402, + "eval_runtime": 29.6481, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.125, + "step": 1219 + }, + { + "epoch": 4.88, + "grad_norm": 0.0849609375, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0003, + "step": 1220 + }, + { + "epoch": 4.88, + "eval_accuracy": 0.99, + "eval_loss": 0.039689820259809494, + "eval_runtime": 29.5996, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.128, + "step": 1220 + }, + { + "epoch": 4.884, + "grad_norm": 0.00055694580078125, + "learning_rate": 1.16e-06, + "loss": 0.0, + "step": 1221 + }, + { + "epoch": 4.884, + "eval_accuracy": 0.99, + "eval_loss": 0.03987071290612221, + "eval_runtime": 29.577, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 1221 + }, + { + "epoch": 4.888, + "grad_norm": 0.0031280517578125, + "learning_rate": 1.12e-06, + "loss": 0.0, + "step": 1222 + }, + { + "epoch": 4.888, + "eval_accuracy": 0.99, + "eval_loss": 0.03916815295815468, + "eval_runtime": 29.5669, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 1222 + }, + { + "epoch": 4.892, + "grad_norm": 0.236328125, + "learning_rate": 1.08e-06, + "loss": 0.0006, + "step": 1223 + }, + { + "epoch": 4.892, + "eval_accuracy": 0.99, + "eval_loss": 0.03995658829808235, + "eval_runtime": 29.5715, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.13, + "step": 1223 + }, + { + "epoch": 4.896, + "grad_norm": 0.0174560546875, + "learning_rate": 1.04e-06, + "loss": 0.0, + "step": 1224 + }, + { + "epoch": 4.896, + "eval_accuracy": 0.99, + "eval_loss": 0.039282578974962234, + "eval_runtime": 29.5514, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.132, + "step": 1224 + }, + { + "epoch": 4.9, + "grad_norm": 0.0030364990234375, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0, + "step": 1225 + }, + { + "epoch": 4.9, + "eval_accuracy": 0.99, + "eval_loss": 0.03948935121297836, + "eval_runtime": 29.6084, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.128, + "step": 1225 + }, + { + "epoch": 4.904, + "grad_norm": 2.47955322265625e-05, + "learning_rate": 9.6e-07, + "loss": 0.0, + "step": 1226 + }, + { + "epoch": 4.904, + "eval_accuracy": 0.99, + "eval_loss": 0.04002528265118599, + "eval_runtime": 29.5663, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 1226 + }, + { + "epoch": 4.908, + "grad_norm": 0.0537109375, + "learning_rate": 9.2e-07, + "loss": 0.0002, + "step": 1227 + }, + { + "epoch": 4.908, + "eval_accuracy": 0.99, + "eval_loss": 0.04045616090297699, + "eval_runtime": 29.5569, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.131, + "step": 1227 + }, + { + "epoch": 4.912, + "grad_norm": 0.011962890625, + "learning_rate": 8.8e-07, + "loss": 0.0, + "step": 1228 + }, + { + "epoch": 4.912, + "eval_accuracy": 0.99, + "eval_loss": 0.03996226191520691, + "eval_runtime": 29.5669, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 1228 + }, + { + "epoch": 4.916, + "grad_norm": 9.42964106798172e-09, + "learning_rate": 8.4e-07, + "loss": 0.0, + "step": 1229 + }, + { + "epoch": 4.916, + "eval_accuracy": 0.99, + "eval_loss": 0.040111929178237915, + "eval_runtime": 29.5742, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.13, + "step": 1229 + }, + { + "epoch": 4.92, + "grad_norm": 0.000766754150390625, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "step": 1230 + }, + { + "epoch": 4.92, + "eval_accuracy": 0.99, + "eval_loss": 0.040205780416727066, + "eval_runtime": 29.5774, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 1230 + }, + { + "epoch": 4.924, + "grad_norm": 0.00555419921875, + "learning_rate": 7.6e-07, + "loss": 0.0, + "step": 1231 + }, + { + "epoch": 4.924, + "eval_accuracy": 0.99, + "eval_loss": 0.039955589920282364, + "eval_runtime": 29.6135, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 1231 + }, + { + "epoch": 4.928, + "grad_norm": 0.00066375732421875, + "learning_rate": 7.2e-07, + "loss": 0.0, + "step": 1232 + }, + { + "epoch": 4.928, + "eval_accuracy": 0.99, + "eval_loss": 0.039896104484796524, + "eval_runtime": 29.563, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 1232 + }, + { + "epoch": 4.932, + "grad_norm": 0.017333984375, + "learning_rate": 6.8e-07, + "loss": 0.0, + "step": 1233 + }, + { + "epoch": 4.932, + "eval_accuracy": 0.99, + "eval_loss": 0.040225252509117126, + "eval_runtime": 29.6143, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.127, + "step": 1233 + }, + { + "epoch": 4.936, + "grad_norm": 300.0, + "learning_rate": 6.4e-07, + "loss": 0.6523, + "step": 1234 + }, + { + "epoch": 4.936, + "eval_accuracy": 0.99, + "eval_loss": 0.04042859375476837, + "eval_runtime": 29.6296, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.126, + "step": 1234 + }, + { + "epoch": 4.9399999999999995, + "grad_norm": 143.0, + "learning_rate": 6.000000000000001e-07, + "loss": 0.625, + "step": 1235 + }, + { + "epoch": 4.9399999999999995, + "eval_accuracy": 0.99, + "eval_loss": 0.039302945137023926, + "eval_runtime": 29.5826, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.13, + "step": 1235 + }, + { + "epoch": 4.944, + "grad_norm": 0.01507568359375, + "learning_rate": 5.6e-07, + "loss": 0.0, + "step": 1236 + }, + { + "epoch": 4.944, + "eval_accuracy": 0.99, + "eval_loss": 0.03978666663169861, + "eval_runtime": 29.6386, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.126, + "step": 1236 + }, + { + "epoch": 4.948, + "grad_norm": 12.4375, + "learning_rate": 5.2e-07, + "loss": 0.0266, + "step": 1237 + }, + { + "epoch": 4.948, + "eval_accuracy": 0.99, + "eval_loss": 0.03954610601067543, + "eval_runtime": 29.6468, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.125, + "step": 1237 + }, + { + "epoch": 4.952, + "grad_norm": 1.40625, + "learning_rate": 4.8e-07, + "loss": 0.002, + "step": 1238 + }, + { + "epoch": 4.952, + "eval_accuracy": 0.99, + "eval_loss": 0.03866248577833176, + "eval_runtime": 29.6446, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.125, + "step": 1238 + }, + { + "epoch": 4.9559999999999995, + "grad_norm": 7.486343383789062e-05, + "learning_rate": 4.4e-07, + "loss": 0.0, + "step": 1239 + }, + { + "epoch": 4.9559999999999995, + "eval_accuracy": 0.99, + "eval_loss": 0.040864985436201096, + "eval_runtime": 29.5881, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.129, + "step": 1239 + }, + { + "epoch": 4.96, + "grad_norm": 0.00433349609375, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "step": 1240 + }, + { + "epoch": 4.96, + "eval_accuracy": 0.99, + "eval_loss": 0.040501151233911514, + "eval_runtime": 29.5805, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 1240 + }, + { + "epoch": 4.964, + "grad_norm": 0.00010585784912109375, + "learning_rate": 3.6e-07, + "loss": 0.0, + "step": 1241 + }, + { + "epoch": 4.964, + "eval_accuracy": 0.99, + "eval_loss": 0.03989541158080101, + "eval_runtime": 29.5622, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.131, + "step": 1241 + }, + { + "epoch": 4.968, + "grad_norm": 0.000514984130859375, + "learning_rate": 3.2e-07, + "loss": 0.0, + "step": 1242 + }, + { + "epoch": 4.968, + "eval_accuracy": 0.99, + "eval_loss": 0.03939824551343918, + "eval_runtime": 29.6076, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.128, + "step": 1242 + }, + { + "epoch": 4.9719999999999995, + "grad_norm": 0.0576171875, + "learning_rate": 2.8e-07, + "loss": 0.0002, + "step": 1243 + }, + { + "epoch": 4.9719999999999995, + "eval_accuracy": 0.99, + "eval_loss": 0.03874744102358818, + "eval_runtime": 29.5805, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 1243 + }, + { + "epoch": 4.976, + "grad_norm": 7.343292236328125e-05, + "learning_rate": 2.4e-07, + "loss": 0.0, + "step": 1244 + }, + { + "epoch": 4.976, + "eval_accuracy": 0.99, + "eval_loss": 0.039426933974027634, + "eval_runtime": 29.5791, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.13, + "step": 1244 + }, + { + "epoch": 4.98, + "grad_norm": 0.0007476806640625, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "step": 1245 + }, + { + "epoch": 4.98, + "eval_accuracy": 0.99, + "eval_loss": 0.039973627775907516, + "eval_runtime": 29.5769, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.13, + "step": 1245 + }, + { + "epoch": 4.984, + "grad_norm": 6.631016731262207e-07, + "learning_rate": 1.6e-07, + "loss": 0.0, + "step": 1246 + }, + { + "epoch": 4.984, + "eval_accuracy": 0.99, + "eval_loss": 0.03948342427611351, + "eval_runtime": 29.6331, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.126, + "step": 1246 + }, + { + "epoch": 4.9879999999999995, + "grad_norm": 0.78515625, + "learning_rate": 1.2e-07, + "loss": 0.0032, + "step": 1247 + }, + { + "epoch": 4.9879999999999995, + "eval_accuracy": 0.99, + "eval_loss": 0.03993542119860649, + "eval_runtime": 29.5808, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.13, + "step": 1247 + }, + { + "epoch": 4.992, + "grad_norm": 4.38690185546875e-05, + "learning_rate": 8e-08, + "loss": 0.0, + "step": 1248 + }, + { + "epoch": 4.992, + "eval_accuracy": 0.99, + "eval_loss": 0.04022332653403282, + "eval_runtime": 29.5752, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.13, + "step": 1248 + }, + { + "epoch": 4.996, + "grad_norm": 0.0264892578125, + "learning_rate": 4e-08, + "loss": 0.0001, + "step": 1249 + }, + { + "epoch": 4.996, + "eval_accuracy": 0.99, + "eval_loss": 0.03989771381020546, + "eval_runtime": 29.5691, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.131, + "step": 1249 + }, + { + "epoch": 5.0, + "grad_norm": 0.01470947265625, + "learning_rate": 0.0, + "loss": 0.0001, + "step": 1250 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.99, + "eval_loss": 0.03989832103252411, + "eval_runtime": 29.5661, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.131, + "step": 1250 + } + ], + "logging_steps": 1, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}