diff --git "a/rm-harmless-cai/checkpoint-1250/trainer_state.json" "b/rm-harmless-cai/checkpoint-1250/trainer_state.json" new file mode 100644--- /dev/null +++ "b/rm-harmless-cai/checkpoint-1250/trainer_state.json" @@ -0,0 +1,20021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 1, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004, + "grad_norm": 27.25, + "learning_rate": 4.996e-05, + "loss": 0.5391, + "step": 1 + }, + { + "epoch": 0.004, + "eval_accuracy": 0.7570281124497992, + "eval_loss": 0.671749472618103, + "eval_runtime": 28.5903, + "eval_samples_per_second": 17.419, + "eval_steps_per_second": 2.204, + "step": 1 + }, + { + "epoch": 0.008, + "grad_norm": 42.5, + "learning_rate": 4.992e-05, + "loss": 1.0, + "step": 2 + }, + { + "epoch": 0.008, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.6584305763244629, + "eval_runtime": 28.8781, + "eval_samples_per_second": 17.245, + "eval_steps_per_second": 2.182, + "step": 2 + }, + { + "epoch": 0.012, + "grad_norm": 16.75, + "learning_rate": 4.9880000000000004e-05, + "loss": 0.4805, + "step": 3 + }, + { + "epoch": 0.012, + "eval_accuracy": 0.7429718875502008, + "eval_loss": 0.6664156913757324, + "eval_runtime": 29.0527, + "eval_samples_per_second": 17.141, + "eval_steps_per_second": 2.168, + "step": 3 + }, + { + "epoch": 0.016, + "grad_norm": 25.625, + "learning_rate": 4.9840000000000004e-05, + "loss": 0.6523, + "step": 4 + }, + { + "epoch": 0.016, + "eval_accuracy": 0.7469879518072289, + "eval_loss": 0.6629329919815063, + "eval_runtime": 29.3114, + "eval_samples_per_second": 16.99, + "eval_steps_per_second": 2.149, + "step": 4 + }, + { + "epoch": 0.02, + "grad_norm": 62.0, + "learning_rate": 4.9800000000000004e-05, + "loss": 0.8008, + "step": 5 + }, + { + "epoch": 0.02, + "eval_accuracy": 0.7429718875502008, + "eval_loss": 0.6590580940246582, + "eval_runtime": 29.4037, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 5 + }, + { + "epoch": 0.024, + "grad_norm": 19.25, + "learning_rate": 4.976e-05, + "loss": 0.7812, + "step": 6 + }, + { + "epoch": 0.024, + "eval_accuracy": 0.7269076305220884, + "eval_loss": 0.6581011414527893, + "eval_runtime": 29.467, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.138, + "step": 6 + }, + { + "epoch": 0.028, + "grad_norm": 20.625, + "learning_rate": 4.972e-05, + "loss": 0.5391, + "step": 7 + }, + { + "epoch": 0.028, + "eval_accuracy": 0.7429718875502008, + "eval_loss": 0.6529948115348816, + "eval_runtime": 29.5386, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 7 + }, + { + "epoch": 0.032, + "grad_norm": 14.75, + "learning_rate": 4.9680000000000005e-05, + "loss": 0.5742, + "step": 8 + }, + { + "epoch": 0.032, + "eval_accuracy": 0.7369477911646586, + "eval_loss": 0.6508749723434448, + "eval_runtime": 29.5481, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.132, + "step": 8 + }, + { + "epoch": 0.036, + "grad_norm": 19.875, + "learning_rate": 4.9640000000000006e-05, + "loss": 0.7578, + "step": 9 + }, + { + "epoch": 0.036, + "eval_accuracy": 0.7349397590361446, + "eval_loss": 0.6519476175308228, + "eval_runtime": 29.4914, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 9 + }, + { + "epoch": 0.04, + "grad_norm": 15.1875, + "learning_rate": 4.96e-05, + "loss": 0.5781, + "step": 10 + }, + { + "epoch": 0.04, + "eval_accuracy": 0.7349397590361446, + "eval_loss": 0.6504769325256348, + "eval_runtime": 29.5394, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 10 + }, + { + "epoch": 0.044, + "grad_norm": 135.0, + "learning_rate": 4.956e-05, + "loss": 0.9531, + "step": 11 + }, + { + "epoch": 0.044, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6459823846817017, + "eval_runtime": 29.5175, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 11 + }, + { + "epoch": 0.048, + "grad_norm": 13.125, + "learning_rate": 4.952e-05, + "loss": 0.5391, + "step": 12 + }, + { + "epoch": 0.048, + "eval_accuracy": 0.7369477911646586, + "eval_loss": 0.6683923006057739, + "eval_runtime": 29.3877, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.144, + "step": 12 + }, + { + "epoch": 0.052, + "grad_norm": 23.875, + "learning_rate": 4.948000000000001e-05, + "loss": 0.6562, + "step": 13 + }, + { + "epoch": 0.052, + "eval_accuracy": 0.7570281124497992, + "eval_loss": 0.6653802990913391, + "eval_runtime": 29.3563, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.146, + "step": 13 + }, + { + "epoch": 0.056, + "grad_norm": 114.5, + "learning_rate": 4.944e-05, + "loss": 0.9688, + "step": 14 + }, + { + "epoch": 0.056, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.6690433621406555, + "eval_runtime": 29.3244, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.148, + "step": 14 + }, + { + "epoch": 0.06, + "grad_norm": 30.125, + "learning_rate": 4.94e-05, + "loss": 0.875, + "step": 15 + }, + { + "epoch": 0.06, + "eval_accuracy": 0.7469879518072289, + "eval_loss": 0.6654351949691772, + "eval_runtime": 29.3055, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.15, + "step": 15 + }, + { + "epoch": 0.064, + "grad_norm": 122.5, + "learning_rate": 4.936e-05, + "loss": 1.2266, + "step": 16 + }, + { + "epoch": 0.064, + "eval_accuracy": 0.7550200803212851, + "eval_loss": 0.6579756736755371, + "eval_runtime": 29.3521, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.146, + "step": 16 + }, + { + "epoch": 0.068, + "grad_norm": 30.375, + "learning_rate": 4.932e-05, + "loss": 0.7227, + "step": 17 + }, + { + "epoch": 0.068, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6493473649024963, + "eval_runtime": 29.4681, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.138, + "step": 17 + }, + { + "epoch": 0.072, + "grad_norm": 13.1875, + "learning_rate": 4.928e-05, + "loss": 0.8047, + "step": 18 + }, + { + "epoch": 0.072, + "eval_accuracy": 0.7610441767068273, + "eval_loss": 0.6470922827720642, + "eval_runtime": 29.5202, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 18 + }, + { + "epoch": 0.076, + "grad_norm": 19.625, + "learning_rate": 4.924e-05, + "loss": 0.6523, + "step": 19 + }, + { + "epoch": 0.076, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6470138430595398, + "eval_runtime": 29.4993, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 19 + }, + { + "epoch": 0.08, + "grad_norm": 57.5, + "learning_rate": 4.92e-05, + "loss": 0.6875, + "step": 20 + }, + { + "epoch": 0.08, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6478198170661926, + "eval_runtime": 29.4831, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.137, + "step": 20 + }, + { + "epoch": 0.084, + "grad_norm": 11.4375, + "learning_rate": 4.9160000000000004e-05, + "loss": 0.5391, + "step": 21 + }, + { + "epoch": 0.084, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.6534634828567505, + "eval_runtime": 29.4689, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.138, + "step": 21 + }, + { + "epoch": 0.088, + "grad_norm": 10.4375, + "learning_rate": 4.9120000000000004e-05, + "loss": 0.4844, + "step": 22 + }, + { + "epoch": 0.088, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.655241072177887, + "eval_runtime": 29.457, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 22 + }, + { + "epoch": 0.092, + "grad_norm": 39.0, + "learning_rate": 4.9080000000000004e-05, + "loss": 0.5391, + "step": 23 + }, + { + "epoch": 0.092, + "eval_accuracy": 0.7650602409638554, + "eval_loss": 0.6521533131599426, + "eval_runtime": 29.3777, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.144, + "step": 23 + }, + { + "epoch": 0.096, + "grad_norm": 18.25, + "learning_rate": 4.9040000000000005e-05, + "loss": 0.7461, + "step": 24 + }, + { + "epoch": 0.096, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.6577795743942261, + "eval_runtime": 29.3085, + "eval_samples_per_second": 16.992, + "eval_steps_per_second": 2.15, + "step": 24 + }, + { + "epoch": 0.1, + "grad_norm": 11.0625, + "learning_rate": 4.9e-05, + "loss": 0.625, + "step": 25 + }, + { + "epoch": 0.1, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.6604307889938354, + "eval_runtime": 29.3395, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 25 + }, + { + "epoch": 0.104, + "grad_norm": 9.0, + "learning_rate": 4.896e-05, + "loss": 0.6133, + "step": 26 + }, + { + "epoch": 0.104, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.6686707735061646, + "eval_runtime": 29.3977, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.143, + "step": 26 + }, + { + "epoch": 0.108, + "grad_norm": 10.125, + "learning_rate": 4.8920000000000006e-05, + "loss": 0.9336, + "step": 27 + }, + { + "epoch": 0.108, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.6728358864784241, + "eval_runtime": 29.4491, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.139, + "step": 27 + }, + { + "epoch": 0.112, + "grad_norm": 9.8125, + "learning_rate": 4.8880000000000006e-05, + "loss": 0.875, + "step": 28 + }, + { + "epoch": 0.112, + "eval_accuracy": 0.7610441767068273, + "eval_loss": 0.6621760725975037, + "eval_runtime": 29.5084, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 28 + }, + { + "epoch": 0.116, + "grad_norm": 8.375, + "learning_rate": 4.884e-05, + "loss": 0.6406, + "step": 29 + }, + { + "epoch": 0.116, + "eval_accuracy": 0.7590361445783133, + "eval_loss": 0.6655136346817017, + "eval_runtime": 29.5104, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 29 + }, + { + "epoch": 0.12, + "grad_norm": 8.4375, + "learning_rate": 4.88e-05, + "loss": 0.5625, + "step": 30 + }, + { + "epoch": 0.12, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6655449867248535, + "eval_runtime": 29.4548, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.139, + "step": 30 + }, + { + "epoch": 0.124, + "grad_norm": 7.34375, + "learning_rate": 4.876e-05, + "loss": 0.7188, + "step": 31 + }, + { + "epoch": 0.124, + "eval_accuracy": 0.7590361445783133, + "eval_loss": 0.6709141135215759, + "eval_runtime": 29.4545, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.139, + "step": 31 + }, + { + "epoch": 0.128, + "grad_norm": 6.53125, + "learning_rate": 4.872000000000001e-05, + "loss": 0.7109, + "step": 32 + }, + { + "epoch": 0.128, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6684707403182983, + "eval_runtime": 29.3608, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.146, + "step": 32 + }, + { + "epoch": 0.132, + "grad_norm": 11.125, + "learning_rate": 4.868e-05, + "loss": 0.5977, + "step": 33 + }, + { + "epoch": 0.132, + "eval_accuracy": 0.7650602409638554, + "eval_loss": 0.6656391024589539, + "eval_runtime": 29.3099, + "eval_samples_per_second": 16.991, + "eval_steps_per_second": 2.149, + "step": 33 + }, + { + "epoch": 0.136, + "grad_norm": 7.28125, + "learning_rate": 4.864e-05, + "loss": 0.7695, + "step": 34 + }, + { + "epoch": 0.136, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.6640076041221619, + "eval_runtime": 29.4165, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 34 + }, + { + "epoch": 0.14, + "grad_norm": 7.9375, + "learning_rate": 4.86e-05, + "loss": 0.6641, + "step": 35 + }, + { + "epoch": 0.14, + "eval_accuracy": 0.7369477911646586, + "eval_loss": 0.6778441667556763, + "eval_runtime": 29.4501, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.139, + "step": 35 + }, + { + "epoch": 0.144, + "grad_norm": 7.34375, + "learning_rate": 4.856e-05, + "loss": 0.7188, + "step": 36 + }, + { + "epoch": 0.144, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.690837562084198, + "eval_runtime": 29.5156, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 36 + }, + { + "epoch": 0.148, + "grad_norm": 9.3125, + "learning_rate": 4.852e-05, + "loss": 0.7188, + "step": 37 + }, + { + "epoch": 0.148, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6788521409034729, + "eval_runtime": 29.5082, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 37 + }, + { + "epoch": 0.152, + "grad_norm": 7.09375, + "learning_rate": 4.8480000000000003e-05, + "loss": 0.7227, + "step": 38 + }, + { + "epoch": 0.152, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6907473802566528, + "eval_runtime": 29.4618, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.138, + "step": 38 + }, + { + "epoch": 0.156, + "grad_norm": 5.09375, + "learning_rate": 4.8440000000000004e-05, + "loss": 0.4199, + "step": 39 + }, + { + "epoch": 0.156, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.685415506362915, + "eval_runtime": 29.4012, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 39 + }, + { + "epoch": 0.16, + "grad_norm": 8.25, + "learning_rate": 4.8400000000000004e-05, + "loss": 0.3809, + "step": 40 + }, + { + "epoch": 0.16, + "eval_accuracy": 0.7610441767068273, + "eval_loss": 0.6820744276046753, + "eval_runtime": 29.3004, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 2.15, + "step": 40 + }, + { + "epoch": 0.164, + "grad_norm": 178.0, + "learning_rate": 4.836e-05, + "loss": 1.0547, + "step": 41 + }, + { + "epoch": 0.164, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6859321594238281, + "eval_runtime": 29.2393, + "eval_samples_per_second": 17.032, + "eval_steps_per_second": 2.155, + "step": 41 + }, + { + "epoch": 0.168, + "grad_norm": 7.375, + "learning_rate": 4.8320000000000005e-05, + "loss": 0.7383, + "step": 42 + }, + { + "epoch": 0.168, + "eval_accuracy": 0.7650602409638554, + "eval_loss": 0.6873440742492676, + "eval_runtime": 29.2131, + "eval_samples_per_second": 17.047, + "eval_steps_per_second": 2.157, + "step": 42 + }, + { + "epoch": 0.172, + "grad_norm": 240.0, + "learning_rate": 4.8280000000000005e-05, + "loss": 1.0703, + "step": 43 + }, + { + "epoch": 0.172, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6906698346138, + "eval_runtime": 29.1905, + "eval_samples_per_second": 17.06, + "eval_steps_per_second": 2.158, + "step": 43 + }, + { + "epoch": 0.176, + "grad_norm": 85.5, + "learning_rate": 4.824e-05, + "loss": 2.4531, + "step": 44 + }, + { + "epoch": 0.176, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.6974478960037231, + "eval_runtime": 29.3024, + "eval_samples_per_second": 16.995, + "eval_steps_per_second": 2.15, + "step": 44 + }, + { + "epoch": 0.18, + "grad_norm": 7.9375, + "learning_rate": 4.82e-05, + "loss": 0.5586, + "step": 45 + }, + { + "epoch": 0.18, + "eval_accuracy": 0.7610441767068273, + "eval_loss": 0.7034720778465271, + "eval_runtime": 29.4383, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.14, + "step": 45 + }, + { + "epoch": 0.184, + "grad_norm": 9.5625, + "learning_rate": 4.816e-05, + "loss": 0.2617, + "step": 46 + }, + { + "epoch": 0.184, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.7064840793609619, + "eval_runtime": 29.4358, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.14, + "step": 46 + }, + { + "epoch": 0.188, + "grad_norm": 31.0, + "learning_rate": 4.812000000000001e-05, + "loss": 1.3672, + "step": 47 + }, + { + "epoch": 0.188, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.7108449339866638, + "eval_runtime": 29.49, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.136, + "step": 47 + }, + { + "epoch": 0.192, + "grad_norm": 5.53125, + "learning_rate": 4.808e-05, + "loss": 0.498, + "step": 48 + }, + { + "epoch": 0.192, + "eval_accuracy": 0.7670682730923695, + "eval_loss": 0.7097460031509399, + "eval_runtime": 29.5007, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 48 + }, + { + "epoch": 0.196, + "grad_norm": 56.25, + "learning_rate": 4.804e-05, + "loss": 2.5, + "step": 49 + }, + { + "epoch": 0.196, + "eval_accuracy": 0.7670682730923695, + "eval_loss": 0.7019027471542358, + "eval_runtime": 29.4831, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.137, + "step": 49 + }, + { + "epoch": 0.2, + "grad_norm": 5.34375, + "learning_rate": 4.8e-05, + "loss": 0.3711, + "step": 50 + }, + { + "epoch": 0.2, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6896992325782776, + "eval_runtime": 29.4812, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 50 + }, + { + "epoch": 0.204, + "grad_norm": 42.5, + "learning_rate": 4.796e-05, + "loss": 0.3965, + "step": 51 + }, + { + "epoch": 0.204, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6737646460533142, + "eval_runtime": 29.3948, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.143, + "step": 51 + }, + { + "epoch": 0.208, + "grad_norm": 20.625, + "learning_rate": 4.792e-05, + "loss": 0.6641, + "step": 52 + }, + { + "epoch": 0.208, + "eval_accuracy": 0.7670682730923695, + "eval_loss": 0.6650082468986511, + "eval_runtime": 29.3879, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.144, + "step": 52 + }, + { + "epoch": 0.212, + "grad_norm": 6.53125, + "learning_rate": 4.788e-05, + "loss": 0.5586, + "step": 53 + }, + { + "epoch": 0.212, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.6571245789527893, + "eval_runtime": 29.473, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 53 + }, + { + "epoch": 0.216, + "grad_norm": 6.15625, + "learning_rate": 4.784e-05, + "loss": 0.6523, + "step": 54 + }, + { + "epoch": 0.216, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6644390225410461, + "eval_runtime": 29.4938, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 54 + }, + { + "epoch": 0.22, + "grad_norm": 5.5625, + "learning_rate": 4.78e-05, + "loss": 0.5352, + "step": 55 + }, + { + "epoch": 0.22, + "eval_accuracy": 0.7791164658634538, + "eval_loss": 0.6522221565246582, + "eval_runtime": 29.4364, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.14, + "step": 55 + }, + { + "epoch": 0.224, + "grad_norm": 6.125, + "learning_rate": 4.7760000000000004e-05, + "loss": 0.668, + "step": 56 + }, + { + "epoch": 0.224, + "eval_accuracy": 0.7871485943775101, + "eval_loss": 0.6511220932006836, + "eval_runtime": 29.4473, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.139, + "step": 56 + }, + { + "epoch": 0.228, + "grad_norm": 6.28125, + "learning_rate": 4.7720000000000004e-05, + "loss": 0.5352, + "step": 57 + }, + { + "epoch": 0.228, + "eval_accuracy": 0.7771084337349398, + "eval_loss": 0.6502337455749512, + "eval_runtime": 29.4999, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 57 + }, + { + "epoch": 0.232, + "grad_norm": 52.0, + "learning_rate": 4.7680000000000004e-05, + "loss": 1.6719, + "step": 58 + }, + { + "epoch": 0.232, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.6462059020996094, + "eval_runtime": 29.3713, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.145, + "step": 58 + }, + { + "epoch": 0.236, + "grad_norm": 7.25, + "learning_rate": 4.7640000000000005e-05, + "loss": 0.7266, + "step": 59 + }, + { + "epoch": 0.236, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6470491290092468, + "eval_runtime": 29.315, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.149, + "step": 59 + }, + { + "epoch": 0.24, + "grad_norm": 5.28125, + "learning_rate": 4.76e-05, + "loss": 0.7109, + "step": 60 + }, + { + "epoch": 0.24, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6490787267684937, + "eval_runtime": 29.199, + "eval_samples_per_second": 17.055, + "eval_steps_per_second": 2.158, + "step": 60 + }, + { + "epoch": 0.244, + "grad_norm": 5.8125, + "learning_rate": 4.7560000000000005e-05, + "loss": 0.5781, + "step": 61 + }, + { + "epoch": 0.244, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6494572162628174, + "eval_runtime": 29.2376, + "eval_samples_per_second": 17.033, + "eval_steps_per_second": 2.155, + "step": 61 + }, + { + "epoch": 0.248, + "grad_norm": 5.59375, + "learning_rate": 4.7520000000000006e-05, + "loss": 0.6094, + "step": 62 + }, + { + "epoch": 0.248, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6492964029312134, + "eval_runtime": 29.2679, + "eval_samples_per_second": 17.015, + "eval_steps_per_second": 2.153, + "step": 62 + }, + { + "epoch": 0.252, + "grad_norm": 6.15625, + "learning_rate": 4.748e-05, + "loss": 0.5117, + "step": 63 + }, + { + "epoch": 0.252, + "eval_accuracy": 0.7650602409638554, + "eval_loss": 0.6466471552848816, + "eval_runtime": 29.415, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.142, + "step": 63 + }, + { + "epoch": 0.256, + "grad_norm": 5.09375, + "learning_rate": 4.744e-05, + "loss": 0.6484, + "step": 64 + }, + { + "epoch": 0.256, + "eval_accuracy": 0.7610441767068273, + "eval_loss": 0.6458666920661926, + "eval_runtime": 29.472, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 64 + }, + { + "epoch": 0.26, + "grad_norm": 5.1875, + "learning_rate": 4.74e-05, + "loss": 0.5508, + "step": 65 + }, + { + "epoch": 0.26, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.6450666189193726, + "eval_runtime": 29.483, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.137, + "step": 65 + }, + { + "epoch": 0.264, + "grad_norm": 4.6875, + "learning_rate": 4.736000000000001e-05, + "loss": 0.5039, + "step": 66 + }, + { + "epoch": 0.264, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.645880401134491, + "eval_runtime": 29.4213, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.141, + "step": 66 + }, + { + "epoch": 0.268, + "grad_norm": 4.71875, + "learning_rate": 4.732e-05, + "loss": 0.4551, + "step": 67 + }, + { + "epoch": 0.268, + "eval_accuracy": 0.7590361445783133, + "eval_loss": 0.6462588906288147, + "eval_runtime": 29.4112, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.142, + "step": 67 + }, + { + "epoch": 0.272, + "grad_norm": 4.875, + "learning_rate": 4.728e-05, + "loss": 0.4336, + "step": 68 + }, + { + "epoch": 0.272, + "eval_accuracy": 0.7550200803212851, + "eval_loss": 0.6485679149627686, + "eval_runtime": 29.4136, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.142, + "step": 68 + }, + { + "epoch": 0.276, + "grad_norm": 5.25, + "learning_rate": 4.724e-05, + "loss": 0.4961, + "step": 69 + }, + { + "epoch": 0.276, + "eval_accuracy": 0.7570281124497992, + "eval_loss": 0.6498944759368896, + "eval_runtime": 29.3815, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.144, + "step": 69 + }, + { + "epoch": 0.28, + "grad_norm": 57.0, + "learning_rate": 4.72e-05, + "loss": 0.5703, + "step": 70 + }, + { + "epoch": 0.28, + "eval_accuracy": 0.7590361445783133, + "eval_loss": 0.6515544652938843, + "eval_runtime": 29.3138, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.149, + "step": 70 + }, + { + "epoch": 0.284, + "grad_norm": 8.625, + "learning_rate": 4.716e-05, + "loss": 0.7305, + "step": 71 + }, + { + "epoch": 0.284, + "eval_accuracy": 0.7570281124497992, + "eval_loss": 0.6519848704338074, + "eval_runtime": 29.2089, + "eval_samples_per_second": 17.05, + "eval_steps_per_second": 2.157, + "step": 71 + }, + { + "epoch": 0.288, + "grad_norm": 4.84375, + "learning_rate": 4.712e-05, + "loss": 0.1758, + "step": 72 + }, + { + "epoch": 0.288, + "eval_accuracy": 0.7610441767068273, + "eval_loss": 0.6570667624473572, + "eval_runtime": 29.227, + "eval_samples_per_second": 17.039, + "eval_steps_per_second": 2.156, + "step": 72 + }, + { + "epoch": 0.292, + "grad_norm": 12.0, + "learning_rate": 4.708e-05, + "loss": 0.6055, + "step": 73 + }, + { + "epoch": 0.292, + "eval_accuracy": 0.7590361445783133, + "eval_loss": 0.661380410194397, + "eval_runtime": 29.1734, + "eval_samples_per_second": 17.07, + "eval_steps_per_second": 2.159, + "step": 73 + }, + { + "epoch": 0.296, + "grad_norm": 10.875, + "learning_rate": 4.7040000000000004e-05, + "loss": 0.4434, + "step": 74 + }, + { + "epoch": 0.296, + "eval_accuracy": 0.7570281124497992, + "eval_loss": 0.6655927896499634, + "eval_runtime": 29.2959, + "eval_samples_per_second": 16.999, + "eval_steps_per_second": 2.15, + "step": 74 + }, + { + "epoch": 0.3, + "grad_norm": 15.0, + "learning_rate": 4.7e-05, + "loss": 0.9531, + "step": 75 + }, + { + "epoch": 0.3, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6597763299942017, + "eval_runtime": 29.4189, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.141, + "step": 75 + }, + { + "epoch": 0.304, + "grad_norm": 6.0, + "learning_rate": 4.6960000000000004e-05, + "loss": 0.4707, + "step": 76 + }, + { + "epoch": 0.304, + "eval_accuracy": 0.7771084337349398, + "eval_loss": 0.6556656360626221, + "eval_runtime": 29.4088, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.142, + "step": 76 + }, + { + "epoch": 0.308, + "grad_norm": 13.1875, + "learning_rate": 4.6920000000000005e-05, + "loss": 0.6367, + "step": 77 + }, + { + "epoch": 0.308, + "eval_accuracy": 0.7791164658634538, + "eval_loss": 0.6510534286499023, + "eval_runtime": 29.4604, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.138, + "step": 77 + }, + { + "epoch": 0.312, + "grad_norm": 7.5625, + "learning_rate": 4.688e-05, + "loss": 0.4238, + "step": 78 + }, + { + "epoch": 0.312, + "eval_accuracy": 0.7791164658634538, + "eval_loss": 0.6499336957931519, + "eval_runtime": 29.4319, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.141, + "step": 78 + }, + { + "epoch": 0.316, + "grad_norm": 13.0, + "learning_rate": 4.684e-05, + "loss": 0.6992, + "step": 79 + }, + { + "epoch": 0.316, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.650346040725708, + "eval_runtime": 29.4523, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 79 + }, + { + "epoch": 0.32, + "grad_norm": 9.1875, + "learning_rate": 4.6800000000000006e-05, + "loss": 0.8828, + "step": 80 + }, + { + "epoch": 0.32, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.650997519493103, + "eval_runtime": 29.4298, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.141, + "step": 80 + }, + { + "epoch": 0.324, + "grad_norm": 12.3125, + "learning_rate": 4.6760000000000006e-05, + "loss": 0.9609, + "step": 81 + }, + { + "epoch": 0.324, + "eval_accuracy": 0.7771084337349398, + "eval_loss": 0.6494449377059937, + "eval_runtime": 29.3625, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 2.146, + "step": 81 + }, + { + "epoch": 0.328, + "grad_norm": 11.0, + "learning_rate": 4.672e-05, + "loss": 0.8945, + "step": 82 + }, + { + "epoch": 0.328, + "eval_accuracy": 0.7871485943775101, + "eval_loss": 0.6491120457649231, + "eval_runtime": 29.2754, + "eval_samples_per_second": 17.011, + "eval_steps_per_second": 2.152, + "step": 82 + }, + { + "epoch": 0.332, + "grad_norm": 6.65625, + "learning_rate": 4.668e-05, + "loss": 0.4141, + "step": 83 + }, + { + "epoch": 0.332, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.6492424607276917, + "eval_runtime": 29.2228, + "eval_samples_per_second": 17.042, + "eval_steps_per_second": 2.156, + "step": 83 + }, + { + "epoch": 0.336, + "grad_norm": 5.65625, + "learning_rate": 4.664e-05, + "loss": 0.5469, + "step": 84 + }, + { + "epoch": 0.336, + "eval_accuracy": 0.7891566265060241, + "eval_loss": 0.6485850811004639, + "eval_runtime": 29.2629, + "eval_samples_per_second": 17.018, + "eval_steps_per_second": 2.153, + "step": 84 + }, + { + "epoch": 0.34, + "grad_norm": 8.25, + "learning_rate": 4.660000000000001e-05, + "loss": 0.7891, + "step": 85 + }, + { + "epoch": 0.34, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.6485472917556763, + "eval_runtime": 29.3836, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.144, + "step": 85 + }, + { + "epoch": 0.344, + "grad_norm": 8.4375, + "learning_rate": 4.656e-05, + "loss": 0.7969, + "step": 86 + }, + { + "epoch": 0.344, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.6483581066131592, + "eval_runtime": 29.4836, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.137, + "step": 86 + }, + { + "epoch": 0.348, + "grad_norm": 3.71875, + "learning_rate": 4.652e-05, + "loss": 0.1416, + "step": 87 + }, + { + "epoch": 0.348, + "eval_accuracy": 0.7871485943775101, + "eval_loss": 0.6490346193313599, + "eval_runtime": 29.4358, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.14, + "step": 87 + }, + { + "epoch": 0.352, + "grad_norm": 4.6875, + "learning_rate": 4.648e-05, + "loss": 0.3496, + "step": 88 + }, + { + "epoch": 0.352, + "eval_accuracy": 0.7891566265060241, + "eval_loss": 0.6501420736312866, + "eval_runtime": 29.444, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.14, + "step": 88 + }, + { + "epoch": 0.356, + "grad_norm": 11.3125, + "learning_rate": 4.644e-05, + "loss": 0.9141, + "step": 89 + }, + { + "epoch": 0.356, + "eval_accuracy": 0.8012048192771084, + "eval_loss": 0.6521397829055786, + "eval_runtime": 29.4428, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 89 + }, + { + "epoch": 0.36, + "grad_norm": 6.65625, + "learning_rate": 4.64e-05, + "loss": 0.4102, + "step": 90 + }, + { + "epoch": 0.36, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.6548557281494141, + "eval_runtime": 29.4523, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 90 + }, + { + "epoch": 0.364, + "grad_norm": 6.96875, + "learning_rate": 4.636e-05, + "loss": 0.625, + "step": 91 + }, + { + "epoch": 0.364, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6554092168807983, + "eval_runtime": 29.2741, + "eval_samples_per_second": 17.012, + "eval_steps_per_second": 2.152, + "step": 91 + }, + { + "epoch": 0.368, + "grad_norm": 6.78125, + "learning_rate": 4.6320000000000004e-05, + "loss": 0.4883, + "step": 92 + }, + { + "epoch": 0.368, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6591306924819946, + "eval_runtime": 29.1988, + "eval_samples_per_second": 17.055, + "eval_steps_per_second": 2.158, + "step": 92 + }, + { + "epoch": 0.372, + "grad_norm": 11.5625, + "learning_rate": 4.6280000000000004e-05, + "loss": 1.1562, + "step": 93 + }, + { + "epoch": 0.372, + "eval_accuracy": 0.7911646586345381, + "eval_loss": 0.6637662649154663, + "eval_runtime": 29.1725, + "eval_samples_per_second": 17.071, + "eval_steps_per_second": 2.16, + "step": 93 + }, + { + "epoch": 0.376, + "grad_norm": 19.75, + "learning_rate": 4.624e-05, + "loss": 1.2266, + "step": 94 + }, + { + "epoch": 0.376, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.6651781797409058, + "eval_runtime": 29.2942, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 94 + }, + { + "epoch": 0.38, + "grad_norm": 7.40625, + "learning_rate": 4.6200000000000005e-05, + "loss": 0.5781, + "step": 95 + }, + { + "epoch": 0.38, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6659625768661499, + "eval_runtime": 29.3851, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.144, + "step": 95 + }, + { + "epoch": 0.384, + "grad_norm": 5.84375, + "learning_rate": 4.6160000000000005e-05, + "loss": 0.4883, + "step": 96 + }, + { + "epoch": 0.384, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6672186851501465, + "eval_runtime": 29.4745, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.137, + "step": 96 + }, + { + "epoch": 0.388, + "grad_norm": 12.625, + "learning_rate": 4.612e-05, + "loss": 1.2734, + "step": 97 + }, + { + "epoch": 0.388, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6654327511787415, + "eval_runtime": 29.4827, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.137, + "step": 97 + }, + { + "epoch": 0.392, + "grad_norm": 43.0, + "learning_rate": 4.608e-05, + "loss": 1.0, + "step": 98 + }, + { + "epoch": 0.392, + "eval_accuracy": 0.7670682730923695, + "eval_loss": 0.6617677807807922, + "eval_runtime": 29.4251, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.141, + "step": 98 + }, + { + "epoch": 0.396, + "grad_norm": 7.5, + "learning_rate": 4.604e-05, + "loss": 0.5352, + "step": 99 + }, + { + "epoch": 0.396, + "eval_accuracy": 0.7670682730923695, + "eval_loss": 0.6578513979911804, + "eval_runtime": 29.4914, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 99 + }, + { + "epoch": 0.4, + "grad_norm": 9.0, + "learning_rate": 4.600000000000001e-05, + "loss": 0.6836, + "step": 100 + }, + { + "epoch": 0.4, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6557869911193848, + "eval_runtime": 29.4471, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.139, + "step": 100 + }, + { + "epoch": 0.404, + "grad_norm": 6.3125, + "learning_rate": 4.596e-05, + "loss": 0.5156, + "step": 101 + }, + { + "epoch": 0.404, + "eval_accuracy": 0.7650602409638554, + "eval_loss": 0.6549768447875977, + "eval_runtime": 29.2944, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 101 + }, + { + "epoch": 0.408, + "grad_norm": 7.125, + "learning_rate": 4.592e-05, + "loss": 0.5195, + "step": 102 + }, + { + "epoch": 0.408, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6506882309913635, + "eval_runtime": 29.2638, + "eval_samples_per_second": 17.018, + "eval_steps_per_second": 2.153, + "step": 102 + }, + { + "epoch": 0.412, + "grad_norm": 9.25, + "learning_rate": 4.588e-05, + "loss": 0.6602, + "step": 103 + }, + { + "epoch": 0.412, + "eval_accuracy": 0.7670682730923695, + "eval_loss": 0.6481528878211975, + "eval_runtime": 29.4395, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.14, + "step": 103 + }, + { + "epoch": 0.416, + "grad_norm": 15.625, + "learning_rate": 4.584e-05, + "loss": 1.1406, + "step": 104 + }, + { + "epoch": 0.416, + "eval_accuracy": 0.7791164658634538, + "eval_loss": 0.6472609043121338, + "eval_runtime": 29.5025, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 104 + }, + { + "epoch": 0.42, + "grad_norm": 8.8125, + "learning_rate": 4.58e-05, + "loss": 0.6641, + "step": 105 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6437375545501709, + "eval_runtime": 29.4431, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 105 + }, + { + "epoch": 0.424, + "grad_norm": 8.125, + "learning_rate": 4.576e-05, + "loss": 0.6016, + "step": 106 + }, + { + "epoch": 0.424, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6424432992935181, + "eval_runtime": 29.4322, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.141, + "step": 106 + }, + { + "epoch": 0.428, + "grad_norm": 7.78125, + "learning_rate": 4.572e-05, + "loss": 0.543, + "step": 107 + }, + { + "epoch": 0.428, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6407426595687866, + "eval_runtime": 29.4803, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.137, + "step": 107 + }, + { + "epoch": 0.432, + "grad_norm": 10.625, + "learning_rate": 4.568e-05, + "loss": 0.7266, + "step": 108 + }, + { + "epoch": 0.432, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.6403288841247559, + "eval_runtime": 29.349, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 108 + }, + { + "epoch": 0.436, + "grad_norm": 12.4375, + "learning_rate": 4.564e-05, + "loss": 1.1641, + "step": 109 + }, + { + "epoch": 0.436, + "eval_accuracy": 0.7791164658634538, + "eval_loss": 0.6392258405685425, + "eval_runtime": 29.2625, + "eval_samples_per_second": 17.018, + "eval_steps_per_second": 2.153, + "step": 109 + }, + { + "epoch": 0.44, + "grad_norm": 9.375, + "learning_rate": 4.5600000000000004e-05, + "loss": 0.6133, + "step": 110 + }, + { + "epoch": 0.44, + "eval_accuracy": 0.7871485943775101, + "eval_loss": 0.6392258405685425, + "eval_runtime": 29.2202, + "eval_samples_per_second": 17.043, + "eval_steps_per_second": 2.156, + "step": 110 + }, + { + "epoch": 0.444, + "grad_norm": 15.0, + "learning_rate": 4.5560000000000004e-05, + "loss": 0.8555, + "step": 111 + }, + { + "epoch": 0.444, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.6401661038398743, + "eval_runtime": 29.2537, + "eval_samples_per_second": 17.023, + "eval_steps_per_second": 2.154, + "step": 111 + }, + { + "epoch": 0.448, + "grad_norm": 5.625, + "learning_rate": 4.5520000000000005e-05, + "loss": 0.5469, + "step": 112 + }, + { + "epoch": 0.448, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.6417447328567505, + "eval_runtime": 29.3793, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.144, + "step": 112 + }, + { + "epoch": 0.452, + "grad_norm": 10.75, + "learning_rate": 4.548e-05, + "loss": 0.6484, + "step": 113 + }, + { + "epoch": 0.452, + "eval_accuracy": 0.7911646586345381, + "eval_loss": 0.6428291201591492, + "eval_runtime": 29.4232, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 113 + }, + { + "epoch": 0.456, + "grad_norm": 4.34375, + "learning_rate": 4.5440000000000005e-05, + "loss": 0.5391, + "step": 114 + }, + { + "epoch": 0.456, + "eval_accuracy": 0.7931726907630522, + "eval_loss": 0.6437684297561646, + "eval_runtime": 29.444, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.14, + "step": 114 + }, + { + "epoch": 0.46, + "grad_norm": 5.0625, + "learning_rate": 4.5400000000000006e-05, + "loss": 0.7031, + "step": 115 + }, + { + "epoch": 0.46, + "eval_accuracy": 0.7971887550200804, + "eval_loss": 0.6435370445251465, + "eval_runtime": 29.5059, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.135, + "step": 115 + }, + { + "epoch": 0.464, + "grad_norm": 4.625, + "learning_rate": 4.536e-05, + "loss": 0.5938, + "step": 116 + }, + { + "epoch": 0.464, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6446332335472107, + "eval_runtime": 29.4842, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 116 + }, + { + "epoch": 0.468, + "grad_norm": 7.3125, + "learning_rate": 4.532e-05, + "loss": 0.9258, + "step": 117 + }, + { + "epoch": 0.468, + "eval_accuracy": 0.7931726907630522, + "eval_loss": 0.6450920701026917, + "eval_runtime": 29.4528, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.139, + "step": 117 + }, + { + "epoch": 0.472, + "grad_norm": 6.78125, + "learning_rate": 4.528e-05, + "loss": 0.6602, + "step": 118 + }, + { + "epoch": 0.472, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6467589139938354, + "eval_runtime": 29.3844, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.144, + "step": 118 + }, + { + "epoch": 0.476, + "grad_norm": 4.78125, + "learning_rate": 4.524000000000001e-05, + "loss": 0.4473, + "step": 119 + }, + { + "epoch": 0.476, + "eval_accuracy": 0.7931726907630522, + "eval_loss": 0.6463823914527893, + "eval_runtime": 29.272, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 119 + }, + { + "epoch": 0.48, + "grad_norm": 5.1875, + "learning_rate": 4.52e-05, + "loss": 0.582, + "step": 120 + }, + { + "epoch": 0.48, + "eval_accuracy": 0.7971887550200804, + "eval_loss": 0.644772469997406, + "eval_runtime": 29.2297, + "eval_samples_per_second": 17.037, + "eval_steps_per_second": 2.155, + "step": 120 + }, + { + "epoch": 0.484, + "grad_norm": 7.75, + "learning_rate": 4.516e-05, + "loss": 0.6602, + "step": 121 + }, + { + "epoch": 0.484, + "eval_accuracy": 0.7931726907630522, + "eval_loss": 0.650191605091095, + "eval_runtime": 29.2715, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 121 + }, + { + "epoch": 0.488, + "grad_norm": 4.875, + "learning_rate": 4.512e-05, + "loss": 0.6406, + "step": 122 + }, + { + "epoch": 0.488, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6495802402496338, + "eval_runtime": 29.2086, + "eval_samples_per_second": 17.05, + "eval_steps_per_second": 2.157, + "step": 122 + }, + { + "epoch": 0.492, + "grad_norm": 5.28125, + "learning_rate": 4.508e-05, + "loss": 0.5078, + "step": 123 + }, + { + "epoch": 0.492, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.6482556462287903, + "eval_runtime": 29.2097, + "eval_samples_per_second": 17.049, + "eval_steps_per_second": 2.157, + "step": 123 + }, + { + "epoch": 0.496, + "grad_norm": 5.1875, + "learning_rate": 4.504e-05, + "loss": 0.3613, + "step": 124 + }, + { + "epoch": 0.496, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.6573554873466492, + "eval_runtime": 29.2807, + "eval_samples_per_second": 17.008, + "eval_steps_per_second": 2.152, + "step": 124 + }, + { + "epoch": 0.5, + "grad_norm": 5.71875, + "learning_rate": 4.5e-05, + "loss": 0.4883, + "step": 125 + }, + { + "epoch": 0.5, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6616586446762085, + "eval_runtime": 29.4814, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 125 + }, + { + "epoch": 0.504, + "grad_norm": 5.9375, + "learning_rate": 4.496e-05, + "loss": 0.5742, + "step": 126 + }, + { + "epoch": 0.504, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6651045083999634, + "eval_runtime": 29.4904, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.136, + "step": 126 + }, + { + "epoch": 0.508, + "grad_norm": 22.0, + "learning_rate": 4.4920000000000004e-05, + "loss": 0.9453, + "step": 127 + }, + { + "epoch": 0.508, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.6711576581001282, + "eval_runtime": 29.5517, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.132, + "step": 127 + }, + { + "epoch": 0.512, + "grad_norm": 13.8125, + "learning_rate": 4.488e-05, + "loss": 0.5391, + "step": 128 + }, + { + "epoch": 0.512, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6996178030967712, + "eval_runtime": 29.4782, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.137, + "step": 128 + }, + { + "epoch": 0.516, + "grad_norm": 20.375, + "learning_rate": 4.4840000000000004e-05, + "loss": 0.2773, + "step": 129 + }, + { + "epoch": 0.516, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.7277112603187561, + "eval_runtime": 29.4884, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.136, + "step": 129 + }, + { + "epoch": 0.52, + "grad_norm": 18.375, + "learning_rate": 4.4800000000000005e-05, + "loss": 0.5703, + "step": 130 + }, + { + "epoch": 0.52, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.7843846082687378, + "eval_runtime": 29.4259, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.141, + "step": 130 + }, + { + "epoch": 0.524, + "grad_norm": 57.5, + "learning_rate": 4.4760000000000005e-05, + "loss": 0.9453, + "step": 131 + }, + { + "epoch": 0.524, + "eval_accuracy": 0.7590361445783133, + "eval_loss": 0.7313159108161926, + "eval_runtime": 29.4633, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.138, + "step": 131 + }, + { + "epoch": 0.528, + "grad_norm": 26.375, + "learning_rate": 4.472e-05, + "loss": 0.4766, + "step": 132 + }, + { + "epoch": 0.528, + "eval_accuracy": 0.748995983935743, + "eval_loss": 0.7071724534034729, + "eval_runtime": 29.4418, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.14, + "step": 132 + }, + { + "epoch": 0.532, + "grad_norm": 16.875, + "learning_rate": 4.468e-05, + "loss": 0.5859, + "step": 133 + }, + { + "epoch": 0.532, + "eval_accuracy": 0.7208835341365462, + "eval_loss": 0.6997835040092468, + "eval_runtime": 29.4247, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 133 + }, + { + "epoch": 0.536, + "grad_norm": 18.75, + "learning_rate": 4.4640000000000006e-05, + "loss": 0.4062, + "step": 134 + }, + { + "epoch": 0.536, + "eval_accuracy": 0.7409638554216867, + "eval_loss": 0.6913278102874756, + "eval_runtime": 29.4184, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 134 + }, + { + "epoch": 0.54, + "grad_norm": 50.75, + "learning_rate": 4.46e-05, + "loss": 1.1016, + "step": 135 + }, + { + "epoch": 0.54, + "eval_accuracy": 0.7771084337349398, + "eval_loss": 0.6827152371406555, + "eval_runtime": 29.4496, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.139, + "step": 135 + }, + { + "epoch": 0.544, + "grad_norm": 18.5, + "learning_rate": 4.456e-05, + "loss": 0.7344, + "step": 136 + }, + { + "epoch": 0.544, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6621172428131104, + "eval_runtime": 29.5878, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 136 + }, + { + "epoch": 0.548, + "grad_norm": 36.0, + "learning_rate": 4.452e-05, + "loss": 0.6914, + "step": 137 + }, + { + "epoch": 0.548, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6528614163398743, + "eval_runtime": 29.631, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.126, + "step": 137 + }, + { + "epoch": 0.552, + "grad_norm": 26.125, + "learning_rate": 4.448e-05, + "loss": 0.625, + "step": 138 + }, + { + "epoch": 0.552, + "eval_accuracy": 0.7791164658634538, + "eval_loss": 0.6560872197151184, + "eval_runtime": 29.593, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.129, + "step": 138 + }, + { + "epoch": 0.556, + "grad_norm": 19.625, + "learning_rate": 4.444e-05, + "loss": 0.6328, + "step": 139 + }, + { + "epoch": 0.556, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6539046764373779, + "eval_runtime": 29.6333, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.126, + "step": 139 + }, + { + "epoch": 0.56, + "grad_norm": 16.125, + "learning_rate": 4.44e-05, + "loss": 0.7578, + "step": 140 + }, + { + "epoch": 0.56, + "eval_accuracy": 0.7570281124497992, + "eval_loss": 0.6554067730903625, + "eval_runtime": 29.6041, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.128, + "step": 140 + }, + { + "epoch": 0.564, + "grad_norm": 10.75, + "learning_rate": 4.436e-05, + "loss": 0.6055, + "step": 141 + }, + { + "epoch": 0.564, + "eval_accuracy": 0.7550200803212851, + "eval_loss": 0.6580933332443237, + "eval_runtime": 29.4937, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 141 + }, + { + "epoch": 0.568, + "grad_norm": 17.875, + "learning_rate": 4.432e-05, + "loss": 0.6445, + "step": 142 + }, + { + "epoch": 0.568, + "eval_accuracy": 0.7570281124497992, + "eval_loss": 0.6513554453849792, + "eval_runtime": 29.4192, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.141, + "step": 142 + }, + { + "epoch": 0.572, + "grad_norm": 25.25, + "learning_rate": 4.428e-05, + "loss": 0.7031, + "step": 143 + }, + { + "epoch": 0.572, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6542263031005859, + "eval_runtime": 29.3623, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.146, + "step": 143 + }, + { + "epoch": 0.576, + "grad_norm": 26.875, + "learning_rate": 4.424e-05, + "loss": 0.832, + "step": 144 + }, + { + "epoch": 0.576, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6659764051437378, + "eval_runtime": 29.3366, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.147, + "step": 144 + }, + { + "epoch": 0.58, + "grad_norm": 9.9375, + "learning_rate": 4.4200000000000004e-05, + "loss": 0.6289, + "step": 145 + }, + { + "epoch": 0.58, + "eval_accuracy": 0.7730923694779116, + "eval_loss": 0.6821191310882568, + "eval_runtime": 29.3021, + "eval_samples_per_second": 16.995, + "eval_steps_per_second": 2.15, + "step": 145 + }, + { + "epoch": 0.584, + "grad_norm": 24.125, + "learning_rate": 4.4160000000000004e-05, + "loss": 0.5703, + "step": 146 + }, + { + "epoch": 0.584, + "eval_accuracy": 0.7550200803212851, + "eval_loss": 0.6942927837371826, + "eval_runtime": 29.2958, + "eval_samples_per_second": 16.999, + "eval_steps_per_second": 2.15, + "step": 146 + }, + { + "epoch": 0.588, + "grad_norm": 24.25, + "learning_rate": 4.412e-05, + "loss": 0.7578, + "step": 147 + }, + { + "epoch": 0.588, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.7125925421714783, + "eval_runtime": 29.342, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 147 + }, + { + "epoch": 0.592, + "grad_norm": 36.75, + "learning_rate": 4.4080000000000005e-05, + "loss": 0.5469, + "step": 148 + }, + { + "epoch": 0.592, + "eval_accuracy": 0.7771084337349398, + "eval_loss": 0.7129729986190796, + "eval_runtime": 29.3399, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.147, + "step": 148 + }, + { + "epoch": 0.596, + "grad_norm": 24.75, + "learning_rate": 4.4040000000000005e-05, + "loss": 0.4883, + "step": 149 + }, + { + "epoch": 0.596, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.7162556648254395, + "eval_runtime": 29.2746, + "eval_samples_per_second": 17.011, + "eval_steps_per_second": 2.152, + "step": 149 + }, + { + "epoch": 0.6, + "grad_norm": 32.25, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.5312, + "step": 150 + }, + { + "epoch": 0.6, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.7125101685523987, + "eval_runtime": 29.3194, + "eval_samples_per_second": 16.985, + "eval_steps_per_second": 2.149, + "step": 150 + }, + { + "epoch": 0.604, + "grad_norm": 19.0, + "learning_rate": 4.396e-05, + "loss": 0.6055, + "step": 151 + }, + { + "epoch": 0.604, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.7113532423973083, + "eval_runtime": 29.2784, + "eval_samples_per_second": 17.009, + "eval_steps_per_second": 2.152, + "step": 151 + }, + { + "epoch": 0.608, + "grad_norm": 18.0, + "learning_rate": 4.392e-05, + "loss": 0.7578, + "step": 152 + }, + { + "epoch": 0.608, + "eval_accuracy": 0.7891566265060241, + "eval_loss": 0.7059978246688843, + "eval_runtime": 29.3272, + "eval_samples_per_second": 16.981, + "eval_steps_per_second": 2.148, + "step": 152 + }, + { + "epoch": 0.612, + "grad_norm": 31.75, + "learning_rate": 4.388000000000001e-05, + "loss": 0.7227, + "step": 153 + }, + { + "epoch": 0.612, + "eval_accuracy": 0.7831325301204819, + "eval_loss": 0.6918577551841736, + "eval_runtime": 29.4346, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.14, + "step": 153 + }, + { + "epoch": 0.616, + "grad_norm": 21.875, + "learning_rate": 4.384e-05, + "loss": 0.6719, + "step": 154 + }, + { + "epoch": 0.616, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.6881245970726013, + "eval_runtime": 29.5191, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 154 + }, + { + "epoch": 0.62, + "grad_norm": 19.25, + "learning_rate": 4.38e-05, + "loss": 0.707, + "step": 155 + }, + { + "epoch": 0.62, + "eval_accuracy": 0.7891566265060241, + "eval_loss": 0.6831045150756836, + "eval_runtime": 29.4747, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.137, + "step": 155 + }, + { + "epoch": 0.624, + "grad_norm": 17.875, + "learning_rate": 4.376e-05, + "loss": 1.4219, + "step": 156 + }, + { + "epoch": 0.624, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.6754797697067261, + "eval_runtime": 29.4689, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.138, + "step": 156 + }, + { + "epoch": 0.628, + "grad_norm": 19.5, + "learning_rate": 4.372e-05, + "loss": 0.9258, + "step": 157 + }, + { + "epoch": 0.628, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.6684707403182983, + "eval_runtime": 29.4572, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 157 + }, + { + "epoch": 0.632, + "grad_norm": 10.125, + "learning_rate": 4.368e-05, + "loss": 0.6055, + "step": 158 + }, + { + "epoch": 0.632, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.6605474352836609, + "eval_runtime": 29.4325, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.14, + "step": 158 + }, + { + "epoch": 0.636, + "grad_norm": 6.71875, + "learning_rate": 4.364e-05, + "loss": 0.377, + "step": 159 + }, + { + "epoch": 0.636, + "eval_accuracy": 0.7891566265060241, + "eval_loss": 0.651968240737915, + "eval_runtime": 29.3567, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.146, + "step": 159 + }, + { + "epoch": 0.64, + "grad_norm": 6.09375, + "learning_rate": 4.36e-05, + "loss": 0.6133, + "step": 160 + }, + { + "epoch": 0.64, + "eval_accuracy": 0.7811244979919679, + "eval_loss": 0.6439772844314575, + "eval_runtime": 29.3073, + "eval_samples_per_second": 16.992, + "eval_steps_per_second": 2.15, + "step": 160 + }, + { + "epoch": 0.644, + "grad_norm": 7.65625, + "learning_rate": 4.356e-05, + "loss": 0.7656, + "step": 161 + }, + { + "epoch": 0.644, + "eval_accuracy": 0.7751004016064257, + "eval_loss": 0.6407005190849304, + "eval_runtime": 29.2141, + "eval_samples_per_second": 17.047, + "eval_steps_per_second": 2.156, + "step": 161 + }, + { + "epoch": 0.648, + "grad_norm": 6.0, + "learning_rate": 4.352e-05, + "loss": 0.7852, + "step": 162 + }, + { + "epoch": 0.648, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.6381105184555054, + "eval_runtime": 29.1926, + "eval_samples_per_second": 17.059, + "eval_steps_per_second": 2.158, + "step": 162 + }, + { + "epoch": 0.652, + "grad_norm": 3.21875, + "learning_rate": 4.3480000000000004e-05, + "loss": 0.0723, + "step": 163 + }, + { + "epoch": 0.652, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.6373477578163147, + "eval_runtime": 29.1843, + "eval_samples_per_second": 17.064, + "eval_steps_per_second": 2.159, + "step": 163 + }, + { + "epoch": 0.656, + "grad_norm": 5.28125, + "learning_rate": 4.3440000000000004e-05, + "loss": 0.8125, + "step": 164 + }, + { + "epoch": 0.656, + "eval_accuracy": 0.7791164658634538, + "eval_loss": 0.6373776197433472, + "eval_runtime": 29.2352, + "eval_samples_per_second": 17.034, + "eval_steps_per_second": 2.155, + "step": 164 + }, + { + "epoch": 0.66, + "grad_norm": 4.90625, + "learning_rate": 4.3400000000000005e-05, + "loss": 0.5469, + "step": 165 + }, + { + "epoch": 0.66, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.6374040842056274, + "eval_runtime": 29.4366, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.14, + "step": 165 + }, + { + "epoch": 0.664, + "grad_norm": 17.375, + "learning_rate": 4.336e-05, + "loss": 1.4844, + "step": 166 + }, + { + "epoch": 0.664, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6372727155685425, + "eval_runtime": 29.491, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.136, + "step": 166 + }, + { + "epoch": 0.668, + "grad_norm": 6.65625, + "learning_rate": 4.332e-05, + "loss": 0.6875, + "step": 167 + }, + { + "epoch": 0.668, + "eval_accuracy": 0.7911646586345381, + "eval_loss": 0.6371325254440308, + "eval_runtime": 29.4557, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.139, + "step": 167 + }, + { + "epoch": 0.672, + "grad_norm": 4.5625, + "learning_rate": 4.3280000000000006e-05, + "loss": 0.5977, + "step": 168 + }, + { + "epoch": 0.672, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6372649073600769, + "eval_runtime": 29.4686, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.138, + "step": 168 + }, + { + "epoch": 0.676, + "grad_norm": 4.125, + "learning_rate": 4.324e-05, + "loss": 0.375, + "step": 169 + }, + { + "epoch": 0.676, + "eval_accuracy": 0.7871485943775101, + "eval_loss": 0.6376364827156067, + "eval_runtime": 29.4704, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 169 + }, + { + "epoch": 0.68, + "grad_norm": 12.6875, + "learning_rate": 4.32e-05, + "loss": 0.4766, + "step": 170 + }, + { + "epoch": 0.68, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6374933123588562, + "eval_runtime": 29.4427, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 170 + }, + { + "epoch": 0.684, + "grad_norm": 29.625, + "learning_rate": 4.316e-05, + "loss": 0.793, + "step": 171 + }, + { + "epoch": 0.684, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6368040442466736, + "eval_runtime": 29.337, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.147, + "step": 171 + }, + { + "epoch": 0.688, + "grad_norm": 99.0, + "learning_rate": 4.312000000000001e-05, + "loss": 1.1797, + "step": 172 + }, + { + "epoch": 0.688, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.637872040271759, + "eval_runtime": 29.426, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.141, + "step": 172 + }, + { + "epoch": 0.692, + "grad_norm": 5.90625, + "learning_rate": 4.308e-05, + "loss": 0.5391, + "step": 173 + }, + { + "epoch": 0.692, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6481320858001709, + "eval_runtime": 29.5174, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 173 + }, + { + "epoch": 0.696, + "grad_norm": 5.46875, + "learning_rate": 4.304e-05, + "loss": 0.5195, + "step": 174 + }, + { + "epoch": 0.696, + "eval_accuracy": 0.8032128514056225, + "eval_loss": 0.6470403075218201, + "eval_runtime": 29.4898, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.136, + "step": 174 + }, + { + "epoch": 0.7, + "grad_norm": 7.21875, + "learning_rate": 4.3e-05, + "loss": 0.9023, + "step": 175 + }, + { + "epoch": 0.7, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6459460854530334, + "eval_runtime": 29.5135, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 175 + }, + { + "epoch": 0.704, + "grad_norm": 3.53125, + "learning_rate": 4.296e-05, + "loss": 0.4668, + "step": 176 + }, + { + "epoch": 0.704, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6484264731407166, + "eval_runtime": 29.4504, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.139, + "step": 176 + }, + { + "epoch": 0.708, + "grad_norm": 17.5, + "learning_rate": 4.292e-05, + "loss": 0.3105, + "step": 177 + }, + { + "epoch": 0.708, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6484919190406799, + "eval_runtime": 29.3597, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 177 + }, + { + "epoch": 0.712, + "grad_norm": 4.5625, + "learning_rate": 4.288e-05, + "loss": 0.4824, + "step": 178 + }, + { + "epoch": 0.712, + "eval_accuracy": 0.8012048192771084, + "eval_loss": 0.652421236038208, + "eval_runtime": 29.2594, + "eval_samples_per_second": 17.02, + "eval_steps_per_second": 2.153, + "step": 178 + }, + { + "epoch": 0.716, + "grad_norm": 4.625, + "learning_rate": 4.284e-05, + "loss": 0.6094, + "step": 179 + }, + { + "epoch": 0.716, + "eval_accuracy": 0.8072289156626506, + "eval_loss": 0.6532085537910461, + "eval_runtime": 29.2012, + "eval_samples_per_second": 17.054, + "eval_steps_per_second": 2.157, + "step": 179 + }, + { + "epoch": 0.72, + "grad_norm": 5.15625, + "learning_rate": 4.2800000000000004e-05, + "loss": 0.6172, + "step": 180 + }, + { + "epoch": 0.72, + "eval_accuracy": 0.8052208835341366, + "eval_loss": 0.6537114977836609, + "eval_runtime": 29.2484, + "eval_samples_per_second": 17.027, + "eval_steps_per_second": 2.154, + "step": 180 + }, + { + "epoch": 0.724, + "grad_norm": 5.875, + "learning_rate": 4.276e-05, + "loss": 0.4297, + "step": 181 + }, + { + "epoch": 0.724, + "eval_accuracy": 0.8052208835341366, + "eval_loss": 0.6581933498382568, + "eval_runtime": 29.4649, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.138, + "step": 181 + }, + { + "epoch": 0.728, + "grad_norm": 4.875, + "learning_rate": 4.2720000000000004e-05, + "loss": 0.418, + "step": 182 + }, + { + "epoch": 0.728, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6617971062660217, + "eval_runtime": 29.5251, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 182 + }, + { + "epoch": 0.732, + "grad_norm": 26.375, + "learning_rate": 4.2680000000000005e-05, + "loss": 0.6094, + "step": 183 + }, + { + "epoch": 0.732, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.659504234790802, + "eval_runtime": 29.4777, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.137, + "step": 183 + }, + { + "epoch": 0.736, + "grad_norm": 3.765625, + "learning_rate": 4.2640000000000005e-05, + "loss": 0.3965, + "step": 184 + }, + { + "epoch": 0.736, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6592799425125122, + "eval_runtime": 29.4792, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.137, + "step": 184 + }, + { + "epoch": 0.74, + "grad_norm": 3.625, + "learning_rate": 4.26e-05, + "loss": 0.377, + "step": 185 + }, + { + "epoch": 0.74, + "eval_accuracy": 0.8072289156626506, + "eval_loss": 0.6573922634124756, + "eval_runtime": 29.4812, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 185 + }, + { + "epoch": 0.744, + "grad_norm": 6.15625, + "learning_rate": 4.256e-05, + "loss": 0.5781, + "step": 186 + }, + { + "epoch": 0.744, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6524920463562012, + "eval_runtime": 29.4734, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 186 + }, + { + "epoch": 0.748, + "grad_norm": 19.0, + "learning_rate": 4.2520000000000006e-05, + "loss": 1.0469, + "step": 187 + }, + { + "epoch": 0.748, + "eval_accuracy": 0.8132530120481928, + "eval_loss": 0.6469986438751221, + "eval_runtime": 29.3412, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.147, + "step": 187 + }, + { + "epoch": 0.752, + "grad_norm": 5.34375, + "learning_rate": 4.248e-05, + "loss": 0.707, + "step": 188 + }, + { + "epoch": 0.752, + "eval_accuracy": 0.8132530120481928, + "eval_loss": 0.6440456509590149, + "eval_runtime": 29.3534, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.146, + "step": 188 + }, + { + "epoch": 0.756, + "grad_norm": 19.875, + "learning_rate": 4.244e-05, + "loss": 1.8203, + "step": 189 + }, + { + "epoch": 0.756, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6401629447937012, + "eval_runtime": 29.4514, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 189 + }, + { + "epoch": 0.76, + "grad_norm": 6.65625, + "learning_rate": 4.24e-05, + "loss": 0.6172, + "step": 190 + }, + { + "epoch": 0.76, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6371276378631592, + "eval_runtime": 29.5253, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 190 + }, + { + "epoch": 0.764, + "grad_norm": 15.75, + "learning_rate": 4.236e-05, + "loss": 0.7266, + "step": 191 + }, + { + "epoch": 0.764, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6346008777618408, + "eval_runtime": 29.4653, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.138, + "step": 191 + }, + { + "epoch": 0.768, + "grad_norm": 4.5, + "learning_rate": 4.232e-05, + "loss": 0.3184, + "step": 192 + }, + { + "epoch": 0.768, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.6346347332000732, + "eval_runtime": 29.4518, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 192 + }, + { + "epoch": 0.772, + "grad_norm": 5.9375, + "learning_rate": 4.228e-05, + "loss": 0.7266, + "step": 193 + }, + { + "epoch": 0.772, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6346646547317505, + "eval_runtime": 29.4956, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.136, + "step": 193 + }, + { + "epoch": 0.776, + "grad_norm": 4.1875, + "learning_rate": 4.224e-05, + "loss": 0.5898, + "step": 194 + }, + { + "epoch": 0.776, + "eval_accuracy": 0.8112449799196787, + "eval_loss": 0.6348209977149963, + "eval_runtime": 29.3816, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.144, + "step": 194 + }, + { + "epoch": 0.78, + "grad_norm": 3.609375, + "learning_rate": 4.22e-05, + "loss": 0.4648, + "step": 195 + }, + { + "epoch": 0.78, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.635009765625, + "eval_runtime": 29.367, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.145, + "step": 195 + }, + { + "epoch": 0.784, + "grad_norm": 6.09375, + "learning_rate": 4.2159999999999996e-05, + "loss": 0.9062, + "step": 196 + }, + { + "epoch": 0.784, + "eval_accuracy": 0.8132530120481928, + "eval_loss": 0.6350136995315552, + "eval_runtime": 29.2696, + "eval_samples_per_second": 17.014, + "eval_steps_per_second": 2.152, + "step": 196 + }, + { + "epoch": 0.788, + "grad_norm": 7.5625, + "learning_rate": 4.212e-05, + "loss": 0.8359, + "step": 197 + }, + { + "epoch": 0.788, + "eval_accuracy": 0.8012048192771084, + "eval_loss": 0.6343601942062378, + "eval_runtime": 29.3059, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.15, + "step": 197 + }, + { + "epoch": 0.792, + "grad_norm": 4.5625, + "learning_rate": 4.2080000000000004e-05, + "loss": 0.543, + "step": 198 + }, + { + "epoch": 0.792, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.6343650817871094, + "eval_runtime": 29.4781, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.137, + "step": 198 + }, + { + "epoch": 0.796, + "grad_norm": 15.25, + "learning_rate": 4.2040000000000004e-05, + "loss": 0.8203, + "step": 199 + }, + { + "epoch": 0.796, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.6326767206192017, + "eval_runtime": 29.4928, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 199 + }, + { + "epoch": 0.8, + "grad_norm": 4.625, + "learning_rate": 4.2e-05, + "loss": 0.6602, + "step": 200 + }, + { + "epoch": 0.8, + "eval_accuracy": 0.8232931726907631, + "eval_loss": 0.6309682130813599, + "eval_runtime": 29.5423, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 200 + }, + { + "epoch": 0.804, + "grad_norm": 4.34375, + "learning_rate": 4.196e-05, + "loss": 0.2676, + "step": 201 + }, + { + "epoch": 0.804, + "eval_accuracy": 0.8032128514056225, + "eval_loss": 0.6321163773536682, + "eval_runtime": 29.4096, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.142, + "step": 201 + }, + { + "epoch": 0.808, + "grad_norm": 9.6875, + "learning_rate": 4.1920000000000005e-05, + "loss": 0.5117, + "step": 202 + }, + { + "epoch": 0.808, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6299161314964294, + "eval_runtime": 29.3754, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.145, + "step": 202 + }, + { + "epoch": 0.812, + "grad_norm": 6.75, + "learning_rate": 4.1880000000000006e-05, + "loss": 0.543, + "step": 203 + }, + { + "epoch": 0.812, + "eval_accuracy": 0.8132530120481928, + "eval_loss": 0.6286630630493164, + "eval_runtime": 29.446, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.14, + "step": 203 + }, + { + "epoch": 0.816, + "grad_norm": 37.5, + "learning_rate": 4.184e-05, + "loss": 0.6875, + "step": 204 + }, + { + "epoch": 0.816, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6287434697151184, + "eval_runtime": 29.544, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.132, + "step": 204 + }, + { + "epoch": 0.82, + "grad_norm": 4.5, + "learning_rate": 4.18e-05, + "loss": 0.668, + "step": 205 + }, + { + "epoch": 0.82, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6302201151847839, + "eval_runtime": 29.4972, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.136, + "step": 205 + }, + { + "epoch": 0.824, + "grad_norm": 4.28125, + "learning_rate": 4.176000000000001e-05, + "loss": 0.5391, + "step": 206 + }, + { + "epoch": 0.824, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6312853693962097, + "eval_runtime": 29.4949, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.136, + "step": 206 + }, + { + "epoch": 0.828, + "grad_norm": 3.578125, + "learning_rate": 4.172e-05, + "loss": 0.332, + "step": 207 + }, + { + "epoch": 0.828, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.6343954801559448, + "eval_runtime": 29.503, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 207 + }, + { + "epoch": 0.832, + "grad_norm": 4.53125, + "learning_rate": 4.168e-05, + "loss": 0.5703, + "step": 208 + }, + { + "epoch": 0.832, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6375725269317627, + "eval_runtime": 29.4119, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.142, + "step": 208 + }, + { + "epoch": 0.836, + "grad_norm": 6.40625, + "learning_rate": 4.164e-05, + "loss": 0.6562, + "step": 209 + }, + { + "epoch": 0.836, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6397570371627808, + "eval_runtime": 29.2712, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 209 + }, + { + "epoch": 0.84, + "grad_norm": 8.0625, + "learning_rate": 4.16e-05, + "loss": 0.7109, + "step": 210 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.8052208835341366, + "eval_loss": 0.6415677070617676, + "eval_runtime": 29.2348, + "eval_samples_per_second": 17.034, + "eval_steps_per_second": 2.155, + "step": 210 + }, + { + "epoch": 0.844, + "grad_norm": 3.5, + "learning_rate": 4.156e-05, + "loss": 0.0693, + "step": 211 + }, + { + "epoch": 0.844, + "eval_accuracy": 0.8032128514056225, + "eval_loss": 0.6449123024940491, + "eval_runtime": 29.2107, + "eval_samples_per_second": 17.049, + "eval_steps_per_second": 2.157, + "step": 211 + }, + { + "epoch": 0.848, + "grad_norm": 5.21875, + "learning_rate": 4.152e-05, + "loss": 0.5391, + "step": 212 + }, + { + "epoch": 0.848, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6505219340324402, + "eval_runtime": 29.2813, + "eval_samples_per_second": 17.007, + "eval_steps_per_second": 2.152, + "step": 212 + }, + { + "epoch": 0.852, + "grad_norm": 29.125, + "learning_rate": 4.148e-05, + "loss": 0.5273, + "step": 213 + }, + { + "epoch": 0.852, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6555063128471375, + "eval_runtime": 29.4867, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.137, + "step": 213 + }, + { + "epoch": 0.856, + "grad_norm": 3.96875, + "learning_rate": 4.144e-05, + "loss": 0.3594, + "step": 214 + }, + { + "epoch": 0.856, + "eval_accuracy": 0.8112449799196787, + "eval_loss": 0.6595506072044373, + "eval_runtime": 29.5035, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 214 + }, + { + "epoch": 0.86, + "grad_norm": 3.90625, + "learning_rate": 4.14e-05, + "loss": 0.4824, + "step": 215 + }, + { + "epoch": 0.86, + "eval_accuracy": 0.8032128514056225, + "eval_loss": 0.6629687547683716, + "eval_runtime": 29.5072, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 215 + }, + { + "epoch": 0.864, + "grad_norm": 6.59375, + "learning_rate": 4.1360000000000004e-05, + "loss": 0.793, + "step": 216 + }, + { + "epoch": 0.864, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.6646623015403748, + "eval_runtime": 29.549, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.132, + "step": 216 + }, + { + "epoch": 0.868, + "grad_norm": 5.03125, + "learning_rate": 4.1320000000000004e-05, + "loss": 0.4609, + "step": 217 + }, + { + "epoch": 0.868, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.6672655940055847, + "eval_runtime": 29.43, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.141, + "step": 217 + }, + { + "epoch": 0.872, + "grad_norm": 5.21875, + "learning_rate": 4.1280000000000005e-05, + "loss": 0.4434, + "step": 218 + }, + { + "epoch": 0.872, + "eval_accuracy": 0.8112449799196787, + "eval_loss": 0.6746382117271423, + "eval_runtime": 29.3573, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.146, + "step": 218 + }, + { + "epoch": 0.876, + "grad_norm": 20.0, + "learning_rate": 4.124e-05, + "loss": 1.7969, + "step": 219 + }, + { + "epoch": 0.876, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.6743885278701782, + "eval_runtime": 29.3699, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 219 + }, + { + "epoch": 0.88, + "grad_norm": 51.75, + "learning_rate": 4.12e-05, + "loss": 0.7422, + "step": 220 + }, + { + "epoch": 0.88, + "eval_accuracy": 0.7871485943775101, + "eval_loss": 0.672954261302948, + "eval_runtime": 29.2797, + "eval_samples_per_second": 17.008, + "eval_steps_per_second": 2.152, + "step": 220 + }, + { + "epoch": 0.884, + "grad_norm": 17.375, + "learning_rate": 4.1160000000000006e-05, + "loss": 0.4414, + "step": 221 + }, + { + "epoch": 0.884, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.6762338876724243, + "eval_runtime": 29.3221, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 221 + }, + { + "epoch": 0.888, + "grad_norm": 33.0, + "learning_rate": 4.1120000000000006e-05, + "loss": 0.918, + "step": 222 + }, + { + "epoch": 0.888, + "eval_accuracy": 0.7690763052208835, + "eval_loss": 0.6697311997413635, + "eval_runtime": 29.3771, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 222 + }, + { + "epoch": 0.892, + "grad_norm": 23.25, + "learning_rate": 4.108e-05, + "loss": 0.9336, + "step": 223 + }, + { + "epoch": 0.892, + "eval_accuracy": 0.7771084337349398, + "eval_loss": 0.6647114753723145, + "eval_runtime": 29.5013, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.135, + "step": 223 + }, + { + "epoch": 0.896, + "grad_norm": 30.125, + "learning_rate": 4.104e-05, + "loss": 1.1953, + "step": 224 + }, + { + "epoch": 0.896, + "eval_accuracy": 0.7871485943775101, + "eval_loss": 0.6471275687217712, + "eval_runtime": 29.4888, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.136, + "step": 224 + }, + { + "epoch": 0.9, + "grad_norm": 27.875, + "learning_rate": 4.1e-05, + "loss": 0.6016, + "step": 225 + }, + { + "epoch": 0.9, + "eval_accuracy": 0.7710843373493976, + "eval_loss": 0.6439782381057739, + "eval_runtime": 29.5609, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.131, + "step": 225 + }, + { + "epoch": 0.904, + "grad_norm": 27.25, + "learning_rate": 4.096e-05, + "loss": 0.7266, + "step": 226 + }, + { + "epoch": 0.904, + "eval_accuracy": 0.7469879518072289, + "eval_loss": 0.644743025302887, + "eval_runtime": 29.5146, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 226 + }, + { + "epoch": 0.908, + "grad_norm": 22.875, + "learning_rate": 4.092e-05, + "loss": 1.1641, + "step": 227 + }, + { + "epoch": 0.908, + "eval_accuracy": 0.7590361445783133, + "eval_loss": 0.6448999047279358, + "eval_runtime": 29.4982, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 227 + }, + { + "epoch": 0.912, + "grad_norm": 11.9375, + "learning_rate": 4.088e-05, + "loss": 0.8164, + "step": 228 + }, + { + "epoch": 0.912, + "eval_accuracy": 0.7530120481927711, + "eval_loss": 0.6458274722099304, + "eval_runtime": 29.4655, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.138, + "step": 228 + }, + { + "epoch": 0.916, + "grad_norm": 55.0, + "learning_rate": 4.084e-05, + "loss": 0.6641, + "step": 229 + }, + { + "epoch": 0.916, + "eval_accuracy": 0.7550200803212851, + "eval_loss": 0.6474276185035706, + "eval_runtime": 29.3707, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 229 + }, + { + "epoch": 0.92, + "grad_norm": 10.875, + "learning_rate": 4.08e-05, + "loss": 0.7578, + "step": 230 + }, + { + "epoch": 0.92, + "eval_accuracy": 0.7610441767068273, + "eval_loss": 0.6499199867248535, + "eval_runtime": 29.2615, + "eval_samples_per_second": 17.019, + "eval_steps_per_second": 2.153, + "step": 230 + }, + { + "epoch": 0.924, + "grad_norm": 9.0625, + "learning_rate": 4.076e-05, + "loss": 0.7188, + "step": 231 + }, + { + "epoch": 0.924, + "eval_accuracy": 0.7630522088353414, + "eval_loss": 0.6519476175308228, + "eval_runtime": 29.2253, + "eval_samples_per_second": 17.04, + "eval_steps_per_second": 2.156, + "step": 231 + }, + { + "epoch": 0.928, + "grad_norm": 5.84375, + "learning_rate": 4.072e-05, + "loss": 0.7188, + "step": 232 + }, + { + "epoch": 0.928, + "eval_accuracy": 0.7771084337349398, + "eval_loss": 0.6540654897689819, + "eval_runtime": 29.3121, + "eval_samples_per_second": 16.99, + "eval_steps_per_second": 2.149, + "step": 232 + }, + { + "epoch": 0.932, + "grad_norm": 5.96875, + "learning_rate": 4.0680000000000004e-05, + "loss": 0.6289, + "step": 233 + }, + { + "epoch": 0.932, + "eval_accuracy": 0.785140562248996, + "eval_loss": 0.6539674401283264, + "eval_runtime": 29.4582, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 233 + }, + { + "epoch": 0.936, + "grad_norm": 4.6875, + "learning_rate": 4.064e-05, + "loss": 0.5391, + "step": 234 + }, + { + "epoch": 0.936, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6508651971817017, + "eval_runtime": 29.5115, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 234 + }, + { + "epoch": 0.94, + "grad_norm": 10.375, + "learning_rate": 4.0600000000000004e-05, + "loss": 0.8086, + "step": 235 + }, + { + "epoch": 0.94, + "eval_accuracy": 0.8273092369477911, + "eval_loss": 0.6485022306442261, + "eval_runtime": 29.522, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 235 + }, + { + "epoch": 0.944, + "grad_norm": 4.875, + "learning_rate": 4.0560000000000005e-05, + "loss": 0.4805, + "step": 236 + }, + { + "epoch": 0.944, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6449763774871826, + "eval_runtime": 29.5439, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.132, + "step": 236 + }, + { + "epoch": 0.948, + "grad_norm": 6.0, + "learning_rate": 4.0520000000000005e-05, + "loss": 0.7109, + "step": 237 + }, + { + "epoch": 0.948, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6437900066375732, + "eval_runtime": 29.4676, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.138, + "step": 237 + }, + { + "epoch": 0.952, + "grad_norm": 4.1875, + "learning_rate": 4.048e-05, + "loss": 0.6953, + "step": 238 + }, + { + "epoch": 0.952, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6419016122817993, + "eval_runtime": 29.4563, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 238 + }, + { + "epoch": 0.956, + "grad_norm": 8.0625, + "learning_rate": 4.044e-05, + "loss": 0.4668, + "step": 239 + }, + { + "epoch": 0.956, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6412584185600281, + "eval_runtime": 29.4007, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 239 + }, + { + "epoch": 0.96, + "grad_norm": 24.125, + "learning_rate": 4.0400000000000006e-05, + "loss": 0.9453, + "step": 240 + }, + { + "epoch": 0.96, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.642693817615509, + "eval_runtime": 29.3125, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.149, + "step": 240 + }, + { + "epoch": 0.964, + "grad_norm": 4.0, + "learning_rate": 4.0360000000000007e-05, + "loss": 0.4629, + "step": 241 + }, + { + "epoch": 0.964, + "eval_accuracy": 0.8192771084337349, + "eval_loss": 0.6476825475692749, + "eval_runtime": 29.3417, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 241 + }, + { + "epoch": 0.968, + "grad_norm": 10.125, + "learning_rate": 4.032e-05, + "loss": 1.0469, + "step": 242 + }, + { + "epoch": 0.968, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6528339982032776, + "eval_runtime": 29.2598, + "eval_samples_per_second": 17.02, + "eval_steps_per_second": 2.153, + "step": 242 + }, + { + "epoch": 0.972, + "grad_norm": 10.375, + "learning_rate": 4.028e-05, + "loss": 0.7227, + "step": 243 + }, + { + "epoch": 0.972, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.6575148105621338, + "eval_runtime": 29.2917, + "eval_samples_per_second": 17.001, + "eval_steps_per_second": 2.151, + "step": 243 + }, + { + "epoch": 0.976, + "grad_norm": 3.5625, + "learning_rate": 4.024e-05, + "loss": 0.5703, + "step": 244 + }, + { + "epoch": 0.976, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6566799283027649, + "eval_runtime": 29.235, + "eval_samples_per_second": 17.034, + "eval_steps_per_second": 2.155, + "step": 244 + }, + { + "epoch": 0.98, + "grad_norm": 4.03125, + "learning_rate": 4.02e-05, + "loss": 0.6641, + "step": 245 + }, + { + "epoch": 0.98, + "eval_accuracy": 0.8273092369477911, + "eval_loss": 0.650460958480835, + "eval_runtime": 29.2297, + "eval_samples_per_second": 17.037, + "eval_steps_per_second": 2.155, + "step": 245 + }, + { + "epoch": 0.984, + "grad_norm": 3.953125, + "learning_rate": 4.016e-05, + "loss": 0.5, + "step": 246 + }, + { + "epoch": 0.984, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6482563614845276, + "eval_runtime": 29.2832, + "eval_samples_per_second": 17.006, + "eval_steps_per_second": 2.151, + "step": 246 + }, + { + "epoch": 0.988, + "grad_norm": 8.1875, + "learning_rate": 4.012e-05, + "loss": 0.4512, + "step": 247 + }, + { + "epoch": 0.988, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6485304236412048, + "eval_runtime": 29.284, + "eval_samples_per_second": 17.006, + "eval_steps_per_second": 2.151, + "step": 247 + }, + { + "epoch": 0.992, + "grad_norm": 12.6875, + "learning_rate": 4.008e-05, + "loss": 0.4629, + "step": 248 + }, + { + "epoch": 0.992, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.6495280265808105, + "eval_runtime": 29.3547, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.146, + "step": 248 + }, + { + "epoch": 0.996, + "grad_norm": 30.125, + "learning_rate": 4.004e-05, + "loss": 1.0312, + "step": 249 + }, + { + "epoch": 0.996, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6497771143913269, + "eval_runtime": 29.468, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.138, + "step": 249 + }, + { + "epoch": 1.0, + "grad_norm": 5.6875, + "learning_rate": 4e-05, + "loss": 0.6445, + "step": 250 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.8112449799196787, + "eval_loss": 0.6489596366882324, + "eval_runtime": 29.4709, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 250 + }, + { + "epoch": 1.004, + "grad_norm": 11.5, + "learning_rate": 3.9960000000000004e-05, + "loss": 0.3711, + "step": 251 + }, + { + "epoch": 1.004, + "eval_accuracy": 0.8132530120481928, + "eval_loss": 0.6521584391593933, + "eval_runtime": 29.4613, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.138, + "step": 251 + }, + { + "epoch": 1.008, + "grad_norm": 22.375, + "learning_rate": 3.9920000000000004e-05, + "loss": 0.9219, + "step": 252 + }, + { + "epoch": 1.008, + "eval_accuracy": 0.7971887550200804, + "eval_loss": 0.6547936201095581, + "eval_runtime": 29.4168, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 252 + }, + { + "epoch": 1.012, + "grad_norm": 18.0, + "learning_rate": 3.988e-05, + "loss": 0.377, + "step": 253 + }, + { + "epoch": 1.012, + "eval_accuracy": 0.7911646586345381, + "eval_loss": 0.6584678292274475, + "eval_runtime": 29.4857, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 253 + }, + { + "epoch": 1.016, + "grad_norm": 5.90625, + "learning_rate": 3.984e-05, + "loss": 0.4668, + "step": 254 + }, + { + "epoch": 1.016, + "eval_accuracy": 0.8072289156626506, + "eval_loss": 0.6416667699813843, + "eval_runtime": 29.5033, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 254 + }, + { + "epoch": 1.02, + "grad_norm": 4.65625, + "learning_rate": 3.9800000000000005e-05, + "loss": 0.5859, + "step": 255 + }, + { + "epoch": 1.02, + "eval_accuracy": 0.7911646586345381, + "eval_loss": 0.6434365510940552, + "eval_runtime": 29.4992, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 255 + }, + { + "epoch": 1.024, + "grad_norm": 5.75, + "learning_rate": 3.9760000000000006e-05, + "loss": 0.6602, + "step": 256 + }, + { + "epoch": 1.024, + "eval_accuracy": 0.7931726907630522, + "eval_loss": 0.6493797302246094, + "eval_runtime": 29.4945, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 256 + }, + { + "epoch": 1.028, + "grad_norm": 27.625, + "learning_rate": 3.972e-05, + "loss": 0.668, + "step": 257 + }, + { + "epoch": 1.028, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6521559953689575, + "eval_runtime": 29.492, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 257 + }, + { + "epoch": 1.032, + "grad_norm": 10.0, + "learning_rate": 3.968e-05, + "loss": 0.5664, + "step": 258 + }, + { + "epoch": 1.032, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.6567534804344177, + "eval_runtime": 29.4215, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.141, + "step": 258 + }, + { + "epoch": 1.036, + "grad_norm": 4.0625, + "learning_rate": 3.964e-05, + "loss": 0.4805, + "step": 259 + }, + { + "epoch": 1.036, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6619220972061157, + "eval_runtime": 29.3187, + "eval_samples_per_second": 16.986, + "eval_steps_per_second": 2.149, + "step": 259 + }, + { + "epoch": 1.04, + "grad_norm": 7.0, + "learning_rate": 3.960000000000001e-05, + "loss": 0.5039, + "step": 260 + }, + { + "epoch": 1.04, + "eval_accuracy": 0.8132530120481928, + "eval_loss": 0.6648949384689331, + "eval_runtime": 29.3471, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.147, + "step": 260 + }, + { + "epoch": 1.044, + "grad_norm": 4.03125, + "learning_rate": 3.956e-05, + "loss": 0.5078, + "step": 261 + }, + { + "epoch": 1.044, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6687761545181274, + "eval_runtime": 29.269, + "eval_samples_per_second": 17.015, + "eval_steps_per_second": 2.152, + "step": 261 + }, + { + "epoch": 1.048, + "grad_norm": 15.0625, + "learning_rate": 3.952e-05, + "loss": 0.582, + "step": 262 + }, + { + "epoch": 1.048, + "eval_accuracy": 0.7971887550200804, + "eval_loss": 0.6758898496627808, + "eval_runtime": 29.2606, + "eval_samples_per_second": 17.019, + "eval_steps_per_second": 2.153, + "step": 262 + }, + { + "epoch": 1.052, + "grad_norm": 48.25, + "learning_rate": 3.948e-05, + "loss": 1.4609, + "step": 263 + }, + { + "epoch": 1.052, + "eval_accuracy": 0.7971887550200804, + "eval_loss": 0.684258759021759, + "eval_runtime": 29.295, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 263 + }, + { + "epoch": 1.056, + "grad_norm": 10.9375, + "learning_rate": 3.944e-05, + "loss": 0.4395, + "step": 264 + }, + { + "epoch": 1.056, + "eval_accuracy": 0.8032128514056225, + "eval_loss": 0.6901208162307739, + "eval_runtime": 29.2917, + "eval_samples_per_second": 17.001, + "eval_steps_per_second": 2.151, + "step": 264 + }, + { + "epoch": 1.06, + "grad_norm": 5.375, + "learning_rate": 3.94e-05, + "loss": 0.7109, + "step": 265 + }, + { + "epoch": 1.06, + "eval_accuracy": 0.7991967871485943, + "eval_loss": 0.7001298666000366, + "eval_runtime": 29.3034, + "eval_samples_per_second": 16.995, + "eval_steps_per_second": 2.15, + "step": 265 + }, + { + "epoch": 1.064, + "grad_norm": 134.0, + "learning_rate": 3.936e-05, + "loss": 0.7656, + "step": 266 + }, + { + "epoch": 1.064, + "eval_accuracy": 0.8012048192771084, + "eval_loss": 0.7038322687149048, + "eval_runtime": 29.4429, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 266 + }, + { + "epoch": 1.068, + "grad_norm": 5.96875, + "learning_rate": 3.932e-05, + "loss": 0.457, + "step": 267 + }, + { + "epoch": 1.068, + "eval_accuracy": 0.7891566265060241, + "eval_loss": 0.7022006511688232, + "eval_runtime": 29.5412, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 267 + }, + { + "epoch": 1.072, + "grad_norm": 25.0, + "learning_rate": 3.9280000000000003e-05, + "loss": 0.1709, + "step": 268 + }, + { + "epoch": 1.072, + "eval_accuracy": 0.8012048192771084, + "eval_loss": 0.698022186756134, + "eval_runtime": 29.4969, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.136, + "step": 268 + }, + { + "epoch": 1.076, + "grad_norm": 8.25, + "learning_rate": 3.9240000000000004e-05, + "loss": 0.5273, + "step": 269 + }, + { + "epoch": 1.076, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6917117834091187, + "eval_runtime": 29.5067, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.135, + "step": 269 + }, + { + "epoch": 1.08, + "grad_norm": 14.375, + "learning_rate": 3.9200000000000004e-05, + "loss": 0.5781, + "step": 270 + }, + { + "epoch": 1.08, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6915860176086426, + "eval_runtime": 29.5502, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.132, + "step": 270 + }, + { + "epoch": 1.084, + "grad_norm": 4.21875, + "learning_rate": 3.9160000000000005e-05, + "loss": 0.4277, + "step": 271 + }, + { + "epoch": 1.084, + "eval_accuracy": 0.7951807228915663, + "eval_loss": 0.6946919560432434, + "eval_runtime": 29.5166, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 271 + }, + { + "epoch": 1.088, + "grad_norm": 26.25, + "learning_rate": 3.912e-05, + "loss": 0.6094, + "step": 272 + }, + { + "epoch": 1.088, + "eval_accuracy": 0.7911646586345381, + "eval_loss": 0.6938756108283997, + "eval_runtime": 29.4323, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.141, + "step": 272 + }, + { + "epoch": 1.092, + "grad_norm": 23.75, + "learning_rate": 3.908e-05, + "loss": 0.7344, + "step": 273 + }, + { + "epoch": 1.092, + "eval_accuracy": 0.7931726907630522, + "eval_loss": 0.6916788816452026, + "eval_runtime": 29.3176, + "eval_samples_per_second": 16.986, + "eval_steps_per_second": 2.149, + "step": 273 + }, + { + "epoch": 1.096, + "grad_norm": 4.96875, + "learning_rate": 3.9040000000000006e-05, + "loss": 0.4414, + "step": 274 + }, + { + "epoch": 1.096, + "eval_accuracy": 0.8012048192771084, + "eval_loss": 0.6881642937660217, + "eval_runtime": 29.2712, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 274 + }, + { + "epoch": 1.1, + "grad_norm": 86.0, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.5859, + "step": 275 + }, + { + "epoch": 1.1, + "eval_accuracy": 0.8012048192771084, + "eval_loss": 0.6879757642745972, + "eval_runtime": 29.3657, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.145, + "step": 275 + }, + { + "epoch": 1.104, + "grad_norm": 6.15625, + "learning_rate": 3.896e-05, + "loss": 0.4863, + "step": 276 + }, + { + "epoch": 1.104, + "eval_accuracy": 0.8132530120481928, + "eval_loss": 0.6844304203987122, + "eval_runtime": 29.485, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 276 + }, + { + "epoch": 1.108, + "grad_norm": 5.46875, + "learning_rate": 3.892e-05, + "loss": 0.5781, + "step": 277 + }, + { + "epoch": 1.108, + "eval_accuracy": 0.8092369477911646, + "eval_loss": 0.6837713718414307, + "eval_runtime": 29.49, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.136, + "step": 277 + }, + { + "epoch": 1.112, + "grad_norm": 10.1875, + "learning_rate": 3.888e-05, + "loss": 0.5234, + "step": 278 + }, + { + "epoch": 1.112, + "eval_accuracy": 0.8012048192771084, + "eval_loss": 0.682453453540802, + "eval_runtime": 29.4992, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 278 + }, + { + "epoch": 1.116, + "grad_norm": 6.09375, + "learning_rate": 3.884e-05, + "loss": 0.2227, + "step": 279 + }, + { + "epoch": 1.116, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6830494403839111, + "eval_runtime": 29.4916, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 279 + }, + { + "epoch": 1.12, + "grad_norm": 6.0, + "learning_rate": 3.88e-05, + "loss": 0.5195, + "step": 280 + }, + { + "epoch": 1.12, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.6841792464256287, + "eval_runtime": 29.501, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 280 + }, + { + "epoch": 1.124, + "grad_norm": 4.5625, + "learning_rate": 3.876e-05, + "loss": 0.3887, + "step": 281 + }, + { + "epoch": 1.124, + "eval_accuracy": 0.8132530120481928, + "eval_loss": 0.6870343089103699, + "eval_runtime": 29.4769, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.137, + "step": 281 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 5.375, + "learning_rate": 3.872e-05, + "loss": 0.332, + "step": 282 + }, + { + "epoch": 1.1280000000000001, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.6879751682281494, + "eval_runtime": 29.4273, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.141, + "step": 282 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 96.0, + "learning_rate": 3.868e-05, + "loss": 1.3203, + "step": 283 + }, + { + "epoch": 1.1320000000000001, + "eval_accuracy": 0.8232931726907631, + "eval_loss": 0.6941245198249817, + "eval_runtime": 29.3137, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.149, + "step": 283 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 7.0625, + "learning_rate": 3.864e-05, + "loss": 0.7188, + "step": 284 + }, + { + "epoch": 1.1360000000000001, + "eval_accuracy": 0.8192771084337349, + "eval_loss": 0.6961638927459717, + "eval_runtime": 29.2726, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 284 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 40.0, + "learning_rate": 3.86e-05, + "loss": 1.1406, + "step": 285 + }, + { + "epoch": 1.1400000000000001, + "eval_accuracy": 0.8273092369477911, + "eval_loss": 0.6967912912368774, + "eval_runtime": 29.3796, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.144, + "step": 285 + }, + { + "epoch": 1.144, + "grad_norm": 11.4375, + "learning_rate": 3.8560000000000004e-05, + "loss": 0.5547, + "step": 286 + }, + { + "epoch": 1.144, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6960384249687195, + "eval_runtime": 29.4957, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.136, + "step": 286 + }, + { + "epoch": 1.148, + "grad_norm": 7.90625, + "learning_rate": 3.8520000000000004e-05, + "loss": 0.7148, + "step": 287 + }, + { + "epoch": 1.148, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6940308213233948, + "eval_runtime": 29.5033, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 287 + }, + { + "epoch": 1.152, + "grad_norm": 26.375, + "learning_rate": 3.848e-05, + "loss": 1.9609, + "step": 288 + }, + { + "epoch": 1.152, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6865020990371704, + "eval_runtime": 29.5251, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 288 + }, + { + "epoch": 1.156, + "grad_norm": 9.5625, + "learning_rate": 3.8440000000000005e-05, + "loss": 0.4805, + "step": 289 + }, + { + "epoch": 1.156, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6784096360206604, + "eval_runtime": 29.5897, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.129, + "step": 289 + }, + { + "epoch": 1.16, + "grad_norm": 5.0625, + "learning_rate": 3.8400000000000005e-05, + "loss": 0.4766, + "step": 290 + }, + { + "epoch": 1.16, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6718859076499939, + "eval_runtime": 29.5431, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.132, + "step": 290 + }, + { + "epoch": 1.164, + "grad_norm": 16.25, + "learning_rate": 3.836e-05, + "loss": 0.668, + "step": 291 + }, + { + "epoch": 1.164, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.6660835146903992, + "eval_runtime": 29.494, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 291 + }, + { + "epoch": 1.168, + "grad_norm": 26.875, + "learning_rate": 3.832e-05, + "loss": 1.8359, + "step": 292 + }, + { + "epoch": 1.168, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6586211919784546, + "eval_runtime": 29.3939, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.143, + "step": 292 + }, + { + "epoch": 1.172, + "grad_norm": 20.625, + "learning_rate": 3.828e-05, + "loss": 1.2969, + "step": 293 + }, + { + "epoch": 1.172, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6489054560661316, + "eval_runtime": 29.3839, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.144, + "step": 293 + }, + { + "epoch": 1.176, + "grad_norm": 25.25, + "learning_rate": 3.8240000000000007e-05, + "loss": 0.7734, + "step": 294 + }, + { + "epoch": 1.176, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.6432080864906311, + "eval_runtime": 29.288, + "eval_samples_per_second": 17.004, + "eval_steps_per_second": 2.151, + "step": 294 + }, + { + "epoch": 1.18, + "grad_norm": 5.53125, + "learning_rate": 3.82e-05, + "loss": 0.4902, + "step": 295 + }, + { + "epoch": 1.18, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6381080746650696, + "eval_runtime": 29.321, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 295 + }, + { + "epoch": 1.184, + "grad_norm": 4.375, + "learning_rate": 3.816e-05, + "loss": 0.334, + "step": 296 + }, + { + "epoch": 1.184, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6357020139694214, + "eval_runtime": 29.2451, + "eval_samples_per_second": 17.029, + "eval_steps_per_second": 2.154, + "step": 296 + }, + { + "epoch": 1.188, + "grad_norm": 14.75, + "learning_rate": 3.812e-05, + "loss": 0.9766, + "step": 297 + }, + { + "epoch": 1.188, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6308260560035706, + "eval_runtime": 29.2927, + "eval_samples_per_second": 17.001, + "eval_steps_per_second": 2.151, + "step": 297 + }, + { + "epoch": 1.192, + "grad_norm": 5.21875, + "learning_rate": 3.808e-05, + "loss": 0.7109, + "step": 298 + }, + { + "epoch": 1.192, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.6252666711807251, + "eval_runtime": 29.2325, + "eval_samples_per_second": 17.036, + "eval_steps_per_second": 2.155, + "step": 298 + }, + { + "epoch": 1.196, + "grad_norm": 4.71875, + "learning_rate": 3.804e-05, + "loss": 0.6211, + "step": 299 + }, + { + "epoch": 1.196, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6222232580184937, + "eval_runtime": 29.3934, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.143, + "step": 299 + }, + { + "epoch": 1.2, + "grad_norm": 4.3125, + "learning_rate": 3.8e-05, + "loss": 0.3125, + "step": 300 + }, + { + "epoch": 1.2, + "eval_accuracy": 0.8534136546184738, + "eval_loss": 0.6201074123382568, + "eval_runtime": 29.5048, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 300 + }, + { + "epoch": 1.204, + "grad_norm": 3.15625, + "learning_rate": 3.796e-05, + "loss": 0.1963, + "step": 301 + }, + { + "epoch": 1.204, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6194053292274475, + "eval_runtime": 29.4067, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.142, + "step": 301 + }, + { + "epoch": 1.208, + "grad_norm": 4.0625, + "learning_rate": 3.792e-05, + "loss": 0.375, + "step": 302 + }, + { + "epoch": 1.208, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6189758777618408, + "eval_runtime": 29.3344, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 302 + }, + { + "epoch": 1.212, + "grad_norm": 14.75, + "learning_rate": 3.788e-05, + "loss": 0.7812, + "step": 303 + }, + { + "epoch": 1.212, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6191229820251465, + "eval_runtime": 29.4586, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 303 + }, + { + "epoch": 1.216, + "grad_norm": 5.3125, + "learning_rate": 3.7840000000000004e-05, + "loss": 0.6016, + "step": 304 + }, + { + "epoch": 1.216, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6190827488899231, + "eval_runtime": 29.5036, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 304 + }, + { + "epoch": 1.22, + "grad_norm": 9.6875, + "learning_rate": 3.7800000000000004e-05, + "loss": 0.4629, + "step": 305 + }, + { + "epoch": 1.22, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6190347075462341, + "eval_runtime": 29.521, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 305 + }, + { + "epoch": 1.224, + "grad_norm": 5.84375, + "learning_rate": 3.776e-05, + "loss": 0.377, + "step": 306 + }, + { + "epoch": 1.224, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6175458431243896, + "eval_runtime": 29.517, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 306 + }, + { + "epoch": 1.228, + "grad_norm": 12.3125, + "learning_rate": 3.772e-05, + "loss": 0.5586, + "step": 307 + }, + { + "epoch": 1.228, + "eval_accuracy": 0.8192771084337349, + "eval_loss": 0.6228066682815552, + "eval_runtime": 29.5736, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.13, + "step": 307 + }, + { + "epoch": 1.232, + "grad_norm": 6.5, + "learning_rate": 3.7680000000000005e-05, + "loss": 0.1924, + "step": 308 + }, + { + "epoch": 1.232, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.6264520883560181, + "eval_runtime": 29.4562, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 308 + }, + { + "epoch": 1.236, + "grad_norm": 29.75, + "learning_rate": 3.7640000000000006e-05, + "loss": 0.75, + "step": 309 + }, + { + "epoch": 1.236, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6260706782341003, + "eval_runtime": 29.3821, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.144, + "step": 309 + }, + { + "epoch": 1.24, + "grad_norm": 21.5, + "learning_rate": 3.76e-05, + "loss": 0.6406, + "step": 310 + }, + { + "epoch": 1.24, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6266849637031555, + "eval_runtime": 29.3913, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.143, + "step": 310 + }, + { + "epoch": 1.244, + "grad_norm": 49.0, + "learning_rate": 3.756e-05, + "loss": 1.2812, + "step": 311 + }, + { + "epoch": 1.244, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6301166415214539, + "eval_runtime": 29.3701, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 311 + }, + { + "epoch": 1.248, + "grad_norm": 7.4375, + "learning_rate": 3.752e-05, + "loss": 0.3477, + "step": 312 + }, + { + "epoch": 1.248, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.6331087350845337, + "eval_runtime": 29.2967, + "eval_samples_per_second": 16.999, + "eval_steps_per_second": 2.15, + "step": 312 + }, + { + "epoch": 1.252, + "grad_norm": 29.375, + "learning_rate": 3.748000000000001e-05, + "loss": 0.4883, + "step": 313 + }, + { + "epoch": 1.252, + "eval_accuracy": 0.8273092369477911, + "eval_loss": 0.6360381841659546, + "eval_runtime": 29.2886, + "eval_samples_per_second": 17.003, + "eval_steps_per_second": 2.151, + "step": 313 + }, + { + "epoch": 1.256, + "grad_norm": 26.625, + "learning_rate": 3.744e-05, + "loss": 0.7461, + "step": 314 + }, + { + "epoch": 1.256, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6370413303375244, + "eval_runtime": 29.3829, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.144, + "step": 314 + }, + { + "epoch": 1.26, + "grad_norm": 10.75, + "learning_rate": 3.74e-05, + "loss": 0.3281, + "step": 315 + }, + { + "epoch": 1.26, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6382951140403748, + "eval_runtime": 29.4668, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.138, + "step": 315 + }, + { + "epoch": 1.264, + "grad_norm": 68.0, + "learning_rate": 3.736e-05, + "loss": 1.0781, + "step": 316 + }, + { + "epoch": 1.264, + "eval_accuracy": 0.8192771084337349, + "eval_loss": 0.6400014162063599, + "eval_runtime": 29.5067, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.135, + "step": 316 + }, + { + "epoch": 1.268, + "grad_norm": 21.125, + "learning_rate": 3.732e-05, + "loss": 0.5508, + "step": 317 + }, + { + "epoch": 1.268, + "eval_accuracy": 0.8172690763052208, + "eval_loss": 0.6404228806495667, + "eval_runtime": 29.5219, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 317 + }, + { + "epoch": 1.272, + "grad_norm": 14.6875, + "learning_rate": 3.728e-05, + "loss": 0.6797, + "step": 318 + }, + { + "epoch": 1.272, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.6398886442184448, + "eval_runtime": 29.5279, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 318 + }, + { + "epoch": 1.276, + "grad_norm": 65.5, + "learning_rate": 3.724e-05, + "loss": 1.4141, + "step": 319 + }, + { + "epoch": 1.276, + "eval_accuracy": 0.8192771084337349, + "eval_loss": 0.6383988261222839, + "eval_runtime": 29.5629, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.131, + "step": 319 + }, + { + "epoch": 1.28, + "grad_norm": 9.5625, + "learning_rate": 3.72e-05, + "loss": 0.5195, + "step": 320 + }, + { + "epoch": 1.28, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.6379119753837585, + "eval_runtime": 29.4669, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.138, + "step": 320 + }, + { + "epoch": 1.284, + "grad_norm": 29.875, + "learning_rate": 3.716e-05, + "loss": 0.8867, + "step": 321 + }, + { + "epoch": 1.284, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6378484964370728, + "eval_runtime": 29.3854, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.144, + "step": 321 + }, + { + "epoch": 1.288, + "grad_norm": 12.4375, + "learning_rate": 3.712e-05, + "loss": 0.5508, + "step": 322 + }, + { + "epoch": 1.288, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6374092698097229, + "eval_runtime": 29.3346, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 322 + }, + { + "epoch": 1.292, + "grad_norm": 53.75, + "learning_rate": 3.7080000000000004e-05, + "loss": 0.6055, + "step": 323 + }, + { + "epoch": 1.292, + "eval_accuracy": 0.8152610441767069, + "eval_loss": 0.637550950050354, + "eval_runtime": 29.3065, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.15, + "step": 323 + }, + { + "epoch": 1.296, + "grad_norm": 6.4375, + "learning_rate": 3.7040000000000005e-05, + "loss": 0.3926, + "step": 324 + }, + { + "epoch": 1.296, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.634836733341217, + "eval_runtime": 29.3511, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.146, + "step": 324 + }, + { + "epoch": 1.3, + "grad_norm": 15.3125, + "learning_rate": 3.7e-05, + "loss": 0.8984, + "step": 325 + }, + { + "epoch": 1.3, + "eval_accuracy": 0.8232931726907631, + "eval_loss": 0.635025680065155, + "eval_runtime": 29.5204, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 325 + }, + { + "epoch": 1.304, + "grad_norm": 12.1875, + "learning_rate": 3.696e-05, + "loss": 0.8281, + "step": 326 + }, + { + "epoch": 1.304, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6336625814437866, + "eval_runtime": 29.515, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 326 + }, + { + "epoch": 1.308, + "grad_norm": 3.671875, + "learning_rate": 3.692e-05, + "loss": 0.1914, + "step": 327 + }, + { + "epoch": 1.308, + "eval_accuracy": 0.8192771084337349, + "eval_loss": 0.6350756883621216, + "eval_runtime": 29.5108, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 327 + }, + { + "epoch": 1.312, + "grad_norm": 7.78125, + "learning_rate": 3.6880000000000006e-05, + "loss": 0.3613, + "step": 328 + }, + { + "epoch": 1.312, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6354835629463196, + "eval_runtime": 29.5555, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 328 + }, + { + "epoch": 1.316, + "grad_norm": 5.3125, + "learning_rate": 3.684e-05, + "loss": 0.377, + "step": 329 + }, + { + "epoch": 1.316, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6376200318336487, + "eval_runtime": 29.5401, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 329 + }, + { + "epoch": 1.32, + "grad_norm": 7.0625, + "learning_rate": 3.68e-05, + "loss": 0.5977, + "step": 330 + }, + { + "epoch": 1.32, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6396616697311401, + "eval_runtime": 29.4141, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.142, + "step": 330 + }, + { + "epoch": 1.324, + "grad_norm": 10.0625, + "learning_rate": 3.676e-05, + "loss": 0.6523, + "step": 331 + }, + { + "epoch": 1.324, + "eval_accuracy": 0.8273092369477911, + "eval_loss": 0.6406833529472351, + "eval_runtime": 29.3384, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 331 + }, + { + "epoch": 1.328, + "grad_norm": 31.75, + "learning_rate": 3.672000000000001e-05, + "loss": 1.4219, + "step": 332 + }, + { + "epoch": 1.328, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6403575539588928, + "eval_runtime": 29.2903, + "eval_samples_per_second": 17.002, + "eval_steps_per_second": 2.151, + "step": 332 + }, + { + "epoch": 1.332, + "grad_norm": 6.03125, + "learning_rate": 3.668e-05, + "loss": 0.6406, + "step": 333 + }, + { + "epoch": 1.332, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6405338048934937, + "eval_runtime": 29.2509, + "eval_samples_per_second": 17.025, + "eval_steps_per_second": 2.154, + "step": 333 + }, + { + "epoch": 1.336, + "grad_norm": 13.0625, + "learning_rate": 3.664e-05, + "loss": 0.8008, + "step": 334 + }, + { + "epoch": 1.336, + "eval_accuracy": 0.8192771084337349, + "eval_loss": 0.6398153901100159, + "eval_runtime": 29.2315, + "eval_samples_per_second": 17.036, + "eval_steps_per_second": 2.155, + "step": 334 + }, + { + "epoch": 1.34, + "grad_norm": 14.4375, + "learning_rate": 3.66e-05, + "loss": 0.5742, + "step": 335 + }, + { + "epoch": 1.34, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6390331983566284, + "eval_runtime": 29.2136, + "eval_samples_per_second": 17.047, + "eval_steps_per_second": 2.157, + "step": 335 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 36.25, + "learning_rate": 3.656e-05, + "loss": 0.918, + "step": 336 + }, + { + "epoch": 1.3439999999999999, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6358059048652649, + "eval_runtime": 29.3751, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.145, + "step": 336 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 8.625, + "learning_rate": 3.652e-05, + "loss": 0.8125, + "step": 337 + }, + { + "epoch": 1.3479999999999999, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6332679390907288, + "eval_runtime": 29.4298, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.141, + "step": 337 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 43.75, + "learning_rate": 3.648e-05, + "loss": 0.4707, + "step": 338 + }, + { + "epoch": 1.3519999999999999, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6299147009849548, + "eval_runtime": 29.5114, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 338 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 11.5625, + "learning_rate": 3.6440000000000003e-05, + "loss": 0.9453, + "step": 339 + }, + { + "epoch": 1.3559999999999999, + "eval_accuracy": 0.8273092369477911, + "eval_loss": 0.6302053928375244, + "eval_runtime": 29.5259, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 339 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 6.15625, + "learning_rate": 3.6400000000000004e-05, + "loss": 0.8438, + "step": 340 + }, + { + "epoch": 1.3599999999999999, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6298357248306274, + "eval_runtime": 29.4701, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 340 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 4.125, + "learning_rate": 3.636e-05, + "loss": 0.6797, + "step": 341 + }, + { + "epoch": 1.3639999999999999, + "eval_accuracy": 0.8273092369477911, + "eval_loss": 0.6281463503837585, + "eval_runtime": 29.4712, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 341 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 5.09375, + "learning_rate": 3.6320000000000005e-05, + "loss": 0.7188, + "step": 342 + }, + { + "epoch": 1.3679999999999999, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6309162378311157, + "eval_runtime": 29.5157, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 342 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 4.125, + "learning_rate": 3.6280000000000005e-05, + "loss": 0.8281, + "step": 343 + }, + { + "epoch": 1.3719999999999999, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.6306083798408508, + "eval_runtime": 29.4142, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.142, + "step": 343 + }, + { + "epoch": 1.376, + "grad_norm": 5.6875, + "learning_rate": 3.624e-05, + "loss": 0.6172, + "step": 344 + }, + { + "epoch": 1.376, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6295798420906067, + "eval_runtime": 29.3382, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 344 + }, + { + "epoch": 1.38, + "grad_norm": 3.484375, + "learning_rate": 3.62e-05, + "loss": 0.6484, + "step": 345 + }, + { + "epoch": 1.38, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.630338728427887, + "eval_runtime": 29.2124, + "eval_samples_per_second": 17.048, + "eval_steps_per_second": 2.157, + "step": 345 + }, + { + "epoch": 1.384, + "grad_norm": 9.8125, + "learning_rate": 3.616e-05, + "loss": 0.6289, + "step": 346 + }, + { + "epoch": 1.384, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.6309735774993896, + "eval_runtime": 29.2503, + "eval_samples_per_second": 17.025, + "eval_steps_per_second": 2.154, + "step": 346 + }, + { + "epoch": 1.388, + "grad_norm": 3.21875, + "learning_rate": 3.6120000000000007e-05, + "loss": 0.6953, + "step": 347 + }, + { + "epoch": 1.388, + "eval_accuracy": 0.8534136546184738, + "eval_loss": 0.631514847278595, + "eval_runtime": 29.3745, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.145, + "step": 347 + }, + { + "epoch": 1.392, + "grad_norm": 3.5625, + "learning_rate": 3.608e-05, + "loss": 0.7148, + "step": 348 + }, + { + "epoch": 1.392, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6342528462409973, + "eval_runtime": 29.4308, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.141, + "step": 348 + }, + { + "epoch": 1.396, + "grad_norm": 2.640625, + "learning_rate": 3.604e-05, + "loss": 0.5039, + "step": 349 + }, + { + "epoch": 1.396, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6351686120033264, + "eval_runtime": 29.4748, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.137, + "step": 349 + }, + { + "epoch": 1.4, + "grad_norm": 30.5, + "learning_rate": 3.6e-05, + "loss": 1.1641, + "step": 350 + }, + { + "epoch": 1.4, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6345489621162415, + "eval_runtime": 29.5259, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 350 + }, + { + "epoch": 1.404, + "grad_norm": 9.5625, + "learning_rate": 3.596e-05, + "loss": 1.375, + "step": 351 + }, + { + "epoch": 1.404, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6354637145996094, + "eval_runtime": 29.278, + "eval_samples_per_second": 17.009, + "eval_steps_per_second": 2.152, + "step": 351 + }, + { + "epoch": 1.408, + "grad_norm": 4.6875, + "learning_rate": 3.592e-05, + "loss": 0.5938, + "step": 352 + }, + { + "epoch": 1.408, + "eval_accuracy": 0.821285140562249, + "eval_loss": 0.6351823210716248, + "eval_runtime": 29.305, + "eval_samples_per_second": 16.994, + "eval_steps_per_second": 2.15, + "step": 352 + }, + { + "epoch": 1.412, + "grad_norm": 2.640625, + "learning_rate": 3.588e-05, + "loss": 0.4922, + "step": 353 + }, + { + "epoch": 1.412, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.6351715326309204, + "eval_runtime": 29.4231, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 353 + }, + { + "epoch": 1.416, + "grad_norm": 9.5625, + "learning_rate": 3.584e-05, + "loss": 0.6328, + "step": 354 + }, + { + "epoch": 1.416, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.6343871355056763, + "eval_runtime": 29.4767, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.137, + "step": 354 + }, + { + "epoch": 1.42, + "grad_norm": 3.390625, + "learning_rate": 3.58e-05, + "loss": 0.7617, + "step": 355 + }, + { + "epoch": 1.42, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6335380673408508, + "eval_runtime": 29.4771, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.137, + "step": 355 + }, + { + "epoch": 1.424, + "grad_norm": 12.1875, + "learning_rate": 3.5759999999999996e-05, + "loss": 0.2949, + "step": 356 + }, + { + "epoch": 1.424, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6333341002464294, + "eval_runtime": 29.4801, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.137, + "step": 356 + }, + { + "epoch": 1.428, + "grad_norm": 4.03125, + "learning_rate": 3.5720000000000004e-05, + "loss": 0.4746, + "step": 357 + }, + { + "epoch": 1.428, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.6345891356468201, + "eval_runtime": 29.5037, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 357 + }, + { + "epoch": 1.432, + "grad_norm": 2.875, + "learning_rate": 3.5680000000000004e-05, + "loss": 0.3496, + "step": 358 + }, + { + "epoch": 1.432, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6342185139656067, + "eval_runtime": 29.4043, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 358 + }, + { + "epoch": 1.436, + "grad_norm": 3.15625, + "learning_rate": 3.5640000000000004e-05, + "loss": 0.5469, + "step": 359 + }, + { + "epoch": 1.436, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.633249819278717, + "eval_runtime": 29.3249, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.148, + "step": 359 + }, + { + "epoch": 1.44, + "grad_norm": 5.125, + "learning_rate": 3.56e-05, + "loss": 0.4785, + "step": 360 + }, + { + "epoch": 1.44, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.6333478689193726, + "eval_runtime": 29.2719, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 360 + }, + { + "epoch": 1.444, + "grad_norm": 3.046875, + "learning_rate": 3.5560000000000005e-05, + "loss": 0.6445, + "step": 361 + }, + { + "epoch": 1.444, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6334155201911926, + "eval_runtime": 29.3014, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 2.15, + "step": 361 + }, + { + "epoch": 1.448, + "grad_norm": 6.0625, + "learning_rate": 3.5520000000000006e-05, + "loss": 0.3984, + "step": 362 + }, + { + "epoch": 1.448, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6342656016349792, + "eval_runtime": 29.404, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 362 + }, + { + "epoch": 1.452, + "grad_norm": 6.6875, + "learning_rate": 3.548e-05, + "loss": 0.5547, + "step": 363 + }, + { + "epoch": 1.452, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6347166299819946, + "eval_runtime": 29.5202, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 363 + }, + { + "epoch": 1.456, + "grad_norm": 2.875, + "learning_rate": 3.544e-05, + "loss": 0.5234, + "step": 364 + }, + { + "epoch": 1.456, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6340979337692261, + "eval_runtime": 29.4782, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.137, + "step": 364 + }, + { + "epoch": 1.46, + "grad_norm": 13.25, + "learning_rate": 3.54e-05, + "loss": 1.2578, + "step": 365 + }, + { + "epoch": 1.46, + "eval_accuracy": 0.8253012048192772, + "eval_loss": 0.634187638759613, + "eval_runtime": 29.4809, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 365 + }, + { + "epoch": 1.464, + "grad_norm": 3.21875, + "learning_rate": 3.536000000000001e-05, + "loss": 0.6094, + "step": 366 + }, + { + "epoch": 1.464, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6336777806282043, + "eval_runtime": 29.5342, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.133, + "step": 366 + }, + { + "epoch": 1.468, + "grad_norm": 6.875, + "learning_rate": 3.532e-05, + "loss": 0.6172, + "step": 367 + }, + { + "epoch": 1.468, + "eval_accuracy": 0.8232931726907631, + "eval_loss": 0.6337635517120361, + "eval_runtime": 29.4515, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 367 + }, + { + "epoch": 1.472, + "grad_norm": 3.09375, + "learning_rate": 3.528e-05, + "loss": 0.6719, + "step": 368 + }, + { + "epoch": 1.472, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.635013222694397, + "eval_runtime": 29.3732, + "eval_samples_per_second": 16.954, + "eval_steps_per_second": 2.145, + "step": 368 + }, + { + "epoch": 1.476, + "grad_norm": 4.71875, + "learning_rate": 3.524e-05, + "loss": 0.6211, + "step": 369 + }, + { + "epoch": 1.476, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.6356078386306763, + "eval_runtime": 29.4637, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.138, + "step": 369 + }, + { + "epoch": 1.48, + "grad_norm": 3.234375, + "learning_rate": 3.52e-05, + "loss": 0.6914, + "step": 370 + }, + { + "epoch": 1.48, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.635135293006897, + "eval_runtime": 29.4883, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.136, + "step": 370 + }, + { + "epoch": 1.484, + "grad_norm": 6.8125, + "learning_rate": 3.516e-05, + "loss": 0.5703, + "step": 371 + }, + { + "epoch": 1.484, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6337194442749023, + "eval_runtime": 29.4769, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.137, + "step": 371 + }, + { + "epoch": 1.488, + "grad_norm": 2.875, + "learning_rate": 3.512e-05, + "loss": 0.625, + "step": 372 + }, + { + "epoch": 1.488, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6327438950538635, + "eval_runtime": 29.4722, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 372 + }, + { + "epoch": 1.492, + "grad_norm": 3.328125, + "learning_rate": 3.508e-05, + "loss": 0.6797, + "step": 373 + }, + { + "epoch": 1.492, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6329924464225769, + "eval_runtime": 29.4244, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 373 + }, + { + "epoch": 1.496, + "grad_norm": 5.34375, + "learning_rate": 3.504e-05, + "loss": 0.6875, + "step": 374 + }, + { + "epoch": 1.496, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.633053183555603, + "eval_runtime": 29.3363, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.148, + "step": 374 + }, + { + "epoch": 1.5, + "grad_norm": 3.453125, + "learning_rate": 3.5e-05, + "loss": 0.6094, + "step": 375 + }, + { + "epoch": 1.5, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.6324237585067749, + "eval_runtime": 29.3281, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.148, + "step": 375 + }, + { + "epoch": 1.504, + "grad_norm": 18.0, + "learning_rate": 3.4960000000000004e-05, + "loss": 1.4062, + "step": 376 + }, + { + "epoch": 1.504, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6318898797035217, + "eval_runtime": 29.4388, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.14, + "step": 376 + }, + { + "epoch": 1.508, + "grad_norm": 2.703125, + "learning_rate": 3.4920000000000004e-05, + "loss": 0.4766, + "step": 377 + }, + { + "epoch": 1.508, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6319864392280579, + "eval_runtime": 29.5334, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.133, + "step": 377 + }, + { + "epoch": 1.512, + "grad_norm": 3.359375, + "learning_rate": 3.4880000000000005e-05, + "loss": 0.7305, + "step": 378 + }, + { + "epoch": 1.512, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.632082998752594, + "eval_runtime": 29.5334, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.133, + "step": 378 + }, + { + "epoch": 1.516, + "grad_norm": 18.25, + "learning_rate": 3.484e-05, + "loss": 0.5898, + "step": 379 + }, + { + "epoch": 1.516, + "eval_accuracy": 0.8293172690763052, + "eval_loss": 0.6330262422561646, + "eval_runtime": 29.5286, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 379 + }, + { + "epoch": 1.52, + "grad_norm": 4.34375, + "learning_rate": 3.48e-05, + "loss": 0.6875, + "step": 380 + }, + { + "epoch": 1.52, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.6317697763442993, + "eval_runtime": 29.4718, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 380 + }, + { + "epoch": 1.524, + "grad_norm": 31.125, + "learning_rate": 3.4760000000000006e-05, + "loss": 1.3125, + "step": 381 + }, + { + "epoch": 1.524, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.633310079574585, + "eval_runtime": 29.4168, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 381 + }, + { + "epoch": 1.528, + "grad_norm": 3.28125, + "learning_rate": 3.472e-05, + "loss": 0.6094, + "step": 382 + }, + { + "epoch": 1.528, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.6315545439720154, + "eval_runtime": 29.3828, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.144, + "step": 382 + }, + { + "epoch": 1.532, + "grad_norm": 3.953125, + "learning_rate": 3.468e-05, + "loss": 0.8672, + "step": 383 + }, + { + "epoch": 1.532, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.630361795425415, + "eval_runtime": 29.3586, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.146, + "step": 383 + }, + { + "epoch": 1.536, + "grad_norm": 3.765625, + "learning_rate": 3.464e-05, + "loss": 0.7461, + "step": 384 + }, + { + "epoch": 1.536, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.6303333640098572, + "eval_runtime": 29.2884, + "eval_samples_per_second": 17.003, + "eval_steps_per_second": 2.151, + "step": 384 + }, + { + "epoch": 1.54, + "grad_norm": 2.71875, + "learning_rate": 3.46e-05, + "loss": 0.418, + "step": 385 + }, + { + "epoch": 1.54, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.6297984719276428, + "eval_runtime": 29.3344, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 385 + }, + { + "epoch": 1.544, + "grad_norm": 3.640625, + "learning_rate": 3.456e-05, + "loss": 0.6445, + "step": 386 + }, + { + "epoch": 1.544, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.629108726978302, + "eval_runtime": 29.372, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.145, + "step": 386 + }, + { + "epoch": 1.548, + "grad_norm": 3.109375, + "learning_rate": 3.452e-05, + "loss": 0.5156, + "step": 387 + }, + { + "epoch": 1.548, + "eval_accuracy": 0.8554216867469879, + "eval_loss": 0.6281659603118896, + "eval_runtime": 29.4224, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.141, + "step": 387 + }, + { + "epoch": 1.552, + "grad_norm": 3.109375, + "learning_rate": 3.448e-05, + "loss": 0.6328, + "step": 388 + }, + { + "epoch": 1.552, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6271923780441284, + "eval_runtime": 29.4912, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 388 + }, + { + "epoch": 1.556, + "grad_norm": 3.515625, + "learning_rate": 3.444e-05, + "loss": 0.5703, + "step": 389 + }, + { + "epoch": 1.556, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6269736886024475, + "eval_runtime": 29.5312, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 389 + }, + { + "epoch": 1.56, + "grad_norm": 3.3125, + "learning_rate": 3.4399999999999996e-05, + "loss": 0.625, + "step": 390 + }, + { + "epoch": 1.56, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6267845034599304, + "eval_runtime": 29.4819, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 390 + }, + { + "epoch": 1.564, + "grad_norm": 5.21875, + "learning_rate": 3.436e-05, + "loss": 0.6328, + "step": 391 + }, + { + "epoch": 1.564, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6249951124191284, + "eval_runtime": 29.5186, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 391 + }, + { + "epoch": 1.568, + "grad_norm": 23.75, + "learning_rate": 3.4320000000000003e-05, + "loss": 0.8594, + "step": 392 + }, + { + "epoch": 1.568, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6245225071907043, + "eval_runtime": 29.4359, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.14, + "step": 392 + }, + { + "epoch": 1.572, + "grad_norm": 3.515625, + "learning_rate": 3.4280000000000004e-05, + "loss": 0.6758, + "step": 393 + }, + { + "epoch": 1.572, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6226375102996826, + "eval_runtime": 29.3224, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 393 + }, + { + "epoch": 1.576, + "grad_norm": 2.765625, + "learning_rate": 3.424e-05, + "loss": 0.3633, + "step": 394 + }, + { + "epoch": 1.576, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6224120259284973, + "eval_runtime": 29.2672, + "eval_samples_per_second": 17.016, + "eval_steps_per_second": 2.153, + "step": 394 + }, + { + "epoch": 1.58, + "grad_norm": 3.8125, + "learning_rate": 3.4200000000000005e-05, + "loss": 0.6875, + "step": 395 + }, + { + "epoch": 1.58, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6217790842056274, + "eval_runtime": 29.2245, + "eval_samples_per_second": 17.041, + "eval_steps_per_second": 2.156, + "step": 395 + }, + { + "epoch": 1.584, + "grad_norm": 6.84375, + "learning_rate": 3.4160000000000005e-05, + "loss": 0.7305, + "step": 396 + }, + { + "epoch": 1.584, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6199250221252441, + "eval_runtime": 29.3114, + "eval_samples_per_second": 16.99, + "eval_steps_per_second": 2.149, + "step": 396 + }, + { + "epoch": 1.588, + "grad_norm": 3.46875, + "learning_rate": 3.412e-05, + "loss": 0.7266, + "step": 397 + }, + { + "epoch": 1.588, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6191999316215515, + "eval_runtime": 29.4081, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.142, + "step": 397 + }, + { + "epoch": 1.592, + "grad_norm": 4.34375, + "learning_rate": 3.408e-05, + "loss": 0.75, + "step": 398 + }, + { + "epoch": 1.592, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6189788579940796, + "eval_runtime": 29.4608, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.138, + "step": 398 + }, + { + "epoch": 1.596, + "grad_norm": 25.25, + "learning_rate": 3.404e-05, + "loss": 0.4805, + "step": 399 + }, + { + "epoch": 1.596, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.620235800743103, + "eval_runtime": 29.4699, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.138, + "step": 399 + }, + { + "epoch": 1.6, + "grad_norm": 3.9375, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.4453, + "step": 400 + }, + { + "epoch": 1.6, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6202980875968933, + "eval_runtime": 29.4984, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 400 + }, + { + "epoch": 1.604, + "grad_norm": 15.625, + "learning_rate": 3.396e-05, + "loss": 0.4375, + "step": 401 + }, + { + "epoch": 1.604, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6182885766029358, + "eval_runtime": 29.413, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.142, + "step": 401 + }, + { + "epoch": 1.608, + "grad_norm": 3.828125, + "learning_rate": 3.392e-05, + "loss": 0.6016, + "step": 402 + }, + { + "epoch": 1.608, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6179728507995605, + "eval_runtime": 29.3559, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.146, + "step": 402 + }, + { + "epoch": 1.612, + "grad_norm": 3.625, + "learning_rate": 3.388e-05, + "loss": 0.625, + "step": 403 + }, + { + "epoch": 1.612, + "eval_accuracy": 0.8313253012048193, + "eval_loss": 0.6179694533348083, + "eval_runtime": 29.4684, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.138, + "step": 403 + }, + { + "epoch": 1.616, + "grad_norm": 2.9375, + "learning_rate": 3.384e-05, + "loss": 0.3809, + "step": 404 + }, + { + "epoch": 1.616, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6179674863815308, + "eval_runtime": 29.5212, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 404 + }, + { + "epoch": 1.62, + "grad_norm": 14.4375, + "learning_rate": 3.38e-05, + "loss": 0.9922, + "step": 405 + }, + { + "epoch": 1.62, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6177811622619629, + "eval_runtime": 29.5637, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.131, + "step": 405 + }, + { + "epoch": 1.624, + "grad_norm": 4.625, + "learning_rate": 3.376e-05, + "loss": 0.7383, + "step": 406 + }, + { + "epoch": 1.624, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6172487735748291, + "eval_runtime": 29.5575, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.131, + "step": 406 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 3.46875, + "learning_rate": 3.372e-05, + "loss": 0.4824, + "step": 407 + }, + { + "epoch": 1.6280000000000001, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6165599822998047, + "eval_runtime": 29.4911, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 407 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 24.5, + "learning_rate": 3.368e-05, + "loss": 1.2266, + "step": 408 + }, + { + "epoch": 1.6320000000000001, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6159079670906067, + "eval_runtime": 29.4268, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.141, + "step": 408 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 25.625, + "learning_rate": 3.3639999999999996e-05, + "loss": 0.5664, + "step": 409 + }, + { + "epoch": 1.6360000000000001, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6148152351379395, + "eval_runtime": 29.3068, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.15, + "step": 409 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 4.8125, + "learning_rate": 3.3600000000000004e-05, + "loss": 0.4453, + "step": 410 + }, + { + "epoch": 1.6400000000000001, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6148456335067749, + "eval_runtime": 29.3652, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.145, + "step": 410 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 3.03125, + "learning_rate": 3.3560000000000004e-05, + "loss": 0.4141, + "step": 411 + }, + { + "epoch": 1.6440000000000001, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6157559752464294, + "eval_runtime": 29.4295, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.141, + "step": 411 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 3.703125, + "learning_rate": 3.3520000000000004e-05, + "loss": 0.7109, + "step": 412 + }, + { + "epoch": 1.6480000000000001, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.6145029067993164, + "eval_runtime": 29.4709, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 412 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 8.625, + "learning_rate": 3.348e-05, + "loss": 0.7188, + "step": 413 + }, + { + "epoch": 1.6520000000000001, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.6140013933181763, + "eval_runtime": 29.4909, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.136, + "step": 413 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 3.5625, + "learning_rate": 3.344e-05, + "loss": 0.5039, + "step": 414 + }, + { + "epoch": 1.6560000000000001, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6121198534965515, + "eval_runtime": 29.5379, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 414 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 3.5, + "learning_rate": 3.3400000000000005e-05, + "loss": 0.5352, + "step": 415 + }, + { + "epoch": 1.6600000000000001, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6117095351219177, + "eval_runtime": 29.5423, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 415 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 3.125, + "learning_rate": 3.336e-05, + "loss": 0.6172, + "step": 416 + }, + { + "epoch": 1.6640000000000001, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.6116756796836853, + "eval_runtime": 29.5134, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 416 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 3.421875, + "learning_rate": 3.332e-05, + "loss": 0.6953, + "step": 417 + }, + { + "epoch": 1.6680000000000001, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6102927327156067, + "eval_runtime": 29.3927, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.143, + "step": 417 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 3.71875, + "learning_rate": 3.328e-05, + "loss": 0.6055, + "step": 418 + }, + { + "epoch": 1.6720000000000002, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.6110751628875732, + "eval_runtime": 29.3832, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.144, + "step": 418 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 15.75, + "learning_rate": 3.324e-05, + "loss": 1.0234, + "step": 419 + }, + { + "epoch": 1.6760000000000002, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6104172468185425, + "eval_runtime": 29.343, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 419 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 8.75, + "learning_rate": 3.32e-05, + "loss": 0.5469, + "step": 420 + }, + { + "epoch": 1.6800000000000002, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6094480752944946, + "eval_runtime": 29.315, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.149, + "step": 420 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 4.125, + "learning_rate": 3.316e-05, + "loss": 0.7617, + "step": 421 + }, + { + "epoch": 1.6840000000000002, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6088871955871582, + "eval_runtime": 29.286, + "eval_samples_per_second": 17.005, + "eval_steps_per_second": 2.151, + "step": 421 + }, + { + "epoch": 1.688, + "grad_norm": 8.5, + "learning_rate": 3.312e-05, + "loss": 0.3086, + "step": 422 + }, + { + "epoch": 1.688, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.6085445284843445, + "eval_runtime": 29.2403, + "eval_samples_per_second": 17.031, + "eval_steps_per_second": 2.155, + "step": 422 + }, + { + "epoch": 1.692, + "grad_norm": 4.375, + "learning_rate": 3.308e-05, + "loss": 0.5859, + "step": 423 + }, + { + "epoch": 1.692, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6077581644058228, + "eval_runtime": 29.4151, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.142, + "step": 423 + }, + { + "epoch": 1.696, + "grad_norm": 3.3125, + "learning_rate": 3.304e-05, + "loss": 0.3281, + "step": 424 + }, + { + "epoch": 1.696, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6068767309188843, + "eval_runtime": 29.4578, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 424 + }, + { + "epoch": 1.7, + "grad_norm": 3.578125, + "learning_rate": 3.3e-05, + "loss": 0.4531, + "step": 425 + }, + { + "epoch": 1.7, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6049260497093201, + "eval_runtime": 29.5398, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 425 + }, + { + "epoch": 1.704, + "grad_norm": 4.4375, + "learning_rate": 3.296e-05, + "loss": 0.4609, + "step": 426 + }, + { + "epoch": 1.704, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.6041048765182495, + "eval_runtime": 29.5039, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 426 + }, + { + "epoch": 1.708, + "grad_norm": 3.328125, + "learning_rate": 3.292e-05, + "loss": 0.252, + "step": 427 + }, + { + "epoch": 1.708, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6034393906593323, + "eval_runtime": 29.5459, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.132, + "step": 427 + }, + { + "epoch": 1.712, + "grad_norm": 5.0625, + "learning_rate": 3.288e-05, + "loss": 0.582, + "step": 428 + }, + { + "epoch": 1.712, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.603621780872345, + "eval_runtime": 29.4934, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 428 + }, + { + "epoch": 1.716, + "grad_norm": 3.453125, + "learning_rate": 3.2840000000000004e-05, + "loss": 0.2676, + "step": 429 + }, + { + "epoch": 1.716, + "eval_accuracy": 0.8353413654618473, + "eval_loss": 0.6035209894180298, + "eval_runtime": 29.4847, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 429 + }, + { + "epoch": 1.72, + "grad_norm": 4.9375, + "learning_rate": 3.2800000000000004e-05, + "loss": 0.6484, + "step": 430 + }, + { + "epoch": 1.72, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.6033256649971008, + "eval_runtime": 29.451, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 430 + }, + { + "epoch": 1.724, + "grad_norm": 9.5, + "learning_rate": 3.2760000000000005e-05, + "loss": 0.7891, + "step": 431 + }, + { + "epoch": 1.724, + "eval_accuracy": 0.8373493975903614, + "eval_loss": 0.604296088218689, + "eval_runtime": 29.3885, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.144, + "step": 431 + }, + { + "epoch": 1.728, + "grad_norm": 4.5, + "learning_rate": 3.272e-05, + "loss": 0.5, + "step": 432 + }, + { + "epoch": 1.728, + "eval_accuracy": 0.8413654618473896, + "eval_loss": 0.6046375632286072, + "eval_runtime": 29.2944, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 432 + }, + { + "epoch": 1.732, + "grad_norm": 44.25, + "learning_rate": 3.268e-05, + "loss": 0.3086, + "step": 433 + }, + { + "epoch": 1.732, + "eval_accuracy": 0.8393574297188755, + "eval_loss": 0.6066742539405823, + "eval_runtime": 29.3924, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.143, + "step": 433 + }, + { + "epoch": 1.736, + "grad_norm": 4.84375, + "learning_rate": 3.2640000000000006e-05, + "loss": 0.4961, + "step": 434 + }, + { + "epoch": 1.736, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.6063256859779358, + "eval_runtime": 29.4573, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 434 + }, + { + "epoch": 1.74, + "grad_norm": 39.0, + "learning_rate": 3.26e-05, + "loss": 0.9375, + "step": 435 + }, + { + "epoch": 1.74, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.6054756045341492, + "eval_runtime": 29.5071, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 435 + }, + { + "epoch": 1.744, + "grad_norm": 7.65625, + "learning_rate": 3.256e-05, + "loss": 0.6289, + "step": 436 + }, + { + "epoch": 1.744, + "eval_accuracy": 0.8554216867469879, + "eval_loss": 0.6073887944221497, + "eval_runtime": 29.5171, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 436 + }, + { + "epoch": 1.748, + "grad_norm": 5.28125, + "learning_rate": 3.252e-05, + "loss": 0.4238, + "step": 437 + }, + { + "epoch": 1.748, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.6079205870628357, + "eval_runtime": 29.5061, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.135, + "step": 437 + }, + { + "epoch": 1.752, + "grad_norm": 5.09375, + "learning_rate": 3.248e-05, + "loss": 0.4004, + "step": 438 + }, + { + "epoch": 1.752, + "eval_accuracy": 0.8554216867469879, + "eval_loss": 0.6067585945129395, + "eval_runtime": 29.5614, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.131, + "step": 438 + }, + { + "epoch": 1.756, + "grad_norm": 6.46875, + "learning_rate": 3.244e-05, + "loss": 0.5234, + "step": 439 + }, + { + "epoch": 1.756, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.6056599617004395, + "eval_runtime": 29.5287, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 439 + }, + { + "epoch": 1.76, + "grad_norm": 7.5625, + "learning_rate": 3.24e-05, + "loss": 0.3535, + "step": 440 + }, + { + "epoch": 1.76, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.6048735976219177, + "eval_runtime": 29.4233, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 440 + }, + { + "epoch": 1.764, + "grad_norm": 6.03125, + "learning_rate": 3.236e-05, + "loss": 0.5195, + "step": 441 + }, + { + "epoch": 1.764, + "eval_accuracy": 0.8594377510040161, + "eval_loss": 0.6018902659416199, + "eval_runtime": 29.2849, + "eval_samples_per_second": 17.005, + "eval_steps_per_second": 2.151, + "step": 441 + }, + { + "epoch": 1.768, + "grad_norm": 6.53125, + "learning_rate": 3.232e-05, + "loss": 0.6562, + "step": 442 + }, + { + "epoch": 1.768, + "eval_accuracy": 0.8594377510040161, + "eval_loss": 0.5998172760009766, + "eval_runtime": 29.298, + "eval_samples_per_second": 16.998, + "eval_steps_per_second": 2.15, + "step": 442 + }, + { + "epoch": 1.772, + "grad_norm": 6.90625, + "learning_rate": 3.2279999999999996e-05, + "loss": 0.4395, + "step": 443 + }, + { + "epoch": 1.772, + "eval_accuracy": 0.8594377510040161, + "eval_loss": 0.5983887910842896, + "eval_runtime": 29.29, + "eval_samples_per_second": 17.002, + "eval_steps_per_second": 2.151, + "step": 443 + }, + { + "epoch": 1.776, + "grad_norm": 10.25, + "learning_rate": 3.224e-05, + "loss": 0.3828, + "step": 444 + }, + { + "epoch": 1.776, + "eval_accuracy": 0.8554216867469879, + "eval_loss": 0.5973514318466187, + "eval_runtime": 29.4437, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 444 + }, + { + "epoch": 1.78, + "grad_norm": 5.40625, + "learning_rate": 3.2200000000000003e-05, + "loss": 0.1631, + "step": 445 + }, + { + "epoch": 1.78, + "eval_accuracy": 0.8554216867469879, + "eval_loss": 0.5965815186500549, + "eval_runtime": 29.5156, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 445 + }, + { + "epoch": 1.784, + "grad_norm": 14.375, + "learning_rate": 3.2160000000000004e-05, + "loss": 0.8633, + "step": 446 + }, + { + "epoch": 1.784, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.596753716468811, + "eval_runtime": 29.5772, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.13, + "step": 446 + }, + { + "epoch": 1.788, + "grad_norm": 7.65625, + "learning_rate": 3.212e-05, + "loss": 0.5234, + "step": 447 + }, + { + "epoch": 1.788, + "eval_accuracy": 0.8654618473895582, + "eval_loss": 0.5969735980033875, + "eval_runtime": 29.5204, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 447 + }, + { + "epoch": 1.792, + "grad_norm": 13.5, + "learning_rate": 3.208e-05, + "loss": 0.8281, + "step": 448 + }, + { + "epoch": 1.792, + "eval_accuracy": 0.8634538152610441, + "eval_loss": 0.596095860004425, + "eval_runtime": 29.5037, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 448 + }, + { + "epoch": 1.796, + "grad_norm": 7.40625, + "learning_rate": 3.2040000000000005e-05, + "loss": 0.5156, + "step": 449 + }, + { + "epoch": 1.796, + "eval_accuracy": 0.8654618473895582, + "eval_loss": 0.5946374535560608, + "eval_runtime": 29.3957, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.143, + "step": 449 + }, + { + "epoch": 1.8, + "grad_norm": 12.75, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.5586, + "step": 450 + }, + { + "epoch": 1.8, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.5936810374259949, + "eval_runtime": 29.3599, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 450 + }, + { + "epoch": 1.804, + "grad_norm": 7.3125, + "learning_rate": 3.196e-05, + "loss": 0.5625, + "step": 451 + }, + { + "epoch": 1.804, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.5925204753875732, + "eval_runtime": 29.3463, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.147, + "step": 451 + }, + { + "epoch": 1.808, + "grad_norm": 19.25, + "learning_rate": 3.192e-05, + "loss": 1.2969, + "step": 452 + }, + { + "epoch": 1.808, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5897141098976135, + "eval_runtime": 29.311, + "eval_samples_per_second": 16.99, + "eval_steps_per_second": 2.149, + "step": 452 + }, + { + "epoch": 1.812, + "grad_norm": 9.75, + "learning_rate": 3.188e-05, + "loss": 0.2793, + "step": 453 + }, + { + "epoch": 1.812, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.5871118903160095, + "eval_runtime": 29.4385, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.14, + "step": 453 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 14.875, + "learning_rate": 3.184e-05, + "loss": 0.9648, + "step": 454 + }, + { + "epoch": 1.8159999999999998, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.5850282311439514, + "eval_runtime": 29.5016, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 454 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 7.78125, + "learning_rate": 3.18e-05, + "loss": 0.5195, + "step": 455 + }, + { + "epoch": 1.8199999999999998, + "eval_accuracy": 0.8734939759036144, + "eval_loss": 0.5830850005149841, + "eval_runtime": 29.57, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 455 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 24.375, + "learning_rate": 3.176e-05, + "loss": 1.2891, + "step": 456 + }, + { + "epoch": 1.8239999999999998, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.5791831612586975, + "eval_runtime": 29.5658, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.131, + "step": 456 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 19.75, + "learning_rate": 3.172e-05, + "loss": 1.0312, + "step": 457 + }, + { + "epoch": 1.8279999999999998, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.5776224732398987, + "eval_runtime": 29.5602, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.131, + "step": 457 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 8.1875, + "learning_rate": 3.168e-05, + "loss": 0.2871, + "step": 458 + }, + { + "epoch": 1.8319999999999999, + "eval_accuracy": 0.8775100401606426, + "eval_loss": 0.5767478942871094, + "eval_runtime": 29.426, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.141, + "step": 458 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 6.1875, + "learning_rate": 3.164e-05, + "loss": 0.4258, + "step": 459 + }, + { + "epoch": 1.8359999999999999, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.574334442615509, + "eval_runtime": 29.3709, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 459 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 10.5, + "learning_rate": 3.16e-05, + "loss": 0.5391, + "step": 460 + }, + { + "epoch": 1.8399999999999999, + "eval_accuracy": 0.8755020080321285, + "eval_loss": 0.5736115574836731, + "eval_runtime": 29.3214, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 460 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 25.875, + "learning_rate": 3.156e-05, + "loss": 0.875, + "step": 461 + }, + { + "epoch": 1.8439999999999999, + "eval_accuracy": 0.8775100401606426, + "eval_loss": 0.5710091590881348, + "eval_runtime": 29.2403, + "eval_samples_per_second": 17.031, + "eval_steps_per_second": 2.155, + "step": 461 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 27.75, + "learning_rate": 3.1519999999999996e-05, + "loss": 0.8359, + "step": 462 + }, + { + "epoch": 1.8479999999999999, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5670624375343323, + "eval_runtime": 29.3544, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.146, + "step": 462 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 5.40625, + "learning_rate": 3.1480000000000004e-05, + "loss": 0.2734, + "step": 463 + }, + { + "epoch": 1.8519999999999999, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.5626578330993652, + "eval_runtime": 29.4835, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.137, + "step": 463 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 11.375, + "learning_rate": 3.1440000000000004e-05, + "loss": 0.6797, + "step": 464 + }, + { + "epoch": 1.8559999999999999, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5601122975349426, + "eval_runtime": 29.5901, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.129, + "step": 464 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 9.75, + "learning_rate": 3.1400000000000004e-05, + "loss": 0.7266, + "step": 465 + }, + { + "epoch": 1.8599999999999999, + "eval_accuracy": 0.8614457831325302, + "eval_loss": 0.5589438080787659, + "eval_runtime": 29.5523, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.132, + "step": 465 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 9.625, + "learning_rate": 3.136e-05, + "loss": 0.6836, + "step": 466 + }, + { + "epoch": 1.8639999999999999, + "eval_accuracy": 0.8674698795180723, + "eval_loss": 0.557289719581604, + "eval_runtime": 29.5897, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.129, + "step": 466 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 8.625, + "learning_rate": 3.132e-05, + "loss": 0.5352, + "step": 467 + }, + { + "epoch": 1.8679999999999999, + "eval_accuracy": 0.8614457831325302, + "eval_loss": 0.5569063425064087, + "eval_runtime": 29.5241, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.134, + "step": 467 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 25.0, + "learning_rate": 3.1280000000000005e-05, + "loss": 1.0312, + "step": 468 + }, + { + "epoch": 1.8719999999999999, + "eval_accuracy": 0.857429718875502, + "eval_loss": 0.5564112067222595, + "eval_runtime": 29.4582, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 468 + }, + { + "epoch": 1.876, + "grad_norm": 25.5, + "learning_rate": 3.1240000000000006e-05, + "loss": 0.7773, + "step": 469 + }, + { + "epoch": 1.876, + "eval_accuracy": 0.8594377510040161, + "eval_loss": 0.5570720434188843, + "eval_runtime": 29.4153, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.142, + "step": 469 + }, + { + "epoch": 1.88, + "grad_norm": 14.625, + "learning_rate": 3.12e-05, + "loss": 0.4648, + "step": 470 + }, + { + "epoch": 1.88, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.557974100112915, + "eval_runtime": 29.4915, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 470 + }, + { + "epoch": 1.884, + "grad_norm": 11.3125, + "learning_rate": 3.116e-05, + "loss": 0.7539, + "step": 471 + }, + { + "epoch": 1.884, + "eval_accuracy": 0.8734939759036144, + "eval_loss": 0.5587559938430786, + "eval_runtime": 29.51, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.135, + "step": 471 + }, + { + "epoch": 1.888, + "grad_norm": 6.9375, + "learning_rate": 3.112e-05, + "loss": 0.3418, + "step": 472 + }, + { + "epoch": 1.888, + "eval_accuracy": 0.8775100401606426, + "eval_loss": 0.5596703290939331, + "eval_runtime": 29.5597, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.131, + "step": 472 + }, + { + "epoch": 1.892, + "grad_norm": 6.46875, + "learning_rate": 3.108e-05, + "loss": 0.2832, + "step": 473 + }, + { + "epoch": 1.892, + "eval_accuracy": 0.8755020080321285, + "eval_loss": 0.5594918727874756, + "eval_runtime": 29.5141, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 473 + }, + { + "epoch": 1.896, + "grad_norm": 6.59375, + "learning_rate": 3.104e-05, + "loss": 0.5039, + "step": 474 + }, + { + "epoch": 1.896, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5598622560501099, + "eval_runtime": 29.5021, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 474 + }, + { + "epoch": 1.9, + "grad_norm": 9.5, + "learning_rate": 3.1e-05, + "loss": 0.5195, + "step": 475 + }, + { + "epoch": 1.9, + "eval_accuracy": 0.8835341365461847, + "eval_loss": 0.5618553161621094, + "eval_runtime": 29.4423, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 475 + }, + { + "epoch": 1.904, + "grad_norm": 5.3125, + "learning_rate": 3.096e-05, + "loss": 0.4531, + "step": 476 + }, + { + "epoch": 1.904, + "eval_accuracy": 0.8815261044176707, + "eval_loss": 0.564973771572113, + "eval_runtime": 29.5021, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 476 + }, + { + "epoch": 1.908, + "grad_norm": 5.0625, + "learning_rate": 3.092e-05, + "loss": 0.5312, + "step": 477 + }, + { + "epoch": 1.908, + "eval_accuracy": 0.8835341365461847, + "eval_loss": 0.5684834122657776, + "eval_runtime": 29.5109, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 477 + }, + { + "epoch": 1.912, + "grad_norm": 9.6875, + "learning_rate": 3.088e-05, + "loss": 0.3281, + "step": 478 + }, + { + "epoch": 1.912, + "eval_accuracy": 0.8775100401606426, + "eval_loss": 0.571491539478302, + "eval_runtime": 29.4995, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 478 + }, + { + "epoch": 1.916, + "grad_norm": 6.65625, + "learning_rate": 3.084e-05, + "loss": 0.9062, + "step": 479 + }, + { + "epoch": 1.916, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5731853246688843, + "eval_runtime": 29.5276, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 479 + }, + { + "epoch": 1.92, + "grad_norm": 6.0625, + "learning_rate": 3.08e-05, + "loss": 0.5625, + "step": 480 + }, + { + "epoch": 1.92, + "eval_accuracy": 0.8875502008032129, + "eval_loss": 0.5743445158004761, + "eval_runtime": 29.4946, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.136, + "step": 480 + }, + { + "epoch": 1.924, + "grad_norm": 4.78125, + "learning_rate": 3.076e-05, + "loss": 0.6328, + "step": 481 + }, + { + "epoch": 1.924, + "eval_accuracy": 0.8815261044176707, + "eval_loss": 0.5752830505371094, + "eval_runtime": 29.4006, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 481 + }, + { + "epoch": 1.928, + "grad_norm": 3.421875, + "learning_rate": 3.072e-05, + "loss": 0.1934, + "step": 482 + }, + { + "epoch": 1.928, + "eval_accuracy": 0.8835341365461847, + "eval_loss": 0.5757492780685425, + "eval_runtime": 29.3542, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.146, + "step": 482 + }, + { + "epoch": 1.932, + "grad_norm": 3.8125, + "learning_rate": 3.0680000000000004e-05, + "loss": 0.4941, + "step": 483 + }, + { + "epoch": 1.932, + "eval_accuracy": 0.8775100401606426, + "eval_loss": 0.5761831402778625, + "eval_runtime": 29.4519, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 483 + }, + { + "epoch": 1.936, + "grad_norm": 3.765625, + "learning_rate": 3.0640000000000005e-05, + "loss": 0.4551, + "step": 484 + }, + { + "epoch": 1.936, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5744856595993042, + "eval_runtime": 29.4918, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 484 + }, + { + "epoch": 1.94, + "grad_norm": 7.0, + "learning_rate": 3.06e-05, + "loss": 0.4648, + "step": 485 + }, + { + "epoch": 1.94, + "eval_accuracy": 0.8674698795180723, + "eval_loss": 0.5740101337432861, + "eval_runtime": 29.4878, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.136, + "step": 485 + }, + { + "epoch": 1.944, + "grad_norm": 5.0625, + "learning_rate": 3.056e-05, + "loss": 0.498, + "step": 486 + }, + { + "epoch": 1.944, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5711672306060791, + "eval_runtime": 29.5013, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 486 + }, + { + "epoch": 1.948, + "grad_norm": 4.59375, + "learning_rate": 3.0520000000000006e-05, + "loss": 0.5156, + "step": 487 + }, + { + "epoch": 1.948, + "eval_accuracy": 0.8734939759036144, + "eval_loss": 0.5706632733345032, + "eval_runtime": 29.5486, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.132, + "step": 487 + }, + { + "epoch": 1.952, + "grad_norm": 4.6875, + "learning_rate": 3.0480000000000003e-05, + "loss": 0.3516, + "step": 488 + }, + { + "epoch": 1.952, + "eval_accuracy": 0.8755020080321285, + "eval_loss": 0.5681500434875488, + "eval_runtime": 29.4288, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.141, + "step": 488 + }, + { + "epoch": 1.956, + "grad_norm": 6.65625, + "learning_rate": 3.0440000000000003e-05, + "loss": 0.6289, + "step": 489 + }, + { + "epoch": 1.956, + "eval_accuracy": 0.8755020080321285, + "eval_loss": 0.5638794302940369, + "eval_runtime": 29.379, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.144, + "step": 489 + }, + { + "epoch": 1.96, + "grad_norm": 6.03125, + "learning_rate": 3.04e-05, + "loss": 0.3848, + "step": 490 + }, + { + "epoch": 1.96, + "eval_accuracy": 0.8634538152610441, + "eval_loss": 0.5608639121055603, + "eval_runtime": 29.3422, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 490 + }, + { + "epoch": 1.964, + "grad_norm": 5.46875, + "learning_rate": 3.036e-05, + "loss": 0.5664, + "step": 491 + }, + { + "epoch": 1.964, + "eval_accuracy": 0.8734939759036144, + "eval_loss": 0.5565158128738403, + "eval_runtime": 29.3736, + "eval_samples_per_second": 16.954, + "eval_steps_per_second": 2.145, + "step": 491 + }, + { + "epoch": 1.968, + "grad_norm": 5.125, + "learning_rate": 3.0320000000000004e-05, + "loss": 0.4375, + "step": 492 + }, + { + "epoch": 1.968, + "eval_accuracy": 0.8674698795180723, + "eval_loss": 0.5503478646278381, + "eval_runtime": 29.364, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 2.145, + "step": 492 + }, + { + "epoch": 1.972, + "grad_norm": 8.125, + "learning_rate": 3.028e-05, + "loss": 0.582, + "step": 493 + }, + { + "epoch": 1.972, + "eval_accuracy": 0.8674698795180723, + "eval_loss": 0.5465025305747986, + "eval_runtime": 29.3004, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 2.15, + "step": 493 + }, + { + "epoch": 1.976, + "grad_norm": 24.625, + "learning_rate": 3.0240000000000002e-05, + "loss": 0.8398, + "step": 494 + }, + { + "epoch": 1.976, + "eval_accuracy": 0.857429718875502, + "eval_loss": 0.544101357460022, + "eval_runtime": 29.2728, + "eval_samples_per_second": 17.012, + "eval_steps_per_second": 2.152, + "step": 494 + }, + { + "epoch": 1.98, + "grad_norm": 5.9375, + "learning_rate": 3.02e-05, + "loss": 0.5039, + "step": 495 + }, + { + "epoch": 1.98, + "eval_accuracy": 0.857429718875502, + "eval_loss": 0.5431907176971436, + "eval_runtime": 29.3319, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 495 + }, + { + "epoch": 1.984, + "grad_norm": 7.59375, + "learning_rate": 3.016e-05, + "loss": 0.1455, + "step": 496 + }, + { + "epoch": 1.984, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.5421074032783508, + "eval_runtime": 29.2709, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 496 + }, + { + "epoch": 1.988, + "grad_norm": 11.75, + "learning_rate": 3.0120000000000003e-05, + "loss": 0.6094, + "step": 497 + }, + { + "epoch": 1.988, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.542059600353241, + "eval_runtime": 29.3183, + "eval_samples_per_second": 16.986, + "eval_steps_per_second": 2.149, + "step": 497 + }, + { + "epoch": 1.992, + "grad_norm": 10.9375, + "learning_rate": 3.0080000000000003e-05, + "loss": 0.3711, + "step": 498 + }, + { + "epoch": 1.992, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.5440202355384827, + "eval_runtime": 29.3151, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.149, + "step": 498 + }, + { + "epoch": 1.996, + "grad_norm": 11.9375, + "learning_rate": 3.004e-05, + "loss": 0.3223, + "step": 499 + }, + { + "epoch": 1.996, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.5460752248764038, + "eval_runtime": 29.2676, + "eval_samples_per_second": 17.015, + "eval_steps_per_second": 2.153, + "step": 499 + }, + { + "epoch": 2.0, + "grad_norm": 56.0, + "learning_rate": 3e-05, + "loss": 1.75, + "step": 500 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.5491499900817871, + "eval_runtime": 29.2586, + "eval_samples_per_second": 17.021, + "eval_steps_per_second": 2.153, + "step": 500 + }, + { + "epoch": 2.004, + "grad_norm": 5.8125, + "learning_rate": 2.9959999999999998e-05, + "loss": 0.1689, + "step": 501 + }, + { + "epoch": 2.004, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.5482244491577148, + "eval_runtime": 29.2555, + "eval_samples_per_second": 17.022, + "eval_steps_per_second": 2.153, + "step": 501 + }, + { + "epoch": 2.008, + "grad_norm": 10.125, + "learning_rate": 2.9920000000000005e-05, + "loss": 0.2773, + "step": 502 + }, + { + "epoch": 2.008, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.547487199306488, + "eval_runtime": 29.3055, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.15, + "step": 502 + }, + { + "epoch": 2.012, + "grad_norm": 10.375, + "learning_rate": 2.9880000000000002e-05, + "loss": 0.1895, + "step": 503 + }, + { + "epoch": 2.012, + "eval_accuracy": 0.8493975903614458, + "eval_loss": 0.5485382080078125, + "eval_runtime": 29.4771, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.137, + "step": 503 + }, + { + "epoch": 2.016, + "grad_norm": 22.25, + "learning_rate": 2.9840000000000002e-05, + "loss": 0.5156, + "step": 504 + }, + { + "epoch": 2.016, + "eval_accuracy": 0.8433734939759037, + "eval_loss": 0.5505305528640747, + "eval_runtime": 29.4934, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 504 + }, + { + "epoch": 2.02, + "grad_norm": 7.0, + "learning_rate": 2.98e-05, + "loss": 0.1523, + "step": 505 + }, + { + "epoch": 2.02, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.5501540899276733, + "eval_runtime": 29.5569, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.131, + "step": 505 + }, + { + "epoch": 2.024, + "grad_norm": 10.0, + "learning_rate": 2.976e-05, + "loss": 0.2168, + "step": 506 + }, + { + "epoch": 2.024, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.5502482056617737, + "eval_runtime": 29.5712, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.13, + "step": 506 + }, + { + "epoch": 2.028, + "grad_norm": 2.484375, + "learning_rate": 2.9720000000000003e-05, + "loss": 0.0459, + "step": 507 + }, + { + "epoch": 2.028, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.5531346797943115, + "eval_runtime": 29.524, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.134, + "step": 507 + }, + { + "epoch": 2.032, + "grad_norm": 21.875, + "learning_rate": 2.9680000000000004e-05, + "loss": 0.7188, + "step": 508 + }, + { + "epoch": 2.032, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.5537151098251343, + "eval_runtime": 29.4789, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.137, + "step": 508 + }, + { + "epoch": 2.036, + "grad_norm": 11.5, + "learning_rate": 2.964e-05, + "loss": 0.377, + "step": 509 + }, + { + "epoch": 2.036, + "eval_accuracy": 0.8453815261044176, + "eval_loss": 0.5540758967399597, + "eval_runtime": 29.3813, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 509 + }, + { + "epoch": 2.04, + "grad_norm": 9.25, + "learning_rate": 2.96e-05, + "loss": 0.4355, + "step": 510 + }, + { + "epoch": 2.04, + "eval_accuracy": 0.8473895582329317, + "eval_loss": 0.5528837442398071, + "eval_runtime": 29.3136, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.149, + "step": 510 + }, + { + "epoch": 2.044, + "grad_norm": 33.0, + "learning_rate": 2.9559999999999998e-05, + "loss": 0.957, + "step": 511 + }, + { + "epoch": 2.044, + "eval_accuracy": 0.8514056224899599, + "eval_loss": 0.5505934953689575, + "eval_runtime": 29.2739, + "eval_samples_per_second": 17.012, + "eval_steps_per_second": 2.152, + "step": 511 + }, + { + "epoch": 2.048, + "grad_norm": 7.90625, + "learning_rate": 2.9520000000000002e-05, + "loss": 0.3926, + "step": 512 + }, + { + "epoch": 2.048, + "eval_accuracy": 0.8554216867469879, + "eval_loss": 0.5476288199424744, + "eval_runtime": 29.2445, + "eval_samples_per_second": 17.029, + "eval_steps_per_second": 2.154, + "step": 512 + }, + { + "epoch": 2.052, + "grad_norm": 14.125, + "learning_rate": 2.9480000000000002e-05, + "loss": 0.7773, + "step": 513 + }, + { + "epoch": 2.052, + "eval_accuracy": 0.857429718875502, + "eval_loss": 0.5464844107627869, + "eval_runtime": 29.3495, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 513 + }, + { + "epoch": 2.056, + "grad_norm": 14.625, + "learning_rate": 2.944e-05, + "loss": 0.4629, + "step": 514 + }, + { + "epoch": 2.056, + "eval_accuracy": 0.8634538152610441, + "eval_loss": 0.5462808012962341, + "eval_runtime": 29.4769, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.137, + "step": 514 + }, + { + "epoch": 2.06, + "grad_norm": 21.375, + "learning_rate": 2.94e-05, + "loss": 0.3047, + "step": 515 + }, + { + "epoch": 2.06, + "eval_accuracy": 0.8614457831325302, + "eval_loss": 0.5446965098381042, + "eval_runtime": 29.5477, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.132, + "step": 515 + }, + { + "epoch": 2.064, + "grad_norm": 9.9375, + "learning_rate": 2.9360000000000003e-05, + "loss": 0.6914, + "step": 516 + }, + { + "epoch": 2.064, + "eval_accuracy": 0.8594377510040161, + "eval_loss": 0.5427827835083008, + "eval_runtime": 29.5097, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.135, + "step": 516 + }, + { + "epoch": 2.068, + "grad_norm": 32.0, + "learning_rate": 2.9320000000000004e-05, + "loss": 1.0391, + "step": 517 + }, + { + "epoch": 2.068, + "eval_accuracy": 0.8694779116465864, + "eval_loss": 0.5410264134407043, + "eval_runtime": 29.5008, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 517 + }, + { + "epoch": 2.072, + "grad_norm": 35.5, + "learning_rate": 2.928e-05, + "loss": 1.2656, + "step": 518 + }, + { + "epoch": 2.072, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5422987937927246, + "eval_runtime": 29.5442, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.132, + "step": 518 + }, + { + "epoch": 2.076, + "grad_norm": 8.6875, + "learning_rate": 2.924e-05, + "loss": 0.4531, + "step": 519 + }, + { + "epoch": 2.076, + "eval_accuracy": 0.8775100401606426, + "eval_loss": 0.5431312918663025, + "eval_runtime": 29.4777, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.137, + "step": 519 + }, + { + "epoch": 2.08, + "grad_norm": 3.9375, + "learning_rate": 2.9199999999999998e-05, + "loss": 0.2734, + "step": 520 + }, + { + "epoch": 2.08, + "eval_accuracy": 0.8835341365461847, + "eval_loss": 0.5426461100578308, + "eval_runtime": 29.4389, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.14, + "step": 520 + }, + { + "epoch": 2.084, + "grad_norm": 4.96875, + "learning_rate": 2.9160000000000005e-05, + "loss": 0.5586, + "step": 521 + }, + { + "epoch": 2.084, + "eval_accuracy": 0.8815261044176707, + "eval_loss": 0.5422388315200806, + "eval_runtime": 29.3021, + "eval_samples_per_second": 16.995, + "eval_steps_per_second": 2.15, + "step": 521 + }, + { + "epoch": 2.088, + "grad_norm": 5.0, + "learning_rate": 2.9120000000000002e-05, + "loss": 0.332, + "step": 522 + }, + { + "epoch": 2.088, + "eval_accuracy": 0.8815261044176707, + "eval_loss": 0.5421609282493591, + "eval_runtime": 29.2555, + "eval_samples_per_second": 17.022, + "eval_steps_per_second": 2.153, + "step": 522 + }, + { + "epoch": 2.092, + "grad_norm": 6.34375, + "learning_rate": 2.9080000000000003e-05, + "loss": 0.5547, + "step": 523 + }, + { + "epoch": 2.092, + "eval_accuracy": 0.8895582329317269, + "eval_loss": 0.5419729948043823, + "eval_runtime": 29.345, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.147, + "step": 523 + }, + { + "epoch": 2.096, + "grad_norm": 5.34375, + "learning_rate": 2.904e-05, + "loss": 0.6445, + "step": 524 + }, + { + "epoch": 2.096, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.5431819558143616, + "eval_runtime": 29.4432, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 524 + }, + { + "epoch": 2.1, + "grad_norm": 10.4375, + "learning_rate": 2.9e-05, + "loss": 0.6914, + "step": 525 + }, + { + "epoch": 2.1, + "eval_accuracy": 0.893574297188755, + "eval_loss": 0.5430887341499329, + "eval_runtime": 29.5046, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 525 + }, + { + "epoch": 2.104, + "grad_norm": 4.625, + "learning_rate": 2.8960000000000004e-05, + "loss": 0.5312, + "step": 526 + }, + { + "epoch": 2.104, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5440460443496704, + "eval_runtime": 29.5664, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.131, + "step": 526 + }, + { + "epoch": 2.108, + "grad_norm": 21.625, + "learning_rate": 2.8920000000000004e-05, + "loss": 1.1484, + "step": 527 + }, + { + "epoch": 2.108, + "eval_accuracy": 0.891566265060241, + "eval_loss": 0.5430581569671631, + "eval_runtime": 29.5114, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 527 + }, + { + "epoch": 2.112, + "grad_norm": 7.78125, + "learning_rate": 2.888e-05, + "loss": 0.6992, + "step": 528 + }, + { + "epoch": 2.112, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5427139401435852, + "eval_runtime": 29.5094, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.135, + "step": 528 + }, + { + "epoch": 2.116, + "grad_norm": 4.4375, + "learning_rate": 2.8840000000000002e-05, + "loss": 0.4082, + "step": 529 + }, + { + "epoch": 2.116, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5439698696136475, + "eval_runtime": 29.3766, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 529 + }, + { + "epoch": 2.12, + "grad_norm": 14.6875, + "learning_rate": 2.88e-05, + "loss": 0.2949, + "step": 530 + }, + { + "epoch": 2.12, + "eval_accuracy": 0.9016064257028112, + "eval_loss": 0.5445825457572937, + "eval_runtime": 29.3212, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 530 + }, + { + "epoch": 2.124, + "grad_norm": 4.40625, + "learning_rate": 2.8760000000000002e-05, + "loss": 0.4844, + "step": 531 + }, + { + "epoch": 2.124, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5437197685241699, + "eval_runtime": 29.3503, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.146, + "step": 531 + }, + { + "epoch": 2.128, + "grad_norm": 6.09375, + "learning_rate": 2.8720000000000003e-05, + "loss": 0.4512, + "step": 532 + }, + { + "epoch": 2.128, + "eval_accuracy": 0.9016064257028112, + "eval_loss": 0.5439391732215881, + "eval_runtime": 29.2726, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 532 + }, + { + "epoch": 2.132, + "grad_norm": 5.71875, + "learning_rate": 2.868e-05, + "loss": 0.5586, + "step": 533 + }, + { + "epoch": 2.132, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5430291891098022, + "eval_runtime": 29.3199, + "eval_samples_per_second": 16.985, + "eval_steps_per_second": 2.149, + "step": 533 + }, + { + "epoch": 2.136, + "grad_norm": 5.28125, + "learning_rate": 2.864e-05, + "loss": 0.6211, + "step": 534 + }, + { + "epoch": 2.136, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.541820228099823, + "eval_runtime": 29.4479, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.139, + "step": 534 + }, + { + "epoch": 2.14, + "grad_norm": 5.09375, + "learning_rate": 2.86e-05, + "loss": 0.5273, + "step": 535 + }, + { + "epoch": 2.14, + "eval_accuracy": 0.9016064257028112, + "eval_loss": 0.539638876914978, + "eval_runtime": 29.5379, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 535 + }, + { + "epoch": 2.144, + "grad_norm": 4.625, + "learning_rate": 2.8560000000000004e-05, + "loss": 0.3633, + "step": 536 + }, + { + "epoch": 2.144, + "eval_accuracy": 0.9016064257028112, + "eval_loss": 0.5370483994483948, + "eval_runtime": 29.4958, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.136, + "step": 536 + }, + { + "epoch": 2.148, + "grad_norm": 21.375, + "learning_rate": 2.852e-05, + "loss": 0.7891, + "step": 537 + }, + { + "epoch": 2.148, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5342397689819336, + "eval_runtime": 29.4912, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 537 + }, + { + "epoch": 2.152, + "grad_norm": 4.78125, + "learning_rate": 2.8480000000000002e-05, + "loss": 0.332, + "step": 538 + }, + { + "epoch": 2.152, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.5307087302207947, + "eval_runtime": 29.4466, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.139, + "step": 538 + }, + { + "epoch": 2.156, + "grad_norm": 7.59375, + "learning_rate": 2.844e-05, + "loss": 0.543, + "step": 539 + }, + { + "epoch": 2.156, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.5306609869003296, + "eval_runtime": 29.3395, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 539 + }, + { + "epoch": 2.16, + "grad_norm": 6.40625, + "learning_rate": 2.84e-05, + "loss": 0.4863, + "step": 540 + }, + { + "epoch": 2.16, + "eval_accuracy": 0.891566265060241, + "eval_loss": 0.5283539295196533, + "eval_runtime": 29.2729, + "eval_samples_per_second": 17.012, + "eval_steps_per_second": 2.152, + "step": 540 + }, + { + "epoch": 2.164, + "grad_norm": 5.53125, + "learning_rate": 2.8360000000000003e-05, + "loss": 0.5938, + "step": 541 + }, + { + "epoch": 2.164, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.5278202295303345, + "eval_runtime": 29.2883, + "eval_samples_per_second": 17.003, + "eval_steps_per_second": 2.151, + "step": 541 + }, + { + "epoch": 2.168, + "grad_norm": 11.0625, + "learning_rate": 2.8320000000000003e-05, + "loss": 0.5781, + "step": 542 + }, + { + "epoch": 2.168, + "eval_accuracy": 0.891566265060241, + "eval_loss": 0.5269098877906799, + "eval_runtime": 29.2726, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 542 + }, + { + "epoch": 2.172, + "grad_norm": 7.09375, + "learning_rate": 2.828e-05, + "loss": 0.4707, + "step": 543 + }, + { + "epoch": 2.172, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5270978808403015, + "eval_runtime": 29.4844, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 543 + }, + { + "epoch": 2.176, + "grad_norm": 24.0, + "learning_rate": 2.824e-05, + "loss": 0.6719, + "step": 544 + }, + { + "epoch": 2.176, + "eval_accuracy": 0.891566265060241, + "eval_loss": 0.5265326499938965, + "eval_runtime": 29.4941, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 544 + }, + { + "epoch": 2.18, + "grad_norm": 5.625, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.4844, + "step": 545 + }, + { + "epoch": 2.18, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5279444456100464, + "eval_runtime": 29.5079, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 545 + }, + { + "epoch": 2.184, + "grad_norm": 6.46875, + "learning_rate": 2.816e-05, + "loss": 0.3789, + "step": 546 + }, + { + "epoch": 2.184, + "eval_accuracy": 0.891566265060241, + "eval_loss": 0.5268769860267639, + "eval_runtime": 29.5533, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.132, + "step": 546 + }, + { + "epoch": 2.188, + "grad_norm": 3.078125, + "learning_rate": 2.8120000000000002e-05, + "loss": 0.1924, + "step": 547 + }, + { + "epoch": 2.188, + "eval_accuracy": 0.8895582329317269, + "eval_loss": 0.5257623791694641, + "eval_runtime": 29.4574, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 547 + }, + { + "epoch": 2.192, + "grad_norm": 6.15625, + "learning_rate": 2.8080000000000002e-05, + "loss": 0.3945, + "step": 548 + }, + { + "epoch": 2.192, + "eval_accuracy": 0.893574297188755, + "eval_loss": 0.5240671038627625, + "eval_runtime": 29.3521, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.146, + "step": 548 + }, + { + "epoch": 2.196, + "grad_norm": 13.4375, + "learning_rate": 2.804e-05, + "loss": 0.3379, + "step": 549 + }, + { + "epoch": 2.196, + "eval_accuracy": 0.8875502008032129, + "eval_loss": 0.5238152742385864, + "eval_runtime": 29.4101, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.142, + "step": 549 + }, + { + "epoch": 2.2, + "grad_norm": 7.53125, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.5273, + "step": 550 + }, + { + "epoch": 2.2, + "eval_accuracy": 0.8875502008032129, + "eval_loss": 0.5237206220626831, + "eval_runtime": 29.4628, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.138, + "step": 550 + }, + { + "epoch": 2.204, + "grad_norm": 18.125, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.3438, + "step": 551 + }, + { + "epoch": 2.204, + "eval_accuracy": 0.8855421686746988, + "eval_loss": 0.5233593583106995, + "eval_runtime": 29.2968, + "eval_samples_per_second": 16.998, + "eval_steps_per_second": 2.15, + "step": 551 + }, + { + "epoch": 2.208, + "grad_norm": 6.78125, + "learning_rate": 2.792e-05, + "loss": 0.0654, + "step": 552 + }, + { + "epoch": 2.208, + "eval_accuracy": 0.8815261044176707, + "eval_loss": 0.529853880405426, + "eval_runtime": 29.301, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 2.15, + "step": 552 + }, + { + "epoch": 2.212, + "grad_norm": 10.0, + "learning_rate": 2.788e-05, + "loss": 0.3906, + "step": 553 + }, + { + "epoch": 2.212, + "eval_accuracy": 0.8855421686746988, + "eval_loss": 0.5395488142967224, + "eval_runtime": 29.4737, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.137, + "step": 553 + }, + { + "epoch": 2.216, + "grad_norm": 6.125, + "learning_rate": 2.7839999999999998e-05, + "loss": 0.1934, + "step": 554 + }, + { + "epoch": 2.216, + "eval_accuracy": 0.8734939759036144, + "eval_loss": 0.552130401134491, + "eval_runtime": 29.5877, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 554 + }, + { + "epoch": 2.22, + "grad_norm": 48.5, + "learning_rate": 2.7800000000000005e-05, + "loss": 0.8867, + "step": 555 + }, + { + "epoch": 2.22, + "eval_accuracy": 0.8674698795180723, + "eval_loss": 0.5590487718582153, + "eval_runtime": 29.5846, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.129, + "step": 555 + }, + { + "epoch": 2.224, + "grad_norm": 13.875, + "learning_rate": 2.7760000000000002e-05, + "loss": 0.2734, + "step": 556 + }, + { + "epoch": 2.224, + "eval_accuracy": 0.8634538152610441, + "eval_loss": 0.5597860813140869, + "eval_runtime": 29.5568, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.131, + "step": 556 + }, + { + "epoch": 2.228, + "grad_norm": 30.25, + "learning_rate": 2.7720000000000002e-05, + "loss": 0.5352, + "step": 557 + }, + { + "epoch": 2.228, + "eval_accuracy": 0.8614457831325302, + "eval_loss": 0.5617784261703491, + "eval_runtime": 29.4276, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.141, + "step": 557 + }, + { + "epoch": 2.232, + "grad_norm": 71.5, + "learning_rate": 2.768e-05, + "loss": 1.2891, + "step": 558 + }, + { + "epoch": 2.232, + "eval_accuracy": 0.8755020080321285, + "eval_loss": 0.5553778409957886, + "eval_runtime": 29.3593, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 558 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 26.25, + "learning_rate": 2.764e-05, + "loss": 0.5273, + "step": 559 + }, + { + "epoch": 2.2359999999999998, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5490242838859558, + "eval_runtime": 29.3208, + "eval_samples_per_second": 16.985, + "eval_steps_per_second": 2.149, + "step": 559 + }, + { + "epoch": 2.24, + "grad_norm": 15.5, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.3652, + "step": 560 + }, + { + "epoch": 2.24, + "eval_accuracy": 0.8855421686746988, + "eval_loss": 0.5460122227668762, + "eval_runtime": 29.294, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 560 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 24.0, + "learning_rate": 2.7560000000000004e-05, + "loss": 0.7305, + "step": 561 + }, + { + "epoch": 2.2439999999999998, + "eval_accuracy": 0.8835341365461847, + "eval_loss": 0.5451180934906006, + "eval_runtime": 29.2678, + "eval_samples_per_second": 17.015, + "eval_steps_per_second": 2.153, + "step": 561 + }, + { + "epoch": 2.248, + "grad_norm": 86.0, + "learning_rate": 2.752e-05, + "loss": 1.7344, + "step": 562 + }, + { + "epoch": 2.248, + "eval_accuracy": 0.8815261044176707, + "eval_loss": 0.5494793653488159, + "eval_runtime": 29.3111, + "eval_samples_per_second": 16.99, + "eval_steps_per_second": 2.149, + "step": 562 + }, + { + "epoch": 2.252, + "grad_norm": 46.0, + "learning_rate": 2.748e-05, + "loss": 1.25, + "step": 563 + }, + { + "epoch": 2.252, + "eval_accuracy": 0.8714859437751004, + "eval_loss": 0.5555666089057922, + "eval_runtime": 29.3224, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 563 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 18.625, + "learning_rate": 2.7439999999999998e-05, + "loss": 0.4277, + "step": 564 + }, + { + "epoch": 2.2560000000000002, + "eval_accuracy": 0.8674698795180723, + "eval_loss": 0.5525238513946533, + "eval_runtime": 29.3945, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.143, + "step": 564 + }, + { + "epoch": 2.26, + "grad_norm": 17.625, + "learning_rate": 2.7400000000000002e-05, + "loss": 0.6133, + "step": 565 + }, + { + "epoch": 2.26, + "eval_accuracy": 0.8694779116465864, + "eval_loss": 0.5513808131217957, + "eval_runtime": 29.471, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 565 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 19.875, + "learning_rate": 2.7360000000000002e-05, + "loss": 0.5586, + "step": 566 + }, + { + "epoch": 2.2640000000000002, + "eval_accuracy": 0.8694779116465864, + "eval_loss": 0.5439775586128235, + "eval_runtime": 29.5008, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 566 + }, + { + "epoch": 2.268, + "grad_norm": 26.0, + "learning_rate": 2.7320000000000003e-05, + "loss": 0.7852, + "step": 567 + }, + { + "epoch": 2.268, + "eval_accuracy": 0.8694779116465864, + "eval_loss": 0.538253128528595, + "eval_runtime": 29.5619, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.131, + "step": 567 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 9.75, + "learning_rate": 2.728e-05, + "loss": 0.1904, + "step": 568 + }, + { + "epoch": 2.2720000000000002, + "eval_accuracy": 0.8734939759036144, + "eval_loss": 0.5339057445526123, + "eval_runtime": 29.523, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.134, + "step": 568 + }, + { + "epoch": 2.276, + "grad_norm": 23.875, + "learning_rate": 2.724e-05, + "loss": 0.5898, + "step": 569 + }, + { + "epoch": 2.276, + "eval_accuracy": 0.8795180722891566, + "eval_loss": 0.5264682173728943, + "eval_runtime": 29.5833, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 569 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 40.5, + "learning_rate": 2.7200000000000004e-05, + "loss": 0.8789, + "step": 570 + }, + { + "epoch": 2.2800000000000002, + "eval_accuracy": 0.8855421686746988, + "eval_loss": 0.5228438973426819, + "eval_runtime": 29.5723, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.13, + "step": 570 + }, + { + "epoch": 2.284, + "grad_norm": 11.625, + "learning_rate": 2.716e-05, + "loss": 0.5938, + "step": 571 + }, + { + "epoch": 2.284, + "eval_accuracy": 0.8815261044176707, + "eval_loss": 0.5181211233139038, + "eval_runtime": 29.471, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 571 + }, + { + "epoch": 2.288, + "grad_norm": 14.75, + "learning_rate": 2.712e-05, + "loss": 0.5391, + "step": 572 + }, + { + "epoch": 2.288, + "eval_accuracy": 0.8835341365461847, + "eval_loss": 0.5149523019790649, + "eval_runtime": 29.3328, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 572 + }, + { + "epoch": 2.292, + "grad_norm": 13.8125, + "learning_rate": 2.7079999999999998e-05, + "loss": 0.5859, + "step": 573 + }, + { + "epoch": 2.292, + "eval_accuracy": 0.8835341365461847, + "eval_loss": 0.5128970742225647, + "eval_runtime": 29.2775, + "eval_samples_per_second": 17.01, + "eval_steps_per_second": 2.152, + "step": 573 + }, + { + "epoch": 2.296, + "grad_norm": 27.25, + "learning_rate": 2.704e-05, + "loss": 1.0234, + "step": 574 + }, + { + "epoch": 2.296, + "eval_accuracy": 0.8895582329317269, + "eval_loss": 0.5105754733085632, + "eval_runtime": 29.3673, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.145, + "step": 574 + }, + { + "epoch": 2.3, + "grad_norm": 12.25, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.2891, + "step": 575 + }, + { + "epoch": 2.3, + "eval_accuracy": 0.8895582329317269, + "eval_loss": 0.5097286701202393, + "eval_runtime": 29.439, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.14, + "step": 575 + }, + { + "epoch": 2.304, + "grad_norm": 13.125, + "learning_rate": 2.6960000000000003e-05, + "loss": 0.6133, + "step": 576 + }, + { + "epoch": 2.304, + "eval_accuracy": 0.8875502008032129, + "eval_loss": 0.5089132785797119, + "eval_runtime": 29.4994, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 576 + }, + { + "epoch": 2.308, + "grad_norm": 16.875, + "learning_rate": 2.692e-05, + "loss": 0.7227, + "step": 577 + }, + { + "epoch": 2.308, + "eval_accuracy": 0.8895582329317269, + "eval_loss": 0.5112680196762085, + "eval_runtime": 29.5164, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 577 + }, + { + "epoch": 2.312, + "grad_norm": 14.625, + "learning_rate": 2.688e-05, + "loss": 0.5742, + "step": 578 + }, + { + "epoch": 2.312, + "eval_accuracy": 0.891566265060241, + "eval_loss": 0.5109400153160095, + "eval_runtime": 29.5031, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 578 + }, + { + "epoch": 2.316, + "grad_norm": 27.75, + "learning_rate": 2.6840000000000004e-05, + "loss": 0.6094, + "step": 579 + }, + { + "epoch": 2.316, + "eval_accuracy": 0.8855421686746988, + "eval_loss": 0.5112075209617615, + "eval_runtime": 29.5609, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.131, + "step": 579 + }, + { + "epoch": 2.32, + "grad_norm": 22.625, + "learning_rate": 2.6800000000000004e-05, + "loss": 0.6367, + "step": 580 + }, + { + "epoch": 2.32, + "eval_accuracy": 0.8875502008032129, + "eval_loss": 0.5086026191711426, + "eval_runtime": 29.4914, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 580 + }, + { + "epoch": 2.324, + "grad_norm": 14.375, + "learning_rate": 2.676e-05, + "loss": 0.6719, + "step": 581 + }, + { + "epoch": 2.324, + "eval_accuracy": 0.8815261044176707, + "eval_loss": 0.5069695711135864, + "eval_runtime": 29.4633, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.138, + "step": 581 + }, + { + "epoch": 2.328, + "grad_norm": 23.875, + "learning_rate": 2.672e-05, + "loss": 0.7148, + "step": 582 + }, + { + "epoch": 2.328, + "eval_accuracy": 0.8875502008032129, + "eval_loss": 0.5065613985061646, + "eval_runtime": 29.4013, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 582 + }, + { + "epoch": 2.332, + "grad_norm": 9.375, + "learning_rate": 2.668e-05, + "loss": 0.4902, + "step": 583 + }, + { + "epoch": 2.332, + "eval_accuracy": 0.8875502008032129, + "eval_loss": 0.5070946216583252, + "eval_runtime": 29.3111, + "eval_samples_per_second": 16.99, + "eval_steps_per_second": 2.149, + "step": 583 + }, + { + "epoch": 2.336, + "grad_norm": 4.65625, + "learning_rate": 2.6640000000000002e-05, + "loss": 0.2031, + "step": 584 + }, + { + "epoch": 2.336, + "eval_accuracy": 0.8855421686746988, + "eval_loss": 0.5058700442314148, + "eval_runtime": 29.3574, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.146, + "step": 584 + }, + { + "epoch": 2.34, + "grad_norm": 7.09375, + "learning_rate": 2.6600000000000003e-05, + "loss": 0.291, + "step": 585 + }, + { + "epoch": 2.34, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5027631521224976, + "eval_runtime": 29.3469, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.147, + "step": 585 + }, + { + "epoch": 2.344, + "grad_norm": 9.25, + "learning_rate": 2.6560000000000003e-05, + "loss": 0.2256, + "step": 586 + }, + { + "epoch": 2.344, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5005035400390625, + "eval_runtime": 29.4678, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.138, + "step": 586 + }, + { + "epoch": 2.348, + "grad_norm": 29.5, + "learning_rate": 2.652e-05, + "loss": 0.625, + "step": 587 + }, + { + "epoch": 2.348, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.49910691380500793, + "eval_runtime": 29.5107, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 587 + }, + { + "epoch": 2.352, + "grad_norm": 10.5, + "learning_rate": 2.648e-05, + "loss": 0.5312, + "step": 588 + }, + { + "epoch": 2.352, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.4992321729660034, + "eval_runtime": 29.5172, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 588 + }, + { + "epoch": 2.356, + "grad_norm": 27.25, + "learning_rate": 2.6440000000000004e-05, + "loss": 0.5312, + "step": 589 + }, + { + "epoch": 2.356, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.4985103905200958, + "eval_runtime": 29.5079, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 589 + }, + { + "epoch": 2.36, + "grad_norm": 8.4375, + "learning_rate": 2.64e-05, + "loss": 0.2422, + "step": 590 + }, + { + "epoch": 2.36, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.4986044466495514, + "eval_runtime": 29.5206, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 590 + }, + { + "epoch": 2.364, + "grad_norm": 8.5625, + "learning_rate": 2.6360000000000002e-05, + "loss": 0.1484, + "step": 591 + }, + { + "epoch": 2.364, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.49888670444488525, + "eval_runtime": 29.4846, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 591 + }, + { + "epoch": 2.368, + "grad_norm": 57.25, + "learning_rate": 2.632e-05, + "loss": 0.4746, + "step": 592 + }, + { + "epoch": 2.368, + "eval_accuracy": 0.893574297188755, + "eval_loss": 0.49971798062324524, + "eval_runtime": 29.3916, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.143, + "step": 592 + }, + { + "epoch": 2.372, + "grad_norm": 6.46875, + "learning_rate": 2.628e-05, + "loss": 0.3184, + "step": 593 + }, + { + "epoch": 2.372, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.500674843788147, + "eval_runtime": 29.4022, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 593 + }, + { + "epoch": 2.376, + "grad_norm": 30.625, + "learning_rate": 2.6240000000000003e-05, + "loss": 0.9258, + "step": 594 + }, + { + "epoch": 2.376, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.5037181973457336, + "eval_runtime": 29.4979, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.136, + "step": 594 + }, + { + "epoch": 2.38, + "grad_norm": 21.5, + "learning_rate": 2.6200000000000003e-05, + "loss": 0.5273, + "step": 595 + }, + { + "epoch": 2.38, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5048006176948547, + "eval_runtime": 29.5811, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.13, + "step": 595 + }, + { + "epoch": 2.384, + "grad_norm": 12.0625, + "learning_rate": 2.616e-05, + "loss": 0.4551, + "step": 596 + }, + { + "epoch": 2.384, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5057418942451477, + "eval_runtime": 29.5256, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 596 + }, + { + "epoch": 2.388, + "grad_norm": 11.3125, + "learning_rate": 2.612e-05, + "loss": 0.2539, + "step": 597 + }, + { + "epoch": 2.388, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5085029006004333, + "eval_runtime": 29.5199, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 597 + }, + { + "epoch": 2.392, + "grad_norm": 45.5, + "learning_rate": 2.6079999999999998e-05, + "loss": 0.7773, + "step": 598 + }, + { + "epoch": 2.392, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5092715620994568, + "eval_runtime": 29.4659, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.138, + "step": 598 + }, + { + "epoch": 2.396, + "grad_norm": 33.5, + "learning_rate": 2.6040000000000005e-05, + "loss": 0.9961, + "step": 599 + }, + { + "epoch": 2.396, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.5119227766990662, + "eval_runtime": 29.3274, + "eval_samples_per_second": 16.981, + "eval_steps_per_second": 2.148, + "step": 599 + }, + { + "epoch": 2.4, + "grad_norm": 4.4375, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.25, + "step": 600 + }, + { + "epoch": 2.4, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.5150603652000427, + "eval_runtime": 29.3297, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 2.148, + "step": 600 + }, + { + "epoch": 2.404, + "grad_norm": 64.0, + "learning_rate": 2.5960000000000002e-05, + "loss": 1.0078, + "step": 601 + }, + { + "epoch": 2.404, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5178840756416321, + "eval_runtime": 29.3061, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.15, + "step": 601 + }, + { + "epoch": 2.408, + "grad_norm": 32.5, + "learning_rate": 2.592e-05, + "loss": 0.1206, + "step": 602 + }, + { + "epoch": 2.408, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5229042172431946, + "eval_runtime": 29.2464, + "eval_samples_per_second": 17.028, + "eval_steps_per_second": 2.154, + "step": 602 + }, + { + "epoch": 2.412, + "grad_norm": 27.75, + "learning_rate": 2.588e-05, + "loss": 0.543, + "step": 603 + }, + { + "epoch": 2.412, + "eval_accuracy": 0.893574297188755, + "eval_loss": 0.5262143611907959, + "eval_runtime": 29.4069, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.142, + "step": 603 + }, + { + "epoch": 2.416, + "grad_norm": 5.125, + "learning_rate": 2.5840000000000003e-05, + "loss": 0.3047, + "step": 604 + }, + { + "epoch": 2.416, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5326778292655945, + "eval_runtime": 29.5504, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.132, + "step": 604 + }, + { + "epoch": 2.42, + "grad_norm": 3.96875, + "learning_rate": 2.58e-05, + "loss": 0.1777, + "step": 605 + }, + { + "epoch": 2.42, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5391021966934204, + "eval_runtime": 29.592, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.129, + "step": 605 + }, + { + "epoch": 2.424, + "grad_norm": 84.0, + "learning_rate": 2.576e-05, + "loss": 0.8008, + "step": 606 + }, + { + "epoch": 2.424, + "eval_accuracy": 0.891566265060241, + "eval_loss": 0.5571268200874329, + "eval_runtime": 29.589, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 606 + }, + { + "epoch": 2.428, + "grad_norm": 124.0, + "learning_rate": 2.572e-05, + "loss": 1.2422, + "step": 607 + }, + { + "epoch": 2.428, + "eval_accuracy": 0.891566265060241, + "eval_loss": 0.5656678080558777, + "eval_runtime": 29.5289, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 607 + }, + { + "epoch": 2.432, + "grad_norm": 35.5, + "learning_rate": 2.5679999999999998e-05, + "loss": 0.4375, + "step": 608 + }, + { + "epoch": 2.432, + "eval_accuracy": 0.8955823293172691, + "eval_loss": 0.5666506290435791, + "eval_runtime": 29.4412, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.14, + "step": 608 + }, + { + "epoch": 2.436, + "grad_norm": 12.5625, + "learning_rate": 2.5640000000000002e-05, + "loss": 0.377, + "step": 609 + }, + { + "epoch": 2.436, + "eval_accuracy": 0.9036144578313253, + "eval_loss": 0.5632814168930054, + "eval_runtime": 29.3813, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 609 + }, + { + "epoch": 2.44, + "grad_norm": 10.0625, + "learning_rate": 2.5600000000000002e-05, + "loss": 0.5781, + "step": 610 + }, + { + "epoch": 2.44, + "eval_accuracy": 0.9036144578313253, + "eval_loss": 0.5601478219032288, + "eval_runtime": 29.3347, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 610 + }, + { + "epoch": 2.444, + "grad_norm": 14.375, + "learning_rate": 2.556e-05, + "loss": 0.5898, + "step": 611 + }, + { + "epoch": 2.444, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.554327666759491, + "eval_runtime": 29.3398, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 611 + }, + { + "epoch": 2.448, + "grad_norm": 23.875, + "learning_rate": 2.552e-05, + "loss": 0.5977, + "step": 612 + }, + { + "epoch": 2.448, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5505782961845398, + "eval_runtime": 29.5197, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 612 + }, + { + "epoch": 2.452, + "grad_norm": 16.25, + "learning_rate": 2.5480000000000003e-05, + "loss": 0.3926, + "step": 613 + }, + { + "epoch": 2.452, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5453718900680542, + "eval_runtime": 29.531, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.133, + "step": 613 + }, + { + "epoch": 2.456, + "grad_norm": 54.25, + "learning_rate": 2.5440000000000004e-05, + "loss": 0.8047, + "step": 614 + }, + { + "epoch": 2.456, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5330707430839539, + "eval_runtime": 29.5398, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 614 + }, + { + "epoch": 2.46, + "grad_norm": 22.0, + "learning_rate": 2.54e-05, + "loss": 0.2695, + "step": 615 + }, + { + "epoch": 2.46, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5197996497154236, + "eval_runtime": 29.5819, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.13, + "step": 615 + }, + { + "epoch": 2.464, + "grad_norm": 45.0, + "learning_rate": 2.536e-05, + "loss": 1.2109, + "step": 616 + }, + { + "epoch": 2.464, + "eval_accuracy": 0.9096385542168675, + "eval_loss": 0.5071935653686523, + "eval_runtime": 29.5275, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 616 + }, + { + "epoch": 2.468, + "grad_norm": 5.25, + "learning_rate": 2.5319999999999998e-05, + "loss": 0.2344, + "step": 617 + }, + { + "epoch": 2.468, + "eval_accuracy": 0.9036144578313253, + "eval_loss": 0.5021207928657532, + "eval_runtime": 29.4847, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 617 + }, + { + "epoch": 2.472, + "grad_norm": 9.5625, + "learning_rate": 2.5280000000000005e-05, + "loss": 0.4785, + "step": 618 + }, + { + "epoch": 2.472, + "eval_accuracy": 0.9056224899598394, + "eval_loss": 0.5007221102714539, + "eval_runtime": 29.4385, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.14, + "step": 618 + }, + { + "epoch": 2.476, + "grad_norm": 7.34375, + "learning_rate": 2.5240000000000002e-05, + "loss": 0.3945, + "step": 619 + }, + { + "epoch": 2.476, + "eval_accuracy": 0.9076305220883534, + "eval_loss": 0.5001117587089539, + "eval_runtime": 29.3497, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 619 + }, + { + "epoch": 2.48, + "grad_norm": 23.25, + "learning_rate": 2.5200000000000003e-05, + "loss": 0.6797, + "step": 620 + }, + { + "epoch": 2.48, + "eval_accuracy": 0.8975903614457831, + "eval_loss": 0.5012707114219666, + "eval_runtime": 29.3703, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 620 + }, + { + "epoch": 2.484, + "grad_norm": 7.09375, + "learning_rate": 2.516e-05, + "loss": 0.1777, + "step": 621 + }, + { + "epoch": 2.484, + "eval_accuracy": 0.9016064257028112, + "eval_loss": 0.5018197894096375, + "eval_runtime": 29.5037, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 621 + }, + { + "epoch": 2.488, + "grad_norm": 18.0, + "learning_rate": 2.512e-05, + "loss": 0.4766, + "step": 622 + }, + { + "epoch": 2.488, + "eval_accuracy": 0.9056224899598394, + "eval_loss": 0.5017197728157043, + "eval_runtime": 29.5372, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 622 + }, + { + "epoch": 2.492, + "grad_norm": 10.625, + "learning_rate": 2.5080000000000004e-05, + "loss": 0.377, + "step": 623 + }, + { + "epoch": 2.492, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.502036452293396, + "eval_runtime": 29.5342, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.133, + "step": 623 + }, + { + "epoch": 2.496, + "grad_norm": 13.3125, + "learning_rate": 2.504e-05, + "loss": 0.5625, + "step": 624 + }, + { + "epoch": 2.496, + "eval_accuracy": 0.8995983935742972, + "eval_loss": 0.5010805130004883, + "eval_runtime": 29.5385, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 624 + }, + { + "epoch": 2.5, + "grad_norm": 12.875, + "learning_rate": 2.5e-05, + "loss": 0.4746, + "step": 625 + }, + { + "epoch": 2.5, + "eval_accuracy": 0.9136546184738956, + "eval_loss": 0.4989655911922455, + "eval_runtime": 29.5632, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.131, + "step": 625 + }, + { + "epoch": 2.504, + "grad_norm": 11.875, + "learning_rate": 2.496e-05, + "loss": 0.6094, + "step": 626 + }, + { + "epoch": 2.504, + "eval_accuracy": 0.9056224899598394, + "eval_loss": 0.4957020580768585, + "eval_runtime": 29.4176, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 626 + }, + { + "epoch": 2.508, + "grad_norm": 16.125, + "learning_rate": 2.4920000000000002e-05, + "loss": 0.3457, + "step": 627 + }, + { + "epoch": 2.508, + "eval_accuracy": 0.9076305220883534, + "eval_loss": 0.493520587682724, + "eval_runtime": 29.3358, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.148, + "step": 627 + }, + { + "epoch": 2.512, + "grad_norm": 11.9375, + "learning_rate": 2.488e-05, + "loss": 0.2988, + "step": 628 + }, + { + "epoch": 2.512, + "eval_accuracy": 0.9096385542168675, + "eval_loss": 0.4923821985721588, + "eval_runtime": 29.3483, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.147, + "step": 628 + }, + { + "epoch": 2.516, + "grad_norm": 3.984375, + "learning_rate": 2.4840000000000003e-05, + "loss": 0.0405, + "step": 629 + }, + { + "epoch": 2.516, + "eval_accuracy": 0.9136546184738956, + "eval_loss": 0.4894757866859436, + "eval_runtime": 29.344, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.147, + "step": 629 + }, + { + "epoch": 2.52, + "grad_norm": 27.0, + "learning_rate": 2.48e-05, + "loss": 0.6289, + "step": 630 + }, + { + "epoch": 2.52, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48743510246276855, + "eval_runtime": 29.433, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.14, + "step": 630 + }, + { + "epoch": 2.524, + "grad_norm": 6.75, + "learning_rate": 2.476e-05, + "loss": 0.3359, + "step": 631 + }, + { + "epoch": 2.524, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.48705822229385376, + "eval_runtime": 29.5117, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 631 + }, + { + "epoch": 2.528, + "grad_norm": 43.75, + "learning_rate": 2.472e-05, + "loss": 0.8125, + "step": 632 + }, + { + "epoch": 2.528, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.48730921745300293, + "eval_runtime": 29.5252, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 632 + }, + { + "epoch": 2.532, + "grad_norm": 11.75, + "learning_rate": 2.468e-05, + "loss": 0.2891, + "step": 633 + }, + { + "epoch": 2.532, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.48823484778404236, + "eval_runtime": 29.5754, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.13, + "step": 633 + }, + { + "epoch": 2.536, + "grad_norm": 5.21875, + "learning_rate": 2.464e-05, + "loss": 0.21, + "step": 634 + }, + { + "epoch": 2.536, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4887055456638336, + "eval_runtime": 29.5206, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 634 + }, + { + "epoch": 2.54, + "grad_norm": 11.0625, + "learning_rate": 2.46e-05, + "loss": 0.4355, + "step": 635 + }, + { + "epoch": 2.54, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.48920777440071106, + "eval_runtime": 29.5547, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 635 + }, + { + "epoch": 2.544, + "grad_norm": 19.5, + "learning_rate": 2.4560000000000002e-05, + "loss": 0.3711, + "step": 636 + }, + { + "epoch": 2.544, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4909493327140808, + "eval_runtime": 29.4574, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 636 + }, + { + "epoch": 2.548, + "grad_norm": 2.578125, + "learning_rate": 2.4520000000000002e-05, + "loss": 0.1514, + "step": 637 + }, + { + "epoch": 2.548, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.49160832166671753, + "eval_runtime": 29.3995, + "eval_samples_per_second": 16.939, + "eval_steps_per_second": 2.143, + "step": 637 + }, + { + "epoch": 2.552, + "grad_norm": 7.15625, + "learning_rate": 2.448e-05, + "loss": 0.3203, + "step": 638 + }, + { + "epoch": 2.552, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.49245554208755493, + "eval_runtime": 29.5156, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 638 + }, + { + "epoch": 2.556, + "grad_norm": 6.0625, + "learning_rate": 2.4440000000000003e-05, + "loss": 0.2949, + "step": 639 + }, + { + "epoch": 2.556, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.49371063709259033, + "eval_runtime": 29.5888, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 639 + }, + { + "epoch": 2.56, + "grad_norm": 5.25, + "learning_rate": 2.44e-05, + "loss": 0.1934, + "step": 640 + }, + { + "epoch": 2.56, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4942125678062439, + "eval_runtime": 29.5274, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 640 + }, + { + "epoch": 2.564, + "grad_norm": 11.5625, + "learning_rate": 2.4360000000000004e-05, + "loss": 0.3594, + "step": 641 + }, + { + "epoch": 2.564, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4950597584247589, + "eval_runtime": 29.4981, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 641 + }, + { + "epoch": 2.568, + "grad_norm": 14.625, + "learning_rate": 2.432e-05, + "loss": 0.3613, + "step": 642 + }, + { + "epoch": 2.568, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.49349096417427063, + "eval_runtime": 29.4494, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.139, + "step": 642 + }, + { + "epoch": 2.572, + "grad_norm": 61.5, + "learning_rate": 2.428e-05, + "loss": 0.4199, + "step": 643 + }, + { + "epoch": 2.572, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4939301908016205, + "eval_runtime": 29.443, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 643 + }, + { + "epoch": 2.576, + "grad_norm": 9.0, + "learning_rate": 2.4240000000000002e-05, + "loss": 0.2734, + "step": 644 + }, + { + "epoch": 2.576, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.494071364402771, + "eval_runtime": 29.5057, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.135, + "step": 644 + }, + { + "epoch": 2.58, + "grad_norm": 4.8125, + "learning_rate": 2.4200000000000002e-05, + "loss": 0.1406, + "step": 645 + }, + { + "epoch": 2.58, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4930828809738159, + "eval_runtime": 29.5287, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 645 + }, + { + "epoch": 2.584, + "grad_norm": 6.96875, + "learning_rate": 2.4160000000000002e-05, + "loss": 0.168, + "step": 646 + }, + { + "epoch": 2.584, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.49325528740882874, + "eval_runtime": 29.5245, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 646 + }, + { + "epoch": 2.588, + "grad_norm": 20.0, + "learning_rate": 2.412e-05, + "loss": 0.5703, + "step": 647 + }, + { + "epoch": 2.588, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.49262765049934387, + "eval_runtime": 29.515, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 647 + }, + { + "epoch": 2.592, + "grad_norm": 8.625, + "learning_rate": 2.408e-05, + "loss": 0.4219, + "step": 648 + }, + { + "epoch": 2.592, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.49210986495018005, + "eval_runtime": 29.4095, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.142, + "step": 648 + }, + { + "epoch": 2.596, + "grad_norm": 29.25, + "learning_rate": 2.404e-05, + "loss": 0.4023, + "step": 649 + }, + { + "epoch": 2.596, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4921254813671112, + "eval_runtime": 29.3261, + "eval_samples_per_second": 16.981, + "eval_steps_per_second": 2.148, + "step": 649 + }, + { + "epoch": 2.6, + "grad_norm": 9.375, + "learning_rate": 2.4e-05, + "loss": 0.1533, + "step": 650 + }, + { + "epoch": 2.6, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4913724362850189, + "eval_runtime": 29.3404, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.147, + "step": 650 + }, + { + "epoch": 2.604, + "grad_norm": 8.1875, + "learning_rate": 2.396e-05, + "loss": 0.2969, + "step": 651 + }, + { + "epoch": 2.604, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4901016652584076, + "eval_runtime": 29.2083, + "eval_samples_per_second": 17.05, + "eval_steps_per_second": 2.157, + "step": 651 + }, + { + "epoch": 2.608, + "grad_norm": 26.375, + "learning_rate": 2.392e-05, + "loss": 0.8867, + "step": 652 + }, + { + "epoch": 2.608, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4907761812210083, + "eval_runtime": 29.3589, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 652 + }, + { + "epoch": 2.612, + "grad_norm": 5.34375, + "learning_rate": 2.3880000000000002e-05, + "loss": 0.4395, + "step": 653 + }, + { + "epoch": 2.612, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4917173981666565, + "eval_runtime": 29.5109, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 653 + }, + { + "epoch": 2.616, + "grad_norm": 5.9375, + "learning_rate": 2.3840000000000002e-05, + "loss": 0.4316, + "step": 654 + }, + { + "epoch": 2.616, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4917173683643341, + "eval_runtime": 29.5354, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 654 + }, + { + "epoch": 2.62, + "grad_norm": 28.875, + "learning_rate": 2.38e-05, + "loss": 0.8398, + "step": 655 + }, + { + "epoch": 2.62, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.49215665459632874, + "eval_runtime": 29.5311, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.133, + "step": 655 + }, + { + "epoch": 2.624, + "grad_norm": 18.25, + "learning_rate": 2.3760000000000003e-05, + "loss": 0.5586, + "step": 656 + }, + { + "epoch": 2.624, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.4909173548221588, + "eval_runtime": 29.5704, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 656 + }, + { + "epoch": 2.628, + "grad_norm": 81.5, + "learning_rate": 2.372e-05, + "loss": 1.4609, + "step": 657 + }, + { + "epoch": 2.628, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.49115270376205444, + "eval_runtime": 29.5184, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 657 + }, + { + "epoch": 2.632, + "grad_norm": 3.171875, + "learning_rate": 2.3680000000000004e-05, + "loss": 0.0315, + "step": 658 + }, + { + "epoch": 2.632, + "eval_accuracy": 0.9156626506024096, + "eval_loss": 0.4893486201763153, + "eval_runtime": 29.4159, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.142, + "step": 658 + }, + { + "epoch": 2.636, + "grad_norm": 12.5, + "learning_rate": 2.364e-05, + "loss": 0.4043, + "step": 659 + }, + { + "epoch": 2.636, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.49076053500175476, + "eval_runtime": 29.362, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.146, + "step": 659 + }, + { + "epoch": 2.64, + "grad_norm": 37.5, + "learning_rate": 2.36e-05, + "loss": 1.0781, + "step": 660 + }, + { + "epoch": 2.64, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.49046245217323303, + "eval_runtime": 29.5387, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 660 + }, + { + "epoch": 2.644, + "grad_norm": 12.25, + "learning_rate": 2.356e-05, + "loss": 0.3477, + "step": 661 + }, + { + "epoch": 2.644, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48980364203453064, + "eval_runtime": 29.5269, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 661 + }, + { + "epoch": 2.648, + "grad_norm": 17.25, + "learning_rate": 2.3520000000000002e-05, + "loss": 0.6094, + "step": 662 + }, + { + "epoch": 2.648, + "eval_accuracy": 0.9156626506024096, + "eval_loss": 0.4897409975528717, + "eval_runtime": 29.5322, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 662 + }, + { + "epoch": 2.652, + "grad_norm": 25.75, + "learning_rate": 2.3480000000000002e-05, + "loss": 1.3047, + "step": 663 + }, + { + "epoch": 2.652, + "eval_accuracy": 0.9156626506024096, + "eval_loss": 0.48850181698799133, + "eval_runtime": 29.5804, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.13, + "step": 663 + }, + { + "epoch": 2.656, + "grad_norm": 8.875, + "learning_rate": 2.344e-05, + "loss": 0.2402, + "step": 664 + }, + { + "epoch": 2.656, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4880784749984741, + "eval_runtime": 29.4817, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 664 + }, + { + "epoch": 2.66, + "grad_norm": 6.21875, + "learning_rate": 2.3400000000000003e-05, + "loss": 0.2637, + "step": 665 + }, + { + "epoch": 2.66, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4876079857349396, + "eval_runtime": 29.3586, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.146, + "step": 665 + }, + { + "epoch": 2.664, + "grad_norm": 13.625, + "learning_rate": 2.336e-05, + "loss": 0.4102, + "step": 666 + }, + { + "epoch": 2.664, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4868551194667816, + "eval_runtime": 29.2854, + "eval_samples_per_second": 17.005, + "eval_steps_per_second": 2.151, + "step": 666 + }, + { + "epoch": 2.668, + "grad_norm": 12.0625, + "learning_rate": 2.332e-05, + "loss": 0.4824, + "step": 667 + }, + { + "epoch": 2.668, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48702773451805115, + "eval_runtime": 29.2698, + "eval_samples_per_second": 17.014, + "eval_steps_per_second": 2.152, + "step": 667 + }, + { + "epoch": 2.672, + "grad_norm": 11.1875, + "learning_rate": 2.328e-05, + "loss": 0.5859, + "step": 668 + }, + { + "epoch": 2.672, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.48583561182022095, + "eval_runtime": 29.4173, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 668 + }, + { + "epoch": 2.676, + "grad_norm": 17.875, + "learning_rate": 2.324e-05, + "loss": 0.5273, + "step": 669 + }, + { + "epoch": 2.676, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.48555347323417664, + "eval_runtime": 29.5533, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.132, + "step": 669 + }, + { + "epoch": 2.68, + "grad_norm": 37.75, + "learning_rate": 2.32e-05, + "loss": 0.6406, + "step": 670 + }, + { + "epoch": 2.68, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48608723282814026, + "eval_runtime": 29.5408, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 670 + }, + { + "epoch": 2.684, + "grad_norm": 12.8125, + "learning_rate": 2.3160000000000002e-05, + "loss": 0.4121, + "step": 671 + }, + { + "epoch": 2.684, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.4861818850040436, + "eval_runtime": 29.5326, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 671 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 9.3125, + "learning_rate": 2.312e-05, + "loss": 0.4746, + "step": 672 + }, + { + "epoch": 2.6879999999999997, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4862135946750641, + "eval_runtime": 29.5281, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 672 + }, + { + "epoch": 2.692, + "grad_norm": 13.9375, + "learning_rate": 2.3080000000000003e-05, + "loss": 0.5312, + "step": 673 + }, + { + "epoch": 2.692, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.4857116937637329, + "eval_runtime": 29.5269, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 673 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 24.5, + "learning_rate": 2.304e-05, + "loss": 1.1875, + "step": 674 + }, + { + "epoch": 2.6959999999999997, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4854767620563507, + "eval_runtime": 29.4035, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 674 + }, + { + "epoch": 2.7, + "grad_norm": 19.25, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.8242, + "step": 675 + }, + { + "epoch": 2.7, + "eval_accuracy": 0.9156626506024096, + "eval_loss": 0.48597896099090576, + "eval_runtime": 29.3554, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.146, + "step": 675 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 20.125, + "learning_rate": 2.296e-05, + "loss": 0.6172, + "step": 676 + }, + { + "epoch": 2.7039999999999997, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48549285531044006, + "eval_runtime": 29.3242, + "eval_samples_per_second": 16.983, + "eval_steps_per_second": 2.148, + "step": 676 + }, + { + "epoch": 2.708, + "grad_norm": 18.875, + "learning_rate": 2.292e-05, + "loss": 0.7773, + "step": 677 + }, + { + "epoch": 2.708, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.48532044887542725, + "eval_runtime": 29.3009, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 2.15, + "step": 677 + }, + { + "epoch": 2.7119999999999997, + "grad_norm": 6.5625, + "learning_rate": 2.288e-05, + "loss": 0.293, + "step": 678 + }, + { + "epoch": 2.7119999999999997, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.48484981060028076, + "eval_runtime": 29.4578, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 678 + }, + { + "epoch": 2.716, + "grad_norm": 25.25, + "learning_rate": 2.284e-05, + "loss": 1.1484, + "step": 679 + }, + { + "epoch": 2.716, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4845987856388092, + "eval_runtime": 29.5277, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 679 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 12.75, + "learning_rate": 2.2800000000000002e-05, + "loss": 0.4238, + "step": 680 + }, + { + "epoch": 2.7199999999999998, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4841282367706299, + "eval_runtime": 29.5898, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.129, + "step": 680 + }, + { + "epoch": 2.724, + "grad_norm": 19.375, + "learning_rate": 2.2760000000000002e-05, + "loss": 0.7891, + "step": 681 + }, + { + "epoch": 2.724, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.48398739099502563, + "eval_runtime": 29.539, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 681 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 7.0625, + "learning_rate": 2.2720000000000003e-05, + "loss": 0.1592, + "step": 682 + }, + { + "epoch": 2.7279999999999998, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.48401883244514465, + "eval_runtime": 29.4843, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 682 + }, + { + "epoch": 2.732, + "grad_norm": 8.9375, + "learning_rate": 2.268e-05, + "loss": 0.3984, + "step": 683 + }, + { + "epoch": 2.732, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4839405119419098, + "eval_runtime": 29.4032, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 683 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 21.5, + "learning_rate": 2.264e-05, + "loss": 0.6602, + "step": 684 + }, + { + "epoch": 2.7359999999999998, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.48400309681892395, + "eval_runtime": 29.3517, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.146, + "step": 684 + }, + { + "epoch": 2.74, + "grad_norm": 14.625, + "learning_rate": 2.26e-05, + "loss": 0.4121, + "step": 685 + }, + { + "epoch": 2.74, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.48437967896461487, + "eval_runtime": 29.3329, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 685 + }, + { + "epoch": 2.7439999999999998, + "grad_norm": 10.0, + "learning_rate": 2.256e-05, + "loss": 0.1562, + "step": 686 + }, + { + "epoch": 2.7439999999999998, + "eval_accuracy": 0.9116465863453815, + "eval_loss": 0.484159916639328, + "eval_runtime": 29.3801, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 686 + }, + { + "epoch": 2.748, + "grad_norm": 10.5, + "learning_rate": 2.252e-05, + "loss": 0.1367, + "step": 687 + }, + { + "epoch": 2.748, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4845520853996277, + "eval_runtime": 29.512, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 687 + }, + { + "epoch": 2.752, + "grad_norm": 37.0, + "learning_rate": 2.248e-05, + "loss": 0.6484, + "step": 688 + }, + { + "epoch": 2.752, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48379889130592346, + "eval_runtime": 29.5355, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 688 + }, + { + "epoch": 2.7560000000000002, + "grad_norm": 11.75, + "learning_rate": 2.244e-05, + "loss": 0.5234, + "step": 689 + }, + { + "epoch": 2.7560000000000002, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.4838770031929016, + "eval_runtime": 29.5316, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 689 + }, + { + "epoch": 2.76, + "grad_norm": 26.75, + "learning_rate": 2.2400000000000002e-05, + "loss": 0.8047, + "step": 690 + }, + { + "epoch": 2.76, + "eval_accuracy": 0.9136546184738956, + "eval_loss": 0.48381394147872925, + "eval_runtime": 29.5305, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.133, + "step": 690 + }, + { + "epoch": 2.7640000000000002, + "grad_norm": 8.4375, + "learning_rate": 2.236e-05, + "loss": 0.4277, + "step": 691 + }, + { + "epoch": 2.7640000000000002, + "eval_accuracy": 0.9116465863453815, + "eval_loss": 0.48414289951324463, + "eval_runtime": 29.4998, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 691 + }, + { + "epoch": 2.768, + "grad_norm": 15.5, + "learning_rate": 2.2320000000000003e-05, + "loss": 0.4316, + "step": 692 + }, + { + "epoch": 2.768, + "eval_accuracy": 0.9136546184738956, + "eval_loss": 0.4851935803890228, + "eval_runtime": 29.4176, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 692 + }, + { + "epoch": 2.7720000000000002, + "grad_norm": 17.0, + "learning_rate": 2.228e-05, + "loss": 0.7891, + "step": 693 + }, + { + "epoch": 2.7720000000000002, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.4849894344806671, + "eval_runtime": 29.3622, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.146, + "step": 693 + }, + { + "epoch": 2.776, + "grad_norm": 17.0, + "learning_rate": 2.224e-05, + "loss": 0.625, + "step": 694 + }, + { + "epoch": 2.776, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.48429882526397705, + "eval_runtime": 29.3957, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.143, + "step": 694 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 30.5, + "learning_rate": 2.22e-05, + "loss": 1.125, + "step": 695 + }, + { + "epoch": 2.7800000000000002, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4839377701282501, + "eval_runtime": 29.5019, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 695 + }, + { + "epoch": 2.784, + "grad_norm": 9.4375, + "learning_rate": 2.216e-05, + "loss": 0.4453, + "step": 696 + }, + { + "epoch": 2.784, + "eval_accuracy": 0.9136546184738956, + "eval_loss": 0.4839218258857727, + "eval_runtime": 29.5372, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 696 + }, + { + "epoch": 2.7880000000000003, + "grad_norm": 28.75, + "learning_rate": 2.212e-05, + "loss": 0.7695, + "step": 697 + }, + { + "epoch": 2.7880000000000003, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.4833882749080658, + "eval_runtime": 29.5276, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 697 + }, + { + "epoch": 2.792, + "grad_norm": 7.40625, + "learning_rate": 2.2080000000000002e-05, + "loss": 0.2969, + "step": 698 + }, + { + "epoch": 2.792, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.48170948028564453, + "eval_runtime": 29.5008, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 698 + }, + { + "epoch": 2.7960000000000003, + "grad_norm": 7.6875, + "learning_rate": 2.2040000000000002e-05, + "loss": 0.1553, + "step": 699 + }, + { + "epoch": 2.7960000000000003, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4812857508659363, + "eval_runtime": 29.4525, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 699 + }, + { + "epoch": 2.8, + "grad_norm": 6.0, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3789, + "step": 700 + }, + { + "epoch": 2.8, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4799834191799164, + "eval_runtime": 29.3778, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.144, + "step": 700 + }, + { + "epoch": 2.8040000000000003, + "grad_norm": 19.75, + "learning_rate": 2.196e-05, + "loss": 0.3691, + "step": 701 + }, + { + "epoch": 2.8040000000000003, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4790734648704529, + "eval_runtime": 29.4128, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.142, + "step": 701 + }, + { + "epoch": 2.808, + "grad_norm": 9.25, + "learning_rate": 2.192e-05, + "loss": 0.3809, + "step": 702 + }, + { + "epoch": 2.808, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47915181517601013, + "eval_runtime": 29.3474, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.147, + "step": 702 + }, + { + "epoch": 2.8120000000000003, + "grad_norm": 10.1875, + "learning_rate": 2.188e-05, + "loss": 0.4316, + "step": 703 + }, + { + "epoch": 2.8120000000000003, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4791046380996704, + "eval_runtime": 29.4781, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.137, + "step": 703 + }, + { + "epoch": 2.816, + "grad_norm": 27.375, + "learning_rate": 2.184e-05, + "loss": 0.9336, + "step": 704 + }, + { + "epoch": 2.816, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4783516526222229, + "eval_runtime": 29.5486, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.132, + "step": 704 + }, + { + "epoch": 2.82, + "grad_norm": 12.9375, + "learning_rate": 2.18e-05, + "loss": 0.2021, + "step": 705 + }, + { + "epoch": 2.82, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4784614145755768, + "eval_runtime": 29.5559, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 705 + }, + { + "epoch": 2.824, + "grad_norm": 9.4375, + "learning_rate": 2.176e-05, + "loss": 0.1729, + "step": 706 + }, + { + "epoch": 2.824, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47747305035591125, + "eval_runtime": 29.5503, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.132, + "step": 706 + }, + { + "epoch": 2.828, + "grad_norm": 11.8125, + "learning_rate": 2.1720000000000002e-05, + "loss": 0.2598, + "step": 707 + }, + { + "epoch": 2.828, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47753575444221497, + "eval_runtime": 29.5293, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.133, + "step": 707 + }, + { + "epoch": 2.832, + "grad_norm": 7.71875, + "learning_rate": 2.168e-05, + "loss": 0.4551, + "step": 708 + }, + { + "epoch": 2.832, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4766886830329895, + "eval_runtime": 29.5009, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 708 + }, + { + "epoch": 2.836, + "grad_norm": 18.125, + "learning_rate": 2.1640000000000003e-05, + "loss": 0.291, + "step": 709 + }, + { + "epoch": 2.836, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.47698667645454407, + "eval_runtime": 29.4432, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 709 + }, + { + "epoch": 2.84, + "grad_norm": 11.5625, + "learning_rate": 2.16e-05, + "loss": 0.3691, + "step": 710 + }, + { + "epoch": 2.84, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4768768846988678, + "eval_runtime": 29.3595, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 710 + }, + { + "epoch": 2.844, + "grad_norm": 13.125, + "learning_rate": 2.1560000000000004e-05, + "loss": 0.2354, + "step": 711 + }, + { + "epoch": 2.844, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47678279876708984, + "eval_runtime": 29.3974, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.143, + "step": 711 + }, + { + "epoch": 2.848, + "grad_norm": 33.0, + "learning_rate": 2.152e-05, + "loss": 1.2031, + "step": 712 + }, + { + "epoch": 2.848, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47610822319984436, + "eval_runtime": 29.3257, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.148, + "step": 712 + }, + { + "epoch": 2.852, + "grad_norm": 10.125, + "learning_rate": 2.148e-05, + "loss": 0.0791, + "step": 713 + }, + { + "epoch": 2.852, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4763436019420624, + "eval_runtime": 29.3808, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 713 + }, + { + "epoch": 2.856, + "grad_norm": 10.125, + "learning_rate": 2.144e-05, + "loss": 0.3359, + "step": 714 + }, + { + "epoch": 2.856, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47686129808425903, + "eval_runtime": 29.5652, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.131, + "step": 714 + }, + { + "epoch": 2.86, + "grad_norm": 18.25, + "learning_rate": 2.1400000000000002e-05, + "loss": 0.5703, + "step": 715 + }, + { + "epoch": 2.86, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4771122932434082, + "eval_runtime": 29.6002, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.128, + "step": 715 + }, + { + "epoch": 2.864, + "grad_norm": 12.5, + "learning_rate": 2.1360000000000002e-05, + "loss": 0.5547, + "step": 716 + }, + { + "epoch": 2.864, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4772692024707794, + "eval_runtime": 29.6017, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.128, + "step": 716 + }, + { + "epoch": 2.868, + "grad_norm": 13.75, + "learning_rate": 2.1320000000000003e-05, + "loss": 0.7773, + "step": 717 + }, + { + "epoch": 2.868, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4770495593547821, + "eval_runtime": 29.5359, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 717 + }, + { + "epoch": 2.872, + "grad_norm": 13.625, + "learning_rate": 2.128e-05, + "loss": 0.168, + "step": 718 + }, + { + "epoch": 2.872, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4777871370315552, + "eval_runtime": 29.4877, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.136, + "step": 718 + }, + { + "epoch": 2.876, + "grad_norm": 18.0, + "learning_rate": 2.124e-05, + "loss": 0.6484, + "step": 719 + }, + { + "epoch": 2.876, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4785715639591217, + "eval_runtime": 29.3602, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 719 + }, + { + "epoch": 2.88, + "grad_norm": 28.75, + "learning_rate": 2.12e-05, + "loss": 0.8281, + "step": 720 + }, + { + "epoch": 2.88, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47901102900505066, + "eval_runtime": 29.2885, + "eval_samples_per_second": 17.003, + "eval_steps_per_second": 2.151, + "step": 720 + }, + { + "epoch": 2.884, + "grad_norm": 9.6875, + "learning_rate": 2.116e-05, + "loss": 0.2031, + "step": 721 + }, + { + "epoch": 2.884, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47915220260620117, + "eval_runtime": 29.3233, + "eval_samples_per_second": 16.983, + "eval_steps_per_second": 2.148, + "step": 721 + }, + { + "epoch": 2.888, + "grad_norm": 42.0, + "learning_rate": 2.112e-05, + "loss": 0.8906, + "step": 722 + }, + { + "epoch": 2.888, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47948190569877625, + "eval_runtime": 29.3292, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.148, + "step": 722 + }, + { + "epoch": 2.892, + "grad_norm": 10.5, + "learning_rate": 2.1079999999999998e-05, + "loss": 0.5977, + "step": 723 + }, + { + "epoch": 2.892, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4792781174182892, + "eval_runtime": 29.4805, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 723 + }, + { + "epoch": 2.896, + "grad_norm": 12.25, + "learning_rate": 2.1040000000000002e-05, + "loss": 0.5977, + "step": 724 + }, + { + "epoch": 2.896, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.479215532541275, + "eval_runtime": 29.5507, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.132, + "step": 724 + }, + { + "epoch": 2.9, + "grad_norm": 7.78125, + "learning_rate": 2.1e-05, + "loss": 0.2012, + "step": 725 + }, + { + "epoch": 2.9, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4792943000793457, + "eval_runtime": 29.6024, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.128, + "step": 725 + }, + { + "epoch": 2.904, + "grad_norm": 42.0, + "learning_rate": 2.0960000000000003e-05, + "loss": 1.3281, + "step": 726 + }, + { + "epoch": 2.904, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47872981429100037, + "eval_runtime": 29.5907, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.129, + "step": 726 + }, + { + "epoch": 2.908, + "grad_norm": 6.5625, + "learning_rate": 2.092e-05, + "loss": 0.1621, + "step": 727 + }, + { + "epoch": 2.908, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4787771701812744, + "eval_runtime": 29.5377, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 727 + }, + { + "epoch": 2.912, + "grad_norm": 7.375, + "learning_rate": 2.0880000000000003e-05, + "loss": 0.1787, + "step": 728 + }, + { + "epoch": 2.912, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47788330912590027, + "eval_runtime": 29.421, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.141, + "step": 728 + }, + { + "epoch": 2.916, + "grad_norm": 13.3125, + "learning_rate": 2.084e-05, + "loss": 0.0923, + "step": 729 + }, + { + "epoch": 2.916, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4776478409767151, + "eval_runtime": 29.3593, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 729 + }, + { + "epoch": 2.92, + "grad_norm": 3.78125, + "learning_rate": 2.08e-05, + "loss": 0.125, + "step": 730 + }, + { + "epoch": 2.92, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47731825709342957, + "eval_runtime": 29.3866, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.144, + "step": 730 + }, + { + "epoch": 2.924, + "grad_norm": 50.5, + "learning_rate": 2.076e-05, + "loss": 1.0, + "step": 731 + }, + { + "epoch": 2.924, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4773494601249695, + "eval_runtime": 29.3827, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.144, + "step": 731 + }, + { + "epoch": 2.928, + "grad_norm": 31.0, + "learning_rate": 2.072e-05, + "loss": 0.4902, + "step": 732 + }, + { + "epoch": 2.928, + "eval_accuracy": 0.9377510040160643, + "eval_loss": 0.47670599818229675, + "eval_runtime": 29.5188, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 732 + }, + { + "epoch": 2.932, + "grad_norm": 3.9375, + "learning_rate": 2.0680000000000002e-05, + "loss": 0.1182, + "step": 733 + }, + { + "epoch": 2.932, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.47722363471984863, + "eval_runtime": 29.6015, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.128, + "step": 733 + }, + { + "epoch": 2.936, + "grad_norm": 40.25, + "learning_rate": 2.0640000000000002e-05, + "loss": 1.0391, + "step": 734 + }, + { + "epoch": 2.936, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.4760158360004425, + "eval_runtime": 29.5536, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.132, + "step": 734 + }, + { + "epoch": 2.94, + "grad_norm": 42.25, + "learning_rate": 2.06e-05, + "loss": 0.5273, + "step": 735 + }, + { + "epoch": 2.94, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.4754351079463959, + "eval_runtime": 29.5473, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.132, + "step": 735 + }, + { + "epoch": 2.944, + "grad_norm": 17.125, + "learning_rate": 2.0560000000000003e-05, + "loss": 0.2324, + "step": 736 + }, + { + "epoch": 2.944, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.47609400749206543, + "eval_runtime": 29.5042, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 736 + }, + { + "epoch": 2.948, + "grad_norm": 10.0625, + "learning_rate": 2.052e-05, + "loss": 0.4004, + "step": 737 + }, + { + "epoch": 2.948, + "eval_accuracy": 0.9377510040160643, + "eval_loss": 0.47567033767700195, + "eval_runtime": 29.4062, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.142, + "step": 737 + }, + { + "epoch": 2.952, + "grad_norm": 12.375, + "learning_rate": 2.048e-05, + "loss": 0.418, + "step": 738 + }, + { + "epoch": 2.952, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47563886642456055, + "eval_runtime": 29.36, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 738 + }, + { + "epoch": 2.956, + "grad_norm": 10.0, + "learning_rate": 2.044e-05, + "loss": 0.4707, + "step": 739 + }, + { + "epoch": 2.956, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47549766302108765, + "eval_runtime": 29.32, + "eval_samples_per_second": 16.985, + "eval_steps_per_second": 2.149, + "step": 739 + }, + { + "epoch": 2.96, + "grad_norm": 4.90625, + "learning_rate": 2.04e-05, + "loss": 0.207, + "step": 740 + }, + { + "epoch": 2.96, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4756545424461365, + "eval_runtime": 29.2573, + "eval_samples_per_second": 17.021, + "eval_steps_per_second": 2.153, + "step": 740 + }, + { + "epoch": 2.964, + "grad_norm": 20.5, + "learning_rate": 2.036e-05, + "loss": 0.582, + "step": 741 + }, + { + "epoch": 2.964, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47552913427352905, + "eval_runtime": 29.4226, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.141, + "step": 741 + }, + { + "epoch": 2.968, + "grad_norm": 6.96875, + "learning_rate": 2.032e-05, + "loss": 0.3145, + "step": 742 + }, + { + "epoch": 2.968, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4748702347278595, + "eval_runtime": 29.5713, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.13, + "step": 742 + }, + { + "epoch": 2.972, + "grad_norm": 22.5, + "learning_rate": 2.0280000000000002e-05, + "loss": 0.8633, + "step": 743 + }, + { + "epoch": 2.972, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.475340873003006, + "eval_runtime": 29.5505, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.132, + "step": 743 + }, + { + "epoch": 2.976, + "grad_norm": 2.9375, + "learning_rate": 2.024e-05, + "loss": 0.0776, + "step": 744 + }, + { + "epoch": 2.976, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4750898778438568, + "eval_runtime": 29.5458, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.132, + "step": 744 + }, + { + "epoch": 2.98, + "grad_norm": 14.25, + "learning_rate": 2.0200000000000003e-05, + "loss": 0.4297, + "step": 745 + }, + { + "epoch": 2.98, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47535642981529236, + "eval_runtime": 29.5126, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 745 + }, + { + "epoch": 2.984, + "grad_norm": 13.25, + "learning_rate": 2.016e-05, + "loss": 0.6055, + "step": 746 + }, + { + "epoch": 2.984, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47588998079299927, + "eval_runtime": 29.4306, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.141, + "step": 746 + }, + { + "epoch": 2.988, + "grad_norm": 9.125, + "learning_rate": 2.012e-05, + "loss": 0.291, + "step": 747 + }, + { + "epoch": 2.988, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4752626121044159, + "eval_runtime": 29.3804, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 747 + }, + { + "epoch": 2.992, + "grad_norm": 16.25, + "learning_rate": 2.008e-05, + "loss": 0.9531, + "step": 748 + }, + { + "epoch": 2.992, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4748079478740692, + "eval_runtime": 29.4134, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.142, + "step": 748 + }, + { + "epoch": 2.996, + "grad_norm": 9.0, + "learning_rate": 2.004e-05, + "loss": 0.1641, + "step": 749 + }, + { + "epoch": 2.996, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47661250829696655, + "eval_runtime": 29.3441, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.147, + "step": 749 + }, + { + "epoch": 3.0, + "grad_norm": 14.4375, + "learning_rate": 2e-05, + "loss": 0.418, + "step": 750 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4772248864173889, + "eval_runtime": 29.4267, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.141, + "step": 750 + }, + { + "epoch": 3.004, + "grad_norm": 21.75, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.6953, + "step": 751 + }, + { + "epoch": 3.004, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47868505120277405, + "eval_runtime": 29.4811, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 751 + }, + { + "epoch": 3.008, + "grad_norm": 11.9375, + "learning_rate": 1.992e-05, + "loss": 0.0991, + "step": 752 + }, + { + "epoch": 3.008, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.47907841205596924, + "eval_runtime": 29.4537, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.139, + "step": 752 + }, + { + "epoch": 3.012, + "grad_norm": 11.4375, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.5664, + "step": 753 + }, + { + "epoch": 3.012, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4798010289669037, + "eval_runtime": 29.5417, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 753 + }, + { + "epoch": 3.016, + "grad_norm": 6.46875, + "learning_rate": 1.984e-05, + "loss": 0.1426, + "step": 754 + }, + { + "epoch": 3.016, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4802097976207733, + "eval_runtime": 29.6115, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 754 + }, + { + "epoch": 3.02, + "grad_norm": 10.75, + "learning_rate": 1.9800000000000004e-05, + "loss": 0.3965, + "step": 755 + }, + { + "epoch": 3.02, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.48146510124206543, + "eval_runtime": 29.5436, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.132, + "step": 755 + }, + { + "epoch": 3.024, + "grad_norm": 6.84375, + "learning_rate": 1.976e-05, + "loss": 0.2891, + "step": 756 + }, + { + "epoch": 3.024, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4820157587528229, + "eval_runtime": 29.532, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 756 + }, + { + "epoch": 3.028, + "grad_norm": 29.0, + "learning_rate": 1.972e-05, + "loss": 0.7188, + "step": 757 + }, + { + "epoch": 3.028, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48294299840927124, + "eval_runtime": 29.4448, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.14, + "step": 757 + }, + { + "epoch": 3.032, + "grad_norm": 22.125, + "learning_rate": 1.968e-05, + "loss": 0.8164, + "step": 758 + }, + { + "epoch": 3.032, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.48402729630470276, + "eval_runtime": 29.3908, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 758 + }, + { + "epoch": 3.036, + "grad_norm": 18.625, + "learning_rate": 1.9640000000000002e-05, + "loss": 0.6328, + "step": 759 + }, + { + "epoch": 3.036, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.48420029878616333, + "eval_runtime": 29.3482, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.147, + "step": 759 + }, + { + "epoch": 3.04, + "grad_norm": 14.625, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.4707, + "step": 760 + }, + { + "epoch": 3.04, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.48523569107055664, + "eval_runtime": 29.2604, + "eval_samples_per_second": 17.02, + "eval_steps_per_second": 2.153, + "step": 760 + }, + { + "epoch": 3.044, + "grad_norm": 3.515625, + "learning_rate": 1.956e-05, + "loss": 0.1113, + "step": 761 + }, + { + "epoch": 3.044, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.48617780208587646, + "eval_runtime": 29.3517, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.146, + "step": 761 + }, + { + "epoch": 3.048, + "grad_norm": 19.75, + "learning_rate": 1.9520000000000003e-05, + "loss": 0.4844, + "step": 762 + }, + { + "epoch": 3.048, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.48529842495918274, + "eval_runtime": 29.4986, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 762 + }, + { + "epoch": 3.052, + "grad_norm": 46.0, + "learning_rate": 1.948e-05, + "loss": 1.0469, + "step": 763 + }, + { + "epoch": 3.052, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.48610126972198486, + "eval_runtime": 29.5926, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.129, + "step": 763 + }, + { + "epoch": 3.056, + "grad_norm": 35.0, + "learning_rate": 1.944e-05, + "loss": 1.3594, + "step": 764 + }, + { + "epoch": 3.056, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48761463165283203, + "eval_runtime": 29.5603, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.131, + "step": 764 + }, + { + "epoch": 3.06, + "grad_norm": 10.0625, + "learning_rate": 1.94e-05, + "loss": 0.4629, + "step": 765 + }, + { + "epoch": 3.06, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.4877530038356781, + "eval_runtime": 29.6117, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 765 + }, + { + "epoch": 3.064, + "grad_norm": 2.015625, + "learning_rate": 1.936e-05, + "loss": 0.0242, + "step": 766 + }, + { + "epoch": 3.064, + "eval_accuracy": 0.9136546184738956, + "eval_loss": 0.4890938401222229, + "eval_runtime": 29.5667, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.131, + "step": 766 + }, + { + "epoch": 3.068, + "grad_norm": 7.03125, + "learning_rate": 1.932e-05, + "loss": 0.2109, + "step": 767 + }, + { + "epoch": 3.068, + "eval_accuracy": 0.9156626506024096, + "eval_loss": 0.48864033818244934, + "eval_runtime": 29.3974, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.143, + "step": 767 + }, + { + "epoch": 3.072, + "grad_norm": 11.75, + "learning_rate": 1.9280000000000002e-05, + "loss": 0.4277, + "step": 768 + }, + { + "epoch": 3.072, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48980712890625, + "eval_runtime": 29.332, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 768 + }, + { + "epoch": 3.076, + "grad_norm": 8.625, + "learning_rate": 1.924e-05, + "loss": 0.3574, + "step": 769 + }, + { + "epoch": 3.076, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4891921281814575, + "eval_runtime": 29.3756, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.145, + "step": 769 + }, + { + "epoch": 3.08, + "grad_norm": 13.5, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.334, + "step": 770 + }, + { + "epoch": 3.08, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.48923107981681824, + "eval_runtime": 29.5622, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.131, + "step": 770 + }, + { + "epoch": 3.084, + "grad_norm": 68.0, + "learning_rate": 1.916e-05, + "loss": 0.7266, + "step": 771 + }, + { + "epoch": 3.084, + "eval_accuracy": 0.9156626506024096, + "eval_loss": 0.4883752465248108, + "eval_runtime": 29.5548, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 771 + }, + { + "epoch": 3.088, + "grad_norm": 45.5, + "learning_rate": 1.9120000000000003e-05, + "loss": 0.3984, + "step": 772 + }, + { + "epoch": 3.088, + "eval_accuracy": 0.9156626506024096, + "eval_loss": 0.4891144037246704, + "eval_runtime": 29.5565, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 772 + }, + { + "epoch": 3.092, + "grad_norm": 29.625, + "learning_rate": 1.908e-05, + "loss": 1.4453, + "step": 773 + }, + { + "epoch": 3.092, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4888458847999573, + "eval_runtime": 29.5975, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 773 + }, + { + "epoch": 3.096, + "grad_norm": 5.90625, + "learning_rate": 1.904e-05, + "loss": 0.2041, + "step": 774 + }, + { + "epoch": 3.096, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4885486662387848, + "eval_runtime": 29.5296, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.133, + "step": 774 + }, + { + "epoch": 3.1, + "grad_norm": 8.4375, + "learning_rate": 1.9e-05, + "loss": 0.1318, + "step": 775 + }, + { + "epoch": 3.1, + "eval_accuracy": 0.9136546184738956, + "eval_loss": 0.4883410632610321, + "eval_runtime": 29.4399, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.14, + "step": 775 + }, + { + "epoch": 3.104, + "grad_norm": 14.5, + "learning_rate": 1.896e-05, + "loss": 0.1865, + "step": 776 + }, + { + "epoch": 3.104, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.48852452635765076, + "eval_runtime": 29.3787, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.144, + "step": 776 + }, + { + "epoch": 3.108, + "grad_norm": 21.875, + "learning_rate": 1.8920000000000002e-05, + "loss": 0.6992, + "step": 777 + }, + { + "epoch": 3.108, + "eval_accuracy": 0.9176706827309237, + "eval_loss": 0.4873410761356354, + "eval_runtime": 29.2849, + "eval_samples_per_second": 17.005, + "eval_steps_per_second": 2.151, + "step": 777 + }, + { + "epoch": 3.112, + "grad_norm": 14.625, + "learning_rate": 1.888e-05, + "loss": 0.6133, + "step": 778 + }, + { + "epoch": 3.112, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4854056239128113, + "eval_runtime": 29.2669, + "eval_samples_per_second": 17.016, + "eval_steps_per_second": 2.153, + "step": 778 + }, + { + "epoch": 3.116, + "grad_norm": 10.6875, + "learning_rate": 1.8840000000000003e-05, + "loss": 0.103, + "step": 779 + }, + { + "epoch": 3.116, + "eval_accuracy": 0.9196787148594378, + "eval_loss": 0.4846171438694, + "eval_runtime": 29.3598, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 779 + }, + { + "epoch": 3.12, + "grad_norm": 31.25, + "learning_rate": 1.88e-05, + "loss": 0.1846, + "step": 780 + }, + { + "epoch": 3.12, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4829975366592407, + "eval_runtime": 29.4907, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.136, + "step": 780 + }, + { + "epoch": 3.124, + "grad_norm": 12.6875, + "learning_rate": 1.876e-05, + "loss": 0.418, + "step": 781 + }, + { + "epoch": 3.124, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.48100391030311584, + "eval_runtime": 29.539, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 781 + }, + { + "epoch": 3.128, + "grad_norm": 9.875, + "learning_rate": 1.872e-05, + "loss": 0.3594, + "step": 782 + }, + { + "epoch": 3.128, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.47949734330177307, + "eval_runtime": 29.5836, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 782 + }, + { + "epoch": 3.132, + "grad_norm": 29.5, + "learning_rate": 1.868e-05, + "loss": 0.6797, + "step": 783 + }, + { + "epoch": 3.132, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47888532280921936, + "eval_runtime": 29.5879, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 783 + }, + { + "epoch": 3.136, + "grad_norm": 6.84375, + "learning_rate": 1.864e-05, + "loss": 0.3516, + "step": 784 + }, + { + "epoch": 3.136, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47857141494750977, + "eval_runtime": 29.5075, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 784 + }, + { + "epoch": 3.14, + "grad_norm": 8.25, + "learning_rate": 1.86e-05, + "loss": 0.3613, + "step": 785 + }, + { + "epoch": 3.14, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47833600640296936, + "eval_runtime": 29.3983, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.143, + "step": 785 + }, + { + "epoch": 3.144, + "grad_norm": 16.5, + "learning_rate": 1.856e-05, + "loss": 0.7617, + "step": 786 + }, + { + "epoch": 3.144, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47811636328697205, + "eval_runtime": 29.3139, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.149, + "step": 786 + }, + { + "epoch": 3.148, + "grad_norm": 2.765625, + "learning_rate": 1.8520000000000002e-05, + "loss": 0.0903, + "step": 787 + }, + { + "epoch": 3.148, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4782888889312744, + "eval_runtime": 29.3652, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.145, + "step": 787 + }, + { + "epoch": 3.152, + "grad_norm": 11.0625, + "learning_rate": 1.848e-05, + "loss": 0.5508, + "step": 788 + }, + { + "epoch": 3.152, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4788222014904022, + "eval_runtime": 29.3936, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.143, + "step": 788 + }, + { + "epoch": 3.156, + "grad_norm": 9.6875, + "learning_rate": 1.8440000000000003e-05, + "loss": 0.6523, + "step": 789 + }, + { + "epoch": 3.156, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47891631722450256, + "eval_runtime": 29.5217, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 789 + }, + { + "epoch": 3.16, + "grad_norm": 9.4375, + "learning_rate": 1.84e-05, + "loss": 0.1982, + "step": 790 + }, + { + "epoch": 3.16, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4787750840187073, + "eval_runtime": 29.5314, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 790 + }, + { + "epoch": 3.164, + "grad_norm": 13.4375, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.373, + "step": 791 + }, + { + "epoch": 3.164, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4783514440059662, + "eval_runtime": 29.582, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.13, + "step": 791 + }, + { + "epoch": 3.168, + "grad_norm": 7.34375, + "learning_rate": 1.832e-05, + "loss": 0.3379, + "step": 792 + }, + { + "epoch": 3.168, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4786337912082672, + "eval_runtime": 29.5364, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 792 + }, + { + "epoch": 3.172, + "grad_norm": 36.5, + "learning_rate": 1.828e-05, + "loss": 0.8984, + "step": 793 + }, + { + "epoch": 3.172, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47832009196281433, + "eval_runtime": 29.5294, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.133, + "step": 793 + }, + { + "epoch": 3.176, + "grad_norm": 9.125, + "learning_rate": 1.824e-05, + "loss": 0.5273, + "step": 794 + }, + { + "epoch": 3.176, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4786181151866913, + "eval_runtime": 29.4483, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.139, + "step": 794 + }, + { + "epoch": 3.18, + "grad_norm": 11.625, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.5703, + "step": 795 + }, + { + "epoch": 3.18, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47813180088996887, + "eval_runtime": 29.4418, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.14, + "step": 795 + }, + { + "epoch": 3.184, + "grad_norm": 31.375, + "learning_rate": 1.8160000000000002e-05, + "loss": 1.8281, + "step": 796 + }, + { + "epoch": 3.184, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47838276624679565, + "eval_runtime": 29.3575, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.146, + "step": 796 + }, + { + "epoch": 3.188, + "grad_norm": 12.0625, + "learning_rate": 1.812e-05, + "loss": 0.4121, + "step": 797 + }, + { + "epoch": 3.188, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47817882895469666, + "eval_runtime": 29.3323, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 797 + }, + { + "epoch": 3.192, + "grad_norm": 9.5, + "learning_rate": 1.808e-05, + "loss": 0.4414, + "step": 798 + }, + { + "epoch": 3.192, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4781317710876465, + "eval_runtime": 29.3214, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 798 + }, + { + "epoch": 3.196, + "grad_norm": 10.375, + "learning_rate": 1.804e-05, + "loss": 0.6406, + "step": 799 + }, + { + "epoch": 3.196, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47860240936279297, + "eval_runtime": 29.3107, + "eval_samples_per_second": 16.99, + "eval_steps_per_second": 2.149, + "step": 799 + }, + { + "epoch": 3.2, + "grad_norm": 8.75, + "learning_rate": 1.8e-05, + "loss": 0.2891, + "step": 800 + }, + { + "epoch": 3.2, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47842979431152344, + "eval_runtime": 29.3008, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 2.15, + "step": 800 + }, + { + "epoch": 3.204, + "grad_norm": 11.375, + "learning_rate": 1.796e-05, + "loss": 0.3594, + "step": 801 + }, + { + "epoch": 3.204, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4788220226764679, + "eval_runtime": 29.4006, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 801 + }, + { + "epoch": 3.208, + "grad_norm": 7.0625, + "learning_rate": 1.792e-05, + "loss": 0.3242, + "step": 802 + }, + { + "epoch": 3.208, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47960638999938965, + "eval_runtime": 29.3598, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 802 + }, + { + "epoch": 3.212, + "grad_norm": 10.4375, + "learning_rate": 1.7879999999999998e-05, + "loss": 0.3438, + "step": 803 + }, + { + "epoch": 3.212, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4796377718448639, + "eval_runtime": 29.4887, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.136, + "step": 803 + }, + { + "epoch": 3.216, + "grad_norm": 16.25, + "learning_rate": 1.7840000000000002e-05, + "loss": 0.7539, + "step": 804 + }, + { + "epoch": 3.216, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.479778915643692, + "eval_runtime": 29.5957, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.129, + "step": 804 + }, + { + "epoch": 3.22, + "grad_norm": 14.125, + "learning_rate": 1.78e-05, + "loss": 0.4102, + "step": 805 + }, + { + "epoch": 3.22, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4793396592140198, + "eval_runtime": 29.5994, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.128, + "step": 805 + }, + { + "epoch": 3.224, + "grad_norm": 23.25, + "learning_rate": 1.7760000000000003e-05, + "loss": 0.668, + "step": 806 + }, + { + "epoch": 3.224, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.48042210936546326, + "eval_runtime": 29.4933, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 806 + }, + { + "epoch": 3.228, + "grad_norm": 7.125, + "learning_rate": 1.772e-05, + "loss": 0.2891, + "step": 807 + }, + { + "epoch": 3.228, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4803122878074646, + "eval_runtime": 29.4481, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.139, + "step": 807 + }, + { + "epoch": 3.232, + "grad_norm": 17.125, + "learning_rate": 1.7680000000000004e-05, + "loss": 0.5156, + "step": 808 + }, + { + "epoch": 3.232, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4802652597427368, + "eval_runtime": 29.4558, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.139, + "step": 808 + }, + { + "epoch": 3.2359999999999998, + "grad_norm": 15.75, + "learning_rate": 1.764e-05, + "loss": 0.5938, + "step": 809 + }, + { + "epoch": 3.2359999999999998, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4806574881076813, + "eval_runtime": 29.3718, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.145, + "step": 809 + }, + { + "epoch": 3.24, + "grad_norm": 5.15625, + "learning_rate": 1.76e-05, + "loss": 0.1187, + "step": 810 + }, + { + "epoch": 3.24, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.48153606057167053, + "eval_runtime": 29.4102, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.142, + "step": 810 + }, + { + "epoch": 3.2439999999999998, + "grad_norm": 15.375, + "learning_rate": 1.756e-05, + "loss": 0.4824, + "step": 811 + }, + { + "epoch": 3.2439999999999998, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.48141056299209595, + "eval_runtime": 29.3395, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 811 + }, + { + "epoch": 3.248, + "grad_norm": 8.25, + "learning_rate": 1.752e-05, + "loss": 0.4102, + "step": 812 + }, + { + "epoch": 3.248, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.48181837797164917, + "eval_runtime": 29.3905, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 812 + }, + { + "epoch": 3.252, + "grad_norm": 23.75, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.9609, + "step": 813 + }, + { + "epoch": 3.252, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.48163020610809326, + "eval_runtime": 29.558, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.131, + "step": 813 + }, + { + "epoch": 3.2560000000000002, + "grad_norm": 0.11376953125, + "learning_rate": 1.7440000000000002e-05, + "loss": 0.0012, + "step": 814 + }, + { + "epoch": 3.2560000000000002, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4806889295578003, + "eval_runtime": 29.5359, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 814 + }, + { + "epoch": 3.26, + "grad_norm": 7.25, + "learning_rate": 1.74e-05, + "loss": 0.2305, + "step": 815 + }, + { + "epoch": 3.26, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.48100268840789795, + "eval_runtime": 29.5924, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.129, + "step": 815 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 8.9375, + "learning_rate": 1.736e-05, + "loss": 0.2188, + "step": 816 + }, + { + "epoch": 3.2640000000000002, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.48051637411117554, + "eval_runtime": 29.5118, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 816 + }, + { + "epoch": 3.268, + "grad_norm": 24.375, + "learning_rate": 1.732e-05, + "loss": 1.0391, + "step": 817 + }, + { + "epoch": 3.268, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.48037517070770264, + "eval_runtime": 29.4535, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.139, + "step": 817 + }, + { + "epoch": 3.2720000000000002, + "grad_norm": 16.875, + "learning_rate": 1.728e-05, + "loss": 0.3066, + "step": 818 + }, + { + "epoch": 3.2720000000000002, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.48109686374664307, + "eval_runtime": 29.396, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.143, + "step": 818 + }, + { + "epoch": 3.276, + "grad_norm": 49.0, + "learning_rate": 1.724e-05, + "loss": 0.8438, + "step": 819 + }, + { + "epoch": 3.276, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4798732399940491, + "eval_runtime": 29.3579, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.146, + "step": 819 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 50.0, + "learning_rate": 1.7199999999999998e-05, + "loss": 0.9492, + "step": 820 + }, + { + "epoch": 3.2800000000000002, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4799359440803528, + "eval_runtime": 29.3323, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 820 + }, + { + "epoch": 3.284, + "grad_norm": 7.375, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.1416, + "step": 821 + }, + { + "epoch": 3.284, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.47955945134162903, + "eval_runtime": 29.3024, + "eval_samples_per_second": 16.995, + "eval_steps_per_second": 2.15, + "step": 821 + }, + { + "epoch": 3.288, + "grad_norm": 19.0, + "learning_rate": 1.712e-05, + "loss": 1.0625, + "step": 822 + }, + { + "epoch": 3.288, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.47880643606185913, + "eval_runtime": 29.3485, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 822 + }, + { + "epoch": 3.292, + "grad_norm": 8.375, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.3203, + "step": 823 + }, + { + "epoch": 3.292, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4791986346244812, + "eval_runtime": 29.3424, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 823 + }, + { + "epoch": 3.296, + "grad_norm": 7.46875, + "learning_rate": 1.704e-05, + "loss": 0.3164, + "step": 824 + }, + { + "epoch": 3.296, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4789319634437561, + "eval_runtime": 29.2745, + "eval_samples_per_second": 17.011, + "eval_steps_per_second": 2.152, + "step": 824 + }, + { + "epoch": 3.3, + "grad_norm": 15.5, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.6406, + "step": 825 + }, + { + "epoch": 3.3, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4779750108718872, + "eval_runtime": 29.2659, + "eval_samples_per_second": 17.016, + "eval_steps_per_second": 2.153, + "step": 825 + }, + { + "epoch": 3.304, + "grad_norm": 9.5, + "learning_rate": 1.696e-05, + "loss": 0.8242, + "step": 826 + }, + { + "epoch": 3.304, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4781632721424103, + "eval_runtime": 29.3218, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 826 + }, + { + "epoch": 3.308, + "grad_norm": 29.0, + "learning_rate": 1.692e-05, + "loss": 0.5078, + "step": 827 + }, + { + "epoch": 3.308, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.478461354970932, + "eval_runtime": 29.3398, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 827 + }, + { + "epoch": 3.312, + "grad_norm": 6.1875, + "learning_rate": 1.688e-05, + "loss": 0.3359, + "step": 828 + }, + { + "epoch": 3.312, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.47789648175239563, + "eval_runtime": 29.5078, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 828 + }, + { + "epoch": 3.316, + "grad_norm": 6.71875, + "learning_rate": 1.684e-05, + "loss": 0.3906, + "step": 829 + }, + { + "epoch": 3.316, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4778807759284973, + "eval_runtime": 29.5206, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 829 + }, + { + "epoch": 3.32, + "grad_norm": 6.09375, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.1299, + "step": 830 + }, + { + "epoch": 3.32, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47784939408302307, + "eval_runtime": 29.5733, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.13, + "step": 830 + }, + { + "epoch": 3.324, + "grad_norm": 9.5625, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.5781, + "step": 831 + }, + { + "epoch": 3.324, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4782886207103729, + "eval_runtime": 29.5241, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.134, + "step": 831 + }, + { + "epoch": 3.328, + "grad_norm": 17.5, + "learning_rate": 1.672e-05, + "loss": 0.4961, + "step": 832 + }, + { + "epoch": 3.328, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4777865707874298, + "eval_runtime": 29.5188, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 832 + }, + { + "epoch": 3.332, + "grad_norm": 4.15625, + "learning_rate": 1.668e-05, + "loss": 0.1523, + "step": 833 + }, + { + "epoch": 3.332, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4783827066421509, + "eval_runtime": 29.4805, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.137, + "step": 833 + }, + { + "epoch": 3.336, + "grad_norm": 2.734375, + "learning_rate": 1.664e-05, + "loss": 0.0889, + "step": 834 + }, + { + "epoch": 3.336, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4781787693500519, + "eval_runtime": 29.3281, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.148, + "step": 834 + }, + { + "epoch": 3.34, + "grad_norm": 0.1103515625, + "learning_rate": 1.66e-05, + "loss": 0.0014, + "step": 835 + }, + { + "epoch": 3.34, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.478445440530777, + "eval_runtime": 29.2621, + "eval_samples_per_second": 17.019, + "eval_steps_per_second": 2.153, + "step": 835 + }, + { + "epoch": 3.344, + "grad_norm": 15.0625, + "learning_rate": 1.656e-05, + "loss": 0.3984, + "step": 836 + }, + { + "epoch": 3.344, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4789317846298218, + "eval_runtime": 29.248, + "eval_samples_per_second": 17.027, + "eval_steps_per_second": 2.154, + "step": 836 + }, + { + "epoch": 3.348, + "grad_norm": 5.3125, + "learning_rate": 1.652e-05, + "loss": 0.2334, + "step": 837 + }, + { + "epoch": 3.348, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4789160490036011, + "eval_runtime": 29.3813, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 837 + }, + { + "epoch": 3.352, + "grad_norm": 4.84375, + "learning_rate": 1.648e-05, + "loss": 0.2559, + "step": 838 + }, + { + "epoch": 3.352, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4785395562648773, + "eval_runtime": 29.4807, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 838 + }, + { + "epoch": 3.356, + "grad_norm": 16.875, + "learning_rate": 1.644e-05, + "loss": 0.9805, + "step": 839 + }, + { + "epoch": 3.356, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47902584075927734, + "eval_runtime": 29.5222, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 839 + }, + { + "epoch": 3.36, + "grad_norm": 6.8125, + "learning_rate": 1.6400000000000002e-05, + "loss": 0.4004, + "step": 840 + }, + { + "epoch": 3.36, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.479465126991272, + "eval_runtime": 29.5247, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 840 + }, + { + "epoch": 3.364, + "grad_norm": 9.0625, + "learning_rate": 1.636e-05, + "loss": 0.6172, + "step": 841 + }, + { + "epoch": 3.364, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4794180393218994, + "eval_runtime": 29.5294, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.133, + "step": 841 + }, + { + "epoch": 3.368, + "grad_norm": 10.8125, + "learning_rate": 1.6320000000000003e-05, + "loss": 0.2773, + "step": 842 + }, + { + "epoch": 3.368, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4794650971889496, + "eval_runtime": 29.5759, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.13, + "step": 842 + }, + { + "epoch": 3.372, + "grad_norm": 8.3125, + "learning_rate": 1.628e-05, + "loss": 0.3262, + "step": 843 + }, + { + "epoch": 3.372, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4790101647377014, + "eval_runtime": 29.5048, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 843 + }, + { + "epoch": 3.376, + "grad_norm": 11.4375, + "learning_rate": 1.624e-05, + "loss": 0.6055, + "step": 844 + }, + { + "epoch": 3.376, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4792768359184265, + "eval_runtime": 29.3749, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.145, + "step": 844 + }, + { + "epoch": 3.38, + "grad_norm": 7.78125, + "learning_rate": 1.62e-05, + "loss": 0.4688, + "step": 845 + }, + { + "epoch": 3.38, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47979453206062317, + "eval_runtime": 29.3294, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.148, + "step": 845 + }, + { + "epoch": 3.384, + "grad_norm": 7.4375, + "learning_rate": 1.616e-05, + "loss": 0.2451, + "step": 846 + }, + { + "epoch": 3.384, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.479135662317276, + "eval_runtime": 29.3636, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 2.146, + "step": 846 + }, + { + "epoch": 3.388, + "grad_norm": 8.625, + "learning_rate": 1.612e-05, + "loss": 0.248, + "step": 847 + }, + { + "epoch": 3.388, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4789160192012787, + "eval_runtime": 29.3475, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.147, + "step": 847 + }, + { + "epoch": 3.392, + "grad_norm": 3.265625, + "learning_rate": 1.6080000000000002e-05, + "loss": 0.0869, + "step": 848 + }, + { + "epoch": 3.392, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4794180393218994, + "eval_runtime": 29.4739, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.137, + "step": 848 + }, + { + "epoch": 3.396, + "grad_norm": 22.25, + "learning_rate": 1.604e-05, + "loss": 0.5312, + "step": 849 + }, + { + "epoch": 3.396, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.479543536901474, + "eval_runtime": 29.5371, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 849 + }, + { + "epoch": 3.4, + "grad_norm": 7.46875, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.3789, + "step": 850 + }, + { + "epoch": 3.4, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4791356027126312, + "eval_runtime": 29.5859, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.129, + "step": 850 + }, + { + "epoch": 3.404, + "grad_norm": 4.9375, + "learning_rate": 1.596e-05, + "loss": 0.2148, + "step": 851 + }, + { + "epoch": 3.404, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4789630174636841, + "eval_runtime": 29.4825, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.137, + "step": 851 + }, + { + "epoch": 3.408, + "grad_norm": 21.75, + "learning_rate": 1.592e-05, + "loss": 0.9844, + "step": 852 + }, + { + "epoch": 3.408, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47921404242515564, + "eval_runtime": 29.36, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.146, + "step": 852 + }, + { + "epoch": 3.412, + "grad_norm": 13.625, + "learning_rate": 1.588e-05, + "loss": 0.8047, + "step": 853 + }, + { + "epoch": 3.412, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4796375632286072, + "eval_runtime": 29.4929, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 853 + }, + { + "epoch": 3.416, + "grad_norm": 16.125, + "learning_rate": 1.584e-05, + "loss": 1.0078, + "step": 854 + }, + { + "epoch": 3.416, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4801395833492279, + "eval_runtime": 29.5411, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 854 + }, + { + "epoch": 3.42, + "grad_norm": 8.6875, + "learning_rate": 1.58e-05, + "loss": 0.334, + "step": 855 + }, + { + "epoch": 3.42, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4797316789627075, + "eval_runtime": 29.5454, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.132, + "step": 855 + }, + { + "epoch": 3.424, + "grad_norm": 12.625, + "learning_rate": 1.5759999999999998e-05, + "loss": 0.4512, + "step": 856 + }, + { + "epoch": 3.424, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.48023366928100586, + "eval_runtime": 29.5257, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 856 + }, + { + "epoch": 3.428, + "grad_norm": 5.25, + "learning_rate": 1.5720000000000002e-05, + "loss": 0.2471, + "step": 857 + }, + { + "epoch": 3.428, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4798571765422821, + "eval_runtime": 29.4566, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 857 + }, + { + "epoch": 3.432, + "grad_norm": 13.9375, + "learning_rate": 1.568e-05, + "loss": 0.4316, + "step": 858 + }, + { + "epoch": 3.432, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.48017093539237976, + "eval_runtime": 29.3945, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.143, + "step": 858 + }, + { + "epoch": 3.436, + "grad_norm": 5.84375, + "learning_rate": 1.5640000000000003e-05, + "loss": 0.1387, + "step": 859 + }, + { + "epoch": 3.436, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4799042344093323, + "eval_runtime": 29.2858, + "eval_samples_per_second": 17.005, + "eval_steps_per_second": 2.151, + "step": 859 + }, + { + "epoch": 3.44, + "grad_norm": 11.4375, + "learning_rate": 1.56e-05, + "loss": 0.7461, + "step": 860 + }, + { + "epoch": 3.44, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.48015522956848145, + "eval_runtime": 29.3064, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.15, + "step": 860 + }, + { + "epoch": 3.444, + "grad_norm": 37.25, + "learning_rate": 1.556e-05, + "loss": 0.7344, + "step": 861 + }, + { + "epoch": 3.444, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.47941792011260986, + "eval_runtime": 29.3349, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.148, + "step": 861 + }, + { + "epoch": 3.448, + "grad_norm": 20.625, + "learning_rate": 1.552e-05, + "loss": 1.4766, + "step": 862 + }, + { + "epoch": 3.448, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4790413975715637, + "eval_runtime": 29.4428, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 862 + }, + { + "epoch": 3.452, + "grad_norm": 11.3125, + "learning_rate": 1.548e-05, + "loss": 0.3945, + "step": 863 + }, + { + "epoch": 3.452, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4790257215499878, + "eval_runtime": 29.5283, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 863 + }, + { + "epoch": 3.456, + "grad_norm": 9.875, + "learning_rate": 1.544e-05, + "loss": 0.167, + "step": 864 + }, + { + "epoch": 3.456, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4784296154975891, + "eval_runtime": 29.5954, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.129, + "step": 864 + }, + { + "epoch": 3.46, + "grad_norm": 12.9375, + "learning_rate": 1.54e-05, + "loss": 0.3965, + "step": 865 + }, + { + "epoch": 3.46, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47849240899086, + "eval_runtime": 29.5533, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.132, + "step": 865 + }, + { + "epoch": 3.464, + "grad_norm": 6.0625, + "learning_rate": 1.536e-05, + "loss": 0.1396, + "step": 866 + }, + { + "epoch": 3.464, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4778178334236145, + "eval_runtime": 29.5408, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 866 + }, + { + "epoch": 3.468, + "grad_norm": 9.0, + "learning_rate": 1.5320000000000002e-05, + "loss": 0.4961, + "step": 867 + }, + { + "epoch": 3.468, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.477755069732666, + "eval_runtime": 29.458, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 867 + }, + { + "epoch": 3.472, + "grad_norm": 9.4375, + "learning_rate": 1.528e-05, + "loss": 0.5469, + "step": 868 + }, + { + "epoch": 3.472, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4771275818347931, + "eval_runtime": 29.3921, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.143, + "step": 868 + }, + { + "epoch": 3.476, + "grad_norm": 16.5, + "learning_rate": 1.5240000000000001e-05, + "loss": 0.7344, + "step": 869 + }, + { + "epoch": 3.476, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4770962595939636, + "eval_runtime": 29.2618, + "eval_samples_per_second": 17.019, + "eval_steps_per_second": 2.153, + "step": 869 + }, + { + "epoch": 3.48, + "grad_norm": 25.25, + "learning_rate": 1.52e-05, + "loss": 1.2031, + "step": 870 + }, + { + "epoch": 3.48, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4767354428768158, + "eval_runtime": 29.302, + "eval_samples_per_second": 16.995, + "eval_steps_per_second": 2.15, + "step": 870 + }, + { + "epoch": 3.484, + "grad_norm": 10.625, + "learning_rate": 1.5160000000000002e-05, + "loss": 0.5703, + "step": 871 + }, + { + "epoch": 3.484, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.47678256034851074, + "eval_runtime": 29.4623, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.138, + "step": 871 + }, + { + "epoch": 3.488, + "grad_norm": 8.625, + "learning_rate": 1.5120000000000001e-05, + "loss": 0.1475, + "step": 872 + }, + { + "epoch": 3.488, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47657862305641174, + "eval_runtime": 29.5932, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.129, + "step": 872 + }, + { + "epoch": 3.492, + "grad_norm": 11.1875, + "learning_rate": 1.508e-05, + "loss": 0.3008, + "step": 873 + }, + { + "epoch": 3.492, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4759197235107422, + "eval_runtime": 29.5599, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.131, + "step": 873 + }, + { + "epoch": 3.496, + "grad_norm": 8.125, + "learning_rate": 1.5040000000000002e-05, + "loss": 0.3555, + "step": 874 + }, + { + "epoch": 3.496, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47590407729148865, + "eval_runtime": 29.5801, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.13, + "step": 874 + }, + { + "epoch": 3.5, + "grad_norm": 13.0625, + "learning_rate": 1.5e-05, + "loss": 0.2988, + "step": 875 + }, + { + "epoch": 3.5, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4764217436313629, + "eval_runtime": 29.5288, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 875 + }, + { + "epoch": 3.504, + "grad_norm": 7.5, + "learning_rate": 1.4960000000000002e-05, + "loss": 0.2207, + "step": 876 + }, + { + "epoch": 3.504, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4756530821323395, + "eval_runtime": 29.4595, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 876 + }, + { + "epoch": 3.508, + "grad_norm": 9.125, + "learning_rate": 1.4920000000000001e-05, + "loss": 0.2305, + "step": 877 + }, + { + "epoch": 3.508, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4759354889392853, + "eval_runtime": 29.4437, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 877 + }, + { + "epoch": 3.512, + "grad_norm": 6.75, + "learning_rate": 1.488e-05, + "loss": 0.2871, + "step": 878 + }, + { + "epoch": 3.512, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4752765893936157, + "eval_runtime": 29.4635, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.138, + "step": 878 + }, + { + "epoch": 3.516, + "grad_norm": 11.9375, + "learning_rate": 1.4840000000000002e-05, + "loss": 0.6328, + "step": 879 + }, + { + "epoch": 3.516, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47563740611076355, + "eval_runtime": 29.5665, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.131, + "step": 879 + }, + { + "epoch": 3.52, + "grad_norm": 6.8125, + "learning_rate": 1.48e-05, + "loss": 0.3262, + "step": 880 + }, + { + "epoch": 3.52, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4752609133720398, + "eval_runtime": 29.5897, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.129, + "step": 880 + }, + { + "epoch": 3.524, + "grad_norm": 11.875, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.5625, + "step": 881 + }, + { + "epoch": 3.524, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4753079116344452, + "eval_runtime": 29.5889, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 881 + }, + { + "epoch": 3.528, + "grad_norm": 8.25, + "learning_rate": 1.472e-05, + "loss": 0.4668, + "step": 882 + }, + { + "epoch": 3.528, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4757471978664398, + "eval_runtime": 29.5923, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.129, + "step": 882 + }, + { + "epoch": 3.532, + "grad_norm": 8.0, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.4258, + "step": 883 + }, + { + "epoch": 3.532, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.47502559423446655, + "eval_runtime": 29.5268, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 883 + }, + { + "epoch": 3.536, + "grad_norm": 22.375, + "learning_rate": 1.464e-05, + "loss": 1.1562, + "step": 884 + }, + { + "epoch": 3.536, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47515109181404114, + "eval_runtime": 29.4623, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.138, + "step": 884 + }, + { + "epoch": 3.54, + "grad_norm": 19.25, + "learning_rate": 1.4599999999999999e-05, + "loss": 0.3906, + "step": 885 + }, + { + "epoch": 3.54, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.47541773319244385, + "eval_runtime": 29.4244, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 885 + }, + { + "epoch": 3.544, + "grad_norm": 6.78125, + "learning_rate": 1.4560000000000001e-05, + "loss": 0.2949, + "step": 886 + }, + { + "epoch": 3.544, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4746333360671997, + "eval_runtime": 29.3548, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.146, + "step": 886 + }, + { + "epoch": 3.548, + "grad_norm": 11.5, + "learning_rate": 1.452e-05, + "loss": 0.4609, + "step": 887 + }, + { + "epoch": 3.548, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.47494709491729736, + "eval_runtime": 29.2912, + "eval_samples_per_second": 17.002, + "eval_steps_per_second": 2.151, + "step": 887 + }, + { + "epoch": 3.552, + "grad_norm": 4.9375, + "learning_rate": 1.4480000000000002e-05, + "loss": 0.1914, + "step": 888 + }, + { + "epoch": 3.552, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.47494709491729736, + "eval_runtime": 29.3383, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 888 + }, + { + "epoch": 3.556, + "grad_norm": 4.1875, + "learning_rate": 1.444e-05, + "loss": 0.1299, + "step": 889 + }, + { + "epoch": 3.556, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.47453922033309937, + "eval_runtime": 29.3288, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.148, + "step": 889 + }, + { + "epoch": 3.56, + "grad_norm": 8.0, + "learning_rate": 1.44e-05, + "loss": 0.418, + "step": 890 + }, + { + "epoch": 3.56, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.47446078062057495, + "eval_runtime": 29.2641, + "eval_samples_per_second": 17.017, + "eval_steps_per_second": 2.153, + "step": 890 + }, + { + "epoch": 3.564, + "grad_norm": 15.625, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.8477, + "step": 891 + }, + { + "epoch": 3.564, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47441366314888, + "eval_runtime": 29.32, + "eval_samples_per_second": 16.985, + "eval_steps_per_second": 2.149, + "step": 891 + }, + { + "epoch": 3.568, + "grad_norm": 12.375, + "learning_rate": 1.432e-05, + "loss": 0.3516, + "step": 892 + }, + { + "epoch": 3.568, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4743352234363556, + "eval_runtime": 29.2569, + "eval_samples_per_second": 17.022, + "eval_steps_per_second": 2.153, + "step": 892 + }, + { + "epoch": 3.572, + "grad_norm": 10.0625, + "learning_rate": 1.4280000000000002e-05, + "loss": 0.4375, + "step": 893 + }, + { + "epoch": 3.572, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4739900827407837, + "eval_runtime": 29.3122, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.149, + "step": 893 + }, + { + "epoch": 3.576, + "grad_norm": 13.625, + "learning_rate": 1.4240000000000001e-05, + "loss": 0.4062, + "step": 894 + }, + { + "epoch": 3.576, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47438228130340576, + "eval_runtime": 29.3127, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.149, + "step": 894 + }, + { + "epoch": 3.58, + "grad_norm": 19.375, + "learning_rate": 1.42e-05, + "loss": 0.7148, + "step": 895 + }, + { + "epoch": 3.58, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4745548367500305, + "eval_runtime": 29.3133, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.149, + "step": 895 + }, + { + "epoch": 3.584, + "grad_norm": 20.875, + "learning_rate": 1.4160000000000002e-05, + "loss": 0.9375, + "step": 896 + }, + { + "epoch": 3.584, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4744606912136078, + "eval_runtime": 29.3766, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 896 + }, + { + "epoch": 3.588, + "grad_norm": 8.125, + "learning_rate": 1.412e-05, + "loss": 0.3203, + "step": 897 + }, + { + "epoch": 3.588, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4743822515010834, + "eval_runtime": 29.4549, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.139, + "step": 897 + }, + { + "epoch": 3.592, + "grad_norm": 14.625, + "learning_rate": 1.408e-05, + "loss": 0.5039, + "step": 898 + }, + { + "epoch": 3.592, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47479012608528137, + "eval_runtime": 29.5152, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.134, + "step": 898 + }, + { + "epoch": 3.596, + "grad_norm": 5.125, + "learning_rate": 1.4040000000000001e-05, + "loss": 0.1689, + "step": 899 + }, + { + "epoch": 3.596, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47458615899086, + "eval_runtime": 29.5258, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 899 + }, + { + "epoch": 3.6, + "grad_norm": 4.34375, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.2461, + "step": 900 + }, + { + "epoch": 3.6, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.47463321685791016, + "eval_runtime": 29.527, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 900 + }, + { + "epoch": 3.604, + "grad_norm": 9.25, + "learning_rate": 1.396e-05, + "loss": 0.5547, + "step": 901 + }, + { + "epoch": 3.604, + "eval_accuracy": 0.9216867469879518, + "eval_loss": 0.4746802747249603, + "eval_runtime": 29.2948, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 901 + }, + { + "epoch": 3.608, + "grad_norm": 9.875, + "learning_rate": 1.3919999999999999e-05, + "loss": 0.4102, + "step": 902 + }, + { + "epoch": 3.608, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4744449555873871, + "eval_runtime": 29.2914, + "eval_samples_per_second": 17.002, + "eval_steps_per_second": 2.151, + "step": 902 + }, + { + "epoch": 3.612, + "grad_norm": 9.0625, + "learning_rate": 1.3880000000000001e-05, + "loss": 0.5195, + "step": 903 + }, + { + "epoch": 3.612, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47475871443748474, + "eval_runtime": 29.4616, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.138, + "step": 903 + }, + { + "epoch": 3.616, + "grad_norm": 13.3125, + "learning_rate": 1.384e-05, + "loss": 0.543, + "step": 904 + }, + { + "epoch": 3.616, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4751352071762085, + "eval_runtime": 29.5114, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 904 + }, + { + "epoch": 3.62, + "grad_norm": 12.0625, + "learning_rate": 1.3800000000000002e-05, + "loss": 0.5625, + "step": 905 + }, + { + "epoch": 3.62, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47504109144210815, + "eval_runtime": 29.5862, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.129, + "step": 905 + }, + { + "epoch": 3.624, + "grad_norm": 11.6875, + "learning_rate": 1.376e-05, + "loss": 0.75, + "step": 906 + }, + { + "epoch": 3.624, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4757470488548279, + "eval_runtime": 29.5769, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.13, + "step": 906 + }, + { + "epoch": 3.628, + "grad_norm": 17.0, + "learning_rate": 1.3719999999999999e-05, + "loss": 0.5352, + "step": 907 + }, + { + "epoch": 3.628, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47599804401397705, + "eval_runtime": 29.5016, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 907 + }, + { + "epoch": 3.632, + "grad_norm": 8.1875, + "learning_rate": 1.3680000000000001e-05, + "loss": 0.4688, + "step": 908 + }, + { + "epoch": 3.632, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47606080770492554, + "eval_runtime": 29.3838, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.144, + "step": 908 + }, + { + "epoch": 3.636, + "grad_norm": 6.90625, + "learning_rate": 1.364e-05, + "loss": 0.25, + "step": 909 + }, + { + "epoch": 3.636, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4761078655719757, + "eval_runtime": 29.3585, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.146, + "step": 909 + }, + { + "epoch": 3.64, + "grad_norm": 14.3125, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.6172, + "step": 910 + }, + { + "epoch": 3.64, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47613924741744995, + "eval_runtime": 29.3143, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.149, + "step": 910 + }, + { + "epoch": 3.644, + "grad_norm": 12.3125, + "learning_rate": 1.356e-05, + "loss": 0.498, + "step": 911 + }, + { + "epoch": 3.644, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47660985589027405, + "eval_runtime": 29.3718, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.145, + "step": 911 + }, + { + "epoch": 3.648, + "grad_norm": 17.0, + "learning_rate": 1.352e-05, + "loss": 0.5117, + "step": 912 + }, + { + "epoch": 3.648, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47642162442207336, + "eval_runtime": 29.4876, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.136, + "step": 912 + }, + { + "epoch": 3.652, + "grad_norm": 14.1875, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.3691, + "step": 913 + }, + { + "epoch": 3.652, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47717463970184326, + "eval_runtime": 29.5625, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.131, + "step": 913 + }, + { + "epoch": 3.656, + "grad_norm": 6.0625, + "learning_rate": 1.344e-05, + "loss": 0.2969, + "step": 914 + }, + { + "epoch": 3.656, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4776923358440399, + "eval_runtime": 29.5233, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.134, + "step": 914 + }, + { + "epoch": 3.66, + "grad_norm": 5.5625, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.3555, + "step": 915 + }, + { + "epoch": 3.66, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4773158133029938, + "eval_runtime": 29.5171, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 915 + }, + { + "epoch": 3.664, + "grad_norm": 16.875, + "learning_rate": 1.336e-05, + "loss": 1.125, + "step": 916 + }, + { + "epoch": 3.664, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47792765498161316, + "eval_runtime": 29.5288, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 916 + }, + { + "epoch": 3.668, + "grad_norm": 6.4375, + "learning_rate": 1.3320000000000001e-05, + "loss": 0.2148, + "step": 917 + }, + { + "epoch": 3.668, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47770801186561584, + "eval_runtime": 29.4674, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.138, + "step": 917 + }, + { + "epoch": 3.672, + "grad_norm": 9.6875, + "learning_rate": 1.3280000000000002e-05, + "loss": 0.5312, + "step": 918 + }, + { + "epoch": 3.672, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47783350944519043, + "eval_runtime": 29.3835, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.144, + "step": 918 + }, + { + "epoch": 3.676, + "grad_norm": 6.28125, + "learning_rate": 1.324e-05, + "loss": 0.2061, + "step": 919 + }, + { + "epoch": 3.676, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4779433310031891, + "eval_runtime": 29.345, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.147, + "step": 919 + }, + { + "epoch": 3.68, + "grad_norm": 5.1875, + "learning_rate": 1.32e-05, + "loss": 0.3145, + "step": 920 + }, + { + "epoch": 3.68, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47762957215309143, + "eval_runtime": 29.3768, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 920 + }, + { + "epoch": 3.684, + "grad_norm": 10.625, + "learning_rate": 1.316e-05, + "loss": 0.4473, + "step": 921 + }, + { + "epoch": 3.684, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47737857699394226, + "eval_runtime": 29.4945, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.136, + "step": 921 + }, + { + "epoch": 3.6879999999999997, + "grad_norm": 11.5, + "learning_rate": 1.3120000000000001e-05, + "loss": 0.5938, + "step": 922 + }, + { + "epoch": 3.6879999999999997, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4778805673122406, + "eval_runtime": 29.5139, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 922 + }, + { + "epoch": 3.692, + "grad_norm": 5.84375, + "learning_rate": 1.308e-05, + "loss": 0.3438, + "step": 923 + }, + { + "epoch": 3.692, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47764521837234497, + "eval_runtime": 29.5309, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.133, + "step": 923 + }, + { + "epoch": 3.6959999999999997, + "grad_norm": 6.6875, + "learning_rate": 1.3039999999999999e-05, + "loss": 0.1177, + "step": 924 + }, + { + "epoch": 3.6959999999999997, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.47734716534614563, + "eval_runtime": 29.525, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 924 + }, + { + "epoch": 3.7, + "grad_norm": 15.1875, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.4902, + "step": 925 + }, + { + "epoch": 3.7, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4774099290370941, + "eval_runtime": 29.4913, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 925 + }, + { + "epoch": 3.7039999999999997, + "grad_norm": 11.5, + "learning_rate": 1.296e-05, + "loss": 0.498, + "step": 926 + }, + { + "epoch": 3.7039999999999997, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.477033406496048, + "eval_runtime": 29.3698, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 926 + }, + { + "epoch": 3.708, + "grad_norm": 1.3203125, + "learning_rate": 1.2920000000000002e-05, + "loss": 0.0206, + "step": 927 + }, + { + "epoch": 3.708, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4767667055130005, + "eval_runtime": 29.3526, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.146, + "step": 927 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 4.875, + "learning_rate": 1.288e-05, + "loss": 0.2715, + "step": 928 + }, + { + "epoch": 3.7119999999999997, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4761548936367035, + "eval_runtime": 29.3196, + "eval_samples_per_second": 16.985, + "eval_steps_per_second": 2.149, + "step": 928 + }, + { + "epoch": 3.716, + "grad_norm": 5.28125, + "learning_rate": 1.2839999999999999e-05, + "loss": 0.2129, + "step": 929 + }, + { + "epoch": 3.716, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4756685793399811, + "eval_runtime": 29.4714, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 929 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 10.25, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.459, + "step": 930 + }, + { + "epoch": 3.7199999999999998, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4756058156490326, + "eval_runtime": 29.5254, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 930 + }, + { + "epoch": 3.724, + "grad_norm": 8.4375, + "learning_rate": 1.276e-05, + "loss": 0.3828, + "step": 931 + }, + { + "epoch": 3.724, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4755430817604065, + "eval_runtime": 29.5925, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.129, + "step": 931 + }, + { + "epoch": 3.7279999999999998, + "grad_norm": 8.75, + "learning_rate": 1.2720000000000002e-05, + "loss": 0.2227, + "step": 932 + }, + { + "epoch": 3.7279999999999998, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4755901098251343, + "eval_runtime": 29.5288, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 932 + }, + { + "epoch": 3.732, + "grad_norm": 5.84375, + "learning_rate": 1.268e-05, + "loss": 0.103, + "step": 933 + }, + { + "epoch": 3.732, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4753391444683075, + "eval_runtime": 29.4857, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 933 + }, + { + "epoch": 3.7359999999999998, + "grad_norm": 8.125, + "learning_rate": 1.2640000000000003e-05, + "loss": 0.334, + "step": 934 + }, + { + "epoch": 3.7359999999999998, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4749469459056854, + "eval_runtime": 29.3981, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.143, + "step": 934 + }, + { + "epoch": 3.74, + "grad_norm": 15.5, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.6133, + "step": 935 + }, + { + "epoch": 3.74, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4747900664806366, + "eval_runtime": 29.3428, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 935 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 14.5625, + "learning_rate": 1.256e-05, + "loss": 0.5703, + "step": 936 + }, + { + "epoch": 3.7439999999999998, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47422531247138977, + "eval_runtime": 29.3068, + "eval_samples_per_second": 16.993, + "eval_steps_per_second": 2.15, + "step": 936 + }, + { + "epoch": 3.748, + "grad_norm": 12.125, + "learning_rate": 1.252e-05, + "loss": 0.5781, + "step": 937 + }, + { + "epoch": 3.748, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4743037223815918, + "eval_runtime": 29.2884, + "eval_samples_per_second": 17.003, + "eval_steps_per_second": 2.151, + "step": 937 + }, + { + "epoch": 3.752, + "grad_norm": 63.5, + "learning_rate": 1.248e-05, + "loss": 1.5859, + "step": 938 + }, + { + "epoch": 3.752, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4743821918964386, + "eval_runtime": 29.2952, + "eval_samples_per_second": 16.999, + "eval_steps_per_second": 2.151, + "step": 938 + }, + { + "epoch": 3.7560000000000002, + "grad_norm": 10.125, + "learning_rate": 1.244e-05, + "loss": 0.4629, + "step": 939 + }, + { + "epoch": 3.7560000000000002, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4735036790370941, + "eval_runtime": 29.3391, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 939 + }, + { + "epoch": 3.76, + "grad_norm": 14.1875, + "learning_rate": 1.24e-05, + "loss": 0.5742, + "step": 940 + }, + { + "epoch": 3.76, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47311148047447205, + "eval_runtime": 29.5254, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 940 + }, + { + "epoch": 3.7640000000000002, + "grad_norm": 70.0, + "learning_rate": 1.236e-05, + "loss": 0.6094, + "step": 941 + }, + { + "epoch": 3.7640000000000002, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4729546010494232, + "eval_runtime": 29.5689, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 941 + }, + { + "epoch": 3.768, + "grad_norm": 15.125, + "learning_rate": 1.232e-05, + "loss": 0.3438, + "step": 942 + }, + { + "epoch": 3.768, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4735664129257202, + "eval_runtime": 29.5913, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.129, + "step": 942 + }, + { + "epoch": 3.7720000000000002, + "grad_norm": 11.0625, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.2734, + "step": 943 + }, + { + "epoch": 3.7720000000000002, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47337815165519714, + "eval_runtime": 29.5421, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 943 + }, + { + "epoch": 3.776, + "grad_norm": 7.8125, + "learning_rate": 1.224e-05, + "loss": 0.375, + "step": 944 + }, + { + "epoch": 3.776, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4730958044528961, + "eval_runtime": 29.5273, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 944 + }, + { + "epoch": 3.7800000000000002, + "grad_norm": 6.03125, + "learning_rate": 1.22e-05, + "loss": 0.1328, + "step": 945 + }, + { + "epoch": 3.7800000000000002, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4730173349380493, + "eval_runtime": 29.4586, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 945 + }, + { + "epoch": 3.784, + "grad_norm": 14.1875, + "learning_rate": 1.216e-05, + "loss": 0.6562, + "step": 946 + }, + { + "epoch": 3.784, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4732056260108948, + "eval_runtime": 29.4381, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.14, + "step": 946 + }, + { + "epoch": 3.7880000000000003, + "grad_norm": 21.625, + "learning_rate": 1.2120000000000001e-05, + "loss": 1.0625, + "step": 947 + }, + { + "epoch": 3.7880000000000003, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.473331093788147, + "eval_runtime": 29.3488, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 947 + }, + { + "epoch": 3.792, + "grad_norm": 11.25, + "learning_rate": 1.2080000000000001e-05, + "loss": 0.3418, + "step": 948 + }, + { + "epoch": 3.792, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.473237007856369, + "eval_runtime": 29.3823, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.144, + "step": 948 + }, + { + "epoch": 3.7960000000000003, + "grad_norm": 10.1875, + "learning_rate": 1.204e-05, + "loss": 0.3828, + "step": 949 + }, + { + "epoch": 3.7960000000000003, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4732683598995209, + "eval_runtime": 29.3558, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.146, + "step": 949 + }, + { + "epoch": 3.8, + "grad_norm": 11.75, + "learning_rate": 1.2e-05, + "loss": 0.7031, + "step": 950 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47318992018699646, + "eval_runtime": 29.2977, + "eval_samples_per_second": 16.998, + "eval_steps_per_second": 2.15, + "step": 950 + }, + { + "epoch": 3.8040000000000003, + "grad_norm": 50.75, + "learning_rate": 1.196e-05, + "loss": 0.6172, + "step": 951 + }, + { + "epoch": 3.8040000000000003, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4732840657234192, + "eval_runtime": 29.3419, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 951 + }, + { + "epoch": 3.808, + "grad_norm": 7.75, + "learning_rate": 1.1920000000000001e-05, + "loss": 0.4336, + "step": 952 + }, + { + "epoch": 3.808, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4735507369041443, + "eval_runtime": 29.2785, + "eval_samples_per_second": 17.009, + "eval_steps_per_second": 2.152, + "step": 952 + }, + { + "epoch": 3.8120000000000003, + "grad_norm": 7.25, + "learning_rate": 1.1880000000000001e-05, + "loss": 0.375, + "step": 953 + }, + { + "epoch": 3.8120000000000003, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47359779477119446, + "eval_runtime": 29.4542, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.139, + "step": 953 + }, + { + "epoch": 3.816, + "grad_norm": 6.875, + "learning_rate": 1.1840000000000002e-05, + "loss": 0.2852, + "step": 954 + }, + { + "epoch": 3.816, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47366052865982056, + "eval_runtime": 29.5208, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 954 + }, + { + "epoch": 3.82, + "grad_norm": 2.703125, + "learning_rate": 1.18e-05, + "loss": 0.0859, + "step": 955 + }, + { + "epoch": 3.82, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4735507369041443, + "eval_runtime": 29.5376, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 955 + }, + { + "epoch": 3.824, + "grad_norm": 6.4375, + "learning_rate": 1.1760000000000001e-05, + "loss": 0.2871, + "step": 956 + }, + { + "epoch": 3.824, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47359779477119446, + "eval_runtime": 29.5339, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.133, + "step": 956 + }, + { + "epoch": 3.828, + "grad_norm": 11.6875, + "learning_rate": 1.172e-05, + "loss": 0.3262, + "step": 957 + }, + { + "epoch": 3.828, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47373896837234497, + "eval_runtime": 29.5835, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 957 + }, + { + "epoch": 3.832, + "grad_norm": 21.125, + "learning_rate": 1.168e-05, + "loss": 0.7188, + "step": 958 + }, + { + "epoch": 3.832, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4739586114883423, + "eval_runtime": 29.4684, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.138, + "step": 958 + }, + { + "epoch": 3.836, + "grad_norm": 21.625, + "learning_rate": 1.164e-05, + "loss": 1.0156, + "step": 959 + }, + { + "epoch": 3.836, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4737546741962433, + "eval_runtime": 29.3496, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 959 + }, + { + "epoch": 3.84, + "grad_norm": 9.25, + "learning_rate": 1.16e-05, + "loss": 0.3184, + "step": 960 + }, + { + "epoch": 3.84, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4737389385700226, + "eval_runtime": 29.3654, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.145, + "step": 960 + }, + { + "epoch": 3.844, + "grad_norm": 14.5625, + "learning_rate": 1.156e-05, + "loss": 0.7578, + "step": 961 + }, + { + "epoch": 3.844, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4740840792655945, + "eval_runtime": 29.4912, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 961 + }, + { + "epoch": 3.848, + "grad_norm": 6.53125, + "learning_rate": 1.152e-05, + "loss": 0.2734, + "step": 962 + }, + { + "epoch": 3.848, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47406840324401855, + "eval_runtime": 29.5363, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 962 + }, + { + "epoch": 3.852, + "grad_norm": 10.25, + "learning_rate": 1.148e-05, + "loss": 0.3594, + "step": 963 + }, + { + "epoch": 3.852, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47398996353149414, + "eval_runtime": 29.5847, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.129, + "step": 963 + }, + { + "epoch": 3.856, + "grad_norm": 21.625, + "learning_rate": 1.144e-05, + "loss": 0.3789, + "step": 964 + }, + { + "epoch": 3.856, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4736762046813965, + "eval_runtime": 29.5716, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.13, + "step": 964 + }, + { + "epoch": 3.86, + "grad_norm": 18.75, + "learning_rate": 1.1400000000000001e-05, + "loss": 0.6875, + "step": 965 + }, + { + "epoch": 3.86, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47358205914497375, + "eval_runtime": 29.5639, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.131, + "step": 965 + }, + { + "epoch": 3.864, + "grad_norm": 10.625, + "learning_rate": 1.1360000000000001e-05, + "loss": 0.2891, + "step": 966 + }, + { + "epoch": 3.864, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47364482283592224, + "eval_runtime": 29.4077, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.142, + "step": 966 + }, + { + "epoch": 3.868, + "grad_norm": 9.9375, + "learning_rate": 1.132e-05, + "loss": 0.4922, + "step": 967 + }, + { + "epoch": 3.868, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47378602623939514, + "eval_runtime": 29.3166, + "eval_samples_per_second": 16.987, + "eval_steps_per_second": 2.149, + "step": 967 + }, + { + "epoch": 3.872, + "grad_norm": 10.5, + "learning_rate": 1.128e-05, + "loss": 0.3867, + "step": 968 + }, + { + "epoch": 3.872, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4737075865268707, + "eval_runtime": 29.333, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 968 + }, + { + "epoch": 3.876, + "grad_norm": 9.75, + "learning_rate": 1.124e-05, + "loss": 0.3438, + "step": 969 + }, + { + "epoch": 3.876, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4738801419734955, + "eval_runtime": 29.3324, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 969 + }, + { + "epoch": 3.88, + "grad_norm": 9.3125, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.4746, + "step": 970 + }, + { + "epoch": 3.88, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4739742577075958, + "eval_runtime": 29.4702, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 970 + }, + { + "epoch": 3.884, + "grad_norm": 24.5, + "learning_rate": 1.1160000000000002e-05, + "loss": 1.0156, + "step": 971 + }, + { + "epoch": 3.884, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47413113713264465, + "eval_runtime": 29.5613, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.131, + "step": 971 + }, + { + "epoch": 3.888, + "grad_norm": 15.75, + "learning_rate": 1.112e-05, + "loss": 0.7969, + "step": 972 + }, + { + "epoch": 3.888, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4739585816860199, + "eval_runtime": 29.5846, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.129, + "step": 972 + }, + { + "epoch": 3.892, + "grad_norm": 7.28125, + "learning_rate": 1.108e-05, + "loss": 0.1572, + "step": 973 + }, + { + "epoch": 3.892, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4744919538497925, + "eval_runtime": 29.5382, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 973 + }, + { + "epoch": 3.896, + "grad_norm": 15.125, + "learning_rate": 1.1040000000000001e-05, + "loss": 0.4609, + "step": 974 + }, + { + "epoch": 3.896, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4742880165576935, + "eval_runtime": 29.534, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.133, + "step": 974 + }, + { + "epoch": 3.9, + "grad_norm": 11.75, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.6875, + "step": 975 + }, + { + "epoch": 3.9, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47427234053611755, + "eval_runtime": 29.5182, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 975 + }, + { + "epoch": 3.904, + "grad_norm": 4.6875, + "learning_rate": 1.096e-05, + "loss": 0.2061, + "step": 976 + }, + { + "epoch": 3.904, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4740370213985443, + "eval_runtime": 29.4389, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.14, + "step": 976 + }, + { + "epoch": 3.908, + "grad_norm": 7.15625, + "learning_rate": 1.092e-05, + "loss": 0.4062, + "step": 977 + }, + { + "epoch": 3.908, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47441351413726807, + "eval_runtime": 29.3658, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.145, + "step": 977 + }, + { + "epoch": 3.912, + "grad_norm": 9.875, + "learning_rate": 1.088e-05, + "loss": 0.3203, + "step": 978 + }, + { + "epoch": 3.912, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4745076596736908, + "eval_runtime": 29.4399, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.14, + "step": 978 + }, + { + "epoch": 3.916, + "grad_norm": 5.75, + "learning_rate": 1.084e-05, + "loss": 0.1924, + "step": 979 + }, + { + "epoch": 3.916, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4743821620941162, + "eval_runtime": 29.5504, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.132, + "step": 979 + }, + { + "epoch": 3.92, + "grad_norm": 9.5625, + "learning_rate": 1.08e-05, + "loss": 0.3496, + "step": 980 + }, + { + "epoch": 3.92, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47433507442474365, + "eval_runtime": 29.607, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.128, + "step": 980 + }, + { + "epoch": 3.924, + "grad_norm": 4.65625, + "learning_rate": 1.076e-05, + "loss": 0.1855, + "step": 981 + }, + { + "epoch": 3.924, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4747272729873657, + "eval_runtime": 29.5563, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 981 + }, + { + "epoch": 3.928, + "grad_norm": 18.25, + "learning_rate": 1.072e-05, + "loss": 1.0312, + "step": 982 + }, + { + "epoch": 3.928, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4740840494632721, + "eval_runtime": 29.5885, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 982 + }, + { + "epoch": 3.932, + "grad_norm": 6.625, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.2539, + "step": 983 + }, + { + "epoch": 3.932, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4745076298713684, + "eval_runtime": 29.5431, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.132, + "step": 983 + }, + { + "epoch": 3.936, + "grad_norm": 5.5625, + "learning_rate": 1.064e-05, + "loss": 0.0928, + "step": 984 + }, + { + "epoch": 3.936, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47466450929641724, + "eval_runtime": 29.4588, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 984 + }, + { + "epoch": 3.94, + "grad_norm": 16.75, + "learning_rate": 1.06e-05, + "loss": 0.6211, + "step": 985 + }, + { + "epoch": 3.94, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47419387102127075, + "eval_runtime": 29.4142, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.142, + "step": 985 + }, + { + "epoch": 3.944, + "grad_norm": 18.125, + "learning_rate": 1.056e-05, + "loss": 0.8906, + "step": 986 + }, + { + "epoch": 3.944, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4746174216270447, + "eval_runtime": 29.3213, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 986 + }, + { + "epoch": 3.948, + "grad_norm": 15.5625, + "learning_rate": 1.0520000000000001e-05, + "loss": 0.5859, + "step": 987 + }, + { + "epoch": 3.948, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47455471754074097, + "eval_runtime": 29.3572, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.146, + "step": 987 + }, + { + "epoch": 3.952, + "grad_norm": 9.9375, + "learning_rate": 1.0480000000000001e-05, + "loss": 0.3047, + "step": 988 + }, + { + "epoch": 3.952, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4741625189781189, + "eval_runtime": 29.424, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 988 + }, + { + "epoch": 3.956, + "grad_norm": 7.6875, + "learning_rate": 1.0440000000000002e-05, + "loss": 0.3887, + "step": 989 + }, + { + "epoch": 3.956, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47446057200431824, + "eval_runtime": 29.5018, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 989 + }, + { + "epoch": 3.96, + "grad_norm": 20.0, + "learning_rate": 1.04e-05, + "loss": 0.8633, + "step": 990 + }, + { + "epoch": 3.96, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47414684295654297, + "eval_runtime": 29.5529, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.132, + "step": 990 + }, + { + "epoch": 3.964, + "grad_norm": 9.4375, + "learning_rate": 1.036e-05, + "loss": 0.4414, + "step": 991 + }, + { + "epoch": 3.964, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47433507442474365, + "eval_runtime": 29.5387, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 991 + }, + { + "epoch": 3.968, + "grad_norm": 3.125, + "learning_rate": 1.0320000000000001e-05, + "loss": 0.0452, + "step": 992 + }, + { + "epoch": 3.968, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47366049885749817, + "eval_runtime": 29.579, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.13, + "step": 992 + }, + { + "epoch": 3.972, + "grad_norm": 7.28125, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.3926, + "step": 993 + }, + { + "epoch": 3.972, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47380170226097107, + "eval_runtime": 29.5671, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.131, + "step": 993 + }, + { + "epoch": 3.976, + "grad_norm": 7.78125, + "learning_rate": 1.024e-05, + "loss": 0.2715, + "step": 994 + }, + { + "epoch": 3.976, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4740840792655945, + "eval_runtime": 29.4755, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.137, + "step": 994 + }, + { + "epoch": 3.98, + "grad_norm": 9.375, + "learning_rate": 1.02e-05, + "loss": 0.5352, + "step": 995 + }, + { + "epoch": 3.98, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47406840324401855, + "eval_runtime": 29.3871, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.144, + "step": 995 + }, + { + "epoch": 3.984, + "grad_norm": 20.625, + "learning_rate": 1.016e-05, + "loss": 0.6953, + "step": 996 + }, + { + "epoch": 3.984, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47398996353149414, + "eval_runtime": 29.332, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 996 + }, + { + "epoch": 3.988, + "grad_norm": 13.375, + "learning_rate": 1.012e-05, + "loss": 0.7734, + "step": 997 + }, + { + "epoch": 3.988, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4743193984031677, + "eval_runtime": 29.3474, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.147, + "step": 997 + }, + { + "epoch": 3.992, + "grad_norm": 27.625, + "learning_rate": 1.008e-05, + "loss": 0.6719, + "step": 998 + }, + { + "epoch": 3.992, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4744448959827423, + "eval_runtime": 29.441, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.14, + "step": 998 + }, + { + "epoch": 3.996, + "grad_norm": 8.8125, + "learning_rate": 1.004e-05, + "loss": 0.4863, + "step": 999 + }, + { + "epoch": 3.996, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47475865483283997, + "eval_runtime": 29.5164, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 999 + }, + { + "epoch": 4.0, + "grad_norm": 16.625, + "learning_rate": 1e-05, + "loss": 0.6406, + "step": 1000 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4746645390987396, + "eval_runtime": 29.5393, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 1000 + }, + { + "epoch": 4.004, + "grad_norm": 20.875, + "learning_rate": 9.96e-06, + "loss": 0.8047, + "step": 1001 + }, + { + "epoch": 4.004, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.47460177540779114, + "eval_runtime": 29.3424, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 1001 + }, + { + "epoch": 4.008, + "grad_norm": 12.5, + "learning_rate": 9.92e-06, + "loss": 0.6484, + "step": 1002 + }, + { + "epoch": 4.008, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47433507442474365, + "eval_runtime": 29.4561, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.139, + "step": 1002 + }, + { + "epoch": 4.012, + "grad_norm": 7.40625, + "learning_rate": 9.88e-06, + "loss": 0.4727, + "step": 1003 + }, + { + "epoch": 4.012, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4745076596736908, + "eval_runtime": 29.5492, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.132, + "step": 1003 + }, + { + "epoch": 4.016, + "grad_norm": 12.1875, + "learning_rate": 9.84e-06, + "loss": 0.4492, + "step": 1004 + }, + { + "epoch": 4.016, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47510379552841187, + "eval_runtime": 29.53, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.133, + "step": 1004 + }, + { + "epoch": 4.02, + "grad_norm": 6.375, + "learning_rate": 9.800000000000001e-06, + "loss": 0.252, + "step": 1005 + }, + { + "epoch": 4.02, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4747272729873657, + "eval_runtime": 29.536, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 1005 + }, + { + "epoch": 4.024, + "grad_norm": 8.4375, + "learning_rate": 9.760000000000001e-06, + "loss": 0.3906, + "step": 1006 + }, + { + "epoch": 4.024, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4746645390987396, + "eval_runtime": 29.5815, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.13, + "step": 1006 + }, + { + "epoch": 4.028, + "grad_norm": 6.125, + "learning_rate": 9.72e-06, + "loss": 0.2285, + "step": 1007 + }, + { + "epoch": 4.028, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47441351413726807, + "eval_runtime": 29.4916, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.136, + "step": 1007 + }, + { + "epoch": 4.032, + "grad_norm": 19.75, + "learning_rate": 9.68e-06, + "loss": 0.6602, + "step": 1008 + }, + { + "epoch": 4.032, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.4748370945453644, + "eval_runtime": 29.4568, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.139, + "step": 1008 + }, + { + "epoch": 4.036, + "grad_norm": 20.0, + "learning_rate": 9.640000000000001e-06, + "loss": 1.0312, + "step": 1009 + }, + { + "epoch": 4.036, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47508808970451355, + "eval_runtime": 29.3541, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.146, + "step": 1009 + }, + { + "epoch": 4.04, + "grad_norm": 4.59375, + "learning_rate": 9.600000000000001e-06, + "loss": 0.0972, + "step": 1010 + }, + { + "epoch": 4.04, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47468021512031555, + "eval_runtime": 29.3162, + "eval_samples_per_second": 16.987, + "eval_steps_per_second": 2.149, + "step": 1010 + }, + { + "epoch": 4.044, + "grad_norm": 6.65625, + "learning_rate": 9.560000000000002e-06, + "loss": 0.2539, + "step": 1011 + }, + { + "epoch": 4.044, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4747115969657898, + "eval_runtime": 29.3467, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.147, + "step": 1011 + }, + { + "epoch": 4.048, + "grad_norm": 12.6875, + "learning_rate": 9.52e-06, + "loss": 0.6914, + "step": 1012 + }, + { + "epoch": 4.048, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4747115969657898, + "eval_runtime": 29.4831, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.137, + "step": 1012 + }, + { + "epoch": 4.052, + "grad_norm": 11.1875, + "learning_rate": 9.48e-06, + "loss": 0.4277, + "step": 1013 + }, + { + "epoch": 4.052, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4744919538497925, + "eval_runtime": 29.5752, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.13, + "step": 1013 + }, + { + "epoch": 4.056, + "grad_norm": 9.3125, + "learning_rate": 9.44e-06, + "loss": 0.3496, + "step": 1014 + }, + { + "epoch": 4.056, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47496259212493896, + "eval_runtime": 29.5319, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 1014 + }, + { + "epoch": 4.06, + "grad_norm": 8.375, + "learning_rate": 9.4e-06, + "loss": 0.373, + "step": 1015 + }, + { + "epoch": 4.06, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47480571269989014, + "eval_runtime": 29.5743, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.13, + "step": 1015 + }, + { + "epoch": 4.064, + "grad_norm": 10.25, + "learning_rate": 9.36e-06, + "loss": 0.582, + "step": 1016 + }, + { + "epoch": 4.064, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47455471754074097, + "eval_runtime": 29.4731, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 1016 + }, + { + "epoch": 4.068, + "grad_norm": 3.84375, + "learning_rate": 9.32e-06, + "loss": 0.1338, + "step": 1017 + }, + { + "epoch": 4.068, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4746645390987396, + "eval_runtime": 29.4429, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.14, + "step": 1017 + }, + { + "epoch": 4.072, + "grad_norm": 6.78125, + "learning_rate": 9.28e-06, + "loss": 0.1621, + "step": 1018 + }, + { + "epoch": 4.072, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47461745142936707, + "eval_runtime": 29.393, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.143, + "step": 1018 + }, + { + "epoch": 4.076, + "grad_norm": 4.125, + "learning_rate": 9.24e-06, + "loss": 0.0608, + "step": 1019 + }, + { + "epoch": 4.076, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4747743308544159, + "eval_runtime": 29.377, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 1019 + }, + { + "epoch": 4.08, + "grad_norm": 6.875, + "learning_rate": 9.2e-06, + "loss": 0.1924, + "step": 1020 + }, + { + "epoch": 4.08, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4747115969657898, + "eval_runtime": 29.4849, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 1020 + }, + { + "epoch": 4.084, + "grad_norm": 10.9375, + "learning_rate": 9.16e-06, + "loss": 0.6211, + "step": 1021 + }, + { + "epoch": 4.084, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4748684763908386, + "eval_runtime": 29.5358, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 1021 + }, + { + "epoch": 4.088, + "grad_norm": 13.4375, + "learning_rate": 9.12e-06, + "loss": 0.5859, + "step": 1022 + }, + { + "epoch": 4.088, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4746018052101135, + "eval_runtime": 29.5295, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.133, + "step": 1022 + }, + { + "epoch": 4.092, + "grad_norm": 13.5625, + "learning_rate": 9.080000000000001e-06, + "loss": 0.7461, + "step": 1023 + }, + { + "epoch": 4.092, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4745703935623169, + "eval_runtime": 29.571, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.13, + "step": 1023 + }, + { + "epoch": 4.096, + "grad_norm": 1.9453125, + "learning_rate": 9.04e-06, + "loss": 0.0349, + "step": 1024 + }, + { + "epoch": 4.096, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4749312102794647, + "eval_runtime": 29.505, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.135, + "step": 1024 + }, + { + "epoch": 4.1, + "grad_norm": 12.3125, + "learning_rate": 9e-06, + "loss": 0.5469, + "step": 1025 + }, + { + "epoch": 4.1, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4747900366783142, + "eval_runtime": 29.4205, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.141, + "step": 1025 + }, + { + "epoch": 4.104, + "grad_norm": 8.9375, + "learning_rate": 8.96e-06, + "loss": 0.3555, + "step": 1026 + }, + { + "epoch": 4.104, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4742880165576935, + "eval_runtime": 29.4064, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.142, + "step": 1026 + }, + { + "epoch": 4.108, + "grad_norm": 7.3125, + "learning_rate": 8.920000000000001e-06, + "loss": 0.4766, + "step": 1027 + }, + { + "epoch": 4.108, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47453901171684265, + "eval_runtime": 29.3764, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 1027 + }, + { + "epoch": 4.112, + "grad_norm": 5.1875, + "learning_rate": 8.880000000000001e-06, + "loss": 0.2256, + "step": 1028 + }, + { + "epoch": 4.112, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47435078024864197, + "eval_runtime": 29.3621, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.146, + "step": 1028 + }, + { + "epoch": 4.116, + "grad_norm": 22.75, + "learning_rate": 8.840000000000002e-06, + "loss": 0.5312, + "step": 1029 + }, + { + "epoch": 4.116, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47435078024864197, + "eval_runtime": 29.5333, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.133, + "step": 1029 + }, + { + "epoch": 4.12, + "grad_norm": 7.65625, + "learning_rate": 8.8e-06, + "loss": 0.3027, + "step": 1030 + }, + { + "epoch": 4.12, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4742880165576935, + "eval_runtime": 29.5698, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 1030 + }, + { + "epoch": 4.124, + "grad_norm": 7.0625, + "learning_rate": 8.76e-06, + "loss": 0.4023, + "step": 1031 + }, + { + "epoch": 4.124, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47461745142936707, + "eval_runtime": 29.594, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.129, + "step": 1031 + }, + { + "epoch": 4.128, + "grad_norm": 10.1875, + "learning_rate": 8.720000000000001e-06, + "loss": 0.3047, + "step": 1032 + }, + { + "epoch": 4.128, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47427234053611755, + "eval_runtime": 29.5518, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.132, + "step": 1032 + }, + { + "epoch": 4.132, + "grad_norm": 12.75, + "learning_rate": 8.68e-06, + "loss": 0.5039, + "step": 1033 + }, + { + "epoch": 4.132, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4744448959827423, + "eval_runtime": 29.5637, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.131, + "step": 1033 + }, + { + "epoch": 4.136, + "grad_norm": 6.75, + "learning_rate": 8.64e-06, + "loss": 0.2832, + "step": 1034 + }, + { + "epoch": 4.136, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47420957684516907, + "eval_runtime": 29.4189, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.141, + "step": 1034 + }, + { + "epoch": 4.14, + "grad_norm": 7.1875, + "learning_rate": 8.599999999999999e-06, + "loss": 0.2295, + "step": 1035 + }, + { + "epoch": 4.14, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4743664562702179, + "eval_runtime": 29.3225, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 1035 + }, + { + "epoch": 4.144, + "grad_norm": 7.0625, + "learning_rate": 8.56e-06, + "loss": 0.2275, + "step": 1036 + }, + { + "epoch": 4.144, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4740997552871704, + "eval_runtime": 29.2709, + "eval_samples_per_second": 17.013, + "eval_steps_per_second": 2.152, + "step": 1036 + }, + { + "epoch": 4.148, + "grad_norm": 1.8125, + "learning_rate": 8.52e-06, + "loss": 0.0255, + "step": 1037 + }, + { + "epoch": 4.148, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47392717003822327, + "eval_runtime": 29.3614, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.146, + "step": 1037 + }, + { + "epoch": 4.152, + "grad_norm": 42.5, + "learning_rate": 8.48e-06, + "loss": 1.0938, + "step": 1038 + }, + { + "epoch": 4.152, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4736134111881256, + "eval_runtime": 29.4594, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 1038 + }, + { + "epoch": 4.156, + "grad_norm": 40.5, + "learning_rate": 8.44e-06, + "loss": 0.7422, + "step": 1039 + }, + { + "epoch": 4.156, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47345656156539917, + "eval_runtime": 29.5524, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.132, + "step": 1039 + }, + { + "epoch": 4.16, + "grad_norm": 7.40625, + "learning_rate": 8.400000000000001e-06, + "loss": 0.2197, + "step": 1040 + }, + { + "epoch": 4.16, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47337809205055237, + "eval_runtime": 29.5703, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 1040 + }, + { + "epoch": 4.164, + "grad_norm": 13.0625, + "learning_rate": 8.36e-06, + "loss": 0.6719, + "step": 1041 + }, + { + "epoch": 4.164, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47356635332107544, + "eval_runtime": 29.5218, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 1041 + }, + { + "epoch": 4.168, + "grad_norm": 10.25, + "learning_rate": 8.32e-06, + "loss": 0.4082, + "step": 1042 + }, + { + "epoch": 4.168, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4734252095222473, + "eval_runtime": 29.5286, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 1042 + }, + { + "epoch": 4.172, + "grad_norm": 10.4375, + "learning_rate": 8.28e-06, + "loss": 0.6602, + "step": 1043 + }, + { + "epoch": 4.172, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47331538796424866, + "eval_runtime": 29.5338, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.133, + "step": 1043 + }, + { + "epoch": 4.176, + "grad_norm": 19.625, + "learning_rate": 8.24e-06, + "loss": 0.6445, + "step": 1044 + }, + { + "epoch": 4.176, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47336244583129883, + "eval_runtime": 29.4554, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.139, + "step": 1044 + }, + { + "epoch": 4.18, + "grad_norm": 7.125, + "learning_rate": 8.200000000000001e-06, + "loss": 0.1895, + "step": 1045 + }, + { + "epoch": 4.18, + "eval_accuracy": 0.9377510040160643, + "eval_loss": 0.47325262427330017, + "eval_runtime": 29.4055, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.142, + "step": 1045 + }, + { + "epoch": 4.184, + "grad_norm": 13.3125, + "learning_rate": 8.160000000000001e-06, + "loss": 0.3496, + "step": 1046 + }, + { + "epoch": 4.184, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47260940074920654, + "eval_runtime": 29.3145, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.149, + "step": 1046 + }, + { + "epoch": 4.188, + "grad_norm": 17.125, + "learning_rate": 8.12e-06, + "loss": 0.3867, + "step": 1047 + }, + { + "epoch": 4.188, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47358205914497375, + "eval_runtime": 29.2954, + "eval_samples_per_second": 16.999, + "eval_steps_per_second": 2.151, + "step": 1047 + }, + { + "epoch": 4.192, + "grad_norm": 7.625, + "learning_rate": 8.08e-06, + "loss": 0.2695, + "step": 1048 + }, + { + "epoch": 4.192, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.473001629114151, + "eval_runtime": 29.3536, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.146, + "step": 1048 + }, + { + "epoch": 4.196, + "grad_norm": 6.96875, + "learning_rate": 8.040000000000001e-06, + "loss": 0.2734, + "step": 1049 + }, + { + "epoch": 4.196, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4730643332004547, + "eval_runtime": 29.3415, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.147, + "step": 1049 + }, + { + "epoch": 4.2, + "grad_norm": 27.25, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4922, + "step": 1050 + }, + { + "epoch": 4.2, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47295457124710083, + "eval_runtime": 29.4033, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 1050 + }, + { + "epoch": 4.204, + "grad_norm": 1.5546875, + "learning_rate": 7.96e-06, + "loss": 0.015, + "step": 1051 + }, + { + "epoch": 4.204, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47282904386520386, + "eval_runtime": 29.3432, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 1051 + }, + { + "epoch": 4.208, + "grad_norm": 10.8125, + "learning_rate": 7.92e-06, + "loss": 0.5195, + "step": 1052 + }, + { + "epoch": 4.208, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47275060415267944, + "eval_runtime": 29.3648, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.145, + "step": 1052 + }, + { + "epoch": 4.212, + "grad_norm": 8.25, + "learning_rate": 7.879999999999999e-06, + "loss": 0.373, + "step": 1053 + }, + { + "epoch": 4.212, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4725152850151062, + "eval_runtime": 29.466, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.138, + "step": 1053 + }, + { + "epoch": 4.216, + "grad_norm": 12.6875, + "learning_rate": 7.84e-06, + "loss": 0.4434, + "step": 1054 + }, + { + "epoch": 4.216, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47287610173225403, + "eval_runtime": 29.5723, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.13, + "step": 1054 + }, + { + "epoch": 4.22, + "grad_norm": 16.125, + "learning_rate": 7.8e-06, + "loss": 0.7148, + "step": 1055 + }, + { + "epoch": 4.22, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47287610173225403, + "eval_runtime": 29.5874, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 1055 + }, + { + "epoch": 4.224, + "grad_norm": 9.375, + "learning_rate": 7.76e-06, + "loss": 0.3516, + "step": 1056 + }, + { + "epoch": 4.224, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47309571504592896, + "eval_runtime": 29.5918, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.129, + "step": 1056 + }, + { + "epoch": 4.228, + "grad_norm": 22.375, + "learning_rate": 7.72e-06, + "loss": 0.8242, + "step": 1057 + }, + { + "epoch": 4.228, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47254663705825806, + "eval_runtime": 29.5558, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 1057 + }, + { + "epoch": 4.232, + "grad_norm": 18.25, + "learning_rate": 7.68e-06, + "loss": 0.1689, + "step": 1058 + }, + { + "epoch": 4.232, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47282904386520386, + "eval_runtime": 29.4204, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.141, + "step": 1058 + }, + { + "epoch": 4.236, + "grad_norm": 3.265625, + "learning_rate": 7.64e-06, + "loss": 0.0566, + "step": 1059 + }, + { + "epoch": 4.236, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4727035164833069, + "eval_runtime": 29.3552, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.146, + "step": 1059 + }, + { + "epoch": 4.24, + "grad_norm": 5.625, + "learning_rate": 7.6e-06, + "loss": 0.2422, + "step": 1060 + }, + { + "epoch": 4.24, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47273486852645874, + "eval_runtime": 29.384, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.144, + "step": 1060 + }, + { + "epoch": 4.244, + "grad_norm": 10.0625, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.2051, + "step": 1061 + }, + { + "epoch": 4.244, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4726407527923584, + "eval_runtime": 29.3695, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 1061 + }, + { + "epoch": 4.248, + "grad_norm": 11.25, + "learning_rate": 7.520000000000001e-06, + "loss": 0.4551, + "step": 1062 + }, + { + "epoch": 4.248, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47281330823898315, + "eval_runtime": 29.3565, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.146, + "step": 1062 + }, + { + "epoch": 4.252, + "grad_norm": 16.875, + "learning_rate": 7.480000000000001e-06, + "loss": 0.4746, + "step": 1063 + }, + { + "epoch": 4.252, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.472766250371933, + "eval_runtime": 29.3491, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 1063 + }, + { + "epoch": 4.256, + "grad_norm": 3.171875, + "learning_rate": 7.44e-06, + "loss": 0.0977, + "step": 1064 + }, + { + "epoch": 4.256, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4730486273765564, + "eval_runtime": 29.482, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.137, + "step": 1064 + }, + { + "epoch": 4.26, + "grad_norm": 12.625, + "learning_rate": 7.4e-06, + "loss": 0.1885, + "step": 1065 + }, + { + "epoch": 4.26, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47275057435035706, + "eval_runtime": 29.5368, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 1065 + }, + { + "epoch": 4.264, + "grad_norm": 6.28125, + "learning_rate": 7.36e-06, + "loss": 0.2734, + "step": 1066 + }, + { + "epoch": 4.264, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.472766250371933, + "eval_runtime": 29.5427, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 1066 + }, + { + "epoch": 4.268, + "grad_norm": 12.625, + "learning_rate": 7.32e-06, + "loss": 0.7109, + "step": 1067 + }, + { + "epoch": 4.268, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47289174795150757, + "eval_runtime": 29.5388, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 1067 + }, + { + "epoch": 4.272, + "grad_norm": 22.375, + "learning_rate": 7.280000000000001e-06, + "loss": 0.7773, + "step": 1068 + }, + { + "epoch": 4.272, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4723740518093109, + "eval_runtime": 29.5728, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.13, + "step": 1068 + }, + { + "epoch": 4.276, + "grad_norm": 19.125, + "learning_rate": 7.240000000000001e-06, + "loss": 0.6953, + "step": 1069 + }, + { + "epoch": 4.276, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47260937094688416, + "eval_runtime": 29.4611, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.138, + "step": 1069 + }, + { + "epoch": 4.28, + "grad_norm": 15.6875, + "learning_rate": 7.2e-06, + "loss": 0.6211, + "step": 1070 + }, + { + "epoch": 4.28, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4724995493888855, + "eval_runtime": 29.3694, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 1070 + }, + { + "epoch": 4.284, + "grad_norm": 31.375, + "learning_rate": 7.16e-06, + "loss": 0.8711, + "step": 1071 + }, + { + "epoch": 4.284, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47253093123435974, + "eval_runtime": 29.3313, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.148, + "step": 1071 + }, + { + "epoch": 4.288, + "grad_norm": 7.65625, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.167, + "step": 1072 + }, + { + "epoch": 4.288, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4728289842605591, + "eval_runtime": 29.3035, + "eval_samples_per_second": 16.995, + "eval_steps_per_second": 2.15, + "step": 1072 + }, + { + "epoch": 4.292, + "grad_norm": 8.0, + "learning_rate": 7.080000000000001e-06, + "loss": 0.3418, + "step": 1073 + }, + { + "epoch": 4.292, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4725152552127838, + "eval_runtime": 29.2932, + "eval_samples_per_second": 17.001, + "eval_steps_per_second": 2.151, + "step": 1073 + }, + { + "epoch": 4.296, + "grad_norm": 23.25, + "learning_rate": 7.04e-06, + "loss": 0.9805, + "step": 1074 + }, + { + "epoch": 4.296, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4720759987831116, + "eval_runtime": 29.2749, + "eval_samples_per_second": 17.011, + "eval_steps_per_second": 2.152, + "step": 1074 + }, + { + "epoch": 4.3, + "grad_norm": 18.125, + "learning_rate": 7.000000000000001e-06, + "loss": 0.8359, + "step": 1075 + }, + { + "epoch": 4.3, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47254660725593567, + "eval_runtime": 29.2695, + "eval_samples_per_second": 17.014, + "eval_steps_per_second": 2.152, + "step": 1075 + }, + { + "epoch": 4.304, + "grad_norm": 11.0, + "learning_rate": 6.9599999999999994e-06, + "loss": 0.3633, + "step": 1076 + }, + { + "epoch": 4.304, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4726564288139343, + "eval_runtime": 29.3301, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 2.148, + "step": 1076 + }, + { + "epoch": 4.308, + "grad_norm": 3.125, + "learning_rate": 6.92e-06, + "loss": 0.0898, + "step": 1077 + }, + { + "epoch": 4.308, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47204461693763733, + "eval_runtime": 29.5191, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 1077 + }, + { + "epoch": 4.312, + "grad_norm": 13.3125, + "learning_rate": 6.88e-06, + "loss": 0.5312, + "step": 1078 + }, + { + "epoch": 4.312, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47234266996383667, + "eval_runtime": 29.5372, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 1078 + }, + { + "epoch": 4.316, + "grad_norm": 10.125, + "learning_rate": 6.840000000000001e-06, + "loss": 0.5508, + "step": 1079 + }, + { + "epoch": 4.316, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47268781065940857, + "eval_runtime": 29.5395, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.133, + "step": 1079 + }, + { + "epoch": 4.32, + "grad_norm": 7.25, + "learning_rate": 6.800000000000001e-06, + "loss": 0.334, + "step": 1080 + }, + { + "epoch": 4.32, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4721073508262634, + "eval_runtime": 29.5275, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 1080 + }, + { + "epoch": 4.324, + "grad_norm": 8.9375, + "learning_rate": 6.76e-06, + "loss": 0.3555, + "step": 1081 + }, + { + "epoch": 4.324, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4722328782081604, + "eval_runtime": 29.5558, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 1081 + }, + { + "epoch": 4.328, + "grad_norm": 11.3125, + "learning_rate": 6.72e-06, + "loss": 0.418, + "step": 1082 + }, + { + "epoch": 4.328, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47213873267173767, + "eval_runtime": 29.4889, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.136, + "step": 1082 + }, + { + "epoch": 4.332, + "grad_norm": 13.625, + "learning_rate": 6.68e-06, + "loss": 0.7227, + "step": 1083 + }, + { + "epoch": 4.332, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4723740518093109, + "eval_runtime": 29.4058, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.142, + "step": 1083 + }, + { + "epoch": 4.336, + "grad_norm": 12.3125, + "learning_rate": 6.640000000000001e-06, + "loss": 0.4727, + "step": 1084 + }, + { + "epoch": 4.336, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4724995493888855, + "eval_runtime": 29.3664, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.145, + "step": 1084 + }, + { + "epoch": 4.34, + "grad_norm": 10.5625, + "learning_rate": 6.6e-06, + "loss": 0.0938, + "step": 1085 + }, + { + "epoch": 4.34, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47191911935806274, + "eval_runtime": 29.4126, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.142, + "step": 1085 + }, + { + "epoch": 4.344, + "grad_norm": 11.875, + "learning_rate": 6.560000000000001e-06, + "loss": 0.5781, + "step": 1086 + }, + { + "epoch": 4.344, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47191911935806274, + "eval_runtime": 29.5288, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 1086 + }, + { + "epoch": 4.348, + "grad_norm": 10.9375, + "learning_rate": 6.519999999999999e-06, + "loss": 0.5938, + "step": 1087 + }, + { + "epoch": 4.348, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4721701145172119, + "eval_runtime": 29.5647, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.131, + "step": 1087 + }, + { + "epoch": 4.352, + "grad_norm": 7.71875, + "learning_rate": 6.48e-06, + "loss": 0.3125, + "step": 1088 + }, + { + "epoch": 4.352, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47204461693763733, + "eval_runtime": 29.5292, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.133, + "step": 1088 + }, + { + "epoch": 4.356, + "grad_norm": 8.5625, + "learning_rate": 6.44e-06, + "loss": 0.2891, + "step": 1089 + }, + { + "epoch": 4.356, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4721701145172119, + "eval_runtime": 29.5447, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.132, + "step": 1089 + }, + { + "epoch": 4.36, + "grad_norm": 15.1875, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.5078, + "step": 1090 + }, + { + "epoch": 4.36, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47227993607521057, + "eval_runtime": 29.6065, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.128, + "step": 1090 + }, + { + "epoch": 4.364, + "grad_norm": 0.9921875, + "learning_rate": 6.360000000000001e-06, + "loss": 0.0172, + "step": 1091 + }, + { + "epoch": 4.364, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4727191925048828, + "eval_runtime": 29.5575, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.131, + "step": 1091 + }, + { + "epoch": 4.368, + "grad_norm": 9.6875, + "learning_rate": 6.320000000000001e-06, + "loss": 0.3203, + "step": 1092 + }, + { + "epoch": 4.368, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47232699394226074, + "eval_runtime": 29.4211, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.141, + "step": 1092 + }, + { + "epoch": 4.372, + "grad_norm": 22.5, + "learning_rate": 6.28e-06, + "loss": 0.5273, + "step": 1093 + }, + { + "epoch": 4.372, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.472358375787735, + "eval_runtime": 29.3306, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 2.148, + "step": 1093 + }, + { + "epoch": 4.376, + "grad_norm": 17.75, + "learning_rate": 6.24e-06, + "loss": 0.5781, + "step": 1094 + }, + { + "epoch": 4.376, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4723740518093109, + "eval_runtime": 29.2647, + "eval_samples_per_second": 17.017, + "eval_steps_per_second": 2.153, + "step": 1094 + }, + { + "epoch": 4.38, + "grad_norm": 13.1875, + "learning_rate": 6.2e-06, + "loss": 0.5938, + "step": 1095 + }, + { + "epoch": 4.38, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4728446900844574, + "eval_runtime": 29.4649, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.138, + "step": 1095 + }, + { + "epoch": 4.384, + "grad_norm": 25.375, + "learning_rate": 6.16e-06, + "loss": 0.8984, + "step": 1096 + }, + { + "epoch": 4.384, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47260937094688416, + "eval_runtime": 29.4971, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.136, + "step": 1096 + }, + { + "epoch": 4.388, + "grad_norm": 15.3125, + "learning_rate": 6.12e-06, + "loss": 0.6055, + "step": 1097 + }, + { + "epoch": 4.388, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47232699394226074, + "eval_runtime": 29.5258, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 1097 + }, + { + "epoch": 4.392, + "grad_norm": 4.375, + "learning_rate": 6.08e-06, + "loss": 0.2012, + "step": 1098 + }, + { + "epoch": 4.392, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47193479537963867, + "eval_runtime": 29.5323, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 1098 + }, + { + "epoch": 4.396, + "grad_norm": 7.40625, + "learning_rate": 6.040000000000001e-06, + "loss": 0.2539, + "step": 1099 + }, + { + "epoch": 4.396, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4724524915218353, + "eval_runtime": 29.5357, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 1099 + }, + { + "epoch": 4.4, + "grad_norm": 12.0, + "learning_rate": 6e-06, + "loss": 0.4824, + "step": 1100 + }, + { + "epoch": 4.4, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47260937094688416, + "eval_runtime": 29.5008, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 1100 + }, + { + "epoch": 4.404, + "grad_norm": 9.0625, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.4043, + "step": 1101 + }, + { + "epoch": 4.404, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4729074239730835, + "eval_runtime": 29.3457, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.147, + "step": 1101 + }, + { + "epoch": 4.408, + "grad_norm": 12.875, + "learning_rate": 5.920000000000001e-06, + "loss": 0.3984, + "step": 1102 + }, + { + "epoch": 4.408, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.472970187664032, + "eval_runtime": 29.3021, + "eval_samples_per_second": 16.995, + "eval_steps_per_second": 2.15, + "step": 1102 + }, + { + "epoch": 4.412, + "grad_norm": 9.875, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.4629, + "step": 1103 + }, + { + "epoch": 4.412, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4727034866809845, + "eval_runtime": 29.5047, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 1103 + }, + { + "epoch": 4.416, + "grad_norm": 9.375, + "learning_rate": 5.84e-06, + "loss": 0.4629, + "step": 1104 + }, + { + "epoch": 4.416, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47220149636268616, + "eval_runtime": 29.5622, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.131, + "step": 1104 + }, + { + "epoch": 4.42, + "grad_norm": 8.0625, + "learning_rate": 5.8e-06, + "loss": 0.3027, + "step": 1105 + }, + { + "epoch": 4.42, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.4726407527923584, + "eval_runtime": 29.5348, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 1105 + }, + { + "epoch": 4.424, + "grad_norm": 16.625, + "learning_rate": 5.76e-06, + "loss": 0.6602, + "step": 1106 + }, + { + "epoch": 4.424, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4729074239730835, + "eval_runtime": 29.582, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.13, + "step": 1106 + }, + { + "epoch": 4.428, + "grad_norm": 11.9375, + "learning_rate": 5.72e-06, + "loss": 0.5469, + "step": 1107 + }, + { + "epoch": 4.428, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47268781065940857, + "eval_runtime": 29.5438, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.132, + "step": 1107 + }, + { + "epoch": 4.432, + "grad_norm": 4.59375, + "learning_rate": 5.680000000000001e-06, + "loss": 0.1807, + "step": 1108 + }, + { + "epoch": 4.432, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.472766250371933, + "eval_runtime": 29.503, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 1108 + }, + { + "epoch": 4.436, + "grad_norm": 36.75, + "learning_rate": 5.64e-06, + "loss": 0.6055, + "step": 1109 + }, + { + "epoch": 4.436, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.472562313079834, + "eval_runtime": 29.4141, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.142, + "step": 1109 + }, + { + "epoch": 4.44, + "grad_norm": 7.375, + "learning_rate": 5.600000000000001e-06, + "loss": 0.4688, + "step": 1110 + }, + { + "epoch": 4.44, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.472358375787735, + "eval_runtime": 29.4175, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 1110 + }, + { + "epoch": 4.444, + "grad_norm": 7.84375, + "learning_rate": 5.56e-06, + "loss": 0.3125, + "step": 1111 + }, + { + "epoch": 4.444, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4723113179206848, + "eval_runtime": 29.3827, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.144, + "step": 1111 + }, + { + "epoch": 4.448, + "grad_norm": 6.625, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.2969, + "step": 1112 + }, + { + "epoch": 4.448, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4718877375125885, + "eval_runtime": 29.3615, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.146, + "step": 1112 + }, + { + "epoch": 4.452, + "grad_norm": 8.5, + "learning_rate": 5.48e-06, + "loss": 0.3848, + "step": 1113 + }, + { + "epoch": 4.452, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47260937094688416, + "eval_runtime": 29.289, + "eval_samples_per_second": 17.003, + "eval_steps_per_second": 2.151, + "step": 1113 + }, + { + "epoch": 4.456, + "grad_norm": 27.25, + "learning_rate": 5.44e-06, + "loss": 0.6094, + "step": 1114 + }, + { + "epoch": 4.456, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47224855422973633, + "eval_runtime": 29.3386, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 1114 + }, + { + "epoch": 4.46, + "grad_norm": 12.9375, + "learning_rate": 5.4e-06, + "loss": 0.3262, + "step": 1115 + }, + { + "epoch": 4.46, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4728446900844574, + "eval_runtime": 29.3695, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.145, + "step": 1115 + }, + { + "epoch": 4.464, + "grad_norm": 10.125, + "learning_rate": 5.36e-06, + "loss": 0.3184, + "step": 1116 + }, + { + "epoch": 4.464, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4727976322174072, + "eval_runtime": 29.334, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 1116 + }, + { + "epoch": 4.468, + "grad_norm": 17.75, + "learning_rate": 5.32e-06, + "loss": 0.2295, + "step": 1117 + }, + { + "epoch": 4.468, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47275057435035706, + "eval_runtime": 29.3642, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.145, + "step": 1117 + }, + { + "epoch": 4.4719999999999995, + "grad_norm": 6.9375, + "learning_rate": 5.28e-06, + "loss": 0.4258, + "step": 1118 + }, + { + "epoch": 4.4719999999999995, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47314274311065674, + "eval_runtime": 29.3566, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.146, + "step": 1118 + }, + { + "epoch": 4.476, + "grad_norm": 6.75, + "learning_rate": 5.240000000000001e-06, + "loss": 0.2754, + "step": 1119 + }, + { + "epoch": 4.476, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.472766250371933, + "eval_runtime": 29.3953, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.143, + "step": 1119 + }, + { + "epoch": 4.48, + "grad_norm": 10.6875, + "learning_rate": 5.2e-06, + "loss": 0.2432, + "step": 1120 + }, + { + "epoch": 4.48, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4726564288139343, + "eval_runtime": 29.4896, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.136, + "step": 1120 + }, + { + "epoch": 4.484, + "grad_norm": 20.375, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.9883, + "step": 1121 + }, + { + "epoch": 4.484, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47295451164245605, + "eval_runtime": 29.4995, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 1121 + }, + { + "epoch": 4.4879999999999995, + "grad_norm": 25.75, + "learning_rate": 5.12e-06, + "loss": 0.6523, + "step": 1122 + }, + { + "epoch": 4.4879999999999995, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.47275057435035706, + "eval_runtime": 29.4965, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.136, + "step": 1122 + }, + { + "epoch": 4.492, + "grad_norm": 11.125, + "learning_rate": 5.08e-06, + "loss": 0.4004, + "step": 1123 + }, + { + "epoch": 4.492, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47224855422973633, + "eval_runtime": 29.5606, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.131, + "step": 1123 + }, + { + "epoch": 4.496, + "grad_norm": 15.0625, + "learning_rate": 5.04e-06, + "loss": 0.2188, + "step": 1124 + }, + { + "epoch": 4.496, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47254660725593567, + "eval_runtime": 29.5077, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.135, + "step": 1124 + }, + { + "epoch": 4.5, + "grad_norm": 18.25, + "learning_rate": 5e-06, + "loss": 0.8594, + "step": 1125 + }, + { + "epoch": 4.5, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47295451164245605, + "eval_runtime": 29.502, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 1125 + }, + { + "epoch": 4.504, + "grad_norm": 12.6875, + "learning_rate": 4.96e-06, + "loss": 0.7148, + "step": 1126 + }, + { + "epoch": 4.504, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47206029295921326, + "eval_runtime": 29.4527, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.139, + "step": 1126 + }, + { + "epoch": 4.508, + "grad_norm": 7.59375, + "learning_rate": 4.92e-06, + "loss": 0.2236, + "step": 1127 + }, + { + "epoch": 4.508, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4727976322174072, + "eval_runtime": 29.4753, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.137, + "step": 1127 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 12.5, + "learning_rate": 4.880000000000001e-06, + "loss": 0.6953, + "step": 1128 + }, + { + "epoch": 4.5120000000000005, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4728289842605591, + "eval_runtime": 29.4034, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 1128 + }, + { + "epoch": 4.516, + "grad_norm": 20.25, + "learning_rate": 4.84e-06, + "loss": 0.5273, + "step": 1129 + }, + { + "epoch": 4.516, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4727819263935089, + "eval_runtime": 29.379, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.144, + "step": 1129 + }, + { + "epoch": 4.52, + "grad_norm": 54.0, + "learning_rate": 4.800000000000001e-06, + "loss": 0.8516, + "step": 1130 + }, + { + "epoch": 4.52, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.472154438495636, + "eval_runtime": 29.3691, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.145, + "step": 1130 + }, + { + "epoch": 4.524, + "grad_norm": 10.375, + "learning_rate": 4.76e-06, + "loss": 0.2158, + "step": 1131 + }, + { + "epoch": 4.524, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4726250469684601, + "eval_runtime": 29.3646, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.145, + "step": 1131 + }, + { + "epoch": 4.5280000000000005, + "grad_norm": 10.75, + "learning_rate": 4.72e-06, + "loss": 0.459, + "step": 1132 + }, + { + "epoch": 4.5280000000000005, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47220149636268616, + "eval_runtime": 29.4184, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 1132 + }, + { + "epoch": 4.532, + "grad_norm": 0.275390625, + "learning_rate": 4.68e-06, + "loss": 0.0032, + "step": 1133 + }, + { + "epoch": 4.532, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47253093123435974, + "eval_runtime": 29.4233, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 1133 + }, + { + "epoch": 4.536, + "grad_norm": 14.6875, + "learning_rate": 4.64e-06, + "loss": 0.8242, + "step": 1134 + }, + { + "epoch": 4.536, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4724995493888855, + "eval_runtime": 29.4231, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.141, + "step": 1134 + }, + { + "epoch": 4.54, + "grad_norm": 19.125, + "learning_rate": 4.6e-06, + "loss": 0.2471, + "step": 1135 + }, + { + "epoch": 4.54, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4731898307800293, + "eval_runtime": 29.4194, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.141, + "step": 1135 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 16.375, + "learning_rate": 4.56e-06, + "loss": 0.668, + "step": 1136 + }, + { + "epoch": 4.5440000000000005, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4727976322174072, + "eval_runtime": 29.4196, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.141, + "step": 1136 + }, + { + "epoch": 4.548, + "grad_norm": 9.25, + "learning_rate": 4.52e-06, + "loss": 0.3145, + "step": 1137 + }, + { + "epoch": 4.548, + "eval_accuracy": 0.9236947791164659, + "eval_loss": 0.47267213463783264, + "eval_runtime": 29.4195, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.141, + "step": 1137 + }, + { + "epoch": 4.552, + "grad_norm": 6.53125, + "learning_rate": 4.48e-06, + "loss": 0.3359, + "step": 1138 + }, + { + "epoch": 4.552, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47240543365478516, + "eval_runtime": 29.3653, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.145, + "step": 1138 + }, + { + "epoch": 4.556, + "grad_norm": 62.5, + "learning_rate": 4.440000000000001e-06, + "loss": 1.0156, + "step": 1139 + }, + { + "epoch": 4.556, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47268781065940857, + "eval_runtime": 29.4222, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.141, + "step": 1139 + }, + { + "epoch": 4.5600000000000005, + "grad_norm": 4.6875, + "learning_rate": 4.4e-06, + "loss": 0.1963, + "step": 1140 + }, + { + "epoch": 4.5600000000000005, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47295451164245605, + "eval_runtime": 29.4279, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.141, + "step": 1140 + }, + { + "epoch": 4.564, + "grad_norm": 7.125, + "learning_rate": 4.360000000000001e-06, + "loss": 0.3789, + "step": 1141 + }, + { + "epoch": 4.564, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47227993607521057, + "eval_runtime": 29.4329, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.14, + "step": 1141 + }, + { + "epoch": 4.568, + "grad_norm": 1.453125, + "learning_rate": 4.32e-06, + "loss": 0.0197, + "step": 1142 + }, + { + "epoch": 4.568, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4725936949253082, + "eval_runtime": 29.4369, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.14, + "step": 1142 + }, + { + "epoch": 4.572, + "grad_norm": 8.25, + "learning_rate": 4.28e-06, + "loss": 0.4023, + "step": 1143 + }, + { + "epoch": 4.572, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47275057435035706, + "eval_runtime": 29.3791, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.144, + "step": 1143 + }, + { + "epoch": 4.576, + "grad_norm": 10.875, + "learning_rate": 4.24e-06, + "loss": 0.6406, + "step": 1144 + }, + { + "epoch": 4.576, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4721073508262634, + "eval_runtime": 29.3766, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 1144 + }, + { + "epoch": 4.58, + "grad_norm": 11.625, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5234, + "step": 1145 + }, + { + "epoch": 4.58, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4724368155002594, + "eval_runtime": 29.4371, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.14, + "step": 1145 + }, + { + "epoch": 4.584, + "grad_norm": 7.09375, + "learning_rate": 4.16e-06, + "loss": 0.3984, + "step": 1146 + }, + { + "epoch": 4.584, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4727191925048828, + "eval_runtime": 29.3812, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 1146 + }, + { + "epoch": 4.588, + "grad_norm": 10.4375, + "learning_rate": 4.12e-06, + "loss": 0.2012, + "step": 1147 + }, + { + "epoch": 4.588, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4724995493888855, + "eval_runtime": 29.3797, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 1147 + }, + { + "epoch": 4.592, + "grad_norm": 17.0, + "learning_rate": 4.080000000000001e-06, + "loss": 0.3926, + "step": 1148 + }, + { + "epoch": 4.592, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4726407527923584, + "eval_runtime": 29.3955, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.143, + "step": 1148 + }, + { + "epoch": 4.596, + "grad_norm": 12.5625, + "learning_rate": 4.04e-06, + "loss": 0.2334, + "step": 1149 + }, + { + "epoch": 4.596, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4726407527923584, + "eval_runtime": 29.4009, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 1149 + }, + { + "epoch": 4.6, + "grad_norm": 11.0, + "learning_rate": 4.000000000000001e-06, + "loss": 0.4414, + "step": 1150 + }, + { + "epoch": 4.6, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4728289842605591, + "eval_runtime": 29.4053, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.142, + "step": 1150 + }, + { + "epoch": 4.604, + "grad_norm": 35.25, + "learning_rate": 3.96e-06, + "loss": 0.9492, + "step": 1151 + }, + { + "epoch": 4.604, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4724995493888855, + "eval_runtime": 29.4227, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.141, + "step": 1151 + }, + { + "epoch": 4.608, + "grad_norm": 7.09375, + "learning_rate": 3.92e-06, + "loss": 0.3477, + "step": 1152 + }, + { + "epoch": 4.608, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4729074239730835, + "eval_runtime": 29.4281, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.141, + "step": 1152 + }, + { + "epoch": 4.612, + "grad_norm": 8.1875, + "learning_rate": 3.88e-06, + "loss": 0.459, + "step": 1153 + }, + { + "epoch": 4.612, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.472358375787735, + "eval_runtime": 29.4868, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.137, + "step": 1153 + }, + { + "epoch": 4.616, + "grad_norm": 22.125, + "learning_rate": 3.84e-06, + "loss": 0.2451, + "step": 1154 + }, + { + "epoch": 4.616, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47295451164245605, + "eval_runtime": 29.5664, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.131, + "step": 1154 + }, + { + "epoch": 4.62, + "grad_norm": 12.125, + "learning_rate": 3.8e-06, + "loss": 0.6133, + "step": 1155 + }, + { + "epoch": 4.62, + "eval_accuracy": 0.9257028112449799, + "eval_loss": 0.47308000922203064, + "eval_runtime": 29.5401, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 1155 + }, + { + "epoch": 4.624, + "grad_norm": 8.1875, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.5156, + "step": 1156 + }, + { + "epoch": 4.624, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4728446900844574, + "eval_runtime": 29.5608, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.131, + "step": 1156 + }, + { + "epoch": 4.628, + "grad_norm": 11.875, + "learning_rate": 3.72e-06, + "loss": 0.5859, + "step": 1157 + }, + { + "epoch": 4.628, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4728446900844574, + "eval_runtime": 29.5888, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.129, + "step": 1157 + }, + { + "epoch": 4.632, + "grad_norm": 15.25, + "learning_rate": 3.68e-06, + "loss": 0.7578, + "step": 1158 + }, + { + "epoch": 4.632, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4725152552127838, + "eval_runtime": 29.4684, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.138, + "step": 1158 + }, + { + "epoch": 4.636, + "grad_norm": 12.375, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.4355, + "step": 1159 + }, + { + "epoch": 4.636, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4726564288139343, + "eval_runtime": 29.3782, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.144, + "step": 1159 + }, + { + "epoch": 4.64, + "grad_norm": 40.75, + "learning_rate": 3.6e-06, + "loss": 1.3281, + "step": 1160 + }, + { + "epoch": 4.64, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47240543365478516, + "eval_runtime": 29.3293, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.148, + "step": 1160 + }, + { + "epoch": 4.644, + "grad_norm": 9.6875, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.3184, + "step": 1161 + }, + { + "epoch": 4.644, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47260937094688416, + "eval_runtime": 29.2936, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 1161 + }, + { + "epoch": 4.648, + "grad_norm": 16.875, + "learning_rate": 3.52e-06, + "loss": 0.6875, + "step": 1162 + }, + { + "epoch": 4.648, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47295451164245605, + "eval_runtime": 29.3302, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 2.148, + "step": 1162 + }, + { + "epoch": 4.652, + "grad_norm": 11.0625, + "learning_rate": 3.4799999999999997e-06, + "loss": 0.5234, + "step": 1163 + }, + { + "epoch": 4.652, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47248387336730957, + "eval_runtime": 29.2555, + "eval_samples_per_second": 17.022, + "eval_steps_per_second": 2.153, + "step": 1163 + }, + { + "epoch": 4.656, + "grad_norm": 6.65625, + "learning_rate": 3.44e-06, + "loss": 0.2754, + "step": 1164 + }, + { + "epoch": 4.656, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4726407527923584, + "eval_runtime": 29.2494, + "eval_samples_per_second": 17.026, + "eval_steps_per_second": 2.154, + "step": 1164 + }, + { + "epoch": 4.66, + "grad_norm": 8.5, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.4004, + "step": 1165 + }, + { + "epoch": 4.66, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47308000922203064, + "eval_runtime": 29.3411, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.147, + "step": 1165 + }, + { + "epoch": 4.664, + "grad_norm": 12.625, + "learning_rate": 3.36e-06, + "loss": 0.7383, + "step": 1166 + }, + { + "epoch": 4.664, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4729074239730835, + "eval_runtime": 29.449, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.139, + "step": 1166 + }, + { + "epoch": 4.668, + "grad_norm": 8.5625, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.4336, + "step": 1167 + }, + { + "epoch": 4.668, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4729074239730835, + "eval_runtime": 29.5559, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 1167 + }, + { + "epoch": 4.672, + "grad_norm": 23.375, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.9883, + "step": 1168 + }, + { + "epoch": 4.672, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.472562313079834, + "eval_runtime": 29.523, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.134, + "step": 1168 + }, + { + "epoch": 4.676, + "grad_norm": 5.09375, + "learning_rate": 3.24e-06, + "loss": 0.1689, + "step": 1169 + }, + { + "epoch": 4.676, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47273486852645874, + "eval_runtime": 29.5177, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 1169 + }, + { + "epoch": 4.68, + "grad_norm": 22.125, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.9492, + "step": 1170 + }, + { + "epoch": 4.68, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.472358375787735, + "eval_runtime": 29.5182, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 1170 + }, + { + "epoch": 4.684, + "grad_norm": 6.125, + "learning_rate": 3.1600000000000007e-06, + "loss": 0.2246, + "step": 1171 + }, + { + "epoch": 4.684, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4726250469684601, + "eval_runtime": 29.5522, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.132, + "step": 1171 + }, + { + "epoch": 4.688, + "grad_norm": 15.5625, + "learning_rate": 3.12e-06, + "loss": 0.7578, + "step": 1172 + }, + { + "epoch": 4.688, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.472562313079834, + "eval_runtime": 29.4011, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 1172 + }, + { + "epoch": 4.692, + "grad_norm": 15.75, + "learning_rate": 3.08e-06, + "loss": 0.6211, + "step": 1173 + }, + { + "epoch": 4.692, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4725936949253082, + "eval_runtime": 29.3343, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 1173 + }, + { + "epoch": 4.696, + "grad_norm": 3.3125, + "learning_rate": 3.04e-06, + "loss": 0.0625, + "step": 1174 + }, + { + "epoch": 4.696, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47224855422973633, + "eval_runtime": 29.4176, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 1174 + }, + { + "epoch": 4.7, + "grad_norm": 6.28125, + "learning_rate": 3e-06, + "loss": 0.2539, + "step": 1175 + }, + { + "epoch": 4.7, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.4724524915218353, + "eval_runtime": 29.5211, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.134, + "step": 1175 + }, + { + "epoch": 4.704, + "grad_norm": 14.25, + "learning_rate": 2.9600000000000005e-06, + "loss": 0.4258, + "step": 1176 + }, + { + "epoch": 4.704, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47295451164245605, + "eval_runtime": 29.5185, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 1176 + }, + { + "epoch": 4.708, + "grad_norm": 8.75, + "learning_rate": 2.92e-06, + "loss": 0.3809, + "step": 1177 + }, + { + "epoch": 4.708, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47275057435035706, + "eval_runtime": 29.5325, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.133, + "step": 1177 + }, + { + "epoch": 4.712, + "grad_norm": 12.8125, + "learning_rate": 2.88e-06, + "loss": 0.5195, + "step": 1178 + }, + { + "epoch": 4.712, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47295451164245605, + "eval_runtime": 29.5756, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.13, + "step": 1178 + }, + { + "epoch": 4.716, + "grad_norm": 10.5625, + "learning_rate": 2.8400000000000003e-06, + "loss": 0.3613, + "step": 1179 + }, + { + "epoch": 4.716, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47273486852645874, + "eval_runtime": 29.5256, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 1179 + }, + { + "epoch": 4.72, + "grad_norm": 25.0, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.1016, + "step": 1180 + }, + { + "epoch": 4.72, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4724211096763611, + "eval_runtime": 29.5361, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 1180 + }, + { + "epoch": 4.724, + "grad_norm": 8.0, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.1963, + "step": 1181 + }, + { + "epoch": 4.724, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47254660725593567, + "eval_runtime": 29.379, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.144, + "step": 1181 + }, + { + "epoch": 4.728, + "grad_norm": 12.375, + "learning_rate": 2.72e-06, + "loss": 0.5664, + "step": 1182 + }, + { + "epoch": 4.728, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47295451164245605, + "eval_runtime": 29.3721, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.145, + "step": 1182 + }, + { + "epoch": 4.732, + "grad_norm": 4.3125, + "learning_rate": 2.68e-06, + "loss": 0.1377, + "step": 1183 + }, + { + "epoch": 4.732, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4725936949253082, + "eval_runtime": 29.2729, + "eval_samples_per_second": 17.012, + "eval_steps_per_second": 2.152, + "step": 1183 + }, + { + "epoch": 4.736, + "grad_norm": 5.875, + "learning_rate": 2.64e-06, + "loss": 0.2012, + "step": 1184 + }, + { + "epoch": 4.736, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4722956120967865, + "eval_runtime": 29.3183, + "eval_samples_per_second": 16.986, + "eval_steps_per_second": 2.149, + "step": 1184 + }, + { + "epoch": 4.74, + "grad_norm": 12.75, + "learning_rate": 2.6e-06, + "loss": 0.4238, + "step": 1185 + }, + { + "epoch": 4.74, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4726250469684601, + "eval_runtime": 29.4484, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.139, + "step": 1185 + }, + { + "epoch": 4.744, + "grad_norm": 40.5, + "learning_rate": 2.56e-06, + "loss": 0.9375, + "step": 1186 + }, + { + "epoch": 4.744, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4726407527923584, + "eval_runtime": 29.5007, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 1186 + }, + { + "epoch": 4.748, + "grad_norm": 12.6875, + "learning_rate": 2.52e-06, + "loss": 0.5938, + "step": 1187 + }, + { + "epoch": 4.748, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4726564288139343, + "eval_runtime": 29.5187, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 1187 + }, + { + "epoch": 4.752, + "grad_norm": 5.625, + "learning_rate": 2.48e-06, + "loss": 0.1221, + "step": 1188 + }, + { + "epoch": 4.752, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4724368155002594, + "eval_runtime": 29.5137, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 1188 + }, + { + "epoch": 4.756, + "grad_norm": 21.75, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.5039, + "step": 1189 + }, + { + "epoch": 4.756, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4724524915218353, + "eval_runtime": 29.5141, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 1189 + }, + { + "epoch": 4.76, + "grad_norm": 6.59375, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.1992, + "step": 1190 + }, + { + "epoch": 4.76, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.473174124956131, + "eval_runtime": 29.4956, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.136, + "step": 1190 + }, + { + "epoch": 4.764, + "grad_norm": 27.75, + "learning_rate": 2.36e-06, + "loss": 0.6367, + "step": 1191 + }, + { + "epoch": 4.764, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4728603661060333, + "eval_runtime": 29.3944, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.143, + "step": 1191 + }, + { + "epoch": 4.768, + "grad_norm": 8.375, + "learning_rate": 2.32e-06, + "loss": 0.1387, + "step": 1192 + }, + { + "epoch": 4.768, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.47220149636268616, + "eval_runtime": 29.3842, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.144, + "step": 1192 + }, + { + "epoch": 4.772, + "grad_norm": 3.515625, + "learning_rate": 2.28e-06, + "loss": 0.1211, + "step": 1193 + }, + { + "epoch": 4.772, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47234266996383667, + "eval_runtime": 29.3423, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 1193 + }, + { + "epoch": 4.776, + "grad_norm": 17.875, + "learning_rate": 2.24e-06, + "loss": 0.6719, + "step": 1194 + }, + { + "epoch": 4.776, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.472562313079834, + "eval_runtime": 29.2499, + "eval_samples_per_second": 17.026, + "eval_steps_per_second": 2.154, + "step": 1194 + }, + { + "epoch": 4.78, + "grad_norm": 26.25, + "learning_rate": 2.2e-06, + "loss": 0.8086, + "step": 1195 + }, + { + "epoch": 4.78, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4724524915218353, + "eval_runtime": 29.3422, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 1195 + }, + { + "epoch": 4.784, + "grad_norm": 19.625, + "learning_rate": 2.16e-06, + "loss": 0.5234, + "step": 1196 + }, + { + "epoch": 4.784, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4724211096763611, + "eval_runtime": 29.5014, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.135, + "step": 1196 + }, + { + "epoch": 4.788, + "grad_norm": 19.75, + "learning_rate": 2.12e-06, + "loss": 0.9219, + "step": 1197 + }, + { + "epoch": 4.788, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4720916748046875, + "eval_runtime": 29.5535, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.132, + "step": 1197 + }, + { + "epoch": 4.792, + "grad_norm": 16.25, + "learning_rate": 2.08e-06, + "loss": 0.5273, + "step": 1198 + }, + { + "epoch": 4.792, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4724368155002594, + "eval_runtime": 29.5183, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 1198 + }, + { + "epoch": 4.796, + "grad_norm": 7.34375, + "learning_rate": 2.0400000000000004e-06, + "loss": 0.1729, + "step": 1199 + }, + { + "epoch": 4.796, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4724995493888855, + "eval_runtime": 29.513, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 1199 + }, + { + "epoch": 4.8, + "grad_norm": 7.53125, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.3359, + "step": 1200 + }, + { + "epoch": 4.8, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.472562313079834, + "eval_runtime": 29.5105, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 1200 + }, + { + "epoch": 4.804, + "grad_norm": 5.5625, + "learning_rate": 1.96e-06, + "loss": 0.2373, + "step": 1201 + }, + { + "epoch": 4.804, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47246816754341125, + "eval_runtime": 29.4282, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.141, + "step": 1201 + }, + { + "epoch": 4.808, + "grad_norm": 14.6875, + "learning_rate": 1.92e-06, + "loss": 0.6992, + "step": 1202 + }, + { + "epoch": 4.808, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47224855422973633, + "eval_runtime": 29.3346, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 1202 + }, + { + "epoch": 4.812, + "grad_norm": 18.375, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.4043, + "step": 1203 + }, + { + "epoch": 4.812, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47240543365478516, + "eval_runtime": 29.4549, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.139, + "step": 1203 + }, + { + "epoch": 4.816, + "grad_norm": 14.5, + "learning_rate": 1.84e-06, + "loss": 0.875, + "step": 1204 + }, + { + "epoch": 4.816, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4727819263935089, + "eval_runtime": 29.5493, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.132, + "step": 1204 + }, + { + "epoch": 4.82, + "grad_norm": 8.0625, + "learning_rate": 1.8e-06, + "loss": 0.3926, + "step": 1205 + }, + { + "epoch": 4.82, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.472766250371933, + "eval_runtime": 29.5635, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.131, + "step": 1205 + }, + { + "epoch": 4.824, + "grad_norm": 7.96875, + "learning_rate": 1.76e-06, + "loss": 0.3359, + "step": 1206 + }, + { + "epoch": 4.824, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47268781065940857, + "eval_runtime": 29.5689, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 1206 + }, + { + "epoch": 4.828, + "grad_norm": 10.375, + "learning_rate": 1.72e-06, + "loss": 0.3359, + "step": 1207 + }, + { + "epoch": 4.828, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.472766250371933, + "eval_runtime": 29.5685, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 1207 + }, + { + "epoch": 4.832, + "grad_norm": 5.21875, + "learning_rate": 1.68e-06, + "loss": 0.1914, + "step": 1208 + }, + { + "epoch": 4.832, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47212305665016174, + "eval_runtime": 29.503, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.135, + "step": 1208 + }, + { + "epoch": 4.836, + "grad_norm": 0.484375, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.0066, + "step": 1209 + }, + { + "epoch": 4.836, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47220149636268616, + "eval_runtime": 29.4619, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.138, + "step": 1209 + }, + { + "epoch": 4.84, + "grad_norm": 10.125, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.5, + "step": 1210 + }, + { + "epoch": 4.84, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4729074239730835, + "eval_runtime": 29.3287, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.148, + "step": 1210 + }, + { + "epoch": 4.844, + "grad_norm": 23.25, + "learning_rate": 1.56e-06, + "loss": 0.4551, + "step": 1211 + }, + { + "epoch": 4.844, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47240543365478516, + "eval_runtime": 29.3383, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 1211 + }, + { + "epoch": 4.848, + "grad_norm": 11.125, + "learning_rate": 1.52e-06, + "loss": 0.6172, + "step": 1212 + }, + { + "epoch": 4.848, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4724211096763611, + "eval_runtime": 29.3803, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.144, + "step": 1212 + }, + { + "epoch": 4.852, + "grad_norm": 14.5, + "learning_rate": 1.4800000000000002e-06, + "loss": 0.5391, + "step": 1213 + }, + { + "epoch": 4.852, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4724524915218353, + "eval_runtime": 29.4521, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.139, + "step": 1213 + }, + { + "epoch": 4.856, + "grad_norm": 5.71875, + "learning_rate": 1.44e-06, + "loss": 0.1338, + "step": 1214 + }, + { + "epoch": 4.856, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47204461693763733, + "eval_runtime": 29.5038, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.135, + "step": 1214 + }, + { + "epoch": 4.86, + "grad_norm": 35.0, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.4688, + "step": 1215 + }, + { + "epoch": 4.86, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4727034866809845, + "eval_runtime": 29.5191, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.134, + "step": 1215 + }, + { + "epoch": 4.864, + "grad_norm": 13.9375, + "learning_rate": 1.36e-06, + "loss": 0.6484, + "step": 1216 + }, + { + "epoch": 4.864, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.4723113179206848, + "eval_runtime": 29.5158, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.134, + "step": 1216 + }, + { + "epoch": 4.868, + "grad_norm": 7.78125, + "learning_rate": 1.32e-06, + "loss": 0.3262, + "step": 1217 + }, + { + "epoch": 4.868, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47246816754341125, + "eval_runtime": 29.5247, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 1217 + }, + { + "epoch": 4.872, + "grad_norm": 14.9375, + "learning_rate": 1.28e-06, + "loss": 0.5547, + "step": 1218 + }, + { + "epoch": 4.872, + "eval_accuracy": 0.927710843373494, + "eval_loss": 0.47212305665016174, + "eval_runtime": 29.5653, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.131, + "step": 1218 + }, + { + "epoch": 4.876, + "grad_norm": 15.9375, + "learning_rate": 1.24e-06, + "loss": 0.8281, + "step": 1219 + }, + { + "epoch": 4.876, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4724368155002594, + "eval_runtime": 29.4273, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.141, + "step": 1219 + }, + { + "epoch": 4.88, + "grad_norm": 15.8125, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.5977, + "step": 1220 + }, + { + "epoch": 4.88, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47218579053878784, + "eval_runtime": 29.3433, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.147, + "step": 1220 + }, + { + "epoch": 4.884, + "grad_norm": 19.875, + "learning_rate": 1.16e-06, + "loss": 0.8672, + "step": 1221 + }, + { + "epoch": 4.884, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47254660725593567, + "eval_runtime": 29.2851, + "eval_samples_per_second": 17.005, + "eval_steps_per_second": 2.151, + "step": 1221 + }, + { + "epoch": 4.888, + "grad_norm": 19.5, + "learning_rate": 1.12e-06, + "loss": 0.4062, + "step": 1222 + }, + { + "epoch": 4.888, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.47232699394226074, + "eval_runtime": 29.3854, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.144, + "step": 1222 + }, + { + "epoch": 4.892, + "grad_norm": 7.59375, + "learning_rate": 1.08e-06, + "loss": 0.3691, + "step": 1223 + }, + { + "epoch": 4.892, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47234266996383667, + "eval_runtime": 29.5255, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.134, + "step": 1223 + }, + { + "epoch": 4.896, + "grad_norm": 9.25, + "learning_rate": 1.04e-06, + "loss": 0.3438, + "step": 1224 + }, + { + "epoch": 4.896, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4722956120967865, + "eval_runtime": 29.5181, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.134, + "step": 1224 + }, + { + "epoch": 4.9, + "grad_norm": 14.1875, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.4551, + "step": 1225 + }, + { + "epoch": 4.9, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4726564288139343, + "eval_runtime": 29.5241, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.134, + "step": 1225 + }, + { + "epoch": 4.904, + "grad_norm": 10.375, + "learning_rate": 9.6e-07, + "loss": 0.2197, + "step": 1226 + }, + { + "epoch": 4.904, + "eval_accuracy": 0.9357429718875502, + "eval_loss": 0.47267213463783264, + "eval_runtime": 29.5382, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.133, + "step": 1226 + }, + { + "epoch": 4.908, + "grad_norm": 18.375, + "learning_rate": 9.2e-07, + "loss": 0.4082, + "step": 1227 + }, + { + "epoch": 4.908, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4721073508262634, + "eval_runtime": 29.5427, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 1227 + }, + { + "epoch": 4.912, + "grad_norm": 11.9375, + "learning_rate": 8.8e-07, + "loss": 0.3301, + "step": 1228 + }, + { + "epoch": 4.912, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47206029295921326, + "eval_runtime": 29.5668, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.131, + "step": 1228 + }, + { + "epoch": 4.916, + "grad_norm": 10.0625, + "learning_rate": 8.4e-07, + "loss": 0.5977, + "step": 1229 + }, + { + "epoch": 4.916, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.472562313079834, + "eval_runtime": 29.4291, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.141, + "step": 1229 + }, + { + "epoch": 4.92, + "grad_norm": 7.46875, + "learning_rate": 8.000000000000001e-07, + "loss": 0.3301, + "step": 1230 + }, + { + "epoch": 4.92, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47232699394226074, + "eval_runtime": 29.3517, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.146, + "step": 1230 + }, + { + "epoch": 4.924, + "grad_norm": 13.5, + "learning_rate": 7.6e-07, + "loss": 0.4727, + "step": 1231 + }, + { + "epoch": 4.924, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4725152552127838, + "eval_runtime": 29.3679, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.145, + "step": 1231 + }, + { + "epoch": 4.928, + "grad_norm": 5.4375, + "learning_rate": 7.2e-07, + "loss": 0.2021, + "step": 1232 + }, + { + "epoch": 4.928, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47246816754341125, + "eval_runtime": 29.3393, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.147, + "step": 1232 + }, + { + "epoch": 4.932, + "grad_norm": 18.875, + "learning_rate": 6.8e-07, + "loss": 0.6094, + "step": 1233 + }, + { + "epoch": 4.932, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47193479537963867, + "eval_runtime": 29.2663, + "eval_samples_per_second": 17.016, + "eval_steps_per_second": 2.153, + "step": 1233 + }, + { + "epoch": 4.936, + "grad_norm": 11.625, + "learning_rate": 6.4e-07, + "loss": 0.4141, + "step": 1234 + }, + { + "epoch": 4.936, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47254660725593567, + "eval_runtime": 29.2739, + "eval_samples_per_second": 17.012, + "eval_steps_per_second": 2.152, + "step": 1234 + }, + { + "epoch": 4.9399999999999995, + "grad_norm": 0.396484375, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0052, + "step": 1235 + }, + { + "epoch": 4.9399999999999995, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4724368155002594, + "eval_runtime": 29.4017, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.143, + "step": 1235 + }, + { + "epoch": 4.944, + "grad_norm": 14.625, + "learning_rate": 5.6e-07, + "loss": 0.6133, + "step": 1236 + }, + { + "epoch": 4.944, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.472358375787735, + "eval_runtime": 29.4731, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 1236 + }, + { + "epoch": 4.948, + "grad_norm": 18.75, + "learning_rate": 5.2e-07, + "loss": 0.3672, + "step": 1237 + }, + { + "epoch": 4.948, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4725152552127838, + "eval_runtime": 29.5011, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 1237 + }, + { + "epoch": 4.952, + "grad_norm": 8.9375, + "learning_rate": 4.8e-07, + "loss": 0.4297, + "step": 1238 + }, + { + "epoch": 4.952, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4721073508262634, + "eval_runtime": 29.5101, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.135, + "step": 1238 + }, + { + "epoch": 4.9559999999999995, + "grad_norm": 13.625, + "learning_rate": 4.4e-07, + "loss": 0.6758, + "step": 1239 + }, + { + "epoch": 4.9559999999999995, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.47238972783088684, + "eval_runtime": 29.559, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.131, + "step": 1239 + }, + { + "epoch": 4.96, + "grad_norm": 1.7109375, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0298, + "step": 1240 + }, + { + "epoch": 4.96, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.4727191925048828, + "eval_runtime": 29.5107, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.135, + "step": 1240 + }, + { + "epoch": 4.964, + "grad_norm": 13.25, + "learning_rate": 3.6e-07, + "loss": 0.5156, + "step": 1241 + }, + { + "epoch": 4.964, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.472358375787735, + "eval_runtime": 29.5137, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 1241 + }, + { + "epoch": 4.968, + "grad_norm": 16.875, + "learning_rate": 3.2e-07, + "loss": 0.9102, + "step": 1242 + }, + { + "epoch": 4.968, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47179362177848816, + "eval_runtime": 29.3776, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.144, + "step": 1242 + }, + { + "epoch": 4.9719999999999995, + "grad_norm": 7.375, + "learning_rate": 2.8e-07, + "loss": 0.3965, + "step": 1243 + }, + { + "epoch": 4.9719999999999995, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.47281330823898315, + "eval_runtime": 29.3173, + "eval_samples_per_second": 16.987, + "eval_steps_per_second": 2.149, + "step": 1243 + }, + { + "epoch": 4.976, + "grad_norm": 6.46875, + "learning_rate": 2.4e-07, + "loss": 0.2539, + "step": 1244 + }, + { + "epoch": 4.976, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4724995493888855, + "eval_runtime": 29.3436, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.147, + "step": 1244 + }, + { + "epoch": 4.98, + "grad_norm": 10.5625, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.6172, + "step": 1245 + }, + { + "epoch": 4.98, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4725779891014099, + "eval_runtime": 29.4609, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.138, + "step": 1245 + }, + { + "epoch": 4.984, + "grad_norm": 9.6875, + "learning_rate": 1.6e-07, + "loss": 0.5391, + "step": 1246 + }, + { + "epoch": 4.984, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4727191925048828, + "eval_runtime": 29.5513, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.132, + "step": 1246 + }, + { + "epoch": 4.9879999999999995, + "grad_norm": 11.1875, + "learning_rate": 1.2e-07, + "loss": 0.4512, + "step": 1247 + }, + { + "epoch": 4.9879999999999995, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4724995493888855, + "eval_runtime": 29.5625, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.131, + "step": 1247 + }, + { + "epoch": 4.992, + "grad_norm": 8.4375, + "learning_rate": 8e-08, + "loss": 0.3027, + "step": 1248 + }, + { + "epoch": 4.992, + "eval_accuracy": 0.929718875502008, + "eval_loss": 0.4725936949253082, + "eval_runtime": 29.5626, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.131, + "step": 1248 + }, + { + "epoch": 4.996, + "grad_norm": 16.875, + "learning_rate": 4e-08, + "loss": 0.4941, + "step": 1249 + }, + { + "epoch": 4.996, + "eval_accuracy": 0.9337349397590361, + "eval_loss": 0.47232699394226074, + "eval_runtime": 29.4973, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.136, + "step": 1249 + }, + { + "epoch": 5.0, + "grad_norm": 12.75, + "learning_rate": 0.0, + "loss": 0.5195, + "step": 1250 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.9317269076305221, + "eval_loss": 0.4724524915218353, + "eval_runtime": 29.5358, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.133, + "step": 1250 + } + ], + "logging_steps": 1, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}