diff --git "a/rm-harmless-shp/checkpoint-1200/trainer_state.json" "b/rm-harmless-shp/checkpoint-1200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/rm-harmless-shp/checkpoint-1200/trainer_state.json" @@ -0,0 +1,19221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.979253112033195, + "eval_steps": 1, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004149377593360996, + "grad_norm": 65.5, + "learning_rate": 4.9958506224066394e-05, + "loss": 0.8945, + "step": 1 + }, + { + "epoch": 0.004149377593360996, + "eval_accuracy": 0.5269709543568465, + "eval_loss": 0.6852956414222717, + "eval_runtime": 27.6916, + "eval_samples_per_second": 17.406, + "eval_steps_per_second": 2.203, + "step": 1 + }, + { + "epoch": 0.008298755186721992, + "grad_norm": 64.5, + "learning_rate": 4.9917012448132785e-05, + "loss": 0.9062, + "step": 2 + }, + { + "epoch": 0.008298755186721992, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6702865958213806, + "eval_runtime": 27.9441, + "eval_samples_per_second": 17.249, + "eval_steps_per_second": 2.183, + "step": 2 + }, + { + "epoch": 0.012448132780082987, + "grad_norm": 40.5, + "learning_rate": 4.987551867219917e-05, + "loss": 0.7578, + "step": 3 + }, + { + "epoch": 0.012448132780082987, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6633169054985046, + "eval_runtime": 28.138, + "eval_samples_per_second": 17.13, + "eval_steps_per_second": 2.168, + "step": 3 + }, + { + "epoch": 0.016597510373443983, + "grad_norm": 27.625, + "learning_rate": 4.983402489626556e-05, + "loss": 0.7227, + "step": 4 + }, + { + "epoch": 0.016597510373443983, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6697030663490295, + "eval_runtime": 28.3051, + "eval_samples_per_second": 17.029, + "eval_steps_per_second": 2.155, + "step": 4 + }, + { + "epoch": 0.02074688796680498, + "grad_norm": 32.25, + "learning_rate": 4.979253112033195e-05, + "loss": 0.6445, + "step": 5 + }, + { + "epoch": 0.02074688796680498, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6804493069648743, + "eval_runtime": 28.3624, + "eval_samples_per_second": 16.994, + "eval_steps_per_second": 2.151, + "step": 5 + }, + { + "epoch": 0.024896265560165973, + "grad_norm": 31.0, + "learning_rate": 4.9751037344398344e-05, + "loss": 0.8203, + "step": 6 + }, + { + "epoch": 0.024896265560165973, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.701893150806427, + "eval_runtime": 28.4785, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.142, + "step": 6 + }, + { + "epoch": 0.029045643153526972, + "grad_norm": 19.25, + "learning_rate": 4.9709543568464736e-05, + "loss": 0.5469, + "step": 7 + }, + { + "epoch": 0.029045643153526972, + "eval_accuracy": 0.558091286307054, + "eval_loss": 0.7245364189147949, + "eval_runtime": 28.5034, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.14, + "step": 7 + }, + { + "epoch": 0.03319502074688797, + "grad_norm": 18.375, + "learning_rate": 4.966804979253112e-05, + "loss": 0.4512, + "step": 8 + }, + { + "epoch": 0.03319502074688797, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.7468717694282532, + "eval_runtime": 28.4696, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.143, + "step": 8 + }, + { + "epoch": 0.03734439834024896, + "grad_norm": 19.875, + "learning_rate": 4.962655601659751e-05, + "loss": 0.6172, + "step": 9 + }, + { + "epoch": 0.03734439834024896, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7562564611434937, + "eval_runtime": 28.5367, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 9 + }, + { + "epoch": 0.04149377593360996, + "grad_norm": 41.75, + "learning_rate": 4.9585062240663904e-05, + "loss": 0.9883, + "step": 10 + }, + { + "epoch": 0.04149377593360996, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7626912593841553, + "eval_runtime": 28.5599, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 10 + }, + { + "epoch": 0.04564315352697095, + "grad_norm": 16.625, + "learning_rate": 4.9543568464730295e-05, + "loss": 0.707, + "step": 11 + }, + { + "epoch": 0.04564315352697095, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.7904077768325806, + "eval_runtime": 28.5912, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.134, + "step": 11 + }, + { + "epoch": 0.04979253112033195, + "grad_norm": 11.125, + "learning_rate": 4.9502074688796687e-05, + "loss": 0.4102, + "step": 12 + }, + { + "epoch": 0.04979253112033195, + "eval_accuracy": 0.5228215767634855, + "eval_loss": 0.8400382399559021, + "eval_runtime": 28.5698, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 12 + }, + { + "epoch": 0.05394190871369295, + "grad_norm": 31.5, + "learning_rate": 4.946058091286308e-05, + "loss": 0.6133, + "step": 13 + }, + { + "epoch": 0.05394190871369295, + "eval_accuracy": 0.495850622406639, + "eval_loss": 0.8821236491203308, + "eval_runtime": 28.5758, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.135, + "step": 13 + }, + { + "epoch": 0.058091286307053944, + "grad_norm": 46.0, + "learning_rate": 4.941908713692946e-05, + "loss": 0.5234, + "step": 14 + }, + { + "epoch": 0.058091286307053944, + "eval_accuracy": 0.495850622406639, + "eval_loss": 0.9115825891494751, + "eval_runtime": 28.5973, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 14 + }, + { + "epoch": 0.06224066390041494, + "grad_norm": 77.5, + "learning_rate": 4.9377593360995854e-05, + "loss": 0.9844, + "step": 15 + }, + { + "epoch": 0.06224066390041494, + "eval_accuracy": 0.46265560165975106, + "eval_loss": 0.9630041122436523, + "eval_runtime": 28.6225, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 15 + }, + { + "epoch": 0.06639004149377593, + "grad_norm": 57.0, + "learning_rate": 4.9336099585062246e-05, + "loss": 2.1875, + "step": 16 + }, + { + "epoch": 0.06639004149377593, + "eval_accuracy": 0.475103734439834, + "eval_loss": 0.891257107257843, + "eval_runtime": 28.6286, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.131, + "step": 16 + }, + { + "epoch": 0.07053941908713693, + "grad_norm": 27.0, + "learning_rate": 4.929460580912864e-05, + "loss": 1.0312, + "step": 17 + }, + { + "epoch": 0.07053941908713693, + "eval_accuracy": 0.43775933609958506, + "eval_loss": 0.7866474390029907, + "eval_runtime": 28.6189, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 17 + }, + { + "epoch": 0.07468879668049792, + "grad_norm": 31.375, + "learning_rate": 4.925311203319503e-05, + "loss": 0.9219, + "step": 18 + }, + { + "epoch": 0.07468879668049792, + "eval_accuracy": 0.49377593360995853, + "eval_loss": 0.7316843867301941, + "eval_runtime": 28.649, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.129, + "step": 18 + }, + { + "epoch": 0.07883817427385892, + "grad_norm": 21.25, + "learning_rate": 4.9211618257261413e-05, + "loss": 0.9336, + "step": 19 + }, + { + "epoch": 0.07883817427385892, + "eval_accuracy": 0.533195020746888, + "eval_loss": 0.7104188203811646, + "eval_runtime": 28.6523, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 19 + }, + { + "epoch": 0.08298755186721991, + "grad_norm": 9.8125, + "learning_rate": 4.91701244813278e-05, + "loss": 0.5352, + "step": 20 + }, + { + "epoch": 0.08298755186721991, + "eval_accuracy": 0.5414937759336099, + "eval_loss": 0.7062370181083679, + "eval_runtime": 28.6559, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.129, + "step": 20 + }, + { + "epoch": 0.08713692946058091, + "grad_norm": 11.625, + "learning_rate": 4.912863070539419e-05, + "loss": 0.5469, + "step": 21 + }, + { + "epoch": 0.08713692946058091, + "eval_accuracy": 0.5228215767634855, + "eval_loss": 0.7180044054985046, + "eval_runtime": 28.659, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 21 + }, + { + "epoch": 0.0912863070539419, + "grad_norm": 30.5, + "learning_rate": 4.908713692946058e-05, + "loss": 1.1406, + "step": 22 + }, + { + "epoch": 0.0912863070539419, + "eval_accuracy": 0.5560165975103735, + "eval_loss": 0.7288641333580017, + "eval_runtime": 28.6068, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 22 + }, + { + "epoch": 0.0954356846473029, + "grad_norm": 45.25, + "learning_rate": 4.904564315352697e-05, + "loss": 1.125, + "step": 23 + }, + { + "epoch": 0.0954356846473029, + "eval_accuracy": 0.5518672199170125, + "eval_loss": 0.7467582821846008, + "eval_runtime": 28.6494, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.129, + "step": 23 + }, + { + "epoch": 0.0995850622406639, + "grad_norm": 34.5, + "learning_rate": 4.9004149377593364e-05, + "loss": 1.5078, + "step": 24 + }, + { + "epoch": 0.0995850622406639, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7434841990470886, + "eval_runtime": 28.6008, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.133, + "step": 24 + }, + { + "epoch": 0.1037344398340249, + "grad_norm": 29.875, + "learning_rate": 4.896265560165975e-05, + "loss": 0.9961, + "step": 25 + }, + { + "epoch": 0.1037344398340249, + "eval_accuracy": 0.549792531120332, + "eval_loss": 0.7361903786659241, + "eval_runtime": 28.6497, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.129, + "step": 25 + }, + { + "epoch": 0.1078838174273859, + "grad_norm": 18.0, + "learning_rate": 4.892116182572614e-05, + "loss": 0.6953, + "step": 26 + }, + { + "epoch": 0.1078838174273859, + "eval_accuracy": 0.5601659751037344, + "eval_loss": 0.7332404255867004, + "eval_runtime": 28.599, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.133, + "step": 26 + }, + { + "epoch": 0.11203319502074689, + "grad_norm": 52.75, + "learning_rate": 4.887966804979253e-05, + "loss": 0.5625, + "step": 27 + }, + { + "epoch": 0.11203319502074689, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.7350314259529114, + "eval_runtime": 28.5997, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.133, + "step": 27 + }, + { + "epoch": 0.11618257261410789, + "grad_norm": 14.625, + "learning_rate": 4.883817427385892e-05, + "loss": 0.7031, + "step": 28 + }, + { + "epoch": 0.11618257261410789, + "eval_accuracy": 0.5643153526970954, + "eval_loss": 0.7414419054985046, + "eval_runtime": 28.6425, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.13, + "step": 28 + }, + { + "epoch": 0.12033195020746888, + "grad_norm": 14.875, + "learning_rate": 4.8796680497925315e-05, + "loss": 0.6055, + "step": 29 + }, + { + "epoch": 0.12033195020746888, + "eval_accuracy": 0.549792531120332, + "eval_loss": 0.7479333877563477, + "eval_runtime": 28.6149, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 29 + }, + { + "epoch": 0.12448132780082988, + "grad_norm": 149.0, + "learning_rate": 4.87551867219917e-05, + "loss": 0.9766, + "step": 30 + }, + { + "epoch": 0.12448132780082988, + "eval_accuracy": 0.5414937759336099, + "eval_loss": 0.7494245767593384, + "eval_runtime": 28.5979, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.133, + "step": 30 + }, + { + "epoch": 0.12863070539419086, + "grad_norm": 11.125, + "learning_rate": 4.871369294605809e-05, + "loss": 0.6328, + "step": 31 + }, + { + "epoch": 0.12863070539419086, + "eval_accuracy": 0.5560165975103735, + "eval_loss": 0.7557864189147949, + "eval_runtime": 28.5977, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.133, + "step": 31 + }, + { + "epoch": 0.13278008298755187, + "grad_norm": 9.0625, + "learning_rate": 4.867219917012448e-05, + "loss": 0.4902, + "step": 32 + }, + { + "epoch": 0.13278008298755187, + "eval_accuracy": 0.5518672199170125, + "eval_loss": 0.7639231085777283, + "eval_runtime": 28.5966, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 32 + }, + { + "epoch": 0.13692946058091288, + "grad_norm": 7.84375, + "learning_rate": 4.8630705394190874e-05, + "loss": 0.5625, + "step": 33 + }, + { + "epoch": 0.13692946058091288, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.7694988250732422, + "eval_runtime": 28.6407, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.13, + "step": 33 + }, + { + "epoch": 0.14107883817427386, + "grad_norm": 12.0625, + "learning_rate": 4.8589211618257265e-05, + "loss": 0.6328, + "step": 34 + }, + { + "epoch": 0.14107883817427386, + "eval_accuracy": 0.5435684647302904, + "eval_loss": 0.7758363485336304, + "eval_runtime": 28.6132, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 34 + }, + { + "epoch": 0.14522821576763487, + "grad_norm": 10.6875, + "learning_rate": 4.854771784232366e-05, + "loss": 0.7344, + "step": 35 + }, + { + "epoch": 0.14522821576763487, + "eval_accuracy": 0.553941908713693, + "eval_loss": 0.7782676219940186, + "eval_runtime": 28.5972, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 35 + }, + { + "epoch": 0.14937759336099585, + "grad_norm": 21.75, + "learning_rate": 4.850622406639004e-05, + "loss": 1.0156, + "step": 36 + }, + { + "epoch": 0.14937759336099585, + "eval_accuracy": 0.549792531120332, + "eval_loss": 0.771273672580719, + "eval_runtime": 28.6075, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 36 + }, + { + "epoch": 0.15352697095435686, + "grad_norm": 11.625, + "learning_rate": 4.846473029045643e-05, + "loss": 0.4785, + "step": 37 + }, + { + "epoch": 0.15352697095435686, + "eval_accuracy": 0.553941908713693, + "eval_loss": 0.7702849507331848, + "eval_runtime": 28.6417, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.13, + "step": 37 + }, + { + "epoch": 0.15767634854771784, + "grad_norm": 23.0, + "learning_rate": 4.8423236514522824e-05, + "loss": 0.4961, + "step": 38 + }, + { + "epoch": 0.15767634854771784, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7809258103370667, + "eval_runtime": 28.6245, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 38 + }, + { + "epoch": 0.16182572614107885, + "grad_norm": 20.5, + "learning_rate": 4.8381742738589216e-05, + "loss": 1.1797, + "step": 39 + }, + { + "epoch": 0.16182572614107885, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7755932211875916, + "eval_runtime": 28.567, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 39 + }, + { + "epoch": 0.16597510373443983, + "grad_norm": 19.5, + "learning_rate": 4.834024896265561e-05, + "loss": 0.5156, + "step": 40 + }, + { + "epoch": 0.16597510373443983, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7631288766860962, + "eval_runtime": 28.5576, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 40 + }, + { + "epoch": 0.17012448132780084, + "grad_norm": 25.875, + "learning_rate": 4.829875518672199e-05, + "loss": 0.3477, + "step": 41 + }, + { + "epoch": 0.17012448132780084, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7569453716278076, + "eval_runtime": 28.5666, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 41 + }, + { + "epoch": 0.17427385892116182, + "grad_norm": 17.5, + "learning_rate": 4.8257261410788384e-05, + "loss": 0.9062, + "step": 42 + }, + { + "epoch": 0.17427385892116182, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7416607141494751, + "eval_runtime": 28.5629, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 42 + }, + { + "epoch": 0.17842323651452283, + "grad_norm": 13.3125, + "learning_rate": 4.8215767634854775e-05, + "loss": 0.6172, + "step": 43 + }, + { + "epoch": 0.17842323651452283, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7319599390029907, + "eval_runtime": 28.6093, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 43 + }, + { + "epoch": 0.1825726141078838, + "grad_norm": 17.5, + "learning_rate": 4.8174273858921166e-05, + "loss": 0.4238, + "step": 44 + }, + { + "epoch": 0.1825726141078838, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7264814376831055, + "eval_runtime": 28.5688, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 44 + }, + { + "epoch": 0.18672199170124482, + "grad_norm": 12.1875, + "learning_rate": 4.813278008298756e-05, + "loss": 0.6914, + "step": 45 + }, + { + "epoch": 0.18672199170124482, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7224131226539612, + "eval_runtime": 28.6055, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 45 + }, + { + "epoch": 0.1908713692946058, + "grad_norm": 12.0, + "learning_rate": 4.809128630705394e-05, + "loss": 0.6328, + "step": 46 + }, + { + "epoch": 0.1908713692946058, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.71815025806427, + "eval_runtime": 28.5617, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.136, + "step": 46 + }, + { + "epoch": 0.1950207468879668, + "grad_norm": 16.75, + "learning_rate": 4.8049792531120334e-05, + "loss": 0.5938, + "step": 47 + }, + { + "epoch": 0.1950207468879668, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7160593867301941, + "eval_runtime": 28.6129, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 47 + }, + { + "epoch": 0.1991701244813278, + "grad_norm": 49.5, + "learning_rate": 4.8008298755186726e-05, + "loss": 0.6914, + "step": 48 + }, + { + "epoch": 0.1991701244813278, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.7155893445014954, + "eval_runtime": 28.6088, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 48 + }, + { + "epoch": 0.2033195020746888, + "grad_norm": 40.5, + "learning_rate": 4.796680497925312e-05, + "loss": 1.0078, + "step": 49 + }, + { + "epoch": 0.2033195020746888, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7102405428886414, + "eval_runtime": 28.5744, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.135, + "step": 49 + }, + { + "epoch": 0.2074688796680498, + "grad_norm": 56.75, + "learning_rate": 4.792531120331951e-05, + "loss": 1.1719, + "step": 50 + }, + { + "epoch": 0.2074688796680498, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7016013860702515, + "eval_runtime": 28.6155, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 50 + }, + { + "epoch": 0.21161825726141079, + "grad_norm": 141.0, + "learning_rate": 4.788381742738589e-05, + "loss": 0.918, + "step": 51 + }, + { + "epoch": 0.21161825726141079, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6948586702346802, + "eval_runtime": 28.4783, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.142, + "step": 51 + }, + { + "epoch": 0.2157676348547718, + "grad_norm": 30.375, + "learning_rate": 4.7842323651452285e-05, + "loss": 0.582, + "step": 52 + }, + { + "epoch": 0.2157676348547718, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6955556273460388, + "eval_runtime": 28.4344, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.145, + "step": 52 + }, + { + "epoch": 0.21991701244813278, + "grad_norm": 8.5, + "learning_rate": 4.7800829875518676e-05, + "loss": 0.3555, + "step": 53 + }, + { + "epoch": 0.21991701244813278, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6979706883430481, + "eval_runtime": 28.536, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 53 + }, + { + "epoch": 0.22406639004149378, + "grad_norm": 14.3125, + "learning_rate": 4.775933609958507e-05, + "loss": 1.0234, + "step": 54 + }, + { + "epoch": 0.22406639004149378, + "eval_accuracy": 0.549792531120332, + "eval_loss": 0.7010664939880371, + "eval_runtime": 28.5648, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 54 + }, + { + "epoch": 0.22821576763485477, + "grad_norm": 30.125, + "learning_rate": 4.771784232365146e-05, + "loss": 0.6445, + "step": 55 + }, + { + "epoch": 0.22821576763485477, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.6944048404693604, + "eval_runtime": 28.622, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 55 + }, + { + "epoch": 0.23236514522821577, + "grad_norm": 10.5625, + "learning_rate": 4.767634854771785e-05, + "loss": 0.5547, + "step": 56 + }, + { + "epoch": 0.23236514522821577, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6888777017593384, + "eval_runtime": 28.6344, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.13, + "step": 56 + }, + { + "epoch": 0.23651452282157676, + "grad_norm": 9.4375, + "learning_rate": 4.763485477178423e-05, + "loss": 0.6875, + "step": 57 + }, + { + "epoch": 0.23651452282157676, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6887885928153992, + "eval_runtime": 28.5782, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 57 + }, + { + "epoch": 0.24066390041493776, + "grad_norm": 9.0, + "learning_rate": 4.759336099585062e-05, + "loss": 0.6406, + "step": 58 + }, + { + "epoch": 0.24066390041493776, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6901581883430481, + "eval_runtime": 28.5838, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 58 + }, + { + "epoch": 0.24481327800829875, + "grad_norm": 7.71875, + "learning_rate": 4.755186721991701e-05, + "loss": 0.7227, + "step": 59 + }, + { + "epoch": 0.24481327800829875, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6880267858505249, + "eval_runtime": 28.5877, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 59 + }, + { + "epoch": 0.24896265560165975, + "grad_norm": 9.75, + "learning_rate": 4.75103734439834e-05, + "loss": 0.6016, + "step": 60 + }, + { + "epoch": 0.24896265560165975, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6852632164955139, + "eval_runtime": 28.6319, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 60 + }, + { + "epoch": 0.25311203319502074, + "grad_norm": 12.25, + "learning_rate": 4.7468879668049795e-05, + "loss": 0.7773, + "step": 61 + }, + { + "epoch": 0.25311203319502074, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6845662593841553, + "eval_runtime": 28.5701, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 61 + }, + { + "epoch": 0.2572614107883817, + "grad_norm": 42.75, + "learning_rate": 4.7427385892116186e-05, + "loss": 0.5273, + "step": 62 + }, + { + "epoch": 0.2572614107883817, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6866490244865417, + "eval_runtime": 28.5684, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 62 + }, + { + "epoch": 0.26141078838174275, + "grad_norm": 10.3125, + "learning_rate": 4.738589211618257e-05, + "loss": 0.5742, + "step": 63 + }, + { + "epoch": 0.26141078838174275, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.688010573387146, + "eval_runtime": 28.6082, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 63 + }, + { + "epoch": 0.26556016597510373, + "grad_norm": 9.25, + "learning_rate": 4.734439834024896e-05, + "loss": 0.7266, + "step": 64 + }, + { + "epoch": 0.26556016597510373, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6893153786659241, + "eval_runtime": 28.5688, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 64 + }, + { + "epoch": 0.2697095435684647, + "grad_norm": 9.625, + "learning_rate": 4.7302904564315354e-05, + "loss": 0.5469, + "step": 65 + }, + { + "epoch": 0.2697095435684647, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6908470392227173, + "eval_runtime": 28.5636, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 65 + }, + { + "epoch": 0.27385892116182575, + "grad_norm": 12.125, + "learning_rate": 4.7261410788381745e-05, + "loss": 0.4297, + "step": 66 + }, + { + "epoch": 0.27385892116182575, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6945669054985046, + "eval_runtime": 28.5519, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 66 + }, + { + "epoch": 0.27800829875518673, + "grad_norm": 12.1875, + "learning_rate": 4.721991701244814e-05, + "loss": 0.6328, + "step": 67 + }, + { + "epoch": 0.27800829875518673, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7022173404693604, + "eval_runtime": 28.5519, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 67 + }, + { + "epoch": 0.2821576763485477, + "grad_norm": 43.0, + "learning_rate": 4.717842323651452e-05, + "loss": 0.5234, + "step": 68 + }, + { + "epoch": 0.2821576763485477, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.7074364423751831, + "eval_runtime": 28.5457, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 68 + }, + { + "epoch": 0.2863070539419087, + "grad_norm": 10.8125, + "learning_rate": 4.713692946058091e-05, + "loss": 0.8711, + "step": 69 + }, + { + "epoch": 0.2863070539419087, + "eval_accuracy": 0.5601659751037344, + "eval_loss": 0.7042919993400574, + "eval_runtime": 28.5539, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.136, + "step": 69 + }, + { + "epoch": 0.29045643153526973, + "grad_norm": 7.9375, + "learning_rate": 4.7095435684647304e-05, + "loss": 0.6836, + "step": 70 + }, + { + "epoch": 0.29045643153526973, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.7033681273460388, + "eval_runtime": 28.571, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 70 + }, + { + "epoch": 0.2946058091286307, + "grad_norm": 10.6875, + "learning_rate": 4.7053941908713696e-05, + "loss": 0.5039, + "step": 71 + }, + { + "epoch": 0.2946058091286307, + "eval_accuracy": 0.5643153526970954, + "eval_loss": 0.7064801454544067, + "eval_runtime": 28.6125, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 71 + }, + { + "epoch": 0.2987551867219917, + "grad_norm": 12.25, + "learning_rate": 4.701244813278009e-05, + "loss": 0.8438, + "step": 72 + }, + { + "epoch": 0.2987551867219917, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7030277252197266, + "eval_runtime": 28.5838, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 72 + }, + { + "epoch": 0.3029045643153527, + "grad_norm": 15.5, + "learning_rate": 4.697095435684647e-05, + "loss": 0.7266, + "step": 73 + }, + { + "epoch": 0.3029045643153527, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.6989594101905823, + "eval_runtime": 28.626, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 73 + }, + { + "epoch": 0.3070539419087137, + "grad_norm": 9.4375, + "learning_rate": 4.6929460580912863e-05, + "loss": 0.4961, + "step": 74 + }, + { + "epoch": 0.3070539419087137, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.6987324953079224, + "eval_runtime": 28.5886, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 74 + }, + { + "epoch": 0.3112033195020747, + "grad_norm": 7.75, + "learning_rate": 4.6887966804979255e-05, + "loss": 0.5781, + "step": 75 + }, + { + "epoch": 0.3112033195020747, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6988297700881958, + "eval_runtime": 28.5783, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 75 + }, + { + "epoch": 0.3153526970954357, + "grad_norm": 17.125, + "learning_rate": 4.6846473029045646e-05, + "loss": 0.7656, + "step": 76 + }, + { + "epoch": 0.3153526970954357, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.6988297700881958, + "eval_runtime": 28.6278, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 76 + }, + { + "epoch": 0.31950207468879666, + "grad_norm": 62.0, + "learning_rate": 4.680497925311204e-05, + "loss": 0.9258, + "step": 77 + }, + { + "epoch": 0.31950207468879666, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6969819664955139, + "eval_runtime": 28.5723, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.135, + "step": 77 + }, + { + "epoch": 0.3236514522821577, + "grad_norm": 21.25, + "learning_rate": 4.676348547717842e-05, + "loss": 0.7656, + "step": 78 + }, + { + "epoch": 0.3236514522821577, + "eval_accuracy": 0.5560165975103735, + "eval_loss": 0.6944696307182312, + "eval_runtime": 28.6016, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.133, + "step": 78 + }, + { + "epoch": 0.3278008298755187, + "grad_norm": 28.125, + "learning_rate": 4.6721991701244814e-05, + "loss": 1.2266, + "step": 79 + }, + { + "epoch": 0.3278008298755187, + "eval_accuracy": 0.5373443983402489, + "eval_loss": 0.6880186796188354, + "eval_runtime": 28.5861, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.134, + "step": 79 + }, + { + "epoch": 0.33195020746887965, + "grad_norm": 10.375, + "learning_rate": 4.6680497925311206e-05, + "loss": 0.3125, + "step": 80 + }, + { + "epoch": 0.33195020746887965, + "eval_accuracy": 0.5352697095435685, + "eval_loss": 0.686867892742157, + "eval_runtime": 28.6108, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.132, + "step": 80 + }, + { + "epoch": 0.3360995850622407, + "grad_norm": 8.375, + "learning_rate": 4.66390041493776e-05, + "loss": 0.7891, + "step": 81 + }, + { + "epoch": 0.3360995850622407, + "eval_accuracy": 0.5414937759336099, + "eval_loss": 0.6846473217010498, + "eval_runtime": 28.5767, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.135, + "step": 81 + }, + { + "epoch": 0.34024896265560167, + "grad_norm": 11.6875, + "learning_rate": 4.659751037344399e-05, + "loss": 0.7422, + "step": 82 + }, + { + "epoch": 0.34024896265560167, + "eval_accuracy": 0.5352697095435685, + "eval_loss": 0.6813731789588928, + "eval_runtime": 28.5717, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 82 + }, + { + "epoch": 0.34439834024896265, + "grad_norm": 7.59375, + "learning_rate": 4.655601659751038e-05, + "loss": 0.5391, + "step": 83 + }, + { + "epoch": 0.34439834024896265, + "eval_accuracy": 0.533195020746888, + "eval_loss": 0.681729793548584, + "eval_runtime": 28.5604, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 83 + }, + { + "epoch": 0.34854771784232363, + "grad_norm": 6.03125, + "learning_rate": 4.6514522821576765e-05, + "loss": 0.6484, + "step": 84 + }, + { + "epoch": 0.34854771784232363, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.6817622184753418, + "eval_runtime": 28.618, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 84 + }, + { + "epoch": 0.35269709543568467, + "grad_norm": 6.4375, + "learning_rate": 4.6473029045643156e-05, + "loss": 0.5859, + "step": 85 + }, + { + "epoch": 0.35269709543568467, + "eval_accuracy": 0.5560165975103735, + "eval_loss": 0.6837396025657654, + "eval_runtime": 28.6226, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 85 + }, + { + "epoch": 0.35684647302904565, + "grad_norm": 29.5, + "learning_rate": 4.643153526970955e-05, + "loss": 0.7773, + "step": 86 + }, + { + "epoch": 0.35684647302904565, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6824753880500793, + "eval_runtime": 28.5741, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.135, + "step": 86 + }, + { + "epoch": 0.36099585062240663, + "grad_norm": 13.3125, + "learning_rate": 4.639004149377594e-05, + "loss": 0.6523, + "step": 87 + }, + { + "epoch": 0.36099585062240663, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6815676689147949, + "eval_runtime": 28.6078, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 87 + }, + { + "epoch": 0.3651452282157676, + "grad_norm": 20.5, + "learning_rate": 4.634854771784233e-05, + "loss": 0.9531, + "step": 88 + }, + { + "epoch": 0.3651452282157676, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6777100563049316, + "eval_runtime": 28.583, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 88 + }, + { + "epoch": 0.36929460580912865, + "grad_norm": 16.25, + "learning_rate": 4.6307053941908715e-05, + "loss": 0.8438, + "step": 89 + }, + { + "epoch": 0.36929460580912865, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.675359845161438, + "eval_runtime": 28.5884, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 89 + }, + { + "epoch": 0.37344398340248963, + "grad_norm": 11.375, + "learning_rate": 4.626556016597511e-05, + "loss": 0.6797, + "step": 90 + }, + { + "epoch": 0.37344398340248963, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6747276782989502, + "eval_runtime": 28.5951, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.133, + "step": 90 + }, + { + "epoch": 0.3775933609958506, + "grad_norm": 16.25, + "learning_rate": 4.62240663900415e-05, + "loss": 0.668, + "step": 91 + }, + { + "epoch": 0.3775933609958506, + "eval_accuracy": 0.6182572614107884, + "eval_loss": 0.6710807681083679, + "eval_runtime": 28.6366, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.13, + "step": 91 + }, + { + "epoch": 0.3817427385892116, + "grad_norm": 8.125, + "learning_rate": 4.618257261410789e-05, + "loss": 0.6562, + "step": 92 + }, + { + "epoch": 0.3817427385892116, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6753436326980591, + "eval_runtime": 28.6341, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.13, + "step": 92 + }, + { + "epoch": 0.38589211618257263, + "grad_norm": 7.15625, + "learning_rate": 4.614107883817428e-05, + "loss": 0.6719, + "step": 93 + }, + { + "epoch": 0.38589211618257263, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6758298873901367, + "eval_runtime": 28.6273, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 93 + }, + { + "epoch": 0.3900414937759336, + "grad_norm": 8.8125, + "learning_rate": 4.6099585062240666e-05, + "loss": 0.6133, + "step": 94 + }, + { + "epoch": 0.3900414937759336, + "eval_accuracy": 0.6141078838174274, + "eval_loss": 0.6732851266860962, + "eval_runtime": 28.5697, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 94 + }, + { + "epoch": 0.3941908713692946, + "grad_norm": 10.4375, + "learning_rate": 4.605809128630705e-05, + "loss": 0.5547, + "step": 95 + }, + { + "epoch": 0.3941908713692946, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.6729123592376709, + "eval_runtime": 28.6243, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 95 + }, + { + "epoch": 0.3983402489626556, + "grad_norm": 7.6875, + "learning_rate": 4.601659751037344e-05, + "loss": 0.6836, + "step": 96 + }, + { + "epoch": 0.3983402489626556, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6729123592376709, + "eval_runtime": 28.5786, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 96 + }, + { + "epoch": 0.4024896265560166, + "grad_norm": 9.5, + "learning_rate": 4.5975103734439834e-05, + "loss": 0.6719, + "step": 97 + }, + { + "epoch": 0.4024896265560166, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6717129349708557, + "eval_runtime": 28.5763, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.135, + "step": 97 + }, + { + "epoch": 0.4066390041493776, + "grad_norm": 6.34375, + "learning_rate": 4.5933609958506225e-05, + "loss": 0.7578, + "step": 98 + }, + { + "epoch": 0.4066390041493776, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6716805100440979, + "eval_runtime": 28.6219, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 98 + }, + { + "epoch": 0.4107883817427386, + "grad_norm": 15.1875, + "learning_rate": 4.5892116182572617e-05, + "loss": 0.7578, + "step": 99 + }, + { + "epoch": 0.4107883817427386, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.6729447841644287, + "eval_runtime": 28.5724, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.135, + "step": 99 + }, + { + "epoch": 0.4149377593360996, + "grad_norm": 9.0, + "learning_rate": 4.5850622406639e-05, + "loss": 0.8398, + "step": 100 + }, + { + "epoch": 0.4149377593360996, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.674127995967865, + "eval_runtime": 28.5716, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 100 + }, + { + "epoch": 0.4190871369294606, + "grad_norm": 10.0625, + "learning_rate": 4.580912863070539e-05, + "loss": 0.6758, + "step": 101 + }, + { + "epoch": 0.4190871369294606, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6749464869499207, + "eval_runtime": 28.5032, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.14, + "step": 101 + }, + { + "epoch": 0.42323651452282157, + "grad_norm": 12.1875, + "learning_rate": 4.5767634854771784e-05, + "loss": 0.6133, + "step": 102 + }, + { + "epoch": 0.42323651452282157, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6771751642227173, + "eval_runtime": 28.338, + "eval_samples_per_second": 17.009, + "eval_steps_per_second": 2.153, + "step": 102 + }, + { + "epoch": 0.42738589211618255, + "grad_norm": 6.25, + "learning_rate": 4.5726141078838176e-05, + "loss": 0.4453, + "step": 103 + }, + { + "epoch": 0.42738589211618255, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6796064376831055, + "eval_runtime": 28.4811, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.142, + "step": 103 + }, + { + "epoch": 0.4315352697095436, + "grad_norm": 16.5, + "learning_rate": 4.568464730290457e-05, + "loss": 0.8477, + "step": 104 + }, + { + "epoch": 0.4315352697095436, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.689963698387146, + "eval_runtime": 28.5378, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 104 + }, + { + "epoch": 0.43568464730290457, + "grad_norm": 10.3125, + "learning_rate": 4.564315352697096e-05, + "loss": 0.4844, + "step": 105 + }, + { + "epoch": 0.43568464730290457, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6996158361434937, + "eval_runtime": 28.607, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 105 + }, + { + "epoch": 0.43983402489626555, + "grad_norm": 16.875, + "learning_rate": 4.560165975103734e-05, + "loss": 0.6211, + "step": 106 + }, + { + "epoch": 0.43983402489626555, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7201195955276489, + "eval_runtime": 28.5601, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 106 + }, + { + "epoch": 0.44398340248962653, + "grad_norm": 8.25, + "learning_rate": 4.5560165975103735e-05, + "loss": 0.7969, + "step": 107 + }, + { + "epoch": 0.44398340248962653, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.7468312382698059, + "eval_runtime": 28.5485, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 107 + }, + { + "epoch": 0.44813278008298757, + "grad_norm": 8.6875, + "learning_rate": 4.5518672199170126e-05, + "loss": 0.5391, + "step": 108 + }, + { + "epoch": 0.44813278008298757, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.7782027721405029, + "eval_runtime": 28.6091, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 108 + }, + { + "epoch": 0.45228215767634855, + "grad_norm": 25.75, + "learning_rate": 4.547717842323652e-05, + "loss": 1.1641, + "step": 109 + }, + { + "epoch": 0.45228215767634855, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7880251407623291, + "eval_runtime": 28.624, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 109 + }, + { + "epoch": 0.45643153526970953, + "grad_norm": 6.75, + "learning_rate": 4.543568464730291e-05, + "loss": 0.625, + "step": 110 + }, + { + "epoch": 0.45643153526970953, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7957161068916321, + "eval_runtime": 28.5783, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.134, + "step": 110 + }, + { + "epoch": 0.4605809128630705, + "grad_norm": 21.75, + "learning_rate": 4.5394190871369294e-05, + "loss": 0.543, + "step": 111 + }, + { + "epoch": 0.4605809128630705, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.8047361373901367, + "eval_runtime": 28.5702, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 111 + }, + { + "epoch": 0.46473029045643155, + "grad_norm": 16.5, + "learning_rate": 4.5352697095435685e-05, + "loss": 1.0469, + "step": 112 + }, + { + "epoch": 0.46473029045643155, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.8029612898826599, + "eval_runtime": 28.5622, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 112 + }, + { + "epoch": 0.46887966804979253, + "grad_norm": 26.25, + "learning_rate": 4.531120331950208e-05, + "loss": 0.6484, + "step": 113 + }, + { + "epoch": 0.46887966804979253, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.8136913180351257, + "eval_runtime": 28.563, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 113 + }, + { + "epoch": 0.4730290456431535, + "grad_norm": 8.125, + "learning_rate": 4.526970954356847e-05, + "loss": 0.5234, + "step": 114 + }, + { + "epoch": 0.4730290456431535, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.8162198662757874, + "eval_runtime": 28.5586, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 114 + }, + { + "epoch": 0.47717842323651455, + "grad_norm": 10.9375, + "learning_rate": 4.522821576763486e-05, + "loss": 0.1934, + "step": 115 + }, + { + "epoch": 0.47717842323651455, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.82286536693573, + "eval_runtime": 28.5549, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.136, + "step": 115 + }, + { + "epoch": 0.48132780082987553, + "grad_norm": 8.875, + "learning_rate": 4.5186721991701245e-05, + "loss": 0.4863, + "step": 116 + }, + { + "epoch": 0.48132780082987553, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.8257180452346802, + "eval_runtime": 28.5676, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 116 + }, + { + "epoch": 0.4854771784232365, + "grad_norm": 13.0625, + "learning_rate": 4.5145228215767636e-05, + "loss": 0.7188, + "step": 117 + }, + { + "epoch": 0.4854771784232365, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.8404272794723511, + "eval_runtime": 28.6024, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.133, + "step": 117 + }, + { + "epoch": 0.4896265560165975, + "grad_norm": 39.75, + "learning_rate": 4.510373443983403e-05, + "loss": 1.125, + "step": 118 + }, + { + "epoch": 0.4896265560165975, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.8527619242668152, + "eval_runtime": 28.537, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 118 + }, + { + "epoch": 0.49377593360995853, + "grad_norm": 12.25, + "learning_rate": 4.506224066390042e-05, + "loss": 0.9102, + "step": 119 + }, + { + "epoch": 0.49377593360995853, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.8534750938415527, + "eval_runtime": 28.5893, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.134, + "step": 119 + }, + { + "epoch": 0.4979253112033195, + "grad_norm": 5.84375, + "learning_rate": 4.502074688796681e-05, + "loss": 0.5898, + "step": 120 + }, + { + "epoch": 0.4979253112033195, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.8573651313781738, + "eval_runtime": 28.5872, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.134, + "step": 120 + }, + { + "epoch": 0.5020746887966805, + "grad_norm": 28.125, + "learning_rate": 4.4979253112033195e-05, + "loss": 1.7188, + "step": 121 + }, + { + "epoch": 0.5020746887966805, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.8463595509529114, + "eval_runtime": 28.5804, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 121 + }, + { + "epoch": 0.5062240663900415, + "grad_norm": 34.75, + "learning_rate": 4.493775933609959e-05, + "loss": 1.3984, + "step": 122 + }, + { + "epoch": 0.5062240663900415, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.8189510107040405, + "eval_runtime": 28.55, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.137, + "step": 122 + }, + { + "epoch": 0.5103734439834025, + "grad_norm": 32.25, + "learning_rate": 4.489626556016598e-05, + "loss": 1.1016, + "step": 123 + }, + { + "epoch": 0.5103734439834025, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.7776841521263123, + "eval_runtime": 28.5311, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.138, + "step": 123 + }, + { + "epoch": 0.5145228215767634, + "grad_norm": 21.75, + "learning_rate": 4.485477178423237e-05, + "loss": 1.3125, + "step": 124 + }, + { + "epoch": 0.5145228215767634, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7359958291053772, + "eval_runtime": 28.5718, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 124 + }, + { + "epoch": 0.5186721991701245, + "grad_norm": 4.9375, + "learning_rate": 4.481327800829876e-05, + "loss": 0.3125, + "step": 125 + }, + { + "epoch": 0.5186721991701245, + "eval_accuracy": 0.6161825726141079, + "eval_loss": 0.7127852439880371, + "eval_runtime": 28.5266, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.138, + "step": 125 + }, + { + "epoch": 0.5228215767634855, + "grad_norm": 25.75, + "learning_rate": 4.477178423236515e-05, + "loss": 0.5508, + "step": 126 + }, + { + "epoch": 0.5228215767634855, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7030925750732422, + "eval_runtime": 28.5646, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.136, + "step": 126 + }, + { + "epoch": 0.5269709543568465, + "grad_norm": 16.625, + "learning_rate": 4.473029045643154e-05, + "loss": 0.3555, + "step": 127 + }, + { + "epoch": 0.5269709543568465, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.6968361139297485, + "eval_runtime": 28.5247, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 127 + }, + { + "epoch": 0.5311203319502075, + "grad_norm": 5.03125, + "learning_rate": 4.468879668049793e-05, + "loss": 0.6016, + "step": 128 + }, + { + "epoch": 0.5311203319502075, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6918438673019409, + "eval_runtime": 28.5214, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 128 + }, + { + "epoch": 0.5352697095435685, + "grad_norm": 5.59375, + "learning_rate": 4.464730290456432e-05, + "loss": 0.7344, + "step": 129 + }, + { + "epoch": 0.5352697095435685, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6874189376831055, + "eval_runtime": 28.5249, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 129 + }, + { + "epoch": 0.5394190871369294, + "grad_norm": 18.75, + "learning_rate": 4.460580912863071e-05, + "loss": 0.9648, + "step": 130 + }, + { + "epoch": 0.5394190871369294, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6824429631233215, + "eval_runtime": 28.5349, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 130 + }, + { + "epoch": 0.5435684647302904, + "grad_norm": 6.0, + "learning_rate": 4.45643153526971e-05, + "loss": 0.5117, + "step": 131 + }, + { + "epoch": 0.5435684647302904, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6842096447944641, + "eval_runtime": 28.5282, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.138, + "step": 131 + }, + { + "epoch": 0.5477178423236515, + "grad_norm": 7.21875, + "learning_rate": 4.452282157676349e-05, + "loss": 0.7227, + "step": 132 + }, + { + "epoch": 0.5477178423236515, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6857818961143494, + "eval_runtime": 28.5613, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.136, + "step": 132 + }, + { + "epoch": 0.5518672199170125, + "grad_norm": 5.9375, + "learning_rate": 4.448132780082987e-05, + "loss": 0.4043, + "step": 133 + }, + { + "epoch": 0.5518672199170125, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6863330006599426, + "eval_runtime": 28.5176, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.139, + "step": 133 + }, + { + "epoch": 0.5560165975103735, + "grad_norm": 5.125, + "learning_rate": 4.4439834024896264e-05, + "loss": 0.582, + "step": 134 + }, + { + "epoch": 0.5560165975103735, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.687921404838562, + "eval_runtime": 28.5197, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 134 + }, + { + "epoch": 0.5601659751037344, + "grad_norm": 8.4375, + "learning_rate": 4.4398340248962656e-05, + "loss": 0.7031, + "step": 135 + }, + { + "epoch": 0.5601659751037344, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.6914224624633789, + "eval_runtime": 28.5527, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 135 + }, + { + "epoch": 0.5643153526970954, + "grad_norm": 10.9375, + "learning_rate": 4.435684647302905e-05, + "loss": 0.8125, + "step": 136 + }, + { + "epoch": 0.5643153526970954, + "eval_accuracy": 0.6224066390041494, + "eval_loss": 0.6914548873901367, + "eval_runtime": 28.5381, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 136 + }, + { + "epoch": 0.5684647302904564, + "grad_norm": 7.03125, + "learning_rate": 4.431535269709544e-05, + "loss": 0.6211, + "step": 137 + }, + { + "epoch": 0.5684647302904564, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.6910982728004456, + "eval_runtime": 28.5401, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.137, + "step": 137 + }, + { + "epoch": 0.5726141078838174, + "grad_norm": 17.25, + "learning_rate": 4.427385892116182e-05, + "loss": 0.8594, + "step": 138 + }, + { + "epoch": 0.5726141078838174, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6911144852638245, + "eval_runtime": 28.486, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.141, + "step": 138 + }, + { + "epoch": 0.5767634854771784, + "grad_norm": 26.875, + "learning_rate": 4.4232365145228215e-05, + "loss": 0.6016, + "step": 139 + }, + { + "epoch": 0.5767634854771784, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6915358901023865, + "eval_runtime": 28.5353, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 139 + }, + { + "epoch": 0.5809128630705395, + "grad_norm": 9.4375, + "learning_rate": 4.4190871369294606e-05, + "loss": 0.5625, + "step": 140 + }, + { + "epoch": 0.5809128630705395, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.6924111843109131, + "eval_runtime": 28.4795, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.142, + "step": 140 + }, + { + "epoch": 0.5850622406639004, + "grad_norm": 6.71875, + "learning_rate": 4.4149377593361e-05, + "loss": 0.6016, + "step": 141 + }, + { + "epoch": 0.5850622406639004, + "eval_accuracy": 0.558091286307054, + "eval_loss": 0.6933188438415527, + "eval_runtime": 28.534, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 141 + }, + { + "epoch": 0.5892116182572614, + "grad_norm": 4.15625, + "learning_rate": 4.410788381742739e-05, + "loss": 0.6484, + "step": 142 + }, + { + "epoch": 0.5892116182572614, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.6930757164955139, + "eval_runtime": 28.5473, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 142 + }, + { + "epoch": 0.5933609958506224, + "grad_norm": 4.03125, + "learning_rate": 4.4066390041493774e-05, + "loss": 0.6719, + "step": 143 + }, + { + "epoch": 0.5933609958506224, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.6929785013198853, + "eval_runtime": 28.5164, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.139, + "step": 143 + }, + { + "epoch": 0.5975103734439834, + "grad_norm": 6.65625, + "learning_rate": 4.4024896265560165e-05, + "loss": 0.7695, + "step": 144 + }, + { + "epoch": 0.5975103734439834, + "eval_accuracy": 0.5560165975103735, + "eval_loss": 0.6928163766860962, + "eval_runtime": 28.5448, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.137, + "step": 144 + }, + { + "epoch": 0.6016597510373444, + "grad_norm": 4.46875, + "learning_rate": 4.398340248962656e-05, + "loss": 0.7891, + "step": 145 + }, + { + "epoch": 0.6016597510373444, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6931243538856506, + "eval_runtime": 28.5312, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.138, + "step": 145 + }, + { + "epoch": 0.6058091286307054, + "grad_norm": 6.03125, + "learning_rate": 4.394190871369295e-05, + "loss": 0.6758, + "step": 146 + }, + { + "epoch": 0.6058091286307054, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6918438673019409, + "eval_runtime": 28.4857, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.141, + "step": 146 + }, + { + "epoch": 0.6099585062240664, + "grad_norm": 6.65625, + "learning_rate": 4.390041493775934e-05, + "loss": 0.6445, + "step": 147 + }, + { + "epoch": 0.6099585062240664, + "eval_accuracy": 0.6141078838174274, + "eval_loss": 0.69080650806427, + "eval_runtime": 28.4942, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.141, + "step": 147 + }, + { + "epoch": 0.6141078838174274, + "grad_norm": 27.0, + "learning_rate": 4.385892116182573e-05, + "loss": 0.6992, + "step": 148 + }, + { + "epoch": 0.6141078838174274, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6894450187683105, + "eval_runtime": 28.495, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.141, + "step": 148 + }, + { + "epoch": 0.6182572614107884, + "grad_norm": 6.5625, + "learning_rate": 4.3817427385892116e-05, + "loss": 0.4082, + "step": 149 + }, + { + "epoch": 0.6182572614107884, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6904985904693604, + "eval_runtime": 28.5057, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.14, + "step": 149 + }, + { + "epoch": 0.6224066390041494, + "grad_norm": 5.96875, + "learning_rate": 4.377593360995851e-05, + "loss": 0.5781, + "step": 150 + }, + { + "epoch": 0.6224066390041494, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.6900609731674194, + "eval_runtime": 28.5591, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 150 + }, + { + "epoch": 0.6265560165975104, + "grad_norm": 32.25, + "learning_rate": 4.37344398340249e-05, + "loss": 0.3379, + "step": 151 + }, + { + "epoch": 0.6265560165975104, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.6905472278594971, + "eval_runtime": 28.4825, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.142, + "step": 151 + }, + { + "epoch": 0.6307053941908713, + "grad_norm": 14.75, + "learning_rate": 4.369294605809129e-05, + "loss": 0.8242, + "step": 152 + }, + { + "epoch": 0.6307053941908713, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6895260810852051, + "eval_runtime": 28.2816, + "eval_samples_per_second": 17.043, + "eval_steps_per_second": 2.157, + "step": 152 + }, + { + "epoch": 0.6348547717842323, + "grad_norm": 10.3125, + "learning_rate": 4.365145228215768e-05, + "loss": 0.5195, + "step": 153 + }, + { + "epoch": 0.6348547717842323, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6892505288124084, + "eval_runtime": 28.4802, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.142, + "step": 153 + }, + { + "epoch": 0.6390041493775933, + "grad_norm": 4.5, + "learning_rate": 4.3609958506224067e-05, + "loss": 0.5039, + "step": 154 + }, + { + "epoch": 0.6390041493775933, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.6881158947944641, + "eval_runtime": 28.4913, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.141, + "step": 154 + }, + { + "epoch": 0.6431535269709544, + "grad_norm": 4.5, + "learning_rate": 4.356846473029046e-05, + "loss": 0.5742, + "step": 155 + }, + { + "epoch": 0.6431535269709544, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6887642741203308, + "eval_runtime": 28.5463, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 155 + }, + { + "epoch": 0.6473029045643154, + "grad_norm": 5.28125, + "learning_rate": 4.352697095435685e-05, + "loss": 0.7422, + "step": 156 + }, + { + "epoch": 0.6473029045643154, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.688521146774292, + "eval_runtime": 28.554, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.136, + "step": 156 + }, + { + "epoch": 0.6514522821576764, + "grad_norm": 7.4375, + "learning_rate": 4.348547717842324e-05, + "loss": 0.8555, + "step": 157 + }, + { + "epoch": 0.6514522821576764, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6885049343109131, + "eval_runtime": 28.5062, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.14, + "step": 157 + }, + { + "epoch": 0.6556016597510373, + "grad_norm": 8.375, + "learning_rate": 4.344398340248963e-05, + "loss": 0.7266, + "step": 158 + }, + { + "epoch": 0.6556016597510373, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6876944899559021, + "eval_runtime": 28.5138, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.139, + "step": 158 + }, + { + "epoch": 0.6597510373443983, + "grad_norm": 4.78125, + "learning_rate": 4.340248962655602e-05, + "loss": 0.8008, + "step": 159 + }, + { + "epoch": 0.6597510373443983, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6859115958213806, + "eval_runtime": 28.5123, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 159 + }, + { + "epoch": 0.6639004149377593, + "grad_norm": 7.8125, + "learning_rate": 4.336099585062241e-05, + "loss": 1.0234, + "step": 160 + }, + { + "epoch": 0.6639004149377593, + "eval_accuracy": 0.5643153526970954, + "eval_loss": 0.6826698780059814, + "eval_runtime": 28.5667, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 160 + }, + { + "epoch": 0.6680497925311203, + "grad_norm": 6.15625, + "learning_rate": 4.33195020746888e-05, + "loss": 0.3867, + "step": 161 + }, + { + "epoch": 0.6680497925311203, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.680108904838562, + "eval_runtime": 28.5273, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.138, + "step": 161 + }, + { + "epoch": 0.6721991701244814, + "grad_norm": 11.0625, + "learning_rate": 4.327800829875519e-05, + "loss": 0.8125, + "step": 162 + }, + { + "epoch": 0.6721991701244814, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6767051219940186, + "eval_runtime": 28.5357, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 162 + }, + { + "epoch": 0.6763485477178424, + "grad_norm": 4.46875, + "learning_rate": 4.323651452282158e-05, + "loss": 0.7656, + "step": 163 + }, + { + "epoch": 0.6763485477178424, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6739497184753418, + "eval_runtime": 28.5312, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.138, + "step": 163 + }, + { + "epoch": 0.6804979253112033, + "grad_norm": 9.0625, + "learning_rate": 4.319502074688797e-05, + "loss": 0.793, + "step": 164 + }, + { + "epoch": 0.6804979253112033, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6743224859237671, + "eval_runtime": 28.5332, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.138, + "step": 164 + }, + { + "epoch": 0.6846473029045643, + "grad_norm": 6.625, + "learning_rate": 4.315352697095436e-05, + "loss": 0.4648, + "step": 165 + }, + { + "epoch": 0.6846473029045643, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6710969805717468, + "eval_runtime": 28.5287, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.138, + "step": 165 + }, + { + "epoch": 0.6887966804979253, + "grad_norm": 161.0, + "learning_rate": 4.311203319502075e-05, + "loss": 0.9609, + "step": 166 + }, + { + "epoch": 0.6887966804979253, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6685036420822144, + "eval_runtime": 28.5274, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.138, + "step": 166 + }, + { + "epoch": 0.6929460580912863, + "grad_norm": 12.9375, + "learning_rate": 4.307053941908714e-05, + "loss": 0.6094, + "step": 167 + }, + { + "epoch": 0.6929460580912863, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6683091521263123, + "eval_runtime": 28.5208, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 167 + }, + { + "epoch": 0.6970954356846473, + "grad_norm": 7.84375, + "learning_rate": 4.3029045643153534e-05, + "loss": 0.7109, + "step": 168 + }, + { + "epoch": 0.6970954356846473, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6684712171554565, + "eval_runtime": 28.5533, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 168 + }, + { + "epoch": 0.7012448132780082, + "grad_norm": 10.1875, + "learning_rate": 4.298755186721992e-05, + "loss": 0.4883, + "step": 169 + }, + { + "epoch": 0.7012448132780082, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.669411301612854, + "eval_runtime": 28.5188, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 169 + }, + { + "epoch": 0.7053941908713693, + "grad_norm": 9.1875, + "learning_rate": 4.29460580912863e-05, + "loss": 0.4805, + "step": 170 + }, + { + "epoch": 0.7053941908713693, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6698975563049316, + "eval_runtime": 28.5574, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 170 + }, + { + "epoch": 0.7095435684647303, + "grad_norm": 16.625, + "learning_rate": 4.2904564315352695e-05, + "loss": 0.7148, + "step": 171 + }, + { + "epoch": 0.7095435684647303, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6731230616569519, + "eval_runtime": 28.5103, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.14, + "step": 171 + }, + { + "epoch": 0.7136929460580913, + "grad_norm": 8.5625, + "learning_rate": 4.2863070539419086e-05, + "loss": 0.6211, + "step": 172 + }, + { + "epoch": 0.7136929460580913, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.672685444355011, + "eval_runtime": 28.5044, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.14, + "step": 172 + }, + { + "epoch": 0.7178423236514523, + "grad_norm": 5.96875, + "learning_rate": 4.282157676348548e-05, + "loss": 0.7031, + "step": 173 + }, + { + "epoch": 0.7178423236514523, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6749060153961182, + "eval_runtime": 28.5058, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.14, + "step": 173 + }, + { + "epoch": 0.7219917012448133, + "grad_norm": 9.3125, + "learning_rate": 4.278008298755187e-05, + "loss": 0.6445, + "step": 174 + }, + { + "epoch": 0.7219917012448133, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.675149142742157, + "eval_runtime": 28.5552, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.136, + "step": 174 + }, + { + "epoch": 0.7261410788381742, + "grad_norm": 4.59375, + "learning_rate": 4.273858921161826e-05, + "loss": 0.6016, + "step": 175 + }, + { + "epoch": 0.7261410788381742, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.673317551612854, + "eval_runtime": 28.5243, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.139, + "step": 175 + }, + { + "epoch": 0.7302904564315352, + "grad_norm": 17.75, + "learning_rate": 4.2697095435684645e-05, + "loss": 0.543, + "step": 176 + }, + { + "epoch": 0.7302904564315352, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6732364892959595, + "eval_runtime": 28.5125, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 176 + }, + { + "epoch": 0.7344398340248963, + "grad_norm": 3.59375, + "learning_rate": 4.265560165975104e-05, + "loss": 0.5117, + "step": 177 + }, + { + "epoch": 0.7344398340248963, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6679525375366211, + "eval_runtime": 28.5664, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 177 + }, + { + "epoch": 0.7385892116182573, + "grad_norm": 8.5, + "learning_rate": 4.261410788381743e-05, + "loss": 0.6484, + "step": 178 + }, + { + "epoch": 0.7385892116182573, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6668989658355713, + "eval_runtime": 28.5332, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.138, + "step": 178 + }, + { + "epoch": 0.7427385892116183, + "grad_norm": 8.6875, + "learning_rate": 4.257261410788382e-05, + "loss": 0.7891, + "step": 179 + }, + { + "epoch": 0.7427385892116183, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6680498123168945, + "eval_runtime": 28.5728, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.135, + "step": 179 + }, + { + "epoch": 0.7468879668049793, + "grad_norm": 5.21875, + "learning_rate": 4.253112033195021e-05, + "loss": 0.7734, + "step": 180 + }, + { + "epoch": 0.7468879668049793, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6726043820381165, + "eval_runtime": 28.5381, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 180 + }, + { + "epoch": 0.7510373443983402, + "grad_norm": 5.28125, + "learning_rate": 4.2489626556016596e-05, + "loss": 0.75, + "step": 181 + }, + { + "epoch": 0.7510373443983402, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.676802396774292, + "eval_runtime": 28.51, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.14, + "step": 181 + }, + { + "epoch": 0.7551867219917012, + "grad_norm": 4.59375, + "learning_rate": 4.244813278008299e-05, + "loss": 0.5664, + "step": 182 + }, + { + "epoch": 0.7551867219917012, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.685247004032135, + "eval_runtime": 28.4659, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.143, + "step": 182 + }, + { + "epoch": 0.7593360995850622, + "grad_norm": 4.84375, + "learning_rate": 4.240663900414938e-05, + "loss": 0.4297, + "step": 183 + }, + { + "epoch": 0.7593360995850622, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6891856789588928, + "eval_runtime": 28.4277, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.146, + "step": 183 + }, + { + "epoch": 0.7634854771784232, + "grad_norm": 11.25, + "learning_rate": 4.236514522821577e-05, + "loss": 0.6602, + "step": 184 + }, + { + "epoch": 0.7634854771784232, + "eval_accuracy": 0.553941908713693, + "eval_loss": 0.6896881461143494, + "eval_runtime": 28.4061, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 184 + }, + { + "epoch": 0.7676348547717843, + "grad_norm": 6.78125, + "learning_rate": 4.232365145228216e-05, + "loss": 0.793, + "step": 185 + }, + { + "epoch": 0.7676348547717843, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6885049343109131, + "eval_runtime": 28.3835, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.149, + "step": 185 + }, + { + "epoch": 0.7717842323651453, + "grad_norm": 4.375, + "learning_rate": 4.2282157676348546e-05, + "loss": 0.5352, + "step": 186 + }, + { + "epoch": 0.7717842323651453, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6832533478736877, + "eval_runtime": 28.3731, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.15, + "step": 186 + }, + { + "epoch": 0.7759336099585062, + "grad_norm": 6.40625, + "learning_rate": 4.224066390041494e-05, + "loss": 0.6289, + "step": 187 + }, + { + "epoch": 0.7759336099585062, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.6798819899559021, + "eval_runtime": 28.3705, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.15, + "step": 187 + }, + { + "epoch": 0.7800829875518672, + "grad_norm": 6.84375, + "learning_rate": 4.219917012448133e-05, + "loss": 0.6055, + "step": 188 + }, + { + "epoch": 0.7800829875518672, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.6770617365837097, + "eval_runtime": 28.3556, + "eval_samples_per_second": 16.998, + "eval_steps_per_second": 2.151, + "step": 188 + }, + { + "epoch": 0.7842323651452282, + "grad_norm": 8.4375, + "learning_rate": 4.215767634854772e-05, + "loss": 0.8164, + "step": 189 + }, + { + "epoch": 0.7842323651452282, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.6758298873901367, + "eval_runtime": 28.3968, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.148, + "step": 189 + }, + { + "epoch": 0.7883817427385892, + "grad_norm": 5.15625, + "learning_rate": 4.211618257261411e-05, + "loss": 0.6875, + "step": 190 + }, + { + "epoch": 0.7883817427385892, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6757164001464844, + "eval_runtime": 28.4941, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.141, + "step": 190 + }, + { + "epoch": 0.7925311203319502, + "grad_norm": 7.65625, + "learning_rate": 4.20746887966805e-05, + "loss": 0.5234, + "step": 191 + }, + { + "epoch": 0.7925311203319502, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6739497184753418, + "eval_runtime": 28.5665, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 191 + }, + { + "epoch": 0.7966804979253111, + "grad_norm": 5.9375, + "learning_rate": 4.203319502074689e-05, + "loss": 0.4141, + "step": 192 + }, + { + "epoch": 0.7966804979253111, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.6737551689147949, + "eval_runtime": 28.5409, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 192 + }, + { + "epoch": 0.8008298755186722, + "grad_norm": 8.8125, + "learning_rate": 4.199170124481328e-05, + "loss": 0.7422, + "step": 193 + }, + { + "epoch": 0.8008298755186722, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.6743711233139038, + "eval_runtime": 28.5943, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 193 + }, + { + "epoch": 0.8049792531120332, + "grad_norm": 5.46875, + "learning_rate": 4.195020746887967e-05, + "loss": 0.5859, + "step": 194 + }, + { + "epoch": 0.8049792531120332, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6718101501464844, + "eval_runtime": 28.5577, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 194 + }, + { + "epoch": 0.8091286307053942, + "grad_norm": 35.0, + "learning_rate": 4.190871369294606e-05, + "loss": 1.1953, + "step": 195 + }, + { + "epoch": 0.8091286307053942, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6699299812316895, + "eval_runtime": 28.5596, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 195 + }, + { + "epoch": 0.8132780082987552, + "grad_norm": 14.1875, + "learning_rate": 4.1867219917012454e-05, + "loss": 0.6719, + "step": 196 + }, + { + "epoch": 0.8132780082987552, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6715508103370667, + "eval_runtime": 28.6047, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.133, + "step": 196 + }, + { + "epoch": 0.8174273858921162, + "grad_norm": 7.625, + "learning_rate": 4.182572614107884e-05, + "loss": 0.7266, + "step": 197 + }, + { + "epoch": 0.8174273858921162, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.6715832352638245, + "eval_runtime": 28.5375, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 197 + }, + { + "epoch": 0.8215767634854771, + "grad_norm": 5.46875, + "learning_rate": 4.178423236514523e-05, + "loss": 0.4004, + "step": 198 + }, + { + "epoch": 0.8215767634854771, + "eval_accuracy": 0.6203319502074689, + "eval_loss": 0.6776776313781738, + "eval_runtime": 28.4985, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.14, + "step": 198 + }, + { + "epoch": 0.8257261410788381, + "grad_norm": 5.1875, + "learning_rate": 4.174273858921162e-05, + "loss": 0.6484, + "step": 199 + }, + { + "epoch": 0.8257261410788381, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.6850038766860962, + "eval_runtime": 28.4785, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.142, + "step": 199 + }, + { + "epoch": 0.8298755186721992, + "grad_norm": 7.53125, + "learning_rate": 4.1701244813278014e-05, + "loss": 0.5391, + "step": 200 + }, + { + "epoch": 0.8298755186721992, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6946641802787781, + "eval_runtime": 28.5197, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 200 + }, + { + "epoch": 0.8340248962655602, + "grad_norm": 14.75, + "learning_rate": 4.1659751037344405e-05, + "loss": 1.2344, + "step": 201 + }, + { + "epoch": 0.8340248962655602, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7046810388565063, + "eval_runtime": 28.4104, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.147, + "step": 201 + }, + { + "epoch": 0.8381742738589212, + "grad_norm": 22.75, + "learning_rate": 4.161825726141079e-05, + "loss": 0.7969, + "step": 202 + }, + { + "epoch": 0.8381742738589212, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.7091221213340759, + "eval_runtime": 28.3417, + "eval_samples_per_second": 17.007, + "eval_steps_per_second": 2.152, + "step": 202 + }, + { + "epoch": 0.8423236514522822, + "grad_norm": 20.5, + "learning_rate": 4.157676348547718e-05, + "loss": 0.9336, + "step": 203 + }, + { + "epoch": 0.8423236514522822, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7122179865837097, + "eval_runtime": 28.566, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 203 + }, + { + "epoch": 0.8464730290456431, + "grad_norm": 5.34375, + "learning_rate": 4.153526970954357e-05, + "loss": 0.3926, + "step": 204 + }, + { + "epoch": 0.8464730290456431, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7106133103370667, + "eval_runtime": 28.596, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.133, + "step": 204 + }, + { + "epoch": 0.8506224066390041, + "grad_norm": 7.9375, + "learning_rate": 4.1493775933609964e-05, + "loss": 0.4629, + "step": 205 + }, + { + "epoch": 0.8506224066390041, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7054833173751831, + "eval_runtime": 28.6231, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 205 + }, + { + "epoch": 0.8547717842323651, + "grad_norm": 10.75, + "learning_rate": 4.1452282157676356e-05, + "loss": 0.7695, + "step": 206 + }, + { + "epoch": 0.8547717842323651, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6957014799118042, + "eval_runtime": 28.6255, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 206 + }, + { + "epoch": 0.8589211618257261, + "grad_norm": 5.9375, + "learning_rate": 4.141078838174274e-05, + "loss": 0.5078, + "step": 207 + }, + { + "epoch": 0.8589211618257261, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6852226853370667, + "eval_runtime": 28.671, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.128, + "step": 207 + }, + { + "epoch": 0.8630705394190872, + "grad_norm": 6.0625, + "learning_rate": 4.1369294605809125e-05, + "loss": 0.6602, + "step": 208 + }, + { + "epoch": 0.8630705394190872, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.679055392742157, + "eval_runtime": 28.6186, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 208 + }, + { + "epoch": 0.8672199170124482, + "grad_norm": 6.21875, + "learning_rate": 4.1327800829875517e-05, + "loss": 0.3535, + "step": 209 + }, + { + "epoch": 0.8672199170124482, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6771751642227173, + "eval_runtime": 28.6803, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.127, + "step": 209 + }, + { + "epoch": 0.8713692946058091, + "grad_norm": 7.71875, + "learning_rate": 4.128630705394191e-05, + "loss": 0.6562, + "step": 210 + }, + { + "epoch": 0.8713692946058091, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6780747771263123, + "eval_runtime": 28.6611, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.128, + "step": 210 + }, + { + "epoch": 0.8755186721991701, + "grad_norm": 8.875, + "learning_rate": 4.12448132780083e-05, + "loss": 0.6719, + "step": 211 + }, + { + "epoch": 0.8755186721991701, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6778883337974548, + "eval_runtime": 28.5708, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 211 + }, + { + "epoch": 0.8796680497925311, + "grad_norm": 6.46875, + "learning_rate": 4.120331950207469e-05, + "loss": 0.582, + "step": 212 + }, + { + "epoch": 0.8796680497925311, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6774669289588928, + "eval_runtime": 28.5835, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 212 + }, + { + "epoch": 0.8838174273858921, + "grad_norm": 20.0, + "learning_rate": 4.1161825726141076e-05, + "loss": 0.6953, + "step": 213 + }, + { + "epoch": 0.8838174273858921, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6789337992668152, + "eval_runtime": 28.5212, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 213 + }, + { + "epoch": 0.8879668049792531, + "grad_norm": 16.75, + "learning_rate": 4.112033195020747e-05, + "loss": 0.4375, + "step": 214 + }, + { + "epoch": 0.8879668049792531, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6844123005867004, + "eval_runtime": 28.5276, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.138, + "step": 214 + }, + { + "epoch": 0.8921161825726142, + "grad_norm": 6.75, + "learning_rate": 4.107883817427386e-05, + "loss": 0.3633, + "step": 215 + }, + { + "epoch": 0.8921161825726142, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.6905876994132996, + "eval_runtime": 28.5225, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.139, + "step": 215 + }, + { + "epoch": 0.8962655601659751, + "grad_norm": 6.25, + "learning_rate": 4.103734439834025e-05, + "loss": 0.5586, + "step": 216 + }, + { + "epoch": 0.8962655601659751, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.705588698387146, + "eval_runtime": 28.5349, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 216 + }, + { + "epoch": 0.9004149377593361, + "grad_norm": 15.5, + "learning_rate": 4.099585062240664e-05, + "loss": 0.6992, + "step": 217 + }, + { + "epoch": 0.9004149377593361, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.7079226970672607, + "eval_runtime": 28.4916, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.141, + "step": 217 + }, + { + "epoch": 0.9045643153526971, + "grad_norm": 5.25, + "learning_rate": 4.095435684647303e-05, + "loss": 0.5352, + "step": 218 + }, + { + "epoch": 0.9045643153526971, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.7163673639297485, + "eval_runtime": 28.5345, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 218 + }, + { + "epoch": 0.9087136929460581, + "grad_norm": 21.125, + "learning_rate": 4.091286307053942e-05, + "loss": 0.5312, + "step": 219 + }, + { + "epoch": 0.9087136929460581, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7331917881965637, + "eval_runtime": 28.4741, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 219 + }, + { + "epoch": 0.9128630705394191, + "grad_norm": 32.0, + "learning_rate": 4.087136929460581e-05, + "loss": 1.0156, + "step": 220 + }, + { + "epoch": 0.9128630705394191, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7478118538856506, + "eval_runtime": 28.4738, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 220 + }, + { + "epoch": 0.91701244813278, + "grad_norm": 7.65625, + "learning_rate": 4.08298755186722e-05, + "loss": 0.3203, + "step": 221 + }, + { + "epoch": 0.91701244813278, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7606732845306396, + "eval_runtime": 28.4787, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.142, + "step": 221 + }, + { + "epoch": 0.921161825726141, + "grad_norm": 41.5, + "learning_rate": 4.078838174273859e-05, + "loss": 0.3887, + "step": 222 + }, + { + "epoch": 0.921161825726141, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7808772325515747, + "eval_runtime": 28.4738, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 222 + }, + { + "epoch": 0.9253112033195021, + "grad_norm": 17.625, + "learning_rate": 4.0746887966804984e-05, + "loss": 0.5234, + "step": 223 + }, + { + "epoch": 0.9253112033195021, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.792336642742157, + "eval_runtime": 28.4985, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.14, + "step": 223 + }, + { + "epoch": 0.9294605809128631, + "grad_norm": 19.875, + "learning_rate": 4.070539419087137e-05, + "loss": 0.2754, + "step": 224 + }, + { + "epoch": 0.9294605809128631, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.8007407188415527, + "eval_runtime": 28.495, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.141, + "step": 224 + }, + { + "epoch": 0.9336099585062241, + "grad_norm": 24.125, + "learning_rate": 4.066390041493776e-05, + "loss": 1.0078, + "step": 225 + }, + { + "epoch": 0.9336099585062241, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7989497184753418, + "eval_runtime": 28.5366, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 225 + }, + { + "epoch": 0.9377593360995851, + "grad_norm": 7.8125, + "learning_rate": 4.062240663900415e-05, + "loss": 0.4043, + "step": 226 + }, + { + "epoch": 0.9377593360995851, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.8019644618034363, + "eval_runtime": 28.4883, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.141, + "step": 226 + }, + { + "epoch": 0.941908713692946, + "grad_norm": 56.25, + "learning_rate": 4.058091286307054e-05, + "loss": 0.9453, + "step": 227 + }, + { + "epoch": 0.941908713692946, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.7974908947944641, + "eval_runtime": 28.5355, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 227 + }, + { + "epoch": 0.946058091286307, + "grad_norm": 48.25, + "learning_rate": 4.0539419087136934e-05, + "loss": 1.0703, + "step": 228 + }, + { + "epoch": 0.946058091286307, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.8000356554985046, + "eval_runtime": 28.4912, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.141, + "step": 228 + }, + { + "epoch": 0.950207468879668, + "grad_norm": 22.0, + "learning_rate": 4.049792531120332e-05, + "loss": 0.9336, + "step": 229 + }, + { + "epoch": 0.950207468879668, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7939412593841553, + "eval_runtime": 28.4811, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.142, + "step": 229 + }, + { + "epoch": 0.9543568464730291, + "grad_norm": 14.3125, + "learning_rate": 4.045643153526971e-05, + "loss": 0.6406, + "step": 230 + }, + { + "epoch": 0.9543568464730291, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7844430804252625, + "eval_runtime": 28.5187, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 230 + }, + { + "epoch": 0.9585062240663901, + "grad_norm": 7.65625, + "learning_rate": 4.04149377593361e-05, + "loss": 0.459, + "step": 231 + }, + { + "epoch": 0.9585062240663901, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.7833327651023865, + "eval_runtime": 28.4731, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 231 + }, + { + "epoch": 0.9626556016597511, + "grad_norm": 17.375, + "learning_rate": 4.0373443983402494e-05, + "loss": 0.9453, + "step": 232 + }, + { + "epoch": 0.9626556016597511, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7691908478736877, + "eval_runtime": 28.5231, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.139, + "step": 232 + }, + { + "epoch": 0.966804979253112, + "grad_norm": 22.875, + "learning_rate": 4.0331950207468885e-05, + "loss": 0.8672, + "step": 233 + }, + { + "epoch": 0.966804979253112, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7557458877563477, + "eval_runtime": 28.5141, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.139, + "step": 233 + }, + { + "epoch": 0.970954356846473, + "grad_norm": 30.5, + "learning_rate": 4.029045643153527e-05, + "loss": 0.8281, + "step": 234 + }, + { + "epoch": 0.970954356846473, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7392699718475342, + "eval_runtime": 28.474, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 234 + }, + { + "epoch": 0.975103734439834, + "grad_norm": 21.125, + "learning_rate": 4.024896265560166e-05, + "loss": 0.8398, + "step": 235 + }, + { + "epoch": 0.975103734439834, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7236449718475342, + "eval_runtime": 28.5309, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.138, + "step": 235 + }, + { + "epoch": 0.979253112033195, + "grad_norm": 170.0, + "learning_rate": 4.020746887966805e-05, + "loss": 0.3477, + "step": 236 + }, + { + "epoch": 0.979253112033195, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7111968398094177, + "eval_runtime": 28.512, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 236 + }, + { + "epoch": 0.983402489626556, + "grad_norm": 19.0, + "learning_rate": 4.0165975103734444e-05, + "loss": 0.334, + "step": 237 + }, + { + "epoch": 0.983402489626556, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6991863250732422, + "eval_runtime": 28.5298, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.138, + "step": 237 + }, + { + "epoch": 0.9875518672199171, + "grad_norm": 20.25, + "learning_rate": 4.0124481327800836e-05, + "loss": 0.7266, + "step": 238 + }, + { + "epoch": 0.9875518672199171, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.6941617131233215, + "eval_runtime": 28.4805, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.142, + "step": 238 + }, + { + "epoch": 0.991701244813278, + "grad_norm": 20.25, + "learning_rate": 4.008298755186723e-05, + "loss": 0.4316, + "step": 239 + }, + { + "epoch": 0.991701244813278, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6904823780059814, + "eval_runtime": 28.4663, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.143, + "step": 239 + }, + { + "epoch": 0.995850622406639, + "grad_norm": 56.0, + "learning_rate": 4.004149377593361e-05, + "loss": 0.9414, + "step": 240 + }, + { + "epoch": 0.995850622406639, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.6870542764663696, + "eval_runtime": 28.5195, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 240 + }, + { + "epoch": 1.0, + "grad_norm": 11.8125, + "learning_rate": 4e-05, + "loss": 0.6875, + "step": 241 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6820701360702515, + "eval_runtime": 28.4743, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 241 + }, + { + "epoch": 1.004149377593361, + "grad_norm": 27.25, + "learning_rate": 3.9958506224066395e-05, + "loss": 0.7812, + "step": 242 + }, + { + "epoch": 1.004149377593361, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6876944899559021, + "eval_runtime": 28.4917, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.141, + "step": 242 + }, + { + "epoch": 1.008298755186722, + "grad_norm": 11.0, + "learning_rate": 3.9917012448132786e-05, + "loss": 0.3828, + "step": 243 + }, + { + "epoch": 1.008298755186722, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6951261162757874, + "eval_runtime": 28.5245, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.139, + "step": 243 + }, + { + "epoch": 1.012448132780083, + "grad_norm": 41.75, + "learning_rate": 3.987551867219918e-05, + "loss": 0.5469, + "step": 244 + }, + { + "epoch": 1.012448132780083, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6961148381233215, + "eval_runtime": 28.5213, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 244 + }, + { + "epoch": 1.016597510373444, + "grad_norm": 10.875, + "learning_rate": 3.983402489626556e-05, + "loss": 0.4609, + "step": 245 + }, + { + "epoch": 1.016597510373444, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.697776198387146, + "eval_runtime": 28.4655, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.143, + "step": 245 + }, + { + "epoch": 1.020746887966805, + "grad_norm": 38.75, + "learning_rate": 3.979253112033195e-05, + "loss": 1.0234, + "step": 246 + }, + { + "epoch": 1.020746887966805, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7045270204544067, + "eval_runtime": 28.4476, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.144, + "step": 246 + }, + { + "epoch": 1.0248962655601659, + "grad_norm": 16.0, + "learning_rate": 3.975103734439834e-05, + "loss": 1.2656, + "step": 247 + }, + { + "epoch": 1.0248962655601659, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7004181742668152, + "eval_runtime": 28.4446, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.145, + "step": 247 + }, + { + "epoch": 1.0290456431535269, + "grad_norm": 28.125, + "learning_rate": 3.970954356846473e-05, + "loss": 0.6328, + "step": 248 + }, + { + "epoch": 1.0290456431535269, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.696965754032135, + "eval_runtime": 28.4225, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.146, + "step": 248 + }, + { + "epoch": 1.033195020746888, + "grad_norm": 19.375, + "learning_rate": 3.966804979253112e-05, + "loss": 0.8359, + "step": 249 + }, + { + "epoch": 1.033195020746888, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6886507868766785, + "eval_runtime": 28.413, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.147, + "step": 249 + }, + { + "epoch": 1.037344398340249, + "grad_norm": 22.375, + "learning_rate": 3.962655601659751e-05, + "loss": 0.4023, + "step": 250 + }, + { + "epoch": 1.037344398340249, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6887966990470886, + "eval_runtime": 28.4572, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.144, + "step": 250 + }, + { + "epoch": 1.04149377593361, + "grad_norm": 18.875, + "learning_rate": 3.95850622406639e-05, + "loss": 1.7891, + "step": 251 + }, + { + "epoch": 1.04149377593361, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6792985200881958, + "eval_runtime": 28.3578, + "eval_samples_per_second": 16.997, + "eval_steps_per_second": 2.151, + "step": 251 + }, + { + "epoch": 1.045643153526971, + "grad_norm": 7.1875, + "learning_rate": 3.954356846473029e-05, + "loss": 0.3984, + "step": 252 + }, + { + "epoch": 1.045643153526971, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6826860904693604, + "eval_runtime": 28.3225, + "eval_samples_per_second": 17.018, + "eval_steps_per_second": 2.154, + "step": 252 + }, + { + "epoch": 1.049792531120332, + "grad_norm": 29.125, + "learning_rate": 3.950207468879668e-05, + "loss": 0.7812, + "step": 253 + }, + { + "epoch": 1.049792531120332, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6793957352638245, + "eval_runtime": 28.5375, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 253 + }, + { + "epoch": 1.053941908713693, + "grad_norm": 14.375, + "learning_rate": 3.946058091286307e-05, + "loss": 0.5312, + "step": 254 + }, + { + "epoch": 1.053941908713693, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.6781152486801147, + "eval_runtime": 28.5308, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.138, + "step": 254 + }, + { + "epoch": 1.058091286307054, + "grad_norm": 6.84375, + "learning_rate": 3.9419087136929464e-05, + "loss": 0.4492, + "step": 255 + }, + { + "epoch": 1.058091286307054, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6802548170089722, + "eval_runtime": 28.5964, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 255 + }, + { + "epoch": 1.062240663900415, + "grad_norm": 19.125, + "learning_rate": 3.937759336099585e-05, + "loss": 0.8086, + "step": 256 + }, + { + "epoch": 1.062240663900415, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6811462640762329, + "eval_runtime": 28.5345, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 256 + }, + { + "epoch": 1.066390041493776, + "grad_norm": 13.8125, + "learning_rate": 3.933609958506224e-05, + "loss": 0.6797, + "step": 257 + }, + { + "epoch": 1.066390041493776, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6798171401023865, + "eval_runtime": 28.5475, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 257 + }, + { + "epoch": 1.070539419087137, + "grad_norm": 20.875, + "learning_rate": 3.929460580912863e-05, + "loss": 0.6875, + "step": 258 + }, + { + "epoch": 1.070539419087137, + "eval_accuracy": 0.6203319502074689, + "eval_loss": 0.6801737546920776, + "eval_runtime": 28.5858, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.134, + "step": 258 + }, + { + "epoch": 1.0746887966804979, + "grad_norm": 22.75, + "learning_rate": 3.925311203319502e-05, + "loss": 1.0078, + "step": 259 + }, + { + "epoch": 1.0746887966804979, + "eval_accuracy": 0.6182572614107884, + "eval_loss": 0.6810327768325806, + "eval_runtime": 28.5448, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.137, + "step": 259 + }, + { + "epoch": 1.0788381742738589, + "grad_norm": 15.3125, + "learning_rate": 3.9211618257261414e-05, + "loss": 0.5039, + "step": 260 + }, + { + "epoch": 1.0788381742738589, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.6807572841644287, + "eval_runtime": 28.5456, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 260 + }, + { + "epoch": 1.0829875518672198, + "grad_norm": 15.375, + "learning_rate": 3.91701244813278e-05, + "loss": 0.3828, + "step": 261 + }, + { + "epoch": 1.0829875518672198, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.6818918585777283, + "eval_runtime": 28.5442, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.137, + "step": 261 + }, + { + "epoch": 1.0871369294605808, + "grad_norm": 16.75, + "learning_rate": 3.912863070539419e-05, + "loss": 0.8711, + "step": 262 + }, + { + "epoch": 1.0871369294605808, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6840313673019409, + "eval_runtime": 28.4561, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.144, + "step": 262 + }, + { + "epoch": 1.0912863070539418, + "grad_norm": 11.375, + "learning_rate": 3.908713692946058e-05, + "loss": 0.6523, + "step": 263 + }, + { + "epoch": 1.0912863070539418, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6872730851173401, + "eval_runtime": 28.4735, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 263 + }, + { + "epoch": 1.095435684647303, + "grad_norm": 11.75, + "learning_rate": 3.9045643153526973e-05, + "loss": 0.6875, + "step": 264 + }, + { + "epoch": 1.095435684647303, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.6867544054985046, + "eval_runtime": 28.4047, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.148, + "step": 264 + }, + { + "epoch": 1.099585062240664, + "grad_norm": 11.875, + "learning_rate": 3.9004149377593365e-05, + "loss": 0.5508, + "step": 265 + }, + { + "epoch": 1.099585062240664, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.6861870884895325, + "eval_runtime": 28.4046, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.148, + "step": 265 + }, + { + "epoch": 1.103734439834025, + "grad_norm": 6.4375, + "learning_rate": 3.8962655601659756e-05, + "loss": 0.3516, + "step": 266 + }, + { + "epoch": 1.103734439834025, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.6898016333580017, + "eval_runtime": 28.383, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.149, + "step": 266 + }, + { + "epoch": 1.107883817427386, + "grad_norm": 34.25, + "learning_rate": 3.892116182572614e-05, + "loss": 1.0859, + "step": 267 + }, + { + "epoch": 1.107883817427386, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.6913089752197266, + "eval_runtime": 28.3678, + "eval_samples_per_second": 16.991, + "eval_steps_per_second": 2.15, + "step": 267 + }, + { + "epoch": 1.112033195020747, + "grad_norm": 21.5, + "learning_rate": 3.887966804979253e-05, + "loss": 0.6328, + "step": 268 + }, + { + "epoch": 1.112033195020747, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6936268210411072, + "eval_runtime": 28.4023, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.148, + "step": 268 + }, + { + "epoch": 1.116182572614108, + "grad_norm": 15.375, + "learning_rate": 3.8838174273858924e-05, + "loss": 0.5312, + "step": 269 + }, + { + "epoch": 1.116182572614108, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6951504349708557, + "eval_runtime": 28.5069, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.14, + "step": 269 + }, + { + "epoch": 1.120331950207469, + "grad_norm": 21.375, + "learning_rate": 3.8796680497925316e-05, + "loss": 0.8594, + "step": 270 + }, + { + "epoch": 1.120331950207469, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6953287124633789, + "eval_runtime": 28.5684, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 270 + }, + { + "epoch": 1.1244813278008299, + "grad_norm": 16.125, + "learning_rate": 3.875518672199171e-05, + "loss": 0.5859, + "step": 271 + }, + { + "epoch": 1.1244813278008299, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6952152252197266, + "eval_runtime": 28.5169, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.139, + "step": 271 + }, + { + "epoch": 1.1286307053941909, + "grad_norm": 11.375, + "learning_rate": 3.871369294605809e-05, + "loss": 0.5586, + "step": 272 + }, + { + "epoch": 1.1286307053941909, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6968361139297485, + "eval_runtime": 28.5457, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 272 + }, + { + "epoch": 1.1327800829875518, + "grad_norm": 41.0, + "learning_rate": 3.867219917012448e-05, + "loss": 0.875, + "step": 273 + }, + { + "epoch": 1.1327800829875518, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6974276900291443, + "eval_runtime": 28.5635, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 273 + }, + { + "epoch": 1.1369294605809128, + "grad_norm": 12.5, + "learning_rate": 3.8630705394190875e-05, + "loss": 0.5273, + "step": 274 + }, + { + "epoch": 1.1369294605809128, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6963903903961182, + "eval_runtime": 28.5707, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 274 + }, + { + "epoch": 1.1410788381742738, + "grad_norm": 13.875, + "learning_rate": 3.8589211618257266e-05, + "loss": 0.8555, + "step": 275 + }, + { + "epoch": 1.1410788381742738, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6944453716278076, + "eval_runtime": 28.5979, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.133, + "step": 275 + }, + { + "epoch": 1.1452282157676348, + "grad_norm": 98.0, + "learning_rate": 3.854771784232366e-05, + "loss": 0.8555, + "step": 276 + }, + { + "epoch": 1.1452282157676348, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6781395673751831, + "eval_runtime": 28.5992, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.133, + "step": 276 + }, + { + "epoch": 1.1493775933609958, + "grad_norm": 22.875, + "learning_rate": 3.850622406639004e-05, + "loss": 0.8555, + "step": 277 + }, + { + "epoch": 1.1493775933609958, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6751329302787781, + "eval_runtime": 28.5928, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 277 + }, + { + "epoch": 1.1535269709543567, + "grad_norm": 20.375, + "learning_rate": 3.8464730290456434e-05, + "loss": 0.6836, + "step": 278 + }, + { + "epoch": 1.1535269709543567, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6721667647361755, + "eval_runtime": 28.5638, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.136, + "step": 278 + }, + { + "epoch": 1.1576763485477177, + "grad_norm": 27.5, + "learning_rate": 3.8423236514522825e-05, + "loss": 0.5469, + "step": 279 + }, + { + "epoch": 1.1576763485477177, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6693302392959595, + "eval_runtime": 28.5368, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 279 + }, + { + "epoch": 1.161825726141079, + "grad_norm": 24.125, + "learning_rate": 3.838174273858922e-05, + "loss": 0.7461, + "step": 280 + }, + { + "epoch": 1.161825726141079, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6662182211875916, + "eval_runtime": 28.5648, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 280 + }, + { + "epoch": 1.16597510373444, + "grad_norm": 15.375, + "learning_rate": 3.834024896265561e-05, + "loss": 0.9062, + "step": 281 + }, + { + "epoch": 1.16597510373444, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6632844805717468, + "eval_runtime": 28.6078, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 281 + }, + { + "epoch": 1.170124481327801, + "grad_norm": 13.125, + "learning_rate": 3.829875518672199e-05, + "loss": 0.5195, + "step": 282 + }, + { + "epoch": 1.170124481327801, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6619067788124084, + "eval_runtime": 28.6431, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.13, + "step": 282 + }, + { + "epoch": 1.1742738589211619, + "grad_norm": 10.4375, + "learning_rate": 3.825726141078838e-05, + "loss": 0.6797, + "step": 283 + }, + { + "epoch": 1.1742738589211619, + "eval_accuracy": 0.5643153526970954, + "eval_loss": 0.6613556742668152, + "eval_runtime": 28.7041, + "eval_samples_per_second": 16.792, + "eval_steps_per_second": 2.125, + "step": 283 + }, + { + "epoch": 1.1784232365145229, + "grad_norm": 9.3125, + "learning_rate": 3.821576763485477e-05, + "loss": 0.7266, + "step": 284 + }, + { + "epoch": 1.1784232365145229, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6623281836509705, + "eval_runtime": 28.7153, + "eval_samples_per_second": 16.785, + "eval_steps_per_second": 2.124, + "step": 284 + }, + { + "epoch": 1.1825726141078838, + "grad_norm": 9.5625, + "learning_rate": 3.817427385892116e-05, + "loss": 0.6406, + "step": 285 + }, + { + "epoch": 1.1825726141078838, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.662830650806427, + "eval_runtime": 28.7215, + "eval_samples_per_second": 16.782, + "eval_steps_per_second": 2.124, + "step": 285 + }, + { + "epoch": 1.1867219917012448, + "grad_norm": 25.125, + "learning_rate": 3.813278008298755e-05, + "loss": 0.6289, + "step": 286 + }, + { + "epoch": 1.1867219917012448, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.6630089282989502, + "eval_runtime": 28.6256, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 286 + }, + { + "epoch": 1.1908713692946058, + "grad_norm": 15.1875, + "learning_rate": 3.8091286307053944e-05, + "loss": 0.7227, + "step": 287 + }, + { + "epoch": 1.1908713692946058, + "eval_accuracy": 0.5518672199170125, + "eval_loss": 0.6631224155426025, + "eval_runtime": 28.5465, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 287 + }, + { + "epoch": 1.1950207468879668, + "grad_norm": 12.0625, + "learning_rate": 3.8049792531120335e-05, + "loss": 0.5977, + "step": 288 + }, + { + "epoch": 1.1950207468879668, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6631224155426025, + "eval_runtime": 28.5345, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 288 + }, + { + "epoch": 1.1991701244813278, + "grad_norm": 21.375, + "learning_rate": 3.800829875518672e-05, + "loss": 0.875, + "step": 289 + }, + { + "epoch": 1.1991701244813278, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6644191145896912, + "eval_runtime": 28.4443, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.145, + "step": 289 + }, + { + "epoch": 1.2033195020746887, + "grad_norm": 13.25, + "learning_rate": 3.796680497925311e-05, + "loss": 0.7227, + "step": 290 + }, + { + "epoch": 1.2033195020746887, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6666882634162903, + "eval_runtime": 28.4732, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 290 + }, + { + "epoch": 1.2074688796680497, + "grad_norm": 16.125, + "learning_rate": 3.79253112033195e-05, + "loss": 0.9102, + "step": 291 + }, + { + "epoch": 1.2074688796680497, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.6688277721405029, + "eval_runtime": 28.5052, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.14, + "step": 291 + }, + { + "epoch": 1.2116182572614107, + "grad_norm": 11.1875, + "learning_rate": 3.7883817427385894e-05, + "loss": 0.707, + "step": 292 + }, + { + "epoch": 1.2116182572614107, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6685360670089722, + "eval_runtime": 28.5731, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.135, + "step": 292 + }, + { + "epoch": 1.215767634854772, + "grad_norm": 12.875, + "learning_rate": 3.7842323651452286e-05, + "loss": 0.6602, + "step": 293 + }, + { + "epoch": 1.215767634854772, + "eval_accuracy": 0.5560165975103735, + "eval_loss": 0.6705135107040405, + "eval_runtime": 28.615, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 293 + }, + { + "epoch": 1.2199170124481329, + "grad_norm": 24.0, + "learning_rate": 3.780082987551867e-05, + "loss": 0.6953, + "step": 294 + }, + { + "epoch": 1.2199170124481329, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6730419993400574, + "eval_runtime": 28.6626, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.128, + "step": 294 + }, + { + "epoch": 1.2240663900414939, + "grad_norm": 12.125, + "learning_rate": 3.775933609958506e-05, + "loss": 0.6758, + "step": 295 + }, + { + "epoch": 1.2240663900414939, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6695895791053772, + "eval_runtime": 28.6225, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 295 + }, + { + "epoch": 1.2282157676348548, + "grad_norm": 15.0, + "learning_rate": 3.771784232365145e-05, + "loss": 0.8047, + "step": 296 + }, + { + "epoch": 1.2282157676348548, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.666736900806427, + "eval_runtime": 28.6212, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 296 + }, + { + "epoch": 1.2323651452282158, + "grad_norm": 10.875, + "learning_rate": 3.7676348547717845e-05, + "loss": 0.5352, + "step": 297 + }, + { + "epoch": 1.2323651452282158, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6662182211875916, + "eval_runtime": 28.6169, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 297 + }, + { + "epoch": 1.2365145228215768, + "grad_norm": 12.75, + "learning_rate": 3.7634854771784236e-05, + "loss": 0.6758, + "step": 298 + }, + { + "epoch": 1.2365145228215768, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6653591990470886, + "eval_runtime": 28.6147, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 298 + }, + { + "epoch": 1.2406639004149378, + "grad_norm": 12.1875, + "learning_rate": 3.759336099585062e-05, + "loss": 0.6172, + "step": 299 + }, + { + "epoch": 1.2406639004149378, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6665099859237671, + "eval_runtime": 28.5972, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 299 + }, + { + "epoch": 1.2448132780082988, + "grad_norm": 30.5, + "learning_rate": 3.755186721991701e-05, + "loss": 0.4805, + "step": 300 + }, + { + "epoch": 1.2448132780082988, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6660399436950684, + "eval_runtime": 28.6136, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 300 + }, + { + "epoch": 1.2489626556016598, + "grad_norm": 13.625, + "learning_rate": 3.7510373443983404e-05, + "loss": 0.6562, + "step": 301 + }, + { + "epoch": 1.2489626556016598, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6668584942817688, + "eval_runtime": 28.3526, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 301 + }, + { + "epoch": 1.2531120331950207, + "grad_norm": 18.0, + "learning_rate": 3.7468879668049795e-05, + "loss": 0.6016, + "step": 302 + }, + { + "epoch": 1.2531120331950207, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6677256226539612, + "eval_runtime": 28.4167, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.147, + "step": 302 + }, + { + "epoch": 1.2572614107883817, + "grad_norm": 11.25, + "learning_rate": 3.742738589211619e-05, + "loss": 0.5391, + "step": 303 + }, + { + "epoch": 1.2572614107883817, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6673204302787781, + "eval_runtime": 28.5643, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.136, + "step": 303 + }, + { + "epoch": 1.2614107883817427, + "grad_norm": 11.6875, + "learning_rate": 3.738589211618257e-05, + "loss": 0.7656, + "step": 304 + }, + { + "epoch": 1.2614107883817427, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6667693257331848, + "eval_runtime": 28.5702, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 304 + }, + { + "epoch": 1.2655601659751037, + "grad_norm": 10.375, + "learning_rate": 3.734439834024896e-05, + "loss": 0.5, + "step": 305 + }, + { + "epoch": 1.2655601659751037, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6679120063781738, + "eval_runtime": 28.654, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.129, + "step": 305 + }, + { + "epoch": 1.2697095435684647, + "grad_norm": 18.75, + "learning_rate": 3.7302904564315355e-05, + "loss": 0.4824, + "step": 306 + }, + { + "epoch": 1.2697095435684647, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6676769852638245, + "eval_runtime": 28.6192, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 306 + }, + { + "epoch": 1.2738589211618256, + "grad_norm": 24.875, + "learning_rate": 3.7261410788381746e-05, + "loss": 0.4336, + "step": 307 + }, + { + "epoch": 1.2738589211618256, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6713887453079224, + "eval_runtime": 28.6264, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 307 + }, + { + "epoch": 1.2780082987551866, + "grad_norm": 8.5, + "learning_rate": 3.721991701244814e-05, + "loss": 0.5508, + "step": 308 + }, + { + "epoch": 1.2780082987551866, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.672085702419281, + "eval_runtime": 28.6204, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 308 + }, + { + "epoch": 1.2821576763485476, + "grad_norm": 11.3125, + "learning_rate": 3.717842323651453e-05, + "loss": 0.7148, + "step": 309 + }, + { + "epoch": 1.2821576763485476, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6753112077713013, + "eval_runtime": 28.609, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 309 + }, + { + "epoch": 1.2863070539419086, + "grad_norm": 9.125, + "learning_rate": 3.7136929460580914e-05, + "loss": 0.6484, + "step": 310 + }, + { + "epoch": 1.2863070539419086, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6780018210411072, + "eval_runtime": 28.6139, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 310 + }, + { + "epoch": 1.2904564315352698, + "grad_norm": 6.375, + "learning_rate": 3.7095435684647305e-05, + "loss": 0.6406, + "step": 311 + }, + { + "epoch": 1.2904564315352698, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6833668351173401, + "eval_runtime": 28.6578, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.129, + "step": 311 + }, + { + "epoch": 1.2946058091286308, + "grad_norm": 13.875, + "learning_rate": 3.70539419087137e-05, + "loss": 0.5781, + "step": 312 + }, + { + "epoch": 1.2946058091286308, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6939023733139038, + "eval_runtime": 28.5475, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 312 + }, + { + "epoch": 1.2987551867219918, + "grad_norm": 7.84375, + "learning_rate": 3.701244813278009e-05, + "loss": 0.7539, + "step": 313 + }, + { + "epoch": 1.2987551867219918, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7070879936218262, + "eval_runtime": 28.4418, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.145, + "step": 313 + }, + { + "epoch": 1.3029045643153527, + "grad_norm": 13.0625, + "learning_rate": 3.697095435684648e-05, + "loss": 0.5703, + "step": 314 + }, + { + "epoch": 1.3029045643153527, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7082144618034363, + "eval_runtime": 28.3815, + "eval_samples_per_second": 16.983, + "eval_steps_per_second": 2.149, + "step": 314 + }, + { + "epoch": 1.3070539419087137, + "grad_norm": 35.25, + "learning_rate": 3.6929460580912864e-05, + "loss": 1.0156, + "step": 315 + }, + { + "epoch": 1.3070539419087137, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7041623592376709, + "eval_runtime": 28.3446, + "eval_samples_per_second": 17.005, + "eval_steps_per_second": 2.152, + "step": 315 + }, + { + "epoch": 1.3112033195020747, + "grad_norm": 14.125, + "learning_rate": 3.6887966804979256e-05, + "loss": 0.8828, + "step": 316 + }, + { + "epoch": 1.3112033195020747, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7021362781524658, + "eval_runtime": 28.387, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.149, + "step": 316 + }, + { + "epoch": 1.3153526970954357, + "grad_norm": 12.3125, + "learning_rate": 3.684647302904565e-05, + "loss": 1.0938, + "step": 317 + }, + { + "epoch": 1.3153526970954357, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6985298991203308, + "eval_runtime": 28.5146, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.139, + "step": 317 + }, + { + "epoch": 1.3195020746887967, + "grad_norm": 16.125, + "learning_rate": 3.680497925311204e-05, + "loss": 0.5352, + "step": 318 + }, + { + "epoch": 1.3195020746887967, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6993646025657654, + "eval_runtime": 28.5515, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.136, + "step": 318 + }, + { + "epoch": 1.3236514522821576, + "grad_norm": 36.25, + "learning_rate": 3.676348547717843e-05, + "loss": 1.3438, + "step": 319 + }, + { + "epoch": 1.3236514522821576, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6976789236068726, + "eval_runtime": 28.568, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 319 + }, + { + "epoch": 1.3278008298755186, + "grad_norm": 7.96875, + "learning_rate": 3.6721991701244815e-05, + "loss": 0.4727, + "step": 320 + }, + { + "epoch": 1.3278008298755186, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6973223686218262, + "eval_runtime": 28.5705, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 320 + }, + { + "epoch": 1.3319502074688796, + "grad_norm": 6.5, + "learning_rate": 3.66804979253112e-05, + "loss": 0.5703, + "step": 321 + }, + { + "epoch": 1.3319502074688796, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6974682211875916, + "eval_runtime": 28.5622, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 321 + }, + { + "epoch": 1.3360995850622408, + "grad_norm": 12.25, + "learning_rate": 3.663900414937759e-05, + "loss": 0.4863, + "step": 322 + }, + { + "epoch": 1.3360995850622408, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7023956179618835, + "eval_runtime": 28.5743, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.135, + "step": 322 + }, + { + "epoch": 1.3402489626556018, + "grad_norm": 14.8125, + "learning_rate": 3.659751037344398e-05, + "loss": 0.7188, + "step": 323 + }, + { + "epoch": 1.3402489626556018, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7000454068183899, + "eval_runtime": 28.5442, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.137, + "step": 323 + }, + { + "epoch": 1.3443983402489628, + "grad_norm": 8.875, + "learning_rate": 3.6556016597510374e-05, + "loss": 0.4141, + "step": 324 + }, + { + "epoch": 1.3443983402489628, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6987487077713013, + "eval_runtime": 28.4952, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.141, + "step": 324 + }, + { + "epoch": 1.3485477178423237, + "grad_norm": 13.375, + "learning_rate": 3.6514522821576766e-05, + "loss": 0.9141, + "step": 325 + }, + { + "epoch": 1.3485477178423237, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.697565495967865, + "eval_runtime": 28.3725, + "eval_samples_per_second": 16.988, + "eval_steps_per_second": 2.15, + "step": 325 + }, + { + "epoch": 1.3526970954356847, + "grad_norm": 8.0625, + "learning_rate": 3.647302904564315e-05, + "loss": 0.5742, + "step": 326 + }, + { + "epoch": 1.3526970954356847, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6982948780059814, + "eval_runtime": 28.3921, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.148, + "step": 326 + }, + { + "epoch": 1.3568464730290457, + "grad_norm": 6.75, + "learning_rate": 3.643153526970954e-05, + "loss": 0.5977, + "step": 327 + }, + { + "epoch": 1.3568464730290457, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6991701126098633, + "eval_runtime": 28.3959, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.148, + "step": 327 + }, + { + "epoch": 1.3609958506224067, + "grad_norm": 10.8125, + "learning_rate": 3.639004149377593e-05, + "loss": 0.3438, + "step": 328 + }, + { + "epoch": 1.3609958506224067, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.701082706451416, + "eval_runtime": 28.5156, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.139, + "step": 328 + }, + { + "epoch": 1.3651452282157677, + "grad_norm": 15.0625, + "learning_rate": 3.6348547717842325e-05, + "loss": 0.9492, + "step": 329 + }, + { + "epoch": 1.3651452282157677, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7010017037391663, + "eval_runtime": 28.5721, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 329 + }, + { + "epoch": 1.3692946058091287, + "grad_norm": 25.5, + "learning_rate": 3.6307053941908716e-05, + "loss": 0.7422, + "step": 330 + }, + { + "epoch": 1.3692946058091287, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.702914297580719, + "eval_runtime": 28.5432, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.137, + "step": 330 + }, + { + "epoch": 1.3734439834024896, + "grad_norm": 13.25, + "learning_rate": 3.626556016597511e-05, + "loss": 0.6797, + "step": 331 + }, + { + "epoch": 1.3734439834024896, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7024280428886414, + "eval_runtime": 28.601, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.133, + "step": 331 + }, + { + "epoch": 1.3775933609958506, + "grad_norm": 17.875, + "learning_rate": 3.622406639004149e-05, + "loss": 0.9688, + "step": 332 + }, + { + "epoch": 1.3775933609958506, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.701082706451416, + "eval_runtime": 28.6182, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.132, + "step": 332 + }, + { + "epoch": 1.3817427385892116, + "grad_norm": 9.75, + "learning_rate": 3.6182572614107884e-05, + "loss": 0.5547, + "step": 333 + }, + { + "epoch": 1.3817427385892116, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6996563673019409, + "eval_runtime": 28.572, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 333 + }, + { + "epoch": 1.3858921161825726, + "grad_norm": 13.5625, + "learning_rate": 3.6141078838174275e-05, + "loss": 0.8281, + "step": 334 + }, + { + "epoch": 1.3858921161825726, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6991376876831055, + "eval_runtime": 28.5676, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 334 + }, + { + "epoch": 1.3900414937759336, + "grad_norm": 13.0625, + "learning_rate": 3.609958506224067e-05, + "loss": 0.625, + "step": 335 + }, + { + "epoch": 1.3900414937759336, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6978410482406616, + "eval_runtime": 28.5388, + "eval_samples_per_second": 16.889, + "eval_steps_per_second": 2.137, + "step": 335 + }, + { + "epoch": 1.3941908713692945, + "grad_norm": 3.375, + "learning_rate": 3.605809128630706e-05, + "loss": 0.2314, + "step": 336 + }, + { + "epoch": 1.3941908713692945, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6983110904693604, + "eval_runtime": 28.4675, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.143, + "step": 336 + }, + { + "epoch": 1.3983402489626555, + "grad_norm": 7.53125, + "learning_rate": 3.601659751037344e-05, + "loss": 0.6445, + "step": 337 + }, + { + "epoch": 1.3983402489626555, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6992025375366211, + "eval_runtime": 28.4261, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.146, + "step": 337 + }, + { + "epoch": 1.4024896265560165, + "grad_norm": 16.25, + "learning_rate": 3.5975103734439834e-05, + "loss": 0.6133, + "step": 338 + }, + { + "epoch": 1.4024896265560165, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7013744711875916, + "eval_runtime": 28.5014, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.14, + "step": 338 + }, + { + "epoch": 1.4066390041493775, + "grad_norm": 5.8125, + "learning_rate": 3.5933609958506226e-05, + "loss": 0.4766, + "step": 339 + }, + { + "epoch": 1.4066390041493775, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7052158713340759, + "eval_runtime": 28.5413, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 339 + }, + { + "epoch": 1.4107883817427385, + "grad_norm": 6.21875, + "learning_rate": 3.589211618257262e-05, + "loss": 0.6445, + "step": 340 + }, + { + "epoch": 1.4107883817427385, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7068853974342346, + "eval_runtime": 28.6083, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 340 + }, + { + "epoch": 1.4149377593360997, + "grad_norm": 7.40625, + "learning_rate": 3.585062240663901e-05, + "loss": 0.6641, + "step": 341 + }, + { + "epoch": 1.4149377593360997, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7107754349708557, + "eval_runtime": 28.5683, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 341 + }, + { + "epoch": 1.4190871369294606, + "grad_norm": 9.9375, + "learning_rate": 3.5809128630705394e-05, + "loss": 0.707, + "step": 342 + }, + { + "epoch": 1.4190871369294606, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7118613719940186, + "eval_runtime": 28.5527, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 342 + }, + { + "epoch": 1.4232365145228216, + "grad_norm": 10.8125, + "learning_rate": 3.5767634854771785e-05, + "loss": 0.5, + "step": 343 + }, + { + "epoch": 1.4232365145228216, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7114075422286987, + "eval_runtime": 28.5939, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 343 + }, + { + "epoch": 1.4273858921161826, + "grad_norm": 57.0, + "learning_rate": 3.5726141078838177e-05, + "loss": 0.9492, + "step": 344 + }, + { + "epoch": 1.4273858921161826, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7123152017593384, + "eval_runtime": 28.5614, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.136, + "step": 344 + }, + { + "epoch": 1.4315352697095436, + "grad_norm": 5.625, + "learning_rate": 3.568464730290457e-05, + "loss": 0.3672, + "step": 345 + }, + { + "epoch": 1.4315352697095436, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7133687734603882, + "eval_runtime": 28.4859, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.141, + "step": 345 + }, + { + "epoch": 1.4356846473029046, + "grad_norm": 20.75, + "learning_rate": 3.564315352697096e-05, + "loss": 0.5781, + "step": 346 + }, + { + "epoch": 1.4356846473029046, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7163835763931274, + "eval_runtime": 28.5359, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 346 + }, + { + "epoch": 1.4398340248962656, + "grad_norm": 5.40625, + "learning_rate": 3.5601659751037344e-05, + "loss": 0.5156, + "step": 347 + }, + { + "epoch": 1.4398340248962656, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7204194664955139, + "eval_runtime": 28.6277, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 347 + }, + { + "epoch": 1.4439834024896265, + "grad_norm": 11.6875, + "learning_rate": 3.5560165975103736e-05, + "loss": 0.5586, + "step": 348 + }, + { + "epoch": 1.4439834024896265, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7237746119499207, + "eval_runtime": 28.5598, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 348 + }, + { + "epoch": 1.4481327800829875, + "grad_norm": 10.3125, + "learning_rate": 3.551867219917013e-05, + "loss": 0.7578, + "step": 349 + }, + { + "epoch": 1.4481327800829875, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7242122888565063, + "eval_runtime": 28.5602, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 349 + }, + { + "epoch": 1.4522821576763485, + "grad_norm": 11.625, + "learning_rate": 3.547717842323652e-05, + "loss": 0.9609, + "step": 350 + }, + { + "epoch": 1.4522821576763485, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7233856320381165, + "eval_runtime": 28.5999, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.133, + "step": 350 + }, + { + "epoch": 1.4564315352697095, + "grad_norm": 14.1875, + "learning_rate": 3.543568464730291e-05, + "loss": 1.1875, + "step": 351 + }, + { + "epoch": 1.4564315352697095, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7228345274925232, + "eval_runtime": 28.5108, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.14, + "step": 351 + }, + { + "epoch": 1.4605809128630705, + "grad_norm": 6.125, + "learning_rate": 3.53941908713693e-05, + "loss": 0.5234, + "step": 352 + }, + { + "epoch": 1.4605809128630705, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7206301689147949, + "eval_runtime": 28.449, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.144, + "step": 352 + }, + { + "epoch": 1.4647302904564317, + "grad_norm": 10.875, + "learning_rate": 3.5352697095435686e-05, + "loss": 0.5508, + "step": 353 + }, + { + "epoch": 1.4647302904564317, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7207598686218262, + "eval_runtime": 28.5283, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.138, + "step": 353 + }, + { + "epoch": 1.4688796680497926, + "grad_norm": 5.21875, + "learning_rate": 3.531120331950208e-05, + "loss": 0.5508, + "step": 354 + }, + { + "epoch": 1.4688796680497926, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7211812734603882, + "eval_runtime": 28.5506, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.137, + "step": 354 + }, + { + "epoch": 1.4730290456431536, + "grad_norm": 7.3125, + "learning_rate": 3.526970954356847e-05, + "loss": 0.7383, + "step": 355 + }, + { + "epoch": 1.4730290456431536, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7188148498535156, + "eval_runtime": 28.5633, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 355 + }, + { + "epoch": 1.4771784232365146, + "grad_norm": 7.09375, + "learning_rate": 3.522821576763486e-05, + "loss": 0.543, + "step": 356 + }, + { + "epoch": 1.4771784232365146, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7181664705276489, + "eval_runtime": 28.5535, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 356 + }, + { + "epoch": 1.4813278008298756, + "grad_norm": 8.5625, + "learning_rate": 3.518672199170125e-05, + "loss": 0.75, + "step": 357 + }, + { + "epoch": 1.4813278008298756, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7157676219940186, + "eval_runtime": 28.5513, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.137, + "step": 357 + }, + { + "epoch": 1.4854771784232366, + "grad_norm": 7.6875, + "learning_rate": 3.514522821576764e-05, + "loss": 0.7305, + "step": 358 + }, + { + "epoch": 1.4854771784232366, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7093166708946228, + "eval_runtime": 28.538, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 358 + }, + { + "epoch": 1.4896265560165975, + "grad_norm": 7.75, + "learning_rate": 3.510373443983402e-05, + "loss": 0.7305, + "step": 359 + }, + { + "epoch": 1.4896265560165975, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7025576829910278, + "eval_runtime": 28.4791, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.142, + "step": 359 + }, + { + "epoch": 1.4937759336099585, + "grad_norm": 12.8125, + "learning_rate": 3.506224066390041e-05, + "loss": 1.1016, + "step": 360 + }, + { + "epoch": 1.4937759336099585, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6933674812316895, + "eval_runtime": 28.4367, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.145, + "step": 360 + }, + { + "epoch": 1.4979253112033195, + "grad_norm": 11.4375, + "learning_rate": 3.5020746887966805e-05, + "loss": 0.7773, + "step": 361 + }, + { + "epoch": 1.4979253112033195, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6842907071113586, + "eval_runtime": 28.4058, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 361 + }, + { + "epoch": 1.5020746887966805, + "grad_norm": 8.4375, + "learning_rate": 3.4979253112033196e-05, + "loss": 0.5352, + "step": 362 + }, + { + "epoch": 1.5020746887966805, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6793633103370667, + "eval_runtime": 28.3872, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 2.149, + "step": 362 + }, + { + "epoch": 1.5062240663900415, + "grad_norm": 11.375, + "learning_rate": 3.493775933609959e-05, + "loss": 0.8281, + "step": 363 + }, + { + "epoch": 1.5062240663900415, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6765106320381165, + "eval_runtime": 28.4252, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.146, + "step": 363 + }, + { + "epoch": 1.5103734439834025, + "grad_norm": 8.3125, + "learning_rate": 3.489626556016597e-05, + "loss": 0.5195, + "step": 364 + }, + { + "epoch": 1.5103734439834025, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6736903786659241, + "eval_runtime": 28.4948, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.141, + "step": 364 + }, + { + "epoch": 1.5145228215767634, + "grad_norm": 13.5, + "learning_rate": 3.4854771784232364e-05, + "loss": 0.5859, + "step": 365 + }, + { + "epoch": 1.5145228215767634, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6722801923751831, + "eval_runtime": 28.5883, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 365 + }, + { + "epoch": 1.5186721991701244, + "grad_norm": 15.25, + "learning_rate": 3.4813278008298755e-05, + "loss": 0.6562, + "step": 366 + }, + { + "epoch": 1.5186721991701244, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6710159778594971, + "eval_runtime": 28.3961, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.148, + "step": 366 + }, + { + "epoch": 1.5228215767634854, + "grad_norm": 14.4375, + "learning_rate": 3.477178423236515e-05, + "loss": 0.5156, + "step": 367 + }, + { + "epoch": 1.5228215767634854, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6699299812316895, + "eval_runtime": 28.4731, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 367 + }, + { + "epoch": 1.5269709543568464, + "grad_norm": 16.5, + "learning_rate": 3.473029045643154e-05, + "loss": 0.6914, + "step": 368 + }, + { + "epoch": 1.5269709543568464, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6698975563049316, + "eval_runtime": 28.4133, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.147, + "step": 368 + }, + { + "epoch": 1.5311203319502074, + "grad_norm": 6.34375, + "learning_rate": 3.468879668049792e-05, + "loss": 0.4707, + "step": 369 + }, + { + "epoch": 1.5311203319502074, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6685360670089722, + "eval_runtime": 28.4587, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 369 + }, + { + "epoch": 1.5352697095435683, + "grad_norm": 9.1875, + "learning_rate": 3.4647302904564314e-05, + "loss": 0.4492, + "step": 370 + }, + { + "epoch": 1.5352697095435683, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6689088344573975, + "eval_runtime": 28.5848, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.134, + "step": 370 + }, + { + "epoch": 1.5394190871369293, + "grad_norm": 10.75, + "learning_rate": 3.4605809128630706e-05, + "loss": 0.5234, + "step": 371 + }, + { + "epoch": 1.5394190871369293, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6704000234603882, + "eval_runtime": 28.6253, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 371 + }, + { + "epoch": 1.5435684647302903, + "grad_norm": 9.25, + "learning_rate": 3.45643153526971e-05, + "loss": 0.8555, + "step": 372 + }, + { + "epoch": 1.5435684647302903, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6704810857772827, + "eval_runtime": 28.5891, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 372 + }, + { + "epoch": 1.5477178423236515, + "grad_norm": 14.125, + "learning_rate": 3.452282157676349e-05, + "loss": 0.625, + "step": 373 + }, + { + "epoch": 1.5477178423236515, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6713076829910278, + "eval_runtime": 28.5668, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 373 + }, + { + "epoch": 1.5518672199170125, + "grad_norm": 10.3125, + "learning_rate": 3.4481327800829873e-05, + "loss": 0.6367, + "step": 374 + }, + { + "epoch": 1.5518672199170125, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6711131930351257, + "eval_runtime": 28.6052, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 374 + }, + { + "epoch": 1.5560165975103735, + "grad_norm": 10.0625, + "learning_rate": 3.4439834024896265e-05, + "loss": 0.8203, + "step": 375 + }, + { + "epoch": 1.5560165975103735, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6710321307182312, + "eval_runtime": 28.565, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 375 + }, + { + "epoch": 1.5601659751037344, + "grad_norm": 25.0, + "learning_rate": 3.4398340248962656e-05, + "loss": 0.8594, + "step": 376 + }, + { + "epoch": 1.5601659751037344, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6714698076248169, + "eval_runtime": 28.4969, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.141, + "step": 376 + }, + { + "epoch": 1.5643153526970954, + "grad_norm": 7.125, + "learning_rate": 3.435684647302905e-05, + "loss": 0.6016, + "step": 377 + }, + { + "epoch": 1.5643153526970954, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6716156601905823, + "eval_runtime": 28.4701, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.143, + "step": 377 + }, + { + "epoch": 1.5684647302904564, + "grad_norm": 6.5625, + "learning_rate": 3.431535269709544e-05, + "loss": 0.5156, + "step": 378 + }, + { + "epoch": 1.5684647302904564, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6721343398094177, + "eval_runtime": 28.4517, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.144, + "step": 378 + }, + { + "epoch": 1.5726141078838174, + "grad_norm": 14.0, + "learning_rate": 3.427385892116183e-05, + "loss": 0.8516, + "step": 379 + }, + { + "epoch": 1.5726141078838174, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6705945134162903, + "eval_runtime": 28.4394, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.145, + "step": 379 + }, + { + "epoch": 1.5767634854771784, + "grad_norm": 22.5, + "learning_rate": 3.4232365145228216e-05, + "loss": 0.9492, + "step": 380 + }, + { + "epoch": 1.5767634854771784, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.670221745967865, + "eval_runtime": 28.4345, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.145, + "step": 380 + }, + { + "epoch": 1.5809128630705396, + "grad_norm": 12.375, + "learning_rate": 3.419087136929461e-05, + "loss": 0.8047, + "step": 381 + }, + { + "epoch": 1.5809128630705396, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6691033244132996, + "eval_runtime": 28.4272, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.146, + "step": 381 + }, + { + "epoch": 1.5850622406639006, + "grad_norm": 18.125, + "learning_rate": 3.4149377593361e-05, + "loss": 0.2373, + "step": 382 + }, + { + "epoch": 1.5850622406639006, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6682118773460388, + "eval_runtime": 28.428, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.146, + "step": 382 + }, + { + "epoch": 1.5892116182572615, + "grad_norm": 8.75, + "learning_rate": 3.410788381742739e-05, + "loss": 0.5742, + "step": 383 + }, + { + "epoch": 1.5892116182572615, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.667336642742157, + "eval_runtime": 28.4757, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.142, + "step": 383 + }, + { + "epoch": 1.5933609958506225, + "grad_norm": 6.6875, + "learning_rate": 3.406639004149378e-05, + "loss": 0.4863, + "step": 384 + }, + { + "epoch": 1.5933609958506225, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6679039001464844, + "eval_runtime": 28.4678, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.143, + "step": 384 + }, + { + "epoch": 1.5975103734439835, + "grad_norm": 9.625, + "learning_rate": 3.4024896265560166e-05, + "loss": 0.6875, + "step": 385 + }, + { + "epoch": 1.5975103734439835, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6688277721405029, + "eval_runtime": 28.4591, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 385 + }, + { + "epoch": 1.6016597510373445, + "grad_norm": 5.3125, + "learning_rate": 3.398340248962656e-05, + "loss": 0.582, + "step": 386 + }, + { + "epoch": 1.6016597510373445, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6704000234603882, + "eval_runtime": 28.4008, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.148, + "step": 386 + }, + { + "epoch": 1.6058091286307055, + "grad_norm": 30.0, + "learning_rate": 3.394190871369295e-05, + "loss": 0.8086, + "step": 387 + }, + { + "epoch": 1.6058091286307055, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6704486608505249, + "eval_runtime": 28.3924, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.148, + "step": 387 + }, + { + "epoch": 1.6099585062240664, + "grad_norm": 15.75, + "learning_rate": 3.390041493775934e-05, + "loss": 0.7539, + "step": 388 + }, + { + "epoch": 1.6099585062240664, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6708052158355713, + "eval_runtime": 28.3973, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.148, + "step": 388 + }, + { + "epoch": 1.6141078838174274, + "grad_norm": 9.375, + "learning_rate": 3.385892116182573e-05, + "loss": 0.5859, + "step": 389 + }, + { + "epoch": 1.6141078838174274, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6710321307182312, + "eval_runtime": 28.3877, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 2.149, + "step": 389 + }, + { + "epoch": 1.6182572614107884, + "grad_norm": 9.1875, + "learning_rate": 3.381742738589212e-05, + "loss": 0.7383, + "step": 390 + }, + { + "epoch": 1.6182572614107884, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6712914705276489, + "eval_runtime": 28.4339, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 390 + }, + { + "epoch": 1.6224066390041494, + "grad_norm": 12.4375, + "learning_rate": 3.377593360995851e-05, + "loss": 0.5469, + "step": 391 + }, + { + "epoch": 1.6224066390041494, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6732041239738464, + "eval_runtime": 28.5056, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.14, + "step": 391 + }, + { + "epoch": 1.6265560165975104, + "grad_norm": 7.875, + "learning_rate": 3.37344398340249e-05, + "loss": 0.5547, + "step": 392 + }, + { + "epoch": 1.6265560165975104, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6754246354103088, + "eval_runtime": 28.546, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 392 + }, + { + "epoch": 1.6307053941908713, + "grad_norm": 10.9375, + "learning_rate": 3.369294605809129e-05, + "loss": 0.7109, + "step": 393 + }, + { + "epoch": 1.6307053941908713, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6757326126098633, + "eval_runtime": 28.548, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 393 + }, + { + "epoch": 1.6348547717842323, + "grad_norm": 6.09375, + "learning_rate": 3.365145228215768e-05, + "loss": 0.4766, + "step": 394 + }, + { + "epoch": 1.6348547717842323, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6769644618034363, + "eval_runtime": 28.5295, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.138, + "step": 394 + }, + { + "epoch": 1.6390041493775933, + "grad_norm": 10.375, + "learning_rate": 3.360995850622407e-05, + "loss": 0.6055, + "step": 395 + }, + { + "epoch": 1.6390041493775933, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6767213344573975, + "eval_runtime": 28.5209, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 395 + }, + { + "epoch": 1.6431535269709543, + "grad_norm": 6.09375, + "learning_rate": 3.356846473029045e-05, + "loss": 0.6836, + "step": 396 + }, + { + "epoch": 1.6431535269709543, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6780666708946228, + "eval_runtime": 28.5402, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 396 + }, + { + "epoch": 1.6473029045643153, + "grad_norm": 8.6875, + "learning_rate": 3.3526970954356844e-05, + "loss": 0.6641, + "step": 397 + }, + { + "epoch": 1.6473029045643153, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6784880757331848, + "eval_runtime": 28.4481, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.144, + "step": 397 + }, + { + "epoch": 1.6514522821576763, + "grad_norm": 5.5, + "learning_rate": 3.3485477178423235e-05, + "loss": 0.6172, + "step": 398 + }, + { + "epoch": 1.6514522821576763, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.678877055644989, + "eval_runtime": 28.3982, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.148, + "step": 398 + }, + { + "epoch": 1.6556016597510372, + "grad_norm": 8.1875, + "learning_rate": 3.3443983402489627e-05, + "loss": 0.6562, + "step": 399 + }, + { + "epoch": 1.6556016597510372, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6800279021263123, + "eval_runtime": 28.4724, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 399 + }, + { + "epoch": 1.6597510373443982, + "grad_norm": 9.75, + "learning_rate": 3.340248962655602e-05, + "loss": 0.6523, + "step": 400 + }, + { + "epoch": 1.6597510373443982, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6819567084312439, + "eval_runtime": 28.5368, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 400 + }, + { + "epoch": 1.6639004149377592, + "grad_norm": 6.28125, + "learning_rate": 3.336099585062241e-05, + "loss": 0.6094, + "step": 401 + }, + { + "epoch": 1.6639004149377592, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6826050281524658, + "eval_runtime": 28.4538, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.144, + "step": 401 + }, + { + "epoch": 1.6680497925311202, + "grad_norm": 7.9375, + "learning_rate": 3.3319502074688794e-05, + "loss": 0.6523, + "step": 402 + }, + { + "epoch": 1.6680497925311202, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6830912828445435, + "eval_runtime": 28.384, + "eval_samples_per_second": 16.981, + "eval_steps_per_second": 2.149, + "step": 402 + }, + { + "epoch": 1.6721991701244814, + "grad_norm": 12.625, + "learning_rate": 3.3278008298755186e-05, + "loss": 0.582, + "step": 403 + }, + { + "epoch": 1.6721991701244814, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6854252815246582, + "eval_runtime": 28.4675, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.143, + "step": 403 + }, + { + "epoch": 1.6763485477178424, + "grad_norm": 4.875, + "learning_rate": 3.323651452282158e-05, + "loss": 0.5234, + "step": 404 + }, + { + "epoch": 1.6763485477178424, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6869164705276489, + "eval_runtime": 28.5208, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 404 + }, + { + "epoch": 1.6804979253112033, + "grad_norm": 11.75, + "learning_rate": 3.319502074688797e-05, + "loss": 0.5508, + "step": 405 + }, + { + "epoch": 1.6804979253112033, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6875324249267578, + "eval_runtime": 28.5484, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 405 + }, + { + "epoch": 1.6846473029045643, + "grad_norm": 5.1875, + "learning_rate": 3.315352697095436e-05, + "loss": 0.8047, + "step": 406 + }, + { + "epoch": 1.6846473029045643, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6876458525657654, + "eval_runtime": 28.559, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 406 + }, + { + "epoch": 1.6887966804979253, + "grad_norm": 7.75, + "learning_rate": 3.3112033195020745e-05, + "loss": 0.4824, + "step": 407 + }, + { + "epoch": 1.6887966804979253, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6880186796188354, + "eval_runtime": 28.5364, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 407 + }, + { + "epoch": 1.6929460580912863, + "grad_norm": 19.25, + "learning_rate": 3.3070539419087136e-05, + "loss": 0.707, + "step": 408 + }, + { + "epoch": 1.6929460580912863, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6877917647361755, + "eval_runtime": 28.5285, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.138, + "step": 408 + }, + { + "epoch": 1.6970954356846473, + "grad_norm": 17.75, + "learning_rate": 3.302904564315353e-05, + "loss": 1.0859, + "step": 409 + }, + { + "epoch": 1.6970954356846473, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6883428692817688, + "eval_runtime": 28.5188, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 409 + }, + { + "epoch": 1.7012448132780082, + "grad_norm": 6.8125, + "learning_rate": 3.298755186721992e-05, + "loss": 0.6836, + "step": 410 + }, + { + "epoch": 1.7012448132780082, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6872730851173401, + "eval_runtime": 28.3993, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.148, + "step": 410 + }, + { + "epoch": 1.7053941908713695, + "grad_norm": 11.125, + "learning_rate": 3.294605809128631e-05, + "loss": 0.6523, + "step": 411 + }, + { + "epoch": 1.7053941908713695, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6874027252197266, + "eval_runtime": 28.3516, + "eval_samples_per_second": 17.001, + "eval_steps_per_second": 2.152, + "step": 411 + }, + { + "epoch": 1.7095435684647304, + "grad_norm": 12.125, + "learning_rate": 3.2904564315352695e-05, + "loss": 0.668, + "step": 412 + }, + { + "epoch": 1.7095435684647304, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6863654255867004, + "eval_runtime": 28.3273, + "eval_samples_per_second": 17.015, + "eval_steps_per_second": 2.153, + "step": 412 + }, + { + "epoch": 1.7136929460580914, + "grad_norm": 22.5, + "learning_rate": 3.286307053941909e-05, + "loss": 1.75, + "step": 413 + }, + { + "epoch": 1.7136929460580914, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.685636043548584, + "eval_runtime": 28.3088, + "eval_samples_per_second": 17.026, + "eval_steps_per_second": 2.155, + "step": 413 + }, + { + "epoch": 1.7178423236514524, + "grad_norm": 5.625, + "learning_rate": 3.282157676348548e-05, + "loss": 0.582, + "step": 414 + }, + { + "epoch": 1.7178423236514524, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6843879818916321, + "eval_runtime": 28.2966, + "eval_samples_per_second": 17.034, + "eval_steps_per_second": 2.156, + "step": 414 + }, + { + "epoch": 1.7219917012448134, + "grad_norm": 8.5, + "learning_rate": 3.278008298755187e-05, + "loss": 0.3848, + "step": 415 + }, + { + "epoch": 1.7219917012448134, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6831237077713013, + "eval_runtime": 28.2908, + "eval_samples_per_second": 17.037, + "eval_steps_per_second": 2.156, + "step": 415 + }, + { + "epoch": 1.7261410788381744, + "grad_norm": 5.6875, + "learning_rate": 3.273858921161826e-05, + "loss": 0.8438, + "step": 416 + }, + { + "epoch": 1.7261410788381744, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6818270087242126, + "eval_runtime": 28.3416, + "eval_samples_per_second": 17.007, + "eval_steps_per_second": 2.152, + "step": 416 + }, + { + "epoch": 1.7302904564315353, + "grad_norm": 8.375, + "learning_rate": 3.2697095435684646e-05, + "loss": 0.8125, + "step": 417 + }, + { + "epoch": 1.7302904564315353, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6807896494865417, + "eval_runtime": 28.5191, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 417 + }, + { + "epoch": 1.7344398340248963, + "grad_norm": 9.875, + "learning_rate": 3.265560165975104e-05, + "loss": 1.0625, + "step": 418 + }, + { + "epoch": 1.7344398340248963, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6792336702346802, + "eval_runtime": 28.5769, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.135, + "step": 418 + }, + { + "epoch": 1.7385892116182573, + "grad_norm": 12.5, + "learning_rate": 3.261410788381743e-05, + "loss": 0.5742, + "step": 419 + }, + { + "epoch": 1.7385892116182573, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.6794605851173401, + "eval_runtime": 28.6003, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.133, + "step": 419 + }, + { + "epoch": 1.7427385892116183, + "grad_norm": 4.25, + "learning_rate": 3.257261410788382e-05, + "loss": 0.4512, + "step": 420 + }, + { + "epoch": 1.7427385892116183, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.6782287359237671, + "eval_runtime": 28.5357, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 420 + }, + { + "epoch": 1.7468879668049793, + "grad_norm": 7.46875, + "learning_rate": 3.253112033195021e-05, + "loss": 0.4082, + "step": 421 + }, + { + "epoch": 1.7468879668049793, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6788446307182312, + "eval_runtime": 28.5554, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.136, + "step": 421 + }, + { + "epoch": 1.7510373443983402, + "grad_norm": 7.25, + "learning_rate": 3.2489626556016603e-05, + "loss": 0.5547, + "step": 422 + }, + { + "epoch": 1.7510373443983402, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6792336702346802, + "eval_runtime": 28.507, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.14, + "step": 422 + }, + { + "epoch": 1.7551867219917012, + "grad_norm": 9.125, + "learning_rate": 3.244813278008299e-05, + "loss": 0.7227, + "step": 423 + }, + { + "epoch": 1.7551867219917012, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6785205006599426, + "eval_runtime": 28.4765, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.142, + "step": 423 + }, + { + "epoch": 1.7593360995850622, + "grad_norm": 6.75, + "learning_rate": 3.240663900414938e-05, + "loss": 0.7383, + "step": 424 + }, + { + "epoch": 1.7593360995850622, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6784394383430481, + "eval_runtime": 28.4586, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 424 + }, + { + "epoch": 1.7634854771784232, + "grad_norm": 6.9375, + "learning_rate": 3.236514522821577e-05, + "loss": 0.4336, + "step": 425 + }, + { + "epoch": 1.7634854771784232, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.6781963109970093, + "eval_runtime": 28.4943, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.141, + "step": 425 + }, + { + "epoch": 1.7676348547717842, + "grad_norm": 6.8125, + "learning_rate": 3.232365145228216e-05, + "loss": 0.3828, + "step": 426 + }, + { + "epoch": 1.7676348547717842, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6797523498535156, + "eval_runtime": 28.4836, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.142, + "step": 426 + }, + { + "epoch": 1.7717842323651452, + "grad_norm": 10.6875, + "learning_rate": 3.2282157676348554e-05, + "loss": 0.6758, + "step": 427 + }, + { + "epoch": 1.7717842323651452, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6797361373901367, + "eval_runtime": 28.4236, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.146, + "step": 427 + }, + { + "epoch": 1.7759336099585061, + "grad_norm": 12.8125, + "learning_rate": 3.224066390041494e-05, + "loss": 0.6953, + "step": 428 + }, + { + "epoch": 1.7759336099585061, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6798009872436523, + "eval_runtime": 28.4101, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.147, + "step": 428 + }, + { + "epoch": 1.7800829875518671, + "grad_norm": 3.96875, + "learning_rate": 3.219917012448133e-05, + "loss": 0.3281, + "step": 429 + }, + { + "epoch": 1.7800829875518671, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.680108904838562, + "eval_runtime": 28.4039, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.148, + "step": 429 + }, + { + "epoch": 1.784232365145228, + "grad_norm": 9.25, + "learning_rate": 3.215767634854772e-05, + "loss": 0.3945, + "step": 430 + }, + { + "epoch": 1.784232365145228, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6811138391494751, + "eval_runtime": 28.4472, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 430 + }, + { + "epoch": 1.788381742738589, + "grad_norm": 12.8125, + "learning_rate": 3.211618257261411e-05, + "loss": 0.7891, + "step": 431 + }, + { + "epoch": 1.788381742738589, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6806924343109131, + "eval_runtime": 28.4539, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.144, + "step": 431 + }, + { + "epoch": 1.79253112033195, + "grad_norm": 5.40625, + "learning_rate": 3.2074688796680505e-05, + "loss": 0.4453, + "step": 432 + }, + { + "epoch": 1.79253112033195, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6834316849708557, + "eval_runtime": 28.4549, + "eval_samples_per_second": 16.939, + "eval_steps_per_second": 2.144, + "step": 432 + }, + { + "epoch": 1.796680497925311, + "grad_norm": 12.6875, + "learning_rate": 3.203319502074689e-05, + "loss": 0.6016, + "step": 433 + }, + { + "epoch": 1.796680497925311, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6859601736068726, + "eval_runtime": 28.4108, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.147, + "step": 433 + }, + { + "epoch": 1.8008298755186722, + "grad_norm": 4.6875, + "learning_rate": 3.1991701244813274e-05, + "loss": 0.1455, + "step": 434 + }, + { + "epoch": 1.8008298755186722, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6896232962608337, + "eval_runtime": 28.4142, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.147, + "step": 434 + }, + { + "epoch": 1.8049792531120332, + "grad_norm": 3.484375, + "learning_rate": 3.1950207468879666e-05, + "loss": 0.5039, + "step": 435 + }, + { + "epoch": 1.8049792531120332, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6958311796188354, + "eval_runtime": 28.4162, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.147, + "step": 435 + }, + { + "epoch": 1.8091286307053942, + "grad_norm": 4.34375, + "learning_rate": 3.190871369294606e-05, + "loss": 0.3477, + "step": 436 + }, + { + "epoch": 1.8091286307053942, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7014393210411072, + "eval_runtime": 28.4177, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.147, + "step": 436 + }, + { + "epoch": 1.8132780082987552, + "grad_norm": 8.0, + "learning_rate": 3.186721991701245e-05, + "loss": 0.6797, + "step": 437 + }, + { + "epoch": 1.8132780082987552, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7062370181083679, + "eval_runtime": 28.4043, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.148, + "step": 437 + }, + { + "epoch": 1.8174273858921162, + "grad_norm": 5.9375, + "learning_rate": 3.182572614107884e-05, + "loss": 0.4512, + "step": 438 + }, + { + "epoch": 1.8174273858921162, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7085062265396118, + "eval_runtime": 28.4139, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.147, + "step": 438 + }, + { + "epoch": 1.8215767634854771, + "grad_norm": 9.6875, + "learning_rate": 3.1784232365145225e-05, + "loss": 0.5078, + "step": 439 + }, + { + "epoch": 1.8215767634854771, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.7123638391494751, + "eval_runtime": 28.415, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.147, + "step": 439 + }, + { + "epoch": 1.8257261410788381, + "grad_norm": 7.0625, + "learning_rate": 3.1742738589211616e-05, + "loss": 0.7812, + "step": 440 + }, + { + "epoch": 1.8257261410788381, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7166104912757874, + "eval_runtime": 28.4253, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.146, + "step": 440 + }, + { + "epoch": 1.8298755186721993, + "grad_norm": 7.03125, + "learning_rate": 3.170124481327801e-05, + "loss": 0.2715, + "step": 441 + }, + { + "epoch": 1.8298755186721993, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.720792293548584, + "eval_runtime": 28.4854, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.141, + "step": 441 + }, + { + "epoch": 1.8340248962655603, + "grad_norm": 4.625, + "learning_rate": 3.16597510373444e-05, + "loss": 0.4922, + "step": 442 + }, + { + "epoch": 1.8340248962655603, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7254927158355713, + "eval_runtime": 28.4527, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.144, + "step": 442 + }, + { + "epoch": 1.8381742738589213, + "grad_norm": 10.75, + "learning_rate": 3.161825726141079e-05, + "loss": 0.9453, + "step": 443 + }, + { + "epoch": 1.8381742738589213, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7260600328445435, + "eval_runtime": 28.4366, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.145, + "step": 443 + }, + { + "epoch": 1.8423236514522823, + "grad_norm": 3.015625, + "learning_rate": 3.157676348547718e-05, + "loss": 0.3613, + "step": 444 + }, + { + "epoch": 1.8423236514522823, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7292206883430481, + "eval_runtime": 28.4568, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.144, + "step": 444 + }, + { + "epoch": 1.8464730290456433, + "grad_norm": 4.21875, + "learning_rate": 3.153526970954357e-05, + "loss": 0.6758, + "step": 445 + }, + { + "epoch": 1.8464730290456433, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7331107258796692, + "eval_runtime": 28.4928, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.141, + "step": 445 + }, + { + "epoch": 1.8506224066390042, + "grad_norm": 7.53125, + "learning_rate": 3.149377593360996e-05, + "loss": 0.3828, + "step": 446 + }, + { + "epoch": 1.8506224066390042, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7381516098976135, + "eval_runtime": 28.6078, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 446 + }, + { + "epoch": 1.8547717842323652, + "grad_norm": 8.6875, + "learning_rate": 3.145228215767635e-05, + "loss": 0.4336, + "step": 447 + }, + { + "epoch": 1.8547717842323652, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7431924343109131, + "eval_runtime": 28.5497, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.137, + "step": 447 + }, + { + "epoch": 1.8589211618257262, + "grad_norm": 6.8125, + "learning_rate": 3.141078838174274e-05, + "loss": 0.5156, + "step": 448 + }, + { + "epoch": 1.8589211618257262, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7455588579177856, + "eval_runtime": 28.5999, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.133, + "step": 448 + }, + { + "epoch": 1.8630705394190872, + "grad_norm": 4.15625, + "learning_rate": 3.136929460580913e-05, + "loss": 0.4805, + "step": 449 + }, + { + "epoch": 1.8630705394190872, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7515722513198853, + "eval_runtime": 28.5584, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 449 + }, + { + "epoch": 1.8672199170124482, + "grad_norm": 7.6875, + "learning_rate": 3.132780082987552e-05, + "loss": 0.6055, + "step": 450 + }, + { + "epoch": 1.8672199170124482, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.7552515268325806, + "eval_runtime": 28.6016, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.133, + "step": 450 + }, + { + "epoch": 1.8713692946058091, + "grad_norm": 10.75, + "learning_rate": 3.128630705394191e-05, + "loss": 1.0078, + "step": 451 + }, + { + "epoch": 1.8713692946058091, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.759255051612854, + "eval_runtime": 28.3622, + "eval_samples_per_second": 16.994, + "eval_steps_per_second": 2.151, + "step": 451 + }, + { + "epoch": 1.8755186721991701, + "grad_norm": 9.9375, + "learning_rate": 3.12448132780083e-05, + "loss": 0.2314, + "step": 452 + }, + { + "epoch": 1.8755186721991701, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7632747888565063, + "eval_runtime": 28.3102, + "eval_samples_per_second": 17.026, + "eval_steps_per_second": 2.155, + "step": 452 + }, + { + "epoch": 1.879668049792531, + "grad_norm": 47.25, + "learning_rate": 3.120331950207469e-05, + "loss": 1.5234, + "step": 453 + }, + { + "epoch": 1.879668049792531, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.7648956179618835, + "eval_runtime": 28.4698, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.143, + "step": 453 + }, + { + "epoch": 1.883817427385892, + "grad_norm": 42.75, + "learning_rate": 3.1161825726141083e-05, + "loss": 0.4258, + "step": 454 + }, + { + "epoch": 1.883817427385892, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.7679266333580017, + "eval_runtime": 28.5378, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 454 + }, + { + "epoch": 1.887966804979253, + "grad_norm": 16.875, + "learning_rate": 3.112033195020747e-05, + "loss": 1.3672, + "step": 455 + }, + { + "epoch": 1.887966804979253, + "eval_accuracy": 0.6161825726141079, + "eval_loss": 0.7674079537391663, + "eval_runtime": 28.6034, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.133, + "step": 455 + }, + { + "epoch": 1.892116182572614, + "grad_norm": 13.0625, + "learning_rate": 3.107883817427386e-05, + "loss": 0.9844, + "step": 456 + }, + { + "epoch": 1.892116182572614, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.7658032774925232, + "eval_runtime": 28.6274, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 456 + }, + { + "epoch": 1.896265560165975, + "grad_norm": 5.4375, + "learning_rate": 3.103734439834025e-05, + "loss": 0.4473, + "step": 457 + }, + { + "epoch": 1.896265560165975, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7638744711875916, + "eval_runtime": 28.6167, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 457 + }, + { + "epoch": 1.900414937759336, + "grad_norm": 10.125, + "learning_rate": 3.099585062240664e-05, + "loss": 0.6367, + "step": 458 + }, + { + "epoch": 1.900414937759336, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.7624805569648743, + "eval_runtime": 28.5381, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.137, + "step": 458 + }, + { + "epoch": 1.904564315352697, + "grad_norm": 5.1875, + "learning_rate": 3.0954356846473034e-05, + "loss": 0.209, + "step": 459 + }, + { + "epoch": 1.904564315352697, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.7621239423751831, + "eval_runtime": 28.4908, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.141, + "step": 459 + }, + { + "epoch": 1.908713692946058, + "grad_norm": 23.0, + "learning_rate": 3.091286307053942e-05, + "loss": 1.1406, + "step": 460 + }, + { + "epoch": 1.908713692946058, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7601951360702515, + "eval_runtime": 28.4024, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.148, + "step": 460 + }, + { + "epoch": 1.912863070539419, + "grad_norm": 10.0625, + "learning_rate": 3.087136929460581e-05, + "loss": 0.4922, + "step": 461 + }, + { + "epoch": 1.912863070539419, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.7605355381965637, + "eval_runtime": 28.4172, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.147, + "step": 461 + }, + { + "epoch": 1.91701244813278, + "grad_norm": 9.8125, + "learning_rate": 3.08298755186722e-05, + "loss": 0.4316, + "step": 462 + }, + { + "epoch": 1.91701244813278, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7621077299118042, + "eval_runtime": 28.3954, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.148, + "step": 462 + }, + { + "epoch": 1.921161825726141, + "grad_norm": 12.5625, + "learning_rate": 3.078838174273859e-05, + "loss": 1.7344, + "step": 463 + }, + { + "epoch": 1.921161825726141, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.7571155428886414, + "eval_runtime": 28.3834, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.149, + "step": 463 + }, + { + "epoch": 1.9253112033195021, + "grad_norm": 4.78125, + "learning_rate": 3.0746887966804985e-05, + "loss": 0.4258, + "step": 464 + }, + { + "epoch": 1.9253112033195021, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7554460763931274, + "eval_runtime": 28.4357, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.145, + "step": 464 + }, + { + "epoch": 1.929460580912863, + "grad_norm": 26.875, + "learning_rate": 3.070539419087137e-05, + "loss": 1.1719, + "step": 465 + }, + { + "epoch": 1.929460580912863, + "eval_accuracy": 0.6141078838174274, + "eval_loss": 0.7532579302787781, + "eval_runtime": 28.5561, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.136, + "step": 465 + }, + { + "epoch": 1.933609958506224, + "grad_norm": 22.625, + "learning_rate": 3.066390041493776e-05, + "loss": 0.4883, + "step": 466 + }, + { + "epoch": 1.933609958506224, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7510049343109131, + "eval_runtime": 28.5565, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.136, + "step": 466 + }, + { + "epoch": 1.937759336099585, + "grad_norm": 9.5625, + "learning_rate": 3.062240663900415e-05, + "loss": 0.8008, + "step": 467 + }, + { + "epoch": 1.937759336099585, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7464989423751831, + "eval_runtime": 28.5642, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.136, + "step": 467 + }, + { + "epoch": 1.941908713692946, + "grad_norm": 15.6875, + "learning_rate": 3.0580912863070544e-05, + "loss": 1.0781, + "step": 468 + }, + { + "epoch": 1.941908713692946, + "eval_accuracy": 0.6078838174273858, + "eval_loss": 0.7421226501464844, + "eval_runtime": 28.6345, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.13, + "step": 468 + }, + { + "epoch": 1.946058091286307, + "grad_norm": 20.75, + "learning_rate": 3.0539419087136935e-05, + "loss": 0.8086, + "step": 469 + }, + { + "epoch": 1.946058091286307, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7396265268325806, + "eval_runtime": 28.5762, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.135, + "step": 469 + }, + { + "epoch": 1.950207468879668, + "grad_norm": 23.625, + "learning_rate": 3.0497925311203323e-05, + "loss": 0.4863, + "step": 470 + }, + { + "epoch": 1.950207468879668, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.7395293116569519, + "eval_runtime": 28.5848, + "eval_samples_per_second": 16.862, + "eval_steps_per_second": 2.134, + "step": 470 + }, + { + "epoch": 1.9543568464730292, + "grad_norm": 3.546875, + "learning_rate": 3.0456431535269708e-05, + "loss": 0.3574, + "step": 471 + }, + { + "epoch": 1.9543568464730292, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.7393996119499207, + "eval_runtime": 28.4982, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.14, + "step": 471 + }, + { + "epoch": 1.9585062240663902, + "grad_norm": 7.25, + "learning_rate": 3.04149377593361e-05, + "loss": 0.5391, + "step": 472 + }, + { + "epoch": 1.9585062240663902, + "eval_accuracy": 0.6058091286307054, + "eval_loss": 0.7385405898094177, + "eval_runtime": 28.3884, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 2.149, + "step": 472 + }, + { + "epoch": 1.9626556016597512, + "grad_norm": 11.25, + "learning_rate": 3.0373443983402488e-05, + "loss": 1.2109, + "step": 473 + }, + { + "epoch": 1.9626556016597512, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7370007634162903, + "eval_runtime": 28.4203, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 2.146, + "step": 473 + }, + { + "epoch": 1.9668049792531122, + "grad_norm": 5.4375, + "learning_rate": 3.033195020746888e-05, + "loss": 0.2266, + "step": 474 + }, + { + "epoch": 1.9668049792531122, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.7374060153961182, + "eval_runtime": 28.5798, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 474 + }, + { + "epoch": 1.9709543568464731, + "grad_norm": 6.25, + "learning_rate": 3.029045643153527e-05, + "loss": 0.4609, + "step": 475 + }, + { + "epoch": 1.9709543568464731, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.7371952533721924, + "eval_runtime": 28.5683, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 475 + }, + { + "epoch": 1.9751037344398341, + "grad_norm": 6.09375, + "learning_rate": 3.024896265560166e-05, + "loss": 0.5703, + "step": 476 + }, + { + "epoch": 1.9751037344398341, + "eval_accuracy": 0.6141078838174274, + "eval_loss": 0.7384433150291443, + "eval_runtime": 28.5717, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 476 + }, + { + "epoch": 1.979253112033195, + "grad_norm": 10.875, + "learning_rate": 3.020746887966805e-05, + "loss": 0.7266, + "step": 477 + }, + { + "epoch": 1.979253112033195, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7398535013198853, + "eval_runtime": 28.5776, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.135, + "step": 477 + }, + { + "epoch": 1.983402489626556, + "grad_norm": 11.1875, + "learning_rate": 3.0165975103734438e-05, + "loss": 0.7969, + "step": 478 + }, + { + "epoch": 1.983402489626556, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.7412149906158447, + "eval_runtime": 28.5724, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.135, + "step": 478 + }, + { + "epoch": 1.987551867219917, + "grad_norm": 10.3125, + "learning_rate": 3.012448132780083e-05, + "loss": 0.7266, + "step": 479 + }, + { + "epoch": 1.987551867219917, + "eval_accuracy": 0.6141078838174274, + "eval_loss": 0.7412149906158447, + "eval_runtime": 28.5943, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 479 + }, + { + "epoch": 1.991701244813278, + "grad_norm": 18.25, + "learning_rate": 3.008298755186722e-05, + "loss": 0.9688, + "step": 480 + }, + { + "epoch": 1.991701244813278, + "eval_accuracy": 0.6141078838174274, + "eval_loss": 0.7382001876831055, + "eval_runtime": 28.4656, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.143, + "step": 480 + }, + { + "epoch": 1.995850622406639, + "grad_norm": 37.25, + "learning_rate": 3.004149377593361e-05, + "loss": 0.6289, + "step": 481 + }, + { + "epoch": 1.995850622406639, + "eval_accuracy": 0.6120331950207469, + "eval_loss": 0.7329810857772827, + "eval_runtime": 28.5202, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 481 + }, + { + "epoch": 2.0, + "grad_norm": 25.375, + "learning_rate": 3e-05, + "loss": 0.5898, + "step": 482 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.6224066390041494, + "eval_loss": 0.7301283478736877, + "eval_runtime": 28.5576, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 482 + }, + { + "epoch": 2.004149377593361, + "grad_norm": 4.25, + "learning_rate": 2.995850622406639e-05, + "loss": 0.4062, + "step": 483 + }, + { + "epoch": 2.004149377593361, + "eval_accuracy": 0.6161825726141079, + "eval_loss": 0.7285075187683105, + "eval_runtime": 28.6301, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 483 + }, + { + "epoch": 2.008298755186722, + "grad_norm": 10.9375, + "learning_rate": 2.991701244813278e-05, + "loss": 0.7812, + "step": 484 + }, + { + "epoch": 2.008298755186722, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.7275674343109131, + "eval_runtime": 28.6237, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 484 + }, + { + "epoch": 2.012448132780083, + "grad_norm": 7.0625, + "learning_rate": 2.9875518672199172e-05, + "loss": 0.7891, + "step": 485 + }, + { + "epoch": 2.012448132780083, + "eval_accuracy": 0.6161825726141079, + "eval_loss": 0.7257682681083679, + "eval_runtime": 28.5667, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 485 + }, + { + "epoch": 2.016597510373444, + "grad_norm": 12.8125, + "learning_rate": 2.983402489626556e-05, + "loss": 0.6562, + "step": 486 + }, + { + "epoch": 2.016597510373444, + "eval_accuracy": 0.6141078838174274, + "eval_loss": 0.7237422466278076, + "eval_runtime": 28.5805, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 486 + }, + { + "epoch": 2.020746887966805, + "grad_norm": 8.375, + "learning_rate": 2.979253112033195e-05, + "loss": 0.4961, + "step": 487 + }, + { + "epoch": 2.020746887966805, + "eval_accuracy": 0.6224066390041494, + "eval_loss": 0.7234991192817688, + "eval_runtime": 28.4495, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.144, + "step": 487 + }, + { + "epoch": 2.024896265560166, + "grad_norm": 24.25, + "learning_rate": 2.9751037344398343e-05, + "loss": 0.8398, + "step": 488 + }, + { + "epoch": 2.024896265560166, + "eval_accuracy": 0.6244813278008299, + "eval_loss": 0.7225427627563477, + "eval_runtime": 28.4473, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 488 + }, + { + "epoch": 2.029045643153527, + "grad_norm": 9.5, + "learning_rate": 2.970954356846473e-05, + "loss": 0.6172, + "step": 489 + }, + { + "epoch": 2.029045643153527, + "eval_accuracy": 0.6161825726141079, + "eval_loss": 0.7228021025657654, + "eval_runtime": 28.3554, + "eval_samples_per_second": 16.999, + "eval_steps_per_second": 2.151, + "step": 489 + }, + { + "epoch": 2.033195020746888, + "grad_norm": 27.75, + "learning_rate": 2.9668049792531122e-05, + "loss": 1.2891, + "step": 490 + }, + { + "epoch": 2.033195020746888, + "eval_accuracy": 0.6161825726141079, + "eval_loss": 0.7202411890029907, + "eval_runtime": 28.3842, + "eval_samples_per_second": 16.981, + "eval_steps_per_second": 2.149, + "step": 490 + }, + { + "epoch": 2.037344398340249, + "grad_norm": 9.6875, + "learning_rate": 2.962655601659751e-05, + "loss": 0.582, + "step": 491 + }, + { + "epoch": 2.037344398340249, + "eval_accuracy": 0.6182572614107884, + "eval_loss": 0.7207598686218262, + "eval_runtime": 28.4085, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.147, + "step": 491 + }, + { + "epoch": 2.04149377593361, + "grad_norm": 4.1875, + "learning_rate": 2.9585062240663902e-05, + "loss": 0.3555, + "step": 492 + }, + { + "epoch": 2.04149377593361, + "eval_accuracy": 0.6182572614107884, + "eval_loss": 0.7206301689147949, + "eval_runtime": 28.5471, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 492 + }, + { + "epoch": 2.045643153526971, + "grad_norm": 8.6875, + "learning_rate": 2.9543568464730293e-05, + "loss": 0.7852, + "step": 493 + }, + { + "epoch": 2.045643153526971, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.7205815315246582, + "eval_runtime": 28.6083, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 493 + }, + { + "epoch": 2.0497925311203318, + "grad_norm": 16.875, + "learning_rate": 2.950207468879668e-05, + "loss": 0.5742, + "step": 494 + }, + { + "epoch": 2.0497925311203318, + "eval_accuracy": 0.6182572614107884, + "eval_loss": 0.7172588109970093, + "eval_runtime": 28.6215, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 494 + }, + { + "epoch": 2.0539419087136928, + "grad_norm": 10.75, + "learning_rate": 2.9460580912863073e-05, + "loss": 0.418, + "step": 495 + }, + { + "epoch": 2.0539419087136928, + "eval_accuracy": 0.6099585062240664, + "eval_loss": 0.719560444355011, + "eval_runtime": 28.58, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 495 + }, + { + "epoch": 2.0580912863070537, + "grad_norm": 9.3125, + "learning_rate": 2.9419087136929465e-05, + "loss": 0.8125, + "step": 496 + }, + { + "epoch": 2.0580912863070537, + "eval_accuracy": 0.6037344398340249, + "eval_loss": 0.7235801219940186, + "eval_runtime": 28.6534, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 496 + }, + { + "epoch": 2.0622406639004147, + "grad_norm": 10.375, + "learning_rate": 2.9377593360995853e-05, + "loss": 0.6602, + "step": 497 + }, + { + "epoch": 2.0622406639004147, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7260438203811646, + "eval_runtime": 28.5975, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 497 + }, + { + "epoch": 2.066390041493776, + "grad_norm": 12.5625, + "learning_rate": 2.9336099585062244e-05, + "loss": 0.4258, + "step": 498 + }, + { + "epoch": 2.066390041493776, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.7268218398094177, + "eval_runtime": 28.5702, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 498 + }, + { + "epoch": 2.070539419087137, + "grad_norm": 13.375, + "learning_rate": 2.9294605809128632e-05, + "loss": 0.8203, + "step": 499 + }, + { + "epoch": 2.070539419087137, + "eval_accuracy": 0.6016597510373444, + "eval_loss": 0.7275025844573975, + "eval_runtime": 28.5011, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.14, + "step": 499 + }, + { + "epoch": 2.074688796680498, + "grad_norm": 37.75, + "learning_rate": 2.9253112033195024e-05, + "loss": 0.8438, + "step": 500 + }, + { + "epoch": 2.074688796680498, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7242446541786194, + "eval_runtime": 28.4033, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.148, + "step": 500 + }, + { + "epoch": 2.078838174273859, + "grad_norm": 7.03125, + "learning_rate": 2.9211618257261415e-05, + "loss": 0.4258, + "step": 501 + }, + { + "epoch": 2.078838174273859, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.723677396774292, + "eval_runtime": 28.4039, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.148, + "step": 501 + }, + { + "epoch": 2.08298755186722, + "grad_norm": 7.25, + "learning_rate": 2.9170124481327803e-05, + "loss": 0.5977, + "step": 502 + }, + { + "epoch": 2.08298755186722, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7253225445747375, + "eval_runtime": 28.4028, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.148, + "step": 502 + }, + { + "epoch": 2.087136929460581, + "grad_norm": 7.375, + "learning_rate": 2.9128630705394195e-05, + "loss": 0.9062, + "step": 503 + }, + { + "epoch": 2.087136929460581, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7231262922286987, + "eval_runtime": 28.5317, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.138, + "step": 503 + }, + { + "epoch": 2.091286307053942, + "grad_norm": 10.5, + "learning_rate": 2.9087136929460583e-05, + "loss": 0.7383, + "step": 504 + }, + { + "epoch": 2.091286307053942, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7197062969207764, + "eval_runtime": 28.5879, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 504 + }, + { + "epoch": 2.095435684647303, + "grad_norm": 16.375, + "learning_rate": 2.9045643153526974e-05, + "loss": 0.4922, + "step": 505 + }, + { + "epoch": 2.095435684647303, + "eval_accuracy": 0.5995850622406639, + "eval_loss": 0.7179557681083679, + "eval_runtime": 28.615, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 505 + }, + { + "epoch": 2.099585062240664, + "grad_norm": 10.25, + "learning_rate": 2.9004149377593366e-05, + "loss": 0.668, + "step": 506 + }, + { + "epoch": 2.099585062240664, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7171777486801147, + "eval_runtime": 28.6489, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.129, + "step": 506 + }, + { + "epoch": 2.103734439834025, + "grad_norm": 12.5625, + "learning_rate": 2.8962655601659754e-05, + "loss": 0.7266, + "step": 507 + }, + { + "epoch": 2.103734439834025, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.715864896774292, + "eval_runtime": 28.6096, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.132, + "step": 507 + }, + { + "epoch": 2.107883817427386, + "grad_norm": 8.8125, + "learning_rate": 2.8921161825726145e-05, + "loss": 0.5, + "step": 508 + }, + { + "epoch": 2.107883817427386, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7144547700881958, + "eval_runtime": 28.5928, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 508 + }, + { + "epoch": 2.112033195020747, + "grad_norm": 14.6875, + "learning_rate": 2.887966804979253e-05, + "loss": 0.9297, + "step": 509 + }, + { + "epoch": 2.112033195020747, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7129635810852051, + "eval_runtime": 28.5975, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 509 + }, + { + "epoch": 2.116182572614108, + "grad_norm": 6.59375, + "learning_rate": 2.883817427385892e-05, + "loss": 0.3828, + "step": 510 + }, + { + "epoch": 2.116182572614108, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7116182446479797, + "eval_runtime": 28.5221, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.139, + "step": 510 + }, + { + "epoch": 2.120331950207469, + "grad_norm": 5.21875, + "learning_rate": 2.879668049792531e-05, + "loss": 0.4102, + "step": 511 + }, + { + "epoch": 2.120331950207469, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7102243304252625, + "eval_runtime": 28.547, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 511 + }, + { + "epoch": 2.12448132780083, + "grad_norm": 28.5, + "learning_rate": 2.87551867219917e-05, + "loss": 0.707, + "step": 512 + }, + { + "epoch": 2.12448132780083, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7086682915687561, + "eval_runtime": 28.5343, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 512 + }, + { + "epoch": 2.128630705394191, + "grad_norm": 13.25, + "learning_rate": 2.871369294605809e-05, + "loss": 0.7578, + "step": 513 + }, + { + "epoch": 2.128630705394191, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7086439728736877, + "eval_runtime": 28.6338, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.13, + "step": 513 + }, + { + "epoch": 2.132780082987552, + "grad_norm": 8.125, + "learning_rate": 2.867219917012448e-05, + "loss": 0.3301, + "step": 514 + }, + { + "epoch": 2.132780082987552, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7067638039588928, + "eval_runtime": 28.6641, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.128, + "step": 514 + }, + { + "epoch": 2.136929460580913, + "grad_norm": 15.375, + "learning_rate": 2.8630705394190872e-05, + "loss": 0.5508, + "step": 515 + }, + { + "epoch": 2.136929460580913, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7045027017593384, + "eval_runtime": 28.6754, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.127, + "step": 515 + }, + { + "epoch": 2.141078838174274, + "grad_norm": 9.1875, + "learning_rate": 2.858921161825726e-05, + "loss": 0.6875, + "step": 516 + }, + { + "epoch": 2.141078838174274, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7006775140762329, + "eval_runtime": 28.6301, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 516 + }, + { + "epoch": 2.145228215767635, + "grad_norm": 15.8125, + "learning_rate": 2.8547717842323652e-05, + "loss": 0.6992, + "step": 517 + }, + { + "epoch": 2.145228215767635, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6959932446479797, + "eval_runtime": 28.624, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 517 + }, + { + "epoch": 2.1493775933609958, + "grad_norm": 6.53125, + "learning_rate": 2.8506224066390043e-05, + "loss": 0.1348, + "step": 518 + }, + { + "epoch": 2.1493775933609958, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6971926689147949, + "eval_runtime": 28.5572, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 518 + }, + { + "epoch": 2.1535269709543567, + "grad_norm": 11.1875, + "learning_rate": 2.846473029045643e-05, + "loss": 0.9453, + "step": 519 + }, + { + "epoch": 2.1535269709543567, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6957014799118042, + "eval_runtime": 28.5042, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.14, + "step": 519 + }, + { + "epoch": 2.1576763485477177, + "grad_norm": 10.5, + "learning_rate": 2.8423236514522823e-05, + "loss": 0.5586, + "step": 520 + }, + { + "epoch": 2.1576763485477177, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6949883103370667, + "eval_runtime": 28.5353, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 520 + }, + { + "epoch": 2.1618257261410787, + "grad_norm": 11.1875, + "learning_rate": 2.838174273858921e-05, + "loss": 0.6016, + "step": 521 + }, + { + "epoch": 2.1618257261410787, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6947938203811646, + "eval_runtime": 28.6548, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.129, + "step": 521 + }, + { + "epoch": 2.1659751037344397, + "grad_norm": 8.625, + "learning_rate": 2.8340248962655602e-05, + "loss": 0.6836, + "step": 522 + }, + { + "epoch": 2.1659751037344397, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6942751407623291, + "eval_runtime": 28.7178, + "eval_samples_per_second": 16.784, + "eval_steps_per_second": 2.124, + "step": 522 + }, + { + "epoch": 2.1701244813278007, + "grad_norm": 24.75, + "learning_rate": 2.8298755186721994e-05, + "loss": 0.7539, + "step": 523 + }, + { + "epoch": 2.1701244813278007, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6932054162025452, + "eval_runtime": 28.7392, + "eval_samples_per_second": 16.772, + "eval_steps_per_second": 2.123, + "step": 523 + }, + { + "epoch": 2.1742738589211617, + "grad_norm": 9.5, + "learning_rate": 2.8257261410788382e-05, + "loss": 0.498, + "step": 524 + }, + { + "epoch": 2.1742738589211617, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6913900375366211, + "eval_runtime": 28.6717, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.128, + "step": 524 + }, + { + "epoch": 2.1784232365145226, + "grad_norm": 16.5, + "learning_rate": 2.8215767634854773e-05, + "loss": 0.6094, + "step": 525 + }, + { + "epoch": 2.1784232365145226, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6901095509529114, + "eval_runtime": 28.6645, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.128, + "step": 525 + }, + { + "epoch": 2.1825726141078836, + "grad_norm": 9.625, + "learning_rate": 2.817427385892116e-05, + "loss": 0.4941, + "step": 526 + }, + { + "epoch": 2.1825726141078836, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6913251876831055, + "eval_runtime": 28.5924, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 526 + }, + { + "epoch": 2.186721991701245, + "grad_norm": 11.8125, + "learning_rate": 2.8132780082987553e-05, + "loss": 0.8242, + "step": 527 + }, + { + "epoch": 2.186721991701245, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6906120181083679, + "eval_runtime": 28.51, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.14, + "step": 527 + }, + { + "epoch": 2.190871369294606, + "grad_norm": 8.5625, + "learning_rate": 2.8091286307053944e-05, + "loss": 0.4844, + "step": 528 + }, + { + "epoch": 2.190871369294606, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6939023733139038, + "eval_runtime": 28.4595, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 528 + }, + { + "epoch": 2.195020746887967, + "grad_norm": 10.1875, + "learning_rate": 2.8049792531120333e-05, + "loss": 0.582, + "step": 529 + }, + { + "epoch": 2.195020746887967, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6939347982406616, + "eval_runtime": 28.4273, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.146, + "step": 529 + }, + { + "epoch": 2.199170124481328, + "grad_norm": 9.1875, + "learning_rate": 2.8008298755186724e-05, + "loss": 0.8711, + "step": 530 + }, + { + "epoch": 2.199170124481328, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6943075656890869, + "eval_runtime": 28.4714, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 530 + }, + { + "epoch": 2.203319502074689, + "grad_norm": 21.75, + "learning_rate": 2.7966804979253115e-05, + "loss": 0.4551, + "step": 531 + }, + { + "epoch": 2.203319502074689, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.695101797580719, + "eval_runtime": 28.467, + "eval_samples_per_second": 16.932, + "eval_steps_per_second": 2.143, + "step": 531 + }, + { + "epoch": 2.20746887966805, + "grad_norm": 19.0, + "learning_rate": 2.7925311203319504e-05, + "loss": 0.6602, + "step": 532 + }, + { + "epoch": 2.20746887966805, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6941292881965637, + "eval_runtime": 28.6394, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.13, + "step": 532 + }, + { + "epoch": 2.211618257261411, + "grad_norm": 7.375, + "learning_rate": 2.7883817427385895e-05, + "loss": 0.457, + "step": 533 + }, + { + "epoch": 2.211618257261411, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6948586702346802, + "eval_runtime": 28.693, + "eval_samples_per_second": 16.799, + "eval_steps_per_second": 2.126, + "step": 533 + }, + { + "epoch": 2.215767634854772, + "grad_norm": 8.6875, + "learning_rate": 2.7842323651452283e-05, + "loss": 0.6602, + "step": 534 + }, + { + "epoch": 2.215767634854772, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6941536068916321, + "eval_runtime": 28.6598, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 534 + }, + { + "epoch": 2.219917012448133, + "grad_norm": 6.90625, + "learning_rate": 2.7800829875518675e-05, + "loss": 0.5781, + "step": 535 + }, + { + "epoch": 2.219917012448133, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6954988837242126, + "eval_runtime": 28.6681, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.128, + "step": 535 + }, + { + "epoch": 2.224066390041494, + "grad_norm": 13.5, + "learning_rate": 2.7759336099585066e-05, + "loss": 0.6445, + "step": 536 + }, + { + "epoch": 2.224066390041494, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6952476501464844, + "eval_runtime": 28.677, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.127, + "step": 536 + }, + { + "epoch": 2.228215767634855, + "grad_norm": 10.125, + "learning_rate": 2.7717842323651454e-05, + "loss": 0.7305, + "step": 537 + }, + { + "epoch": 2.228215767634855, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6971521377563477, + "eval_runtime": 28.6464, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 537 + }, + { + "epoch": 2.232365145228216, + "grad_norm": 14.9375, + "learning_rate": 2.7676348547717846e-05, + "loss": 0.8906, + "step": 538 + }, + { + "epoch": 2.232365145228216, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6975006461143494, + "eval_runtime": 28.6126, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 538 + }, + { + "epoch": 2.236514522821577, + "grad_norm": 8.4375, + "learning_rate": 2.7634854771784234e-05, + "loss": 0.4629, + "step": 539 + }, + { + "epoch": 2.236514522821577, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6986514329910278, + "eval_runtime": 28.5159, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.139, + "step": 539 + }, + { + "epoch": 2.240663900414938, + "grad_norm": 19.375, + "learning_rate": 2.7593360995850625e-05, + "loss": 0.4258, + "step": 540 + }, + { + "epoch": 2.240663900414938, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6990566849708557, + "eval_runtime": 28.4714, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 540 + }, + { + "epoch": 2.2448132780082988, + "grad_norm": 5.125, + "learning_rate": 2.7551867219917017e-05, + "loss": 0.4727, + "step": 541 + }, + { + "epoch": 2.2448132780082988, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6996887922286987, + "eval_runtime": 28.4377, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.145, + "step": 541 + }, + { + "epoch": 2.2489626556016598, + "grad_norm": 9.25, + "learning_rate": 2.7510373443983405e-05, + "loss": 0.8438, + "step": 542 + }, + { + "epoch": 2.2489626556016598, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6992754936218262, + "eval_runtime": 28.4215, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.146, + "step": 542 + }, + { + "epoch": 2.2531120331950207, + "grad_norm": 9.6875, + "learning_rate": 2.7468879668049796e-05, + "loss": 0.6328, + "step": 543 + }, + { + "epoch": 2.2531120331950207, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7009368538856506, + "eval_runtime": 28.4751, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.142, + "step": 543 + }, + { + "epoch": 2.2572614107883817, + "grad_norm": 9.5625, + "learning_rate": 2.7427385892116188e-05, + "loss": 0.5508, + "step": 544 + }, + { + "epoch": 2.2572614107883817, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7018120884895325, + "eval_runtime": 28.6062, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 544 + }, + { + "epoch": 2.2614107883817427, + "grad_norm": 8.9375, + "learning_rate": 2.7385892116182576e-05, + "loss": 0.3457, + "step": 545 + }, + { + "epoch": 2.2614107883817427, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7013177275657654, + "eval_runtime": 28.7108, + "eval_samples_per_second": 16.788, + "eval_steps_per_second": 2.125, + "step": 545 + }, + { + "epoch": 2.2655601659751037, + "grad_norm": 10.375, + "learning_rate": 2.7344398340248967e-05, + "loss": 0.6484, + "step": 546 + }, + { + "epoch": 2.2655601659751037, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7023064494132996, + "eval_runtime": 28.6628, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.128, + "step": 546 + }, + { + "epoch": 2.2697095435684647, + "grad_norm": 20.375, + "learning_rate": 2.7302904564315352e-05, + "loss": 1.8203, + "step": 547 + }, + { + "epoch": 2.2697095435684647, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7015041708946228, + "eval_runtime": 28.666, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.128, + "step": 547 + }, + { + "epoch": 2.2738589211618256, + "grad_norm": 8.5, + "learning_rate": 2.726141078838174e-05, + "loss": 0.3066, + "step": 548 + }, + { + "epoch": 2.2738589211618256, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7002155780792236, + "eval_runtime": 28.7002, + "eval_samples_per_second": 16.794, + "eval_steps_per_second": 2.125, + "step": 548 + }, + { + "epoch": 2.2780082987551866, + "grad_norm": 14.6875, + "learning_rate": 2.721991701244813e-05, + "loss": 0.5859, + "step": 549 + }, + { + "epoch": 2.2780082987551866, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6998913884162903, + "eval_runtime": 28.6625, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.128, + "step": 549 + }, + { + "epoch": 2.2821576763485476, + "grad_norm": 7.6875, + "learning_rate": 2.7178423236514523e-05, + "loss": 0.3164, + "step": 550 + }, + { + "epoch": 2.2821576763485476, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7008233666419983, + "eval_runtime": 28.5987, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.133, + "step": 550 + }, + { + "epoch": 2.2863070539419086, + "grad_norm": 9.625, + "learning_rate": 2.713692946058091e-05, + "loss": 0.707, + "step": 551 + }, + { + "epoch": 2.2863070539419086, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.69921875, + "eval_runtime": 28.4753, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.142, + "step": 551 + }, + { + "epoch": 2.2904564315352696, + "grad_norm": 55.25, + "learning_rate": 2.7095435684647303e-05, + "loss": 0.543, + "step": 552 + }, + { + "epoch": 2.2904564315352696, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7007423639297485, + "eval_runtime": 28.4462, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 552 + }, + { + "epoch": 2.2946058091286305, + "grad_norm": 12.4375, + "learning_rate": 2.7053941908713694e-05, + "loss": 0.5312, + "step": 553 + }, + { + "epoch": 2.2946058091286305, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6980031132698059, + "eval_runtime": 28.6504, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.129, + "step": 553 + }, + { + "epoch": 2.2987551867219915, + "grad_norm": 14.0, + "learning_rate": 2.7012448132780082e-05, + "loss": 1.0312, + "step": 554 + }, + { + "epoch": 2.2987551867219915, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.69861900806427, + "eval_runtime": 28.6586, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.129, + "step": 554 + }, + { + "epoch": 2.3029045643153525, + "grad_norm": 7.84375, + "learning_rate": 2.6970954356846474e-05, + "loss": 0.6133, + "step": 555 + }, + { + "epoch": 2.3029045643153525, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7004181742668152, + "eval_runtime": 28.678, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.127, + "step": 555 + }, + { + "epoch": 2.3070539419087135, + "grad_norm": 9.25, + "learning_rate": 2.6929460580912862e-05, + "loss": 0.5117, + "step": 556 + }, + { + "epoch": 2.3070539419087135, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7002561092376709, + "eval_runtime": 28.7393, + "eval_samples_per_second": 16.771, + "eval_steps_per_second": 2.123, + "step": 556 + }, + { + "epoch": 2.3112033195020745, + "grad_norm": 17.5, + "learning_rate": 2.6887966804979253e-05, + "loss": 0.9492, + "step": 557 + }, + { + "epoch": 2.3112033195020745, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6998346447944641, + "eval_runtime": 28.6728, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.127, + "step": 557 + }, + { + "epoch": 2.3153526970954355, + "grad_norm": 14.8125, + "learning_rate": 2.6846473029045645e-05, + "loss": 0.5391, + "step": 558 + }, + { + "epoch": 2.3153526970954355, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6989675164222717, + "eval_runtime": 28.6472, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.129, + "step": 558 + }, + { + "epoch": 2.3195020746887964, + "grad_norm": 11.75, + "learning_rate": 2.6804979253112033e-05, + "loss": 0.4863, + "step": 559 + }, + { + "epoch": 2.3195020746887964, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6978329420089722, + "eval_runtime": 28.6046, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.133, + "step": 559 + }, + { + "epoch": 2.323651452282158, + "grad_norm": 10.4375, + "learning_rate": 2.6763485477178424e-05, + "loss": 0.6211, + "step": 560 + }, + { + "epoch": 2.323651452282158, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6988459825515747, + "eval_runtime": 28.4997, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.14, + "step": 560 + }, + { + "epoch": 2.327800829875519, + "grad_norm": 16.25, + "learning_rate": 2.6721991701244812e-05, + "loss": 0.5664, + "step": 561 + }, + { + "epoch": 2.327800829875519, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6976627111434937, + "eval_runtime": 28.4558, + "eval_samples_per_second": 16.939, + "eval_steps_per_second": 2.144, + "step": 561 + }, + { + "epoch": 2.33195020746888, + "grad_norm": 7.5625, + "learning_rate": 2.6680497925311204e-05, + "loss": 0.7305, + "step": 562 + }, + { + "epoch": 2.33195020746888, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.6985380053520203, + "eval_runtime": 28.4373, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.145, + "step": 562 + }, + { + "epoch": 2.336099585062241, + "grad_norm": 25.25, + "learning_rate": 2.6639004149377595e-05, + "loss": 0.6055, + "step": 563 + }, + { + "epoch": 2.336099585062241, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6977275609970093, + "eval_runtime": 28.4709, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.143, + "step": 563 + }, + { + "epoch": 2.340248962655602, + "grad_norm": 11.5625, + "learning_rate": 2.6597510373443983e-05, + "loss": 0.6133, + "step": 564 + }, + { + "epoch": 2.340248962655602, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6976789236068726, + "eval_runtime": 28.5204, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 564 + }, + { + "epoch": 2.3443983402489628, + "grad_norm": 13.125, + "learning_rate": 2.6556016597510375e-05, + "loss": 0.6445, + "step": 565 + }, + { + "epoch": 2.3443983402489628, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6969009041786194, + "eval_runtime": 28.6184, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 565 + }, + { + "epoch": 2.3485477178423237, + "grad_norm": 20.125, + "learning_rate": 2.6514522821576766e-05, + "loss": 0.4746, + "step": 566 + }, + { + "epoch": 2.3485477178423237, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6970143914222717, + "eval_runtime": 28.7089, + "eval_samples_per_second": 16.789, + "eval_steps_per_second": 2.125, + "step": 566 + }, + { + "epoch": 2.3526970954356847, + "grad_norm": 7.46875, + "learning_rate": 2.6473029045643155e-05, + "loss": 0.6172, + "step": 567 + }, + { + "epoch": 2.3526970954356847, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.6966739892959595, + "eval_runtime": 28.7118, + "eval_samples_per_second": 16.788, + "eval_steps_per_second": 2.125, + "step": 567 + }, + { + "epoch": 2.3568464730290457, + "grad_norm": 25.25, + "learning_rate": 2.6431535269709546e-05, + "loss": 0.4512, + "step": 568 + }, + { + "epoch": 2.3568464730290457, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6965281367301941, + "eval_runtime": 28.654, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.129, + "step": 568 + }, + { + "epoch": 2.3609958506224067, + "grad_norm": 9.375, + "learning_rate": 2.6390041493775934e-05, + "loss": 0.5547, + "step": 569 + }, + { + "epoch": 2.3609958506224067, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6974682211875916, + "eval_runtime": 28.6571, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.129, + "step": 569 + }, + { + "epoch": 2.3651452282157677, + "grad_norm": 26.125, + "learning_rate": 2.6348547717842326e-05, + "loss": 0.5156, + "step": 570 + }, + { + "epoch": 2.3651452282157677, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6983516216278076, + "eval_runtime": 28.6383, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 570 + }, + { + "epoch": 2.3692946058091287, + "grad_norm": 19.125, + "learning_rate": 2.6307053941908717e-05, + "loss": 0.7695, + "step": 571 + }, + { + "epoch": 2.3692946058091287, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7002074718475342, + "eval_runtime": 28.5513, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.137, + "step": 571 + }, + { + "epoch": 2.3734439834024896, + "grad_norm": 14.5625, + "learning_rate": 2.6265560165975105e-05, + "loss": 1.0469, + "step": 572 + }, + { + "epoch": 2.3734439834024896, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.7005154490470886, + "eval_runtime": 28.492, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.141, + "step": 572 + }, + { + "epoch": 2.3775933609958506, + "grad_norm": 22.25, + "learning_rate": 2.6224066390041497e-05, + "loss": 0.6133, + "step": 573 + }, + { + "epoch": 2.3775933609958506, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.699129581451416, + "eval_runtime": 28.5135, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.139, + "step": 573 + }, + { + "epoch": 2.3817427385892116, + "grad_norm": 6.84375, + "learning_rate": 2.6182572614107888e-05, + "loss": 0.3301, + "step": 574 + }, + { + "epoch": 2.3817427385892116, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6998994946479797, + "eval_runtime": 28.6161, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 574 + }, + { + "epoch": 2.3858921161825726, + "grad_norm": 35.75, + "learning_rate": 2.6141078838174276e-05, + "loss": 0.4902, + "step": 575 + }, + { + "epoch": 2.3858921161825726, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7001345157623291, + "eval_runtime": 28.6463, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 575 + }, + { + "epoch": 2.3900414937759336, + "grad_norm": 29.375, + "learning_rate": 2.6099585062240668e-05, + "loss": 1.4297, + "step": 576 + }, + { + "epoch": 2.3900414937759336, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6988621354103088, + "eval_runtime": 28.6661, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.128, + "step": 576 + }, + { + "epoch": 2.3941908713692945, + "grad_norm": 11.875, + "learning_rate": 2.6058091286307056e-05, + "loss": 0.5977, + "step": 577 + }, + { + "epoch": 2.3941908713692945, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6994699835777283, + "eval_runtime": 28.6479, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.129, + "step": 577 + }, + { + "epoch": 2.3983402489626555, + "grad_norm": 8.625, + "learning_rate": 2.6016597510373447e-05, + "loss": 0.3457, + "step": 578 + }, + { + "epoch": 2.3983402489626555, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7026387453079224, + "eval_runtime": 28.6409, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.13, + "step": 578 + }, + { + "epoch": 2.4024896265560165, + "grad_norm": 20.0, + "learning_rate": 2.597510373443984e-05, + "loss": 1.1797, + "step": 579 + }, + { + "epoch": 2.4024896265560165, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7015122771263123, + "eval_runtime": 28.6576, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.129, + "step": 579 + }, + { + "epoch": 2.4066390041493775, + "grad_norm": 23.375, + "learning_rate": 2.5933609958506227e-05, + "loss": 1.0859, + "step": 580 + }, + { + "epoch": 2.4066390041493775, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7027683854103088, + "eval_runtime": 28.5402, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 580 + }, + { + "epoch": 2.4107883817427385, + "grad_norm": 13.9375, + "learning_rate": 2.5892116182572618e-05, + "loss": 0.7617, + "step": 581 + }, + { + "epoch": 2.4107883817427385, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7031574249267578, + "eval_runtime": 28.5615, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.136, + "step": 581 + }, + { + "epoch": 2.4149377593360994, + "grad_norm": 14.5, + "learning_rate": 2.5850622406639006e-05, + "loss": 0.8125, + "step": 582 + }, + { + "epoch": 2.4149377593360994, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7055238485336304, + "eval_runtime": 28.6265, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 582 + }, + { + "epoch": 2.4190871369294604, + "grad_norm": 22.5, + "learning_rate": 2.5809128630705398e-05, + "loss": 0.9023, + "step": 583 + }, + { + "epoch": 2.4190871369294604, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7068691849708557, + "eval_runtime": 28.668, + "eval_samples_per_second": 16.813, + "eval_steps_per_second": 2.128, + "step": 583 + }, + { + "epoch": 2.4232365145228214, + "grad_norm": 13.0625, + "learning_rate": 2.5767634854771783e-05, + "loss": 0.7891, + "step": 584 + }, + { + "epoch": 2.4232365145228214, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7048917412757874, + "eval_runtime": 28.6598, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 584 + }, + { + "epoch": 2.4273858921161824, + "grad_norm": 13.5625, + "learning_rate": 2.5726141078838174e-05, + "loss": 0.4062, + "step": 585 + }, + { + "epoch": 2.4273858921161824, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7036923170089722, + "eval_runtime": 28.6655, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.128, + "step": 585 + }, + { + "epoch": 2.431535269709544, + "grad_norm": 9.75, + "learning_rate": 2.5684647302904562e-05, + "loss": 0.5547, + "step": 586 + }, + { + "epoch": 2.431535269709544, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7033032774925232, + "eval_runtime": 28.7275, + "eval_samples_per_second": 16.778, + "eval_steps_per_second": 2.123, + "step": 586 + }, + { + "epoch": 2.435684647302905, + "grad_norm": 6.71875, + "learning_rate": 2.5643153526970954e-05, + "loss": 0.3164, + "step": 587 + }, + { + "epoch": 2.435684647302905, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.705499529838562, + "eval_runtime": 28.6551, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.129, + "step": 587 + }, + { + "epoch": 2.4398340248962658, + "grad_norm": 9.0625, + "learning_rate": 2.5601659751037345e-05, + "loss": 0.5, + "step": 588 + }, + { + "epoch": 2.4398340248962658, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.71033775806427, + "eval_runtime": 28.5803, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 588 + }, + { + "epoch": 2.4439834024896268, + "grad_norm": 21.75, + "learning_rate": 2.5560165975103733e-05, + "loss": 1.6484, + "step": 589 + }, + { + "epoch": 2.4439834024896268, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7086601853370667, + "eval_runtime": 28.548, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 589 + }, + { + "epoch": 2.4481327800829877, + "grad_norm": 7.3125, + "learning_rate": 2.5518672199170125e-05, + "loss": 0.6211, + "step": 590 + }, + { + "epoch": 2.4481327800829877, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7058318257331848, + "eval_runtime": 28.6518, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.129, + "step": 590 + }, + { + "epoch": 2.4522821576763487, + "grad_norm": 9.6875, + "learning_rate": 2.5477178423236513e-05, + "loss": 0.5195, + "step": 591 + }, + { + "epoch": 2.4522821576763487, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7050294876098633, + "eval_runtime": 28.7103, + "eval_samples_per_second": 16.788, + "eval_steps_per_second": 2.125, + "step": 591 + }, + { + "epoch": 2.4564315352697097, + "grad_norm": 42.0, + "learning_rate": 2.5435684647302904e-05, + "loss": 0.7188, + "step": 592 + }, + { + "epoch": 2.4564315352697097, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7047863602638245, + "eval_runtime": 28.7007, + "eval_samples_per_second": 16.794, + "eval_steps_per_second": 2.125, + "step": 592 + }, + { + "epoch": 2.4605809128630707, + "grad_norm": 4.96875, + "learning_rate": 2.5394190871369296e-05, + "loss": 0.3809, + "step": 593 + }, + { + "epoch": 2.4605809128630707, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7065287828445435, + "eval_runtime": 28.6534, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 593 + }, + { + "epoch": 2.4647302904564317, + "grad_norm": 13.625, + "learning_rate": 2.5352697095435684e-05, + "loss": 0.4297, + "step": 594 + }, + { + "epoch": 2.4647302904564317, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.708473801612854, + "eval_runtime": 28.6544, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.129, + "step": 594 + }, + { + "epoch": 2.4688796680497926, + "grad_norm": 6.25, + "learning_rate": 2.5311203319502075e-05, + "loss": 0.5586, + "step": 595 + }, + { + "epoch": 2.4688796680497926, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7084413766860962, + "eval_runtime": 28.6112, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.132, + "step": 595 + }, + { + "epoch": 2.4730290456431536, + "grad_norm": 49.25, + "learning_rate": 2.5269709543568463e-05, + "loss": 0.5508, + "step": 596 + }, + { + "epoch": 2.4730290456431536, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7110104560852051, + "eval_runtime": 28.5966, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 596 + }, + { + "epoch": 2.4771784232365146, + "grad_norm": 10.75, + "learning_rate": 2.5228215767634855e-05, + "loss": 0.5898, + "step": 597 + }, + { + "epoch": 2.4771784232365146, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7125664353370667, + "eval_runtime": 28.5649, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.135, + "step": 597 + }, + { + "epoch": 2.4813278008298756, + "grad_norm": 25.25, + "learning_rate": 2.5186721991701246e-05, + "loss": 0.8906, + "step": 598 + }, + { + "epoch": 2.4813278008298756, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7136524319648743, + "eval_runtime": 28.5501, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.137, + "step": 598 + }, + { + "epoch": 2.4854771784232366, + "grad_norm": 10.9375, + "learning_rate": 2.5145228215767634e-05, + "loss": 0.4805, + "step": 599 + }, + { + "epoch": 2.4854771784232366, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7144142389297485, + "eval_runtime": 28.5339, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 599 + }, + { + "epoch": 2.4896265560165975, + "grad_norm": 11.875, + "learning_rate": 2.5103734439834026e-05, + "loss": 0.7969, + "step": 600 + }, + { + "epoch": 2.4896265560165975, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.712590754032135, + "eval_runtime": 28.4259, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.146, + "step": 600 + }, + { + "epoch": 2.4937759336099585, + "grad_norm": 21.125, + "learning_rate": 2.5062240663900417e-05, + "loss": 0.6445, + "step": 601 + }, + { + "epoch": 2.4937759336099585, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7081455588340759, + "eval_runtime": 28.1989, + "eval_samples_per_second": 17.093, + "eval_steps_per_second": 2.163, + "step": 601 + }, + { + "epoch": 2.4979253112033195, + "grad_norm": 5.53125, + "learning_rate": 2.5020746887966805e-05, + "loss": 0.4082, + "step": 602 + }, + { + "epoch": 2.4979253112033195, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.7092518210411072, + "eval_runtime": 28.3168, + "eval_samples_per_second": 17.022, + "eval_steps_per_second": 2.154, + "step": 602 + }, + { + "epoch": 2.5020746887966805, + "grad_norm": 18.0, + "learning_rate": 2.4979253112033197e-05, + "loss": 0.4609, + "step": 603 + }, + { + "epoch": 2.5020746887966805, + "eval_accuracy": 0.5975103734439834, + "eval_loss": 0.7098029255867004, + "eval_runtime": 28.2953, + "eval_samples_per_second": 17.035, + "eval_steps_per_second": 2.156, + "step": 603 + }, + { + "epoch": 2.5062240663900415, + "grad_norm": 10.4375, + "learning_rate": 2.4937759336099585e-05, + "loss": 0.6523, + "step": 604 + }, + { + "epoch": 2.5062240663900415, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7129960060119629, + "eval_runtime": 28.307, + "eval_samples_per_second": 17.028, + "eval_steps_per_second": 2.155, + "step": 604 + }, + { + "epoch": 2.5103734439834025, + "grad_norm": 7.0, + "learning_rate": 2.4896265560165977e-05, + "loss": 0.6523, + "step": 605 + }, + { + "epoch": 2.5103734439834025, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7116182446479797, + "eval_runtime": 28.4232, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.146, + "step": 605 + }, + { + "epoch": 2.5145228215767634, + "grad_norm": 6.875, + "learning_rate": 2.4854771784232368e-05, + "loss": 0.3691, + "step": 606 + }, + { + "epoch": 2.5145228215767634, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7113953828811646, + "eval_runtime": 28.5554, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.136, + "step": 606 + }, + { + "epoch": 2.5186721991701244, + "grad_norm": 11.125, + "learning_rate": 2.4813278008298756e-05, + "loss": 0.4512, + "step": 607 + }, + { + "epoch": 2.5186721991701244, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7128500938415527, + "eval_runtime": 28.6138, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 607 + }, + { + "epoch": 2.5228215767634854, + "grad_norm": 18.625, + "learning_rate": 2.4771784232365148e-05, + "loss": 1.0312, + "step": 608 + }, + { + "epoch": 2.5228215767634854, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.716886043548584, + "eval_runtime": 28.6833, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.127, + "step": 608 + }, + { + "epoch": 2.5269709543568464, + "grad_norm": 9.5, + "learning_rate": 2.473029045643154e-05, + "loss": 0.498, + "step": 609 + }, + { + "epoch": 2.5269709543568464, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7184744477272034, + "eval_runtime": 28.702, + "eval_samples_per_second": 16.793, + "eval_steps_per_second": 2.125, + "step": 609 + }, + { + "epoch": 2.5311203319502074, + "grad_norm": 5.53125, + "learning_rate": 2.4688796680497927e-05, + "loss": 0.459, + "step": 610 + }, + { + "epoch": 2.5311203319502074, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7199169993400574, + "eval_runtime": 28.6506, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.129, + "step": 610 + }, + { + "epoch": 2.5352697095435683, + "grad_norm": 13.5, + "learning_rate": 2.464730290456432e-05, + "loss": 1.0859, + "step": 611 + }, + { + "epoch": 2.5352697095435683, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7193132638931274, + "eval_runtime": 28.717, + "eval_samples_per_second": 16.784, + "eval_steps_per_second": 2.124, + "step": 611 + }, + { + "epoch": 2.5394190871369293, + "grad_norm": 12.75, + "learning_rate": 2.4605809128630707e-05, + "loss": 0.6016, + "step": 612 + }, + { + "epoch": 2.5394190871369293, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7222753763198853, + "eval_runtime": 28.663, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.128, + "step": 612 + }, + { + "epoch": 2.5435684647302903, + "grad_norm": 7.375, + "learning_rate": 2.4564315352697095e-05, + "loss": 0.457, + "step": 613 + }, + { + "epoch": 2.5435684647302903, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7209057211875916, + "eval_runtime": 28.6836, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.127, + "step": 613 + }, + { + "epoch": 2.5477178423236513, + "grad_norm": 24.375, + "learning_rate": 2.4522821576763486e-05, + "loss": 1.0938, + "step": 614 + }, + { + "epoch": 2.5477178423236513, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.719382107257843, + "eval_runtime": 28.6017, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.133, + "step": 614 + }, + { + "epoch": 2.5518672199170123, + "grad_norm": 48.5, + "learning_rate": 2.4481327800829874e-05, + "loss": 0.543, + "step": 615 + }, + { + "epoch": 2.5518672199170123, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7181219458580017, + "eval_runtime": 28.5143, + "eval_samples_per_second": 16.904, + "eval_steps_per_second": 2.139, + "step": 615 + }, + { + "epoch": 2.5560165975103732, + "grad_norm": 11.125, + "learning_rate": 2.4439834024896266e-05, + "loss": 0.8047, + "step": 616 + }, + { + "epoch": 2.5560165975103732, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7153300046920776, + "eval_runtime": 28.4775, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.142, + "step": 616 + }, + { + "epoch": 2.5601659751037342, + "grad_norm": 63.5, + "learning_rate": 2.4398340248962657e-05, + "loss": 0.8906, + "step": 617 + }, + { + "epoch": 2.5601659751037342, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7186567783355713, + "eval_runtime": 28.5076, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.14, + "step": 617 + }, + { + "epoch": 2.564315352697095, + "grad_norm": 5.3125, + "learning_rate": 2.4356846473029045e-05, + "loss": 0.5547, + "step": 618 + }, + { + "epoch": 2.564315352697095, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7178747653961182, + "eval_runtime": 28.6166, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 618 + }, + { + "epoch": 2.568464730290456, + "grad_norm": 11.6875, + "learning_rate": 2.4315352697095437e-05, + "loss": 1.0625, + "step": 619 + }, + { + "epoch": 2.568464730290456, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.718839168548584, + "eval_runtime": 28.7156, + "eval_samples_per_second": 16.785, + "eval_steps_per_second": 2.124, + "step": 619 + }, + { + "epoch": 2.572614107883817, + "grad_norm": 10.9375, + "learning_rate": 2.427385892116183e-05, + "loss": 0.707, + "step": 620 + }, + { + "epoch": 2.572614107883817, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7161890268325806, + "eval_runtime": 28.6642, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.128, + "step": 620 + }, + { + "epoch": 2.576763485477178, + "grad_norm": 10.125, + "learning_rate": 2.4232365145228216e-05, + "loss": 0.5117, + "step": 621 + }, + { + "epoch": 2.576763485477178, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7131742835044861, + "eval_runtime": 28.6543, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.129, + "step": 621 + }, + { + "epoch": 2.5809128630705396, + "grad_norm": 54.0, + "learning_rate": 2.4190871369294608e-05, + "loss": 0.8672, + "step": 622 + }, + { + "epoch": 2.5809128630705396, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7113913297653198, + "eval_runtime": 28.6411, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.13, + "step": 622 + }, + { + "epoch": 2.5850622406639006, + "grad_norm": 11.5625, + "learning_rate": 2.4149377593360996e-05, + "loss": 0.6562, + "step": 623 + }, + { + "epoch": 2.5850622406639006, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7103216052055359, + "eval_runtime": 28.6308, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 623 + }, + { + "epoch": 2.5892116182572615, + "grad_norm": 11.625, + "learning_rate": 2.4107883817427388e-05, + "loss": 1.0391, + "step": 624 + }, + { + "epoch": 2.5892116182572615, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7068286538124084, + "eval_runtime": 28.6217, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 624 + }, + { + "epoch": 2.5933609958506225, + "grad_norm": 6.34375, + "learning_rate": 2.406639004149378e-05, + "loss": 0.543, + "step": 625 + }, + { + "epoch": 2.5933609958506225, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7054428458213806, + "eval_runtime": 28.5195, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 625 + }, + { + "epoch": 2.5975103734439835, + "grad_norm": 13.1875, + "learning_rate": 2.4024896265560167e-05, + "loss": 0.5781, + "step": 626 + }, + { + "epoch": 2.5975103734439835, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6965362429618835, + "eval_runtime": 28.527, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.138, + "step": 626 + }, + { + "epoch": 2.6016597510373445, + "grad_norm": 6.375, + "learning_rate": 2.398340248962656e-05, + "loss": 0.3223, + "step": 627 + }, + { + "epoch": 2.6016597510373445, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6945831179618835, + "eval_runtime": 28.5159, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.139, + "step": 627 + }, + { + "epoch": 2.6058091286307055, + "grad_norm": 6.6875, + "learning_rate": 2.3941908713692947e-05, + "loss": 0.6211, + "step": 628 + }, + { + "epoch": 2.6058091286307055, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.692127525806427, + "eval_runtime": 28.6089, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 628 + }, + { + "epoch": 2.6099585062240664, + "grad_norm": 50.25, + "learning_rate": 2.3900414937759338e-05, + "loss": 0.6016, + "step": 629 + }, + { + "epoch": 2.6099585062240664, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6913657188415527, + "eval_runtime": 28.6523, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 629 + }, + { + "epoch": 2.6141078838174274, + "grad_norm": 10.0625, + "learning_rate": 2.385892116182573e-05, + "loss": 0.5625, + "step": 630 + }, + { + "epoch": 2.6141078838174274, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6918033361434937, + "eval_runtime": 28.6732, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.127, + "step": 630 + }, + { + "epoch": 2.6182572614107884, + "grad_norm": 4.40625, + "learning_rate": 2.3817427385892114e-05, + "loss": 0.3262, + "step": 631 + }, + { + "epoch": 2.6182572614107884, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.6936673521995544, + "eval_runtime": 28.66, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 631 + }, + { + "epoch": 2.6224066390041494, + "grad_norm": 20.875, + "learning_rate": 2.3775933609958506e-05, + "loss": 0.8594, + "step": 632 + }, + { + "epoch": 2.6224066390041494, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6991782188415527, + "eval_runtime": 28.7003, + "eval_samples_per_second": 16.794, + "eval_steps_per_second": 2.125, + "step": 632 + }, + { + "epoch": 2.6265560165975104, + "grad_norm": 12.25, + "learning_rate": 2.3734439834024897e-05, + "loss": 0.7734, + "step": 633 + }, + { + "epoch": 2.6265560165975104, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.6978896260261536, + "eval_runtime": 28.6284, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.131, + "step": 633 + }, + { + "epoch": 2.6307053941908713, + "grad_norm": 6.21875, + "learning_rate": 2.3692946058091285e-05, + "loss": 0.6562, + "step": 634 + }, + { + "epoch": 2.6307053941908713, + "eval_accuracy": 0.5954356846473029, + "eval_loss": 0.6957744359970093, + "eval_runtime": 28.5703, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 634 + }, + { + "epoch": 2.6348547717842323, + "grad_norm": 30.25, + "learning_rate": 2.3651452282157677e-05, + "loss": 0.5703, + "step": 635 + }, + { + "epoch": 2.6348547717842323, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.6971278786659241, + "eval_runtime": 28.5588, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 635 + }, + { + "epoch": 2.6390041493775933, + "grad_norm": 15.0625, + "learning_rate": 2.360995850622407e-05, + "loss": 0.4883, + "step": 636 + }, + { + "epoch": 2.6390041493775933, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7037895321846008, + "eval_runtime": 28.5351, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 636 + }, + { + "epoch": 2.6431535269709543, + "grad_norm": 14.5625, + "learning_rate": 2.3568464730290456e-05, + "loss": 0.6367, + "step": 637 + }, + { + "epoch": 2.6431535269709543, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7089560031890869, + "eval_runtime": 28.4682, + "eval_samples_per_second": 16.931, + "eval_steps_per_second": 2.143, + "step": 637 + }, + { + "epoch": 2.6473029045643153, + "grad_norm": 6.0625, + "learning_rate": 2.3526970954356848e-05, + "loss": 0.6055, + "step": 638 + }, + { + "epoch": 2.6473029045643153, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7118735313415527, + "eval_runtime": 28.4614, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.143, + "step": 638 + }, + { + "epoch": 2.6514522821576763, + "grad_norm": 16.125, + "learning_rate": 2.3485477178423236e-05, + "loss": 1.3281, + "step": 639 + }, + { + "epoch": 2.6514522821576763, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.713190495967865, + "eval_runtime": 28.4388, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.145, + "step": 639 + }, + { + "epoch": 2.6556016597510372, + "grad_norm": 17.375, + "learning_rate": 2.3443983402489627e-05, + "loss": 0.8594, + "step": 640 + }, + { + "epoch": 2.6556016597510372, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7124205827713013, + "eval_runtime": 28.4583, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 640 + }, + { + "epoch": 2.659751037344398, + "grad_norm": 5.34375, + "learning_rate": 2.340248962655602e-05, + "loss": 0.3633, + "step": 641 + }, + { + "epoch": 2.659751037344398, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7130283713340759, + "eval_runtime": 28.4878, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.141, + "step": 641 + }, + { + "epoch": 2.663900414937759, + "grad_norm": 7.375, + "learning_rate": 2.3360995850622407e-05, + "loss": 0.7422, + "step": 642 + }, + { + "epoch": 2.663900414937759, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7142764329910278, + "eval_runtime": 28.6509, + "eval_samples_per_second": 16.823, + "eval_steps_per_second": 2.129, + "step": 642 + }, + { + "epoch": 2.66804979253112, + "grad_norm": 10.4375, + "learning_rate": 2.33195020746888e-05, + "loss": 0.5, + "step": 643 + }, + { + "epoch": 2.66804979253112, + "eval_accuracy": 0.5912863070539419, + "eval_loss": 0.7151800990104675, + "eval_runtime": 28.7079, + "eval_samples_per_second": 16.79, + "eval_steps_per_second": 2.125, + "step": 643 + }, + { + "epoch": 2.6721991701244816, + "grad_norm": 8.125, + "learning_rate": 2.327800829875519e-05, + "loss": 0.3613, + "step": 644 + }, + { + "epoch": 2.6721991701244816, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7175830006599426, + "eval_runtime": 28.719, + "eval_samples_per_second": 16.783, + "eval_steps_per_second": 2.124, + "step": 644 + }, + { + "epoch": 2.6763485477178426, + "grad_norm": 11.5625, + "learning_rate": 2.3236514522821578e-05, + "loss": 0.7539, + "step": 645 + }, + { + "epoch": 2.6763485477178426, + "eval_accuracy": 0.5933609958506224, + "eval_loss": 0.7197873592376709, + "eval_runtime": 28.6532, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 645 + }, + { + "epoch": 2.6804979253112036, + "grad_norm": 10.0625, + "learning_rate": 2.319502074688797e-05, + "loss": 0.5781, + "step": 646 + }, + { + "epoch": 2.6804979253112036, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7202249765396118, + "eval_runtime": 28.6348, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.13, + "step": 646 + }, + { + "epoch": 2.6846473029045645, + "grad_norm": 4.125, + "learning_rate": 2.3153526970954358e-05, + "loss": 0.4805, + "step": 647 + }, + { + "epoch": 2.6846473029045645, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7195523381233215, + "eval_runtime": 28.6165, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 647 + }, + { + "epoch": 2.6887966804979255, + "grad_norm": 10.8125, + "learning_rate": 2.311203319502075e-05, + "loss": 0.6445, + "step": 648 + }, + { + "epoch": 2.6887966804979255, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.720071017742157, + "eval_runtime": 28.6007, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.133, + "step": 648 + }, + { + "epoch": 2.6929460580912865, + "grad_norm": 10.3125, + "learning_rate": 2.307053941908714e-05, + "loss": 1.0078, + "step": 649 + }, + { + "epoch": 2.6929460580912865, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7197306156158447, + "eval_runtime": 28.4937, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.141, + "step": 649 + }, + { + "epoch": 2.6970954356846475, + "grad_norm": 30.375, + "learning_rate": 2.3029045643153525e-05, + "loss": 1.0625, + "step": 650 + }, + { + "epoch": 2.6970954356846475, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7191876173019409, + "eval_runtime": 28.5096, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.14, + "step": 650 + }, + { + "epoch": 2.7012448132780085, + "grad_norm": 5.15625, + "learning_rate": 2.2987551867219917e-05, + "loss": 0.459, + "step": 651 + }, + { + "epoch": 2.7012448132780085, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7186567783355713, + "eval_runtime": 28.3483, + "eval_samples_per_second": 17.003, + "eval_steps_per_second": 2.152, + "step": 651 + }, + { + "epoch": 2.7053941908713695, + "grad_norm": 6.0625, + "learning_rate": 2.2946058091286308e-05, + "loss": 0.5586, + "step": 652 + }, + { + "epoch": 2.7053941908713695, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7180935740470886, + "eval_runtime": 28.3991, + "eval_samples_per_second": 16.972, + "eval_steps_per_second": 2.148, + "step": 652 + }, + { + "epoch": 2.7095435684647304, + "grad_norm": 23.0, + "learning_rate": 2.2904564315352696e-05, + "loss": 0.7109, + "step": 653 + }, + { + "epoch": 2.7095435684647304, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7188715934753418, + "eval_runtime": 28.5311, + "eval_samples_per_second": 16.894, + "eval_steps_per_second": 2.138, + "step": 653 + }, + { + "epoch": 2.7136929460580914, + "grad_norm": 8.0, + "learning_rate": 2.2863070539419088e-05, + "loss": 0.7812, + "step": 654 + }, + { + "epoch": 2.7136929460580914, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7193942666053772, + "eval_runtime": 28.6081, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 654 + }, + { + "epoch": 2.7178423236514524, + "grad_norm": 6.75, + "learning_rate": 2.282157676348548e-05, + "loss": 0.7578, + "step": 655 + }, + { + "epoch": 2.7178423236514524, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7193092107772827, + "eval_runtime": 28.6406, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.13, + "step": 655 + }, + { + "epoch": 2.7219917012448134, + "grad_norm": 9.4375, + "learning_rate": 2.2780082987551867e-05, + "loss": 0.8047, + "step": 656 + }, + { + "epoch": 2.7219917012448134, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7194631695747375, + "eval_runtime": 28.6565, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.129, + "step": 656 + }, + { + "epoch": 2.7261410788381744, + "grad_norm": 4.625, + "learning_rate": 2.273858921161826e-05, + "loss": 0.332, + "step": 657 + }, + { + "epoch": 2.7261410788381744, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7189363837242126, + "eval_runtime": 28.709, + "eval_samples_per_second": 16.789, + "eval_steps_per_second": 2.125, + "step": 657 + }, + { + "epoch": 2.7302904564315353, + "grad_norm": 18.375, + "learning_rate": 2.2697095435684647e-05, + "loss": 0.6758, + "step": 658 + }, + { + "epoch": 2.7302904564315353, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7193335294723511, + "eval_runtime": 28.6455, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 658 + }, + { + "epoch": 2.7344398340248963, + "grad_norm": 9.4375, + "learning_rate": 2.265560165975104e-05, + "loss": 0.3164, + "step": 659 + }, + { + "epoch": 2.7344398340248963, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7199007868766785, + "eval_runtime": 28.6083, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 659 + }, + { + "epoch": 2.7385892116182573, + "grad_norm": 6.28125, + "learning_rate": 2.261410788381743e-05, + "loss": 0.4863, + "step": 660 + }, + { + "epoch": 2.7385892116182573, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7204437851905823, + "eval_runtime": 28.5317, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.138, + "step": 660 + }, + { + "epoch": 2.7427385892116183, + "grad_norm": 5.4375, + "learning_rate": 2.2572614107883818e-05, + "loss": 0.457, + "step": 661 + }, + { + "epoch": 2.7427385892116183, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7214081883430481, + "eval_runtime": 28.4895, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.141, + "step": 661 + }, + { + "epoch": 2.7468879668049793, + "grad_norm": 15.25, + "learning_rate": 2.253112033195021e-05, + "loss": 0.5977, + "step": 662 + }, + { + "epoch": 2.7468879668049793, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7225266098976135, + "eval_runtime": 28.456, + "eval_samples_per_second": 16.938, + "eval_steps_per_second": 2.144, + "step": 662 + }, + { + "epoch": 2.7510373443983402, + "grad_norm": 9.4375, + "learning_rate": 2.2489626556016598e-05, + "loss": 0.3984, + "step": 663 + }, + { + "epoch": 2.7510373443983402, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7232073545455933, + "eval_runtime": 28.449, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.144, + "step": 663 + }, + { + "epoch": 2.7551867219917012, + "grad_norm": 9.0625, + "learning_rate": 2.244813278008299e-05, + "loss": 0.6641, + "step": 664 + }, + { + "epoch": 2.7551867219917012, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7244392037391663, + "eval_runtime": 28.4733, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 664 + }, + { + "epoch": 2.759336099585062, + "grad_norm": 4.28125, + "learning_rate": 2.240663900414938e-05, + "loss": 0.2676, + "step": 665 + }, + { + "epoch": 2.759336099585062, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7280617952346802, + "eval_runtime": 28.6334, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.13, + "step": 665 + }, + { + "epoch": 2.763485477178423, + "grad_norm": 16.375, + "learning_rate": 2.236514522821577e-05, + "loss": 0.8945, + "step": 666 + }, + { + "epoch": 2.763485477178423, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.730890154838562, + "eval_runtime": 28.6404, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.13, + "step": 666 + }, + { + "epoch": 2.767634854771784, + "grad_norm": 55.75, + "learning_rate": 2.232365145228216e-05, + "loss": 0.5234, + "step": 667 + }, + { + "epoch": 2.767634854771784, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7329608201980591, + "eval_runtime": 28.6534, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 667 + }, + { + "epoch": 2.771784232365145, + "grad_norm": 3.859375, + "learning_rate": 2.228215767634855e-05, + "loss": 0.3555, + "step": 668 + }, + { + "epoch": 2.771784232365145, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7364658713340759, + "eval_runtime": 28.6887, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.126, + "step": 668 + }, + { + "epoch": 2.775933609958506, + "grad_norm": 8.75, + "learning_rate": 2.2240663900414936e-05, + "loss": 0.7305, + "step": 669 + }, + { + "epoch": 2.775933609958506, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7409637570381165, + "eval_runtime": 28.6388, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.13, + "step": 669 + }, + { + "epoch": 2.780082987551867, + "grad_norm": 4.9375, + "learning_rate": 2.2199170124481328e-05, + "loss": 0.5039, + "step": 670 + }, + { + "epoch": 2.780082987551867, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7434517741203308, + "eval_runtime": 28.6228, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 670 + }, + { + "epoch": 2.784232365145228, + "grad_norm": 7.9375, + "learning_rate": 2.215767634854772e-05, + "loss": 0.375, + "step": 671 + }, + { + "epoch": 2.784232365145228, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7464665174484253, + "eval_runtime": 28.5455, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 671 + }, + { + "epoch": 2.788381742738589, + "grad_norm": 5.59375, + "learning_rate": 2.2116182572614107e-05, + "loss": 0.4336, + "step": 672 + }, + { + "epoch": 2.788381742738589, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7519288063049316, + "eval_runtime": 28.4874, + "eval_samples_per_second": 16.92, + "eval_steps_per_second": 2.141, + "step": 672 + }, + { + "epoch": 2.79253112033195, + "grad_norm": 8.75, + "learning_rate": 2.20746887966805e-05, + "loss": 0.6758, + "step": 673 + }, + { + "epoch": 2.79253112033195, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7535334825515747, + "eval_runtime": 28.5084, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.14, + "step": 673 + }, + { + "epoch": 2.796680497925311, + "grad_norm": 19.0, + "learning_rate": 2.2033195020746887e-05, + "loss": 0.582, + "step": 674 + }, + { + "epoch": 2.796680497925311, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7567751407623291, + "eval_runtime": 28.4911, + "eval_samples_per_second": 16.918, + "eval_steps_per_second": 2.141, + "step": 674 + }, + { + "epoch": 2.800829875518672, + "grad_norm": 7.75, + "learning_rate": 2.199170124481328e-05, + "loss": 0.4551, + "step": 675 + }, + { + "epoch": 2.800829875518672, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7580069899559021, + "eval_runtime": 28.6125, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 675 + }, + { + "epoch": 2.804979253112033, + "grad_norm": 9.75, + "learning_rate": 2.195020746887967e-05, + "loss": 0.8359, + "step": 676 + }, + { + "epoch": 2.804979253112033, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7606327533721924, + "eval_runtime": 28.7066, + "eval_samples_per_second": 16.791, + "eval_steps_per_second": 2.125, + "step": 676 + }, + { + "epoch": 2.809128630705394, + "grad_norm": 4.34375, + "learning_rate": 2.1908713692946058e-05, + "loss": 0.2637, + "step": 677 + }, + { + "epoch": 2.809128630705394, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7631775140762329, + "eval_runtime": 28.6438, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.13, + "step": 677 + }, + { + "epoch": 2.813278008298755, + "grad_norm": 5.53125, + "learning_rate": 2.186721991701245e-05, + "loss": 0.375, + "step": 678 + }, + { + "epoch": 2.813278008298755, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7614431977272034, + "eval_runtime": 28.6657, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.128, + "step": 678 + }, + { + "epoch": 2.817427385892116, + "grad_norm": 14.0, + "learning_rate": 2.182572614107884e-05, + "loss": 0.6172, + "step": 679 + }, + { + "epoch": 2.817427385892116, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7619943022727966, + "eval_runtime": 28.6474, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.129, + "step": 679 + }, + { + "epoch": 2.821576763485477, + "grad_norm": 31.75, + "learning_rate": 2.178423236514523e-05, + "loss": 1.0312, + "step": 680 + }, + { + "epoch": 2.821576763485477, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7638744711875916, + "eval_runtime": 28.6313, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 680 + }, + { + "epoch": 2.825726141078838, + "grad_norm": 7.96875, + "learning_rate": 2.174273858921162e-05, + "loss": 0.7227, + "step": 681 + }, + { + "epoch": 2.825726141078838, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7631288766860962, + "eval_runtime": 28.502, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.14, + "step": 681 + }, + { + "epoch": 2.8298755186721993, + "grad_norm": 8.9375, + "learning_rate": 2.170124481327801e-05, + "loss": 0.5, + "step": 682 + }, + { + "epoch": 2.8298755186721993, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7649766802787781, + "eval_runtime": 28.4424, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.145, + "step": 682 + }, + { + "epoch": 2.8340248962655603, + "grad_norm": 12.375, + "learning_rate": 2.16597510373444e-05, + "loss": 1.0547, + "step": 683 + }, + { + "epoch": 2.8340248962655603, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7648469805717468, + "eval_runtime": 28.4625, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.143, + "step": 683 + }, + { + "epoch": 2.8381742738589213, + "grad_norm": 13.0625, + "learning_rate": 2.161825726141079e-05, + "loss": 0.9531, + "step": 684 + }, + { + "epoch": 2.8381742738589213, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7641175985336304, + "eval_runtime": 28.3901, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.149, + "step": 684 + }, + { + "epoch": 2.8423236514522823, + "grad_norm": 33.5, + "learning_rate": 2.157676348547718e-05, + "loss": 1.0469, + "step": 685 + }, + { + "epoch": 2.8423236514522823, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7636799812316895, + "eval_runtime": 28.4377, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.145, + "step": 685 + }, + { + "epoch": 2.8464730290456433, + "grad_norm": 31.375, + "learning_rate": 2.153526970954357e-05, + "loss": 1.1953, + "step": 686 + }, + { + "epoch": 2.8464730290456433, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.761929452419281, + "eval_runtime": 28.6185, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 686 + }, + { + "epoch": 2.8506224066390042, + "grad_norm": 6.90625, + "learning_rate": 2.149377593360996e-05, + "loss": 0.6289, + "step": 687 + }, + { + "epoch": 2.8506224066390042, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7564347982406616, + "eval_runtime": 28.6117, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 687 + }, + { + "epoch": 2.854771784232365, + "grad_norm": 22.375, + "learning_rate": 2.1452282157676347e-05, + "loss": 1.0625, + "step": 688 + }, + { + "epoch": 2.854771784232365, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7530148029327393, + "eval_runtime": 28.626, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 688 + }, + { + "epoch": 2.858921161825726, + "grad_norm": 7.40625, + "learning_rate": 2.141078838174274e-05, + "loss": 0.6406, + "step": 689 + }, + { + "epoch": 2.858921161825726, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7501944899559021, + "eval_runtime": 28.6219, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 689 + }, + { + "epoch": 2.863070539419087, + "grad_norm": 5.0625, + "learning_rate": 2.136929460580913e-05, + "loss": 0.6719, + "step": 690 + }, + { + "epoch": 2.863070539419087, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7466448545455933, + "eval_runtime": 28.6727, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.127, + "step": 690 + }, + { + "epoch": 2.867219917012448, + "grad_norm": 8.0625, + "learning_rate": 2.132780082987552e-05, + "loss": 0.4629, + "step": 691 + }, + { + "epoch": 2.867219917012448, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7446349859237671, + "eval_runtime": 28.6011, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.133, + "step": 691 + }, + { + "epoch": 2.871369294605809, + "grad_norm": 9.4375, + "learning_rate": 2.128630705394191e-05, + "loss": 0.4121, + "step": 692 + }, + { + "epoch": 2.871369294605809, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7449267506599426, + "eval_runtime": 28.5169, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.139, + "step": 692 + }, + { + "epoch": 2.87551867219917, + "grad_norm": 6.65625, + "learning_rate": 2.1244813278008298e-05, + "loss": 0.6328, + "step": 693 + }, + { + "epoch": 2.87551867219917, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7445539236068726, + "eval_runtime": 28.4384, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.145, + "step": 693 + }, + { + "epoch": 2.879668049792531, + "grad_norm": 19.875, + "learning_rate": 2.120331950207469e-05, + "loss": 1.2344, + "step": 694 + }, + { + "epoch": 2.879668049792531, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7438731789588928, + "eval_runtime": 28.4035, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.148, + "step": 694 + }, + { + "epoch": 2.883817427385892, + "grad_norm": 8.0, + "learning_rate": 2.116182572614108e-05, + "loss": 0.957, + "step": 695 + }, + { + "epoch": 2.883817427385892, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.7415067553520203, + "eval_runtime": 28.4286, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.146, + "step": 695 + }, + { + "epoch": 2.887966804979253, + "grad_norm": 27.125, + "learning_rate": 2.112033195020747e-05, + "loss": 0.8672, + "step": 696 + }, + { + "epoch": 2.887966804979253, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7393186092376709, + "eval_runtime": 28.4769, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.142, + "step": 696 + }, + { + "epoch": 2.892116182572614, + "grad_norm": 8.6875, + "learning_rate": 2.107883817427386e-05, + "loss": 0.5586, + "step": 697 + }, + { + "epoch": 2.892116182572614, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7377463579177856, + "eval_runtime": 28.5985, + "eval_samples_per_second": 16.854, + "eval_steps_per_second": 2.133, + "step": 697 + }, + { + "epoch": 2.896265560165975, + "grad_norm": 41.0, + "learning_rate": 2.103734439834025e-05, + "loss": 0.8672, + "step": 698 + }, + { + "epoch": 2.896265560165975, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7349423170089722, + "eval_runtime": 28.6494, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.129, + "step": 698 + }, + { + "epoch": 2.900414937759336, + "grad_norm": 7.1875, + "learning_rate": 2.099585062240664e-05, + "loss": 0.6484, + "step": 699 + }, + { + "epoch": 2.900414937759336, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.73497474193573, + "eval_runtime": 28.6237, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 699 + }, + { + "epoch": 2.904564315352697, + "grad_norm": 18.625, + "learning_rate": 2.095435684647303e-05, + "loss": 0.3809, + "step": 700 + }, + { + "epoch": 2.904564315352697, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7333700656890869, + "eval_runtime": 28.6077, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 700 + }, + { + "epoch": 2.908713692946058, + "grad_norm": 6.5, + "learning_rate": 2.091286307053942e-05, + "loss": 0.3457, + "step": 701 + }, + { + "epoch": 2.908713692946058, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.7324624061584473, + "eval_runtime": 28.529, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.138, + "step": 701 + }, + { + "epoch": 2.912863070539419, + "grad_norm": 64.0, + "learning_rate": 2.087136929460581e-05, + "loss": 0.7344, + "step": 702 + }, + { + "epoch": 2.912863070539419, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.7315223217010498, + "eval_runtime": 28.5125, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 702 + }, + { + "epoch": 2.91701244813278, + "grad_norm": 6.6875, + "learning_rate": 2.0829875518672203e-05, + "loss": 0.3203, + "step": 703 + }, + { + "epoch": 2.91701244813278, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7326568961143494, + "eval_runtime": 28.6041, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.133, + "step": 703 + }, + { + "epoch": 2.921161825726141, + "grad_norm": 9.25, + "learning_rate": 2.078838174273859e-05, + "loss": 0.7617, + "step": 704 + }, + { + "epoch": 2.921161825726141, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.731911301612854, + "eval_runtime": 28.6179, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 704 + }, + { + "epoch": 2.9253112033195023, + "grad_norm": 27.125, + "learning_rate": 2.0746887966804982e-05, + "loss": 0.5547, + "step": 705 + }, + { + "epoch": 2.9253112033195023, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7315547466278076, + "eval_runtime": 28.6773, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.127, + "step": 705 + }, + { + "epoch": 2.9294605809128633, + "grad_norm": 24.5, + "learning_rate": 2.070539419087137e-05, + "loss": 1.0781, + "step": 706 + }, + { + "epoch": 2.9294605809128633, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.7315547466278076, + "eval_runtime": 28.6226, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 706 + }, + { + "epoch": 2.9336099585062243, + "grad_norm": 19.25, + "learning_rate": 2.0663900414937758e-05, + "loss": 0.9922, + "step": 707 + }, + { + "epoch": 2.9336099585062243, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.730679452419281, + "eval_runtime": 28.6182, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.132, + "step": 707 + }, + { + "epoch": 2.9377593360995853, + "grad_norm": 6.625, + "learning_rate": 2.062240663900415e-05, + "loss": 0.7891, + "step": 708 + }, + { + "epoch": 2.9377593360995853, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.729236900806427, + "eval_runtime": 28.5624, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 708 + }, + { + "epoch": 2.9419087136929463, + "grad_norm": 19.0, + "learning_rate": 2.0580912863070538e-05, + "loss": 0.6445, + "step": 709 + }, + { + "epoch": 2.9419087136929463, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7287830710411072, + "eval_runtime": 28.5287, + "eval_samples_per_second": 16.895, + "eval_steps_per_second": 2.138, + "step": 709 + }, + { + "epoch": 2.9460580912863072, + "grad_norm": 21.625, + "learning_rate": 2.053941908713693e-05, + "loss": 0.7109, + "step": 710 + }, + { + "epoch": 2.9460580912863072, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7285561561584473, + "eval_runtime": 28.4857, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.141, + "step": 710 + }, + { + "epoch": 2.9502074688796682, + "grad_norm": 107.0, + "learning_rate": 2.049792531120332e-05, + "loss": 0.625, + "step": 711 + }, + { + "epoch": 2.9502074688796682, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7268542647361755, + "eval_runtime": 28.4586, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 711 + }, + { + "epoch": 2.954356846473029, + "grad_norm": 7.84375, + "learning_rate": 2.045643153526971e-05, + "loss": 0.459, + "step": 712 + }, + { + "epoch": 2.954356846473029, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7263355851173401, + "eval_runtime": 28.3866, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.149, + "step": 712 + }, + { + "epoch": 2.95850622406639, + "grad_norm": 10.25, + "learning_rate": 2.04149377593361e-05, + "loss": 0.957, + "step": 713 + }, + { + "epoch": 2.95850622406639, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7241312265396118, + "eval_runtime": 28.4218, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.146, + "step": 713 + }, + { + "epoch": 2.962655601659751, + "grad_norm": 8.4375, + "learning_rate": 2.0373443983402492e-05, + "loss": 0.8281, + "step": 714 + }, + { + "epoch": 2.962655601659751, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7210516333580017, + "eval_runtime": 28.476, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.142, + "step": 714 + }, + { + "epoch": 2.966804979253112, + "grad_norm": 18.75, + "learning_rate": 2.033195020746888e-05, + "loss": 0.6406, + "step": 715 + }, + { + "epoch": 2.966804979253112, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7197225093841553, + "eval_runtime": 28.5229, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.139, + "step": 715 + }, + { + "epoch": 2.970954356846473, + "grad_norm": 35.25, + "learning_rate": 2.029045643153527e-05, + "loss": 0.8594, + "step": 716 + }, + { + "epoch": 2.970954356846473, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7171777486801147, + "eval_runtime": 28.6078, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 716 + }, + { + "epoch": 2.975103734439834, + "grad_norm": 9.25, + "learning_rate": 2.024896265560166e-05, + "loss": 0.5781, + "step": 717 + }, + { + "epoch": 2.975103734439834, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7160269618034363, + "eval_runtime": 28.6414, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.13, + "step": 717 + }, + { + "epoch": 2.979253112033195, + "grad_norm": 12.75, + "learning_rate": 2.020746887966805e-05, + "loss": 1.3125, + "step": 718 + }, + { + "epoch": 2.979253112033195, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7152165174484253, + "eval_runtime": 28.6813, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.127, + "step": 718 + }, + { + "epoch": 2.983402489626556, + "grad_norm": 11.125, + "learning_rate": 2.0165975103734442e-05, + "loss": 0.9688, + "step": 719 + }, + { + "epoch": 2.983402489626556, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7137253880500793, + "eval_runtime": 28.6327, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 719 + }, + { + "epoch": 2.987551867219917, + "grad_norm": 33.0, + "learning_rate": 2.012448132780083e-05, + "loss": 0.5625, + "step": 720 + }, + { + "epoch": 2.987551867219917, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7116020321846008, + "eval_runtime": 28.6168, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 720 + }, + { + "epoch": 2.991701244813278, + "grad_norm": 5.84375, + "learning_rate": 2.0082987551867222e-05, + "loss": 0.3848, + "step": 721 + }, + { + "epoch": 2.991701244813278, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7104836702346802, + "eval_runtime": 28.5962, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 721 + }, + { + "epoch": 2.995850622406639, + "grad_norm": 11.1875, + "learning_rate": 2.0041493775933614e-05, + "loss": 0.543, + "step": 722 + }, + { + "epoch": 2.995850622406639, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7096732258796692, + "eval_runtime": 28.5817, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.134, + "step": 722 + }, + { + "epoch": 3.0, + "grad_norm": 13.9375, + "learning_rate": 2e-05, + "loss": 1.0156, + "step": 723 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7089114189147949, + "eval_runtime": 28.5963, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 723 + }, + { + "epoch": 3.004149377593361, + "grad_norm": 9.25, + "learning_rate": 1.9958506224066393e-05, + "loss": 0.7109, + "step": 724 + }, + { + "epoch": 3.004149377593361, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7082793116569519, + "eval_runtime": 28.6328, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 724 + }, + { + "epoch": 3.008298755186722, + "grad_norm": 5.21875, + "learning_rate": 1.991701244813278e-05, + "loss": 0.5469, + "step": 725 + }, + { + "epoch": 3.008298755186722, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7066746354103088, + "eval_runtime": 28.6852, + "eval_samples_per_second": 16.803, + "eval_steps_per_second": 2.127, + "step": 725 + }, + { + "epoch": 3.012448132780083, + "grad_norm": 40.25, + "learning_rate": 1.987551867219917e-05, + "loss": 0.6367, + "step": 726 + }, + { + "epoch": 3.012448132780083, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7059128880500793, + "eval_runtime": 28.6375, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 726 + }, + { + "epoch": 3.016597510373444, + "grad_norm": 17.5, + "learning_rate": 1.983402489626556e-05, + "loss": 0.5273, + "step": 727 + }, + { + "epoch": 3.016597510373444, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7061722278594971, + "eval_runtime": 28.6157, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 727 + }, + { + "epoch": 3.020746887966805, + "grad_norm": 7.03125, + "learning_rate": 1.979253112033195e-05, + "loss": 0.5625, + "step": 728 + }, + { + "epoch": 3.020746887966805, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.7047944664955139, + "eval_runtime": 28.5409, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 728 + }, + { + "epoch": 3.024896265560166, + "grad_norm": 11.375, + "learning_rate": 1.975103734439834e-05, + "loss": 1.2031, + "step": 729 + }, + { + "epoch": 3.024896265560166, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7060587406158447, + "eval_runtime": 28.4716, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 729 + }, + { + "epoch": 3.029045643153527, + "grad_norm": 13.625, + "learning_rate": 1.9709543568464732e-05, + "loss": 0.4219, + "step": 730 + }, + { + "epoch": 3.029045643153527, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7053131461143494, + "eval_runtime": 28.4428, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.145, + "step": 730 + }, + { + "epoch": 3.033195020746888, + "grad_norm": 16.125, + "learning_rate": 1.966804979253112e-05, + "loss": 0.3516, + "step": 731 + }, + { + "epoch": 3.033195020746888, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7060587406158447, + "eval_runtime": 28.4087, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.147, + "step": 731 + }, + { + "epoch": 3.037344398340249, + "grad_norm": 9.1875, + "learning_rate": 1.962655601659751e-05, + "loss": 0.6055, + "step": 732 + }, + { + "epoch": 3.037344398340249, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7064639329910278, + "eval_runtime": 28.4964, + "eval_samples_per_second": 16.914, + "eval_steps_per_second": 2.141, + "step": 732 + }, + { + "epoch": 3.04149377593361, + "grad_norm": 24.875, + "learning_rate": 1.95850622406639e-05, + "loss": 0.6055, + "step": 733 + }, + { + "epoch": 3.04149377593361, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7063667178153992, + "eval_runtime": 28.569, + "eval_samples_per_second": 16.871, + "eval_steps_per_second": 2.135, + "step": 733 + }, + { + "epoch": 3.045643153526971, + "grad_norm": 8.6875, + "learning_rate": 1.954356846473029e-05, + "loss": 0.6055, + "step": 734 + }, + { + "epoch": 3.045643153526971, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7053617835044861, + "eval_runtime": 28.6581, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.129, + "step": 734 + }, + { + "epoch": 3.0497925311203318, + "grad_norm": 10.0625, + "learning_rate": 1.9502074688796682e-05, + "loss": 0.6602, + "step": 735 + }, + { + "epoch": 3.0497925311203318, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7059128880500793, + "eval_runtime": 28.6196, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 735 + }, + { + "epoch": 3.0539419087136928, + "grad_norm": 5.375, + "learning_rate": 1.946058091286307e-05, + "loss": 0.6289, + "step": 736 + }, + { + "epoch": 3.0539419087136928, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7056049108505249, + "eval_runtime": 28.6269, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 736 + }, + { + "epoch": 3.0580912863070537, + "grad_norm": 13.9375, + "learning_rate": 1.9419087136929462e-05, + "loss": 0.8906, + "step": 737 + }, + { + "epoch": 3.0580912863070537, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7052969336509705, + "eval_runtime": 28.6814, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.127, + "step": 737 + }, + { + "epoch": 3.0622406639004147, + "grad_norm": 15.6875, + "learning_rate": 1.9377593360995853e-05, + "loss": 1.2734, + "step": 738 + }, + { + "epoch": 3.0622406639004147, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7048268914222717, + "eval_runtime": 28.6062, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 738 + }, + { + "epoch": 3.066390041493776, + "grad_norm": 25.75, + "learning_rate": 1.933609958506224e-05, + "loss": 0.7109, + "step": 739 + }, + { + "epoch": 3.066390041493776, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7048593163490295, + "eval_runtime": 28.632, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 739 + }, + { + "epoch": 3.070539419087137, + "grad_norm": 13.1875, + "learning_rate": 1.9294605809128633e-05, + "loss": 0.8906, + "step": 740 + }, + { + "epoch": 3.070539419087137, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7044702768325806, + "eval_runtime": 28.5632, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 740 + }, + { + "epoch": 3.074688796680498, + "grad_norm": 23.25, + "learning_rate": 1.925311203319502e-05, + "loss": 0.75, + "step": 741 + }, + { + "epoch": 3.074688796680498, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7041785717010498, + "eval_runtime": 28.5884, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 741 + }, + { + "epoch": 3.078838174273859, + "grad_norm": 7.25, + "learning_rate": 1.9211618257261413e-05, + "loss": 0.7773, + "step": 742 + }, + { + "epoch": 3.078838174273859, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7022335529327393, + "eval_runtime": 28.5767, + "eval_samples_per_second": 16.867, + "eval_steps_per_second": 2.135, + "step": 742 + }, + { + "epoch": 3.08298755186722, + "grad_norm": 16.375, + "learning_rate": 1.9170124481327804e-05, + "loss": 0.75, + "step": 743 + }, + { + "epoch": 3.08298755186722, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7026549577713013, + "eval_runtime": 28.5244, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.139, + "step": 743 + }, + { + "epoch": 3.087136929460581, + "grad_norm": 6.25, + "learning_rate": 1.912863070539419e-05, + "loss": 0.7461, + "step": 744 + }, + { + "epoch": 3.087136929460581, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7009854912757874, + "eval_runtime": 28.5221, + "eval_samples_per_second": 16.899, + "eval_steps_per_second": 2.139, + "step": 744 + }, + { + "epoch": 3.091286307053942, + "grad_norm": 5.78125, + "learning_rate": 1.908713692946058e-05, + "loss": 0.5938, + "step": 745 + }, + { + "epoch": 3.091286307053942, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7012124061584473, + "eval_runtime": 28.5676, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 745 + }, + { + "epoch": 3.095435684647303, + "grad_norm": 9.625, + "learning_rate": 1.9045643153526972e-05, + "loss": 0.5625, + "step": 746 + }, + { + "epoch": 3.095435684647303, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7018120884895325, + "eval_runtime": 28.5162, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.139, + "step": 746 + }, + { + "epoch": 3.099585062240664, + "grad_norm": 16.125, + "learning_rate": 1.900414937759336e-05, + "loss": 1.0391, + "step": 747 + }, + { + "epoch": 3.099585062240664, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7007423639297485, + "eval_runtime": 28.5412, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 747 + }, + { + "epoch": 3.103734439834025, + "grad_norm": 18.5, + "learning_rate": 1.896265560165975e-05, + "loss": 0.7773, + "step": 748 + }, + { + "epoch": 3.103734439834025, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7001264095306396, + "eval_runtime": 28.513, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 748 + }, + { + "epoch": 3.107883817427386, + "grad_norm": 5.625, + "learning_rate": 1.8921161825726143e-05, + "loss": 0.5234, + "step": 749 + }, + { + "epoch": 3.107883817427386, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6992673873901367, + "eval_runtime": 28.5542, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.136, + "step": 749 + }, + { + "epoch": 3.112033195020747, + "grad_norm": 6.90625, + "learning_rate": 1.887966804979253e-05, + "loss": 0.2334, + "step": 750 + }, + { + "epoch": 3.112033195020747, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6993970274925232, + "eval_runtime": 28.5109, + "eval_samples_per_second": 16.906, + "eval_steps_per_second": 2.14, + "step": 750 + }, + { + "epoch": 3.116182572614108, + "grad_norm": 6.625, + "learning_rate": 1.8838174273858922e-05, + "loss": 0.5195, + "step": 751 + }, + { + "epoch": 3.116182572614108, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6995591521263123, + "eval_runtime": 28.4699, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.143, + "step": 751 + }, + { + "epoch": 3.120331950207469, + "grad_norm": 8.0, + "learning_rate": 1.879668049792531e-05, + "loss": 0.5586, + "step": 752 + }, + { + "epoch": 3.120331950207469, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.6975817084312439, + "eval_runtime": 28.4154, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.147, + "step": 752 + }, + { + "epoch": 3.12448132780083, + "grad_norm": 7.40625, + "learning_rate": 1.8755186721991702e-05, + "loss": 0.6719, + "step": 753 + }, + { + "epoch": 3.12448132780083, + "eval_accuracy": 0.5643153526970954, + "eval_loss": 0.6981651782989502, + "eval_runtime": 28.5556, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.136, + "step": 753 + }, + { + "epoch": 3.128630705394191, + "grad_norm": 6.03125, + "learning_rate": 1.8713692946058093e-05, + "loss": 0.7031, + "step": 754 + }, + { + "epoch": 3.128630705394191, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6979706883430481, + "eval_runtime": 28.6535, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 754 + }, + { + "epoch": 3.132780082987552, + "grad_norm": 7.78125, + "learning_rate": 1.867219917012448e-05, + "loss": 0.5859, + "step": 755 + }, + { + "epoch": 3.132780082987552, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.6974357962608337, + "eval_runtime": 28.6248, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 755 + }, + { + "epoch": 3.136929460580913, + "grad_norm": 23.375, + "learning_rate": 1.8630705394190873e-05, + "loss": 0.9453, + "step": 756 + }, + { + "epoch": 3.136929460580913, + "eval_accuracy": 0.5643153526970954, + "eval_loss": 0.6979058384895325, + "eval_runtime": 28.6196, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 756 + }, + { + "epoch": 3.141078838174274, + "grad_norm": 344.0, + "learning_rate": 1.8589211618257264e-05, + "loss": 0.5664, + "step": 757 + }, + { + "epoch": 3.141078838174274, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6974844336509705, + "eval_runtime": 28.6745, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.127, + "step": 757 + }, + { + "epoch": 3.145228215767635, + "grad_norm": 10.125, + "learning_rate": 1.8547717842323653e-05, + "loss": 0.5312, + "step": 758 + }, + { + "epoch": 3.145228215767635, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6969495415687561, + "eval_runtime": 28.6565, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.129, + "step": 758 + }, + { + "epoch": 3.1493775933609958, + "grad_norm": 15.625, + "learning_rate": 1.8506224066390044e-05, + "loss": 0.7891, + "step": 759 + }, + { + "epoch": 3.1493775933609958, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6954745650291443, + "eval_runtime": 28.5827, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 759 + }, + { + "epoch": 3.1535269709543567, + "grad_norm": 11.0, + "learning_rate": 1.8464730290456432e-05, + "loss": 0.6602, + "step": 760 + }, + { + "epoch": 3.1535269709543567, + "eval_accuracy": 0.5643153526970954, + "eval_loss": 0.696544349193573, + "eval_runtime": 28.4601, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 760 + }, + { + "epoch": 3.1576763485477177, + "grad_norm": 13.4375, + "learning_rate": 1.8423236514522824e-05, + "loss": 0.6992, + "step": 761 + }, + { + "epoch": 3.1576763485477177, + "eval_accuracy": 0.5622406639004149, + "eval_loss": 0.6963498592376709, + "eval_runtime": 28.4804, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.142, + "step": 761 + }, + { + "epoch": 3.1618257261410787, + "grad_norm": 7.71875, + "learning_rate": 1.8381742738589215e-05, + "loss": 0.7461, + "step": 762 + }, + { + "epoch": 3.1618257261410787, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.69591224193573, + "eval_runtime": 28.4018, + "eval_samples_per_second": 16.971, + "eval_steps_per_second": 2.148, + "step": 762 + }, + { + "epoch": 3.1659751037344397, + "grad_norm": 11.75, + "learning_rate": 1.83402489626556e-05, + "loss": 0.7773, + "step": 763 + }, + { + "epoch": 3.1659751037344397, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6966577768325806, + "eval_runtime": 28.3848, + "eval_samples_per_second": 16.981, + "eval_steps_per_second": 2.149, + "step": 763 + }, + { + "epoch": 3.1701244813278007, + "grad_norm": 4.375, + "learning_rate": 1.829875518672199e-05, + "loss": 0.4395, + "step": 764 + }, + { + "epoch": 3.1701244813278007, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6969171166419983, + "eval_runtime": 28.3714, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.15, + "step": 764 + }, + { + "epoch": 3.1742738589211617, + "grad_norm": 6.625, + "learning_rate": 1.8257261410788383e-05, + "loss": 0.543, + "step": 765 + }, + { + "epoch": 3.1742738589211617, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.6978410482406616, + "eval_runtime": 28.4763, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.142, + "step": 765 + }, + { + "epoch": 3.1784232365145226, + "grad_norm": 12.1875, + "learning_rate": 1.821576763485477e-05, + "loss": 0.7344, + "step": 766 + }, + { + "epoch": 3.1784232365145226, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6976141333580017, + "eval_runtime": 28.551, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.137, + "step": 766 + }, + { + "epoch": 3.1825726141078836, + "grad_norm": 193.0, + "learning_rate": 1.8174273858921162e-05, + "loss": 0.8555, + "step": 767 + }, + { + "epoch": 3.1825726141078836, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6984569430351257, + "eval_runtime": 28.6607, + "eval_samples_per_second": 16.817, + "eval_steps_per_second": 2.128, + "step": 767 + }, + { + "epoch": 3.186721991701245, + "grad_norm": 27.125, + "learning_rate": 1.8132780082987554e-05, + "loss": 0.7305, + "step": 768 + }, + { + "epoch": 3.186721991701245, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.6981489658355713, + "eval_runtime": 28.6446, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.13, + "step": 768 + }, + { + "epoch": 3.190871369294606, + "grad_norm": 9.1875, + "learning_rate": 1.8091286307053942e-05, + "loss": 0.8633, + "step": 769 + }, + { + "epoch": 3.190871369294606, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6975817084312439, + "eval_runtime": 28.675, + "eval_samples_per_second": 16.809, + "eval_steps_per_second": 2.127, + "step": 769 + }, + { + "epoch": 3.195020746887967, + "grad_norm": 35.5, + "learning_rate": 1.8049792531120333e-05, + "loss": 0.6133, + "step": 770 + }, + { + "epoch": 3.195020746887967, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6972088813781738, + "eval_runtime": 28.6196, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 770 + }, + { + "epoch": 3.199170124481328, + "grad_norm": 5.0625, + "learning_rate": 1.800829875518672e-05, + "loss": 0.4531, + "step": 771 + }, + { + "epoch": 3.199170124481328, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6976464986801147, + "eval_runtime": 28.6093, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 771 + }, + { + "epoch": 3.203319502074689, + "grad_norm": 6.375, + "learning_rate": 1.7966804979253113e-05, + "loss": 0.6914, + "step": 772 + }, + { + "epoch": 3.203319502074689, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.6967874765396118, + "eval_runtime": 28.5249, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.138, + "step": 772 + }, + { + "epoch": 3.20746887966805, + "grad_norm": 10.125, + "learning_rate": 1.7925311203319504e-05, + "loss": 1.125, + "step": 773 + }, + { + "epoch": 3.20746887966805, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6964794993400574, + "eval_runtime": 28.4628, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.143, + "step": 773 + }, + { + "epoch": 3.211618257261411, + "grad_norm": 7.25, + "learning_rate": 1.7883817427385893e-05, + "loss": 0.5469, + "step": 774 + }, + { + "epoch": 3.211618257261411, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6965929865837097, + "eval_runtime": 28.4233, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.146, + "step": 774 + }, + { + "epoch": 3.215767634854772, + "grad_norm": 41.25, + "learning_rate": 1.7842323651452284e-05, + "loss": 0.8789, + "step": 775 + }, + { + "epoch": 3.215767634854772, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6973061561584473, + "eval_runtime": 28.4053, + "eval_samples_per_second": 16.969, + "eval_steps_per_second": 2.147, + "step": 775 + }, + { + "epoch": 3.219917012448133, + "grad_norm": 21.25, + "learning_rate": 1.7800829875518672e-05, + "loss": 0.6836, + "step": 776 + }, + { + "epoch": 3.219917012448133, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6966416239738464, + "eval_runtime": 28.4547, + "eval_samples_per_second": 16.939, + "eval_steps_per_second": 2.144, + "step": 776 + }, + { + "epoch": 3.224066390041494, + "grad_norm": 8.0625, + "learning_rate": 1.7759336099585064e-05, + "loss": 0.75, + "step": 777 + }, + { + "epoch": 3.224066390041494, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.6960743069648743, + "eval_runtime": 28.575, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.135, + "step": 777 + }, + { + "epoch": 3.228215767634855, + "grad_norm": 9.3125, + "learning_rate": 1.7717842323651455e-05, + "loss": 0.4746, + "step": 778 + }, + { + "epoch": 3.228215767634855, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6966254115104675, + "eval_runtime": 28.629, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.131, + "step": 778 + }, + { + "epoch": 3.232365145228216, + "grad_norm": 5.25, + "learning_rate": 1.7676348547717843e-05, + "loss": 0.5938, + "step": 779 + }, + { + "epoch": 3.232365145228216, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.6967226266860962, + "eval_runtime": 28.6651, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.128, + "step": 779 + }, + { + "epoch": 3.236514522821577, + "grad_norm": 9.3125, + "learning_rate": 1.7634854771784235e-05, + "loss": 0.6602, + "step": 780 + }, + { + "epoch": 3.236514522821577, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6958636045455933, + "eval_runtime": 28.6459, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 780 + }, + { + "epoch": 3.240663900414938, + "grad_norm": 4.75, + "learning_rate": 1.7593360995850626e-05, + "loss": 0.3828, + "step": 781 + }, + { + "epoch": 3.240663900414938, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6961877346038818, + "eval_runtime": 28.6875, + "eval_samples_per_second": 16.802, + "eval_steps_per_second": 2.126, + "step": 781 + }, + { + "epoch": 3.2448132780082988, + "grad_norm": 22.0, + "learning_rate": 1.755186721991701e-05, + "loss": 0.4395, + "step": 782 + }, + { + "epoch": 3.2448132780082988, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6977599859237671, + "eval_runtime": 28.6804, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.127, + "step": 782 + }, + { + "epoch": 3.2489626556016598, + "grad_norm": 32.5, + "learning_rate": 1.7510373443983402e-05, + "loss": 0.9766, + "step": 783 + }, + { + "epoch": 3.2489626556016598, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6978410482406616, + "eval_runtime": 28.6158, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 783 + }, + { + "epoch": 3.2531120331950207, + "grad_norm": 8.875, + "learning_rate": 1.7468879668049794e-05, + "loss": 0.7148, + "step": 784 + }, + { + "epoch": 3.2531120331950207, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6987000703811646, + "eval_runtime": 28.4948, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.141, + "step": 784 + }, + { + "epoch": 3.2572614107883817, + "grad_norm": 7.6875, + "learning_rate": 1.7427385892116182e-05, + "loss": 0.4785, + "step": 785 + }, + { + "epoch": 3.2572614107883817, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6988135576248169, + "eval_runtime": 28.5063, + "eval_samples_per_second": 16.909, + "eval_steps_per_second": 2.14, + "step": 785 + }, + { + "epoch": 3.2614107883817427, + "grad_norm": 4.71875, + "learning_rate": 1.7385892116182573e-05, + "loss": 0.4844, + "step": 786 + }, + { + "epoch": 3.2614107883817427, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.6995915174484253, + "eval_runtime": 28.408, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.147, + "step": 786 + }, + { + "epoch": 3.2655601659751037, + "grad_norm": 9.75, + "learning_rate": 1.734439834024896e-05, + "loss": 0.5039, + "step": 787 + }, + { + "epoch": 3.2655601659751037, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7007910013198853, + "eval_runtime": 28.4509, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.144, + "step": 787 + }, + { + "epoch": 3.2697095435684647, + "grad_norm": 9.0, + "learning_rate": 1.7302904564315353e-05, + "loss": 0.6055, + "step": 788 + }, + { + "epoch": 3.2697095435684647, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.702492892742157, + "eval_runtime": 28.6264, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 788 + }, + { + "epoch": 3.2738589211618256, + "grad_norm": 4.15625, + "learning_rate": 1.7261410788381744e-05, + "loss": 0.3496, + "step": 789 + }, + { + "epoch": 3.2738589211618256, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7033519148826599, + "eval_runtime": 28.6265, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 789 + }, + { + "epoch": 3.2780082987551866, + "grad_norm": 4.9375, + "learning_rate": 1.7219917012448132e-05, + "loss": 0.4473, + "step": 790 + }, + { + "epoch": 3.2780082987551866, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.704988956451416, + "eval_runtime": 28.6346, + "eval_samples_per_second": 16.833, + "eval_steps_per_second": 2.13, + "step": 790 + }, + { + "epoch": 3.2821576763485476, + "grad_norm": 7.46875, + "learning_rate": 1.7178423236514524e-05, + "loss": 0.4824, + "step": 791 + }, + { + "epoch": 3.2821576763485476, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7060911655426025, + "eval_runtime": 28.6356, + "eval_samples_per_second": 16.832, + "eval_steps_per_second": 2.13, + "step": 791 + }, + { + "epoch": 3.2863070539419086, + "grad_norm": 15.25, + "learning_rate": 1.7136929460580915e-05, + "loss": 1.2109, + "step": 792 + }, + { + "epoch": 3.2863070539419086, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7060749530792236, + "eval_runtime": 28.6238, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 792 + }, + { + "epoch": 3.2904564315352696, + "grad_norm": 19.25, + "learning_rate": 1.7095435684647304e-05, + "loss": 0.8203, + "step": 793 + }, + { + "epoch": 3.2904564315352696, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7065287828445435, + "eval_runtime": 28.5957, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.133, + "step": 793 + }, + { + "epoch": 3.2946058091286305, + "grad_norm": 22.125, + "learning_rate": 1.7053941908713695e-05, + "loss": 0.8906, + "step": 794 + }, + { + "epoch": 3.2946058091286305, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7070474624633789, + "eval_runtime": 28.5272, + "eval_samples_per_second": 16.896, + "eval_steps_per_second": 2.138, + "step": 794 + }, + { + "epoch": 3.2987551867219915, + "grad_norm": 5.75, + "learning_rate": 1.7012448132780083e-05, + "loss": 0.4277, + "step": 795 + }, + { + "epoch": 3.2987551867219915, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.70703125, + "eval_runtime": 28.4726, + "eval_samples_per_second": 16.929, + "eval_steps_per_second": 2.142, + "step": 795 + }, + { + "epoch": 3.3029045643153525, + "grad_norm": 8.0, + "learning_rate": 1.6970954356846475e-05, + "loss": 0.6328, + "step": 796 + }, + { + "epoch": 3.3029045643153525, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7058804631233215, + "eval_runtime": 28.4392, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.145, + "step": 796 + }, + { + "epoch": 3.3070539419087135, + "grad_norm": 7.3125, + "learning_rate": 1.6929460580912866e-05, + "loss": 0.6484, + "step": 797 + }, + { + "epoch": 3.3070539419087135, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7054428458213806, + "eval_runtime": 28.5003, + "eval_samples_per_second": 16.912, + "eval_steps_per_second": 2.14, + "step": 797 + }, + { + "epoch": 3.3112033195020745, + "grad_norm": 5.4375, + "learning_rate": 1.6887966804979254e-05, + "loss": 0.4512, + "step": 798 + }, + { + "epoch": 3.3112033195020745, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7060263156890869, + "eval_runtime": 28.4741, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 798 + }, + { + "epoch": 3.3153526970954355, + "grad_norm": 5.4375, + "learning_rate": 1.6846473029045646e-05, + "loss": 0.7148, + "step": 799 + }, + { + "epoch": 3.3153526970954355, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7041785717010498, + "eval_runtime": 28.627, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 799 + }, + { + "epoch": 3.3195020746887964, + "grad_norm": 8.1875, + "learning_rate": 1.6804979253112034e-05, + "loss": 0.9766, + "step": 800 + }, + { + "epoch": 3.3195020746887964, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7051835060119629, + "eval_runtime": 28.6055, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 800 + }, + { + "epoch": 3.323651452282158, + "grad_norm": 17.75, + "learning_rate": 1.6763485477178422e-05, + "loss": 1.1875, + "step": 801 + }, + { + "epoch": 3.323651452282158, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7041785717010498, + "eval_runtime": 28.5334, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 801 + }, + { + "epoch": 3.327800829875519, + "grad_norm": 5.09375, + "learning_rate": 1.6721991701244813e-05, + "loss": 0.4941, + "step": 802 + }, + { + "epoch": 3.327800829875519, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.704567551612854, + "eval_runtime": 28.5414, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 802 + }, + { + "epoch": 3.33195020746888, + "grad_norm": 12.25, + "learning_rate": 1.6680497925311205e-05, + "loss": 0.8008, + "step": 803 + }, + { + "epoch": 3.33195020746888, + "eval_accuracy": 0.5663900414937759, + "eval_loss": 0.705588698387146, + "eval_runtime": 28.6068, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 803 + }, + { + "epoch": 3.336099585062241, + "grad_norm": 7.09375, + "learning_rate": 1.6639004149377593e-05, + "loss": 0.5977, + "step": 804 + }, + { + "epoch": 3.336099585062241, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7033843398094177, + "eval_runtime": 28.6202, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 804 + }, + { + "epoch": 3.340248962655602, + "grad_norm": 11.9375, + "learning_rate": 1.6597510373443984e-05, + "loss": 0.5547, + "step": 805 + }, + { + "epoch": 3.340248962655602, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7040650844573975, + "eval_runtime": 28.6637, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.128, + "step": 805 + }, + { + "epoch": 3.3443983402489628, + "grad_norm": 6.0625, + "learning_rate": 1.6556016597510372e-05, + "loss": 0.5469, + "step": 806 + }, + { + "epoch": 3.3443983402489628, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7047458291053772, + "eval_runtime": 28.6577, + "eval_samples_per_second": 16.819, + "eval_steps_per_second": 2.129, + "step": 806 + }, + { + "epoch": 3.3485477178423237, + "grad_norm": 6.65625, + "learning_rate": 1.6514522821576764e-05, + "loss": 0.707, + "step": 807 + }, + { + "epoch": 3.3485477178423237, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7044702768325806, + "eval_runtime": 28.6384, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 807 + }, + { + "epoch": 3.3526970954356847, + "grad_norm": 7.28125, + "learning_rate": 1.6473029045643155e-05, + "loss": 0.5547, + "step": 808 + }, + { + "epoch": 3.3526970954356847, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7041623592376709, + "eval_runtime": 28.5612, + "eval_samples_per_second": 16.876, + "eval_steps_per_second": 2.136, + "step": 808 + }, + { + "epoch": 3.3568464730290457, + "grad_norm": 5.40625, + "learning_rate": 1.6431535269709543e-05, + "loss": 0.5312, + "step": 809 + }, + { + "epoch": 3.3568464730290457, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7042433619499207, + "eval_runtime": 28.4516, + "eval_samples_per_second": 16.941, + "eval_steps_per_second": 2.144, + "step": 809 + }, + { + "epoch": 3.3609958506224067, + "grad_norm": 9.8125, + "learning_rate": 1.6390041493775935e-05, + "loss": 0.8672, + "step": 810 + }, + { + "epoch": 3.3609958506224067, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7046324014663696, + "eval_runtime": 28.4067, + "eval_samples_per_second": 16.968, + "eval_steps_per_second": 2.147, + "step": 810 + }, + { + "epoch": 3.3651452282157677, + "grad_norm": 5.78125, + "learning_rate": 1.6348547717842323e-05, + "loss": 0.4336, + "step": 811 + }, + { + "epoch": 3.3651452282157677, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7057669758796692, + "eval_runtime": 28.3799, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 811 + }, + { + "epoch": 3.3692946058091287, + "grad_norm": 8.0625, + "learning_rate": 1.6307053941908715e-05, + "loss": 0.9023, + "step": 812 + }, + { + "epoch": 3.3692946058091287, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7045999765396118, + "eval_runtime": 28.4767, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.142, + "step": 812 + }, + { + "epoch": 3.3734439834024896, + "grad_norm": 28.625, + "learning_rate": 1.6265560165975106e-05, + "loss": 1.1953, + "step": 813 + }, + { + "epoch": 3.3734439834024896, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7045999765396118, + "eval_runtime": 28.5544, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.136, + "step": 813 + }, + { + "epoch": 3.3775933609958506, + "grad_norm": 5.8125, + "learning_rate": 1.6224066390041494e-05, + "loss": 0.6367, + "step": 814 + }, + { + "epoch": 3.3775933609958506, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7030115127563477, + "eval_runtime": 28.6051, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 814 + }, + { + "epoch": 3.3817427385892116, + "grad_norm": 4.625, + "learning_rate": 1.6182572614107886e-05, + "loss": 0.7812, + "step": 815 + }, + { + "epoch": 3.3817427385892116, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7031898498535156, + "eval_runtime": 28.6278, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 815 + }, + { + "epoch": 3.3858921161825726, + "grad_norm": 5.03125, + "learning_rate": 1.6141078838174277e-05, + "loss": 0.6328, + "step": 816 + }, + { + "epoch": 3.3858921161825726, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7024442553520203, + "eval_runtime": 28.6164, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 816 + }, + { + "epoch": 3.3900414937759336, + "grad_norm": 8.375, + "learning_rate": 1.6099585062240665e-05, + "loss": 0.6094, + "step": 817 + }, + { + "epoch": 3.3900414937759336, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7024442553520203, + "eval_runtime": 28.6604, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 817 + }, + { + "epoch": 3.3941908713692945, + "grad_norm": 8.3125, + "learning_rate": 1.6058091286307057e-05, + "loss": 0.625, + "step": 818 + }, + { + "epoch": 3.3941908713692945, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7014231085777283, + "eval_runtime": 28.5591, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 818 + }, + { + "epoch": 3.3983402489626555, + "grad_norm": 12.8125, + "learning_rate": 1.6016597510373445e-05, + "loss": 0.6562, + "step": 819 + }, + { + "epoch": 3.3983402489626555, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7013906836509705, + "eval_runtime": 28.4853, + "eval_samples_per_second": 16.921, + "eval_steps_per_second": 2.141, + "step": 819 + }, + { + "epoch": 3.4024896265560165, + "grad_norm": 5.0, + "learning_rate": 1.5975103734439833e-05, + "loss": 0.4355, + "step": 820 + }, + { + "epoch": 3.4024896265560165, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7001912593841553, + "eval_runtime": 28.4306, + "eval_samples_per_second": 16.954, + "eval_steps_per_second": 2.146, + "step": 820 + }, + { + "epoch": 3.4066390041493775, + "grad_norm": 16.625, + "learning_rate": 1.5933609958506224e-05, + "loss": 0.5312, + "step": 821 + }, + { + "epoch": 3.4066390041493775, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7009368538856506, + "eval_runtime": 28.4037, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.148, + "step": 821 + }, + { + "epoch": 3.4107883817427385, + "grad_norm": 8.9375, + "learning_rate": 1.5892116182572612e-05, + "loss": 0.6875, + "step": 822 + }, + { + "epoch": 3.4107883817427385, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7011151313781738, + "eval_runtime": 28.4396, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.145, + "step": 822 + }, + { + "epoch": 3.4149377593360994, + "grad_norm": 6.0, + "learning_rate": 1.5850622406639004e-05, + "loss": 0.6914, + "step": 823 + }, + { + "epoch": 3.4149377593360994, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6996239423751831, + "eval_runtime": 28.3741, + "eval_samples_per_second": 16.987, + "eval_steps_per_second": 2.15, + "step": 823 + }, + { + "epoch": 3.4190871369294604, + "grad_norm": 23.125, + "learning_rate": 1.5809128630705395e-05, + "loss": 0.3555, + "step": 824 + }, + { + "epoch": 3.4190871369294604, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7007585763931274, + "eval_runtime": 28.3598, + "eval_samples_per_second": 16.996, + "eval_steps_per_second": 2.151, + "step": 824 + }, + { + "epoch": 3.4232365145228214, + "grad_norm": 13.1875, + "learning_rate": 1.5767634854771783e-05, + "loss": 0.5391, + "step": 825 + }, + { + "epoch": 3.4232365145228214, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.6999805569648743, + "eval_runtime": 28.3975, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.148, + "step": 825 + }, + { + "epoch": 3.4273858921161824, + "grad_norm": 15.625, + "learning_rate": 1.5726141078838175e-05, + "loss": 0.6289, + "step": 826 + }, + { + "epoch": 3.4273858921161824, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7004992365837097, + "eval_runtime": 28.5329, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.138, + "step": 826 + }, + { + "epoch": 3.431535269709544, + "grad_norm": 7.9375, + "learning_rate": 1.5684647302904566e-05, + "loss": 0.543, + "step": 827 + }, + { + "epoch": 3.431535269709544, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7000777721405029, + "eval_runtime": 28.5943, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.133, + "step": 827 + }, + { + "epoch": 3.435684647302905, + "grad_norm": 6.5, + "learning_rate": 1.5643153526970954e-05, + "loss": 0.3555, + "step": 828 + }, + { + "epoch": 3.435684647302905, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7005640268325806, + "eval_runtime": 28.6597, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 828 + }, + { + "epoch": 3.4398340248962658, + "grad_norm": 29.25, + "learning_rate": 1.5601659751037346e-05, + "loss": 0.8242, + "step": 829 + }, + { + "epoch": 3.4398340248962658, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6996725797653198, + "eval_runtime": 28.6144, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 829 + }, + { + "epoch": 3.4439834024896268, + "grad_norm": 6.0, + "learning_rate": 1.5560165975103734e-05, + "loss": 0.6172, + "step": 830 + }, + { + "epoch": 3.4439834024896268, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.6999967694282532, + "eval_runtime": 28.6114, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 830 + }, + { + "epoch": 3.4481327800829877, + "grad_norm": 9.375, + "learning_rate": 1.5518672199170126e-05, + "loss": 0.8008, + "step": 831 + }, + { + "epoch": 3.4481327800829877, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7008233666419983, + "eval_runtime": 28.6155, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 831 + }, + { + "epoch": 3.4522821576763487, + "grad_norm": 7.28125, + "learning_rate": 1.5477178423236517e-05, + "loss": 0.6211, + "step": 832 + }, + { + "epoch": 3.4522821576763487, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7004343867301941, + "eval_runtime": 28.5643, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.136, + "step": 832 + }, + { + "epoch": 3.4564315352697097, + "grad_norm": 56.0, + "learning_rate": 1.5435684647302905e-05, + "loss": 0.6875, + "step": 833 + }, + { + "epoch": 3.4564315352697097, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.700239896774292, + "eval_runtime": 28.4752, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.142, + "step": 833 + }, + { + "epoch": 3.4605809128630707, + "grad_norm": 8.0625, + "learning_rate": 1.5394190871369297e-05, + "loss": 0.3613, + "step": 834 + }, + { + "epoch": 3.4605809128630707, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7007585763931274, + "eval_runtime": 28.4209, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.146, + "step": 834 + }, + { + "epoch": 3.4647302904564317, + "grad_norm": 17.25, + "learning_rate": 1.5352697095435685e-05, + "loss": 0.957, + "step": 835 + }, + { + "epoch": 3.4647302904564317, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7008882164955139, + "eval_runtime": 28.448, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.144, + "step": 835 + }, + { + "epoch": 3.4688796680497926, + "grad_norm": 12.8125, + "learning_rate": 1.5311203319502076e-05, + "loss": 0.373, + "step": 836 + }, + { + "epoch": 3.4688796680497926, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7008557915687561, + "eval_runtime": 28.6002, + "eval_samples_per_second": 16.853, + "eval_steps_per_second": 2.133, + "step": 836 + }, + { + "epoch": 3.4730290456431536, + "grad_norm": 6.875, + "learning_rate": 1.5269709543568468e-05, + "loss": 0.459, + "step": 837 + }, + { + "epoch": 3.4730290456431536, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7007910013198853, + "eval_runtime": 28.6526, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 837 + }, + { + "epoch": 3.4771784232365146, + "grad_norm": 36.0, + "learning_rate": 1.5228215767634854e-05, + "loss": 0.5312, + "step": 838 + }, + { + "epoch": 3.4771784232365146, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7009854912757874, + "eval_runtime": 28.6739, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.127, + "step": 838 + }, + { + "epoch": 3.4813278008298756, + "grad_norm": 15.875, + "learning_rate": 1.5186721991701244e-05, + "loss": 0.6172, + "step": 839 + }, + { + "epoch": 3.4813278008298756, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7022821307182312, + "eval_runtime": 28.6054, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 839 + }, + { + "epoch": 3.4854771784232366, + "grad_norm": 5.5625, + "learning_rate": 1.5145228215767635e-05, + "loss": 0.4082, + "step": 840 + }, + { + "epoch": 3.4854771784232366, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7032384872436523, + "eval_runtime": 28.5959, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.133, + "step": 840 + }, + { + "epoch": 3.4896265560165975, + "grad_norm": 26.75, + "learning_rate": 1.5103734439834025e-05, + "loss": 0.832, + "step": 841 + }, + { + "epoch": 3.4896265560165975, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7038705945014954, + "eval_runtime": 28.5681, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 841 + }, + { + "epoch": 3.4937759336099585, + "grad_norm": 5.03125, + "learning_rate": 1.5062240663900415e-05, + "loss": 0.5391, + "step": 842 + }, + { + "epoch": 3.4937759336099585, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7038705945014954, + "eval_runtime": 28.5529, + "eval_samples_per_second": 16.881, + "eval_steps_per_second": 2.136, + "step": 842 + }, + { + "epoch": 3.4979253112033195, + "grad_norm": 9.6875, + "learning_rate": 1.5020746887966805e-05, + "loss": 0.7227, + "step": 843 + }, + { + "epoch": 3.4979253112033195, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7041785717010498, + "eval_runtime": 28.4531, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.144, + "step": 843 + }, + { + "epoch": 3.5020746887966805, + "grad_norm": 17.5, + "learning_rate": 1.4979253112033194e-05, + "loss": 0.6094, + "step": 844 + }, + { + "epoch": 3.5020746887966805, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7034816145896912, + "eval_runtime": 28.4216, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.146, + "step": 844 + }, + { + "epoch": 3.5062240663900415, + "grad_norm": 11.75, + "learning_rate": 1.4937759336099586e-05, + "loss": 0.9766, + "step": 845 + }, + { + "epoch": 3.5062240663900415, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7040326595306396, + "eval_runtime": 28.4604, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 845 + }, + { + "epoch": 3.5103734439834025, + "grad_norm": 7.78125, + "learning_rate": 1.4896265560165976e-05, + "loss": 0.5703, + "step": 846 + }, + { + "epoch": 3.5103734439834025, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7041623592376709, + "eval_runtime": 28.5016, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.14, + "step": 846 + }, + { + "epoch": 3.5145228215767634, + "grad_norm": 6.96875, + "learning_rate": 1.4854771784232365e-05, + "loss": 0.6836, + "step": 847 + }, + { + "epoch": 3.5145228215767634, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7041299343109131, + "eval_runtime": 28.5463, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 847 + }, + { + "epoch": 3.5186721991701244, + "grad_norm": 4.46875, + "learning_rate": 1.4813278008298755e-05, + "loss": 0.4668, + "step": 848 + }, + { + "epoch": 3.5186721991701244, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7040488719940186, + "eval_runtime": 28.603, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.133, + "step": 848 + }, + { + "epoch": 3.5228215767634854, + "grad_norm": 16.75, + "learning_rate": 1.4771784232365147e-05, + "loss": 0.6328, + "step": 849 + }, + { + "epoch": 3.5228215767634854, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7039678692817688, + "eval_runtime": 28.6212, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 849 + }, + { + "epoch": 3.5269709543568464, + "grad_norm": 5.78125, + "learning_rate": 1.4730290456431537e-05, + "loss": 0.8398, + "step": 850 + }, + { + "epoch": 3.5269709543568464, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7034329771995544, + "eval_runtime": 28.6279, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 850 + }, + { + "epoch": 3.5311203319502074, + "grad_norm": 8.1875, + "learning_rate": 1.4688796680497926e-05, + "loss": 0.4238, + "step": 851 + }, + { + "epoch": 3.5311203319502074, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7037408947944641, + "eval_runtime": 28.3925, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.148, + "step": 851 + }, + { + "epoch": 3.5352697095435683, + "grad_norm": 13.375, + "learning_rate": 1.4647302904564316e-05, + "loss": 0.3828, + "step": 852 + }, + { + "epoch": 3.5352697095435683, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7032222747802734, + "eval_runtime": 28.4772, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.142, + "step": 852 + }, + { + "epoch": 3.5394190871369293, + "grad_norm": 5.34375, + "learning_rate": 1.4605809128630708e-05, + "loss": 0.5039, + "step": 853 + }, + { + "epoch": 3.5394190871369293, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7033519148826599, + "eval_runtime": 28.6326, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 853 + }, + { + "epoch": 3.5435684647302903, + "grad_norm": 10.875, + "learning_rate": 1.4564315352697097e-05, + "loss": 0.7734, + "step": 854 + }, + { + "epoch": 3.5435684647302903, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7034816145896912, + "eval_runtime": 28.6154, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 854 + }, + { + "epoch": 3.5477178423236513, + "grad_norm": 9.5625, + "learning_rate": 1.4522821576763487e-05, + "loss": 0.8672, + "step": 855 + }, + { + "epoch": 3.5477178423236513, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7036274671554565, + "eval_runtime": 28.6144, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 855 + }, + { + "epoch": 3.5518672199170123, + "grad_norm": 12.625, + "learning_rate": 1.4481327800829877e-05, + "loss": 0.4551, + "step": 856 + }, + { + "epoch": 3.5518672199170123, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.703335702419281, + "eval_runtime": 28.6287, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.131, + "step": 856 + }, + { + "epoch": 3.5560165975103732, + "grad_norm": 9.5, + "learning_rate": 1.4439834024896265e-05, + "loss": 0.5703, + "step": 857 + }, + { + "epoch": 3.5560165975103732, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7043406367301941, + "eval_runtime": 28.6134, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 857 + }, + { + "epoch": 3.5601659751037342, + "grad_norm": 6.84375, + "learning_rate": 1.4398340248962655e-05, + "loss": 0.4375, + "step": 858 + }, + { + "epoch": 3.5601659751037342, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7052645087242126, + "eval_runtime": 28.5677, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 858 + }, + { + "epoch": 3.564315352697095, + "grad_norm": 20.5, + "learning_rate": 1.4356846473029045e-05, + "loss": 1.2031, + "step": 859 + }, + { + "epoch": 3.564315352697095, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7053942084312439, + "eval_runtime": 28.4983, + "eval_samples_per_second": 16.913, + "eval_steps_per_second": 2.14, + "step": 859 + }, + { + "epoch": 3.568464730290456, + "grad_norm": 21.375, + "learning_rate": 1.4315352697095436e-05, + "loss": 0.9023, + "step": 860 + }, + { + "epoch": 3.568464730290456, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7042433619499207, + "eval_runtime": 28.5079, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.14, + "step": 860 + }, + { + "epoch": 3.572614107883817, + "grad_norm": 10.25, + "learning_rate": 1.4273858921161826e-05, + "loss": 0.4902, + "step": 861 + }, + { + "epoch": 3.572614107883817, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7045189142227173, + "eval_runtime": 28.4285, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.146, + "step": 861 + }, + { + "epoch": 3.576763485477178, + "grad_norm": 18.25, + "learning_rate": 1.4232365145228216e-05, + "loss": 0.8945, + "step": 862 + }, + { + "epoch": 3.576763485477178, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7037085294723511, + "eval_runtime": 28.4706, + "eval_samples_per_second": 16.93, + "eval_steps_per_second": 2.143, + "step": 862 + }, + { + "epoch": 3.5809128630705396, + "grad_norm": 9.5625, + "learning_rate": 1.4190871369294605e-05, + "loss": 0.6406, + "step": 863 + }, + { + "epoch": 3.5809128630705396, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7040326595306396, + "eval_runtime": 28.4154, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.147, + "step": 863 + }, + { + "epoch": 3.5850622406639006, + "grad_norm": 4.21875, + "learning_rate": 1.4149377593360997e-05, + "loss": 0.3379, + "step": 864 + }, + { + "epoch": 3.5850622406639006, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7041299343109131, + "eval_runtime": 28.4607, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 864 + }, + { + "epoch": 3.5892116182572615, + "grad_norm": 6.09375, + "learning_rate": 1.4107883817427387e-05, + "loss": 0.6016, + "step": 865 + }, + { + "epoch": 3.5892116182572615, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7053942084312439, + "eval_runtime": 28.3936, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.148, + "step": 865 + }, + { + "epoch": 3.5933609958506225, + "grad_norm": 4.46875, + "learning_rate": 1.4066390041493776e-05, + "loss": 0.418, + "step": 866 + }, + { + "epoch": 3.5933609958506225, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7040002346038818, + "eval_runtime": 28.4325, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 866 + }, + { + "epoch": 3.5975103734439835, + "grad_norm": 18.375, + "learning_rate": 1.4024896265560166e-05, + "loss": 0.9688, + "step": 867 + }, + { + "epoch": 3.5975103734439835, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7045189142227173, + "eval_runtime": 28.3707, + "eval_samples_per_second": 16.989, + "eval_steps_per_second": 2.15, + "step": 867 + }, + { + "epoch": 3.6016597510373445, + "grad_norm": 7.28125, + "learning_rate": 1.3983402489626558e-05, + "loss": 0.4082, + "step": 868 + }, + { + "epoch": 3.6016597510373445, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7048431038856506, + "eval_runtime": 28.3688, + "eval_samples_per_second": 16.991, + "eval_steps_per_second": 2.15, + "step": 868 + }, + { + "epoch": 3.6058091286307055, + "grad_norm": 7.28125, + "learning_rate": 1.3941908713692948e-05, + "loss": 0.6875, + "step": 869 + }, + { + "epoch": 3.6058091286307055, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7055076360702515, + "eval_runtime": 28.3628, + "eval_samples_per_second": 16.994, + "eval_steps_per_second": 2.151, + "step": 869 + }, + { + "epoch": 3.6099585062240664, + "grad_norm": 272.0, + "learning_rate": 1.3900414937759337e-05, + "loss": 0.6445, + "step": 870 + }, + { + "epoch": 3.6099585062240664, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7054266333580017, + "eval_runtime": 28.4271, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.146, + "step": 870 + }, + { + "epoch": 3.6141078838174274, + "grad_norm": 5.875, + "learning_rate": 1.3858921161825727e-05, + "loss": 0.4766, + "step": 871 + }, + { + "epoch": 3.6141078838174274, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.705167293548584, + "eval_runtime": 28.5499, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.137, + "step": 871 + }, + { + "epoch": 3.6182572614107884, + "grad_norm": 10.6875, + "learning_rate": 1.3817427385892117e-05, + "loss": 0.8047, + "step": 872 + }, + { + "epoch": 3.6182572614107884, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7050538063049316, + "eval_runtime": 28.595, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.133, + "step": 872 + }, + { + "epoch": 3.6224066390041494, + "grad_norm": 4.5, + "learning_rate": 1.3775933609958508e-05, + "loss": 0.3633, + "step": 873 + }, + { + "epoch": 3.6224066390041494, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7061722278594971, + "eval_runtime": 28.6172, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 873 + }, + { + "epoch": 3.6265560165975104, + "grad_norm": 7.6875, + "learning_rate": 1.3734439834024898e-05, + "loss": 0.4863, + "step": 874 + }, + { + "epoch": 3.6265560165975104, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7061883807182312, + "eval_runtime": 28.616, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 874 + }, + { + "epoch": 3.6307053941908713, + "grad_norm": 4.15625, + "learning_rate": 1.3692946058091288e-05, + "loss": 0.4668, + "step": 875 + }, + { + "epoch": 3.6307053941908713, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7061398029327393, + "eval_runtime": 28.6144, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 875 + }, + { + "epoch": 3.6348547717842323, + "grad_norm": 12.875, + "learning_rate": 1.3651452282157676e-05, + "loss": 0.5586, + "step": 876 + }, + { + "epoch": 3.6348547717842323, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7064639329910278, + "eval_runtime": 28.6093, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 876 + }, + { + "epoch": 3.6390041493775933, + "grad_norm": 4.1875, + "learning_rate": 1.3609958506224066e-05, + "loss": 0.5664, + "step": 877 + }, + { + "epoch": 3.6390041493775933, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7063829302787781, + "eval_runtime": 28.5801, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 877 + }, + { + "epoch": 3.6431535269709543, + "grad_norm": 5.75, + "learning_rate": 1.3568464730290456e-05, + "loss": 0.4336, + "step": 878 + }, + { + "epoch": 3.6431535269709543, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7066584825515747, + "eval_runtime": 28.5117, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 878 + }, + { + "epoch": 3.6473029045643153, + "grad_norm": 10.375, + "learning_rate": 1.3526970954356847e-05, + "loss": 0.7656, + "step": 879 + }, + { + "epoch": 3.6473029045643153, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7061235904693604, + "eval_runtime": 28.4129, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.147, + "step": 879 + }, + { + "epoch": 3.6514522821576763, + "grad_norm": 13.8125, + "learning_rate": 1.3485477178423237e-05, + "loss": 0.4277, + "step": 880 + }, + { + "epoch": 3.6514522821576763, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7062694430351257, + "eval_runtime": 28.4382, + "eval_samples_per_second": 16.949, + "eval_steps_per_second": 2.145, + "step": 880 + }, + { + "epoch": 3.6556016597510372, + "grad_norm": 6.1875, + "learning_rate": 1.3443983402489627e-05, + "loss": 0.4297, + "step": 881 + }, + { + "epoch": 3.6556016597510372, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7079551219940186, + "eval_runtime": 28.5958, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.133, + "step": 881 + }, + { + "epoch": 3.659751037344398, + "grad_norm": 8.8125, + "learning_rate": 1.3402489626556016e-05, + "loss": 0.8008, + "step": 882 + }, + { + "epoch": 3.659751037344398, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7077606320381165, + "eval_runtime": 28.6017, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.133, + "step": 882 + }, + { + "epoch": 3.663900414937759, + "grad_norm": 45.0, + "learning_rate": 1.3360995850622406e-05, + "loss": 1.1172, + "step": 883 + }, + { + "epoch": 3.663900414937759, + "eval_accuracy": 0.5892116182572614, + "eval_loss": 0.7083117365837097, + "eval_runtime": 28.616, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 883 + }, + { + "epoch": 3.66804979253112, + "grad_norm": 7.0625, + "learning_rate": 1.3319502074688798e-05, + "loss": 0.4395, + "step": 884 + }, + { + "epoch": 3.66804979253112, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7102729678153992, + "eval_runtime": 28.6055, + "eval_samples_per_second": 16.85, + "eval_steps_per_second": 2.132, + "step": 884 + }, + { + "epoch": 3.6721991701244816, + "grad_norm": 8.5, + "learning_rate": 1.3278008298755187e-05, + "loss": 0.7812, + "step": 885 + }, + { + "epoch": 3.6721991701244816, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7103701829910278, + "eval_runtime": 28.5933, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 885 + }, + { + "epoch": 3.6763485477178426, + "grad_norm": 9.4375, + "learning_rate": 1.3236514522821577e-05, + "loss": 0.4297, + "step": 886 + }, + { + "epoch": 3.6763485477178426, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7112292647361755, + "eval_runtime": 28.535, + "eval_samples_per_second": 16.892, + "eval_steps_per_second": 2.138, + "step": 886 + }, + { + "epoch": 3.6804979253112036, + "grad_norm": 8.0, + "learning_rate": 1.3195020746887967e-05, + "loss": 0.6992, + "step": 887 + }, + { + "epoch": 3.6804979253112036, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7129635810852051, + "eval_runtime": 28.4641, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.143, + "step": 887 + }, + { + "epoch": 3.6846473029045645, + "grad_norm": 21.75, + "learning_rate": 1.3153526970954359e-05, + "loss": 0.6875, + "step": 888 + }, + { + "epoch": 3.6846473029045645, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7119424343109131, + "eval_runtime": 28.4198, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 2.146, + "step": 888 + }, + { + "epoch": 3.6887966804979255, + "grad_norm": 30.0, + "learning_rate": 1.3112033195020748e-05, + "loss": 1.0312, + "step": 889 + }, + { + "epoch": 3.6887966804979255, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7132067084312439, + "eval_runtime": 28.4488, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.144, + "step": 889 + }, + { + "epoch": 3.6929460580912865, + "grad_norm": 7.375, + "learning_rate": 1.3070539419087138e-05, + "loss": 0.4961, + "step": 890 + }, + { + "epoch": 3.6929460580912865, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7131742835044861, + "eval_runtime": 28.3809, + "eval_samples_per_second": 16.983, + "eval_steps_per_second": 2.149, + "step": 890 + }, + { + "epoch": 3.6970954356846475, + "grad_norm": 9.0, + "learning_rate": 1.3029045643153528e-05, + "loss": 0.6328, + "step": 891 + }, + { + "epoch": 3.6970954356846475, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7127204537391663, + "eval_runtime": 28.3977, + "eval_samples_per_second": 16.973, + "eval_steps_per_second": 2.148, + "step": 891 + }, + { + "epoch": 3.7012448132780085, + "grad_norm": 10.8125, + "learning_rate": 1.298755186721992e-05, + "loss": 0.7031, + "step": 892 + }, + { + "epoch": 3.7012448132780085, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7132877111434937, + "eval_runtime": 28.4285, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.146, + "step": 892 + }, + { + "epoch": 3.7053941908713695, + "grad_norm": 13.6875, + "learning_rate": 1.2946058091286309e-05, + "loss": 0.9492, + "step": 893 + }, + { + "epoch": 3.7053941908713695, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7130932211875916, + "eval_runtime": 28.5508, + "eval_samples_per_second": 16.882, + "eval_steps_per_second": 2.137, + "step": 893 + }, + { + "epoch": 3.7095435684647304, + "grad_norm": 6.03125, + "learning_rate": 1.2904564315352699e-05, + "loss": 0.7617, + "step": 894 + }, + { + "epoch": 3.7095435684647304, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7135956883430481, + "eval_runtime": 28.5946, + "eval_samples_per_second": 16.856, + "eval_steps_per_second": 2.133, + "step": 894 + }, + { + "epoch": 3.7136929460580914, + "grad_norm": 7.28125, + "learning_rate": 1.2863070539419087e-05, + "loss": 0.8516, + "step": 895 + }, + { + "epoch": 3.7136929460580914, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7116506695747375, + "eval_runtime": 28.6632, + "eval_samples_per_second": 16.816, + "eval_steps_per_second": 2.128, + "step": 895 + }, + { + "epoch": 3.7178423236514524, + "grad_norm": 15.875, + "learning_rate": 1.2821576763485477e-05, + "loss": 0.7461, + "step": 896 + }, + { + "epoch": 3.7178423236514524, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.712169349193573, + "eval_runtime": 28.6119, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 896 + }, + { + "epoch": 3.7219917012448134, + "grad_norm": 6.34375, + "learning_rate": 1.2780082987551867e-05, + "loss": 0.4492, + "step": 897 + }, + { + "epoch": 3.7219917012448134, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7126556038856506, + "eval_runtime": 28.6644, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.128, + "step": 897 + }, + { + "epoch": 3.7261410788381744, + "grad_norm": 6.21875, + "learning_rate": 1.2738589211618256e-05, + "loss": 0.6172, + "step": 898 + }, + { + "epoch": 3.7261410788381744, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7121855616569519, + "eval_runtime": 28.6503, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.129, + "step": 898 + }, + { + "epoch": 3.7302904564315353, + "grad_norm": 3.8125, + "learning_rate": 1.2697095435684648e-05, + "loss": 0.4062, + "step": 899 + }, + { + "epoch": 3.7302904564315353, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7117803692817688, + "eval_runtime": 28.5584, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 899 + }, + { + "epoch": 3.7344398340248963, + "grad_norm": 15.8125, + "learning_rate": 1.2655601659751038e-05, + "loss": 0.707, + "step": 900 + }, + { + "epoch": 3.7344398340248963, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7124935388565063, + "eval_runtime": 28.4488, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.144, + "step": 900 + }, + { + "epoch": 3.7385892116182573, + "grad_norm": 19.875, + "learning_rate": 1.2614107883817427e-05, + "loss": 0.8906, + "step": 901 + }, + { + "epoch": 3.7385892116182573, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7121044993400574, + "eval_runtime": 28.2718, + "eval_samples_per_second": 17.049, + "eval_steps_per_second": 2.158, + "step": 901 + }, + { + "epoch": 3.7427385892116183, + "grad_norm": 4.59375, + "learning_rate": 1.2572614107883817e-05, + "loss": 0.5117, + "step": 902 + }, + { + "epoch": 3.7427385892116183, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7126231789588928, + "eval_runtime": 28.4595, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 902 + }, + { + "epoch": 3.7468879668049793, + "grad_norm": 28.125, + "learning_rate": 1.2531120331950209e-05, + "loss": 0.8594, + "step": 903 + }, + { + "epoch": 3.7468879668049793, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7135794758796692, + "eval_runtime": 28.5545, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.136, + "step": 903 + }, + { + "epoch": 3.7510373443983402, + "grad_norm": 5.375, + "learning_rate": 1.2489626556016598e-05, + "loss": 0.4785, + "step": 904 + }, + { + "epoch": 3.7510373443983402, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7119424343109131, + "eval_runtime": 28.5933, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 904 + }, + { + "epoch": 3.7551867219917012, + "grad_norm": 5.40625, + "learning_rate": 1.2448132780082988e-05, + "loss": 0.4375, + "step": 905 + }, + { + "epoch": 3.7551867219917012, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.712979793548584, + "eval_runtime": 28.616, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 905 + }, + { + "epoch": 3.759336099585062, + "grad_norm": 30.375, + "learning_rate": 1.2406639004149378e-05, + "loss": 1.5, + "step": 906 + }, + { + "epoch": 3.759336099585062, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7130122184753418, + "eval_runtime": 28.6659, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.128, + "step": 906 + }, + { + "epoch": 3.763485477178423, + "grad_norm": 36.0, + "learning_rate": 1.236514522821577e-05, + "loss": 1.1406, + "step": 907 + }, + { + "epoch": 3.763485477178423, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7133201360702515, + "eval_runtime": 28.6129, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 907 + }, + { + "epoch": 3.767634854771784, + "grad_norm": 3.921875, + "learning_rate": 1.232365145228216e-05, + "loss": 0.5391, + "step": 908 + }, + { + "epoch": 3.767634854771784, + "eval_accuracy": 0.5871369294605809, + "eval_loss": 0.7120396494865417, + "eval_runtime": 28.58, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 908 + }, + { + "epoch": 3.771784232365145, + "grad_norm": 8.6875, + "learning_rate": 1.2282157676348547e-05, + "loss": 0.416, + "step": 909 + }, + { + "epoch": 3.771784232365145, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7134336233139038, + "eval_runtime": 28.5017, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.14, + "step": 909 + }, + { + "epoch": 3.775933609958506, + "grad_norm": 3.40625, + "learning_rate": 1.2240663900414937e-05, + "loss": 0.2578, + "step": 910 + }, + { + "epoch": 3.775933609958506, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7143737077713013, + "eval_runtime": 28.5034, + "eval_samples_per_second": 16.91, + "eval_steps_per_second": 2.14, + "step": 910 + }, + { + "epoch": 3.780082987551867, + "grad_norm": 5.59375, + "learning_rate": 1.2199170124481329e-05, + "loss": 0.3203, + "step": 911 + }, + { + "epoch": 3.780082987551867, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7152813673019409, + "eval_runtime": 28.5893, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.134, + "step": 911 + }, + { + "epoch": 3.784232365145228, + "grad_norm": 31.25, + "learning_rate": 1.2157676348547718e-05, + "loss": 0.7344, + "step": 912 + }, + { + "epoch": 3.784232365145228, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7171939611434937, + "eval_runtime": 28.6291, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.131, + "step": 912 + }, + { + "epoch": 3.788381742738589, + "grad_norm": 4.84375, + "learning_rate": 1.2116182572614108e-05, + "loss": 0.4609, + "step": 913 + }, + { + "epoch": 3.788381742738589, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7159945368766785, + "eval_runtime": 28.6763, + "eval_samples_per_second": 16.808, + "eval_steps_per_second": 2.127, + "step": 913 + }, + { + "epoch": 3.79253112033195, + "grad_norm": 15.5, + "learning_rate": 1.2074688796680498e-05, + "loss": 1.0469, + "step": 914 + }, + { + "epoch": 3.79253112033195, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7169184684753418, + "eval_runtime": 28.6404, + "eval_samples_per_second": 16.829, + "eval_steps_per_second": 2.13, + "step": 914 + }, + { + "epoch": 3.796680497925311, + "grad_norm": 5.78125, + "learning_rate": 1.203319502074689e-05, + "loss": 0.2656, + "step": 915 + }, + { + "epoch": 3.796680497925311, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7177126407623291, + "eval_runtime": 28.6553, + "eval_samples_per_second": 16.821, + "eval_steps_per_second": 2.129, + "step": 915 + }, + { + "epoch": 3.800829875518672, + "grad_norm": 15.125, + "learning_rate": 1.199170124481328e-05, + "loss": 0.6328, + "step": 916 + }, + { + "epoch": 3.800829875518672, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7184420228004456, + "eval_runtime": 28.6155, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 916 + }, + { + "epoch": 3.804979253112033, + "grad_norm": 11.6875, + "learning_rate": 1.1950207468879669e-05, + "loss": 0.6172, + "step": 917 + }, + { + "epoch": 3.804979253112033, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7191389799118042, + "eval_runtime": 28.4929, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.141, + "step": 917 + }, + { + "epoch": 3.809128630705394, + "grad_norm": 3.671875, + "learning_rate": 1.1908713692946057e-05, + "loss": 0.3848, + "step": 918 + }, + { + "epoch": 3.809128630705394, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7204518914222717, + "eval_runtime": 28.4345, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.145, + "step": 918 + }, + { + "epoch": 3.813278008298755, + "grad_norm": 12.0, + "learning_rate": 1.1867219917012449e-05, + "loss": 0.8555, + "step": 919 + }, + { + "epoch": 3.813278008298755, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.720403254032135, + "eval_runtime": 28.4408, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.145, + "step": 919 + }, + { + "epoch": 3.817427385892116, + "grad_norm": 13.75, + "learning_rate": 1.1825726141078838e-05, + "loss": 0.6445, + "step": 920 + }, + { + "epoch": 3.817427385892116, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7208408713340759, + "eval_runtime": 28.4253, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.146, + "step": 920 + }, + { + "epoch": 3.821576763485477, + "grad_norm": 51.5, + "learning_rate": 1.1784232365145228e-05, + "loss": 1.2734, + "step": 921 + }, + { + "epoch": 3.821576763485477, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7208247184753418, + "eval_runtime": 28.5435, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.137, + "step": 921 + }, + { + "epoch": 3.825726141078838, + "grad_norm": 5.5625, + "learning_rate": 1.1742738589211618e-05, + "loss": 0.3418, + "step": 922 + }, + { + "epoch": 3.825726141078838, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7207274436950684, + "eval_runtime": 28.5903, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.134, + "step": 922 + }, + { + "epoch": 3.8298755186721993, + "grad_norm": 18.25, + "learning_rate": 1.170124481327801e-05, + "loss": 0.7344, + "step": 923 + }, + { + "epoch": 3.8298755186721993, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7208732962608337, + "eval_runtime": 28.6088, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 923 + }, + { + "epoch": 3.8340248962655603, + "grad_norm": 10.3125, + "learning_rate": 1.16597510373444e-05, + "loss": 0.7188, + "step": 924 + }, + { + "epoch": 3.8340248962655603, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7202087640762329, + "eval_runtime": 28.6094, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 924 + }, + { + "epoch": 3.8381742738589213, + "grad_norm": 4.125, + "learning_rate": 1.1618257261410789e-05, + "loss": 0.4199, + "step": 925 + }, + { + "epoch": 3.8381742738589213, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7204194664955139, + "eval_runtime": 28.6167, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 925 + }, + { + "epoch": 3.8423236514522823, + "grad_norm": 13.0, + "learning_rate": 1.1576763485477179e-05, + "loss": 0.3789, + "step": 926 + }, + { + "epoch": 3.8423236514522823, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7218782305717468, + "eval_runtime": 28.5836, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 926 + }, + { + "epoch": 3.8464730290456433, + "grad_norm": 5.34375, + "learning_rate": 1.153526970954357e-05, + "loss": 0.5898, + "step": 927 + }, + { + "epoch": 3.8464730290456433, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7212623357772827, + "eval_runtime": 28.5191, + "eval_samples_per_second": 16.901, + "eval_steps_per_second": 2.139, + "step": 927 + }, + { + "epoch": 3.8506224066390042, + "grad_norm": 9.25, + "learning_rate": 1.1493775933609958e-05, + "loss": 0.7734, + "step": 928 + }, + { + "epoch": 3.8506224066390042, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7211812734603882, + "eval_runtime": 28.5169, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.139, + "step": 928 + }, + { + "epoch": 3.854771784232365, + "grad_norm": 19.375, + "learning_rate": 1.1452282157676348e-05, + "loss": 0.7305, + "step": 929 + }, + { + "epoch": 3.854771784232365, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7217161655426025, + "eval_runtime": 28.4357, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.145, + "step": 929 + }, + { + "epoch": 3.858921161825726, + "grad_norm": 16.75, + "learning_rate": 1.141078838174274e-05, + "loss": 0.6914, + "step": 930 + }, + { + "epoch": 3.858921161825726, + "eval_accuracy": 0.5850622406639004, + "eval_loss": 0.7211974859237671, + "eval_runtime": 28.4086, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.147, + "step": 930 + }, + { + "epoch": 3.863070539419087, + "grad_norm": 8.75, + "learning_rate": 1.136929460580913e-05, + "loss": 1.1484, + "step": 931 + }, + { + "epoch": 3.863070539419087, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7213271260261536, + "eval_runtime": 28.3943, + "eval_samples_per_second": 16.975, + "eval_steps_per_second": 2.148, + "step": 931 + }, + { + "epoch": 3.867219917012448, + "grad_norm": 5.90625, + "learning_rate": 1.132780082987552e-05, + "loss": 0.6133, + "step": 932 + }, + { + "epoch": 3.867219917012448, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7215216755867004, + "eval_runtime": 28.3927, + "eval_samples_per_second": 16.976, + "eval_steps_per_second": 2.148, + "step": 932 + }, + { + "epoch": 3.871369294605809, + "grad_norm": 7.8125, + "learning_rate": 1.1286307053941909e-05, + "loss": 0.6211, + "step": 933 + }, + { + "epoch": 3.871369294605809, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7210839986801147, + "eval_runtime": 28.3832, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.149, + "step": 933 + }, + { + "epoch": 3.87551867219917, + "grad_norm": 12.5, + "learning_rate": 1.1244813278008299e-05, + "loss": 0.3047, + "step": 934 + }, + { + "epoch": 3.87551867219917, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7218620181083679, + "eval_runtime": 28.3792, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 934 + }, + { + "epoch": 3.879668049792531, + "grad_norm": 7.46875, + "learning_rate": 1.120331950207469e-05, + "loss": 0.7188, + "step": 935 + }, + { + "epoch": 3.879668049792531, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7212461233139038, + "eval_runtime": 28.3769, + "eval_samples_per_second": 16.986, + "eval_steps_per_second": 2.15, + "step": 935 + }, + { + "epoch": 3.883817427385892, + "grad_norm": 12.4375, + "learning_rate": 1.116182572614108e-05, + "loss": 1.0859, + "step": 936 + }, + { + "epoch": 3.883817427385892, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7199980616569519, + "eval_runtime": 28.3684, + "eval_samples_per_second": 16.991, + "eval_steps_per_second": 2.15, + "step": 936 + }, + { + "epoch": 3.887966804979253, + "grad_norm": 5.6875, + "learning_rate": 1.1120331950207468e-05, + "loss": 0.6016, + "step": 937 + }, + { + "epoch": 3.887966804979253, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7209219336509705, + "eval_runtime": 28.4257, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.146, + "step": 937 + }, + { + "epoch": 3.892116182572614, + "grad_norm": 7.53125, + "learning_rate": 1.107883817427386e-05, + "loss": 0.457, + "step": 938 + }, + { + "epoch": 3.892116182572614, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7203384041786194, + "eval_runtime": 28.3674, + "eval_samples_per_second": 16.991, + "eval_steps_per_second": 2.15, + "step": 938 + }, + { + "epoch": 3.896265560165975, + "grad_norm": 6.3125, + "learning_rate": 1.103734439834025e-05, + "loss": 0.4902, + "step": 939 + }, + { + "epoch": 3.896265560165975, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7199169993400574, + "eval_runtime": 28.4788, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.142, + "step": 939 + }, + { + "epoch": 3.900414937759336, + "grad_norm": 13.375, + "learning_rate": 1.099585062240664e-05, + "loss": 0.6328, + "step": 940 + }, + { + "epoch": 3.900414937759336, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7203060388565063, + "eval_runtime": 28.5467, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 940 + }, + { + "epoch": 3.904564315352697, + "grad_norm": 5.9375, + "learning_rate": 1.0954356846473029e-05, + "loss": 0.543, + "step": 941 + }, + { + "epoch": 3.904564315352697, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7201277017593384, + "eval_runtime": 28.5975, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 941 + }, + { + "epoch": 3.908713692946058, + "grad_norm": 7.375, + "learning_rate": 1.091286307053942e-05, + "loss": 0.6875, + "step": 942 + }, + { + "epoch": 3.908713692946058, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7192686796188354, + "eval_runtime": 28.6124, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 942 + }, + { + "epoch": 3.912863070539419, + "grad_norm": 20.75, + "learning_rate": 1.087136929460581e-05, + "loss": 1.1094, + "step": 943 + }, + { + "epoch": 3.912863070539419, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7190093398094177, + "eval_runtime": 28.6192, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 943 + }, + { + "epoch": 3.91701244813278, + "grad_norm": 6.625, + "learning_rate": 1.08298755186722e-05, + "loss": 0.4102, + "step": 944 + }, + { + "epoch": 3.91701244813278, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7194307446479797, + "eval_runtime": 28.6427, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.13, + "step": 944 + }, + { + "epoch": 3.921161825726141, + "grad_norm": 7.03125, + "learning_rate": 1.078838174273859e-05, + "loss": 0.625, + "step": 945 + }, + { + "epoch": 3.921161825726141, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7194631695747375, + "eval_runtime": 28.6456, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 945 + }, + { + "epoch": 3.9253112033195023, + "grad_norm": 12.6875, + "learning_rate": 1.074688796680498e-05, + "loss": 0.8672, + "step": 946 + }, + { + "epoch": 3.9253112033195023, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7193658947944641, + "eval_runtime": 28.5415, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 946 + }, + { + "epoch": 3.9294605809128633, + "grad_norm": 17.375, + "learning_rate": 1.070539419087137e-05, + "loss": 0.6094, + "step": 947 + }, + { + "epoch": 3.9294605809128633, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7189444899559021, + "eval_runtime": 28.4107, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.147, + "step": 947 + }, + { + "epoch": 3.9336099585062243, + "grad_norm": 21.375, + "learning_rate": 1.066390041493776e-05, + "loss": 1.0312, + "step": 948 + }, + { + "epoch": 3.9336099585062243, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7191389799118042, + "eval_runtime": 28.3536, + "eval_samples_per_second": 17.0, + "eval_steps_per_second": 2.151, + "step": 948 + }, + { + "epoch": 3.9377593360995853, + "grad_norm": 8.5625, + "learning_rate": 1.0622406639004149e-05, + "loss": 0.6758, + "step": 949 + }, + { + "epoch": 3.9377593360995853, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7193011045455933, + "eval_runtime": 28.4886, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.141, + "step": 949 + }, + { + "epoch": 3.9419087136929463, + "grad_norm": 5.78125, + "learning_rate": 1.058091286307054e-05, + "loss": 0.3262, + "step": 950 + }, + { + "epoch": 3.9419087136929463, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7191389799118042, + "eval_runtime": 28.5454, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 950 + }, + { + "epoch": 3.9460580912863072, + "grad_norm": 6.09375, + "learning_rate": 1.053941908713693e-05, + "loss": 0.4219, + "step": 951 + }, + { + "epoch": 3.9460580912863072, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7190255522727966, + "eval_runtime": 28.578, + "eval_samples_per_second": 16.866, + "eval_steps_per_second": 2.135, + "step": 951 + }, + { + "epoch": 3.9502074688796682, + "grad_norm": 9.0, + "learning_rate": 1.049792531120332e-05, + "loss": 0.5234, + "step": 952 + }, + { + "epoch": 3.9502074688796682, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7186203598976135, + "eval_runtime": 28.4444, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.145, + "step": 952 + }, + { + "epoch": 3.954356846473029, + "grad_norm": 14.125, + "learning_rate": 1.045643153526971e-05, + "loss": 0.334, + "step": 953 + }, + { + "epoch": 3.954356846473029, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7182637453079224, + "eval_runtime": 28.5569, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.136, + "step": 953 + }, + { + "epoch": 3.95850622406639, + "grad_norm": 27.875, + "learning_rate": 1.0414937759336101e-05, + "loss": 0.7305, + "step": 954 + }, + { + "epoch": 3.95850622406639, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7187986373901367, + "eval_runtime": 28.5941, + "eval_samples_per_second": 16.857, + "eval_steps_per_second": 2.133, + "step": 954 + }, + { + "epoch": 3.962655601659751, + "grad_norm": 7.34375, + "learning_rate": 1.0373443983402491e-05, + "loss": 0.5039, + "step": 955 + }, + { + "epoch": 3.962655601659751, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7184420228004456, + "eval_runtime": 28.6221, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 955 + }, + { + "epoch": 3.966804979253112, + "grad_norm": 9.25, + "learning_rate": 1.0331950207468879e-05, + "loss": 0.3516, + "step": 956 + }, + { + "epoch": 3.966804979253112, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7187824249267578, + "eval_runtime": 28.6199, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 956 + }, + { + "epoch": 3.970954356846473, + "grad_norm": 6.3125, + "learning_rate": 1.0290456431535269e-05, + "loss": 0.5703, + "step": 957 + }, + { + "epoch": 3.970954356846473, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.717939555644989, + "eval_runtime": 28.6383, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 957 + }, + { + "epoch": 3.975103734439834, + "grad_norm": 9.9375, + "learning_rate": 1.024896265560166e-05, + "loss": 0.9141, + "step": 958 + }, + { + "epoch": 3.975103734439834, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7177126407623291, + "eval_runtime": 28.58, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 958 + }, + { + "epoch": 3.979253112033195, + "grad_norm": 14.375, + "learning_rate": 1.020746887966805e-05, + "loss": 0.5078, + "step": 959 + }, + { + "epoch": 3.979253112033195, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7175830006599426, + "eval_runtime": 28.4951, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.141, + "step": 959 + }, + { + "epoch": 3.983402489626556, + "grad_norm": 6.25, + "learning_rate": 1.016597510373444e-05, + "loss": 0.3633, + "step": 960 + }, + { + "epoch": 3.983402489626556, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7171939611434937, + "eval_runtime": 28.4806, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.142, + "step": 960 + }, + { + "epoch": 3.987551867219917, + "grad_norm": 19.5, + "learning_rate": 1.012448132780083e-05, + "loss": 0.8359, + "step": 961 + }, + { + "epoch": 3.987551867219917, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7185068726539612, + "eval_runtime": 28.5686, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 961 + }, + { + "epoch": 3.991701244813278, + "grad_norm": 54.25, + "learning_rate": 1.0082987551867221e-05, + "loss": 0.5742, + "step": 962 + }, + { + "epoch": 3.991701244813278, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7169670462608337, + "eval_runtime": 28.6223, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 962 + }, + { + "epoch": 3.995850622406639, + "grad_norm": 24.25, + "learning_rate": 1.0041493775933611e-05, + "loss": 1.1953, + "step": 963 + }, + { + "epoch": 3.995850622406639, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7172912359237671, + "eval_runtime": 28.6732, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.127, + "step": 963 + }, + { + "epoch": 4.0, + "grad_norm": 5.21875, + "learning_rate": 1e-05, + "loss": 0.3887, + "step": 964 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7183610200881958, + "eval_runtime": 28.6879, + "eval_samples_per_second": 16.802, + "eval_steps_per_second": 2.126, + "step": 964 + }, + { + "epoch": 4.004149377593361, + "grad_norm": 4.3125, + "learning_rate": 9.95850622406639e-06, + "loss": 0.3633, + "step": 965 + }, + { + "epoch": 4.004149377593361, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7182313203811646, + "eval_runtime": 28.6265, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 965 + }, + { + "epoch": 4.008298755186722, + "grad_norm": 14.8125, + "learning_rate": 9.91701244813278e-06, + "loss": 1.4531, + "step": 966 + }, + { + "epoch": 4.008298755186722, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7177774906158447, + "eval_runtime": 28.5643, + "eval_samples_per_second": 16.874, + "eval_steps_per_second": 2.136, + "step": 966 + }, + { + "epoch": 4.012448132780083, + "grad_norm": 9.375, + "learning_rate": 9.87551867219917e-06, + "loss": 0.6602, + "step": 967 + }, + { + "epoch": 4.012448132780083, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7178747653961182, + "eval_runtime": 28.4637, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.143, + "step": 967 + }, + { + "epoch": 4.016597510373444, + "grad_norm": 8.9375, + "learning_rate": 9.83402489626556e-06, + "loss": 0.5781, + "step": 968 + }, + { + "epoch": 4.016597510373444, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7176154255867004, + "eval_runtime": 28.4147, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.147, + "step": 968 + }, + { + "epoch": 4.020746887966805, + "grad_norm": 23.75, + "learning_rate": 9.79253112033195e-06, + "loss": 0.3223, + "step": 969 + }, + { + "epoch": 4.020746887966805, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.718117892742157, + "eval_runtime": 28.3875, + "eval_samples_per_second": 16.979, + "eval_steps_per_second": 2.149, + "step": 969 + }, + { + "epoch": 4.024896265560166, + "grad_norm": 6.4375, + "learning_rate": 9.751037344398341e-06, + "loss": 0.4082, + "step": 970 + }, + { + "epoch": 4.024896265560166, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7177612781524658, + "eval_runtime": 28.368, + "eval_samples_per_second": 16.991, + "eval_steps_per_second": 2.15, + "step": 970 + }, + { + "epoch": 4.029045643153527, + "grad_norm": 3.953125, + "learning_rate": 9.709543568464731e-06, + "loss": 0.416, + "step": 971 + }, + { + "epoch": 4.029045643153527, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7173885107040405, + "eval_runtime": 28.4122, + "eval_samples_per_second": 16.965, + "eval_steps_per_second": 2.147, + "step": 971 + }, + { + "epoch": 4.033195020746888, + "grad_norm": 11.8125, + "learning_rate": 9.66804979253112e-06, + "loss": 1.1719, + "step": 972 + }, + { + "epoch": 4.033195020746888, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7187013626098633, + "eval_runtime": 28.5455, + "eval_samples_per_second": 16.885, + "eval_steps_per_second": 2.137, + "step": 972 + }, + { + "epoch": 4.037344398340249, + "grad_norm": 9.375, + "learning_rate": 9.62655601659751e-06, + "loss": 0.7969, + "step": 973 + }, + { + "epoch": 4.037344398340249, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7187175750732422, + "eval_runtime": 28.6499, + "eval_samples_per_second": 16.824, + "eval_steps_per_second": 2.129, + "step": 973 + }, + { + "epoch": 4.04149377593361, + "grad_norm": 7.875, + "learning_rate": 9.585062240663902e-06, + "loss": 0.668, + "step": 974 + }, + { + "epoch": 4.04149377593361, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7183610200881958, + "eval_runtime": 28.6184, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 974 + }, + { + "epoch": 4.045643153526971, + "grad_norm": 10.9375, + "learning_rate": 9.54356846473029e-06, + "loss": 0.5938, + "step": 975 + }, + { + "epoch": 4.045643153526971, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7189444899559021, + "eval_runtime": 28.5825, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 975 + }, + { + "epoch": 4.049792531120332, + "grad_norm": 13.4375, + "learning_rate": 9.50207468879668e-06, + "loss": 0.8516, + "step": 976 + }, + { + "epoch": 4.049792531120332, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7187175750732422, + "eval_runtime": 28.6223, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 976 + }, + { + "epoch": 4.053941908713693, + "grad_norm": 15.3125, + "learning_rate": 9.460580912863071e-06, + "loss": 1.3047, + "step": 977 + }, + { + "epoch": 4.053941908713693, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7185555100440979, + "eval_runtime": 28.6109, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.132, + "step": 977 + }, + { + "epoch": 4.058091286307054, + "grad_norm": 21.75, + "learning_rate": 9.419087136929461e-06, + "loss": 0.6172, + "step": 978 + }, + { + "epoch": 4.058091286307054, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7190417647361755, + "eval_runtime": 28.6091, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 978 + }, + { + "epoch": 4.062240663900415, + "grad_norm": 7.09375, + "learning_rate": 9.377593360995851e-06, + "loss": 0.6445, + "step": 979 + }, + { + "epoch": 4.062240663900415, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7180530428886414, + "eval_runtime": 28.6024, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.133, + "step": 979 + }, + { + "epoch": 4.066390041493776, + "grad_norm": 15.375, + "learning_rate": 9.33609958506224e-06, + "loss": 0.7031, + "step": 980 + }, + { + "epoch": 4.066390041493776, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7185555100440979, + "eval_runtime": 28.6064, + "eval_samples_per_second": 16.849, + "eval_steps_per_second": 2.132, + "step": 980 + }, + { + "epoch": 4.070539419087137, + "grad_norm": 7.96875, + "learning_rate": 9.294605809128632e-06, + "loss": 0.3965, + "step": 981 + }, + { + "epoch": 4.070539419087137, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7175505757331848, + "eval_runtime": 28.4955, + "eval_samples_per_second": 16.915, + "eval_steps_per_second": 2.141, + "step": 981 + }, + { + "epoch": 4.074688796680498, + "grad_norm": 18.375, + "learning_rate": 9.253112033195022e-06, + "loss": 0.4883, + "step": 982 + }, + { + "epoch": 4.074688796680498, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7180692553520203, + "eval_runtime": 28.5017, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.14, + "step": 982 + }, + { + "epoch": 4.078838174273859, + "grad_norm": 13.1875, + "learning_rate": 9.211618257261412e-06, + "loss": 0.6133, + "step": 983 + }, + { + "epoch": 4.078838174273859, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7174533009529114, + "eval_runtime": 28.4834, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.142, + "step": 983 + }, + { + "epoch": 4.08298755186722, + "grad_norm": 19.375, + "learning_rate": 9.1701244813278e-06, + "loss": 0.7344, + "step": 984 + }, + { + "epoch": 4.08298755186722, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7173560857772827, + "eval_runtime": 28.4094, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.147, + "step": 984 + }, + { + "epoch": 4.087136929460581, + "grad_norm": 20.125, + "learning_rate": 9.128630705394191e-06, + "loss": 1.3281, + "step": 985 + }, + { + "epoch": 4.087136929460581, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7180044054985046, + "eval_runtime": 28.396, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.148, + "step": 985 + }, + { + "epoch": 4.091286307053942, + "grad_norm": 362.0, + "learning_rate": 9.087136929460581e-06, + "loss": 0.7539, + "step": 986 + }, + { + "epoch": 4.091286307053942, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7172912359237671, + "eval_runtime": 28.4459, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 986 + }, + { + "epoch": 4.095435684647303, + "grad_norm": 7.875, + "learning_rate": 9.045643153526971e-06, + "loss": 0.8594, + "step": 987 + }, + { + "epoch": 4.095435684647303, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7170805335044861, + "eval_runtime": 28.3833, + "eval_samples_per_second": 16.982, + "eval_steps_per_second": 2.149, + "step": 987 + }, + { + "epoch": 4.0995850622406635, + "grad_norm": 9.25, + "learning_rate": 9.00414937759336e-06, + "loss": 0.7656, + "step": 988 + }, + { + "epoch": 4.0995850622406635, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7159945368766785, + "eval_runtime": 28.4393, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.145, + "step": 988 + }, + { + "epoch": 4.1037344398340245, + "grad_norm": 12.5625, + "learning_rate": 8.962655601659752e-06, + "loss": 1.1562, + "step": 989 + }, + { + "epoch": 4.1037344398340245, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7160269618034363, + "eval_runtime": 28.436, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.145, + "step": 989 + }, + { + "epoch": 4.1078838174273855, + "grad_norm": 12.5625, + "learning_rate": 8.921161825726142e-06, + "loss": 0.8203, + "step": 990 + }, + { + "epoch": 4.1078838174273855, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7152489423751831, + "eval_runtime": 28.3813, + "eval_samples_per_second": 16.983, + "eval_steps_per_second": 2.149, + "step": 990 + }, + { + "epoch": 4.1120331950207465, + "grad_norm": 26.375, + "learning_rate": 8.879668049792532e-06, + "loss": 1.0312, + "step": 991 + }, + { + "epoch": 4.1120331950207465, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7145357728004456, + "eval_runtime": 28.3753, + "eval_samples_per_second": 16.987, + "eval_steps_per_second": 2.15, + "step": 991 + }, + { + "epoch": 4.1161825726141075, + "grad_norm": 18.875, + "learning_rate": 8.838174273858922e-06, + "loss": 0.5391, + "step": 992 + }, + { + "epoch": 4.1161825726141075, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7146978974342346, + "eval_runtime": 28.4776, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.142, + "step": 992 + }, + { + "epoch": 4.1203319502074685, + "grad_norm": 9.5625, + "learning_rate": 8.796680497925313e-06, + "loss": 0.75, + "step": 993 + }, + { + "epoch": 4.1203319502074685, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7143899202346802, + "eval_runtime": 28.5908, + "eval_samples_per_second": 16.859, + "eval_steps_per_second": 2.134, + "step": 993 + }, + { + "epoch": 4.124481327800829, + "grad_norm": 6.125, + "learning_rate": 8.755186721991701e-06, + "loss": 0.6172, + "step": 994 + }, + { + "epoch": 4.124481327800829, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7148275375366211, + "eval_runtime": 28.6024, + "eval_samples_per_second": 16.852, + "eval_steps_per_second": 2.133, + "step": 994 + }, + { + "epoch": 4.12863070539419, + "grad_norm": 12.0, + "learning_rate": 8.713692946058091e-06, + "loss": 0.8516, + "step": 995 + }, + { + "epoch": 4.12863070539419, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7143737077713013, + "eval_runtime": 28.6798, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.127, + "step": 995 + }, + { + "epoch": 4.132780082987552, + "grad_norm": 5.875, + "learning_rate": 8.67219917012448e-06, + "loss": 0.5898, + "step": 996 + }, + { + "epoch": 4.132780082987552, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.713822603225708, + "eval_runtime": 28.6252, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 996 + }, + { + "epoch": 4.136929460580913, + "grad_norm": 11.25, + "learning_rate": 8.630705394190872e-06, + "loss": 1.1562, + "step": 997 + }, + { + "epoch": 4.136929460580913, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7136281132698059, + "eval_runtime": 28.6219, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 997 + }, + { + "epoch": 4.141078838174274, + "grad_norm": 6.3125, + "learning_rate": 8.589211618257262e-06, + "loss": 0.2354, + "step": 998 + }, + { + "epoch": 4.141078838174274, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7138874530792236, + "eval_runtime": 28.5792, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 998 + }, + { + "epoch": 4.145228215767635, + "grad_norm": 15.125, + "learning_rate": 8.547717842323652e-06, + "loss": 0.9258, + "step": 999 + }, + { + "epoch": 4.145228215767635, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7135308384895325, + "eval_runtime": 28.5446, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.137, + "step": 999 + }, + { + "epoch": 4.149377593360996, + "grad_norm": 6.375, + "learning_rate": 8.506224066390042e-06, + "loss": 0.5156, + "step": 1000 + }, + { + "epoch": 4.149377593360996, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7133525609970093, + "eval_runtime": 28.4449, + "eval_samples_per_second": 16.945, + "eval_steps_per_second": 2.144, + "step": 1000 + }, + { + "epoch": 4.153526970954357, + "grad_norm": 4.84375, + "learning_rate": 8.464730290456433e-06, + "loss": 0.7188, + "step": 1001 + }, + { + "epoch": 4.153526970954357, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7127852439880371, + "eval_runtime": 28.3184, + "eval_samples_per_second": 17.021, + "eval_steps_per_second": 2.154, + "step": 1001 + }, + { + "epoch": 4.157676348547718, + "grad_norm": 18.375, + "learning_rate": 8.423236514522823e-06, + "loss": 0.9062, + "step": 1002 + }, + { + "epoch": 4.157676348547718, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7127852439880371, + "eval_runtime": 28.4372, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.145, + "step": 1002 + }, + { + "epoch": 4.161825726141079, + "grad_norm": 12.4375, + "learning_rate": 8.381742738589211e-06, + "loss": 1.375, + "step": 1003 + }, + { + "epoch": 4.161825726141079, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7130770087242126, + "eval_runtime": 28.5889, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 1003 + }, + { + "epoch": 4.16597510373444, + "grad_norm": 6.375, + "learning_rate": 8.340248962655602e-06, + "loss": 0.3867, + "step": 1004 + }, + { + "epoch": 4.16597510373444, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7116506695747375, + "eval_runtime": 28.6454, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 1004 + }, + { + "epoch": 4.170124481327801, + "grad_norm": 5.40625, + "learning_rate": 8.298755186721992e-06, + "loss": 0.2461, + "step": 1005 + }, + { + "epoch": 4.170124481327801, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7115533947944641, + "eval_runtime": 28.6166, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 1005 + }, + { + "epoch": 4.174273858921162, + "grad_norm": 6.59375, + "learning_rate": 8.257261410788382e-06, + "loss": 0.5117, + "step": 1006 + }, + { + "epoch": 4.174273858921162, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7115858197212219, + "eval_runtime": 28.6225, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 1006 + }, + { + "epoch": 4.178423236514523, + "grad_norm": 9.875, + "learning_rate": 8.215767634854772e-06, + "loss": 0.418, + "step": 1007 + }, + { + "epoch": 4.178423236514523, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7119424343109131, + "eval_runtime": 28.623, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 1007 + }, + { + "epoch": 4.182572614107884, + "grad_norm": 12.6875, + "learning_rate": 8.174273858921162e-06, + "loss": 1.1641, + "step": 1008 + }, + { + "epoch": 4.182572614107884, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7113751173019409, + "eval_runtime": 28.6447, + "eval_samples_per_second": 16.827, + "eval_steps_per_second": 2.13, + "step": 1008 + }, + { + "epoch": 4.186721991701245, + "grad_norm": 9.125, + "learning_rate": 8.132780082987553e-06, + "loss": 0.582, + "step": 1009 + }, + { + "epoch": 4.186721991701245, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.710516095161438, + "eval_runtime": 28.5211, + "eval_samples_per_second": 16.9, + "eval_steps_per_second": 2.139, + "step": 1009 + }, + { + "epoch": 4.190871369294606, + "grad_norm": 9.125, + "learning_rate": 8.091286307053943e-06, + "loss": 0.4375, + "step": 1010 + }, + { + "epoch": 4.190871369294606, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7115533947944641, + "eval_runtime": 28.5066, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.14, + "step": 1010 + }, + { + "epoch": 4.195020746887967, + "grad_norm": 9.0, + "learning_rate": 8.049792531120333e-06, + "loss": 0.6953, + "step": 1011 + }, + { + "epoch": 4.195020746887967, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7114237546920776, + "eval_runtime": 28.5915, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.134, + "step": 1011 + }, + { + "epoch": 4.199170124481328, + "grad_norm": 10.875, + "learning_rate": 8.008298755186722e-06, + "loss": 1.1719, + "step": 1012 + }, + { + "epoch": 4.199170124481328, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7106133103370667, + "eval_runtime": 28.6222, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 1012 + }, + { + "epoch": 4.203319502074689, + "grad_norm": 4.25, + "learning_rate": 7.966804979253112e-06, + "loss": 0.5195, + "step": 1013 + }, + { + "epoch": 4.203319502074689, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7106781601905823, + "eval_runtime": 28.6673, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.128, + "step": 1013 + }, + { + "epoch": 4.20746887966805, + "grad_norm": 10.3125, + "learning_rate": 7.925311203319502e-06, + "loss": 0.8164, + "step": 1014 + }, + { + "epoch": 4.20746887966805, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7118775844573975, + "eval_runtime": 28.6126, + "eval_samples_per_second": 16.846, + "eval_steps_per_second": 2.132, + "step": 1014 + }, + { + "epoch": 4.211618257261411, + "grad_norm": 4.9375, + "learning_rate": 7.883817427385892e-06, + "loss": 0.5117, + "step": 1015 + }, + { + "epoch": 4.211618257261411, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7105808854103088, + "eval_runtime": 28.6039, + "eval_samples_per_second": 16.851, + "eval_steps_per_second": 2.133, + "step": 1015 + }, + { + "epoch": 4.215767634854772, + "grad_norm": 25.375, + "learning_rate": 7.842323651452283e-06, + "loss": 0.707, + "step": 1016 + }, + { + "epoch": 4.215767634854772, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7105970978736877, + "eval_runtime": 28.5961, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 1016 + }, + { + "epoch": 4.219917012448133, + "grad_norm": 6.1875, + "learning_rate": 7.800829875518673e-06, + "loss": 0.6836, + "step": 1017 + }, + { + "epoch": 4.219917012448133, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7106943726539612, + "eval_runtime": 28.4804, + "eval_samples_per_second": 16.924, + "eval_steps_per_second": 2.142, + "step": 1017 + }, + { + "epoch": 4.224066390041494, + "grad_norm": 16.5, + "learning_rate": 7.759336099585063e-06, + "loss": 0.9336, + "step": 1018 + }, + { + "epoch": 4.224066390041494, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7108078598976135, + "eval_runtime": 28.4396, + "eval_samples_per_second": 16.948, + "eval_steps_per_second": 2.145, + "step": 1018 + }, + { + "epoch": 4.228215767634855, + "grad_norm": 12.6875, + "learning_rate": 7.717842323651453e-06, + "loss": 0.3848, + "step": 1019 + }, + { + "epoch": 4.228215767634855, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7112616896629333, + "eval_runtime": 28.5164, + "eval_samples_per_second": 16.903, + "eval_steps_per_second": 2.139, + "step": 1019 + }, + { + "epoch": 4.232365145228216, + "grad_norm": 6.6875, + "learning_rate": 7.676348547717842e-06, + "loss": 0.3438, + "step": 1020 + }, + { + "epoch": 4.232365145228216, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7109050750732422, + "eval_runtime": 28.5624, + "eval_samples_per_second": 16.875, + "eval_steps_per_second": 2.136, + "step": 1020 + }, + { + "epoch": 4.236514522821577, + "grad_norm": 5.375, + "learning_rate": 7.634854771784234e-06, + "loss": 0.5547, + "step": 1021 + }, + { + "epoch": 4.236514522821577, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.711358904838562, + "eval_runtime": 28.6098, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.132, + "step": 1021 + }, + { + "epoch": 4.240663900414938, + "grad_norm": 32.75, + "learning_rate": 7.593360995850622e-06, + "loss": 0.6406, + "step": 1022 + }, + { + "epoch": 4.240663900414938, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7106619477272034, + "eval_runtime": 28.6237, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 1022 + }, + { + "epoch": 4.244813278008299, + "grad_norm": 33.75, + "learning_rate": 7.5518672199170125e-06, + "loss": 0.7891, + "step": 1023 + }, + { + "epoch": 4.244813278008299, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7097380757331848, + "eval_runtime": 28.6779, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.127, + "step": 1023 + }, + { + "epoch": 4.24896265560166, + "grad_norm": 11.25, + "learning_rate": 7.510373443983402e-06, + "loss": 0.7461, + "step": 1024 + }, + { + "epoch": 4.24896265560166, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7106457352638245, + "eval_runtime": 28.6159, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 1024 + }, + { + "epoch": 4.253112033195021, + "grad_norm": 7.0, + "learning_rate": 7.468879668049793e-06, + "loss": 0.7344, + "step": 1025 + }, + { + "epoch": 4.253112033195021, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7097542881965637, + "eval_runtime": 28.6142, + "eval_samples_per_second": 16.845, + "eval_steps_per_second": 2.132, + "step": 1025 + }, + { + "epoch": 4.257261410788382, + "grad_norm": 6.15625, + "learning_rate": 7.427385892116183e-06, + "loss": 0.5898, + "step": 1026 + }, + { + "epoch": 4.257261410788382, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7102567553520203, + "eval_runtime": 28.5441, + "eval_samples_per_second": 16.886, + "eval_steps_per_second": 2.137, + "step": 1026 + }, + { + "epoch": 4.261410788381743, + "grad_norm": 8.4375, + "learning_rate": 7.385892116182573e-06, + "loss": 0.5703, + "step": 1027 + }, + { + "epoch": 4.261410788381743, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7109212875366211, + "eval_runtime": 28.4916, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.141, + "step": 1027 + }, + { + "epoch": 4.265560165975104, + "grad_norm": 15.9375, + "learning_rate": 7.344398340248963e-06, + "loss": 0.4531, + "step": 1028 + }, + { + "epoch": 4.265560165975104, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7111806273460388, + "eval_runtime": 28.4782, + "eval_samples_per_second": 16.925, + "eval_steps_per_second": 2.142, + "step": 1028 + }, + { + "epoch": 4.269709543568465, + "grad_norm": 4.25, + "learning_rate": 7.302904564315354e-06, + "loss": 0.3438, + "step": 1029 + }, + { + "epoch": 4.269709543568465, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7108240127563477, + "eval_runtime": 28.5131, + "eval_samples_per_second": 16.905, + "eval_steps_per_second": 2.139, + "step": 1029 + }, + { + "epoch": 4.273858921161826, + "grad_norm": 20.5, + "learning_rate": 7.2614107883817436e-06, + "loss": 0.5508, + "step": 1030 + }, + { + "epoch": 4.273858921161826, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.709284245967865, + "eval_runtime": 28.5916, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 1030 + }, + { + "epoch": 4.278008298755187, + "grad_norm": 4.875, + "learning_rate": 7.2199170124481325e-06, + "loss": 0.5352, + "step": 1031 + }, + { + "epoch": 4.278008298755187, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7105647325515747, + "eval_runtime": 28.6562, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.129, + "step": 1031 + }, + { + "epoch": 4.282157676348548, + "grad_norm": 3.765625, + "learning_rate": 7.178423236514522e-06, + "loss": 0.2715, + "step": 1032 + }, + { + "epoch": 4.282157676348548, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.710726797580719, + "eval_runtime": 28.618, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 1032 + }, + { + "epoch": 4.286307053941909, + "grad_norm": 4.78125, + "learning_rate": 7.136929460580913e-06, + "loss": 0.5039, + "step": 1033 + }, + { + "epoch": 4.286307053941909, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7114723920822144, + "eval_runtime": 28.6235, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 1033 + }, + { + "epoch": 4.29045643153527, + "grad_norm": 7.1875, + "learning_rate": 7.095435684647303e-06, + "loss": 0.7383, + "step": 1034 + }, + { + "epoch": 4.29045643153527, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7110347747802734, + "eval_runtime": 28.6205, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 1034 + }, + { + "epoch": 4.2946058091286305, + "grad_norm": 6.09375, + "learning_rate": 7.053941908713693e-06, + "loss": 0.8047, + "step": 1035 + }, + { + "epoch": 4.2946058091286305, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7111157774925232, + "eval_runtime": 28.5816, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.134, + "step": 1035 + }, + { + "epoch": 4.2987551867219915, + "grad_norm": 4.09375, + "learning_rate": 7.012448132780083e-06, + "loss": 0.4414, + "step": 1036 + }, + { + "epoch": 4.2987551867219915, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7119100093841553, + "eval_runtime": 28.5025, + "eval_samples_per_second": 16.911, + "eval_steps_per_second": 2.14, + "step": 1036 + }, + { + "epoch": 4.3029045643153525, + "grad_norm": 9.75, + "learning_rate": 6.970954356846474e-06, + "loss": 0.6016, + "step": 1037 + }, + { + "epoch": 4.3029045643153525, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7108078598976135, + "eval_runtime": 28.4926, + "eval_samples_per_second": 16.917, + "eval_steps_per_second": 2.141, + "step": 1037 + }, + { + "epoch": 4.3070539419087135, + "grad_norm": 23.375, + "learning_rate": 6.9294605809128635e-06, + "loss": 1.3438, + "step": 1038 + }, + { + "epoch": 4.3070539419087135, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7107430100440979, + "eval_runtime": 28.4149, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.147, + "step": 1038 + }, + { + "epoch": 4.3112033195020745, + "grad_norm": 5.0, + "learning_rate": 6.887966804979254e-06, + "loss": 0.5938, + "step": 1039 + }, + { + "epoch": 4.3112033195020745, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7123152017593384, + "eval_runtime": 28.386, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.149, + "step": 1039 + }, + { + "epoch": 4.3153526970954355, + "grad_norm": 12.375, + "learning_rate": 6.846473029045644e-06, + "loss": 0.6094, + "step": 1040 + }, + { + "epoch": 4.3153526970954355, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7104026079177856, + "eval_runtime": 28.3804, + "eval_samples_per_second": 16.984, + "eval_steps_per_second": 2.149, + "step": 1040 + }, + { + "epoch": 4.319502074688796, + "grad_norm": 9.375, + "learning_rate": 6.804979253112033e-06, + "loss": 0.6328, + "step": 1041 + }, + { + "epoch": 4.319502074688796, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7119100093841553, + "eval_runtime": 28.4258, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.146, + "step": 1041 + }, + { + "epoch": 4.323651452282157, + "grad_norm": 9.5, + "learning_rate": 6.7634854771784235e-06, + "loss": 0.7852, + "step": 1042 + }, + { + "epoch": 4.323651452282157, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7112292647361755, + "eval_runtime": 28.42, + "eval_samples_per_second": 16.96, + "eval_steps_per_second": 2.146, + "step": 1042 + }, + { + "epoch": 4.327800829875518, + "grad_norm": 4.65625, + "learning_rate": 6.721991701244813e-06, + "loss": 0.4922, + "step": 1043 + }, + { + "epoch": 4.327800829875518, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7125259041786194, + "eval_runtime": 28.5379, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 1043 + }, + { + "epoch": 4.331950207468879, + "grad_norm": 7.625, + "learning_rate": 6.680497925311203e-06, + "loss": 0.9492, + "step": 1044 + }, + { + "epoch": 4.331950207468879, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7119100093841553, + "eval_runtime": 28.647, + "eval_samples_per_second": 16.825, + "eval_steps_per_second": 2.129, + "step": 1044 + }, + { + "epoch": 4.33609958506224, + "grad_norm": 5.90625, + "learning_rate": 6.639004149377594e-06, + "loss": 0.7227, + "step": 1045 + }, + { + "epoch": 4.33609958506224, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7107105851173401, + "eval_runtime": 28.6148, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 1045 + }, + { + "epoch": 4.340248962655601, + "grad_norm": 14.125, + "learning_rate": 6.5975103734439835e-06, + "loss": 0.7305, + "step": 1046 + }, + { + "epoch": 4.340248962655601, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7123638391494751, + "eval_runtime": 28.6201, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 1046 + }, + { + "epoch": 4.344398340248962, + "grad_norm": 6.78125, + "learning_rate": 6.556016597510374e-06, + "loss": 0.7695, + "step": 1047 + }, + { + "epoch": 4.344398340248962, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7127852439880371, + "eval_runtime": 28.6173, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 1047 + }, + { + "epoch": 4.348547717842323, + "grad_norm": 5.25, + "learning_rate": 6.514522821576764e-06, + "loss": 0.3086, + "step": 1048 + }, + { + "epoch": 4.348547717842323, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7124124765396118, + "eval_runtime": 28.6111, + "eval_samples_per_second": 16.847, + "eval_steps_per_second": 2.132, + "step": 1048 + }, + { + "epoch": 4.352697095435684, + "grad_norm": 17.0, + "learning_rate": 6.4730290456431546e-06, + "loss": 0.5586, + "step": 1049 + }, + { + "epoch": 4.352697095435684, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7129311561584473, + "eval_runtime": 28.5476, + "eval_samples_per_second": 16.884, + "eval_steps_per_second": 2.137, + "step": 1049 + }, + { + "epoch": 4.356846473029045, + "grad_norm": 6.1875, + "learning_rate": 6.4315352697095435e-06, + "loss": 0.625, + "step": 1050 + }, + { + "epoch": 4.356846473029045, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7116993069648743, + "eval_runtime": 28.5404, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 1050 + }, + { + "epoch": 4.360995850622406, + "grad_norm": 22.125, + "learning_rate": 6.390041493775933e-06, + "loss": 0.5898, + "step": 1051 + }, + { + "epoch": 4.360995850622406, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7124611139297485, + "eval_runtime": 28.5866, + "eval_samples_per_second": 16.861, + "eval_steps_per_second": 2.134, + "step": 1051 + }, + { + "epoch": 4.365145228215767, + "grad_norm": 5.125, + "learning_rate": 6.348547717842324e-06, + "loss": 0.3613, + "step": 1052 + }, + { + "epoch": 4.365145228215767, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7122017741203308, + "eval_runtime": 28.5095, + "eval_samples_per_second": 16.907, + "eval_steps_per_second": 2.14, + "step": 1052 + }, + { + "epoch": 4.369294605809129, + "grad_norm": 4.53125, + "learning_rate": 6.307053941908714e-06, + "loss": 0.75, + "step": 1053 + }, + { + "epoch": 4.369294605809129, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7123638391494751, + "eval_runtime": 28.6197, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.131, + "step": 1053 + }, + { + "epoch": 4.37344398340249, + "grad_norm": 9.625, + "learning_rate": 6.265560165975104e-06, + "loss": 0.7539, + "step": 1054 + }, + { + "epoch": 4.37344398340249, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7126231789588928, + "eval_runtime": 28.6183, + "eval_samples_per_second": 16.842, + "eval_steps_per_second": 2.132, + "step": 1054 + }, + { + "epoch": 4.377593360995851, + "grad_norm": 34.25, + "learning_rate": 6.224066390041494e-06, + "loss": 0.4922, + "step": 1055 + }, + { + "epoch": 4.377593360995851, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7117803692817688, + "eval_runtime": 28.6315, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 1055 + }, + { + "epoch": 4.381742738589212, + "grad_norm": 10.0625, + "learning_rate": 6.182572614107885e-06, + "loss": 0.5586, + "step": 1056 + }, + { + "epoch": 4.381742738589212, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7126069664955139, + "eval_runtime": 28.6829, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.127, + "step": 1056 + }, + { + "epoch": 4.385892116182573, + "grad_norm": 7.125, + "learning_rate": 6.141078838174274e-06, + "loss": 0.6719, + "step": 1057 + }, + { + "epoch": 4.385892116182573, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7128825187683105, + "eval_runtime": 28.6284, + "eval_samples_per_second": 16.836, + "eval_steps_per_second": 2.131, + "step": 1057 + }, + { + "epoch": 4.390041493775934, + "grad_norm": 7.34375, + "learning_rate": 6.099585062240664e-06, + "loss": 0.5469, + "step": 1058 + }, + { + "epoch": 4.390041493775934, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7122179865837097, + "eval_runtime": 28.5683, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 1058 + }, + { + "epoch": 4.394190871369295, + "grad_norm": 25.5, + "learning_rate": 6.058091286307054e-06, + "loss": 0.5859, + "step": 1059 + }, + { + "epoch": 4.394190871369295, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7123962640762329, + "eval_runtime": 28.5417, + "eval_samples_per_second": 16.888, + "eval_steps_per_second": 2.137, + "step": 1059 + }, + { + "epoch": 4.398340248962656, + "grad_norm": 11.5, + "learning_rate": 6.016597510373445e-06, + "loss": 0.6367, + "step": 1060 + }, + { + "epoch": 4.398340248962656, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7136605381965637, + "eval_runtime": 28.4944, + "eval_samples_per_second": 16.916, + "eval_steps_per_second": 2.141, + "step": 1060 + }, + { + "epoch": 4.402489626556017, + "grad_norm": 15.375, + "learning_rate": 5.9751037344398345e-06, + "loss": 0.4336, + "step": 1061 + }, + { + "epoch": 4.402489626556017, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7127204537391663, + "eval_runtime": 28.4219, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.146, + "step": 1061 + }, + { + "epoch": 4.406639004149378, + "grad_norm": 29.125, + "learning_rate": 5.933609958506224e-06, + "loss": 1.0, + "step": 1062 + }, + { + "epoch": 4.406639004149378, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7136605381965637, + "eval_runtime": 28.3893, + "eval_samples_per_second": 16.978, + "eval_steps_per_second": 2.149, + "step": 1062 + }, + { + "epoch": 4.410788381742739, + "grad_norm": 5.21875, + "learning_rate": 5.892116182572614e-06, + "loss": 0.7188, + "step": 1063 + }, + { + "epoch": 4.410788381742739, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7127690315246582, + "eval_runtime": 28.4324, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 1063 + }, + { + "epoch": 4.4149377593361, + "grad_norm": 22.375, + "learning_rate": 5.850622406639005e-06, + "loss": 0.7617, + "step": 1064 + }, + { + "epoch": 4.4149377593361, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7136281132698059, + "eval_runtime": 28.5671, + "eval_samples_per_second": 16.873, + "eval_steps_per_second": 2.135, + "step": 1064 + }, + { + "epoch": 4.419087136929461, + "grad_norm": 4.53125, + "learning_rate": 5.8091286307053945e-06, + "loss": 0.5898, + "step": 1065 + }, + { + "epoch": 4.419087136929461, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.712590754032135, + "eval_runtime": 28.6212, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 1065 + }, + { + "epoch": 4.423236514522822, + "grad_norm": 9.75, + "learning_rate": 5.767634854771785e-06, + "loss": 0.4746, + "step": 1066 + }, + { + "epoch": 4.423236514522822, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7133363485336304, + "eval_runtime": 28.6157, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 1066 + }, + { + "epoch": 4.427385892116183, + "grad_norm": 30.625, + "learning_rate": 5.726141078838174e-06, + "loss": 1.3984, + "step": 1067 + }, + { + "epoch": 4.427385892116183, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7124935388565063, + "eval_runtime": 28.6268, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 1067 + }, + { + "epoch": 4.431535269709544, + "grad_norm": 4.59375, + "learning_rate": 5.684647302904565e-06, + "loss": 0.6172, + "step": 1068 + }, + { + "epoch": 4.431535269709544, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7129149436950684, + "eval_runtime": 28.6841, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.127, + "step": 1068 + }, + { + "epoch": 4.435684647302905, + "grad_norm": 11.875, + "learning_rate": 5.6431535269709545e-06, + "loss": 0.6953, + "step": 1069 + }, + { + "epoch": 4.435684647302905, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7140819430351257, + "eval_runtime": 28.6159, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 1069 + }, + { + "epoch": 4.439834024896266, + "grad_norm": 15.125, + "learning_rate": 5.601659751037345e-06, + "loss": 1.0078, + "step": 1070 + }, + { + "epoch": 4.439834024896266, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.712979793548584, + "eval_runtime": 28.5723, + "eval_samples_per_second": 16.869, + "eval_steps_per_second": 2.135, + "step": 1070 + }, + { + "epoch": 4.443983402489627, + "grad_norm": 6.78125, + "learning_rate": 5.560165975103734e-06, + "loss": 0.293, + "step": 1071 + }, + { + "epoch": 4.443983402489627, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.712979793548584, + "eval_runtime": 28.5236, + "eval_samples_per_second": 16.898, + "eval_steps_per_second": 2.139, + "step": 1071 + }, + { + "epoch": 4.448132780082988, + "grad_norm": 6.125, + "learning_rate": 5.518672199170125e-06, + "loss": 0.5117, + "step": 1072 + }, + { + "epoch": 4.448132780082988, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.713190495967865, + "eval_runtime": 28.4466, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 1072 + }, + { + "epoch": 4.452282157676349, + "grad_norm": 23.875, + "learning_rate": 5.4771784232365145e-06, + "loss": 0.8789, + "step": 1073 + }, + { + "epoch": 4.452282157676349, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7130770087242126, + "eval_runtime": 28.4023, + "eval_samples_per_second": 16.97, + "eval_steps_per_second": 2.148, + "step": 1073 + }, + { + "epoch": 4.45643153526971, + "grad_norm": 4.9375, + "learning_rate": 5.435684647302905e-06, + "loss": 0.5352, + "step": 1074 + }, + { + "epoch": 4.45643153526971, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7122827768325806, + "eval_runtime": 28.443, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.145, + "step": 1074 + }, + { + "epoch": 4.460580912863071, + "grad_norm": 5.375, + "learning_rate": 5.394190871369295e-06, + "loss": 0.4863, + "step": 1075 + }, + { + "epoch": 4.460580912863071, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7129960060119629, + "eval_runtime": 28.4412, + "eval_samples_per_second": 16.947, + "eval_steps_per_second": 2.145, + "step": 1075 + }, + { + "epoch": 4.464730290456432, + "grad_norm": 6.125, + "learning_rate": 5.352697095435685e-06, + "loss": 0.3984, + "step": 1076 + }, + { + "epoch": 4.464730290456432, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7129311561584473, + "eval_runtime": 28.5491, + "eval_samples_per_second": 16.883, + "eval_steps_per_second": 2.137, + "step": 1076 + }, + { + "epoch": 4.468879668049793, + "grad_norm": 21.25, + "learning_rate": 5.3112033195020745e-06, + "loss": 0.6602, + "step": 1077 + }, + { + "epoch": 4.468879668049793, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7124611139297485, + "eval_runtime": 28.6155, + "eval_samples_per_second": 16.844, + "eval_steps_per_second": 2.132, + "step": 1077 + }, + { + "epoch": 4.473029045643154, + "grad_norm": 8.0625, + "learning_rate": 5.269709543568465e-06, + "loss": 0.543, + "step": 1078 + }, + { + "epoch": 4.473029045643154, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7132391333580017, + "eval_runtime": 28.6304, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 1078 + }, + { + "epoch": 4.477178423236515, + "grad_norm": 5.5625, + "learning_rate": 5.228215767634855e-06, + "loss": 0.668, + "step": 1079 + }, + { + "epoch": 4.477178423236515, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7131094336509705, + "eval_runtime": 28.6738, + "eval_samples_per_second": 16.81, + "eval_steps_per_second": 2.127, + "step": 1079 + }, + { + "epoch": 4.481327800829876, + "grad_norm": 5.40625, + "learning_rate": 5.1867219917012455e-06, + "loss": 0.7539, + "step": 1080 + }, + { + "epoch": 4.481327800829876, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7124449014663696, + "eval_runtime": 28.6656, + "eval_samples_per_second": 16.815, + "eval_steps_per_second": 2.128, + "step": 1080 + }, + { + "epoch": 4.485477178423237, + "grad_norm": 4.9375, + "learning_rate": 5.1452282157676345e-06, + "loss": 0.4922, + "step": 1081 + }, + { + "epoch": 4.485477178423237, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7133039236068726, + "eval_runtime": 28.6455, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 1081 + }, + { + "epoch": 4.4896265560165975, + "grad_norm": 32.0, + "learning_rate": 5.103734439834025e-06, + "loss": 0.6016, + "step": 1082 + }, + { + "epoch": 4.4896265560165975, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7126718163490295, + "eval_runtime": 28.5167, + "eval_samples_per_second": 16.902, + "eval_steps_per_second": 2.139, + "step": 1082 + }, + { + "epoch": 4.4937759336099585, + "grad_norm": 4.5625, + "learning_rate": 5.062240663900415e-06, + "loss": 0.5703, + "step": 1083 + }, + { + "epoch": 4.4937759336099585, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7125259041786194, + "eval_runtime": 28.4773, + "eval_samples_per_second": 16.926, + "eval_steps_per_second": 2.142, + "step": 1083 + }, + { + "epoch": 4.4979253112033195, + "grad_norm": 11.1875, + "learning_rate": 5.0207468879668055e-06, + "loss": 0.8555, + "step": 1084 + }, + { + "epoch": 4.4979253112033195, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7130122184753418, + "eval_runtime": 28.4273, + "eval_samples_per_second": 16.956, + "eval_steps_per_second": 2.146, + "step": 1084 + }, + { + "epoch": 4.5020746887966805, + "grad_norm": 8.875, + "learning_rate": 4.979253112033195e-06, + "loss": 0.4492, + "step": 1085 + }, + { + "epoch": 4.5020746887966805, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7130770087242126, + "eval_runtime": 28.3963, + "eval_samples_per_second": 16.974, + "eval_steps_per_second": 2.148, + "step": 1085 + }, + { + "epoch": 4.5062240663900415, + "grad_norm": 8.0625, + "learning_rate": 4.937759336099585e-06, + "loss": 0.5117, + "step": 1086 + }, + { + "epoch": 4.5062240663900415, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7138550281524658, + "eval_runtime": 28.3861, + "eval_samples_per_second": 16.98, + "eval_steps_per_second": 2.149, + "step": 1086 + }, + { + "epoch": 4.5103734439834025, + "grad_norm": 6.40625, + "learning_rate": 4.896265560165975e-06, + "loss": 0.5859, + "step": 1087 + }, + { + "epoch": 4.5103734439834025, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7135308384895325, + "eval_runtime": 28.4826, + "eval_samples_per_second": 16.923, + "eval_steps_per_second": 2.142, + "step": 1087 + }, + { + "epoch": 4.514522821576763, + "grad_norm": 5.65625, + "learning_rate": 4.8547717842323655e-06, + "loss": 0.7578, + "step": 1088 + }, + { + "epoch": 4.514522821576763, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7129473686218262, + "eval_runtime": 28.5707, + "eval_samples_per_second": 16.87, + "eval_steps_per_second": 2.135, + "step": 1088 + }, + { + "epoch": 4.518672199170124, + "grad_norm": 5.65625, + "learning_rate": 4.813278008298755e-06, + "loss": 0.2578, + "step": 1089 + }, + { + "epoch": 4.518672199170124, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7147626876831055, + "eval_runtime": 28.6375, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 1089 + }, + { + "epoch": 4.522821576763485, + "grad_norm": 14.5, + "learning_rate": 4.771784232365145e-06, + "loss": 0.9766, + "step": 1090 + }, + { + "epoch": 4.522821576763485, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.713401198387146, + "eval_runtime": 28.6568, + "eval_samples_per_second": 16.82, + "eval_steps_per_second": 2.129, + "step": 1090 + }, + { + "epoch": 4.526970954356846, + "grad_norm": 9.0, + "learning_rate": 4.730290456431536e-06, + "loss": 0.7695, + "step": 1091 + }, + { + "epoch": 4.526970954356846, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.714033305644989, + "eval_runtime": 28.653, + "eval_samples_per_second": 16.822, + "eval_steps_per_second": 2.129, + "step": 1091 + }, + { + "epoch": 4.531120331950207, + "grad_norm": 5.25, + "learning_rate": 4.6887966804979255e-06, + "loss": 0.7617, + "step": 1092 + }, + { + "epoch": 4.531120331950207, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7132067084312439, + "eval_runtime": 28.6307, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 1092 + }, + { + "epoch": 4.535269709543568, + "grad_norm": 15.9375, + "learning_rate": 4.647302904564316e-06, + "loss": 0.8477, + "step": 1093 + }, + { + "epoch": 4.535269709543568, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.713611900806427, + "eval_runtime": 28.5793, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 1093 + }, + { + "epoch": 4.539419087136929, + "grad_norm": 10.8125, + "learning_rate": 4.605809128630706e-06, + "loss": 0.7305, + "step": 1094 + }, + { + "epoch": 4.539419087136929, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7131742835044861, + "eval_runtime": 28.5261, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 1094 + }, + { + "epoch": 4.54356846473029, + "grad_norm": 6.71875, + "learning_rate": 4.564315352697096e-06, + "loss": 0.3496, + "step": 1095 + }, + { + "epoch": 4.54356846473029, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7126231789588928, + "eval_runtime": 28.4211, + "eval_samples_per_second": 16.959, + "eval_steps_per_second": 2.146, + "step": 1095 + }, + { + "epoch": 4.547717842323651, + "grad_norm": 12.875, + "learning_rate": 4.5228215767634855e-06, + "loss": 0.5859, + "step": 1096 + }, + { + "epoch": 4.547717842323651, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7133525609970093, + "eval_runtime": 28.4315, + "eval_samples_per_second": 16.953, + "eval_steps_per_second": 2.146, + "step": 1096 + }, + { + "epoch": 4.551867219917012, + "grad_norm": 9.0, + "learning_rate": 4.481327800829876e-06, + "loss": 0.7031, + "step": 1097 + }, + { + "epoch": 4.551867219917012, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7126880288124084, + "eval_runtime": 28.4348, + "eval_samples_per_second": 16.951, + "eval_steps_per_second": 2.145, + "step": 1097 + }, + { + "epoch": 4.556016597510373, + "grad_norm": 5.96875, + "learning_rate": 4.439834024896266e-06, + "loss": 0.4863, + "step": 1098 + }, + { + "epoch": 4.556016597510373, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7127528786659241, + "eval_runtime": 28.621, + "eval_samples_per_second": 16.841, + "eval_steps_per_second": 2.131, + "step": 1098 + }, + { + "epoch": 4.560165975103734, + "grad_norm": 5.78125, + "learning_rate": 4.3983402489626565e-06, + "loss": 0.5898, + "step": 1099 + }, + { + "epoch": 4.560165975103734, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7124286890029907, + "eval_runtime": 28.6714, + "eval_samples_per_second": 16.811, + "eval_steps_per_second": 2.128, + "step": 1099 + }, + { + "epoch": 4.564315352697095, + "grad_norm": 13.5625, + "learning_rate": 4.3568464730290455e-06, + "loss": 0.4883, + "step": 1100 + }, + { + "epoch": 4.564315352697095, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7130445837974548, + "eval_runtime": 28.6378, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 1100 + }, + { + "epoch": 4.568464730290456, + "grad_norm": 5.84375, + "learning_rate": 4.315352697095436e-06, + "loss": 0.4336, + "step": 1101 + }, + { + "epoch": 4.568464730290456, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7132067084312439, + "eval_runtime": 28.555, + "eval_samples_per_second": 16.88, + "eval_steps_per_second": 2.136, + "step": 1101 + }, + { + "epoch": 4.572614107883817, + "grad_norm": 12.875, + "learning_rate": 4.273858921161826e-06, + "loss": 0.4023, + "step": 1102 + }, + { + "epoch": 4.572614107883817, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7130122184753418, + "eval_runtime": 28.4459, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 1102 + }, + { + "epoch": 4.576763485477178, + "grad_norm": 3.890625, + "learning_rate": 4.2323651452282165e-06, + "loss": 0.3887, + "step": 1103 + }, + { + "epoch": 4.576763485477178, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7133201360702515, + "eval_runtime": 28.6224, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 1103 + }, + { + "epoch": 4.580912863070539, + "grad_norm": 13.5, + "learning_rate": 4.1908713692946055e-06, + "loss": 0.8555, + "step": 1104 + }, + { + "epoch": 4.580912863070539, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7141467928886414, + "eval_runtime": 28.6804, + "eval_samples_per_second": 16.806, + "eval_steps_per_second": 2.127, + "step": 1104 + }, + { + "epoch": 4.5850622406639, + "grad_norm": 11.5, + "learning_rate": 4.149377593360996e-06, + "loss": 0.5586, + "step": 1105 + }, + { + "epoch": 4.5850622406639, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7132067084312439, + "eval_runtime": 28.6377, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 1105 + }, + { + "epoch": 4.589211618257261, + "grad_norm": 6.5625, + "learning_rate": 4.107883817427386e-06, + "loss": 0.4824, + "step": 1106 + }, + { + "epoch": 4.589211618257261, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7127690315246582, + "eval_runtime": 28.6277, + "eval_samples_per_second": 16.837, + "eval_steps_per_second": 2.131, + "step": 1106 + }, + { + "epoch": 4.593360995850622, + "grad_norm": 14.9375, + "learning_rate": 4.0663900414937765e-06, + "loss": 0.5664, + "step": 1107 + }, + { + "epoch": 4.593360995850622, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.712979793548584, + "eval_runtime": 28.6704, + "eval_samples_per_second": 16.812, + "eval_steps_per_second": 2.128, + "step": 1107 + }, + { + "epoch": 4.597510373443983, + "grad_norm": 17.375, + "learning_rate": 4.024896265560166e-06, + "loss": 0.8672, + "step": 1108 + }, + { + "epoch": 4.597510373443983, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7140495181083679, + "eval_runtime": 28.5883, + "eval_samples_per_second": 16.86, + "eval_steps_per_second": 2.134, + "step": 1108 + }, + { + "epoch": 4.601659751037344, + "grad_norm": 20.75, + "learning_rate": 3.983402489626556e-06, + "loss": 0.4746, + "step": 1109 + }, + { + "epoch": 4.601659751037344, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7132391333580017, + "eval_runtime": 28.4477, + "eval_samples_per_second": 16.943, + "eval_steps_per_second": 2.144, + "step": 1109 + }, + { + "epoch": 4.605809128630705, + "grad_norm": 14.25, + "learning_rate": 3.941908713692946e-06, + "loss": 0.6836, + "step": 1110 + }, + { + "epoch": 4.605809128630705, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7134660482406616, + "eval_runtime": 28.3911, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.149, + "step": 1110 + }, + { + "epoch": 4.609958506224066, + "grad_norm": 13.5, + "learning_rate": 3.9004149377593365e-06, + "loss": 0.4551, + "step": 1111 + }, + { + "epoch": 4.609958506224066, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7138388156890869, + "eval_runtime": 28.3916, + "eval_samples_per_second": 16.977, + "eval_steps_per_second": 2.149, + "step": 1111 + }, + { + "epoch": 4.614107883817427, + "grad_norm": 5.8125, + "learning_rate": 3.858921161825726e-06, + "loss": 0.5625, + "step": 1112 + }, + { + "epoch": 4.614107883817427, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7141467928886414, + "eval_runtime": 28.5373, + "eval_samples_per_second": 16.89, + "eval_steps_per_second": 2.138, + "step": 1112 + }, + { + "epoch": 4.618257261410788, + "grad_norm": 9.0625, + "learning_rate": 3.817427385892117e-06, + "loss": 0.5195, + "step": 1113 + }, + { + "epoch": 4.618257261410788, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7132714986801147, + "eval_runtime": 28.6096, + "eval_samples_per_second": 16.848, + "eval_steps_per_second": 2.132, + "step": 1113 + }, + { + "epoch": 4.622406639004149, + "grad_norm": 10.9375, + "learning_rate": 3.7759336099585063e-06, + "loss": 0.707, + "step": 1114 + }, + { + "epoch": 4.622406639004149, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7127204537391663, + "eval_runtime": 28.6664, + "eval_samples_per_second": 16.814, + "eval_steps_per_second": 2.128, + "step": 1114 + }, + { + "epoch": 4.62655601659751, + "grad_norm": 24.625, + "learning_rate": 3.7344398340248965e-06, + "loss": 0.8047, + "step": 1115 + }, + { + "epoch": 4.62655601659751, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7138712406158447, + "eval_runtime": 28.6881, + "eval_samples_per_second": 16.801, + "eval_steps_per_second": 2.126, + "step": 1115 + }, + { + "epoch": 4.630705394190871, + "grad_norm": 7.1875, + "learning_rate": 3.6929460580912867e-06, + "loss": 0.7969, + "step": 1116 + }, + { + "epoch": 4.630705394190871, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7129149436950684, + "eval_runtime": 28.6939, + "eval_samples_per_second": 16.798, + "eval_steps_per_second": 2.126, + "step": 1116 + }, + { + "epoch": 4.634854771784232, + "grad_norm": 5.0625, + "learning_rate": 3.651452282157677e-06, + "loss": 0.2021, + "step": 1117 + }, + { + "epoch": 4.634854771784232, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7134822607040405, + "eval_runtime": 28.5825, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 1117 + }, + { + "epoch": 4.639004149377593, + "grad_norm": 4.875, + "learning_rate": 3.6099585062240663e-06, + "loss": 0.4492, + "step": 1118 + }, + { + "epoch": 4.639004149377593, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7122341990470886, + "eval_runtime": 28.5568, + "eval_samples_per_second": 16.879, + "eval_steps_per_second": 2.136, + "step": 1118 + }, + { + "epoch": 4.643153526970955, + "grad_norm": 6.3125, + "learning_rate": 3.5684647302904565e-06, + "loss": 0.5273, + "step": 1119 + }, + { + "epoch": 4.643153526970955, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7143574953079224, + "eval_runtime": 28.4607, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 1119 + }, + { + "epoch": 4.647302904564316, + "grad_norm": 9.3125, + "learning_rate": 3.5269709543568467e-06, + "loss": 0.8711, + "step": 1120 + }, + { + "epoch": 4.647302904564316, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.712979793548584, + "eval_runtime": 28.4332, + "eval_samples_per_second": 16.952, + "eval_steps_per_second": 2.145, + "step": 1120 + }, + { + "epoch": 4.651452282157677, + "grad_norm": 10.25, + "learning_rate": 3.485477178423237e-06, + "loss": 0.5312, + "step": 1121 + }, + { + "epoch": 4.651452282157677, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7133039236068726, + "eval_runtime": 28.4658, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.143, + "step": 1121 + }, + { + "epoch": 4.655601659751038, + "grad_norm": 9.3125, + "learning_rate": 3.443983402489627e-06, + "loss": 0.8906, + "step": 1122 + }, + { + "epoch": 4.655601659751038, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.712979793548584, + "eval_runtime": 28.4529, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.144, + "step": 1122 + }, + { + "epoch": 4.659751037344399, + "grad_norm": 20.75, + "learning_rate": 3.4024896265560165e-06, + "loss": 0.875, + "step": 1123 + }, + { + "epoch": 4.659751037344399, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7139198780059814, + "eval_runtime": 28.4536, + "eval_samples_per_second": 16.94, + "eval_steps_per_second": 2.144, + "step": 1123 + }, + { + "epoch": 4.66390041493776, + "grad_norm": 15.6875, + "learning_rate": 3.3609958506224067e-06, + "loss": 0.9648, + "step": 1124 + }, + { + "epoch": 4.66390041493776, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.712801456451416, + "eval_runtime": 28.5825, + "eval_samples_per_second": 16.863, + "eval_steps_per_second": 2.134, + "step": 1124 + }, + { + "epoch": 4.668049792531121, + "grad_norm": 6.09375, + "learning_rate": 3.319502074688797e-06, + "loss": 0.4082, + "step": 1125 + }, + { + "epoch": 4.668049792531121, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7130283713340759, + "eval_runtime": 28.6264, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 1125 + }, + { + "epoch": 4.672199170124482, + "grad_norm": 5.96875, + "learning_rate": 3.278008298755187e-06, + "loss": 0.582, + "step": 1126 + }, + { + "epoch": 4.672199170124482, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7139847278594971, + "eval_runtime": 28.6324, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 1126 + }, + { + "epoch": 4.676348547717843, + "grad_norm": 5.4375, + "learning_rate": 3.2365145228215773e-06, + "loss": 0.4336, + "step": 1127 + }, + { + "epoch": 4.676348547717843, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7136605381965637, + "eval_runtime": 28.6179, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 1127 + }, + { + "epoch": 4.680497925311204, + "grad_norm": 3.0625, + "learning_rate": 3.1950207468879666e-06, + "loss": 0.2266, + "step": 1128 + }, + { + "epoch": 4.680497925311204, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7141467928886414, + "eval_runtime": 28.6251, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 1128 + }, + { + "epoch": 4.6846473029045645, + "grad_norm": 5.09375, + "learning_rate": 3.153526970954357e-06, + "loss": 0.4609, + "step": 1129 + }, + { + "epoch": 4.6846473029045645, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7133039236068726, + "eval_runtime": 28.5919, + "eval_samples_per_second": 16.858, + "eval_steps_per_second": 2.133, + "step": 1129 + }, + { + "epoch": 4.6887966804979255, + "grad_norm": 9.8125, + "learning_rate": 3.112033195020747e-06, + "loss": 1.0312, + "step": 1130 + }, + { + "epoch": 4.6887966804979255, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7139036655426025, + "eval_runtime": 28.5591, + "eval_samples_per_second": 16.877, + "eval_steps_per_second": 2.136, + "step": 1130 + }, + { + "epoch": 4.6929460580912865, + "grad_norm": 11.1875, + "learning_rate": 3.070539419087137e-06, + "loss": 0.7422, + "step": 1131 + }, + { + "epoch": 4.6929460580912865, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7124286890029907, + "eval_runtime": 28.4493, + "eval_samples_per_second": 16.942, + "eval_steps_per_second": 2.144, + "step": 1131 + }, + { + "epoch": 4.6970954356846475, + "grad_norm": 7.59375, + "learning_rate": 3.029045643153527e-06, + "loss": 0.3691, + "step": 1132 + }, + { + "epoch": 4.6970954356846475, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7133039236068726, + "eval_runtime": 28.4761, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.142, + "step": 1132 + }, + { + "epoch": 4.7012448132780085, + "grad_norm": 9.5, + "learning_rate": 2.9875518672199173e-06, + "loss": 0.5547, + "step": 1133 + }, + { + "epoch": 4.7012448132780085, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.712979793548584, + "eval_runtime": 28.4098, + "eval_samples_per_second": 16.966, + "eval_steps_per_second": 2.147, + "step": 1133 + }, + { + "epoch": 4.7053941908713695, + "grad_norm": 5.1875, + "learning_rate": 2.946058091286307e-06, + "loss": 0.5391, + "step": 1134 + }, + { + "epoch": 4.7053941908713695, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7128176689147949, + "eval_runtime": 28.4425, + "eval_samples_per_second": 16.946, + "eval_steps_per_second": 2.145, + "step": 1134 + }, + { + "epoch": 4.70954356846473, + "grad_norm": 5.46875, + "learning_rate": 2.9045643153526973e-06, + "loss": 0.5039, + "step": 1135 + }, + { + "epoch": 4.70954356846473, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7132229208946228, + "eval_runtime": 28.4606, + "eval_samples_per_second": 16.936, + "eval_steps_per_second": 2.143, + "step": 1135 + }, + { + "epoch": 4.713692946058091, + "grad_norm": 20.125, + "learning_rate": 2.863070539419087e-06, + "loss": 1.8047, + "step": 1136 + }, + { + "epoch": 4.713692946058091, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.713822603225708, + "eval_runtime": 28.6308, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 1136 + }, + { + "epoch": 4.717842323651452, + "grad_norm": 6.875, + "learning_rate": 2.8215767634854773e-06, + "loss": 0.8047, + "step": 1137 + }, + { + "epoch": 4.717842323651452, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7136443257331848, + "eval_runtime": 28.6831, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.127, + "step": 1137 + }, + { + "epoch": 4.721991701244813, + "grad_norm": 4.96875, + "learning_rate": 2.780082987551867e-06, + "loss": 0.4473, + "step": 1138 + }, + { + "epoch": 4.721991701244813, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7122989892959595, + "eval_runtime": 28.6453, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 1138 + }, + { + "epoch": 4.726141078838174, + "grad_norm": 9.0625, + "learning_rate": 2.7385892116182572e-06, + "loss": 0.7812, + "step": 1139 + }, + { + "epoch": 4.726141078838174, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7130932211875916, + "eval_runtime": 28.679, + "eval_samples_per_second": 16.807, + "eval_steps_per_second": 2.127, + "step": 1139 + }, + { + "epoch": 4.730290456431535, + "grad_norm": 15.0625, + "learning_rate": 2.6970954356846475e-06, + "loss": 0.832, + "step": 1140 + }, + { + "epoch": 4.730290456431535, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7145195603370667, + "eval_runtime": 28.686, + "eval_samples_per_second": 16.803, + "eval_steps_per_second": 2.126, + "step": 1140 + }, + { + "epoch": 4.734439834024896, + "grad_norm": 6.40625, + "learning_rate": 2.6556016597510372e-06, + "loss": 0.7656, + "step": 1141 + }, + { + "epoch": 4.734439834024896, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7136929631233215, + "eval_runtime": 28.5966, + "eval_samples_per_second": 16.855, + "eval_steps_per_second": 2.133, + "step": 1141 + }, + { + "epoch": 4.738589211618257, + "grad_norm": 18.75, + "learning_rate": 2.6141078838174274e-06, + "loss": 0.6562, + "step": 1142 + }, + { + "epoch": 4.738589211618257, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7139198780059814, + "eval_runtime": 28.4883, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.141, + "step": 1142 + }, + { + "epoch": 4.742738589211618, + "grad_norm": 13.625, + "learning_rate": 2.5726141078838172e-06, + "loss": 0.5859, + "step": 1143 + }, + { + "epoch": 4.742738589211618, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7137577533721924, + "eval_runtime": 28.4233, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 2.146, + "step": 1143 + }, + { + "epoch": 4.746887966804979, + "grad_norm": 6.75, + "learning_rate": 2.5311203319502074e-06, + "loss": 0.6094, + "step": 1144 + }, + { + "epoch": 4.746887966804979, + "eval_accuracy": 0.5684647302904564, + "eval_loss": 0.7138874530792236, + "eval_runtime": 28.4143, + "eval_samples_per_second": 16.963, + "eval_steps_per_second": 2.147, + "step": 1144 + }, + { + "epoch": 4.75103734439834, + "grad_norm": 10.1875, + "learning_rate": 2.4896265560165977e-06, + "loss": 0.875, + "step": 1145 + }, + { + "epoch": 4.75103734439834, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7133525609970093, + "eval_runtime": 28.5365, + "eval_samples_per_second": 16.891, + "eval_steps_per_second": 2.138, + "step": 1145 + }, + { + "epoch": 4.755186721991701, + "grad_norm": 9.6875, + "learning_rate": 2.4481327800829874e-06, + "loss": 0.9336, + "step": 1146 + }, + { + "epoch": 4.755186721991701, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7137416005134583, + "eval_runtime": 28.6464, + "eval_samples_per_second": 16.826, + "eval_steps_per_second": 2.129, + "step": 1146 + }, + { + "epoch": 4.759336099585062, + "grad_norm": 12.6875, + "learning_rate": 2.4066390041493776e-06, + "loss": 0.3906, + "step": 1147 + }, + { + "epoch": 4.759336099585062, + "eval_accuracy": 0.5705394190871369, + "eval_loss": 0.7125259041786194, + "eval_runtime": 28.6238, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 1147 + }, + { + "epoch": 4.763485477178423, + "grad_norm": 3.28125, + "learning_rate": 2.365145228215768e-06, + "loss": 0.2012, + "step": 1148 + }, + { + "epoch": 4.763485477178423, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7146006226539612, + "eval_runtime": 28.6821, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.127, + "step": 1148 + }, + { + "epoch": 4.767634854771784, + "grad_norm": 7.1875, + "learning_rate": 2.323651452282158e-06, + "loss": 0.8359, + "step": 1149 + }, + { + "epoch": 4.767634854771784, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7130283713340759, + "eval_runtime": 28.6324, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 1149 + }, + { + "epoch": 4.771784232365145, + "grad_norm": 6.21875, + "learning_rate": 2.282157676348548e-06, + "loss": 0.7539, + "step": 1150 + }, + { + "epoch": 4.771784232365145, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7134498357772827, + "eval_runtime": 28.6242, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 1150 + }, + { + "epoch": 4.775933609958506, + "grad_norm": 4.40625, + "learning_rate": 2.240663900414938e-06, + "loss": 0.3652, + "step": 1151 + }, + { + "epoch": 4.775933609958506, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7140819430351257, + "eval_runtime": 28.5742, + "eval_samples_per_second": 16.868, + "eval_steps_per_second": 2.135, + "step": 1151 + }, + { + "epoch": 4.780082987551867, + "grad_norm": 11.0, + "learning_rate": 2.1991701244813283e-06, + "loss": 0.7344, + "step": 1152 + }, + { + "epoch": 4.780082987551867, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7135956883430481, + "eval_runtime": 28.455, + "eval_samples_per_second": 16.939, + "eval_steps_per_second": 2.144, + "step": 1152 + }, + { + "epoch": 4.784232365145228, + "grad_norm": 9.8125, + "learning_rate": 2.157676348547718e-06, + "loss": 0.7227, + "step": 1153 + }, + { + "epoch": 4.784232365145228, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7134336233139038, + "eval_runtime": 28.626, + "eval_samples_per_second": 16.838, + "eval_steps_per_second": 2.131, + "step": 1153 + }, + { + "epoch": 4.788381742738589, + "grad_norm": 44.25, + "learning_rate": 2.1161825726141083e-06, + "loss": 0.3262, + "step": 1154 + }, + { + "epoch": 4.788381742738589, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.713822603225708, + "eval_runtime": 28.6821, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.127, + "step": 1154 + }, + { + "epoch": 4.79253112033195, + "grad_norm": 4.40625, + "learning_rate": 2.074688796680498e-06, + "loss": 0.2178, + "step": 1155 + }, + { + "epoch": 4.79253112033195, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.712380051612854, + "eval_runtime": 28.6329, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 1155 + }, + { + "epoch": 4.796680497925311, + "grad_norm": 12.8125, + "learning_rate": 2.0331950207468883e-06, + "loss": 0.6016, + "step": 1156 + }, + { + "epoch": 4.796680497925311, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7127852439880371, + "eval_runtime": 28.6323, + "eval_samples_per_second": 16.834, + "eval_steps_per_second": 2.13, + "step": 1156 + }, + { + "epoch": 4.800829875518672, + "grad_norm": 5.21875, + "learning_rate": 1.991701244813278e-06, + "loss": 0.625, + "step": 1157 + }, + { + "epoch": 4.800829875518672, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7137253880500793, + "eval_runtime": 28.6226, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 1157 + }, + { + "epoch": 4.804979253112033, + "grad_norm": 5.40625, + "learning_rate": 1.9502074688796682e-06, + "loss": 0.668, + "step": 1158 + }, + { + "epoch": 4.804979253112033, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7135632634162903, + "eval_runtime": 28.5689, + "eval_samples_per_second": 16.872, + "eval_steps_per_second": 2.135, + "step": 1158 + }, + { + "epoch": 4.809128630705394, + "grad_norm": 19.875, + "learning_rate": 1.9087136929460585e-06, + "loss": 0.8086, + "step": 1159 + }, + { + "epoch": 4.809128630705394, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7132553458213806, + "eval_runtime": 28.5324, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.138, + "step": 1159 + }, + { + "epoch": 4.813278008298755, + "grad_norm": 8.6875, + "learning_rate": 1.8672199170124482e-06, + "loss": 0.8906, + "step": 1160 + }, + { + "epoch": 4.813278008298755, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7134660482406616, + "eval_runtime": 28.618, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 1160 + }, + { + "epoch": 4.817427385892116, + "grad_norm": 4.84375, + "learning_rate": 1.8257261410788384e-06, + "loss": 0.4766, + "step": 1161 + }, + { + "epoch": 4.817427385892116, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7133687734603882, + "eval_runtime": 28.6429, + "eval_samples_per_second": 16.828, + "eval_steps_per_second": 2.13, + "step": 1161 + }, + { + "epoch": 4.821576763485477, + "grad_norm": 13.6875, + "learning_rate": 1.7842323651452282e-06, + "loss": 0.5586, + "step": 1162 + }, + { + "epoch": 4.821576763485477, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7131580710411072, + "eval_runtime": 28.6374, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 1162 + }, + { + "epoch": 4.825726141078838, + "grad_norm": 5.46875, + "learning_rate": 1.7427385892116184e-06, + "loss": 0.5117, + "step": 1163 + }, + { + "epoch": 4.825726141078838, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7137739658355713, + "eval_runtime": 28.6598, + "eval_samples_per_second": 16.818, + "eval_steps_per_second": 2.128, + "step": 1163 + }, + { + "epoch": 4.829875518672199, + "grad_norm": 6.90625, + "learning_rate": 1.7012448132780082e-06, + "loss": 0.3027, + "step": 1164 + }, + { + "epoch": 4.829875518672199, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7130445837974548, + "eval_runtime": 28.5256, + "eval_samples_per_second": 16.897, + "eval_steps_per_second": 2.138, + "step": 1164 + }, + { + "epoch": 4.83402489626556, + "grad_norm": 9.5625, + "learning_rate": 1.6597510373443984e-06, + "loss": 0.5469, + "step": 1165 + }, + { + "epoch": 4.83402489626556, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7134660482406616, + "eval_runtime": 28.4841, + "eval_samples_per_second": 16.922, + "eval_steps_per_second": 2.142, + "step": 1165 + }, + { + "epoch": 4.838174273858921, + "grad_norm": 21.25, + "learning_rate": 1.6182572614107886e-06, + "loss": 0.9141, + "step": 1166 + }, + { + "epoch": 4.838174273858921, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7139198780059814, + "eval_runtime": 28.4371, + "eval_samples_per_second": 16.95, + "eval_steps_per_second": 2.145, + "step": 1166 + }, + { + "epoch": 4.842323651452282, + "grad_norm": 56.5, + "learning_rate": 1.5767634854771784e-06, + "loss": 0.6445, + "step": 1167 + }, + { + "epoch": 4.842323651452282, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7133687734603882, + "eval_runtime": 28.4752, + "eval_samples_per_second": 16.927, + "eval_steps_per_second": 2.142, + "step": 1167 + }, + { + "epoch": 4.846473029045643, + "grad_norm": 31.375, + "learning_rate": 1.5352697095435684e-06, + "loss": 0.4492, + "step": 1168 + }, + { + "epoch": 4.846473029045643, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7132229208946228, + "eval_runtime": 28.5803, + "eval_samples_per_second": 16.865, + "eval_steps_per_second": 2.134, + "step": 1168 + }, + { + "epoch": 4.850622406639004, + "grad_norm": 15.8125, + "learning_rate": 1.4937759336099586e-06, + "loss": 0.7578, + "step": 1169 + }, + { + "epoch": 4.850622406639004, + "eval_accuracy": 0.58298755186722, + "eval_loss": 0.7128825187683105, + "eval_runtime": 28.6173, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 1169 + }, + { + "epoch": 4.854771784232365, + "grad_norm": 8.0625, + "learning_rate": 1.4522821576763486e-06, + "loss": 0.5352, + "step": 1170 + }, + { + "epoch": 4.854771784232365, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.712169349193573, + "eval_runtime": 28.6829, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.127, + "step": 1170 + }, + { + "epoch": 4.858921161825726, + "grad_norm": 6.40625, + "learning_rate": 1.4107883817427386e-06, + "loss": 0.4297, + "step": 1171 + }, + { + "epoch": 4.858921161825726, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7136929631233215, + "eval_runtime": 28.5581, + "eval_samples_per_second": 16.878, + "eval_steps_per_second": 2.136, + "step": 1171 + }, + { + "epoch": 4.863070539419088, + "grad_norm": 14.8125, + "learning_rate": 1.3692946058091286e-06, + "loss": 1.1094, + "step": 1172 + }, + { + "epoch": 4.863070539419088, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7124286890029907, + "eval_runtime": 28.6817, + "eval_samples_per_second": 16.805, + "eval_steps_per_second": 2.127, + "step": 1172 + }, + { + "epoch": 4.867219917012449, + "grad_norm": 4.75, + "learning_rate": 1.3278008298755186e-06, + "loss": 0.5977, + "step": 1173 + }, + { + "epoch": 4.867219917012449, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7133687734603882, + "eval_runtime": 28.6373, + "eval_samples_per_second": 16.831, + "eval_steps_per_second": 2.13, + "step": 1173 + }, + { + "epoch": 4.87136929460581, + "grad_norm": 9.5, + "learning_rate": 1.2863070539419086e-06, + "loss": 0.8203, + "step": 1174 + }, + { + "epoch": 4.87136929460581, + "eval_accuracy": 0.5809128630705395, + "eval_loss": 0.7127528786659241, + "eval_runtime": 28.6178, + "eval_samples_per_second": 16.843, + "eval_steps_per_second": 2.132, + "step": 1174 + }, + { + "epoch": 4.875518672199171, + "grad_norm": 16.75, + "learning_rate": 1.2448132780082988e-06, + "loss": 0.5352, + "step": 1175 + }, + { + "epoch": 4.875518672199171, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7132067084312439, + "eval_runtime": 28.6391, + "eval_samples_per_second": 16.83, + "eval_steps_per_second": 2.13, + "step": 1175 + }, + { + "epoch": 4.8796680497925315, + "grad_norm": 5.75, + "learning_rate": 1.2033195020746888e-06, + "loss": 0.4941, + "step": 1176 + }, + { + "epoch": 4.8796680497925315, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7120720744132996, + "eval_runtime": 28.532, + "eval_samples_per_second": 16.893, + "eval_steps_per_second": 2.138, + "step": 1176 + }, + { + "epoch": 4.8838174273858925, + "grad_norm": 5.40625, + "learning_rate": 1.161825726141079e-06, + "loss": 0.5156, + "step": 1177 + }, + { + "epoch": 4.8838174273858925, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7135632634162903, + "eval_runtime": 28.489, + "eval_samples_per_second": 16.919, + "eval_steps_per_second": 2.141, + "step": 1177 + }, + { + "epoch": 4.8879668049792535, + "grad_norm": 9.9375, + "learning_rate": 1.120331950207469e-06, + "loss": 1.0938, + "step": 1178 + }, + { + "epoch": 4.8879668049792535, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7143899202346802, + "eval_runtime": 28.4622, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.143, + "step": 1178 + }, + { + "epoch": 4.8921161825726145, + "grad_norm": 6.6875, + "learning_rate": 1.078838174273859e-06, + "loss": 0.6875, + "step": 1179 + }, + { + "epoch": 4.8921161825726145, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7137577533721924, + "eval_runtime": 28.4475, + "eval_samples_per_second": 16.944, + "eval_steps_per_second": 2.144, + "step": 1179 + }, + { + "epoch": 4.8962655601659755, + "grad_norm": 11.5, + "learning_rate": 1.037344398340249e-06, + "loss": 0.8438, + "step": 1180 + }, + { + "epoch": 4.8962655601659755, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7130770087242126, + "eval_runtime": 28.4288, + "eval_samples_per_second": 16.955, + "eval_steps_per_second": 2.146, + "step": 1180 + }, + { + "epoch": 4.9004149377593365, + "grad_norm": 9.1875, + "learning_rate": 9.95850622406639e-07, + "loss": 0.7656, + "step": 1181 + }, + { + "epoch": 4.9004149377593365, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.713822603225708, + "eval_runtime": 28.4242, + "eval_samples_per_second": 16.957, + "eval_steps_per_second": 2.146, + "step": 1181 + }, + { + "epoch": 4.904564315352697, + "grad_norm": 10.8125, + "learning_rate": 9.543568464730292e-07, + "loss": 0.4531, + "step": 1182 + }, + { + "epoch": 4.904564315352697, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7133525609970093, + "eval_runtime": 28.4739, + "eval_samples_per_second": 16.928, + "eval_steps_per_second": 2.142, + "step": 1182 + }, + { + "epoch": 4.908713692946058, + "grad_norm": 5.0625, + "learning_rate": 9.128630705394192e-07, + "loss": 0.3203, + "step": 1183 + }, + { + "epoch": 4.908713692946058, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7131256461143494, + "eval_runtime": 28.4177, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.147, + "step": 1183 + }, + { + "epoch": 4.912863070539419, + "grad_norm": 9.6875, + "learning_rate": 8.713692946058092e-07, + "loss": 0.7031, + "step": 1184 + }, + { + "epoch": 4.912863070539419, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7128500938415527, + "eval_runtime": 28.4137, + "eval_samples_per_second": 16.964, + "eval_steps_per_second": 2.147, + "step": 1184 + }, + { + "epoch": 4.91701244813278, + "grad_norm": 14.625, + "learning_rate": 8.298755186721992e-07, + "loss": 0.6406, + "step": 1185 + }, + { + "epoch": 4.91701244813278, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7137577533721924, + "eval_runtime": 28.465, + "eval_samples_per_second": 16.933, + "eval_steps_per_second": 2.143, + "step": 1185 + }, + { + "epoch": 4.921161825726141, + "grad_norm": 8.3125, + "learning_rate": 7.883817427385892e-07, + "loss": 0.6172, + "step": 1186 + }, + { + "epoch": 4.921161825726141, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7131094336509705, + "eval_runtime": 28.4081, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.147, + "step": 1186 + }, + { + "epoch": 4.925311203319502, + "grad_norm": 11.5, + "learning_rate": 7.468879668049793e-07, + "loss": 0.5117, + "step": 1187 + }, + { + "epoch": 4.925311203319502, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.712979793548584, + "eval_runtime": 28.4587, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 1187 + }, + { + "epoch": 4.929460580912863, + "grad_norm": 8.125, + "learning_rate": 7.053941908713693e-07, + "loss": 0.4785, + "step": 1188 + }, + { + "epoch": 4.929460580912863, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7132067084312439, + "eval_runtime": 28.4161, + "eval_samples_per_second": 16.962, + "eval_steps_per_second": 2.147, + "step": 1188 + }, + { + "epoch": 4.933609958506224, + "grad_norm": 6.90625, + "learning_rate": 6.639004149377593e-07, + "loss": 0.3691, + "step": 1189 + }, + { + "epoch": 4.933609958506224, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7138712406158447, + "eval_runtime": 28.4589, + "eval_samples_per_second": 16.937, + "eval_steps_per_second": 2.143, + "step": 1189 + }, + { + "epoch": 4.937759336099585, + "grad_norm": 13.375, + "learning_rate": 6.224066390041494e-07, + "loss": 1.1016, + "step": 1190 + }, + { + "epoch": 4.937759336099585, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7131580710411072, + "eval_runtime": 28.463, + "eval_samples_per_second": 16.934, + "eval_steps_per_second": 2.143, + "step": 1190 + }, + { + "epoch": 4.941908713692946, + "grad_norm": 13.25, + "learning_rate": 5.809128630705395e-07, + "loss": 0.5312, + "step": 1191 + }, + { + "epoch": 4.941908713692946, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7127528786659241, + "eval_runtime": 28.4074, + "eval_samples_per_second": 16.967, + "eval_steps_per_second": 2.147, + "step": 1191 + }, + { + "epoch": 4.946058091286307, + "grad_norm": 5.71875, + "learning_rate": 5.394190871369295e-07, + "loss": 0.6836, + "step": 1192 + }, + { + "epoch": 4.946058091286307, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7133201360702515, + "eval_runtime": 28.4182, + "eval_samples_per_second": 16.961, + "eval_steps_per_second": 2.147, + "step": 1192 + }, + { + "epoch": 4.950207468879668, + "grad_norm": 3.28125, + "learning_rate": 4.979253112033195e-07, + "loss": 0.3047, + "step": 1193 + }, + { + "epoch": 4.950207468879668, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7134174108505249, + "eval_runtime": 28.4612, + "eval_samples_per_second": 16.935, + "eval_steps_per_second": 2.143, + "step": 1193 + }, + { + "epoch": 4.954356846473029, + "grad_norm": 10.0, + "learning_rate": 4.564315352697096e-07, + "loss": 0.8203, + "step": 1194 + }, + { + "epoch": 4.954356846473029, + "eval_accuracy": 0.578838174273859, + "eval_loss": 0.7134660482406616, + "eval_runtime": 28.507, + "eval_samples_per_second": 16.908, + "eval_steps_per_second": 2.14, + "step": 1194 + }, + { + "epoch": 4.95850622406639, + "grad_norm": 27.5, + "learning_rate": 4.149377593360996e-07, + "loss": 0.7422, + "step": 1195 + }, + { + "epoch": 4.95850622406639, + "eval_accuracy": 0.5726141078838174, + "eval_loss": 0.7130283713340759, + "eval_runtime": 28.5811, + "eval_samples_per_second": 16.864, + "eval_steps_per_second": 2.134, + "step": 1195 + }, + { + "epoch": 4.962655601659751, + "grad_norm": 24.75, + "learning_rate": 3.7344398340248966e-07, + "loss": 1.1719, + "step": 1196 + }, + { + "epoch": 4.962655601659751, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7129960060119629, + "eval_runtime": 28.6244, + "eval_samples_per_second": 16.839, + "eval_steps_per_second": 2.131, + "step": 1196 + }, + { + "epoch": 4.966804979253112, + "grad_norm": 8.3125, + "learning_rate": 3.3195020746887966e-07, + "loss": 0.5898, + "step": 1197 + }, + { + "epoch": 4.966804979253112, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7126718163490295, + "eval_runtime": 28.6829, + "eval_samples_per_second": 16.804, + "eval_steps_per_second": 2.127, + "step": 1197 + }, + { + "epoch": 4.970954356846473, + "grad_norm": 15.9375, + "learning_rate": 2.9045643153526976e-07, + "loss": 0.2471, + "step": 1198 + }, + { + "epoch": 4.970954356846473, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7127042412757874, + "eval_runtime": 28.6304, + "eval_samples_per_second": 16.835, + "eval_steps_per_second": 2.131, + "step": 1198 + }, + { + "epoch": 4.975103734439834, + "grad_norm": 6.65625, + "learning_rate": 2.4896265560165975e-07, + "loss": 0.4102, + "step": 1199 + }, + { + "epoch": 4.975103734439834, + "eval_accuracy": 0.5767634854771784, + "eval_loss": 0.7126231789588928, + "eval_runtime": 28.6217, + "eval_samples_per_second": 16.84, + "eval_steps_per_second": 2.131, + "step": 1199 + }, + { + "epoch": 4.979253112033195, + "grad_norm": 6.46875, + "learning_rate": 2.074688796680498e-07, + "loss": 0.8477, + "step": 1200 + }, + { + "epoch": 4.979253112033195, + "eval_accuracy": 0.5746887966804979, + "eval_loss": 0.7114886045455933, + "eval_runtime": 28.5429, + "eval_samples_per_second": 16.887, + "eval_steps_per_second": 2.137, + "step": 1200 + } + ], + "logging_steps": 1, + "max_steps": 1205, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}