| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 16.515276630883566, |
| "eval_steps": 2000, |
| "global_step": 120000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0, |
| "eval_accuracy": 1.2790688050781904e-05, |
| "eval_loss": 115.5, |
| "eval_runtime": 200.9168, |
| "eval_samples_per_second": 8229.121, |
| "eval_steps_per_second": 16.076, |
| "step": 0 |
| }, |
| { |
| "epoch": 0.13762730525736305, |
| "grad_norm": 26.74563980102539, |
| "learning_rate": 2.9639999999999997e-05, |
| "loss": 141.26440625, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2752546105147261, |
| "grad_norm": 22.885421752929688, |
| "learning_rate": 5.964e-05, |
| "loss": 59.12350390625, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2752546105147261, |
| "eval_accuracy": 0.676248915563685, |
| "eval_loss": 12.453125, |
| "eval_runtime": 168.1994, |
| "eval_samples_per_second": 9829.817, |
| "eval_steps_per_second": 19.203, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.41288191577208916, |
| "grad_norm": 21.04855728149414, |
| "learning_rate": 8.957999999999998e-05, |
| "loss": 50.17753125, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.5505092210294522, |
| "grad_norm": 20.75350570678711, |
| "learning_rate": 0.00011957999999999999, |
| "loss": 46.33467578125, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.5505092210294522, |
| "eval_accuracy": 0.7075093226563655, |
| "eval_loss": 10.6796875, |
| "eval_runtime": 167.9148, |
| "eval_samples_per_second": 9846.476, |
| "eval_steps_per_second": 19.236, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.6881365262868153, |
| "grad_norm": 19.514423370361328, |
| "learning_rate": 0.00014948999999999998, |
| "loss": 44.1445078125, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.8257638315441783, |
| "grad_norm": 18.586366653442383, |
| "learning_rate": 0.00017949, |
| "loss": 42.90046875, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.8257638315441783, |
| "eval_accuracy": 0.7171687154928088, |
| "eval_loss": 10.140625, |
| "eval_runtime": 169.3136, |
| "eval_samples_per_second": 9765.126, |
| "eval_steps_per_second": 19.077, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.9633911368015414, |
| "grad_norm": 17.66230010986328, |
| "learning_rate": 0.00020946, |
| "loss": 41.96175390625, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.1010184420589044, |
| "grad_norm": 19.431285858154297, |
| "learning_rate": 0.0002394, |
| "loss": 41.42766015625, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.1010184420589044, |
| "eval_accuracy": 0.7211219507641113, |
| "eval_loss": 9.921875, |
| "eval_runtime": 170.601, |
| "eval_samples_per_second": 9691.437, |
| "eval_steps_per_second": 18.933, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.2386457473162675, |
| "grad_norm": 19.449247360229492, |
| "learning_rate": 0.00026933999999999997, |
| "loss": 41.1454921875, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.3762730525736306, |
| "grad_norm": 21.419654846191406, |
| "learning_rate": 0.00029934, |
| "loss": 40.98086328125, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.3762730525736306, |
| "eval_accuracy": 0.7225334877311377, |
| "eval_loss": 9.8359375, |
| "eval_runtime": 167.923, |
| "eval_samples_per_second": 9845.998, |
| "eval_steps_per_second": 19.235, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.5139003578309937, |
| "grad_norm": 17.391115188598633, |
| "learning_rate": 0.0003, |
| "loss": 40.79280078125, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.6515276630883569, |
| "grad_norm": 29.895591735839844, |
| "learning_rate": 0.0003, |
| "loss": 40.4991640625, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.6515276630883569, |
| "eval_accuracy": 0.7232843609541934, |
| "eval_loss": 9.78125, |
| "eval_runtime": 170.7512, |
| "eval_samples_per_second": 9682.91, |
| "eval_steps_per_second": 18.916, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.7891549683457197, |
| "grad_norm": 16.52672576904297, |
| "learning_rate": 0.0003, |
| "loss": 40.5724921875, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.9267822736030829, |
| "grad_norm": 21.801998138427734, |
| "learning_rate": 0.0003, |
| "loss": 40.29026953125, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.9267822736030829, |
| "eval_accuracy": 0.7247579731972368, |
| "eval_loss": 9.7109375, |
| "eval_runtime": 169.376, |
| "eval_samples_per_second": 9761.53, |
| "eval_steps_per_second": 19.07, |
| "step": 14000 |
| }, |
| { |
| "epoch": 2.0644095788604457, |
| "grad_norm": 16.79710578918457, |
| "learning_rate": 0.0003, |
| "loss": 40.192109375, |
| "step": 15000 |
| }, |
| { |
| "epoch": 2.202036884117809, |
| "grad_norm": 19.166486740112305, |
| "learning_rate": 0.0003, |
| "loss": 39.6695, |
| "step": 16000 |
| }, |
| { |
| "epoch": 2.202036884117809, |
| "eval_accuracy": 0.726963905479327, |
| "eval_loss": 9.6171875, |
| "eval_runtime": 170.9384, |
| "eval_samples_per_second": 9672.308, |
| "eval_steps_per_second": 18.896, |
| "step": 16000 |
| }, |
| { |
| "epoch": 2.339664189375172, |
| "grad_norm": 15.058335304260254, |
| "learning_rate": 0.0003, |
| "loss": 39.4829609375, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.477291494632535, |
| "grad_norm": 15.027190208435059, |
| "learning_rate": 0.0003, |
| "loss": 39.3190859375, |
| "step": 18000 |
| }, |
| { |
| "epoch": 2.477291494632535, |
| "eval_accuracy": 0.7287604009112041, |
| "eval_loss": 9.546875, |
| "eval_runtime": 170.8414, |
| "eval_samples_per_second": 9677.8, |
| "eval_steps_per_second": 18.906, |
| "step": 18000 |
| }, |
| { |
| "epoch": 2.614918799889898, |
| "grad_norm": 17.88246726989746, |
| "learning_rate": 0.0003, |
| "loss": 39.1439296875, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.7525461051472613, |
| "grad_norm": 63.608863830566406, |
| "learning_rate": 0.0003, |
| "loss": 39.0614765625, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.7525461051472613, |
| "eval_accuracy": 0.7304020938141926, |
| "eval_loss": 9.46875, |
| "eval_runtime": 169.4479, |
| "eval_samples_per_second": 9757.391, |
| "eval_steps_per_second": 19.062, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.8901734104046244, |
| "grad_norm": 16.970745086669922, |
| "learning_rate": 0.0003, |
| "loss": 38.933671875, |
| "step": 21000 |
| }, |
| { |
| "epoch": 3.0278007156619875, |
| "grad_norm": 19.159212112426758, |
| "learning_rate": 0.0003, |
| "loss": 39.2828828125, |
| "step": 22000 |
| }, |
| { |
| "epoch": 3.0278007156619875, |
| "eval_accuracy": 0.7285464859640399, |
| "eval_loss": 9.5390625, |
| "eval_runtime": 168.2153, |
| "eval_samples_per_second": 9828.886, |
| "eval_steps_per_second": 19.202, |
| "step": 22000 |
| }, |
| { |
| "epoch": 3.1654280209193506, |
| "grad_norm": 28.30343246459961, |
| "learning_rate": 0.0003, |
| "loss": 39.1001796875, |
| "step": 23000 |
| }, |
| { |
| "epoch": 3.3030553261767133, |
| "grad_norm": 14.642151832580566, |
| "learning_rate": 0.0003, |
| "loss": 38.792859375, |
| "step": 24000 |
| }, |
| { |
| "epoch": 3.3030553261767133, |
| "eval_accuracy": 0.7310024378583697, |
| "eval_loss": 9.421875, |
| "eval_runtime": 168.799, |
| "eval_samples_per_second": 9794.899, |
| "eval_steps_per_second": 19.135, |
| "step": 24000 |
| }, |
| { |
| "epoch": 3.4406826314340764, |
| "grad_norm": 32.9803352355957, |
| "learning_rate": 0.0003, |
| "loss": 38.72437890625, |
| "step": 25000 |
| }, |
| { |
| "epoch": 3.5783099366914395, |
| "grad_norm": 15.326594352722168, |
| "learning_rate": 0.0003, |
| "loss": 38.7340703125, |
| "step": 26000 |
| }, |
| { |
| "epoch": 3.5783099366914395, |
| "eval_accuracy": 0.7314286407398777, |
| "eval_loss": 9.390625, |
| "eval_runtime": 168.4702, |
| "eval_samples_per_second": 9814.015, |
| "eval_steps_per_second": 19.173, |
| "step": 26000 |
| }, |
| { |
| "epoch": 3.7159372419488026, |
| "grad_norm": 17.456626892089844, |
| "learning_rate": 0.0003, |
| "loss": 38.50905078125, |
| "step": 27000 |
| }, |
| { |
| "epoch": 3.8535645472061657, |
| "grad_norm": 19.74641990661621, |
| "learning_rate": 0.0003, |
| "loss": 38.4493125, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.8535645472061657, |
| "eval_accuracy": 0.7326063700018453, |
| "eval_loss": 9.3359375, |
| "eval_runtime": 167.8973, |
| "eval_samples_per_second": 9847.504, |
| "eval_steps_per_second": 19.238, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.991191852463529, |
| "grad_norm": 16.116609573364258, |
| "learning_rate": 0.0003, |
| "loss": 38.4295859375, |
| "step": 29000 |
| }, |
| { |
| "epoch": 4.1288191577208915, |
| "grad_norm": 14.791277885437012, |
| "learning_rate": 0.0003, |
| "loss": 38.2604921875, |
| "step": 30000 |
| }, |
| { |
| "epoch": 4.1288191577208915, |
| "eval_accuracy": 0.73262638229311, |
| "eval_loss": 9.34375, |
| "eval_runtime": 169.3505, |
| "eval_samples_per_second": 9763.003, |
| "eval_steps_per_second": 19.073, |
| "step": 30000 |
| }, |
| { |
| "epoch": 4.266446462978255, |
| "grad_norm": 14.701274871826172, |
| "learning_rate": 0.0003, |
| "loss": 38.178046875, |
| "step": 31000 |
| }, |
| { |
| "epoch": 4.404073768235618, |
| "grad_norm": 32.28364181518555, |
| "learning_rate": 0.0003, |
| "loss": 38.118671875, |
| "step": 32000 |
| }, |
| { |
| "epoch": 4.404073768235618, |
| "eval_accuracy": 0.7330061560700385, |
| "eval_loss": 9.328125, |
| "eval_runtime": 169.1911, |
| "eval_samples_per_second": 9772.196, |
| "eval_steps_per_second": 19.091, |
| "step": 32000 |
| }, |
| { |
| "epoch": 4.541701073492981, |
| "grad_norm": 14.65267562866211, |
| "learning_rate": 0.0003, |
| "loss": 38.04671484375, |
| "step": 33000 |
| }, |
| { |
| "epoch": 4.679328378750344, |
| "grad_norm": 15.814043045043945, |
| "learning_rate": 0.0003, |
| "loss": 38.03919140625, |
| "step": 34000 |
| }, |
| { |
| "epoch": 4.679328378750344, |
| "eval_accuracy": 0.7350299683164979, |
| "eval_loss": 9.234375, |
| "eval_runtime": 170.3006, |
| "eval_samples_per_second": 9708.532, |
| "eval_steps_per_second": 18.966, |
| "step": 34000 |
| }, |
| { |
| "epoch": 4.8169556840077075, |
| "grad_norm": 22.082040786743164, |
| "learning_rate": 0.0003, |
| "loss": 38.00453125, |
| "step": 35000 |
| }, |
| { |
| "epoch": 4.95458298926507, |
| "grad_norm": 20.287931442260742, |
| "learning_rate": 0.0003, |
| "loss": 37.977375, |
| "step": 36000 |
| }, |
| { |
| "epoch": 4.95458298926507, |
| "eval_accuracy": 0.7339120258458323, |
| "eval_loss": 9.28125, |
| "eval_runtime": 171.801, |
| "eval_samples_per_second": 9623.744, |
| "eval_steps_per_second": 18.801, |
| "step": 36000 |
| }, |
| { |
| "epoch": 5.092210294522434, |
| "grad_norm": 30.58030128479004, |
| "learning_rate": 0.0003, |
| "loss": 37.9320703125, |
| "step": 37000 |
| }, |
| { |
| "epoch": 5.229837599779796, |
| "grad_norm": 19.570669174194336, |
| "learning_rate": 0.0003, |
| "loss": 37.9665234375, |
| "step": 38000 |
| }, |
| { |
| "epoch": 5.229837599779796, |
| "eval_accuracy": 0.7340658881695064, |
| "eval_loss": 9.28125, |
| "eval_runtime": 167.7847, |
| "eval_samples_per_second": 9854.11, |
| "eval_steps_per_second": 19.251, |
| "step": 38000 |
| }, |
| { |
| "epoch": 5.367464905037159, |
| "grad_norm": 17.29003143310547, |
| "learning_rate": 0.0003, |
| "loss": 37.9463984375, |
| "step": 39000 |
| }, |
| { |
| "epoch": 5.505092210294523, |
| "grad_norm": 16.568086624145508, |
| "learning_rate": 0.0003, |
| "loss": 37.7375625, |
| "step": 40000 |
| }, |
| { |
| "epoch": 5.505092210294523, |
| "eval_accuracy": 0.735108947509515, |
| "eval_loss": 9.2265625, |
| "eval_runtime": 170.3335, |
| "eval_samples_per_second": 9706.659, |
| "eval_steps_per_second": 18.963, |
| "step": 40000 |
| }, |
| { |
| "epoch": 5.642719515551885, |
| "grad_norm": 110.49578094482422, |
| "learning_rate": 0.0003, |
| "loss": 37.6945, |
| "step": 41000 |
| }, |
| { |
| "epoch": 5.780346820809249, |
| "grad_norm": 24.624027252197266, |
| "learning_rate": 0.0003, |
| "loss": 37.72793359375, |
| "step": 42000 |
| }, |
| { |
| "epoch": 5.780346820809249, |
| "eval_accuracy": 0.7357611562920721, |
| "eval_loss": 9.203125, |
| "eval_runtime": 168.0213, |
| "eval_samples_per_second": 9840.236, |
| "eval_steps_per_second": 19.224, |
| "step": 42000 |
| }, |
| { |
| "epoch": 5.917974126066611, |
| "grad_norm": 17.721742630004883, |
| "learning_rate": 0.0003, |
| "loss": 37.635171875, |
| "step": 43000 |
| }, |
| { |
| "epoch": 6.055601431323975, |
| "grad_norm": 14.108975410461426, |
| "learning_rate": 0.0003, |
| "loss": 37.572765625, |
| "step": 44000 |
| }, |
| { |
| "epoch": 6.055601431323975, |
| "eval_accuracy": 0.7358946142236276, |
| "eval_loss": 9.1953125, |
| "eval_runtime": 168.7352, |
| "eval_samples_per_second": 9798.601, |
| "eval_steps_per_second": 19.142, |
| "step": 44000 |
| }, |
| { |
| "epoch": 6.193228736581338, |
| "grad_norm": 22.010786056518555, |
| "learning_rate": 0.0003, |
| "loss": 37.4784453125, |
| "step": 45000 |
| }, |
| { |
| "epoch": 6.330856041838701, |
| "grad_norm": 28.03326988220215, |
| "learning_rate": 0.0003, |
| "loss": 37.5036328125, |
| "step": 46000 |
| }, |
| { |
| "epoch": 6.330856041838701, |
| "eval_accuracy": 0.7368965623776919, |
| "eval_loss": 9.171875, |
| "eval_runtime": 166.8883, |
| "eval_samples_per_second": 9907.04, |
| "eval_steps_per_second": 19.354, |
| "step": 46000 |
| }, |
| { |
| "epoch": 6.468483347096064, |
| "grad_norm": 17.36493492126465, |
| "learning_rate": 0.0003, |
| "loss": 37.4494453125, |
| "step": 47000 |
| }, |
| { |
| "epoch": 6.6061106523534265, |
| "grad_norm": 19.51688003540039, |
| "learning_rate": 0.0003, |
| "loss": 37.435234375, |
| "step": 48000 |
| }, |
| { |
| "epoch": 6.6061106523534265, |
| "eval_accuracy": 0.7361287527698055, |
| "eval_loss": 9.1796875, |
| "eval_runtime": 169.5516, |
| "eval_samples_per_second": 9751.421, |
| "eval_steps_per_second": 19.05, |
| "step": 48000 |
| }, |
| { |
| "epoch": 6.74373795761079, |
| "grad_norm": 15.679317474365234, |
| "learning_rate": 0.0003, |
| "loss": 37.3981484375, |
| "step": 49000 |
| }, |
| { |
| "epoch": 6.881365262868153, |
| "grad_norm": 42.613555908203125, |
| "learning_rate": 0.0003, |
| "loss": 37.51515625, |
| "step": 50000 |
| }, |
| { |
| "epoch": 6.881365262868153, |
| "eval_accuracy": 0.7378156733015306, |
| "eval_loss": 9.125, |
| "eval_runtime": 170.8004, |
| "eval_samples_per_second": 9680.126, |
| "eval_steps_per_second": 18.911, |
| "step": 50000 |
| }, |
| { |
| "epoch": 7.018992568125516, |
| "grad_norm": 15.565272331237793, |
| "learning_rate": 0.0003, |
| "loss": 37.4844375, |
| "step": 51000 |
| }, |
| { |
| "epoch": 7.156619873382879, |
| "grad_norm": 15.575469017028809, |
| "learning_rate": 0.0003, |
| "loss": 37.2694375, |
| "step": 52000 |
| }, |
| { |
| "epoch": 7.156619873382879, |
| "eval_accuracy": 0.7383814207101249, |
| "eval_loss": 9.09375, |
| "eval_runtime": 167.1627, |
| "eval_samples_per_second": 9890.776, |
| "eval_steps_per_second": 19.322, |
| "step": 52000 |
| }, |
| { |
| "epoch": 7.2942471786402425, |
| "grad_norm": 15.146500587463379, |
| "learning_rate": 0.0003, |
| "loss": 37.243421875, |
| "step": 53000 |
| }, |
| { |
| "epoch": 7.431874483897605, |
| "grad_norm": 17.28059959411621, |
| "learning_rate": 0.0003, |
| "loss": 37.23215625, |
| "step": 54000 |
| }, |
| { |
| "epoch": 7.431874483897605, |
| "eval_accuracy": 0.7379147365824262, |
| "eval_loss": 9.1015625, |
| "eval_runtime": 169.7822, |
| "eval_samples_per_second": 9738.179, |
| "eval_steps_per_second": 19.024, |
| "step": 54000 |
| }, |
| { |
| "epoch": 7.569501789154968, |
| "grad_norm": 29.069555282592773, |
| "learning_rate": 0.0003, |
| "loss": 37.22365625, |
| "step": 55000 |
| }, |
| { |
| "epoch": 7.707129094412331, |
| "grad_norm": 26.9329833984375, |
| "learning_rate": 0.0003, |
| "loss": 37.22718359375, |
| "step": 56000 |
| }, |
| { |
| "epoch": 7.707129094412331, |
| "eval_accuracy": 0.7364887765974263, |
| "eval_loss": 9.171875, |
| "eval_runtime": 170.0511, |
| "eval_samples_per_second": 9722.778, |
| "eval_steps_per_second": 18.994, |
| "step": 56000 |
| }, |
| { |
| "epoch": 7.844756399669695, |
| "grad_norm": 97.25556182861328, |
| "learning_rate": 0.0003, |
| "loss": 40.3630625, |
| "step": 57000 |
| }, |
| { |
| "epoch": 7.982383704927058, |
| "grad_norm": 19.714794158935547, |
| "learning_rate": 0.0003, |
| "loss": 37.60580859375, |
| "step": 58000 |
| }, |
| { |
| "epoch": 7.982383704927058, |
| "eval_accuracy": 0.7352168040424968, |
| "eval_loss": 9.234375, |
| "eval_runtime": 169.0419, |
| "eval_samples_per_second": 9780.822, |
| "eval_steps_per_second": 19.108, |
| "step": 58000 |
| }, |
| { |
| "epoch": 8.12001101018442, |
| "grad_norm": 14.792304039001465, |
| "learning_rate": 0.0003, |
| "loss": 37.54733984375, |
| "step": 59000 |
| }, |
| { |
| "epoch": 8.257638315441783, |
| "grad_norm": 15.469705581665039, |
| "learning_rate": 0.0003, |
| "loss": 37.5459453125, |
| "step": 60000 |
| }, |
| { |
| "epoch": 8.257638315441783, |
| "eval_accuracy": 0.7371664310551599, |
| "eval_loss": 9.1328125, |
| "eval_runtime": 166.3796, |
| "eval_samples_per_second": 9937.33, |
| "eval_steps_per_second": 19.413, |
| "step": 60000 |
| }, |
| { |
| "epoch": 8.395265620699147, |
| "grad_norm": 15.161516189575195, |
| "learning_rate": 0.0003, |
| "loss": 37.47540625, |
| "step": 61000 |
| }, |
| { |
| "epoch": 8.53289292595651, |
| "grad_norm": 14.802396774291992, |
| "learning_rate": 0.0003, |
| "loss": 37.30961328125, |
| "step": 62000 |
| }, |
| { |
| "epoch": 8.53289292595651, |
| "eval_accuracy": 0.7381438951141913, |
| "eval_loss": 9.09375, |
| "eval_runtime": 168.1235, |
| "eval_samples_per_second": 9834.251, |
| "eval_steps_per_second": 19.212, |
| "step": 62000 |
| }, |
| { |
| "epoch": 8.670520231213873, |
| "grad_norm": 20.38652229309082, |
| "learning_rate": 0.0003, |
| "loss": 37.20555859375, |
| "step": 63000 |
| }, |
| { |
| "epoch": 8.808147536471235, |
| "grad_norm": 15.676960945129395, |
| "learning_rate": 0.0003, |
| "loss": 37.14574609375, |
| "step": 64000 |
| }, |
| { |
| "epoch": 8.808147536471235, |
| "eval_accuracy": 0.7382565867319608, |
| "eval_loss": 9.1015625, |
| "eval_runtime": 171.8616, |
| "eval_samples_per_second": 9620.35, |
| "eval_steps_per_second": 18.794, |
| "step": 64000 |
| }, |
| { |
| "epoch": 8.9457748417286, |
| "grad_norm": 16.668174743652344, |
| "learning_rate": 0.0003, |
| "loss": 37.125671875, |
| "step": 65000 |
| }, |
| { |
| "epoch": 9.083402146985962, |
| "grad_norm": 16.503215789794922, |
| "learning_rate": 0.0003, |
| "loss": 37.01801171875, |
| "step": 66000 |
| }, |
| { |
| "epoch": 9.083402146985962, |
| "eval_accuracy": 0.7393124677323514, |
| "eval_loss": 9.046875, |
| "eval_runtime": 168.277, |
| "eval_samples_per_second": 9825.282, |
| "eval_steps_per_second": 19.195, |
| "step": 66000 |
| }, |
| { |
| "epoch": 9.221029452243325, |
| "grad_norm": 19.62793731689453, |
| "learning_rate": 0.0003, |
| "loss": 36.9910703125, |
| "step": 67000 |
| }, |
| { |
| "epoch": 9.358656757500688, |
| "grad_norm": 18.604116439819336, |
| "learning_rate": 0.0003, |
| "loss": 36.9631328125, |
| "step": 68000 |
| }, |
| { |
| "epoch": 9.358656757500688, |
| "eval_accuracy": 0.7395416473430355, |
| "eval_loss": 9.03125, |
| "eval_runtime": 166.5578, |
| "eval_samples_per_second": 9926.697, |
| "eval_steps_per_second": 19.393, |
| "step": 68000 |
| }, |
| { |
| "epoch": 9.49628406275805, |
| "grad_norm": 20.98838233947754, |
| "learning_rate": 0.0003, |
| "loss": 36.963703125, |
| "step": 69000 |
| }, |
| { |
| "epoch": 9.633911368015415, |
| "grad_norm": 17.784839630126953, |
| "learning_rate": 0.0003, |
| "loss": 36.96887109375, |
| "step": 70000 |
| }, |
| { |
| "epoch": 9.633911368015415, |
| "eval_accuracy": 0.7390765595619299, |
| "eval_loss": 9.0625, |
| "eval_runtime": 166.5724, |
| "eval_samples_per_second": 9925.827, |
| "eval_steps_per_second": 19.391, |
| "step": 70000 |
| }, |
| { |
| "epoch": 9.771538673272778, |
| "grad_norm": 27.863100051879883, |
| "learning_rate": 0.0003, |
| "loss": 37.0048671875, |
| "step": 71000 |
| }, |
| { |
| "epoch": 9.90916597853014, |
| "grad_norm": 16.188459396362305, |
| "learning_rate": 0.0003, |
| "loss": 36.9179296875, |
| "step": 72000 |
| }, |
| { |
| "epoch": 9.90916597853014, |
| "eval_accuracy": 0.7394581492526459, |
| "eval_loss": 9.03125, |
| "eval_runtime": 170.082, |
| "eval_samples_per_second": 9721.013, |
| "eval_steps_per_second": 18.991, |
| "step": 72000 |
| }, |
| { |
| "epoch": 10.046793283787503, |
| "grad_norm": 14.628756523132324, |
| "learning_rate": 0.0003, |
| "loss": 36.888484375, |
| "step": 73000 |
| }, |
| { |
| "epoch": 10.184420589044867, |
| "grad_norm": 14.934839248657227, |
| "learning_rate": 0.0003, |
| "loss": 36.7855625, |
| "step": 74000 |
| }, |
| { |
| "epoch": 10.184420589044867, |
| "eval_accuracy": 0.7404586215139355, |
| "eval_loss": 8.9921875, |
| "eval_runtime": 168.2373, |
| "eval_samples_per_second": 9827.6, |
| "eval_steps_per_second": 19.199, |
| "step": 74000 |
| }, |
| { |
| "epoch": 10.32204789430223, |
| "grad_norm": 15.220124244689941, |
| "learning_rate": 0.0003, |
| "loss": 36.9548515625, |
| "step": 75000 |
| }, |
| { |
| "epoch": 10.459675199559593, |
| "grad_norm": 15.933542251586914, |
| "learning_rate": 0.0003, |
| "loss": 36.85228125, |
| "step": 76000 |
| }, |
| { |
| "epoch": 10.459675199559593, |
| "eval_accuracy": 0.7398413045299678, |
| "eval_loss": 9.0234375, |
| "eval_runtime": 169.5447, |
| "eval_samples_per_second": 9751.82, |
| "eval_steps_per_second": 19.051, |
| "step": 76000 |
| }, |
| { |
| "epoch": 10.597302504816955, |
| "grad_norm": 15.21678638458252, |
| "learning_rate": 0.0003, |
| "loss": 36.8666171875, |
| "step": 77000 |
| }, |
| { |
| "epoch": 10.734929810074318, |
| "grad_norm": 16.906696319580078, |
| "learning_rate": 0.0003, |
| "loss": 36.78946875, |
| "step": 78000 |
| }, |
| { |
| "epoch": 10.734929810074318, |
| "eval_accuracy": 0.7396672771453636, |
| "eval_loss": 9.015625, |
| "eval_runtime": 168.0605, |
| "eval_samples_per_second": 9837.938, |
| "eval_steps_per_second": 19.219, |
| "step": 78000 |
| }, |
| { |
| "epoch": 10.872557115331682, |
| "grad_norm": 15.474593162536621, |
| "learning_rate": 0.0003, |
| "loss": 36.7689296875, |
| "step": 79000 |
| }, |
| { |
| "epoch": 11.010184420589045, |
| "grad_norm": 14.83968448638916, |
| "learning_rate": 0.0003, |
| "loss": 36.761296875, |
| "step": 80000 |
| }, |
| { |
| "epoch": 11.010184420589045, |
| "eval_accuracy": 0.7401702144442798, |
| "eval_loss": 9.0, |
| "eval_runtime": 167.1731, |
| "eval_samples_per_second": 9890.16, |
| "eval_steps_per_second": 19.321, |
| "step": 80000 |
| }, |
| { |
| "epoch": 11.147811725846408, |
| "grad_norm": 16.945621490478516, |
| "learning_rate": 0.0003, |
| "loss": 36.7302734375, |
| "step": 81000 |
| }, |
| { |
| "epoch": 11.28543903110377, |
| "grad_norm": 20.112407684326172, |
| "learning_rate": 0.0003, |
| "loss": 36.74803125, |
| "step": 82000 |
| }, |
| { |
| "epoch": 11.28543903110377, |
| "eval_accuracy": 0.7397283390276018, |
| "eval_loss": 9.046875, |
| "eval_runtime": 167.1017, |
| "eval_samples_per_second": 9894.387, |
| "eval_steps_per_second": 19.33, |
| "step": 82000 |
| }, |
| { |
| "epoch": 11.423066336361135, |
| "grad_norm": 15.264861106872559, |
| "learning_rate": 0.0003, |
| "loss": 36.7436875, |
| "step": 83000 |
| }, |
| { |
| "epoch": 11.560693641618498, |
| "grad_norm": 35.09538269042969, |
| "learning_rate": 0.0003, |
| "loss": 36.68840625, |
| "step": 84000 |
| }, |
| { |
| "epoch": 11.560693641618498, |
| "eval_accuracy": 0.7408698045825953, |
| "eval_loss": 8.96875, |
| "eval_runtime": 168.7302, |
| "eval_samples_per_second": 9798.89, |
| "eval_steps_per_second": 19.143, |
| "step": 84000 |
| }, |
| { |
| "epoch": 11.69832094687586, |
| "grad_norm": 19.97286605834961, |
| "learning_rate": 0.0003, |
| "loss": 36.69001171875, |
| "step": 85000 |
| }, |
| { |
| "epoch": 11.835948252133223, |
| "grad_norm": 17.521934509277344, |
| "learning_rate": 0.0003, |
| "loss": 36.67117578125, |
| "step": 86000 |
| }, |
| { |
| "epoch": 11.835948252133223, |
| "eval_accuracy": 0.7407812017839025, |
| "eval_loss": 8.9765625, |
| "eval_runtime": 169.2449, |
| "eval_samples_per_second": 9769.092, |
| "eval_steps_per_second": 19.085, |
| "step": 86000 |
| }, |
| { |
| "epoch": 11.973575557390586, |
| "grad_norm": 15.003190040588379, |
| "learning_rate": 0.0003, |
| "loss": 36.64848046875, |
| "step": 87000 |
| }, |
| { |
| "epoch": 12.11120286264795, |
| "grad_norm": 20.016794204711914, |
| "learning_rate": 0.0003, |
| "loss": 36.72609765625, |
| "step": 88000 |
| }, |
| { |
| "epoch": 12.11120286264795, |
| "eval_accuracy": 0.7402567288458476, |
| "eval_loss": 9.0, |
| "eval_runtime": 168.1699, |
| "eval_samples_per_second": 9831.539, |
| "eval_steps_per_second": 19.207, |
| "step": 88000 |
| }, |
| { |
| "epoch": 12.248830167905313, |
| "grad_norm": 16.020404815673828, |
| "learning_rate": 0.0003, |
| "loss": 36.8169140625, |
| "step": 89000 |
| }, |
| { |
| "epoch": 12.386457473162675, |
| "grad_norm": 16.005430221557617, |
| "learning_rate": 0.0003, |
| "loss": 36.652015625, |
| "step": 90000 |
| }, |
| { |
| "epoch": 12.386457473162675, |
| "eval_accuracy": 0.740178315482875, |
| "eval_loss": 9.0, |
| "eval_runtime": 169.3304, |
| "eval_samples_per_second": 9764.16, |
| "eval_steps_per_second": 19.075, |
| "step": 90000 |
| }, |
| { |
| "epoch": 12.524084778420038, |
| "grad_norm": 14.788511276245117, |
| "learning_rate": 0.0003, |
| "loss": 36.70846875, |
| "step": 91000 |
| }, |
| { |
| "epoch": 12.661712083677402, |
| "grad_norm": 15.171088218688965, |
| "learning_rate": 0.0003, |
| "loss": 36.9015078125, |
| "step": 92000 |
| }, |
| { |
| "epoch": 12.661712083677402, |
| "eval_accuracy": 0.7409616192832641, |
| "eval_loss": 8.9765625, |
| "eval_runtime": 167.4567, |
| "eval_samples_per_second": 9873.414, |
| "eval_steps_per_second": 19.289, |
| "step": 92000 |
| }, |
| { |
| "epoch": 12.799339388934765, |
| "grad_norm": 15.66518497467041, |
| "learning_rate": 0.0003, |
| "loss": 36.66136328125, |
| "step": 93000 |
| }, |
| { |
| "epoch": 12.936966694192128, |
| "grad_norm": 17.391189575195312, |
| "learning_rate": 0.0003, |
| "loss": 36.62637109375, |
| "step": 94000 |
| }, |
| { |
| "epoch": 12.936966694192128, |
| "eval_accuracy": 0.7408841425444469, |
| "eval_loss": 8.96875, |
| "eval_runtime": 168.9204, |
| "eval_samples_per_second": 9787.86, |
| "eval_steps_per_second": 19.121, |
| "step": 94000 |
| }, |
| { |
| "epoch": 13.07459399944949, |
| "grad_norm": 14.819647789001465, |
| "learning_rate": 0.0003, |
| "loss": 36.554375, |
| "step": 95000 |
| }, |
| { |
| "epoch": 13.212221304706853, |
| "grad_norm": 15.322741508483887, |
| "learning_rate": 0.0003, |
| "loss": 36.5329375, |
| "step": 96000 |
| }, |
| { |
| "epoch": 13.212221304706853, |
| "eval_accuracy": 0.7418048186655257, |
| "eval_loss": 8.9296875, |
| "eval_runtime": 168.0129, |
| "eval_samples_per_second": 9840.727, |
| "eval_steps_per_second": 19.225, |
| "step": 96000 |
| }, |
| { |
| "epoch": 13.349848609964218, |
| "grad_norm": 15.646768569946289, |
| "learning_rate": 0.0003, |
| "loss": 36.5148125, |
| "step": 97000 |
| }, |
| { |
| "epoch": 13.48747591522158, |
| "grad_norm": 22.885791778564453, |
| "learning_rate": 0.0003, |
| "loss": 36.550625, |
| "step": 98000 |
| }, |
| { |
| "epoch": 13.48747591522158, |
| "eval_accuracy": 0.7403524509758528, |
| "eval_loss": 9.0078125, |
| "eval_runtime": 169.9032, |
| "eval_samples_per_second": 9731.239, |
| "eval_steps_per_second": 19.011, |
| "step": 98000 |
| }, |
| { |
| "epoch": 13.625103220478943, |
| "grad_norm": 23.617691040039062, |
| "learning_rate": 0.0003, |
| "loss": 36.64653515625, |
| "step": 99000 |
| }, |
| { |
| "epoch": 13.762730525736306, |
| "grad_norm": 16.434221267700195, |
| "learning_rate": 0.0003, |
| "loss": 36.5250390625, |
| "step": 100000 |
| }, |
| { |
| "epoch": 13.762730525736306, |
| "eval_accuracy": 0.741298813475031, |
| "eval_loss": 8.9453125, |
| "eval_runtime": 170.5631, |
| "eval_samples_per_second": 9693.591, |
| "eval_steps_per_second": 18.937, |
| "step": 100000 |
| }, |
| { |
| "epoch": 13.90035783099367, |
| "grad_norm": 15.552685737609863, |
| "learning_rate": 0.0003, |
| "loss": 36.50655078125, |
| "step": 101000 |
| }, |
| { |
| "epoch": 14.037985136251033, |
| "grad_norm": 15.682257652282715, |
| "learning_rate": 0.0003, |
| "loss": 36.52517578125, |
| "step": 102000 |
| }, |
| { |
| "epoch": 14.037985136251033, |
| "eval_accuracy": 0.7417503019794978, |
| "eval_loss": 8.9296875, |
| "eval_runtime": 167.2274, |
| "eval_samples_per_second": 9886.95, |
| "eval_steps_per_second": 19.315, |
| "step": 102000 |
| }, |
| { |
| "epoch": 14.175612441508395, |
| "grad_norm": 16.582626342773438, |
| "learning_rate": 0.0003, |
| "loss": 36.4439453125, |
| "step": 103000 |
| }, |
| { |
| "epoch": 14.313239746765758, |
| "grad_norm": 15.228682518005371, |
| "learning_rate": 0.0003, |
| "loss": 36.7357578125, |
| "step": 104000 |
| }, |
| { |
| "epoch": 14.313239746765758, |
| "eval_accuracy": 0.7409231190118309, |
| "eval_loss": 8.984375, |
| "eval_runtime": 168.495, |
| "eval_samples_per_second": 9812.569, |
| "eval_steps_per_second": 19.17, |
| "step": 104000 |
| }, |
| { |
| "epoch": 14.45086705202312, |
| "grad_norm": 18.01194190979004, |
| "learning_rate": 0.0003, |
| "loss": 36.506953125, |
| "step": 105000 |
| }, |
| { |
| "epoch": 14.588494357280485, |
| "grad_norm": 16.423683166503906, |
| "learning_rate": 0.0003, |
| "loss": 36.4571796875, |
| "step": 106000 |
| }, |
| { |
| "epoch": 14.588494357280485, |
| "eval_accuracy": 0.7415011536669796, |
| "eval_loss": 8.9453125, |
| "eval_runtime": 170.7059, |
| "eval_samples_per_second": 9685.482, |
| "eval_steps_per_second": 18.921, |
| "step": 106000 |
| }, |
| { |
| "epoch": 14.726121662537848, |
| "grad_norm": 15.935735702514648, |
| "learning_rate": 0.0003, |
| "loss": 36.4399453125, |
| "step": 107000 |
| }, |
| { |
| "epoch": 14.86374896779521, |
| "grad_norm": 34.532562255859375, |
| "learning_rate": 0.0003, |
| "loss": 36.496796875, |
| "step": 108000 |
| }, |
| { |
| "epoch": 14.86374896779521, |
| "eval_accuracy": 0.740452263067793, |
| "eval_loss": 9.0, |
| "eval_runtime": 169.5593, |
| "eval_samples_per_second": 9750.98, |
| "eval_steps_per_second": 19.049, |
| "step": 108000 |
| }, |
| { |
| "epoch": 15.001376273052573, |
| "grad_norm": 17.5130615234375, |
| "learning_rate": 0.0003, |
| "loss": 36.44953125, |
| "step": 109000 |
| }, |
| { |
| "epoch": 15.139003578309937, |
| "grad_norm": 17.350488662719727, |
| "learning_rate": 0.0003, |
| "loss": 36.5754375, |
| "step": 110000 |
| }, |
| { |
| "epoch": 15.139003578309937, |
| "eval_accuracy": 0.7407140399047041, |
| "eval_loss": 8.984375, |
| "eval_runtime": 169.3269, |
| "eval_samples_per_second": 9764.36, |
| "eval_steps_per_second": 19.076, |
| "step": 110000 |
| }, |
| { |
| "epoch": 15.2766308835673, |
| "grad_norm": 16.463069915771484, |
| "learning_rate": 0.0003, |
| "loss": 36.57256640625, |
| "step": 111000 |
| }, |
| { |
| "epoch": 15.414258188824663, |
| "grad_norm": 15.29598331451416, |
| "learning_rate": 0.0003, |
| "loss": 36.57964453125, |
| "step": 112000 |
| }, |
| { |
| "epoch": 15.414258188824663, |
| "eval_accuracy": 0.7415344808420624, |
| "eval_loss": 8.9375, |
| "eval_runtime": 169.2422, |
| "eval_samples_per_second": 9769.248, |
| "eval_steps_per_second": 19.085, |
| "step": 112000 |
| }, |
| { |
| "epoch": 15.551885494082025, |
| "grad_norm": 26.094396591186523, |
| "learning_rate": 0.0003, |
| "loss": 36.45933203125, |
| "step": 113000 |
| }, |
| { |
| "epoch": 15.689512799339388, |
| "grad_norm": 15.42809009552002, |
| "learning_rate": 0.0003, |
| "loss": 36.4115546875, |
| "step": 114000 |
| }, |
| { |
| "epoch": 15.689512799339388, |
| "eval_accuracy": 0.7421097237159107, |
| "eval_loss": 8.921875, |
| "eval_runtime": 168.9547, |
| "eval_samples_per_second": 9785.869, |
| "eval_steps_per_second": 19.118, |
| "step": 114000 |
| }, |
| { |
| "epoch": 15.827140104596753, |
| "grad_norm": 18.432769775390625, |
| "learning_rate": 0.0003, |
| "loss": 36.36869921875, |
| "step": 115000 |
| }, |
| { |
| "epoch": 15.964767409854115, |
| "grad_norm": 16.479217529296875, |
| "learning_rate": 0.0003, |
| "loss": 36.33540625, |
| "step": 116000 |
| }, |
| { |
| "epoch": 15.964767409854115, |
| "eval_accuracy": 0.742688496326886, |
| "eval_loss": 8.9140625, |
| "eval_runtime": 171.1574, |
| "eval_samples_per_second": 9659.935, |
| "eval_steps_per_second": 18.872, |
| "step": 116000 |
| }, |
| { |
| "epoch": 16.10239471511148, |
| "grad_norm": 16.778072357177734, |
| "learning_rate": 0.0003, |
| "loss": 36.3029140625, |
| "step": 117000 |
| }, |
| { |
| "epoch": 16.24002202036884, |
| "grad_norm": 17.59860610961914, |
| "learning_rate": 0.0003, |
| "loss": 36.37992578125, |
| "step": 118000 |
| }, |
| { |
| "epoch": 16.24002202036884, |
| "eval_accuracy": 0.7424896095870896, |
| "eval_loss": 8.9140625, |
| "eval_runtime": 169.4524, |
| "eval_samples_per_second": 9757.132, |
| "eval_steps_per_second": 19.061, |
| "step": 118000 |
| }, |
| { |
| "epoch": 16.377649325626205, |
| "grad_norm": 19.460819244384766, |
| "learning_rate": 0.0003, |
| "loss": 36.3964765625, |
| "step": 119000 |
| }, |
| { |
| "epoch": 16.515276630883566, |
| "grad_norm": 47.38739776611328, |
| "learning_rate": 0.0003, |
| "loss": 36.399671875, |
| "step": 120000 |
| }, |
| { |
| "epoch": 16.515276630883566, |
| "eval_accuracy": 0.7421772826656658, |
| "eval_loss": 8.90625, |
| "eval_runtime": 168.2033, |
| "eval_samples_per_second": 9829.585, |
| "eval_steps_per_second": 19.203, |
| "step": 120000 |
| }, |
| { |
| "epoch": 16.515276630883566, |
| "step": 120000, |
| "total_flos": 1.617130434771026e+19, |
| "train_loss": 38.9186564453125, |
| "train_runtime": 44793.9304, |
| "train_samples_per_second": 5486.458, |
| "train_steps_per_second": 2.679 |
| } |
| ], |
| "logging_steps": 1000, |
| "max_steps": 120000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 17, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.617130434771026e+19, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|