| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 30.979827089337174, |
| "eval_steps": 1200, |
| "global_step": 21500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.7204610951008645, |
| "grad_norm": 0.19696219265460968, |
| "learning_rate": 0.0004967269843558504, |
| "loss": 0.115, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.440922190201729, |
| "grad_norm": 0.18757623434066772, |
| "learning_rate": 0.0004931223415759498, |
| "loss": 0.0605, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.729106628242075, |
| "eval_loss": 0.051301125437021255, |
| "eval_runtime": 20.8394, |
| "eval_samples_per_second": 111.039, |
| "eval_steps_per_second": 0.096, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.161383285302594, |
| "grad_norm": 0.23383216559886932, |
| "learning_rate": 0.0004895176987960493, |
| "loss": 0.06, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.881844380403458, |
| "grad_norm": 0.8578475713729858, |
| "learning_rate": 0.00048591305601614884, |
| "loss": 0.045, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.4582132564841497, |
| "eval_loss": 0.048763249069452286, |
| "eval_runtime": 21.4687, |
| "eval_samples_per_second": 107.785, |
| "eval_steps_per_second": 0.093, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.602305475504323, |
| "grad_norm": 0.5358479619026184, |
| "learning_rate": 0.0004823084132362483, |
| "loss": 0.034, |
| "step": 2500 |
| }, |
| { |
| "epoch": 4.322766570605188, |
| "grad_norm": 0.3984196186065674, |
| "learning_rate": 0.0004787037704563478, |
| "loss": 0.0278, |
| "step": 3000 |
| }, |
| { |
| "epoch": 5.043227665706052, |
| "grad_norm": 0.5611603856086731, |
| "learning_rate": 0.00047509912767644725, |
| "loss": 0.021, |
| "step": 3500 |
| }, |
| { |
| "epoch": 5.187319884726225, |
| "eval_loss": 0.05793336406350136, |
| "eval_runtime": 21.4122, |
| "eval_samples_per_second": 108.069, |
| "eval_steps_per_second": 0.093, |
| "step": 3600 |
| }, |
| { |
| "epoch": 5.763688760806916, |
| "grad_norm": 0.33251306414604187, |
| "learning_rate": 0.0004714944848965468, |
| "loss": 0.019, |
| "step": 4000 |
| }, |
| { |
| "epoch": 6.484149855907781, |
| "grad_norm": 0.5683927536010742, |
| "learning_rate": 0.00046788984211664625, |
| "loss": 0.0168, |
| "step": 4500 |
| }, |
| { |
| "epoch": 6.916426512968299, |
| "eval_loss": 0.047444652765989304, |
| "eval_runtime": 21.5287, |
| "eval_samples_per_second": 107.484, |
| "eval_steps_per_second": 0.093, |
| "step": 4800 |
| }, |
| { |
| "epoch": 7.204610951008646, |
| "grad_norm": 1.6696492433547974, |
| "learning_rate": 0.0004642851993367457, |
| "loss": 0.0153, |
| "step": 5000 |
| }, |
| { |
| "epoch": 7.92507204610951, |
| "grad_norm": 0.6783491373062134, |
| "learning_rate": 0.0004606805565568452, |
| "loss": 0.0116, |
| "step": 5500 |
| }, |
| { |
| "epoch": 8.645533141210375, |
| "grad_norm": 0.4771524667739868, |
| "learning_rate": 0.0004570759137769447, |
| "loss": 0.0118, |
| "step": 6000 |
| }, |
| { |
| "epoch": 8.645533141210375, |
| "eval_loss": 0.06448203325271606, |
| "eval_runtime": 20.7245, |
| "eval_samples_per_second": 111.655, |
| "eval_steps_per_second": 0.097, |
| "step": 6000 |
| }, |
| { |
| "epoch": 9.36599423631124, |
| "grad_norm": 0.45867717266082764, |
| "learning_rate": 0.0004534712709970442, |
| "loss": 0.0095, |
| "step": 6500 |
| }, |
| { |
| "epoch": 10.086455331412104, |
| "grad_norm": 1.0143071413040161, |
| "learning_rate": 0.0004498666282171437, |
| "loss": 0.0081, |
| "step": 7000 |
| }, |
| { |
| "epoch": 10.37463976945245, |
| "eval_loss": 0.059642400592565536, |
| "eval_runtime": 20.5013, |
| "eval_samples_per_second": 112.871, |
| "eval_steps_per_second": 0.098, |
| "step": 7200 |
| }, |
| { |
| "epoch": 10.806916426512968, |
| "grad_norm": 0.34545987844467163, |
| "learning_rate": 0.0004462619854372432, |
| "loss": 0.0077, |
| "step": 7500 |
| }, |
| { |
| "epoch": 11.527377521613833, |
| "grad_norm": 0.6745367050170898, |
| "learning_rate": 0.00044265734265734266, |
| "loss": 0.0073, |
| "step": 8000 |
| }, |
| { |
| "epoch": 12.103746397694524, |
| "eval_loss": 0.057360123842954636, |
| "eval_runtime": 21.5407, |
| "eval_samples_per_second": 107.425, |
| "eval_steps_per_second": 0.093, |
| "step": 8400 |
| }, |
| { |
| "epoch": 12.247838616714697, |
| "grad_norm": 0.3190229535102844, |
| "learning_rate": 0.0004390526998774422, |
| "loss": 0.0065, |
| "step": 8500 |
| }, |
| { |
| "epoch": 12.968299711815561, |
| "grad_norm": 0.20763935148715973, |
| "learning_rate": 0.00043544805709754166, |
| "loss": 0.0064, |
| "step": 9000 |
| }, |
| { |
| "epoch": 13.688760806916427, |
| "grad_norm": 0.11372426152229309, |
| "learning_rate": 0.00043184341431764113, |
| "loss": 0.0059, |
| "step": 9500 |
| }, |
| { |
| "epoch": 13.832853025936599, |
| "eval_loss": 0.08313994109630585, |
| "eval_runtime": 21.5595, |
| "eval_samples_per_second": 107.331, |
| "eval_steps_per_second": 0.093, |
| "step": 9600 |
| }, |
| { |
| "epoch": 14.409221902017292, |
| "grad_norm": 0.6901423335075378, |
| "learning_rate": 0.0004282387715377406, |
| "loss": 0.0055, |
| "step": 10000 |
| }, |
| { |
| "epoch": 15.129682997118156, |
| "grad_norm": 0.5882952213287354, |
| "learning_rate": 0.0004246341287578401, |
| "loss": 0.005, |
| "step": 10500 |
| }, |
| { |
| "epoch": 15.561959654178674, |
| "eval_loss": 0.06821350008249283, |
| "eval_runtime": 20.3166, |
| "eval_samples_per_second": 113.897, |
| "eval_steps_per_second": 0.098, |
| "step": 10800 |
| }, |
| { |
| "epoch": 15.85014409221902, |
| "grad_norm": 0.4642440676689148, |
| "learning_rate": 0.0004210294859779396, |
| "loss": 0.0049, |
| "step": 11000 |
| }, |
| { |
| "epoch": 16.570605187319885, |
| "grad_norm": 0.9032358527183533, |
| "learning_rate": 0.00041742484319803907, |
| "loss": 0.0048, |
| "step": 11500 |
| }, |
| { |
| "epoch": 17.29106628242075, |
| "grad_norm": 0.5521640777587891, |
| "learning_rate": 0.00041382020041813854, |
| "loss": 0.0046, |
| "step": 12000 |
| }, |
| { |
| "epoch": 17.29106628242075, |
| "eval_loss": 0.08423992991447449, |
| "eval_runtime": 21.1812, |
| "eval_samples_per_second": 109.248, |
| "eval_steps_per_second": 0.094, |
| "step": 12000 |
| }, |
| { |
| "epoch": 18.011527377521613, |
| "grad_norm": 0.7376463413238525, |
| "learning_rate": 0.000410215557638238, |
| "loss": 0.0044, |
| "step": 12500 |
| }, |
| { |
| "epoch": 18.73198847262248, |
| "grad_norm": 1.1471983194351196, |
| "learning_rate": 0.0004066109148583376, |
| "loss": 0.0045, |
| "step": 13000 |
| }, |
| { |
| "epoch": 19.020172910662826, |
| "eval_loss": 0.07880275696516037, |
| "eval_runtime": 21.5701, |
| "eval_samples_per_second": 107.278, |
| "eval_steps_per_second": 0.093, |
| "step": 13200 |
| }, |
| { |
| "epoch": 19.45244956772334, |
| "grad_norm": 0.053835347294807434, |
| "learning_rate": 0.00040300627207843706, |
| "loss": 0.0041, |
| "step": 13500 |
| }, |
| { |
| "epoch": 20.172910662824208, |
| "grad_norm": 0.7777488231658936, |
| "learning_rate": 0.00039940162929853653, |
| "loss": 0.0042, |
| "step": 14000 |
| }, |
| { |
| "epoch": 20.7492795389049, |
| "eval_loss": 0.062229253351688385, |
| "eval_runtime": 20.4938, |
| "eval_samples_per_second": 112.912, |
| "eval_steps_per_second": 0.098, |
| "step": 14400 |
| }, |
| { |
| "epoch": 20.89337175792507, |
| "grad_norm": 0.14320553839206696, |
| "learning_rate": 0.000395796986518636, |
| "loss": 0.004, |
| "step": 14500 |
| }, |
| { |
| "epoch": 21.613832853025936, |
| "grad_norm": 0.3327866494655609, |
| "learning_rate": 0.00039219234373873553, |
| "loss": 0.004, |
| "step": 15000 |
| }, |
| { |
| "epoch": 22.334293948126803, |
| "grad_norm": 0.29509493708610535, |
| "learning_rate": 0.000388587700958835, |
| "loss": 0.0037, |
| "step": 15500 |
| }, |
| { |
| "epoch": 22.478386167146976, |
| "eval_loss": 0.07450389117002487, |
| "eval_runtime": 21.8716, |
| "eval_samples_per_second": 105.799, |
| "eval_steps_per_second": 0.091, |
| "step": 15600 |
| }, |
| { |
| "epoch": 23.054755043227665, |
| "grad_norm": 0.5017435550689697, |
| "learning_rate": 0.00038498305817893447, |
| "loss": 0.0038, |
| "step": 16000 |
| }, |
| { |
| "epoch": 23.77521613832853, |
| "grad_norm": 0.05931377038359642, |
| "learning_rate": 0.00038137841539903394, |
| "loss": 0.0038, |
| "step": 16500 |
| }, |
| { |
| "epoch": 24.207492795389047, |
| "eval_loss": 0.09549176692962646, |
| "eval_runtime": 21.5513, |
| "eval_samples_per_second": 107.372, |
| "eval_steps_per_second": 0.093, |
| "step": 16800 |
| }, |
| { |
| "epoch": 24.495677233429394, |
| "grad_norm": 0.13349242508411407, |
| "learning_rate": 0.0003777737726191334, |
| "loss": 0.0034, |
| "step": 17000 |
| }, |
| { |
| "epoch": 25.21613832853026, |
| "grad_norm": 0.19320227205753326, |
| "learning_rate": 0.00037416912983923294, |
| "loss": 0.0034, |
| "step": 17500 |
| }, |
| { |
| "epoch": 25.936599423631122, |
| "grad_norm": 0.24608492851257324, |
| "learning_rate": 0.0003705644870593324, |
| "loss": 0.0034, |
| "step": 18000 |
| }, |
| { |
| "epoch": 25.936599423631122, |
| "eval_loss": 0.10036125034093857, |
| "eval_runtime": 22.0387, |
| "eval_samples_per_second": 104.997, |
| "eval_steps_per_second": 0.091, |
| "step": 18000 |
| }, |
| { |
| "epoch": 26.65706051873199, |
| "grad_norm": 0.11887585371732712, |
| "learning_rate": 0.0003669598442794319, |
| "loss": 0.0033, |
| "step": 18500 |
| }, |
| { |
| "epoch": 27.377521613832855, |
| "grad_norm": 0.5103694796562195, |
| "learning_rate": 0.0003633552014995314, |
| "loss": 0.0031, |
| "step": 19000 |
| }, |
| { |
| "epoch": 27.665706051873197, |
| "eval_loss": 0.0853080227971077, |
| "eval_runtime": 21.6671, |
| "eval_samples_per_second": 106.798, |
| "eval_steps_per_second": 0.092, |
| "step": 19200 |
| }, |
| { |
| "epoch": 28.097982708933717, |
| "grad_norm": 0.9122279286384583, |
| "learning_rate": 0.00035975055871963093, |
| "loss": 0.0034, |
| "step": 19500 |
| }, |
| { |
| "epoch": 28.818443804034583, |
| "grad_norm": 0.028490234166383743, |
| "learning_rate": 0.0003561459159397304, |
| "loss": 0.0035, |
| "step": 20000 |
| }, |
| { |
| "epoch": 29.394812680115272, |
| "eval_loss": 0.05787323787808418, |
| "eval_runtime": 21.1854, |
| "eval_samples_per_second": 109.226, |
| "eval_steps_per_second": 0.094, |
| "step": 20400 |
| }, |
| { |
| "epoch": 29.538904899135446, |
| "grad_norm": 0.32352131605148315, |
| "learning_rate": 0.0003525412731598299, |
| "loss": 0.0036, |
| "step": 20500 |
| }, |
| { |
| "epoch": 30.259365994236312, |
| "grad_norm": 0.43146830797195435, |
| "learning_rate": 0.00034893663037992935, |
| "loss": 0.0032, |
| "step": 21000 |
| }, |
| { |
| "epoch": 30.979827089337174, |
| "grad_norm": 0.22915582358837128, |
| "learning_rate": 0.0003453319876000288, |
| "loss": 0.0026, |
| "step": 21500 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 69400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 100, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.477693523839612e+17, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|