| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 237, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.06389776357827476, |
| "grad_norm": 17.501014709472656, |
| "learning_rate": 9.992973140107996e-07, |
| "loss": 1.3507, |
| "num_input_tokens_seen": 93984, |
| "step": 5, |
| "train_runtime": 556.6226, |
| "train_tokens_per_second": 168.847 |
| }, |
| { |
| "epoch": 0.12779552715654952, |
| "grad_norm": 11.144261360168457, |
| "learning_rate": 9.964460368509865e-07, |
| "loss": 1.0678, |
| "num_input_tokens_seen": 187200, |
| "step": 10, |
| "train_runtime": 1095.0218, |
| "train_tokens_per_second": 170.956 |
| }, |
| { |
| "epoch": 0.19169329073482427, |
| "grad_norm": 5.366312026977539, |
| "learning_rate": 9.914147615517526e-07, |
| "loss": 0.8995, |
| "num_input_tokens_seen": 282560, |
| "step": 15, |
| "train_runtime": 1632.6431, |
| "train_tokens_per_second": 173.069 |
| }, |
| { |
| "epoch": 0.25559105431309903, |
| "grad_norm": 5.131296157836914, |
| "learning_rate": 9.842255814927944e-07, |
| "loss": 0.8231, |
| "num_input_tokens_seen": 376064, |
| "step": 20, |
| "train_runtime": 2172.7033, |
| "train_tokens_per_second": 173.086 |
| }, |
| { |
| "epoch": 0.3194888178913738, |
| "grad_norm": 3.8358519077301025, |
| "learning_rate": 9.749100658638914e-07, |
| "loss": 0.7679, |
| "num_input_tokens_seen": 471616, |
| "step": 25, |
| "train_runtime": 2712.1273, |
| "train_tokens_per_second": 173.892 |
| }, |
| { |
| "epoch": 0.38338658146964855, |
| "grad_norm": 3.4662442207336426, |
| "learning_rate": 9.63509121038005e-07, |
| "loss": 0.7301, |
| "num_input_tokens_seen": 564352, |
| "step": 30, |
| "train_runtime": 3636.2616, |
| "train_tokens_per_second": 155.201 |
| }, |
| { |
| "epoch": 0.4472843450479233, |
| "grad_norm": 3.0572352409362793, |
| "learning_rate": 9.500728109428603e-07, |
| "loss": 0.7017, |
| "num_input_tokens_seen": 657568, |
| "step": 35, |
| "train_runtime": 3832.9972, |
| "train_tokens_per_second": 171.555 |
| }, |
| { |
| "epoch": 0.5111821086261981, |
| "grad_norm": 3.1490511894226074, |
| "learning_rate": 9.346601372197913e-07, |
| "loss": 0.7058, |
| "num_input_tokens_seen": 750720, |
| "step": 40, |
| "train_runtime": 4012.6034, |
| "train_tokens_per_second": 187.091 |
| }, |
| { |
| "epoch": 0.5750798722044729, |
| "grad_norm": 3.2475409507751465, |
| "learning_rate": 9.17338780135223e-07, |
| "loss": 0.6841, |
| "num_input_tokens_seen": 842784, |
| "step": 45, |
| "train_runtime": 4192.5527, |
| "train_tokens_per_second": 201.019 |
| }, |
| { |
| "epoch": 0.6389776357827476, |
| "grad_norm": 3.1415064334869385, |
| "learning_rate": 8.981848013824993e-07, |
| "loss": 0.6738, |
| "num_input_tokens_seen": 936128, |
| "step": 50, |
| "train_runtime": 4379.3543, |
| "train_tokens_per_second": 213.759 |
| }, |
| { |
| "epoch": 0.7028753993610224, |
| "grad_norm": 3.2890915870666504, |
| "learning_rate": 8.77282310079115e-07, |
| "loss": 0.6643, |
| "num_input_tokens_seen": 1028160, |
| "step": 55, |
| "train_runtime": 4592.6801, |
| "train_tokens_per_second": 223.869 |
| }, |
| { |
| "epoch": 0.7667731629392971, |
| "grad_norm": 2.9755921363830566, |
| "learning_rate": 8.547230934260311e-07, |
| "loss": 0.6449, |
| "num_input_tokens_seen": 1119552, |
| "step": 60, |
| "train_runtime": 5103.7665, |
| "train_tokens_per_second": 219.358 |
| }, |
| { |
| "epoch": 0.8306709265175719, |
| "grad_norm": 3.0064377784729004, |
| "learning_rate": 8.306062136509219e-07, |
| "loss": 0.6547, |
| "num_input_tokens_seen": 1212032, |
| "step": 65, |
| "train_runtime": 5281.6563, |
| "train_tokens_per_second": 229.48 |
| }, |
| { |
| "epoch": 0.8945686900958466, |
| "grad_norm": 3.0753512382507324, |
| "learning_rate": 8.050375730052621e-07, |
| "loss": 0.6543, |
| "num_input_tokens_seen": 1306368, |
| "step": 70, |
| "train_runtime": 5458.516, |
| "train_tokens_per_second": 239.327 |
| }, |
| { |
| "epoch": 0.9584664536741214, |
| "grad_norm": 2.9098427295684814, |
| "learning_rate": 7.781294487254435e-07, |
| "loss": 0.6579, |
| "num_input_tokens_seen": 1400576, |
| "step": 75, |
| "train_runtime": 5636.7125, |
| "train_tokens_per_second": 248.474 |
| }, |
| { |
| "epoch": 1.012779552715655, |
| "grad_norm": 3.033903121948242, |
| "learning_rate": 7.5e-07, |
| "loss": 0.6344, |
| "num_input_tokens_seen": 1481248, |
| "step": 80, |
| "train_runtime": 5831.8252, |
| "train_tokens_per_second": 253.994 |
| }, |
| { |
| "epoch": 1.0766773162939298, |
| "grad_norm": 2.6656150817871094, |
| "learning_rate": 7.207727491079559e-07, |
| "loss": 0.6292, |
| "num_input_tokens_seen": 1575104, |
| "step": 85, |
| "train_runtime": 6012.044, |
| "train_tokens_per_second": 261.991 |
| }, |
| { |
| "epoch": 1.1405750798722045, |
| "grad_norm": 2.7004282474517822, |
| "learning_rate": 6.905760390067234e-07, |
| "loss": 0.6239, |
| "num_input_tokens_seen": 1668064, |
| "step": 90, |
| "train_runtime": 6510.4742, |
| "train_tokens_per_second": 256.212 |
| }, |
| { |
| "epoch": 1.2044728434504792, |
| "grad_norm": 2.72955060005188, |
| "learning_rate": 6.595424697513963e-07, |
| "loss": 0.6157, |
| "num_input_tokens_seen": 1764128, |
| "step": 95, |
| "train_runtime": 6697.2327, |
| "train_tokens_per_second": 263.411 |
| }, |
| { |
| "epoch": 1.268370607028754, |
| "grad_norm": 2.819629192352295, |
| "learning_rate": 6.278083162202373e-07, |
| "loss": 0.6096, |
| "num_input_tokens_seen": 1858912, |
| "step": 100, |
| "train_runtime": 6880.9117, |
| "train_tokens_per_second": 270.155 |
| }, |
| { |
| "epoch": 1.3322683706070286, |
| "grad_norm": 2.837791919708252, |
| "learning_rate": 5.955129297032538e-07, |
| "loss": 0.5967, |
| "num_input_tokens_seen": 1952640, |
| "step": 105, |
| "train_runtime": 7062.8237, |
| "train_tokens_per_second": 276.467 |
| }, |
| { |
| "epoch": 1.3961661341853036, |
| "grad_norm": 2.6342546939849854, |
| "learning_rate": 5.62798125981604e-07, |
| "loss": 0.6051, |
| "num_input_tokens_seen": 2045792, |
| "step": 110, |
| "train_runtime": 7245.471, |
| "train_tokens_per_second": 282.355 |
| }, |
| { |
| "epoch": 1.4600638977635783, |
| "grad_norm": 2.5401482582092285, |
| "learning_rate": 5.298075625849099e-07, |
| "loss": 0.5899, |
| "num_input_tokens_seen": 2140736, |
| "step": 115, |
| "train_runtime": 7427.2464, |
| "train_tokens_per_second": 288.227 |
| }, |
| { |
| "epoch": 1.5239616613418532, |
| "grad_norm": 2.6414806842803955, |
| "learning_rate": 4.966861079610687e-07, |
| "loss": 0.5901, |
| "num_input_tokens_seen": 2233280, |
| "step": 120, |
| "train_runtime": 7929.4515, |
| "train_tokens_per_second": 281.644 |
| }, |
| { |
| "epoch": 1.5878594249201279, |
| "grad_norm": 2.817983865737915, |
| "learning_rate": 4.6357920532866816e-07, |
| "loss": 0.6011, |
| "num_input_tokens_seen": 2326144, |
| "step": 125, |
| "train_runtime": 8118.8041, |
| "train_tokens_per_second": 286.513 |
| }, |
| { |
| "epoch": 1.6517571884984026, |
| "grad_norm": 2.9443130493164062, |
| "learning_rate": 4.306322340054659e-07, |
| "loss": 0.5969, |
| "num_input_tokens_seen": 2418592, |
| "step": 130, |
| "train_runtime": 8306.8501, |
| "train_tokens_per_second": 291.156 |
| }, |
| { |
| "epoch": 1.7156549520766773, |
| "grad_norm": 2.630876302719116, |
| "learning_rate": 3.979898710174677e-07, |
| "loss": 0.5948, |
| "num_input_tokens_seen": 2512320, |
| "step": 135, |
| "train_runtime": 8501.9493, |
| "train_tokens_per_second": 295.499 |
| }, |
| { |
| "epoch": 1.779552715654952, |
| "grad_norm": 2.6901042461395264, |
| "learning_rate": 3.657954557919183e-07, |
| "loss": 0.598, |
| "num_input_tokens_seen": 2606112, |
| "step": 140, |
| "train_runtime": 8694.019, |
| "train_tokens_per_second": 299.759 |
| }, |
| { |
| "epoch": 1.8434504792332267, |
| "grad_norm": 2.8361966609954834, |
| "learning_rate": 3.3419036072396614e-07, |
| "loss": 0.5902, |
| "num_input_tokens_seen": 2699936, |
| "step": 145, |
| "train_runtime": 8889.9576, |
| "train_tokens_per_second": 303.706 |
| }, |
| { |
| "epoch": 1.9073482428115016, |
| "grad_norm": 2.87080979347229, |
| "learning_rate": 3.033133703809759e-07, |
| "loss": 0.5978, |
| "num_input_tokens_seen": 2795136, |
| "step": 150, |
| "train_runtime": 9406.664, |
| "train_tokens_per_second": 297.144 |
| }, |
| { |
| "epoch": 1.9712460063897763, |
| "grad_norm": 2.7429561614990234, |
| "learning_rate": 2.7330007207053406e-07, |
| "loss": 0.5946, |
| "num_input_tokens_seen": 2888960, |
| "step": 155, |
| "train_runtime": 9589.8224, |
| "train_tokens_per_second": 301.253 |
| }, |
| { |
| "epoch": 2.02555910543131, |
| "grad_norm": 2.6952402591705322, |
| "learning_rate": 2.442822604482889e-07, |
| "loss": 0.5918, |
| "num_input_tokens_seen": 2968224, |
| "step": 160, |
| "train_runtime": 9773.1495, |
| "train_tokens_per_second": 303.712 |
| }, |
| { |
| "epoch": 2.0894568690095845, |
| "grad_norm": 2.673067569732666, |
| "learning_rate": 2.16387358780116e-07, |
| "loss": 0.5663, |
| "num_input_tokens_seen": 3062400, |
| "step": 165, |
| "train_runtime": 9959.8127, |
| "train_tokens_per_second": 307.476 |
| }, |
| { |
| "epoch": 2.1533546325878596, |
| "grad_norm": 2.7801618576049805, |
| "learning_rate": 1.8973785939996927e-07, |
| "loss": 0.5791, |
| "num_input_tokens_seen": 3155520, |
| "step": 170, |
| "train_runtime": 10148.3528, |
| "train_tokens_per_second": 310.939 |
| }, |
| { |
| "epoch": 2.2172523961661343, |
| "grad_norm": 2.5863192081451416, |
| "learning_rate": 1.6445078582048154e-07, |
| "loss": 0.5695, |
| "num_input_tokens_seen": 3250496, |
| "step": 175, |
| "train_runtime": 10338.7555, |
| "train_tokens_per_second": 314.399 |
| }, |
| { |
| "epoch": 2.281150159744409, |
| "grad_norm": 2.7501046657562256, |
| "learning_rate": 1.4063717885830373e-07, |
| "loss": 0.5675, |
| "num_input_tokens_seen": 3344672, |
| "step": 180, |
| "train_runtime": 10847.5534, |
| "train_tokens_per_second": 308.334 |
| }, |
| { |
| "epoch": 2.3450479233226837, |
| "grad_norm": 2.7307002544403076, |
| "learning_rate": 1.184016090307059e-07, |
| "loss": 0.5657, |
| "num_input_tokens_seen": 3438784, |
| "step": 185, |
| "train_runtime": 11033.7768, |
| "train_tokens_per_second": 311.66 |
| }, |
| { |
| "epoch": 2.4089456869009584, |
| "grad_norm": 2.663017988204956, |
| "learning_rate": 9.78417173646176e-08, |
| "loss": 0.5745, |
| "num_input_tokens_seen": 3531840, |
| "step": 190, |
| "train_runtime": 11217.6988, |
| "train_tokens_per_second": 314.845 |
| }, |
| { |
| "epoch": 2.472843450479233, |
| "grad_norm": 2.6426873207092285, |
| "learning_rate": 7.904778663450323e-08, |
| "loss": 0.5885, |
| "num_input_tokens_seen": 3625984, |
| "step": 195, |
| "train_runtime": 11403.1929, |
| "train_tokens_per_second": 317.98 |
| }, |
| { |
| "epoch": 2.536741214057508, |
| "grad_norm": 2.7548089027404785, |
| "learning_rate": 6.210234491186079e-08, |
| "loss": 0.5748, |
| "num_input_tokens_seen": 3720352, |
| "step": 200, |
| "train_runtime": 11603.6555, |
| "train_tokens_per_second": 320.619 |
| }, |
| { |
| "epoch": 2.600638977635783, |
| "grad_norm": 2.6678242683410645, |
| "learning_rate": 4.7079803167238366e-08, |
| "loss": 0.5741, |
| "num_input_tokens_seen": 3812992, |
| "step": 205, |
| "train_runtime": 11793.3317, |
| "train_tokens_per_second": 323.318 |
| }, |
| { |
| "epoch": 2.6645367412140573, |
| "grad_norm": 2.8868303298950195, |
| "learning_rate": 3.4046128516136754e-08, |
| "loss": 0.5642, |
| "num_input_tokens_seen": 3905280, |
| "step": 210, |
| "train_runtime": 12299.9488, |
| "train_tokens_per_second": 317.504 |
| }, |
| { |
| "epoch": 2.7284345047923324, |
| "grad_norm": 2.5737545490264893, |
| "learning_rate": 2.3058554543638698e-08, |
| "loss": 0.5741, |
| "num_input_tokens_seen": 3999680, |
| "step": 215, |
| "train_runtime": 12494.1775, |
| "train_tokens_per_second": 320.124 |
| }, |
| { |
| "epoch": 2.792332268370607, |
| "grad_norm": 2.635117292404175, |
| "learning_rate": 1.4165329979794971e-08, |
| "loss": 0.5805, |
| "num_input_tokens_seen": 4094720, |
| "step": 220, |
| "train_runtime": 12687.8777, |
| "train_tokens_per_second": 322.727 |
| }, |
| { |
| "epoch": 2.856230031948882, |
| "grad_norm": 2.665903329849243, |
| "learning_rate": 7.405506829382735e-09, |
| "loss": 0.5779, |
| "num_input_tokens_seen": 4189248, |
| "step": 225, |
| "train_runtime": 12880.2851, |
| "train_tokens_per_second": 325.245 |
| }, |
| { |
| "epoch": 2.9201277955271565, |
| "grad_norm": 2.6100857257843018, |
| "learning_rate": 2.808768886403301e-09, |
| "loss": 0.5671, |
| "num_input_tokens_seen": 4282208, |
| "step": 230, |
| "train_runtime": 13076.8487, |
| "train_tokens_per_second": 327.465 |
| }, |
| { |
| "epoch": 2.984025559105431, |
| "grad_norm": 2.5199291706085205, |
| "learning_rate": 3.9530138634907837e-10, |
| "loss": 0.5685, |
| "num_input_tokens_seen": 4376096, |
| "step": 235, |
| "train_runtime": 13275.1963, |
| "train_tokens_per_second": 329.645 |
| }, |
| { |
| "epoch": 3.0, |
| "num_input_tokens_seen": 4400096, |
| "step": 237, |
| "total_flos": 1.8666841676395315e+17, |
| "train_loss": 0.6493978349468376, |
| "train_runtime": 13351.2629, |
| "train_samples_per_second": 2.247, |
| "train_steps_per_second": 0.018 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 237, |
| "num_input_tokens_seen": 4400096, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.8666841676395315e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|