{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06389776357827476, "grad_norm": 17.501014709472656, "learning_rate": 9.992973140107996e-07, "loss": 1.3507, "num_input_tokens_seen": 93984, "step": 5, "train_runtime": 556.6226, "train_tokens_per_second": 168.847 }, { "epoch": 0.12779552715654952, "grad_norm": 11.144261360168457, "learning_rate": 9.964460368509865e-07, "loss": 1.0678, "num_input_tokens_seen": 187200, "step": 10, "train_runtime": 1095.0218, "train_tokens_per_second": 170.956 }, { "epoch": 0.19169329073482427, "grad_norm": 5.366312026977539, "learning_rate": 9.914147615517526e-07, "loss": 0.8995, "num_input_tokens_seen": 282560, "step": 15, "train_runtime": 1632.6431, "train_tokens_per_second": 173.069 }, { "epoch": 0.25559105431309903, "grad_norm": 5.131296157836914, "learning_rate": 9.842255814927944e-07, "loss": 0.8231, "num_input_tokens_seen": 376064, "step": 20, "train_runtime": 2172.7033, "train_tokens_per_second": 173.086 }, { "epoch": 0.3194888178913738, "grad_norm": 3.8358519077301025, "learning_rate": 9.749100658638914e-07, "loss": 0.7679, "num_input_tokens_seen": 471616, "step": 25, "train_runtime": 2712.1273, "train_tokens_per_second": 173.892 }, { "epoch": 0.38338658146964855, "grad_norm": 3.4662442207336426, "learning_rate": 9.63509121038005e-07, "loss": 0.7301, "num_input_tokens_seen": 564352, "step": 30, "train_runtime": 3636.2616, "train_tokens_per_second": 155.201 }, { "epoch": 0.4472843450479233, "grad_norm": 3.0572352409362793, "learning_rate": 9.500728109428603e-07, "loss": 0.7017, "num_input_tokens_seen": 657568, "step": 35, "train_runtime": 3832.9972, "train_tokens_per_second": 171.555 }, { "epoch": 0.5111821086261981, "grad_norm": 3.1490511894226074, "learning_rate": 9.346601372197913e-07, "loss": 0.7058, "num_input_tokens_seen": 750720, "step": 40, "train_runtime": 4012.6034, "train_tokens_per_second": 187.091 }, { "epoch": 0.5750798722044729, "grad_norm": 3.2475409507751465, "learning_rate": 9.17338780135223e-07, "loss": 0.6841, "num_input_tokens_seen": 842784, "step": 45, "train_runtime": 4192.5527, "train_tokens_per_second": 201.019 }, { "epoch": 0.6389776357827476, "grad_norm": 3.1415064334869385, "learning_rate": 8.981848013824993e-07, "loss": 0.6738, "num_input_tokens_seen": 936128, "step": 50, "train_runtime": 4379.3543, "train_tokens_per_second": 213.759 }, { "epoch": 0.7028753993610224, "grad_norm": 3.2890915870666504, "learning_rate": 8.77282310079115e-07, "loss": 0.6643, "num_input_tokens_seen": 1028160, "step": 55, "train_runtime": 4592.6801, "train_tokens_per_second": 223.869 }, { "epoch": 0.7667731629392971, "grad_norm": 2.9755921363830566, "learning_rate": 8.547230934260311e-07, "loss": 0.6449, "num_input_tokens_seen": 1119552, "step": 60, "train_runtime": 5103.7665, "train_tokens_per_second": 219.358 }, { "epoch": 0.8306709265175719, "grad_norm": 3.0064377784729004, "learning_rate": 8.306062136509219e-07, "loss": 0.6547, "num_input_tokens_seen": 1212032, "step": 65, "train_runtime": 5281.6563, "train_tokens_per_second": 229.48 }, { "epoch": 0.8945686900958466, "grad_norm": 3.0753512382507324, "learning_rate": 8.050375730052621e-07, "loss": 0.6543, "num_input_tokens_seen": 1306368, "step": 70, "train_runtime": 5458.516, "train_tokens_per_second": 239.327 }, { "epoch": 0.9584664536741214, "grad_norm": 2.9098427295684814, "learning_rate": 7.781294487254435e-07, "loss": 0.6579, "num_input_tokens_seen": 1400576, "step": 75, "train_runtime": 5636.7125, "train_tokens_per_second": 248.474 }, { "epoch": 1.012779552715655, "grad_norm": 3.033903121948242, "learning_rate": 7.5e-07, "loss": 0.6344, "num_input_tokens_seen": 1481248, "step": 80, "train_runtime": 5831.8252, "train_tokens_per_second": 253.994 }, { "epoch": 1.0766773162939298, "grad_norm": 2.6656150817871094, "learning_rate": 7.207727491079559e-07, "loss": 0.6292, "num_input_tokens_seen": 1575104, "step": 85, "train_runtime": 6012.044, "train_tokens_per_second": 261.991 }, { "epoch": 1.1405750798722045, "grad_norm": 2.7004282474517822, "learning_rate": 6.905760390067234e-07, "loss": 0.6239, "num_input_tokens_seen": 1668064, "step": 90, "train_runtime": 6510.4742, "train_tokens_per_second": 256.212 }, { "epoch": 1.2044728434504792, "grad_norm": 2.72955060005188, "learning_rate": 6.595424697513963e-07, "loss": 0.6157, "num_input_tokens_seen": 1764128, "step": 95, "train_runtime": 6697.2327, "train_tokens_per_second": 263.411 }, { "epoch": 1.268370607028754, "grad_norm": 2.819629192352295, "learning_rate": 6.278083162202373e-07, "loss": 0.6096, "num_input_tokens_seen": 1858912, "step": 100, "train_runtime": 6880.9117, "train_tokens_per_second": 270.155 }, { "epoch": 1.3322683706070286, "grad_norm": 2.837791919708252, "learning_rate": 5.955129297032538e-07, "loss": 0.5967, "num_input_tokens_seen": 1952640, "step": 105, "train_runtime": 7062.8237, "train_tokens_per_second": 276.467 }, { "epoch": 1.3961661341853036, "grad_norm": 2.6342546939849854, "learning_rate": 5.62798125981604e-07, "loss": 0.6051, "num_input_tokens_seen": 2045792, "step": 110, "train_runtime": 7245.471, "train_tokens_per_second": 282.355 }, { "epoch": 1.4600638977635783, "grad_norm": 2.5401482582092285, "learning_rate": 5.298075625849099e-07, "loss": 0.5899, "num_input_tokens_seen": 2140736, "step": 115, "train_runtime": 7427.2464, "train_tokens_per_second": 288.227 }, { "epoch": 1.5239616613418532, "grad_norm": 2.6414806842803955, "learning_rate": 4.966861079610687e-07, "loss": 0.5901, "num_input_tokens_seen": 2233280, "step": 120, "train_runtime": 7929.4515, "train_tokens_per_second": 281.644 }, { "epoch": 1.5878594249201279, "grad_norm": 2.817983865737915, "learning_rate": 4.6357920532866816e-07, "loss": 0.6011, "num_input_tokens_seen": 2326144, "step": 125, "train_runtime": 8118.8041, "train_tokens_per_second": 286.513 }, { "epoch": 1.6517571884984026, "grad_norm": 2.9443130493164062, "learning_rate": 4.306322340054659e-07, "loss": 0.5969, "num_input_tokens_seen": 2418592, "step": 130, "train_runtime": 8306.8501, "train_tokens_per_second": 291.156 }, { "epoch": 1.7156549520766773, "grad_norm": 2.630876302719116, "learning_rate": 3.979898710174677e-07, "loss": 0.5948, "num_input_tokens_seen": 2512320, "step": 135, "train_runtime": 8501.9493, "train_tokens_per_second": 295.499 }, { "epoch": 1.779552715654952, "grad_norm": 2.6901042461395264, "learning_rate": 3.657954557919183e-07, "loss": 0.598, "num_input_tokens_seen": 2606112, "step": 140, "train_runtime": 8694.019, "train_tokens_per_second": 299.759 }, { "epoch": 1.8434504792332267, "grad_norm": 2.8361966609954834, "learning_rate": 3.3419036072396614e-07, "loss": 0.5902, "num_input_tokens_seen": 2699936, "step": 145, "train_runtime": 8889.9576, "train_tokens_per_second": 303.706 }, { "epoch": 1.9073482428115016, "grad_norm": 2.87080979347229, "learning_rate": 3.033133703809759e-07, "loss": 0.5978, "num_input_tokens_seen": 2795136, "step": 150, "train_runtime": 9406.664, "train_tokens_per_second": 297.144 }, { "epoch": 1.9712460063897763, "grad_norm": 2.7429561614990234, "learning_rate": 2.7330007207053406e-07, "loss": 0.5946, "num_input_tokens_seen": 2888960, "step": 155, "train_runtime": 9589.8224, "train_tokens_per_second": 301.253 }, { "epoch": 2.02555910543131, "grad_norm": 2.6952402591705322, "learning_rate": 2.442822604482889e-07, "loss": 0.5918, "num_input_tokens_seen": 2968224, "step": 160, "train_runtime": 9773.1495, "train_tokens_per_second": 303.712 }, { "epoch": 2.0894568690095845, "grad_norm": 2.673067569732666, "learning_rate": 2.16387358780116e-07, "loss": 0.5663, "num_input_tokens_seen": 3062400, "step": 165, "train_runtime": 9959.8127, "train_tokens_per_second": 307.476 }, { "epoch": 2.1533546325878596, "grad_norm": 2.7801618576049805, "learning_rate": 1.8973785939996927e-07, "loss": 0.5791, "num_input_tokens_seen": 3155520, "step": 170, "train_runtime": 10148.3528, "train_tokens_per_second": 310.939 }, { "epoch": 2.2172523961661343, "grad_norm": 2.5863192081451416, "learning_rate": 1.6445078582048154e-07, "loss": 0.5695, "num_input_tokens_seen": 3250496, "step": 175, "train_runtime": 10338.7555, "train_tokens_per_second": 314.399 }, { "epoch": 2.281150159744409, "grad_norm": 2.7501046657562256, "learning_rate": 1.4063717885830373e-07, "loss": 0.5675, "num_input_tokens_seen": 3344672, "step": 180, "train_runtime": 10847.5534, "train_tokens_per_second": 308.334 }, { "epoch": 2.3450479233226837, "grad_norm": 2.7307002544403076, "learning_rate": 1.184016090307059e-07, "loss": 0.5657, "num_input_tokens_seen": 3438784, "step": 185, "train_runtime": 11033.7768, "train_tokens_per_second": 311.66 }, { "epoch": 2.4089456869009584, "grad_norm": 2.663017988204956, "learning_rate": 9.78417173646176e-08, "loss": 0.5745, "num_input_tokens_seen": 3531840, "step": 190, "train_runtime": 11217.6988, "train_tokens_per_second": 314.845 }, { "epoch": 2.472843450479233, "grad_norm": 2.6426873207092285, "learning_rate": 7.904778663450323e-08, "loss": 0.5885, "num_input_tokens_seen": 3625984, "step": 195, "train_runtime": 11403.1929, "train_tokens_per_second": 317.98 }, { "epoch": 2.536741214057508, "grad_norm": 2.7548089027404785, "learning_rate": 6.210234491186079e-08, "loss": 0.5748, "num_input_tokens_seen": 3720352, "step": 200, "train_runtime": 11603.6555, "train_tokens_per_second": 320.619 }, { "epoch": 2.600638977635783, "grad_norm": 2.6678242683410645, "learning_rate": 4.7079803167238366e-08, "loss": 0.5741, "num_input_tokens_seen": 3812992, "step": 205, "train_runtime": 11793.3317, "train_tokens_per_second": 323.318 }, { "epoch": 2.6645367412140573, "grad_norm": 2.8868303298950195, "learning_rate": 3.4046128516136754e-08, "loss": 0.5642, "num_input_tokens_seen": 3905280, "step": 210, "train_runtime": 12299.9488, "train_tokens_per_second": 317.504 }, { "epoch": 2.7284345047923324, "grad_norm": 2.5737545490264893, "learning_rate": 2.3058554543638698e-08, "loss": 0.5741, "num_input_tokens_seen": 3999680, "step": 215, "train_runtime": 12494.1775, "train_tokens_per_second": 320.124 }, { "epoch": 2.792332268370607, "grad_norm": 2.635117292404175, "learning_rate": 1.4165329979794971e-08, "loss": 0.5805, "num_input_tokens_seen": 4094720, "step": 220, "train_runtime": 12687.8777, "train_tokens_per_second": 322.727 }, { "epoch": 2.856230031948882, "grad_norm": 2.665903329849243, "learning_rate": 7.405506829382735e-09, "loss": 0.5779, "num_input_tokens_seen": 4189248, "step": 225, "train_runtime": 12880.2851, "train_tokens_per_second": 325.245 }, { "epoch": 2.9201277955271565, "grad_norm": 2.6100857257843018, "learning_rate": 2.808768886403301e-09, "loss": 0.5671, "num_input_tokens_seen": 4282208, "step": 230, "train_runtime": 13076.8487, "train_tokens_per_second": 327.465 }, { "epoch": 2.984025559105431, "grad_norm": 2.5199291706085205, "learning_rate": 3.9530138634907837e-10, "loss": 0.5685, "num_input_tokens_seen": 4376096, "step": 235, "train_runtime": 13275.1963, "train_tokens_per_second": 329.645 }, { "epoch": 3.0, "num_input_tokens_seen": 4400096, "step": 237, "total_flos": 1.8666841676395315e+17, "train_loss": 0.6493978349468376, "train_runtime": 13351.2629, "train_samples_per_second": 2.247, "train_steps_per_second": 0.018 } ], "logging_steps": 5, "max_steps": 237, "num_input_tokens_seen": 4400096, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8666841676395315e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }