{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.279503105590062, "eval_steps": 500, "global_step": 528, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.055900621118012424, "grad_norm": 0.9968776702880859, "learning_rate": 3.2142857142857144e-05, "loss": 1.8589, "step": 9 }, { "epoch": 0.11180124223602485, "grad_norm": 0.7745970487594604, "learning_rate": 4.9991026578391245e-05, "loss": 1.5729, "step": 18 }, { "epoch": 0.16770186335403728, "grad_norm": 0.6643325686454773, "learning_rate": 4.990527244618566e-05, "loss": 1.4004, "step": 27 }, { "epoch": 0.2236024844720497, "grad_norm": 0.6641173362731934, "learning_rate": 4.972902867895191e-05, "loss": 1.309, "step": 36 }, { "epoch": 0.2795031055900621, "grad_norm": 0.7292973399162292, "learning_rate": 4.946293563243023e-05, "loss": 1.301, "step": 45 }, { "epoch": 0.33540372670807456, "grad_norm": 0.6836386919021606, "learning_rate": 4.910796011646843e-05, "loss": 1.2907, "step": 54 }, { "epoch": 0.391304347826087, "grad_norm": 0.7532292604446411, "learning_rate": 4.8665391882260856e-05, "loss": 1.2673, "step": 63 }, { "epoch": 0.4472049689440994, "grad_norm": 0.8352078795433044, "learning_rate": 4.8136838936227645e-05, "loss": 1.2422, "step": 72 }, { "epoch": 0.5031055900621118, "grad_norm": 0.8020253777503967, "learning_rate": 4.752422169756048e-05, "loss": 1.2132, "step": 81 }, { "epoch": 0.5590062111801242, "grad_norm": 0.7051873207092285, "learning_rate": 4.682976602066263e-05, "loss": 1.2494, "step": 90 }, { "epoch": 0.6149068322981367, "grad_norm": 0.7008059024810791, "learning_rate": 4.605599510783517e-05, "loss": 1.242, "step": 99 }, { "epoch": 0.6708074534161491, "grad_norm": 0.739669680595398, "learning_rate": 4.5205720341593556e-05, "loss": 1.2136, "step": 108 }, { "epoch": 0.7267080745341615, "grad_norm": 0.7449454069137573, "learning_rate": 4.4282031069923714e-05, "loss": 1.1958, "step": 117 }, { "epoch": 0.782608695652174, "grad_norm": 0.6761304140090942, "learning_rate": 4.328828338159173e-05, "loss": 1.2118, "step": 126 }, { "epoch": 0.8385093167701864, "grad_norm": 0.6721615195274353, "learning_rate": 4.222808791229016e-05, "loss": 1.2348, "step": 135 }, { "epoch": 0.8944099378881988, "grad_norm": 0.7120226621627808, "learning_rate": 4.110529672592568e-05, "loss": 1.1987, "step": 144 }, { "epoch": 0.9503105590062112, "grad_norm": 0.7605018615722656, "learning_rate": 3.992398931871285e-05, "loss": 1.1666, "step": 153 }, { "epoch": 1.0062111801242235, "grad_norm": 0.7596850991249084, "learning_rate": 3.868845779692618e-05, "loss": 1.1513, "step": 162 }, { "epoch": 1.062111801242236, "grad_norm": 0.734574019908905, "learning_rate": 3.7403191282164886e-05, "loss": 1.17, "step": 171 }, { "epoch": 1.1180124223602483, "grad_norm": 0.7106190323829651, "learning_rate": 3.607285960079146e-05, "loss": 1.1524, "step": 180 }, { "epoch": 1.1739130434782608, "grad_norm": 0.7839401960372925, "learning_rate": 3.4702296316806244e-05, "loss": 1.1252, "step": 189 }, { "epoch": 1.2298136645962732, "grad_norm": 0.7764331102371216, "learning_rate": 3.3296481169805274e-05, "loss": 1.1062, "step": 198 }, { "epoch": 1.2857142857142856, "grad_norm": 0.7929440140724182, "learning_rate": 3.186052198183081e-05, "loss": 1.1362, "step": 207 }, { "epoch": 1.341614906832298, "grad_norm": 0.8039306998252869, "learning_rate": 3.0399636098853114e-05, "loss": 1.1563, "step": 216 }, { "epoch": 1.3975155279503104, "grad_norm": 0.8428529500961304, "learning_rate": 2.8919131434313156e-05, "loss": 1.1278, "step": 225 }, { "epoch": 1.453416149068323, "grad_norm": 0.8190981149673462, "learning_rate": 2.7424387183601858e-05, "loss": 1.0945, "step": 234 }, { "epoch": 1.5093167701863353, "grad_norm": 0.9296110272407532, "learning_rate": 2.5920834279546775e-05, "loss": 1.0886, "step": 243 }, { "epoch": 1.5652173913043477, "grad_norm": 0.8043022155761719, "learning_rate": 2.441393565991849e-05, "loss": 1.1619, "step": 252 }, { "epoch": 1.62111801242236, "grad_norm": 0.8056561350822449, "learning_rate": 2.2909166418651832e-05, "loss": 1.1397, "step": 261 }, { "epoch": 1.6770186335403725, "grad_norm": 0.8094793558120728, "learning_rate": 2.1411993912899285e-05, "loss": 1.1026, "step": 270 }, { "epoch": 1.7329192546583851, "grad_norm": 0.7818030118942261, "learning_rate": 1.9927857898195064e-05, "loss": 1.0969, "step": 279 }, { "epoch": 1.7888198757763976, "grad_norm": 0.7848922610282898, "learning_rate": 1.846215076390543e-05, "loss": 1.1376, "step": 288 }, { "epoch": 1.84472049689441, "grad_norm": 0.7966899275779724, "learning_rate": 1.7020197940777067e-05, "loss": 1.1326, "step": 297 }, { "epoch": 1.9006211180124224, "grad_norm": 0.8150126934051514, "learning_rate": 1.5607238551769794e-05, "loss": 1.122, "step": 306 }, { "epoch": 1.9565217391304348, "grad_norm": 0.8451591730117798, "learning_rate": 1.4228406376475742e-05, "loss": 1.088, "step": 315 }, { "epoch": 2.012422360248447, "grad_norm": 0.8190770149230957, "learning_rate": 1.288871119828825e-05, "loss": 1.0823, "step": 324 }, { "epoch": 2.0683229813664594, "grad_norm": 0.7996165752410889, "learning_rate": 1.1593020602092605e-05, "loss": 1.1, "step": 333 }, { "epoch": 2.124223602484472, "grad_norm": 0.8338391184806824, "learning_rate": 1.0346042288614138e-05, "loss": 1.0777, "step": 342 }, { "epoch": 2.1801242236024843, "grad_norm": 0.8706255555152893, "learning_rate": 9.152306969681765e-06, "loss": 1.0542, "step": 351 }, { "epoch": 2.2360248447204967, "grad_norm": 0.8082641959190369, "learning_rate": 8.016151906554683e-06, "loss": 1.0248, "step": 360 }, { "epoch": 2.291925465838509, "grad_norm": 0.854958713054657, "learning_rate": 6.941705151123118e-06, "loss": 1.0754, "step": 369 }, { "epoch": 2.3478260869565215, "grad_norm": 0.8530688881874084, "learning_rate": 5.932870547240454e-06, "loss": 1.0822, "step": 378 }, { "epoch": 2.403726708074534, "grad_norm": 0.8880767226219177, "learning_rate": 4.993313546682271e-06, "loss": 1.0634, "step": 387 }, { "epoch": 2.4596273291925463, "grad_norm": 0.873835563659668, "learning_rate": 4.1264478912677846e-06, "loss": 1.031, "step": 396 }, { "epoch": 2.5155279503105588, "grad_norm": 0.8964288830757141, "learning_rate": 3.33542320953234e-06, "loss": 1.0296, "step": 405 }, { "epoch": 2.571428571428571, "grad_norm": 0.8553845286369324, "learning_rate": 2.6231135730165446e-06, "loss": 1.0958, "step": 414 }, { "epoch": 2.6273291925465836, "grad_norm": 0.8658971786499023, "learning_rate": 1.992107053751105e-06, "loss": 1.0697, "step": 423 }, { "epoch": 2.683229813664596, "grad_norm": 0.8648439049720764, "learning_rate": 1.4446963208787633e-06, "loss": 1.0435, "step": 432 }, { "epoch": 2.7391304347826084, "grad_norm": 0.8663669228553772, "learning_rate": 9.828703105789983e-07, "loss": 1.0234, "step": 441 }, { "epoch": 2.795031055900621, "grad_norm": 0.8901626467704773, "learning_rate": 6.083069995617113e-07, "loss": 1.0737, "step": 450 }, { "epoch": 2.8509316770186337, "grad_norm": 0.8699432611465454, "learning_rate": 3.2236730838628437e-07, "loss": 1.0765, "step": 459 }, { "epoch": 2.906832298136646, "grad_norm": 0.895370602607727, "learning_rate": 1.2609015675739134e-07, "loss": 1.0471, "step": 468 }, { "epoch": 2.9627329192546585, "grad_norm": 0.919094979763031, "learning_rate": 2.0188688763433938e-08, "loss": 1.0377, "step": 477 }, { "epoch": 3.018633540372671, "grad_norm": 0.9786181449890137, "learning_rate": 2.5920834279546775e-05, "loss": 1.0355, "step": 486 }, { "epoch": 3.0745341614906834, "grad_norm": 0.9231936931610107, "learning_rate": 2.516746104263722e-05, "loss": 1.0865, "step": 495 }, { "epoch": 3.130434782608696, "grad_norm": 1.004806399345398, "learning_rate": 2.441393565991849e-05, "loss": 1.0732, "step": 504 }, { "epoch": 3.186335403726708, "grad_norm": 0.9859076738357544, "learning_rate": 2.366094274273233e-05, "loss": 1.0404, "step": 513 }, { "epoch": 3.2422360248447206, "grad_norm": 0.9473730325698853, "learning_rate": 2.2909166418651832e-05, "loss": 1.013, "step": 522 } ], "logging_steps": 9, "max_steps": 966, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 48, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.4611520448793805e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }