| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.962732919254658, |
| "eval_steps": 500, |
| "global_step": 960, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.055900621118012424, |
| "grad_norm": 0.9968776702880859, |
| "learning_rate": 3.2142857142857144e-05, |
| "loss": 1.8589, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.11180124223602485, |
| "grad_norm": 0.7745970487594604, |
| "learning_rate": 4.9991026578391245e-05, |
| "loss": 1.5729, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.16770186335403728, |
| "grad_norm": 0.6643325686454773, |
| "learning_rate": 4.990527244618566e-05, |
| "loss": 1.4004, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.2236024844720497, |
| "grad_norm": 0.6641173362731934, |
| "learning_rate": 4.972902867895191e-05, |
| "loss": 1.309, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.2795031055900621, |
| "grad_norm": 0.7292973399162292, |
| "learning_rate": 4.946293563243023e-05, |
| "loss": 1.301, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.33540372670807456, |
| "grad_norm": 0.6836386919021606, |
| "learning_rate": 4.910796011646843e-05, |
| "loss": 1.2907, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.391304347826087, |
| "grad_norm": 0.7532292604446411, |
| "learning_rate": 4.8665391882260856e-05, |
| "loss": 1.2673, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.4472049689440994, |
| "grad_norm": 0.8352078795433044, |
| "learning_rate": 4.8136838936227645e-05, |
| "loss": 1.2422, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.5031055900621118, |
| "grad_norm": 0.8020253777503967, |
| "learning_rate": 4.752422169756048e-05, |
| "loss": 1.2132, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.5590062111801242, |
| "grad_norm": 0.7051873207092285, |
| "learning_rate": 4.682976602066263e-05, |
| "loss": 1.2494, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6149068322981367, |
| "grad_norm": 0.7008059024810791, |
| "learning_rate": 4.605599510783517e-05, |
| "loss": 1.242, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.6708074534161491, |
| "grad_norm": 0.739669680595398, |
| "learning_rate": 4.5205720341593556e-05, |
| "loss": 1.2136, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.7267080745341615, |
| "grad_norm": 0.7449454069137573, |
| "learning_rate": 4.4282031069923714e-05, |
| "loss": 1.1958, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.782608695652174, |
| "grad_norm": 0.6761304140090942, |
| "learning_rate": 4.328828338159173e-05, |
| "loss": 1.2118, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.8385093167701864, |
| "grad_norm": 0.6721615195274353, |
| "learning_rate": 4.222808791229016e-05, |
| "loss": 1.2348, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.8944099378881988, |
| "grad_norm": 0.7120226621627808, |
| "learning_rate": 4.110529672592568e-05, |
| "loss": 1.1987, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.9503105590062112, |
| "grad_norm": 0.7605018615722656, |
| "learning_rate": 3.992398931871285e-05, |
| "loss": 1.1666, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.0062111801242235, |
| "grad_norm": 0.7596850991249084, |
| "learning_rate": 3.868845779692618e-05, |
| "loss": 1.1513, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.062111801242236, |
| "grad_norm": 0.734574019908905, |
| "learning_rate": 3.7403191282164886e-05, |
| "loss": 1.17, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.1180124223602483, |
| "grad_norm": 0.7106190323829651, |
| "learning_rate": 3.607285960079146e-05, |
| "loss": 1.1524, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.1739130434782608, |
| "grad_norm": 0.7839401960372925, |
| "learning_rate": 3.4702296316806244e-05, |
| "loss": 1.1252, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.2298136645962732, |
| "grad_norm": 0.7764331102371216, |
| "learning_rate": 3.3296481169805274e-05, |
| "loss": 1.1062, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.2857142857142856, |
| "grad_norm": 0.7929440140724182, |
| "learning_rate": 3.186052198183081e-05, |
| "loss": 1.1362, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.341614906832298, |
| "grad_norm": 0.8039306998252869, |
| "learning_rate": 3.0399636098853114e-05, |
| "loss": 1.1563, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.3975155279503104, |
| "grad_norm": 0.8428529500961304, |
| "learning_rate": 2.8919131434313156e-05, |
| "loss": 1.1278, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.453416149068323, |
| "grad_norm": 0.8190981149673462, |
| "learning_rate": 2.7424387183601858e-05, |
| "loss": 1.0945, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.5093167701863353, |
| "grad_norm": 0.9296110272407532, |
| "learning_rate": 2.5920834279546775e-05, |
| "loss": 1.0886, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.5652173913043477, |
| "grad_norm": 0.8043022155761719, |
| "learning_rate": 2.441393565991849e-05, |
| "loss": 1.1619, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.62111801242236, |
| "grad_norm": 0.8056561350822449, |
| "learning_rate": 2.2909166418651832e-05, |
| "loss": 1.1397, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.6770186335403725, |
| "grad_norm": 0.8094793558120728, |
| "learning_rate": 2.1411993912899285e-05, |
| "loss": 1.1026, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.7329192546583851, |
| "grad_norm": 0.7818030118942261, |
| "learning_rate": 1.9927857898195064e-05, |
| "loss": 1.0969, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.7888198757763976, |
| "grad_norm": 0.7848922610282898, |
| "learning_rate": 1.846215076390543e-05, |
| "loss": 1.1376, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.84472049689441, |
| "grad_norm": 0.7966899275779724, |
| "learning_rate": 1.7020197940777067e-05, |
| "loss": 1.1326, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.9006211180124224, |
| "grad_norm": 0.8150126934051514, |
| "learning_rate": 1.5607238551769794e-05, |
| "loss": 1.122, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.9565217391304348, |
| "grad_norm": 0.8451591730117798, |
| "learning_rate": 1.4228406376475742e-05, |
| "loss": 1.088, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.012422360248447, |
| "grad_norm": 0.8190770149230957, |
| "learning_rate": 1.288871119828825e-05, |
| "loss": 1.0823, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.0683229813664594, |
| "grad_norm": 0.7996165752410889, |
| "learning_rate": 1.1593020602092605e-05, |
| "loss": 1.1, |
| "step": 333 |
| }, |
| { |
| "epoch": 2.124223602484472, |
| "grad_norm": 0.8338391184806824, |
| "learning_rate": 1.0346042288614138e-05, |
| "loss": 1.0777, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.1801242236024843, |
| "grad_norm": 0.8706255555152893, |
| "learning_rate": 9.152306969681765e-06, |
| "loss": 1.0542, |
| "step": 351 |
| }, |
| { |
| "epoch": 2.2360248447204967, |
| "grad_norm": 0.8082641959190369, |
| "learning_rate": 8.016151906554683e-06, |
| "loss": 1.0248, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.291925465838509, |
| "grad_norm": 0.854958713054657, |
| "learning_rate": 6.941705151123118e-06, |
| "loss": 1.0754, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.3478260869565215, |
| "grad_norm": 0.8530688881874084, |
| "learning_rate": 5.932870547240454e-06, |
| "loss": 1.0822, |
| "step": 378 |
| }, |
| { |
| "epoch": 2.403726708074534, |
| "grad_norm": 0.8880767226219177, |
| "learning_rate": 4.993313546682271e-06, |
| "loss": 1.0634, |
| "step": 387 |
| }, |
| { |
| "epoch": 2.4596273291925463, |
| "grad_norm": 0.873835563659668, |
| "learning_rate": 4.1264478912677846e-06, |
| "loss": 1.031, |
| "step": 396 |
| }, |
| { |
| "epoch": 2.5155279503105588, |
| "grad_norm": 0.8964288830757141, |
| "learning_rate": 3.33542320953234e-06, |
| "loss": 1.0296, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.571428571428571, |
| "grad_norm": 0.8553845286369324, |
| "learning_rate": 2.6231135730165446e-06, |
| "loss": 1.0958, |
| "step": 414 |
| }, |
| { |
| "epoch": 2.6273291925465836, |
| "grad_norm": 0.8658971786499023, |
| "learning_rate": 1.992107053751105e-06, |
| "loss": 1.0697, |
| "step": 423 |
| }, |
| { |
| "epoch": 2.683229813664596, |
| "grad_norm": 0.8648439049720764, |
| "learning_rate": 1.4446963208787633e-06, |
| "loss": 1.0435, |
| "step": 432 |
| }, |
| { |
| "epoch": 2.7391304347826084, |
| "grad_norm": 0.8663669228553772, |
| "learning_rate": 9.828703105789983e-07, |
| "loss": 1.0234, |
| "step": 441 |
| }, |
| { |
| "epoch": 2.795031055900621, |
| "grad_norm": 0.8901626467704773, |
| "learning_rate": 6.083069995617113e-07, |
| "loss": 1.0737, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.8509316770186337, |
| "grad_norm": 0.8699432611465454, |
| "learning_rate": 3.2236730838628437e-07, |
| "loss": 1.0765, |
| "step": 459 |
| }, |
| { |
| "epoch": 2.906832298136646, |
| "grad_norm": 0.895370602607727, |
| "learning_rate": 1.2609015675739134e-07, |
| "loss": 1.0471, |
| "step": 468 |
| }, |
| { |
| "epoch": 2.9627329192546585, |
| "grad_norm": 0.919094979763031, |
| "learning_rate": 2.0188688763433938e-08, |
| "loss": 1.0377, |
| "step": 477 |
| }, |
| { |
| "epoch": 3.018633540372671, |
| "grad_norm": 0.9786181449890137, |
| "learning_rate": 2.5920834279546775e-05, |
| "loss": 1.0355, |
| "step": 486 |
| }, |
| { |
| "epoch": 3.0745341614906834, |
| "grad_norm": 0.9231936931610107, |
| "learning_rate": 2.516746104263722e-05, |
| "loss": 1.0865, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.130434782608696, |
| "grad_norm": 1.004806399345398, |
| "learning_rate": 2.441393565991849e-05, |
| "loss": 1.0732, |
| "step": 504 |
| }, |
| { |
| "epoch": 3.186335403726708, |
| "grad_norm": 0.9859076738357544, |
| "learning_rate": 2.366094274273233e-05, |
| "loss": 1.0404, |
| "step": 513 |
| }, |
| { |
| "epoch": 3.2422360248447206, |
| "grad_norm": 0.9473730325698853, |
| "learning_rate": 2.2909166418651832e-05, |
| "loss": 1.013, |
| "step": 522 |
| }, |
| { |
| "epoch": 3.298136645962733, |
| "grad_norm": 0.9594529867172241, |
| "learning_rate": 2.215928970992113e-05, |
| "loss": 1.0819, |
| "step": 531 |
| }, |
| { |
| "epoch": 3.3540372670807455, |
| "grad_norm": 1.035379409790039, |
| "learning_rate": 2.1411993912899285e-05, |
| "loss": 1.0713, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.409937888198758, |
| "grad_norm": 1.0150120258331299, |
| "learning_rate": 2.0667957979072223e-05, |
| "loss": 1.0492, |
| "step": 549 |
| }, |
| { |
| "epoch": 3.4658385093167703, |
| "grad_norm": 1.0104522705078125, |
| "learning_rate": 1.9927857898195064e-05, |
| "loss": 1.022, |
| "step": 558 |
| }, |
| { |
| "epoch": 3.5217391304347827, |
| "grad_norm": 0.996560275554657, |
| "learning_rate": 1.9192366084125425e-05, |
| "loss": 1.0319, |
| "step": 567 |
| }, |
| { |
| "epoch": 3.577639751552795, |
| "grad_norm": 0.954988956451416, |
| "learning_rate": 1.846215076390543e-05, |
| "loss": 1.0823, |
| "step": 576 |
| }, |
| { |
| "epoch": 3.6335403726708075, |
| "grad_norm": 0.9988158941268921, |
| "learning_rate": 1.7737875370647733e-05, |
| "loss": 1.0526, |
| "step": 585 |
| }, |
| { |
| "epoch": 3.68944099378882, |
| "grad_norm": 0.9697257876396179, |
| "learning_rate": 1.7020197940777067e-05, |
| "loss": 1.0347, |
| "step": 594 |
| }, |
| { |
| "epoch": 3.7453416149068324, |
| "grad_norm": 0.9268921613693237, |
| "learning_rate": 1.6309770516174772e-05, |
| "loss": 1.0195, |
| "step": 603 |
| }, |
| { |
| "epoch": 3.801242236024845, |
| "grad_norm": 1.0004150867462158, |
| "learning_rate": 1.5607238551769794e-05, |
| "loss": 1.0926, |
| "step": 612 |
| }, |
| { |
| "epoch": 3.857142857142857, |
| "grad_norm": 0.9498361945152283, |
| "learning_rate": 1.4913240329114158e-05, |
| "loss": 1.0837, |
| "step": 621 |
| }, |
| { |
| "epoch": 3.9130434782608696, |
| "grad_norm": 1.0308877229690552, |
| "learning_rate": 1.4228406376475742e-05, |
| "loss": 1.0492, |
| "step": 630 |
| }, |
| { |
| "epoch": 3.968944099378882, |
| "grad_norm": 0.9514777064323425, |
| "learning_rate": 1.355335889597532e-05, |
| "loss": 1.0131, |
| "step": 639 |
| }, |
| { |
| "epoch": 4.024844720496894, |
| "grad_norm": 0.9895027279853821, |
| "learning_rate": 1.288871119828825e-05, |
| "loss": 1.0157, |
| "step": 648 |
| }, |
| { |
| "epoch": 4.080745341614906, |
| "grad_norm": 1.050930142402649, |
| "learning_rate": 1.223506714542438e-05, |
| "loss": 1.0364, |
| "step": 657 |
| }, |
| { |
| "epoch": 4.136645962732919, |
| "grad_norm": 1.0357706546783447, |
| "learning_rate": 1.1593020602092605e-05, |
| "loss": 0.9999, |
| "step": 666 |
| }, |
| { |
| "epoch": 4.192546583850931, |
| "grad_norm": 1.0587010383605957, |
| "learning_rate": 1.0963154896148325e-05, |
| "loss": 0.9767, |
| "step": 675 |
| }, |
| { |
| "epoch": 4.248447204968944, |
| "grad_norm": 1.050858974456787, |
| "learning_rate": 1.0346042288614138e-05, |
| "loss": 0.9531, |
| "step": 684 |
| }, |
| { |
| "epoch": 4.304347826086957, |
| "grad_norm": 1.0594791173934937, |
| "learning_rate": 9.742243453755202e-06, |
| "loss": 1.0215, |
| "step": 693 |
| }, |
| { |
| "epoch": 4.3602484472049685, |
| "grad_norm": 1.0404331684112549, |
| "learning_rate": 9.152306969681765e-06, |
| "loss": 1.0213, |
| "step": 702 |
| }, |
| { |
| "epoch": 4.416149068322982, |
| "grad_norm": 1.0493998527526855, |
| "learning_rate": 8.576768819941525e-06, |
| "loss": 0.9885, |
| "step": 711 |
| }, |
| { |
| "epoch": 4.472049689440993, |
| "grad_norm": 1.070451021194458, |
| "learning_rate": 8.016151906554683e-06, |
| "loss": 0.9767, |
| "step": 720 |
| }, |
| { |
| "epoch": 4.527950310559007, |
| "grad_norm": 1.1037689447402954, |
| "learning_rate": 7.470965574934282e-06, |
| "loss": 0.9987, |
| "step": 729 |
| }, |
| { |
| "epoch": 4.583850931677018, |
| "grad_norm": 1.0566823482513428, |
| "learning_rate": 6.941705151123118e-06, |
| "loss": 1.0185, |
| "step": 738 |
| }, |
| { |
| "epoch": 4.6397515527950315, |
| "grad_norm": 1.0436228513717651, |
| "learning_rate": 6.428851491768087e-06, |
| "loss": 1.0166, |
| "step": 747 |
| }, |
| { |
| "epoch": 4.695652173913043, |
| "grad_norm": 1.0555388927459717, |
| "learning_rate": 5.932870547240454e-06, |
| "loss": 0.9771, |
| "step": 756 |
| }, |
| { |
| "epoch": 4.751552795031056, |
| "grad_norm": 1.0716733932495117, |
| "learning_rate": 5.454212938299255e-06, |
| "loss": 0.9681, |
| "step": 765 |
| }, |
| { |
| "epoch": 4.807453416149068, |
| "grad_norm": 1.0682276487350464, |
| "learning_rate": 4.993313546682271e-06, |
| "loss": 1.0468, |
| "step": 774 |
| }, |
| { |
| "epoch": 4.863354037267081, |
| "grad_norm": 1.1288073062896729, |
| "learning_rate": 4.550591119996575e-06, |
| "loss": 1.0156, |
| "step": 783 |
| }, |
| { |
| "epoch": 4.919254658385093, |
| "grad_norm": 1.0215827226638794, |
| "learning_rate": 4.1264478912677846e-06, |
| "loss": 0.9885, |
| "step": 792 |
| }, |
| { |
| "epoch": 4.975155279503106, |
| "grad_norm": 1.0524131059646606, |
| "learning_rate": 3.7212692134933614e-06, |
| "loss": 0.9785, |
| "step": 801 |
| }, |
| { |
| "epoch": 5.031055900621118, |
| "grad_norm": 1.108382225036621, |
| "learning_rate": 3.33542320953234e-06, |
| "loss": 0.9786, |
| "step": 810 |
| }, |
| { |
| "epoch": 5.086956521739131, |
| "grad_norm": 1.0805060863494873, |
| "learning_rate": 2.969260437649293e-06, |
| "loss": 1.0016, |
| "step": 819 |
| }, |
| { |
| "epoch": 5.142857142857143, |
| "grad_norm": 1.0768829584121704, |
| "learning_rate": 2.6231135730165446e-06, |
| "loss": 0.9684, |
| "step": 828 |
| }, |
| { |
| "epoch": 5.198757763975156, |
| "grad_norm": 1.0885145664215088, |
| "learning_rate": 2.297297105463994e-06, |
| "loss": 0.9544, |
| "step": 837 |
| }, |
| { |
| "epoch": 5.254658385093168, |
| "grad_norm": 1.0752713680267334, |
| "learning_rate": 1.992107053751105e-06, |
| "loss": 0.9372, |
| "step": 846 |
| }, |
| { |
| "epoch": 5.3105590062111805, |
| "grad_norm": 1.1867809295654297, |
| "learning_rate": 1.7078206966206739e-06, |
| "loss": 1.0042, |
| "step": 855 |
| }, |
| { |
| "epoch": 5.366459627329193, |
| "grad_norm": 1.1268541812896729, |
| "learning_rate": 1.4446963208787633e-06, |
| "loss": 0.9865, |
| "step": 864 |
| }, |
| { |
| "epoch": 5.422360248447205, |
| "grad_norm": 1.0873429775238037, |
| "learning_rate": 1.202972986729653e-06, |
| "loss": 0.972, |
| "step": 873 |
| }, |
| { |
| "epoch": 5.478260869565218, |
| "grad_norm": 1.1025999784469604, |
| "learning_rate": 9.828703105789983e-07, |
| "loss": 0.9298, |
| "step": 882 |
| }, |
| { |
| "epoch": 5.53416149068323, |
| "grad_norm": 1.1131285429000854, |
| "learning_rate": 7.845882655025422e-07, |
| "loss": 0.9795, |
| "step": 891 |
| }, |
| { |
| "epoch": 5.590062111801243, |
| "grad_norm": 1.090147852897644, |
| "learning_rate": 6.083069995617113e-07, |
| "loss": 1.0019, |
| "step": 900 |
| }, |
| { |
| "epoch": 5.645962732919255, |
| "grad_norm": 1.085330843925476, |
| "learning_rate": 4.541866721310406e-07, |
| "loss": 0.9641, |
| "step": 909 |
| }, |
| { |
| "epoch": 5.701863354037267, |
| "grad_norm": 1.1173158884048462, |
| "learning_rate": 3.2236730838628437e-07, |
| "loss": 0.9549, |
| "step": 918 |
| }, |
| { |
| "epoch": 5.75776397515528, |
| "grad_norm": 1.124629259109497, |
| "learning_rate": 2.1296867208531467e-07, |
| "loss": 0.9571, |
| "step": 927 |
| }, |
| { |
| "epoch": 5.813664596273292, |
| "grad_norm": 1.068975806236267, |
| "learning_rate": 1.2609015675739134e-07, |
| "loss": 1.0009, |
| "step": 936 |
| }, |
| { |
| "epoch": 5.869565217391305, |
| "grad_norm": 1.0912953615188599, |
| "learning_rate": 6.181069539974716e-08, |
| "loss": 0.9811, |
| "step": 945 |
| }, |
| { |
| "epoch": 5.925465838509317, |
| "grad_norm": 1.156136155128479, |
| "learning_rate": 2.0188688763433938e-08, |
| "loss": 0.9623, |
| "step": 954 |
| } |
| ], |
| "logging_steps": 9, |
| "max_steps": 966, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 48, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.097963530422518e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|