| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.864, |
| "eval_steps": 54, |
| "global_step": 648, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013333333333333334, |
| "grad_norm": 3.1677727699279785, |
| "learning_rate": 1.323529411764706e-05, |
| "loss": 2.432534408569336, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02666666666666667, |
| "grad_norm": 2.737276077270508, |
| "learning_rate": 2.7941176470588236e-05, |
| "loss": 1.499803352355957, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.322286367416382, |
| "learning_rate": 4.2647058823529415e-05, |
| "loss": 0.958259391784668, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05333333333333334, |
| "grad_norm": 1.6951706409454346, |
| "learning_rate": 5.735294117647059e-05, |
| "loss": 0.6128009796142578, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06666666666666667, |
| "grad_norm": 1.5513038635253906, |
| "learning_rate": 7.205882352941177e-05, |
| "loss": 0.5434298992156983, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.072, |
| "eval_loss": 0.4876534342765808, |
| "eval_runtime": 46.2236, |
| "eval_samples_per_second": 116.802, |
| "eval_steps_per_second": 3.656, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 1.2719606161117554, |
| "learning_rate": 8.676470588235295e-05, |
| "loss": 0.49590306282043456, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09333333333333334, |
| "grad_norm": 1.4482001066207886, |
| "learning_rate": 9.995417048579285e-05, |
| "loss": 0.4701418876647949, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10666666666666667, |
| "grad_norm": 0.9346111416816711, |
| "learning_rate": 9.949587534372136e-05, |
| "loss": 0.42320098876953127, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.961360514163971, |
| "learning_rate": 9.903758020164987e-05, |
| "loss": 0.4108582973480225, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.13333333333333333, |
| "grad_norm": 1.0673013925552368, |
| "learning_rate": 9.857928505957838e-05, |
| "loss": 0.4146592617034912, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.144, |
| "eval_loss": 0.39331719279289246, |
| "eval_runtime": 45.2237, |
| "eval_samples_per_second": 119.384, |
| "eval_steps_per_second": 3.737, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.14666666666666667, |
| "grad_norm": 0.9028648138046265, |
| "learning_rate": 9.812098991750688e-05, |
| "loss": 0.395797872543335, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.9546720385551453, |
| "learning_rate": 9.766269477543539e-05, |
| "loss": 0.3759224653244019, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.17333333333333334, |
| "grad_norm": 1.0134034156799316, |
| "learning_rate": 9.720439963336389e-05, |
| "loss": 0.3860164642333984, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.18666666666666668, |
| "grad_norm": 1.0793383121490479, |
| "learning_rate": 9.67461044912924e-05, |
| "loss": 0.37697725296020507, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.8906319737434387, |
| "learning_rate": 9.62878093492209e-05, |
| "loss": 0.38399662971496584, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.21333333333333335, |
| "grad_norm": 0.9087033867835999, |
| "learning_rate": 9.58295142071494e-05, |
| "loss": 0.35976552963256836, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.216, |
| "eval_loss": 0.36686545610427856, |
| "eval_runtime": 45.33, |
| "eval_samples_per_second": 119.104, |
| "eval_steps_per_second": 3.728, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.22666666666666666, |
| "grad_norm": 0.8666670918464661, |
| "learning_rate": 9.53712190650779e-05, |
| "loss": 0.3579233169555664, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.8580495715141296, |
| "learning_rate": 9.491292392300642e-05, |
| "loss": 0.34284372329711915, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.25333333333333335, |
| "grad_norm": 0.7784494757652283, |
| "learning_rate": 9.445462878093493e-05, |
| "loss": 0.36840295791625977, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.26666666666666666, |
| "grad_norm": 0.8598802089691162, |
| "learning_rate": 9.399633363886343e-05, |
| "loss": 0.3700442314147949, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.8933939337730408, |
| "learning_rate": 9.353803849679193e-05, |
| "loss": 0.36236522197723386, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.288, |
| "eval_loss": 0.346111536026001, |
| "eval_runtime": 45.7874, |
| "eval_samples_per_second": 117.914, |
| "eval_steps_per_second": 3.691, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.29333333333333333, |
| "grad_norm": 1.05680251121521, |
| "learning_rate": 9.307974335472044e-05, |
| "loss": 0.35799968242645264, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.30666666666666664, |
| "grad_norm": 0.8355916142463684, |
| "learning_rate": 9.262144821264895e-05, |
| "loss": 0.35074672698974607, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.9081747531890869, |
| "learning_rate": 9.216315307057746e-05, |
| "loss": 0.3546539068222046, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.9269344210624695, |
| "learning_rate": 9.170485792850596e-05, |
| "loss": 0.3317249774932861, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.3466666666666667, |
| "grad_norm": 0.7904302477836609, |
| "learning_rate": 9.124656278643447e-05, |
| "loss": 0.3290853023529053, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.7114660143852234, |
| "learning_rate": 9.078826764436298e-05, |
| "loss": 0.322760272026062, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.36, |
| "eval_loss": 0.33535903692245483, |
| "eval_runtime": 45.3828, |
| "eval_samples_per_second": 118.966, |
| "eval_steps_per_second": 3.724, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.37333333333333335, |
| "grad_norm": 0.8438096046447754, |
| "learning_rate": 9.032997250229149e-05, |
| "loss": 0.34818062782287595, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.38666666666666666, |
| "grad_norm": 0.7516797780990601, |
| "learning_rate": 8.987167736021999e-05, |
| "loss": 0.31596965789794923, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.8765379786491394, |
| "learning_rate": 8.94133822181485e-05, |
| "loss": 0.33709211349487306, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.41333333333333333, |
| "grad_norm": 0.8902734518051147, |
| "learning_rate": 8.8955087076077e-05, |
| "loss": 0.330603814125061, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.4266666666666667, |
| "grad_norm": 0.8690024018287659, |
| "learning_rate": 8.84967919340055e-05, |
| "loss": 0.32671523094177246, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.432, |
| "eval_loss": 0.33184683322906494, |
| "eval_runtime": 45.3915, |
| "eval_samples_per_second": 118.943, |
| "eval_steps_per_second": 3.723, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.8143032193183899, |
| "learning_rate": 8.8038496791934e-05, |
| "loss": 0.3163719177246094, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4533333333333333, |
| "grad_norm": 0.7447159290313721, |
| "learning_rate": 8.758020164986251e-05, |
| "loss": 0.3183164119720459, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4666666666666667, |
| "grad_norm": 0.7337270379066467, |
| "learning_rate": 8.712190650779101e-05, |
| "loss": 0.3220224857330322, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.6819722056388855, |
| "learning_rate": 8.666361136571953e-05, |
| "loss": 0.3218740940093994, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.49333333333333335, |
| "grad_norm": 0.7921403646469116, |
| "learning_rate": 8.620531622364803e-05, |
| "loss": 0.32049055099487306, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.504, |
| "eval_loss": 0.3204575181007385, |
| "eval_runtime": 45.5527, |
| "eval_samples_per_second": 118.522, |
| "eval_steps_per_second": 3.71, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.5066666666666667, |
| "grad_norm": 0.7351377010345459, |
| "learning_rate": 8.574702108157654e-05, |
| "loss": 0.3107901573181152, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.800105094909668, |
| "learning_rate": 8.528872593950504e-05, |
| "loss": 0.3166258096694946, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5333333333333333, |
| "grad_norm": 0.683992326259613, |
| "learning_rate": 8.483043079743355e-05, |
| "loss": 0.31937375068664553, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5466666666666666, |
| "grad_norm": 0.7428257465362549, |
| "learning_rate": 8.437213565536206e-05, |
| "loss": 0.3019124984741211, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.7256530523300171, |
| "learning_rate": 8.391384051329057e-05, |
| "loss": 0.3088233232498169, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5733333333333334, |
| "grad_norm": 0.8287463784217834, |
| "learning_rate": 8.345554537121907e-05, |
| "loss": 0.3135841369628906, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.576, |
| "eval_loss": 0.3146475553512573, |
| "eval_runtime": 45.5431, |
| "eval_samples_per_second": 118.547, |
| "eval_steps_per_second": 3.711, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.5866666666666667, |
| "grad_norm": 0.7201558947563171, |
| "learning_rate": 8.299725022914757e-05, |
| "loss": 0.33163981437683104, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.7666788697242737, |
| "learning_rate": 8.253895508707609e-05, |
| "loss": 0.3103063106536865, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6133333333333333, |
| "grad_norm": 0.7993035912513733, |
| "learning_rate": 8.20806599450046e-05, |
| "loss": 0.3024377584457397, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6266666666666667, |
| "grad_norm": 0.7305812835693359, |
| "learning_rate": 8.16223648029331e-05, |
| "loss": 0.30060317516326907, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.7218968868255615, |
| "learning_rate": 8.11640696608616e-05, |
| "loss": 0.30545334815979003, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.648, |
| "eval_loss": 0.30640655755996704, |
| "eval_runtime": 45.3513, |
| "eval_samples_per_second": 119.048, |
| "eval_steps_per_second": 3.726, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.6533333333333333, |
| "grad_norm": 0.7409582734107971, |
| "learning_rate": 8.07057745187901e-05, |
| "loss": 0.31014978885650635, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.8251257538795471, |
| "learning_rate": 8.024747937671861e-05, |
| "loss": 0.2921054124832153, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 0.8492790460586548, |
| "learning_rate": 7.978918423464711e-05, |
| "loss": 0.2973308801651001, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.6933333333333334, |
| "grad_norm": 0.9368008971214294, |
| "learning_rate": 7.933088909257562e-05, |
| "loss": 0.2915247917175293, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7066666666666667, |
| "grad_norm": 0.7164352536201477, |
| "learning_rate": 7.887259395050412e-05, |
| "loss": 0.2985499382019043, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 0.7182020545005798, |
| "learning_rate": 7.841429880843263e-05, |
| "loss": 0.29260077476501467, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.72, |
| "eval_loss": 0.301230788230896, |
| "eval_runtime": 45.2631, |
| "eval_samples_per_second": 119.281, |
| "eval_steps_per_second": 3.734, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7333333333333333, |
| "grad_norm": 0.667168140411377, |
| "learning_rate": 7.795600366636114e-05, |
| "loss": 0.2892777442932129, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7466666666666667, |
| "grad_norm": 0.8839274048805237, |
| "learning_rate": 7.749770852428965e-05, |
| "loss": 0.29940755367279054, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.7022384405136108, |
| "learning_rate": 7.703941338221815e-05, |
| "loss": 0.2847739696502686, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.7733333333333333, |
| "grad_norm": 0.7643616199493408, |
| "learning_rate": 7.658111824014665e-05, |
| "loss": 0.3103649139404297, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.7866666666666666, |
| "grad_norm": 0.8885356187820435, |
| "learning_rate": 7.612282309807517e-05, |
| "loss": 0.2783879518508911, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.792, |
| "eval_loss": 0.2951120138168335, |
| "eval_runtime": 45.5971, |
| "eval_samples_per_second": 118.407, |
| "eval_steps_per_second": 3.706, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.7724995017051697, |
| "learning_rate": 7.566452795600368e-05, |
| "loss": 0.30985236167907715, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8133333333333334, |
| "grad_norm": 0.9214587807655334, |
| "learning_rate": 7.520623281393218e-05, |
| "loss": 0.28750762939453123, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8266666666666667, |
| "grad_norm": 0.8178621530532837, |
| "learning_rate": 7.474793767186068e-05, |
| "loss": 0.3045247793197632, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 0.6371078491210938, |
| "learning_rate": 7.428964252978919e-05, |
| "loss": 0.2850461006164551, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8533333333333334, |
| "grad_norm": 0.6749277710914612, |
| "learning_rate": 7.383134738771769e-05, |
| "loss": 0.29334354400634766, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.864, |
| "eval_loss": 0.29118046164512634, |
| "eval_runtime": 45.5175, |
| "eval_samples_per_second": 118.614, |
| "eval_steps_per_second": 3.713, |
| "step": 648 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2250, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 324, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.141916003015066e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|