{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 198.0, "learning_rate": 1.747190540874941e-05, "loss": 0.5751, "step": 5 }, { "epoch": 0.05, "grad_norm": 186.0, "learning_rate": 3.931178716968617e-05, "loss": 0.4175, "step": 10 }, { "epoch": 0.075, "grad_norm": 270.0, "learning_rate": 6.115166893062294e-05, "loss": 0.3509, "step": 15 }, { "epoch": 0.1, "grad_norm": 110.5, "learning_rate": 8.29915506915597e-05, "loss": 0.3508, "step": 20 }, { "epoch": 0.125, "grad_norm": 111.0, "learning_rate": 0.00010483143245249646, "loss": 0.4222, "step": 25 }, { "epoch": 0.15, "grad_norm": 83.5, "learning_rate": 0.00012667131421343323, "loss": 0.6498, "step": 30 }, { "epoch": 0.175, "grad_norm": 65.0, "learning_rate": 0.00014851119597437, "loss": 0.8894, "step": 35 }, { "epoch": 0.2, "grad_norm": 104.5, "learning_rate": 0.0001528649930352754, "loss": 0.6775, "step": 40 }, { "epoch": 0.225, "grad_norm": 29.5, "learning_rate": 0.00015280740168512177, "loss": 0.6998, "step": 45 }, { "epoch": 0.25, "grad_norm": 17.0, "learning_rate": 0.00015270555654763282, "loss": 0.6402, "step": 50 }, { "epoch": 0.275, "grad_norm": 16.75, "learning_rate": 0.00015255953633738878, "loss": 0.4757, "step": 55 }, { "epoch": 0.3, "grad_norm": 11.75, "learning_rate": 0.0001523694539112214, "loss": 0.3831, "step": 60 }, { "epoch": 0.325, "grad_norm": 6.5625, "learning_rate": 0.00015213545618098876, "loss": 0.3437, "step": 65 }, { "epoch": 0.35, "grad_norm": 6.03125, "learning_rate": 0.00015185772400002907, "loss": 0.2986, "step": 70 }, { "epoch": 0.375, "grad_norm": 4.46875, "learning_rate": 0.00015153647202338207, "loss": 0.2774, "step": 75 }, { "epoch": 0.4, "grad_norm": 7.6875, "learning_rate": 0.00015117194854188525, "loss": 0.2703, "step": 80 }, { "epoch": 0.425, "grad_norm": 5.65625, "learning_rate": 0.00015076443529027353, "loss": 0.2501, "step": 85 }, { "epoch": 0.45, "grad_norm": 26.375, "learning_rate": 0.00015031424722943083, "loss": 0.3738, "step": 90 }, { "epoch": 0.475, "grad_norm": 8.1875, "learning_rate": 0.00014982173230296148, "loss": 0.2765, "step": 95 }, { "epoch": 0.5, "grad_norm": 7.125, "learning_rate": 0.00014928727116826976, "loss": 0.2929, "step": 100 }, { "epoch": 0.525, "grad_norm": 7.8125, "learning_rate": 0.00014871127690235564, "loss": 0.2699, "step": 105 }, { "epoch": 0.55, "grad_norm": 6.125, "learning_rate": 0.00014809419468255356, "loss": 0.269, "step": 110 }, { "epoch": 0.575, "grad_norm": 4.1875, "learning_rate": 0.00014743650144246167, "loss": 0.2615, "step": 115 }, { "epoch": 0.6, "grad_norm": 3.828125, "learning_rate": 0.00014673870550332703, "loss": 0.2325, "step": 120 }, { "epoch": 0.625, "grad_norm": 5.375, "learning_rate": 0.00014600134618117166, "loss": 0.2295, "step": 125 }, { "epoch": 0.65, "grad_norm": 3.953125, "learning_rate": 0.0001452249933699633, "loss": 0.2407, "step": 130 }, { "epoch": 0.675, "grad_norm": 2.09375, "learning_rate": 0.0001444102471011529, "loss": 0.2192, "step": 135 }, { "epoch": 0.7, "grad_norm": 1.8515625, "learning_rate": 0.00014355773707991926, "loss": 0.2133, "step": 140 }, { "epoch": 0.725, "grad_norm": 2.078125, "learning_rate": 0.00014266812219847945, "loss": 0.2014, "step": 145 }, { "epoch": 0.75, "grad_norm": 2.0625, "learning_rate": 0.00014174209002684087, "loss": 0.191, "step": 150 }, { "epoch": 0.775, "grad_norm": 2.828125, "learning_rate": 0.00014078035628138847, "loss": 0.2024, "step": 155 }, { "epoch": 0.8, "grad_norm": 2.015625, "learning_rate": 0.00013978366427171864, "loss": 0.1871, "step": 160 }, { "epoch": 0.825, "grad_norm": 1.90625, "learning_rate": 0.00013875278432614612, "loss": 0.1751, "step": 165 }, { "epoch": 0.85, "grad_norm": 1.1171875, "learning_rate": 0.00013768851319632887, "loss": 0.1679, "step": 170 }, { "epoch": 0.875, "grad_norm": 2.015625, "learning_rate": 0.00013659167344147067, "loss": 0.1818, "step": 175 }, { "epoch": 0.9, "grad_norm": 1.8671875, "learning_rate": 0.0001354631127925774, "loss": 0.1692, "step": 180 }, { "epoch": 0.925, "grad_norm": 1.5078125, "learning_rate": 0.0001343037034972584, "loss": 0.1619, "step": 185 }, { "epoch": 0.95, "grad_norm": 1.6328125, "learning_rate": 0.0001331143416455796, "loss": 0.1617, "step": 190 }, { "epoch": 0.975, "grad_norm": 1.6953125, "learning_rate": 0.00013189594647748868, "loss": 0.1615, "step": 195 }, { "epoch": 1.0, "grad_norm": 1.5, "learning_rate": 0.00013064945967234835, "loss": 0.1689, "step": 200 }, { "epoch": 1.0, "eval_loss": 0.16072207689285278, "eval_runtime": 0.5972, "eval_samples_per_second": 43.533, "eval_steps_per_second": 43.533, "step": 200 }, { "epoch": 1.025, "grad_norm": 1.828125, "learning_rate": 0.0001293758446211266, "loss": 0.1629, "step": 205 }, { "epoch": 1.05, "grad_norm": 1.59375, "learning_rate": 0.00012807608568180618, "loss": 0.1624, "step": 210 }, { "epoch": 1.075, "grad_norm": 1.0859375, "learning_rate": 0.00012675118741858906, "loss": 0.1614, "step": 215 }, { "epoch": 1.1, "grad_norm": 1.265625, "learning_rate": 0.00012540217382548384, "loss": 0.1636, "step": 220 }, { "epoch": 1.125, "grad_norm": 1.46875, "learning_rate": 0.0001240300875348761, "loss": 0.1567, "step": 225 }, { "epoch": 1.15, "grad_norm": 2.609375, "learning_rate": 0.0001226359890116935, "loss": 0.1663, "step": 230 }, { "epoch": 1.175, "grad_norm": 1.9140625, "learning_rate": 0.00012122095573378837, "loss": 0.1774, "step": 235 }, { "epoch": 1.2, "grad_norm": 3.015625, "learning_rate": 0.00011978608135917105, "loss": 0.1701, "step": 240 }, { "epoch": 1.225, "grad_norm": 2.59375, "learning_rate": 0.00011833247488073823, "loss": 0.1853, "step": 245 }, { "epoch": 1.25, "grad_norm": 2.453125, "learning_rate": 0.00011686125976914878, "loss": 0.1948, "step": 250 }, { "epoch": 1.275, "grad_norm": 1.3359375, "learning_rate": 0.00011537357310451031, "loss": 0.1733, "step": 255 }, { "epoch": 1.3, "grad_norm": 1.890625, "learning_rate": 0.00011387056469754679, "loss": 0.1624, "step": 260 }, { "epoch": 1.325, "grad_norm": 2.390625, "learning_rate": 0.00011235339620092721, "loss": 0.1684, "step": 265 }, { "epoch": 1.35, "grad_norm": 2.390625, "learning_rate": 0.0001108232402114416, "loss": 0.169, "step": 270 }, { "epoch": 1.375, "grad_norm": 2.65625, "learning_rate": 0.0001092812793637186, "loss": 0.1789, "step": 275 }, { "epoch": 1.4, "grad_norm": 1.6875, "learning_rate": 0.0001077287054161847, "loss": 0.1695, "step": 280 }, { "epoch": 1.425, "grad_norm": 1.4375, "learning_rate": 0.00010616671832997237, "loss": 0.1671, "step": 285 }, { "epoch": 1.45, "grad_norm": 1.0390625, "learning_rate": 0.00010459652534148764, "loss": 0.1675, "step": 290 }, { "epoch": 1.475, "grad_norm": 0.97265625, "learning_rate": 0.00010301934002935564, "loss": 0.159, "step": 295 }, { "epoch": 1.5, "grad_norm": 1.7109375, "learning_rate": 0.00010143638137646338, "loss": 0.1578, "step": 300 }, { "epoch": 1.525, "grad_norm": 1.171875, "learning_rate": 9.984887282782665e-05, "loss": 0.1588, "step": 305 }, { "epoch": 1.55, "grad_norm": 1.0546875, "learning_rate": 9.825804134500727e-05, "loss": 0.1535, "step": 310 }, { "epoch": 1.575, "grad_norm": 1.296875, "learning_rate": 9.666511645781328e-05, "loss": 0.1587, "step": 315 }, { "epoch": 1.6, "grad_norm": 1.109375, "learning_rate": 9.507132931401333e-05, "loss": 0.1514, "step": 320 }, { "epoch": 1.625, "grad_norm": 1.21875, "learning_rate": 9.347791172780155e-05, "loss": 0.1486, "step": 325 }, { "epoch": 1.65, "grad_norm": 1.0703125, "learning_rate": 9.188609522774628e-05, "loss": 0.149, "step": 330 }, { "epoch": 1.675, "grad_norm": 0.87109375, "learning_rate": 9.029711010496061e-05, "loss": 0.1467, "step": 335 }, { "epoch": 1.7, "grad_norm": 0.8984375, "learning_rate": 8.871218446222844e-05, "loss": 0.1456, "step": 340 }, { "epoch": 1.725, "grad_norm": 0.7734375, "learning_rate": 8.713254326482237e-05, "loss": 0.1439, "step": 345 }, { "epoch": 1.75, "grad_norm": 1.0625, "learning_rate": 8.555940739374653e-05, "loss": 0.1426, "step": 350 }, { "epoch": 1.775, "grad_norm": 0.9453125, "learning_rate": 8.399399270213575e-05, "loss": 0.1399, "step": 355 }, { "epoch": 1.8, "grad_norm": 0.953125, "learning_rate": 8.243750907554097e-05, "loss": 0.1436, "step": 360 }, { "epoch": 1.825, "grad_norm": 0.7421875, "learning_rate": 8.089115949682696e-05, "loss": 0.1375, "step": 365 }, { "epoch": 1.85, "grad_norm": 0.6640625, "learning_rate": 7.935613911640464e-05, "loss": 0.1384, "step": 370 }, { "epoch": 1.875, "grad_norm": 0.828125, "learning_rate": 7.783363432851746e-05, "loss": 0.1366, "step": 375 }, { "epoch": 1.9, "grad_norm": 0.859375, "learning_rate": 7.632482185429501e-05, "loss": 0.1374, "step": 380 }, { "epoch": 1.925, "grad_norm": 0.87890625, "learning_rate": 7.483086783228284e-05, "loss": 0.1413, "step": 385 }, { "epoch": 1.95, "grad_norm": 1.1015625, "learning_rate": 7.335292691715154e-05, "loss": 0.1371, "step": 390 }, { "epoch": 1.975, "grad_norm": 0.8359375, "learning_rate": 7.189214138728142e-05, "loss": 0.1413, "step": 395 }, { "epoch": 2.0, "grad_norm": 0.8359375, "learning_rate": 7.044964026191261e-05, "loss": 0.1378, "step": 400 }, { "epoch": 2.0, "eval_loss": 0.13336719572544098, "eval_runtime": 0.5924, "eval_samples_per_second": 43.887, "eval_steps_per_second": 43.887, "step": 400 }, { "epoch": 2.025, "grad_norm": 0.8125, "learning_rate": 6.902653842854314e-05, "loss": 0.1382, "step": 405 }, { "epoch": 2.05, "grad_norm": 0.6640625, "learning_rate": 6.762393578124894e-05, "loss": 0.1273, "step": 410 }, { "epoch": 2.075, "grad_norm": 0.6640625, "learning_rate": 6.624291637059237e-05, "loss": 0.1319, "step": 415 }, { "epoch": 2.1, "grad_norm": 0.6796875, "learning_rate": 6.48845475657757e-05, "loss": 0.1296, "step": 420 }, { "epoch": 2.125, "grad_norm": 0.68359375, "learning_rate": 6.354987922968741e-05, "loss": 0.136, "step": 425 }, { "epoch": 2.15, "grad_norm": 0.8203125, "learning_rate": 6.223994290747898e-05, "loss": 0.1332, "step": 430 }, { "epoch": 2.175, "grad_norm": 0.79296875, "learning_rate": 6.095575102929907e-05, "loss": 0.1337, "step": 435 }, { "epoch": 2.2, "grad_norm": 0.7421875, "learning_rate": 5.969829612780141e-05, "loss": 0.1295, "step": 440 }, { "epoch": 2.225, "grad_norm": 0.85546875, "learning_rate": 5.8468550071031296e-05, "loss": 0.1291, "step": 445 }, { "epoch": 2.25, "grad_norm": 1.0390625, "learning_rate": 5.726746331128316e-05, "loss": 0.1364, "step": 450 }, { "epoch": 2.275, "grad_norm": 1.0, "learning_rate": 5.609596415051039e-05, "loss": 0.1365, "step": 455 }, { "epoch": 2.3, "grad_norm": 1.0390625, "learning_rate": 5.495495802285465e-05, "loss": 0.1317, "step": 460 }, { "epoch": 2.325, "grad_norm": 0.7890625, "learning_rate": 5.384532679484933e-05, "loss": 0.1296, "step": 465 }, { "epoch": 2.35, "grad_norm": 0.66796875, "learning_rate": 5.276792808383817e-05, "loss": 0.1355, "step": 470 }, { "epoch": 2.375, "grad_norm": 0.875, "learning_rate": 5.1723594595135666e-05, "loss": 0.1324, "step": 475 }, { "epoch": 2.4, "grad_norm": 0.77734375, "learning_rate": 5.07131334784416e-05, "loss": 0.1315, "step": 480 }, { "epoch": 2.425, "grad_norm": 0.8359375, "learning_rate": 4.973732570400718e-05, "loss": 0.1284, "step": 485 }, { "epoch": 2.45, "grad_norm": 0.79296875, "learning_rate": 4.879692545903476e-05, "loss": 0.1345, "step": 490 }, { "epoch": 2.475, "grad_norm": 0.875, "learning_rate": 4.789265956477791e-05, "loss": 0.126, "step": 495 }, { "epoch": 2.5, "grad_norm": 0.953125, "learning_rate": 4.702522691479217e-05, "loss": 0.1345, "step": 500 }, { "epoch": 2.5, "eval_loss": 0.12545974552631378, "eval_runtime": 0.5917, "eval_samples_per_second": 43.943, "eval_steps_per_second": 43.943, "step": 500 }, { "epoch": 2.525, "grad_norm": 0.8125, "learning_rate": 4.619529793477068e-05, "loss": 0.1291, "step": 505 }, { "epoch": 2.55, "grad_norm": 1.078125, "learning_rate": 4.540351406438219e-05, "loss": 0.1301, "step": 510 }, { "epoch": 2.575, "grad_norm": 0.921875, "learning_rate": 4.465048726151201e-05, "loss": 0.128, "step": 515 }, { "epoch": 2.6, "grad_norm": 1.1875, "learning_rate": 4.393679952928885e-05, "loss": 0.1288, "step": 520 }, { "epoch": 2.625, "grad_norm": 1.203125, "learning_rate": 4.3263002466263436e-05, "loss": 0.1281, "step": 525 }, { "epoch": 2.65, "grad_norm": 0.9609375, "learning_rate": 4.262961684008613e-05, "loss": 0.1305, "step": 530 }, { "epoch": 2.675, "grad_norm": 0.88671875, "learning_rate": 4.203713218501353e-05, "loss": 0.1258, "step": 535 }, { "epoch": 2.7, "grad_norm": 0.9921875, "learning_rate": 4.1486006423554745e-05, "loss": 0.1284, "step": 540 }, { "epoch": 2.725, "grad_norm": 0.84765625, "learning_rate": 4.097666551254989e-05, "loss": 0.1269, "step": 545 }, { "epoch": 2.75, "grad_norm": 0.92578125, "learning_rate": 4.0509503113954545e-05, "loss": 0.128, "step": 550 }, { "epoch": 2.775, "grad_norm": 0.90234375, "learning_rate": 4.008488029058422e-05, "loss": 0.1293, "step": 555 }, { "epoch": 2.8, "grad_norm": 0.85546875, "learning_rate": 3.9703125227054376e-05, "loss": 0.132, "step": 560 }, { "epoch": 2.825, "grad_norm": 0.859375, "learning_rate": 3.9364532976131475e-05, "loss": 0.1247, "step": 565 }, { "epoch": 2.85, "grad_norm": 1.6796875, "learning_rate": 3.906936523069101e-05, "loss": 0.1246, "step": 570 }, { "epoch": 2.875, "grad_norm": 1.0390625, "learning_rate": 3.8817850121459174e-05, "loss": 0.1246, "step": 575 }, { "epoch": 2.9, "grad_norm": 1.03125, "learning_rate": 3.861018204069391e-05, "loss": 0.1322, "step": 580 }, { "epoch": 2.925, "grad_norm": 0.859375, "learning_rate": 3.8446521491942034e-05, "loss": 0.1277, "step": 585 }, { "epoch": 2.95, "grad_norm": 0.984375, "learning_rate": 3.832699496598859e-05, "loss": 0.1264, "step": 590 }, { "epoch": 2.975, "grad_norm": 0.81640625, "learning_rate": 3.8251694843093894e-05, "loss": 0.1277, "step": 595 }, { "epoch": 3.0, "grad_norm": 0.78515625, "learning_rate": 3.8220679321594226e-05, "loss": 0.1288, "step": 600 }, { "epoch": 3.0, "eval_loss": 0.1242036521434784, "eval_runtime": 0.5964, "eval_samples_per_second": 43.593, "eval_steps_per_second": 43.593, "step": 600 } ], "logging_steps": 5, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.19491360014336e+16, "train_batch_size": 46, "trial_name": null, "trial_params": null }