{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025682182985553772, "grad_norm": 41.89281463623047, "learning_rate": 5e-05, "loss": 41.062384033203124, "step": 10 }, { "epoch": 0.051364365971107544, "grad_norm": 19.25795555114746, "learning_rate": 0.00010555555555555557, "loss": 23.114556884765626, "step": 20 }, { "epoch": 0.07704654895666131, "grad_norm": 15.17092514038086, "learning_rate": 0.0001611111111111111, "loss": 19.892947387695312, "step": 30 }, { "epoch": 0.10272873194221509, "grad_norm": 12.6154146194458, "learning_rate": 0.0001999965463076377, "loss": 18.954251098632813, "step": 40 }, { "epoch": 0.12841091492776885, "grad_norm": 13.695039749145508, "learning_rate": 0.00019993515396856082, "loss": 18.60211944580078, "step": 50 }, { "epoch": 0.15409309791332262, "grad_norm": 13.03475570678711, "learning_rate": 0.00019979706714271113, "loss": 17.00911102294922, "step": 60 }, { "epoch": 0.1797752808988764, "grad_norm": 12.657264709472656, "learning_rate": 0.0001995823918037908, "loss": 16.737196350097655, "step": 70 }, { "epoch": 0.20545746388443017, "grad_norm": 9.450530052185059, "learning_rate": 0.00019929129270278366, "loss": 17.192404174804686, "step": 80 }, { "epoch": 0.23113964686998395, "grad_norm": 11.038394927978516, "learning_rate": 0.0001989239932415185, "loss": 17.626002502441406, "step": 90 }, { "epoch": 0.2568218298555377, "grad_norm": 10.112431526184082, "learning_rate": 0.00019848077530122083, "loss": 17.375982666015624, "step": 100 }, { "epoch": 0.2825040128410915, "grad_norm": 10.951290130615234, "learning_rate": 0.0001979619790261853, "loss": 16.52630157470703, "step": 110 }, { "epoch": 0.30818619582664525, "grad_norm": 13.097779273986816, "learning_rate": 0.00019736800256273457, "loss": 18.660000610351563, "step": 120 }, { "epoch": 0.33386837881219905, "grad_norm": 10.489601135253906, "learning_rate": 0.00019669930175366472, "loss": 18.038082885742188, "step": 130 }, { "epoch": 0.3595505617977528, "grad_norm": 12.037221908569336, "learning_rate": 0.0001959563897884124, "loss": 16.976956176757813, "step": 140 }, { "epoch": 0.3852327447833066, "grad_norm": 13.149781227111816, "learning_rate": 0.00019513983680921108, "loss": 17.55983123779297, "step": 150 }, { "epoch": 0.41091492776886035, "grad_norm": 12.796188354492188, "learning_rate": 0.00019425026947353992, "loss": 17.86356201171875, "step": 160 }, { "epoch": 0.43659711075441415, "grad_norm": 14.400433540344238, "learning_rate": 0.0001932883704732001, "loss": 17.5066162109375, "step": 170 }, { "epoch": 0.4622792937399679, "grad_norm": 16.03416633605957, "learning_rate": 0.00019225487801038788, "loss": 18.057034301757813, "step": 180 }, { "epoch": 0.48796147672552165, "grad_norm": 11.53113842010498, "learning_rate": 0.00019115058523116733, "loss": 16.775308227539064, "step": 190 }, { "epoch": 0.5136436597110754, "grad_norm": 12.079059600830078, "learning_rate": 0.00018997633961677582, "loss": 17.352998352050783, "step": 200 }, { "epoch": 0.5393258426966292, "grad_norm": 11.70594310760498, "learning_rate": 0.00018873304233323122, "loss": 18.17870635986328, "step": 210 }, { "epoch": 0.565008025682183, "grad_norm": 13.156736373901367, "learning_rate": 0.00018742164753973855, "loss": 16.766079711914063, "step": 220 }, { "epoch": 0.5906902086677368, "grad_norm": 11.801448822021484, "learning_rate": 0.0001860431616564278, "loss": 17.98457489013672, "step": 230 }, { "epoch": 0.6163723916532905, "grad_norm": 11.434338569641113, "learning_rate": 0.0001845986425919841, "loss": 17.011558532714844, "step": 240 }, { "epoch": 0.6420545746388443, "grad_norm": 11.592355728149414, "learning_rate": 0.00018308919893176396, "loss": 18.073362731933592, "step": 250 }, { "epoch": 0.6677367576243981, "grad_norm": 12.140549659729004, "learning_rate": 0.00018151598908701947, "loss": 17.267474365234374, "step": 260 }, { "epoch": 0.6934189406099518, "grad_norm": 12.207261085510254, "learning_rate": 0.00017988022040588416, "loss": 17.970367431640625, "step": 270 }, { "epoch": 0.7191011235955056, "grad_norm": 12.024649620056152, "learning_rate": 0.000178183148246803, "loss": 18.595138549804688, "step": 280 }, { "epoch": 0.7447833065810594, "grad_norm": 10.953705787658691, "learning_rate": 0.0001764260750151167, "loss": 17.661044311523437, "step": 290 }, { "epoch": 0.7704654895666132, "grad_norm": 12.161933898925781, "learning_rate": 0.0001746103491635407, "loss": 17.323040771484376, "step": 300 }, { "epoch": 0.7961476725521669, "grad_norm": 14.1478271484375, "learning_rate": 0.00017273736415730488, "loss": 17.589439392089844, "step": 310 }, { "epoch": 0.8218298555377207, "grad_norm": 11.344724655151367, "learning_rate": 0.0001708085574047494, "loss": 18.087261962890626, "step": 320 }, { "epoch": 0.8475120385232745, "grad_norm": 11.49538803100586, "learning_rate": 0.00016882540915419623, "loss": 16.709014892578125, "step": 330 }, { "epoch": 0.8731942215088283, "grad_norm": 11.975954055786133, "learning_rate": 0.00016678944135794374, "loss": 18.784584045410156, "step": 340 }, { "epoch": 0.898876404494382, "grad_norm": 10.979130744934082, "learning_rate": 0.00016470221650425582, "loss": 17.035598754882812, "step": 350 }, { "epoch": 0.9245585874799358, "grad_norm": 10.86307144165039, "learning_rate": 0.00016256533641824177, "loss": 17.912120056152343, "step": 360 }, { "epoch": 0.9502407704654896, "grad_norm": 11.18655776977539, "learning_rate": 0.00016038044103254775, "loss": 16.4998046875, "step": 370 }, { "epoch": 0.9759229534510433, "grad_norm": 11.321154594421387, "learning_rate": 0.00015814920712880267, "loss": 17.580471801757813, "step": 380 }, { "epoch": 1.0, "grad_norm": 6.271613597869873, "learning_rate": 0.0001558733470507847, "loss": 15.328689575195312, "step": 390 }, { "epoch": 1.0256821829855538, "grad_norm": 13.181764602661133, "learning_rate": 0.00015355460739029586, "loss": 15.265965270996094, "step": 400 }, { "epoch": 1.0513643659711076, "grad_norm": 11.904574394226074, "learning_rate": 0.00015119476764675305, "loss": 14.833596801757812, "step": 410 }, { "epoch": 1.0770465489566614, "grad_norm": 12.352519989013672, "learning_rate": 0.0001487956388615247, "loss": 13.833314514160156, "step": 420 }, { "epoch": 1.102728731942215, "grad_norm": 15.478001594543457, "learning_rate": 0.00014635906222806058, "loss": 15.767561340332032, "step": 430 }, { "epoch": 1.1284109149277688, "grad_norm": 15.335821151733398, "learning_rate": 0.00014388690767888154, "loss": 14.462684631347656, "step": 440 }, { "epoch": 1.1540930979133226, "grad_norm": 16.043373107910156, "learning_rate": 0.00014138107245051392, "loss": 15.908651733398438, "step": 450 }, { "epoch": 1.1797752808988764, "grad_norm": 12.852351188659668, "learning_rate": 0.00013884347962746948, "loss": 15.758128356933593, "step": 460 }, { "epoch": 1.2054574638844302, "grad_norm": 15.320487022399902, "learning_rate": 0.00013627607666638858, "loss": 15.051063537597656, "step": 470 }, { "epoch": 1.231139646869984, "grad_norm": 16.30264663696289, "learning_rate": 0.00013368083390147913, "loss": 14.599794006347656, "step": 480 }, { "epoch": 1.2568218298555376, "grad_norm": 13.734286308288574, "learning_rate": 0.00013105974303239838, "loss": 14.052903747558593, "step": 490 }, { "epoch": 1.2825040128410916, "grad_norm": 15.704442024230957, "learning_rate": 0.0001284148155957379, "loss": 13.530386352539063, "step": 500 }, { "epoch": 1.3081861958266452, "grad_norm": 14.808961868286133, "learning_rate": 0.00012574808142128477, "loss": 14.240873718261719, "step": 510 }, { "epoch": 1.333868378812199, "grad_norm": 13.389018058776855, "learning_rate": 0.00012306158707424403, "loss": 14.467668151855468, "step": 520 }, { "epoch": 1.3595505617977528, "grad_norm": 16.077634811401367, "learning_rate": 0.00012035739428461739, "loss": 13.67303466796875, "step": 530 }, { "epoch": 1.3852327447833066, "grad_norm": 14.300647735595703, "learning_rate": 0.00011763757836494403, "loss": 15.072747802734375, "step": 540 }, { "epoch": 1.4109149277688604, "grad_norm": 12.59499454498291, "learning_rate": 0.00011490422661761744, "loss": 14.030448913574219, "step": 550 }, { "epoch": 1.4365971107544142, "grad_norm": 17.819976806640625, "learning_rate": 0.00011215943673300093, "loss": 13.556326293945313, "step": 560 }, { "epoch": 1.462279293739968, "grad_norm": 15.307595252990723, "learning_rate": 0.00010940531517957073, "loss": 15.464706420898438, "step": 570 }, { "epoch": 1.4879614767255216, "grad_norm": 14.918625831604004, "learning_rate": 0.00010664397558732244, "loss": 14.040945434570313, "step": 580 }, { "epoch": 1.5136436597110754, "grad_norm": 13.040802955627441, "learning_rate": 0.0001038775371256817, "loss": 14.671842956542969, "step": 590 }, { "epoch": 1.5393258426966292, "grad_norm": 14.143183708190918, "learning_rate": 0.00010110812287716327, "loss": 14.198591613769532, "step": 600 }, { "epoch": 1.565008025682183, "grad_norm": 16.373567581176758, "learning_rate": 9.833785820802739e-05, "loss": 14.670704650878907, "step": 610 }, { "epoch": 1.5906902086677368, "grad_norm": 17.14202117919922, "learning_rate": 9.556886913718317e-05, "loss": 14.713813781738281, "step": 620 }, { "epoch": 1.6163723916532904, "grad_norm": 13.428468704223633, "learning_rate": 9.280328070459135e-05, "loss": 14.256681823730469, "step": 630 }, { "epoch": 1.6420545746388444, "grad_norm": 14.759570121765137, "learning_rate": 9.004321534041835e-05, "loss": 14.875436401367187, "step": 640 }, { "epoch": 1.667736757624398, "grad_norm": 14.605072021484375, "learning_rate": 8.729079123619286e-05, "loss": 14.151382446289062, "step": 650 }, { "epoch": 1.6934189406099518, "grad_norm": 15.300275802612305, "learning_rate": 8.454812071921596e-05, "loss": 14.209205627441406, "step": 660 }, { "epoch": 1.7191011235955056, "grad_norm": 15.565445899963379, "learning_rate": 8.181730863147093e-05, "loss": 15.246949768066406, "step": 670 }, { "epoch": 1.7447833065810594, "grad_norm": 13.463878631591797, "learning_rate": 7.910045071427829e-05, "loss": 14.081675720214843, "step": 680 }, { "epoch": 1.7704654895666132, "grad_norm": 15.799697875976562, "learning_rate": 7.63996319999347e-05, "loss": 14.659947204589844, "step": 690 }, { "epoch": 1.7961476725521668, "grad_norm": 15.421786308288574, "learning_rate": 7.371692521157048e-05, "loss": 13.642781066894532, "step": 700 }, { "epoch": 1.8218298555377208, "grad_norm": 16.400951385498047, "learning_rate": 7.10543891724537e-05, "loss": 14.9028076171875, "step": 710 }, { "epoch": 1.8475120385232744, "grad_norm": 16.549213409423828, "learning_rate": 6.841406722596191e-05, "loss": 14.747923278808594, "step": 720 }, { "epoch": 1.8731942215088284, "grad_norm": 14.868678092956543, "learning_rate": 6.579798566743314e-05, "loss": 14.379522705078125, "step": 730 }, { "epoch": 1.898876404494382, "grad_norm": 17.223217010498047, "learning_rate": 6.320815218910101e-05, "loss": 14.513031005859375, "step": 740 }, { "epoch": 1.9245585874799358, "grad_norm": 15.178607940673828, "learning_rate": 6.064655433930624e-05, "loss": 14.75238494873047, "step": 750 }, { "epoch": 1.9502407704654896, "grad_norm": 14.607568740844727, "learning_rate": 5.8115157997167536e-05, "loss": 14.215359497070313, "step": 760 }, { "epoch": 1.9759229534510432, "grad_norm": 15.849374771118164, "learning_rate": 5.561590586388221e-05, "loss": 13.844842529296875, "step": 770 }, { "epoch": 2.0, "grad_norm": 8.945368766784668, "learning_rate": 5.315071597181504e-05, "loss": 13.500308227539062, "step": 780 }, { "epoch": 2.0256821829855536, "grad_norm": 14.872233390808105, "learning_rate": 5.072148021251821e-05, "loss": 10.96926498413086, "step": 790 }, { "epoch": 2.0513643659711076, "grad_norm": 16.96340560913086, "learning_rate": 4.833006288481371e-05, "loss": 10.601210021972657, "step": 800 }, { "epoch": 2.077046548956661, "grad_norm": 17.987470626831055, "learning_rate": 4.597829926405075e-05, "loss": 11.348848724365235, "step": 810 }, { "epoch": 2.102728731942215, "grad_norm": 18.53151512145996, "learning_rate": 4.3667994193637796e-05, "loss": 10.507981109619141, "step": 820 }, { "epoch": 2.128410914927769, "grad_norm": 23.948802947998047, "learning_rate": 4.140092069992867e-05, "loss": 11.215933227539063, "step": 830 }, { "epoch": 2.154093097913323, "grad_norm": 21.3605899810791, "learning_rate": 3.91788186315269e-05, "loss": 10.393006896972656, "step": 840 }, { "epoch": 2.1797752808988764, "grad_norm": 26.548303604125977, "learning_rate": 3.7003393324051874e-05, "loss": 10.384098052978516, "step": 850 }, { "epoch": 2.20545746388443, "grad_norm": 23.979907989501953, "learning_rate": 3.487631429139183e-05, "loss": 10.094139862060548, "step": 860 }, { "epoch": 2.231139646869984, "grad_norm": 18.00309944152832, "learning_rate": 3.279921394444776e-05, "loss": 10.624467468261718, "step": 870 }, { "epoch": 2.2568218298555376, "grad_norm": 24.657569885253906, "learning_rate": 3.077368633835205e-05, "loss": 10.711078643798828, "step": 880 }, { "epoch": 2.2825040128410916, "grad_norm": 22.551204681396484, "learning_rate": 2.8801285949122593e-05, "loss": 10.329103088378906, "step": 890 }, { "epoch": 2.308186195826645, "grad_norm": 26.180330276489258, "learning_rate": 2.688352648069198e-05, "loss": 9.944695281982423, "step": 900 }, { "epoch": 2.333868378812199, "grad_norm": 34.10860061645508, "learning_rate": 2.502187970322657e-05, "loss": 10.465196990966797, "step": 910 }, { "epoch": 2.359550561797753, "grad_norm": 25.76052474975586, "learning_rate": 2.321777432362764e-05, "loss": 10.892754364013673, "step": 920 }, { "epoch": 2.3852327447833064, "grad_norm": 25.387516021728516, "learning_rate": 2.1472594889080756e-05, "loss": 10.628679656982422, "step": 930 }, { "epoch": 2.4109149277688604, "grad_norm": 22.750202178955078, "learning_rate": 1.9787680724495617e-05, "loss": 10.46801986694336, "step": 940 }, { "epoch": 2.436597110754414, "grad_norm": 29.39733123779297, "learning_rate": 1.8164324904650965e-05, "loss": 11.32564697265625, "step": 950 }, { "epoch": 2.462279293739968, "grad_norm": 23.36044692993164, "learning_rate": 1.660377326183412e-05, "loss": 10.110736083984374, "step": 960 }, { "epoch": 2.4879614767255216, "grad_norm": 22.447628021240234, "learning_rate": 1.5107223429736272e-05, "loss": 10.582487487792969, "step": 970 }, { "epoch": 2.513643659711075, "grad_norm": 21.37103271484375, "learning_rate": 1.3675823924337506e-05, "loss": 9.974002838134766, "step": 980 }, { "epoch": 2.539325842696629, "grad_norm": 23.942567825317383, "learning_rate": 1.2310673262486705e-05, "loss": 10.401480102539063, "step": 990 }, { "epoch": 2.5650080256821832, "grad_norm": 23.232219696044922, "learning_rate": 1.1012819118853147e-05, "loss": 10.403594970703125, "step": 1000 }, { "epoch": 2.590690208667737, "grad_norm": 26.392332077026367, "learning_rate": 9.783257521896227e-06, "loss": 11.170610046386718, "step": 1010 }, { "epoch": 2.6163723916532904, "grad_norm": 22.941421508789062, "learning_rate": 8.62293208947107e-06, "loss": 9.754792785644531, "step": 1020 }, { "epoch": 2.6420545746388444, "grad_norm": 31.97374153137207, "learning_rate": 7.532733304655848e-06, "loss": 10.895748138427734, "step": 1030 }, { "epoch": 2.667736757624398, "grad_norm": 22.15962791442871, "learning_rate": 6.5134978323574066e-06, "loss": 10.086806488037109, "step": 1040 }, { "epoch": 2.693418940609952, "grad_norm": 23.121976852416992, "learning_rate": 5.566007877218882e-06, "loss": 9.806757354736328, "step": 1050 }, { "epoch": 2.7191011235955056, "grad_norm": 22.89173698425293, "learning_rate": 4.6909905833226966e-06, "loss": 9.998442077636719, "step": 1060 }, { "epoch": 2.744783306581059, "grad_norm": 28.151565551757812, "learning_rate": 3.8891174761491735e-06, "loss": 10.232617950439453, "step": 1070 }, { "epoch": 2.770465489566613, "grad_norm": 27.67608070373535, "learning_rate": 3.161003947219421e-06, "loss": 11.071966552734375, "step": 1080 }, { "epoch": 2.796147672552167, "grad_norm": 26.361303329467773, "learning_rate": 2.5072087818176382e-06, "loss": 11.048786926269532, "step": 1090 }, { "epoch": 2.821829855537721, "grad_norm": 24.121335983276367, "learning_rate": 1.928233730155604e-06, "loss": 10.665110015869141, "step": 1100 }, { "epoch": 2.8475120385232744, "grad_norm": 26.22313117980957, "learning_rate": 1.4245231223081301e-06, "loss": 11.044110107421876, "step": 1110 }, { "epoch": 2.8731942215088284, "grad_norm": 26.499591827392578, "learning_rate": 9.964635272153633e-07, "loss": 10.147935485839843, "step": 1120 }, { "epoch": 2.898876404494382, "grad_norm": 23.90386390686035, "learning_rate": 6.443834560132534e-07, "loss": 10.296646881103516, "step": 1130 }, { "epoch": 2.924558587479936, "grad_norm": 24.210269927978516, "learning_rate": 3.685531099202111e-07, "loss": 10.592522430419923, "step": 1140 }, { "epoch": 2.9502407704654896, "grad_norm": 28.532909393310547, "learning_rate": 1.6918417287318245e-07, "loss": 9.853338623046875, "step": 1150 }, { "epoch": 2.975922953451043, "grad_norm": 27.218006134033203, "learning_rate": 4.642964907235481e-08, "loss": 10.533452606201172, "step": 1160 }, { "epoch": 3.0, "grad_norm": 14.907074928283691, "learning_rate": 3.837455592847761e-10, "loss": 9.5769775390625, "step": 1170 }, { "epoch": 3.0, "step": 1170, "total_flos": 2.709259627140219e+17, "train_loss": 14.446504472259782, "train_runtime": 5026.5595, "train_samples_per_second": 1.859, "train_steps_per_second": 0.233 } ], "logging_steps": 10, "max_steps": 1170, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.709259627140219e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }