| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.2953367875647668, |
| "eval_steps": 10000000, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.012953367875647668, |
| "grad_norm": 33.1711293280208, |
| "learning_rate": 6.476683937823834e-09, |
| "loss": 3.2052, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.025906735751295335, |
| "grad_norm": 32.37649814316274, |
| "learning_rate": 1.2953367875647667e-08, |
| "loss": 3.0656, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.038860103626943004, |
| "grad_norm": 31.56149230496282, |
| "learning_rate": 1.9430051813471502e-08, |
| "loss": 3.1704, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05181347150259067, |
| "grad_norm": 31.942868827719224, |
| "learning_rate": 2.5906735751295334e-08, |
| "loss": 3.1568, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06476683937823834, |
| "grad_norm": 32.143099670749734, |
| "learning_rate": 3.238341968911917e-08, |
| "loss": 3.1613, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.07772020725388601, |
| "grad_norm": 32.24118466383636, |
| "learning_rate": 3.8860103626943005e-08, |
| "loss": 3.1164, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09067357512953368, |
| "grad_norm": 31.650055464493875, |
| "learning_rate": 4.533678756476684e-08, |
| "loss": 3.1494, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10362694300518134, |
| "grad_norm": 28.899262199995725, |
| "learning_rate": 5.181347150259067e-08, |
| "loss": 3.0801, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.11658031088082901, |
| "grad_norm": 28.89484749420223, |
| "learning_rate": 5.8290155440414504e-08, |
| "loss": 3.0142, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.12953367875647667, |
| "grad_norm": 28.383181515972513, |
| "learning_rate": 6.476683937823834e-08, |
| "loss": 2.9967, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.14248704663212436, |
| "grad_norm": 27.801929424192455, |
| "learning_rate": 7.124352331606218e-08, |
| "loss": 2.984, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.15544041450777202, |
| "grad_norm": 21.516965224010576, |
| "learning_rate": 7.772020725388601e-08, |
| "loss": 2.822, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.16839378238341968, |
| "grad_norm": 19.65232033324181, |
| "learning_rate": 8.419689119170984e-08, |
| "loss": 2.7653, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.18134715025906736, |
| "grad_norm": 19.201263343869776, |
| "learning_rate": 9.067357512953368e-08, |
| "loss": 2.6609, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.19430051813471502, |
| "grad_norm": 14.459530975729104, |
| "learning_rate": 9.715025906735751e-08, |
| "loss": 2.5682, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.20725388601036268, |
| "grad_norm": 9.12934345966672, |
| "learning_rate": 1.0362694300518134e-07, |
| "loss": 2.432, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.22020725388601037, |
| "grad_norm": 7.256496950927029, |
| "learning_rate": 1.1010362694300518e-07, |
| "loss": 2.3733, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.23316062176165803, |
| "grad_norm": 6.662098845634381, |
| "learning_rate": 1.1658031088082901e-07, |
| "loss": 2.3659, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.24611398963730569, |
| "grad_norm": 6.095993583987254, |
| "learning_rate": 1.2305699481865284e-07, |
| "loss": 2.3295, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.25906735751295334, |
| "grad_norm": 5.693722335575032, |
| "learning_rate": 1.2953367875647668e-07, |
| "loss": 2.3366, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.27202072538860106, |
| "grad_norm": 5.047232489514463, |
| "learning_rate": 1.3601036269430052e-07, |
| "loss": 2.2315, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.2849740932642487, |
| "grad_norm": 4.927414652144195, |
| "learning_rate": 1.4248704663212436e-07, |
| "loss": 2.2668, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2979274611398964, |
| "grad_norm": 4.835355533686139, |
| "learning_rate": 1.4896373056994818e-07, |
| "loss": 2.2649, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.31088082901554404, |
| "grad_norm": 4.87421791743561, |
| "learning_rate": 1.5544041450777202e-07, |
| "loss": 2.2932, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3238341968911917, |
| "grad_norm": 4.820276947385723, |
| "learning_rate": 1.6191709844559583e-07, |
| "loss": 2.2502, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.33678756476683935, |
| "grad_norm": 4.516053705523844, |
| "learning_rate": 1.6839378238341968e-07, |
| "loss": 2.2293, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.34974093264248707, |
| "grad_norm": 4.496107225758641, |
| "learning_rate": 1.7487046632124352e-07, |
| "loss": 2.2073, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.3626943005181347, |
| "grad_norm": 4.349690749010343, |
| "learning_rate": 1.8134715025906736e-07, |
| "loss": 2.2241, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3756476683937824, |
| "grad_norm": 4.556595681247454, |
| "learning_rate": 1.8782383419689118e-07, |
| "loss": 2.2485, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.38860103626943004, |
| "grad_norm": 4.209430201616371, |
| "learning_rate": 1.9430051813471502e-07, |
| "loss": 2.2237, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4015544041450777, |
| "grad_norm": 4.489040941077934, |
| "learning_rate": 2.0077720207253883e-07, |
| "loss": 2.2653, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.41450777202072536, |
| "grad_norm": 4.803060461126722, |
| "learning_rate": 2.0725388601036267e-07, |
| "loss": 2.1946, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4274611398963731, |
| "grad_norm": 4.280584588616054, |
| "learning_rate": 2.1373056994818652e-07, |
| "loss": 2.2146, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.44041450777202074, |
| "grad_norm": 4.619244786650026, |
| "learning_rate": 2.2020725388601036e-07, |
| "loss": 2.2021, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4533678756476684, |
| "grad_norm": 4.4382532701001995, |
| "learning_rate": 2.2668393782383417e-07, |
| "loss": 2.1895, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.46632124352331605, |
| "grad_norm": 4.2716622332872145, |
| "learning_rate": 2.3316062176165802e-07, |
| "loss": 2.1829, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.4792746113989637, |
| "grad_norm": 4.273910475216059, |
| "learning_rate": 2.3963730569948183e-07, |
| "loss": 2.2089, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.49222797927461137, |
| "grad_norm": 4.478306118940495, |
| "learning_rate": 2.4611398963730567e-07, |
| "loss": 2.1644, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5051813471502591, |
| "grad_norm": 4.4518158510772485, |
| "learning_rate": 2.525906735751295e-07, |
| "loss": 2.1981, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5181347150259067, |
| "grad_norm": 4.288110327620116, |
| "learning_rate": 2.5906735751295336e-07, |
| "loss": 2.1446, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5310880829015544, |
| "grad_norm": 4.176981515512014, |
| "learning_rate": 2.655440414507772e-07, |
| "loss": 2.1793, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5440414507772021, |
| "grad_norm": 4.341974723955389, |
| "learning_rate": 2.7202072538860104e-07, |
| "loss": 2.1744, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5569948186528497, |
| "grad_norm": 4.283471991855415, |
| "learning_rate": 2.7849740932642483e-07, |
| "loss": 2.1733, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5699481865284974, |
| "grad_norm": 4.373337793890535, |
| "learning_rate": 2.849740932642487e-07, |
| "loss": 2.2163, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.582901554404145, |
| "grad_norm": 4.192319713165312, |
| "learning_rate": 2.914507772020725e-07, |
| "loss": 2.1688, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.5958549222797928, |
| "grad_norm": 4.431919073381032, |
| "learning_rate": 2.9792746113989635e-07, |
| "loss": 2.1808, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6088082901554405, |
| "grad_norm": 4.4889229042752845, |
| "learning_rate": 3.044041450777202e-07, |
| "loss": 2.1981, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6217616580310881, |
| "grad_norm": 4.222020097797262, |
| "learning_rate": 3.1088082901554404e-07, |
| "loss": 2.1798, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6347150259067358, |
| "grad_norm": 4.216176333681839, |
| "learning_rate": 3.173575129533679e-07, |
| "loss": 2.1864, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6476683937823834, |
| "grad_norm": 4.366482754156596, |
| "learning_rate": 3.2383419689119167e-07, |
| "loss": 2.1344, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6606217616580311, |
| "grad_norm": 4.357005078373983, |
| "learning_rate": 3.303108808290155e-07, |
| "loss": 2.1424, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.6735751295336787, |
| "grad_norm": 4.541933842125955, |
| "learning_rate": 3.3678756476683935e-07, |
| "loss": 2.1517, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.6865284974093264, |
| "grad_norm": 4.161277678712947, |
| "learning_rate": 3.432642487046632e-07, |
| "loss": 2.1546, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.6994818652849741, |
| "grad_norm": 4.236717017318247, |
| "learning_rate": 3.4974093264248704e-07, |
| "loss": 2.1327, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7124352331606217, |
| "grad_norm": 4.300988527799866, |
| "learning_rate": 3.562176165803109e-07, |
| "loss": 2.1501, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7253886010362695, |
| "grad_norm": 4.223261260043241, |
| "learning_rate": 3.626943005181347e-07, |
| "loss": 2.1615, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7383419689119171, |
| "grad_norm": 4.085308096354535, |
| "learning_rate": 3.691709844559585e-07, |
| "loss": 2.144, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.7512953367875648, |
| "grad_norm": 4.31015677001362, |
| "learning_rate": 3.7564766839378235e-07, |
| "loss": 2.1859, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.7642487046632125, |
| "grad_norm": 4.231574714412857, |
| "learning_rate": 3.8212435233160625e-07, |
| "loss": 2.1476, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.7772020725388601, |
| "grad_norm": 4.206477374687759, |
| "learning_rate": 3.8860103626943004e-07, |
| "loss": 2.1658, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7901554404145078, |
| "grad_norm": 4.354390663140942, |
| "learning_rate": 3.950777202072539e-07, |
| "loss": 2.1599, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8031088082901554, |
| "grad_norm": 4.110842381635348, |
| "learning_rate": 4.0155440414507767e-07, |
| "loss": 2.1431, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8160621761658031, |
| "grad_norm": 4.237764332245077, |
| "learning_rate": 4.0803108808290156e-07, |
| "loss": 2.1543, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8290155440414507, |
| "grad_norm": 3.905536571258385, |
| "learning_rate": 4.1450777202072535e-07, |
| "loss": 2.1106, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8419689119170984, |
| "grad_norm": 4.393170487432548, |
| "learning_rate": 4.209844559585492e-07, |
| "loss": 2.1392, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.8549222797927462, |
| "grad_norm": 4.251449853594785, |
| "learning_rate": 4.2746113989637303e-07, |
| "loss": 2.1434, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.8678756476683938, |
| "grad_norm": 4.230410652383188, |
| "learning_rate": 4.339378238341969e-07, |
| "loss": 2.1033, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.8808290155440415, |
| "grad_norm": 4.191297726929567, |
| "learning_rate": 4.404145077720207e-07, |
| "loss": 2.1668, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.8937823834196891, |
| "grad_norm": 4.157886217693691, |
| "learning_rate": 4.468911917098445e-07, |
| "loss": 2.1379, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9067357512953368, |
| "grad_norm": 4.485713372256864, |
| "learning_rate": 4.5336787564766835e-07, |
| "loss": 2.1352, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9196891191709845, |
| "grad_norm": 3.9736562746500805, |
| "learning_rate": 4.5984455958549224e-07, |
| "loss": 2.1097, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.9326424870466321, |
| "grad_norm": 4.118058611454383, |
| "learning_rate": 4.6632124352331603e-07, |
| "loss": 2.1121, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.9455958549222798, |
| "grad_norm": 4.059747364924617, |
| "learning_rate": 4.7279792746113987e-07, |
| "loss": 2.0727, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.9585492227979274, |
| "grad_norm": 3.9437377004412997, |
| "learning_rate": 4.792746113989637e-07, |
| "loss": 2.0997, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.9715025906735751, |
| "grad_norm": 4.038836900317155, |
| "learning_rate": 4.857512953367875e-07, |
| "loss": 2.1516, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.9844559585492227, |
| "grad_norm": 4.416716033210665, |
| "learning_rate": 4.922279792746113e-07, |
| "loss": 2.1124, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.9974093264248705, |
| "grad_norm": 4.138231942142784, |
| "learning_rate": 4.987046632124352e-07, |
| "loss": 2.0725, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.0103626943005182, |
| "grad_norm": 4.29950033055081, |
| "learning_rate": 5.05181347150259e-07, |
| "loss": 2.0996, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.0233160621761659, |
| "grad_norm": 4.142376447802417, |
| "learning_rate": 5.116580310880829e-07, |
| "loss": 2.1324, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.0362694300518134, |
| "grad_norm": 3.917371610743461, |
| "learning_rate": 5.181347150259067e-07, |
| "loss": 2.1006, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.049222797927461, |
| "grad_norm": 4.038077523537081, |
| "learning_rate": 5.246113989637306e-07, |
| "loss": 2.1275, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.0621761658031088, |
| "grad_norm": 4.318310745831879, |
| "learning_rate": 5.310880829015544e-07, |
| "loss": 2.0793, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.0751295336787565, |
| "grad_norm": 4.26330338898587, |
| "learning_rate": 5.375647668393782e-07, |
| "loss": 2.0955, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.0880829015544042, |
| "grad_norm": 3.8965234945979663, |
| "learning_rate": 5.440414507772021e-07, |
| "loss": 2.0684, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.1010362694300517, |
| "grad_norm": 4.288695103356495, |
| "learning_rate": 5.505181347150258e-07, |
| "loss": 2.1247, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.1139896373056994, |
| "grad_norm": 4.035234944690109, |
| "learning_rate": 5.569948186528497e-07, |
| "loss": 2.1666, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.1269430051813472, |
| "grad_norm": 4.091744197400346, |
| "learning_rate": 5.634715025906735e-07, |
| "loss": 2.1217, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.1398963730569949, |
| "grad_norm": 3.927555977045572, |
| "learning_rate": 5.699481865284974e-07, |
| "loss": 2.1349, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.1528497409326426, |
| "grad_norm": 4.16752707585748, |
| "learning_rate": 5.764248704663213e-07, |
| "loss": 2.1126, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.16580310880829, |
| "grad_norm": 4.099847144482344, |
| "learning_rate": 5.82901554404145e-07, |
| "loss": 2.1144, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.1787564766839378, |
| "grad_norm": 4.174988920130071, |
| "learning_rate": 5.893782383419689e-07, |
| "loss": 2.0872, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.1917098445595855, |
| "grad_norm": 4.109961957930782, |
| "learning_rate": 5.958549222797927e-07, |
| "loss": 2.0567, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.2046632124352332, |
| "grad_norm": 4.180647847650424, |
| "learning_rate": 6.023316062176166e-07, |
| "loss": 2.1202, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.2176165803108807, |
| "grad_norm": 3.98575411050178, |
| "learning_rate": 6.088082901554404e-07, |
| "loss": 2.1152, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.2305699481865284, |
| "grad_norm": 3.9239892073269997, |
| "learning_rate": 6.152849740932642e-07, |
| "loss": 2.0765, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.2435233160621761, |
| "grad_norm": 4.266583390376126, |
| "learning_rate": 6.217616580310881e-07, |
| "loss": 2.099, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.2564766839378239, |
| "grad_norm": 3.958906112705802, |
| "learning_rate": 6.282383419689119e-07, |
| "loss": 2.0834, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.2694300518134716, |
| "grad_norm": 4.050771101367249, |
| "learning_rate": 6.347150259067358e-07, |
| "loss": 2.1043, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.2823834196891193, |
| "grad_norm": 4.145392853886648, |
| "learning_rate": 6.411917098445595e-07, |
| "loss": 2.0901, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.2953367875647668, |
| "grad_norm": 4.279480677144505, |
| "learning_rate": 6.476683937823833e-07, |
| "loss": 2.084, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 15440, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 21117560389632.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|