| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.2953367875647668, |
| "eval_steps": 10000000, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.012953367875647668, |
| "grad_norm": 25.608994557453922, |
| "learning_rate": 6.476683937823834e-09, |
| "loss": 2.9207, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.025906735751295335, |
| "grad_norm": 25.006509435749546, |
| "learning_rate": 1.2953367875647667e-08, |
| "loss": 2.8035, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.038860103626943004, |
| "grad_norm": 26.07092571978953, |
| "learning_rate": 1.9430051813471502e-08, |
| "loss": 2.8884, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05181347150259067, |
| "grad_norm": 24.9115069323374, |
| "learning_rate": 2.5906735751295334e-08, |
| "loss": 2.8786, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06476683937823834, |
| "grad_norm": 26.35515887725646, |
| "learning_rate": 3.238341968911917e-08, |
| "loss": 2.8856, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.07772020725388601, |
| "grad_norm": 26.802778574551475, |
| "learning_rate": 3.8860103626943005e-08, |
| "loss": 2.8532, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09067357512953368, |
| "grad_norm": 26.534962747282748, |
| "learning_rate": 4.533678756476684e-08, |
| "loss": 2.8952, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10362694300518134, |
| "grad_norm": 23.051599328506548, |
| "learning_rate": 5.181347150259067e-08, |
| "loss": 2.8437, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.11658031088082901, |
| "grad_norm": 24.471329875902512, |
| "learning_rate": 5.8290155440414504e-08, |
| "loss": 2.8237, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.12953367875647667, |
| "grad_norm": 24.33959816120034, |
| "learning_rate": 6.476683937823834e-08, |
| "loss": 2.8199, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.14248704663212436, |
| "grad_norm": 25.214396561848275, |
| "learning_rate": 7.124352331606218e-08, |
| "loss": 2.8259, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.15544041450777202, |
| "grad_norm": 20.508728927761148, |
| "learning_rate": 7.772020725388601e-08, |
| "loss": 2.7907, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.16839378238341968, |
| "grad_norm": 21.133956193530185, |
| "learning_rate": 8.419689119170984e-08, |
| "loss": 2.7776, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.18134715025906736, |
| "grad_norm": 20.216670897828962, |
| "learning_rate": 9.067357512953368e-08, |
| "loss": 2.6967, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.19430051813471502, |
| "grad_norm": 19.577892832244164, |
| "learning_rate": 9.715025906735751e-08, |
| "loss": 2.6823, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.20725388601036268, |
| "grad_norm": 14.26088592523892, |
| "learning_rate": 1.0362694300518134e-07, |
| "loss": 2.6147, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.22020725388601037, |
| "grad_norm": 12.389614421651768, |
| "learning_rate": 1.1010362694300518e-07, |
| "loss": 2.5705, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.23316062176165803, |
| "grad_norm": 10.537108203822184, |
| "learning_rate": 1.1658031088082901e-07, |
| "loss": 2.5779, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.24611398963730569, |
| "grad_norm": 8.566097835293071, |
| "learning_rate": 1.2305699481865284e-07, |
| "loss": 2.5464, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.25906735751295334, |
| "grad_norm": 7.5406588821241884, |
| "learning_rate": 1.2953367875647668e-07, |
| "loss": 2.5462, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.27202072538860106, |
| "grad_norm": 6.406776125251227, |
| "learning_rate": 1.3601036269430052e-07, |
| "loss": 2.4201, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.2849740932642487, |
| "grad_norm": 5.0333340242378135, |
| "learning_rate": 1.4248704663212436e-07, |
| "loss": 2.4404, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2979274611398964, |
| "grad_norm": 4.96460074385302, |
| "learning_rate": 1.4896373056994818e-07, |
| "loss": 2.4234, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.31088082901554404, |
| "grad_norm": 4.718132167499178, |
| "learning_rate": 1.5544041450777202e-07, |
| "loss": 2.447, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3238341968911917, |
| "grad_norm": 4.473414832247284, |
| "learning_rate": 1.6191709844559583e-07, |
| "loss": 2.3905, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.33678756476683935, |
| "grad_norm": 4.231641138749174, |
| "learning_rate": 1.6839378238341968e-07, |
| "loss": 2.3678, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.34974093264248707, |
| "grad_norm": 3.950230234878744, |
| "learning_rate": 1.7487046632124352e-07, |
| "loss": 2.3378, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.3626943005181347, |
| "grad_norm": 3.789360959290258, |
| "learning_rate": 1.8134715025906736e-07, |
| "loss": 2.3507, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3756476683937824, |
| "grad_norm": 3.752329492676831, |
| "learning_rate": 1.8782383419689118e-07, |
| "loss": 2.3752, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.38860103626943004, |
| "grad_norm": 3.8201206496833313, |
| "learning_rate": 1.9430051813471502e-07, |
| "loss": 2.3437, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4015544041450777, |
| "grad_norm": 3.8776779124718175, |
| "learning_rate": 2.0077720207253883e-07, |
| "loss": 2.3889, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.41450777202072536, |
| "grad_norm": 3.989802925727435, |
| "learning_rate": 2.0725388601036267e-07, |
| "loss": 2.3129, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4274611398963731, |
| "grad_norm": 3.651464736452137, |
| "learning_rate": 2.1373056994818652e-07, |
| "loss": 2.3276, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.44041450777202074, |
| "grad_norm": 3.784690196455203, |
| "learning_rate": 2.2020725388601036e-07, |
| "loss": 2.31, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4533678756476684, |
| "grad_norm": 3.5900123693564683, |
| "learning_rate": 2.2668393782383417e-07, |
| "loss": 2.3023, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.46632124352331605, |
| "grad_norm": 3.616764531573499, |
| "learning_rate": 2.3316062176165802e-07, |
| "loss": 2.2977, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.4792746113989637, |
| "grad_norm": 3.4970049253402076, |
| "learning_rate": 2.3963730569948183e-07, |
| "loss": 2.32, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.49222797927461137, |
| "grad_norm": 3.7715349943050733, |
| "learning_rate": 2.4611398963730567e-07, |
| "loss": 2.274, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5051813471502591, |
| "grad_norm": 3.7123935945294897, |
| "learning_rate": 2.525906735751295e-07, |
| "loss": 2.3086, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5181347150259067, |
| "grad_norm": 3.372291772174901, |
| "learning_rate": 2.5906735751295336e-07, |
| "loss": 2.2496, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5310880829015544, |
| "grad_norm": 3.784294595936631, |
| "learning_rate": 2.655440414507772e-07, |
| "loss": 2.2858, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5440414507772021, |
| "grad_norm": 3.4351612072380675, |
| "learning_rate": 2.7202072538860104e-07, |
| "loss": 2.28, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5569948186528497, |
| "grad_norm": 3.3411063107189753, |
| "learning_rate": 2.7849740932642483e-07, |
| "loss": 2.2764, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5699481865284974, |
| "grad_norm": 3.3765768734179993, |
| "learning_rate": 2.849740932642487e-07, |
| "loss": 2.3278, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.582901554404145, |
| "grad_norm": 3.501479583740947, |
| "learning_rate": 2.914507772020725e-07, |
| "loss": 2.275, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.5958549222797928, |
| "grad_norm": 3.4545990517473895, |
| "learning_rate": 2.9792746113989635e-07, |
| "loss": 2.292, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6088082901554405, |
| "grad_norm": 3.4921131143644137, |
| "learning_rate": 3.044041450777202e-07, |
| "loss": 2.306, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6217616580310881, |
| "grad_norm": 3.3569942808113202, |
| "learning_rate": 3.1088082901554404e-07, |
| "loss": 2.2818, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6347150259067358, |
| "grad_norm": 3.473735761418456, |
| "learning_rate": 3.173575129533679e-07, |
| "loss": 2.292, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.6476683937823834, |
| "grad_norm": 3.4098418845884506, |
| "learning_rate": 3.2383419689119167e-07, |
| "loss": 2.2388, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6606217616580311, |
| "grad_norm": 3.2884572160731103, |
| "learning_rate": 3.303108808290155e-07, |
| "loss": 2.2485, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.6735751295336787, |
| "grad_norm": 3.49059998854233, |
| "learning_rate": 3.3678756476683935e-07, |
| "loss": 2.2546, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.6865284974093264, |
| "grad_norm": 3.228603206545892, |
| "learning_rate": 3.432642487046632e-07, |
| "loss": 2.2611, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.6994818652849741, |
| "grad_norm": 3.201214883425356, |
| "learning_rate": 3.4974093264248704e-07, |
| "loss": 2.2423, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7124352331606217, |
| "grad_norm": 3.174572440904334, |
| "learning_rate": 3.562176165803109e-07, |
| "loss": 2.2575, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7253886010362695, |
| "grad_norm": 3.6637540799374086, |
| "learning_rate": 3.626943005181347e-07, |
| "loss": 2.2703, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.7383419689119171, |
| "grad_norm": 3.2144805242394456, |
| "learning_rate": 3.691709844559585e-07, |
| "loss": 2.2536, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.7512953367875648, |
| "grad_norm": 3.2832998982931647, |
| "learning_rate": 3.7564766839378235e-07, |
| "loss": 2.2923, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.7642487046632125, |
| "grad_norm": 3.339591881411431, |
| "learning_rate": 3.8212435233160625e-07, |
| "loss": 2.2582, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.7772020725388601, |
| "grad_norm": 3.1793659057987114, |
| "learning_rate": 3.8860103626943004e-07, |
| "loss": 2.2725, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7901554404145078, |
| "grad_norm": 3.105051267095931, |
| "learning_rate": 3.950777202072539e-07, |
| "loss": 2.2609, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8031088082901554, |
| "grad_norm": 3.1058601287467837, |
| "learning_rate": 4.0155440414507767e-07, |
| "loss": 2.2564, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8160621761658031, |
| "grad_norm": 3.2145397711050374, |
| "learning_rate": 4.0803108808290156e-07, |
| "loss": 2.2643, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.8290155440414507, |
| "grad_norm": 2.9574287184508896, |
| "learning_rate": 4.1450777202072535e-07, |
| "loss": 2.2198, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.8419689119170984, |
| "grad_norm": 3.385438951564467, |
| "learning_rate": 4.209844559585492e-07, |
| "loss": 2.2482, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.8549222797927462, |
| "grad_norm": 3.232841492939655, |
| "learning_rate": 4.2746113989637303e-07, |
| "loss": 2.2553, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.8678756476683938, |
| "grad_norm": 3.2498023529603226, |
| "learning_rate": 4.339378238341969e-07, |
| "loss": 2.2091, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.8808290155440415, |
| "grad_norm": 3.387272764637681, |
| "learning_rate": 4.404145077720207e-07, |
| "loss": 2.2803, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.8937823834196891, |
| "grad_norm": 3.205923523135513, |
| "learning_rate": 4.468911917098445e-07, |
| "loss": 2.2416, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9067357512953368, |
| "grad_norm": 3.328638392908686, |
| "learning_rate": 4.5336787564766835e-07, |
| "loss": 2.2512, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9196891191709845, |
| "grad_norm": 3.223397811767207, |
| "learning_rate": 4.5984455958549224e-07, |
| "loss": 2.2233, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.9326424870466321, |
| "grad_norm": 2.903434123937875, |
| "learning_rate": 4.6632124352331603e-07, |
| "loss": 2.2221, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.9455958549222798, |
| "grad_norm": 3.167214093551616, |
| "learning_rate": 4.7279792746113987e-07, |
| "loss": 2.1797, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.9585492227979274, |
| "grad_norm": 2.8228184362789936, |
| "learning_rate": 4.792746113989637e-07, |
| "loss": 2.2113, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.9715025906735751, |
| "grad_norm": 3.2547987666473506, |
| "learning_rate": 4.857512953367875e-07, |
| "loss": 2.2603, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.9844559585492227, |
| "grad_norm": 3.2819941887944317, |
| "learning_rate": 4.922279792746113e-07, |
| "loss": 2.2197, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.9974093264248705, |
| "grad_norm": 3.1793621366904765, |
| "learning_rate": 4.987046632124352e-07, |
| "loss": 2.1802, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.0103626943005182, |
| "grad_norm": 3.129077964136437, |
| "learning_rate": 5.05181347150259e-07, |
| "loss": 2.2115, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.0233160621761659, |
| "grad_norm": 3.624077927910293, |
| "learning_rate": 5.116580310880829e-07, |
| "loss": 2.2505, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.0362694300518134, |
| "grad_norm": 3.001991950399265, |
| "learning_rate": 5.181347150259067e-07, |
| "loss": 2.2186, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.049222797927461, |
| "grad_norm": 3.2056710194420077, |
| "learning_rate": 5.246113989637306e-07, |
| "loss": 2.2441, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.0621761658031088, |
| "grad_norm": 3.3095386913147307, |
| "learning_rate": 5.310880829015544e-07, |
| "loss": 2.2022, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.0751295336787565, |
| "grad_norm": 3.2748460342108276, |
| "learning_rate": 5.375647668393782e-07, |
| "loss": 2.2102, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.0880829015544042, |
| "grad_norm": 3.1482065886570108, |
| "learning_rate": 5.440414507772021e-07, |
| "loss": 2.186, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.1010362694300517, |
| "grad_norm": 3.2541206943048016, |
| "learning_rate": 5.505181347150258e-07, |
| "loss": 2.2447, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.1139896373056994, |
| "grad_norm": 3.35168061407981, |
| "learning_rate": 5.569948186528497e-07, |
| "loss": 2.287, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.1269430051813472, |
| "grad_norm": 3.324702715900074, |
| "learning_rate": 5.634715025906735e-07, |
| "loss": 2.2381, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.1398963730569949, |
| "grad_norm": 2.981057376298506, |
| "learning_rate": 5.699481865284974e-07, |
| "loss": 2.253, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.1528497409326426, |
| "grad_norm": 3.2319048665014734, |
| "learning_rate": 5.764248704663213e-07, |
| "loss": 2.234, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.16580310880829, |
| "grad_norm": 3.197206460323895, |
| "learning_rate": 5.82901554404145e-07, |
| "loss": 2.2401, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.1787564766839378, |
| "grad_norm": 3.128653204841994, |
| "learning_rate": 5.893782383419689e-07, |
| "loss": 2.2044, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.1917098445595855, |
| "grad_norm": 3.194111726652451, |
| "learning_rate": 5.958549222797927e-07, |
| "loss": 2.1694, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.2046632124352332, |
| "grad_norm": 3.07095427002542, |
| "learning_rate": 6.023316062176166e-07, |
| "loss": 2.2416, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.2176165803108807, |
| "grad_norm": 3.0432092192539777, |
| "learning_rate": 6.088082901554404e-07, |
| "loss": 2.2327, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.2305699481865284, |
| "grad_norm": 3.045795113156715, |
| "learning_rate": 6.152849740932642e-07, |
| "loss": 2.1957, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.2435233160621761, |
| "grad_norm": 3.363436323137103, |
| "learning_rate": 6.217616580310881e-07, |
| "loss": 2.2182, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.2564766839378239, |
| "grad_norm": 3.0067901045334633, |
| "learning_rate": 6.282383419689119e-07, |
| "loss": 2.1986, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.2694300518134716, |
| "grad_norm": 3.047870744487349, |
| "learning_rate": 6.347150259067358e-07, |
| "loss": 2.2217, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.2823834196891193, |
| "grad_norm": 3.2325772640891395, |
| "learning_rate": 6.411917098445595e-07, |
| "loss": 2.2146, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.2953367875647668, |
| "grad_norm": 3.2882251454495006, |
| "learning_rate": 6.476683937823833e-07, |
| "loss": 2.203, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 15440, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 15311042248704.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|