| { |
| "best_global_step": 1650, |
| "best_metric": 0.006614842917770147, |
| "best_model_checkpoint": "/content/NH-SQL-finetuned/checkpoint-1650", |
| "epoch": 50.0, |
| "eval_steps": 500, |
| "global_step": 1650, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.030303030303030304, |
| "grad_norm": 6.595283031463623, |
| "learning_rate": 0.0, |
| "loss": 1.9666, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.6060606060606061, |
| "grad_norm": 6.3463263511657715, |
| "learning_rate": 5.757575757575758e-07, |
| "loss": 1.8746, |
| "step": 20 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 1.7834347486495972, |
| "eval_runtime": 0.9389, |
| "eval_samples_per_second": 140.592, |
| "eval_steps_per_second": 18.107, |
| "step": 33 |
| }, |
| { |
| "epoch": 1.2121212121212122, |
| "grad_norm": 4.871561050415039, |
| "learning_rate": 1.181818181818182e-06, |
| "loss": 1.7412, |
| "step": 40 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 3.2557735443115234, |
| "learning_rate": 1.787878787878788e-06, |
| "loss": 1.5522, |
| "step": 60 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 1.2882661819458008, |
| "eval_runtime": 0.9475, |
| "eval_samples_per_second": 139.315, |
| "eval_steps_per_second": 17.942, |
| "step": 66 |
| }, |
| { |
| "epoch": 2.4242424242424243, |
| "grad_norm": 1.9632468223571777, |
| "learning_rate": 2.393939393939394e-06, |
| "loss": 1.2669, |
| "step": 80 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.8929286003112793, |
| "eval_runtime": 0.9528, |
| "eval_samples_per_second": 138.545, |
| "eval_steps_per_second": 17.843, |
| "step": 99 |
| }, |
| { |
| "epoch": 3.0303030303030303, |
| "grad_norm": 1.6738139390945435, |
| "learning_rate": 3e-06, |
| "loss": 1.0527, |
| "step": 100 |
| }, |
| { |
| "epoch": 3.6363636363636362, |
| "grad_norm": 1.4490736722946167, |
| "learning_rate": 3.606060606060606e-06, |
| "loss": 0.8491, |
| "step": 120 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.5932092070579529, |
| "eval_runtime": 0.9509, |
| "eval_samples_per_second": 138.815, |
| "eval_steps_per_second": 17.878, |
| "step": 132 |
| }, |
| { |
| "epoch": 4.242424242424242, |
| "grad_norm": 1.0791282653808594, |
| "learning_rate": 4.212121212121212e-06, |
| "loss": 0.6586, |
| "step": 140 |
| }, |
| { |
| "epoch": 4.848484848484849, |
| "grad_norm": 0.8789141178131104, |
| "learning_rate": 4.818181818181819e-06, |
| "loss": 0.4801, |
| "step": 160 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.42336800694465637, |
| "eval_runtime": 0.9459, |
| "eval_samples_per_second": 139.555, |
| "eval_steps_per_second": 17.973, |
| "step": 165 |
| }, |
| { |
| "epoch": 5.454545454545454, |
| "grad_norm": 0.8390600085258484, |
| "learning_rate": 4.9989035693310165e-06, |
| "loss": 0.4134, |
| "step": 180 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.3166239559650421, |
| "eval_runtime": 0.9402, |
| "eval_samples_per_second": 140.394, |
| "eval_steps_per_second": 18.081, |
| "step": 198 |
| }, |
| { |
| "epoch": 6.0606060606060606, |
| "grad_norm": 0.908724308013916, |
| "learning_rate": 4.993535611735464e-06, |
| "loss": 0.33, |
| "step": 200 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.8689959645271301, |
| "learning_rate": 4.983704338371375e-06, |
| "loss": 0.2941, |
| "step": 220 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.246794193983078, |
| "eval_runtime": 0.9539, |
| "eval_samples_per_second": 138.381, |
| "eval_steps_per_second": 17.822, |
| "step": 231 |
| }, |
| { |
| "epoch": 7.2727272727272725, |
| "grad_norm": 0.806349515914917, |
| "learning_rate": 4.969427346772643e-06, |
| "loss": 0.2513, |
| "step": 240 |
| }, |
| { |
| "epoch": 7.878787878787879, |
| "grad_norm": 0.8216381072998047, |
| "learning_rate": 4.950730192107368e-06, |
| "loss": 0.2244, |
| "step": 260 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.19389495253562927, |
| "eval_runtime": 0.9398, |
| "eval_samples_per_second": 140.46, |
| "eval_steps_per_second": 18.09, |
| "step": 264 |
| }, |
| { |
| "epoch": 8.484848484848484, |
| "grad_norm": 0.8309475183486938, |
| "learning_rate": 4.927646341435276e-06, |
| "loss": 0.1756, |
| "step": 280 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.1608426719903946, |
| "eval_runtime": 0.9567, |
| "eval_samples_per_second": 137.969, |
| "eval_steps_per_second": 17.769, |
| "step": 297 |
| }, |
| { |
| "epoch": 9.090909090909092, |
| "grad_norm": 0.807766854763031, |
| "learning_rate": 4.900217113803193e-06, |
| "loss": 0.1666, |
| "step": 300 |
| }, |
| { |
| "epoch": 9.696969696969697, |
| "grad_norm": 0.638713538646698, |
| "learning_rate": 4.868491606285823e-06, |
| "loss": 0.1756, |
| "step": 320 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.13285745680332184, |
| "eval_runtime": 0.9431, |
| "eval_samples_per_second": 139.968, |
| "eval_steps_per_second": 18.026, |
| "step": 330 |
| }, |
| { |
| "epoch": 10.303030303030303, |
| "grad_norm": 0.6577040553092957, |
| "learning_rate": 4.832526606104213e-06, |
| "loss": 0.1407, |
| "step": 340 |
| }, |
| { |
| "epoch": 10.909090909090908, |
| "grad_norm": 0.7107315063476562, |
| "learning_rate": 4.792386488979193e-06, |
| "loss": 0.1218, |
| "step": 360 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.1115950271487236, |
| "eval_runtime": 0.9468, |
| "eval_samples_per_second": 139.417, |
| "eval_steps_per_second": 17.955, |
| "step": 363 |
| }, |
| { |
| "epoch": 11.515151515151516, |
| "grad_norm": 0.7401019334793091, |
| "learning_rate": 4.74814310390176e-06, |
| "loss": 0.1362, |
| "step": 380 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.09207186847925186, |
| "eval_runtime": 0.9332, |
| "eval_samples_per_second": 141.446, |
| "eval_steps_per_second": 18.216, |
| "step": 396 |
| }, |
| { |
| "epoch": 12.121212121212121, |
| "grad_norm": 0.68468177318573, |
| "learning_rate": 4.699875644526633e-06, |
| "loss": 0.0987, |
| "step": 400 |
| }, |
| { |
| "epoch": 12.727272727272727, |
| "grad_norm": 0.7722787857055664, |
| "learning_rate": 4.647670507419206e-06, |
| "loss": 0.0989, |
| "step": 420 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.0792744979262352, |
| "eval_runtime": 0.935, |
| "eval_samples_per_second": 141.172, |
| "eval_steps_per_second": 18.181, |
| "step": 429 |
| }, |
| { |
| "epoch": 13.333333333333334, |
| "grad_norm": 0.48431074619293213, |
| "learning_rate": 4.591621137409602e-06, |
| "loss": 0.0936, |
| "step": 440 |
| }, |
| { |
| "epoch": 13.93939393939394, |
| "grad_norm": 0.555296778678894, |
| "learning_rate": 4.53182786033067e-06, |
| "loss": 0.0945, |
| "step": 460 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.07043830305337906, |
| "eval_runtime": 0.9312, |
| "eval_samples_per_second": 141.753, |
| "eval_steps_per_second": 18.256, |
| "step": 462 |
| }, |
| { |
| "epoch": 14.545454545454545, |
| "grad_norm": 0.5280194878578186, |
| "learning_rate": 4.468397703439282e-06, |
| "loss": 0.0811, |
| "step": 480 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.06017066538333893, |
| "eval_runtime": 0.9305, |
| "eval_samples_per_second": 141.866, |
| "eval_steps_per_second": 18.271, |
| "step": 495 |
| }, |
| { |
| "epoch": 15.151515151515152, |
| "grad_norm": 0.5912495851516724, |
| "learning_rate": 4.401444203842396e-06, |
| "loss": 0.0742, |
| "step": 500 |
| }, |
| { |
| "epoch": 15.757575757575758, |
| "grad_norm": 1.0043439865112305, |
| "learning_rate": 4.331087205270778e-06, |
| "loss": 0.0778, |
| "step": 520 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.052856337279081345, |
| "eval_runtime": 0.9363, |
| "eval_samples_per_second": 140.988, |
| "eval_steps_per_second": 18.158, |
| "step": 528 |
| }, |
| { |
| "epoch": 16.363636363636363, |
| "grad_norm": 0.37315091490745544, |
| "learning_rate": 4.257452643564155e-06, |
| "loss": 0.0746, |
| "step": 540 |
| }, |
| { |
| "epoch": 16.96969696969697, |
| "grad_norm": 0.5725280046463013, |
| "learning_rate": 4.180672321251766e-06, |
| "loss": 0.0651, |
| "step": 560 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.04653371125459671, |
| "eval_runtime": 0.9263, |
| "eval_samples_per_second": 142.504, |
| "eval_steps_per_second": 18.353, |
| "step": 561 |
| }, |
| { |
| "epoch": 17.575757575757574, |
| "grad_norm": 0.6575049161911011, |
| "learning_rate": 4.100883671631806e-06, |
| "loss": 0.0529, |
| "step": 580 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.04210372641682625, |
| "eval_runtime": 0.9448, |
| "eval_samples_per_second": 139.719, |
| "eval_steps_per_second": 17.994, |
| "step": 594 |
| }, |
| { |
| "epoch": 18.181818181818183, |
| "grad_norm": 0.7622207999229431, |
| "learning_rate": 4.018229512772053e-06, |
| "loss": 0.0644, |
| "step": 600 |
| }, |
| { |
| "epoch": 18.78787878787879, |
| "grad_norm": 0.5367633700370789, |
| "learning_rate": 3.9328577918719916e-06, |
| "loss": 0.0551, |
| "step": 620 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.03950377553701401, |
| "eval_runtime": 0.9419, |
| "eval_samples_per_second": 140.144, |
| "eval_steps_per_second": 18.049, |
| "step": 627 |
| }, |
| { |
| "epoch": 19.393939393939394, |
| "grad_norm": 0.581658661365509, |
| "learning_rate": 3.844921320444031e-06, |
| "loss": 0.0566, |
| "step": 640 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.7774003148078918, |
| "learning_rate": 3.754577500787828e-06, |
| "loss": 0.0532, |
| "step": 660 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.0353802889585495, |
| "eval_runtime": 0.9534, |
| "eval_samples_per_second": 138.453, |
| "eval_steps_per_second": 17.831, |
| "step": 660 |
| }, |
| { |
| "epoch": 20.606060606060606, |
| "grad_norm": 0.6069843769073486, |
| "learning_rate": 3.66198804424729e-06, |
| "loss": 0.0436, |
| "step": 680 |
| }, |
| { |
| "epoch": 21.0, |
| "eval_loss": 0.031004376709461212, |
| "eval_runtime": 0.9391, |
| "eval_samples_per_second": 140.556, |
| "eval_steps_per_second": 18.102, |
| "step": 693 |
| }, |
| { |
| "epoch": 21.21212121212121, |
| "grad_norm": 0.5869702696800232, |
| "learning_rate": 3.5673186817546047e-06, |
| "loss": 0.0487, |
| "step": 700 |
| }, |
| { |
| "epoch": 21.818181818181817, |
| "grad_norm": 0.39276406168937683, |
| "learning_rate": 3.4707388671793814e-06, |
| "loss": 0.0459, |
| "step": 720 |
| }, |
| { |
| "epoch": 22.0, |
| "eval_loss": 0.027377676218748093, |
| "eval_runtime": 0.9405, |
| "eval_samples_per_second": 140.352, |
| "eval_steps_per_second": 18.076, |
| "step": 726 |
| }, |
| { |
| "epoch": 22.424242424242426, |
| "grad_norm": 0.5247741937637329, |
| "learning_rate": 3.3724214740138933e-06, |
| "loss": 0.0461, |
| "step": 740 |
| }, |
| { |
| "epoch": 23.0, |
| "eval_loss": 0.02525358274579048, |
| "eval_runtime": 0.9593, |
| "eval_samples_per_second": 137.596, |
| "eval_steps_per_second": 17.721, |
| "step": 759 |
| }, |
| { |
| "epoch": 23.03030303030303, |
| "grad_norm": 0.4420148730278015, |
| "learning_rate": 3.272542485937369e-06, |
| "loss": 0.0424, |
| "step": 760 |
| }, |
| { |
| "epoch": 23.636363636363637, |
| "grad_norm": 0.4988000690937042, |
| "learning_rate": 3.171280681813174e-06, |
| "loss": 0.0443, |
| "step": 780 |
| }, |
| { |
| "epoch": 24.0, |
| "eval_loss": 0.02300359681248665, |
| "eval_runtime": 0.9498, |
| "eval_samples_per_second": 138.973, |
| "eval_steps_per_second": 17.898, |
| "step": 792 |
| }, |
| { |
| "epoch": 24.242424242424242, |
| "grad_norm": 0.6696539521217346, |
| "learning_rate": 3.0688173156827454e-06, |
| "loss": 0.0346, |
| "step": 800 |
| }, |
| { |
| "epoch": 24.848484848484848, |
| "grad_norm": 0.6659572720527649, |
| "learning_rate": 2.9653357923290753e-06, |
| "loss": 0.0394, |
| "step": 820 |
| }, |
| { |
| "epoch": 25.0, |
| "eval_loss": 0.02064535580575466, |
| "eval_runtime": 0.9419, |
| "eval_samples_per_second": 140.14, |
| "eval_steps_per_second": 18.048, |
| "step": 825 |
| }, |
| { |
| "epoch": 25.454545454545453, |
| "grad_norm": 0.477318674325943, |
| "learning_rate": 2.86102133899045e-06, |
| "loss": 0.0354, |
| "step": 840 |
| }, |
| { |
| "epoch": 26.0, |
| "eval_loss": 0.017997996881604195, |
| "eval_runtime": 0.9337, |
| "eval_samples_per_second": 141.375, |
| "eval_steps_per_second": 18.207, |
| "step": 858 |
| }, |
| { |
| "epoch": 26.060606060606062, |
| "grad_norm": 0.4188827574253082, |
| "learning_rate": 2.7560606738120947e-06, |
| "loss": 0.0379, |
| "step": 860 |
| }, |
| { |
| "epoch": 26.666666666666668, |
| "grad_norm": 0.37732234597206116, |
| "learning_rate": 2.6506416716291466e-06, |
| "loss": 0.0369, |
| "step": 880 |
| }, |
| { |
| "epoch": 27.0, |
| "eval_loss": 0.01666710339486599, |
| "eval_runtime": 0.9423, |
| "eval_samples_per_second": 140.084, |
| "eval_steps_per_second": 18.041, |
| "step": 891 |
| }, |
| { |
| "epoch": 27.272727272727273, |
| "grad_norm": 0.5058871507644653, |
| "learning_rate": 2.544953027679216e-06, |
| "loss": 0.0327, |
| "step": 900 |
| }, |
| { |
| "epoch": 27.87878787878788, |
| "grad_norm": 0.5595805644989014, |
| "learning_rate": 2.4391839198464613e-06, |
| "loss": 0.0338, |
| "step": 920 |
| }, |
| { |
| "epoch": 28.0, |
| "eval_loss": 0.015013493597507477, |
| "eval_runtime": 0.93, |
| "eval_samples_per_second": 141.934, |
| "eval_steps_per_second": 18.279, |
| "step": 924 |
| }, |
| { |
| "epoch": 28.484848484848484, |
| "grad_norm": 0.4609270393848419, |
| "learning_rate": 2.3335236700417404e-06, |
| "loss": 0.0306, |
| "step": 940 |
| }, |
| { |
| "epoch": 29.0, |
| "eval_loss": 0.014108900912106037, |
| "eval_runtime": 0.9248, |
| "eval_samples_per_second": 142.737, |
| "eval_steps_per_second": 18.383, |
| "step": 957 |
| }, |
| { |
| "epoch": 29.09090909090909, |
| "grad_norm": 0.3746008276939392, |
| "learning_rate": 2.2281614053249796e-06, |
| "loss": 0.0307, |
| "step": 960 |
| }, |
| { |
| "epoch": 29.696969696969695, |
| "grad_norm": 0.5330935716629028, |
| "learning_rate": 2.1232857193762923e-06, |
| "loss": 0.0298, |
| "step": 980 |
| }, |
| { |
| "epoch": 30.0, |
| "eval_loss": 0.013387720100581646, |
| "eval_runtime": 0.9587, |
| "eval_samples_per_second": 137.688, |
| "eval_steps_per_second": 17.733, |
| "step": 990 |
| }, |
| { |
| "epoch": 30.303030303030305, |
| "grad_norm": 0.31854015588760376, |
| "learning_rate": 2.019084334921849e-06, |
| "loss": 0.028, |
| "step": 1000 |
| }, |
| { |
| "epoch": 30.90909090909091, |
| "grad_norm": 0.38515424728393555, |
| "learning_rate": 1.9157437677186903e-06, |
| "loss": 0.031, |
| "step": 1020 |
| }, |
| { |
| "epoch": 31.0, |
| "eval_loss": 0.012279902584850788, |
| "eval_runtime": 0.9342, |
| "eval_samples_per_second": 141.305, |
| "eval_steps_per_second": 18.198, |
| "step": 1023 |
| }, |
| { |
| "epoch": 31.515151515151516, |
| "grad_norm": 0.45346567034721375, |
| "learning_rate": 1.8134489926999837e-06, |
| "loss": 0.033, |
| "step": 1040 |
| }, |
| { |
| "epoch": 32.0, |
| "eval_loss": 0.011261457577347755, |
| "eval_runtime": 0.9421, |
| "eval_samples_per_second": 140.119, |
| "eval_steps_per_second": 18.046, |
| "step": 1056 |
| }, |
| { |
| "epoch": 32.121212121212125, |
| "grad_norm": 0.44893690943717957, |
| "learning_rate": 1.7123831128782686e-06, |
| "loss": 0.0246, |
| "step": 1060 |
| }, |
| { |
| "epoch": 32.72727272727273, |
| "grad_norm": 0.4021283984184265, |
| "learning_rate": 1.612727031599356e-06, |
| "loss": 0.03, |
| "step": 1080 |
| }, |
| { |
| "epoch": 33.0, |
| "eval_loss": 0.010378457605838776, |
| "eval_runtime": 0.9543, |
| "eval_samples_per_second": 138.318, |
| "eval_steps_per_second": 17.814, |
| "step": 1089 |
| }, |
| { |
| "epoch": 33.333333333333336, |
| "grad_norm": 0.6586357951164246, |
| "learning_rate": 1.5146591287335452e-06, |
| "loss": 0.0266, |
| "step": 1100 |
| }, |
| { |
| "epoch": 33.93939393939394, |
| "grad_norm": 0.4133249521255493, |
| "learning_rate": 1.4183549413837288e-06, |
| "loss": 0.026, |
| "step": 1120 |
| }, |
| { |
| "epoch": 34.0, |
| "eval_loss": 0.009874224662780762, |
| "eval_runtime": 0.9426, |
| "eval_samples_per_second": 140.041, |
| "eval_steps_per_second": 18.036, |
| "step": 1122 |
| }, |
| { |
| "epoch": 34.54545454545455, |
| "grad_norm": 0.4618055522441864, |
| "learning_rate": 1.3239868496819407e-06, |
| "loss": 0.0278, |
| "step": 1140 |
| }, |
| { |
| "epoch": 35.0, |
| "eval_loss": 0.009275372140109539, |
| "eval_runtime": 0.9399, |
| "eval_samples_per_second": 140.441, |
| "eval_steps_per_second": 18.087, |
| "step": 1155 |
| }, |
| { |
| "epoch": 35.15151515151515, |
| "grad_norm": 0.3481239676475525, |
| "learning_rate": 1.2317237682367178e-06, |
| "loss": 0.0253, |
| "step": 1160 |
| }, |
| { |
| "epoch": 35.75757575757576, |
| "grad_norm": 0.42644399404525757, |
| "learning_rate": 1.1417308437836181e-06, |
| "loss": 0.0269, |
| "step": 1180 |
| }, |
| { |
| "epoch": 36.0, |
| "eval_loss": 0.008818729780614376, |
| "eval_runtime": 0.9417, |
| "eval_samples_per_second": 140.172, |
| "eval_steps_per_second": 18.052, |
| "step": 1188 |
| }, |
| { |
| "epoch": 36.36363636363637, |
| "grad_norm": 0.5186192393302917, |
| "learning_rate": 1.0541691595800338e-06, |
| "loss": 0.0231, |
| "step": 1200 |
| }, |
| { |
| "epoch": 36.96969696969697, |
| "grad_norm": 0.3892291784286499, |
| "learning_rate": 9.691954470734692e-07, |
| "loss": 0.0273, |
| "step": 1220 |
| }, |
| { |
| "epoch": 37.0, |
| "eval_loss": 0.008245617151260376, |
| "eval_runtime": 0.9541, |
| "eval_samples_per_second": 138.353, |
| "eval_steps_per_second": 17.818, |
| "step": 1221 |
| }, |
| { |
| "epoch": 37.57575757575758, |
| "grad_norm": 0.5318649411201477, |
| "learning_rate": 8.869618053593429e-07, |
| "loss": 0.0251, |
| "step": 1240 |
| }, |
| { |
| "epoch": 38.0, |
| "eval_loss": 0.007936290465295315, |
| "eval_runtime": 0.9281, |
| "eval_samples_per_second": 142.225, |
| "eval_steps_per_second": 18.317, |
| "step": 1254 |
| }, |
| { |
| "epoch": 38.18181818181818, |
| "grad_norm": 0.37824392318725586, |
| "learning_rate": 8.076154289305019e-07, |
| "loss": 0.0258, |
| "step": 1260 |
| }, |
| { |
| "epoch": 38.78787878787879, |
| "grad_norm": 0.402786523103714, |
| "learning_rate": 7.312983442057497e-07, |
| "loss": 0.0263, |
| "step": 1280 |
| }, |
| { |
| "epoch": 39.0, |
| "eval_loss": 0.007719958666712046, |
| "eval_runtime": 0.9423, |
| "eval_samples_per_second": 140.079, |
| "eval_steps_per_second": 18.041, |
| "step": 1287 |
| }, |
| { |
| "epoch": 39.39393939393939, |
| "grad_norm": 0.41106194257736206, |
| "learning_rate": 6.581471553089874e-07, |
| "loss": 0.0243, |
| "step": 1300 |
| }, |
| { |
| "epoch": 40.0, |
| "grad_norm": 0.5733346939086914, |
| "learning_rate": 5.882927995540266e-07, |
| "loss": 0.0247, |
| "step": 1320 |
| }, |
| { |
| "epoch": 40.0, |
| "eval_loss": 0.00738176517188549, |
| "eval_runtime": 0.9247, |
| "eval_samples_per_second": 142.756, |
| "eval_steps_per_second": 18.385, |
| "step": 1320 |
| }, |
| { |
| "epoch": 40.60606060606061, |
| "grad_norm": 0.3024619221687317, |
| "learning_rate": 5.218603130727243e-07, |
| "loss": 0.0243, |
| "step": 1340 |
| }, |
| { |
| "epoch": 41.0, |
| "eval_loss": 0.007284797262400389, |
| "eval_runtime": 0.9333, |
| "eval_samples_per_second": 141.426, |
| "eval_steps_per_second": 18.214, |
| "step": 1353 |
| }, |
| { |
| "epoch": 41.21212121212121, |
| "grad_norm": 0.48434221744537354, |
| "learning_rate": 4.589686070059762e-07, |
| "loss": 0.0245, |
| "step": 1360 |
| }, |
| { |
| "epoch": 41.81818181818182, |
| "grad_norm": 0.4191039204597473, |
| "learning_rate": 3.997302546581597e-07, |
| "loss": 0.0259, |
| "step": 1380 |
| }, |
| { |
| "epoch": 42.0, |
| "eval_loss": 0.007058488205075264, |
| "eval_runtime": 0.9549, |
| "eval_samples_per_second": 138.232, |
| "eval_steps_per_second": 17.803, |
| "step": 1386 |
| }, |
| { |
| "epoch": 42.42424242424242, |
| "grad_norm": 0.3011131286621094, |
| "learning_rate": 3.4425128999602265e-07, |
| "loss": 0.0234, |
| "step": 1400 |
| }, |
| { |
| "epoch": 43.0, |
| "eval_loss": 0.006919534411281347, |
| "eval_runtime": 0.9466, |
| "eval_samples_per_second": 139.447, |
| "eval_steps_per_second": 17.959, |
| "step": 1419 |
| }, |
| { |
| "epoch": 43.03030303030303, |
| "grad_norm": 0.49921727180480957, |
| "learning_rate": 2.9263101785268253e-07, |
| "loss": 0.0268, |
| "step": 1420 |
| }, |
| { |
| "epoch": 43.63636363636363, |
| "grad_norm": 0.41327381134033203, |
| "learning_rate": 2.449618361764788e-07, |
| "loss": 0.0232, |
| "step": 1440 |
| }, |
| { |
| "epoch": 44.0, |
| "eval_loss": 0.0067825643345713615, |
| "eval_runtime": 0.9457, |
| "eval_samples_per_second": 139.574, |
| "eval_steps_per_second": 17.975, |
| "step": 1452 |
| }, |
| { |
| "epoch": 44.24242424242424, |
| "grad_norm": 0.42085084319114685, |
| "learning_rate": 2.0132907064282837e-07, |
| "loss": 0.0222, |
| "step": 1460 |
| }, |
| { |
| "epoch": 44.84848484848485, |
| "grad_norm": 0.4812968075275421, |
| "learning_rate": 1.6181082192513352e-07, |
| "loss": 0.0245, |
| "step": 1480 |
| }, |
| { |
| "epoch": 45.0, |
| "eval_loss": 0.006708750035613775, |
| "eval_runtime": 0.944, |
| "eval_samples_per_second": 139.826, |
| "eval_steps_per_second": 18.008, |
| "step": 1485 |
| }, |
| { |
| "epoch": 45.45454545454545, |
| "grad_norm": 0.41410696506500244, |
| "learning_rate": 1.264778258981178e-07, |
| "loss": 0.0234, |
| "step": 1500 |
| }, |
| { |
| "epoch": 46.0, |
| "eval_loss": 0.006636774633079767, |
| "eval_runtime": 0.9246, |
| "eval_samples_per_second": 142.77, |
| "eval_steps_per_second": 18.387, |
| "step": 1518 |
| }, |
| { |
| "epoch": 46.06060606060606, |
| "grad_norm": 0.36860212683677673, |
| "learning_rate": 9.539332702381026e-08, |
| "loss": 0.0264, |
| "step": 1520 |
| }, |
| { |
| "epoch": 46.666666666666664, |
| "grad_norm": 0.3396029770374298, |
| "learning_rate": 6.86129651468273e-08, |
| "loss": 0.0229, |
| "step": 1540 |
| }, |
| { |
| "epoch": 47.0, |
| "eval_loss": 0.00661947438493371, |
| "eval_runtime": 0.9503, |
| "eval_samples_per_second": 138.905, |
| "eval_steps_per_second": 17.889, |
| "step": 1551 |
| }, |
| { |
| "epoch": 47.27272727272727, |
| "grad_norm": 0.4342035949230194, |
| "learning_rate": 4.618467590157133e-08, |
| "loss": 0.0233, |
| "step": 1560 |
| }, |
| { |
| "epoch": 47.878787878787875, |
| "grad_norm": 0.4722955822944641, |
| "learning_rate": 2.814860490961607e-08, |
| "loss": 0.0248, |
| "step": 1580 |
| }, |
| { |
| "epoch": 48.0, |
| "eval_loss": 0.006618270184844732, |
| "eval_runtime": 0.9496, |
| "eval_samples_per_second": 139.011, |
| "eval_steps_per_second": 17.903, |
| "step": 1584 |
| }, |
| { |
| "epoch": 48.484848484848484, |
| "grad_norm": 0.22162474691867828, |
| "learning_rate": 1.453703592086353e-08, |
| "loss": 0.0239, |
| "step": 1600 |
| }, |
| { |
| "epoch": 49.0, |
| "eval_loss": 0.006619932595640421, |
| "eval_runtime": 0.947, |
| "eval_samples_per_second": 139.389, |
| "eval_steps_per_second": 17.952, |
| "step": 1617 |
| }, |
| { |
| "epoch": 49.09090909090909, |
| "grad_norm": 0.4756720960140228, |
| "learning_rate": 5.374333027093892e-09, |
| "loss": 0.0236, |
| "step": 1620 |
| }, |
| { |
| "epoch": 49.696969696969695, |
| "grad_norm": 0.42004162073135376, |
| "learning_rate": 6.768970513457151e-10, |
| "loss": 0.0228, |
| "step": 1640 |
| }, |
| { |
| "epoch": 50.0, |
| "eval_loss": 0.006614842917770147, |
| "eval_runtime": 0.9549, |
| "eval_samples_per_second": 138.227, |
| "eval_steps_per_second": 17.802, |
| "step": 1650 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 1650, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.91340755288064e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|