| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.17142857142857143, |
| "eval_steps": 500, |
| "global_step": 150, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 2693.6875610351562, |
| "entropy": 0.3662109375, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.12395373731851578, |
| "kl": 0.0, |
| "learning_rate": 6.666666666666667e-08, |
| "loss": 0.0, |
| "reward": 0.7708333535119891, |
| "reward_std": 0.4629540964961052, |
| "rewards/accuracy_reward": 0.25000001303851604, |
| "rewards/format_reward": 0.5208333386108279, |
| "step": 1 |
| }, |
| { |
| "completion_length": 3127.3958435058594, |
| "entropy": 0.353515625, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.14846429228782654, |
| "kl": 0.0, |
| "learning_rate": 1.3333333333333334e-07, |
| "loss": 0.0, |
| "reward": 0.6458333637565374, |
| "reward_std": 0.4249730706214905, |
| "rewards/accuracy_reward": 0.2812500102445483, |
| "rewards/format_reward": 0.3645833386108279, |
| "step": 2 |
| }, |
| { |
| "completion_length": 3685.041748046875, |
| "entropy": 0.4443359375, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.10399040579795837, |
| "kl": 4.1425228118896484e-05, |
| "learning_rate": 2e-07, |
| "loss": 0.0, |
| "reward": 0.23958333674818277, |
| "reward_std": 0.3668827787041664, |
| "rewards/accuracy_reward": 0.0729166679084301, |
| "rewards/format_reward": 0.16666667256504297, |
| "step": 3 |
| }, |
| { |
| "completion_length": 2380.291778564453, |
| "entropy": 0.40478515625, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.16352659463882446, |
| "kl": 3.409385681152344e-05, |
| "learning_rate": 2.6666666666666667e-07, |
| "loss": 0.0, |
| "reward": 0.8229166865348816, |
| "reward_std": 0.507609948515892, |
| "rewards/accuracy_reward": 0.19791667722165585, |
| "rewards/format_reward": 0.6250000223517418, |
| "step": 4 |
| }, |
| { |
| "completion_length": 3441.2188720703125, |
| "entropy": 0.45458984375, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.15812984108924866, |
| "kl": 4.1961669921875e-05, |
| "learning_rate": 3.333333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.42708334885537624, |
| "reward_std": 0.5058739930391312, |
| "rewards/accuracy_reward": 0.07291666697710752, |
| "rewards/format_reward": 0.35416667722165585, |
| "step": 5 |
| }, |
| { |
| "completion_length": 3382.3438110351562, |
| "entropy": 0.45166015625, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.15454305708408356, |
| "kl": 4.26173210144043e-05, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "reward": 0.40625000558793545, |
| "reward_std": 0.5202516540884972, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.3229166744276881, |
| "step": 6 |
| }, |
| { |
| "completion_length": 3277.291748046875, |
| "entropy": 0.39404296875, |
| "epoch": 0.008, |
| "grad_norm": 0.13690507411956787, |
| "kl": 2.562999725341797e-05, |
| "learning_rate": 4.6666666666666666e-07, |
| "loss": 0.0, |
| "reward": 0.8854166865348816, |
| "reward_std": 0.6845719665288925, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/format_reward": 0.6145833432674408, |
| "step": 7 |
| }, |
| { |
| "completion_length": 2841.916748046875, |
| "entropy": 0.36083984375, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.1767321527004242, |
| "kl": 2.4050474166870117e-05, |
| "learning_rate": 5.333333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.8854166967794299, |
| "reward_std": 0.3672378845512867, |
| "rewards/accuracy_reward": 0.3958333535119891, |
| "rewards/format_reward": 0.4895833460614085, |
| "step": 8 |
| }, |
| { |
| "completion_length": 3480.6563110351562, |
| "entropy": 0.4384765625, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.15406936407089233, |
| "kl": 3.796815872192383e-05, |
| "learning_rate": 6e-07, |
| "loss": 0.0, |
| "reward": 0.5520833432674408, |
| "reward_std": 0.6496799141168594, |
| "rewards/accuracy_reward": 0.17708333488553762, |
| "rewards/format_reward": 0.3750000074505806, |
| "step": 9 |
| }, |
| { |
| "completion_length": 2963.572967529297, |
| "entropy": 0.3544921875, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.15688633918762207, |
| "kl": 2.5287270545959473e-05, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": 0.0, |
| "reward": 0.5937500223517418, |
| "reward_std": 0.5099271312355995, |
| "rewards/accuracy_reward": 0.17708333861082792, |
| "rewards/format_reward": 0.4166666753590107, |
| "step": 10 |
| }, |
| { |
| "completion_length": 3573.7500610351562, |
| "entropy": 0.37890625, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.12983083724975586, |
| "kl": 2.5391578674316406e-05, |
| "learning_rate": 7.333333333333332e-07, |
| "loss": 0.0, |
| "reward": 0.3125000111758709, |
| "reward_std": 0.5802810192108154, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/format_reward": 0.20833334140479565, |
| "step": 11 |
| }, |
| { |
| "completion_length": 2520.8958740234375, |
| "entropy": 0.39111328125, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.20449091494083405, |
| "kl": 3.743171691894531e-05, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "reward": 0.8020833656191826, |
| "reward_std": 0.4411254972219467, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/format_reward": 0.6562500223517418, |
| "step": 12 |
| }, |
| { |
| "completion_length": 3038.041748046875, |
| "entropy": 0.3828125, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.14574433863162994, |
| "kl": 2.5153160095214844e-05, |
| "learning_rate": 8.666666666666667e-07, |
| "loss": 0.0, |
| "reward": 0.6875000298023224, |
| "reward_std": 0.3254704251885414, |
| "rewards/accuracy_reward": 0.22916666697710752, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 13 |
| }, |
| { |
| "completion_length": 3116.3125610351562, |
| "entropy": 0.37109375, |
| "epoch": 0.016, |
| "grad_norm": 0.202586367726326, |
| "kl": 1.9026920199394226e-05, |
| "learning_rate": 9.333333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.5833333507180214, |
| "reward_std": 0.4630111753940582, |
| "rewards/accuracy_reward": 0.21875001024454832, |
| "rewards/format_reward": 0.3645833395421505, |
| "step": 14 |
| }, |
| { |
| "completion_length": 2924.0521240234375, |
| "entropy": 0.36328125, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.09130721539258957, |
| "kl": 1.4469027519226074e-05, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "reward": 0.604166679084301, |
| "reward_std": 0.22134994342923164, |
| "rewards/accuracy_reward": 0.1979166716337204, |
| "rewards/format_reward": 0.4062500074505806, |
| "step": 15 |
| }, |
| { |
| "completion_length": 3887.5521850585938, |
| "entropy": 0.4755859375, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.11806491017341614, |
| "kl": 2.9832124710083008e-05, |
| "learning_rate": 9.998781585307575e-07, |
| "loss": 0.0, |
| "reward": 0.11458333674818277, |
| "reward_std": 0.26997610181570053, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.07291666977107525, |
| "step": 16 |
| }, |
| { |
| "completion_length": 2579.625030517578, |
| "entropy": 0.44091796875, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.2032601535320282, |
| "kl": 3.93986701965332e-05, |
| "learning_rate": 9.99512700102336e-07, |
| "loss": 0.0, |
| "reward": 0.7083333507180214, |
| "reward_std": 0.39187028259038925, |
| "rewards/accuracy_reward": 0.19791667442768812, |
| "rewards/format_reward": 0.5104166753590107, |
| "step": 17 |
| }, |
| { |
| "completion_length": 3089.104248046875, |
| "entropy": 0.3671875, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.11376938223838806, |
| "kl": 1.2531876564025879e-05, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0, |
| "reward": 0.5625000251457095, |
| "reward_std": 0.35285963863134384, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.3958333386108279, |
| "step": 18 |
| }, |
| { |
| "completion_length": 3130.760498046875, |
| "entropy": 0.39111328125, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.09636794775724411, |
| "kl": 2.8267502784729004e-05, |
| "learning_rate": 9.98051855792412e-07, |
| "loss": 0.0, |
| "reward": 0.8125000111758709, |
| "reward_std": 0.3496965616941452, |
| "rewards/accuracy_reward": 0.36458333395421505, |
| "rewards/format_reward": 0.44791667722165585, |
| "step": 19 |
| }, |
| { |
| "completion_length": 2585.9896545410156, |
| "entropy": 0.329833984375, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.15105831623077393, |
| "kl": 6.628036499023438e-05, |
| "learning_rate": 9.969572609838744e-07, |
| "loss": 0.0, |
| "reward": 0.9791666716337204, |
| "reward_std": 0.3452813923358917, |
| "rewards/accuracy_reward": 0.2812500037252903, |
| "rewards/format_reward": 0.6979166716337204, |
| "step": 20 |
| }, |
| { |
| "completion_length": 2804.229248046875, |
| "entropy": 0.42578125, |
| "epoch": 0.024, |
| "grad_norm": 0.2109123021364212, |
| "kl": 0.00016552209854125977, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": 0.0, |
| "reward": 0.6145833432674408, |
| "reward_std": 0.4177238382399082, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/format_reward": 0.46875002048909664, |
| "step": 21 |
| }, |
| { |
| "completion_length": 1903.1459045410156, |
| "entropy": 0.419921875, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.20996998250484467, |
| "kl": 0.00026351213455200195, |
| "learning_rate": 9.940426894506606e-07, |
| "loss": 0.0, |
| "reward": 1.1041667014360428, |
| "reward_std": 0.4033822976052761, |
| "rewards/accuracy_reward": 0.29166667722165585, |
| "rewards/format_reward": 0.8125000149011612, |
| "step": 22 |
| }, |
| { |
| "completion_length": 2714.0729370117188, |
| "entropy": 0.36865234375, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.16544093191623688, |
| "kl": 0.00011658668518066406, |
| "learning_rate": 9.922242910178859e-07, |
| "loss": 0.0, |
| "reward": 0.6770833507180214, |
| "reward_std": 0.6271640285849571, |
| "rewards/accuracy_reward": 0.1770833432674408, |
| "rewards/format_reward": 0.5000000223517418, |
| "step": 23 |
| }, |
| { |
| "completion_length": 2834.2396850585938, |
| "entropy": 0.373046875, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.10939397662878036, |
| "kl": 0.0001084059476852417, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0, |
| "reward": 0.7916666865348816, |
| "reward_std": 0.5711240321397781, |
| "rewards/accuracy_reward": 0.2187500074505806, |
| "rewards/format_reward": 0.572916679084301, |
| "step": 24 |
| }, |
| { |
| "completion_length": 2877.1354370117188, |
| "entropy": 0.4296875, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.10193013399839401, |
| "kl": 0.00018364191055297852, |
| "learning_rate": 9.878701917609207e-07, |
| "loss": 0.0, |
| "reward": 0.677083358168602, |
| "reward_std": 0.2898401468992233, |
| "rewards/accuracy_reward": 0.2395833432674408, |
| "rewards/format_reward": 0.4375, |
| "step": 25 |
| }, |
| { |
| "completion_length": 3221.2396850585938, |
| "entropy": 0.4248046875, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.07458896934986115, |
| "kl": 3.0487775802612305e-05, |
| "learning_rate": 9.853368487582886e-07, |
| "loss": 0.0, |
| "reward": 0.6562500149011612, |
| "reward_std": 0.25371449440717697, |
| "rewards/accuracy_reward": 0.19791666977107525, |
| "rewards/format_reward": 0.4583333358168602, |
| "step": 26 |
| }, |
| { |
| "completion_length": 3297.3959350585938, |
| "entropy": 0.45703125, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.0925775095820427, |
| "kl": 0.00012201815843582153, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0, |
| "reward": 0.541666679084301, |
| "reward_std": 0.4426998719573021, |
| "rewards/accuracy_reward": 0.15625000279396772, |
| "rewards/format_reward": 0.385416679084301, |
| "step": 27 |
| }, |
| { |
| "completion_length": 2984.2188110351562, |
| "entropy": 0.3994140625, |
| "epoch": 0.032, |
| "grad_norm": 0.12723609805107117, |
| "kl": 0.00015980005264282227, |
| "learning_rate": 9.795644345114794e-07, |
| "loss": 0.0, |
| "reward": 0.8437500447034836, |
| "reward_std": 0.48607436567544937, |
| "rewards/accuracy_reward": 0.3333333460614085, |
| "rewards/format_reward": 0.5104166865348816, |
| "step": 28 |
| }, |
| { |
| "completion_length": 3707.2501220703125, |
| "entropy": 0.43408203125, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.20130394399166107, |
| "kl": 0.0003858804702758789, |
| "learning_rate": 9.76328489131448e-07, |
| "loss": 0.0, |
| "reward": 0.2500000027939677, |
| "reward_std": 0.37919554859399796, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/format_reward": 0.18750001024454832, |
| "step": 29 |
| }, |
| { |
| "completion_length": 3099.9063110351562, |
| "entropy": 0.384765625, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.13564985990524292, |
| "kl": 0.0005750656127929688, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.0, |
| "reward": 0.8854166828095913, |
| "reward_std": 0.5532158613204956, |
| "rewards/accuracy_reward": 0.3125000009313226, |
| "rewards/format_reward": 0.572916679084301, |
| "step": 30 |
| }, |
| { |
| "completion_length": 3310.8125610351562, |
| "entropy": 0.40087890625, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.14703762531280518, |
| "kl": 0.0007152557373046875, |
| "learning_rate": 9.69165882516764e-07, |
| "loss": 0.0, |
| "reward": 0.4583333469927311, |
| "reward_std": 0.4937985762953758, |
| "rewards/accuracy_reward": 0.16666667070239782, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 31 |
| }, |
| { |
| "completion_length": 3543.666748046875, |
| "entropy": 0.4521484375, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 0.11301636695861816, |
| "kl": 0.00032842159271240234, |
| "learning_rate": 9.65243099959949e-07, |
| "loss": 0.0, |
| "reward": 0.6250000223517418, |
| "reward_std": 0.46596524864435196, |
| "rewards/accuracy_reward": 0.28125000558793545, |
| "rewards/format_reward": 0.34375, |
| "step": 32 |
| }, |
| { |
| "completion_length": 3395.947998046875, |
| "entropy": 0.384765625, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.12107253074645996, |
| "kl": 0.00042450428009033203, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0, |
| "reward": 0.604166679084301, |
| "reward_std": 0.5497709587216377, |
| "rewards/accuracy_reward": 0.18750000093132257, |
| "rewards/format_reward": 0.41666667722165585, |
| "step": 33 |
| }, |
| { |
| "completion_length": 2621.218780517578, |
| "entropy": 0.45263671875, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.15317150950431824, |
| "kl": 0.0013637542724609375, |
| "learning_rate": 9.567251964768342e-07, |
| "loss": 0.0001, |
| "reward": 0.8541666865348816, |
| "reward_std": 0.4670567326247692, |
| "rewards/accuracy_reward": 0.31250001303851604, |
| "rewards/format_reward": 0.5416666828095913, |
| "step": 34 |
| }, |
| { |
| "completion_length": 3166.3958740234375, |
| "entropy": 0.43115234375, |
| "epoch": 0.04, |
| "grad_norm": 0.1469903290271759, |
| "kl": 0.0011509060859680176, |
| "learning_rate": 9.521346881455354e-07, |
| "loss": 0.0, |
| "reward": 0.6458333656191826, |
| "reward_std": 0.6130613833665848, |
| "rewards/accuracy_reward": 0.23958333767950535, |
| "rewards/format_reward": 0.4062500149011612, |
| "step": 35 |
| }, |
| { |
| "completion_length": 3509.697998046875, |
| "entropy": 0.513671875, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.11033376306295395, |
| "kl": 0.0011191368103027344, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0, |
| "reward": 0.23958333395421505, |
| "reward_std": 0.24118434637784958, |
| "rewards/accuracy_reward": 0.031250000931322575, |
| "rewards/format_reward": 0.20833334140479565, |
| "step": 36 |
| }, |
| { |
| "completion_length": 3363.8333740234375, |
| "entropy": 0.42138671875, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.11778294295072556, |
| "kl": 0.0008115768432617188, |
| "learning_rate": 9.42302986163543e-07, |
| "loss": 0.0, |
| "reward": 0.2812500149011612, |
| "reward_std": 0.13804075866937637, |
| "rewards/accuracy_reward": 0.031250000931322575, |
| "rewards/format_reward": 0.25, |
| "step": 37 |
| }, |
| { |
| "completion_length": 3610.8438110351562, |
| "entropy": 0.44677734375, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.061884235590696335, |
| "kl": 0.0005736351013183594, |
| "learning_rate": 9.370671165529144e-07, |
| "loss": 0.0, |
| "reward": 0.21875000558793545, |
| "reward_std": 0.17128896713256836, |
| "rewards/accuracy_reward": 0.10416666697710752, |
| "rewards/format_reward": 0.11458333861082792, |
| "step": 38 |
| }, |
| { |
| "completion_length": 2926.4063110351562, |
| "entropy": 0.36669921875, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.1068028062582016, |
| "kl": 0.0011527538299560547, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0, |
| "reward": 0.7708333656191826, |
| "reward_std": 0.1930682435631752, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.5208333507180214, |
| "step": 39 |
| }, |
| { |
| "completion_length": 2785.1146545410156, |
| "entropy": 0.388671875, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.17572174966335297, |
| "kl": 0.0032024383544921875, |
| "learning_rate": 9.259695151358214e-07, |
| "loss": 0.0001, |
| "reward": 0.7291666902601719, |
| "reward_std": 0.3721684589982033, |
| "rewards/accuracy_reward": 0.1979166716337204, |
| "rewards/format_reward": 0.5312500186264515, |
| "step": 40 |
| }, |
| { |
| "completion_length": 3123.947998046875, |
| "entropy": 0.35791015625, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.144905224442482, |
| "kl": 0.0007574558258056641, |
| "learning_rate": 9.20113792876298e-07, |
| "loss": 0.0, |
| "reward": 0.5416666865348816, |
| "reward_std": 0.4893290549516678, |
| "rewards/accuracy_reward": 0.12500000651925802, |
| "rewards/format_reward": 0.416666679084301, |
| "step": 41 |
| }, |
| { |
| "completion_length": 3056.197998046875, |
| "entropy": 0.48193359375, |
| "epoch": 0.048, |
| "grad_norm": 0.07001210004091263, |
| "kl": 0.000598907470703125, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0, |
| "reward": 0.30208333395421505, |
| "reward_std": 0.15690935403108597, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/format_reward": 0.2812500009313226, |
| "step": 42 |
| }, |
| { |
| "completion_length": 3097.479248046875, |
| "entropy": 0.4033203125, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.09348543733358383, |
| "kl": 0.001148223876953125, |
| "learning_rate": 9.078043584226815e-07, |
| "loss": 0.0, |
| "reward": 0.4895833358168602, |
| "reward_std": 0.3020758181810379, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.32291666977107525, |
| "step": 43 |
| }, |
| { |
| "completion_length": 2797.7084197998047, |
| "entropy": 0.39013671875, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.15556485950946808, |
| "kl": 0.0014767646789550781, |
| "learning_rate": 9.013573120044966e-07, |
| "loss": 0.0001, |
| "reward": 0.8020833386108279, |
| "reward_std": 0.3607782945036888, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/format_reward": 0.5312500102445483, |
| "step": 44 |
| }, |
| { |
| "completion_length": 3618.791748046875, |
| "entropy": 0.423828125, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.09923144429922104, |
| "kl": 0.0026645660400390625, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0001, |
| "reward": 0.5833333348855376, |
| "reward_std": 0.4189528524875641, |
| "rewards/accuracy_reward": 0.2604166716337204, |
| "rewards/format_reward": 0.3229166781529784, |
| "step": 45 |
| }, |
| { |
| "completion_length": 3482.479248046875, |
| "entropy": 0.50634765625, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.12269324064254761, |
| "kl": 0.0013875961303710938, |
| "learning_rate": 8.878960148416747e-07, |
| "loss": 0.0001, |
| "reward": 0.22916667722165585, |
| "reward_std": 0.26679350435733795, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/format_reward": 0.18750000186264515, |
| "step": 46 |
| }, |
| { |
| "completion_length": 2958.291748046875, |
| "entropy": 0.390625, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.16963143646717072, |
| "kl": 0.0011917352676391602, |
| "learning_rate": 8.808890536269229e-07, |
| "loss": 0.0, |
| "reward": 0.8854166967794299, |
| "reward_std": 0.5451135858893394, |
| "rewards/accuracy_reward": 0.3437500149011612, |
| "rewards/format_reward": 0.5416666669771075, |
| "step": 47 |
| }, |
| { |
| "completion_length": 2956.416717529297, |
| "entropy": 0.396484375, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.14105089008808136, |
| "kl": 0.0033426284790039062, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0001, |
| "reward": 0.7395833507180214, |
| "reward_std": 0.5457281768321991, |
| "rewards/accuracy_reward": 0.29166666977107525, |
| "rewards/format_reward": 0.4479166679084301, |
| "step": 48 |
| }, |
| { |
| "completion_length": 2448.1146850585938, |
| "entropy": 0.36865234375, |
| "epoch": 0.056, |
| "grad_norm": 0.15970121324062347, |
| "kl": 0.006764888763427734, |
| "learning_rate": 8.663414758415478e-07, |
| "loss": 0.0003, |
| "reward": 0.895833395421505, |
| "reward_std": 0.464010052382946, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.6458333507180214, |
| "step": 49 |
| }, |
| { |
| "completion_length": 3050.1041870117188, |
| "entropy": 0.34521484375, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.12386268377304077, |
| "kl": 0.0011911392211914062, |
| "learning_rate": 8.588087370409302e-07, |
| "loss": 0.0, |
| "reward": 0.6562500325962901, |
| "reward_std": 0.4570996016263962, |
| "rewards/accuracy_reward": 0.2916666818782687, |
| "rewards/format_reward": 0.3645833497866988, |
| "step": 50 |
| }, |
| { |
| "completion_length": 2495.2708740234375, |
| "entropy": 0.44873046875, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.12384030222892761, |
| "kl": 0.005596160888671875, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": 0.0002, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.3033446706831455, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 51 |
| }, |
| { |
| "completion_length": 3027.406280517578, |
| "entropy": 0.384765625, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.0901699811220169, |
| "kl": 0.0021982192993164062, |
| "learning_rate": 8.432457529696548e-07, |
| "loss": 0.0001, |
| "reward": 0.8750000298023224, |
| "reward_std": 0.5351639539003372, |
| "rewards/accuracy_reward": 0.3958333507180214, |
| "rewards/format_reward": 0.4791666865348816, |
| "step": 52 |
| }, |
| { |
| "completion_length": 2952.8646850585938, |
| "entropy": 0.41845703125, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.09236861765384674, |
| "kl": 0.0012669563293457031, |
| "learning_rate": 8.352239353294194e-07, |
| "loss": 0.0001, |
| "reward": 0.8541666865348816, |
| "reward_std": 0.5214647725224495, |
| "rewards/accuracy_reward": 0.260416679084301, |
| "rewards/format_reward": 0.5937500074505806, |
| "step": 53 |
| }, |
| { |
| "completion_length": 2996.1250610351562, |
| "entropy": 0.3837890625, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.15135987102985382, |
| "kl": 0.0015659332275390625, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0001, |
| "reward": 0.9479166939854622, |
| "reward_std": 0.7639089524745941, |
| "rewards/accuracy_reward": 0.4062500111758709, |
| "rewards/format_reward": 0.5416666828095913, |
| "step": 54 |
| }, |
| { |
| "completion_length": 3076.2500610351562, |
| "entropy": 0.4130859375, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.12680813670158386, |
| "kl": 0.0023276805877685547, |
| "learning_rate": 8.187213662662538e-07, |
| "loss": 0.0001, |
| "reward": 0.6979166865348816, |
| "reward_std": 0.5675121322274208, |
| "rewards/accuracy_reward": 0.23958333861082792, |
| "rewards/format_reward": 0.458333358168602, |
| "step": 55 |
| }, |
| { |
| "completion_length": 3058.104248046875, |
| "entropy": 0.4072265625, |
| "epoch": 0.064, |
| "grad_norm": 0.10628776252269745, |
| "kl": 0.0009794235229492188, |
| "learning_rate": 8.102495512755938e-07, |
| "loss": 0.0, |
| "reward": 0.6562500298023224, |
| "reward_std": 0.3362164571881294, |
| "rewards/accuracy_reward": 0.19791666697710752, |
| "rewards/format_reward": 0.4583333469927311, |
| "step": 56 |
| }, |
| { |
| "completion_length": 3532.2813110351562, |
| "entropy": 0.3369140625, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.09391733258962631, |
| "kl": 0.0005993843078613281, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": 0.0, |
| "reward": 0.3854166865348816, |
| "reward_std": 0.3325711265206337, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.3020833432674408, |
| "step": 57 |
| }, |
| { |
| "completion_length": 2239.1145935058594, |
| "entropy": 0.322998046875, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.11698172241449356, |
| "kl": 0.0037631988525390625, |
| "learning_rate": 7.928877960781808e-07, |
| "loss": 0.0002, |
| "reward": 1.0000000223517418, |
| "reward_std": 0.39449498802423477, |
| "rewards/accuracy_reward": 0.2604166669771075, |
| "rewards/format_reward": 0.7395833358168602, |
| "step": 58 |
| }, |
| { |
| "completion_length": 3092.3438110351562, |
| "entropy": 0.3662109375, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.10882271081209183, |
| "kl": 0.001026153564453125, |
| "learning_rate": 7.840072575681468e-07, |
| "loss": 0.0, |
| "reward": 0.5625000298023224, |
| "reward_std": 0.39667778089642525, |
| "rewards/accuracy_reward": 0.19791667442768812, |
| "rewards/format_reward": 0.36458333395421505, |
| "step": 59 |
| }, |
| { |
| "completion_length": 3120.5209350585938, |
| "entropy": 0.38037109375, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.1319083720445633, |
| "kl": 0.0019273757934570312, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0001, |
| "reward": 0.583333358168602, |
| "reward_std": 0.49578939378261566, |
| "rewards/accuracy_reward": 0.13541666977107525, |
| "rewards/format_reward": 0.4479166902601719, |
| "step": 60 |
| }, |
| { |
| "completion_length": 2971.9166870117188, |
| "entropy": 0.36669921875, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.17734739184379578, |
| "kl": 0.0010924339294433594, |
| "learning_rate": 7.658709009626109e-07, |
| "loss": 0.0, |
| "reward": 0.8020833730697632, |
| "reward_std": 0.5149242952466011, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/format_reward": 0.5937500149011612, |
| "step": 61 |
| }, |
| { |
| "completion_length": 2529.7188720703125, |
| "entropy": 0.329833984375, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 0.26185768842697144, |
| "kl": 0.016622543334960938, |
| "learning_rate": 7.566249040241553e-07, |
| "loss": 0.0007, |
| "reward": 0.9270833656191826, |
| "reward_std": 0.4443442225456238, |
| "rewards/accuracy_reward": 0.27083334885537624, |
| "rewards/format_reward": 0.6562500149011612, |
| "step": 62 |
| }, |
| { |
| "completion_length": 2196.1250610351562, |
| "entropy": 0.36279296875, |
| "epoch": 0.072, |
| "grad_norm": 0.1202726885676384, |
| "kl": 0.0028791427612304688, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0001, |
| "reward": 1.1562500596046448, |
| "reward_std": 0.43789636343717575, |
| "rewards/accuracy_reward": 0.385416679084301, |
| "rewards/format_reward": 0.7708333432674408, |
| "step": 63 |
| }, |
| { |
| "completion_length": 3074.041717529297, |
| "entropy": 0.42041015625, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.10591074079275131, |
| "kl": 0.0019989013671875, |
| "learning_rate": 7.37802304516818e-07, |
| "loss": 0.0001, |
| "reward": 0.6250000149011612, |
| "reward_std": 0.4529266282916069, |
| "rewards/accuracy_reward": 0.18750000651925802, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 64 |
| }, |
| { |
| "completion_length": 2871.416748046875, |
| "entropy": 0.365478515625, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.16383042931556702, |
| "kl": 0.001850128173828125, |
| "learning_rate": 7.282358947176205e-07, |
| "loss": 0.0001, |
| "reward": 0.6666666865348816, |
| "reward_std": 0.38071464747190475, |
| "rewards/accuracy_reward": 0.19791667722165585, |
| "rewards/format_reward": 0.4687500149011612, |
| "step": 65 |
| }, |
| { |
| "completion_length": 2014.6354370117188, |
| "entropy": 0.33642578125, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.26954010128974915, |
| "kl": 0.008558273315429688, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.0003, |
| "reward": 0.9375000447034836, |
| "reward_std": 0.35192636400461197, |
| "rewards/accuracy_reward": 0.34375000838190317, |
| "rewards/format_reward": 0.59375, |
| "step": 66 |
| }, |
| { |
| "completion_length": 3587.479248046875, |
| "entropy": 0.36572265625, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.08370436728000641, |
| "kl": 0.001728057861328125, |
| "learning_rate": 7.08818754121241e-07, |
| "loss": 0.0001, |
| "reward": 0.22916667442768812, |
| "reward_std": 0.2259194478392601, |
| "rewards/accuracy_reward": 0.010416666977107525, |
| "rewards/format_reward": 0.21875000279396772, |
| "step": 67 |
| }, |
| { |
| "completion_length": 2018.3750457763672, |
| "entropy": 0.36474609375, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.18221524357795715, |
| "kl": 0.0046844482421875, |
| "learning_rate": 6.989785380482312e-07, |
| "loss": 0.0002, |
| "reward": 0.916666716337204, |
| "reward_std": 0.35510556399822235, |
| "rewards/accuracy_reward": 0.2604166716337204, |
| "rewards/format_reward": 0.6562500149011612, |
| "step": 68 |
| }, |
| { |
| "completion_length": 2231.1979370117188, |
| "entropy": 0.41796875, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.19485469162464142, |
| "kl": 0.0045928955078125, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0002, |
| "reward": 0.6666666939854622, |
| "reward_std": 0.3663952201604843, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/format_reward": 0.6041666865348816, |
| "step": 69 |
| }, |
| { |
| "completion_length": 3180.8854370117188, |
| "entropy": 0.38232421875, |
| "epoch": 0.08, |
| "grad_norm": 0.13257598876953125, |
| "kl": 0.00274658203125, |
| "learning_rate": 6.790614547199906e-07, |
| "loss": 0.0001, |
| "reward": 0.45833334885537624, |
| "reward_std": 0.4246904104948044, |
| "rewards/accuracy_reward": 0.07291666977107525, |
| "rewards/format_reward": 0.38541666977107525, |
| "step": 70 |
| }, |
| { |
| "completion_length": 2643.1771240234375, |
| "entropy": 0.43798828125, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 0.13770414888858795, |
| "kl": 0.0029726028442382812, |
| "learning_rate": 6.68995372916741e-07, |
| "loss": 0.0001, |
| "reward": 0.7500000149011612, |
| "reward_std": 0.30269280821084976, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/format_reward": 0.5000000149011612, |
| "step": 71 |
| }, |
| { |
| "completion_length": 2940.6146545410156, |
| "entropy": 0.4892578125, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.20104120671749115, |
| "kl": 0.0030574798583984375, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0001, |
| "reward": 0.5000000149011612, |
| "reward_std": 0.47806398570537567, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/format_reward": 0.4375000111758709, |
| "step": 72 |
| }, |
| { |
| "completion_length": 3779.4583740234375, |
| "entropy": 0.513671875, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.09181614220142365, |
| "kl": 0.0015926361083984375, |
| "learning_rate": 6.486753808845564e-07, |
| "loss": 0.0001, |
| "reward": 0.3229166744276881, |
| "reward_std": 0.4466712549328804, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.19791667070239782, |
| "step": 73 |
| }, |
| { |
| "completion_length": 3236.4688720703125, |
| "entropy": 0.4130859375, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.1495177149772644, |
| "kl": 0.0028803348541259766, |
| "learning_rate": 6.384324742897735e-07, |
| "loss": 0.0001, |
| "reward": 0.645833358168602, |
| "reward_std": 0.4744175747036934, |
| "rewards/accuracy_reward": 0.26041666977107525, |
| "rewards/format_reward": 0.385416679084301, |
| "step": 74 |
| }, |
| { |
| "completion_length": 3159.635498046875, |
| "entropy": 0.404296875, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.11836569011211395, |
| "kl": 0.0026121139526367188, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": 0.0001, |
| "reward": 0.697916679084301, |
| "reward_std": 0.46560388058423996, |
| "rewards/accuracy_reward": 0.22916666697710752, |
| "rewards/format_reward": 0.4687500111758709, |
| "step": 75 |
| }, |
| { |
| "completion_length": 2450.041748046875, |
| "entropy": 0.412353515625, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.1628679782152176, |
| "kl": 0.0018434524536132812, |
| "learning_rate": 6.178085705122674e-07, |
| "loss": 0.0001, |
| "reward": 0.6875, |
| "reward_std": 0.31900282949209213, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/format_reward": 0.6041666716337204, |
| "step": 76 |
| }, |
| { |
| "completion_length": 3208.3751220703125, |
| "entropy": 0.453125, |
| "epoch": 0.088, |
| "grad_norm": 0.1415146142244339, |
| "kl": 0.002140045166015625, |
| "learning_rate": 6.074387415372676e-07, |
| "loss": 0.0001, |
| "reward": 0.5104166818782687, |
| "reward_std": 0.44949568808078766, |
| "rewards/accuracy_reward": 0.09375000558793545, |
| "rewards/format_reward": 0.4166666744276881, |
| "step": 77 |
| }, |
| { |
| "completion_length": 2896.6771240234375, |
| "entropy": 0.3759765625, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.13142429292201996, |
| "kl": 0.0013580322265625, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0001, |
| "reward": 0.7500000298023224, |
| "reward_std": 0.5779594928026199, |
| "rewards/accuracy_reward": 0.250000006519258, |
| "rewards/format_reward": 0.5000000223517418, |
| "step": 78 |
| }, |
| { |
| "completion_length": 2312.0938110351562, |
| "entropy": 0.3544921875, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.1474093347787857, |
| "kl": 0.0019969940185546875, |
| "learning_rate": 5.866114036005362e-07, |
| "loss": 0.0001, |
| "reward": 0.8125000149011612, |
| "reward_std": 0.36936958134174347, |
| "rewards/accuracy_reward": 0.20833333674818277, |
| "rewards/format_reward": 0.604166679084301, |
| "step": 79 |
| }, |
| { |
| "completion_length": 3418.0833740234375, |
| "entropy": 0.49755859375, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.14068344235420227, |
| "kl": 0.002742767333984375, |
| "learning_rate": 5.761651730097142e-07, |
| "loss": 0.0001, |
| "reward": 0.5208333544433117, |
| "reward_std": 0.4246201291680336, |
| "rewards/accuracy_reward": 0.16666666697710752, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 80 |
| }, |
| { |
| "completion_length": 2992.0729370117188, |
| "entropy": 0.56787109375, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.1596606820821762, |
| "kl": 0.005756378173828125, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": 0.0002, |
| "reward": 0.4895833432674408, |
| "reward_std": 0.3185732662677765, |
| "rewards/accuracy_reward": 0.11458333395421505, |
| "rewards/format_reward": 0.3750000074505806, |
| "step": 81 |
| }, |
| { |
| "completion_length": 2483.9584350585938, |
| "entropy": 0.39306640625, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.13652034103870392, |
| "kl": 0.002727508544921875, |
| "learning_rate": 5.552358696106288e-07, |
| "loss": 0.0001, |
| "reward": 0.8020833432674408, |
| "reward_std": 0.24248424544930458, |
| "rewards/accuracy_reward": 0.3020833395421505, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 82 |
| }, |
| { |
| "completion_length": 2964.8646850585938, |
| "entropy": 0.48486328125, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.10738710314035416, |
| "kl": 0.00264739990234375, |
| "learning_rate": 5.447641303893714e-07, |
| "loss": 0.0001, |
| "reward": 0.5312500074505806, |
| "reward_std": 0.3985592797398567, |
| "rewards/accuracy_reward": 0.1770833395421505, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 83 |
| }, |
| { |
| "completion_length": 3069.7396240234375, |
| "entropy": 0.45263671875, |
| "epoch": 0.096, |
| "grad_norm": 0.14286305010318756, |
| "kl": 0.0017604827880859375, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0001, |
| "reward": 0.739583358168602, |
| "reward_std": 0.44136959314346313, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.4895833432674408, |
| "step": 84 |
| }, |
| { |
| "completion_length": 2664.2188110351562, |
| "entropy": 0.324951171875, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.13478516042232513, |
| "kl": 0.002002716064453125, |
| "learning_rate": 5.238348269902859e-07, |
| "loss": 0.0001, |
| "reward": 0.7604166716337204, |
| "reward_std": 0.5178688690066338, |
| "rewards/accuracy_reward": 0.15625000186264515, |
| "rewards/format_reward": 0.6041666679084301, |
| "step": 85 |
| }, |
| { |
| "completion_length": 2774.291748046875, |
| "entropy": 0.465576171875, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.16704270243644714, |
| "kl": 0.00365447998046875, |
| "learning_rate": 5.133885963994639e-07, |
| "loss": 0.0001, |
| "reward": 0.6250000102445483, |
| "reward_std": 0.2606133744120598, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.4583333386108279, |
| "step": 86 |
| }, |
| { |
| "completion_length": 2400.302215576172, |
| "entropy": 0.4453125, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.23015545308589935, |
| "kl": 0.0040435791015625, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0002, |
| "reward": 0.8125000447034836, |
| "reward_std": 0.5173326060175896, |
| "rewards/accuracy_reward": 0.18750001024454832, |
| "rewards/format_reward": 0.625, |
| "step": 87 |
| }, |
| { |
| "completion_length": 2469.7396697998047, |
| "entropy": 0.41943359375, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.16470564901828766, |
| "kl": 0.0041961669921875, |
| "learning_rate": 4.925612584627324e-07, |
| "loss": 0.0002, |
| "reward": 1.0208333730697632, |
| "reward_std": 0.693773627281189, |
| "rewards/accuracy_reward": 0.3750000186264515, |
| "rewards/format_reward": 0.6458333507180214, |
| "step": 88 |
| }, |
| { |
| "completion_length": 2834.635498046875, |
| "entropy": 0.37939453125, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.1899234652519226, |
| "kl": 0.003200531005859375, |
| "learning_rate": 4.821914294877326e-07, |
| "loss": 0.0001, |
| "reward": 0.6562500149011612, |
| "reward_std": 0.5321320816874504, |
| "rewards/accuracy_reward": 0.17708333395421505, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 89 |
| }, |
| { |
| "completion_length": 2226.3959045410156, |
| "entropy": 0.59619140625, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.15134188532829285, |
| "kl": 0.0078887939453125, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0003, |
| "reward": 0.6458333563059568, |
| "reward_std": 0.21650634706020355, |
| "rewards/accuracy_reward": 0.031250000931322575, |
| "rewards/format_reward": 0.6145833460614085, |
| "step": 90 |
| }, |
| { |
| "completion_length": 2510.8334045410156, |
| "entropy": 0.423828125, |
| "epoch": 0.104, |
| "grad_norm": 0.15344281494617462, |
| "kl": 0.0041046142578125, |
| "learning_rate": 4.6156752571022637e-07, |
| "loss": 0.0002, |
| "reward": 0.8958333637565374, |
| "reward_std": 0.40820014476776123, |
| "rewards/accuracy_reward": 0.2604166669771075, |
| "rewards/format_reward": 0.6354166818782687, |
| "step": 91 |
| }, |
| { |
| "completion_length": 2551.062530517578, |
| "entropy": 0.390869140625, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.12282641232013702, |
| "kl": 0.00506591796875, |
| "learning_rate": 4.513246191154434e-07, |
| "loss": 0.0002, |
| "reward": 0.6770833432674408, |
| "reward_std": 0.3805246874690056, |
| "rewards/accuracy_reward": 0.09375000093132257, |
| "rewards/format_reward": 0.5833333432674408, |
| "step": 92 |
| }, |
| { |
| "completion_length": 3784.7188110351562, |
| "entropy": 0.638671875, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.2033790946006775, |
| "kl": 0.0058460235595703125, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0002, |
| "reward": 0.0729166679084301, |
| "reward_std": 0.18205293267965317, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/format_reward": 0.0729166679084301, |
| "step": 93 |
| }, |
| { |
| "completion_length": 2915.9375, |
| "entropy": 0.5439453125, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.19546350836753845, |
| "kl": 0.004344940185546875, |
| "learning_rate": 4.3100462708325914e-07, |
| "loss": 0.0002, |
| "reward": 0.5833333395421505, |
| "reward_std": 0.427902989089489, |
| "rewards/accuracy_reward": 0.18750000279396772, |
| "rewards/format_reward": 0.3958333358168602, |
| "step": 94 |
| }, |
| { |
| "completion_length": 3644.4688110351562, |
| "entropy": 0.47607421875, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.09118141978979111, |
| "kl": 0.0022993087768554688, |
| "learning_rate": 4.209385452800095e-07, |
| "loss": 0.0001, |
| "reward": 0.3958333358168602, |
| "reward_std": 0.4476298391819, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 95 |
| }, |
| { |
| "completion_length": 2458.0833740234375, |
| "entropy": 0.37353515625, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.16451668739318848, |
| "kl": 0.0038547515869140625, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0002, |
| "reward": 0.8750000447034836, |
| "reward_std": 0.4621882885694504, |
| "rewards/accuracy_reward": 0.2708333348855376, |
| "rewards/format_reward": 0.6041666865348816, |
| "step": 96 |
| }, |
| { |
| "completion_length": 2523.0833740234375, |
| "entropy": 0.421875, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.1885625571012497, |
| "kl": 0.003082275390625, |
| "learning_rate": 4.0102146195176887e-07, |
| "loss": 0.0001, |
| "reward": 0.927083358168602, |
| "reward_std": 0.5721743106842041, |
| "rewards/accuracy_reward": 0.2708333348855376, |
| "rewards/format_reward": 0.6562500223517418, |
| "step": 97 |
| }, |
| { |
| "completion_length": 2336.5625915527344, |
| "entropy": 0.384033203125, |
| "epoch": 0.112, |
| "grad_norm": 0.15700186789035797, |
| "kl": 0.0025310516357421875, |
| "learning_rate": 3.911812458787591e-07, |
| "loss": 0.0001, |
| "reward": 0.7916666865348816, |
| "reward_std": 0.2110944464802742, |
| "rewards/accuracy_reward": 0.13541666697710752, |
| "rewards/format_reward": 0.6562500223517418, |
| "step": 98 |
| }, |
| { |
| "completion_length": 2524.8958740234375, |
| "entropy": 0.379150390625, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.1743498593568802, |
| "kl": 0.003711700439453125, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0001, |
| "reward": 0.8125000204890966, |
| "reward_std": 0.4803639128804207, |
| "rewards/accuracy_reward": 0.2708333432674408, |
| "rewards/format_reward": 0.5416666697710752, |
| "step": 99 |
| }, |
| { |
| "completion_length": 2306.0521545410156, |
| "entropy": 0.35009765625, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.12883873283863068, |
| "kl": 0.003726959228515625, |
| "learning_rate": 3.7176410528237945e-07, |
| "loss": 0.0001, |
| "reward": 1.0416666865348816, |
| "reward_std": 0.44550345838069916, |
| "rewards/accuracy_reward": 0.3437500074505806, |
| "rewards/format_reward": 0.6979166716337204, |
| "step": 100 |
| }, |
| { |
| "completion_length": 2115.5313110351562, |
| "entropy": 0.437255859375, |
| "epoch": 0.11542857142857142, |
| "grad_norm": 0.1613183170557022, |
| "kl": 0.00330352783203125, |
| "learning_rate": 3.62197695483182e-07, |
| "loss": 0.0001, |
| "reward": 0.833333358168602, |
| "reward_std": 0.2652370296418667, |
| "rewards/accuracy_reward": 0.15625000558793545, |
| "rewards/format_reward": 0.6770833358168602, |
| "step": 101 |
| }, |
| { |
| "completion_length": 1823.4167175292969, |
| "entropy": 0.369384765625, |
| "epoch": 0.11657142857142858, |
| "grad_norm": 0.11039572954177856, |
| "kl": 0.0045318603515625, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0002, |
| "reward": 0.9375000298023224, |
| "reward_std": 0.2463684342801571, |
| "rewards/accuracy_reward": 0.12500000279396772, |
| "rewards/format_reward": 0.8125000298023224, |
| "step": 102 |
| }, |
| { |
| "completion_length": 2301.593780517578, |
| "entropy": 0.376953125, |
| "epoch": 0.11771428571428572, |
| "grad_norm": 0.27249184250831604, |
| "kl": 0.00434112548828125, |
| "learning_rate": 3.433750959758446e-07, |
| "loss": 0.0002, |
| "reward": 0.895833358168602, |
| "reward_std": 0.5925451144576073, |
| "rewards/accuracy_reward": 0.19791667815297842, |
| "rewards/format_reward": 0.6979166865348816, |
| "step": 103 |
| }, |
| { |
| "completion_length": 2650.4063720703125, |
| "entropy": 0.45458984375, |
| "epoch": 0.11885714285714286, |
| "grad_norm": 0.14005857706069946, |
| "kl": 0.00519561767578125, |
| "learning_rate": 3.3412909903738936e-07, |
| "loss": 0.0002, |
| "reward": 0.572916679084301, |
| "reward_std": 0.42935075983405113, |
| "rewards/accuracy_reward": 0.09375000186264515, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 104 |
| }, |
| { |
| "completion_length": 2248.6666870117188, |
| "entropy": 0.35205078125, |
| "epoch": 0.12, |
| "grad_norm": 0.18584585189819336, |
| "kl": 0.0032196044921875, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0001, |
| "reward": 0.885416716337204, |
| "reward_std": 0.5438356846570969, |
| "rewards/accuracy_reward": 0.25000000838190317, |
| "rewards/format_reward": 0.6354166865348816, |
| "step": 105 |
| }, |
| { |
| "completion_length": 2213.1250915527344, |
| "entropy": 0.30859375, |
| "epoch": 0.12114285714285715, |
| "grad_norm": 0.13285432755947113, |
| "kl": 0.003154754638671875, |
| "learning_rate": 3.159927424318531e-07, |
| "loss": 0.0001, |
| "reward": 1.0625000204890966, |
| "reward_std": 0.4201487675309181, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.6562500055879354, |
| "step": 106 |
| }, |
| { |
| "completion_length": 2495.4166870117188, |
| "entropy": 0.54296875, |
| "epoch": 0.12228571428571429, |
| "grad_norm": 0.2279452681541443, |
| "kl": 0.005664825439453125, |
| "learning_rate": 3.0711220392181934e-07, |
| "loss": 0.0002, |
| "reward": 0.8437500298023224, |
| "reward_std": 0.39580530673265457, |
| "rewards/accuracy_reward": 0.19791666697710752, |
| "rewards/format_reward": 0.645833358168602, |
| "step": 107 |
| }, |
| { |
| "completion_length": 2470.9896545410156, |
| "entropy": 0.41259765625, |
| "epoch": 0.12342857142857143, |
| "grad_norm": 0.1852155178785324, |
| "kl": 0.005123138427734375, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0002, |
| "reward": 0.7604167014360428, |
| "reward_std": 0.44793232530355453, |
| "rewards/accuracy_reward": 0.1770833358168602, |
| "rewards/format_reward": 0.583333358168602, |
| "step": 108 |
| }, |
| { |
| "completion_length": 2746.9166870117188, |
| "entropy": 0.41796875, |
| "epoch": 0.12457142857142857, |
| "grad_norm": 0.15881556272506714, |
| "kl": 0.00360107421875, |
| "learning_rate": 2.897504487244061e-07, |
| "loss": 0.0001, |
| "reward": 0.6458333656191826, |
| "reward_std": 0.3334706202149391, |
| "rewards/accuracy_reward": 0.15625, |
| "rewards/format_reward": 0.4895833507180214, |
| "step": 109 |
| }, |
| { |
| "completion_length": 2482.5000610351562, |
| "entropy": 0.400390625, |
| "epoch": 0.12571428571428572, |
| "grad_norm": 0.1606108397245407, |
| "kl": 0.0030384063720703125, |
| "learning_rate": 2.812786337337463e-07, |
| "loss": 0.0001, |
| "reward": 0.8750000298023224, |
| "reward_std": 0.560508705675602, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/format_reward": 0.645833358168602, |
| "step": 110 |
| }, |
| { |
| "completion_length": 2674.791748046875, |
| "entropy": 0.486572265625, |
| "epoch": 0.12685714285714286, |
| "grad_norm": 0.14915555715560913, |
| "kl": 0.00421905517578125, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0002, |
| "reward": 0.614583358168602, |
| "reward_std": 0.38519187271595, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/format_reward": 0.4895833507180214, |
| "step": 111 |
| }, |
| { |
| "completion_length": 2968.5834350585938, |
| "entropy": 0.470703125, |
| "epoch": 0.128, |
| "grad_norm": 0.2177121490240097, |
| "kl": 0.0030193328857421875, |
| "learning_rate": 2.6477606467058035e-07, |
| "loss": 0.0001, |
| "reward": 0.8229166865348816, |
| "reward_std": 0.5380749329924583, |
| "rewards/accuracy_reward": 0.2708333386108279, |
| "rewards/format_reward": 0.5520833507180214, |
| "step": 112 |
| }, |
| { |
| "completion_length": 1850.3125457763672, |
| "entropy": 0.37451171875, |
| "epoch": 0.12914285714285714, |
| "grad_norm": 0.18240460753440857, |
| "kl": 0.004482269287109375, |
| "learning_rate": 2.567542470303452e-07, |
| "loss": 0.0002, |
| "reward": 0.9375000149011612, |
| "reward_std": 0.39232632517814636, |
| "rewards/accuracy_reward": 0.1979166716337204, |
| "rewards/format_reward": 0.7395833432674408, |
| "step": 113 |
| }, |
| { |
| "completion_length": 1991.9479675292969, |
| "entropy": 0.34228515625, |
| "epoch": 0.13028571428571428, |
| "grad_norm": 0.12856441736221313, |
| "kl": 0.00384521484375, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0002, |
| "reward": 0.9687500298023224, |
| "reward_std": 0.41063307225704193, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.802083358168602, |
| "step": 114 |
| }, |
| { |
| "completion_length": 2642.2813110351562, |
| "entropy": 0.470703125, |
| "epoch": 0.13142857142857142, |
| "grad_norm": 0.1557956039905548, |
| "kl": 0.005847930908203125, |
| "learning_rate": 2.411912629590699e-07, |
| "loss": 0.0002, |
| "reward": 0.7500000149011612, |
| "reward_std": 0.3538191542029381, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/format_reward": 0.5000000149011612, |
| "step": 115 |
| }, |
| { |
| "completion_length": 3452.9271240234375, |
| "entropy": 0.5498046875, |
| "epoch": 0.13257142857142856, |
| "grad_norm": 0.12163397669792175, |
| "kl": 0.00417327880859375, |
| "learning_rate": 2.336585241584522e-07, |
| "loss": 0.0002, |
| "reward": 0.3125000009313226, |
| "reward_std": 0.3494237996637821, |
| "rewards/accuracy_reward": 0.09375000279396772, |
| "rewards/format_reward": 0.21875000558793545, |
| "step": 116 |
| }, |
| { |
| "completion_length": 2859.000030517578, |
| "entropy": 0.5322265625, |
| "epoch": 0.1337142857142857, |
| "grad_norm": 0.23236258327960968, |
| "kl": 0.0061187744140625, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0002, |
| "reward": 0.48958336375653744, |
| "reward_std": 0.34661681205034256, |
| "rewards/accuracy_reward": 0.052083334885537624, |
| "rewards/format_reward": 0.4375000027939677, |
| "step": 117 |
| }, |
| { |
| "completion_length": 2824.354248046875, |
| "entropy": 0.3935546875, |
| "epoch": 0.13485714285714287, |
| "grad_norm": 0.11868440359830856, |
| "kl": 0.002780914306640625, |
| "learning_rate": 2.1911094637307714e-07, |
| "loss": 0.0001, |
| "reward": 1.0416666865348816, |
| "reward_std": 0.6117755249142647, |
| "rewards/accuracy_reward": 0.4270833358168602, |
| "rewards/format_reward": 0.6145833432674408, |
| "step": 118 |
| }, |
| { |
| "completion_length": 2026.5937805175781, |
| "entropy": 0.446533203125, |
| "epoch": 0.136, |
| "grad_norm": 0.1918383240699768, |
| "kl": 0.00518035888671875, |
| "learning_rate": 2.1210398515832536e-07, |
| "loss": 0.0002, |
| "reward": 0.989583358168602, |
| "reward_std": 0.3108450919389725, |
| "rewards/accuracy_reward": 0.2604166716337204, |
| "rewards/format_reward": 0.7291666865348816, |
| "step": 119 |
| }, |
| { |
| "completion_length": 1974.4270935058594, |
| "entropy": 0.43359375, |
| "epoch": 0.13714285714285715, |
| "grad_norm": 0.19848716259002686, |
| "kl": 0.006320953369140625, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0003, |
| "reward": 0.864583358168602, |
| "reward_std": 0.4301687255501747, |
| "rewards/accuracy_reward": 0.16666666697710752, |
| "rewards/format_reward": 0.6979166865348816, |
| "step": 120 |
| }, |
| { |
| "completion_length": 1066.2396240234375, |
| "entropy": 0.30810546875, |
| "epoch": 0.1382857142857143, |
| "grad_norm": 0.15486152470111847, |
| "kl": 0.00482940673828125, |
| "learning_rate": 1.986426879955034e-07, |
| "loss": 0.0002, |
| "reward": 1.2500000298023224, |
| "reward_std": 0.25760992616415024, |
| "rewards/accuracy_reward": 0.3020833386108279, |
| "rewards/format_reward": 0.9479166865348816, |
| "step": 121 |
| }, |
| { |
| "completion_length": 2534.8958740234375, |
| "entropy": 0.4521484375, |
| "epoch": 0.13942857142857143, |
| "grad_norm": 0.12797077000141144, |
| "kl": 0.00380706787109375, |
| "learning_rate": 1.9219564157731844e-07, |
| "loss": 0.0002, |
| "reward": 0.8229167014360428, |
| "reward_std": 0.3391122668981552, |
| "rewards/accuracy_reward": 0.20833333861082792, |
| "rewards/format_reward": 0.6145833432674408, |
| "step": 122 |
| }, |
| { |
| "completion_length": 2572.791748046875, |
| "entropy": 0.451416015625, |
| "epoch": 0.14057142857142857, |
| "grad_norm": 0.1227995902299881, |
| "kl": 0.0032405853271484375, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0001, |
| "reward": 0.7083333544433117, |
| "reward_std": 0.3739900141954422, |
| "rewards/accuracy_reward": 0.1354166679084301, |
| "rewards/format_reward": 0.5729166828095913, |
| "step": 123 |
| }, |
| { |
| "completion_length": 2198.6563110351562, |
| "entropy": 0.33984375, |
| "epoch": 0.1417142857142857, |
| "grad_norm": 0.2465568333864212, |
| "kl": 0.015537261962890625, |
| "learning_rate": 1.7988620712370195e-07, |
| "loss": 0.0006, |
| "reward": 0.9687500298023224, |
| "reward_std": 0.5658619552850723, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.7187500298023224, |
| "step": 124 |
| }, |
| { |
| "completion_length": 2865.250030517578, |
| "entropy": 0.431640625, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.10473211109638214, |
| "kl": 0.00345611572265625, |
| "learning_rate": 1.7403048486417868e-07, |
| "loss": 0.0001, |
| "reward": 0.6979166697710752, |
| "reward_std": 0.3267679661512375, |
| "rewards/accuracy_reward": 0.30208333395421505, |
| "rewards/format_reward": 0.3958333386108279, |
| "step": 125 |
| }, |
| { |
| "completion_length": 2886.2188720703125, |
| "entropy": 0.4560546875, |
| "epoch": 0.144, |
| "grad_norm": 0.09086798876523972, |
| "kl": 0.0029506683349609375, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0001, |
| "reward": 0.7395833358168602, |
| "reward_std": 0.3677559196949005, |
| "rewards/accuracy_reward": 0.2083333432674408, |
| "rewards/format_reward": 0.5312500149011612, |
| "step": 126 |
| }, |
| { |
| "completion_length": 2771.3438415527344, |
| "entropy": 0.41796875, |
| "epoch": 0.14514285714285713, |
| "grad_norm": 0.15809084475040436, |
| "kl": 0.004436492919921875, |
| "learning_rate": 1.6293288344708566e-07, |
| "loss": 0.0002, |
| "reward": 0.635416679084301, |
| "reward_std": 0.49130160734057426, |
| "rewards/accuracy_reward": 0.0937500037252903, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 127 |
| }, |
| { |
| "completion_length": 2852.291748046875, |
| "entropy": 0.5322265625, |
| "epoch": 0.1462857142857143, |
| "grad_norm": 0.17982318997383118, |
| "kl": 0.004756927490234375, |
| "learning_rate": 1.5769701383645698e-07, |
| "loss": 0.0002, |
| "reward": 0.9166666967794299, |
| "reward_std": 0.5036755502223969, |
| "rewards/accuracy_reward": 0.3645833358168602, |
| "rewards/format_reward": 0.5520833535119891, |
| "step": 128 |
| }, |
| { |
| "completion_length": 3367.9583740234375, |
| "entropy": 0.49658203125, |
| "epoch": 0.14742857142857144, |
| "grad_norm": 0.1593668907880783, |
| "kl": 0.00467681884765625, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0002, |
| "reward": 0.4583333507180214, |
| "reward_std": 0.4737073630094528, |
| "rewards/accuracy_reward": 0.1666666753590107, |
| "rewards/format_reward": 0.2916666679084301, |
| "step": 129 |
| }, |
| { |
| "completion_length": 2817.7709350585938, |
| "entropy": 0.5, |
| "epoch": 0.14857142857142858, |
| "grad_norm": 0.16384749114513397, |
| "kl": 0.00354766845703125, |
| "learning_rate": 1.4786531185446452e-07, |
| "loss": 0.0001, |
| "reward": 0.479166679084301, |
| "reward_std": 0.40873220562934875, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 130 |
| }, |
| { |
| "completion_length": 2720.8021850585938, |
| "entropy": 0.49658203125, |
| "epoch": 0.14971428571428572, |
| "grad_norm": 0.24187295138835907, |
| "kl": 0.004924774169921875, |
| "learning_rate": 1.432748035231658e-07, |
| "loss": 0.0002, |
| "reward": 0.9062500298023224, |
| "reward_std": 0.504379153251648, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 0.5312500223517418, |
| "step": 131 |
| }, |
| { |
| "completion_length": 2590.0521545410156, |
| "entropy": 0.4248046875, |
| "epoch": 0.15085714285714286, |
| "grad_norm": 0.13521018624305725, |
| "kl": 0.003265380859375, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0001, |
| "reward": 0.833333358168602, |
| "reward_std": 0.5318443104624748, |
| "rewards/accuracy_reward": 0.2916666744276881, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 132 |
| }, |
| { |
| "completion_length": 3037.8854370117188, |
| "entropy": 0.49072265625, |
| "epoch": 0.152, |
| "grad_norm": 0.17249611020088196, |
| "kl": 0.004932403564453125, |
| "learning_rate": 1.3475690004005097e-07, |
| "loss": 0.0002, |
| "reward": 0.5104166865348816, |
| "reward_std": 0.2723224312067032, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/format_reward": 0.4062500149011612, |
| "step": 133 |
| }, |
| { |
| "completion_length": 2485.322998046875, |
| "entropy": 0.5224609375, |
| "epoch": 0.15314285714285714, |
| "grad_norm": 0.16694706678390503, |
| "kl": 0.00672149658203125, |
| "learning_rate": 1.308341174832359e-07, |
| "loss": 0.0003, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.49518734961748123, |
| "rewards/accuracy_reward": 0.2500000102445483, |
| "rewards/format_reward": 0.6250000149011612, |
| "step": 134 |
| }, |
| { |
| "completion_length": 1746.3958740234375, |
| "entropy": 0.373046875, |
| "epoch": 0.15428571428571428, |
| "grad_norm": 0.1858878880739212, |
| "kl": 0.007049560546875, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0003, |
| "reward": 1.1145833432674408, |
| "reward_std": 0.3352552205324173, |
| "rewards/accuracy_reward": 0.43750000558793545, |
| "rewards/format_reward": 0.6770833432674408, |
| "step": 135 |
| }, |
| { |
| "completion_length": 2145.4584045410156, |
| "entropy": 0.35986328125, |
| "epoch": 0.15542857142857142, |
| "grad_norm": 0.196150541305542, |
| "kl": 0.00507354736328125, |
| "learning_rate": 1.2367151086855187e-07, |
| "loss": 0.0002, |
| "reward": 0.9895833358168602, |
| "reward_std": 0.6333772391080856, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/format_reward": 0.6770833507180214, |
| "step": 136 |
| }, |
| { |
| "completion_length": 2813.510467529297, |
| "entropy": 0.387939453125, |
| "epoch": 0.15657142857142858, |
| "grad_norm": 0.13014180958271027, |
| "kl": 0.00402069091796875, |
| "learning_rate": 1.2043556548852063e-07, |
| "loss": 0.0002, |
| "reward": 0.677083358168602, |
| "reward_std": 0.5024930611252785, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/format_reward": 0.5312500260770321, |
| "step": 137 |
| }, |
| { |
| "completion_length": 2061.0000610351562, |
| "entropy": 0.35107421875, |
| "epoch": 0.15771428571428572, |
| "grad_norm": 0.10559725016355515, |
| "kl": 0.0037689208984375, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0002, |
| "reward": 0.9270833730697632, |
| "reward_std": 0.30403000861406326, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/format_reward": 0.739583358168602, |
| "step": 138 |
| }, |
| { |
| "completion_length": 3106.291748046875, |
| "entropy": 0.55615234375, |
| "epoch": 0.15885714285714286, |
| "grad_norm": 0.15882417559623718, |
| "kl": 0.00519561767578125, |
| "learning_rate": 1.1466315124171128e-07, |
| "loss": 0.0002, |
| "reward": 0.708333358168602, |
| "reward_std": 0.5456142984330654, |
| "rewards/accuracy_reward": 0.1666666753590107, |
| "rewards/format_reward": 0.541666679084301, |
| "step": 139 |
| }, |
| { |
| "completion_length": 2395.885498046875, |
| "entropy": 0.4853515625, |
| "epoch": 0.16, |
| "grad_norm": 0.2862064242362976, |
| "kl": 0.006809234619140625, |
| "learning_rate": 1.1212980823907929e-07, |
| "loss": 0.0003, |
| "reward": 0.7500000298023224, |
| "reward_std": 0.38956041634082794, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.5833333507180214, |
| "step": 140 |
| }, |
| { |
| "completion_length": 1969.2292175292969, |
| "entropy": 0.33935546875, |
| "epoch": 0.16114285714285714, |
| "grad_norm": 0.17285719513893127, |
| "kl": 0.0047760009765625, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0002, |
| "reward": 0.9895833730697632, |
| "reward_std": 0.520443569868803, |
| "rewards/accuracy_reward": 0.2187500074505806, |
| "rewards/format_reward": 0.770833358168602, |
| "step": 141 |
| }, |
| { |
| "completion_length": 2584.9688110351562, |
| "entropy": 0.447998046875, |
| "epoch": 0.16228571428571428, |
| "grad_norm": 0.13187262415885925, |
| "kl": 0.0047740936279296875, |
| "learning_rate": 1.0777570898211405e-07, |
| "loss": 0.0002, |
| "reward": 0.9166667014360428, |
| "reward_std": 0.4435262605547905, |
| "rewards/accuracy_reward": 0.2187500111758709, |
| "rewards/format_reward": 0.6979166865348816, |
| "step": 142 |
| }, |
| { |
| "completion_length": 2300.7500610351562, |
| "entropy": 0.4326171875, |
| "epoch": 0.16342857142857142, |
| "grad_norm": 0.25766721367836, |
| "kl": 0.00977325439453125, |
| "learning_rate": 1.0595731054933934e-07, |
| "loss": 0.0004, |
| "reward": 0.6875000074505806, |
| "reward_std": 0.3443669453263283, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.6041666716337204, |
| "step": 143 |
| }, |
| { |
| "completion_length": 2849.4688110351562, |
| "entropy": 0.45556640625, |
| "epoch": 0.16457142857142856, |
| "grad_norm": 0.14796483516693115, |
| "kl": 0.00493621826171875, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0002, |
| "reward": 0.677083358168602, |
| "reward_std": 0.4717573896050453, |
| "rewards/accuracy_reward": 0.2187500037252903, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 144 |
| }, |
| { |
| "completion_length": 1885.4479522705078, |
| "entropy": 0.353515625, |
| "epoch": 0.1657142857142857, |
| "grad_norm": 0.1617114096879959, |
| "kl": 0.005008697509765625, |
| "learning_rate": 1.0304273901612565e-07, |
| "loss": 0.0002, |
| "reward": 1.0208333730697632, |
| "reward_std": 0.3080247640609741, |
| "rewards/accuracy_reward": 0.3020833460614085, |
| "rewards/format_reward": 0.7187500149011612, |
| "step": 145 |
| }, |
| { |
| "completion_length": 1947.0312805175781, |
| "entropy": 0.376953125, |
| "epoch": 0.16685714285714287, |
| "grad_norm": 0.11680302768945694, |
| "kl": 0.0033721923828125, |
| "learning_rate": 1.0194814420758804e-07, |
| "loss": 0.0001, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.22604453563690186, |
| "rewards/accuracy_reward": 0.07291666977107525, |
| "rewards/format_reward": 0.8020833432674408, |
| "step": 146 |
| }, |
| { |
| "completion_length": 2215.7084045410156, |
| "entropy": 0.392578125, |
| "epoch": 0.168, |
| "grad_norm": 0.21416479349136353, |
| "kl": 0.00583648681640625, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0002, |
| "reward": 0.8437500149011612, |
| "reward_std": 0.5214347615838051, |
| "rewards/accuracy_reward": 0.1770833395421505, |
| "rewards/format_reward": 0.6666666865348816, |
| "step": 147 |
| }, |
| { |
| "completion_length": 1631.229248046875, |
| "entropy": 0.302734375, |
| "epoch": 0.16914285714285715, |
| "grad_norm": 0.17106008529663086, |
| "kl": 0.00476837158203125, |
| "learning_rate": 1.0048729989766394e-07, |
| "loss": 0.0002, |
| "reward": 0.9895833730697632, |
| "reward_std": 0.29220427572727203, |
| "rewards/accuracy_reward": 0.13541666977107525, |
| "rewards/format_reward": 0.8541666865348816, |
| "step": 148 |
| }, |
| { |
| "completion_length": 2407.8229370117188, |
| "entropy": 0.3408203125, |
| "epoch": 0.1702857142857143, |
| "grad_norm": 0.15983973443508148, |
| "kl": 0.008087158203125, |
| "learning_rate": 1.0012184146924223e-07, |
| "loss": 0.0003, |
| "reward": 0.9270833432674408, |
| "reward_std": 0.48707588016986847, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/format_reward": 0.6770833432674408, |
| "step": 149 |
| }, |
| { |
| "completion_length": 2178.729217529297, |
| "entropy": 0.36962890625, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.19081099331378937, |
| "kl": 0.004444122314453125, |
| "learning_rate": 1e-07, |
| "loss": 0.0002, |
| "reward": 1.0312500298023224, |
| "reward_std": 0.5531396120786667, |
| "rewards/accuracy_reward": 0.291666679084301, |
| "rewards/format_reward": 0.7395833432674408, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.17142857142857143, |
| "step": 150, |
| "total_flos": 0.0, |
| "train_loss": 0.00011944215420650531, |
| "train_runtime": 12092.6435, |
| "train_samples_per_second": 1.191, |
| "train_steps_per_second": 0.012 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 150, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|