| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.17142857142857143, |
| "eval_steps": 500, |
| "global_step": 150, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 2700.4271850585938, |
| "cov_mean": -6.0587970438064076e-05, |
| "cov_std": 0.35307812318205833, |
| "entropy": 0.36962890625, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.4682573080062866, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": -0.0446, |
| "reward": 0.7604166893288493, |
| "reward_std": 0.4268697127699852, |
| "rewards/accuracy_reward": 0.25000001303851604, |
| "rewards/format_reward": 0.5104166669771075, |
| "step": 1, |
| "w_high_ratio": 0.21827427297830582, |
| "w_low_ratio": 0.03724556043744087, |
| "w_max": 2.315404176712036, |
| "w_mean": 1.47113436460495, |
| "w_min": 0.0, |
| "w_std": 0.2791289445012808 |
| }, |
| { |
| "completion_length": 3127.3958435058594, |
| "cov_mean": -2.155053698515985e-05, |
| "cov_std": 0.310540571808815, |
| "entropy": 0.353515625, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.534198522567749, |
| "kl": 0.0, |
| "learning_rate": 6.666666666666667e-08, |
| "loss": 0.0058, |
| "reward": 0.6458333637565374, |
| "reward_std": 0.4249730706214905, |
| "rewards/accuracy_reward": 0.2812500102445483, |
| "rewards/format_reward": 0.3645833386108279, |
| "step": 2, |
| "w_high_ratio": 0.05722124548628926, |
| "w_low_ratio": 0.036368744214996696, |
| "w_max": 1.8768170773983002, |
| "w_mean": 1.2113382518291473, |
| "w_min": 0.0, |
| "w_std": 0.19011373445391655 |
| }, |
| { |
| "completion_length": 3699.729248046875, |
| "cov_mean": -7.181393357313937e-05, |
| "cov_std": 0.2876722402870655, |
| "entropy": 0.458984375, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.3820074200630188, |
| "kl": 4.845857620239258e-05, |
| "learning_rate": 1.3333333333333334e-07, |
| "loss": 0.0581, |
| "reward": 0.250000006519258, |
| "reward_std": 0.392750509083271, |
| "rewards/accuracy_reward": 0.0729166679084301, |
| "rewards/format_reward": 0.17708334233611822, |
| "step": 3, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03734566690400243, |
| "w_max": 1.4001508057117462, |
| "w_mean": 1.0752681195735931, |
| "w_min": 0.0, |
| "w_std": 0.16775447502732277 |
| }, |
| { |
| "completion_length": 2261.197998046875, |
| "cov_mean": 2.3754174435453024e-05, |
| "cov_std": 0.37356993183493614, |
| "entropy": 0.3896484375, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.6735726594924927, |
| "kl": 3.325939178466797e-05, |
| "learning_rate": 2e-07, |
| "loss": -0.0291, |
| "reward": 0.9166666865348816, |
| "reward_std": 0.4729222096502781, |
| "rewards/accuracy_reward": 0.19791667070239782, |
| "rewards/format_reward": 0.7187500223517418, |
| "step": 4, |
| "w_high_ratio": 0.1157110151834786, |
| "w_low_ratio": 0.03791455435566604, |
| "w_max": 2.3952889442443848, |
| "w_mean": 1.526582419872284, |
| "w_min": 0.0, |
| "w_std": 0.27415894344449043 |
| }, |
| { |
| "completion_length": 3429.5313720703125, |
| "cov_mean": 6.268694096434047e-05, |
| "cov_std": 0.43890176713466644, |
| "entropy": 0.45556640625, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.4359005391597748, |
| "kl": 4.1544437408447266e-05, |
| "learning_rate": 2.6666666666666667e-07, |
| "loss": -0.0158, |
| "reward": 0.385416679084301, |
| "reward_std": 0.4654032774269581, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/format_reward": 0.3437500074505806, |
| "step": 5, |
| "w_high_ratio": 0.02077934332191944, |
| "w_low_ratio": 0.05408582789823413, |
| "w_max": 1.6484719216823578, |
| "w_mean": 1.1780387163162231, |
| "w_min": 1.1194353007380611e-33, |
| "w_std": 0.2486402541399002 |
| }, |
| { |
| "completion_length": 3461.1875610351562, |
| "cov_mean": -5.967591278022155e-06, |
| "cov_std": 0.49887532368302345, |
| "entropy": 0.46533203125, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.3651033937931061, |
| "kl": 4.723668098449707e-05, |
| "learning_rate": 3.333333333333333e-07, |
| "loss": 0.0963, |
| "reward": 0.3750000027939677, |
| "reward_std": 0.4880600869655609, |
| "rewards/accuracy_reward": 0.09375000093132257, |
| "rewards/format_reward": 0.281250006519258, |
| "step": 6, |
| "w_high_ratio": 0.09943684190511703, |
| "w_low_ratio": 0.05423067696392536, |
| "w_max": 2.1359716653823853, |
| "w_mean": 1.2522149085998535, |
| "w_min": 0.0, |
| "w_std": 0.3030992951244116 |
| }, |
| { |
| "completion_length": 3263.5834350585938, |
| "cov_mean": 1.3439643225865439e-05, |
| "cov_std": 0.48358847945928574, |
| "entropy": 0.3818359375, |
| "epoch": 0.008, |
| "grad_norm": 0.30507004261016846, |
| "kl": 1.7881393432617188e-05, |
| "learning_rate": 4e-07, |
| "loss": -0.0382, |
| "reward": 0.895833358168602, |
| "reward_std": 0.5626667812466621, |
| "rewards/accuracy_reward": 0.2604166716337204, |
| "rewards/format_reward": 0.6354166865348816, |
| "step": 7, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.061702484264969826, |
| "w_max": 1.6040385067462921, |
| "w_mean": 1.2159111201763153, |
| "w_min": 0.0, |
| "w_std": 0.2597455531358719 |
| }, |
| { |
| "completion_length": 2891.2396240234375, |
| "cov_mean": -2.5899114461935824e-06, |
| "cov_std": 0.31939053907990456, |
| "entropy": 0.352783203125, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.2681833803653717, |
| "kl": 2.5928020477294922e-05, |
| "learning_rate": 4.6666666666666666e-07, |
| "loss": -0.0366, |
| "reward": 0.8645833544433117, |
| "reward_std": 0.5030707456171513, |
| "rewards/accuracy_reward": 0.38541667722165585, |
| "rewards/format_reward": 0.47916667722165585, |
| "step": 8, |
| "w_high_ratio": 0.008502780459821224, |
| "w_low_ratio": 0.032900307793170214, |
| "w_max": 1.777650386095047, |
| "w_mean": 1.312690258026123, |
| "w_min": 0.0, |
| "w_std": 0.19663411937654018 |
| }, |
| { |
| "completion_length": 3367.1146850585938, |
| "cov_mean": -1.6302776657539653e-05, |
| "cov_std": 0.46739284694194794, |
| "entropy": 0.455078125, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.3553212881088257, |
| "kl": 4.4405460357666016e-05, |
| "learning_rate": 5.333333333333333e-07, |
| "loss": -0.0055, |
| "reward": 0.4687500074505806, |
| "reward_std": 0.5153512582182884, |
| "rewards/accuracy_reward": 0.10416667070239782, |
| "rewards/format_reward": 0.3645833432674408, |
| "step": 9, |
| "w_high_ratio": 0.07327797263860703, |
| "w_low_ratio": 0.05655699595808983, |
| "w_max": 1.8923848271369934, |
| "w_mean": 1.2186144888401031, |
| "w_min": 0.0, |
| "w_std": 0.2829560115933418 |
| }, |
| { |
| "completion_length": 2910.5209350585938, |
| "cov_mean": -4.502756110014161e-05, |
| "cov_std": 0.2934259846806526, |
| "entropy": 0.34228515625, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.42780593037605286, |
| "kl": 2.6166439056396484e-05, |
| "learning_rate": 6e-07, |
| "loss": -0.1052, |
| "reward": 0.645833358168602, |
| "reward_std": 0.4341064542531967, |
| "rewards/accuracy_reward": 0.18750001024454832, |
| "rewards/format_reward": 0.4583333469927311, |
| "step": 10, |
| "w_high_ratio": 0.10562621057033539, |
| "w_low_ratio": 0.03302360652014613, |
| "w_max": 2.053061753511429, |
| "w_mean": 1.2806267738342285, |
| "w_min": 0.0, |
| "w_std": 0.18188510835170746 |
| }, |
| { |
| "completion_length": 3628.7188110351562, |
| "cov_mean": 7.874305651967006e-06, |
| "cov_std": 0.26691694743931293, |
| "entropy": 0.37451171875, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.3640754818916321, |
| "kl": 3.072619438171387e-05, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": 0.028, |
| "reward": 0.22916667442768812, |
| "reward_std": 0.3545106574892998, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/format_reward": 0.16666666697710752, |
| "step": 11, |
| "w_high_ratio": 0.02717638947069645, |
| "w_low_ratio": 0.032686853082850575, |
| "w_max": 1.4469610452651978, |
| "w_mean": 1.1045592427253723, |
| "w_min": 0.25, |
| "w_std": 0.16260053776204586 |
| }, |
| { |
| "completion_length": 2600.3334350585938, |
| "cov_mean": -2.9306334909051657e-05, |
| "cov_std": 0.36532483994960785, |
| "entropy": 0.3974609375, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.5842437148094177, |
| "kl": 3.3795833587646484e-05, |
| "learning_rate": 7.333333333333332e-07, |
| "loss": -0.0312, |
| "reward": 0.8541667014360428, |
| "reward_std": 0.42210913449525833, |
| "rewards/accuracy_reward": 0.16666667070239782, |
| "rewards/format_reward": 0.6875000298023224, |
| "step": 12, |
| "w_high_ratio": 0.25580327026546, |
| "w_low_ratio": 0.03646009974181652, |
| "w_max": 2.4061461091041565, |
| "w_mean": 1.5557032227516174, |
| "w_min": 0.0, |
| "w_std": 0.2952301353216171 |
| }, |
| { |
| "completion_length": 3168.260498046875, |
| "cov_mean": -2.8395811568771023e-07, |
| "cov_std": 0.16673796251416206, |
| "entropy": 0.3974609375, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.32018211483955383, |
| "kl": 3.4928321838378906e-05, |
| "learning_rate": 8e-07, |
| "loss": 0.013, |
| "reward": 0.6666666865348816, |
| "reward_std": 0.21763009577989578, |
| "rewards/accuracy_reward": 0.21875000558793545, |
| "rewards/format_reward": 0.447916679084301, |
| "step": 13, |
| "w_high_ratio": 0.16240009665489197, |
| "w_low_ratio": 0.019987554755061865, |
| "w_max": 1.9363721311092377, |
| "w_mean": 1.3362610340118408, |
| "w_min": 0.25, |
| "w_std": 0.13108899258077145 |
| }, |
| { |
| "completion_length": 3131.4896240234375, |
| "cov_mean": 2.226169999630656e-05, |
| "cov_std": 0.4046770706772804, |
| "entropy": 0.37109375, |
| "epoch": 0.016, |
| "grad_norm": 0.37404048442840576, |
| "kl": 2.765655517578125e-05, |
| "learning_rate": 8.666666666666667e-07, |
| "loss": -0.0183, |
| "reward": 0.697916679084301, |
| "reward_std": 0.5398248583078384, |
| "rewards/accuracy_reward": 0.23958334140479565, |
| "rewards/format_reward": 0.4583333507180214, |
| "step": 14, |
| "w_high_ratio": 0.04599327966570854, |
| "w_low_ratio": 0.04479631967842579, |
| "w_max": 1.9890422523021698, |
| "w_mean": 1.2714892327785492, |
| "w_min": 0.0, |
| "w_std": 0.2778457775712013 |
| }, |
| { |
| "completion_length": 2876.5000610351562, |
| "cov_mean": 5.642831865770859e-05, |
| "cov_std": 0.2463381662964821, |
| "entropy": 0.3603515625, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.3123975694179535, |
| "kl": 2.4527311325073242e-05, |
| "learning_rate": 9.333333333333333e-07, |
| "loss": 0.0189, |
| "reward": 0.6145833358168602, |
| "reward_std": 0.282865684479475, |
| "rewards/accuracy_reward": 0.21875000279396772, |
| "rewards/format_reward": 0.3958333395421505, |
| "step": 15, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.025731001514941454, |
| "w_max": 1.6097791492938995, |
| "w_mean": 1.1706343591213226, |
| "w_min": 0.25, |
| "w_std": 0.15255355089902878 |
| }, |
| { |
| "completion_length": 3825.5313110351562, |
| "cov_mean": 4.868104133493034e-05, |
| "cov_std": 0.24393152818083763, |
| "entropy": 0.462890625, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.33849093317985535, |
| "kl": 3.88026237487793e-05, |
| "learning_rate": 1e-06, |
| "loss": 0.0219, |
| "reward": 0.1458333358168602, |
| "reward_std": 0.3029968775808811, |
| "rewards/accuracy_reward": 0.05208333395421505, |
| "rewards/format_reward": 0.09375, |
| "step": 16, |
| "w_high_ratio": 0.016145935282111168, |
| "w_low_ratio": 0.030266874469816685, |
| "w_max": 1.4482556581497192, |
| "w_mean": 1.039628803730011, |
| "w_min": 0.25, |
| "w_std": 0.12409967929124832 |
| }, |
| { |
| "completion_length": 2457.187530517578, |
| "cov_mean": -5.5577158491360024e-05, |
| "cov_std": 0.40591511130332947, |
| "entropy": 0.44580078125, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.4333915710449219, |
| "kl": 3.7103891372680664e-05, |
| "learning_rate": 9.998781585307575e-07, |
| "loss": -0.0184, |
| "reward": 0.8645833432674408, |
| "reward_std": 0.4666195958852768, |
| "rewards/accuracy_reward": 0.27083334140479565, |
| "rewards/format_reward": 0.5937500149011612, |
| "step": 17, |
| "w_high_ratio": 0.2576238848268986, |
| "w_low_ratio": 0.045077938586473465, |
| "w_max": 2.4780974686145782, |
| "w_mean": 1.4898322224617004, |
| "w_min": 0.25, |
| "w_std": 0.27607931941747665 |
| }, |
| { |
| "completion_length": 3082.572998046875, |
| "cov_mean": 2.8908147328365885e-05, |
| "cov_std": 0.37330811098217964, |
| "entropy": 0.35791015625, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.4184105098247528, |
| "kl": 1.868605613708496e-05, |
| "learning_rate": 9.99512700102336e-07, |
| "loss": -0.0108, |
| "reward": 0.5833333432674408, |
| "reward_std": 0.35651107877492905, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/format_reward": 0.4583333507180214, |
| "step": 18, |
| "w_high_ratio": 0.05808360502123833, |
| "w_low_ratio": 0.036713168025016785, |
| "w_max": 1.845141887664795, |
| "w_mean": 1.2742244899272919, |
| "w_min": 0.0, |
| "w_std": 0.20810226537287235 |
| }, |
| { |
| "completion_length": 3121.2084350585938, |
| "cov_mean": -7.294934039236978e-05, |
| "cov_std": 0.43627090007066727, |
| "entropy": 0.390625, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.37888550758361816, |
| "kl": 3.045797348022461e-05, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": -0.0515, |
| "reward": 0.8437500223517418, |
| "reward_std": 0.6305291801691055, |
| "rewards/accuracy_reward": 0.37500001676380634, |
| "rewards/format_reward": 0.4687500260770321, |
| "step": 19, |
| "w_high_ratio": 0.07324637286365032, |
| "w_low_ratio": 0.05016931891441345, |
| "w_max": 1.8255594372749329, |
| "w_mean": 1.2571382224559784, |
| "w_min": 0.0, |
| "w_std": 0.2677953541278839 |
| }, |
| { |
| "completion_length": 2640.885498046875, |
| "cov_mean": 4.735650145448744e-05, |
| "cov_std": 0.3673415333032608, |
| "entropy": 0.346923828125, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.5563057065010071, |
| "kl": 2.5674700736999512e-05, |
| "learning_rate": 9.98051855792412e-07, |
| "loss": -0.0634, |
| "reward": 0.927083358168602, |
| "reward_std": 0.4865139201283455, |
| "rewards/accuracy_reward": 0.291666679084301, |
| "rewards/format_reward": 0.6354166865348816, |
| "step": 20, |
| "w_high_ratio": 0.1635199710726738, |
| "w_low_ratio": 0.04272336233407259, |
| "w_max": 2.4736950993537903, |
| "w_mean": 1.437729924917221, |
| "w_min": 0.0, |
| "w_std": 0.2519150599837303 |
| }, |
| { |
| "completion_length": 2853.541748046875, |
| "cov_mean": 2.0583035620802548e-05, |
| "cov_std": 0.330572672188282, |
| "entropy": 0.4111328125, |
| "epoch": 0.024, |
| "grad_norm": 1.0757092237472534, |
| "kl": 6.0230493545532227e-05, |
| "learning_rate": 9.969572609838744e-07, |
| "loss": -0.0628, |
| "reward": 0.6458333684131503, |
| "reward_std": 0.4320429190993309, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/format_reward": 0.4583333535119891, |
| "step": 21, |
| "w_high_ratio": 0.2964049205183983, |
| "w_low_ratio": 0.028110376093536615, |
| "w_max": 2.3169990181922913, |
| "w_mean": 1.5084502398967743, |
| "w_min": 0.0, |
| "w_std": 0.2527524419128895 |
| }, |
| { |
| "completion_length": 1911.4584045410156, |
| "cov_mean": 5.60426842639572e-05, |
| "cov_std": 0.40649277716875076, |
| "entropy": 0.4189453125, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.6271277666091919, |
| "kl": 7.963180541992188e-05, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": -0.0446, |
| "reward": 1.0937500447034836, |
| "reward_std": 0.43423888459801674, |
| "rewards/accuracy_reward": 0.26041666977107525, |
| "rewards/format_reward": 0.833333358168602, |
| "step": 22, |
| "w_high_ratio": 0.31319986283779144, |
| "w_low_ratio": 0.043129971250891685, |
| "w_max": 2.544324040412903, |
| "w_mean": 1.6584191024303436, |
| "w_min": 1.0509738482436128e-45, |
| "w_std": 0.2851412668824196 |
| }, |
| { |
| "completion_length": 2764.9896240234375, |
| "cov_mean": -5.482907317855279e-05, |
| "cov_std": 0.4773280769586563, |
| "entropy": 0.364013671875, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.4739457368850708, |
| "kl": 5.441904067993164e-05, |
| "learning_rate": 9.940426894506606e-07, |
| "loss": 0.0215, |
| "reward": 0.677083358168602, |
| "reward_std": 0.5219395384192467, |
| "rewards/accuracy_reward": 0.17708333861082792, |
| "rewards/format_reward": 0.5000000149011612, |
| "step": 23, |
| "w_high_ratio": 0.2707533538341522, |
| "w_low_ratio": 0.053491173312067986, |
| "w_max": 2.4205015003681183, |
| "w_mean": 1.5040302574634552, |
| "w_min": 0.0, |
| "w_std": 0.36683739349246025 |
| }, |
| { |
| "completion_length": 2940.791748046875, |
| "cov_mean": -2.262868292746134e-07, |
| "cov_std": 0.4811120182275772, |
| "entropy": 0.36572265625, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.7653826475143433, |
| "kl": 7.31348991394043e-05, |
| "learning_rate": 9.922242910178859e-07, |
| "loss": -0.0493, |
| "reward": 0.8750000149011612, |
| "reward_std": 0.580329179763794, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/format_reward": 0.5625, |
| "step": 24, |
| "w_high_ratio": 0.13745100796222687, |
| "w_low_ratio": 0.043023983016610146, |
| "w_max": 2.282612681388855, |
| "w_mean": 1.408221811056137, |
| "w_min": 0.0, |
| "w_std": 0.30761053785681725 |
| }, |
| { |
| "completion_length": 2838.385498046875, |
| "cov_mean": 5.768927030658233e-05, |
| "cov_std": 0.3588615171611309, |
| "entropy": 0.43115234375, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.4592779278755188, |
| "kl": 0.0001583397388458252, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0336, |
| "reward": 0.6458333535119891, |
| "reward_std": 0.42768918722867966, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/format_reward": 0.4375000102445483, |
| "step": 25, |
| "w_high_ratio": 0.25193173810839653, |
| "w_low_ratio": 0.03785138111561537, |
| "w_max": 2.232436418533325, |
| "w_mean": 1.3874212205410004, |
| "w_min": 0.0, |
| "w_std": 0.26341583393514156 |
| }, |
| { |
| "completion_length": 3263.9271850585938, |
| "cov_mean": 5.7232594372180756e-05, |
| "cov_std": 0.3583865277469158, |
| "entropy": 0.423828125, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.29540950059890747, |
| "kl": 2.8073787689208984e-05, |
| "learning_rate": 9.878701917609207e-07, |
| "loss": -0.0308, |
| "reward": 0.6562500298023224, |
| "reward_std": 0.30247221142053604, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/format_reward": 0.4479166716337204, |
| "step": 26, |
| "w_high_ratio": 0.043125174939632416, |
| "w_low_ratio": 0.034932715352624655, |
| "w_max": 1.7730375826358795, |
| "w_mean": 1.1945610046386719, |
| "w_min": 0.0, |
| "w_std": 0.21625201031565666 |
| }, |
| { |
| "completion_length": 3234.354248046875, |
| "cov_mean": -1.585684123028841e-05, |
| "cov_std": 0.3628169037401676, |
| "entropy": 0.4287109375, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.23347000777721405, |
| "kl": 7.574260234832764e-05, |
| "learning_rate": 9.853368487582886e-07, |
| "loss": 0.0016, |
| "reward": 0.5729166939854622, |
| "reward_std": 0.4602612778544426, |
| "rewards/accuracy_reward": 0.12500000279396772, |
| "rewards/format_reward": 0.447916679084301, |
| "step": 27, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.049336991272866726, |
| "w_max": 1.539461612701416, |
| "w_mean": 1.1640309989452362, |
| "w_min": 0.0, |
| "w_std": 0.19506899639964104 |
| }, |
| { |
| "completion_length": 3126.1458740234375, |
| "cov_mean": -2.0113498976570554e-05, |
| "cov_std": 0.4664968028664589, |
| "entropy": 0.40576171875, |
| "epoch": 0.032, |
| "grad_norm": 0.3652705252170563, |
| "kl": 9.274482727050781e-05, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0034, |
| "reward": 0.7916666865348816, |
| "reward_std": 0.5775652155280113, |
| "rewards/accuracy_reward": 0.322916679084301, |
| "rewards/format_reward": 0.4687500149011612, |
| "step": 28, |
| "w_high_ratio": 0.0399150624871254, |
| "w_low_ratio": 0.05781116522848606, |
| "w_max": 1.8517873883247375, |
| "w_mean": 1.2485012710094452, |
| "w_min": 0.0, |
| "w_std": 0.2899218685925007 |
| }, |
| { |
| "completion_length": 3620.3854370117188, |
| "cov_mean": 3.1027989280119073e-06, |
| "cov_std": 0.3902217298746109, |
| "entropy": 0.42919921875, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.42134925723075867, |
| "kl": 0.00019466876983642578, |
| "learning_rate": 9.795644345114794e-07, |
| "loss": 0.054, |
| "reward": 0.26041667722165585, |
| "reward_std": 0.39330877363681793, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.21875000558793545, |
| "step": 29, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.05211624875664711, |
| "w_max": 1.389526218175888, |
| "w_mean": 1.1012998819351196, |
| "w_min": 0.25, |
| "w_std": 0.21629613637924194 |
| }, |
| { |
| "completion_length": 3127.28125, |
| "cov_mean": 7.65450022299774e-06, |
| "cov_std": 0.39614470303058624, |
| "entropy": 0.37109375, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.3341064751148224, |
| "kl": 0.0004119873046875, |
| "learning_rate": 9.76328489131448e-07, |
| "loss": 0.0409, |
| "reward": 0.7916666865348816, |
| "reward_std": 0.536014050245285, |
| "rewards/accuracy_reward": 0.2708333395421505, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 30, |
| "w_high_ratio": 0.09042909741401672, |
| "w_low_ratio": 0.04219994880259037, |
| "w_max": 2.0287185609340668, |
| "w_mean": 1.2833797633647919, |
| "w_min": 0.25, |
| "w_std": 0.2540416121482849 |
| }, |
| { |
| "completion_length": 3311.0418090820312, |
| "cov_mean": 1.0463507578606368e-05, |
| "cov_std": 0.2679142467677593, |
| "entropy": 0.392578125, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.34394925832748413, |
| "kl": 0.00025856494903564453, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.008, |
| "reward": 0.5104166772216558, |
| "reward_std": 0.41994429379701614, |
| "rewards/accuracy_reward": 0.19791666977107525, |
| "rewards/format_reward": 0.3125000027939677, |
| "step": 31, |
| "w_high_ratio": 0.013557232916355133, |
| "w_low_ratio": 0.029067183146253228, |
| "w_max": 1.5240460634231567, |
| "w_mean": 1.1379797160625458, |
| "w_min": 0.0, |
| "w_std": 0.1539093293249607 |
| }, |
| { |
| "completion_length": 3302.9793090820312, |
| "cov_mean": -3.0240359592426103e-05, |
| "cov_std": 0.38900837302207947, |
| "entropy": 0.4296875, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 0.3267619013786316, |
| "kl": 0.0003151893615722656, |
| "learning_rate": 9.69165882516764e-07, |
| "loss": 0.0316, |
| "reward": 0.708333333954215, |
| "reward_std": 0.4763314798474312, |
| "rewards/accuracy_reward": 0.29166666977107525, |
| "rewards/format_reward": 0.4166666744276881, |
| "step": 32, |
| "w_high_ratio": 0.03662256337702274, |
| "w_low_ratio": 0.04006141540594399, |
| "w_max": 1.6614282727241516, |
| "w_mean": 1.1914446651935577, |
| "w_min": 0.0, |
| "w_std": 0.22214871272444725 |
| }, |
| { |
| "completion_length": 3574.9063720703125, |
| "cov_mean": -1.1321862984914333e-05, |
| "cov_std": 0.35655253008008003, |
| "entropy": 0.375, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.22990448772907257, |
| "kl": 0.00036966800689697266, |
| "learning_rate": 9.65243099959949e-07, |
| "loss": -0.0046, |
| "reward": 0.614583358168602, |
| "reward_std": 0.5987343490123749, |
| "rewards/accuracy_reward": 0.2395833432674408, |
| "rewards/format_reward": 0.375, |
| "step": 33, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.050560859963297844, |
| "w_max": 1.4202565550804138, |
| "w_mean": 1.1010453402996063, |
| "w_min": 9.80908925027372e-45, |
| "w_std": 0.22285480797290802 |
| }, |
| { |
| "completion_length": 2701.3021545410156, |
| "cov_mean": -9.209951758748502e-05, |
| "cov_std": 0.40857063978910446, |
| "entropy": 0.455078125, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.49724769592285156, |
| "kl": 0.0010764598846435547, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": -0.0999, |
| "reward": 0.8229166883975267, |
| "reward_std": 0.44509488344192505, |
| "rewards/accuracy_reward": 0.32291668001562357, |
| "rewards/format_reward": 0.5000000158324838, |
| "step": 34, |
| "w_high_ratio": 0.255466103553772, |
| "w_low_ratio": 0.058159707114100456, |
| "w_max": 1.966733694076538, |
| "w_mean": 1.445367842912674, |
| "w_min": 0.0, |
| "w_std": 0.2951628230512142 |
| }, |
| { |
| "completion_length": 3312.5209350585938, |
| "cov_mean": -2.75732190857525e-06, |
| "cov_std": 0.4042964428663254, |
| "entropy": 0.43115234375, |
| "epoch": 0.04, |
| "grad_norm": 0.5397042036056519, |
| "kl": 0.0008138418197631836, |
| "learning_rate": 9.567251964768342e-07, |
| "loss": -0.0246, |
| "reward": 0.6041666716337204, |
| "reward_std": 0.5854796469211578, |
| "rewards/accuracy_reward": 0.2604166753590107, |
| "rewards/format_reward": 0.3437500111758709, |
| "step": 35, |
| "w_high_ratio": 0.05638222396373749, |
| "w_low_ratio": 0.05512247420847416, |
| "w_max": 2.192526876926422, |
| "w_mean": 1.2277101576328278, |
| "w_min": 5.989329002727702e-37, |
| "w_std": 0.2675026059150696 |
| }, |
| { |
| "completion_length": 3587.6354370117188, |
| "cov_mean": -1.1433875897637336e-05, |
| "cov_std": 0.2749627083539963, |
| "entropy": 0.48095703125, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.2379421591758728, |
| "kl": 0.000979304313659668, |
| "learning_rate": 9.521346881455354e-07, |
| "loss": -0.0089, |
| "reward": 0.22916666977107525, |
| "reward_std": 0.3393310159444809, |
| "rewards/accuracy_reward": 0.052083334885537624, |
| "rewards/format_reward": 0.17708334140479565, |
| "step": 36, |
| "w_high_ratio": 0.0930290725082159, |
| "w_low_ratio": 0.03267599269747734, |
| "w_max": 1.6210260689258575, |
| "w_mean": 1.149814784526825, |
| "w_min": 0.25, |
| "w_std": 0.14480087533593178 |
| }, |
| { |
| "completion_length": 3585.6458740234375, |
| "cov_mean": -6.411561662389431e-05, |
| "cov_std": 0.3158421888947487, |
| "entropy": 0.42236328125, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.2652978301048279, |
| "kl": 0.0006819963455200195, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0058, |
| "reward": 0.23958334419876337, |
| "reward_std": 0.20556553453207016, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/format_reward": 0.21875000558793545, |
| "step": 37, |
| "w_high_ratio": 0.054578784853219986, |
| "w_low_ratio": 0.03975548921152949, |
| "w_max": 1.6195516288280487, |
| "w_mean": 1.1414334774017334, |
| "w_min": 0.0, |
| "w_std": 0.17483297176659107 |
| }, |
| { |
| "completion_length": 3557.1563110351562, |
| "cov_mean": 1.9627160781965358e-05, |
| "cov_std": 0.1310347281396389, |
| "entropy": 0.4443359375, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.18166524171829224, |
| "kl": 0.00045049190521240234, |
| "learning_rate": 9.42302986163543e-07, |
| "loss": 0.0055, |
| "reward": 0.2916666716337204, |
| "reward_std": 0.20568452775478363, |
| "rewards/accuracy_reward": 0.1354166716337204, |
| "rewards/format_reward": 0.15625, |
| "step": 38, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.013907193206250668, |
| "w_max": 1.2318150103092194, |
| "w_mean": 1.0593293607234955, |
| "w_min": 0.5, |
| "w_std": 0.0720198005437851 |
| }, |
| { |
| "completion_length": 3032.9688110351562, |
| "cov_mean": 3.0818391678621992e-06, |
| "cov_std": 0.21192274242639542, |
| "entropy": 0.361083984375, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.19133131206035614, |
| "kl": 0.001926124095916748, |
| "learning_rate": 9.370671165529144e-07, |
| "loss": 0.0176, |
| "reward": 0.760416679084301, |
| "reward_std": 0.19024790823459625, |
| "rewards/accuracy_reward": 0.2604166669771075, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 39, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.031908176839351654, |
| "w_max": 1.4641892611980438, |
| "w_mean": 1.1698184311389923, |
| "w_min": 0.5, |
| "w_std": 0.11090587638318539 |
| }, |
| { |
| "completion_length": 3034.4375610351562, |
| "cov_mean": -2.096771822834853e-05, |
| "cov_std": 0.2704497389495373, |
| "entropy": 0.3974609375, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.2450522482395172, |
| "kl": 0.0029773712158203125, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0304, |
| "reward": 0.6145833469927311, |
| "reward_std": 0.3184027150273323, |
| "rewards/accuracy_reward": 0.16666666697710752, |
| "rewards/format_reward": 0.447916679084301, |
| "step": 40, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03790469467639923, |
| "w_max": 1.6503434479236603, |
| "w_mean": 1.2544237673282623, |
| "w_min": 0.0, |
| "w_std": 0.16268039494752884 |
| }, |
| { |
| "completion_length": 3323.0313110351562, |
| "cov_mean": 1.6318189409503248e-05, |
| "cov_std": 0.3728119507431984, |
| "entropy": 0.37890625, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.25028058886528015, |
| "kl": 0.0005707740783691406, |
| "learning_rate": 9.259695151358214e-07, |
| "loss": 0.0052, |
| "reward": 0.4166666716337204, |
| "reward_std": 0.42720501869916916, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 41, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04678645543754101, |
| "w_max": 1.4909087419509888, |
| "w_mean": 1.1125215888023376, |
| "w_min": 0.0, |
| "w_std": 0.20876972749829292 |
| }, |
| { |
| "completion_length": 2951.6458740234375, |
| "cov_mean": 5.0857947826443706e-05, |
| "cov_std": 0.20116684958338737, |
| "entropy": 0.45947265625, |
| "epoch": 0.048, |
| "grad_norm": 0.20272254943847656, |
| "kl": 0.0004711151123046875, |
| "learning_rate": 9.20113792876298e-07, |
| "loss": 0.0143, |
| "reward": 0.40625000558793545, |
| "reward_std": 0.2362503558397293, |
| "rewards/accuracy_reward": 0.031250000931322575, |
| "rewards/format_reward": 0.3750000027939677, |
| "step": 42, |
| "w_high_ratio": 0.125, |
| "w_low_ratio": 0.02315131900832057, |
| "w_max": 1.6171163022518158, |
| "w_mean": 1.2283784747123718, |
| "w_min": 0.5, |
| "w_std": 0.10980619117617607 |
| }, |
| { |
| "completion_length": 3174.875, |
| "cov_mean": 4.1157167999017474e-05, |
| "cov_std": 0.3693430423736572, |
| "entropy": 0.396484375, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.2846878170967102, |
| "kl": 0.0015649795532226562, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0363, |
| "reward": 0.5000000111758709, |
| "reward_std": 0.39679284393787384, |
| "rewards/accuracy_reward": 0.17708334140479565, |
| "rewards/format_reward": 0.32291666977107525, |
| "step": 43, |
| "w_high_ratio": 0.0364043265581131, |
| "w_low_ratio": 0.052212903276085854, |
| "w_max": 1.6643747389316559, |
| "w_mean": 1.1576766669750214, |
| "w_min": 0.0, |
| "w_std": 0.22118561156094074 |
| }, |
| { |
| "completion_length": 2870.6771697998047, |
| "cov_mean": -0.00011349500164214987, |
| "cov_std": 0.38516905158758163, |
| "entropy": 0.392578125, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.36889317631721497, |
| "kl": 0.001346588134765625, |
| "learning_rate": 9.078043584226815e-07, |
| "loss": -0.016, |
| "reward": 0.7708333469927311, |
| "reward_std": 0.4629998579621315, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.520833333954215, |
| "step": 44, |
| "w_high_ratio": 0.16294695064425468, |
| "w_low_ratio": 0.04376620473340154, |
| "w_max": 1.860895425081253, |
| "w_mean": 1.262012630701065, |
| "w_min": 0.0, |
| "w_std": 0.2177225835621357 |
| }, |
| { |
| "completion_length": 3653.2084350585938, |
| "cov_mean": 2.4295502612403652e-05, |
| "cov_std": 0.34152911603450775, |
| "entropy": 0.41552734375, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.3034785985946655, |
| "kl": 0.0036156177520751953, |
| "learning_rate": 9.013573120044966e-07, |
| "loss": 0.0183, |
| "reward": 0.4375, |
| "reward_std": 0.47292545437812805, |
| "rewards/accuracy_reward": 0.17708333861082792, |
| "rewards/format_reward": 0.26041667349636555, |
| "step": 45, |
| "w_high_ratio": 0.0399763397872448, |
| "w_low_ratio": 0.03463862743228674, |
| "w_max": 1.478510558605194, |
| "w_mean": 1.0995357930660248, |
| "w_min": 0.0, |
| "w_std": 0.18589595332741737 |
| }, |
| { |
| "completion_length": 3480.4583740234375, |
| "cov_mean": -8.883081477506494e-06, |
| "cov_std": 0.35656186379492283, |
| "entropy": 0.482421875, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.43441376090049744, |
| "kl": 0.0010223388671875, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.1236, |
| "reward": 0.2708333348855376, |
| "reward_std": 0.3359568640589714, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/format_reward": 0.22916667442768812, |
| "step": 46, |
| "w_high_ratio": 0.09412252902984619, |
| "w_low_ratio": 0.039234326453879476, |
| "w_max": 1.7104443907737732, |
| "w_mean": 1.1970912516117096, |
| "w_min": 0.0, |
| "w_std": 0.22640072740614414 |
| }, |
| { |
| "completion_length": 3027.9375610351562, |
| "cov_mean": -2.4718745407881215e-05, |
| "cov_std": 0.26147888600826263, |
| "entropy": 0.38720703125, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.29539671540260315, |
| "kl": 0.0011167526245117188, |
| "learning_rate": 8.878960148416747e-07, |
| "loss": -0.0436, |
| "reward": 0.8125000149011612, |
| "reward_std": 0.4830247238278389, |
| "rewards/accuracy_reward": 0.291666679084301, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 47, |
| "w_high_ratio": 0.11265619844198227, |
| "w_low_ratio": 0.028757336549460888, |
| "w_max": 1.9260917007923126, |
| "w_mean": 1.3850333988666534, |
| "w_min": 0.25, |
| "w_std": 0.19246497750282288 |
| }, |
| { |
| "completion_length": 3071.947998046875, |
| "cov_mean": 6.93041113208892e-05, |
| "cov_std": 0.44929926097393036, |
| "entropy": 0.3916015625, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.429691344499588, |
| "kl": 0.0033349990844726562, |
| "learning_rate": 8.808890536269229e-07, |
| "loss": 0.0121, |
| "reward": 0.6250000074505806, |
| "reward_std": 0.5002652183175087, |
| "rewards/accuracy_reward": 0.21875000651925802, |
| "rewards/format_reward": 0.40625001303851604, |
| "step": 48, |
| "w_high_ratio": 0.11854390799999237, |
| "w_low_ratio": 0.049413095228374004, |
| "w_max": 2.040738523006439, |
| "w_mean": 1.2706988453865051, |
| "w_min": 0.0, |
| "w_std": 0.28789742290973663 |
| }, |
| { |
| "completion_length": 2590.0208740234375, |
| "cov_mean": 1.1296036518615438e-05, |
| "cov_std": 0.44136329740285873, |
| "entropy": 0.361328125, |
| "epoch": 0.056, |
| "grad_norm": 0.3813425898551941, |
| "kl": 0.006386756896972656, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0099, |
| "reward": 0.8541667014360428, |
| "reward_std": 0.5753171592950821, |
| "rewards/accuracy_reward": 0.3020833358168602, |
| "rewards/format_reward": 0.5520833507180214, |
| "step": 49, |
| "w_high_ratio": 0.08562466688454151, |
| "w_low_ratio": 0.04912099055945873, |
| "w_max": 2.1155774295330048, |
| "w_mean": 1.313397854566574, |
| "w_min": 1.0880903909928003e-39, |
| "w_std": 0.2910540699958801 |
| }, |
| { |
| "completion_length": 3252.7188110351562, |
| "cov_mean": -5.4081367125036195e-05, |
| "cov_std": 0.2936253622174263, |
| "entropy": 0.341796875, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.19821792840957642, |
| "kl": 0.0009126663208007812, |
| "learning_rate": 8.663414758415478e-07, |
| "loss": 0.0282, |
| "reward": 0.5937500204890966, |
| "reward_std": 0.34640391170978546, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.34375000558793545, |
| "step": 50, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.038133963476866484, |
| "w_max": 1.6234075427055359, |
| "w_mean": 1.1869377791881561, |
| "w_min": 0.25, |
| "w_std": 0.1625902745872736 |
| }, |
| { |
| "completion_length": 2461.135498046875, |
| "cov_mean": 2.296896900588763e-05, |
| "cov_std": 0.34488509595394135, |
| "entropy": 0.43603515625, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.37042999267578125, |
| "kl": 0.0047626495361328125, |
| "learning_rate": 8.588087370409302e-07, |
| "loss": 0.0031, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.3103678971529007, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/format_reward": 0.5416666716337204, |
| "step": 51, |
| "w_high_ratio": 0.012511095963418484, |
| "w_low_ratio": 0.04256652761250734, |
| "w_max": 1.5135074257850647, |
| "w_mean": 1.154930055141449, |
| "w_min": 0.25, |
| "w_std": 0.20846375823020935 |
| }, |
| { |
| "completion_length": 3097.0000610351562, |
| "cov_mean": 1.4503702914225869e-06, |
| "cov_std": 0.42206430435180664, |
| "entropy": 0.39306640625, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.7796747088432312, |
| "kl": 0.0031175613403320312, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": -0.0444, |
| "reward": 0.7083333684131503, |
| "reward_std": 0.5597149804234505, |
| "rewards/accuracy_reward": 0.2812500074505806, |
| "rewards/format_reward": 0.4270833386108279, |
| "step": 52, |
| "w_high_ratio": 0.1064748540520668, |
| "w_low_ratio": 0.0502536753192544, |
| "w_max": 1.7458641231060028, |
| "w_mean": 1.2538467645645142, |
| "w_min": 0.0, |
| "w_std": 0.25843533128499985 |
| }, |
| { |
| "completion_length": 2980.0833740234375, |
| "cov_mean": -1.4053926861379296e-05, |
| "cov_std": 0.5106082037091255, |
| "entropy": 0.4208984375, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.43870407342910767, |
| "kl": 0.0011355876922607422, |
| "learning_rate": 8.432457529696548e-07, |
| "loss": -0.0332, |
| "reward": 0.8229167014360428, |
| "reward_std": 0.6252040863037109, |
| "rewards/accuracy_reward": 0.2812500074505806, |
| "rewards/format_reward": 0.5416666865348816, |
| "step": 53, |
| "w_high_ratio": 0.15575578436255455, |
| "w_low_ratio": 0.04851931845769286, |
| "w_max": 2.0333048701286316, |
| "w_mean": 1.3303064107894897, |
| "w_min": 0.0, |
| "w_std": 0.3132343143224716 |
| }, |
| { |
| "completion_length": 3125.135498046875, |
| "cov_mean": 2.5879786335281096e-05, |
| "cov_std": 0.4214501827955246, |
| "entropy": 0.38623046875, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.41025811433792114, |
| "kl": 0.0010042190551757812, |
| "learning_rate": 8.352239353294194e-07, |
| "loss": -0.0491, |
| "reward": 0.9479167014360428, |
| "reward_std": 0.6265207231044769, |
| "rewards/accuracy_reward": 0.416666679084301, |
| "rewards/format_reward": 0.5312500149011612, |
| "step": 54, |
| "w_high_ratio": 0.045562680810689926, |
| "w_low_ratio": 0.05255642905831337, |
| "w_max": 1.7965390384197235, |
| "w_mean": 1.294397234916687, |
| "w_min": 0.0, |
| "w_std": 0.2626797705888748 |
| }, |
| { |
| "completion_length": 3290.5521240234375, |
| "cov_mean": 6.297564050328219e-05, |
| "cov_std": 0.3725521042943001, |
| "entropy": 0.3955078125, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.3483085632324219, |
| "kl": 0.0009459257125854492, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0117, |
| "reward": 0.6979166716337204, |
| "reward_std": 0.5277387201786041, |
| "rewards/accuracy_reward": 0.2604166716337204, |
| "rewards/format_reward": 0.4375000149011612, |
| "step": 55, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04737149551510811, |
| "w_max": 1.5634404420852661, |
| "w_mean": 1.1619892120361328, |
| "w_min": 0.0, |
| "w_std": 0.2019548863172531 |
| }, |
| { |
| "completion_length": 3193.3438110351562, |
| "cov_mean": -6.364414912241045e-05, |
| "cov_std": 0.35638032108545303, |
| "entropy": 0.38623046875, |
| "epoch": 0.064, |
| "grad_norm": 0.22449523210525513, |
| "kl": 0.000545501708984375, |
| "learning_rate": 8.187213662662538e-07, |
| "loss": -0.0369, |
| "reward": 0.6770833432674408, |
| "reward_std": 0.3087990954518318, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/format_reward": 0.4479166865348816, |
| "step": 56, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.038341518957167864, |
| "w_max": 1.6050121486186981, |
| "w_mean": 1.1923900246620178, |
| "w_min": 0.0, |
| "w_std": 0.19538425654172897 |
| }, |
| { |
| "completion_length": 3553.1563110351562, |
| "cov_mean": 2.031017220360809e-05, |
| "cov_std": 0.3057239428162575, |
| "entropy": 0.31396484375, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.1648004949092865, |
| "kl": 0.0003190040588378906, |
| "learning_rate": 8.102495512755938e-07, |
| "loss": 0.0243, |
| "reward": 0.5520833544433117, |
| "reward_std": 0.47310057282447815, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/format_reward": 0.38541668467223644, |
| "step": 57, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04216110520064831, |
| "w_max": 1.4258966147899628, |
| "w_mean": 1.126460313796997, |
| "w_min": 0.0, |
| "w_std": 0.163984976708889 |
| }, |
| { |
| "completion_length": 2358.1771240234375, |
| "cov_mean": 0.00012649836571654305, |
| "cov_std": 0.3942733556032181, |
| "entropy": 0.32470703125, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.3121250867843628, |
| "kl": 0.0030040740966796875, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": -0.0315, |
| "reward": 0.9791667014360428, |
| "reward_std": 0.47231587767601013, |
| "rewards/accuracy_reward": 0.3020833395421505, |
| "rewards/format_reward": 0.6770833358168602, |
| "step": 58, |
| "w_high_ratio": 0.08487696945667267, |
| "w_low_ratio": 0.045225437730550766, |
| "w_max": 1.9617216885089874, |
| "w_mean": 1.3747560679912567, |
| "w_min": 3.898036555555448e-35, |
| "w_std": 0.2643117532134056 |
| }, |
| { |
| "completion_length": 3183.8021850585938, |
| "cov_mean": -4.030091076856479e-06, |
| "cov_std": 0.21410225331783295, |
| "entropy": 0.345703125, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.37277111411094666, |
| "kl": 0.0005140304565429688, |
| "learning_rate": 7.928877960781808e-07, |
| "loss": -0.0033, |
| "reward": 0.5104166977107525, |
| "reward_std": 0.2982303276658058, |
| "rewards/accuracy_reward": 0.17708333861082792, |
| "rewards/format_reward": 0.3333333432674408, |
| "step": 59, |
| "w_high_ratio": 0.1216658167541027, |
| "w_low_ratio": 0.023135079303756356, |
| "w_max": 1.7003150880336761, |
| "w_mean": 1.2053856253623962, |
| "w_min": 0.25, |
| "w_std": 0.15523223765194416 |
| }, |
| { |
| "completion_length": 3174.5000610351562, |
| "cov_mean": -1.2183483704575337e-05, |
| "cov_std": 0.27795324102044106, |
| "entropy": 0.3603515625, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.26638808846473694, |
| "kl": 0.0025773048400878906, |
| "learning_rate": 7.840072575681468e-07, |
| "loss": 0.0243, |
| "reward": 0.5729167014360428, |
| "reward_std": 0.4113100916147232, |
| "rewards/accuracy_reward": 0.14583334140479565, |
| "rewards/format_reward": 0.4270833507180214, |
| "step": 60, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03565680282190442, |
| "w_max": 1.410323053598404, |
| "w_mean": 1.124219536781311, |
| "w_min": 0.0, |
| "w_std": 0.15482920035719872 |
| }, |
| { |
| "completion_length": 3304.791748046875, |
| "cov_mean": 1.00996726359881e-05, |
| "cov_std": 0.37024862319231033, |
| "entropy": 0.3603515625, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.43320998549461365, |
| "kl": 0.0008983612060546875, |
| "learning_rate": 7.75e-07, |
| "loss": -0.0682, |
| "reward": 0.645833358168602, |
| "reward_std": 0.45523863658308983, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/format_reward": 0.4583333432674408, |
| "step": 61, |
| "w_high_ratio": 0.01961296983063221, |
| "w_low_ratio": 0.04602003050968051, |
| "w_max": 1.6873614192008972, |
| "w_mean": 1.205470085144043, |
| "w_min": 0.0, |
| "w_std": 0.22806508466601372 |
| }, |
| { |
| "completion_length": 2739.2500610351562, |
| "cov_mean": 5.323334062268259e-05, |
| "cov_std": 0.4578249305486679, |
| "entropy": 0.31982421875, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 0.3162655532360077, |
| "kl": 0.002288818359375, |
| "learning_rate": 7.658709009626109e-07, |
| "loss": 0.0286, |
| "reward": 0.916666679084301, |
| "reward_std": 0.6331392228603363, |
| "rewards/accuracy_reward": 0.2812500111758709, |
| "rewards/format_reward": 0.6354166939854622, |
| "step": 62, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.05417798087000847, |
| "w_max": 1.768731027841568, |
| "w_mean": 1.2571848034858704, |
| "w_min": 0.0, |
| "w_std": 0.26054797321558 |
| }, |
| { |
| "completion_length": 2615.3021850585938, |
| "cov_mean": 1.778580372047145e-05, |
| "cov_std": 0.4087640196084976, |
| "entropy": 0.4072265625, |
| "epoch": 0.072, |
| "grad_norm": 0.392840176820755, |
| "kl": 0.0025348663330078125, |
| "learning_rate": 7.566249040241553e-07, |
| "loss": -0.0491, |
| "reward": 1.0104167014360428, |
| "reward_std": 0.5578364282846451, |
| "rewards/accuracy_reward": 0.3333333507180214, |
| "rewards/format_reward": 0.677083358168602, |
| "step": 63, |
| "w_high_ratio": 0.008992652408778667, |
| "w_low_ratio": 0.04948492627590895, |
| "w_max": 1.822005033493042, |
| "w_mean": 1.33000847697258, |
| "w_min": 2.0318827732709848e-44, |
| "w_std": 0.24196847900748253 |
| }, |
| { |
| "completion_length": 3171.354248046875, |
| "cov_mean": -0.00014721620300406357, |
| "cov_std": 0.3767973370850086, |
| "entropy": 0.3984375, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.4929248094558716, |
| "kl": 0.0016901493072509766, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": -0.033, |
| "reward": 0.6875000204890966, |
| "reward_std": 0.5117540434002876, |
| "rewards/accuracy_reward": 0.2708333460614085, |
| "rewards/format_reward": 0.4166666669771075, |
| "step": 64, |
| "w_high_ratio": 0.14000318944454193, |
| "w_low_ratio": 0.03487167996354401, |
| "w_max": 1.7245290279388428, |
| "w_mean": 1.3096586167812347, |
| "w_min": 0.0, |
| "w_std": 0.2602265626192093 |
| }, |
| { |
| "completion_length": 2907.3126220703125, |
| "cov_mean": 3.645536344265565e-05, |
| "cov_std": 0.29790719598531723, |
| "entropy": 0.3271484375, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.32978716492652893, |
| "kl": 0.0017614364624023438, |
| "learning_rate": 7.37802304516818e-07, |
| "loss": -0.0106, |
| "reward": 0.6875000260770321, |
| "reward_std": 0.3368534557521343, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.5208333395421505, |
| "step": 65, |
| "w_high_ratio": 0.01945015788078308, |
| "w_low_ratio": 0.03530450165271759, |
| "w_max": 1.5614324808120728, |
| "w_mean": 1.165216714143753, |
| "w_min": 0.0, |
| "w_std": 0.17453981935977936 |
| }, |
| { |
| "completion_length": 2260.9896087646484, |
| "cov_mean": -8.368766430066898e-06, |
| "cov_std": 0.21803472004830837, |
| "entropy": 0.310791015625, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.49300873279571533, |
| "kl": 0.00205230712890625, |
| "learning_rate": 7.282358947176205e-07, |
| "loss": 0.0218, |
| "reward": 0.8750000055879354, |
| "reward_std": 0.27498848363757133, |
| "rewards/accuracy_reward": 0.3645833348855376, |
| "rewards/format_reward": 0.5104166669771075, |
| "step": 66, |
| "w_high_ratio": 0.0345294363796711, |
| "w_low_ratio": 0.024254919728264213, |
| "w_max": 1.7126049399375916, |
| "w_mean": 1.2293311953544617, |
| "w_min": 1.3662660027166966e-44, |
| "w_std": 0.14490841701626778 |
| }, |
| { |
| "completion_length": 3728.5833740234375, |
| "cov_mean": 5.284800408844603e-06, |
| "cov_std": 0.1643856093287468, |
| "entropy": 0.34765625, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.2334880828857422, |
| "kl": 0.001039743423461914, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.0054, |
| "reward": 0.1770833358168602, |
| "reward_std": 0.19108106940984726, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/format_reward": 0.15625, |
| "step": 67, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.02258414216339588, |
| "w_max": 1.245382398366928, |
| "w_mean": 1.0691203624010086, |
| "w_min": 0.5, |
| "w_std": 0.10235420987010002 |
| }, |
| { |
| "completion_length": 2355.072982788086, |
| "cov_mean": -1.9176595742464997e-06, |
| "cov_std": 0.27589889243245125, |
| "entropy": 0.3740234375, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.37152960896492004, |
| "kl": 0.0041484832763671875, |
| "learning_rate": 7.08818754121241e-07, |
| "loss": -0.0423, |
| "reward": 0.8229167014360428, |
| "reward_std": 0.415886752307415, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/format_reward": 0.5937500074505806, |
| "step": 68, |
| "w_high_ratio": 0.012929616495966911, |
| "w_low_ratio": 0.031258232425898314, |
| "w_max": 1.8356628715991974, |
| "w_mean": 1.304297924041748, |
| "w_min": 0.25, |
| "w_std": 0.19349467381834984 |
| }, |
| { |
| "completion_length": 2847.2604370117188, |
| "cov_mean": -4.8151488954317756e-05, |
| "cov_std": 0.28809408843517303, |
| "entropy": 0.4375, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.3872397840023041, |
| "kl": 0.0045623779296875, |
| "learning_rate": 6.989785380482312e-07, |
| "loss": -0.0566, |
| "reward": 0.4479166939854622, |
| "reward_std": 0.33732588589191437, |
| "rewards/accuracy_reward": 0.031250000931322575, |
| "rewards/format_reward": 0.4166666865348816, |
| "step": 69, |
| "w_high_ratio": 0.19539788458496332, |
| "w_low_ratio": 0.03220258606597781, |
| "w_max": 2.5382341742515564, |
| "w_mean": 1.4389045536518097, |
| "w_min": 0.0, |
| "w_std": 0.1934449914842844 |
| }, |
| { |
| "completion_length": 3250.4063110351562, |
| "cov_mean": 4.1365366996615194e-05, |
| "cov_std": 0.29582661017775536, |
| "entropy": 0.34326171875, |
| "epoch": 0.08, |
| "grad_norm": 0.23539891839027405, |
| "kl": 0.0022635459899902344, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0138, |
| "reward": 0.6145833684131503, |
| "reward_std": 0.3936128318309784, |
| "rewards/accuracy_reward": 0.1562500037252903, |
| "rewards/format_reward": 0.4583333535119891, |
| "step": 70, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03546257223933935, |
| "w_max": 1.3784515261650085, |
| "w_mean": 1.1171683073043823, |
| "w_min": 0.0, |
| "w_std": 0.1633461881428957 |
| }, |
| { |
| "completion_length": 2797.4063415527344, |
| "cov_mean": 4.543551585811656e-05, |
| "cov_std": 0.2623286135494709, |
| "entropy": 0.37744140625, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 1.3858193159103394, |
| "kl": 0.05230998992919922, |
| "learning_rate": 6.790614547199906e-07, |
| "loss": 0.0548, |
| "reward": 0.5729166772216558, |
| "reward_std": 0.24960162490606308, |
| "rewards/accuracy_reward": 0.1770833358168602, |
| "rewards/format_reward": 0.39583333395421505, |
| "step": 71, |
| "w_high_ratio": 0.13244600966572762, |
| "w_low_ratio": 0.025677886325865984, |
| "w_max": 1.9256412386894226, |
| "w_mean": 1.2497790455818176, |
| "w_min": 0.0, |
| "w_std": 0.19411796145141125 |
| }, |
| { |
| "completion_length": 3362.5313720703125, |
| "cov_mean": 3.213349305042357e-05, |
| "cov_std": 0.4226767495274544, |
| "entropy": 0.486328125, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.4099564254283905, |
| "kl": 0.004467964172363281, |
| "learning_rate": 6.68995372916741e-07, |
| "loss": -0.0646, |
| "reward": 0.416666679084301, |
| "reward_std": 0.4769069701433182, |
| "rewards/accuracy_reward": 0.0520833358168602, |
| "rewards/format_reward": 0.3645833432674408, |
| "step": 72, |
| "w_high_ratio": 0.08334781229496002, |
| "w_low_ratio": 0.05627091834321618, |
| "w_max": 1.7420941889286041, |
| "w_mean": 1.216987669467926, |
| "w_min": 0.0, |
| "w_std": 0.27180150151252747 |
| }, |
| { |
| "completion_length": 3799.3334350585938, |
| "cov_mean": 6.445545591304835e-05, |
| "cov_std": 0.38433366641402245, |
| "entropy": 0.45068359375, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.2338305115699768, |
| "kl": 0.00043392181396484375, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0467, |
| "reward": 0.4062500027939677, |
| "reward_std": 0.48472320288419724, |
| "rewards/accuracy_reward": 0.1770833395421505, |
| "rewards/format_reward": 0.22916667442768812, |
| "step": 73, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.054560547694563866, |
| "w_max": 1.3258715867996216, |
| "w_mean": 1.0365844666957855, |
| "w_min": 2.0397733447937514e-38, |
| "w_std": 0.1829804591834545 |
| }, |
| { |
| "completion_length": 3411.354248046875, |
| "cov_mean": 2.615013909235131e-05, |
| "cov_std": 0.32939745485782623, |
| "entropy": 0.38916015625, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.31615766882896423, |
| "kl": 0.0020766258239746094, |
| "learning_rate": 6.486753808845564e-07, |
| "loss": 0.0447, |
| "reward": 0.5416666716337204, |
| "reward_std": 0.48268113285303116, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/format_reward": 0.2916666716337204, |
| "step": 74, |
| "w_high_ratio": 0.017251405864953995, |
| "w_low_ratio": 0.037632704712450504, |
| "w_max": 1.4793908894062042, |
| "w_mean": 1.120386153459549, |
| "w_min": 0.25, |
| "w_std": 0.18761293590068817 |
| }, |
| { |
| "completion_length": 3256.6876220703125, |
| "cov_mean": 7.561668553535128e-07, |
| "cov_std": 0.39095795527100563, |
| "entropy": 0.36669921875, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.29881325364112854, |
| "kl": 0.0019350051879882812, |
| "learning_rate": 6.384324742897735e-07, |
| "loss": 0.0709, |
| "reward": 0.5625000260770321, |
| "reward_std": 0.3942164406180382, |
| "rewards/accuracy_reward": 0.1979166716337204, |
| "rewards/format_reward": 0.36458334513008595, |
| "step": 75, |
| "w_high_ratio": 0.05663827061653137, |
| "w_low_ratio": 0.04413987882435322, |
| "w_max": 1.78597491979599, |
| "w_mean": 1.1896328330039978, |
| "w_min": 0.0, |
| "w_std": 0.20989646948873997 |
| }, |
| { |
| "completion_length": 3081.6875610351562, |
| "cov_mean": 7.851473583286861e-06, |
| "cov_std": 0.19348382577300072, |
| "entropy": 0.38671875, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.13885051012039185, |
| "kl": 0.0005955696105957031, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": -0.0033, |
| "reward": 0.4583333432674408, |
| "reward_std": 0.22215576469898224, |
| "rewards/accuracy_reward": 0.0520833358168602, |
| "rewards/format_reward": 0.4062500074505806, |
| "step": 76, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.024444932583719492, |
| "w_max": 1.385847419500351, |
| "w_mean": 1.104325294494629, |
| "w_min": 0.25, |
| "w_std": 0.09913814999163151 |
| }, |
| { |
| "completion_length": 3410.2084350585938, |
| "cov_mean": 1.7174193999380805e-05, |
| "cov_std": 0.28355711698532104, |
| "entropy": 0.41064453125, |
| "epoch": 0.088, |
| "grad_norm": 0.220694899559021, |
| "kl": 0.0006546974182128906, |
| "learning_rate": 6.178085705122674e-07, |
| "loss": -0.0111, |
| "reward": 0.46875000558793545, |
| "reward_std": 0.2968830242753029, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.34375000558793545, |
| "step": 77, |
| "w_high_ratio": 0.03967808559536934, |
| "w_low_ratio": 0.03201043838635087, |
| "w_max": 1.494775265455246, |
| "w_mean": 1.1476246118545532, |
| "w_min": 0.25, |
| "w_std": 0.1502416580915451 |
| }, |
| { |
| "completion_length": 3454.6771850585938, |
| "cov_mean": 2.8414152438926976e-05, |
| "cov_std": 0.42589128017425537, |
| "entropy": 0.376953125, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.24352695047855377, |
| "kl": 0.0004711151123046875, |
| "learning_rate": 6.074387415372676e-07, |
| "loss": -0.0208, |
| "reward": 0.6354166716337204, |
| "reward_std": 0.5373464152216911, |
| "rewards/accuracy_reward": 0.2500000111758709, |
| "rewards/format_reward": 0.3854166716337204, |
| "step": 78, |
| "w_high_ratio": 0.040874604135751724, |
| "w_low_ratio": 0.04665249306708574, |
| "w_max": 1.7644164860248566, |
| "w_mean": 1.1853148639202118, |
| "w_min": 1.401298464324817e-45, |
| "w_std": 0.24474802613258362 |
| }, |
| { |
| "completion_length": 2533.354217529297, |
| "cov_mean": 4.250231540936511e-06, |
| "cov_std": 0.28236184269189835, |
| "entropy": 0.322265625, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.2319362461566925, |
| "kl": 0.0014781951904296875, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0229, |
| "reward": 0.8541666865348816, |
| "reward_std": 0.3287995010614395, |
| "rewards/accuracy_reward": 0.23958333861082792, |
| "rewards/format_reward": 0.6145833507180214, |
| "step": 79, |
| "w_high_ratio": 0.16244123131036758, |
| "w_low_ratio": 0.035402802750468254, |
| "w_max": 2.477913051843643, |
| "w_mean": 1.4660456776618958, |
| "w_min": 0.3189111649990082, |
| "w_std": 0.194712957367301 |
| }, |
| { |
| "completion_length": 3480.1250610351562, |
| "cov_mean": -2.1006295810366282e-05, |
| "cov_std": 0.1846322864294052, |
| "entropy": 0.4541015625, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.3083387017250061, |
| "kl": 0.0014166831970214844, |
| "learning_rate": 5.866114036005362e-07, |
| "loss": 0.0178, |
| "reward": 0.6041666893288493, |
| "reward_std": 0.32849549502134323, |
| "rewards/accuracy_reward": 0.2187500111758709, |
| "rewards/format_reward": 0.3854166781529784, |
| "step": 80, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.023656398989260197, |
| "w_max": 1.5105722546577454, |
| "w_mean": 1.109735906124115, |
| "w_min": 0.25, |
| "w_std": 0.10967518202960491 |
| }, |
| { |
| "completion_length": 3358.6979370117188, |
| "cov_mean": 1.4671212284156354e-05, |
| "cov_std": 0.1834326833486557, |
| "entropy": 0.5087890625, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.17795835435390472, |
| "kl": 0.005743980407714844, |
| "learning_rate": 5.761651730097142e-07, |
| "loss": -0.0007, |
| "reward": 0.36458334885537624, |
| "reward_std": 0.28906675428152084, |
| "rewards/accuracy_reward": 0.09375000279396772, |
| "rewards/format_reward": 0.2708333386108279, |
| "step": 81, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.025394567288458347, |
| "w_max": 1.5137894749641418, |
| "w_mean": 1.1780100166797638, |
| "w_min": 0.25, |
| "w_std": 0.09445377439260483 |
| }, |
| { |
| "completion_length": 3041.3438110351562, |
| "cov_mean": -2.3124930976337055e-05, |
| "cov_std": 0.2670583054423332, |
| "entropy": 0.39501953125, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.30143725872039795, |
| "kl": 0.0016946792602539062, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": -0.0214, |
| "reward": 0.7604166716337204, |
| "reward_std": 0.34568001329898834, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/format_reward": 0.4479166716337204, |
| "step": 82, |
| "w_high_ratio": 0.04279303178191185, |
| "w_low_ratio": 0.03479792643338442, |
| "w_max": 1.7097257375717163, |
| "w_mean": 1.1838775873184204, |
| "w_min": 0.25, |
| "w_std": 0.16872986778616905 |
| }, |
| { |
| "completion_length": 3054.3646240234375, |
| "cov_mean": -5.689870704372879e-06, |
| "cov_std": 0.17481233924627304, |
| "entropy": 0.43115234375, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.4099189341068268, |
| "kl": 0.0016026496887207031, |
| "learning_rate": 5.552358696106288e-07, |
| "loss": 0.0418, |
| "reward": 0.46875, |
| "reward_std": 0.2636485621333122, |
| "rewards/accuracy_reward": 0.16666666697710752, |
| "rewards/format_reward": 0.30208333395421505, |
| "step": 83, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.01848737057298422, |
| "w_max": 1.5311354398727417, |
| "w_mean": 1.116348922252655, |
| "w_min": 0.5, |
| "w_std": 0.08476846665143967 |
| }, |
| { |
| "completion_length": 3265.4375, |
| "cov_mean": 2.6241104933433235e-06, |
| "cov_std": 0.3729252219200134, |
| "entropy": 0.40966796875, |
| "epoch": 0.096, |
| "grad_norm": 0.3238582909107208, |
| "kl": 0.0008416175842285156, |
| "learning_rate": 5.447641303893714e-07, |
| "loss": 0.0273, |
| "reward": 0.6770833432674408, |
| "reward_std": 0.4903585724532604, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/format_reward": 0.4270833432674408, |
| "step": 84, |
| "w_high_ratio": 0.01827232539653778, |
| "w_low_ratio": 0.04356031212955713, |
| "w_max": 1.6150497496128082, |
| "w_mean": 1.17939093708992, |
| "w_min": 0.0, |
| "w_std": 0.2226697877049446 |
| }, |
| { |
| "completion_length": 3305.8125, |
| "cov_mean": 5.61498741262767e-05, |
| "cov_std": 0.4452364891767502, |
| "entropy": 0.32861328125, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.29220932722091675, |
| "kl": 0.0005831718444824219, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0609, |
| "reward": 0.6354166865348816, |
| "reward_std": 0.68822330981493, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.447916679084301, |
| "step": 85, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.046569294296205044, |
| "w_max": 1.4473570883274078, |
| "w_mean": 1.1975693106651306, |
| "w_min": 0.0, |
| "w_std": 0.23973201215267181 |
| }, |
| { |
| "completion_length": 3247.2188110351562, |
| "cov_mean": -1.9998861716885585e-05, |
| "cov_std": 0.31147949025034904, |
| "entropy": 0.44189453125, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.5364130735397339, |
| "kl": 0.0019941329956054688, |
| "learning_rate": 5.238348269902859e-07, |
| "loss": -0.0757, |
| "reward": 0.6041666865348816, |
| "reward_std": 0.28646790236234665, |
| "rewards/accuracy_reward": 0.1979166716337204, |
| "rewards/format_reward": 0.40625, |
| "step": 86, |
| "w_high_ratio": 0.1384273413568735, |
| "w_low_ratio": 0.027202811557799578, |
| "w_max": 2.1997461020946503, |
| "w_mean": 1.3846492767333984, |
| "w_min": 0.25, |
| "w_std": 0.2737709581851959 |
| }, |
| { |
| "completion_length": 3057.2916870117188, |
| "cov_mean": -5.754891753895208e-05, |
| "cov_std": 0.5682927817106247, |
| "entropy": 0.4931640625, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.587156355381012, |
| "kl": 0.0027523040771484375, |
| "learning_rate": 5.133885963994639e-07, |
| "loss": -0.0103, |
| "reward": 0.7083333432674408, |
| "reward_std": 0.560588151216507, |
| "rewards/accuracy_reward": 0.21875000279396772, |
| "rewards/format_reward": 0.4895833432674408, |
| "step": 87, |
| "w_high_ratio": 0.15174898132681847, |
| "w_low_ratio": 0.06858384050428867, |
| "w_max": 2.0574756860733032, |
| "w_mean": 1.3842451870441437, |
| "w_min": 0.0, |
| "w_std": 0.38674013316631317 |
| }, |
| { |
| "completion_length": 2906.0209350585938, |
| "cov_mean": 4.286354305804707e-05, |
| "cov_std": 0.5042391121387482, |
| "entropy": 0.4453125, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.44004324078559875, |
| "kl": 0.0045490264892578125, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0208, |
| "reward": 0.9791666865348816, |
| "reward_std": 0.5309341698884964, |
| "rewards/accuracy_reward": 0.3437500074505806, |
| "rewards/format_reward": 0.6354166716337204, |
| "step": 88, |
| "w_high_ratio": 0.17751162499189377, |
| "w_low_ratio": 0.05830758810043335, |
| "w_max": 2.042703092098236, |
| "w_mean": 1.399275004863739, |
| "w_min": 0.25, |
| "w_std": 0.35736314207315445 |
| }, |
| { |
| "completion_length": 3425.3333740234375, |
| "cov_mean": -2.9065696480756742e-05, |
| "cov_std": 0.3547215014696121, |
| "entropy": 0.39208984375, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.41978099942207336, |
| "kl": 0.0021228790283203125, |
| "learning_rate": 4.925612584627324e-07, |
| "loss": -0.0597, |
| "reward": 0.5625000223517418, |
| "reward_std": 0.5216004773974419, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/format_reward": 0.354166679084301, |
| "step": 89, |
| "w_high_ratio": 0.017988620325922966, |
| "w_low_ratio": 0.042721704579889774, |
| "w_max": 1.6596693396568298, |
| "w_mean": 1.1673058569431305, |
| "w_min": 0.0, |
| "w_std": 0.22802076116204262 |
| }, |
| { |
| "completion_length": 2660.4375915527344, |
| "cov_mean": 9.994783340516733e-06, |
| "cov_std": 0.2509019151329994, |
| "entropy": 0.546875, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.7049030661582947, |
| "kl": 0.0055828094482421875, |
| "learning_rate": 4.821914294877326e-07, |
| "loss": -0.0314, |
| "reward": 0.48958335630595684, |
| "reward_std": 0.24196770787239075, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/format_reward": 0.4687500176951289, |
| "step": 90, |
| "w_high_ratio": 0.09902577847242355, |
| "w_low_ratio": 0.025217004818841815, |
| "w_max": 2.4860771000385284, |
| "w_mean": 1.4202724397182465, |
| "w_min": 0.0, |
| "w_std": 0.18205549381673336 |
| }, |
| { |
| "completion_length": 3260.8334350585938, |
| "cov_mean": -1.9221572074457072e-05, |
| "cov_std": 0.2735915407538414, |
| "entropy": 0.41845703125, |
| "epoch": 0.104, |
| "grad_norm": 0.2550157308578491, |
| "kl": 0.0021266937255859375, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0421, |
| "reward": 0.5312500316649675, |
| "reward_std": 0.3842952623963356, |
| "rewards/accuracy_reward": 0.14583333674818277, |
| "rewards/format_reward": 0.385416672565043, |
| "step": 91, |
| "w_high_ratio": 0.042984794825315475, |
| "w_low_ratio": 0.027249778620898724, |
| "w_max": 1.7297326922416687, |
| "w_mean": 1.2372649610042572, |
| "w_min": 0.0, |
| "w_std": 0.1690264195203781 |
| }, |
| { |
| "completion_length": 2809.573028564453, |
| "cov_mean": -4.66689198219683e-06, |
| "cov_std": 0.41415752843022346, |
| "entropy": 0.34033203125, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.3232761323451996, |
| "kl": 0.0012707710266113281, |
| "learning_rate": 4.6156752571022637e-07, |
| "loss": -0.0264, |
| "reward": 0.7083333432674408, |
| "reward_std": 0.48402564972639084, |
| "rewards/accuracy_reward": 0.16666667722165585, |
| "rewards/format_reward": 0.5416666939854622, |
| "step": 92, |
| "w_high_ratio": 0.0625, |
| "w_low_ratio": 0.04879668727517128, |
| "w_max": 1.7188981473445892, |
| "w_mean": 1.264578402042389, |
| "w_min": 1.0121512603752466e-34, |
| "w_std": 0.23184099607169628 |
| }, |
| { |
| "completion_length": 3819.1458740234375, |
| "cov_mean": -4.416617707647674e-05, |
| "cov_std": 0.18136212974786758, |
| "entropy": 0.533203125, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.2244294136762619, |
| "kl": 0.002910614013671875, |
| "learning_rate": 4.513246191154434e-07, |
| "loss": 0.0303, |
| "reward": 0.0937500037252903, |
| "reward_std": 0.1851910501718521, |
| "rewards/accuracy_reward": 0.010416666977107525, |
| "rewards/format_reward": 0.08333333674818277, |
| "step": 93, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.025186134036630392, |
| "w_max": 1.3321870565414429, |
| "w_mean": 1.0297031998634338, |
| "w_min": 0.25, |
| "w_std": 0.09247609600424767 |
| }, |
| { |
| "completion_length": 3216.4376220703125, |
| "cov_mean": -2.7046048671763856e-05, |
| "cov_std": 0.25123433768749237, |
| "entropy": 0.5078125, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.2843908965587616, |
| "kl": 0.0036516189575195312, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0352, |
| "reward": 0.47916667722165585, |
| "reward_std": 0.28607168793678284, |
| "rewards/accuracy_reward": 0.17708333395421505, |
| "rewards/format_reward": 0.30208334140479565, |
| "step": 94, |
| "w_high_ratio": 0.04015461727976799, |
| "w_low_ratio": 0.02874834556132555, |
| "w_max": 1.7613461911678314, |
| "w_mean": 1.1813118755817413, |
| "w_min": 0.25, |
| "w_std": 0.16030845791101456 |
| }, |
| { |
| "completion_length": 3702.510498046875, |
| "cov_mean": 3.147694224026054e-05, |
| "cov_std": 0.3758469521999359, |
| "entropy": 0.4091796875, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.2823449671268463, |
| "kl": 0.0005347728729248047, |
| "learning_rate": 4.3100462708325914e-07, |
| "loss": 0.0264, |
| "reward": 0.416666679084301, |
| "reward_std": 0.5112503468990326, |
| "rewards/accuracy_reward": 0.11458333488553762, |
| "rewards/format_reward": 0.3020833432674408, |
| "step": 95, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.0470161447301507, |
| "w_max": 1.2909899652004242, |
| "w_mean": 1.0598113238811493, |
| "w_min": 0.25, |
| "w_std": 0.19865867495536804 |
| }, |
| { |
| "completion_length": 3136.7188110351562, |
| "cov_mean": 2.809584930218989e-05, |
| "cov_std": 0.38921112939715385, |
| "entropy": 0.41259765625, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.4165164828300476, |
| "kl": 0.004611968994140625, |
| "learning_rate": 4.209385452800095e-07, |
| "loss": -0.0197, |
| "reward": 0.677083358168602, |
| "reward_std": 0.4686770662665367, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/format_reward": 0.4270833432674408, |
| "step": 96, |
| "w_high_ratio": 0.1767394095659256, |
| "w_low_ratio": 0.046211169101297855, |
| "w_max": 2.1192705631256104, |
| "w_mean": 1.334891527891159, |
| "w_min": 2.0274371819188836e-36, |
| "w_std": 0.27778077498078346 |
| }, |
| { |
| "completion_length": 3338.3958740234375, |
| "cov_mean": 4.9627053158474155e-05, |
| "cov_std": 0.3888591527938843, |
| "entropy": 0.41650390625, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.41793736815452576, |
| "kl": 0.0014972686767578125, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": -0.0162, |
| "reward": 0.5520833563059568, |
| "reward_std": 0.5027666687965393, |
| "rewards/accuracy_reward": 0.20833334419876337, |
| "rewards/format_reward": 0.3437500176951289, |
| "step": 97, |
| "w_high_ratio": 0.036094631999731064, |
| "w_low_ratio": 0.03847068129107356, |
| "w_max": 1.7370452582836151, |
| "w_mean": 1.2170022130012512, |
| "w_min": 0.0, |
| "w_std": 0.2471493650227785 |
| }, |
| { |
| "completion_length": 3080.9584350585938, |
| "cov_mean": -3.7720649288530694e-05, |
| "cov_std": 0.35732631012797356, |
| "entropy": 0.37109375, |
| "epoch": 0.112, |
| "grad_norm": 0.4429682493209839, |
| "kl": 0.0007581710815429688, |
| "learning_rate": 4.0102146195176887e-07, |
| "loss": -0.087, |
| "reward": 0.614583358168602, |
| "reward_std": 0.3441091701388359, |
| "rewards/accuracy_reward": 0.15625000558793545, |
| "rewards/format_reward": 0.4583333507180214, |
| "step": 98, |
| "w_high_ratio": 0.07888161391019821, |
| "w_low_ratio": 0.0333328228443861, |
| "w_max": 1.875957041978836, |
| "w_mean": 1.2836172580718994, |
| "w_min": 0.0, |
| "w_std": 0.24231520667672157 |
| }, |
| { |
| "completion_length": 3051.885467529297, |
| "cov_mean": -8.294958661281271e-06, |
| "cov_std": 0.13039003312587738, |
| "entropy": 0.342529296875, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.09671887010335922, |
| "kl": 0.0012235641479492188, |
| "learning_rate": 3.911812458787591e-07, |
| "loss": 0.015, |
| "reward": 0.5520833395421505, |
| "reward_std": 0.22183798253536224, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/format_reward": 0.34375000558793545, |
| "step": 99, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.016014322638511658, |
| "w_max": 1.2210949659347534, |
| "w_mean": 1.0700498223304749, |
| "w_min": 0.5234909653663635, |
| "w_std": 0.06671209260821342 |
| }, |
| { |
| "completion_length": 2941.4271850585938, |
| "cov_mean": 3.641827424871735e-05, |
| "cov_std": 0.311247356235981, |
| "entropy": 0.352783203125, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.40575236082077026, |
| "kl": 0.0022411346435546875, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": -0.0488, |
| "reward": 0.8229167088866234, |
| "reward_std": 0.4384620487689972, |
| "rewards/accuracy_reward": 0.3020833507180214, |
| "rewards/format_reward": 0.5208333507180214, |
| "step": 100, |
| "w_high_ratio": 0.057203881442546844, |
| "w_low_ratio": 0.037438319995999336, |
| "w_max": 2.0794378519058228, |
| "w_mean": 1.3217148184776306, |
| "w_min": 0.0, |
| "w_std": 0.19731487706303596 |
| }, |
| { |
| "completion_length": 3064.322967529297, |
| "cov_mean": -0.00016853955486340055, |
| "cov_std": 0.3906340226531029, |
| "entropy": 0.49560546875, |
| "epoch": 0.11542857142857142, |
| "grad_norm": 0.3836102783679962, |
| "kl": 0.0015306472778320312, |
| "learning_rate": 3.7176410528237945e-07, |
| "loss": 0.0071, |
| "reward": 0.6041666716337204, |
| "reward_std": 0.37709444761276245, |
| "rewards/accuracy_reward": 0.20833334419876337, |
| "rewards/format_reward": 0.3958333469927311, |
| "step": 101, |
| "w_high_ratio": 0.039534129202365875, |
| "w_low_ratio": 0.04537520185112953, |
| "w_max": 1.8732931017875671, |
| "w_mean": 1.2176434397697449, |
| "w_min": 1.2642545573704715e-38, |
| "w_std": 0.2533961348235607 |
| }, |
| { |
| "completion_length": 2447.7188415527344, |
| "cov_mean": 6.931000660870268e-05, |
| "cov_std": 0.49404649436473846, |
| "entropy": 0.41943359375, |
| "epoch": 0.11657142857142858, |
| "grad_norm": 0.8746929168701172, |
| "kl": 0.0036649703979492188, |
| "learning_rate": 3.62197695483182e-07, |
| "loss": -0.1189, |
| "reward": 0.8541666939854622, |
| "reward_std": 0.36090725660324097, |
| "rewards/accuracy_reward": 0.15625000279396772, |
| "rewards/format_reward": 0.6979166865348816, |
| "step": 102, |
| "w_high_ratio": 0.21037982031702995, |
| "w_low_ratio": 0.04308201279491186, |
| "w_max": 3.2181393206119537, |
| "w_mean": 1.75843146443367, |
| "w_min": 0.0, |
| "w_std": 0.38950372859835625 |
| }, |
| { |
| "completion_length": 2925.8646850585938, |
| "cov_mean": 7.4649460657383315e-06, |
| "cov_std": 0.41748112440109253, |
| "entropy": 0.384765625, |
| "epoch": 0.11771428571428572, |
| "grad_norm": 0.6049470901489258, |
| "kl": 0.0027923583984375, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": -0.0624, |
| "reward": 0.8125000298023224, |
| "reward_std": 0.4502665400505066, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.5937500149011612, |
| "step": 103, |
| "w_high_ratio": 0.21904528141021729, |
| "w_low_ratio": 0.04505133908241987, |
| "w_max": 2.339956372976303, |
| "w_mean": 1.4152106940746307, |
| "w_min": 0.0, |
| "w_std": 0.29690178483724594 |
| }, |
| { |
| "completion_length": 3003.3230590820312, |
| "cov_mean": -3.357986315677408e-07, |
| "cov_std": 0.30858776718378067, |
| "entropy": 0.4453125, |
| "epoch": 0.11885714285714286, |
| "grad_norm": 1.4959157705307007, |
| "kl": 0.016025543212890625, |
| "learning_rate": 3.433750959758446e-07, |
| "loss": -0.0262, |
| "reward": 0.5833333432674408, |
| "reward_std": 0.418441042304039, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/format_reward": 0.4166666828095913, |
| "step": 104, |
| "w_high_ratio": 0.11394603550434113, |
| "w_low_ratio": 0.0399134736508131, |
| "w_max": 1.7974587678909302, |
| "w_mean": 1.2668271660804749, |
| "w_min": 0.25, |
| "w_std": 0.18237562477588654 |
| }, |
| { |
| "completion_length": 2974.947998046875, |
| "cov_mean": -6.136376214271877e-06, |
| "cov_std": 0.4218733385205269, |
| "entropy": 0.38671875, |
| "epoch": 0.12, |
| "grad_norm": 0.5280556678771973, |
| "kl": 0.004073143005371094, |
| "learning_rate": 3.3412909903738936e-07, |
| "loss": -0.0267, |
| "reward": 0.8020833535119891, |
| "reward_std": 0.6002020314335823, |
| "rewards/accuracy_reward": 0.322916679084301, |
| "rewards/format_reward": 0.4791666818782687, |
| "step": 105, |
| "w_high_ratio": 0.07510412856936455, |
| "w_low_ratio": 0.05238847387954593, |
| "w_max": 2.132063180208206, |
| "w_mean": 1.2634376883506775, |
| "w_min": 0.0, |
| "w_std": 0.2662976738065481 |
| }, |
| { |
| "completion_length": 2262.4375915527344, |
| "cov_mean": 7.913076842669398e-07, |
| "cov_std": 0.2457173652946949, |
| "entropy": 0.291748046875, |
| "epoch": 0.12114285714285715, |
| "grad_norm": 0.2537207007408142, |
| "kl": 0.00292205810546875, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0557, |
| "reward": 1.0833333535119891, |
| "reward_std": 0.31829095631837845, |
| "rewards/accuracy_reward": 0.4270833395421505, |
| "rewards/format_reward": 0.6562500251457095, |
| "step": 106, |
| "w_high_ratio": 0.05819880962371826, |
| "w_low_ratio": 0.029597220942378044, |
| "w_max": 2.0277227461338043, |
| "w_mean": 1.3821330666542053, |
| "w_min": 0.0, |
| "w_std": 0.1694270297884941 |
| }, |
| { |
| "completion_length": 3115.3021240234375, |
| "cov_mean": -5.469087955134455e-05, |
| "cov_std": 0.3848882205784321, |
| "entropy": 0.5244140625, |
| "epoch": 0.12228571428571429, |
| "grad_norm": 0.38348808884620667, |
| "kl": 0.0030164718627929688, |
| "learning_rate": 3.159927424318531e-07, |
| "loss": -0.0623, |
| "reward": 0.6250000251457095, |
| "reward_std": 0.3455836847424507, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/format_reward": 0.4583333535119891, |
| "step": 107, |
| "w_high_ratio": 0.16982319951057434, |
| "w_low_ratio": 0.04572468576952815, |
| "w_max": 1.996431291103363, |
| "w_mean": 1.4110250174999237, |
| "w_min": 0.0, |
| "w_std": 0.2217676378786564 |
| }, |
| { |
| "completion_length": 3326.885498046875, |
| "cov_mean": -3.166737906212802e-05, |
| "cov_std": 0.34976962953805923, |
| "entropy": 0.42529296875, |
| "epoch": 0.12342857142857143, |
| "grad_norm": 0.4025486409664154, |
| "kl": 0.0019197463989257812, |
| "learning_rate": 3.0711220392181934e-07, |
| "loss": 0.0339, |
| "reward": 0.5104166716337204, |
| "reward_std": 0.45747723430395126, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/format_reward": 0.3229166716337204, |
| "step": 108, |
| "w_high_ratio": 0.05910671502351761, |
| "w_low_ratio": 0.04742087051272392, |
| "w_max": 1.6517033874988556, |
| "w_mean": 1.2254261672496796, |
| "w_min": 0.25, |
| "w_std": 0.23562095686793327 |
| }, |
| { |
| "completion_length": 3175.3541870117188, |
| "cov_mean": 7.027760148048401e-06, |
| "cov_std": 0.17652258835732937, |
| "entropy": 0.388671875, |
| "epoch": 0.12457142857142857, |
| "grad_norm": 0.47681912779808044, |
| "kl": 0.0006990432739257812, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": -0.0293, |
| "reward": 0.4895833460614085, |
| "reward_std": 0.20087094604969025, |
| "rewards/accuracy_reward": 0.12500000279396772, |
| "rewards/format_reward": 0.3645833386108279, |
| "step": 109, |
| "w_high_ratio": 0.11448103934526443, |
| "w_low_ratio": 0.018031115527264774, |
| "w_max": 1.5564889311790466, |
| "w_mean": 1.2944257855415344, |
| "w_min": 0.25, |
| "w_std": 0.10801565833389759 |
| }, |
| { |
| "completion_length": 2903.9375610351562, |
| "cov_mean": 2.2163485482451506e-05, |
| "cov_std": 0.3046695999801159, |
| "entropy": 0.37548828125, |
| "epoch": 0.12571428571428572, |
| "grad_norm": 0.574763298034668, |
| "kl": 0.0011281967163085938, |
| "learning_rate": 2.897504487244061e-07, |
| "loss": 0.0666, |
| "reward": 0.6458333656191826, |
| "reward_std": 0.4784049317240715, |
| "rewards/accuracy_reward": 0.16666667256504297, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 110, |
| "w_high_ratio": 0.11397865414619446, |
| "w_low_ratio": 0.036777073866687715, |
| "w_max": 1.7808756828308105, |
| "w_mean": 1.2882322669029236, |
| "w_min": 0.0, |
| "w_std": 0.17456290312111378 |
| }, |
| { |
| "completion_length": 3608.7813110351562, |
| "cov_mean": 2.5331736196676502e-05, |
| "cov_std": 0.3403046578168869, |
| "entropy": 0.54296875, |
| "epoch": 0.12685714285714286, |
| "grad_norm": 0.28652071952819824, |
| "kl": 0.0020961761474609375, |
| "learning_rate": 2.812786337337463e-07, |
| "loss": -0.0593, |
| "reward": 0.4479166716337204, |
| "reward_std": 0.5819729715585709, |
| "rewards/accuracy_reward": 0.19791667442768812, |
| "rewards/format_reward": 0.25000000558793545, |
| "step": 111, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.040594917722046375, |
| "w_max": 1.5843260884284973, |
| "w_mean": 1.1159851551055908, |
| "w_min": 0.0, |
| "w_std": 0.1932711023837328 |
| }, |
| { |
| "completion_length": 3511.9791870117188, |
| "cov_mean": -8.262183837359771e-05, |
| "cov_std": 0.3419903479516506, |
| "entropy": 0.466796875, |
| "epoch": 0.128, |
| "grad_norm": 0.21528662741184235, |
| "kl": 0.0010385513305664062, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0036, |
| "reward": 0.7395833432674408, |
| "reward_std": 0.4909324310719967, |
| "rewards/accuracy_reward": 0.3229166716337204, |
| "rewards/format_reward": 0.4166666716337204, |
| "step": 112, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.039533226285129786, |
| "w_max": 1.4701470732688904, |
| "w_mean": 1.1230643689632416, |
| "w_min": 2.796174871268219e-27, |
| "w_std": 0.17366146482527256 |
| }, |
| { |
| "completion_length": 2989.6250610351562, |
| "cov_mean": -9.21895634746761e-05, |
| "cov_std": 0.4826783090829849, |
| "entropy": 0.47412109375, |
| "epoch": 0.12914285714285714, |
| "grad_norm": 0.48934122920036316, |
| "kl": 0.0016803741455078125, |
| "learning_rate": 2.6477606467058035e-07, |
| "loss": -0.0453, |
| "reward": 0.739583358168602, |
| "reward_std": 0.46098607778549194, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/format_reward": 0.510416679084301, |
| "step": 113, |
| "w_high_ratio": 0.05894821137189865, |
| "w_low_ratio": 0.05880188010632992, |
| "w_max": 2.289375811815262, |
| "w_mean": 1.3520435392856598, |
| "w_min": 0.0, |
| "w_std": 0.3024050109088421 |
| }, |
| { |
| "completion_length": 2825.6771850585938, |
| "cov_mean": 8.050216638366692e-05, |
| "cov_std": 0.36271025612950325, |
| "entropy": 0.38232421875, |
| "epoch": 0.13028571428571428, |
| "grad_norm": 0.3449488580226898, |
| "kl": 0.006592750549316406, |
| "learning_rate": 2.567542470303452e-07, |
| "loss": -0.0924, |
| "reward": 0.6250000298023224, |
| "reward_std": 0.24164991825819016, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.5625000149011612, |
| "step": 114, |
| "w_high_ratio": 0.04218151792883873, |
| "w_low_ratio": 0.03556477651000023, |
| "w_max": 1.9164948165416718, |
| "w_mean": 1.3027912080287933, |
| "w_min": 0.25, |
| "w_std": 0.2133668176829815 |
| }, |
| { |
| "completion_length": 3166.1146240234375, |
| "cov_mean": 5.38662197868689e-06, |
| "cov_std": 0.25203782320022583, |
| "entropy": 0.43505859375, |
| "epoch": 0.13142857142857142, |
| "grad_norm": 0.26439687609672546, |
| "kl": 0.0024480819702148438, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": -0.0066, |
| "reward": 0.5937500298023224, |
| "reward_std": 0.28200745210051537, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/format_reward": 0.3229166716337204, |
| "step": 115, |
| "w_high_ratio": 0.10065623372793198, |
| "w_low_ratio": 0.02666568197309971, |
| "w_max": 1.5669940114021301, |
| "w_mean": 1.224227637052536, |
| "w_min": 0.25, |
| "w_std": 0.1499568410217762 |
| }, |
| { |
| "completion_length": 3656.2604370117188, |
| "cov_mean": -1.988131225516554e-05, |
| "cov_std": 0.35770974680781364, |
| "entropy": 0.5078125, |
| "epoch": 0.13257142857142856, |
| "grad_norm": 0.29910722374916077, |
| "kl": 0.0016193389892578125, |
| "learning_rate": 2.411912629590699e-07, |
| "loss": 0.0044, |
| "reward": 0.2812500027939677, |
| "reward_std": 0.3736678585410118, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/format_reward": 0.19791667442768812, |
| "step": 116, |
| "w_high_ratio": 0.055632032454013824, |
| "w_low_ratio": 0.04639715701341629, |
| "w_max": 1.5884924530982971, |
| "w_mean": 1.1101520657539368, |
| "w_min": 0.25, |
| "w_std": 0.2023993842303753 |
| }, |
| { |
| "completion_length": 3502.0833740234375, |
| "cov_mean": 1.3217656032793457e-05, |
| "cov_std": 0.24507246166467667, |
| "entropy": 0.48486328125, |
| "epoch": 0.1337142857142857, |
| "grad_norm": 0.19559521973133087, |
| "kl": 0.0014400482177734375, |
| "learning_rate": 2.336585241584522e-07, |
| "loss": 0.0266, |
| "reward": 0.31250000558793545, |
| "reward_std": 0.2934442162513733, |
| "rewards/accuracy_reward": 0.041666666977107525, |
| "rewards/format_reward": 0.2708333386108279, |
| "step": 117, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03416412137448788, |
| "w_max": 1.3934015035629272, |
| "w_mean": 1.121408373117447, |
| "w_min": 0.25, |
| "w_std": 0.12930788472294807 |
| }, |
| { |
| "completion_length": 3388.2291870117188, |
| "cov_mean": 8.877824029696058e-05, |
| "cov_std": 0.4343739002943039, |
| "entropy": 0.39794921875, |
| "epoch": 0.13485714285714287, |
| "grad_norm": 0.2352270781993866, |
| "kl": 0.001239776611328125, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": -0.001, |
| "reward": 0.8750000596046448, |
| "reward_std": 0.7063143625855446, |
| "rewards/accuracy_reward": 0.4166666865348816, |
| "rewards/format_reward": 0.458333358168602, |
| "step": 118, |
| "w_high_ratio": 0.025391947478055954, |
| "w_low_ratio": 0.0545379314571619, |
| "w_max": 1.6127934455871582, |
| "w_mean": 1.1610458493232727, |
| "w_min": 0.0, |
| "w_std": 0.23880053497850895 |
| }, |
| { |
| "completion_length": 2543.125030517578, |
| "cov_mean": -5.281608531504389e-05, |
| "cov_std": 0.3761717230081558, |
| "entropy": 0.46533203125, |
| "epoch": 0.136, |
| "grad_norm": 0.4445283114910126, |
| "kl": 0.0053043365478515625, |
| "learning_rate": 2.1911094637307714e-07, |
| "loss": -0.0188, |
| "reward": 0.7812500298023224, |
| "reward_std": 0.3462969809770584, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/format_reward": 0.5729166865348816, |
| "step": 119, |
| "w_high_ratio": 0.14202763978391886, |
| "w_low_ratio": 0.04364374093711376, |
| "w_max": 2.151145786046982, |
| "w_mean": 1.4133342802524567, |
| "w_min": 0.0, |
| "w_std": 0.27028138749301434 |
| }, |
| { |
| "completion_length": 2731.8021240234375, |
| "cov_mean": -1.2950695236213505e-06, |
| "cov_std": 0.40660279989242554, |
| "entropy": 0.45849609375, |
| "epoch": 0.13714285714285715, |
| "grad_norm": 0.44800451397895813, |
| "kl": 0.0019931793212890625, |
| "learning_rate": 2.1210398515832536e-07, |
| "loss": 0.0072, |
| "reward": 0.6770833432674408, |
| "reward_std": 0.37227439880371094, |
| "rewards/accuracy_reward": 0.17708333395421505, |
| "rewards/format_reward": 0.5000000074505806, |
| "step": 120, |
| "w_high_ratio": 0.06295246630907059, |
| "w_low_ratio": 0.051405247300863266, |
| "w_max": 1.8777723908424377, |
| "w_mean": 1.2852334678173065, |
| "w_min": 0.0, |
| "w_std": 0.24544718861579895 |
| }, |
| { |
| "completion_length": 1963.3125610351562, |
| "cov_mean": -5.540308461604582e-05, |
| "cov_std": 0.36702772229909897, |
| "entropy": 0.38525390625, |
| "epoch": 0.1382857142857143, |
| "grad_norm": 0.5723959803581238, |
| "kl": 0.00885772705078125, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": -0.076, |
| "reward": 1.0104167014360428, |
| "reward_std": 0.406433891505003, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/format_reward": 0.7812500149011612, |
| "step": 121, |
| "w_high_ratio": 0.25104042887687683, |
| "w_low_ratio": 0.04047479620203376, |
| "w_max": 2.346793830394745, |
| "w_mean": 1.5671572387218475, |
| "w_min": 0.0, |
| "w_std": 0.25984594970941544 |
| }, |
| { |
| "completion_length": 3143.0208740234375, |
| "cov_mean": 1.6946690038821544e-05, |
| "cov_std": 0.2671542540192604, |
| "entropy": 0.453125, |
| "epoch": 0.13942857142857143, |
| "grad_norm": 0.1998731791973114, |
| "kl": 0.0013976097106933594, |
| "learning_rate": 1.986426879955034e-07, |
| "loss": -0.0537, |
| "reward": 0.7812500149011612, |
| "reward_std": 0.4281647428870201, |
| "rewards/accuracy_reward": 0.3020833469927311, |
| "rewards/format_reward": 0.479166679084301, |
| "step": 122, |
| "w_high_ratio": 0.01932075247168541, |
| "w_low_ratio": 0.030062017496675253, |
| "w_max": 1.5524874925613403, |
| "w_mean": 1.134686678647995, |
| "w_min": 0.0, |
| "w_std": 0.1492646411061287 |
| }, |
| { |
| "completion_length": 3167.3959350585938, |
| "cov_mean": -5.797519952466246e-05, |
| "cov_std": 0.44271689653396606, |
| "entropy": 0.4228515625, |
| "epoch": 0.14057142857142857, |
| "grad_norm": 0.32559123635292053, |
| "kl": 0.001354217529296875, |
| "learning_rate": 1.9219564157731844e-07, |
| "loss": -0.0282, |
| "reward": 0.6562500260770321, |
| "reward_std": 0.49003005772829056, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/format_reward": 0.4687500111758709, |
| "step": 123, |
| "w_high_ratio": 0.03850603708997369, |
| "w_low_ratio": 0.054318103939294815, |
| "w_max": 1.9482422471046448, |
| "w_mean": 1.2491904497146606, |
| "w_min": 0.0, |
| "w_std": 0.273440919816494 |
| }, |
| { |
| "completion_length": 2679.291748046875, |
| "cov_mean": 4.984476254321635e-05, |
| "cov_std": 0.3510932922363281, |
| "entropy": 0.32421875, |
| "epoch": 0.1417142857142857, |
| "grad_norm": 0.5072447061538696, |
| "kl": 0.0061626434326171875, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0459, |
| "reward": 0.8645833730697632, |
| "reward_std": 0.44617248326539993, |
| "rewards/accuracy_reward": 0.29166667722165585, |
| "rewards/format_reward": 0.5729166865348816, |
| "step": 124, |
| "w_high_ratio": 0.0684627341106534, |
| "w_low_ratio": 0.03564309095963836, |
| "w_max": 2.0593042075634003, |
| "w_mean": 1.2975650131702423, |
| "w_min": 0.0, |
| "w_std": 0.22546635568141937 |
| }, |
| { |
| "completion_length": 2886.1666870117188, |
| "cov_mean": -3.579185113267158e-05, |
| "cov_std": 0.20644951611757278, |
| "entropy": 0.365478515625, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 1.0346649885177612, |
| "kl": 0.001129150390625, |
| "learning_rate": 1.7988620712370195e-07, |
| "loss": -0.0434, |
| "reward": 0.6041666772216558, |
| "reward_std": 0.16948115080595016, |
| "rewards/accuracy_reward": 0.23958333861082792, |
| "rewards/format_reward": 0.3645833386108279, |
| "step": 125, |
| "w_high_ratio": 0.10017836093902588, |
| "w_low_ratio": 0.013640805147588253, |
| "w_max": 1.8374318480491638, |
| "w_mean": 1.238489419221878, |
| "w_min": 0.25, |
| "w_std": 0.1324586421251297 |
| }, |
| { |
| "completion_length": 3072.1355590820312, |
| "cov_mean": -7.128902507247403e-05, |
| "cov_std": 0.268012635409832, |
| "entropy": 0.42138671875, |
| "epoch": 0.144, |
| "grad_norm": 0.24976088106632233, |
| "kl": 0.001007080078125, |
| "learning_rate": 1.7403048486417868e-07, |
| "loss": -0.003, |
| "reward": 0.7604166939854622, |
| "reward_std": 0.3946729302406311, |
| "rewards/accuracy_reward": 0.2395833395421505, |
| "rewards/format_reward": 0.5208333432674408, |
| "step": 126, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.02901885099709034, |
| "w_max": 1.5019842684268951, |
| "w_mean": 1.154728651046753, |
| "w_min": 0.25, |
| "w_std": 0.13917932659387589 |
| }, |
| { |
| "completion_length": 3639.2084350585938, |
| "cov_mean": -4.922010657537612e-05, |
| "cov_std": 0.40233808010816574, |
| "entropy": 0.4443359375, |
| "epoch": 0.14514285714285713, |
| "grad_norm": 0.2971266508102417, |
| "kl": 0.001190185546875, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0023, |
| "reward": 0.291666679084301, |
| "reward_std": 0.3814757987856865, |
| "rewards/accuracy_reward": 0.031250000931322575, |
| "rewards/format_reward": 0.2604166716337204, |
| "step": 127, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.048589578829705715, |
| "w_max": 1.546887069940567, |
| "w_mean": 1.0961028933525085, |
| "w_min": 0.0, |
| "w_std": 0.21367743983864784 |
| }, |
| { |
| "completion_length": 3072.729248046875, |
| "cov_mean": 2.080060630760272e-05, |
| "cov_std": 0.1757410392165184, |
| "entropy": 0.4794921875, |
| "epoch": 0.1462857142857143, |
| "grad_norm": 0.20411358773708344, |
| "kl": 0.0022602081298828125, |
| "learning_rate": 1.6293288344708566e-07, |
| "loss": -0.0426, |
| "reward": 0.8854166865348816, |
| "reward_std": 0.23283471912145615, |
| "rewards/accuracy_reward": 0.416666679084301, |
| "rewards/format_reward": 0.46875, |
| "step": 128, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.023187558632344007, |
| "w_max": 1.2965095043182373, |
| "w_mean": 1.0975826382637024, |
| "w_min": 0.5, |
| "w_std": 0.09349002316594124 |
| }, |
| { |
| "completion_length": 3820.5000610351562, |
| "cov_mean": -7.538520003436133e-05, |
| "cov_std": 0.28777727484703064, |
| "entropy": 0.4658203125, |
| "epoch": 0.14742857142857144, |
| "grad_norm": 0.15165992081165314, |
| "kl": 0.0022630691528320312, |
| "learning_rate": 1.5769701383645698e-07, |
| "loss": 0.0248, |
| "reward": 0.3125, |
| "reward_std": 0.40841156244277954, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.1875, |
| "step": 129, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.03958333469927311, |
| "w_max": 1.3145957589149475, |
| "w_mean": 1.0334198474884033, |
| "w_min": 0.25, |
| "w_std": 0.15427661687135696 |
| }, |
| { |
| "completion_length": 3234.9375610351562, |
| "cov_mean": -4.075149445270654e-05, |
| "cov_std": 0.40202101692557335, |
| "entropy": 0.4580078125, |
| "epoch": 0.14857142857142858, |
| "grad_norm": 0.5406985282897949, |
| "kl": 0.0025501251220703125, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0054, |
| "reward": 0.43750001676380634, |
| "reward_std": 0.48847879469394684, |
| "rewards/accuracy_reward": 0.13541666697710752, |
| "rewards/format_reward": 0.30208334140479565, |
| "step": 130, |
| "w_high_ratio": 0.1068541444838047, |
| "w_low_ratio": 0.04362851567566395, |
| "w_max": 2.2782379388809204, |
| "w_mean": 1.2775286734104156, |
| "w_min": 0.0, |
| "w_std": 0.27442070841789246 |
| }, |
| { |
| "completion_length": 2971.3751220703125, |
| "cov_mean": 7.5797214776685e-05, |
| "cov_std": 0.296669140458107, |
| "entropy": 0.42236328125, |
| "epoch": 0.14971428571428572, |
| "grad_norm": 0.3472649157047272, |
| "kl": 0.0027103424072265625, |
| "learning_rate": 1.4786531185446452e-07, |
| "loss": -0.0555, |
| "reward": 0.7812500223517418, |
| "reward_std": 0.35348715633153915, |
| "rewards/accuracy_reward": 0.3437500074505806, |
| "rewards/format_reward": 0.4375000074505806, |
| "step": 131, |
| "w_high_ratio": 0.03943436220288277, |
| "w_low_ratio": 0.02768976055085659, |
| "w_max": 1.649653136730194, |
| "w_mean": 1.195732295513153, |
| "w_min": 0.25, |
| "w_std": 0.17519061639904976 |
| }, |
| { |
| "completion_length": 3084.197998046875, |
| "cov_mean": 8.538130737179017e-05, |
| "cov_std": 0.4518180638551712, |
| "entropy": 0.4111328125, |
| "epoch": 0.15085714285714286, |
| "grad_norm": 0.3087000846862793, |
| "kl": 0.0020132064819335938, |
| "learning_rate": 1.432748035231658e-07, |
| "loss": 0.0503, |
| "reward": 0.6458333507180214, |
| "reward_std": 0.5204262509942055, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/format_reward": 0.3958333432674408, |
| "step": 132, |
| "w_high_ratio": 0.16728225722908974, |
| "w_low_ratio": 0.057774459943175316, |
| "w_max": 2.160991072654724, |
| "w_mean": 1.3055765330791473, |
| "w_min": 0.0, |
| "w_std": 0.2905358038842678 |
| }, |
| { |
| "completion_length": 3655.4166870117188, |
| "cov_mean": 7.652423300896771e-05, |
| "cov_std": 0.2661324590444565, |
| "entropy": 0.49365234375, |
| "epoch": 0.152, |
| "grad_norm": 0.18705269694328308, |
| "kl": 0.001617431640625, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.075, |
| "reward": 0.3645833358168602, |
| "reward_std": 0.23336705565452576, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.1979166679084301, |
| "step": 133, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04366964288055897, |
| "w_max": 1.3597778677940369, |
| "w_mean": 1.095180168747902, |
| "w_min": 0.5, |
| "w_std": 0.1226506233215332 |
| }, |
| { |
| "completion_length": 2935.791748046875, |
| "cov_mean": 4.5339866119320504e-05, |
| "cov_std": 0.3691224604845047, |
| "entropy": 0.47900390625, |
| "epoch": 0.15314285714285714, |
| "grad_norm": 0.3127249777317047, |
| "kl": 0.0026226043701171875, |
| "learning_rate": 1.3475690004005097e-07, |
| "loss": 0.0052, |
| "reward": 0.739583358168602, |
| "reward_std": 0.5291576012969017, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/format_reward": 0.4895833507180214, |
| "step": 134, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.049329387256875634, |
| "w_max": 1.5319222509860992, |
| "w_mean": 1.2102845907211304, |
| "w_min": 0.0, |
| "w_std": 0.20571278221905231 |
| }, |
| { |
| "completion_length": 2150.0625610351562, |
| "cov_mean": -6.792098974983674e-05, |
| "cov_std": 0.41625837981700897, |
| "entropy": 0.40966796875, |
| "epoch": 0.15428571428571428, |
| "grad_norm": 0.5556284189224243, |
| "kl": 0.00433349609375, |
| "learning_rate": 1.308341174832359e-07, |
| "loss": -0.0542, |
| "reward": 1.2187500149011612, |
| "reward_std": 0.4045410081744194, |
| "rewards/accuracy_reward": 0.5312500074505806, |
| "rewards/format_reward": 0.6875000074505806, |
| "step": 135, |
| "w_high_ratio": 0.1767500415444374, |
| "w_low_ratio": 0.04722660221159458, |
| "w_max": 2.584423005580902, |
| "w_mean": 1.3822406232357025, |
| "w_min": 3.783363098268088e-33, |
| "w_std": 0.2943294197320938 |
| }, |
| { |
| "completion_length": 3219.2188110351562, |
| "cov_mean": 7.927787930839258e-05, |
| "cov_std": 0.36336907744407654, |
| "entropy": 0.36572265625, |
| "epoch": 0.15542857142857142, |
| "grad_norm": 0.4164927303791046, |
| "kl": 0.0019321441650390625, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": -0.1128, |
| "reward": 0.9062500298023224, |
| "reward_std": 0.520418331027031, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/format_reward": 0.4687500149011612, |
| "step": 136, |
| "w_high_ratio": 0.09939645975828171, |
| "w_low_ratio": 0.0331453662365675, |
| "w_max": 1.8468493521213531, |
| "w_mean": 1.2761128842830658, |
| "w_min": 0.25, |
| "w_std": 0.22711537778377533 |
| }, |
| { |
| "completion_length": 3419.6458740234375, |
| "cov_mean": -3.98908814531751e-05, |
| "cov_std": 0.4879545792937279, |
| "entropy": 0.36328125, |
| "epoch": 0.15657142857142858, |
| "grad_norm": 0.6263577938079834, |
| "kl": 0.0011224746704101562, |
| "learning_rate": 1.2367151086855187e-07, |
| "loss": 0.0354, |
| "reward": 0.4479166716337204, |
| "reward_std": 0.5504394620656967, |
| "rewards/accuracy_reward": 0.10416667070239782, |
| "rewards/format_reward": 0.3437500074505806, |
| "step": 137, |
| "w_high_ratio": 0.040824100375175476, |
| "w_low_ratio": 0.054094865918159485, |
| "w_max": 1.8979838490486145, |
| "w_mean": 1.1621877253055573, |
| "w_min": 0.0, |
| "w_std": 0.2735421061515808 |
| }, |
| { |
| "completion_length": 3128.0626220703125, |
| "cov_mean": -2.278413830936188e-05, |
| "cov_std": 0.27040576189756393, |
| "entropy": 0.38330078125, |
| "epoch": 0.15771428571428572, |
| "grad_norm": 0.9205570816993713, |
| "kl": 0.049633026123046875, |
| "learning_rate": 1.2043556548852063e-07, |
| "loss": 0.036, |
| "reward": 0.5520833358168602, |
| "reward_std": 0.3116639107465744, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/format_reward": 0.385416679084301, |
| "step": 138, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.038355547934770584, |
| "w_max": 1.4525592625141144, |
| "w_mean": 1.1665604412555695, |
| "w_min": 0.25, |
| "w_std": 0.14510583132505417 |
| }, |
| { |
| "completion_length": 3528.5313110351562, |
| "cov_mean": 3.168399962305557e-05, |
| "cov_std": 0.5140600129961967, |
| "entropy": 0.498046875, |
| "epoch": 0.15885714285714286, |
| "grad_norm": 0.4198448061943054, |
| "kl": 0.0019092559814453125, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": -0.0322, |
| "reward": 0.5104166865348816, |
| "reward_std": 0.562984399497509, |
| "rewards/accuracy_reward": 0.13541666977107525, |
| "rewards/format_reward": 0.3750000186264515, |
| "step": 139, |
| "w_high_ratio": 0.04249488562345505, |
| "w_low_ratio": 0.06252239271998405, |
| "w_max": 1.7476003468036652, |
| "w_mean": 1.1608193814754486, |
| "w_min": 0.0, |
| "w_std": 0.2752341143786907 |
| }, |
| { |
| "completion_length": 3422.666748046875, |
| "cov_mean": -5.857029464095831e-05, |
| "cov_std": 0.4820387288928032, |
| "entropy": 0.51123046875, |
| "epoch": 0.16, |
| "grad_norm": 0.747706413269043, |
| "kl": 0.00482940673828125, |
| "learning_rate": 1.1466315124171128e-07, |
| "loss": -0.0162, |
| "reward": 0.4791666865348816, |
| "reward_std": 0.521266907453537, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/format_reward": 0.2916666753590107, |
| "step": 140, |
| "w_high_ratio": 0.09309478104114532, |
| "w_low_ratio": 0.05631279572844505, |
| "w_max": 2.1824983656406403, |
| "w_mean": 1.2857783138751984, |
| "w_min": 4.5542200090556555e-45, |
| "w_std": 0.3347093164920807 |
| }, |
| { |
| "completion_length": 3233.166748046875, |
| "cov_mean": 2.940024387498852e-06, |
| "cov_std": 0.45166684687137604, |
| "entropy": 0.40185546875, |
| "epoch": 0.16114285714285714, |
| "grad_norm": 0.3896685838699341, |
| "kl": 0.0029087066650390625, |
| "learning_rate": 1.1212980823907929e-07, |
| "loss": -0.0202, |
| "reward": 0.6875000149011612, |
| "reward_std": 0.5667420700192451, |
| "rewards/accuracy_reward": 0.21875000651925802, |
| "rewards/format_reward": 0.4687500149011612, |
| "step": 141, |
| "w_high_ratio": 0.04078603908419609, |
| "w_low_ratio": 0.0461601298302412, |
| "w_max": 1.661450743675232, |
| "w_mean": 1.181685209274292, |
| "w_min": 0.0, |
| "w_std": 0.24826455861330032 |
| }, |
| { |
| "completion_length": 3014.7084045410156, |
| "cov_mean": 1.010456662697834e-05, |
| "cov_std": 0.35788750648498535, |
| "entropy": 0.43212890625, |
| "epoch": 0.16228571428571428, |
| "grad_norm": 0.37326905131340027, |
| "kl": 0.0032243728637695312, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0113, |
| "reward": 0.8020833730697632, |
| "reward_std": 0.4403490200638771, |
| "rewards/accuracy_reward": 0.23958334233611822, |
| "rewards/format_reward": 0.5625000074505806, |
| "step": 142, |
| "w_high_ratio": 0.07575653120875359, |
| "w_low_ratio": 0.04324484569951892, |
| "w_max": 1.8279287815093994, |
| "w_mean": 1.3297627568244934, |
| "w_min": 0.0, |
| "w_std": 0.21703040227293968 |
| }, |
| { |
| "completion_length": 3143.479248046875, |
| "cov_mean": 4.2679124817368574e-05, |
| "cov_std": 0.5287895128130913, |
| "entropy": 0.4609375, |
| "epoch": 0.16342857142857142, |
| "grad_norm": 0.8503015637397766, |
| "kl": 0.004160881042480469, |
| "learning_rate": 1.0777570898211405e-07, |
| "loss": -0.0551, |
| "reward": 0.5208333507180214, |
| "reward_std": 0.4103339910507202, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/format_reward": 0.416666679084301, |
| "step": 143, |
| "w_high_ratio": 0.06985687837004662, |
| "w_low_ratio": 0.06702116876840591, |
| "w_max": 2.2525435388088226, |
| "w_mean": 1.2808727622032166, |
| "w_min": 0.0, |
| "w_std": 0.33132829889655113 |
| }, |
| { |
| "completion_length": 3187.3750610351562, |
| "cov_mean": -3.3114460165961646e-05, |
| "cov_std": 0.4341953620314598, |
| "entropy": 0.40185546875, |
| "epoch": 0.16457142857142856, |
| "grad_norm": 0.36935943365097046, |
| "kl": 0.0021028518676757812, |
| "learning_rate": 1.0595731054933934e-07, |
| "loss": 0.0152, |
| "reward": 0.6250000223517418, |
| "reward_std": 0.54064517095685, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/format_reward": 0.3541666716337204, |
| "step": 144, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.04692948702722788, |
| "w_max": 1.7171835899353027, |
| "w_mean": 1.1371115744113922, |
| "w_min": 0.0, |
| "w_std": 0.24135740101337433 |
| }, |
| { |
| "completion_length": 2363.416748046875, |
| "cov_mean": 7.833678682800382e-05, |
| "cov_std": 0.23034526035189629, |
| "entropy": 0.3486328125, |
| "epoch": 0.1657142857142857, |
| "grad_norm": 0.32663026452064514, |
| "kl": 0.0025043487548828125, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0397, |
| "reward": 0.916666692122817, |
| "reward_std": 0.28357625752687454, |
| "rewards/accuracy_reward": 0.3125000111758709, |
| "rewards/format_reward": 0.6041666772216558, |
| "step": 145, |
| "w_high_ratio": 0.0, |
| "w_low_ratio": 0.027998102828860283, |
| "w_max": 1.3861334919929504, |
| "w_mean": 1.1757619678974152, |
| "w_min": 0.2742290794849396, |
| "w_std": 0.13859782367944717 |
| }, |
| { |
| "completion_length": 2963.791748046875, |
| "cov_mean": 3.222269015168422e-05, |
| "cov_std": 0.2941744774580002, |
| "entropy": 0.43359375, |
| "epoch": 0.16685714285714287, |
| "grad_norm": 0.466317743062973, |
| "kl": 0.0017948150634765625, |
| "learning_rate": 1.0304273901612565e-07, |
| "loss": -0.0398, |
| "reward": 0.5312500074505806, |
| "reward_std": 0.31367800384759903, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/format_reward": 0.4687500074505806, |
| "step": 146, |
| "w_high_ratio": 0.08747749403119087, |
| "w_low_ratio": 0.03374567721039057, |
| "w_max": 2.10350301861763, |
| "w_mean": 1.2755134403705597, |
| "w_min": 0.25, |
| "w_std": 0.2049087956547737 |
| }, |
| { |
| "completion_length": 3316.1875610351562, |
| "cov_mean": -2.5228303002222674e-07, |
| "cov_std": 0.2783627863973379, |
| "entropy": 0.42431640625, |
| "epoch": 0.168, |
| "grad_norm": 0.2964071035385132, |
| "kl": 0.00269317626953125, |
| "learning_rate": 1.0194814420758804e-07, |
| "loss": -0.0063, |
| "reward": 0.4687500102445483, |
| "reward_std": 0.41155891865491867, |
| "rewards/accuracy_reward": 0.1562500037252903, |
| "rewards/format_reward": 0.3125000102445483, |
| "step": 147, |
| "w_high_ratio": 0.08768598735332489, |
| "w_low_ratio": 0.027966859750449657, |
| "w_max": 1.8361600935459137, |
| "w_mean": 1.2343448996543884, |
| "w_min": 0.0, |
| "w_std": 0.19136979151517153 |
| }, |
| { |
| "completion_length": 2667.0625, |
| "cov_mean": 6.741791366948746e-05, |
| "cov_std": 0.2421601451933384, |
| "entropy": 0.3486328125, |
| "epoch": 0.16914285714285715, |
| "grad_norm": 0.5170913338661194, |
| "kl": 0.0018463134765625, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": -0.0546, |
| "reward": 0.708333358168602, |
| "reward_std": 0.22040386497974396, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/format_reward": 0.5416666865348816, |
| "step": 148, |
| "w_high_ratio": 0.10121732205152512, |
| "w_low_ratio": 0.022710513323545456, |
| "w_max": 2.0310742557048798, |
| "w_mean": 1.4218811392784119, |
| "w_min": 0.25, |
| "w_std": 0.20712272450327873 |
| }, |
| { |
| "completion_length": 3187.947998046875, |
| "cov_mean": -3.065193595830351e-05, |
| "cov_std": 0.278855100274086, |
| "entropy": 0.34326171875, |
| "epoch": 0.1702857142857143, |
| "grad_norm": 0.3262748122215271, |
| "kl": 0.011348247528076172, |
| "learning_rate": 1.0048729989766394e-07, |
| "loss": -0.0407, |
| "reward": 0.656250013038516, |
| "reward_std": 0.2951196879148483, |
| "rewards/accuracy_reward": 0.1979166716337204, |
| "rewards/format_reward": 0.45833334140479565, |
| "step": 149, |
| "w_high_ratio": 0.006521851755678654, |
| "w_low_ratio": 0.021616162732243538, |
| "w_max": 1.6156696677207947, |
| "w_mean": 1.1841600239276886, |
| "w_min": 0.0, |
| "w_std": 0.13553307205438614 |
| }, |
| { |
| "completion_length": 3107.3125, |
| "cov_mean": 1.709405751171289e-05, |
| "cov_std": 0.34910060465335846, |
| "entropy": 0.3984375, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.33226433396339417, |
| "kl": 0.0028133392333984375, |
| "learning_rate": 1.0012184146924223e-07, |
| "loss": 0.0548, |
| "reward": 0.520833358168602, |
| "reward_std": 0.40651097148656845, |
| "rewards/accuracy_reward": 0.13541666977107525, |
| "rewards/format_reward": 0.3854166679084301, |
| "step": 150, |
| "w_high_ratio": 0.05817456915974617, |
| "w_low_ratio": 0.04275808576494455, |
| "w_max": 1.5990401804447174, |
| "w_mean": 1.167884886264801, |
| "w_min": 0.25, |
| "w_std": 0.18246712163090706 |
| }, |
| { |
| "epoch": 0.17142857142857143, |
| "step": 150, |
| "total_flos": 0.0, |
| "train_loss": -0.007013439348277946, |
| "train_runtime": 12590.1794, |
| "train_samples_per_second": 1.144, |
| "train_steps_per_second": 0.012 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 150, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|