diff --git "a/checkpoint-550/trainer_state.json" "b/checkpoint-550/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-550/trainer_state.json" @@ -0,0 +1,3004 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.023706896551724137, + "eval_steps": 500, + "global_step": 550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 68.5, + "completions/mean_terminated_length": 68.5, + "completions/min_length": 26.8, + "completions/min_terminated_length": 26.8, + "entropy": 0.25826080311089755, + "epoch": 0.00021551724137931034, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.28125, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.10238287448883057, + "num_tokens": 12064.0, + "reward": 0.3290319949388504, + "reward_std": 0.40028320252895355, + "rewards/reward_fn/mean": 0.3290319949388504, + "rewards/reward_fn/std": 0.400283208489418, + "step": 5, + "step_time": 22.194597821800016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.6, + "completions/max_terminated_length": 87.6, + "completions/mean_length": 48.525, + "completions/mean_terminated_length": 48.525, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.2673827801831067, + "epoch": 0.0004310344827586207, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.484375, + "learning_rate": 2.25e-06, + "loss": 0.05849265456199646, + "num_tokens": 23089.0, + "reward": 0.45969198942184447, + "reward_std": 0.2795014828443527, + "rewards/reward_fn/mean": 0.45969198942184447, + "rewards/reward_fn/std": 0.27950150668621065, + "step": 10, + "step_time": 16.331819552399928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.6, + "completions/max_terminated_length": 85.6, + "completions/mean_length": 47.35, + "completions/mean_terminated_length": 47.35, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.20174795808270574, + "epoch": 0.000646551724137931, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.66015625, + "learning_rate": 3.5e-06, + "loss": -0.03555725216865539, + "num_tokens": 34239.0, + "reward": 0.5545999944210053, + "reward_std": 0.32832055240869523, + "rewards/reward_fn/mean": 0.5545999944210053, + "rewards/reward_fn/std": 0.3283205583691597, + "step": 15, + "step_time": 16.44816417159991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.2, + "completions/max_terminated_length": 98.2, + "completions/mean_length": 53.6, + "completions/mean_terminated_length": 53.6, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.2620095924474299, + "epoch": 0.0008620689655172414, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.5390625, + "learning_rate": 4.75e-06, + "loss": 0.017401468753814698, + "num_tokens": 45399.0, + "reward": 0.4466240078210831, + "reward_std": 0.27573536019772293, + "rewards/reward_fn/mean": 0.4466240078210831, + "rewards/reward_fn/std": 0.27573537137359383, + "step": 20, + "step_time": 17.988385831799953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.8, + "completions/max_terminated_length": 123.8, + "completions/mean_length": 59.15, + "completions/mean_terminated_length": 59.15, + "completions/min_length": 21.8, + "completions/min_terminated_length": 21.8, + "entropy": 0.33701689867302775, + "epoch": 0.0010775862068965517, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.74609375, + "learning_rate": 4.965517241379311e-06, + "loss": -0.02674364447593689, + "num_tokens": 56993.0, + "reward": 0.5006439983844757, + "reward_std": 0.2351181447505951, + "rewards/reward_fn/mean": 0.5006439983844757, + "rewards/reward_fn/std": 0.23511814773082734, + "step": 25, + "step_time": 21.11461545459997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.2, + "completions/max_terminated_length": 110.2, + "completions/mean_length": 47.95, + "completions/mean_terminated_length": 47.95, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.24504410615190864, + "epoch": 0.001293103448275862, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.92578125, + "learning_rate": 4.922413793103449e-06, + "loss": -0.05023183822631836, + "num_tokens": 67659.0, + "reward": 0.6457239985466003, + "reward_std": 0.11179900387069211, + "rewards/reward_fn/mean": 0.6457239985466003, + "rewards/reward_fn/std": 0.11179901438299567, + "step": 30, + "step_time": 19.134268762199827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 150.8, + "completions/max_terminated_length": 120.2, + "completions/mean_length": 73.375, + "completions/mean_terminated_length": 54.36666717529297, + "completions/min_length": 15.2, + "completions/min_terminated_length": 15.2, + "entropy": 0.23285924410447478, + "epoch": 0.0015086206896551724, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.58203125, + "learning_rate": 4.879310344827586e-06, + "loss": -0.05149807929992676, + "num_tokens": 79734.0, + "reward": 0.46414998471736907, + "reward_std": 0.4071369742392562, + "rewards/reward_fn/mean": 0.46414998471736907, + "rewards/reward_fn/std": 0.40713699695188554, + "step": 35, + "step_time": 24.679699858600042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.4, + "completions/max_terminated_length": 125.4, + "completions/mean_length": 68.925, + "completions/mean_terminated_length": 68.925, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.3110098702833056, + "epoch": 0.0017241379310344827, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.419921875, + "learning_rate": 4.836206896551724e-06, + "loss": 0.030162644386291505, + "num_tokens": 91695.0, + "reward": 0.5143539935350419, + "reward_std": 0.2790891878306866, + "rewards/reward_fn/mean": 0.5143539935350419, + "rewards/reward_fn/std": 0.279089218378067, + "step": 40, + "step_time": 21.565513409599998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 16.6, + "completions/min_terminated_length": 16.6, + "entropy": 0.23047098610550165, + "epoch": 0.001939655172413793, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.15625, + "learning_rate": 4.793103448275862e-06, + "loss": -0.04350074529647827, + "num_tokens": 102994.0, + "reward": 0.6398920059204102, + "reward_std": 0.2279826147481799, + "rewards/reward_fn/mean": 0.6398920059204102, + "rewards/reward_fn/std": 0.22798261437565087, + "step": 45, + "step_time": 18.23975218899968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.4, + "completions/max_terminated_length": 133.4, + "completions/mean_length": 75.975, + "completions/mean_terminated_length": 75.975, + "completions/min_length": 23.8, + "completions/min_terminated_length": 23.8, + "entropy": 0.34613882582634686, + "epoch": 0.0021551724137931034, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.1318359375, + "learning_rate": 4.75e-06, + "loss": -0.03668657541275024, + "num_tokens": 115289.0, + "reward": 0.33042599707841874, + "reward_std": 0.5918701648712158, + "rewards/reward_fn/mean": 0.33042599707841874, + "rewards/reward_fn/std": 0.5918702006340026, + "step": 50, + "step_time": 22.43372436519985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.2, + "completions/max_terminated_length": 92.2, + "completions/mean_length": 50.65, + "completions/mean_terminated_length": 50.65, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "entropy": 0.3391244841972366, + "epoch": 0.0023706896551724138, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.39453125, + "learning_rate": 4.706896551724138e-06, + "loss": -0.0521969735622406, + "num_tokens": 126111.0, + "reward": 0.6139639973640442, + "reward_std": 0.18208687230944634, + "rewards/reward_fn/mean": 0.6139639973640442, + "rewards/reward_fn/std": 0.18208688013255597, + "step": 55, + "step_time": 16.889007100199933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 164.0, + "completions/max_terminated_length": 162.4, + "completions/mean_length": 82.1, + "completions/mean_terminated_length": 78.87857208251953, + "completions/min_length": 29.4, + "completions/min_terminated_length": 29.4, + "entropy": 0.28240502553526314, + "epoch": 0.002586206896551724, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2373046875, + "learning_rate": 4.663793103448276e-06, + "loss": -0.0108717679977417, + "num_tokens": 138667.0, + "reward": 0.6616780042648316, + "reward_std": 0.18088504523038865, + "rewards/reward_fn/mean": 0.6616780042648316, + "rewards/reward_fn/std": 0.18088504672050476, + "step": 60, + "step_time": 26.46498533039994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 134.8, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 69.55, + "completions/mean_terminated_length": 63.233334350585935, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.3688268234953284, + "epoch": 0.0028017241379310344, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.2216796875, + "learning_rate": 4.620689655172414e-06, + "loss": 0.09875075221061706, + "num_tokens": 150585.0, + "reward": 0.5008320093154908, + "reward_std": 0.2221744753420353, + "rewards/reward_fn/mean": 0.5008320093154908, + "rewards/reward_fn/std": 0.22217447832226753, + "step": 65, + "step_time": 22.69940989719953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 149.4, + "completions/max_terminated_length": 141.4, + "completions/mean_length": 82.125, + "completions/mean_terminated_length": 76.2107162475586, + "completions/min_length": 34.8, + "completions/min_terminated_length": 34.8, + "entropy": 0.2419425747357309, + "epoch": 0.003017241379310345, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.5859375, + "learning_rate": 4.577586206896552e-06, + "loss": -0.06608393192291259, + "num_tokens": 162726.0, + "reward": 0.35277000069618225, + "reward_std": 0.37774557205848397, + "rewards/reward_fn/mean": 0.35277000069618225, + "rewards/reward_fn/std": 0.37774557354860006, + "step": 70, + "step_time": 24.30778650340026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 137.6, + "completions/max_terminated_length": 135.6, + "completions/mean_length": 77.55, + "completions/mean_terminated_length": 74.46785888671874, + "completions/min_length": 41.8, + "completions/min_terminated_length": 41.8, + "entropy": 0.35793030727654696, + "epoch": 0.003232758620689655, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.63671875, + "learning_rate": 4.53448275862069e-06, + "loss": 0.10921311378479004, + "num_tokens": 175364.0, + "reward": 0.15934600830078124, + "reward_std": 0.5374872148036957, + "rewards/reward_fn/mean": 0.15934600830078124, + "rewards/reward_fn/std": 0.5374872386455536, + "step": 75, + "step_time": 23.099894465200123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.6, + "completions/max_terminated_length": 95.6, + "completions/mean_length": 53.175, + "completions/mean_terminated_length": 53.175, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.2758270605234429, + "epoch": 0.0034482758620689655, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.443359375, + "learning_rate": 4.4913793103448275e-06, + "loss": 0.11924625635147094, + "num_tokens": 186407.0, + "reward": 0.6774819850921631, + "reward_std": 0.11054287778679281, + "rewards/reward_fn/mean": 0.6774819850921631, + "rewards/reward_fn/std": 0.11054288879968226, + "step": 80, + "step_time": 17.348075130399685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.6, + "completions/max_terminated_length": 137.6, + "completions/mean_length": 68.25, + "completions/mean_terminated_length": 68.25, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.2696234828326851, + "epoch": 0.003663793103448276, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.287109375, + "learning_rate": 4.4482758620689656e-06, + "loss": -0.03822631537914276, + "num_tokens": 198241.0, + "reward": 0.5305620029568672, + "reward_std": 0.4029154841089621, + "rewards/reward_fn/mean": 0.5305620029568672, + "rewards/reward_fn/std": 0.4029154877178371, + "step": 85, + "step_time": 22.834117503599828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.2, + "completions/max_terminated_length": 131.2, + "completions/mean_length": 77.45, + "completions/mean_terminated_length": 77.45, + "completions/min_length": 19.8, + "completions/min_terminated_length": 19.8, + "entropy": 0.3707127865403891, + "epoch": 0.003879310344827586, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.50390625, + "learning_rate": 4.405172413793104e-06, + "loss": 0.035861659049987796, + "num_tokens": 210935.0, + "reward": 0.3616240084171295, + "reward_std": 0.48762375079095366, + "rewards/reward_fn/mean": 0.3616240084171295, + "rewards/reward_fn/std": 0.48762374445796014, + "step": 90, + "step_time": 22.48314213299982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 118.2, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 66.9, + "completions/mean_terminated_length": 56.7, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.342819757014513, + "epoch": 0.004094827586206897, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.78125, + "learning_rate": 4.362068965517242e-06, + "loss": -0.051665693521499634, + "num_tokens": 222647.0, + "reward": 0.46663198471069334, + "reward_std": 0.3400567059754394, + "rewards/reward_fn/mean": 0.46663198471069334, + "rewards/reward_fn/std": 0.34005671155173334, + "step": 95, + "step_time": 20.31996373819993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.2, + "completions/max_terminated_length": 82.2, + "completions/mean_length": 39.975, + "completions/mean_terminated_length": 39.975, + "completions/min_length": 16.4, + "completions/min_terminated_length": 16.4, + "entropy": 0.25842549465596676, + "epoch": 0.004310344827586207, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.64453125, + "learning_rate": 4.31896551724138e-06, + "loss": 0.029389530420303345, + "num_tokens": 233466.0, + "reward": 0.5307899951934815, + "reward_std": 0.28180397795513273, + "rewards/reward_fn/mean": 0.5307899951934815, + "rewards/reward_fn/std": 0.28180398060940204, + "step": 100, + "step_time": 15.753981888000453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.8, + "completions/max_terminated_length": 77.8, + "completions/mean_length": 49.6, + "completions/mean_terminated_length": 49.6, + "completions/min_length": 16.4, + "completions/min_terminated_length": 16.4, + "entropy": 0.27716706816572695, + "epoch": 0.004525862068965518, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.51953125, + "learning_rate": 4.275862068965518e-06, + "loss": 0.011613726615905762, + "num_tokens": 244550.0, + "reward": 0.6918019950389862, + "reward_std": 0.10386521331965923, + "rewards/reward_fn/mean": 0.6918019950389862, + "rewards/reward_fn/std": 0.10386521825566888, + "step": 105, + "step_time": 15.171548829199946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 47.1, + "completions/mean_terminated_length": 47.1, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.1942274721484864, + "epoch": 0.0047413793103448275, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.515625, + "learning_rate": 4.232758620689656e-06, + "loss": -0.000987192615866661, + "num_tokens": 255962.0, + "reward": 0.1787539839744568, + "reward_std": 0.5241983281448483, + "rewards/reward_fn/mean": 0.1787539839744568, + "rewards/reward_fn/std": 0.5241983408108354, + "step": 110, + "step_time": 17.40271624299976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.4, + "completions/max_terminated_length": 108.4, + "completions/mean_length": 54.025, + "completions/mean_terminated_length": 54.025, + "completions/min_length": 26.8, + "completions/min_terminated_length": 26.8, + "entropy": 0.21391218816424953, + "epoch": 0.004956896551724138, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.546875, + "learning_rate": 4.189655172413794e-06, + "loss": -0.07643186450004577, + "num_tokens": 267563.0, + "reward": 0.5654179990291596, + "reward_std": 0.2869014423340559, + "rewards/reward_fn/mean": 0.5654179990291596, + "rewards/reward_fn/std": 0.2869014598429203, + "step": 115, + "step_time": 19.255251389799923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 136.0, + "completions/max_terminated_length": 124.6, + "completions/mean_length": 80.575, + "completions/mean_terminated_length": 77.74285888671875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.31822405869606885, + "epoch": 0.005172413793103448, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.2421875, + "learning_rate": 4.146551724137932e-06, + "loss": 0.01645863652229309, + "num_tokens": 280074.0, + "reward": 0.06659798473119735, + "reward_std": 0.5049000799655914, + "rewards/reward_fn/mean": 0.06659798473119735, + "rewards/reward_fn/std": 0.5049000844359398, + "step": 120, + "step_time": 22.649546912000005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 138.4, + "completions/max_terminated_length": 136.2, + "completions/mean_length": 68.85, + "completions/mean_terminated_length": 61.975, + "completions/min_length": 25.4, + "completions/min_terminated_length": 25.4, + "entropy": 0.33708888398250564, + "epoch": 0.005387931034482759, + "frac_reward_zero_std": 0.1, + "grad_norm": 1.25, + "learning_rate": 4.103448275862069e-06, + "loss": 0.16269092559814452, + "num_tokens": 291760.0, + "reward": 0.09610399603843689, + "reward_std": 0.6181229472160339, + "rewards/reward_fn/mean": 0.09610399603843689, + "rewards/reward_fn/std": 0.6181229710578918, + "step": 125, + "step_time": 22.892032967400247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 121.6, + "completions/max_terminated_length": 119.6, + "completions/mean_length": 54.625, + "completions/mean_terminated_length": 45.516667175292966, + "completions/min_length": 15.4, + "completions/min_terminated_length": 15.4, + "entropy": 0.2316889252353576, + "epoch": 0.005603448275862069, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.78515625, + "learning_rate": 4.060344827586207e-06, + "loss": 0.05887578129768371, + "num_tokens": 302777.0, + "reward": 0.47827199250459673, + "reward_std": 0.31257240688428284, + "rewards/reward_fn/mean": 0.47827199250459673, + "rewards/reward_fn/std": 0.31257240623235705, + "step": 130, + "step_time": 20.756939186200178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.4, + "completions/max_terminated_length": 83.4, + "completions/mean_length": 43.375, + "completions/mean_terminated_length": 43.375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.14220066292909905, + "epoch": 0.00581896551724138, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.361328125, + "learning_rate": 4.017241379310346e-06, + "loss": 0.0067549549043178555, + "num_tokens": 313896.0, + "reward": 0.5540139883756637, + "reward_std": 0.3137821853160858, + "rewards/reward_fn/mean": 0.5540139883756637, + "rewards/reward_fn/std": 0.31378220971673726, + "step": 135, + "step_time": 16.04840490619972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.2, + "completions/max_terminated_length": 65.2, + "completions/mean_length": 33.825, + "completions/mean_terminated_length": 33.825, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.16708708838559688, + "epoch": 0.00603448275862069, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.53125, + "learning_rate": 3.974137931034483e-06, + "loss": 0.000922510027885437, + "num_tokens": 324093.0, + "reward": 0.5665820002555847, + "reward_std": 0.3455619063111953, + "rewards/reward_fn/mean": 0.5665820002555847, + "rewards/reward_fn/std": 0.34556191868614405, + "step": 140, + "step_time": 13.251863584200146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 175.8, + "completions/max_terminated_length": 149.4, + "completions/mean_length": 76.9, + "completions/mean_terminated_length": 58.73142929077149, + "completions/min_length": 25.6, + "completions/min_terminated_length": 25.6, + "entropy": 0.2752605409361422, + "epoch": 0.00625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.62109375, + "learning_rate": 3.931034482758621e-06, + "loss": -0.012939244508743286, + "num_tokens": 336493.0, + "reward": 0.03320800065994263, + "reward_std": 0.6782135218381882, + "rewards/reward_fn/mean": 0.03320800065994263, + "rewards/reward_fn/std": 0.6782135486602783, + "step": 145, + "step_time": 27.717435833599847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 159.6, + "completions/max_terminated_length": 159.4, + "completions/mean_length": 86.4, + "completions/mean_terminated_length": 78.2, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.3023837681859732, + "epoch": 0.00646551724137931, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.25, + "learning_rate": 3.887931034482759e-06, + "loss": -0.004035864770412445, + "num_tokens": 349213.0, + "reward": 0.2794659972190857, + "reward_std": 0.33909483794122935, + "rewards/reward_fn/mean": 0.2794659972190857, + "rewards/reward_fn/std": 0.3390948371961713, + "step": 150, + "step_time": 25.6254667358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 120.8, + "completions/max_terminated_length": 117.2, + "completions/mean_length": 76.35, + "completions/mean_terminated_length": 74.68571472167969, + "completions/min_length": 42.2, + "completions/min_terminated_length": 42.2, + "entropy": 0.2444805505729164, + "epoch": 0.006681034482758621, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.59375, + "learning_rate": 3.844827586206897e-06, + "loss": 0.06990929841995239, + "num_tokens": 361243.0, + "reward": 0.31372600197792055, + "reward_std": 0.16792407557368277, + "rewards/reward_fn/mean": 0.31372600197792055, + "rewards/reward_fn/std": 0.16792407706379892, + "step": 155, + "step_time": 20.657237647200326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 86.2, + "completions/max_terminated_length": 78.2, + "completions/mean_length": 57.375, + "completions/mean_terminated_length": 54.985714721679685, + "completions/min_length": 36.2, + "completions/min_terminated_length": 36.2, + "entropy": 0.2041449310956523, + "epoch": 0.006896551724137931, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.146484375, + "learning_rate": 3.801724137931035e-06, + "loss": -0.003799402713775635, + "num_tokens": 372606.0, + "reward": 0.6000540047883988, + "reward_std": 0.14438311448320745, + "rewards/reward_fn/mean": 0.6000540047883988, + "rewards/reward_fn/std": 0.1443831158336252, + "step": 160, + "step_time": 15.94448771479947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 196.6, + "completions/max_terminated_length": 173.8, + "completions/mean_length": 114.125, + "completions/mean_terminated_length": 102.28333435058593, + "completions/min_length": 40.8, + "completions/min_terminated_length": 40.8, + "entropy": 0.3578500231727958, + "epoch": 0.007112068965517242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.515625, + "learning_rate": 3.7586206896551727e-06, + "loss": 0.05463705062866211, + "num_tokens": 386899.0, + "reward": 0.29335000514984133, + "reward_std": 0.6011195421218872, + "rewards/reward_fn/mean": 0.29335000514984133, + "rewards/reward_fn/std": 0.6011195570230484, + "step": 165, + "step_time": 30.583657701599986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 120.6, + "completions/max_terminated_length": 105.4, + "completions/mean_length": 64.775, + "completions/mean_terminated_length": 60.79285736083985, + "completions/min_length": 27.2, + "completions/min_terminated_length": 27.2, + "entropy": 0.3172524501627777, + "epoch": 0.007327586206896552, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.380859375, + "learning_rate": 3.7155172413793107e-06, + "loss": -0.04064536094665527, + "num_tokens": 398878.0, + "reward": 0.2578879952430725, + "reward_std": 0.4871767422184348, + "rewards/reward_fn/mean": 0.2578879952430725, + "rewards/reward_fn/std": 0.4871767588891089, + "step": 170, + "step_time": 20.961449378399994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 49.1, + "completions/mean_terminated_length": 49.1, + "completions/min_length": 25.4, + "completions/min_terminated_length": 25.4, + "entropy": 0.28907319878344423, + "epoch": 0.007543103448275862, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.63671875, + "learning_rate": 3.6724137931034487e-06, + "loss": 0.07596501111984252, + "num_tokens": 409894.0, + "reward": 0.2182680130004883, + "reward_std": 0.2655696720117703, + "rewards/reward_fn/mean": 0.2182680130004883, + "rewards/reward_fn/std": 0.26556967524811625, + "step": 175, + "step_time": 16.00376275539984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 119.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 75.025, + "completions/mean_terminated_length": 60.685000610351565, + "completions/min_length": 30.4, + "completions/min_terminated_length": 30.4, + "entropy": 0.2300539159856271, + "epoch": 0.007758620689655172, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.33203125, + "learning_rate": 3.6293103448275863e-06, + "loss": 0.0008989959955215454, + "num_tokens": 422115.0, + "reward": 0.2748300015926361, + "reward_std": 0.4872116835438646, + "rewards/reward_fn/mean": 0.2748300015926361, + "rewards/reward_fn/std": 0.487211688212119, + "step": 180, + "step_time": 20.472563396999977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 140.8, + "completions/max_terminated_length": 130.4, + "completions/mean_length": 77.75, + "completions/mean_terminated_length": 73.95714416503907, + "completions/min_length": 28.4, + "completions/min_terminated_length": 28.4, + "entropy": 0.2450535054318607, + "epoch": 0.007974137931034483, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.0, + "learning_rate": 3.5862068965517243e-06, + "loss": 0.041662493348121644, + "num_tokens": 434661.0, + "reward": 0.052784007787704465, + "reward_std": 0.2650376170873642, + "rewards/reward_fn/mean": 0.052784007787704465, + "rewards/reward_fn/std": 0.26503762900829314, + "step": 185, + "step_time": 23.318358728199563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 195.6, + "completions/max_terminated_length": 192.8, + "completions/mean_length": 89.9, + "completions/mean_terminated_length": 84.275, + "completions/min_length": 26.2, + "completions/min_terminated_length": 26.2, + "entropy": 0.3329991169273853, + "epoch": 0.008189655172413794, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.3515625, + "learning_rate": 3.5431034482758623e-06, + "loss": 0.014610832929611206, + "num_tokens": 447801.0, + "reward": 0.449290007352829, + "reward_std": 0.2617717768996954, + "rewards/reward_fn/mean": 0.449290007352829, + "rewards/reward_fn/std": 0.26177178248763083, + "step": 190, + "step_time": 30.514870558599977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 174.6, + "completions/max_terminated_length": 167.2, + "completions/mean_length": 102.6, + "completions/mean_terminated_length": 93.59166717529297, + "completions/min_length": 33.8, + "completions/min_terminated_length": 33.8, + "entropy": 0.27610186783713286, + "epoch": 0.008405172413793103, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.33203125, + "learning_rate": 3.5e-06, + "loss": 0.014060372114181518, + "num_tokens": 461297.0, + "reward": 0.18859200328588485, + "reward_std": 0.20913713360205294, + "rewards/reward_fn/mean": 0.18859200328588485, + "rewards/reward_fn/std": 0.20913713917834684, + "step": 195, + "step_time": 27.840963185400142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 145.6, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 63.075, + "completions/mean_terminated_length": 43.47500076293945, + "completions/min_length": 16.4, + "completions/min_terminated_length": 16.4, + "entropy": 0.20764350929530337, + "epoch": 0.008620689655172414, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.2177734375, + "learning_rate": 3.4568965517241384e-06, + "loss": 0.01216265857219696, + "num_tokens": 473228.0, + "reward": 0.4162260115146637, + "reward_std": 0.5489736057817935, + "rewards/reward_fn/mean": 0.4162260115146637, + "rewards/reward_fn/std": 0.5489736313116736, + "step": 200, + "step_time": 24.10546086580016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.6, + "completions/max_terminated_length": 55.6, + "completions/mean_length": 23.825, + "completions/mean_terminated_length": 23.825, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.13016226456093136, + "epoch": 0.008836206896551724, + "frac_reward_zero_std": 0.3, + "grad_norm": 2.59375, + "learning_rate": 3.4137931034482764e-06, + "loss": 0.10798157453536987, + "num_tokens": 483053.0, + "reward": 0.7742319941520691, + "reward_std": 0.06249405808048323, + "rewards/reward_fn/mean": 0.7742319941520691, + "rewards/reward_fn/std": 0.06249405494891107, + "step": 205, + "step_time": 12.142383367000003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.8, + "completions/max_terminated_length": 104.8, + "completions/mean_length": 50.125, + "completions/mean_terminated_length": 50.125, + "completions/min_length": 22.4, + "completions/min_terminated_length": 22.4, + "entropy": 0.3128367411205545, + "epoch": 0.009051724137931035, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.57421875, + "learning_rate": 3.370689655172414e-06, + "loss": -0.012002941966056824, + "num_tokens": 494054.0, + "reward": 0.31811599135398866, + "reward_std": 0.6536980773322284, + "rewards/reward_fn/mean": 0.31811599135398866, + "rewards/reward_fn/std": 0.6536980641074479, + "step": 210, + "step_time": 18.774766382600138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.8, + "completions/max_terminated_length": 79.8, + "completions/mean_length": 54.7, + "completions/mean_terminated_length": 54.7, + "completions/min_length": 38.4, + "completions/min_terminated_length": 38.4, + "entropy": 0.18855432691198076, + "epoch": 0.009267241379310344, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.376953125, + "learning_rate": 3.327586206896552e-06, + "loss": -0.01920826584100723, + "num_tokens": 505226.0, + "reward": 0.5115379929542542, + "reward_std": 0.15735746768768877, + "rewards/reward_fn/mean": 0.5115379929542542, + "rewards/reward_fn/std": 0.1573574845213443, + "step": 215, + "step_time": 15.221036481800548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.2, + "completions/max_terminated_length": 126.2, + "completions/mean_length": 59.2, + "completions/mean_terminated_length": 59.2, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.31272616144269705, + "epoch": 0.009482758620689655, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.7578125, + "learning_rate": 3.28448275862069e-06, + "loss": 0.13071467876434326, + "num_tokens": 516730.0, + "reward": 0.6471440196037292, + "reward_std": 0.15736471198033541, + "rewards/reward_fn/mean": 0.6471440196037292, + "rewards/reward_fn/std": 0.15736472072312607, + "step": 220, + "step_time": 21.683129164199453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.6, + "completions/max_terminated_length": 75.6, + "completions/mean_length": 41.075, + "completions/mean_terminated_length": 41.075, + "completions/min_length": 21.2, + "completions/min_terminated_length": 21.2, + "entropy": 0.17901114720734768, + "epoch": 0.009698275862068966, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.390625, + "learning_rate": 3.2413793103448277e-06, + "loss": 0.0042393475770950316, + "num_tokens": 527637.0, + "reward": 0.5594579905271531, + "reward_std": 0.23395470455288886, + "rewards/reward_fn/mean": 0.5594579905271531, + "rewards/reward_fn/std": 0.23395471236435697, + "step": 225, + "step_time": 14.985401752000143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 145.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 77.85, + "completions/mean_terminated_length": 73.91785736083985, + "completions/min_length": 29.8, + "completions/min_terminated_length": 29.8, + "entropy": 0.28966000496875494, + "epoch": 0.009913793103448277, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.294921875, + "learning_rate": 3.1982758620689657e-06, + "loss": -0.03884480595588684, + "num_tokens": 539999.0, + "reward": 0.2210400015115738, + "reward_std": 0.45898649455048146, + "rewards/reward_fn/mean": 0.2210400015115738, + "rewards/reward_fn/std": 0.458986507600639, + "step": 230, + "step_time": 23.7599374403997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.2, + "completions/max_terminated_length": 117.2, + "completions/mean_length": 50.25, + "completions/mean_terminated_length": 50.25, + "completions/min_length": 15.4, + "completions/min_terminated_length": 15.4, + "entropy": 0.2198769545648247, + "epoch": 0.010129310344827586, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.5234375, + "learning_rate": 3.1551724137931037e-06, + "loss": -0.013682875037193298, + "num_tokens": 551073.0, + "reward": 0.4487079918384552, + "reward_std": 0.5221681177150458, + "rewards/reward_fn/mean": 0.4487079918384552, + "rewards/reward_fn/std": 0.5221681545488537, + "step": 235, + "step_time": 20.431722840200383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.6, + "completions/max_terminated_length": 118.6, + "completions/mean_length": 54.675, + "completions/mean_terminated_length": 54.675, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.20829223558394006, + "epoch": 0.010344827586206896, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.41015625, + "learning_rate": 3.1120689655172413e-06, + "loss": -0.030032938718795775, + "num_tokens": 562420.0, + "reward": 0.17456801533699035, + "reward_std": 0.46873694062232973, + "rewards/reward_fn/mean": 0.17456801533699035, + "rewards/reward_fn/std": 0.4687369393184781, + "step": 240, + "step_time": 20.53446474100001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.8, + "completions/max_terminated_length": 73.8, + "completions/mean_length": 42.475, + "completions/mean_terminated_length": 42.475, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.22942446915694745, + "epoch": 0.010560344827586207, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.0, + "learning_rate": 3.0689655172413797e-06, + "loss": 0.043540936708450315, + "num_tokens": 572919.0, + "reward": 0.5922660022974015, + "reward_std": 0.26113789578084834, + "rewards/reward_fn/mean": 0.5922660022974015, + "rewards/reward_fn/std": 0.2611379227571888, + "step": 245, + "step_time": 14.755489835400112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.4, + "completions/max_terminated_length": 57.4, + "completions/mean_length": 31.925, + "completions/mean_terminated_length": 31.925, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.16531706769019366, + "epoch": 0.010775862068965518, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7734375, + "learning_rate": 3.0258620689655178e-06, + "loss": 0.02320311963558197, + "num_tokens": 583104.0, + "reward": 0.7694700002670288, + "reward_std": 0.03285302005242556, + "rewards/reward_fn/mean": 0.7694700002670288, + "rewards/reward_fn/std": 0.03285302061121911, + "step": 250, + "step_time": 12.29388190760037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.2, + "completions/max_terminated_length": 62.2, + "completions/mean_length": 37.025, + "completions/mean_terminated_length": 37.025, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "entropy": 0.17417828142351938, + "epoch": 0.010991379310344827, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.546875, + "learning_rate": 2.9827586206896554e-06, + "loss": 0.0023472100496292115, + "num_tokens": 593081.0, + "reward": 0.6150040000677108, + "reward_std": 0.19029689192248042, + "rewards/reward_fn/mean": 0.6150040000677108, + "rewards/reward_fn/std": 0.19029689457966015, + "step": 255, + "step_time": 12.741135484799088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.4, + "completions/max_terminated_length": 129.4, + "completions/mean_length": 63.825, + "completions/mean_terminated_length": 63.825, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.253861751453951, + "epoch": 0.011206896551724138, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.20703125, + "learning_rate": 2.9396551724137934e-06, + "loss": 0.019092470407485962, + "num_tokens": 605022.0, + "reward": 0.5227800011634827, + "reward_std": 0.22449488371494225, + "rewards/reward_fn/mean": 0.5227800011634827, + "rewards/reward_fn/std": 0.22449489417485893, + "step": 260, + "step_time": 21.9141057962006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 35.05, + "completions/mean_terminated_length": 35.05, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.1651882082573138, + "epoch": 0.011422413793103449, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.71875, + "learning_rate": 2.8965517241379314e-06, + "loss": -0.019537296891212464, + "num_tokens": 615204.0, + "reward": 0.6061379969120025, + "reward_std": 0.3120082815119531, + "rewards/reward_fn/mean": 0.6061379969120025, + "rewards/reward_fn/std": 0.31200829716108275, + "step": 265, + "step_time": 17.125141170400457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 56.975, + "completions/mean_terminated_length": 56.975, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.24561482777353377, + "epoch": 0.01163793103448276, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.7421875, + "learning_rate": 2.853448275862069e-06, + "loss": 0.04291326403617859, + "num_tokens": 626315.0, + "reward": 0.18831600546836852, + "reward_std": 0.5584077455103398, + "rewards/reward_fn/mean": 0.18831600546836852, + "rewards/reward_fn/std": 0.5584077499806881, + "step": 270, + "step_time": 21.739286486199852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.2, + "completions/max_terminated_length": 84.2, + "completions/mean_length": 57.575, + "completions/mean_terminated_length": 57.575, + "completions/min_length": 26.8, + "completions/min_terminated_length": 26.8, + "entropy": 0.23427433941978962, + "epoch": 0.011853448275862068, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.453125, + "learning_rate": 2.810344827586207e-06, + "loss": 0.039196133613586426, + "num_tokens": 638030.0, + "reward": 0.3995940089225769, + "reward_std": 0.3814970766892657, + "rewards/reward_fn/mean": 0.3995940089225769, + "rewards/reward_fn/std": 0.3814970819279552, + "step": 275, + "step_time": 16.096988868400878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 85.6, + "completions/mean_terminated_length": 85.6, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.2679269164043944, + "epoch": 0.01206896551724138, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.0, + "learning_rate": 2.7672413793103455e-06, + "loss": 0.03390491306781769, + "num_tokens": 650622.0, + "reward": 0.2944400191307068, + "reward_std": 0.6373479247093201, + "rewards/reward_fn/mean": 0.2944400191307068, + "rewards/reward_fn/std": 0.6373479485511779, + "step": 280, + "step_time": 26.78552907839967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 118.8, + "completions/max_terminated_length": 111.2, + "completions/mean_length": 59.8, + "completions/mean_terminated_length": 55.75000152587891, + "completions/min_length": 16.6, + "completions/min_terminated_length": 16.6, + "entropy": 0.2526095831533894, + "epoch": 0.01228448275862069, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.435546875, + "learning_rate": 2.724137931034483e-06, + "loss": 0.05690905451774597, + "num_tokens": 662374.0, + "reward": 0.1196260005235672, + "reward_std": 0.767779788421467, + "rewards/reward_fn/mean": 0.1196260005235672, + "rewards/reward_fn/std": 0.7677798029500991, + "step": 285, + "step_time": 20.659876349599834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 122.0, + "completions/max_terminated_length": 120.8, + "completions/mean_length": 67.7, + "completions/mean_terminated_length": 60.75833435058594, + "completions/min_length": 15.4, + "completions/min_terminated_length": 15.4, + "entropy": 0.2572842217050493, + "epoch": 0.0125, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.302734375, + "learning_rate": 2.681034482758621e-06, + "loss": 0.08332639336585998, + "num_tokens": 674146.0, + "reward": -0.005963999032974243, + "reward_std": 0.5873876449419185, + "rewards/reward_fn/mean": -0.005963999032974243, + "rewards/reward_fn/std": 0.5873876665486023, + "step": 290, + "step_time": 20.897897071199804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 49.4, + "completions/mean_terminated_length": 49.4, + "completions/min_length": 24.2, + "completions/min_terminated_length": 24.2, + "entropy": 0.1755124410497956, + "epoch": 0.01271551724137931, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.63671875, + "learning_rate": 2.637931034482759e-06, + "loss": 0.02972428798675537, + "num_tokens": 685162.0, + "reward": 0.5233660042285919, + "reward_std": 0.3009416783461347, + "rewards/reward_fn/mean": 0.5233660042285919, + "rewards/reward_fn/std": 0.3009416822576895, + "step": 295, + "step_time": 15.749949136399664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.8, + "completions/max_terminated_length": 93.8, + "completions/mean_length": 43.125, + "completions/mean_terminated_length": 43.125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.14239629595540465, + "epoch": 0.01293103448275862, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.29296875, + "learning_rate": 2.5948275862068967e-06, + "loss": 0.03822866082191467, + "num_tokens": 696263.0, + "reward": 0.7467899918556213, + "reward_std": 0.14996536182879935, + "rewards/reward_fn/mean": 0.7467899918556213, + "rewards/reward_fn/std": 0.14996537954139058, + "step": 300, + "step_time": 17.312333570999545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 131.2, + "completions/max_terminated_length": 100.6, + "completions/mean_length": 55.55, + "completions/mean_terminated_length": 50.860714721679685, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.22959837107919157, + "epoch": 0.013146551724137931, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.47265625, + "learning_rate": 2.5517241379310347e-06, + "loss": -0.094166100025177, + "num_tokens": 707373.0, + "reward": 0.13532000482082368, + "reward_std": 0.8043574929237366, + "rewards/reward_fn/mean": 0.13532000482082368, + "rewards/reward_fn/std": 0.8043575048446655, + "step": 305, + "step_time": 21.792155743800322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 108.2, + "completions/max_terminated_length": 89.8, + "completions/mean_length": 53.7, + "completions/mean_terminated_length": 49.05357208251953, + "completions/min_length": 16.8, + "completions/min_terminated_length": 16.8, + "entropy": 0.17698723110370337, + "epoch": 0.013362068965517242, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.455078125, + "learning_rate": 2.5086206896551723e-06, + "loss": 0.054443192481994626, + "num_tokens": 718741.0, + "reward": 0.29295199513435366, + "reward_std": 0.5932383094579563, + "rewards/reward_fn/mean": 0.29295199513435366, + "rewards/reward_fn/std": 0.5932383131046663, + "step": 310, + "step_time": 19.077739837599985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15, + "completions/max_length": 159.6, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 92.8, + "completions/mean_terminated_length": 73.96666717529297, + "completions/min_length": 34.8, + "completions/min_terminated_length": 34.8, + "entropy": 0.1539902521762997, + "epoch": 0.013577586206896551, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1796875, + "learning_rate": 2.4655172413793104e-06, + "loss": 0.04268620014190674, + "num_tokens": 732317.0, + "reward": 0.09846000373363495, + "reward_std": 0.654845405537344, + "rewards/reward_fn/mean": 0.09846000373363495, + "rewards/reward_fn/std": 0.6548454073446919, + "step": 315, + "step_time": 25.857181660000606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.8, + "completions/max_terminated_length": 46.8, + "completions/mean_length": 29.525, + "completions/mean_terminated_length": 29.525, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.13891502455808222, + "epoch": 0.013793103448275862, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 2.4224137931034484e-06, + "loss": 0.0070524528622627255, + "num_tokens": 742546.0, + "reward": 0.5995699942111969, + "reward_std": 0.21884493981051492, + "rewards/reward_fn/mean": 0.5995699942111969, + "rewards/reward_fn/std": 0.21884493713005213, + "step": 320, + "step_time": 11.21470864460007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 107.2, + "completions/max_terminated_length": 105.4, + "completions/mean_length": 58.825, + "completions/mean_terminated_length": 45.85, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.18881899882107972, + "epoch": 0.014008620689655173, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.0234375, + "learning_rate": 2.3793103448275864e-06, + "loss": -0.006822568923234939, + "num_tokens": 754463.0, + "reward": 0.05360400676727295, + "reward_std": 0.8347343930785428, + "rewards/reward_fn/mean": 0.05360400676727295, + "rewards/reward_fn/std": 0.8347344425696065, + "step": 325, + "step_time": 19.000760145999447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 137.8, + "completions/max_terminated_length": 133.2, + "completions/mean_length": 76.525, + "completions/mean_terminated_length": 70.575, + "completions/min_length": 29.4, + "completions/min_terminated_length": 29.4, + "entropy": 0.2549753186496673, + "epoch": 0.014224137931034483, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.291015625, + "learning_rate": 2.3362068965517244e-06, + "loss": -0.00032879412174224854, + "num_tokens": 766976.0, + "reward": 0.15794800221920013, + "reward_std": 0.39052192161325366, + "rewards/reward_fn/mean": 0.15794800221920013, + "rewards/reward_fn/std": 0.39052194436080756, + "step": 330, + "step_time": 23.229899759599355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.8, + "completions/max_terminated_length": 113.8, + "completions/mean_length": 44.9, + "completions/mean_terminated_length": 44.9, + "completions/min_length": 16.8, + "completions/min_terminated_length": 16.8, + "entropy": 0.16029987429210452, + "epoch": 0.014439655172413792, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 2.293103448275862e-06, + "loss": -0.1327621340751648, + "num_tokens": 777720.0, + "reward": -0.13512398451566696, + "reward_std": 0.6157536920160055, + "rewards/reward_fn/mean": -0.13512398451566696, + "rewards/reward_fn/std": 0.6157537158578634, + "step": 335, + "step_time": 19.72434388899965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15, + "completions/max_length": 124.8, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 89.45, + "completions/mean_terminated_length": 73.82000122070312, + "completions/min_length": 48.4, + "completions/min_terminated_length": 48.4, + "entropy": 0.22397661422437523, + "epoch": 0.014655172413793103, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.234375, + "learning_rate": 2.25e-06, + "loss": 0.020370352268218993, + "num_tokens": 791050.0, + "reward": 0.02506202459335327, + "reward_std": 0.030776232342759613, + "rewards/reward_fn/mean": 0.02506202459335327, + "rewards/reward_fn/std": 0.03077622954297112, + "step": 340, + "step_time": 21.360666513999785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 163.8, + "completions/max_terminated_length": 127.6, + "completions/mean_length": 75.875, + "completions/mean_terminated_length": 50.14833374023438, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.14475313751026989, + "epoch": 0.014870689655172414, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.490234375, + "learning_rate": 2.206896551724138e-06, + "loss": 0.08303472399711609, + "num_tokens": 803649.0, + "reward": 0.38839800357818605, + "reward_std": 0.44462206639145735, + "rewards/reward_fn/mean": 0.38839800357818605, + "rewards/reward_fn/std": 0.44462209131597774, + "step": 345, + "step_time": 26.33779034660056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1, + "completions/max_length": 186.8, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 85.5, + "completions/mean_terminated_length": 67.71666870117187, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.233237065654248, + "epoch": 0.015086206896551725, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.07177734375, + "learning_rate": 2.163793103448276e-06, + "loss": -0.04608747363090515, + "num_tokens": 816513.0, + "reward": -0.31583799719810485, + "reward_std": 0.6374753089621663, + "rewards/reward_fn/mean": -0.31583799719810485, + "rewards/reward_fn/std": 0.637475342862308, + "step": 350, + "step_time": 29.608003718000145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05, + "completions/max_length": 143.4, + "completions/max_terminated_length": 110.8, + "completions/mean_length": 61.4, + "completions/mean_terminated_length": 52.72500152587891, + "completions/min_length": 16.4, + "completions/min_terminated_length": 16.4, + "entropy": 0.20117253810167313, + "epoch": 0.015301724137931034, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.0, + "learning_rate": 2.120689655172414e-06, + "loss": -0.0012839555740356446, + "num_tokens": 828305.0, + "reward": 0.11507600247859955, + "reward_std": 0.5993056967010488, + "rewards/reward_fn/mean": 0.11507600247859955, + "rewards/reward_fn/std": 0.5993057217798196, + "step": 355, + "step_time": 23.633322461799253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.4, + "completions/max_terminated_length": 63.4, + "completions/mean_length": 30.9, + "completions/mean_terminated_length": 30.9, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "entropy": 0.17519649318710434, + "epoch": 0.015517241379310345, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.99609375, + "learning_rate": 2.0775862068965517e-06, + "loss": 0.025579386949539186, + "num_tokens": 838501.0, + "reward": 0.4936339855194092, + "reward_std": 0.39777664675784763, + "rewards/reward_fn/mean": 0.4936339855194092, + "rewards/reward_fn/std": 0.3977766572148539, + "step": 360, + "step_time": 13.110204230200543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.6, + "completions/max_terminated_length": 61.6, + "completions/mean_length": 30.575, + "completions/mean_terminated_length": 30.575, + "completions/min_length": 16.6, + "completions/min_terminated_length": 16.6, + "entropy": 0.14497978072613477, + "epoch": 0.015732758620689655, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.43359375, + "learning_rate": 2.0344827586206897e-06, + "loss": 0.05366594791412353, + "num_tokens": 848536.0, + "reward": 0.3110899955034256, + "reward_std": 0.5116304856666829, + "rewards/reward_fn/mean": 0.3110899955034256, + "rewards/reward_fn/std": 0.5116304916300578, + "step": 365, + "step_time": 12.62919970000039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.6, + "completions/max_terminated_length": 82.6, + "completions/mean_length": 40.325, + "completions/mean_terminated_length": 40.325, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.1809167622952373, + "epoch": 0.015948275862068966, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.765625, + "learning_rate": 1.9913793103448278e-06, + "loss": 0.04847588837146759, + "num_tokens": 859473.0, + "reward": 0.2660140126943588, + "reward_std": 0.645512792468071, + "rewards/reward_fn/mean": 0.2660140126943588, + "rewards/reward_fn/std": 0.645512792468071, + "step": 370, + "step_time": 16.079390238599444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 52.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 24.85, + "completions/mean_terminated_length": 24.85, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.18615856370088296, + "epoch": 0.016163793103448277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4375, + "learning_rate": 1.948275862068966e-06, + "loss": -0.12370493412017822, + "num_tokens": 868927.0, + "reward": 0.3794360011816025, + "reward_std": 0.5513797464169329, + "rewards/reward_fn/mean": 0.3794360011816025, + "rewards/reward_fn/std": 0.5513797624618746, + "step": 375, + "step_time": 11.392414143199858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 62.25, + "completions/mean_terminated_length": 62.25, + "completions/min_length": 32.8, + "completions/min_terminated_length": 32.8, + "entropy": 0.12981002542364878, + "epoch": 0.016379310344827588, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 1.9051724137931036e-06, + "loss": -0.005396611988544464, + "num_tokens": 880641.0, + "reward": 0.37234801054000854, + "reward_std": 0.33126811534166334, + "rewards/reward_fn/mean": 0.37234801054000854, + "rewards/reward_fn/std": 0.33126811534166334, + "step": 380, + "step_time": 18.616042540200944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.2, + "completions/max_terminated_length": 54.2, + "completions/mean_length": 37.275, + "completions/mean_terminated_length": 37.275, + "completions/min_length": 16.6, + "completions/min_terminated_length": 16.6, + "entropy": 0.11609641392715275, + "epoch": 0.016594827586206895, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.67578125, + "learning_rate": 1.8620689655172416e-06, + "loss": 0.0155934676527977, + "num_tokens": 891240.0, + "reward": 0.5631299942731858, + "reward_std": 0.35218784370808864, + "rewards/reward_fn/mean": 0.5631299942731858, + "rewards/reward_fn/std": 0.35218784669123127, + "step": 385, + "step_time": 11.740256453400798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.8, + "completions/max_terminated_length": 112.8, + "completions/mean_length": 51.875, + "completions/mean_terminated_length": 51.875, + "completions/min_length": 17.4, + "completions/min_terminated_length": 17.4, + "entropy": 0.13740509356721303, + "epoch": 0.016810344827586206, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.49609375, + "learning_rate": 1.8189655172413794e-06, + "loss": 0.10463507175445556, + "num_tokens": 902783.0, + "reward": 0.6312699973583221, + "reward_std": 0.2348479055101052, + "rewards/reward_fn/mean": 0.6312699973583221, + "rewards/reward_fn/std": 0.23484793551033362, + "step": 390, + "step_time": 19.62515979279924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.8, + "completions/max_terminated_length": 68.8, + "completions/mean_length": 38.425, + "completions/mean_terminated_length": 38.425, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.22300243815625437, + "epoch": 0.017025862068965517, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.376953125, + "learning_rate": 1.7758620689655172e-06, + "loss": 0.020081929862499237, + "num_tokens": 913372.0, + "reward": 0.3120719850063324, + "reward_std": 0.6087781610287493, + "rewards/reward_fn/mean": 0.3120719850063324, + "rewards/reward_fn/std": 0.6087781770736911, + "step": 395, + "step_time": 13.894178912400093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.4, + "completions/max_terminated_length": 42.4, + "completions/mean_length": 26.6, + "completions/mean_terminated_length": 26.6, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.16512603219016456, + "epoch": 0.017241379310344827, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.71484375, + "learning_rate": 1.7327586206896555e-06, + "loss": -0.00851401463150978, + "num_tokens": 923000.0, + "reward": 0.1584160089492798, + "reward_std": 0.7910981756576803, + "rewards/reward_fn/mean": 0.1584160089492798, + "rewards/reward_fn/std": 0.7910982114233776, + "step": 400, + "step_time": 10.169051740999567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.2, + "completions/max_terminated_length": 83.2, + "completions/mean_length": 32.7, + "completions/mean_terminated_length": 32.7, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.2798259791918099, + "epoch": 0.017456896551724138, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.5859375, + "learning_rate": 1.6896551724137933e-06, + "loss": -0.05898996591567993, + "num_tokens": 933180.0, + "reward": -0.09678800404071808, + "reward_std": 0.8124626636505127, + "rewards/reward_fn/mean": -0.09678800404071808, + "rewards/reward_fn/std": 0.8124626994132995, + "step": 405, + "step_time": 15.84627854859973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 45.6, + "completions/mean_terminated_length": 45.6, + "completions/min_length": 27.4, + "completions/min_terminated_length": 27.4, + "entropy": 0.12936758662763168, + "epoch": 0.01767241379310345, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6953125, + "learning_rate": 1.646551724137931e-06, + "loss": -0.033120396733284, + "num_tokens": 944080.0, + "reward": 0.5422719717025757, + "reward_std": 0.28954650281812067, + "rewards/reward_fn/mean": 0.5422719717025757, + "rewards/reward_fn/std": 0.28954653043765577, + "step": 410, + "step_time": 15.51832699400038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 132.4, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 55.15, + "completions/mean_terminated_length": 50.94642944335938, + "completions/min_length": 16.8, + "completions/min_terminated_length": 16.8, + "entropy": 0.1441746059259458, + "epoch": 0.01788793103448276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.76953125, + "learning_rate": 1.603448275862069e-06, + "loss": -0.0725719690322876, + "num_tokens": 955402.0, + "reward": 0.16672799736261368, + "reward_std": 0.8173915803432464, + "rewards/reward_fn/mean": 0.16672799736261368, + "rewards/reward_fn/std": 0.8173915863037109, + "step": 415, + "step_time": 22.511086131800404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 102.2, + "completions/max_terminated_length": 94.4, + "completions/mean_length": 45.275, + "completions/mean_terminated_length": 40.475, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.13067209254659246, + "epoch": 0.01810344827586207, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.578125, + "learning_rate": 1.5603448275862071e-06, + "loss": -0.061251068115234376, + "num_tokens": 965973.0, + "reward": 0.04006999731063843, + "reward_std": 0.8386635899543762, + "rewards/reward_fn/mean": 0.04006999731063843, + "rewards/reward_fn/std": 0.8386636175215244, + "step": 420, + "step_time": 18.12668382439988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 112.4, + "completions/max_terminated_length": 111.6, + "completions/mean_length": 53.35, + "completions/mean_terminated_length": 39.21999969482422, + "completions/min_length": 17.8, + "completions/min_terminated_length": 17.8, + "entropy": 0.19069459346355871, + "epoch": 0.018318965517241378, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.22265625, + "learning_rate": 1.517241379310345e-06, + "loss": 0.048850050568580626, + "num_tokens": 977975.0, + "reward": 0.5497639954090119, + "reward_std": 0.17705658902705182, + "rewards/reward_fn/mean": 0.5497639954090119, + "rewards/reward_fn/std": 0.17705659940256738, + "step": 425, + "step_time": 20.06997448439979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.4, + "completions/max_terminated_length": 58.4, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 28.75, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "entropy": 0.135710381649551, + "epoch": 0.01853448275862069, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.90625, + "learning_rate": 1.4741379310344828e-06, + "loss": 0.04218609035015106, + "num_tokens": 988389.0, + "reward": 0.6093420043587685, + "reward_std": 0.3344592633948196, + "rewards/reward_fn/mean": 0.6093420043587685, + "rewards/reward_fn/std": 0.3344592841720441, + "step": 430, + "step_time": 13.001832899203146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.4, + "completions/max_terminated_length": 74.4, + "completions/mean_length": 40.975, + "completions/mean_terminated_length": 40.975, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.21582197244933923, + "epoch": 0.01875, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 1.4310344827586208e-06, + "loss": -0.022132165729999542, + "num_tokens": 998772.0, + "reward": 0.37132199853658676, + "reward_std": 0.5101128663867712, + "rewards/reward_fn/mean": 0.37132199853658676, + "rewards/reward_fn/std": 0.510112880077213, + "step": 435, + "step_time": 14.487816170999213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 127.4, + "completions/max_terminated_length": 98.4, + "completions/mean_length": 51.825, + "completions/mean_terminated_length": 46.90714340209961, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.20477710807172117, + "epoch": 0.01896551724137931, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.435546875, + "learning_rate": 1.3879310344827588e-06, + "loss": 0.02955879271030426, + "num_tokens": 1010253.0, + "reward": 0.3175980031490326, + "reward_std": 0.5348470628261566, + "rewards/reward_fn/mean": 0.3175980031490326, + "rewards/reward_fn/std": 0.5348470747470856, + "step": 440, + "step_time": 21.879293413598862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 34.175, + "completions/mean_terminated_length": 34.175, + "completions/min_length": 15.4, + "completions/min_terminated_length": 15.4, + "entropy": 0.12146973262715619, + "epoch": 0.01918103448275862, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.5703125, + "learning_rate": 1.3448275862068966e-06, + "loss": -0.06392813324928284, + "num_tokens": 1020428.0, + "reward": 0.3168379902839661, + "reward_std": 0.6938783974153921, + "rewards/reward_fn/mean": 0.3168379902839661, + "rewards/reward_fn/std": 0.6938784208497963, + "step": 445, + "step_time": 13.7290538027999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 33.3, + "completions/mean_terminated_length": 33.3, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.10337164305092301, + "epoch": 0.01939655172413793, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 1.3017241379310346e-06, + "loss": 0.018225130438804627, + "num_tokens": 1030752.0, + "reward": 0.4067739874124527, + "reward_std": 0.42165334834717216, + "rewards/reward_fn/mean": 0.4067739874124527, + "rewards/reward_fn/std": 0.4216533594531938, + "step": 450, + "step_time": 13.09169417019948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.6, + "completions/max_terminated_length": 91.6, + "completions/mean_length": 41.75, + "completions/mean_terminated_length": 41.75, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.18729911991395057, + "epoch": 0.019612068965517242, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.50390625, + "learning_rate": 1.2586206896551725e-06, + "loss": -0.09923816323280335, + "num_tokens": 1041430.0, + "reward": 0.28174002170562745, + "reward_std": 0.6190577574074269, + "rewards/reward_fn/mean": 0.28174002170562745, + "rewards/reward_fn/std": 0.6190577335655689, + "step": 455, + "step_time": 17.306821168800525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.6, + "completions/max_terminated_length": 72.6, + "completions/mean_length": 38.175, + "completions/mean_terminated_length": 38.175, + "completions/min_length": 16.8, + "completions/min_terminated_length": 16.8, + "entropy": 0.12451781287672929, + "epoch": 0.019827586206896553, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.96875, + "learning_rate": 1.2155172413793105e-06, + "loss": -0.06482980847358703, + "num_tokens": 1052265.0, + "reward": 0.3294659972190857, + "reward_std": 0.08181906587124103, + "rewards/reward_fn/mean": 0.3294659972190857, + "rewards/reward_fn/std": 0.08181906841491582, + "step": 460, + "step_time": 14.729962180000802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.8, + "completions/max_terminated_length": 67.8, + "completions/mean_length": 36.525, + "completions/mean_terminated_length": 36.525, + "completions/min_length": 16.4, + "completions/min_terminated_length": 16.4, + "entropy": 0.14964785871852654, + "epoch": 0.02004310344827586, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 1.1724137931034483e-06, + "loss": 0.03263564705848694, + "num_tokens": 1062750.0, + "reward": 0.7607880115509034, + "reward_std": 0.04108410105109215, + "rewards/reward_fn/mean": 0.7607880115509034, + "rewards/reward_fn/std": 0.04108410105109215, + "step": 465, + "step_time": 13.958972323600756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.4, + "completions/max_terminated_length": 78.4, + "completions/mean_length": 38.6, + "completions/mean_terminated_length": 38.6, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "entropy": 0.07761492129939143, + "epoch": 0.02025862068965517, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.451171875, + "learning_rate": 1.1293103448275863e-06, + "loss": -0.038190174102783206, + "num_tokens": 1073254.0, + "reward": 0.39792599976062776, + "reward_std": 0.43699904829263686, + "rewards/reward_fn/mean": 0.39792599976062776, + "rewards/reward_fn/std": 0.4369990237057209, + "step": 470, + "step_time": 15.37113720279849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 27.875, + "completions/mean_terminated_length": 27.875, + "completions/min_length": 16.4, + "completions/min_terminated_length": 16.4, + "entropy": 0.10411922150960891, + "epoch": 0.020474137931034482, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.52734375, + "learning_rate": 1.0862068965517241e-06, + "loss": 0.08627685904502869, + "num_tokens": 1083473.0, + "reward": 0.4679439961910248, + "reward_std": 0.40033682010543997, + "rewards/reward_fn/mean": 0.4679439961910248, + "rewards/reward_fn/std": 0.4003368301899172, + "step": 475, + "step_time": 11.856545787199867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 54.2, + "completions/max_terminated_length": 54.2, + "completions/mean_length": 27.975, + "completions/mean_terminated_length": 27.975, + "completions/min_length": 17.2, + "completions/min_terminated_length": 17.2, + "entropy": 0.09195145816338482, + "epoch": 0.020689655172413793, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.84375, + "learning_rate": 1.0431034482758621e-06, + "loss": 0.11756571531295776, + "num_tokens": 1093300.0, + "reward": 0.5340119987726212, + "reward_std": 0.4810411691665649, + "rewards/reward_fn/mean": 0.5340119987726212, + "rewards/reward_fn/std": 0.481041194498539, + "step": 480, + "step_time": 11.739623094601484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.8, + "completions/max_terminated_length": 103.8, + "completions/mean_length": 56.05, + "completions/mean_terminated_length": 56.05, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.14636036474257708, + "epoch": 0.020905172413793104, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.86328125, + "learning_rate": 1.0000000000000002e-06, + "loss": -0.0425639271736145, + "num_tokens": 1104678.0, + "reward": 0.503628009557724, + "reward_std": 0.39048823765624546, + "rewards/reward_fn/mean": 0.503628009557724, + "rewards/reward_fn/std": 0.39048823688208356, + "step": 485, + "step_time": 18.426820938600212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.6, + "completions/max_terminated_length": 85.6, + "completions/mean_length": 44.8, + "completions/mean_terminated_length": 44.8, + "completions/min_length": 17.4, + "completions/min_terminated_length": 17.4, + "entropy": 0.13744304669089616, + "epoch": 0.021120689655172414, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.375, + "learning_rate": 9.56896551724138e-07, + "loss": -0.012553822994232178, + "num_tokens": 1115738.0, + "reward": 0.1133280098438263, + "reward_std": 0.7501570947468281, + "rewards/reward_fn/mean": 0.1133280098438263, + "rewards/reward_fn/std": 0.7501571170985699, + "step": 490, + "step_time": 16.32867533860008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 19.825, + "completions/mean_terminated_length": 19.825, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.09021760655959951, + "epoch": 0.021336206896551725, + "frac_reward_zero_std": 0.6, + "grad_norm": 1.4375, + "learning_rate": 9.13793103448276e-07, + "loss": -0.06582846641540527, + "num_tokens": 1125275.0, + "reward": 0.6070199966430664, + "reward_std": 0.2248265265574446, + "rewards/reward_fn/mean": 0.6070199966430664, + "rewards/reward_fn/std": 0.22482652704929934, + "step": 495, + "step_time": 10.234892192999178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.2, + "completions/max_terminated_length": 86.2, + "completions/mean_length": 30.6, + "completions/mean_terminated_length": 30.6, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "entropy": 0.16520561957440805, + "epoch": 0.021551724137931036, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.65234375, + "learning_rate": 8.706896551724138e-07, + "loss": 0.05150673389434814, + "num_tokens": 1135535.0, + "reward": 0.27766999006271365, + "reward_std": 0.14229891437571496, + "rewards/reward_fn/mean": 0.27766999006271365, + "rewards/reward_fn/std": 0.14229891322320326, + "step": 500, + "step_time": 16.51249713080106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 104.4, + "completions/max_terminated_length": 87.4, + "completions/mean_length": 44.05, + "completions/mean_terminated_length": 39.35714416503906, + "completions/min_length": 15.8, + "completions/min_terminated_length": 15.8, + "entropy": 0.08128404353337829, + "epoch": 0.021767241379310343, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.0, + "learning_rate": 8.275862068965518e-07, + "loss": -0.03205357193946838, + "num_tokens": 1146069.0, + "reward": 0.19227599799633027, + "reward_std": 0.7330781102180481, + "rewards/reward_fn/mean": 0.19227599799633027, + "rewards/reward_fn/std": 0.7330781102180481, + "step": 505, + "step_time": 18.19536280339962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.4, + "completions/max_terminated_length": 60.4, + "completions/mean_length": 30.925, + "completions/mean_terminated_length": 30.925, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "entropy": 0.177409560502565, + "epoch": 0.021982758620689654, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.478515625, + "learning_rate": 7.844827586206897e-07, + "loss": 0.09961764812469483, + "num_tokens": 1156114.0, + "reward": 0.6251720190048218, + "reward_std": 0.33523904105677504, + "rewards/reward_fn/mean": 0.6251720190048218, + "rewards/reward_fn/std": 0.33523902565793834, + "step": 510, + "step_time": 12.48947542400092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.2, + "completions/max_terminated_length": 61.2, + "completions/mean_length": 29.025, + "completions/mean_terminated_length": 29.025, + "completions/min_length": 15.6, + "completions/min_terminated_length": 15.6, + "entropy": 0.1548972967953887, + "epoch": 0.022198275862068965, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.96484375, + "learning_rate": 7.413793103448276e-07, + "loss": -0.04886127114295959, + "num_tokens": 1166031.0, + "reward": -0.02624799758195877, + "reward_std": 0.9727282524108887, + "rewards/reward_fn/mean": -0.02624799758195877, + "rewards/reward_fn/std": 0.9727282643318176, + "step": 515, + "step_time": 12.66001488919792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.4, + "completions/max_terminated_length": 83.4, + "completions/mean_length": 34.15, + "completions/mean_terminated_length": 34.15, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.1318921248661354, + "epoch": 0.022413793103448276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.37890625, + "learning_rate": 6.982758620689656e-07, + "loss": 0.16495332717895508, + "num_tokens": 1176785.0, + "reward": 0.649573989212513, + "reward_std": 0.20802882347197738, + "rewards/reward_fn/mean": 0.649573989212513, + "rewards/reward_fn/std": 0.20802885060838888, + "step": 520, + "step_time": 16.0061209066007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 63.6, + "completions/max_terminated_length": 63.6, + "completions/mean_length": 27.55, + "completions/mean_terminated_length": 27.55, + "completions/min_length": 16.2, + "completions/min_terminated_length": 16.2, + "entropy": 0.10987133319722489, + "epoch": 0.022629310344827586, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.412109375, + "learning_rate": 6.551724137931036e-07, + "loss": 0.061076796054840087, + "num_tokens": 1187031.0, + "reward": -0.0006740093231201172, + "reward_std": 0.42037363812560213, + "rewards/reward_fn/mean": -0.0006740093231201172, + "rewards/reward_fn/std": 0.42037364130374044, + "step": 525, + "step_time": 13.465872670798854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 90.0, + "completions/max_terminated_length": 81.6, + "completions/mean_length": 39.175, + "completions/mean_terminated_length": 34.67857208251953, + "completions/min_length": 16.6, + "completions/min_terminated_length": 16.6, + "entropy": 0.1547772396108485, + "epoch": 0.022844827586206897, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.515625, + "learning_rate": 6.120689655172414e-07, + "loss": 0.06957628726959228, + "num_tokens": 1197570.0, + "reward": 0.4232920050621033, + "reward_std": 0.3604313934542006, + "rewards/reward_fn/mean": 0.4232920050621033, + "rewards/reward_fn/std": 0.36043139236280697, + "step": 530, + "step_time": 16.512748561598347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.2, + "completions/max_terminated_length": 96.2, + "completions/mean_length": 44.25, + "completions/mean_terminated_length": 44.25, + "completions/min_length": 16.6, + "completions/min_terminated_length": 16.6, + "entropy": 0.18253832953050733, + "epoch": 0.023060344827586208, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.59375, + "learning_rate": 5.689655172413793e-07, + "loss": 0.006311050057411194, + "num_tokens": 1208500.0, + "reward": 0.15177001953125, + "reward_std": 0.35281400037929417, + "rewards/reward_fn/mean": 0.15177001953125, + "rewards/reward_fn/std": 0.3528140248730779, + "step": 535, + "step_time": 17.665802622400225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.4, + "completions/max_terminated_length": 83.4, + "completions/mean_length": 48.05, + "completions/mean_terminated_length": 48.05, + "completions/min_length": 24.8, + "completions/min_terminated_length": 24.8, + "entropy": 0.21351429815404116, + "epoch": 0.02327586206896552, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.49609375, + "learning_rate": 5.258620689655172e-07, + "loss": -0.027930429577827452, + "num_tokens": 1219822.0, + "reward": 0.38965801149606705, + "reward_std": 0.5316631376743317, + "rewards/reward_fn/mean": 0.38965801149606705, + "rewards/reward_fn/std": 0.5316631510853768, + "step": 540, + "step_time": 15.973668830200404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.075, + "completions/max_length": 121.2, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 79.1, + "completions/mean_terminated_length": 73.175, + "completions/min_length": 38.6, + "completions/min_terminated_length": 38.6, + "entropy": 0.24065397809899877, + "epoch": 0.023491379310344826, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.1142578125, + "learning_rate": 4.827586206896552e-07, + "loss": 0.016971246898174287, + "num_tokens": 1232174.0, + "reward": 0.2989579916000366, + "reward_std": 0.3972888857126236, + "rewards/reward_fn/mean": 0.2989579916000366, + "rewards/reward_fn/std": 0.3972889166325331, + "step": 545, + "step_time": 20.859514107201175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 97.2, + "completions/max_terminated_length": 77.6, + "completions/mean_length": 48.9, + "completions/mean_terminated_length": 45.27857208251953, + "completions/min_length": 24.4, + "completions/min_terminated_length": 24.4, + "entropy": 0.14531502816826106, + "epoch": 0.023706896551724137, + "frac_reward_zero_std": 0.4, + "grad_norm": 1.921875, + "learning_rate": 4.3965517241379313e-07, + "loss": 0.04016273319721222, + "num_tokens": 1243278.0, + "reward": 0.09395999610424041, + "reward_std": 0.7670043525766232, + "rewards/reward_fn/mean": 0.09395999610424041, + "rewards/reward_fn/std": 0.767004385330074, + "step": 550, + "step_time": 17.55344593800146 + } + ], + "logging_steps": 5, + "max_steps": 600, + "num_input_tokens_seen": 1243278, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}