diff --git "a/checkpoint-450/trainer_state.json" "b/checkpoint-450/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-450/trainer_state.json" @@ -0,0 +1,11284 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5142857142857142, + "eval_steps": 500, + "global_step": 450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2837817668914795, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": -0.09800112247467041, + "reward_std": 0.3028089702129364, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2421981245279312, + "learning_rate": 2e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.020556632429361343, + "reward_std": 0.3545936942100525, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 1952.234375, + "completions/mean_terminated_length": 822.2000122070312, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24851329624652863, + "learning_rate": 4e-08, + "loss": -0.0, + "num_tokens": 375163.0, + "reward": -0.22721199691295624, + "reward_std": 0.14563649892807007, + "rewards/cosine_scaled_reward/mean": -0.22721199691295624, + "rewards/cosine_scaled_reward/std": 0.1709199845790863, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 1554.109375, + "completions/mean_terminated_length": 958.0344848632812, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29272863268852234, + "learning_rate": 6e-08, + "loss": -0.0, + "num_tokens": 484434.0, + "reward": -0.17542189359664917, + "reward_std": 0.18219107389450073, + "rewards/cosine_scaled_reward/mean": -0.17542189359664917, + "rewards/cosine_scaled_reward/std": 0.27975013852119446, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1943.0625, + "completions/mean_terminated_length": 1088.571533203125, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2773251533508301, + "learning_rate": 8e-08, + "loss": 0.0, + "num_tokens": 619606.0, + "reward": -0.2648562788963318, + "reward_std": 0.21638144552707672, + "rewards/cosine_scaled_reward/mean": -0.2648562788963318, + "rewards/cosine_scaled_reward/std": 0.23959198594093323, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 1854.21875, + "completions/mean_terminated_length": 920.5454711914062, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27399909496307373, + "learning_rate": 1e-07, + "loss": -0.0, + "num_tokens": 749924.0, + "reward": -0.19292885065078735, + "reward_std": 0.2666770815849304, + "rewards/cosine_scaled_reward/mean": -0.19292885065078735, + "rewards/cosine_scaled_reward/std": 0.295730322599411, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 1940.5625, + "completions/mean_terminated_length": 1065.71435546875, + "completions/min_length": 773.0, + "completions/min_terminated_length": 773.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23362359404563904, + "learning_rate": 1.2e-07, + "loss": 0.0, + "num_tokens": 884528.0, + "reward": -0.18198424577713013, + "reward_std": 0.18540163338184357, + "rewards/cosine_scaled_reward/mean": -0.18198424577713013, + "rewards/cosine_scaled_reward/std": 0.32407456636428833, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1708.5625, + "completions/mean_terminated_length": 1013.5238037109375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24677562713623047, + "learning_rate": 1.4e-07, + "loss": -0.0, + "num_tokens": 1004292.0, + "reward": -0.09573853015899658, + "reward_std": 0.22485454380512238, + "rewards/cosine_scaled_reward/mean": -0.09573852270841599, + "rewards/cosine_scaled_reward/std": 0.449250191450119, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1221.0, + "completions/mean_length": 1979.359375, + "completions/mean_terminated_length": 949.75, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26966309547424316, + "learning_rate": 1.6e-07, + "loss": 0.0, + "num_tokens": 1142427.0, + "reward": -0.19992578029632568, + "reward_std": 0.20190927386283875, + "rewards/cosine_scaled_reward/mean": -0.19992581009864807, + "rewards/cosine_scaled_reward/std": 0.23785534501075745, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1652.59375, + "completions/mean_terminated_length": 897.727294921875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3011312484741211, + "learning_rate": 1.8e-07, + "loss": 0.0, + "num_tokens": 1259025.0, + "reward": -0.11706389486789703, + "reward_std": 0.2934548258781433, + "rewards/cosine_scaled_reward/mean": -0.11706390231847763, + "rewards/cosine_scaled_reward/std": 0.3601698577404022, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 1946.6875, + "completions/mean_terminated_length": 967.3333740234375, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2451399564743042, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 1395285.0, + "reward": -0.2866281270980835, + "reward_std": 0.12184012681245804, + "rewards/cosine_scaled_reward/mean": -0.2866281270980835, + "rewards/cosine_scaled_reward/std": 0.15141677856445312, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1659.28125, + "completions/mean_terminated_length": 1190.137939453125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2733561396598816, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "num_tokens": 1512423.0, + "reward": -0.13816070556640625, + "reward_std": 0.2968980073928833, + "rewards/cosine_scaled_reward/mean": -0.13816070556640625, + "rewards/cosine_scaled_reward/std": 0.3597467839717865, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1770.0, + "completions/mean_length": 1807.796875, + "completions/mean_terminated_length": 1023.1333618164062, + "completions/min_length": 697.0, + "completions/min_terminated_length": 697.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25238803029060364, + "learning_rate": 2.4e-07, + "loss": 0.0, + "num_tokens": 1639162.0, + "reward": -0.13488636910915375, + "reward_std": 0.2661236524581909, + "rewards/cosine_scaled_reward/mean": -0.13488635420799255, + "rewards/cosine_scaled_reward/std": 0.3444243371486664, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 1846.921875, + "completions/mean_terminated_length": 1243.6875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2201598882675171, + "learning_rate": 2.6e-07, + "loss": -0.0, + "num_tokens": 1767973.0, + "reward": -0.20591925084590912, + "reward_std": 0.21505361795425415, + "rewards/cosine_scaled_reward/mean": -0.20591923594474792, + "rewards/cosine_scaled_reward/std": 0.323749840259552, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 1710.421875, + "completions/mean_terminated_length": 847.7222290039062, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2665213644504547, + "learning_rate": 2.8e-07, + "loss": 0.0, + "num_tokens": 1888360.0, + "reward": -0.0778750479221344, + "reward_std": 0.17502948641777039, + "rewards/cosine_scaled_reward/mean": -0.0778750628232956, + "rewards/cosine_scaled_reward/std": 0.47343766689300537, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 2031.03125, + "completions/mean_terminated_length": 962.0, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23009927570819855, + "learning_rate": 3e-07, + "loss": -0.0, + "num_tokens": 2028786.0, + "reward": -0.2619968056678772, + "reward_std": 0.16954168677330017, + "rewards/cosine_scaled_reward/mean": -0.2619968056678772, + "rewards/cosine_scaled_reward/std": 0.18357795476913452, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1533.15625, + "completions/mean_terminated_length": 780.6923217773438, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3392995297908783, + "learning_rate": 3.2e-07, + "loss": -0.0, + "num_tokens": 2137428.0, + "reward": -0.11706461012363434, + "reward_std": 0.3096129894256592, + "rewards/cosine_scaled_reward/mean": -0.11706460267305374, + "rewards/cosine_scaled_reward/std": 0.3810974657535553, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 1774.46875, + "completions/mean_terminated_length": 1018.2352905273438, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23254038393497467, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "num_tokens": 2261370.0, + "reward": -0.18709540367126465, + "reward_std": 0.2795025110244751, + "rewards/cosine_scaled_reward/mean": -0.18709540367126465, + "rewards/cosine_scaled_reward/std": 0.3359416127204895, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 1719.0, + "completions/mean_terminated_length": 995.2000122070312, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262045681476593, + "learning_rate": 3.6e-07, + "loss": -0.0, + "num_tokens": 2382642.0, + "reward": -0.02329203486442566, + "reward_std": 0.34684932231903076, + "rewards/cosine_scaled_reward/mean": -0.02329203486442566, + "rewards/cosine_scaled_reward/std": 0.47637447714805603, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1630.90625, + "completions/mean_terminated_length": 935.75, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.250532329082489, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "num_tokens": 2498372.0, + "reward": -0.06319350004196167, + "reward_std": 0.2394939512014389, + "rewards/cosine_scaled_reward/mean": -0.06319350004196167, + "rewards/cosine_scaled_reward/std": 0.3889789879322052, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 1735.96875, + "completions/mean_terminated_length": 1140.272705078125, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2773231565952301, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 2620282.0, + "reward": -0.20884393155574799, + "reward_std": 0.20233216881752014, + "rewards/cosine_scaled_reward/mean": -0.20884393155574799, + "rewards/cosine_scaled_reward/std": 0.28432920575141907, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1790.0, + "completions/mean_length": 1342.953125, + "completions/mean_terminated_length": 919.9249877929688, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34627005457878113, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "num_tokens": 2715247.0, + "reward": -0.09092864394187927, + "reward_std": 0.21042926609516144, + "rewards/cosine_scaled_reward/mean": -0.09092865139245987, + "rewards/cosine_scaled_reward/std": 0.43559205532073975, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1661.9375, + "completions/mean_terminated_length": 1132.888916015625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2705242335796356, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "num_tokens": 2832403.0, + "reward": -0.13339249789714813, + "reward_std": 0.2433384656906128, + "rewards/cosine_scaled_reward/mean": -0.13339248299598694, + "rewards/cosine_scaled_reward/std": 0.3815627098083496, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1802.296875, + "completions/mean_terminated_length": 1065.1875, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24961258471012115, + "learning_rate": 4.6e-07, + "loss": 0.0, + "num_tokens": 2958678.0, + "reward": -0.18733163177967072, + "reward_std": 0.2773033380508423, + "rewards/cosine_scaled_reward/mean": -0.1873316466808319, + "rewards/cosine_scaled_reward/std": 0.37051624059677124, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 1731.53125, + "completions/mean_terminated_length": 982.0, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2662124037742615, + "learning_rate": 4.8e-07, + "loss": 0.0, + "num_tokens": 3079792.0, + "reward": -0.12407588213682175, + "reward_std": 0.25581949949264526, + "rewards/cosine_scaled_reward/mean": -0.12407589703798294, + "rewards/cosine_scaled_reward/std": 0.39043793082237244, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1965.46875, + "completions/mean_terminated_length": 1567.8182373046875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23202598094940186, + "learning_rate": 5e-07, + "loss": 0.0, + "num_tokens": 3216214.0, + "reward": -0.0963105633854866, + "reward_std": 0.30887559056282043, + "rewards/cosine_scaled_reward/mean": -0.0963105633854866, + "rewards/cosine_scaled_reward/std": 0.39396020770072937, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1886.96875, + "completions/mean_terminated_length": 1111.0909423828125, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2878379225730896, + "learning_rate": 5.2e-07, + "loss": -0.0, + "num_tokens": 3347268.0, + "reward": -0.1645491123199463, + "reward_std": 0.28629785776138306, + "rewards/cosine_scaled_reward/mean": -0.1645491123199463, + "rewards/cosine_scaled_reward/std": 0.35050687193870544, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1843.640625, + "completions/mean_terminated_length": 1230.5625, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24996496737003326, + "learning_rate": 5.4e-07, + "loss": 0.0, + "num_tokens": 3475597.0, + "reward": -0.06605555862188339, + "reward_std": 0.2643629312515259, + "rewards/cosine_scaled_reward/mean": -0.06605555862188339, + "rewards/cosine_scaled_reward/std": 0.438128799200058, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 2020.5, + "completions/mean_terminated_length": 1608.0, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23316837847232819, + "learning_rate": 5.6e-07, + "loss": -0.0, + "num_tokens": 3615381.0, + "reward": -0.2015206664800644, + "reward_std": 0.15312039852142334, + "rewards/cosine_scaled_reward/mean": -0.2015206664800644, + "rewards/cosine_scaled_reward/std": 0.1648881882429123, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1839.0, + "completions/mean_length": 1826.046875, + "completions/mean_terminated_length": 955.3077392578125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2410832792520523, + "learning_rate": 5.8e-07, + "loss": -0.0, + "num_tokens": 3742784.0, + "reward": -0.17509159445762634, + "reward_std": 0.18994277715682983, + "rewards/cosine_scaled_reward/mean": -0.17509159445762634, + "rewards/cosine_scaled_reward/std": 0.22516494989395142, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 1781.4375, + "completions/mean_terminated_length": 910.6666870117188, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2693414092063904, + "learning_rate": 6e-07, + "loss": 0.0, + "num_tokens": 3867292.0, + "reward": -0.24513831734657288, + "reward_std": 0.28315529227256775, + "rewards/cosine_scaled_reward/mean": -0.24513831734657288, + "rewards/cosine_scaled_reward/std": 0.3480584919452667, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1969.28125, + "completions/mean_terminated_length": 1488.2222900390625, + "completions/min_length": 1088.0, + "completions/min_terminated_length": 1088.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24202018976211548, + "learning_rate": 6.2e-07, + "loss": 0.0, + "num_tokens": 4003678.0, + "reward": -0.18968716263771057, + "reward_std": 0.28299200534820557, + "rewards/cosine_scaled_reward/mean": -0.18968716263771057, + "rewards/cosine_scaled_reward/std": 0.3119950294494629, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22288212180137634, + "learning_rate": 6.4e-07, + "loss": 0.0, + "num_tokens": 4145966.0, + "reward": -0.2955162525177002, + "reward_std": 0.17793573439121246, + "rewards/cosine_scaled_reward/mean": -0.2955162525177002, + "rewards/cosine_scaled_reward/std": 0.22786569595336914, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 1589.640625, + "completions/mean_terminated_length": 1036.4482421875, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31030499935150146, + "learning_rate": 6.6e-07, + "loss": 0.0, + "num_tokens": 4257255.0, + "reward": 0.008002171292901039, + "reward_std": 0.3413254916667938, + "rewards/cosine_scaled_reward/mean": 0.008002176880836487, + "rewards/cosine_scaled_reward/std": 0.4431404769420624, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1785.921875, + "completions/mean_terminated_length": 757.769287109375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3145958483219147, + "learning_rate": 6.800000000000001e-07, + "loss": -0.0, + "num_tokens": 4383050.0, + "reward": -0.16386553645133972, + "reward_std": 0.2818174958229065, + "rewards/cosine_scaled_reward/mean": -0.16386555135250092, + "rewards/cosine_scaled_reward/std": 0.3242056965827942, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 2000.421875, + "completions/mean_terminated_length": 1033.0, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25796815752983093, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 4522189.0, + "reward": -0.2470606118440628, + "reward_std": 0.15509279072284698, + "rewards/cosine_scaled_reward/mean": -0.2470606118440628, + "rewards/cosine_scaled_reward/std": 0.16412879526615143, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1964.46875, + "completions/mean_terminated_length": 1284.2857666015625, + "completions/min_length": 931.0, + "completions/min_terminated_length": 931.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22452199459075928, + "learning_rate": 7.2e-07, + "loss": 0.0, + "num_tokens": 4658939.0, + "reward": -0.24706938862800598, + "reward_std": 0.18499845266342163, + "rewards/cosine_scaled_reward/mean": -0.24706941843032837, + "rewards/cosine_scaled_reward/std": 0.21092188358306885, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 1925.234375, + "completions/mean_terminated_length": 1175.0, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23703666031360626, + "learning_rate": 7.4e-07, + "loss": -0.0, + "num_tokens": 4793866.0, + "reward": -0.11504355818033218, + "reward_std": 0.20660358667373657, + "rewards/cosine_scaled_reward/mean": -0.11504356563091278, + "rewards/cosine_scaled_reward/std": 0.3190351724624634, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 1740.546875, + "completions/mean_terminated_length": 642.5, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23829001188278198, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "num_tokens": 4916045.0, + "reward": -0.12095541507005692, + "reward_std": 0.1958026885986328, + "rewards/cosine_scaled_reward/mean": -0.12095542997121811, + "rewards/cosine_scaled_reward/std": 0.340241402387619, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1713.203125, + "completions/mean_terminated_length": 920.26318359375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24145744740962982, + "learning_rate": 7.799999999999999e-07, + "loss": -0.0, + "num_tokens": 5035762.0, + "reward": -0.10936243832111359, + "reward_std": 0.14468500018119812, + "rewards/cosine_scaled_reward/mean": -0.10936242341995239, + "rewards/cosine_scaled_reward/std": 0.4288744330406189, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1909.71875, + "completions/mean_terminated_length": 1367.2308349609375, + "completions/min_length": 1138.0, + "completions/min_terminated_length": 1138.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22317881882190704, + "learning_rate": 8e-07, + "loss": 0.0, + "num_tokens": 5169136.0, + "reward": -0.2058967649936676, + "reward_std": 0.2325170338153839, + "rewards/cosine_scaled_reward/mean": -0.20589673519134521, + "rewards/cosine_scaled_reward/std": 0.28897321224212646, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 1727.71875, + "completions/mean_terminated_length": 583.857177734375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44688937067985535, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "num_tokens": 5290070.0, + "reward": -0.2254919707775116, + "reward_std": 0.1687203049659729, + "rewards/cosine_scaled_reward/mean": -0.2254919707775116, + "rewards/cosine_scaled_reward/std": 0.18203677237033844, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 1855.328125, + "completions/mean_terminated_length": 814.9000244140625, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2430828958749771, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0, + "num_tokens": 5420427.0, + "reward": -0.09104865789413452, + "reward_std": 0.18217626214027405, + "rewards/cosine_scaled_reward/mean": -0.09104865789413452, + "rewards/cosine_scaled_reward/std": 0.3521345257759094, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 1727.9375, + "completions/mean_terminated_length": 767.75, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32065215706825256, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "num_tokens": 5541711.0, + "reward": -0.17701950669288635, + "reward_std": 0.2957555055618286, + "rewards/cosine_scaled_reward/mean": -0.17701953649520874, + "rewards/cosine_scaled_reward/std": 0.38460060954093933, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 2013.9375, + "completions/mean_terminated_length": 1321.3333740234375, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22363637387752533, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "num_tokens": 5682259.0, + "reward": -0.20341511070728302, + "reward_std": 0.23104795813560486, + "rewards/cosine_scaled_reward/mean": -0.20341511070728302, + "rewards/cosine_scaled_reward/std": 0.3092363774776459, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 1909.0, + "completions/mean_terminated_length": 936.0, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26306217908859253, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 5815603.0, + "reward": -0.26145532727241516, + "reward_std": 0.17108051478862762, + "rewards/cosine_scaled_reward/mean": -0.2614552974700928, + "rewards/cosine_scaled_reward/std": 0.18312901258468628, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1668.0, + "completions/mean_length": 1757.1875, + "completions/mean_terminated_length": 884.75, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2856813371181488, + "learning_rate": 9.2e-07, + "loss": 0.0, + "num_tokens": 5938463.0, + "reward": -0.20879247784614563, + "reward_std": 0.23861759901046753, + "rewards/cosine_scaled_reward/mean": -0.20879246294498444, + "rewards/cosine_scaled_reward/std": 0.39607998728752136, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 1756.5, + "completions/mean_terminated_length": 1011.5555419921875, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27563413977622986, + "learning_rate": 9.399999999999999e-07, + "loss": -0.0, + "num_tokens": 6061423.0, + "reward": -0.16147920489311218, + "reward_std": 0.24055320024490356, + "rewards/cosine_scaled_reward/mean": -0.16147920489311218, + "rewards/cosine_scaled_reward/std": 0.3948959410190582, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1458.0, + "completions/mean_length": 1538.078125, + "completions/mean_terminated_length": 839.2963256835938, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27617642283439636, + "learning_rate": 9.6e-07, + "loss": -0.0, + "num_tokens": 6169924.0, + "reward": -0.18436825275421143, + "reward_std": 0.27141550183296204, + "rewards/cosine_scaled_reward/mean": -0.18436823785305023, + "rewards/cosine_scaled_reward/std": 0.3920196294784546, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 1749.0625, + "completions/mean_terminated_length": 772.5333862304688, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23394836485385895, + "learning_rate": 9.8e-07, + "loss": 0.0, + "num_tokens": 6292680.0, + "reward": -0.10770958662033081, + "reward_std": 0.22513547539710999, + "rewards/cosine_scaled_reward/mean": -0.10770957916975021, + "rewards/cosine_scaled_reward/std": 0.421062707901001, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1482.25, + "completions/mean_terminated_length": 841.0667114257812, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.05828571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3268967568874359, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 6397752.0, + "reward": -0.09745607525110245, + "reward_std": 0.25210899114608765, + "rewards/cosine_scaled_reward/mean": -0.09745605289936066, + "rewards/cosine_scaled_reward/std": 0.3351369798183441, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1743.953125, + "completions/mean_terminated_length": 750.7333984375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.05942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2918722927570343, + "learning_rate": 9.999890338174275e-07, + "loss": -0.0, + "num_tokens": 6520717.0, + "reward": -0.1890830397605896, + "reward_std": 0.21916288137435913, + "rewards/cosine_scaled_reward/mean": -0.1890830546617508, + "rewards/cosine_scaled_reward/std": 0.32568052411079407, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 1772.421875, + "completions/mean_terminated_length": 1010.5294189453125, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.060571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24523264169692993, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0, + "num_tokens": 6644984.0, + "reward": -0.20969681441783905, + "reward_std": 0.1810423731803894, + "rewards/cosine_scaled_reward/mean": -0.20969681441783905, + "rewards/cosine_scaled_reward/std": 0.2371566891670227, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1838.859375, + "completions/mean_terminated_length": 1304.388916015625, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.061714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23284469544887543, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "num_tokens": 6773815.0, + "reward": -0.06641622632741928, + "reward_std": 0.30815836787223816, + "rewards/cosine_scaled_reward/mean": -0.06641621887683868, + "rewards/cosine_scaled_reward/std": 0.46219584345817566, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1750.125, + "completions/mean_terminated_length": 856.5, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.06285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2651103734970093, + "learning_rate": 9.998245517681593e-07, + "loss": -0.0, + "num_tokens": 6896111.0, + "reward": -0.10750342905521393, + "reward_std": 0.2286185324192047, + "rewards/cosine_scaled_reward/mean": -0.10750342160463333, + "rewards/cosine_scaled_reward/std": 0.43372800946235657, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1840.078125, + "completions/mean_terminated_length": 1097.5, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22967560589313507, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 7024836.0, + "reward": -0.10045827925205231, + "reward_std": 0.2548004388809204, + "rewards/cosine_scaled_reward/mean": -0.10045827925205231, + "rewards/cosine_scaled_reward/std": 0.41444358229637146, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 1991.1875, + "completions/mean_terminated_length": 1442.0, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "epoch": 0.06514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20479348301887512, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "num_tokens": 7163840.0, + "reward": -0.27901512384414673, + "reward_std": 0.2130473554134369, + "rewards/cosine_scaled_reward/mean": -0.27901512384414673, + "rewards/cosine_scaled_reward/std": 0.2583855092525482, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1617.421875, + "completions/mean_terminated_length": 1129.433349609375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.06628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2690146267414093, + "learning_rate": 9.994627618036452e-07, + "loss": -0.0, + "num_tokens": 7277451.0, + "reward": -0.04198366403579712, + "reward_std": 0.4036104083061218, + "rewards/cosine_scaled_reward/mean": -0.04198366031050682, + "rewards/cosine_scaled_reward/std": 0.5008736252784729, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1736.09375, + "completions/mean_terminated_length": 997.368408203125, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "epoch": 0.06742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2184475064277649, + "learning_rate": 9.992983438818915e-07, + "loss": -0.0, + "num_tokens": 7399025.0, + "reward": -0.1564982533454895, + "reward_std": 0.19560785591602325, + "rewards/cosine_scaled_reward/mean": -0.1564982533454895, + "rewards/cosine_scaled_reward/std": 0.3402426540851593, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1512.0, + "completions/mean_length": 1785.40625, + "completions/mean_terminated_length": 847.5714721679688, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.06857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23538637161254883, + "learning_rate": 9.991120277927223e-07, + "loss": -0.0, + "num_tokens": 7524179.0, + "reward": -0.2697012424468994, + "reward_std": 0.17935499548912048, + "rewards/cosine_scaled_reward/mean": -0.2697012424468994, + "rewards/cosine_scaled_reward/std": 0.19757980108261108, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1884.484375, + "completions/mean_terminated_length": 1001.5, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.06971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.225452721118927, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "num_tokens": 7656306.0, + "reward": -0.1635127067565918, + "reward_std": 0.1931447982788086, + "rewards/cosine_scaled_reward/mean": -0.1635127067565918, + "rewards/cosine_scaled_reward/std": 0.23563610017299652, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1739.46875, + "completions/mean_terminated_length": 1060.7000732421875, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.07085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23771661520004272, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0, + "num_tokens": 7777864.0, + "reward": -0.10127441585063934, + "reward_std": 0.2957979142665863, + "rewards/cosine_scaled_reward/mean": -0.10127442330121994, + "rewards/cosine_scaled_reward/std": 0.34053224325180054, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1522.953125, + "completions/mean_terminated_length": 1163.7105712890625, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27804723381996155, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "num_tokens": 7885589.0, + "reward": -0.036153122782707214, + "reward_std": 0.3305097818374634, + "rewards/cosine_scaled_reward/mean": -0.03615312650799751, + "rewards/cosine_scaled_reward/std": 0.4355940818786621, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 1760.390625, + "completions/mean_terminated_length": 1025.388916015625, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.07314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2333846092224121, + "learning_rate": 9.981479793771866e-07, + "loss": -0.0, + "num_tokens": 8009206.0, + "reward": -0.14333069324493408, + "reward_std": 0.28757935762405396, + "rewards/cosine_scaled_reward/mean": -0.14333069324493408, + "rewards/cosine_scaled_reward/std": 0.41007620096206665, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1651.515625, + "completions/mean_terminated_length": 638.2777709960938, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.07428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26348626613616943, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0, + "num_tokens": 8125607.0, + "reward": -0.2117859125137329, + "reward_std": 0.15534773468971252, + "rewards/cosine_scaled_reward/mean": -0.2117859125137329, + "rewards/cosine_scaled_reward/std": 0.37395453453063965, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 1254.125, + "completions/mean_terminated_length": 596.3428344726562, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.07542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33443817496299744, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "num_tokens": 8216103.0, + "reward": 0.028336994349956512, + "reward_std": 0.25119709968566895, + "rewards/cosine_scaled_reward/mean": 0.02833697199821472, + "rewards/cosine_scaled_reward/std": 0.4882389008998871, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 1966.21875, + "completions/mean_terminated_length": 1175.666748046875, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.07657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2199370563030243, + "learning_rate": 9.971955636222684e-07, + "loss": -0.0, + "num_tokens": 8352677.0, + "reward": -0.28747493028640747, + "reward_std": 0.15530282258987427, + "rewards/cosine_scaled_reward/mean": -0.28747493028640747, + "rewards/cosine_scaled_reward/std": 0.16220521926879883, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1357.109375, + "completions/mean_terminated_length": 747.5, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.07771428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3341590464115143, + "learning_rate": 9.968344786479415e-07, + "loss": -0.0, + "num_tokens": 8448788.0, + "reward": -0.06672946363687515, + "reward_std": 0.28790342807769775, + "rewards/cosine_scaled_reward/mean": -0.06672945618629456, + "rewards/cosine_scaled_reward/std": 0.35960128903388977, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 1565.046875, + "completions/mean_terminated_length": 944.107177734375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.07885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35159721970558167, + "learning_rate": 9.964516155915151e-07, + "loss": -0.0, + "num_tokens": 8559295.0, + "reward": -0.27992868423461914, + "reward_std": 0.20264248549938202, + "rewards/cosine_scaled_reward/mean": -0.27992868423461914, + "rewards/cosine_scaled_reward/std": 0.23891927301883698, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 1867.765625, + "completions/mean_terminated_length": 606.125, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23989427089691162, + "learning_rate": 9.960469931131936e-07, + "loss": -0.0, + "num_tokens": 8690288.0, + "reward": -0.2498025894165039, + "reward_std": 0.15823513269424438, + "rewards/cosine_scaled_reward/mean": -0.2498025894165039, + "rewards/cosine_scaled_reward/std": 0.17978127300739288, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 1669.125, + "completions/mean_terminated_length": 945.8182373046875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.08114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.335510790348053, + "learning_rate": 9.956206309337066e-07, + "loss": -0.0, + "num_tokens": 8807832.0, + "reward": -0.1673138290643692, + "reward_std": 0.2547321915626526, + "rewards/cosine_scaled_reward/mean": -0.1673138290643692, + "rewards/cosine_scaled_reward/std": 0.39353805780410767, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1632.59375, + "completions/mean_terminated_length": 892.0869750976562, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.08228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30721575021743774, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0, + "num_tokens": 8922670.0, + "reward": -0.1493685096502304, + "reward_std": 0.23021411895751953, + "rewards/cosine_scaled_reward/mean": -0.1493685096502304, + "rewards/cosine_scaled_reward/std": 0.27729952335357666, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 2020.59375, + "completions/mean_terminated_length": 1463.3333740234375, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "epoch": 0.08342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20856839418411255, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0, + "num_tokens": 9062716.0, + "reward": -0.25696587562561035, + "reward_std": 0.19847074151039124, + "rewards/cosine_scaled_reward/mean": -0.25696590542793274, + "rewards/cosine_scaled_reward/std": 0.23918035626411438, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1926.984375, + "completions/mean_terminated_length": 1273.5, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.08457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23241353034973145, + "learning_rate": 9.942113192828444e-07, + "loss": -0.0, + "num_tokens": 9195971.0, + "reward": -0.12904082238674164, + "reward_std": 0.23554545640945435, + "rewards/cosine_scaled_reward/mean": -0.12904080748558044, + "rewards/cosine_scaled_reward/std": 0.4280695915222168, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 1868.890625, + "completions/mean_terminated_length": 1092.75, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "epoch": 0.08571428571428572, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.19846303761005402, + "learning_rate": 9.93698216681727e-07, + "loss": -0.0, + "num_tokens": 9326540.0, + "reward": -0.03926669806241989, + "reward_std": 0.2044709324836731, + "rewards/cosine_scaled_reward/mean": -0.039266690611839294, + "rewards/cosine_scaled_reward/std": 0.49658530950546265, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1805.296875, + "completions/mean_terminated_length": 1077.1875, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.08685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23998627066612244, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 9452479.0, + "reward": -0.23065510392189026, + "reward_std": 0.17413878440856934, + "rewards/cosine_scaled_reward/mean": -0.23065511882305145, + "rewards/cosine_scaled_reward/std": 0.21896763145923615, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 1857.328125, + "completions/mean_terminated_length": 1285.3125, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20421437919139862, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "num_tokens": 9582924.0, + "reward": -0.17972718179225922, + "reward_std": 0.209285706281662, + "rewards/cosine_scaled_reward/mean": -0.17972716689109802, + "rewards/cosine_scaled_reward/std": 0.2716500163078308, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1883.921875, + "completions/mean_terminated_length": 1093.3636474609375, + "completions/min_length": 712.0, + "completions/min_terminated_length": 712.0, + "epoch": 0.08914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2156875878572464, + "learning_rate": 9.9202926282791e-07, + "loss": -0.0, + "num_tokens": 9714215.0, + "reward": -0.14897406101226807, + "reward_std": 0.2451157122850418, + "rewards/cosine_scaled_reward/mean": -0.14897406101226807, + "rewards/cosine_scaled_reward/std": 0.38884180784225464, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1878.0, + "completions/mean_length": 1507.65625, + "completions/mean_terminated_length": 767.1851806640625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.09028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29943305253982544, + "learning_rate": 9.91429819907136e-07, + "loss": -0.0, + "num_tokens": 9820801.0, + "reward": -0.17114077508449554, + "reward_std": 0.23199111223220825, + "rewards/cosine_scaled_reward/mean": -0.17114077508449554, + "rewards/cosine_scaled_reward/std": 0.3217289447784424, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1976.125, + "completions/mean_terminated_length": 1536.888916015625, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "epoch": 0.09142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26230743527412415, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0, + "num_tokens": 9957665.0, + "reward": -0.21115826070308685, + "reward_std": 0.2435196340084076, + "rewards/cosine_scaled_reward/mean": -0.21115827560424805, + "rewards/cosine_scaled_reward/std": 0.28258123993873596, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1779.28125, + "completions/mean_terminated_length": 901.4667358398438, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.09257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33359771966934204, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "num_tokens": 10082811.0, + "reward": -0.1508273482322693, + "reward_std": 0.2594776749610901, + "rewards/cosine_scaled_reward/mean": -0.1508273482322693, + "rewards/cosine_scaled_reward/std": 0.33812451362609863, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 1711.609375, + "completions/mean_terminated_length": 851.9444580078125, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.09371428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2805767059326172, + "learning_rate": 9.895025252503755e-07, + "loss": -0.0, + "num_tokens": 10202682.0, + "reward": -0.11850972473621368, + "reward_std": 0.2631937861442566, + "rewards/cosine_scaled_reward/mean": -0.11850972473621368, + "rewards/cosine_scaled_reward/std": 0.4419197142124176, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 1749.984375, + "completions/mean_terminated_length": 1044.157958984375, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "epoch": 0.09485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3109220266342163, + "learning_rate": 9.888172094375033e-07, + "loss": -0.0, + "num_tokens": 10325769.0, + "reward": -0.10190614312887192, + "reward_std": 0.2739119529724121, + "rewards/cosine_scaled_reward/mean": -0.10190614312887192, + "rewards/cosine_scaled_reward/std": 0.39238420128822327, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 1800.390625, + "completions/mean_terminated_length": 829.0000610351562, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23385629057884216, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0, + "num_tokens": 10451690.0, + "reward": -0.21778321266174316, + "reward_std": 0.25428956747055054, + "rewards/cosine_scaled_reward/mean": -0.21778322756290436, + "rewards/cosine_scaled_reward/std": 0.30295974016189575, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 1870.46875, + "completions/mean_terminated_length": 1337.875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "epoch": 0.09714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21526271104812622, + "learning_rate": 9.873824502603459e-07, + "loss": -0.0, + "num_tokens": 10581720.0, + "reward": -0.19906702637672424, + "reward_std": 0.23402772843837738, + "rewards/cosine_scaled_reward/mean": -0.19906699657440186, + "rewards/cosine_scaled_reward/std": 0.28999006748199463, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1369.0, + "completions/mean_length": 1734.875, + "completions/mean_terminated_length": 795.5, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.09828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24285966157913208, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "num_tokens": 10703608.0, + "reward": -0.16528445482254028, + "reward_std": 0.2592755854129791, + "rewards/cosine_scaled_reward/mean": -0.16528445482254028, + "rewards/cosine_scaled_reward/std": 0.37110546231269836, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 1577.921875, + "completions/mean_terminated_length": 973.5357666015625, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "epoch": 0.09942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30273520946502686, + "learning_rate": 9.85862422507884e-07, + "loss": -0.0, + "num_tokens": 10814715.0, + "reward": -0.20241931080818176, + "reward_std": 0.2693288326263428, + "rewards/cosine_scaled_reward/mean": -0.20241928100585938, + "rewards/cosine_scaled_reward/std": 0.33345305919647217, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1680.546875, + "completions/mean_terminated_length": 1068.125, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.10057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2649252116680145, + "learning_rate": 9.850705248720068e-07, + "loss": -0.0, + "num_tokens": 10932782.0, + "reward": -0.018871163949370384, + "reward_std": 0.3073042631149292, + "rewards/cosine_scaled_reward/mean": -0.018871165812015533, + "rewards/cosine_scaled_reward/std": 0.3826298415660858, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 1683.703125, + "completions/mean_terminated_length": 1151.269287109375, + "completions/min_length": 667.0, + "completions/min_terminated_length": 667.0, + "epoch": 0.10171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24950510263442993, + "learning_rate": 9.8425742251254e-07, + "loss": -0.0, + "num_tokens": 11051539.0, + "reward": -0.11818082630634308, + "reward_std": 0.2949528694152832, + "rewards/cosine_scaled_reward/mean": -0.11818082630634308, + "rewards/cosine_scaled_reward/std": 0.34418320655822754, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1558.546875, + "completions/mean_terminated_length": 967.8275756835938, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.10285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36593058705329895, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0, + "num_tokens": 11161286.0, + "reward": -0.26082760095596313, + "reward_std": 0.1802712082862854, + "rewards/cosine_scaled_reward/mean": -0.26082760095596313, + "rewards/cosine_scaled_reward/std": 0.2037661075592041, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1505.0, + "completions/mean_length": 1827.9375, + "completions/mean_terminated_length": 1109.0667724609375, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24167831242084503, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0, + "num_tokens": 11288842.0, + "reward": -0.11456942558288574, + "reward_std": 0.26296502351760864, + "rewards/cosine_scaled_reward/mean": -0.11456942558288574, + "rewards/cosine_scaled_reward/std": 0.3274599611759186, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1581.546875, + "completions/mean_terminated_length": 899.8077392578125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "epoch": 0.10514285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2570616602897644, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "num_tokens": 11400053.0, + "reward": -0.17942462861537933, + "reward_std": 0.2633644640445709, + "rewards/cosine_scaled_reward/mean": -0.17942462861537933, + "rewards/cosine_scaled_reward/std": 0.30215632915496826, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 2022.328125, + "completions/mean_terminated_length": 1226.5, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "epoch": 0.10628571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25331902503967285, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0, + "num_tokens": 11540826.0, + "reward": -0.26418450474739075, + "reward_std": 0.1380012035369873, + "rewards/cosine_scaled_reward/mean": -0.26418450474739075, + "rewards/cosine_scaled_reward/std": 0.17390060424804688, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1702.0, + "completions/mean_length": 1769.546875, + "completions/mean_terminated_length": 934.1875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "epoch": 0.10742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29503753781318665, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0, + "num_tokens": 11663845.0, + "reward": -0.08299511671066284, + "reward_std": 0.18226617574691772, + "rewards/cosine_scaled_reward/mean": -0.08299513161182404, + "rewards/cosine_scaled_reward/std": 0.46436113119125366, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 2021.5, + "completions/mean_terminated_length": 1200.0, + "completions/min_length": 1100.0, + "completions/min_terminated_length": 1100.0, + "epoch": 0.10857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20416001975536346, + "learning_rate": 9.78935800506826e-07, + "loss": -0.0, + "num_tokens": 11803749.0, + "reward": -0.22345861792564392, + "reward_std": 0.18781372904777527, + "rewards/cosine_scaled_reward/mean": -0.22345861792564392, + "rewards/cosine_scaled_reward/std": 0.24531956017017365, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 1582.890625, + "completions/mean_terminated_length": 903.1154174804688, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.10971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2593792974948883, + "learning_rate": 9.779754323328192e-07, + "loss": -0.0, + "num_tokens": 11916190.0, + "reward": 0.00020215287804603577, + "reward_std": 0.24673128128051758, + "rewards/cosine_scaled_reward/mean": 0.00020216405391693115, + "rewards/cosine_scaled_reward/std": 0.49432000517845154, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1748.859375, + "completions/mean_terminated_length": 1177.772705078125, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "epoch": 0.11085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2480001151561737, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0, + "num_tokens": 12038381.0, + "reward": -0.19425566494464874, + "reward_std": 0.21240204572677612, + "rewards/cosine_scaled_reward/mean": -0.19425567984580994, + "rewards/cosine_scaled_reward/std": 0.29181501269340515, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1632.171875, + "completions/mean_terminated_length": 1062.3333740234375, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2797771692276001, + "learning_rate": 9.759921670520634e-07, + "loss": -0.0, + "num_tokens": 12153904.0, + "reward": -0.11104464530944824, + "reward_std": 0.2755987048149109, + "rewards/cosine_scaled_reward/mean": -0.11104465276002884, + "rewards/cosine_scaled_reward/std": 0.4012855887413025, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 1651.078125, + "completions/mean_terminated_length": 553.7058715820312, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.11314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3114299476146698, + "learning_rate": 9.749693666068663e-07, + "loss": -0.0, + "num_tokens": 12270741.0, + "reward": -0.1317199319601059, + "reward_std": 0.14237020909786224, + "rewards/cosine_scaled_reward/mean": -0.1317199319601059, + "rewards/cosine_scaled_reward/std": 0.3707720935344696, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1544.765625, + "completions/mean_terminated_length": 937.413818359375, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.11428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2654109001159668, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "num_tokens": 12379318.0, + "reward": -0.018167953938245773, + "reward_std": 0.29768484830856323, + "rewards/cosine_scaled_reward/mean": -0.01816795952618122, + "rewards/cosine_scaled_reward/std": 0.44200995564460754, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1647.421875, + "completions/mean_terminated_length": 979.7916870117188, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.11542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2877754867076874, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0, + "num_tokens": 12496185.0, + "reward": -0.10053972899913788, + "reward_std": 0.28722673654556274, + "rewards/cosine_scaled_reward/mean": -0.10053973644971848, + "rewards/cosine_scaled_reward/std": 0.36782190203666687, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1371.484375, + "completions/mean_terminated_length": 937.8204956054688, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.11657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30472108721733093, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0, + "num_tokens": 12594112.0, + "reward": -0.20305150747299194, + "reward_std": 0.23292692005634308, + "rewards/cosine_scaled_reward/mean": -0.20305150747299194, + "rewards/cosine_scaled_reward/std": 0.3213489055633545, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 1737.984375, + "completions/mean_terminated_length": 807.9375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.11771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27034303545951843, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0, + "num_tokens": 12715695.0, + "reward": -0.29003486037254333, + "reward_std": 0.21371816098690033, + "rewards/cosine_scaled_reward/mean": -0.29003486037254333, + "rewards/cosine_scaled_reward/std": 0.224824920296669, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 1705.28125, + "completions/mean_terminated_length": 893.5789794921875, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.11885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27687934041023254, + "learning_rate": 9.695457105469804e-07, + "loss": -0.0, + "num_tokens": 12835297.0, + "reward": -0.15606051683425903, + "reward_std": 0.18938840925693512, + "rewards/cosine_scaled_reward/mean": -0.15606051683425903, + "rewards/cosine_scaled_reward/std": 0.24088984727859497, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 1401.015625, + "completions/mean_terminated_length": 830.1470336914062, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2770017087459564, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0, + "num_tokens": 12936250.0, + "reward": 0.018527541309595108, + "reward_std": 0.36475759744644165, + "rewards/cosine_scaled_reward/mean": 0.018527545034885406, + "rewards/cosine_scaled_reward/std": 0.4995051920413971, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 1438.453125, + "completions/mean_terminated_length": 789.5806274414062, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.12114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26982930302619934, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0, + "num_tokens": 13039143.0, + "reward": 0.07083749771118164, + "reward_std": 0.29650557041168213, + "rewards/cosine_scaled_reward/mean": 0.07083749771118164, + "rewards/cosine_scaled_reward/std": 0.5094331502914429, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1722.0, + "completions/mean_length": 1787.09375, + "completions/mean_terminated_length": 1065.7647705078125, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "epoch": 0.12228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26255276799201965, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0, + "num_tokens": 13164669.0, + "reward": -0.2222379446029663, + "reward_std": 0.240003302693367, + "rewards/cosine_scaled_reward/mean": -0.2222379446029663, + "rewards/cosine_scaled_reward/std": 0.29153531789779663, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 1660.96875, + "completions/mean_terminated_length": 1095.3077392578125, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.12342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30773183703422546, + "learning_rate": 9.648384182148252e-07, + "loss": -0.0, + "num_tokens": 13281331.0, + "reward": -0.21352165937423706, + "reward_std": 0.3123124837875366, + "rewards/cosine_scaled_reward/mean": -0.21352165937423706, + "rewards/cosine_scaled_reward/std": 0.3453315496444702, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1858.921875, + "completions/mean_terminated_length": 1117.1539306640625, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.12457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24000757932662964, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0, + "num_tokens": 13411550.0, + "reward": -0.13601753115653992, + "reward_std": 0.1500597596168518, + "rewards/cosine_scaled_reward/mean": -0.1360175609588623, + "rewards/cosine_scaled_reward/std": 0.42859947681427, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1563.90625, + "completions/mean_terminated_length": 900.5185546875, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.12571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31709614396095276, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "num_tokens": 13522416.0, + "reward": -0.28067731857299805, + "reward_std": 0.1671288013458252, + "rewards/cosine_scaled_reward/mean": -0.28067731857299805, + "rewards/cosine_scaled_reward/std": 0.21458736062049866, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 1724.71875, + "completions/mean_terminated_length": 1013.5, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.12685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2646999657154083, + "learning_rate": 9.610954559391704e-07, + "loss": -0.0, + "num_tokens": 13642918.0, + "reward": -0.11896095424890518, + "reward_std": 0.28121650218963623, + "rewards/cosine_scaled_reward/mean": -0.11896096169948578, + "rewards/cosine_scaled_reward/std": 0.37855637073516846, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1918.0, + "completions/mean_terminated_length": 1216.0, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22158586978912354, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0, + "num_tokens": 13776766.0, + "reward": -0.1688530147075653, + "reward_std": 0.2535978853702545, + "rewards/cosine_scaled_reward/mean": -0.1688530296087265, + "rewards/cosine_scaled_reward/std": 0.3341792821884155, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 1556.125, + "completions/mean_terminated_length": 837.2307739257812, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.12914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2930087745189667, + "learning_rate": 9.58499865339809e-07, + "loss": -0.0, + "num_tokens": 13886654.0, + "reward": -0.10367631912231445, + "reward_std": 0.30835023522377014, + "rewards/cosine_scaled_reward/mean": -0.10367631912231445, + "rewards/cosine_scaled_reward/std": 0.42973947525024414, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1521.9375, + "completions/mean_terminated_length": 753.0769653320312, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.13028571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3062143921852112, + "learning_rate": 9.571721736097088e-07, + "loss": -0.0, + "num_tokens": 13993906.0, + "reward": -0.22209212183952332, + "reward_std": 0.2074735462665558, + "rewards/cosine_scaled_reward/mean": -0.22209212183952332, + "rewards/cosine_scaled_reward/std": 0.29088398814201355, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1714.578125, + "completions/mean_terminated_length": 1031.857177734375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.13142857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2564532160758972, + "learning_rate": 9.55824636882301e-07, + "loss": -0.0, + "num_tokens": 14114855.0, + "reward": -0.10947269201278687, + "reward_std": 0.30371129512786865, + "rewards/cosine_scaled_reward/mean": -0.10947269946336746, + "rewards/cosine_scaled_reward/std": 0.41030505299568176, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 1877.90625, + "completions/mean_terminated_length": 492.857177734375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.13257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25748053193092346, + "learning_rate": 9.54457320834625e-07, + "loss": -0.0, + "num_tokens": 14246425.0, + "reward": -0.19163870811462402, + "reward_std": 0.21010378003120422, + "rewards/cosine_scaled_reward/mean": -0.19163869321346283, + "rewards/cosine_scaled_reward/std": 0.3049132525920868, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 1922.546875, + "completions/mean_terminated_length": 1155.888916015625, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "epoch": 0.1337142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24102462828159332, + "learning_rate": 9.530702921077358e-07, + "loss": -0.0, + "num_tokens": 14380492.0, + "reward": -0.21347489953041077, + "reward_std": 0.19724325835704803, + "rewards/cosine_scaled_reward/mean": -0.21347489953041077, + "rewards/cosine_scaled_reward/std": 0.2647304832935333, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1793.546875, + "completions/mean_terminated_length": 1233.75, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.13485714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2439616322517395, + "learning_rate": 9.516636183034564e-07, + "loss": -0.0, + "num_tokens": 14505815.0, + "reward": -0.08845303952693939, + "reward_std": 0.30429399013519287, + "rewards/cosine_scaled_reward/mean": -0.08845303952693939, + "rewards/cosine_scaled_reward/std": 0.4648522734642029, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1405.15625, + "completions/mean_terminated_length": 936.0540771484375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32119110226631165, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0, + "num_tokens": 14606153.0, + "reward": -0.04571840912103653, + "reward_std": 0.3056246340274811, + "rewards/cosine_scaled_reward/mean": -0.04571840912103653, + "rewards/cosine_scaled_reward/std": 0.49307262897491455, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1321.40625, + "completions/mean_terminated_length": 940.8095703125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.13714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3139563500881195, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0, + "num_tokens": 14701179.0, + "reward": -0.12771092355251312, + "reward_std": 0.3157998323440552, + "rewards/cosine_scaled_reward/mean": -0.12771093845367432, + "rewards/cosine_scaled_reward/std": 0.4336044490337372, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1024.5, + "completions/mean_terminated_length": 812.0755004882812, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.1382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3428559899330139, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 14776443.0, + "reward": -0.004689367488026619, + "reward_std": 0.297618567943573, + "rewards/cosine_scaled_reward/mean": -0.004689373075962067, + "rewards/cosine_scaled_reward/std": 0.46961408853530884, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1790.765625, + "completions/mean_terminated_length": 1133.388916015625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.13942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29122933745384216, + "learning_rate": 9.458418577899774e-07, + "loss": -0.0, + "num_tokens": 14902612.0, + "reward": -0.11110783368349075, + "reward_std": 0.22664329409599304, + "rewards/cosine_scaled_reward/mean": -0.11110783368349075, + "rewards/cosine_scaled_reward/std": 0.3362382650375366, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1658.46875, + "completions/mean_terminated_length": 1124.6666259765625, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.14057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2646903693675995, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0, + "num_tokens": 15018986.0, + "reward": -0.20784568786621094, + "reward_std": 0.270358681678772, + "rewards/cosine_scaled_reward/mean": -0.20784570276737213, + "rewards/cosine_scaled_reward/std": 0.35689592361450195, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 1494.9375, + "completions/mean_terminated_length": 868.1333618164062, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.1417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26702970266342163, + "learning_rate": 9.428149347714143e-07, + "loss": -0.0, + "num_tokens": 15125614.0, + "reward": -0.160624697804451, + "reward_std": 0.23646026849746704, + "rewards/cosine_scaled_reward/mean": -0.160624697804451, + "rewards/cosine_scaled_reward/std": 0.4083607792854309, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 1608.609375, + "completions/mean_terminated_length": 825.3478393554688, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2753336727619171, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0, + "num_tokens": 15239493.0, + "reward": -0.008194006979465485, + "reward_std": 0.21567228436470032, + "rewards/cosine_scaled_reward/mean": -0.008194014430046082, + "rewards/cosine_scaled_reward/std": 0.463446706533432, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1759.484375, + "completions/mean_terminated_length": 1076.157958984375, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24985821545124054, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "num_tokens": 15363396.0, + "reward": -0.16068750619888306, + "reward_std": 0.22599664330482483, + "rewards/cosine_scaled_reward/mean": -0.16068752110004425, + "rewards/cosine_scaled_reward/std": 0.304392009973526, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 1842.859375, + "completions/mean_terminated_length": 1110.21435546875, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.14514285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21972927451133728, + "learning_rate": 9.381311511432658e-07, + "loss": -0.0, + "num_tokens": 15492435.0, + "reward": -0.29198482632637024, + "reward_std": 0.17300401628017426, + "rewards/cosine_scaled_reward/mean": -0.29198482632637024, + "rewards/cosine_scaled_reward/std": 0.21628034114837646, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1694.578125, + "completions/mean_terminated_length": 1064.565185546875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.1462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24753950536251068, + "learning_rate": 9.36531953618799e-07, + "loss": -0.0, + "num_tokens": 15611240.0, + "reward": 0.04859344661235809, + "reward_std": 0.31105202436447144, + "rewards/cosine_scaled_reward/mean": 0.04859344661235809, + "rewards/cosine_scaled_reward/std": 0.4569285809993744, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 1917.5625, + "completions/mean_terminated_length": 1004.5, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "epoch": 0.14742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23949742317199707, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0, + "num_tokens": 15744668.0, + "reward": -0.27834638953208923, + "reward_std": 0.16836056113243103, + "rewards/cosine_scaled_reward/mean": -0.27834638953208923, + "rewards/cosine_scaled_reward/std": 0.20021934807300568, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1502.0, + "completions/mean_length": 1725.796875, + "completions/mean_terminated_length": 902.388916015625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.14857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23063035309314728, + "learning_rate": 9.332771203643714e-07, + "loss": -0.0, + "num_tokens": 15865623.0, + "reward": -0.19732065498828888, + "reward_std": 0.19462591409683228, + "rewards/cosine_scaled_reward/mean": -0.19732065498828888, + "rewards/cosine_scaled_reward/std": 0.2627345323562622, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1640.8125, + "completions/mean_terminated_length": 863.45458984375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.14971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29630133509635925, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0, + "num_tokens": 15980371.0, + "reward": -0.07768938690423965, + "reward_std": 0.2543257176876068, + "rewards/cosine_scaled_reward/mean": -0.07768939435482025, + "rewards/cosine_scaled_reward/std": 0.4248148798942566, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 1570.890625, + "completions/mean_terminated_length": 826.5999755859375, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.15085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2735442817211151, + "learning_rate": 9.299475664759068e-07, + "loss": -0.0, + "num_tokens": 16091972.0, + "reward": -0.1057564914226532, + "reward_std": 0.32137495279312134, + "rewards/cosine_scaled_reward/mean": -0.105756476521492, + "rewards/cosine_scaled_reward/std": 0.4788062870502472, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1971.34375, + "completions/mean_terminated_length": 1347.1429443359375, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23230750858783722, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0, + "num_tokens": 16229362.0, + "reward": -0.21333375573158264, + "reward_std": 0.1880394071340561, + "rewards/cosine_scaled_reward/mean": -0.21333375573158264, + "rewards/cosine_scaled_reward/std": 0.2557979226112366, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1682.984375, + "completions/mean_terminated_length": 1113.5599365234375, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "epoch": 0.15314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2776358127593994, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0, + "num_tokens": 16347641.0, + "reward": -0.07218431681394577, + "reward_std": 0.19744814932346344, + "rewards/cosine_scaled_reward/mean": -0.07218432426452637, + "rewards/cosine_scaled_reward/std": 0.41042155027389526, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 1269.171875, + "completions/mean_terminated_length": 736.2894897460938, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.15428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30510956048965454, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0, + "num_tokens": 16439340.0, + "reward": 0.1377476304769516, + "reward_std": 0.25976449251174927, + "rewards/cosine_scaled_reward/mean": 0.1377476155757904, + "rewards/cosine_scaled_reward/std": 0.4923737347126007, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 1624.203125, + "completions/mean_terminated_length": 917.875, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "epoch": 0.15542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25474753975868225, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0, + "num_tokens": 16553961.0, + "reward": -0.04156734049320221, + "reward_std": 0.27987948060035706, + "rewards/cosine_scaled_reward/mean": -0.04156734049320221, + "rewards/cosine_scaled_reward/std": 0.4557124078273773, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1832.625, + "completions/mean_terminated_length": 1063.4285888671875, + "completions/min_length": 600.0, + "completions/min_terminated_length": 600.0, + "epoch": 0.15657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2200661152601242, + "learning_rate": 9.213010742252327e-07, + "loss": -0.0, + "num_tokens": 16681857.0, + "reward": -0.2795522212982178, + "reward_std": 0.16735097765922546, + "rewards/cosine_scaled_reward/mean": -0.2795522212982178, + "rewards/cosine_scaled_reward/std": 0.22360830008983612, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 1547.90625, + "completions/mean_terminated_length": 981.1333618164062, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.15771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.247065007686615, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0, + "num_tokens": 16792235.0, + "reward": -0.1421782374382019, + "reward_std": 0.25017279386520386, + "rewards/cosine_scaled_reward/mean": -0.1421782374382019, + "rewards/cosine_scaled_reward/std": 0.3903765082359314, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1898.375, + "completions/mean_terminated_length": 1177.45458984375, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "epoch": 0.15885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25471416115760803, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0, + "num_tokens": 16924371.0, + "reward": -0.24234679341316223, + "reward_std": 0.15713179111480713, + "rewards/cosine_scaled_reward/mean": -0.24234679341316223, + "rewards/cosine_scaled_reward/std": 0.17467617988586426, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1577.625, + "completions/mean_terminated_length": 1044.533447265625, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.16, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2628695070743561, + "learning_rate": 9.158953424711624e-07, + "loss": -0.0, + "num_tokens": 17035563.0, + "reward": -0.12413343787193298, + "reward_std": 0.20063763856887817, + "rewards/cosine_scaled_reward/mean": -0.12413343787193298, + "rewards/cosine_scaled_reward/std": 0.5006609559059143, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 1405.125, + "completions/mean_terminated_length": 993.025634765625, + "completions/min_length": 586.0, + "completions/min_terminated_length": 586.0, + "epoch": 0.16114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2682877779006958, + "learning_rate": 9.140576474687263e-07, + "loss": -0.0, + "num_tokens": 17136051.0, + "reward": -0.02423717826604843, + "reward_std": 0.2661462128162384, + "rewards/cosine_scaled_reward/mean": -0.02423717826604843, + "rewards/cosine_scaled_reward/std": 0.502265214920044, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1763.515625, + "completions/mean_terminated_length": 1347.7308349609375, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.16228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24265550076961517, + "learning_rate": 9.122022088101613e-07, + "loss": -0.0, + "num_tokens": 17259420.0, + "reward": -0.23560766875743866, + "reward_std": 0.22989924252033234, + "rewards/cosine_scaled_reward/mean": -0.23560766875743866, + "rewards/cosine_scaled_reward/std": 0.28772976994514465, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 1600.671875, + "completions/mean_terminated_length": 1153.34375, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "epoch": 0.16342857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30536147952079773, + "learning_rate": 9.103291169269299e-07, + "loss": -0.0, + "num_tokens": 17372679.0, + "reward": -0.23412726819515228, + "reward_std": 0.226594477891922, + "rewards/cosine_scaled_reward/mean": -0.2341272532939911, + "rewards/cosine_scaled_reward/std": 0.2685011625289917, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 1708.0625, + "completions/mean_terminated_length": 1012.0, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.16457142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2859592139720917, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0, + "num_tokens": 17493483.0, + "reward": -0.11928378790616989, + "reward_std": 0.2819562554359436, + "rewards/cosine_scaled_reward/mean": -0.11928380280733109, + "rewards/cosine_scaled_reward/std": 0.41741910576820374, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1725.0, + "completions/mean_length": 1277.78125, + "completions/mean_terminated_length": 845.707275390625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.1657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.310493141412735, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0, + "num_tokens": 17585205.0, + "reward": 0.009949762374162674, + "reward_std": 0.32572609186172485, + "rewards/cosine_scaled_reward/mean": 0.009949766099452972, + "rewards/cosine_scaled_reward/std": 0.5299619436264038, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1550.625, + "completions/mean_terminated_length": 986.9334106445312, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.16685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2404046207666397, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0, + "num_tokens": 17695061.0, + "reward": -0.17625686526298523, + "reward_std": 0.2529022991657257, + "rewards/cosine_scaled_reward/mean": -0.17625686526298523, + "rewards/cosine_scaled_reward/std": 0.3359045386314392, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 1741.703125, + "completions/mean_terminated_length": 1156.95458984375, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2498754858970642, + "learning_rate": 9.026620557966279e-07, + "loss": -0.0, + "num_tokens": 17817314.0, + "reward": -0.26471418142318726, + "reward_std": 0.2048022449016571, + "rewards/cosine_scaled_reward/mean": -0.26471418142318726, + "rewards/cosine_scaled_reward/std": 0.2656060457229614, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 1386.0625, + "completions/mean_terminated_length": 903.0270385742188, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.16914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2538217306137085, + "learning_rate": 9.007020842191634e-07, + "loss": -0.0, + "num_tokens": 17917206.0, + "reward": -0.10874275863170624, + "reward_std": 0.24236595630645752, + "rewards/cosine_scaled_reward/mean": -0.10874275863170624, + "rewards/cosine_scaled_reward/std": 0.3927372395992279, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1735.96875, + "completions/mean_terminated_length": 1140.272705078125, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "epoch": 0.1702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23916038870811462, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0, + "num_tokens": 18040204.0, + "reward": -0.20906513929367065, + "reward_std": 0.2755752205848694, + "rewards/cosine_scaled_reward/mean": -0.20906512439250946, + "rewards/cosine_scaled_reward/std": 0.38517922163009644, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1379.359375, + "completions/mean_terminated_length": 978.1749877929688, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.17142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30970829725265503, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0, + "num_tokens": 18138987.0, + "reward": -0.14114701747894287, + "reward_std": 0.3519541621208191, + "rewards/cosine_scaled_reward/mean": -0.14114701747894287, + "rewards/cosine_scaled_reward/std": 0.39396560192108154, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1497.328125, + "completions/mean_terminated_length": 1011.441162109375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.17257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2874428331851959, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0, + "num_tokens": 18245496.0, + "reward": -0.04711150377988815, + "reward_std": 0.33344799280166626, + "rewards/cosine_scaled_reward/mean": -0.04711151123046875, + "rewards/cosine_scaled_reward/std": 0.41477611660957336, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1741.0, + "completions/mean_length": 1777.140625, + "completions/mean_terminated_length": 964.5625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.1737142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28282323479652405, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0, + "num_tokens": 18369897.0, + "reward": -0.2543114423751831, + "reward_std": 0.18715068697929382, + "rewards/cosine_scaled_reward/mean": -0.2543114423751831, + "rewards/cosine_scaled_reward/std": 0.19382856786251068, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 1564.0625, + "completions/mean_terminated_length": 900.888916015625, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.17485714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27684876322746277, + "learning_rate": 8.906477750432903e-07, + "loss": -0.0, + "num_tokens": 18481141.0, + "reward": -0.1415693461894989, + "reward_std": 0.23039600253105164, + "rewards/cosine_scaled_reward/mean": -0.1415693461894989, + "rewards/cosine_scaled_reward/std": 0.2940608859062195, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1829.328125, + "completions/mean_terminated_length": 1224.7647705078125, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24094167351722717, + "learning_rate": 8.88586709003076e-07, + "loss": -0.0, + "num_tokens": 18609282.0, + "reward": -0.2521882653236389, + "reward_std": 0.20982292294502258, + "rewards/cosine_scaled_reward/mean": -0.2521882653236389, + "rewards/cosine_scaled_reward/std": 0.23373161256313324, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 1517.765625, + "completions/mean_terminated_length": 916.8333740234375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.17714285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2674770653247833, + "learning_rate": 8.865091407243394e-07, + "loss": -0.0, + "num_tokens": 18717043.0, + "reward": -0.028832588344812393, + "reward_std": 0.22500035166740417, + "rewards/cosine_scaled_reward/mean": -0.028832584619522095, + "rewards/cosine_scaled_reward/std": 0.4698766767978668, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 1731.453125, + "completions/mean_terminated_length": 781.8125, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.1782857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23764768242835999, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0, + "num_tokens": 18837960.0, + "reward": -0.10049945116043091, + "reward_std": 0.2521243393421173, + "rewards/cosine_scaled_reward/mean": -0.10049945116043091, + "rewards/cosine_scaled_reward/std": 0.4200229048728943, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1531.1875, + "completions/mean_terminated_length": 1014.375, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.17942857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28145694732666016, + "learning_rate": 8.823049032816478e-07, + "loss": -0.0, + "num_tokens": 18945916.0, + "reward": -0.22566190361976624, + "reward_std": 0.19013158977031708, + "rewards/cosine_scaled_reward/mean": -0.22566190361976624, + "rewards/cosine_scaled_reward/std": 0.24779614806175232, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 1425.203125, + "completions/mean_terminated_length": 909.1714477539062, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.18057142857142858, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.24377204477787018, + "learning_rate": 8.801784390262943e-07, + "loss": -0.0, + "num_tokens": 19047249.0, + "reward": -0.021197691559791565, + "reward_std": 0.22868266701698303, + "rewards/cosine_scaled_reward/mean": -0.021197684109210968, + "rewards/cosine_scaled_reward/std": 0.46860653162002563, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 1600.4375, + "completions/mean_terminated_length": 1093.2000732421875, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "epoch": 0.18171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2479163259267807, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0, + "num_tokens": 19161357.0, + "reward": -0.23690757155418396, + "reward_std": 0.20615912973880768, + "rewards/cosine_scaled_reward/mean": -0.23690758645534515, + "rewards/cosine_scaled_reward/std": 0.32988741993904114, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 1527.375, + "completions/mean_terminated_length": 937.3333740234375, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.18285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2751549184322357, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0, + "num_tokens": 19270693.0, + "reward": -0.12134292721748352, + "reward_std": 0.2621082067489624, + "rewards/cosine_scaled_reward/mean": -0.12134292721748352, + "rewards/cosine_scaled_reward/std": 0.4263574779033661, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1419.484375, + "completions/mean_terminated_length": 989.4473876953125, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2905498445034027, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0, + "num_tokens": 19371532.0, + "reward": -0.1314084678888321, + "reward_std": 0.25361165404319763, + "rewards/cosine_scaled_reward/mean": -0.1314084678888321, + "rewards/cosine_scaled_reward/std": 0.36607682704925537, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1584.5, + "completions/mean_terminated_length": 949.3333129882812, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.18514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3278505206108093, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0, + "num_tokens": 19483244.0, + "reward": -0.16470149159431458, + "reward_std": 0.26964259147644043, + "rewards/cosine_scaled_reward/mean": -0.16470149159431458, + "rewards/cosine_scaled_reward/std": 0.31499552726745605, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 1439.6875, + "completions/mean_terminated_length": 868.242431640625, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.18628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29175421595573425, + "learning_rate": 8.693068314414344e-07, + "loss": -0.0, + "num_tokens": 19586568.0, + "reward": 0.10278680920600891, + "reward_std": 0.271634042263031, + "rewards/cosine_scaled_reward/mean": 0.10278680920600891, + "rewards/cosine_scaled_reward/std": 0.4813632071018219, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 1406.703125, + "completions/mean_terminated_length": 995.6154174804688, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.18742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26038941740989685, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0, + "num_tokens": 19687125.0, + "reward": -0.08026184141635895, + "reward_std": 0.21900159120559692, + "rewards/cosine_scaled_reward/mean": -0.08026183396577835, + "rewards/cosine_scaled_reward/std": 0.4170342683792114, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1896.0, + "completions/mean_length": 1438.921875, + "completions/mean_terminated_length": 994.45947265625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.18857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29659712314605713, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0, + "num_tokens": 19790632.0, + "reward": -0.12293928861618042, + "reward_std": 0.23739376664161682, + "rewards/cosine_scaled_reward/mean": -0.12293929606676102, + "rewards/cosine_scaled_reward/std": 0.3927924335002899, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1697.765625, + "completions/mean_terminated_length": 1073.434814453125, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "epoch": 0.18971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21795569360256195, + "learning_rate": 8.625962667065487e-07, + "loss": -0.0, + "num_tokens": 19910865.0, + "reward": -0.20583154261112213, + "reward_std": 0.2378866970539093, + "rewards/cosine_scaled_reward/mean": -0.20583152770996094, + "rewards/cosine_scaled_reward/std": 0.26525840163230896, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1674.0, + "completions/mean_length": 1406.890625, + "completions/mean_terminated_length": 995.923095703125, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "epoch": 0.19085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2583286166191101, + "learning_rate": 8.603287946810513e-07, + "loss": -0.0, + "num_tokens": 20012450.0, + "reward": -0.14853140711784363, + "reward_std": 0.23831486701965332, + "rewards/cosine_scaled_reward/mean": -0.14853140711784363, + "rewards/cosine_scaled_reward/std": 0.2794221341609955, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 1574.921875, + "completions/mean_terminated_length": 1038.7667236328125, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2431253343820572, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0, + "num_tokens": 20124085.0, + "reward": -0.07713659107685089, + "reward_std": 0.2686954736709595, + "rewards/cosine_scaled_reward/mean": -0.07713659107685089, + "rewards/cosine_scaled_reward/std": 0.37947362661361694, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1507.90625, + "completions/mean_terminated_length": 1161.6923828125, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "epoch": 0.19314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23702675104141235, + "learning_rate": 8.557485869176825e-07, + "loss": -0.0, + "num_tokens": 20231215.0, + "reward": 0.20358076691627502, + "reward_std": 0.2683357000350952, + "rewards/cosine_scaled_reward/mean": 0.20358076691627502, + "rewards/cosine_scaled_reward/std": 0.5625549554824829, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1290.53125, + "completions/mean_terminated_length": 836.0499877929688, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.19428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2915634512901306, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0, + "num_tokens": 20323593.0, + "reward": -0.04663477838039398, + "reward_std": 0.1683385670185089, + "rewards/cosine_scaled_reward/mean": -0.04663477838039398, + "rewards/cosine_scaled_reward/std": 0.432047039270401, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 1461.703125, + "completions/mean_terminated_length": 875.40625, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.19542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2520189881324768, + "learning_rate": 8.511087728614862e-07, + "loss": -0.0, + "num_tokens": 20427534.0, + "reward": -0.03435331583023071, + "reward_std": 0.18240094184875488, + "rewards/cosine_scaled_reward/mean": -0.034353308379650116, + "rewards/cosine_scaled_reward/std": 0.4340380132198334, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 1483.359375, + "completions/mean_terminated_length": 1071.3243408203125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.19657142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31458213925361633, + "learning_rate": 8.487667956935087e-07, + "loss": -0.0, + "num_tokens": 20533085.0, + "reward": 0.1847388744354248, + "reward_std": 0.20619311928749084, + "rewards/cosine_scaled_reward/mean": 0.18473894894123077, + "rewards/cosine_scaled_reward/std": 0.512468159198761, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1113.96875, + "completions/mean_terminated_length": 689.4091186523438, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.1977142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3380848467350006, + "learning_rate": 8.464102570534061e-07, + "loss": -0.0, + "num_tokens": 20615691.0, + "reward": -0.05022401362657547, + "reward_std": 0.2543797492980957, + "rewards/cosine_scaled_reward/mean": -0.05022402107715607, + "rewards/cosine_scaled_reward/std": 0.38979703187942505, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 1151.390625, + "completions/mean_terminated_length": 985.3518676757812, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.19885714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2737923562526703, + "learning_rate": 8.440392717955475e-07, + "loss": -0.0, + "num_tokens": 20699716.0, + "reward": -0.05732875317335129, + "reward_std": 0.2915908694267273, + "rewards/cosine_scaled_reward/mean": -0.05732874572277069, + "rewards/cosine_scaled_reward/std": 0.4477607011795044, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1481.765625, + "completions/mean_terminated_length": 1068.567626953125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26141369342803955, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0, + "num_tokens": 20805373.0, + "reward": -0.02904359996318817, + "reward_std": 0.24616873264312744, + "rewards/cosine_scaled_reward/mean": -0.02904359996318817, + "rewards/cosine_scaled_reward/std": 0.45150378346443176, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 1445.53125, + "completions/mean_terminated_length": 913.941162109375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.20114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.314208984375, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0, + "num_tokens": 20909055.0, + "reward": -0.165739506483078, + "reward_std": 0.2986479103565216, + "rewards/cosine_scaled_reward/mean": -0.165739506483078, + "rewards/cosine_scaled_reward/std": 0.3703363239765167, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 1427.890625, + "completions/mean_terminated_length": 1003.6052856445312, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.2022857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2828216254711151, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0, + "num_tokens": 21010664.0, + "reward": -0.07454323768615723, + "reward_std": 0.23275166749954224, + "rewards/cosine_scaled_reward/mean": -0.07454322278499603, + "rewards/cosine_scaled_reward/std": 0.3976919949054718, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 1304.5, + "completions/mean_terminated_length": 915.047607421875, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.20342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28618043661117554, + "learning_rate": 8.344131861991828e-07, + "loss": -0.0, + "num_tokens": 21105688.0, + "reward": 0.002464752644300461, + "reward_std": 0.3809230327606201, + "rewards/cosine_scaled_reward/mean": 0.002464751712977886, + "rewards/cosine_scaled_reward/std": 0.46308550238609314, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 1658.5, + "completions/mean_terminated_length": 1050.8800048828125, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "epoch": 0.20457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.250982403755188, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0, + "num_tokens": 21222664.0, + "reward": -0.18153682351112366, + "reward_std": 0.2734690308570862, + "rewards/cosine_scaled_reward/mean": -0.18153685331344604, + "rewards/cosine_scaled_reward/std": 0.33050045371055603, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1288.34375, + "completions/mean_terminated_length": 943.0454711914062, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.2057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3166482150554657, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0, + "num_tokens": 21316294.0, + "reward": 0.20186525583267212, + "reward_std": 0.31781116127967834, + "rewards/cosine_scaled_reward/mean": 0.20186525583267212, + "rewards/cosine_scaled_reward/std": 0.49267733097076416, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 1591.796875, + "completions/mean_terminated_length": 925.0385131835938, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.20685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26195216178894043, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0, + "num_tokens": 21429641.0, + "reward": -0.060104113072156906, + "reward_std": 0.23563489317893982, + "rewards/cosine_scaled_reward/mean": -0.06010409817099571, + "rewards/cosine_scaled_reward/std": 0.43010979890823364, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 1365.421875, + "completions/mean_terminated_length": 927.871826171875, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2551879286766052, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0, + "num_tokens": 21526820.0, + "reward": -0.15826305747032166, + "reward_std": 0.24291284382343292, + "rewards/cosine_scaled_reward/mean": -0.15826307237148285, + "rewards/cosine_scaled_reward/std": 0.30778464674949646, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1320.515625, + "completions/mean_terminated_length": 912.4146118164062, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.20914285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32218796014785767, + "learning_rate": 8.220696016880687e-07, + "loss": -0.0, + "num_tokens": 21621949.0, + "reward": -0.07413223385810852, + "reward_std": 0.35920435190200806, + "rewards/cosine_scaled_reward/mean": -0.07413223385810852, + "rewards/cosine_scaled_reward/std": 0.45890137553215027, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1465.71875, + "completions/mean_terminated_length": 1012.8333129882812, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.2102857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27700135111808777, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0, + "num_tokens": 21727107.0, + "reward": -0.158505380153656, + "reward_std": 0.18604165315628052, + "rewards/cosine_scaled_reward/mean": -0.158505380153656, + "rewards/cosine_scaled_reward/std": 0.29056471586227417, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1286.8125, + "completions/mean_terminated_length": 940.8182373046875, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.21142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2607719898223877, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0, + "num_tokens": 21819647.0, + "reward": -0.28775715827941895, + "reward_std": 0.19134438037872314, + "rewards/cosine_scaled_reward/mean": -0.28775715827941895, + "rewards/cosine_scaled_reward/std": 0.21350952982902527, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1825.0, + "completions/mean_length": 1480.09375, + "completions/mean_terminated_length": 1065.6756591796875, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.21257142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2575705349445343, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0, + "num_tokens": 21925069.0, + "reward": -0.13343556225299835, + "reward_std": 0.2557746171951294, + "rewards/cosine_scaled_reward/mean": -0.13343556225299835, + "rewards/cosine_scaled_reward/std": 0.36808857321739197, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1347.71875, + "completions/mean_terminated_length": 1114.291748046875, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.21371428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31024712324142456, + "learning_rate": 8.119553365707802e-07, + "loss": -0.0, + "num_tokens": 22021747.0, + "reward": -0.09627380967140198, + "reward_std": 0.2472851276397705, + "rewards/cosine_scaled_reward/mean": -0.09627379477024078, + "rewards/cosine_scaled_reward/std": 0.41195833683013916, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1774.140625, + "completions/mean_terminated_length": 1251.3182373046875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.21485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2481517493724823, + "learning_rate": 8.093945422764069e-07, + "loss": -0.0, + "num_tokens": 22147092.0, + "reward": -0.20224528014659882, + "reward_std": 0.2598743736743927, + "rewards/cosine_scaled_reward/mean": -0.20224529504776, + "rewards/cosine_scaled_reward/std": 0.33939501643180847, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1234.328125, + "completions/mean_terminated_length": 808.1190795898438, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31437209248542786, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0, + "num_tokens": 22235377.0, + "reward": -0.09877841919660568, + "reward_std": 0.2865467667579651, + "rewards/cosine_scaled_reward/mean": -0.09877842664718628, + "rewards/cosine_scaled_reward/std": 0.4444861114025116, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1189.3125, + "completions/mean_terminated_length": 1011.0943603515625, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.21714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28301987051963806, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0, + "num_tokens": 22321261.0, + "reward": -0.028003819286823273, + "reward_std": 0.27996310591697693, + "rewards/cosine_scaled_reward/mean": -0.028003819286823273, + "rewards/cosine_scaled_reward/std": 0.4598979353904724, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1061.140625, + "completions/mean_terminated_length": 1012.6065063476562, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.21828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31064528226852417, + "learning_rate": 8.01636806561836e-07, + "loss": -0.0, + "num_tokens": 22399462.0, + "reward": 0.07088040560483932, + "reward_std": 0.3638381361961365, + "rewards/cosine_scaled_reward/mean": 0.07088041305541992, + "rewards/cosine_scaled_reward/std": 0.5184580683708191, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1542.21875, + "completions/mean_terminated_length": 1258.48779296875, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "epoch": 0.21942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2742583751678467, + "learning_rate": 7.990261971595048e-07, + "loss": -0.0, + "num_tokens": 22509460.0, + "reward": -0.14651048183441162, + "reward_std": 0.2414294183254242, + "rewards/cosine_scaled_reward/mean": -0.14651048183441162, + "rewards/cosine_scaled_reward/std": 0.3039136528968811, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1717.8125, + "completions/mean_terminated_length": 1202.719970703125, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "epoch": 0.22057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24609725177288055, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0, + "num_tokens": 22630544.0, + "reward": -0.28856799006462097, + "reward_std": 0.14614446461200714, + "rewards/cosine_scaled_reward/mean": -0.28856799006462097, + "rewards/cosine_scaled_reward/std": 0.17294423282146454, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1692.546875, + "completions/mean_terminated_length": 1058.9130859375, + "completions/min_length": 613.0, + "completions/min_terminated_length": 613.0, + "epoch": 0.22171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27539438009262085, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0, + "num_tokens": 22750227.0, + "reward": -0.10590282082557678, + "reward_std": 0.25362446904182434, + "rewards/cosine_scaled_reward/mean": -0.10590282082557678, + "rewards/cosine_scaled_reward/std": 0.36822667717933655, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1410.09375, + "completions/mean_terminated_length": 1120.1363525390625, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "epoch": 0.22285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23645445704460144, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0, + "num_tokens": 22851617.0, + "reward": -0.12888561189174652, + "reward_std": 0.32565274834632874, + "rewards/cosine_scaled_reward/mean": -0.12888562679290771, + "rewards/cosine_scaled_reward/std": 0.3842463195323944, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 1625.234375, + "completions/mean_terminated_length": 1146.10009765625, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27093231678009033, + "learning_rate": 7.884636689049422e-07, + "loss": -0.0, + "num_tokens": 22967224.0, + "reward": -0.1617402583360672, + "reward_std": 0.3036938011646271, + "rewards/cosine_scaled_reward/mean": -0.1617402583360672, + "rewards/cosine_scaled_reward/std": 0.390837699174881, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1099.96875, + "completions/mean_terminated_length": 924.4074096679688, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.22514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31926214694976807, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0, + "num_tokens": 23047990.0, + "reward": 0.09089304506778717, + "reward_std": 0.40348750352859497, + "rewards/cosine_scaled_reward/mean": 0.09089304506778717, + "rewards/cosine_scaled_reward/std": 0.5607035756111145, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1312.96875, + "completions/mean_terminated_length": 1125.60791015625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.22628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2919371426105499, + "learning_rate": 7.831121542179086e-07, + "loss": -0.0, + "num_tokens": 23143524.0, + "reward": 0.0047197043895721436, + "reward_std": 0.3408518433570862, + "rewards/cosine_scaled_reward/mean": 0.004719719290733337, + "rewards/cosine_scaled_reward/std": 0.46544134616851807, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1417.171875, + "completions/mean_terminated_length": 1224.0611572265625, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "epoch": 0.22742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24992844462394714, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0, + "num_tokens": 23245727.0, + "reward": -0.19424019753932953, + "reward_std": 0.28145354986190796, + "rewards/cosine_scaled_reward/mean": -0.19424019753932953, + "rewards/cosine_scaled_reward/std": 0.3362065255641937, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 1180.515625, + "completions/mean_terminated_length": 891.3541870117188, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.22857142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2989206612110138, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0, + "num_tokens": 23331400.0, + "reward": 0.08669155836105347, + "reward_std": 0.3488098084926605, + "rewards/cosine_scaled_reward/mean": 0.08669155836105347, + "rewards/cosine_scaled_reward/std": 0.46097004413604736, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1671.0, + "completions/mean_length": 1399.1875, + "completions/mean_terminated_length": 789.697021484375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.2297142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3482288122177124, + "learning_rate": 7.75e-07, + "loss": 0.0, + "num_tokens": 23431972.0, + "reward": 0.05170612782239914, + "reward_std": 0.33521372079849243, + "rewards/cosine_scaled_reward/mean": 0.05170612409710884, + "rewards/cosine_scaled_reward/std": 0.4809432625770569, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 1257.5, + "completions/mean_terminated_length": 871.4418334960938, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.23085714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24333854019641876, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0, + "num_tokens": 23522356.0, + "reward": 0.20302791893482208, + "reward_std": 0.24270620942115784, + "rewards/cosine_scaled_reward/mean": 0.20302791893482208, + "rewards/cosine_scaled_reward/std": 0.5547645688056946, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1545.421875, + "completions/mean_terminated_length": 1299.9766845703125, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "epoch": 0.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24562042951583862, + "learning_rate": 7.695368466124296e-07, + "loss": -0.0, + "num_tokens": 23632679.0, + "reward": 0.07688053697347641, + "reward_std": 0.32062458992004395, + "rewards/cosine_scaled_reward/mean": 0.07688053697347641, + "rewards/cosine_scaled_reward/std": 0.5180152058601379, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1250.28125, + "completions/mean_terminated_length": 961.74462890625, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.23314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2952634394168854, + "learning_rate": 7.667891533457718e-07, + "loss": -0.0, + "num_tokens": 23722417.0, + "reward": 0.0316191166639328, + "reward_std": 0.23991048336029053, + "rewards/cosine_scaled_reward/mean": 0.0316191241145134, + "rewards/cosine_scaled_reward/std": 0.4419180452823639, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1239.6875, + "completions/mean_terminated_length": 923.3912963867188, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.2342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3470660448074341, + "learning_rate": 7.640308940816239e-07, + "loss": -0.0, + "num_tokens": 23812821.0, + "reward": 0.04175570607185364, + "reward_std": 0.32632672786712646, + "rewards/cosine_scaled_reward/mean": 0.04175570607185364, + "rewards/cosine_scaled_reward/std": 0.5073853135108948, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 1563.75, + "completions/mean_terminated_length": 1162.5142822265625, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.23542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2750691771507263, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0, + "num_tokens": 23923693.0, + "reward": -0.1269976794719696, + "reward_std": 0.2818883955478668, + "rewards/cosine_scaled_reward/mean": -0.1269976794719696, + "rewards/cosine_scaled_reward/std": 0.3301773965358734, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1346.515625, + "completions/mean_terminated_length": 1072.021728515625, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "epoch": 0.23657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34398096799850464, + "learning_rate": 7.584832158039378e-07, + "loss": -0.0, + "num_tokens": 24020470.0, + "reward": -0.11099155992269516, + "reward_std": 0.32174742221832275, + "rewards/cosine_scaled_reward/mean": -0.11099155247211456, + "rewards/cosine_scaled_reward/std": 0.4000038504600525, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 1350.71875, + "completions/mean_terminated_length": 1206.0, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "epoch": 0.2377142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2667733430862427, + "learning_rate": 7.556940671764124e-07, + "loss": -0.0, + "num_tokens": 24117244.0, + "reward": -0.012698620557785034, + "reward_std": 0.27501654624938965, + "rewards/cosine_scaled_reward/mean": -0.01269860565662384, + "rewards/cosine_scaled_reward/std": 0.47749608755111694, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 1186.484375, + "completions/mean_terminated_length": 922.7550659179688, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.23885714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34667861461639404, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0, + "num_tokens": 24203091.0, + "reward": -0.132795050740242, + "reward_std": 0.2735438942909241, + "rewards/cosine_scaled_reward/mean": -0.132795050740242, + "rewards/cosine_scaled_reward/std": 0.3893483579158783, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1450.0625, + "completions/mean_terminated_length": 1197.5999755859375, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21571174263954163, + "learning_rate": 7.500858306332172e-07, + "loss": -0.0, + "num_tokens": 24306703.0, + "reward": -0.06977479159832001, + "reward_std": 0.24265971779823303, + "rewards/cosine_scaled_reward/mean": -0.06977479159832001, + "rewards/cosine_scaled_reward/std": 0.45415669679641724, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1201.609375, + "completions/mean_terminated_length": 964.6199951171875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.24114285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2720986306667328, + "learning_rate": 7.472670160550848e-07, + "loss": -0.0, + "num_tokens": 24394846.0, + "reward": 0.0786014124751091, + "reward_std": 0.2013745754957199, + "rewards/cosine_scaled_reward/mean": 0.0786014199256897, + "rewards/cosine_scaled_reward/std": 0.4884081780910492, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 1176.359375, + "completions/mean_terminated_length": 808.3333740234375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.2422857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3127840757369995, + "learning_rate": 7.444385869608921e-07, + "loss": -0.0, + "num_tokens": 24480613.0, + "reward": 0.11307461559772491, + "reward_std": 0.284263014793396, + "rewards/cosine_scaled_reward/mean": 0.11307463049888611, + "rewards/cosine_scaled_reward/std": 0.5329286456108093, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1114.03125, + "completions/mean_terminated_length": 776.2127685546875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.24342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.343943327665329, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0, + "num_tokens": 24561775.0, + "reward": -0.10338220745325089, + "reward_std": 0.2921890914440155, + "rewards/cosine_scaled_reward/mean": -0.10338220745325089, + "rewards/cosine_scaled_reward/std": 0.34980201721191406, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1386.0625, + "completions/mean_terminated_length": 1039.3333740234375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.24457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26102328300476074, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0, + "num_tokens": 24662027.0, + "reward": 0.02548668347299099, + "reward_std": 0.3174683451652527, + "rewards/cosine_scaled_reward/mean": 0.025486690923571587, + "rewards/cosine_scaled_reward/std": 0.46307510137557983, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1882.0, + "completions/mean_length": 1318.46875, + "completions/mean_terminated_length": 962.18603515625, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.24571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2819078266620636, + "learning_rate": 7.358969934210438e-07, + "loss": -0.0, + "num_tokens": 24756897.0, + "reward": -0.11348340660333633, + "reward_std": 0.1657339334487915, + "rewards/cosine_scaled_reward/mean": -0.11348340660333633, + "rewards/cosine_scaled_reward/std": 0.41132697463035583, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 971.234375, + "completions/mean_terminated_length": 839.0, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.24685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3146374225616455, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0, + "num_tokens": 24828336.0, + "reward": 0.09829875081777573, + "reward_std": 0.34463635087013245, + "rewards/cosine_scaled_reward/mean": 0.09829875826835632, + "rewards/cosine_scaled_reward/std": 0.5223532319068909, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 1323.546875, + "completions/mean_terminated_length": 1017.6666870117188, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "epoch": 0.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25747084617614746, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0, + "num_tokens": 24923395.0, + "reward": -0.08631986379623413, + "reward_std": 0.3201732039451599, + "rewards/cosine_scaled_reward/mean": -0.08631986379623413, + "rewards/cosine_scaled_reward/std": 0.41996634006500244, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1421.5, + "completions/mean_terminated_length": 1115.534912109375, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.24914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24970035254955292, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0, + "num_tokens": 25025059.0, + "reward": -0.22788012027740479, + "reward_std": 0.22475574910640717, + "rewards/cosine_scaled_reward/mean": -0.22788012027740479, + "rewards/cosine_scaled_reward/std": 0.2934871315956116, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1120.609375, + "completions/mean_terminated_length": 948.870361328125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2502857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34460073709487915, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0, + "num_tokens": 25107090.0, + "reward": 0.02718304470181465, + "reward_std": 0.3376328647136688, + "rewards/cosine_scaled_reward/mean": 0.027183040976524353, + "rewards/cosine_scaled_reward/std": 0.5283166170120239, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 1366.828125, + "completions/mean_terminated_length": 1034.162841796875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.25142857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4521820843219757, + "learning_rate": 7.214816693576234e-07, + "loss": -0.0, + "num_tokens": 25204871.0, + "reward": -0.25229814648628235, + "reward_std": 0.17562136054039001, + "rewards/cosine_scaled_reward/mean": -0.25229811668395996, + "rewards/cosine_scaled_reward/std": 0.19320644438266754, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1156.53125, + "completions/mean_terminated_length": 950.8077392578125, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.25257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26752790808677673, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0, + "num_tokens": 25289449.0, + "reward": 0.24696281552314758, + "reward_std": 0.273512065410614, + "rewards/cosine_scaled_reward/mean": 0.24696281552314758, + "rewards/cosine_scaled_reward/std": 0.46473291516304016, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1243.3125, + "completions/mean_terminated_length": 903.5556030273438, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.2537142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27195242047309875, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0, + "num_tokens": 25379149.0, + "reward": 0.007332861423492432, + "reward_std": 0.29589229822158813, + "rewards/cosine_scaled_reward/mean": 0.007332857698202133, + "rewards/cosine_scaled_reward/std": 0.48079609870910645, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 1200.3125, + "completions/mean_terminated_length": 962.9599609375, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "epoch": 0.25485714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2661433219909668, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0, + "num_tokens": 25465705.0, + "reward": 0.03970642387866974, + "reward_std": 0.2005533128976822, + "rewards/cosine_scaled_reward/mean": 0.03970641642808914, + "rewards/cosine_scaled_reward/std": 0.5048101544380188, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1705.46875, + "completions/mean_terminated_length": 1383.697021484375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23623619973659515, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0, + "num_tokens": 25586263.0, + "reward": -0.07307912409305573, + "reward_std": 0.350577175617218, + "rewards/cosine_scaled_reward/mean": -0.07307912409305573, + "rewards/cosine_scaled_reward/std": 0.38458916544914246, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1527.640625, + "completions/mean_terminated_length": 1122.9166259765625, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.2571428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2600167393684387, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0, + "num_tokens": 25694624.0, + "reward": -0.18486955761909485, + "reward_std": 0.24510705471038818, + "rewards/cosine_scaled_reward/mean": -0.18486955761909485, + "rewards/cosine_scaled_reward/std": 0.29842856526374817, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1293.0, + "completions/mean_terminated_length": 1118.769287109375, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.2582857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24967192113399506, + "learning_rate": 7.039090644965509e-07, + "loss": -0.0, + "num_tokens": 25788016.0, + "reward": 0.10143648833036423, + "reward_std": 0.3550751805305481, + "rewards/cosine_scaled_reward/mean": 0.10143650323152542, + "rewards/cosine_scaled_reward/std": 0.48985999822616577, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 975.421875, + "completions/mean_terminated_length": 958.3968505859375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.25942857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33750462532043457, + "learning_rate": 7.009532063876148e-07, + "loss": -0.0, + "num_tokens": 25860827.0, + "reward": 0.017139945179224014, + "reward_std": 0.40727996826171875, + "rewards/cosine_scaled_reward/mean": 0.017139948904514313, + "rewards/cosine_scaled_reward/std": 0.4528072476387024, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1005.453125, + "completions/mean_terminated_length": 834.8544921875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.26057142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3340362310409546, + "learning_rate": 6.979899910323624e-07, + "loss": -0.0, + "num_tokens": 25935848.0, + "reward": 0.1363377869129181, + "reward_std": 0.31884267926216125, + "rewards/cosine_scaled_reward/mean": 0.1363377869129181, + "rewards/cosine_scaled_reward/std": 0.5562776923179626, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1308.875, + "completions/mean_terminated_length": 1019.6522216796875, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.26171428571428573, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2481510192155838, + "learning_rate": 6.950195628537299e-07, + "loss": -0.0, + "num_tokens": 26030280.0, + "reward": -0.0336291566491127, + "reward_std": 0.2131306231021881, + "rewards/cosine_scaled_reward/mean": -0.0336291640996933, + "rewards/cosine_scaled_reward/std": 0.4883540868759155, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 1424.34375, + "completions/mean_terminated_length": 1119.7674560546875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.26285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24989557266235352, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0, + "num_tokens": 26131870.0, + "reward": -0.27840444445610046, + "reward_std": 0.18090233206748962, + "rewards/cosine_scaled_reward/mean": -0.27840444445610046, + "rewards/cosine_scaled_reward/std": 0.2319284826517105, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1420.328125, + "completions/mean_terminated_length": 1113.7906494140625, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "epoch": 0.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25709542632102966, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0, + "num_tokens": 26234467.0, + "reward": -0.012329377233982086, + "reward_std": 0.3558858633041382, + "rewards/cosine_scaled_reward/mean": -0.012329380959272385, + "rewards/cosine_scaled_reward/std": 0.45383208990097046, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1477.65625, + "completions/mean_terminated_length": 1087.4210205078125, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.2651428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26604732871055603, + "learning_rate": 6.860664508377001e-07, + "loss": -0.0, + "num_tokens": 26339365.0, + "reward": -0.18533703684806824, + "reward_std": 0.24220798909664154, + "rewards/cosine_scaled_reward/mean": -0.18533703684806824, + "rewards/cosine_scaled_reward/std": 0.26634126901626587, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 1072.109375, + "completions/mean_terminated_length": 1024.11474609375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.2662857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26210692524909973, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0, + "num_tokens": 26418084.0, + "reward": -0.1599939614534378, + "reward_std": 0.3579375445842743, + "rewards/cosine_scaled_reward/mean": -0.1599939614534378, + "rewards/cosine_scaled_reward/std": 0.3679514527320862, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1287.546875, + "completions/mean_terminated_length": 889.2142944335938, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.2674285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30882614850997925, + "learning_rate": 6.800643086250121e-07, + "loss": -0.0, + "num_tokens": 26510503.0, + "reward": -0.1574883908033371, + "reward_std": 0.17980948090553284, + "rewards/cosine_scaled_reward/mean": -0.1574883908033371, + "rewards/cosine_scaled_reward/std": 0.35836631059646606, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 1145.125, + "completions/mean_terminated_length": 936.769287109375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.26857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30261489748954773, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0, + "num_tokens": 26594351.0, + "reward": 0.1909978985786438, + "reward_std": 0.3115041255950928, + "rewards/cosine_scaled_reward/mean": 0.1909978985786438, + "rewards/cosine_scaled_reward/std": 0.5054126381874084, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1449.375, + "completions/mean_terminated_length": 1012.5405883789062, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.26971428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28838953375816345, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0, + "num_tokens": 26698399.0, + "reward": -0.11444643139839172, + "reward_std": 0.3462868928909302, + "rewards/cosine_scaled_reward/mean": -0.11444643884897232, + "rewards/cosine_scaled_reward/std": 0.4084509313106537, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1262.125, + "completions/mean_terminated_length": 1021.551025390625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.27085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3033871650695801, + "learning_rate": 6.710139192768694e-07, + "loss": -0.0, + "num_tokens": 26789303.0, + "reward": -0.05035819113254547, + "reward_std": 0.2872178554534912, + "rewards/cosine_scaled_reward/mean": -0.050358183681964874, + "rewards/cosine_scaled_reward/std": 0.5157716870307922, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1301.734375, + "completions/mean_terminated_length": 1092.780029296875, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26610657572746277, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0, + "num_tokens": 26883454.0, + "reward": 0.10226152092218399, + "reward_std": 0.3642864525318146, + "rewards/cosine_scaled_reward/mean": 0.10226152092218399, + "rewards/cosine_scaled_reward/std": 0.49199798703193665, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1746.0, + "completions/mean_length": 1012.578125, + "completions/mean_terminated_length": 797.6792602539062, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.27314285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3098434805870056, + "learning_rate": 6.649505910711058e-07, + "loss": -0.0, + "num_tokens": 26958571.0, + "reward": 0.2893483638763428, + "reward_std": 0.21750710904598236, + "rewards/cosine_scaled_reward/mean": 0.2893483638763428, + "rewards/cosine_scaled_reward/std": 0.5735083818435669, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 1127.03125, + "completions/mean_terminated_length": 845.10205078125, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.2742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33869531750679016, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0, + "num_tokens": 27040949.0, + "reward": -0.2518009841442108, + "reward_std": 0.2073291540145874, + "rewards/cosine_scaled_reward/mean": -0.2518009841442108, + "rewards/cosine_scaled_reward/std": 0.26051101088523865, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 1248.578125, + "completions/mean_terminated_length": 1044.803955078125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.2754285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2926189601421356, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0, + "num_tokens": 27132074.0, + "reward": -0.18343190848827362, + "reward_std": 0.32297152280807495, + "rewards/cosine_scaled_reward/mean": -0.18343190848827362, + "rewards/cosine_scaled_reward/std": 0.3960045278072357, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1057.171875, + "completions/mean_terminated_length": 779.739990234375, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.2765714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3697403073310852, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0, + "num_tokens": 27209245.0, + "reward": -0.13560537993907928, + "reward_std": 0.2509098947048187, + "rewards/cosine_scaled_reward/mean": -0.13560537993907928, + "rewards/cosine_scaled_reward/std": 0.42233115434646606, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1849.0, + "completions/mean_length": 1294.9375, + "completions/mean_terminated_length": 1121.1539306640625, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "epoch": 0.2777142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2539284825325012, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0, + "num_tokens": 27302953.0, + "reward": 0.006944652646780014, + "reward_std": 0.3980734050273895, + "rewards/cosine_scaled_reward/mean": 0.006944645196199417, + "rewards/cosine_scaled_reward/std": 0.4572637379169464, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1435.71875, + "completions/mean_terminated_length": 1068.3499755859375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.27885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28219878673553467, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0, + "num_tokens": 27405479.0, + "reward": -0.04507390409708023, + "reward_std": 0.2943881154060364, + "rewards/cosine_scaled_reward/mean": -0.04507390037178993, + "rewards/cosine_scaled_reward/std": 0.482650488615036, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1331.296875, + "completions/mean_terminated_length": 1005.5227661132812, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2733215391635895, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0, + "num_tokens": 27501746.0, + "reward": 0.04791342094540596, + "reward_std": 0.34749698638916016, + "rewards/cosine_scaled_reward/mean": 0.047913409769535065, + "rewards/cosine_scaled_reward/std": 0.5028091669082642, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1310.046875, + "completions/mean_terminated_length": 1043.127685546875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.28114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2701246440410614, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0, + "num_tokens": 27596189.0, + "reward": -0.13987088203430176, + "reward_std": 0.3327594995498657, + "rewards/cosine_scaled_reward/mean": -0.13987088203430176, + "rewards/cosine_scaled_reward/std": 0.4108533263206482, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1433.5625, + "completions/mean_terminated_length": 1088.8780517578125, + "completions/min_length": 521.0, + "completions/min_terminated_length": 521.0, + "epoch": 0.2822857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2608485221862793, + "learning_rate": 6.404850645156841e-07, + "loss": -0.0, + "num_tokens": 27698385.0, + "reward": -0.19611218571662903, + "reward_std": 0.18159456551074982, + "rewards/cosine_scaled_reward/mean": -0.19611218571662903, + "rewards/cosine_scaled_reward/std": 0.18690702319145203, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 1016.078125, + "completions/mean_terminated_length": 824.9815063476562, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.2834285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33469828963279724, + "learning_rate": 6.374054580489873e-07, + "loss": -0.0, + "num_tokens": 27774342.0, + "reward": 0.20066902041435242, + "reward_std": 0.2608226537704468, + "rewards/cosine_scaled_reward/mean": 0.20066902041435242, + "rewards/cosine_scaled_reward/std": 0.5498367547988892, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 1137.15625, + "completions/mean_terminated_length": 926.9615478515625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.2845714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.283346951007843, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0, + "num_tokens": 27858296.0, + "reward": 0.22508396208286285, + "reward_std": 0.32221734523773193, + "rewards/cosine_scaled_reward/mean": 0.22508396208286285, + "rewards/cosine_scaled_reward/std": 0.5403409600257874, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1258.484375, + "completions/mean_terminated_length": 1016.7958984375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.2857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3057456612586975, + "learning_rate": 6.31233615362752e-07, + "loss": -0.0, + "num_tokens": 27949463.0, + "reward": -0.161838099360466, + "reward_std": 0.3008255660533905, + "rewards/cosine_scaled_reward/mean": -0.1618381142616272, + "rewards/cosine_scaled_reward/std": 0.36034730076789856, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 924.796875, + "completions/mean_terminated_length": 849.9166870117188, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.28685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3387444019317627, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0, + "num_tokens": 28018754.0, + "reward": -0.024298980832099915, + "reward_std": 0.26890814304351807, + "rewards/cosine_scaled_reward/mean": -0.024298986420035362, + "rewards/cosine_scaled_reward/std": 0.42033475637435913, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 1181.65625, + "completions/mean_terminated_length": 868.2978515625, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31221455335617065, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0, + "num_tokens": 28104708.0, + "reward": -0.12287517637014389, + "reward_std": 0.16865523159503937, + "rewards/cosine_scaled_reward/mean": -0.12287518382072449, + "rewards/cosine_scaled_reward/std": 0.3217977285385132, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 1170.125, + "completions/mean_terminated_length": 946.3529663085938, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.28914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3127641975879669, + "learning_rate": 6.219465344613258e-07, + "loss": -0.0, + "num_tokens": 28190628.0, + "reward": 0.004593905061483383, + "reward_std": 0.27023184299468994, + "rewards/cosine_scaled_reward/mean": 0.0045939162373542786, + "rewards/cosine_scaled_reward/std": 0.45060300827026367, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 1240.578125, + "completions/mean_terminated_length": 948.5318603515625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.29028571428571426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3329788148403168, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0, + "num_tokens": 28280561.0, + "reward": -0.10231998562812805, + "reward_std": 0.28730475902557373, + "rewards/cosine_scaled_reward/mean": -0.10232000052928925, + "rewards/cosine_scaled_reward/std": 0.39239707589149475, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1410.15625, + "completions/mean_terminated_length": 1027.4500732421875, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.2914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27967000007629395, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0, + "num_tokens": 28380867.0, + "reward": -0.22316259145736694, + "reward_std": 0.1437114179134369, + "rewards/cosine_scaled_reward/mean": -0.22316259145736694, + "rewards/cosine_scaled_reward/std": 0.14716070890426636, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 1163.859375, + "completions/mean_terminated_length": 916.2999877929688, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.2925714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29270097613334656, + "learning_rate": 6.126278954320294e-07, + "loss": -0.0, + "num_tokens": 28466402.0, + "reward": -0.07603419572114944, + "reward_std": 0.25814926624298096, + "rewards/cosine_scaled_reward/mean": -0.07603420317173004, + "rewards/cosine_scaled_reward/std": 0.43036898970603943, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1788.0, + "completions/mean_length": 1341.5625, + "completions/mean_terminated_length": 1065.1304931640625, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "epoch": 0.2937142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26492658257484436, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0, + "num_tokens": 28563766.0, + "reward": 0.08429908752441406, + "reward_std": 0.3440818786621094, + "rewards/cosine_scaled_reward/mean": 0.08429908752441406, + "rewards/cosine_scaled_reward/std": 0.5146033763885498, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1407.96875, + "completions/mean_terminated_length": 1048.9267578125, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "epoch": 0.2948571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25829192996025085, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0, + "num_tokens": 28665100.0, + "reward": -0.17156042158603668, + "reward_std": 0.27311789989471436, + "rewards/cosine_scaled_reward/mean": -0.1715604066848755, + "rewards/cosine_scaled_reward/std": 0.30253204703330994, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 1148.53125, + "completions/mean_terminated_length": 919.2549438476562, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30335742235183716, + "learning_rate": 6.032817857379256e-07, + "loss": -0.0, + "num_tokens": 28749382.0, + "reward": 0.042243435978889465, + "reward_std": 0.26714712381362915, + "rewards/cosine_scaled_reward/mean": 0.042243435978889465, + "rewards/cosine_scaled_reward/std": 0.4686411917209625, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1083.234375, + "completions/mean_terminated_length": 761.6458740234375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.29714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39401763677597046, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0, + "num_tokens": 28828653.0, + "reward": 0.07885205745697021, + "reward_std": 0.2136325240135193, + "rewards/cosine_scaled_reward/mean": 0.07885205745697021, + "rewards/cosine_scaled_reward/std": 0.4590488076210022, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1553.515625, + "completions/mean_terminated_length": 1117.2059326171875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.29828571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23382478952407837, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0, + "num_tokens": 28939382.0, + "reward": -0.21638496220111847, + "reward_std": 0.18316227197647095, + "rewards/cosine_scaled_reward/mean": -0.21638496220111847, + "rewards/cosine_scaled_reward/std": 0.21999679505825043, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 1150.203125, + "completions/mean_terminated_length": 1021.9464721679688, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.29942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31957077980041504, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0, + "num_tokens": 29024731.0, + "reward": -0.15519243478775024, + "reward_std": 0.25976234674453735, + "rewards/cosine_scaled_reward/mean": -0.15519244968891144, + "rewards/cosine_scaled_reward/std": 0.29580366611480713, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1284.109375, + "completions/mean_terminated_length": 1029.479248046875, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.30057142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.257870614528656, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0, + "num_tokens": 29117034.0, + "reward": -0.19904303550720215, + "reward_std": 0.21833446621894836, + "rewards/cosine_scaled_reward/mean": -0.19904303550720215, + "rewards/cosine_scaled_reward/std": 0.2711222171783447, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 1279.6875, + "completions/mean_terminated_length": 1044.48974609375, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.3017142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2747717797756195, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0, + "num_tokens": 29210278.0, + "reward": -0.036783367395401, + "reward_std": 0.32698529958724976, + "rewards/cosine_scaled_reward/mean": -0.0367833711206913, + "rewards/cosine_scaled_reward/std": 0.4024903178215027, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1140.796875, + "completions/mean_terminated_length": 972.7963256835938, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "epoch": 0.3028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30939391255378723, + "learning_rate": 5.845235626570683e-07, + "loss": -0.0, + "num_tokens": 29293225.0, + "reward": 0.012244641780853271, + "reward_std": 0.3513813316822052, + "rewards/cosine_scaled_reward/mean": 0.012244660407304764, + "rewards/cosine_scaled_reward/std": 0.47922924160957336, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 1543.5, + "completions/mean_terminated_length": 1125.4857177734375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23532193899154663, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0, + "num_tokens": 29403313.0, + "reward": -0.16622336208820343, + "reward_std": 0.29864761233329773, + "rewards/cosine_scaled_reward/mean": -0.16622334718704224, + "rewards/cosine_scaled_reward/std": 0.34780678153038025, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 1511.796875, + "completions/mean_terminated_length": 1094.75, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.30514285714285716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2672438621520996, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0, + "num_tokens": 29511564.0, + "reward": -0.2196415364742279, + "reward_std": 0.22792132198810577, + "rewards/cosine_scaled_reward/mean": -0.2196415364742279, + "rewards/cosine_scaled_reward/std": 0.35021698474884033, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 1050.921875, + "completions/mean_terminated_length": 947.77587890625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.3062857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.308774471282959, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0, + "num_tokens": 29588999.0, + "reward": -0.11035098135471344, + "reward_std": 0.3422476053237915, + "rewards/cosine_scaled_reward/mean": -0.11035098135471344, + "rewards/cosine_scaled_reward/std": 0.42984655499458313, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1253.59375, + "completions/mean_terminated_length": 1010.4081420898438, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.30742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2676822543144226, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0, + "num_tokens": 29679837.0, + "reward": -0.11707413196563721, + "reward_std": 0.1972976177930832, + "rewards/cosine_scaled_reward/mean": -0.117074154317379, + "rewards/cosine_scaled_reward/std": 0.3723585903644562, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1307.8125, + "completions/mean_terminated_length": 1061.0833740234375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.30857142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28255271911621094, + "learning_rate": 5.688440441781398e-07, + "loss": -0.0, + "num_tokens": 29774857.0, + "reward": -0.08722042292356491, + "reward_std": 0.2875755727291107, + "rewards/cosine_scaled_reward/mean": -0.08722040057182312, + "rewards/cosine_scaled_reward/std": 0.46101436018943787, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1214.921875, + "completions/mean_terminated_length": 1022.673095703125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.3097142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2939779460430145, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0, + "num_tokens": 29862996.0, + "reward": 0.26210832595825195, + "reward_std": 0.2794266939163208, + "rewards/cosine_scaled_reward/mean": 0.26210832595825195, + "rewards/cosine_scaled_reward/std": 0.5501880645751953, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1447.5625, + "completions/mean_terminated_length": 1212.6087646484375, + "completions/min_length": 674.0, + "completions/min_terminated_length": 674.0, + "epoch": 0.31085714285714283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.270077645778656, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0, + "num_tokens": 29966888.0, + "reward": -0.07720675319433212, + "reward_std": 0.32648950815200806, + "rewards/cosine_scaled_reward/mean": -0.07720677554607391, + "rewards/cosine_scaled_reward/std": 0.46835580468177795, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 1150.640625, + "completions/mean_terminated_length": 943.5577392578125, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2647186815738678, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0, + "num_tokens": 30050489.0, + "reward": 0.06313569843769073, + "reward_std": 0.3254297375679016, + "rewards/cosine_scaled_reward/mean": 0.06313569843769073, + "rewards/cosine_scaled_reward/std": 0.5027272701263428, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 911.875, + "completions/mean_terminated_length": 794.3448486328125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.31314285714285717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37198200821876526, + "learning_rate": 5.562829811526154e-07, + "loss": -0.0, + "num_tokens": 30118929.0, + "reward": 0.03341127932071686, + "reward_std": 0.31158989667892456, + "rewards/cosine_scaled_reward/mean": 0.033411286771297455, + "rewards/cosine_scaled_reward/std": 0.48580819368362427, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1294.109375, + "completions/mean_terminated_length": 975.800048828125, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.3142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25744378566741943, + "learning_rate": 5.531415671340826e-07, + "loss": -0.0, + "num_tokens": 30211752.0, + "reward": 0.21001243591308594, + "reward_std": 0.402576744556427, + "rewards/cosine_scaled_reward/mean": 0.21001243591308594, + "rewards/cosine_scaled_reward/std": 0.5340982675552368, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1826.0, + "completions/mean_length": 1084.53125, + "completions/mean_terminated_length": 966.2105102539062, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.31542857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31013426184654236, + "learning_rate": 5.5e-07, + "loss": 0.0, + "num_tokens": 30291938.0, + "reward": 0.042711060494184494, + "reward_std": 0.38207319378852844, + "rewards/cosine_scaled_reward/mean": 0.042711060494184494, + "rewards/cosine_scaled_reward/std": 0.4666350483894348, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1227.75, + "completions/mean_terminated_length": 954.3333740234375, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "epoch": 0.31657142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3113550841808319, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0, + "num_tokens": 30382218.0, + "reward": 0.07253848016262054, + "reward_std": 0.1982618272304535, + "rewards/cosine_scaled_reward/mean": 0.07253845036029816, + "rewards/cosine_scaled_reward/std": 0.5152764916419983, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1272.296875, + "completions/mean_terminated_length": 865.9761962890625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.3177142857142857, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2616003155708313, + "learning_rate": 5.437170188473847e-07, + "loss": -0.0, + "num_tokens": 30474309.0, + "reward": 0.08356653898954391, + "reward_std": 0.1530226469039917, + "rewards/cosine_scaled_reward/mean": 0.08356654644012451, + "rewards/cosine_scaled_reward/std": 0.5249122381210327, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1500.65625, + "completions/mean_terminated_length": 1251.8636474609375, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.31885714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23652584850788116, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0, + "num_tokens": 30581391.0, + "reward": -0.07124869525432587, + "reward_std": 0.32038426399230957, + "rewards/cosine_scaled_reward/mean": -0.07124869525432587, + "rewards/cosine_scaled_reward/std": 0.43860507011413574, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1823.0, + "completions/mean_length": 1187.34375, + "completions/mean_terminated_length": 850.5652465820312, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.336769163608551, + "learning_rate": 5.37435262574394e-07, + "loss": -0.0, + "num_tokens": 30668053.0, + "reward": 0.10428585112094879, + "reward_std": 0.24089524149894714, + "rewards/cosine_scaled_reward/mean": 0.10428585112094879, + "rewards/cosine_scaled_reward/std": 0.5281394720077515, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 1572.578125, + "completions/mean_terminated_length": 998.7930908203125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.3211428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26516586542129517, + "learning_rate": 5.342952264838747e-07, + "loss": -0.0, + "num_tokens": 30780322.0, + "reward": -0.22128218412399292, + "reward_std": 0.2539688050746918, + "rewards/cosine_scaled_reward/mean": -0.22128218412399292, + "rewards/cosine_scaled_reward/std": 0.2776496410369873, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 1154.703125, + "completions/mean_terminated_length": 1079.0, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.3222857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28156232833862305, + "learning_rate": 5.311559558218603e-07, + "loss": -0.0, + "num_tokens": 30864247.0, + "reward": -0.05536589026451111, + "reward_std": 0.24473221600055695, + "rewards/cosine_scaled_reward/mean": -0.055365875363349915, + "rewards/cosine_scaled_reward/std": 0.4620397686958313, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1669.65625, + "completions/mean_terminated_length": 1240.86669921875, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "epoch": 0.32342857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21421867609024048, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0, + "num_tokens": 30981553.0, + "reward": 0.00772450864315033, + "reward_std": 0.365454763174057, + "rewards/cosine_scaled_reward/mean": 0.00772450864315033, + "rewards/cosine_scaled_reward/std": 0.5393754839897156, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1037.25, + "completions/mean_terminated_length": 932.6896362304688, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.32457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.476616770029068, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0, + "num_tokens": 31058233.0, + "reward": -0.11384838074445724, + "reward_std": 0.30951282382011414, + "rewards/cosine_scaled_reward/mean": -0.11384837329387665, + "rewards/cosine_scaled_reward/std": 0.3668363690376282, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1596.0, + "completions/mean_length": 980.6875, + "completions/mean_terminated_length": 828.2142944335938, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.32571428571428573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35374680161476135, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0, + "num_tokens": 31131685.0, + "reward": -0.07238543033599854, + "reward_std": 0.31320473551750183, + "rewards/cosine_scaled_reward/mean": -0.07238544523715973, + "rewards/cosine_scaled_reward/std": 0.44502073526382446, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1609.0, + "completions/mean_length": 1229.28125, + "completions/mean_terminated_length": 1059.3585205078125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.32685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3040845990180969, + "learning_rate": 5.186095868151436e-07, + "loss": -0.0, + "num_tokens": 31220791.0, + "reward": -0.01666315644979477, + "reward_std": 0.373945951461792, + "rewards/cosine_scaled_reward/mean": -0.01666315644979477, + "rewards/cosine_scaled_reward/std": 0.4552530348300934, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 1057.921875, + "completions/mean_terminated_length": 727.8958740234375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33932286500930786, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0, + "num_tokens": 31298650.0, + "reward": -0.04437921941280365, + "reward_std": 0.2586653232574463, + "rewards/cosine_scaled_reward/mean": -0.04437922313809395, + "rewards/cosine_scaled_reward/std": 0.4228661060333252, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 1318.125, + "completions/mean_terminated_length": 1132.0784912109375, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "epoch": 0.3291428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24589991569519043, + "learning_rate": 5.123449705004581e-07, + "loss": -0.0, + "num_tokens": 31394034.0, + "reward": -0.14538612961769104, + "reward_std": 0.30133792757987976, + "rewards/cosine_scaled_reward/mean": -0.14538611471652985, + "rewards/cosine_scaled_reward/std": 0.37525659799575806, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1191.203125, + "completions/mean_terminated_length": 993.4808349609375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.3302857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29120251536369324, + "learning_rate": 5.09215338910999e-07, + "loss": -0.0, + "num_tokens": 31481407.0, + "reward": 0.05579536408185959, + "reward_std": 0.3108929693698883, + "rewards/cosine_scaled_reward/mean": 0.05579536035656929, + "rewards/cosine_scaled_reward/std": 0.5062500238418579, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 992.15625, + "completions/mean_terminated_length": 841.3214721679688, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4412848949432373, + "learning_rate": 5.060876951083828e-07, + "loss": -0.0, + "num_tokens": 31554665.0, + "reward": 0.1182926595211029, + "reward_std": 0.34878551959991455, + "rewards/cosine_scaled_reward/mean": 0.1182926669716835, + "rewards/cosine_scaled_reward/std": 0.5200846195220947, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1220.203125, + "completions/mean_terminated_length": 1118.5438232421875, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.3325714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23901385068893433, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0, + "num_tokens": 31643182.0, + "reward": -0.005823981016874313, + "reward_std": 0.4199373722076416, + "rewards/cosine_scaled_reward/mean": -0.005823981016874313, + "rewards/cosine_scaled_reward/std": 0.48368990421295166, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1297.9375, + "completions/mean_terminated_length": 1026.6383056640625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.33371428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2564057409763336, + "learning_rate": 4.998389805071536e-07, + "loss": -0.0, + "num_tokens": 31737938.0, + "reward": -0.1168045848608017, + "reward_std": 0.26728183031082153, + "rewards/cosine_scaled_reward/mean": -0.1168045848608017, + "rewards/cosine_scaled_reward/std": 0.35125845670700073, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 1095.40625, + "completions/mean_terminated_length": 919.0, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.33485714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30023491382598877, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0, + "num_tokens": 31818924.0, + "reward": -0.04689720273017883, + "reward_std": 0.28609341382980347, + "rewards/cosine_scaled_reward/mean": -0.04689720273017883, + "rewards/cosine_scaled_reward/std": 0.4900621771812439, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 1580.125, + "completions/mean_terminated_length": 1167.2940673828125, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2665126323699951, + "learning_rate": 4.93600044896063e-07, + "loss": -0.0, + "num_tokens": 31930260.0, + "reward": -0.08336995542049408, + "reward_std": 0.31363779306411743, + "rewards/cosine_scaled_reward/mean": -0.08336995542049408, + "rewards/cosine_scaled_reward/std": 0.5037887692451477, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1488.453125, + "completions/mean_terminated_length": 1080.1351318359375, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.33714285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27722999453544617, + "learning_rate": 4.904846243842949e-07, + "loss": -0.0, + "num_tokens": 32035937.0, + "reward": 0.07110725343227386, + "reward_std": 0.3471496105194092, + "rewards/cosine_scaled_reward/mean": 0.07110726088285446, + "rewards/cosine_scaled_reward/std": 0.4735586941242218, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 1372.890625, + "completions/mean_terminated_length": 1087.844482421875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.3382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3566659986972809, + "learning_rate": 4.873721045679706e-07, + "loss": -0.0, + "num_tokens": 32134450.0, + "reward": -0.13402147591114044, + "reward_std": 0.2937889099121094, + "rewards/cosine_scaled_reward/mean": -0.13402147591114044, + "rewards/cosine_scaled_reward/std": 0.3325452506542206, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1618.578125, + "completions/mean_terminated_length": 1239.676513671875, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.3394285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2620627284049988, + "learning_rate": 4.842626371469149e-07, + "loss": -0.0, + "num_tokens": 32249263.0, + "reward": -0.10356634855270386, + "reward_std": 0.3085135817527771, + "rewards/cosine_scaled_reward/mean": -0.10356632620096207, + "rewards/cosine_scaled_reward/std": 0.41743960976600647, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1220.3125, + "completions/mean_terminated_length": 1009.3333740234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3405714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38996121287345886, + "learning_rate": 4.811563736721829e-07, + "loss": -0.0, + "num_tokens": 32338387.0, + "reward": -0.023439258337020874, + "reward_std": 0.2796996235847473, + "rewards/cosine_scaled_reward/mean": -0.02343926578760147, + "rewards/cosine_scaled_reward/std": 0.5068122744560242, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1268.8125, + "completions/mean_terminated_length": 1157.5, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.3417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2567848563194275, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0, + "num_tokens": 32430847.0, + "reward": -0.09949212521314621, + "reward_std": 0.3315163254737854, + "rewards/cosine_scaled_reward/mean": -0.0994921326637268, + "rewards/cosine_scaled_reward/std": 0.3953181803226471, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1375.375, + "completions/mean_terminated_length": 1132.0850830078125, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.34285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3365769386291504, + "learning_rate": 4.749540639777539e-07, + "loss": -0.0, + "num_tokens": 32530767.0, + "reward": -0.04265330731868744, + "reward_std": 0.19944772124290466, + "rewards/cosine_scaled_reward/mean": -0.04265330731868744, + "rewards/cosine_scaled_reward/std": 0.39511266350746155, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1274.828125, + "completions/mean_terminated_length": 972.2826538085938, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3056529760360718, + "learning_rate": 4.7185832004988133e-07, + "loss": -0.0, + "num_tokens": 32623156.0, + "reward": -0.020140450447797775, + "reward_std": 0.24108503758907318, + "rewards/cosine_scaled_reward/mean": -0.02014043927192688, + "rewards/cosine_scaled_reward/std": 0.3770294487476349, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1257.5, + "completions/mean_terminated_length": 923.7333374023438, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.34514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2761608958244324, + "learning_rate": 4.68766384637248e-07, + "loss": -0.0, + "num_tokens": 32714172.0, + "reward": 0.00733301043510437, + "reward_std": 0.31370484828948975, + "rewards/cosine_scaled_reward/mean": 0.007333017885684967, + "rewards/cosine_scaled_reward/std": 0.4660908877849579, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1843.0, + "completions/mean_length": 1154.34375, + "completions/mean_terminated_length": 1026.6785888671875, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.3462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28181061148643494, + "learning_rate": 4.656784084364238e-07, + "loss": -0.0, + "num_tokens": 32798778.0, + "reward": -0.0471949465572834, + "reward_std": 0.31843262910842896, + "rewards/cosine_scaled_reward/mean": -0.0471949428319931, + "rewards/cosine_scaled_reward/std": 0.42977795004844666, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 1131.421875, + "completions/mean_terminated_length": 941.188720703125, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "epoch": 0.3474285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29874488711357117, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0, + "num_tokens": 32881573.0, + "reward": -0.10114389657974243, + "reward_std": 0.24575868248939514, + "rewards/cosine_scaled_reward/mean": -0.10114389657974243, + "rewards/cosine_scaled_reward/std": 0.40359967947006226, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1296.0625, + "completions/mean_terminated_length": 1122.5384521484375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.3485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27664560079574585, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0, + "num_tokens": 32974737.0, + "reward": -0.13854889571666718, + "reward_std": 0.25994938611984253, + "rewards/cosine_scaled_reward/mean": -0.13854891061782837, + "rewards/cosine_scaled_reward/std": 0.35616254806518555, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1035.921875, + "completions/mean_terminated_length": 1003.274169921875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.3497142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3285360634326935, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0, + "num_tokens": 33051604.0, + "reward": 0.14512689411640167, + "reward_std": 0.28848469257354736, + "rewards/cosine_scaled_reward/mean": 0.14512689411640167, + "rewards/cosine_scaled_reward/std": 0.5476776361465454, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1240.765625, + "completions/mean_terminated_length": 899.933349609375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.35085714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3270239233970642, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0, + "num_tokens": 33141517.0, + "reward": -0.08620665967464447, + "reward_std": 0.3766360580921173, + "rewards/cosine_scaled_reward/mean": -0.08620666712522507, + "rewards/cosine_scaled_reward/std": 0.4342641830444336, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1546.78125, + "completions/mean_terminated_length": 1181.027099609375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27663469314575195, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0, + "num_tokens": 33251111.0, + "reward": -0.17574885487556458, + "reward_std": 0.2633781433105469, + "rewards/cosine_scaled_reward/mean": -0.17574885487556458, + "rewards/cosine_scaled_reward/std": 0.38026031851768494, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1443.6875, + "completions/mean_terminated_length": 1188.5333251953125, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "epoch": 0.35314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22700217366218567, + "learning_rate": 4.4724210845020494e-07, + "loss": -0.0, + "num_tokens": 33354707.0, + "reward": -0.10798169672489166, + "reward_std": 0.31306353211402893, + "rewards/cosine_scaled_reward/mean": -0.10798169672489166, + "rewards/cosine_scaled_reward/std": 0.4384790062904358, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 1147.890625, + "completions/mean_terminated_length": 1037.350830078125, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.35428571428571426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29228872060775757, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0, + "num_tokens": 33438108.0, + "reward": -0.06270914524793625, + "reward_std": 0.3073654770851135, + "rewards/cosine_scaled_reward/mean": -0.06270914524793625, + "rewards/cosine_scaled_reward/std": 0.43214184045791626, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1250.640625, + "completions/mean_terminated_length": 1120.16357421875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.3554285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3056102991104126, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0, + "num_tokens": 33529405.0, + "reward": 0.10986693948507309, + "reward_std": 0.39667800068855286, + "rewards/cosine_scaled_reward/mean": 0.10986694693565369, + "rewards/cosine_scaled_reward/std": 0.521910548210144, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1711.0, + "completions/mean_length": 1222.3125, + "completions/mean_terminated_length": 759.1219482421875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3565714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3069155812263489, + "learning_rate": 4.3808955077581546e-07, + "loss": -0.0, + "num_tokens": 33618057.0, + "reward": 0.17035391926765442, + "reward_std": 0.1586351990699768, + "rewards/cosine_scaled_reward/mean": 0.17035391926765442, + "rewards/cosine_scaled_reward/std": 0.5358856916427612, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1873.0, + "completions/mean_length": 1280.859375, + "completions/mean_terminated_length": 956.95556640625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.3577142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2738092243671417, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0, + "num_tokens": 33710496.0, + "reward": 0.00472693145275116, + "reward_std": 0.24437010288238525, + "rewards/cosine_scaled_reward/mean": 0.004726938903331757, + "rewards/cosine_scaled_reward/std": 0.4307664930820465, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 1132.734375, + "completions/mean_terminated_length": 876.4599609375, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.3588571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2913839817047119, + "learning_rate": 4.3201486961161093e-07, + "loss": -0.0, + "num_tokens": 33793223.0, + "reward": 0.14254246652126312, + "reward_std": 0.3005802035331726, + "rewards/cosine_scaled_reward/mean": 0.14254248142242432, + "rewards/cosine_scaled_reward/std": 0.48442843556404114, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1495.015625, + "completions/mean_terminated_length": 1205.357177734375, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23286940157413483, + "learning_rate": 4.2898608072313045e-07, + "loss": -0.0, + "num_tokens": 33899984.0, + "reward": -0.0238988995552063, + "reward_std": 0.3149425685405731, + "rewards/cosine_scaled_reward/mean": -0.0238988995552063, + "rewards/cosine_scaled_reward/std": 0.4462411403656006, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1440.78125, + "completions/mean_terminated_length": 1100.146240234375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.36114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2949734032154083, + "learning_rate": 4.2596318988235037e-07, + "loss": -0.0, + "num_tokens": 34004122.0, + "reward": -0.26918160915374756, + "reward_std": 0.18649792671203613, + "rewards/cosine_scaled_reward/mean": -0.26918160915374756, + "rewards/cosine_scaled_reward/std": 0.23089087009429932, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1237.984375, + "completions/mean_terminated_length": 990.0203857421875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.36228571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32614174485206604, + "learning_rate": 4.2294634442070553e-07, + "loss": -0.0, + "num_tokens": 34094209.0, + "reward": -0.1377859264612198, + "reward_std": 0.2551230788230896, + "rewards/cosine_scaled_reward/mean": -0.1377859115600586, + "rewards/cosine_scaled_reward/std": 0.4370906949043274, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1681.0, + "completions/mean_length": 1091.515625, + "completions/mean_terminated_length": 870.7885131835938, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.36342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3238787055015564, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0, + "num_tokens": 34174002.0, + "reward": -0.1866554617881775, + "reward_std": 0.29696089029312134, + "rewards/cosine_scaled_reward/mean": -0.1866554617881775, + "rewards/cosine_scaled_reward/std": 0.3505614995956421, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1843.0, + "completions/mean_length": 1241.90625, + "completions/mean_terminated_length": 1036.431396484375, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.36457142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31191158294677734, + "learning_rate": 4.1693137748017915e-07, + "loss": -0.0, + "num_tokens": 34264540.0, + "reward": -0.16028451919555664, + "reward_std": 0.2337343692779541, + "rewards/cosine_scaled_reward/mean": -0.16028451919555664, + "rewards/cosine_scaled_reward/std": 0.2717510163784027, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1494.0, + "completions/mean_length": 861.453125, + "completions/mean_terminated_length": 823.1773681640625, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.3657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34454721212387085, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0, + "num_tokens": 34329673.0, + "reward": -0.06567925214767456, + "reward_std": 0.31354451179504395, + "rewards/cosine_scaled_reward/mean": -0.06567925959825516, + "rewards/cosine_scaled_reward/std": 0.4225046932697296, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 921.53125, + "completions/mean_terminated_length": 866.131103515625, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.3668571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32410749793052673, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0, + "num_tokens": 34398491.0, + "reward": 0.1567627191543579, + "reward_std": 0.28847143054008484, + "rewards/cosine_scaled_reward/mean": 0.1567627191543579, + "rewards/cosine_scaled_reward/std": 0.5146859288215637, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1115.5625, + "completions/mean_terminated_length": 1001.0526123046875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39042168855667114, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0, + "num_tokens": 34481167.0, + "reward": -0.10124355554580688, + "reward_std": 0.33205243945121765, + "rewards/cosine_scaled_reward/mean": -0.10124355554580688, + "rewards/cosine_scaled_reward/std": 0.4005582928657532, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1136.703125, + "completions/mean_terminated_length": 967.9444580078125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.36914285714285716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3326353132724762, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0, + "num_tokens": 34564764.0, + "reward": -0.040179818868637085, + "reward_std": 0.24554386734962463, + "rewards/cosine_scaled_reward/mean": -0.040179818868637085, + "rewards/cosine_scaled_reward/std": 0.4389503598213196, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1133.90625, + "completions/mean_terminated_length": 944.188720703125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.3702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34735575318336487, + "learning_rate": 4.020100089676376e-07, + "loss": -0.0, + "num_tokens": 34648182.0, + "reward": -0.16276082396507263, + "reward_std": 0.29534196853637695, + "rewards/cosine_scaled_reward/mean": -0.16276082396507263, + "rewards/cosine_scaled_reward/std": 0.35879120230674744, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1301.296875, + "completions/mean_terminated_length": 961.8864135742188, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.37142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3051433265209198, + "learning_rate": 3.9904679361238526e-07, + "loss": -0.0, + "num_tokens": 34741921.0, + "reward": -0.026148229837417603, + "reward_std": 0.24914240837097168, + "rewards/cosine_scaled_reward/mean": -0.026148226112127304, + "rewards/cosine_scaled_reward/std": 0.47579747438430786, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1623.0, + "completions/mean_length": 933.8125, + "completions/mean_terminated_length": 879.016357421875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.37257142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3523712754249573, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0, + "num_tokens": 34811509.0, + "reward": -0.005365140736103058, + "reward_std": 0.42064523696899414, + "rewards/cosine_scaled_reward/mean": -0.005365140736103058, + "rewards/cosine_scaled_reward/std": 0.48223450779914856, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1775.0, + "completions/mean_length": 1274.625, + "completions/mean_terminated_length": 972.0, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "epoch": 0.3737142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25114986300468445, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0, + "num_tokens": 34902525.0, + "reward": 0.15240590274333954, + "reward_std": 0.36757412552833557, + "rewards/cosine_scaled_reward/mean": 0.15240588784217834, + "rewards/cosine_scaled_reward/std": 0.5200423002243042, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1287.765625, + "completions/mean_terminated_length": 1034.354248046875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.37485714285714283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31936249136924744, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0, + "num_tokens": 34996638.0, + "reward": -0.07551632821559906, + "reward_std": 0.28956252336502075, + "rewards/cosine_scaled_reward/mean": -0.07551632821559906, + "rewards/cosine_scaled_reward/std": 0.38629648089408875, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 968.734375, + "completions/mean_terminated_length": 836.1929931640625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3347271680831909, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0, + "num_tokens": 35068285.0, + "reward": 0.03636639565229416, + "reward_std": 0.2681905925273895, + "rewards/cosine_scaled_reward/mean": 0.03636638820171356, + "rewards/cosine_scaled_reward/std": 0.5459120273590088, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 917.171875, + "completions/mean_terminated_length": 778.2982788085938, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.37714285714285717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4763779938220978, + "learning_rate": 3.843439512918949e-07, + "loss": -0.0, + "num_tokens": 35137376.0, + "reward": 0.03270518034696579, + "reward_std": 0.3372231125831604, + "rewards/cosine_scaled_reward/mean": 0.03270517289638519, + "rewards/cosine_scaled_reward/std": 0.47484341263771057, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 1351.84375, + "completions/mean_terminated_length": 810.388916015625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.3782857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5174200534820557, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0, + "num_tokens": 35234158.0, + "reward": -0.14917470514774323, + "reward_std": 0.23482158780097961, + "rewards/cosine_scaled_reward/mean": -0.14917470514774323, + "rewards/cosine_scaled_reward/std": 0.33059147000312805, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1160.9375, + "completions/mean_terminated_length": 956.2308349609375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.37942857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3024626076221466, + "learning_rate": 3.785183306423767e-07, + "loss": -0.0, + "num_tokens": 35319202.0, + "reward": -0.1432058960199356, + "reward_std": 0.219490647315979, + "rewards/cosine_scaled_reward/mean": -0.14320588111877441, + "rewards/cosine_scaled_reward/std": 0.3754548728466034, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1025.53125, + "completions/mean_terminated_length": 975.245849609375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.38057142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2969377338886261, + "learning_rate": 3.7561798609655373e-07, + "loss": -0.0, + "num_tokens": 35395524.0, + "reward": 0.06338316202163696, + "reward_std": 0.3955259919166565, + "rewards/cosine_scaled_reward/mean": 0.06338317692279816, + "rewards/cosine_scaled_reward/std": 0.4999569058418274, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1489.828125, + "completions/mean_terminated_length": 1082.5135498046875, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.38171428571428573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26992398500442505, + "learning_rate": 3.72726140684072e-07, + "loss": -0.0, + "num_tokens": 35501841.0, + "reward": -0.2190597951412201, + "reward_std": 0.20334219932556152, + "rewards/cosine_scaled_reward/mean": -0.2190597951412201, + "rewards/cosine_scaled_reward/std": 0.22991260886192322, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1254.953125, + "completions/mean_terminated_length": 1052.803955078125, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.38285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2687843441963196, + "learning_rate": 3.6984293534939737e-07, + "loss": -0.0, + "num_tokens": 35592238.0, + "reward": -0.008545689284801483, + "reward_std": 0.2865098714828491, + "rewards/cosine_scaled_reward/mean": -0.008545689284801483, + "rewards/cosine_scaled_reward/std": 0.46601414680480957, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1462.296875, + "completions/mean_terminated_length": 1133.731689453125, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 0.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28923502564430237, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0, + "num_tokens": 35696769.0, + "reward": 0.006126267835497856, + "reward_std": 0.34422457218170166, + "rewards/cosine_scaled_reward/mean": 0.006126277148723602, + "rewards/cosine_scaled_reward/std": 0.49070778489112854, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 1304.328125, + "completions/mean_terminated_length": 1035.34033203125, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.3851428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2708565592765808, + "learning_rate": 3.641030065789562e-07, + "loss": -0.0, + "num_tokens": 35791238.0, + "reward": -0.1498498022556305, + "reward_std": 0.3901336193084717, + "rewards/cosine_scaled_reward/mean": -0.1498498022556305, + "rewards/cosine_scaled_reward/std": 0.4185183346271515, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1146.875, + "completions/mean_terminated_length": 871.0203857421875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.3862857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34217318892478943, + "learning_rate": 3.612465628992203e-07, + "loss": -0.0, + "num_tokens": 35874318.0, + "reward": 0.023914728313684464, + "reward_std": 0.3298850655555725, + "rewards/cosine_scaled_reward/mean": 0.023914728313684464, + "rewards/cosine_scaled_reward/std": 0.4540858268737793, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 1251.3125, + "completions/mean_terminated_length": 1120.9454345703125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.38742857142857146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.322914719581604, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0, + "num_tokens": 35965698.0, + "reward": -0.08145550638437271, + "reward_std": 0.2552027404308319, + "rewards/cosine_scaled_reward/mean": -0.0814555287361145, + "rewards/cosine_scaled_reward/std": 0.4186084270477295, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 1216.640625, + "completions/mean_terminated_length": 962.142822265625, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "epoch": 0.38857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2708609104156494, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0, + "num_tokens": 36053531.0, + "reward": -0.01538059115409851, + "reward_std": 0.282448410987854, + "rewards/cosine_scaled_reward/mean": -0.01538059115409851, + "rewards/cosine_scaled_reward/std": 0.47943398356437683, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1257.15625, + "completions/mean_terminated_length": 1035.719970703125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.38971428571428574, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.21827326714992523, + "learning_rate": 3.5273298394491515e-07, + "loss": -0.0, + "num_tokens": 36145061.0, + "reward": 0.12981322407722473, + "reward_std": 0.3501738905906677, + "rewards/cosine_scaled_reward/mean": 0.12981323897838593, + "rewards/cosine_scaled_reward/std": 0.5823574662208557, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1330.71875, + "completions/mean_terminated_length": 955.0, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.39085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3223852217197418, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0, + "num_tokens": 36240867.0, + "reward": -0.14867264032363892, + "reward_std": 0.36134976148605347, + "rewards/cosine_scaled_reward/mean": -0.14867264032363892, + "rewards/cosine_scaled_reward/std": 0.4149284064769745, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1322.796875, + "completions/mean_terminated_length": 1060.4893798828125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28214380145072937, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0, + "num_tokens": 36337262.0, + "reward": 0.05130603909492493, + "reward_std": 0.3060014545917511, + "rewards/cosine_scaled_reward/mean": 0.05130603164434433, + "rewards/cosine_scaled_reward/std": 0.4707089960575104, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1358.78125, + "completions/mean_terminated_length": 1129.041748046875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.3931428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28303998708724976, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0, + "num_tokens": 36434272.0, + "reward": 0.1340864598751068, + "reward_std": 0.3489719033241272, + "rewards/cosine_scaled_reward/mean": 0.1340864598751068, + "rewards/cosine_scaled_reward/std": 0.46820539236068726, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1194.65625, + "completions/mean_terminated_length": 1089.859619140625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.3942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31418564915657043, + "learning_rate": 3.4151678419606233e-07, + "loss": -0.0, + "num_tokens": 36520338.0, + "reward": 0.03204090893268585, + "reward_std": 0.2984616756439209, + "rewards/cosine_scaled_reward/mean": 0.03204091265797615, + "rewards/cosine_scaled_reward/std": 0.47096553444862366, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1274.703125, + "completions/mean_terminated_length": 1194.7069091796875, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "epoch": 0.3954285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2741748094558716, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0, + "num_tokens": 36613703.0, + "reward": -0.03510487079620361, + "reward_std": 0.34712469577789307, + "rewards/cosine_scaled_reward/mean": -0.03510487079620361, + "rewards/cosine_scaled_reward/std": 0.44494491815567017, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1292.078125, + "completions/mean_terminated_length": 1168.3818359375, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.3965714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2128373682498932, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0, + "num_tokens": 36706132.0, + "reward": -0.19367390871047974, + "reward_std": 0.26442888379096985, + "rewards/cosine_scaled_reward/mean": -0.19367390871047974, + "rewards/cosine_scaled_reward/std": 0.3002490699291229, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1143.234375, + "completions/mean_terminated_length": 934.4423217773438, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.3977142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3682413697242737, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0, + "num_tokens": 36789859.0, + "reward": 0.050142791122198105, + "reward_std": 0.2848895788192749, + "rewards/cosine_scaled_reward/mean": 0.050142791122198105, + "rewards/cosine_scaled_reward/std": 0.5106508731842041, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1187.359375, + "completions/mean_terminated_length": 1098.32763671875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.39885714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3549482822418213, + "learning_rate": 3.3046315338757026e-07, + "loss": -0.0, + "num_tokens": 36877594.0, + "reward": 0.03635264188051224, + "reward_std": 0.40539801120758057, + "rewards/cosine_scaled_reward/mean": 0.036352649331092834, + "rewards/cosine_scaled_reward/std": 0.5298211574554443, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 1020.984375, + "completions/mean_terminated_length": 952.5167236328125, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3496515154838562, + "learning_rate": 3.2772616003709616e-07, + "loss": -0.0, + "num_tokens": 36952737.0, + "reward": 0.037382401525974274, + "reward_std": 0.38036224246025085, + "rewards/cosine_scaled_reward/mean": 0.03738240525126457, + "rewards/cosine_scaled_reward/std": 0.4981257915496826, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1289.59375, + "completions/mean_terminated_length": 1077.239990234375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.40114285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28009918332099915, + "learning_rate": 3.250000000000001e-07, + "loss": -0.0, + "num_tokens": 37046815.0, + "reward": -0.10745181888341904, + "reward_std": 0.3360505998134613, + "rewards/cosine_scaled_reward/mean": -0.10745181143283844, + "rewards/cosine_scaled_reward/std": 0.3718012869358063, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1274.25, + "completions/mean_terminated_length": 947.5556030273438, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.4022857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27298474311828613, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0, + "num_tokens": 37138951.0, + "reward": -0.012279238551855087, + "reward_std": 0.26211076974868774, + "rewards/cosine_scaled_reward/mean": -0.012279249727725983, + "rewards/cosine_scaled_reward/std": 0.4294367730617523, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 948.21875, + "completions/mean_terminated_length": 894.131103515625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.4034285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27804434299468994, + "learning_rate": 3.195807108082429e-07, + "loss": -0.0, + "num_tokens": 37210381.0, + "reward": 0.10156769305467606, + "reward_std": 0.355769544839859, + "rewards/cosine_scaled_reward/mean": 0.10156770050525665, + "rewards/cosine_scaled_reward/std": 0.5138735771179199, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 996.625, + "completions/mean_terminated_length": 926.5333862304688, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.4045714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31089261174201965, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0, + "num_tokens": 37284325.0, + "reward": 0.09626409411430359, + "reward_std": 0.28315430879592896, + "rewards/cosine_scaled_reward/mean": 0.09626409411430359, + "rewards/cosine_scaled_reward/std": 0.5148419141769409, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1041.6875, + "completions/mean_terminated_length": 992.1966552734375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.4057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2799144983291626, + "learning_rate": 3.142063423134644e-07, + "loss": -0.0, + "num_tokens": 37361673.0, + "reward": 0.026262100785970688, + "reward_std": 0.3633853793144226, + "rewards/cosine_scaled_reward/mean": 0.026262104511260986, + "rewards/cosine_scaled_reward/std": 0.47469663619995117, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1205.3125, + "completions/mean_terminated_length": 1049.25927734375, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.40685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.270648717880249, + "learning_rate": 3.115363310950578e-07, + "loss": -0.0, + "num_tokens": 37449733.0, + "reward": -0.0814027264714241, + "reward_std": 0.27949750423431396, + "rewards/cosine_scaled_reward/mean": -0.0814027190208435, + "rewards/cosine_scaled_reward/std": 0.4111027419567108, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1295.671875, + "completions/mean_terminated_length": 1139.5283203125, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25192537903785706, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0, + "num_tokens": 37543712.0, + "reward": -0.18691374361515045, + "reward_std": 0.24994581937789917, + "rewards/cosine_scaled_reward/mean": -0.18691372871398926, + "rewards/cosine_scaled_reward/std": 0.2994806170463562, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 1273.90625, + "completions/mean_terminated_length": 1095.269287109375, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.40914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544371485710144, + "learning_rate": 3.062313053727671e-07, + "loss": -0.0, + "num_tokens": 37636554.0, + "reward": 0.0533258393406868, + "reward_std": 0.4265974164009094, + "rewards/cosine_scaled_reward/mean": 0.0533258393406868, + "rewards/cosine_scaled_reward/std": 0.5119028091430664, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 962.1875, + "completions/mean_terminated_length": 908.7868041992188, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.4102857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31888896226882935, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0, + "num_tokens": 37708910.0, + "reward": -0.10390816628932953, + "reward_std": 0.28485608100891113, + "rewards/cosine_scaled_reward/mean": -0.10390816628932953, + "rewards/cosine_scaled_reward/std": 0.3938615918159485, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1767.0, + "completions/mean_length": 974.71875, + "completions/mean_terminated_length": 940.0967407226562, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.4114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3155864477157593, + "learning_rate": 3.0097380284049523e-07, + "loss": -0.0, + "num_tokens": 37781628.0, + "reward": 0.08673460781574249, + "reward_std": 0.3896053731441498, + "rewards/cosine_scaled_reward/mean": 0.08673461526632309, + "rewards/cosine_scaled_reward/std": 0.49318408966064453, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 1219.171875, + "completions/mean_terminated_length": 1027.9039306640625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.4125714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30458390712738037, + "learning_rate": 2.9836319343816397e-07, + "loss": -0.0, + "num_tokens": 37869671.0, + "reward": -0.08344399929046631, + "reward_std": 0.29122042655944824, + "rewards/cosine_scaled_reward/mean": -0.0834440067410469, + "rewards/cosine_scaled_reward/std": 0.4114733040332794, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1695.0, + "completions/mean_length": 846.421875, + "completions/mean_terminated_length": 827.3492431640625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.4137142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34244003891944885, + "learning_rate": 2.9576484845877793e-07, + "loss": -0.0, + "num_tokens": 37933410.0, + "reward": 0.08258837461471558, + "reward_std": 0.17118743062019348, + "rewards/cosine_scaled_reward/mean": 0.08258837461471558, + "rewards/cosine_scaled_reward/std": 0.4797515273094177, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 996.109375, + "completions/mean_terminated_length": 866.9298095703125, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.41485714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32859963178634644, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0, + "num_tokens": 38007625.0, + "reward": 0.20837043225765228, + "reward_std": 0.31725478172302246, + "rewards/cosine_scaled_reward/mean": 0.20837046205997467, + "rewards/cosine_scaled_reward/std": 0.5618105530738831, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1105.671875, + "completions/mean_terminated_length": 989.9473876953125, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2958175539970398, + "learning_rate": 2.9060545772359305e-07, + "loss": -0.0, + "num_tokens": 38088468.0, + "reward": -0.2035731077194214, + "reward_std": 0.22497859597206116, + "rewards/cosine_scaled_reward/mean": -0.2035731077194214, + "rewards/cosine_scaled_reward/std": 0.25464943051338196, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1889.0, + "completions/mean_length": 1401.546875, + "completions/mean_terminated_length": 865.914306640625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.41714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2523272633552551, + "learning_rate": 2.8804466342921987e-07, + "loss": -0.0, + "num_tokens": 38190031.0, + "reward": 0.0627281665802002, + "reward_std": 0.2581511437892914, + "rewards/cosine_scaled_reward/mean": 0.0627281665802002, + "rewards/cosine_scaled_reward/std": 0.4917566180229187, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1241.171875, + "completions/mean_terminated_length": 994.1836547851562, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.41828571428571426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2996644675731659, + "learning_rate": 2.854966364683872e-07, + "loss": -0.0, + "num_tokens": 38279786.0, + "reward": 0.029350124299526215, + "reward_std": 0.3294476568698883, + "rewards/cosine_scaled_reward/mean": 0.029350124299526215, + "rewards/cosine_scaled_reward/std": 0.4382822811603546, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1368.3125, + "completions/mean_terminated_length": 1059.3636474609375, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.41942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25763261318206787, + "learning_rate": 2.829615010283344e-07, + "loss": -0.0, + "num_tokens": 38378350.0, + "reward": -0.04613512009382248, + "reward_std": 0.36061087250709534, + "rewards/cosine_scaled_reward/mean": -0.04613511264324188, + "rewards/cosine_scaled_reward/std": 0.45644786953926086, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 1498.40625, + "completions/mean_terminated_length": 913.3547973632812, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.4205714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29817602038383484, + "learning_rate": 2.8043938066798645e-07, + "loss": -0.0, + "num_tokens": 38487032.0, + "reward": -0.09489106386899948, + "reward_std": 0.26948118209838867, + "rewards/cosine_scaled_reward/mean": -0.09489107131958008, + "rewards/cosine_scaled_reward/std": 0.4007035791873932, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1269.0, + "completions/mean_terminated_length": 1070.431396484375, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.4217142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3075028657913208, + "learning_rate": 2.7793039831193133e-07, + "loss": -0.0, + "num_tokens": 38578936.0, + "reward": -0.02169196307659149, + "reward_std": 0.35401207208633423, + "rewards/cosine_scaled_reward/mean": -0.02169196307659149, + "rewards/cosine_scaled_reward/std": 0.44533175230026245, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1262.359375, + "completions/mean_terminated_length": 1000.4791870117188, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.4228571428571429, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2736559510231018, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0, + "num_tokens": 38670095.0, + "reward": -0.03380996733903885, + "reward_std": 0.17631584405899048, + "rewards/cosine_scaled_reward/mean": -0.03380995988845825, + "rewards/cosine_scaled_reward/std": 0.4548901319503784, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 735.40625, + "completions/mean_terminated_length": 735.40625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35735684633255005, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0, + "num_tokens": 38727353.0, + "reward": 0.07788234949111938, + "reward_std": 0.21955034136772156, + "rewards/cosine_scaled_reward/mean": 0.07788234949111938, + "rewards/cosine_scaled_reward/std": 0.4429240822792053, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1332.609375, + "completions/mean_terminated_length": 1200.129638671875, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "epoch": 0.42514285714285716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2321436107158661, + "learning_rate": 2.7048349887476037e-07, + "loss": -0.0, + "num_tokens": 38823936.0, + "reward": 0.053345322608947754, + "reward_std": 0.2683081030845642, + "rewards/cosine_scaled_reward/mean": 0.053345322608947754, + "rewards/cosine_scaled_reward/std": 0.4978865087032318, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 835.65625, + "completions/mean_terminated_length": 776.03271484375, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.42628571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3863072693347931, + "learning_rate": 2.6802828488599294e-07, + "loss": -0.0, + "num_tokens": 38887218.0, + "reward": -0.0681738406419754, + "reward_std": 0.2950044870376587, + "rewards/cosine_scaled_reward/mean": -0.0681738406419754, + "rewards/cosine_scaled_reward/std": 0.4459474980831146, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1059.671875, + "completions/mean_terminated_length": 957.4310302734375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.42742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29587796330451965, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0, + "num_tokens": 38965917.0, + "reward": -0.028351346030831337, + "reward_std": 0.3197570741176605, + "rewards/cosine_scaled_reward/mean": -0.028351355344057083, + "rewards/cosine_scaled_reward/std": 0.4428535997867584, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1786.0, + "completions/mean_length": 1256.453125, + "completions/mean_terminated_length": 896.6591186523438, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.42857142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2946600317955017, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0, + "num_tokens": 39057514.0, + "reward": 0.12926119565963745, + "reward_std": 0.350225031375885, + "rewards/cosine_scaled_reward/mean": 0.12926119565963745, + "rewards/cosine_scaled_reward/std": 0.5650700330734253, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1205.875, + "completions/mean_terminated_length": 1118.7586669921875, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.4297142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2932252287864685, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0, + "num_tokens": 39145122.0, + "reward": -0.12883727252483368, + "reward_std": 0.31050506234169006, + "rewards/cosine_scaled_reward/mean": -0.12883727252483368, + "rewards/cosine_scaled_reward/std": 0.403542160987854, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1333.046875, + "completions/mean_terminated_length": 1114.18359375, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "epoch": 0.4308571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29586219787597656, + "learning_rate": 2.583460445215911e-07, + "loss": -0.0, + "num_tokens": 39241533.0, + "reward": -0.10611157864332199, + "reward_std": 0.32782500982284546, + "rewards/cosine_scaled_reward/mean": -0.10611158609390259, + "rewards/cosine_scaled_reward/std": 0.4070427119731903, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1046.921875, + "completions/mean_terminated_length": 962.0847778320312, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48897626996040344, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0, + "num_tokens": 39319344.0, + "reward": 0.08191946148872375, + "reward_std": 0.35094010829925537, + "rewards/cosine_scaled_reward/mean": 0.08191945403814316, + "rewards/cosine_scaled_reward/std": 0.4611974060535431, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1298.796875, + "completions/mean_terminated_length": 982.4666748046875, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.43314285714285716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2892192602157593, + "learning_rate": 2.5358974294659373e-07, + "loss": -0.0, + "num_tokens": 39413075.0, + "reward": -0.17867997288703918, + "reward_std": 0.23621207475662231, + "rewards/cosine_scaled_reward/mean": -0.17867997288703918, + "rewards/cosine_scaled_reward/std": 0.31146153807640076, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 1132.84375, + "completions/mean_terminated_length": 921.6538696289062, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.4342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2403450012207031, + "learning_rate": 2.512332043064913e-07, + "loss": -0.0, + "num_tokens": 39495881.0, + "reward": -0.06327217072248459, + "reward_std": 0.3884689509868622, + "rewards/cosine_scaled_reward/mean": -0.06327217072248459, + "rewards/cosine_scaled_reward/std": 0.4420482814311981, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1459.25, + "completions/mean_terminated_length": 939.7647094726562, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.43542857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.343969464302063, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0, + "num_tokens": 39600001.0, + "reward": -0.2608669698238373, + "reward_std": 0.14979779720306396, + "rewards/cosine_scaled_reward/mean": -0.2608669698238373, + "rewards/cosine_scaled_reward/std": 0.1791483461856842, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1051.484375, + "completions/mean_terminated_length": 929.1052856445312, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.43657142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3288431167602539, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0, + "num_tokens": 39677168.0, + "reward": -0.12583154439926147, + "reward_std": 0.3182182312011719, + "rewards/cosine_scaled_reward/mean": -0.12583155930042267, + "rewards/cosine_scaled_reward/std": 0.3559519946575165, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1795.0, + "completions/mean_length": 1156.90625, + "completions/mean_terminated_length": 951.269287109375, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "epoch": 0.4377142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32613274455070496, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0, + "num_tokens": 39762146.0, + "reward": 0.1630343198776245, + "reward_std": 0.3413184881210327, + "rewards/cosine_scaled_reward/mean": 0.1630343198776245, + "rewards/cosine_scaled_reward/std": 0.5236021876335144, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1131.609375, + "completions/mean_terminated_length": 1019.0701904296875, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.43885714285714283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2989339232444763, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.0, + "num_tokens": 39844729.0, + "reward": 0.34208860993385315, + "reward_std": 0.4012106657028198, + "rewards/cosine_scaled_reward/mean": 0.34208860993385315, + "rewards/cosine_scaled_reward/std": 0.5298689007759094, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1234.875, + "completions/mean_terminated_length": 1135.017578125, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29750096797943115, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0, + "num_tokens": 39934849.0, + "reward": -0.14285171031951904, + "reward_std": 0.34338319301605225, + "rewards/cosine_scaled_reward/mean": -0.14285171031951904, + "rewards/cosine_scaled_reward/std": 0.38804200291633606, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1118.171875, + "completions/mean_terminated_length": 966.0181274414062, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.44114285714285717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33167222142219543, + "learning_rate": 2.374037332934512e-07, + "loss": -0.0, + "num_tokens": 40016788.0, + "reward": 0.05989622324705124, + "reward_std": 0.2561216652393341, + "rewards/cosine_scaled_reward/mean": 0.059896212071180344, + "rewards/cosine_scaled_reward/std": 0.5094225406646729, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1285.734375, + "completions/mean_terminated_length": 1052.3876953125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.4422857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2604723572731018, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0, + "num_tokens": 40109931.0, + "reward": 0.01040782779455185, + "reward_std": 0.24030566215515137, + "rewards/cosine_scaled_reward/mean": 0.010407820343971252, + "rewards/cosine_scaled_reward/std": 0.4113282859325409, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 1145.28125, + "completions/mean_terminated_length": 997.5635986328125, + "completions/min_length": 542.0, + "completions/min_terminated_length": 542.0, + "epoch": 0.44342857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2968713939189911, + "learning_rate": 2.3291460551638237e-07, + "loss": -0.0, + "num_tokens": 40193453.0, + "reward": 0.0958661362528801, + "reward_std": 0.3359958529472351, + "rewards/cosine_scaled_reward/mean": 0.0958661437034607, + "rewards/cosine_scaled_reward/std": 0.5019846558570862, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1089.265625, + "completions/mean_terminated_length": 1042.11474609375, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.44457142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2655465602874756, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0, + "num_tokens": 40273326.0, + "reward": -0.06952888518571854, + "reward_std": 0.31117042899131775, + "rewards/cosine_scaled_reward/mean": -0.06952889263629913, + "rewards/cosine_scaled_reward/std": 0.4088662266731262, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 1239.234375, + "completions/mean_terminated_length": 897.7555541992188, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.44571428571428573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30228230357170105, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0, + "num_tokens": 40364573.0, + "reward": -0.06515465676784515, + "reward_std": 0.2851213216781616, + "rewards/cosine_scaled_reward/mean": -0.06515465676784515, + "rewards/cosine_scaled_reward/std": 0.4545634388923645, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 1129.953125, + "completions/mean_terminated_length": 823.9375, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.44685714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3024221360683441, + "learning_rate": 2.2629708984760706e-07, + "loss": -0.0, + "num_tokens": 40447010.0, + "reward": 0.1165984496474266, + "reward_std": 0.3462623953819275, + "rewards/cosine_scaled_reward/mean": 0.1165984570980072, + "rewards/cosine_scaled_reward/std": 0.5021436810493469, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1247.84375, + "completions/mean_terminated_length": 1002.89794921875, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.319551557302475, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.0, + "num_tokens": 40537200.0, + "reward": -0.010323913767933846, + "reward_std": 0.38794031739234924, + "rewards/cosine_scaled_reward/mean": -0.010323913767933846, + "rewards/cosine_scaled_reward/std": 0.4609202742576599, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1875.0, + "completions/mean_length": 1196.125, + "completions/mean_terminated_length": 912.1666870117188, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.4491428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3175903558731079, + "learning_rate": 2.2196411766036487e-07, + "loss": -0.0, + "num_tokens": 40624728.0, + "reward": -0.0534963496029377, + "reward_std": 0.37004372477531433, + "rewards/cosine_scaled_reward/mean": -0.053496353328228, + "rewards/cosine_scaled_reward/std": 0.4389037489891052, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 1345.265625, + "completions/mean_terminated_length": 1002.0697631835938, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.4502857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33591559529304504, + "learning_rate": 2.1982156097370557e-07, + "loss": -0.0, + "num_tokens": 40720825.0, + "reward": -0.1941550076007843, + "reward_std": 0.2748720049858093, + "rewards/cosine_scaled_reward/mean": -0.1941550076007843, + "rewards/cosine_scaled_reward/std": 0.3162692189216614, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1036.140625, + "completions/mean_terminated_length": 891.58935546875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.4514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5138213038444519, + "learning_rate": 2.1769509671835223e-07, + "loss": -0.0, + "num_tokens": 40797130.0, + "reward": -0.037831805646419525, + "reward_std": 0.3481375575065613, + "rewards/cosine_scaled_reward/mean": -0.03783179447054863, + "rewards/cosine_scaled_reward/std": 0.4529452323913574, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1032.9375, + "completions/mean_terminated_length": 1000.1935424804688, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "epoch": 0.45257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2617109417915344, + "learning_rate": 2.1558482853517253e-07, + "loss": -0.0, + "num_tokens": 40874294.0, + "reward": 0.11380182206630707, + "reward_std": 0.25549638271331787, + "rewards/cosine_scaled_reward/mean": 0.11380180716514587, + "rewards/cosine_scaled_reward/std": 0.4604811370372772, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1137.34375, + "completions/mean_terminated_length": 988.3272705078125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.45371428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27592313289642334, + "learning_rate": 2.134908592756607e-07, + "loss": -0.0, + "num_tokens": 40958172.0, + "reward": -0.13230225443840027, + "reward_std": 0.27906233072280884, + "rewards/cosine_scaled_reward/mean": -0.13230225443840027, + "rewards/cosine_scaled_reward/std": 0.35712045431137085, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1731.0, + "completions/mean_length": 1069.703125, + "completions/mean_terminated_length": 968.5, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "epoch": 0.45485714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.273295521736145, + "learning_rate": 2.1141329099692406e-07, + "loss": -0.0, + "num_tokens": 41037137.0, + "reward": -0.10481224954128265, + "reward_std": 0.22134995460510254, + "rewards/cosine_scaled_reward/mean": -0.10481224209070206, + "rewards/cosine_scaled_reward/std": 0.3863997161388397, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1863.0, + "completions/mean_length": 1130.65625, + "completions/mean_terminated_length": 999.607177734375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2677765190601349, + "learning_rate": 2.0935222495670968e-07, + "loss": -0.0, + "num_tokens": 41119507.0, + "reward": 0.10938499867916107, + "reward_std": 0.43441423773765564, + "rewards/cosine_scaled_reward/mean": 0.10938498377799988, + "rewards/cosine_scaled_reward/std": 0.5402286052703857, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 906.828125, + "completions/mean_terminated_length": 850.7048950195312, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.45714285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29809895157814026, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0, + "num_tokens": 41187928.0, + "reward": 0.18382114171981812, + "reward_std": 0.31220924854278564, + "rewards/cosine_scaled_reward/mean": 0.18382114171981812, + "rewards/cosine_scaled_reward/std": 0.5266857147216797, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 1340.515625, + "completions/mean_terminated_length": 1018.9318237304688, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.4582857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3078423738479614, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0, + "num_tokens": 41284265.0, + "reward": -0.011224612593650818, + "reward_std": 0.2520029842853546, + "rewards/cosine_scaled_reward/mean": -0.011224620044231415, + "rewards/cosine_scaled_reward/std": 0.4714611768722534, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1066.4375, + "completions/mean_terminated_length": 945.8947143554688, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.4594285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37535738945007324, + "learning_rate": 2.032690407508949e-07, + "loss": -0.0, + "num_tokens": 41364277.0, + "reward": 0.03628683090209961, + "reward_std": 0.20915059745311737, + "rewards/cosine_scaled_reward/mean": 0.036286838352680206, + "rewards/cosine_scaled_reward/std": 0.4816150367259979, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1007.15625, + "completions/mean_terminated_length": 955.9671630859375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.4605714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3474806249141693, + "learning_rate": 2.0127498008311922e-07, + "loss": -0.0, + "num_tokens": 41439895.0, + "reward": 0.06987690180540085, + "reward_std": 0.2849646508693695, + "rewards/cosine_scaled_reward/mean": 0.06987689435482025, + "rewards/cosine_scaled_reward/std": 0.46470174193382263, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 1077.96875, + "completions/mean_terminated_length": 939.3928833007812, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.4617142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.359544575214386, + "learning_rate": 1.9929791578083655e-07, + "loss": -0.0, + "num_tokens": 41520141.0, + "reward": -0.04401617869734764, + "reward_std": 0.2067043036222458, + "rewards/cosine_scaled_reward/mean": -0.044016171246767044, + "rewards/cosine_scaled_reward/std": 0.48886072635650635, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 1000.875, + "completions/mean_terminated_length": 829.5272216796875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.46285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3358437418937683, + "learning_rate": 1.9733794420337213e-07, + "loss": -0.0, + "num_tokens": 41594621.0, + "reward": 0.17152394354343414, + "reward_std": 0.2651253044605255, + "rewards/cosine_scaled_reward/mean": 0.17152394354343414, + "rewards/cosine_scaled_reward/std": 0.4771791100502014, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 1146.703125, + "completions/mean_terminated_length": 979.7963256835938, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2471207231283188, + "learning_rate": 1.9539516087697517e-07, + "loss": -0.0, + "num_tokens": 41678786.0, + "reward": -0.03732556849718094, + "reward_std": 0.2710352838039398, + "rewards/cosine_scaled_reward/mean": -0.03732557222247124, + "rewards/cosine_scaled_reward/std": 0.4403770864009857, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1786.0, + "completions/mean_length": 1215.28125, + "completions/mean_terminated_length": 863.6889038085938, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.46514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2744766175746918, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0, + "num_tokens": 41767988.0, + "reward": -0.011735863983631134, + "reward_std": 0.19990617036819458, + "rewards/cosine_scaled_reward/mean": -0.011735863983631134, + "rewards/cosine_scaled_reward/std": 0.4528859853744507, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1203.65625, + "completions/mean_terminated_length": 967.239990234375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.4662857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2612498104572296, + "learning_rate": 1.915615368891117e-07, + "loss": -0.0, + "num_tokens": 41856158.0, + "reward": -0.05038873106241226, + "reward_std": 0.2517946660518646, + "rewards/cosine_scaled_reward/mean": -0.050388723611831665, + "rewards/cosine_scaled_reward/std": 0.4285145699977875, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1454.125, + "completions/mean_terminated_length": 1203.3778076171875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.4674285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2793102562427521, + "learning_rate": 1.8967088307307e-07, + "loss": 0.0, + "num_tokens": 41959918.0, + "reward": 0.01714620739221573, + "reward_std": 0.3343357443809509, + "rewards/cosine_scaled_reward/mean": 0.017146214842796326, + "rewards/cosine_scaled_reward/std": 0.4367195963859558, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 1185.5625, + "completions/mean_terminated_length": 764.3720703125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.4685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0585436820983887, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0, + "num_tokens": 42046714.0, + "reward": -0.08598305284976959, + "reward_std": 0.25422143936157227, + "rewards/cosine_scaled_reward/mean": -0.08598306030035019, + "rewards/cosine_scaled_reward/std": 0.4083198010921478, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1347.625, + "completions/mean_terminated_length": 1029.272705078125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.4697142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2592661678791046, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.0, + "num_tokens": 42143370.0, + "reward": 0.027512095868587494, + "reward_std": 0.32514244318008423, + "rewards/cosine_scaled_reward/mean": 0.027512073516845703, + "rewards/cosine_scaled_reward/std": 0.47743892669677734, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 1154.078125, + "completions/mean_terminated_length": 903.7799682617188, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.47085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31956759095191956, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0, + "num_tokens": 42227639.0, + "reward": 0.010800749063491821, + "reward_std": 0.23627053201198578, + "rewards/cosine_scaled_reward/mean": 0.010800749063491821, + "rewards/cosine_scaled_reward/std": 0.4793511629104614, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1177.59375, + "completions/mean_terminated_length": 933.8800048828125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31341809034347534, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0, + "num_tokens": 42313381.0, + "reward": 0.07047475129365921, + "reward_std": 0.2826780080795288, + "rewards/cosine_scaled_reward/mean": 0.07047474384307861, + "rewards/cosine_scaled_reward/std": 0.45719414949417114, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 1556.25, + "completions/mean_terminated_length": 1280.39013671875, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "epoch": 0.47314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23267346620559692, + "learning_rate": 1.804828558898332e-07, + "loss": 0.0, + "num_tokens": 42423565.0, + "reward": -0.11575190722942352, + "reward_std": 0.3371303081512451, + "rewards/cosine_scaled_reward/mean": -0.11575192213058472, + "rewards/cosine_scaled_reward/std": 0.41621309518814087, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1529.71875, + "completions/mean_terminated_length": 1175.105224609375, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "epoch": 0.4742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24718843400478363, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0, + "num_tokens": 42532403.0, + "reward": -0.09849782288074493, + "reward_std": 0.34412887692451477, + "rewards/cosine_scaled_reward/mean": -0.09849782288074493, + "rewards/cosine_scaled_reward/std": 0.3836482763290405, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1171.453125, + "completions/mean_terminated_length": 969.173095703125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.4754285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2948278486728668, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0, + "num_tokens": 42618640.0, + "reward": 0.06686967611312866, + "reward_std": 0.3304000198841095, + "rewards/cosine_scaled_reward/mean": 0.06686967611312866, + "rewards/cosine_scaled_reward/std": 0.5081449151039124, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 1325.46875, + "completions/mean_terminated_length": 1020.4000244140625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4765714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3314505815505981, + "learning_rate": 1.7518544168045524e-07, + "loss": -0.0, + "num_tokens": 42714030.0, + "reward": -0.11587303131818771, + "reward_std": 0.26352766156196594, + "rewards/cosine_scaled_reward/mean": -0.11587303131818771, + "rewards/cosine_scaled_reward/std": 0.37798386812210083, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1043.359375, + "completions/mean_terminated_length": 878.963623046875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.4777142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36372727155685425, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.0, + "num_tokens": 42791701.0, + "reward": 0.1487215757369995, + "reward_std": 0.28281038999557495, + "rewards/cosine_scaled_reward/mean": 0.1487215757369995, + "rewards/cosine_scaled_reward/std": 0.5075209736824036, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1088.0625, + "completions/mean_terminated_length": 1006.7118530273438, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.47885714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3068139851093292, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.0, + "num_tokens": 42871841.0, + "reward": 0.07801510393619537, + "reward_std": 0.35038578510284424, + "rewards/cosine_scaled_reward/mean": 0.07801511138677597, + "rewards/cosine_scaled_reward/std": 0.4775239825248718, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 940.078125, + "completions/mean_terminated_length": 846.1864624023438, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3610937297344208, + "learning_rate": 1.7005243352409333e-07, + "loss": -0.0, + "num_tokens": 42942070.0, + "reward": -0.12704703211784363, + "reward_std": 0.20476336777210236, + "rewards/cosine_scaled_reward/mean": -0.12704703211784363, + "rewards/cosine_scaled_reward/std": 0.40539950132369995, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1208.578125, + "completions/mean_terminated_length": 928.7708740234375, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.48114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32972097396850586, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0, + "num_tokens": 43030259.0, + "reward": -0.21137985587120056, + "reward_std": 0.30565890669822693, + "rewards/cosine_scaled_reward/mean": -0.21137985587120056, + "rewards/cosine_scaled_reward/std": 0.3228145241737366, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1381.359375, + "completions/mean_terminated_length": 1078.3409423828125, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.48228571428571426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2809450030326843, + "learning_rate": 1.6672287963562852e-07, + "loss": -0.0, + "num_tokens": 43129058.0, + "reward": -0.04460533708333969, + "reward_std": 0.33481448888778687, + "rewards/cosine_scaled_reward/mean": -0.04460533708333969, + "rewards/cosine_scaled_reward/std": 0.43994688987731934, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1365.125, + "completions/mean_terminated_length": 955.4000244140625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.48342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3238787353038788, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0, + "num_tokens": 43227610.0, + "reward": -0.19622239470481873, + "reward_std": 0.22936534881591797, + "rewards/cosine_scaled_reward/mean": -0.19622239470481873, + "rewards/cosine_scaled_reward/std": 0.3362935781478882, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1464.640625, + "completions/mean_terminated_length": 1137.3902587890625, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "epoch": 0.4845714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25315195322036743, + "learning_rate": 1.6346804638120098e-07, + "loss": -0.0, + "num_tokens": 43332483.0, + "reward": -0.15073364973068237, + "reward_std": 0.23451551795005798, + "rewards/cosine_scaled_reward/mean": -0.15073364973068237, + "rewards/cosine_scaled_reward/std": 0.3537865877151489, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1683.0, + "completions/mean_length": 1083.984375, + "completions/mean_terminated_length": 926.236328125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.4857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3034358322620392, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0, + "num_tokens": 43412378.0, + "reward": 0.4011744260787964, + "reward_std": 0.3350656032562256, + "rewards/cosine_scaled_reward/mean": 0.4011744260787964, + "rewards/cosine_scaled_reward/std": 0.5258311033248901, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 1135.625, + "completions/mean_terminated_length": 880.1599731445312, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.4868571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36451342701911926, + "learning_rate": 1.6028856829700258e-07, + "loss": -0.0, + "num_tokens": 43495402.0, + "reward": -0.029252879321575165, + "reward_std": 0.28142860531806946, + "rewards/cosine_scaled_reward/mean": -0.02925288677215576, + "rewards/cosine_scaled_reward/std": 0.4736383259296417, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1424.8125, + "completions/mean_terminated_length": 1161.68896484375, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2642489969730377, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0, + "num_tokens": 43598582.0, + "reward": -0.0024358853697776794, + "reward_std": 0.33131521940231323, + "rewards/cosine_scaled_reward/mean": -0.0024358853697776794, + "rewards/cosine_scaled_reward/std": 0.4573180377483368, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1293.875, + "completions/mean_terminated_length": 1082.719970703125, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.48914285714285716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3077866733074188, + "learning_rate": 1.5718506522858572e-07, + "loss": -0.0, + "num_tokens": 43691910.0, + "reward": -0.10191977024078369, + "reward_std": 0.3159896731376648, + "rewards/cosine_scaled_reward/mean": -0.10191977024078369, + "rewards/cosine_scaled_reward/std": 0.4219300150871277, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1786.0, + "completions/mean_length": 898.8125, + "completions/mean_terminated_length": 861.7418823242188, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.49028571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3498988449573517, + "learning_rate": 1.5566199398026147e-07, + "loss": -0.0, + "num_tokens": 43759466.0, + "reward": -0.04474419355392456, + "reward_std": 0.3491528630256653, + "rewards/cosine_scaled_reward/mean": -0.04474419355392456, + "rewards/cosine_scaled_reward/std": 0.44242092967033386, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1135.28125, + "completions/mean_terminated_length": 966.25927734375, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.49142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2568611204624176, + "learning_rate": 1.5415814221002265e-07, + "loss": -0.0, + "num_tokens": 43842868.0, + "reward": 0.05037510395050049, + "reward_std": 0.32187074422836304, + "rewards/cosine_scaled_reward/mean": 0.05037510767579079, + "rewards/cosine_scaled_reward/std": 0.4334285259246826, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1495.0, + "completions/mean_length": 1000.03125, + "completions/mean_terminated_length": 850.3214721679688, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.49257142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3356950581073761, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0, + "num_tokens": 43918070.0, + "reward": -0.04200819134712219, + "reward_std": 0.24933312833309174, + "rewards/cosine_scaled_reward/mean": -0.04200819879770279, + "rewards/cosine_scaled_reward/std": 0.41009721159935, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1408.578125, + "completions/mean_terminated_length": 1212.836669921875, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "epoch": 0.4937142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2495565265417099, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0, + "num_tokens": 44018947.0, + "reward": 0.05039107799530029, + "reward_std": 0.36211979389190674, + "rewards/cosine_scaled_reward/mean": 0.050391070544719696, + "rewards/cosine_scaled_reward/std": 0.4851328134536743, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1411.671875, + "completions/mean_terminated_length": 1078.357177734375, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "epoch": 0.4948571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2228534072637558, + "learning_rate": 1.4976263201891613e-07, + "loss": -0.0, + "num_tokens": 44119126.0, + "reward": -0.0001517757773399353, + "reward_std": 0.23533518612384796, + "rewards/cosine_scaled_reward/mean": -0.00015176832675933838, + "rewards/cosine_scaled_reward/std": 0.5502965450286865, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 1365.3125, + "completions/mean_terminated_length": 1238.888916015625, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "epoch": 0.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24571111798286438, + "learning_rate": 1.483363816965435e-07, + "loss": 0.0, + "num_tokens": 44217810.0, + "reward": -0.2489815354347229, + "reward_std": 0.19901177287101746, + "rewards/cosine_scaled_reward/mean": -0.2489815354347229, + "rewards/cosine_scaled_reward/std": 0.22070743143558502, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 843.03125, + "completions/mean_terminated_length": 823.90478515625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.49714285714285716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3952166438102722, + "learning_rate": 1.469297078922642e-07, + "loss": -0.0, + "num_tokens": 44282588.0, + "reward": -0.11719170212745667, + "reward_std": 0.3373968005180359, + "rewards/cosine_scaled_reward/mean": -0.11719170212745667, + "rewards/cosine_scaled_reward/std": 0.3970893919467926, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 1012.6875, + "completions/mean_terminated_length": 773.769287109375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.4982857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3275962471961975, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0, + "num_tokens": 44357752.0, + "reward": 0.11989711225032806, + "reward_std": 0.2525637149810791, + "rewards/cosine_scaled_reward/mean": 0.11989710479974747, + "rewards/cosine_scaled_reward/std": 0.5183996558189392, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1301.09375, + "completions/mean_terminated_length": 1146.075439453125, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.49942857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26017865538597107, + "learning_rate": 1.4417536311769885e-07, + "loss": -0.0, + "num_tokens": 44451526.0, + "reward": -0.05758136138319969, + "reward_std": 0.2695399820804596, + "rewards/cosine_scaled_reward/mean": -0.05758136883378029, + "rewards/cosine_scaled_reward/std": 0.47205057740211487, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1405.5, + "completions/mean_terminated_length": 1068.952392578125, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "epoch": 0.5005714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2330140769481659, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0, + "num_tokens": 44551918.0, + "reward": -0.1377682238817215, + "reward_std": 0.30102482438087463, + "rewards/cosine_scaled_reward/mean": -0.1377682238817215, + "rewards/cosine_scaled_reward/std": 0.3792971670627594, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1652.0, + "completions/mean_length": 1203.09375, + "completions/mean_terminated_length": 966.5199584960938, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "epoch": 0.5017142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2796766459941864, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0, + "num_tokens": 44640548.0, + "reward": -0.11659872531890869, + "reward_std": 0.16560694575309753, + "rewards/cosine_scaled_reward/mean": -0.1165987178683281, + "rewards/cosine_scaled_reward/std": 0.4298913776874542, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1145.890625, + "completions/mean_terminated_length": 869.7346801757812, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.5028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33164745569229126, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0, + "num_tokens": 44724453.0, + "reward": -0.21655994653701782, + "reward_std": 0.2501342296600342, + "rewards/cosine_scaled_reward/mean": -0.21655994653701782, + "rewards/cosine_scaled_reward/std": 0.3220486342906952, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1332.515625, + "completions/mean_terminated_length": 1052.54345703125, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3051064908504486, + "learning_rate": 1.3890454406082956e-07, + "loss": -0.0, + "num_tokens": 44820470.0, + "reward": -0.011267155408859253, + "reward_std": 0.28055429458618164, + "rewards/cosine_scaled_reward/mean": -0.011267140507698059, + "rewards/cosine_scaled_reward/std": 0.5222532153129578, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1080.984375, + "completions/mean_terminated_length": 962.2280883789062, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.5051428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27710723876953125, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0, + "num_tokens": 44901133.0, + "reward": -0.007066421210765839, + "reward_std": 0.31837818026542664, + "rewards/cosine_scaled_reward/mean": -0.007066421210765839, + "rewards/cosine_scaled_reward/std": 0.5073189735412598, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 1404.0, + "completions/mean_terminated_length": 934.0540771484375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.5062857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26647108793258667, + "learning_rate": 1.3638909733514452e-07, + "loss": -0.0, + "num_tokens": 45002053.0, + "reward": -0.07669035345315933, + "reward_std": 0.1767852008342743, + "rewards/cosine_scaled_reward/mean": -0.07669034600257874, + "rewards/cosine_scaled_reward/std": 0.44851550459861755, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1676.0, + "completions/mean_length": 1100.6875, + "completions/mean_terminated_length": 945.6726684570312, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.5074285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29792189598083496, + "learning_rate": 1.351615817851748e-07, + "loss": -0.0, + "num_tokens": 45083105.0, + "reward": -0.0347767099738121, + "reward_std": 0.3655226528644562, + "rewards/cosine_scaled_reward/mean": -0.034776702523231506, + "rewards/cosine_scaled_reward/std": 0.45551741123199463, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1170.5, + "completions/mean_terminated_length": 1045.1429443359375, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.5085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26108649373054504, + "learning_rate": 1.3395428487445914e-07, + "loss": -0.0, + "num_tokens": 45168689.0, + "reward": -0.07436345517635345, + "reward_std": 0.32115882635116577, + "rewards/cosine_scaled_reward/mean": -0.07436345517635345, + "rewards/cosine_scaled_reward/std": 0.4131569564342499, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1245.140625, + "completions/mean_terminated_length": 1078.5093994140625, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "epoch": 0.5097142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.266167551279068, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.0, + "num_tokens": 45260370.0, + "reward": -0.008524402976036072, + "reward_std": 0.33043521642684937, + "rewards/cosine_scaled_reward/mean": -0.00852440670132637, + "rewards/cosine_scaled_reward/std": 0.5104613304138184, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1028.96875, + "completions/mean_terminated_length": 883.3928833007812, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.5108571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3669893145561218, + "learning_rate": 1.316005813502869e-07, + "loss": -0.0, + "num_tokens": 45336232.0, + "reward": -0.0438205823302269, + "reward_std": 0.3060353994369507, + "rewards/cosine_scaled_reward/mean": -0.0438205823302269, + "rewards/cosine_scaled_reward/std": 0.45601826906204224, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1733.0, + "completions/mean_length": 1018.25, + "completions/mean_terminated_length": 911.72412109375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3722478151321411, + "learning_rate": 1.3045428945301953e-07, + "loss": -0.0, + "num_tokens": 45411176.0, + "reward": -0.05109614133834839, + "reward_std": 0.23726628720760345, + "rewards/cosine_scaled_reward/mean": -0.05109614133834839, + "rewards/cosine_scaled_reward/std": 0.4964829981327057, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1030.25, + "completions/mean_terminated_length": 924.9655151367188, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5131428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3326222896575928, + "learning_rate": 1.2932844562179352e-07, + "loss": -0.0, + "num_tokens": 45487976.0, + "reward": -0.2593098282814026, + "reward_std": 0.1575671136379242, + "rewards/cosine_scaled_reward/mean": -0.2593098282814026, + "rewards/cosine_scaled_reward/std": 0.24032087624073029, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1070.453125, + "completions/mean_terminated_length": 950.4035034179688, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.5142857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31410908699035645, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0, + "num_tokens": 45566933.0, + "reward": -0.05131612718105316, + "reward_std": 0.24355947971343994, + "rewards/cosine_scaled_reward/mean": -0.05131613463163376, + "rewards/cosine_scaled_reward/std": 0.4501807391643524, + "step": 450 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 45566933, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}