diff --git "a/checkpoint-350/trainer_state.json" "b/checkpoint-350/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-350/trainer_state.json" @@ -0,0 +1,8784 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4, + "eval_steps": 500, + "global_step": 350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2837817668914795, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": -0.09800112247467041, + "reward_std": 0.3028089702129364, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2421981245279312, + "learning_rate": 2e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.020556632429361343, + "reward_std": 0.3545936942100525, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 1952.234375, + "completions/mean_terminated_length": 822.2000122070312, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24851329624652863, + "learning_rate": 4e-08, + "loss": -0.0, + "num_tokens": 375163.0, + "reward": -0.22721199691295624, + "reward_std": 0.14563649892807007, + "rewards/cosine_scaled_reward/mean": -0.22721199691295624, + "rewards/cosine_scaled_reward/std": 0.1709199845790863, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 1554.109375, + "completions/mean_terminated_length": 958.0344848632812, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29272863268852234, + "learning_rate": 6e-08, + "loss": -0.0, + "num_tokens": 484434.0, + "reward": -0.17542189359664917, + "reward_std": 0.18219107389450073, + "rewards/cosine_scaled_reward/mean": -0.17542189359664917, + "rewards/cosine_scaled_reward/std": 0.27975013852119446, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1943.0625, + "completions/mean_terminated_length": 1088.571533203125, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2773251533508301, + "learning_rate": 8e-08, + "loss": 0.0, + "num_tokens": 619606.0, + "reward": -0.2648562788963318, + "reward_std": 0.21638144552707672, + "rewards/cosine_scaled_reward/mean": -0.2648562788963318, + "rewards/cosine_scaled_reward/std": 0.23959198594093323, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 1854.21875, + "completions/mean_terminated_length": 920.5454711914062, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27399909496307373, + "learning_rate": 1e-07, + "loss": -0.0, + "num_tokens": 749924.0, + "reward": -0.19292885065078735, + "reward_std": 0.2666770815849304, + "rewards/cosine_scaled_reward/mean": -0.19292885065078735, + "rewards/cosine_scaled_reward/std": 0.295730322599411, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 1940.5625, + "completions/mean_terminated_length": 1065.71435546875, + "completions/min_length": 773.0, + "completions/min_terminated_length": 773.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23362359404563904, + "learning_rate": 1.2e-07, + "loss": 0.0, + "num_tokens": 884528.0, + "reward": -0.18198424577713013, + "reward_std": 0.18540163338184357, + "rewards/cosine_scaled_reward/mean": -0.18198424577713013, + "rewards/cosine_scaled_reward/std": 0.32407456636428833, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1708.5625, + "completions/mean_terminated_length": 1013.5238037109375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24677562713623047, + "learning_rate": 1.4e-07, + "loss": -0.0, + "num_tokens": 1004292.0, + "reward": -0.09573853015899658, + "reward_std": 0.22485454380512238, + "rewards/cosine_scaled_reward/mean": -0.09573852270841599, + "rewards/cosine_scaled_reward/std": 0.449250191450119, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1221.0, + "completions/mean_length": 1979.359375, + "completions/mean_terminated_length": 949.75, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26966309547424316, + "learning_rate": 1.6e-07, + "loss": 0.0, + "num_tokens": 1142427.0, + "reward": -0.19992578029632568, + "reward_std": 0.20190927386283875, + "rewards/cosine_scaled_reward/mean": -0.19992581009864807, + "rewards/cosine_scaled_reward/std": 0.23785534501075745, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1652.59375, + "completions/mean_terminated_length": 897.727294921875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3011312484741211, + "learning_rate": 1.8e-07, + "loss": 0.0, + "num_tokens": 1259025.0, + "reward": -0.11706389486789703, + "reward_std": 0.2934548258781433, + "rewards/cosine_scaled_reward/mean": -0.11706390231847763, + "rewards/cosine_scaled_reward/std": 0.3601698577404022, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 1946.6875, + "completions/mean_terminated_length": 967.3333740234375, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2451399564743042, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 1395285.0, + "reward": -0.2866281270980835, + "reward_std": 0.12184012681245804, + "rewards/cosine_scaled_reward/mean": -0.2866281270980835, + "rewards/cosine_scaled_reward/std": 0.15141677856445312, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1659.28125, + "completions/mean_terminated_length": 1190.137939453125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2733561396598816, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "num_tokens": 1512423.0, + "reward": -0.13816070556640625, + "reward_std": 0.2968980073928833, + "rewards/cosine_scaled_reward/mean": -0.13816070556640625, + "rewards/cosine_scaled_reward/std": 0.3597467839717865, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1770.0, + "completions/mean_length": 1807.796875, + "completions/mean_terminated_length": 1023.1333618164062, + "completions/min_length": 697.0, + "completions/min_terminated_length": 697.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25238803029060364, + "learning_rate": 2.4e-07, + "loss": 0.0, + "num_tokens": 1639162.0, + "reward": -0.13488636910915375, + "reward_std": 0.2661236524581909, + "rewards/cosine_scaled_reward/mean": -0.13488635420799255, + "rewards/cosine_scaled_reward/std": 0.3444243371486664, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 1846.921875, + "completions/mean_terminated_length": 1243.6875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2201598882675171, + "learning_rate": 2.6e-07, + "loss": -0.0, + "num_tokens": 1767973.0, + "reward": -0.20591925084590912, + "reward_std": 0.21505361795425415, + "rewards/cosine_scaled_reward/mean": -0.20591923594474792, + "rewards/cosine_scaled_reward/std": 0.323749840259552, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 1710.421875, + "completions/mean_terminated_length": 847.7222290039062, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2665213644504547, + "learning_rate": 2.8e-07, + "loss": 0.0, + "num_tokens": 1888360.0, + "reward": -0.0778750479221344, + "reward_std": 0.17502948641777039, + "rewards/cosine_scaled_reward/mean": -0.0778750628232956, + "rewards/cosine_scaled_reward/std": 0.47343766689300537, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 2031.03125, + "completions/mean_terminated_length": 962.0, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23009927570819855, + "learning_rate": 3e-07, + "loss": -0.0, + "num_tokens": 2028786.0, + "reward": -0.2619968056678772, + "reward_std": 0.16954168677330017, + "rewards/cosine_scaled_reward/mean": -0.2619968056678772, + "rewards/cosine_scaled_reward/std": 0.18357795476913452, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1533.15625, + "completions/mean_terminated_length": 780.6923217773438, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3392995297908783, + "learning_rate": 3.2e-07, + "loss": -0.0, + "num_tokens": 2137428.0, + "reward": -0.11706461012363434, + "reward_std": 0.3096129894256592, + "rewards/cosine_scaled_reward/mean": -0.11706460267305374, + "rewards/cosine_scaled_reward/std": 0.3810974657535553, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 1774.46875, + "completions/mean_terminated_length": 1018.2352905273438, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23254038393497467, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "num_tokens": 2261370.0, + "reward": -0.18709540367126465, + "reward_std": 0.2795025110244751, + "rewards/cosine_scaled_reward/mean": -0.18709540367126465, + "rewards/cosine_scaled_reward/std": 0.3359416127204895, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 1719.0, + "completions/mean_terminated_length": 995.2000122070312, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262045681476593, + "learning_rate": 3.6e-07, + "loss": -0.0, + "num_tokens": 2382642.0, + "reward": -0.02329203486442566, + "reward_std": 0.34684932231903076, + "rewards/cosine_scaled_reward/mean": -0.02329203486442566, + "rewards/cosine_scaled_reward/std": 0.47637447714805603, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1630.90625, + "completions/mean_terminated_length": 935.75, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.250532329082489, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "num_tokens": 2498372.0, + "reward": -0.06319350004196167, + "reward_std": 0.2394939512014389, + "rewards/cosine_scaled_reward/mean": -0.06319350004196167, + "rewards/cosine_scaled_reward/std": 0.3889789879322052, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 1735.96875, + "completions/mean_terminated_length": 1140.272705078125, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2773231565952301, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 2620282.0, + "reward": -0.20884393155574799, + "reward_std": 0.20233216881752014, + "rewards/cosine_scaled_reward/mean": -0.20884393155574799, + "rewards/cosine_scaled_reward/std": 0.28432920575141907, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1790.0, + "completions/mean_length": 1342.953125, + "completions/mean_terminated_length": 919.9249877929688, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34627005457878113, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "num_tokens": 2715247.0, + "reward": -0.09092864394187927, + "reward_std": 0.21042926609516144, + "rewards/cosine_scaled_reward/mean": -0.09092865139245987, + "rewards/cosine_scaled_reward/std": 0.43559205532073975, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1661.9375, + "completions/mean_terminated_length": 1132.888916015625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2705242335796356, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "num_tokens": 2832403.0, + "reward": -0.13339249789714813, + "reward_std": 0.2433384656906128, + "rewards/cosine_scaled_reward/mean": -0.13339248299598694, + "rewards/cosine_scaled_reward/std": 0.3815627098083496, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1802.296875, + "completions/mean_terminated_length": 1065.1875, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24961258471012115, + "learning_rate": 4.6e-07, + "loss": 0.0, + "num_tokens": 2958678.0, + "reward": -0.18733163177967072, + "reward_std": 0.2773033380508423, + "rewards/cosine_scaled_reward/mean": -0.1873316466808319, + "rewards/cosine_scaled_reward/std": 0.37051624059677124, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 1731.53125, + "completions/mean_terminated_length": 982.0, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2662124037742615, + "learning_rate": 4.8e-07, + "loss": 0.0, + "num_tokens": 3079792.0, + "reward": -0.12407588213682175, + "reward_std": 0.25581949949264526, + "rewards/cosine_scaled_reward/mean": -0.12407589703798294, + "rewards/cosine_scaled_reward/std": 0.39043793082237244, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1965.46875, + "completions/mean_terminated_length": 1567.8182373046875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23202598094940186, + "learning_rate": 5e-07, + "loss": 0.0, + "num_tokens": 3216214.0, + "reward": -0.0963105633854866, + "reward_std": 0.30887559056282043, + "rewards/cosine_scaled_reward/mean": -0.0963105633854866, + "rewards/cosine_scaled_reward/std": 0.39396020770072937, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1886.96875, + "completions/mean_terminated_length": 1111.0909423828125, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2878379225730896, + "learning_rate": 5.2e-07, + "loss": -0.0, + "num_tokens": 3347268.0, + "reward": -0.1645491123199463, + "reward_std": 0.28629785776138306, + "rewards/cosine_scaled_reward/mean": -0.1645491123199463, + "rewards/cosine_scaled_reward/std": 0.35050687193870544, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1843.640625, + "completions/mean_terminated_length": 1230.5625, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24996496737003326, + "learning_rate": 5.4e-07, + "loss": 0.0, + "num_tokens": 3475597.0, + "reward": -0.06605555862188339, + "reward_std": 0.2643629312515259, + "rewards/cosine_scaled_reward/mean": -0.06605555862188339, + "rewards/cosine_scaled_reward/std": 0.438128799200058, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 2020.5, + "completions/mean_terminated_length": 1608.0, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23316837847232819, + "learning_rate": 5.6e-07, + "loss": -0.0, + "num_tokens": 3615381.0, + "reward": -0.2015206664800644, + "reward_std": 0.15312039852142334, + "rewards/cosine_scaled_reward/mean": -0.2015206664800644, + "rewards/cosine_scaled_reward/std": 0.1648881882429123, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1839.0, + "completions/mean_length": 1826.046875, + "completions/mean_terminated_length": 955.3077392578125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2410832792520523, + "learning_rate": 5.8e-07, + "loss": -0.0, + "num_tokens": 3742784.0, + "reward": -0.17509159445762634, + "reward_std": 0.18994277715682983, + "rewards/cosine_scaled_reward/mean": -0.17509159445762634, + "rewards/cosine_scaled_reward/std": 0.22516494989395142, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 1781.4375, + "completions/mean_terminated_length": 910.6666870117188, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2693414092063904, + "learning_rate": 6e-07, + "loss": 0.0, + "num_tokens": 3867292.0, + "reward": -0.24513831734657288, + "reward_std": 0.28315529227256775, + "rewards/cosine_scaled_reward/mean": -0.24513831734657288, + "rewards/cosine_scaled_reward/std": 0.3480584919452667, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1969.28125, + "completions/mean_terminated_length": 1488.2222900390625, + "completions/min_length": 1088.0, + "completions/min_terminated_length": 1088.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24202018976211548, + "learning_rate": 6.2e-07, + "loss": 0.0, + "num_tokens": 4003678.0, + "reward": -0.18968716263771057, + "reward_std": 0.28299200534820557, + "rewards/cosine_scaled_reward/mean": -0.18968716263771057, + "rewards/cosine_scaled_reward/std": 0.3119950294494629, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22288212180137634, + "learning_rate": 6.4e-07, + "loss": 0.0, + "num_tokens": 4145966.0, + "reward": -0.2955162525177002, + "reward_std": 0.17793573439121246, + "rewards/cosine_scaled_reward/mean": -0.2955162525177002, + "rewards/cosine_scaled_reward/std": 0.22786569595336914, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 1589.640625, + "completions/mean_terminated_length": 1036.4482421875, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31030499935150146, + "learning_rate": 6.6e-07, + "loss": 0.0, + "num_tokens": 4257255.0, + "reward": 0.008002171292901039, + "reward_std": 0.3413254916667938, + "rewards/cosine_scaled_reward/mean": 0.008002176880836487, + "rewards/cosine_scaled_reward/std": 0.4431404769420624, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1785.921875, + "completions/mean_terminated_length": 757.769287109375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3145958483219147, + "learning_rate": 6.800000000000001e-07, + "loss": -0.0, + "num_tokens": 4383050.0, + "reward": -0.16386553645133972, + "reward_std": 0.2818174958229065, + "rewards/cosine_scaled_reward/mean": -0.16386555135250092, + "rewards/cosine_scaled_reward/std": 0.3242056965827942, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 2000.421875, + "completions/mean_terminated_length": 1033.0, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25796815752983093, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 4522189.0, + "reward": -0.2470606118440628, + "reward_std": 0.15509279072284698, + "rewards/cosine_scaled_reward/mean": -0.2470606118440628, + "rewards/cosine_scaled_reward/std": 0.16412879526615143, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1964.46875, + "completions/mean_terminated_length": 1284.2857666015625, + "completions/min_length": 931.0, + "completions/min_terminated_length": 931.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22452199459075928, + "learning_rate": 7.2e-07, + "loss": 0.0, + "num_tokens": 4658939.0, + "reward": -0.24706938862800598, + "reward_std": 0.18499845266342163, + "rewards/cosine_scaled_reward/mean": -0.24706941843032837, + "rewards/cosine_scaled_reward/std": 0.21092188358306885, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 1925.234375, + "completions/mean_terminated_length": 1175.0, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23703666031360626, + "learning_rate": 7.4e-07, + "loss": -0.0, + "num_tokens": 4793866.0, + "reward": -0.11504355818033218, + "reward_std": 0.20660358667373657, + "rewards/cosine_scaled_reward/mean": -0.11504356563091278, + "rewards/cosine_scaled_reward/std": 0.3190351724624634, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 1740.546875, + "completions/mean_terminated_length": 642.5, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23829001188278198, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "num_tokens": 4916045.0, + "reward": -0.12095541507005692, + "reward_std": 0.1958026885986328, + "rewards/cosine_scaled_reward/mean": -0.12095542997121811, + "rewards/cosine_scaled_reward/std": 0.340241402387619, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1713.203125, + "completions/mean_terminated_length": 920.26318359375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24145744740962982, + "learning_rate": 7.799999999999999e-07, + "loss": -0.0, + "num_tokens": 5035762.0, + "reward": -0.10936243832111359, + "reward_std": 0.14468500018119812, + "rewards/cosine_scaled_reward/mean": -0.10936242341995239, + "rewards/cosine_scaled_reward/std": 0.4288744330406189, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1909.71875, + "completions/mean_terminated_length": 1367.2308349609375, + "completions/min_length": 1138.0, + "completions/min_terminated_length": 1138.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22317881882190704, + "learning_rate": 8e-07, + "loss": 0.0, + "num_tokens": 5169136.0, + "reward": -0.2058967649936676, + "reward_std": 0.2325170338153839, + "rewards/cosine_scaled_reward/mean": -0.20589673519134521, + "rewards/cosine_scaled_reward/std": 0.28897321224212646, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 1727.71875, + "completions/mean_terminated_length": 583.857177734375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44688937067985535, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "num_tokens": 5290070.0, + "reward": -0.2254919707775116, + "reward_std": 0.1687203049659729, + "rewards/cosine_scaled_reward/mean": -0.2254919707775116, + "rewards/cosine_scaled_reward/std": 0.18203677237033844, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 1855.328125, + "completions/mean_terminated_length": 814.9000244140625, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2430828958749771, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0, + "num_tokens": 5420427.0, + "reward": -0.09104865789413452, + "reward_std": 0.18217626214027405, + "rewards/cosine_scaled_reward/mean": -0.09104865789413452, + "rewards/cosine_scaled_reward/std": 0.3521345257759094, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 1727.9375, + "completions/mean_terminated_length": 767.75, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32065215706825256, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "num_tokens": 5541711.0, + "reward": -0.17701950669288635, + "reward_std": 0.2957555055618286, + "rewards/cosine_scaled_reward/mean": -0.17701953649520874, + "rewards/cosine_scaled_reward/std": 0.38460060954093933, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 2013.9375, + "completions/mean_terminated_length": 1321.3333740234375, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22363637387752533, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "num_tokens": 5682259.0, + "reward": -0.20341511070728302, + "reward_std": 0.23104795813560486, + "rewards/cosine_scaled_reward/mean": -0.20341511070728302, + "rewards/cosine_scaled_reward/std": 0.3092363774776459, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 1909.0, + "completions/mean_terminated_length": 936.0, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26306217908859253, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 5815603.0, + "reward": -0.26145532727241516, + "reward_std": 0.17108051478862762, + "rewards/cosine_scaled_reward/mean": -0.2614552974700928, + "rewards/cosine_scaled_reward/std": 0.18312901258468628, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1668.0, + "completions/mean_length": 1757.1875, + "completions/mean_terminated_length": 884.75, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2856813371181488, + "learning_rate": 9.2e-07, + "loss": 0.0, + "num_tokens": 5938463.0, + "reward": -0.20879247784614563, + "reward_std": 0.23861759901046753, + "rewards/cosine_scaled_reward/mean": -0.20879246294498444, + "rewards/cosine_scaled_reward/std": 0.39607998728752136, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 1756.5, + "completions/mean_terminated_length": 1011.5555419921875, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27563413977622986, + "learning_rate": 9.399999999999999e-07, + "loss": -0.0, + "num_tokens": 6061423.0, + "reward": -0.16147920489311218, + "reward_std": 0.24055320024490356, + "rewards/cosine_scaled_reward/mean": -0.16147920489311218, + "rewards/cosine_scaled_reward/std": 0.3948959410190582, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1458.0, + "completions/mean_length": 1538.078125, + "completions/mean_terminated_length": 839.2963256835938, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27617642283439636, + "learning_rate": 9.6e-07, + "loss": -0.0, + "num_tokens": 6169924.0, + "reward": -0.18436825275421143, + "reward_std": 0.27141550183296204, + "rewards/cosine_scaled_reward/mean": -0.18436823785305023, + "rewards/cosine_scaled_reward/std": 0.3920196294784546, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 1749.0625, + "completions/mean_terminated_length": 772.5333862304688, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23394836485385895, + "learning_rate": 9.8e-07, + "loss": 0.0, + "num_tokens": 6292680.0, + "reward": -0.10770958662033081, + "reward_std": 0.22513547539710999, + "rewards/cosine_scaled_reward/mean": -0.10770957916975021, + "rewards/cosine_scaled_reward/std": 0.421062707901001, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1482.25, + "completions/mean_terminated_length": 841.0667114257812, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.05828571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3268967568874359, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 6397752.0, + "reward": -0.09745607525110245, + "reward_std": 0.25210899114608765, + "rewards/cosine_scaled_reward/mean": -0.09745605289936066, + "rewards/cosine_scaled_reward/std": 0.3351369798183441, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1743.953125, + "completions/mean_terminated_length": 750.7333984375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.05942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2918722927570343, + "learning_rate": 9.999890338174275e-07, + "loss": -0.0, + "num_tokens": 6520717.0, + "reward": -0.1890830397605896, + "reward_std": 0.21916288137435913, + "rewards/cosine_scaled_reward/mean": -0.1890830546617508, + "rewards/cosine_scaled_reward/std": 0.32568052411079407, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 1772.421875, + "completions/mean_terminated_length": 1010.5294189453125, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.060571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24523264169692993, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0, + "num_tokens": 6644984.0, + "reward": -0.20969681441783905, + "reward_std": 0.1810423731803894, + "rewards/cosine_scaled_reward/mean": -0.20969681441783905, + "rewards/cosine_scaled_reward/std": 0.2371566891670227, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1838.859375, + "completions/mean_terminated_length": 1304.388916015625, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.061714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23284469544887543, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "num_tokens": 6773815.0, + "reward": -0.06641622632741928, + "reward_std": 0.30815836787223816, + "rewards/cosine_scaled_reward/mean": -0.06641621887683868, + "rewards/cosine_scaled_reward/std": 0.46219584345817566, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1750.125, + "completions/mean_terminated_length": 856.5, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.06285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2651103734970093, + "learning_rate": 9.998245517681593e-07, + "loss": -0.0, + "num_tokens": 6896111.0, + "reward": -0.10750342905521393, + "reward_std": 0.2286185324192047, + "rewards/cosine_scaled_reward/mean": -0.10750342160463333, + "rewards/cosine_scaled_reward/std": 0.43372800946235657, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1840.078125, + "completions/mean_terminated_length": 1097.5, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22967560589313507, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 7024836.0, + "reward": -0.10045827925205231, + "reward_std": 0.2548004388809204, + "rewards/cosine_scaled_reward/mean": -0.10045827925205231, + "rewards/cosine_scaled_reward/std": 0.41444358229637146, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 1991.1875, + "completions/mean_terminated_length": 1442.0, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "epoch": 0.06514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20479348301887512, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "num_tokens": 7163840.0, + "reward": -0.27901512384414673, + "reward_std": 0.2130473554134369, + "rewards/cosine_scaled_reward/mean": -0.27901512384414673, + "rewards/cosine_scaled_reward/std": 0.2583855092525482, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1617.421875, + "completions/mean_terminated_length": 1129.433349609375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.06628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2690146267414093, + "learning_rate": 9.994627618036452e-07, + "loss": -0.0, + "num_tokens": 7277451.0, + "reward": -0.04198366403579712, + "reward_std": 0.4036104083061218, + "rewards/cosine_scaled_reward/mean": -0.04198366031050682, + "rewards/cosine_scaled_reward/std": 0.5008736252784729, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1736.09375, + "completions/mean_terminated_length": 997.368408203125, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "epoch": 0.06742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2184475064277649, + "learning_rate": 9.992983438818915e-07, + "loss": -0.0, + "num_tokens": 7399025.0, + "reward": -0.1564982533454895, + "reward_std": 0.19560785591602325, + "rewards/cosine_scaled_reward/mean": -0.1564982533454895, + "rewards/cosine_scaled_reward/std": 0.3402426540851593, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1512.0, + "completions/mean_length": 1785.40625, + "completions/mean_terminated_length": 847.5714721679688, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.06857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23538637161254883, + "learning_rate": 9.991120277927223e-07, + "loss": -0.0, + "num_tokens": 7524179.0, + "reward": -0.2697012424468994, + "reward_std": 0.17935499548912048, + "rewards/cosine_scaled_reward/mean": -0.2697012424468994, + "rewards/cosine_scaled_reward/std": 0.19757980108261108, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1884.484375, + "completions/mean_terminated_length": 1001.5, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.06971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.225452721118927, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "num_tokens": 7656306.0, + "reward": -0.1635127067565918, + "reward_std": 0.1931447982788086, + "rewards/cosine_scaled_reward/mean": -0.1635127067565918, + "rewards/cosine_scaled_reward/std": 0.23563610017299652, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1739.46875, + "completions/mean_terminated_length": 1060.7000732421875, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.07085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23771661520004272, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0, + "num_tokens": 7777864.0, + "reward": -0.10127441585063934, + "reward_std": 0.2957979142665863, + "rewards/cosine_scaled_reward/mean": -0.10127442330121994, + "rewards/cosine_scaled_reward/std": 0.34053224325180054, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1522.953125, + "completions/mean_terminated_length": 1163.7105712890625, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27804723381996155, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "num_tokens": 7885589.0, + "reward": -0.036153122782707214, + "reward_std": 0.3305097818374634, + "rewards/cosine_scaled_reward/mean": -0.03615312650799751, + "rewards/cosine_scaled_reward/std": 0.4355940818786621, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 1760.390625, + "completions/mean_terminated_length": 1025.388916015625, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.07314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2333846092224121, + "learning_rate": 9.981479793771866e-07, + "loss": -0.0, + "num_tokens": 8009206.0, + "reward": -0.14333069324493408, + "reward_std": 0.28757935762405396, + "rewards/cosine_scaled_reward/mean": -0.14333069324493408, + "rewards/cosine_scaled_reward/std": 0.41007620096206665, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1651.515625, + "completions/mean_terminated_length": 638.2777709960938, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.07428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26348626613616943, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0, + "num_tokens": 8125607.0, + "reward": -0.2117859125137329, + "reward_std": 0.15534773468971252, + "rewards/cosine_scaled_reward/mean": -0.2117859125137329, + "rewards/cosine_scaled_reward/std": 0.37395453453063965, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 1254.125, + "completions/mean_terminated_length": 596.3428344726562, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.07542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33443817496299744, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "num_tokens": 8216103.0, + "reward": 0.028336994349956512, + "reward_std": 0.25119709968566895, + "rewards/cosine_scaled_reward/mean": 0.02833697199821472, + "rewards/cosine_scaled_reward/std": 0.4882389008998871, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 1966.21875, + "completions/mean_terminated_length": 1175.666748046875, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.07657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2199370563030243, + "learning_rate": 9.971955636222684e-07, + "loss": -0.0, + "num_tokens": 8352677.0, + "reward": -0.28747493028640747, + "reward_std": 0.15530282258987427, + "rewards/cosine_scaled_reward/mean": -0.28747493028640747, + "rewards/cosine_scaled_reward/std": 0.16220521926879883, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1357.109375, + "completions/mean_terminated_length": 747.5, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.07771428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3341590464115143, + "learning_rate": 9.968344786479415e-07, + "loss": -0.0, + "num_tokens": 8448788.0, + "reward": -0.06672946363687515, + "reward_std": 0.28790342807769775, + "rewards/cosine_scaled_reward/mean": -0.06672945618629456, + "rewards/cosine_scaled_reward/std": 0.35960128903388977, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 1565.046875, + "completions/mean_terminated_length": 944.107177734375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.07885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35159721970558167, + "learning_rate": 9.964516155915151e-07, + "loss": -0.0, + "num_tokens": 8559295.0, + "reward": -0.27992868423461914, + "reward_std": 0.20264248549938202, + "rewards/cosine_scaled_reward/mean": -0.27992868423461914, + "rewards/cosine_scaled_reward/std": 0.23891927301883698, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 1867.765625, + "completions/mean_terminated_length": 606.125, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23989427089691162, + "learning_rate": 9.960469931131936e-07, + "loss": -0.0, + "num_tokens": 8690288.0, + "reward": -0.2498025894165039, + "reward_std": 0.15823513269424438, + "rewards/cosine_scaled_reward/mean": -0.2498025894165039, + "rewards/cosine_scaled_reward/std": 0.17978127300739288, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 1669.125, + "completions/mean_terminated_length": 945.8182373046875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.08114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.335510790348053, + "learning_rate": 9.956206309337066e-07, + "loss": -0.0, + "num_tokens": 8807832.0, + "reward": -0.1673138290643692, + "reward_std": 0.2547321915626526, + "rewards/cosine_scaled_reward/mean": -0.1673138290643692, + "rewards/cosine_scaled_reward/std": 0.39353805780410767, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1632.59375, + "completions/mean_terminated_length": 892.0869750976562, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.08228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30721575021743774, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0, + "num_tokens": 8922670.0, + "reward": -0.1493685096502304, + "reward_std": 0.23021411895751953, + "rewards/cosine_scaled_reward/mean": -0.1493685096502304, + "rewards/cosine_scaled_reward/std": 0.27729952335357666, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 2020.59375, + "completions/mean_terminated_length": 1463.3333740234375, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "epoch": 0.08342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20856839418411255, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0, + "num_tokens": 9062716.0, + "reward": -0.25696587562561035, + "reward_std": 0.19847074151039124, + "rewards/cosine_scaled_reward/mean": -0.25696590542793274, + "rewards/cosine_scaled_reward/std": 0.23918035626411438, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1926.984375, + "completions/mean_terminated_length": 1273.5, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.08457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23241353034973145, + "learning_rate": 9.942113192828444e-07, + "loss": -0.0, + "num_tokens": 9195971.0, + "reward": -0.12904082238674164, + "reward_std": 0.23554545640945435, + "rewards/cosine_scaled_reward/mean": -0.12904080748558044, + "rewards/cosine_scaled_reward/std": 0.4280695915222168, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 1868.890625, + "completions/mean_terminated_length": 1092.75, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "epoch": 0.08571428571428572, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.19846303761005402, + "learning_rate": 9.93698216681727e-07, + "loss": -0.0, + "num_tokens": 9326540.0, + "reward": -0.03926669806241989, + "reward_std": 0.2044709324836731, + "rewards/cosine_scaled_reward/mean": -0.039266690611839294, + "rewards/cosine_scaled_reward/std": 0.49658530950546265, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1805.296875, + "completions/mean_terminated_length": 1077.1875, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.08685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23998627066612244, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 9452479.0, + "reward": -0.23065510392189026, + "reward_std": 0.17413878440856934, + "rewards/cosine_scaled_reward/mean": -0.23065511882305145, + "rewards/cosine_scaled_reward/std": 0.21896763145923615, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 1857.328125, + "completions/mean_terminated_length": 1285.3125, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20421437919139862, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "num_tokens": 9582924.0, + "reward": -0.17972718179225922, + "reward_std": 0.209285706281662, + "rewards/cosine_scaled_reward/mean": -0.17972716689109802, + "rewards/cosine_scaled_reward/std": 0.2716500163078308, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1883.921875, + "completions/mean_terminated_length": 1093.3636474609375, + "completions/min_length": 712.0, + "completions/min_terminated_length": 712.0, + "epoch": 0.08914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2156875878572464, + "learning_rate": 9.9202926282791e-07, + "loss": -0.0, + "num_tokens": 9714215.0, + "reward": -0.14897406101226807, + "reward_std": 0.2451157122850418, + "rewards/cosine_scaled_reward/mean": -0.14897406101226807, + "rewards/cosine_scaled_reward/std": 0.38884180784225464, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1878.0, + "completions/mean_length": 1507.65625, + "completions/mean_terminated_length": 767.1851806640625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.09028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29943305253982544, + "learning_rate": 9.91429819907136e-07, + "loss": -0.0, + "num_tokens": 9820801.0, + "reward": -0.17114077508449554, + "reward_std": 0.23199111223220825, + "rewards/cosine_scaled_reward/mean": -0.17114077508449554, + "rewards/cosine_scaled_reward/std": 0.3217289447784424, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1976.125, + "completions/mean_terminated_length": 1536.888916015625, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "epoch": 0.09142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26230743527412415, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0, + "num_tokens": 9957665.0, + "reward": -0.21115826070308685, + "reward_std": 0.2435196340084076, + "rewards/cosine_scaled_reward/mean": -0.21115827560424805, + "rewards/cosine_scaled_reward/std": 0.28258123993873596, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1779.28125, + "completions/mean_terminated_length": 901.4667358398438, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.09257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33359771966934204, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "num_tokens": 10082811.0, + "reward": -0.1508273482322693, + "reward_std": 0.2594776749610901, + "rewards/cosine_scaled_reward/mean": -0.1508273482322693, + "rewards/cosine_scaled_reward/std": 0.33812451362609863, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 1711.609375, + "completions/mean_terminated_length": 851.9444580078125, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.09371428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2805767059326172, + "learning_rate": 9.895025252503755e-07, + "loss": -0.0, + "num_tokens": 10202682.0, + "reward": -0.11850972473621368, + "reward_std": 0.2631937861442566, + "rewards/cosine_scaled_reward/mean": -0.11850972473621368, + "rewards/cosine_scaled_reward/std": 0.4419197142124176, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 1749.984375, + "completions/mean_terminated_length": 1044.157958984375, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "epoch": 0.09485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3109220266342163, + "learning_rate": 9.888172094375033e-07, + "loss": -0.0, + "num_tokens": 10325769.0, + "reward": -0.10190614312887192, + "reward_std": 0.2739119529724121, + "rewards/cosine_scaled_reward/mean": -0.10190614312887192, + "rewards/cosine_scaled_reward/std": 0.39238420128822327, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 1800.390625, + "completions/mean_terminated_length": 829.0000610351562, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23385629057884216, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0, + "num_tokens": 10451690.0, + "reward": -0.21778321266174316, + "reward_std": 0.25428956747055054, + "rewards/cosine_scaled_reward/mean": -0.21778322756290436, + "rewards/cosine_scaled_reward/std": 0.30295974016189575, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 1870.46875, + "completions/mean_terminated_length": 1337.875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "epoch": 0.09714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21526271104812622, + "learning_rate": 9.873824502603459e-07, + "loss": -0.0, + "num_tokens": 10581720.0, + "reward": -0.19906702637672424, + "reward_std": 0.23402772843837738, + "rewards/cosine_scaled_reward/mean": -0.19906699657440186, + "rewards/cosine_scaled_reward/std": 0.28999006748199463, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1369.0, + "completions/mean_length": 1734.875, + "completions/mean_terminated_length": 795.5, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.09828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24285966157913208, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "num_tokens": 10703608.0, + "reward": -0.16528445482254028, + "reward_std": 0.2592755854129791, + "rewards/cosine_scaled_reward/mean": -0.16528445482254028, + "rewards/cosine_scaled_reward/std": 0.37110546231269836, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 1577.921875, + "completions/mean_terminated_length": 973.5357666015625, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "epoch": 0.09942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30273520946502686, + "learning_rate": 9.85862422507884e-07, + "loss": -0.0, + "num_tokens": 10814715.0, + "reward": -0.20241931080818176, + "reward_std": 0.2693288326263428, + "rewards/cosine_scaled_reward/mean": -0.20241928100585938, + "rewards/cosine_scaled_reward/std": 0.33345305919647217, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1680.546875, + "completions/mean_terminated_length": 1068.125, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.10057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2649252116680145, + "learning_rate": 9.850705248720068e-07, + "loss": -0.0, + "num_tokens": 10932782.0, + "reward": -0.018871163949370384, + "reward_std": 0.3073042631149292, + "rewards/cosine_scaled_reward/mean": -0.018871165812015533, + "rewards/cosine_scaled_reward/std": 0.3826298415660858, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 1683.703125, + "completions/mean_terminated_length": 1151.269287109375, + "completions/min_length": 667.0, + "completions/min_terminated_length": 667.0, + "epoch": 0.10171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24950510263442993, + "learning_rate": 9.8425742251254e-07, + "loss": -0.0, + "num_tokens": 11051539.0, + "reward": -0.11818082630634308, + "reward_std": 0.2949528694152832, + "rewards/cosine_scaled_reward/mean": -0.11818082630634308, + "rewards/cosine_scaled_reward/std": 0.34418320655822754, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1558.546875, + "completions/mean_terminated_length": 967.8275756835938, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.10285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36593058705329895, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0, + "num_tokens": 11161286.0, + "reward": -0.26082760095596313, + "reward_std": 0.1802712082862854, + "rewards/cosine_scaled_reward/mean": -0.26082760095596313, + "rewards/cosine_scaled_reward/std": 0.2037661075592041, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1505.0, + "completions/mean_length": 1827.9375, + "completions/mean_terminated_length": 1109.0667724609375, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24167831242084503, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0, + "num_tokens": 11288842.0, + "reward": -0.11456942558288574, + "reward_std": 0.26296502351760864, + "rewards/cosine_scaled_reward/mean": -0.11456942558288574, + "rewards/cosine_scaled_reward/std": 0.3274599611759186, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1581.546875, + "completions/mean_terminated_length": 899.8077392578125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "epoch": 0.10514285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2570616602897644, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "num_tokens": 11400053.0, + "reward": -0.17942462861537933, + "reward_std": 0.2633644640445709, + "rewards/cosine_scaled_reward/mean": -0.17942462861537933, + "rewards/cosine_scaled_reward/std": 0.30215632915496826, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 2022.328125, + "completions/mean_terminated_length": 1226.5, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "epoch": 0.10628571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25331902503967285, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0, + "num_tokens": 11540826.0, + "reward": -0.26418450474739075, + "reward_std": 0.1380012035369873, + "rewards/cosine_scaled_reward/mean": -0.26418450474739075, + "rewards/cosine_scaled_reward/std": 0.17390060424804688, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1702.0, + "completions/mean_length": 1769.546875, + "completions/mean_terminated_length": 934.1875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "epoch": 0.10742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29503753781318665, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0, + "num_tokens": 11663845.0, + "reward": -0.08299511671066284, + "reward_std": 0.18226617574691772, + "rewards/cosine_scaled_reward/mean": -0.08299513161182404, + "rewards/cosine_scaled_reward/std": 0.46436113119125366, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 2021.5, + "completions/mean_terminated_length": 1200.0, + "completions/min_length": 1100.0, + "completions/min_terminated_length": 1100.0, + "epoch": 0.10857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20416001975536346, + "learning_rate": 9.78935800506826e-07, + "loss": -0.0, + "num_tokens": 11803749.0, + "reward": -0.22345861792564392, + "reward_std": 0.18781372904777527, + "rewards/cosine_scaled_reward/mean": -0.22345861792564392, + "rewards/cosine_scaled_reward/std": 0.24531956017017365, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 1582.890625, + "completions/mean_terminated_length": 903.1154174804688, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.10971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2593792974948883, + "learning_rate": 9.779754323328192e-07, + "loss": -0.0, + "num_tokens": 11916190.0, + "reward": 0.00020215287804603577, + "reward_std": 0.24673128128051758, + "rewards/cosine_scaled_reward/mean": 0.00020216405391693115, + "rewards/cosine_scaled_reward/std": 0.49432000517845154, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1748.859375, + "completions/mean_terminated_length": 1177.772705078125, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "epoch": 0.11085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2480001151561737, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0, + "num_tokens": 12038381.0, + "reward": -0.19425566494464874, + "reward_std": 0.21240204572677612, + "rewards/cosine_scaled_reward/mean": -0.19425567984580994, + "rewards/cosine_scaled_reward/std": 0.29181501269340515, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1632.171875, + "completions/mean_terminated_length": 1062.3333740234375, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2797771692276001, + "learning_rate": 9.759921670520634e-07, + "loss": -0.0, + "num_tokens": 12153904.0, + "reward": -0.11104464530944824, + "reward_std": 0.2755987048149109, + "rewards/cosine_scaled_reward/mean": -0.11104465276002884, + "rewards/cosine_scaled_reward/std": 0.4012855887413025, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 1651.078125, + "completions/mean_terminated_length": 553.7058715820312, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.11314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3114299476146698, + "learning_rate": 9.749693666068663e-07, + "loss": -0.0, + "num_tokens": 12270741.0, + "reward": -0.1317199319601059, + "reward_std": 0.14237020909786224, + "rewards/cosine_scaled_reward/mean": -0.1317199319601059, + "rewards/cosine_scaled_reward/std": 0.3707720935344696, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1544.765625, + "completions/mean_terminated_length": 937.413818359375, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.11428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2654109001159668, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "num_tokens": 12379318.0, + "reward": -0.018167953938245773, + "reward_std": 0.29768484830856323, + "rewards/cosine_scaled_reward/mean": -0.01816795952618122, + "rewards/cosine_scaled_reward/std": 0.44200995564460754, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1647.421875, + "completions/mean_terminated_length": 979.7916870117188, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.11542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2877754867076874, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0, + "num_tokens": 12496185.0, + "reward": -0.10053972899913788, + "reward_std": 0.28722673654556274, + "rewards/cosine_scaled_reward/mean": -0.10053973644971848, + "rewards/cosine_scaled_reward/std": 0.36782190203666687, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1371.484375, + "completions/mean_terminated_length": 937.8204956054688, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.11657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30472108721733093, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0, + "num_tokens": 12594112.0, + "reward": -0.20305150747299194, + "reward_std": 0.23292692005634308, + "rewards/cosine_scaled_reward/mean": -0.20305150747299194, + "rewards/cosine_scaled_reward/std": 0.3213489055633545, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 1737.984375, + "completions/mean_terminated_length": 807.9375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.11771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27034303545951843, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0, + "num_tokens": 12715695.0, + "reward": -0.29003486037254333, + "reward_std": 0.21371816098690033, + "rewards/cosine_scaled_reward/mean": -0.29003486037254333, + "rewards/cosine_scaled_reward/std": 0.224824920296669, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 1705.28125, + "completions/mean_terminated_length": 893.5789794921875, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.11885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27687934041023254, + "learning_rate": 9.695457105469804e-07, + "loss": -0.0, + "num_tokens": 12835297.0, + "reward": -0.15606051683425903, + "reward_std": 0.18938840925693512, + "rewards/cosine_scaled_reward/mean": -0.15606051683425903, + "rewards/cosine_scaled_reward/std": 0.24088984727859497, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 1401.015625, + "completions/mean_terminated_length": 830.1470336914062, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2770017087459564, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0, + "num_tokens": 12936250.0, + "reward": 0.018527541309595108, + "reward_std": 0.36475759744644165, + "rewards/cosine_scaled_reward/mean": 0.018527545034885406, + "rewards/cosine_scaled_reward/std": 0.4995051920413971, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 1438.453125, + "completions/mean_terminated_length": 789.5806274414062, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.12114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26982930302619934, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0, + "num_tokens": 13039143.0, + "reward": 0.07083749771118164, + "reward_std": 0.29650557041168213, + "rewards/cosine_scaled_reward/mean": 0.07083749771118164, + "rewards/cosine_scaled_reward/std": 0.5094331502914429, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1722.0, + "completions/mean_length": 1787.09375, + "completions/mean_terminated_length": 1065.7647705078125, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "epoch": 0.12228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26255276799201965, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0, + "num_tokens": 13164669.0, + "reward": -0.2222379446029663, + "reward_std": 0.240003302693367, + "rewards/cosine_scaled_reward/mean": -0.2222379446029663, + "rewards/cosine_scaled_reward/std": 0.29153531789779663, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 1660.96875, + "completions/mean_terminated_length": 1095.3077392578125, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.12342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30773183703422546, + "learning_rate": 9.648384182148252e-07, + "loss": -0.0, + "num_tokens": 13281331.0, + "reward": -0.21352165937423706, + "reward_std": 0.3123124837875366, + "rewards/cosine_scaled_reward/mean": -0.21352165937423706, + "rewards/cosine_scaled_reward/std": 0.3453315496444702, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1858.921875, + "completions/mean_terminated_length": 1117.1539306640625, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.12457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24000757932662964, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0, + "num_tokens": 13411550.0, + "reward": -0.13601753115653992, + "reward_std": 0.1500597596168518, + "rewards/cosine_scaled_reward/mean": -0.1360175609588623, + "rewards/cosine_scaled_reward/std": 0.42859947681427, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1563.90625, + "completions/mean_terminated_length": 900.5185546875, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.12571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31709614396095276, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "num_tokens": 13522416.0, + "reward": -0.28067731857299805, + "reward_std": 0.1671288013458252, + "rewards/cosine_scaled_reward/mean": -0.28067731857299805, + "rewards/cosine_scaled_reward/std": 0.21458736062049866, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 1724.71875, + "completions/mean_terminated_length": 1013.5, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.12685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2646999657154083, + "learning_rate": 9.610954559391704e-07, + "loss": -0.0, + "num_tokens": 13642918.0, + "reward": -0.11896095424890518, + "reward_std": 0.28121650218963623, + "rewards/cosine_scaled_reward/mean": -0.11896096169948578, + "rewards/cosine_scaled_reward/std": 0.37855637073516846, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1918.0, + "completions/mean_terminated_length": 1216.0, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22158586978912354, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0, + "num_tokens": 13776766.0, + "reward": -0.1688530147075653, + "reward_std": 0.2535978853702545, + "rewards/cosine_scaled_reward/mean": -0.1688530296087265, + "rewards/cosine_scaled_reward/std": 0.3341792821884155, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 1556.125, + "completions/mean_terminated_length": 837.2307739257812, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.12914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2930087745189667, + "learning_rate": 9.58499865339809e-07, + "loss": -0.0, + "num_tokens": 13886654.0, + "reward": -0.10367631912231445, + "reward_std": 0.30835023522377014, + "rewards/cosine_scaled_reward/mean": -0.10367631912231445, + "rewards/cosine_scaled_reward/std": 0.42973947525024414, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1521.9375, + "completions/mean_terminated_length": 753.0769653320312, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.13028571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3062143921852112, + "learning_rate": 9.571721736097088e-07, + "loss": -0.0, + "num_tokens": 13993906.0, + "reward": -0.22209212183952332, + "reward_std": 0.2074735462665558, + "rewards/cosine_scaled_reward/mean": -0.22209212183952332, + "rewards/cosine_scaled_reward/std": 0.29088398814201355, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1714.578125, + "completions/mean_terminated_length": 1031.857177734375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.13142857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2564532160758972, + "learning_rate": 9.55824636882301e-07, + "loss": -0.0, + "num_tokens": 14114855.0, + "reward": -0.10947269201278687, + "reward_std": 0.30371129512786865, + "rewards/cosine_scaled_reward/mean": -0.10947269946336746, + "rewards/cosine_scaled_reward/std": 0.41030505299568176, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 1877.90625, + "completions/mean_terminated_length": 492.857177734375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.13257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25748053193092346, + "learning_rate": 9.54457320834625e-07, + "loss": -0.0, + "num_tokens": 14246425.0, + "reward": -0.19163870811462402, + "reward_std": 0.21010378003120422, + "rewards/cosine_scaled_reward/mean": -0.19163869321346283, + "rewards/cosine_scaled_reward/std": 0.3049132525920868, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 1922.546875, + "completions/mean_terminated_length": 1155.888916015625, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "epoch": 0.1337142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24102462828159332, + "learning_rate": 9.530702921077358e-07, + "loss": -0.0, + "num_tokens": 14380492.0, + "reward": -0.21347489953041077, + "reward_std": 0.19724325835704803, + "rewards/cosine_scaled_reward/mean": -0.21347489953041077, + "rewards/cosine_scaled_reward/std": 0.2647304832935333, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1793.546875, + "completions/mean_terminated_length": 1233.75, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.13485714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2439616322517395, + "learning_rate": 9.516636183034564e-07, + "loss": -0.0, + "num_tokens": 14505815.0, + "reward": -0.08845303952693939, + "reward_std": 0.30429399013519287, + "rewards/cosine_scaled_reward/mean": -0.08845303952693939, + "rewards/cosine_scaled_reward/std": 0.4648522734642029, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1405.15625, + "completions/mean_terminated_length": 936.0540771484375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32119110226631165, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0, + "num_tokens": 14606153.0, + "reward": -0.04571840912103653, + "reward_std": 0.3056246340274811, + "rewards/cosine_scaled_reward/mean": -0.04571840912103653, + "rewards/cosine_scaled_reward/std": 0.49307262897491455, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1321.40625, + "completions/mean_terminated_length": 940.8095703125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.13714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3139563500881195, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0, + "num_tokens": 14701179.0, + "reward": -0.12771092355251312, + "reward_std": 0.3157998323440552, + "rewards/cosine_scaled_reward/mean": -0.12771093845367432, + "rewards/cosine_scaled_reward/std": 0.4336044490337372, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1024.5, + "completions/mean_terminated_length": 812.0755004882812, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.1382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3428559899330139, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 14776443.0, + "reward": -0.004689367488026619, + "reward_std": 0.297618567943573, + "rewards/cosine_scaled_reward/mean": -0.004689373075962067, + "rewards/cosine_scaled_reward/std": 0.46961408853530884, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1790.765625, + "completions/mean_terminated_length": 1133.388916015625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.13942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29122933745384216, + "learning_rate": 9.458418577899774e-07, + "loss": -0.0, + "num_tokens": 14902612.0, + "reward": -0.11110783368349075, + "reward_std": 0.22664329409599304, + "rewards/cosine_scaled_reward/mean": -0.11110783368349075, + "rewards/cosine_scaled_reward/std": 0.3362382650375366, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1658.46875, + "completions/mean_terminated_length": 1124.6666259765625, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.14057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2646903693675995, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0, + "num_tokens": 15018986.0, + "reward": -0.20784568786621094, + "reward_std": 0.270358681678772, + "rewards/cosine_scaled_reward/mean": -0.20784570276737213, + "rewards/cosine_scaled_reward/std": 0.35689592361450195, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 1494.9375, + "completions/mean_terminated_length": 868.1333618164062, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.1417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26702970266342163, + "learning_rate": 9.428149347714143e-07, + "loss": -0.0, + "num_tokens": 15125614.0, + "reward": -0.160624697804451, + "reward_std": 0.23646026849746704, + "rewards/cosine_scaled_reward/mean": -0.160624697804451, + "rewards/cosine_scaled_reward/std": 0.4083607792854309, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 1608.609375, + "completions/mean_terminated_length": 825.3478393554688, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2753336727619171, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0, + "num_tokens": 15239493.0, + "reward": -0.008194006979465485, + "reward_std": 0.21567228436470032, + "rewards/cosine_scaled_reward/mean": -0.008194014430046082, + "rewards/cosine_scaled_reward/std": 0.463446706533432, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1759.484375, + "completions/mean_terminated_length": 1076.157958984375, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24985821545124054, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "num_tokens": 15363396.0, + "reward": -0.16068750619888306, + "reward_std": 0.22599664330482483, + "rewards/cosine_scaled_reward/mean": -0.16068752110004425, + "rewards/cosine_scaled_reward/std": 0.304392009973526, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 1842.859375, + "completions/mean_terminated_length": 1110.21435546875, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.14514285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21972927451133728, + "learning_rate": 9.381311511432658e-07, + "loss": -0.0, + "num_tokens": 15492435.0, + "reward": -0.29198482632637024, + "reward_std": 0.17300401628017426, + "rewards/cosine_scaled_reward/mean": -0.29198482632637024, + "rewards/cosine_scaled_reward/std": 0.21628034114837646, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1694.578125, + "completions/mean_terminated_length": 1064.565185546875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.1462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24753950536251068, + "learning_rate": 9.36531953618799e-07, + "loss": -0.0, + "num_tokens": 15611240.0, + "reward": 0.04859344661235809, + "reward_std": 0.31105202436447144, + "rewards/cosine_scaled_reward/mean": 0.04859344661235809, + "rewards/cosine_scaled_reward/std": 0.4569285809993744, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 1917.5625, + "completions/mean_terminated_length": 1004.5, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "epoch": 0.14742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23949742317199707, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0, + "num_tokens": 15744668.0, + "reward": -0.27834638953208923, + "reward_std": 0.16836056113243103, + "rewards/cosine_scaled_reward/mean": -0.27834638953208923, + "rewards/cosine_scaled_reward/std": 0.20021934807300568, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1502.0, + "completions/mean_length": 1725.796875, + "completions/mean_terminated_length": 902.388916015625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.14857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23063035309314728, + "learning_rate": 9.332771203643714e-07, + "loss": -0.0, + "num_tokens": 15865623.0, + "reward": -0.19732065498828888, + "reward_std": 0.19462591409683228, + "rewards/cosine_scaled_reward/mean": -0.19732065498828888, + "rewards/cosine_scaled_reward/std": 0.2627345323562622, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1640.8125, + "completions/mean_terminated_length": 863.45458984375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.14971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29630133509635925, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0, + "num_tokens": 15980371.0, + "reward": -0.07768938690423965, + "reward_std": 0.2543257176876068, + "rewards/cosine_scaled_reward/mean": -0.07768939435482025, + "rewards/cosine_scaled_reward/std": 0.4248148798942566, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 1570.890625, + "completions/mean_terminated_length": 826.5999755859375, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.15085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2735442817211151, + "learning_rate": 9.299475664759068e-07, + "loss": -0.0, + "num_tokens": 16091972.0, + "reward": -0.1057564914226532, + "reward_std": 0.32137495279312134, + "rewards/cosine_scaled_reward/mean": -0.105756476521492, + "rewards/cosine_scaled_reward/std": 0.4788062870502472, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1971.34375, + "completions/mean_terminated_length": 1347.1429443359375, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23230750858783722, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0, + "num_tokens": 16229362.0, + "reward": -0.21333375573158264, + "reward_std": 0.1880394071340561, + "rewards/cosine_scaled_reward/mean": -0.21333375573158264, + "rewards/cosine_scaled_reward/std": 0.2557979226112366, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1682.984375, + "completions/mean_terminated_length": 1113.5599365234375, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "epoch": 0.15314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2776358127593994, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0, + "num_tokens": 16347641.0, + "reward": -0.07218431681394577, + "reward_std": 0.19744814932346344, + "rewards/cosine_scaled_reward/mean": -0.07218432426452637, + "rewards/cosine_scaled_reward/std": 0.41042155027389526, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 1269.171875, + "completions/mean_terminated_length": 736.2894897460938, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.15428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30510956048965454, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0, + "num_tokens": 16439340.0, + "reward": 0.1377476304769516, + "reward_std": 0.25976449251174927, + "rewards/cosine_scaled_reward/mean": 0.1377476155757904, + "rewards/cosine_scaled_reward/std": 0.4923737347126007, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 1624.203125, + "completions/mean_terminated_length": 917.875, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "epoch": 0.15542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25474753975868225, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0, + "num_tokens": 16553961.0, + "reward": -0.04156734049320221, + "reward_std": 0.27987948060035706, + "rewards/cosine_scaled_reward/mean": -0.04156734049320221, + "rewards/cosine_scaled_reward/std": 0.4557124078273773, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1832.625, + "completions/mean_terminated_length": 1063.4285888671875, + "completions/min_length": 600.0, + "completions/min_terminated_length": 600.0, + "epoch": 0.15657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2200661152601242, + "learning_rate": 9.213010742252327e-07, + "loss": -0.0, + "num_tokens": 16681857.0, + "reward": -0.2795522212982178, + "reward_std": 0.16735097765922546, + "rewards/cosine_scaled_reward/mean": -0.2795522212982178, + "rewards/cosine_scaled_reward/std": 0.22360830008983612, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 1547.90625, + "completions/mean_terminated_length": 981.1333618164062, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.15771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.247065007686615, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0, + "num_tokens": 16792235.0, + "reward": -0.1421782374382019, + "reward_std": 0.25017279386520386, + "rewards/cosine_scaled_reward/mean": -0.1421782374382019, + "rewards/cosine_scaled_reward/std": 0.3903765082359314, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1898.375, + "completions/mean_terminated_length": 1177.45458984375, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "epoch": 0.15885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25471416115760803, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0, + "num_tokens": 16924371.0, + "reward": -0.24234679341316223, + "reward_std": 0.15713179111480713, + "rewards/cosine_scaled_reward/mean": -0.24234679341316223, + "rewards/cosine_scaled_reward/std": 0.17467617988586426, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1577.625, + "completions/mean_terminated_length": 1044.533447265625, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.16, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2628695070743561, + "learning_rate": 9.158953424711624e-07, + "loss": -0.0, + "num_tokens": 17035563.0, + "reward": -0.12413343787193298, + "reward_std": 0.20063763856887817, + "rewards/cosine_scaled_reward/mean": -0.12413343787193298, + "rewards/cosine_scaled_reward/std": 0.5006609559059143, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 1405.125, + "completions/mean_terminated_length": 993.025634765625, + "completions/min_length": 586.0, + "completions/min_terminated_length": 586.0, + "epoch": 0.16114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2682877779006958, + "learning_rate": 9.140576474687263e-07, + "loss": -0.0, + "num_tokens": 17136051.0, + "reward": -0.02423717826604843, + "reward_std": 0.2661462128162384, + "rewards/cosine_scaled_reward/mean": -0.02423717826604843, + "rewards/cosine_scaled_reward/std": 0.502265214920044, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1763.515625, + "completions/mean_terminated_length": 1347.7308349609375, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.16228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24265550076961517, + "learning_rate": 9.122022088101613e-07, + "loss": -0.0, + "num_tokens": 17259420.0, + "reward": -0.23560766875743866, + "reward_std": 0.22989924252033234, + "rewards/cosine_scaled_reward/mean": -0.23560766875743866, + "rewards/cosine_scaled_reward/std": 0.28772976994514465, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 1600.671875, + "completions/mean_terminated_length": 1153.34375, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "epoch": 0.16342857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30536147952079773, + "learning_rate": 9.103291169269299e-07, + "loss": -0.0, + "num_tokens": 17372679.0, + "reward": -0.23412726819515228, + "reward_std": 0.226594477891922, + "rewards/cosine_scaled_reward/mean": -0.2341272532939911, + "rewards/cosine_scaled_reward/std": 0.2685011625289917, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 1708.0625, + "completions/mean_terminated_length": 1012.0, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.16457142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2859592139720917, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0, + "num_tokens": 17493483.0, + "reward": -0.11928378790616989, + "reward_std": 0.2819562554359436, + "rewards/cosine_scaled_reward/mean": -0.11928380280733109, + "rewards/cosine_scaled_reward/std": 0.41741910576820374, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1725.0, + "completions/mean_length": 1277.78125, + "completions/mean_terminated_length": 845.707275390625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.1657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.310493141412735, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0, + "num_tokens": 17585205.0, + "reward": 0.009949762374162674, + "reward_std": 0.32572609186172485, + "rewards/cosine_scaled_reward/mean": 0.009949766099452972, + "rewards/cosine_scaled_reward/std": 0.5299619436264038, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1550.625, + "completions/mean_terminated_length": 986.9334106445312, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.16685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2404046207666397, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0, + "num_tokens": 17695061.0, + "reward": -0.17625686526298523, + "reward_std": 0.2529022991657257, + "rewards/cosine_scaled_reward/mean": -0.17625686526298523, + "rewards/cosine_scaled_reward/std": 0.3359045386314392, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 1741.703125, + "completions/mean_terminated_length": 1156.95458984375, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2498754858970642, + "learning_rate": 9.026620557966279e-07, + "loss": -0.0, + "num_tokens": 17817314.0, + "reward": -0.26471418142318726, + "reward_std": 0.2048022449016571, + "rewards/cosine_scaled_reward/mean": -0.26471418142318726, + "rewards/cosine_scaled_reward/std": 0.2656060457229614, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 1386.0625, + "completions/mean_terminated_length": 903.0270385742188, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.16914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2538217306137085, + "learning_rate": 9.007020842191634e-07, + "loss": -0.0, + "num_tokens": 17917206.0, + "reward": -0.10874275863170624, + "reward_std": 0.24236595630645752, + "rewards/cosine_scaled_reward/mean": -0.10874275863170624, + "rewards/cosine_scaled_reward/std": 0.3927372395992279, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1735.96875, + "completions/mean_terminated_length": 1140.272705078125, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "epoch": 0.1702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23916038870811462, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0, + "num_tokens": 18040204.0, + "reward": -0.20906513929367065, + "reward_std": 0.2755752205848694, + "rewards/cosine_scaled_reward/mean": -0.20906512439250946, + "rewards/cosine_scaled_reward/std": 0.38517922163009644, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1379.359375, + "completions/mean_terminated_length": 978.1749877929688, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.17142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30970829725265503, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0, + "num_tokens": 18138987.0, + "reward": -0.14114701747894287, + "reward_std": 0.3519541621208191, + "rewards/cosine_scaled_reward/mean": -0.14114701747894287, + "rewards/cosine_scaled_reward/std": 0.39396560192108154, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1497.328125, + "completions/mean_terminated_length": 1011.441162109375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.17257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2874428331851959, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0, + "num_tokens": 18245496.0, + "reward": -0.04711150377988815, + "reward_std": 0.33344799280166626, + "rewards/cosine_scaled_reward/mean": -0.04711151123046875, + "rewards/cosine_scaled_reward/std": 0.41477611660957336, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1741.0, + "completions/mean_length": 1777.140625, + "completions/mean_terminated_length": 964.5625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.1737142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28282323479652405, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0, + "num_tokens": 18369897.0, + "reward": -0.2543114423751831, + "reward_std": 0.18715068697929382, + "rewards/cosine_scaled_reward/mean": -0.2543114423751831, + "rewards/cosine_scaled_reward/std": 0.19382856786251068, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 1564.0625, + "completions/mean_terminated_length": 900.888916015625, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.17485714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27684876322746277, + "learning_rate": 8.906477750432903e-07, + "loss": -0.0, + "num_tokens": 18481141.0, + "reward": -0.1415693461894989, + "reward_std": 0.23039600253105164, + "rewards/cosine_scaled_reward/mean": -0.1415693461894989, + "rewards/cosine_scaled_reward/std": 0.2940608859062195, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1829.328125, + "completions/mean_terminated_length": 1224.7647705078125, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24094167351722717, + "learning_rate": 8.88586709003076e-07, + "loss": -0.0, + "num_tokens": 18609282.0, + "reward": -0.2521882653236389, + "reward_std": 0.20982292294502258, + "rewards/cosine_scaled_reward/mean": -0.2521882653236389, + "rewards/cosine_scaled_reward/std": 0.23373161256313324, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 1517.765625, + "completions/mean_terminated_length": 916.8333740234375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.17714285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2674770653247833, + "learning_rate": 8.865091407243394e-07, + "loss": -0.0, + "num_tokens": 18717043.0, + "reward": -0.028832588344812393, + "reward_std": 0.22500035166740417, + "rewards/cosine_scaled_reward/mean": -0.028832584619522095, + "rewards/cosine_scaled_reward/std": 0.4698766767978668, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 1731.453125, + "completions/mean_terminated_length": 781.8125, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.1782857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23764768242835999, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0, + "num_tokens": 18837960.0, + "reward": -0.10049945116043091, + "reward_std": 0.2521243393421173, + "rewards/cosine_scaled_reward/mean": -0.10049945116043091, + "rewards/cosine_scaled_reward/std": 0.4200229048728943, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1531.1875, + "completions/mean_terminated_length": 1014.375, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.17942857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28145694732666016, + "learning_rate": 8.823049032816478e-07, + "loss": -0.0, + "num_tokens": 18945916.0, + "reward": -0.22566190361976624, + "reward_std": 0.19013158977031708, + "rewards/cosine_scaled_reward/mean": -0.22566190361976624, + "rewards/cosine_scaled_reward/std": 0.24779614806175232, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 1425.203125, + "completions/mean_terminated_length": 909.1714477539062, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.18057142857142858, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.24377204477787018, + "learning_rate": 8.801784390262943e-07, + "loss": -0.0, + "num_tokens": 19047249.0, + "reward": -0.021197691559791565, + "reward_std": 0.22868266701698303, + "rewards/cosine_scaled_reward/mean": -0.021197684109210968, + "rewards/cosine_scaled_reward/std": 0.46860653162002563, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 1600.4375, + "completions/mean_terminated_length": 1093.2000732421875, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "epoch": 0.18171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2479163259267807, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0, + "num_tokens": 19161357.0, + "reward": -0.23690757155418396, + "reward_std": 0.20615912973880768, + "rewards/cosine_scaled_reward/mean": -0.23690758645534515, + "rewards/cosine_scaled_reward/std": 0.32988741993904114, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 1527.375, + "completions/mean_terminated_length": 937.3333740234375, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.18285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2751549184322357, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0, + "num_tokens": 19270693.0, + "reward": -0.12134292721748352, + "reward_std": 0.2621082067489624, + "rewards/cosine_scaled_reward/mean": -0.12134292721748352, + "rewards/cosine_scaled_reward/std": 0.4263574779033661, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1419.484375, + "completions/mean_terminated_length": 989.4473876953125, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2905498445034027, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0, + "num_tokens": 19371532.0, + "reward": -0.1314084678888321, + "reward_std": 0.25361165404319763, + "rewards/cosine_scaled_reward/mean": -0.1314084678888321, + "rewards/cosine_scaled_reward/std": 0.36607682704925537, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1584.5, + "completions/mean_terminated_length": 949.3333129882812, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.18514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3278505206108093, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0, + "num_tokens": 19483244.0, + "reward": -0.16470149159431458, + "reward_std": 0.26964259147644043, + "rewards/cosine_scaled_reward/mean": -0.16470149159431458, + "rewards/cosine_scaled_reward/std": 0.31499552726745605, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 1439.6875, + "completions/mean_terminated_length": 868.242431640625, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.18628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29175421595573425, + "learning_rate": 8.693068314414344e-07, + "loss": -0.0, + "num_tokens": 19586568.0, + "reward": 0.10278680920600891, + "reward_std": 0.271634042263031, + "rewards/cosine_scaled_reward/mean": 0.10278680920600891, + "rewards/cosine_scaled_reward/std": 0.4813632071018219, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 1406.703125, + "completions/mean_terminated_length": 995.6154174804688, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.18742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26038941740989685, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0, + "num_tokens": 19687125.0, + "reward": -0.08026184141635895, + "reward_std": 0.21900159120559692, + "rewards/cosine_scaled_reward/mean": -0.08026183396577835, + "rewards/cosine_scaled_reward/std": 0.4170342683792114, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1896.0, + "completions/mean_length": 1438.921875, + "completions/mean_terminated_length": 994.45947265625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.18857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29659712314605713, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0, + "num_tokens": 19790632.0, + "reward": -0.12293928861618042, + "reward_std": 0.23739376664161682, + "rewards/cosine_scaled_reward/mean": -0.12293929606676102, + "rewards/cosine_scaled_reward/std": 0.3927924335002899, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1697.765625, + "completions/mean_terminated_length": 1073.434814453125, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "epoch": 0.18971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21795569360256195, + "learning_rate": 8.625962667065487e-07, + "loss": -0.0, + "num_tokens": 19910865.0, + "reward": -0.20583154261112213, + "reward_std": 0.2378866970539093, + "rewards/cosine_scaled_reward/mean": -0.20583152770996094, + "rewards/cosine_scaled_reward/std": 0.26525840163230896, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1674.0, + "completions/mean_length": 1406.890625, + "completions/mean_terminated_length": 995.923095703125, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "epoch": 0.19085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2583286166191101, + "learning_rate": 8.603287946810513e-07, + "loss": -0.0, + "num_tokens": 20012450.0, + "reward": -0.14853140711784363, + "reward_std": 0.23831486701965332, + "rewards/cosine_scaled_reward/mean": -0.14853140711784363, + "rewards/cosine_scaled_reward/std": 0.2794221341609955, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 1574.921875, + "completions/mean_terminated_length": 1038.7667236328125, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2431253343820572, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0, + "num_tokens": 20124085.0, + "reward": -0.07713659107685089, + "reward_std": 0.2686954736709595, + "rewards/cosine_scaled_reward/mean": -0.07713659107685089, + "rewards/cosine_scaled_reward/std": 0.37947362661361694, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1507.90625, + "completions/mean_terminated_length": 1161.6923828125, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "epoch": 0.19314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23702675104141235, + "learning_rate": 8.557485869176825e-07, + "loss": -0.0, + "num_tokens": 20231215.0, + "reward": 0.20358076691627502, + "reward_std": 0.2683357000350952, + "rewards/cosine_scaled_reward/mean": 0.20358076691627502, + "rewards/cosine_scaled_reward/std": 0.5625549554824829, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1290.53125, + "completions/mean_terminated_length": 836.0499877929688, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.19428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2915634512901306, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0, + "num_tokens": 20323593.0, + "reward": -0.04663477838039398, + "reward_std": 0.1683385670185089, + "rewards/cosine_scaled_reward/mean": -0.04663477838039398, + "rewards/cosine_scaled_reward/std": 0.432047039270401, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 1461.703125, + "completions/mean_terminated_length": 875.40625, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.19542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2520189881324768, + "learning_rate": 8.511087728614862e-07, + "loss": -0.0, + "num_tokens": 20427534.0, + "reward": -0.03435331583023071, + "reward_std": 0.18240094184875488, + "rewards/cosine_scaled_reward/mean": -0.034353308379650116, + "rewards/cosine_scaled_reward/std": 0.4340380132198334, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 1483.359375, + "completions/mean_terminated_length": 1071.3243408203125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.19657142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31458213925361633, + "learning_rate": 8.487667956935087e-07, + "loss": -0.0, + "num_tokens": 20533085.0, + "reward": 0.1847388744354248, + "reward_std": 0.20619311928749084, + "rewards/cosine_scaled_reward/mean": 0.18473894894123077, + "rewards/cosine_scaled_reward/std": 0.512468159198761, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1113.96875, + "completions/mean_terminated_length": 689.4091186523438, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.1977142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3380848467350006, + "learning_rate": 8.464102570534061e-07, + "loss": -0.0, + "num_tokens": 20615691.0, + "reward": -0.05022401362657547, + "reward_std": 0.2543797492980957, + "rewards/cosine_scaled_reward/mean": -0.05022402107715607, + "rewards/cosine_scaled_reward/std": 0.38979703187942505, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 1151.390625, + "completions/mean_terminated_length": 985.3518676757812, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.19885714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2737923562526703, + "learning_rate": 8.440392717955475e-07, + "loss": -0.0, + "num_tokens": 20699716.0, + "reward": -0.05732875317335129, + "reward_std": 0.2915908694267273, + "rewards/cosine_scaled_reward/mean": -0.05732874572277069, + "rewards/cosine_scaled_reward/std": 0.4477607011795044, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1481.765625, + "completions/mean_terminated_length": 1068.567626953125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26141369342803955, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0, + "num_tokens": 20805373.0, + "reward": -0.02904359996318817, + "reward_std": 0.24616873264312744, + "rewards/cosine_scaled_reward/mean": -0.02904359996318817, + "rewards/cosine_scaled_reward/std": 0.45150378346443176, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 1445.53125, + "completions/mean_terminated_length": 913.941162109375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.20114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.314208984375, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0, + "num_tokens": 20909055.0, + "reward": -0.165739506483078, + "reward_std": 0.2986479103565216, + "rewards/cosine_scaled_reward/mean": -0.165739506483078, + "rewards/cosine_scaled_reward/std": 0.3703363239765167, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 1427.890625, + "completions/mean_terminated_length": 1003.6052856445312, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.2022857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2828216254711151, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0, + "num_tokens": 21010664.0, + "reward": -0.07454323768615723, + "reward_std": 0.23275166749954224, + "rewards/cosine_scaled_reward/mean": -0.07454322278499603, + "rewards/cosine_scaled_reward/std": 0.3976919949054718, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 1304.5, + "completions/mean_terminated_length": 915.047607421875, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.20342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28618043661117554, + "learning_rate": 8.344131861991828e-07, + "loss": -0.0, + "num_tokens": 21105688.0, + "reward": 0.002464752644300461, + "reward_std": 0.3809230327606201, + "rewards/cosine_scaled_reward/mean": 0.002464751712977886, + "rewards/cosine_scaled_reward/std": 0.46308550238609314, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 1658.5, + "completions/mean_terminated_length": 1050.8800048828125, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "epoch": 0.20457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.250982403755188, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0, + "num_tokens": 21222664.0, + "reward": -0.18153682351112366, + "reward_std": 0.2734690308570862, + "rewards/cosine_scaled_reward/mean": -0.18153685331344604, + "rewards/cosine_scaled_reward/std": 0.33050045371055603, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1288.34375, + "completions/mean_terminated_length": 943.0454711914062, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.2057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3166482150554657, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0, + "num_tokens": 21316294.0, + "reward": 0.20186525583267212, + "reward_std": 0.31781116127967834, + "rewards/cosine_scaled_reward/mean": 0.20186525583267212, + "rewards/cosine_scaled_reward/std": 0.49267733097076416, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 1591.796875, + "completions/mean_terminated_length": 925.0385131835938, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.20685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26195216178894043, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0, + "num_tokens": 21429641.0, + "reward": -0.060104113072156906, + "reward_std": 0.23563489317893982, + "rewards/cosine_scaled_reward/mean": -0.06010409817099571, + "rewards/cosine_scaled_reward/std": 0.43010979890823364, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 1365.421875, + "completions/mean_terminated_length": 927.871826171875, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2551879286766052, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0, + "num_tokens": 21526820.0, + "reward": -0.15826305747032166, + "reward_std": 0.24291284382343292, + "rewards/cosine_scaled_reward/mean": -0.15826307237148285, + "rewards/cosine_scaled_reward/std": 0.30778464674949646, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1320.515625, + "completions/mean_terminated_length": 912.4146118164062, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.20914285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32218796014785767, + "learning_rate": 8.220696016880687e-07, + "loss": -0.0, + "num_tokens": 21621949.0, + "reward": -0.07413223385810852, + "reward_std": 0.35920435190200806, + "rewards/cosine_scaled_reward/mean": -0.07413223385810852, + "rewards/cosine_scaled_reward/std": 0.45890137553215027, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1465.71875, + "completions/mean_terminated_length": 1012.8333129882812, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.2102857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27700135111808777, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0, + "num_tokens": 21727107.0, + "reward": -0.158505380153656, + "reward_std": 0.18604165315628052, + "rewards/cosine_scaled_reward/mean": -0.158505380153656, + "rewards/cosine_scaled_reward/std": 0.29056471586227417, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1286.8125, + "completions/mean_terminated_length": 940.8182373046875, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.21142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2607719898223877, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0, + "num_tokens": 21819647.0, + "reward": -0.28775715827941895, + "reward_std": 0.19134438037872314, + "rewards/cosine_scaled_reward/mean": -0.28775715827941895, + "rewards/cosine_scaled_reward/std": 0.21350952982902527, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1825.0, + "completions/mean_length": 1480.09375, + "completions/mean_terminated_length": 1065.6756591796875, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.21257142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2575705349445343, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0, + "num_tokens": 21925069.0, + "reward": -0.13343556225299835, + "reward_std": 0.2557746171951294, + "rewards/cosine_scaled_reward/mean": -0.13343556225299835, + "rewards/cosine_scaled_reward/std": 0.36808857321739197, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1347.71875, + "completions/mean_terminated_length": 1114.291748046875, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.21371428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31024712324142456, + "learning_rate": 8.119553365707802e-07, + "loss": -0.0, + "num_tokens": 22021747.0, + "reward": -0.09627380967140198, + "reward_std": 0.2472851276397705, + "rewards/cosine_scaled_reward/mean": -0.09627379477024078, + "rewards/cosine_scaled_reward/std": 0.41195833683013916, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1774.140625, + "completions/mean_terminated_length": 1251.3182373046875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.21485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2481517493724823, + "learning_rate": 8.093945422764069e-07, + "loss": -0.0, + "num_tokens": 22147092.0, + "reward": -0.20224528014659882, + "reward_std": 0.2598743736743927, + "rewards/cosine_scaled_reward/mean": -0.20224529504776, + "rewards/cosine_scaled_reward/std": 0.33939501643180847, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1234.328125, + "completions/mean_terminated_length": 808.1190795898438, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31437209248542786, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0, + "num_tokens": 22235377.0, + "reward": -0.09877841919660568, + "reward_std": 0.2865467667579651, + "rewards/cosine_scaled_reward/mean": -0.09877842664718628, + "rewards/cosine_scaled_reward/std": 0.4444861114025116, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1189.3125, + "completions/mean_terminated_length": 1011.0943603515625, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.21714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28301987051963806, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0, + "num_tokens": 22321261.0, + "reward": -0.028003819286823273, + "reward_std": 0.27996310591697693, + "rewards/cosine_scaled_reward/mean": -0.028003819286823273, + "rewards/cosine_scaled_reward/std": 0.4598979353904724, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1061.140625, + "completions/mean_terminated_length": 1012.6065063476562, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.21828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31064528226852417, + "learning_rate": 8.01636806561836e-07, + "loss": -0.0, + "num_tokens": 22399462.0, + "reward": 0.07088040560483932, + "reward_std": 0.3638381361961365, + "rewards/cosine_scaled_reward/mean": 0.07088041305541992, + "rewards/cosine_scaled_reward/std": 0.5184580683708191, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1542.21875, + "completions/mean_terminated_length": 1258.48779296875, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "epoch": 0.21942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2742583751678467, + "learning_rate": 7.990261971595048e-07, + "loss": -0.0, + "num_tokens": 22509460.0, + "reward": -0.14651048183441162, + "reward_std": 0.2414294183254242, + "rewards/cosine_scaled_reward/mean": -0.14651048183441162, + "rewards/cosine_scaled_reward/std": 0.3039136528968811, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1717.8125, + "completions/mean_terminated_length": 1202.719970703125, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "epoch": 0.22057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24609725177288055, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0, + "num_tokens": 22630544.0, + "reward": -0.28856799006462097, + "reward_std": 0.14614446461200714, + "rewards/cosine_scaled_reward/mean": -0.28856799006462097, + "rewards/cosine_scaled_reward/std": 0.17294423282146454, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1692.546875, + "completions/mean_terminated_length": 1058.9130859375, + "completions/min_length": 613.0, + "completions/min_terminated_length": 613.0, + "epoch": 0.22171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27539438009262085, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0, + "num_tokens": 22750227.0, + "reward": -0.10590282082557678, + "reward_std": 0.25362446904182434, + "rewards/cosine_scaled_reward/mean": -0.10590282082557678, + "rewards/cosine_scaled_reward/std": 0.36822667717933655, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1410.09375, + "completions/mean_terminated_length": 1120.1363525390625, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "epoch": 0.22285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23645445704460144, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0, + "num_tokens": 22851617.0, + "reward": -0.12888561189174652, + "reward_std": 0.32565274834632874, + "rewards/cosine_scaled_reward/mean": -0.12888562679290771, + "rewards/cosine_scaled_reward/std": 0.3842463195323944, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 1625.234375, + "completions/mean_terminated_length": 1146.10009765625, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27093231678009033, + "learning_rate": 7.884636689049422e-07, + "loss": -0.0, + "num_tokens": 22967224.0, + "reward": -0.1617402583360672, + "reward_std": 0.3036938011646271, + "rewards/cosine_scaled_reward/mean": -0.1617402583360672, + "rewards/cosine_scaled_reward/std": 0.390837699174881, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1099.96875, + "completions/mean_terminated_length": 924.4074096679688, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.22514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31926214694976807, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0, + "num_tokens": 23047990.0, + "reward": 0.09089304506778717, + "reward_std": 0.40348750352859497, + "rewards/cosine_scaled_reward/mean": 0.09089304506778717, + "rewards/cosine_scaled_reward/std": 0.5607035756111145, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1312.96875, + "completions/mean_terminated_length": 1125.60791015625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.22628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2919371426105499, + "learning_rate": 7.831121542179086e-07, + "loss": -0.0, + "num_tokens": 23143524.0, + "reward": 0.0047197043895721436, + "reward_std": 0.3408518433570862, + "rewards/cosine_scaled_reward/mean": 0.004719719290733337, + "rewards/cosine_scaled_reward/std": 0.46544134616851807, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1417.171875, + "completions/mean_terminated_length": 1224.0611572265625, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "epoch": 0.22742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24992844462394714, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0, + "num_tokens": 23245727.0, + "reward": -0.19424019753932953, + "reward_std": 0.28145354986190796, + "rewards/cosine_scaled_reward/mean": -0.19424019753932953, + "rewards/cosine_scaled_reward/std": 0.3362065255641937, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 1180.515625, + "completions/mean_terminated_length": 891.3541870117188, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.22857142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2989206612110138, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0, + "num_tokens": 23331400.0, + "reward": 0.08669155836105347, + "reward_std": 0.3488098084926605, + "rewards/cosine_scaled_reward/mean": 0.08669155836105347, + "rewards/cosine_scaled_reward/std": 0.46097004413604736, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1671.0, + "completions/mean_length": 1399.1875, + "completions/mean_terminated_length": 789.697021484375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.2297142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3482288122177124, + "learning_rate": 7.75e-07, + "loss": 0.0, + "num_tokens": 23431972.0, + "reward": 0.05170612782239914, + "reward_std": 0.33521372079849243, + "rewards/cosine_scaled_reward/mean": 0.05170612409710884, + "rewards/cosine_scaled_reward/std": 0.4809432625770569, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 1257.5, + "completions/mean_terminated_length": 871.4418334960938, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.23085714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24333854019641876, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0, + "num_tokens": 23522356.0, + "reward": 0.20302791893482208, + "reward_std": 0.24270620942115784, + "rewards/cosine_scaled_reward/mean": 0.20302791893482208, + "rewards/cosine_scaled_reward/std": 0.5547645688056946, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1545.421875, + "completions/mean_terminated_length": 1299.9766845703125, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "epoch": 0.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24562042951583862, + "learning_rate": 7.695368466124296e-07, + "loss": -0.0, + "num_tokens": 23632679.0, + "reward": 0.07688053697347641, + "reward_std": 0.32062458992004395, + "rewards/cosine_scaled_reward/mean": 0.07688053697347641, + "rewards/cosine_scaled_reward/std": 0.5180152058601379, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1250.28125, + "completions/mean_terminated_length": 961.74462890625, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.23314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2952634394168854, + "learning_rate": 7.667891533457718e-07, + "loss": -0.0, + "num_tokens": 23722417.0, + "reward": 0.0316191166639328, + "reward_std": 0.23991048336029053, + "rewards/cosine_scaled_reward/mean": 0.0316191241145134, + "rewards/cosine_scaled_reward/std": 0.4419180452823639, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1239.6875, + "completions/mean_terminated_length": 923.3912963867188, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.2342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3470660448074341, + "learning_rate": 7.640308940816239e-07, + "loss": -0.0, + "num_tokens": 23812821.0, + "reward": 0.04175570607185364, + "reward_std": 0.32632672786712646, + "rewards/cosine_scaled_reward/mean": 0.04175570607185364, + "rewards/cosine_scaled_reward/std": 0.5073853135108948, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 1563.75, + "completions/mean_terminated_length": 1162.5142822265625, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.23542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2750691771507263, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0, + "num_tokens": 23923693.0, + "reward": -0.1269976794719696, + "reward_std": 0.2818883955478668, + "rewards/cosine_scaled_reward/mean": -0.1269976794719696, + "rewards/cosine_scaled_reward/std": 0.3301773965358734, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1346.515625, + "completions/mean_terminated_length": 1072.021728515625, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "epoch": 0.23657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34398096799850464, + "learning_rate": 7.584832158039378e-07, + "loss": -0.0, + "num_tokens": 24020470.0, + "reward": -0.11099155992269516, + "reward_std": 0.32174742221832275, + "rewards/cosine_scaled_reward/mean": -0.11099155247211456, + "rewards/cosine_scaled_reward/std": 0.4000038504600525, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 1350.71875, + "completions/mean_terminated_length": 1206.0, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "epoch": 0.2377142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2667733430862427, + "learning_rate": 7.556940671764124e-07, + "loss": -0.0, + "num_tokens": 24117244.0, + "reward": -0.012698620557785034, + "reward_std": 0.27501654624938965, + "rewards/cosine_scaled_reward/mean": -0.01269860565662384, + "rewards/cosine_scaled_reward/std": 0.47749608755111694, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 1186.484375, + "completions/mean_terminated_length": 922.7550659179688, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.23885714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34667861461639404, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0, + "num_tokens": 24203091.0, + "reward": -0.132795050740242, + "reward_std": 0.2735438942909241, + "rewards/cosine_scaled_reward/mean": -0.132795050740242, + "rewards/cosine_scaled_reward/std": 0.3893483579158783, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1450.0625, + "completions/mean_terminated_length": 1197.5999755859375, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21571174263954163, + "learning_rate": 7.500858306332172e-07, + "loss": -0.0, + "num_tokens": 24306703.0, + "reward": -0.06977479159832001, + "reward_std": 0.24265971779823303, + "rewards/cosine_scaled_reward/mean": -0.06977479159832001, + "rewards/cosine_scaled_reward/std": 0.45415669679641724, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1201.609375, + "completions/mean_terminated_length": 964.6199951171875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.24114285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2720986306667328, + "learning_rate": 7.472670160550848e-07, + "loss": -0.0, + "num_tokens": 24394846.0, + "reward": 0.0786014124751091, + "reward_std": 0.2013745754957199, + "rewards/cosine_scaled_reward/mean": 0.0786014199256897, + "rewards/cosine_scaled_reward/std": 0.4884081780910492, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 1176.359375, + "completions/mean_terminated_length": 808.3333740234375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.2422857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3127840757369995, + "learning_rate": 7.444385869608921e-07, + "loss": -0.0, + "num_tokens": 24480613.0, + "reward": 0.11307461559772491, + "reward_std": 0.284263014793396, + "rewards/cosine_scaled_reward/mean": 0.11307463049888611, + "rewards/cosine_scaled_reward/std": 0.5329286456108093, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1114.03125, + "completions/mean_terminated_length": 776.2127685546875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.24342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.343943327665329, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0, + "num_tokens": 24561775.0, + "reward": -0.10338220745325089, + "reward_std": 0.2921890914440155, + "rewards/cosine_scaled_reward/mean": -0.10338220745325089, + "rewards/cosine_scaled_reward/std": 0.34980201721191406, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1386.0625, + "completions/mean_terminated_length": 1039.3333740234375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.24457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26102328300476074, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0, + "num_tokens": 24662027.0, + "reward": 0.02548668347299099, + "reward_std": 0.3174683451652527, + "rewards/cosine_scaled_reward/mean": 0.025486690923571587, + "rewards/cosine_scaled_reward/std": 0.46307510137557983, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1882.0, + "completions/mean_length": 1318.46875, + "completions/mean_terminated_length": 962.18603515625, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.24571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2819078266620636, + "learning_rate": 7.358969934210438e-07, + "loss": -0.0, + "num_tokens": 24756897.0, + "reward": -0.11348340660333633, + "reward_std": 0.1657339334487915, + "rewards/cosine_scaled_reward/mean": -0.11348340660333633, + "rewards/cosine_scaled_reward/std": 0.41132697463035583, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 971.234375, + "completions/mean_terminated_length": 839.0, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.24685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3146374225616455, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0, + "num_tokens": 24828336.0, + "reward": 0.09829875081777573, + "reward_std": 0.34463635087013245, + "rewards/cosine_scaled_reward/mean": 0.09829875826835632, + "rewards/cosine_scaled_reward/std": 0.5223532319068909, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 1323.546875, + "completions/mean_terminated_length": 1017.6666870117188, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "epoch": 0.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25747084617614746, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0, + "num_tokens": 24923395.0, + "reward": -0.08631986379623413, + "reward_std": 0.3201732039451599, + "rewards/cosine_scaled_reward/mean": -0.08631986379623413, + "rewards/cosine_scaled_reward/std": 0.41996634006500244, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1421.5, + "completions/mean_terminated_length": 1115.534912109375, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.24914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24970035254955292, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0, + "num_tokens": 25025059.0, + "reward": -0.22788012027740479, + "reward_std": 0.22475574910640717, + "rewards/cosine_scaled_reward/mean": -0.22788012027740479, + "rewards/cosine_scaled_reward/std": 0.2934871315956116, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1120.609375, + "completions/mean_terminated_length": 948.870361328125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2502857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34460073709487915, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0, + "num_tokens": 25107090.0, + "reward": 0.02718304470181465, + "reward_std": 0.3376328647136688, + "rewards/cosine_scaled_reward/mean": 0.027183040976524353, + "rewards/cosine_scaled_reward/std": 0.5283166170120239, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 1366.828125, + "completions/mean_terminated_length": 1034.162841796875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.25142857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4521820843219757, + "learning_rate": 7.214816693576234e-07, + "loss": -0.0, + "num_tokens": 25204871.0, + "reward": -0.25229814648628235, + "reward_std": 0.17562136054039001, + "rewards/cosine_scaled_reward/mean": -0.25229811668395996, + "rewards/cosine_scaled_reward/std": 0.19320644438266754, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1156.53125, + "completions/mean_terminated_length": 950.8077392578125, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.25257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26752790808677673, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0, + "num_tokens": 25289449.0, + "reward": 0.24696281552314758, + "reward_std": 0.273512065410614, + "rewards/cosine_scaled_reward/mean": 0.24696281552314758, + "rewards/cosine_scaled_reward/std": 0.46473291516304016, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1243.3125, + "completions/mean_terminated_length": 903.5556030273438, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.2537142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27195242047309875, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0, + "num_tokens": 25379149.0, + "reward": 0.007332861423492432, + "reward_std": 0.29589229822158813, + "rewards/cosine_scaled_reward/mean": 0.007332857698202133, + "rewards/cosine_scaled_reward/std": 0.48079609870910645, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 1200.3125, + "completions/mean_terminated_length": 962.9599609375, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "epoch": 0.25485714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2661433219909668, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0, + "num_tokens": 25465705.0, + "reward": 0.03970642387866974, + "reward_std": 0.2005533128976822, + "rewards/cosine_scaled_reward/mean": 0.03970641642808914, + "rewards/cosine_scaled_reward/std": 0.5048101544380188, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1705.46875, + "completions/mean_terminated_length": 1383.697021484375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23623619973659515, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0, + "num_tokens": 25586263.0, + "reward": -0.07307912409305573, + "reward_std": 0.350577175617218, + "rewards/cosine_scaled_reward/mean": -0.07307912409305573, + "rewards/cosine_scaled_reward/std": 0.38458916544914246, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1527.640625, + "completions/mean_terminated_length": 1122.9166259765625, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.2571428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2600167393684387, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0, + "num_tokens": 25694624.0, + "reward": -0.18486955761909485, + "reward_std": 0.24510705471038818, + "rewards/cosine_scaled_reward/mean": -0.18486955761909485, + "rewards/cosine_scaled_reward/std": 0.29842856526374817, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1293.0, + "completions/mean_terminated_length": 1118.769287109375, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.2582857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24967192113399506, + "learning_rate": 7.039090644965509e-07, + "loss": -0.0, + "num_tokens": 25788016.0, + "reward": 0.10143648833036423, + "reward_std": 0.3550751805305481, + "rewards/cosine_scaled_reward/mean": 0.10143650323152542, + "rewards/cosine_scaled_reward/std": 0.48985999822616577, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 975.421875, + "completions/mean_terminated_length": 958.3968505859375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.25942857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33750462532043457, + "learning_rate": 7.009532063876148e-07, + "loss": -0.0, + "num_tokens": 25860827.0, + "reward": 0.017139945179224014, + "reward_std": 0.40727996826171875, + "rewards/cosine_scaled_reward/mean": 0.017139948904514313, + "rewards/cosine_scaled_reward/std": 0.4528072476387024, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1005.453125, + "completions/mean_terminated_length": 834.8544921875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.26057142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3340362310409546, + "learning_rate": 6.979899910323624e-07, + "loss": -0.0, + "num_tokens": 25935848.0, + "reward": 0.1363377869129181, + "reward_std": 0.31884267926216125, + "rewards/cosine_scaled_reward/mean": 0.1363377869129181, + "rewards/cosine_scaled_reward/std": 0.5562776923179626, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1308.875, + "completions/mean_terminated_length": 1019.6522216796875, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.26171428571428573, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2481510192155838, + "learning_rate": 6.950195628537299e-07, + "loss": -0.0, + "num_tokens": 26030280.0, + "reward": -0.0336291566491127, + "reward_std": 0.2131306231021881, + "rewards/cosine_scaled_reward/mean": -0.0336291640996933, + "rewards/cosine_scaled_reward/std": 0.4883540868759155, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 1424.34375, + "completions/mean_terminated_length": 1119.7674560546875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.26285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24989557266235352, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0, + "num_tokens": 26131870.0, + "reward": -0.27840444445610046, + "reward_std": 0.18090233206748962, + "rewards/cosine_scaled_reward/mean": -0.27840444445610046, + "rewards/cosine_scaled_reward/std": 0.2319284826517105, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1420.328125, + "completions/mean_terminated_length": 1113.7906494140625, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "epoch": 0.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25709542632102966, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0, + "num_tokens": 26234467.0, + "reward": -0.012329377233982086, + "reward_std": 0.3558858633041382, + "rewards/cosine_scaled_reward/mean": -0.012329380959272385, + "rewards/cosine_scaled_reward/std": 0.45383208990097046, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1477.65625, + "completions/mean_terminated_length": 1087.4210205078125, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.2651428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26604732871055603, + "learning_rate": 6.860664508377001e-07, + "loss": -0.0, + "num_tokens": 26339365.0, + "reward": -0.18533703684806824, + "reward_std": 0.24220798909664154, + "rewards/cosine_scaled_reward/mean": -0.18533703684806824, + "rewards/cosine_scaled_reward/std": 0.26634126901626587, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 1072.109375, + "completions/mean_terminated_length": 1024.11474609375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.2662857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26210692524909973, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0, + "num_tokens": 26418084.0, + "reward": -0.1599939614534378, + "reward_std": 0.3579375445842743, + "rewards/cosine_scaled_reward/mean": -0.1599939614534378, + "rewards/cosine_scaled_reward/std": 0.3679514527320862, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1287.546875, + "completions/mean_terminated_length": 889.2142944335938, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.2674285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30882614850997925, + "learning_rate": 6.800643086250121e-07, + "loss": -0.0, + "num_tokens": 26510503.0, + "reward": -0.1574883908033371, + "reward_std": 0.17980948090553284, + "rewards/cosine_scaled_reward/mean": -0.1574883908033371, + "rewards/cosine_scaled_reward/std": 0.35836631059646606, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 1145.125, + "completions/mean_terminated_length": 936.769287109375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.26857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30261489748954773, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0, + "num_tokens": 26594351.0, + "reward": 0.1909978985786438, + "reward_std": 0.3115041255950928, + "rewards/cosine_scaled_reward/mean": 0.1909978985786438, + "rewards/cosine_scaled_reward/std": 0.5054126381874084, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1449.375, + "completions/mean_terminated_length": 1012.5405883789062, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.26971428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28838953375816345, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0, + "num_tokens": 26698399.0, + "reward": -0.11444643139839172, + "reward_std": 0.3462868928909302, + "rewards/cosine_scaled_reward/mean": -0.11444643884897232, + "rewards/cosine_scaled_reward/std": 0.4084509313106537, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1262.125, + "completions/mean_terminated_length": 1021.551025390625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.27085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3033871650695801, + "learning_rate": 6.710139192768694e-07, + "loss": -0.0, + "num_tokens": 26789303.0, + "reward": -0.05035819113254547, + "reward_std": 0.2872178554534912, + "rewards/cosine_scaled_reward/mean": -0.050358183681964874, + "rewards/cosine_scaled_reward/std": 0.5157716870307922, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1301.734375, + "completions/mean_terminated_length": 1092.780029296875, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26610657572746277, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0, + "num_tokens": 26883454.0, + "reward": 0.10226152092218399, + "reward_std": 0.3642864525318146, + "rewards/cosine_scaled_reward/mean": 0.10226152092218399, + "rewards/cosine_scaled_reward/std": 0.49199798703193665, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1746.0, + "completions/mean_length": 1012.578125, + "completions/mean_terminated_length": 797.6792602539062, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.27314285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3098434805870056, + "learning_rate": 6.649505910711058e-07, + "loss": -0.0, + "num_tokens": 26958571.0, + "reward": 0.2893483638763428, + "reward_std": 0.21750710904598236, + "rewards/cosine_scaled_reward/mean": 0.2893483638763428, + "rewards/cosine_scaled_reward/std": 0.5735083818435669, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 1127.03125, + "completions/mean_terminated_length": 845.10205078125, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.2742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33869531750679016, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0, + "num_tokens": 27040949.0, + "reward": -0.2518009841442108, + "reward_std": 0.2073291540145874, + "rewards/cosine_scaled_reward/mean": -0.2518009841442108, + "rewards/cosine_scaled_reward/std": 0.26051101088523865, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 1248.578125, + "completions/mean_terminated_length": 1044.803955078125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.2754285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2926189601421356, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0, + "num_tokens": 27132074.0, + "reward": -0.18343190848827362, + "reward_std": 0.32297152280807495, + "rewards/cosine_scaled_reward/mean": -0.18343190848827362, + "rewards/cosine_scaled_reward/std": 0.3960045278072357, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1057.171875, + "completions/mean_terminated_length": 779.739990234375, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.2765714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3697403073310852, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0, + "num_tokens": 27209245.0, + "reward": -0.13560537993907928, + "reward_std": 0.2509098947048187, + "rewards/cosine_scaled_reward/mean": -0.13560537993907928, + "rewards/cosine_scaled_reward/std": 0.42233115434646606, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1849.0, + "completions/mean_length": 1294.9375, + "completions/mean_terminated_length": 1121.1539306640625, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "epoch": 0.2777142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2539284825325012, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0, + "num_tokens": 27302953.0, + "reward": 0.006944652646780014, + "reward_std": 0.3980734050273895, + "rewards/cosine_scaled_reward/mean": 0.006944645196199417, + "rewards/cosine_scaled_reward/std": 0.4572637379169464, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1435.71875, + "completions/mean_terminated_length": 1068.3499755859375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.27885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28219878673553467, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0, + "num_tokens": 27405479.0, + "reward": -0.04507390409708023, + "reward_std": 0.2943881154060364, + "rewards/cosine_scaled_reward/mean": -0.04507390037178993, + "rewards/cosine_scaled_reward/std": 0.482650488615036, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1331.296875, + "completions/mean_terminated_length": 1005.5227661132812, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2733215391635895, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0, + "num_tokens": 27501746.0, + "reward": 0.04791342094540596, + "reward_std": 0.34749698638916016, + "rewards/cosine_scaled_reward/mean": 0.047913409769535065, + "rewards/cosine_scaled_reward/std": 0.5028091669082642, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1310.046875, + "completions/mean_terminated_length": 1043.127685546875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.28114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2701246440410614, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0, + "num_tokens": 27596189.0, + "reward": -0.13987088203430176, + "reward_std": 0.3327594995498657, + "rewards/cosine_scaled_reward/mean": -0.13987088203430176, + "rewards/cosine_scaled_reward/std": 0.4108533263206482, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1433.5625, + "completions/mean_terminated_length": 1088.8780517578125, + "completions/min_length": 521.0, + "completions/min_terminated_length": 521.0, + "epoch": 0.2822857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2608485221862793, + "learning_rate": 6.404850645156841e-07, + "loss": -0.0, + "num_tokens": 27698385.0, + "reward": -0.19611218571662903, + "reward_std": 0.18159456551074982, + "rewards/cosine_scaled_reward/mean": -0.19611218571662903, + "rewards/cosine_scaled_reward/std": 0.18690702319145203, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 1016.078125, + "completions/mean_terminated_length": 824.9815063476562, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.2834285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33469828963279724, + "learning_rate": 6.374054580489873e-07, + "loss": -0.0, + "num_tokens": 27774342.0, + "reward": 0.20066902041435242, + "reward_std": 0.2608226537704468, + "rewards/cosine_scaled_reward/mean": 0.20066902041435242, + "rewards/cosine_scaled_reward/std": 0.5498367547988892, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 1137.15625, + "completions/mean_terminated_length": 926.9615478515625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.2845714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.283346951007843, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0, + "num_tokens": 27858296.0, + "reward": 0.22508396208286285, + "reward_std": 0.32221734523773193, + "rewards/cosine_scaled_reward/mean": 0.22508396208286285, + "rewards/cosine_scaled_reward/std": 0.5403409600257874, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1258.484375, + "completions/mean_terminated_length": 1016.7958984375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.2857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3057456612586975, + "learning_rate": 6.31233615362752e-07, + "loss": -0.0, + "num_tokens": 27949463.0, + "reward": -0.161838099360466, + "reward_std": 0.3008255660533905, + "rewards/cosine_scaled_reward/mean": -0.1618381142616272, + "rewards/cosine_scaled_reward/std": 0.36034730076789856, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 924.796875, + "completions/mean_terminated_length": 849.9166870117188, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.28685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3387444019317627, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0, + "num_tokens": 28018754.0, + "reward": -0.024298980832099915, + "reward_std": 0.26890814304351807, + "rewards/cosine_scaled_reward/mean": -0.024298986420035362, + "rewards/cosine_scaled_reward/std": 0.42033475637435913, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 1181.65625, + "completions/mean_terminated_length": 868.2978515625, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31221455335617065, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0, + "num_tokens": 28104708.0, + "reward": -0.12287517637014389, + "reward_std": 0.16865523159503937, + "rewards/cosine_scaled_reward/mean": -0.12287518382072449, + "rewards/cosine_scaled_reward/std": 0.3217977285385132, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 1170.125, + "completions/mean_terminated_length": 946.3529663085938, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.28914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3127641975879669, + "learning_rate": 6.219465344613258e-07, + "loss": -0.0, + "num_tokens": 28190628.0, + "reward": 0.004593905061483383, + "reward_std": 0.27023184299468994, + "rewards/cosine_scaled_reward/mean": 0.0045939162373542786, + "rewards/cosine_scaled_reward/std": 0.45060300827026367, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 1240.578125, + "completions/mean_terminated_length": 948.5318603515625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.29028571428571426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3329788148403168, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0, + "num_tokens": 28280561.0, + "reward": -0.10231998562812805, + "reward_std": 0.28730475902557373, + "rewards/cosine_scaled_reward/mean": -0.10232000052928925, + "rewards/cosine_scaled_reward/std": 0.39239707589149475, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1410.15625, + "completions/mean_terminated_length": 1027.4500732421875, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.2914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27967000007629395, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0, + "num_tokens": 28380867.0, + "reward": -0.22316259145736694, + "reward_std": 0.1437114179134369, + "rewards/cosine_scaled_reward/mean": -0.22316259145736694, + "rewards/cosine_scaled_reward/std": 0.14716070890426636, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 1163.859375, + "completions/mean_terminated_length": 916.2999877929688, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "epoch": 0.2925714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29270097613334656, + "learning_rate": 6.126278954320294e-07, + "loss": -0.0, + "num_tokens": 28466402.0, + "reward": -0.07603419572114944, + "reward_std": 0.25814926624298096, + "rewards/cosine_scaled_reward/mean": -0.07603420317173004, + "rewards/cosine_scaled_reward/std": 0.43036898970603943, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1788.0, + "completions/mean_length": 1341.5625, + "completions/mean_terminated_length": 1065.1304931640625, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "epoch": 0.2937142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26492658257484436, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0, + "num_tokens": 28563766.0, + "reward": 0.08429908752441406, + "reward_std": 0.3440818786621094, + "rewards/cosine_scaled_reward/mean": 0.08429908752441406, + "rewards/cosine_scaled_reward/std": 0.5146033763885498, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1407.96875, + "completions/mean_terminated_length": 1048.9267578125, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "epoch": 0.2948571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25829192996025085, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0, + "num_tokens": 28665100.0, + "reward": -0.17156042158603668, + "reward_std": 0.27311789989471436, + "rewards/cosine_scaled_reward/mean": -0.1715604066848755, + "rewards/cosine_scaled_reward/std": 0.30253204703330994, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 1148.53125, + "completions/mean_terminated_length": 919.2549438476562, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30335742235183716, + "learning_rate": 6.032817857379256e-07, + "loss": -0.0, + "num_tokens": 28749382.0, + "reward": 0.042243435978889465, + "reward_std": 0.26714712381362915, + "rewards/cosine_scaled_reward/mean": 0.042243435978889465, + "rewards/cosine_scaled_reward/std": 0.4686411917209625, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1083.234375, + "completions/mean_terminated_length": 761.6458740234375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.29714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39401763677597046, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0, + "num_tokens": 28828653.0, + "reward": 0.07885205745697021, + "reward_std": 0.2136325240135193, + "rewards/cosine_scaled_reward/mean": 0.07885205745697021, + "rewards/cosine_scaled_reward/std": 0.4590488076210022, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1553.515625, + "completions/mean_terminated_length": 1117.2059326171875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.29828571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23382478952407837, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0, + "num_tokens": 28939382.0, + "reward": -0.21638496220111847, + "reward_std": 0.18316227197647095, + "rewards/cosine_scaled_reward/mean": -0.21638496220111847, + "rewards/cosine_scaled_reward/std": 0.21999679505825043, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 1150.203125, + "completions/mean_terminated_length": 1021.9464721679688, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.29942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31957077980041504, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0, + "num_tokens": 29024731.0, + "reward": -0.15519243478775024, + "reward_std": 0.25976234674453735, + "rewards/cosine_scaled_reward/mean": -0.15519244968891144, + "rewards/cosine_scaled_reward/std": 0.29580366611480713, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1284.109375, + "completions/mean_terminated_length": 1029.479248046875, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.30057142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.257870614528656, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0, + "num_tokens": 29117034.0, + "reward": -0.19904303550720215, + "reward_std": 0.21833446621894836, + "rewards/cosine_scaled_reward/mean": -0.19904303550720215, + "rewards/cosine_scaled_reward/std": 0.2711222171783447, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 1279.6875, + "completions/mean_terminated_length": 1044.48974609375, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.3017142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2747717797756195, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0, + "num_tokens": 29210278.0, + "reward": -0.036783367395401, + "reward_std": 0.32698529958724976, + "rewards/cosine_scaled_reward/mean": -0.0367833711206913, + "rewards/cosine_scaled_reward/std": 0.4024903178215027, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1140.796875, + "completions/mean_terminated_length": 972.7963256835938, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "epoch": 0.3028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30939391255378723, + "learning_rate": 5.845235626570683e-07, + "loss": -0.0, + "num_tokens": 29293225.0, + "reward": 0.012244641780853271, + "reward_std": 0.3513813316822052, + "rewards/cosine_scaled_reward/mean": 0.012244660407304764, + "rewards/cosine_scaled_reward/std": 0.47922924160957336, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 1543.5, + "completions/mean_terminated_length": 1125.4857177734375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23532193899154663, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0, + "num_tokens": 29403313.0, + "reward": -0.16622336208820343, + "reward_std": 0.29864761233329773, + "rewards/cosine_scaled_reward/mean": -0.16622334718704224, + "rewards/cosine_scaled_reward/std": 0.34780678153038025, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 1511.796875, + "completions/mean_terminated_length": 1094.75, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.30514285714285716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2672438621520996, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0, + "num_tokens": 29511564.0, + "reward": -0.2196415364742279, + "reward_std": 0.22792132198810577, + "rewards/cosine_scaled_reward/mean": -0.2196415364742279, + "rewards/cosine_scaled_reward/std": 0.35021698474884033, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1907.0, + "completions/mean_length": 1050.921875, + "completions/mean_terminated_length": 947.77587890625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.3062857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.308774471282959, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0, + "num_tokens": 29588999.0, + "reward": -0.11035098135471344, + "reward_std": 0.3422476053237915, + "rewards/cosine_scaled_reward/mean": -0.11035098135471344, + "rewards/cosine_scaled_reward/std": 0.42984655499458313, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1253.59375, + "completions/mean_terminated_length": 1010.4081420898438, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.30742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2676822543144226, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0, + "num_tokens": 29679837.0, + "reward": -0.11707413196563721, + "reward_std": 0.1972976177930832, + "rewards/cosine_scaled_reward/mean": -0.117074154317379, + "rewards/cosine_scaled_reward/std": 0.3723585903644562, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1307.8125, + "completions/mean_terminated_length": 1061.0833740234375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.30857142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28255271911621094, + "learning_rate": 5.688440441781398e-07, + "loss": -0.0, + "num_tokens": 29774857.0, + "reward": -0.08722042292356491, + "reward_std": 0.2875755727291107, + "rewards/cosine_scaled_reward/mean": -0.08722040057182312, + "rewards/cosine_scaled_reward/std": 0.46101436018943787, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1214.921875, + "completions/mean_terminated_length": 1022.673095703125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.3097142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2939779460430145, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0, + "num_tokens": 29862996.0, + "reward": 0.26210832595825195, + "reward_std": 0.2794266939163208, + "rewards/cosine_scaled_reward/mean": 0.26210832595825195, + "rewards/cosine_scaled_reward/std": 0.5501880645751953, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1447.5625, + "completions/mean_terminated_length": 1212.6087646484375, + "completions/min_length": 674.0, + "completions/min_terminated_length": 674.0, + "epoch": 0.31085714285714283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.270077645778656, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0, + "num_tokens": 29966888.0, + "reward": -0.07720675319433212, + "reward_std": 0.32648950815200806, + "rewards/cosine_scaled_reward/mean": -0.07720677554607391, + "rewards/cosine_scaled_reward/std": 0.46835580468177795, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 1150.640625, + "completions/mean_terminated_length": 943.5577392578125, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2647186815738678, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0, + "num_tokens": 30050489.0, + "reward": 0.06313569843769073, + "reward_std": 0.3254297375679016, + "rewards/cosine_scaled_reward/mean": 0.06313569843769073, + "rewards/cosine_scaled_reward/std": 0.5027272701263428, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 911.875, + "completions/mean_terminated_length": 794.3448486328125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.31314285714285717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37198200821876526, + "learning_rate": 5.562829811526154e-07, + "loss": -0.0, + "num_tokens": 30118929.0, + "reward": 0.03341127932071686, + "reward_std": 0.31158989667892456, + "rewards/cosine_scaled_reward/mean": 0.033411286771297455, + "rewards/cosine_scaled_reward/std": 0.48580819368362427, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1294.109375, + "completions/mean_terminated_length": 975.800048828125, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.3142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25744378566741943, + "learning_rate": 5.531415671340826e-07, + "loss": -0.0, + "num_tokens": 30211752.0, + "reward": 0.21001243591308594, + "reward_std": 0.402576744556427, + "rewards/cosine_scaled_reward/mean": 0.21001243591308594, + "rewards/cosine_scaled_reward/std": 0.5340982675552368, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1826.0, + "completions/mean_length": 1084.53125, + "completions/mean_terminated_length": 966.2105102539062, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.31542857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31013426184654236, + "learning_rate": 5.5e-07, + "loss": 0.0, + "num_tokens": 30291938.0, + "reward": 0.042711060494184494, + "reward_std": 0.38207319378852844, + "rewards/cosine_scaled_reward/mean": 0.042711060494184494, + "rewards/cosine_scaled_reward/std": 0.4666350483894348, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1227.75, + "completions/mean_terminated_length": 954.3333740234375, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "epoch": 0.31657142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3113550841808319, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0, + "num_tokens": 30382218.0, + "reward": 0.07253848016262054, + "reward_std": 0.1982618272304535, + "rewards/cosine_scaled_reward/mean": 0.07253845036029816, + "rewards/cosine_scaled_reward/std": 0.5152764916419983, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1272.296875, + "completions/mean_terminated_length": 865.9761962890625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.3177142857142857, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2616003155708313, + "learning_rate": 5.437170188473847e-07, + "loss": -0.0, + "num_tokens": 30474309.0, + "reward": 0.08356653898954391, + "reward_std": 0.1530226469039917, + "rewards/cosine_scaled_reward/mean": 0.08356654644012451, + "rewards/cosine_scaled_reward/std": 0.5249122381210327, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1500.65625, + "completions/mean_terminated_length": 1251.8636474609375, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.31885714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23652584850788116, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0, + "num_tokens": 30581391.0, + "reward": -0.07124869525432587, + "reward_std": 0.32038426399230957, + "rewards/cosine_scaled_reward/mean": -0.07124869525432587, + "rewards/cosine_scaled_reward/std": 0.43860507011413574, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1823.0, + "completions/mean_length": 1187.34375, + "completions/mean_terminated_length": 850.5652465820312, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.336769163608551, + "learning_rate": 5.37435262574394e-07, + "loss": -0.0, + "num_tokens": 30668053.0, + "reward": 0.10428585112094879, + "reward_std": 0.24089524149894714, + "rewards/cosine_scaled_reward/mean": 0.10428585112094879, + "rewards/cosine_scaled_reward/std": 0.5281394720077515, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 1572.578125, + "completions/mean_terminated_length": 998.7930908203125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.3211428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26516586542129517, + "learning_rate": 5.342952264838747e-07, + "loss": -0.0, + "num_tokens": 30780322.0, + "reward": -0.22128218412399292, + "reward_std": 0.2539688050746918, + "rewards/cosine_scaled_reward/mean": -0.22128218412399292, + "rewards/cosine_scaled_reward/std": 0.2776496410369873, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1883.0, + "completions/mean_length": 1154.703125, + "completions/mean_terminated_length": 1079.0, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.3222857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28156232833862305, + "learning_rate": 5.311559558218603e-07, + "loss": -0.0, + "num_tokens": 30864247.0, + "reward": -0.05536589026451111, + "reward_std": 0.24473221600055695, + "rewards/cosine_scaled_reward/mean": -0.055365875363349915, + "rewards/cosine_scaled_reward/std": 0.4620397686958313, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1669.65625, + "completions/mean_terminated_length": 1240.86669921875, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "epoch": 0.32342857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21421867609024048, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0, + "num_tokens": 30981553.0, + "reward": 0.00772450864315033, + "reward_std": 0.365454763174057, + "rewards/cosine_scaled_reward/mean": 0.00772450864315033, + "rewards/cosine_scaled_reward/std": 0.5393754839897156, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1037.25, + "completions/mean_terminated_length": 932.6896362304688, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.32457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.476616770029068, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0, + "num_tokens": 31058233.0, + "reward": -0.11384838074445724, + "reward_std": 0.30951282382011414, + "rewards/cosine_scaled_reward/mean": -0.11384837329387665, + "rewards/cosine_scaled_reward/std": 0.3668363690376282, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1596.0, + "completions/mean_length": 980.6875, + "completions/mean_terminated_length": 828.2142944335938, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.32571428571428573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35374680161476135, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0, + "num_tokens": 31131685.0, + "reward": -0.07238543033599854, + "reward_std": 0.31320473551750183, + "rewards/cosine_scaled_reward/mean": -0.07238544523715973, + "rewards/cosine_scaled_reward/std": 0.44502073526382446, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1609.0, + "completions/mean_length": 1229.28125, + "completions/mean_terminated_length": 1059.3585205078125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.32685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3040845990180969, + "learning_rate": 5.186095868151436e-07, + "loss": -0.0, + "num_tokens": 31220791.0, + "reward": -0.01666315644979477, + "reward_std": 0.373945951461792, + "rewards/cosine_scaled_reward/mean": -0.01666315644979477, + "rewards/cosine_scaled_reward/std": 0.4552530348300934, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 1057.921875, + "completions/mean_terminated_length": 727.8958740234375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33932286500930786, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0, + "num_tokens": 31298650.0, + "reward": -0.04437921941280365, + "reward_std": 0.2586653232574463, + "rewards/cosine_scaled_reward/mean": -0.04437922313809395, + "rewards/cosine_scaled_reward/std": 0.4228661060333252, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 1318.125, + "completions/mean_terminated_length": 1132.0784912109375, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "epoch": 0.3291428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24589991569519043, + "learning_rate": 5.123449705004581e-07, + "loss": -0.0, + "num_tokens": 31394034.0, + "reward": -0.14538612961769104, + "reward_std": 0.30133792757987976, + "rewards/cosine_scaled_reward/mean": -0.14538611471652985, + "rewards/cosine_scaled_reward/std": 0.37525659799575806, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1191.203125, + "completions/mean_terminated_length": 993.4808349609375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.3302857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29120251536369324, + "learning_rate": 5.09215338910999e-07, + "loss": -0.0, + "num_tokens": 31481407.0, + "reward": 0.05579536408185959, + "reward_std": 0.3108929693698883, + "rewards/cosine_scaled_reward/mean": 0.05579536035656929, + "rewards/cosine_scaled_reward/std": 0.5062500238418579, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 992.15625, + "completions/mean_terminated_length": 841.3214721679688, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4412848949432373, + "learning_rate": 5.060876951083828e-07, + "loss": -0.0, + "num_tokens": 31554665.0, + "reward": 0.1182926595211029, + "reward_std": 0.34878551959991455, + "rewards/cosine_scaled_reward/mean": 0.1182926669716835, + "rewards/cosine_scaled_reward/std": 0.5200846195220947, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1220.203125, + "completions/mean_terminated_length": 1118.5438232421875, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.3325714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23901385068893433, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0, + "num_tokens": 31643182.0, + "reward": -0.005823981016874313, + "reward_std": 0.4199373722076416, + "rewards/cosine_scaled_reward/mean": -0.005823981016874313, + "rewards/cosine_scaled_reward/std": 0.48368990421295166, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1297.9375, + "completions/mean_terminated_length": 1026.6383056640625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.33371428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2564057409763336, + "learning_rate": 4.998389805071536e-07, + "loss": -0.0, + "num_tokens": 31737938.0, + "reward": -0.1168045848608017, + "reward_std": 0.26728183031082153, + "rewards/cosine_scaled_reward/mean": -0.1168045848608017, + "rewards/cosine_scaled_reward/std": 0.35125845670700073, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 1095.40625, + "completions/mean_terminated_length": 919.0, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.33485714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30023491382598877, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0, + "num_tokens": 31818924.0, + "reward": -0.04689720273017883, + "reward_std": 0.28609341382980347, + "rewards/cosine_scaled_reward/mean": -0.04689720273017883, + "rewards/cosine_scaled_reward/std": 0.4900621771812439, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 1580.125, + "completions/mean_terminated_length": 1167.2940673828125, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2665126323699951, + "learning_rate": 4.93600044896063e-07, + "loss": -0.0, + "num_tokens": 31930260.0, + "reward": -0.08336995542049408, + "reward_std": 0.31363779306411743, + "rewards/cosine_scaled_reward/mean": -0.08336995542049408, + "rewards/cosine_scaled_reward/std": 0.5037887692451477, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1488.453125, + "completions/mean_terminated_length": 1080.1351318359375, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.33714285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27722999453544617, + "learning_rate": 4.904846243842949e-07, + "loss": -0.0, + "num_tokens": 32035937.0, + "reward": 0.07110725343227386, + "reward_std": 0.3471496105194092, + "rewards/cosine_scaled_reward/mean": 0.07110726088285446, + "rewards/cosine_scaled_reward/std": 0.4735586941242218, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 1372.890625, + "completions/mean_terminated_length": 1087.844482421875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.3382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3566659986972809, + "learning_rate": 4.873721045679706e-07, + "loss": -0.0, + "num_tokens": 32134450.0, + "reward": -0.13402147591114044, + "reward_std": 0.2937889099121094, + "rewards/cosine_scaled_reward/mean": -0.13402147591114044, + "rewards/cosine_scaled_reward/std": 0.3325452506542206, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1618.578125, + "completions/mean_terminated_length": 1239.676513671875, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.3394285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2620627284049988, + "learning_rate": 4.842626371469149e-07, + "loss": -0.0, + "num_tokens": 32249263.0, + "reward": -0.10356634855270386, + "reward_std": 0.3085135817527771, + "rewards/cosine_scaled_reward/mean": -0.10356632620096207, + "rewards/cosine_scaled_reward/std": 0.41743960976600647, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1220.3125, + "completions/mean_terminated_length": 1009.3333740234375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3405714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38996121287345886, + "learning_rate": 4.811563736721829e-07, + "loss": -0.0, + "num_tokens": 32338387.0, + "reward": -0.023439258337020874, + "reward_std": 0.2796996235847473, + "rewards/cosine_scaled_reward/mean": -0.02343926578760147, + "rewards/cosine_scaled_reward/std": 0.5068122744560242, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1268.8125, + "completions/mean_terminated_length": 1157.5, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.3417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2567848563194275, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0, + "num_tokens": 32430847.0, + "reward": -0.09949212521314621, + "reward_std": 0.3315163254737854, + "rewards/cosine_scaled_reward/mean": -0.0994921326637268, + "rewards/cosine_scaled_reward/std": 0.3953181803226471, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1375.375, + "completions/mean_terminated_length": 1132.0850830078125, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.34285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3365769386291504, + "learning_rate": 4.749540639777539e-07, + "loss": -0.0, + "num_tokens": 32530767.0, + "reward": -0.04265330731868744, + "reward_std": 0.19944772124290466, + "rewards/cosine_scaled_reward/mean": -0.04265330731868744, + "rewards/cosine_scaled_reward/std": 0.39511266350746155, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1274.828125, + "completions/mean_terminated_length": 972.2826538085938, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3056529760360718, + "learning_rate": 4.7185832004988133e-07, + "loss": -0.0, + "num_tokens": 32623156.0, + "reward": -0.020140450447797775, + "reward_std": 0.24108503758907318, + "rewards/cosine_scaled_reward/mean": -0.02014043927192688, + "rewards/cosine_scaled_reward/std": 0.3770294487476349, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1257.5, + "completions/mean_terminated_length": 923.7333374023438, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.34514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2761608958244324, + "learning_rate": 4.68766384637248e-07, + "loss": -0.0, + "num_tokens": 32714172.0, + "reward": 0.00733301043510437, + "reward_std": 0.31370484828948975, + "rewards/cosine_scaled_reward/mean": 0.007333017885684967, + "rewards/cosine_scaled_reward/std": 0.4660908877849579, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1843.0, + "completions/mean_length": 1154.34375, + "completions/mean_terminated_length": 1026.6785888671875, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.3462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28181061148643494, + "learning_rate": 4.656784084364238e-07, + "loss": -0.0, + "num_tokens": 32798778.0, + "reward": -0.0471949465572834, + "reward_std": 0.31843262910842896, + "rewards/cosine_scaled_reward/mean": -0.0471949428319931, + "rewards/cosine_scaled_reward/std": 0.42977795004844666, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 1131.421875, + "completions/mean_terminated_length": 941.188720703125, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "epoch": 0.3474285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29874488711357117, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0, + "num_tokens": 32881573.0, + "reward": -0.10114389657974243, + "reward_std": 0.24575868248939514, + "rewards/cosine_scaled_reward/mean": -0.10114389657974243, + "rewards/cosine_scaled_reward/std": 0.40359967947006226, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1296.0625, + "completions/mean_terminated_length": 1122.5384521484375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.3485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27664560079574585, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0, + "num_tokens": 32974737.0, + "reward": -0.13854889571666718, + "reward_std": 0.25994938611984253, + "rewards/cosine_scaled_reward/mean": -0.13854891061782837, + "rewards/cosine_scaled_reward/std": 0.35616254806518555, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1035.921875, + "completions/mean_terminated_length": 1003.274169921875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.3497142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3285360634326935, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0, + "num_tokens": 33051604.0, + "reward": 0.14512689411640167, + "reward_std": 0.28848469257354736, + "rewards/cosine_scaled_reward/mean": 0.14512689411640167, + "rewards/cosine_scaled_reward/std": 0.5476776361465454, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1240.765625, + "completions/mean_terminated_length": 899.933349609375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.35085714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3270239233970642, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0, + "num_tokens": 33141517.0, + "reward": -0.08620665967464447, + "reward_std": 0.3766360580921173, + "rewards/cosine_scaled_reward/mean": -0.08620666712522507, + "rewards/cosine_scaled_reward/std": 0.4342641830444336, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1546.78125, + "completions/mean_terminated_length": 1181.027099609375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27663469314575195, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0, + "num_tokens": 33251111.0, + "reward": -0.17574885487556458, + "reward_std": 0.2633781433105469, + "rewards/cosine_scaled_reward/mean": -0.17574885487556458, + "rewards/cosine_scaled_reward/std": 0.38026031851768494, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1443.6875, + "completions/mean_terminated_length": 1188.5333251953125, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "epoch": 0.35314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22700217366218567, + "learning_rate": 4.4724210845020494e-07, + "loss": -0.0, + "num_tokens": 33354707.0, + "reward": -0.10798169672489166, + "reward_std": 0.31306353211402893, + "rewards/cosine_scaled_reward/mean": -0.10798169672489166, + "rewards/cosine_scaled_reward/std": 0.4384790062904358, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 1147.890625, + "completions/mean_terminated_length": 1037.350830078125, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.35428571428571426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29228872060775757, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0, + "num_tokens": 33438108.0, + "reward": -0.06270914524793625, + "reward_std": 0.3073654770851135, + "rewards/cosine_scaled_reward/mean": -0.06270914524793625, + "rewards/cosine_scaled_reward/std": 0.43214184045791626, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1250.640625, + "completions/mean_terminated_length": 1120.16357421875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.3554285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3056102991104126, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0, + "num_tokens": 33529405.0, + "reward": 0.10986693948507309, + "reward_std": 0.39667800068855286, + "rewards/cosine_scaled_reward/mean": 0.10986694693565369, + "rewards/cosine_scaled_reward/std": 0.521910548210144, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1711.0, + "completions/mean_length": 1222.3125, + "completions/mean_terminated_length": 759.1219482421875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3565714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3069155812263489, + "learning_rate": 4.3808955077581546e-07, + "loss": -0.0, + "num_tokens": 33618057.0, + "reward": 0.17035391926765442, + "reward_std": 0.1586351990699768, + "rewards/cosine_scaled_reward/mean": 0.17035391926765442, + "rewards/cosine_scaled_reward/std": 0.5358856916427612, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1873.0, + "completions/mean_length": 1280.859375, + "completions/mean_terminated_length": 956.95556640625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.3577142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2738092243671417, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0, + "num_tokens": 33710496.0, + "reward": 0.00472693145275116, + "reward_std": 0.24437010288238525, + "rewards/cosine_scaled_reward/mean": 0.004726938903331757, + "rewards/cosine_scaled_reward/std": 0.4307664930820465, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 1132.734375, + "completions/mean_terminated_length": 876.4599609375, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.3588571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2913839817047119, + "learning_rate": 4.3201486961161093e-07, + "loss": -0.0, + "num_tokens": 33793223.0, + "reward": 0.14254246652126312, + "reward_std": 0.3005802035331726, + "rewards/cosine_scaled_reward/mean": 0.14254248142242432, + "rewards/cosine_scaled_reward/std": 0.48442843556404114, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1495.015625, + "completions/mean_terminated_length": 1205.357177734375, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23286940157413483, + "learning_rate": 4.2898608072313045e-07, + "loss": -0.0, + "num_tokens": 33899984.0, + "reward": -0.0238988995552063, + "reward_std": 0.3149425685405731, + "rewards/cosine_scaled_reward/mean": -0.0238988995552063, + "rewards/cosine_scaled_reward/std": 0.4462411403656006, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1440.78125, + "completions/mean_terminated_length": 1100.146240234375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.36114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2949734032154083, + "learning_rate": 4.2596318988235037e-07, + "loss": -0.0, + "num_tokens": 34004122.0, + "reward": -0.26918160915374756, + "reward_std": 0.18649792671203613, + "rewards/cosine_scaled_reward/mean": -0.26918160915374756, + "rewards/cosine_scaled_reward/std": 0.23089087009429932, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1237.984375, + "completions/mean_terminated_length": 990.0203857421875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.36228571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32614174485206604, + "learning_rate": 4.2294634442070553e-07, + "loss": -0.0, + "num_tokens": 34094209.0, + "reward": -0.1377859264612198, + "reward_std": 0.2551230788230896, + "rewards/cosine_scaled_reward/mean": -0.1377859115600586, + "rewards/cosine_scaled_reward/std": 0.4370906949043274, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1681.0, + "completions/mean_length": 1091.515625, + "completions/mean_terminated_length": 870.7885131835938, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.36342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3238787055015564, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0, + "num_tokens": 34174002.0, + "reward": -0.1866554617881775, + "reward_std": 0.29696089029312134, + "rewards/cosine_scaled_reward/mean": -0.1866554617881775, + "rewards/cosine_scaled_reward/std": 0.3505614995956421, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1843.0, + "completions/mean_length": 1241.90625, + "completions/mean_terminated_length": 1036.431396484375, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.36457142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31191158294677734, + "learning_rate": 4.1693137748017915e-07, + "loss": -0.0, + "num_tokens": 34264540.0, + "reward": -0.16028451919555664, + "reward_std": 0.2337343692779541, + "rewards/cosine_scaled_reward/mean": -0.16028451919555664, + "rewards/cosine_scaled_reward/std": 0.2717510163784027, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1494.0, + "completions/mean_length": 861.453125, + "completions/mean_terminated_length": 823.1773681640625, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.3657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34454721212387085, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0, + "num_tokens": 34329673.0, + "reward": -0.06567925214767456, + "reward_std": 0.31354451179504395, + "rewards/cosine_scaled_reward/mean": -0.06567925959825516, + "rewards/cosine_scaled_reward/std": 0.4225046932697296, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 921.53125, + "completions/mean_terminated_length": 866.131103515625, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.3668571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32410749793052673, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0, + "num_tokens": 34398491.0, + "reward": 0.1567627191543579, + "reward_std": 0.28847143054008484, + "rewards/cosine_scaled_reward/mean": 0.1567627191543579, + "rewards/cosine_scaled_reward/std": 0.5146859288215637, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1115.5625, + "completions/mean_terminated_length": 1001.0526123046875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39042168855667114, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0, + "num_tokens": 34481167.0, + "reward": -0.10124355554580688, + "reward_std": 0.33205243945121765, + "rewards/cosine_scaled_reward/mean": -0.10124355554580688, + "rewards/cosine_scaled_reward/std": 0.4005582928657532, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1136.703125, + "completions/mean_terminated_length": 967.9444580078125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.36914285714285716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3326353132724762, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0, + "num_tokens": 34564764.0, + "reward": -0.040179818868637085, + "reward_std": 0.24554386734962463, + "rewards/cosine_scaled_reward/mean": -0.040179818868637085, + "rewards/cosine_scaled_reward/std": 0.4389503598213196, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1133.90625, + "completions/mean_terminated_length": 944.188720703125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.3702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34735575318336487, + "learning_rate": 4.020100089676376e-07, + "loss": -0.0, + "num_tokens": 34648182.0, + "reward": -0.16276082396507263, + "reward_std": 0.29534196853637695, + "rewards/cosine_scaled_reward/mean": -0.16276082396507263, + "rewards/cosine_scaled_reward/std": 0.35879120230674744, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1301.296875, + "completions/mean_terminated_length": 961.8864135742188, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.37142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3051433265209198, + "learning_rate": 3.9904679361238526e-07, + "loss": -0.0, + "num_tokens": 34741921.0, + "reward": -0.026148229837417603, + "reward_std": 0.24914240837097168, + "rewards/cosine_scaled_reward/mean": -0.026148226112127304, + "rewards/cosine_scaled_reward/std": 0.47579747438430786, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1623.0, + "completions/mean_length": 933.8125, + "completions/mean_terminated_length": 879.016357421875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.37257142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3523712754249573, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0, + "num_tokens": 34811509.0, + "reward": -0.005365140736103058, + "reward_std": 0.42064523696899414, + "rewards/cosine_scaled_reward/mean": -0.005365140736103058, + "rewards/cosine_scaled_reward/std": 0.48223450779914856, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1775.0, + "completions/mean_length": 1274.625, + "completions/mean_terminated_length": 972.0, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "epoch": 0.3737142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25114986300468445, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0, + "num_tokens": 34902525.0, + "reward": 0.15240590274333954, + "reward_std": 0.36757412552833557, + "rewards/cosine_scaled_reward/mean": 0.15240588784217834, + "rewards/cosine_scaled_reward/std": 0.5200423002243042, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1287.765625, + "completions/mean_terminated_length": 1034.354248046875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.37485714285714283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31936249136924744, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0, + "num_tokens": 34996638.0, + "reward": -0.07551632821559906, + "reward_std": 0.28956252336502075, + "rewards/cosine_scaled_reward/mean": -0.07551632821559906, + "rewards/cosine_scaled_reward/std": 0.38629648089408875, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 968.734375, + "completions/mean_terminated_length": 836.1929931640625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3347271680831909, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0, + "num_tokens": 35068285.0, + "reward": 0.03636639565229416, + "reward_std": 0.2681905925273895, + "rewards/cosine_scaled_reward/mean": 0.03636638820171356, + "rewards/cosine_scaled_reward/std": 0.5459120273590088, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 917.171875, + "completions/mean_terminated_length": 778.2982788085938, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.37714285714285717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4763779938220978, + "learning_rate": 3.843439512918949e-07, + "loss": -0.0, + "num_tokens": 35137376.0, + "reward": 0.03270518034696579, + "reward_std": 0.3372231125831604, + "rewards/cosine_scaled_reward/mean": 0.03270517289638519, + "rewards/cosine_scaled_reward/std": 0.47484341263771057, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 1351.84375, + "completions/mean_terminated_length": 810.388916015625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.3782857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5174200534820557, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0, + "num_tokens": 35234158.0, + "reward": -0.14917470514774323, + "reward_std": 0.23482158780097961, + "rewards/cosine_scaled_reward/mean": -0.14917470514774323, + "rewards/cosine_scaled_reward/std": 0.33059147000312805, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1160.9375, + "completions/mean_terminated_length": 956.2308349609375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.37942857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3024626076221466, + "learning_rate": 3.785183306423767e-07, + "loss": -0.0, + "num_tokens": 35319202.0, + "reward": -0.1432058960199356, + "reward_std": 0.219490647315979, + "rewards/cosine_scaled_reward/mean": -0.14320588111877441, + "rewards/cosine_scaled_reward/std": 0.3754548728466034, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1966.0, + "completions/mean_length": 1025.53125, + "completions/mean_terminated_length": 975.245849609375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.38057142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2969377338886261, + "learning_rate": 3.7561798609655373e-07, + "loss": -0.0, + "num_tokens": 35395524.0, + "reward": 0.06338316202163696, + "reward_std": 0.3955259919166565, + "rewards/cosine_scaled_reward/mean": 0.06338317692279816, + "rewards/cosine_scaled_reward/std": 0.4999569058418274, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1489.828125, + "completions/mean_terminated_length": 1082.5135498046875, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.38171428571428573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26992398500442505, + "learning_rate": 3.72726140684072e-07, + "loss": -0.0, + "num_tokens": 35501841.0, + "reward": -0.2190597951412201, + "reward_std": 0.20334219932556152, + "rewards/cosine_scaled_reward/mean": -0.2190597951412201, + "rewards/cosine_scaled_reward/std": 0.22991260886192322, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 1254.953125, + "completions/mean_terminated_length": 1052.803955078125, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.38285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2687843441963196, + "learning_rate": 3.6984293534939737e-07, + "loss": -0.0, + "num_tokens": 35592238.0, + "reward": -0.008545689284801483, + "reward_std": 0.2865098714828491, + "rewards/cosine_scaled_reward/mean": -0.008545689284801483, + "rewards/cosine_scaled_reward/std": 0.46601414680480957, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1462.296875, + "completions/mean_terminated_length": 1133.731689453125, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 0.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28923502564430237, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.0, + "num_tokens": 35696769.0, + "reward": 0.006126267835497856, + "reward_std": 0.34422457218170166, + "rewards/cosine_scaled_reward/mean": 0.006126277148723602, + "rewards/cosine_scaled_reward/std": 0.49070778489112854, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1855.0, + "completions/mean_length": 1304.328125, + "completions/mean_terminated_length": 1035.34033203125, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.3851428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2708565592765808, + "learning_rate": 3.641030065789562e-07, + "loss": -0.0, + "num_tokens": 35791238.0, + "reward": -0.1498498022556305, + "reward_std": 0.3901336193084717, + "rewards/cosine_scaled_reward/mean": -0.1498498022556305, + "rewards/cosine_scaled_reward/std": 0.4185183346271515, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1146.875, + "completions/mean_terminated_length": 871.0203857421875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.3862857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34217318892478943, + "learning_rate": 3.612465628992203e-07, + "loss": -0.0, + "num_tokens": 35874318.0, + "reward": 0.023914728313684464, + "reward_std": 0.3298850655555725, + "rewards/cosine_scaled_reward/mean": 0.023914728313684464, + "rewards/cosine_scaled_reward/std": 0.4540858268737793, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 1251.3125, + "completions/mean_terminated_length": 1120.9454345703125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.38742857142857146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.322914719581604, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0, + "num_tokens": 35965698.0, + "reward": -0.08145550638437271, + "reward_std": 0.2552027404308319, + "rewards/cosine_scaled_reward/mean": -0.0814555287361145, + "rewards/cosine_scaled_reward/std": 0.4186084270477295, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 1216.640625, + "completions/mean_terminated_length": 962.142822265625, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "epoch": 0.38857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2708609104156494, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0, + "num_tokens": 36053531.0, + "reward": -0.01538059115409851, + "reward_std": 0.282448410987854, + "rewards/cosine_scaled_reward/mean": -0.01538059115409851, + "rewards/cosine_scaled_reward/std": 0.47943398356437683, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1257.15625, + "completions/mean_terminated_length": 1035.719970703125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.38971428571428574, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.21827326714992523, + "learning_rate": 3.5273298394491515e-07, + "loss": -0.0, + "num_tokens": 36145061.0, + "reward": 0.12981322407722473, + "reward_std": 0.3501738905906677, + "rewards/cosine_scaled_reward/mean": 0.12981323897838593, + "rewards/cosine_scaled_reward/std": 0.5823574662208557, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1330.71875, + "completions/mean_terminated_length": 955.0, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.39085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3223852217197418, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0, + "num_tokens": 36240867.0, + "reward": -0.14867264032363892, + "reward_std": 0.36134976148605347, + "rewards/cosine_scaled_reward/mean": -0.14867264032363892, + "rewards/cosine_scaled_reward/std": 0.4149284064769745, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1322.796875, + "completions/mean_terminated_length": 1060.4893798828125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28214380145072937, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0, + "num_tokens": 36337262.0, + "reward": 0.05130603909492493, + "reward_std": 0.3060014545917511, + "rewards/cosine_scaled_reward/mean": 0.05130603164434433, + "rewards/cosine_scaled_reward/std": 0.4707089960575104, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1358.78125, + "completions/mean_terminated_length": 1129.041748046875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.3931428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28303998708724976, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0, + "num_tokens": 36434272.0, + "reward": 0.1340864598751068, + "reward_std": 0.3489719033241272, + "rewards/cosine_scaled_reward/mean": 0.1340864598751068, + "rewards/cosine_scaled_reward/std": 0.46820539236068726, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1194.65625, + "completions/mean_terminated_length": 1089.859619140625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.3942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31418564915657043, + "learning_rate": 3.4151678419606233e-07, + "loss": -0.0, + "num_tokens": 36520338.0, + "reward": 0.03204090893268585, + "reward_std": 0.2984616756439209, + "rewards/cosine_scaled_reward/mean": 0.03204091265797615, + "rewards/cosine_scaled_reward/std": 0.47096553444862366, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1274.703125, + "completions/mean_terminated_length": 1194.7069091796875, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "epoch": 0.3954285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2741748094558716, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0, + "num_tokens": 36613703.0, + "reward": -0.03510487079620361, + "reward_std": 0.34712469577789307, + "rewards/cosine_scaled_reward/mean": -0.03510487079620361, + "rewards/cosine_scaled_reward/std": 0.44494491815567017, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1292.078125, + "completions/mean_terminated_length": 1168.3818359375, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.3965714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2128373682498932, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0, + "num_tokens": 36706132.0, + "reward": -0.19367390871047974, + "reward_std": 0.26442888379096985, + "rewards/cosine_scaled_reward/mean": -0.19367390871047974, + "rewards/cosine_scaled_reward/std": 0.3002490699291229, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1143.234375, + "completions/mean_terminated_length": 934.4423217773438, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.3977142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3682413697242737, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0, + "num_tokens": 36789859.0, + "reward": 0.050142791122198105, + "reward_std": 0.2848895788192749, + "rewards/cosine_scaled_reward/mean": 0.050142791122198105, + "rewards/cosine_scaled_reward/std": 0.5106508731842041, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1187.359375, + "completions/mean_terminated_length": 1098.32763671875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.39885714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3549482822418213, + "learning_rate": 3.3046315338757026e-07, + "loss": -0.0, + "num_tokens": 36877594.0, + "reward": 0.03635264188051224, + "reward_std": 0.40539801120758057, + "rewards/cosine_scaled_reward/mean": 0.036352649331092834, + "rewards/cosine_scaled_reward/std": 0.5298211574554443, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 1020.984375, + "completions/mean_terminated_length": 952.5167236328125, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3496515154838562, + "learning_rate": 3.2772616003709616e-07, + "loss": -0.0, + "num_tokens": 36952737.0, + "reward": 0.037382401525974274, + "reward_std": 0.38036224246025085, + "rewards/cosine_scaled_reward/mean": 0.03738240525126457, + "rewards/cosine_scaled_reward/std": 0.4981257915496826, + "step": 350 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 36952737, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}