diff --git "a/checkpoint-250/trainer_state.json" "b/checkpoint-250/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-250/trainer_state.json" @@ -0,0 +1,6284 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2857142857142857, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2837817668914795, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": -0.09800112247467041, + "reward_std": 0.3028089702129364, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2421981245279312, + "learning_rate": 2e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.020556632429361343, + "reward_std": 0.3545936942100525, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 1952.234375, + "completions/mean_terminated_length": 822.2000122070312, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24851329624652863, + "learning_rate": 4e-08, + "loss": -0.0, + "num_tokens": 375163.0, + "reward": -0.22721199691295624, + "reward_std": 0.14563649892807007, + "rewards/cosine_scaled_reward/mean": -0.22721199691295624, + "rewards/cosine_scaled_reward/std": 0.1709199845790863, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 1554.109375, + "completions/mean_terminated_length": 958.0344848632812, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29272863268852234, + "learning_rate": 6e-08, + "loss": -0.0, + "num_tokens": 484434.0, + "reward": -0.17542189359664917, + "reward_std": 0.18219107389450073, + "rewards/cosine_scaled_reward/mean": -0.17542189359664917, + "rewards/cosine_scaled_reward/std": 0.27975013852119446, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1943.0625, + "completions/mean_terminated_length": 1088.571533203125, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2773251533508301, + "learning_rate": 8e-08, + "loss": 0.0, + "num_tokens": 619606.0, + "reward": -0.2648562788963318, + "reward_std": 0.21638144552707672, + "rewards/cosine_scaled_reward/mean": -0.2648562788963318, + "rewards/cosine_scaled_reward/std": 0.23959198594093323, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 1854.21875, + "completions/mean_terminated_length": 920.5454711914062, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27399909496307373, + "learning_rate": 1e-07, + "loss": -0.0, + "num_tokens": 749924.0, + "reward": -0.19292885065078735, + "reward_std": 0.2666770815849304, + "rewards/cosine_scaled_reward/mean": -0.19292885065078735, + "rewards/cosine_scaled_reward/std": 0.295730322599411, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 1940.5625, + "completions/mean_terminated_length": 1065.71435546875, + "completions/min_length": 773.0, + "completions/min_terminated_length": 773.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23362359404563904, + "learning_rate": 1.2e-07, + "loss": 0.0, + "num_tokens": 884528.0, + "reward": -0.18198424577713013, + "reward_std": 0.18540163338184357, + "rewards/cosine_scaled_reward/mean": -0.18198424577713013, + "rewards/cosine_scaled_reward/std": 0.32407456636428833, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1708.5625, + "completions/mean_terminated_length": 1013.5238037109375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24677562713623047, + "learning_rate": 1.4e-07, + "loss": -0.0, + "num_tokens": 1004292.0, + "reward": -0.09573853015899658, + "reward_std": 0.22485454380512238, + "rewards/cosine_scaled_reward/mean": -0.09573852270841599, + "rewards/cosine_scaled_reward/std": 0.449250191450119, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1221.0, + "completions/mean_length": 1979.359375, + "completions/mean_terminated_length": 949.75, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26966309547424316, + "learning_rate": 1.6e-07, + "loss": 0.0, + "num_tokens": 1142427.0, + "reward": -0.19992578029632568, + "reward_std": 0.20190927386283875, + "rewards/cosine_scaled_reward/mean": -0.19992581009864807, + "rewards/cosine_scaled_reward/std": 0.23785534501075745, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1652.59375, + "completions/mean_terminated_length": 897.727294921875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3011312484741211, + "learning_rate": 1.8e-07, + "loss": 0.0, + "num_tokens": 1259025.0, + "reward": -0.11706389486789703, + "reward_std": 0.2934548258781433, + "rewards/cosine_scaled_reward/mean": -0.11706390231847763, + "rewards/cosine_scaled_reward/std": 0.3601698577404022, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 1946.6875, + "completions/mean_terminated_length": 967.3333740234375, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2451399564743042, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 1395285.0, + "reward": -0.2866281270980835, + "reward_std": 0.12184012681245804, + "rewards/cosine_scaled_reward/mean": -0.2866281270980835, + "rewards/cosine_scaled_reward/std": 0.15141677856445312, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 1659.28125, + "completions/mean_terminated_length": 1190.137939453125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2733561396598816, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "num_tokens": 1512423.0, + "reward": -0.13816070556640625, + "reward_std": 0.2968980073928833, + "rewards/cosine_scaled_reward/mean": -0.13816070556640625, + "rewards/cosine_scaled_reward/std": 0.3597467839717865, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1770.0, + "completions/mean_length": 1807.796875, + "completions/mean_terminated_length": 1023.1333618164062, + "completions/min_length": 697.0, + "completions/min_terminated_length": 697.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25238803029060364, + "learning_rate": 2.4e-07, + "loss": 0.0, + "num_tokens": 1639162.0, + "reward": -0.13488636910915375, + "reward_std": 0.2661236524581909, + "rewards/cosine_scaled_reward/mean": -0.13488635420799255, + "rewards/cosine_scaled_reward/std": 0.3444243371486664, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 1846.921875, + "completions/mean_terminated_length": 1243.6875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2201598882675171, + "learning_rate": 2.6e-07, + "loss": -0.0, + "num_tokens": 1767973.0, + "reward": -0.20591925084590912, + "reward_std": 0.21505361795425415, + "rewards/cosine_scaled_reward/mean": -0.20591923594474792, + "rewards/cosine_scaled_reward/std": 0.323749840259552, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 1710.421875, + "completions/mean_terminated_length": 847.7222290039062, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2665213644504547, + "learning_rate": 2.8e-07, + "loss": 0.0, + "num_tokens": 1888360.0, + "reward": -0.0778750479221344, + "reward_std": 0.17502948641777039, + "rewards/cosine_scaled_reward/mean": -0.0778750628232956, + "rewards/cosine_scaled_reward/std": 0.47343766689300537, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 2031.03125, + "completions/mean_terminated_length": 962.0, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23009927570819855, + "learning_rate": 3e-07, + "loss": -0.0, + "num_tokens": 2028786.0, + "reward": -0.2619968056678772, + "reward_std": 0.16954168677330017, + "rewards/cosine_scaled_reward/mean": -0.2619968056678772, + "rewards/cosine_scaled_reward/std": 0.18357795476913452, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1533.15625, + "completions/mean_terminated_length": 780.6923217773438, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3392995297908783, + "learning_rate": 3.2e-07, + "loss": -0.0, + "num_tokens": 2137428.0, + "reward": -0.11706461012363434, + "reward_std": 0.3096129894256592, + "rewards/cosine_scaled_reward/mean": -0.11706460267305374, + "rewards/cosine_scaled_reward/std": 0.3810974657535553, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 1774.46875, + "completions/mean_terminated_length": 1018.2352905273438, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23254038393497467, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "num_tokens": 2261370.0, + "reward": -0.18709540367126465, + "reward_std": 0.2795025110244751, + "rewards/cosine_scaled_reward/mean": -0.18709540367126465, + "rewards/cosine_scaled_reward/std": 0.3359416127204895, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1859.0, + "completions/mean_length": 1719.0, + "completions/mean_terminated_length": 995.2000122070312, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262045681476593, + "learning_rate": 3.6e-07, + "loss": -0.0, + "num_tokens": 2382642.0, + "reward": -0.02329203486442566, + "reward_std": 0.34684932231903076, + "rewards/cosine_scaled_reward/mean": -0.02329203486442566, + "rewards/cosine_scaled_reward/std": 0.47637447714805603, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1630.90625, + "completions/mean_terminated_length": 935.75, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.250532329082489, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "num_tokens": 2498372.0, + "reward": -0.06319350004196167, + "reward_std": 0.2394939512014389, + "rewards/cosine_scaled_reward/mean": -0.06319350004196167, + "rewards/cosine_scaled_reward/std": 0.3889789879322052, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 1735.96875, + "completions/mean_terminated_length": 1140.272705078125, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2773231565952301, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 2620282.0, + "reward": -0.20884393155574799, + "reward_std": 0.20233216881752014, + "rewards/cosine_scaled_reward/mean": -0.20884393155574799, + "rewards/cosine_scaled_reward/std": 0.28432920575141907, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1790.0, + "completions/mean_length": 1342.953125, + "completions/mean_terminated_length": 919.9249877929688, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34627005457878113, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "num_tokens": 2715247.0, + "reward": -0.09092864394187927, + "reward_std": 0.21042926609516144, + "rewards/cosine_scaled_reward/mean": -0.09092865139245987, + "rewards/cosine_scaled_reward/std": 0.43559205532073975, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1661.9375, + "completions/mean_terminated_length": 1132.888916015625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2705242335796356, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "num_tokens": 2832403.0, + "reward": -0.13339249789714813, + "reward_std": 0.2433384656906128, + "rewards/cosine_scaled_reward/mean": -0.13339248299598694, + "rewards/cosine_scaled_reward/std": 0.3815627098083496, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1802.296875, + "completions/mean_terminated_length": 1065.1875, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24961258471012115, + "learning_rate": 4.6e-07, + "loss": 0.0, + "num_tokens": 2958678.0, + "reward": -0.18733163177967072, + "reward_std": 0.2773033380508423, + "rewards/cosine_scaled_reward/mean": -0.1873316466808319, + "rewards/cosine_scaled_reward/std": 0.37051624059677124, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 1731.53125, + "completions/mean_terminated_length": 982.0, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2662124037742615, + "learning_rate": 4.8e-07, + "loss": 0.0, + "num_tokens": 3079792.0, + "reward": -0.12407588213682175, + "reward_std": 0.25581949949264526, + "rewards/cosine_scaled_reward/mean": -0.12407589703798294, + "rewards/cosine_scaled_reward/std": 0.39043793082237244, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1965.46875, + "completions/mean_terminated_length": 1567.8182373046875, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23202598094940186, + "learning_rate": 5e-07, + "loss": 0.0, + "num_tokens": 3216214.0, + "reward": -0.0963105633854866, + "reward_std": 0.30887559056282043, + "rewards/cosine_scaled_reward/mean": -0.0963105633854866, + "rewards/cosine_scaled_reward/std": 0.39396020770072937, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1886.96875, + "completions/mean_terminated_length": 1111.0909423828125, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2878379225730896, + "learning_rate": 5.2e-07, + "loss": -0.0, + "num_tokens": 3347268.0, + "reward": -0.1645491123199463, + "reward_std": 0.28629785776138306, + "rewards/cosine_scaled_reward/mean": -0.1645491123199463, + "rewards/cosine_scaled_reward/std": 0.35050687193870544, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 1843.640625, + "completions/mean_terminated_length": 1230.5625, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24996496737003326, + "learning_rate": 5.4e-07, + "loss": 0.0, + "num_tokens": 3475597.0, + "reward": -0.06605555862188339, + "reward_std": 0.2643629312515259, + "rewards/cosine_scaled_reward/mean": -0.06605555862188339, + "rewards/cosine_scaled_reward/std": 0.438128799200058, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 2020.5, + "completions/mean_terminated_length": 1608.0, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23316837847232819, + "learning_rate": 5.6e-07, + "loss": -0.0, + "num_tokens": 3615381.0, + "reward": -0.2015206664800644, + "reward_std": 0.15312039852142334, + "rewards/cosine_scaled_reward/mean": -0.2015206664800644, + "rewards/cosine_scaled_reward/std": 0.1648881882429123, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1839.0, + "completions/mean_length": 1826.046875, + "completions/mean_terminated_length": 955.3077392578125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2410832792520523, + "learning_rate": 5.8e-07, + "loss": -0.0, + "num_tokens": 3742784.0, + "reward": -0.17509159445762634, + "reward_std": 0.18994277715682983, + "rewards/cosine_scaled_reward/mean": -0.17509159445762634, + "rewards/cosine_scaled_reward/std": 0.22516494989395142, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1678.0, + "completions/mean_length": 1781.4375, + "completions/mean_terminated_length": 910.6666870117188, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2693414092063904, + "learning_rate": 6e-07, + "loss": 0.0, + "num_tokens": 3867292.0, + "reward": -0.24513831734657288, + "reward_std": 0.28315529227256775, + "rewards/cosine_scaled_reward/mean": -0.24513831734657288, + "rewards/cosine_scaled_reward/std": 0.3480584919452667, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1969.28125, + "completions/mean_terminated_length": 1488.2222900390625, + "completions/min_length": 1088.0, + "completions/min_terminated_length": 1088.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24202018976211548, + "learning_rate": 6.2e-07, + "loss": 0.0, + "num_tokens": 4003678.0, + "reward": -0.18968716263771057, + "reward_std": 0.28299200534820557, + "rewards/cosine_scaled_reward/mean": -0.18968716263771057, + "rewards/cosine_scaled_reward/std": 0.3119950294494629, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22288212180137634, + "learning_rate": 6.4e-07, + "loss": 0.0, + "num_tokens": 4145966.0, + "reward": -0.2955162525177002, + "reward_std": 0.17793573439121246, + "rewards/cosine_scaled_reward/mean": -0.2955162525177002, + "rewards/cosine_scaled_reward/std": 0.22786569595336914, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 1589.640625, + "completions/mean_terminated_length": 1036.4482421875, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31030499935150146, + "learning_rate": 6.6e-07, + "loss": 0.0, + "num_tokens": 4257255.0, + "reward": 0.008002171292901039, + "reward_std": 0.3413254916667938, + "rewards/cosine_scaled_reward/mean": 0.008002176880836487, + "rewards/cosine_scaled_reward/std": 0.4431404769420624, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1785.921875, + "completions/mean_terminated_length": 757.769287109375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3145958483219147, + "learning_rate": 6.800000000000001e-07, + "loss": -0.0, + "num_tokens": 4383050.0, + "reward": -0.16386553645133972, + "reward_std": 0.2818174958229065, + "rewards/cosine_scaled_reward/mean": -0.16386555135250092, + "rewards/cosine_scaled_reward/std": 0.3242056965827942, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 2000.421875, + "completions/mean_terminated_length": 1033.0, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25796815752983093, + "learning_rate": 7e-07, + "loss": 0.0, + "num_tokens": 4522189.0, + "reward": -0.2470606118440628, + "reward_std": 0.15509279072284698, + "rewards/cosine_scaled_reward/mean": -0.2470606118440628, + "rewards/cosine_scaled_reward/std": 0.16412879526615143, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1964.46875, + "completions/mean_terminated_length": 1284.2857666015625, + "completions/min_length": 931.0, + "completions/min_terminated_length": 931.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22452199459075928, + "learning_rate": 7.2e-07, + "loss": 0.0, + "num_tokens": 4658939.0, + "reward": -0.24706938862800598, + "reward_std": 0.18499845266342163, + "rewards/cosine_scaled_reward/mean": -0.24706941843032837, + "rewards/cosine_scaled_reward/std": 0.21092188358306885, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 1925.234375, + "completions/mean_terminated_length": 1175.0, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23703666031360626, + "learning_rate": 7.4e-07, + "loss": -0.0, + "num_tokens": 4793866.0, + "reward": -0.11504355818033218, + "reward_std": 0.20660358667373657, + "rewards/cosine_scaled_reward/mean": -0.11504356563091278, + "rewards/cosine_scaled_reward/std": 0.3190351724624634, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 1740.546875, + "completions/mean_terminated_length": 642.5, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23829001188278198, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "num_tokens": 4916045.0, + "reward": -0.12095541507005692, + "reward_std": 0.1958026885986328, + "rewards/cosine_scaled_reward/mean": -0.12095542997121811, + "rewards/cosine_scaled_reward/std": 0.340241402387619, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1713.203125, + "completions/mean_terminated_length": 920.26318359375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24145744740962982, + "learning_rate": 7.799999999999999e-07, + "loss": -0.0, + "num_tokens": 5035762.0, + "reward": -0.10936243832111359, + "reward_std": 0.14468500018119812, + "rewards/cosine_scaled_reward/mean": -0.10936242341995239, + "rewards/cosine_scaled_reward/std": 0.4288744330406189, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1909.71875, + "completions/mean_terminated_length": 1367.2308349609375, + "completions/min_length": 1138.0, + "completions/min_terminated_length": 1138.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22317881882190704, + "learning_rate": 8e-07, + "loss": 0.0, + "num_tokens": 5169136.0, + "reward": -0.2058967649936676, + "reward_std": 0.2325170338153839, + "rewards/cosine_scaled_reward/mean": -0.20589673519134521, + "rewards/cosine_scaled_reward/std": 0.28897321224212646, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 1727.71875, + "completions/mean_terminated_length": 583.857177734375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44688937067985535, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "num_tokens": 5290070.0, + "reward": -0.2254919707775116, + "reward_std": 0.1687203049659729, + "rewards/cosine_scaled_reward/mean": -0.2254919707775116, + "rewards/cosine_scaled_reward/std": 0.18203677237033844, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 1855.328125, + "completions/mean_terminated_length": 814.9000244140625, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2430828958749771, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0, + "num_tokens": 5420427.0, + "reward": -0.09104865789413452, + "reward_std": 0.18217626214027405, + "rewards/cosine_scaled_reward/mean": -0.09104865789413452, + "rewards/cosine_scaled_reward/std": 0.3521345257759094, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 1727.9375, + "completions/mean_terminated_length": 767.75, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32065215706825256, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "num_tokens": 5541711.0, + "reward": -0.17701950669288635, + "reward_std": 0.2957555055618286, + "rewards/cosine_scaled_reward/mean": -0.17701953649520874, + "rewards/cosine_scaled_reward/std": 0.38460060954093933, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 2013.9375, + "completions/mean_terminated_length": 1321.3333740234375, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22363637387752533, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "num_tokens": 5682259.0, + "reward": -0.20341511070728302, + "reward_std": 0.23104795813560486, + "rewards/cosine_scaled_reward/mean": -0.20341511070728302, + "rewards/cosine_scaled_reward/std": 0.3092363774776459, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 1909.0, + "completions/mean_terminated_length": 936.0, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26306217908859253, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 5815603.0, + "reward": -0.26145532727241516, + "reward_std": 0.17108051478862762, + "rewards/cosine_scaled_reward/mean": -0.2614552974700928, + "rewards/cosine_scaled_reward/std": 0.18312901258468628, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1668.0, + "completions/mean_length": 1757.1875, + "completions/mean_terminated_length": 884.75, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2856813371181488, + "learning_rate": 9.2e-07, + "loss": 0.0, + "num_tokens": 5938463.0, + "reward": -0.20879247784614563, + "reward_std": 0.23861759901046753, + "rewards/cosine_scaled_reward/mean": -0.20879246294498444, + "rewards/cosine_scaled_reward/std": 0.39607998728752136, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 1756.5, + "completions/mean_terminated_length": 1011.5555419921875, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27563413977622986, + "learning_rate": 9.399999999999999e-07, + "loss": -0.0, + "num_tokens": 6061423.0, + "reward": -0.16147920489311218, + "reward_std": 0.24055320024490356, + "rewards/cosine_scaled_reward/mean": -0.16147920489311218, + "rewards/cosine_scaled_reward/std": 0.3948959410190582, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1458.0, + "completions/mean_length": 1538.078125, + "completions/mean_terminated_length": 839.2963256835938, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27617642283439636, + "learning_rate": 9.6e-07, + "loss": -0.0, + "num_tokens": 6169924.0, + "reward": -0.18436825275421143, + "reward_std": 0.27141550183296204, + "rewards/cosine_scaled_reward/mean": -0.18436823785305023, + "rewards/cosine_scaled_reward/std": 0.3920196294784546, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 1749.0625, + "completions/mean_terminated_length": 772.5333862304688, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23394836485385895, + "learning_rate": 9.8e-07, + "loss": 0.0, + "num_tokens": 6292680.0, + "reward": -0.10770958662033081, + "reward_std": 0.22513547539710999, + "rewards/cosine_scaled_reward/mean": -0.10770957916975021, + "rewards/cosine_scaled_reward/std": 0.421062707901001, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1482.25, + "completions/mean_terminated_length": 841.0667114257812, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.05828571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3268967568874359, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 6397752.0, + "reward": -0.09745607525110245, + "reward_std": 0.25210899114608765, + "rewards/cosine_scaled_reward/mean": -0.09745605289936066, + "rewards/cosine_scaled_reward/std": 0.3351369798183441, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1743.953125, + "completions/mean_terminated_length": 750.7333984375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.05942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2918722927570343, + "learning_rate": 9.999890338174275e-07, + "loss": -0.0, + "num_tokens": 6520717.0, + "reward": -0.1890830397605896, + "reward_std": 0.21916288137435913, + "rewards/cosine_scaled_reward/mean": -0.1890830546617508, + "rewards/cosine_scaled_reward/std": 0.32568052411079407, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 1772.421875, + "completions/mean_terminated_length": 1010.5294189453125, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.060571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24523264169692993, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0, + "num_tokens": 6644984.0, + "reward": -0.20969681441783905, + "reward_std": 0.1810423731803894, + "rewards/cosine_scaled_reward/mean": -0.20969681441783905, + "rewards/cosine_scaled_reward/std": 0.2371566891670227, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 1838.859375, + "completions/mean_terminated_length": 1304.388916015625, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.061714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23284469544887543, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "num_tokens": 6773815.0, + "reward": -0.06641622632741928, + "reward_std": 0.30815836787223816, + "rewards/cosine_scaled_reward/mean": -0.06641621887683868, + "rewards/cosine_scaled_reward/std": 0.46219584345817566, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1750.125, + "completions/mean_terminated_length": 856.5, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.06285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2651103734970093, + "learning_rate": 9.998245517681593e-07, + "loss": -0.0, + "num_tokens": 6896111.0, + "reward": -0.10750342905521393, + "reward_std": 0.2286185324192047, + "rewards/cosine_scaled_reward/mean": -0.10750342160463333, + "rewards/cosine_scaled_reward/std": 0.43372800946235657, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1840.078125, + "completions/mean_terminated_length": 1097.5, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22967560589313507, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 7024836.0, + "reward": -0.10045827925205231, + "reward_std": 0.2548004388809204, + "rewards/cosine_scaled_reward/mean": -0.10045827925205231, + "rewards/cosine_scaled_reward/std": 0.41444358229637146, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1810.0, + "completions/mean_length": 1991.1875, + "completions/mean_terminated_length": 1442.0, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "epoch": 0.06514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20479348301887512, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "num_tokens": 7163840.0, + "reward": -0.27901512384414673, + "reward_std": 0.2130473554134369, + "rewards/cosine_scaled_reward/mean": -0.27901512384414673, + "rewards/cosine_scaled_reward/std": 0.2583855092525482, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 1617.421875, + "completions/mean_terminated_length": 1129.433349609375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.06628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2690146267414093, + "learning_rate": 9.994627618036452e-07, + "loss": -0.0, + "num_tokens": 7277451.0, + "reward": -0.04198366403579712, + "reward_std": 0.4036104083061218, + "rewards/cosine_scaled_reward/mean": -0.04198366031050682, + "rewards/cosine_scaled_reward/std": 0.5008736252784729, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1736.09375, + "completions/mean_terminated_length": 997.368408203125, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "epoch": 0.06742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2184475064277649, + "learning_rate": 9.992983438818915e-07, + "loss": -0.0, + "num_tokens": 7399025.0, + "reward": -0.1564982533454895, + "reward_std": 0.19560785591602325, + "rewards/cosine_scaled_reward/mean": -0.1564982533454895, + "rewards/cosine_scaled_reward/std": 0.3402426540851593, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1512.0, + "completions/mean_length": 1785.40625, + "completions/mean_terminated_length": 847.5714721679688, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.06857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23538637161254883, + "learning_rate": 9.991120277927223e-07, + "loss": -0.0, + "num_tokens": 7524179.0, + "reward": -0.2697012424468994, + "reward_std": 0.17935499548912048, + "rewards/cosine_scaled_reward/mean": -0.2697012424468994, + "rewards/cosine_scaled_reward/std": 0.19757980108261108, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1884.484375, + "completions/mean_terminated_length": 1001.5, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.06971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.225452721118927, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "num_tokens": 7656306.0, + "reward": -0.1635127067565918, + "reward_std": 0.1931447982788086, + "rewards/cosine_scaled_reward/mean": -0.1635127067565918, + "rewards/cosine_scaled_reward/std": 0.23563610017299652, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1994.0, + "completions/mean_length": 1739.46875, + "completions/mean_terminated_length": 1060.7000732421875, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.07085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23771661520004272, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0, + "num_tokens": 7777864.0, + "reward": -0.10127441585063934, + "reward_std": 0.2957979142665863, + "rewards/cosine_scaled_reward/mean": -0.10127442330121994, + "rewards/cosine_scaled_reward/std": 0.34053224325180054, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1522.953125, + "completions/mean_terminated_length": 1163.7105712890625, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27804723381996155, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "num_tokens": 7885589.0, + "reward": -0.036153122782707214, + "reward_std": 0.3305097818374634, + "rewards/cosine_scaled_reward/mean": -0.03615312650799751, + "rewards/cosine_scaled_reward/std": 0.4355940818786621, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 1760.390625, + "completions/mean_terminated_length": 1025.388916015625, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.07314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2333846092224121, + "learning_rate": 9.981479793771866e-07, + "loss": -0.0, + "num_tokens": 8009206.0, + "reward": -0.14333069324493408, + "reward_std": 0.28757935762405396, + "rewards/cosine_scaled_reward/mean": -0.14333069324493408, + "rewards/cosine_scaled_reward/std": 0.41007620096206665, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 1651.515625, + "completions/mean_terminated_length": 638.2777709960938, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.07428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26348626613616943, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0, + "num_tokens": 8125607.0, + "reward": -0.2117859125137329, + "reward_std": 0.15534773468971252, + "rewards/cosine_scaled_reward/mean": -0.2117859125137329, + "rewards/cosine_scaled_reward/std": 0.37395453453063965, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 1254.125, + "completions/mean_terminated_length": 596.3428344726562, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.07542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33443817496299744, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "num_tokens": 8216103.0, + "reward": 0.028336994349956512, + "reward_std": 0.25119709968566895, + "rewards/cosine_scaled_reward/mean": 0.02833697199821472, + "rewards/cosine_scaled_reward/std": 0.4882389008998871, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 1966.21875, + "completions/mean_terminated_length": 1175.666748046875, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.07657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2199370563030243, + "learning_rate": 9.971955636222684e-07, + "loss": -0.0, + "num_tokens": 8352677.0, + "reward": -0.28747493028640747, + "reward_std": 0.15530282258987427, + "rewards/cosine_scaled_reward/mean": -0.28747493028640747, + "rewards/cosine_scaled_reward/std": 0.16220521926879883, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 1357.109375, + "completions/mean_terminated_length": 747.5, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.07771428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3341590464115143, + "learning_rate": 9.968344786479415e-07, + "loss": -0.0, + "num_tokens": 8448788.0, + "reward": -0.06672946363687515, + "reward_std": 0.28790342807769775, + "rewards/cosine_scaled_reward/mean": -0.06672945618629456, + "rewards/cosine_scaled_reward/std": 0.35960128903388977, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 1565.046875, + "completions/mean_terminated_length": 944.107177734375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.07885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35159721970558167, + "learning_rate": 9.964516155915151e-07, + "loss": -0.0, + "num_tokens": 8559295.0, + "reward": -0.27992868423461914, + "reward_std": 0.20264248549938202, + "rewards/cosine_scaled_reward/mean": -0.27992868423461914, + "rewards/cosine_scaled_reward/std": 0.23891927301883698, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 1867.765625, + "completions/mean_terminated_length": 606.125, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23989427089691162, + "learning_rate": 9.960469931131936e-07, + "loss": -0.0, + "num_tokens": 8690288.0, + "reward": -0.2498025894165039, + "reward_std": 0.15823513269424438, + "rewards/cosine_scaled_reward/mean": -0.2498025894165039, + "rewards/cosine_scaled_reward/std": 0.17978127300739288, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 1669.125, + "completions/mean_terminated_length": 945.8182373046875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.08114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.335510790348053, + "learning_rate": 9.956206309337066e-07, + "loss": -0.0, + "num_tokens": 8807832.0, + "reward": -0.1673138290643692, + "reward_std": 0.2547321915626526, + "rewards/cosine_scaled_reward/mean": -0.1673138290643692, + "rewards/cosine_scaled_reward/std": 0.39353805780410767, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1632.59375, + "completions/mean_terminated_length": 892.0869750976562, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.08228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30721575021743774, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0, + "num_tokens": 8922670.0, + "reward": -0.1493685096502304, + "reward_std": 0.23021411895751953, + "rewards/cosine_scaled_reward/mean": -0.1493685096502304, + "rewards/cosine_scaled_reward/std": 0.27729952335357666, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 2020.59375, + "completions/mean_terminated_length": 1463.3333740234375, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "epoch": 0.08342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20856839418411255, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0, + "num_tokens": 9062716.0, + "reward": -0.25696587562561035, + "reward_std": 0.19847074151039124, + "rewards/cosine_scaled_reward/mean": -0.25696590542793274, + "rewards/cosine_scaled_reward/std": 0.23918035626411438, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1926.984375, + "completions/mean_terminated_length": 1273.5, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.08457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23241353034973145, + "learning_rate": 9.942113192828444e-07, + "loss": -0.0, + "num_tokens": 9195971.0, + "reward": -0.12904082238674164, + "reward_std": 0.23554545640945435, + "rewards/cosine_scaled_reward/mean": -0.12904080748558044, + "rewards/cosine_scaled_reward/std": 0.4280695915222168, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 1868.890625, + "completions/mean_terminated_length": 1092.75, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "epoch": 0.08571428571428572, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.19846303761005402, + "learning_rate": 9.93698216681727e-07, + "loss": -0.0, + "num_tokens": 9326540.0, + "reward": -0.03926669806241989, + "reward_std": 0.2044709324836731, + "rewards/cosine_scaled_reward/mean": -0.039266690611839294, + "rewards/cosine_scaled_reward/std": 0.49658530950546265, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1805.296875, + "completions/mean_terminated_length": 1077.1875, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.08685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23998627066612244, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 9452479.0, + "reward": -0.23065510392189026, + "reward_std": 0.17413878440856934, + "rewards/cosine_scaled_reward/mean": -0.23065511882305145, + "rewards/cosine_scaled_reward/std": 0.21896763145923615, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 1857.328125, + "completions/mean_terminated_length": 1285.3125, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20421437919139862, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "num_tokens": 9582924.0, + "reward": -0.17972718179225922, + "reward_std": 0.209285706281662, + "rewards/cosine_scaled_reward/mean": -0.17972716689109802, + "rewards/cosine_scaled_reward/std": 0.2716500163078308, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2001.0, + "completions/mean_length": 1883.921875, + "completions/mean_terminated_length": 1093.3636474609375, + "completions/min_length": 712.0, + "completions/min_terminated_length": 712.0, + "epoch": 0.08914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2156875878572464, + "learning_rate": 9.9202926282791e-07, + "loss": -0.0, + "num_tokens": 9714215.0, + "reward": -0.14897406101226807, + "reward_std": 0.2451157122850418, + "rewards/cosine_scaled_reward/mean": -0.14897406101226807, + "rewards/cosine_scaled_reward/std": 0.38884180784225464, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1878.0, + "completions/mean_length": 1507.65625, + "completions/mean_terminated_length": 767.1851806640625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.09028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29943305253982544, + "learning_rate": 9.91429819907136e-07, + "loss": -0.0, + "num_tokens": 9820801.0, + "reward": -0.17114077508449554, + "reward_std": 0.23199111223220825, + "rewards/cosine_scaled_reward/mean": -0.17114077508449554, + "rewards/cosine_scaled_reward/std": 0.3217289447784424, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1976.125, + "completions/mean_terminated_length": 1536.888916015625, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "epoch": 0.09142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26230743527412415, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0, + "num_tokens": 9957665.0, + "reward": -0.21115826070308685, + "reward_std": 0.2435196340084076, + "rewards/cosine_scaled_reward/mean": -0.21115827560424805, + "rewards/cosine_scaled_reward/std": 0.28258123993873596, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1779.28125, + "completions/mean_terminated_length": 901.4667358398438, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.09257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33359771966934204, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "num_tokens": 10082811.0, + "reward": -0.1508273482322693, + "reward_std": 0.2594776749610901, + "rewards/cosine_scaled_reward/mean": -0.1508273482322693, + "rewards/cosine_scaled_reward/std": 0.33812451362609863, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 1711.609375, + "completions/mean_terminated_length": 851.9444580078125, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.09371428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2805767059326172, + "learning_rate": 9.895025252503755e-07, + "loss": -0.0, + "num_tokens": 10202682.0, + "reward": -0.11850972473621368, + "reward_std": 0.2631937861442566, + "rewards/cosine_scaled_reward/mean": -0.11850972473621368, + "rewards/cosine_scaled_reward/std": 0.4419197142124176, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 1749.984375, + "completions/mean_terminated_length": 1044.157958984375, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "epoch": 0.09485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3109220266342163, + "learning_rate": 9.888172094375033e-07, + "loss": -0.0, + "num_tokens": 10325769.0, + "reward": -0.10190614312887192, + "reward_std": 0.2739119529724121, + "rewards/cosine_scaled_reward/mean": -0.10190614312887192, + "rewards/cosine_scaled_reward/std": 0.39238420128822327, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 1800.390625, + "completions/mean_terminated_length": 829.0000610351562, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23385629057884216, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0, + "num_tokens": 10451690.0, + "reward": -0.21778321266174316, + "reward_std": 0.25428956747055054, + "rewards/cosine_scaled_reward/mean": -0.21778322756290436, + "rewards/cosine_scaled_reward/std": 0.30295974016189575, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 1870.46875, + "completions/mean_terminated_length": 1337.875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "epoch": 0.09714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21526271104812622, + "learning_rate": 9.873824502603459e-07, + "loss": -0.0, + "num_tokens": 10581720.0, + "reward": -0.19906702637672424, + "reward_std": 0.23402772843837738, + "rewards/cosine_scaled_reward/mean": -0.19906699657440186, + "rewards/cosine_scaled_reward/std": 0.28999006748199463, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1369.0, + "completions/mean_length": 1734.875, + "completions/mean_terminated_length": 795.5, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.09828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24285966157913208, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "num_tokens": 10703608.0, + "reward": -0.16528445482254028, + "reward_std": 0.2592755854129791, + "rewards/cosine_scaled_reward/mean": -0.16528445482254028, + "rewards/cosine_scaled_reward/std": 0.37110546231269836, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 1577.921875, + "completions/mean_terminated_length": 973.5357666015625, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "epoch": 0.09942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30273520946502686, + "learning_rate": 9.85862422507884e-07, + "loss": -0.0, + "num_tokens": 10814715.0, + "reward": -0.20241931080818176, + "reward_std": 0.2693288326263428, + "rewards/cosine_scaled_reward/mean": -0.20241928100585938, + "rewards/cosine_scaled_reward/std": 0.33345305919647217, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1680.546875, + "completions/mean_terminated_length": 1068.125, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.10057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2649252116680145, + "learning_rate": 9.850705248720068e-07, + "loss": -0.0, + "num_tokens": 10932782.0, + "reward": -0.018871163949370384, + "reward_std": 0.3073042631149292, + "rewards/cosine_scaled_reward/mean": -0.018871165812015533, + "rewards/cosine_scaled_reward/std": 0.3826298415660858, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 1683.703125, + "completions/mean_terminated_length": 1151.269287109375, + "completions/min_length": 667.0, + "completions/min_terminated_length": 667.0, + "epoch": 0.10171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24950510263442993, + "learning_rate": 9.8425742251254e-07, + "loss": -0.0, + "num_tokens": 11051539.0, + "reward": -0.11818082630634308, + "reward_std": 0.2949528694152832, + "rewards/cosine_scaled_reward/mean": -0.11818082630634308, + "rewards/cosine_scaled_reward/std": 0.34418320655822754, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1558.546875, + "completions/mean_terminated_length": 967.8275756835938, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.10285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36593058705329895, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0, + "num_tokens": 11161286.0, + "reward": -0.26082760095596313, + "reward_std": 0.1802712082862854, + "rewards/cosine_scaled_reward/mean": -0.26082760095596313, + "rewards/cosine_scaled_reward/std": 0.2037661075592041, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1505.0, + "completions/mean_length": 1827.9375, + "completions/mean_terminated_length": 1109.0667724609375, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24167831242084503, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0, + "num_tokens": 11288842.0, + "reward": -0.11456942558288574, + "reward_std": 0.26296502351760864, + "rewards/cosine_scaled_reward/mean": -0.11456942558288574, + "rewards/cosine_scaled_reward/std": 0.3274599611759186, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 1581.546875, + "completions/mean_terminated_length": 899.8077392578125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "epoch": 0.10514285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2570616602897644, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "num_tokens": 11400053.0, + "reward": -0.17942462861537933, + "reward_std": 0.2633644640445709, + "rewards/cosine_scaled_reward/mean": -0.17942462861537933, + "rewards/cosine_scaled_reward/std": 0.30215632915496826, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 2022.328125, + "completions/mean_terminated_length": 1226.5, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "epoch": 0.10628571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25331902503967285, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0, + "num_tokens": 11540826.0, + "reward": -0.26418450474739075, + "reward_std": 0.1380012035369873, + "rewards/cosine_scaled_reward/mean": -0.26418450474739075, + "rewards/cosine_scaled_reward/std": 0.17390060424804688, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1702.0, + "completions/mean_length": 1769.546875, + "completions/mean_terminated_length": 934.1875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "epoch": 0.10742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29503753781318665, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0, + "num_tokens": 11663845.0, + "reward": -0.08299511671066284, + "reward_std": 0.18226617574691772, + "rewards/cosine_scaled_reward/mean": -0.08299513161182404, + "rewards/cosine_scaled_reward/std": 0.46436113119125366, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 2021.5, + "completions/mean_terminated_length": 1200.0, + "completions/min_length": 1100.0, + "completions/min_terminated_length": 1100.0, + "epoch": 0.10857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20416001975536346, + "learning_rate": 9.78935800506826e-07, + "loss": -0.0, + "num_tokens": 11803749.0, + "reward": -0.22345861792564392, + "reward_std": 0.18781372904777527, + "rewards/cosine_scaled_reward/mean": -0.22345861792564392, + "rewards/cosine_scaled_reward/std": 0.24531956017017365, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 1582.890625, + "completions/mean_terminated_length": 903.1154174804688, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.10971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2593792974948883, + "learning_rate": 9.779754323328192e-07, + "loss": -0.0, + "num_tokens": 11916190.0, + "reward": 0.00020215287804603577, + "reward_std": 0.24673128128051758, + "rewards/cosine_scaled_reward/mean": 0.00020216405391693115, + "rewards/cosine_scaled_reward/std": 0.49432000517845154, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1748.859375, + "completions/mean_terminated_length": 1177.772705078125, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "epoch": 0.11085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2480001151561737, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0, + "num_tokens": 12038381.0, + "reward": -0.19425566494464874, + "reward_std": 0.21240204572677612, + "rewards/cosine_scaled_reward/mean": -0.19425567984580994, + "rewards/cosine_scaled_reward/std": 0.29181501269340515, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1632.171875, + "completions/mean_terminated_length": 1062.3333740234375, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2797771692276001, + "learning_rate": 9.759921670520634e-07, + "loss": -0.0, + "num_tokens": 12153904.0, + "reward": -0.11104464530944824, + "reward_std": 0.2755987048149109, + "rewards/cosine_scaled_reward/mean": -0.11104465276002884, + "rewards/cosine_scaled_reward/std": 0.4012855887413025, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 1651.078125, + "completions/mean_terminated_length": 553.7058715820312, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.11314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3114299476146698, + "learning_rate": 9.749693666068663e-07, + "loss": -0.0, + "num_tokens": 12270741.0, + "reward": -0.1317199319601059, + "reward_std": 0.14237020909786224, + "rewards/cosine_scaled_reward/mean": -0.1317199319601059, + "rewards/cosine_scaled_reward/std": 0.3707720935344696, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1544.765625, + "completions/mean_terminated_length": 937.413818359375, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.11428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2654109001159668, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "num_tokens": 12379318.0, + "reward": -0.018167953938245773, + "reward_std": 0.29768484830856323, + "rewards/cosine_scaled_reward/mean": -0.01816795952618122, + "rewards/cosine_scaled_reward/std": 0.44200995564460754, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 1647.421875, + "completions/mean_terminated_length": 979.7916870117188, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.11542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2877754867076874, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0, + "num_tokens": 12496185.0, + "reward": -0.10053972899913788, + "reward_std": 0.28722673654556274, + "rewards/cosine_scaled_reward/mean": -0.10053973644971848, + "rewards/cosine_scaled_reward/std": 0.36782190203666687, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1371.484375, + "completions/mean_terminated_length": 937.8204956054688, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.11657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30472108721733093, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0, + "num_tokens": 12594112.0, + "reward": -0.20305150747299194, + "reward_std": 0.23292692005634308, + "rewards/cosine_scaled_reward/mean": -0.20305150747299194, + "rewards/cosine_scaled_reward/std": 0.3213489055633545, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 1737.984375, + "completions/mean_terminated_length": 807.9375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.11771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27034303545951843, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0, + "num_tokens": 12715695.0, + "reward": -0.29003486037254333, + "reward_std": 0.21371816098690033, + "rewards/cosine_scaled_reward/mean": -0.29003486037254333, + "rewards/cosine_scaled_reward/std": 0.224824920296669, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1692.0, + "completions/mean_length": 1705.28125, + "completions/mean_terminated_length": 893.5789794921875, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.11885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27687934041023254, + "learning_rate": 9.695457105469804e-07, + "loss": -0.0, + "num_tokens": 12835297.0, + "reward": -0.15606051683425903, + "reward_std": 0.18938840925693512, + "rewards/cosine_scaled_reward/mean": -0.15606051683425903, + "rewards/cosine_scaled_reward/std": 0.24088984727859497, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 1401.015625, + "completions/mean_terminated_length": 830.1470336914062, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2770017087459564, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0, + "num_tokens": 12936250.0, + "reward": 0.018527541309595108, + "reward_std": 0.36475759744644165, + "rewards/cosine_scaled_reward/mean": 0.018527545034885406, + "rewards/cosine_scaled_reward/std": 0.4995051920413971, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1944.0, + "completions/mean_length": 1438.453125, + "completions/mean_terminated_length": 789.5806274414062, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.12114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26982930302619934, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0, + "num_tokens": 13039143.0, + "reward": 0.07083749771118164, + "reward_std": 0.29650557041168213, + "rewards/cosine_scaled_reward/mean": 0.07083749771118164, + "rewards/cosine_scaled_reward/std": 0.5094331502914429, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1722.0, + "completions/mean_length": 1787.09375, + "completions/mean_terminated_length": 1065.7647705078125, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "epoch": 0.12228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26255276799201965, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0, + "num_tokens": 13164669.0, + "reward": -0.2222379446029663, + "reward_std": 0.240003302693367, + "rewards/cosine_scaled_reward/mean": -0.2222379446029663, + "rewards/cosine_scaled_reward/std": 0.29153531789779663, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 1660.96875, + "completions/mean_terminated_length": 1095.3077392578125, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.12342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30773183703422546, + "learning_rate": 9.648384182148252e-07, + "loss": -0.0, + "num_tokens": 13281331.0, + "reward": -0.21352165937423706, + "reward_std": 0.3123124837875366, + "rewards/cosine_scaled_reward/mean": -0.21352165937423706, + "rewards/cosine_scaled_reward/std": 0.3453315496444702, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1858.921875, + "completions/mean_terminated_length": 1117.1539306640625, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.12457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24000757932662964, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0, + "num_tokens": 13411550.0, + "reward": -0.13601753115653992, + "reward_std": 0.1500597596168518, + "rewards/cosine_scaled_reward/mean": -0.1360175609588623, + "rewards/cosine_scaled_reward/std": 0.42859947681427, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1563.90625, + "completions/mean_terminated_length": 900.5185546875, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.12571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31709614396095276, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "num_tokens": 13522416.0, + "reward": -0.28067731857299805, + "reward_std": 0.1671288013458252, + "rewards/cosine_scaled_reward/mean": -0.28067731857299805, + "rewards/cosine_scaled_reward/std": 0.21458736062049866, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 1724.71875, + "completions/mean_terminated_length": 1013.5, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.12685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2646999657154083, + "learning_rate": 9.610954559391704e-07, + "loss": -0.0, + "num_tokens": 13642918.0, + "reward": -0.11896095424890518, + "reward_std": 0.28121650218963623, + "rewards/cosine_scaled_reward/mean": -0.11896096169948578, + "rewards/cosine_scaled_reward/std": 0.37855637073516846, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1918.0, + "completions/mean_terminated_length": 1216.0, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22158586978912354, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0, + "num_tokens": 13776766.0, + "reward": -0.1688530147075653, + "reward_std": 0.2535978853702545, + "rewards/cosine_scaled_reward/mean": -0.1688530296087265, + "rewards/cosine_scaled_reward/std": 0.3341792821884155, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 1556.125, + "completions/mean_terminated_length": 837.2307739257812, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.12914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2930087745189667, + "learning_rate": 9.58499865339809e-07, + "loss": -0.0, + "num_tokens": 13886654.0, + "reward": -0.10367631912231445, + "reward_std": 0.30835023522377014, + "rewards/cosine_scaled_reward/mean": -0.10367631912231445, + "rewards/cosine_scaled_reward/std": 0.42973947525024414, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1521.9375, + "completions/mean_terminated_length": 753.0769653320312, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.13028571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3062143921852112, + "learning_rate": 9.571721736097088e-07, + "loss": -0.0, + "num_tokens": 13993906.0, + "reward": -0.22209212183952332, + "reward_std": 0.2074735462665558, + "rewards/cosine_scaled_reward/mean": -0.22209212183952332, + "rewards/cosine_scaled_reward/std": 0.29088398814201355, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1803.0, + "completions/mean_length": 1714.578125, + "completions/mean_terminated_length": 1031.857177734375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.13142857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2564532160758972, + "learning_rate": 9.55824636882301e-07, + "loss": -0.0, + "num_tokens": 14114855.0, + "reward": -0.10947269201278687, + "reward_std": 0.30371129512786865, + "rewards/cosine_scaled_reward/mean": -0.10947269946336746, + "rewards/cosine_scaled_reward/std": 0.41030505299568176, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 1877.90625, + "completions/mean_terminated_length": 492.857177734375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.13257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25748053193092346, + "learning_rate": 9.54457320834625e-07, + "loss": -0.0, + "num_tokens": 14246425.0, + "reward": -0.19163870811462402, + "reward_std": 0.21010378003120422, + "rewards/cosine_scaled_reward/mean": -0.19163869321346283, + "rewards/cosine_scaled_reward/std": 0.3049132525920868, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 1922.546875, + "completions/mean_terminated_length": 1155.888916015625, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "epoch": 0.1337142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24102462828159332, + "learning_rate": 9.530702921077358e-07, + "loss": -0.0, + "num_tokens": 14380492.0, + "reward": -0.21347489953041077, + "reward_std": 0.19724325835704803, + "rewards/cosine_scaled_reward/mean": -0.21347489953041077, + "rewards/cosine_scaled_reward/std": 0.2647304832935333, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 1793.546875, + "completions/mean_terminated_length": 1233.75, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.13485714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2439616322517395, + "learning_rate": 9.516636183034564e-07, + "loss": -0.0, + "num_tokens": 14505815.0, + "reward": -0.08845303952693939, + "reward_std": 0.30429399013519287, + "rewards/cosine_scaled_reward/mean": -0.08845303952693939, + "rewards/cosine_scaled_reward/std": 0.4648522734642029, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1405.15625, + "completions/mean_terminated_length": 936.0540771484375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32119110226631165, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0, + "num_tokens": 14606153.0, + "reward": -0.04571840912103653, + "reward_std": 0.3056246340274811, + "rewards/cosine_scaled_reward/mean": -0.04571840912103653, + "rewards/cosine_scaled_reward/std": 0.49307262897491455, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1321.40625, + "completions/mean_terminated_length": 940.8095703125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.13714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3139563500881195, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0, + "num_tokens": 14701179.0, + "reward": -0.12771092355251312, + "reward_std": 0.3157998323440552, + "rewards/cosine_scaled_reward/mean": -0.12771093845367432, + "rewards/cosine_scaled_reward/std": 0.4336044490337372, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1024.5, + "completions/mean_terminated_length": 812.0755004882812, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.1382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3428559899330139, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 14776443.0, + "reward": -0.004689367488026619, + "reward_std": 0.297618567943573, + "rewards/cosine_scaled_reward/mean": -0.004689373075962067, + "rewards/cosine_scaled_reward/std": 0.46961408853530884, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1790.765625, + "completions/mean_terminated_length": 1133.388916015625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.13942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29122933745384216, + "learning_rate": 9.458418577899774e-07, + "loss": -0.0, + "num_tokens": 14902612.0, + "reward": -0.11110783368349075, + "reward_std": 0.22664329409599304, + "rewards/cosine_scaled_reward/mean": -0.11110783368349075, + "rewards/cosine_scaled_reward/std": 0.3362382650375366, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 1658.46875, + "completions/mean_terminated_length": 1124.6666259765625, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.14057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2646903693675995, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0, + "num_tokens": 15018986.0, + "reward": -0.20784568786621094, + "reward_std": 0.270358681678772, + "rewards/cosine_scaled_reward/mean": -0.20784570276737213, + "rewards/cosine_scaled_reward/std": 0.35689592361450195, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 1494.9375, + "completions/mean_terminated_length": 868.1333618164062, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.1417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26702970266342163, + "learning_rate": 9.428149347714143e-07, + "loss": -0.0, + "num_tokens": 15125614.0, + "reward": -0.160624697804451, + "reward_std": 0.23646026849746704, + "rewards/cosine_scaled_reward/mean": -0.160624697804451, + "rewards/cosine_scaled_reward/std": 0.4083607792854309, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 1608.609375, + "completions/mean_terminated_length": 825.3478393554688, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2753336727619171, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0, + "num_tokens": 15239493.0, + "reward": -0.008194006979465485, + "reward_std": 0.21567228436470032, + "rewards/cosine_scaled_reward/mean": -0.008194014430046082, + "rewards/cosine_scaled_reward/std": 0.463446706533432, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1759.484375, + "completions/mean_terminated_length": 1076.157958984375, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24985821545124054, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "num_tokens": 15363396.0, + "reward": -0.16068750619888306, + "reward_std": 0.22599664330482483, + "rewards/cosine_scaled_reward/mean": -0.16068752110004425, + "rewards/cosine_scaled_reward/std": 0.304392009973526, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 1842.859375, + "completions/mean_terminated_length": 1110.21435546875, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.14514285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21972927451133728, + "learning_rate": 9.381311511432658e-07, + "loss": -0.0, + "num_tokens": 15492435.0, + "reward": -0.29198482632637024, + "reward_std": 0.17300401628017426, + "rewards/cosine_scaled_reward/mean": -0.29198482632637024, + "rewards/cosine_scaled_reward/std": 0.21628034114837646, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1694.578125, + "completions/mean_terminated_length": 1064.565185546875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.1462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24753950536251068, + "learning_rate": 9.36531953618799e-07, + "loss": -0.0, + "num_tokens": 15611240.0, + "reward": 0.04859344661235809, + "reward_std": 0.31105202436447144, + "rewards/cosine_scaled_reward/mean": 0.04859344661235809, + "rewards/cosine_scaled_reward/std": 0.4569285809993744, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 1917.5625, + "completions/mean_terminated_length": 1004.5, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "epoch": 0.14742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23949742317199707, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0, + "num_tokens": 15744668.0, + "reward": -0.27834638953208923, + "reward_std": 0.16836056113243103, + "rewards/cosine_scaled_reward/mean": -0.27834638953208923, + "rewards/cosine_scaled_reward/std": 0.20021934807300568, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1502.0, + "completions/mean_length": 1725.796875, + "completions/mean_terminated_length": 902.388916015625, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "epoch": 0.14857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23063035309314728, + "learning_rate": 9.332771203643714e-07, + "loss": -0.0, + "num_tokens": 15865623.0, + "reward": -0.19732065498828888, + "reward_std": 0.19462591409683228, + "rewards/cosine_scaled_reward/mean": -0.19732065498828888, + "rewards/cosine_scaled_reward/std": 0.2627345323562622, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 1640.8125, + "completions/mean_terminated_length": 863.45458984375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.14971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29630133509635925, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0, + "num_tokens": 15980371.0, + "reward": -0.07768938690423965, + "reward_std": 0.2543257176876068, + "rewards/cosine_scaled_reward/mean": -0.07768939435482025, + "rewards/cosine_scaled_reward/std": 0.4248148798942566, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1424.0, + "completions/mean_length": 1570.890625, + "completions/mean_terminated_length": 826.5999755859375, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.15085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2735442817211151, + "learning_rate": 9.299475664759068e-07, + "loss": -0.0, + "num_tokens": 16091972.0, + "reward": -0.1057564914226532, + "reward_std": 0.32137495279312134, + "rewards/cosine_scaled_reward/mean": -0.105756476521492, + "rewards/cosine_scaled_reward/std": 0.4788062870502472, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1971.34375, + "completions/mean_terminated_length": 1347.1429443359375, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23230750858783722, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0, + "num_tokens": 16229362.0, + "reward": -0.21333375573158264, + "reward_std": 0.1880394071340561, + "rewards/cosine_scaled_reward/mean": -0.21333375573158264, + "rewards/cosine_scaled_reward/std": 0.2557979226112366, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1682.984375, + "completions/mean_terminated_length": 1113.5599365234375, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "epoch": 0.15314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2776358127593994, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0, + "num_tokens": 16347641.0, + "reward": -0.07218431681394577, + "reward_std": 0.19744814932346344, + "rewards/cosine_scaled_reward/mean": -0.07218432426452637, + "rewards/cosine_scaled_reward/std": 0.41042155027389526, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1852.0, + "completions/mean_length": 1269.171875, + "completions/mean_terminated_length": 736.2894897460938, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.15428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30510956048965454, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0, + "num_tokens": 16439340.0, + "reward": 0.1377476304769516, + "reward_std": 0.25976449251174927, + "rewards/cosine_scaled_reward/mean": 0.1377476155757904, + "rewards/cosine_scaled_reward/std": 0.4923737347126007, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 1624.203125, + "completions/mean_terminated_length": 917.875, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "epoch": 0.15542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25474753975868225, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0, + "num_tokens": 16553961.0, + "reward": -0.04156734049320221, + "reward_std": 0.27987948060035706, + "rewards/cosine_scaled_reward/mean": -0.04156734049320221, + "rewards/cosine_scaled_reward/std": 0.4557124078273773, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1832.625, + "completions/mean_terminated_length": 1063.4285888671875, + "completions/min_length": 600.0, + "completions/min_terminated_length": 600.0, + "epoch": 0.15657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2200661152601242, + "learning_rate": 9.213010742252327e-07, + "loss": -0.0, + "num_tokens": 16681857.0, + "reward": -0.2795522212982178, + "reward_std": 0.16735097765922546, + "rewards/cosine_scaled_reward/mean": -0.2795522212982178, + "rewards/cosine_scaled_reward/std": 0.22360830008983612, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 1547.90625, + "completions/mean_terminated_length": 981.1333618164062, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.15771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.247065007686615, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0, + "num_tokens": 16792235.0, + "reward": -0.1421782374382019, + "reward_std": 0.25017279386520386, + "rewards/cosine_scaled_reward/mean": -0.1421782374382019, + "rewards/cosine_scaled_reward/std": 0.3903765082359314, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1898.375, + "completions/mean_terminated_length": 1177.45458984375, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "epoch": 0.15885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25471416115760803, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0, + "num_tokens": 16924371.0, + "reward": -0.24234679341316223, + "reward_std": 0.15713179111480713, + "rewards/cosine_scaled_reward/mean": -0.24234679341316223, + "rewards/cosine_scaled_reward/std": 0.17467617988586426, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1577.625, + "completions/mean_terminated_length": 1044.533447265625, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.16, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2628695070743561, + "learning_rate": 9.158953424711624e-07, + "loss": -0.0, + "num_tokens": 17035563.0, + "reward": -0.12413343787193298, + "reward_std": 0.20063763856887817, + "rewards/cosine_scaled_reward/mean": -0.12413343787193298, + "rewards/cosine_scaled_reward/std": 0.5006609559059143, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 1405.125, + "completions/mean_terminated_length": 993.025634765625, + "completions/min_length": 586.0, + "completions/min_terminated_length": 586.0, + "epoch": 0.16114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2682877779006958, + "learning_rate": 9.140576474687263e-07, + "loss": -0.0, + "num_tokens": 17136051.0, + "reward": -0.02423717826604843, + "reward_std": 0.2661462128162384, + "rewards/cosine_scaled_reward/mean": -0.02423717826604843, + "rewards/cosine_scaled_reward/std": 0.502265214920044, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1763.515625, + "completions/mean_terminated_length": 1347.7308349609375, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.16228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24265550076961517, + "learning_rate": 9.122022088101613e-07, + "loss": -0.0, + "num_tokens": 17259420.0, + "reward": -0.23560766875743866, + "reward_std": 0.22989924252033234, + "rewards/cosine_scaled_reward/mean": -0.23560766875743866, + "rewards/cosine_scaled_reward/std": 0.28772976994514465, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 1600.671875, + "completions/mean_terminated_length": 1153.34375, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "epoch": 0.16342857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30536147952079773, + "learning_rate": 9.103291169269299e-07, + "loss": -0.0, + "num_tokens": 17372679.0, + "reward": -0.23412726819515228, + "reward_std": 0.226594477891922, + "rewards/cosine_scaled_reward/mean": -0.2341272532939911, + "rewards/cosine_scaled_reward/std": 0.2685011625289917, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 1708.0625, + "completions/mean_terminated_length": 1012.0, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.16457142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2859592139720917, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0, + "num_tokens": 17493483.0, + "reward": -0.11928378790616989, + "reward_std": 0.2819562554359436, + "rewards/cosine_scaled_reward/mean": -0.11928380280733109, + "rewards/cosine_scaled_reward/std": 0.41741910576820374, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1725.0, + "completions/mean_length": 1277.78125, + "completions/mean_terminated_length": 845.707275390625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.1657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.310493141412735, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0, + "num_tokens": 17585205.0, + "reward": 0.009949762374162674, + "reward_std": 0.32572609186172485, + "rewards/cosine_scaled_reward/mean": 0.009949766099452972, + "rewards/cosine_scaled_reward/std": 0.5299619436264038, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1550.625, + "completions/mean_terminated_length": 986.9334106445312, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.16685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2404046207666397, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0, + "num_tokens": 17695061.0, + "reward": -0.17625686526298523, + "reward_std": 0.2529022991657257, + "rewards/cosine_scaled_reward/mean": -0.17625686526298523, + "rewards/cosine_scaled_reward/std": 0.3359045386314392, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 1741.703125, + "completions/mean_terminated_length": 1156.95458984375, + "completions/min_length": 591.0, + "completions/min_terminated_length": 591.0, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2498754858970642, + "learning_rate": 9.026620557966279e-07, + "loss": -0.0, + "num_tokens": 17817314.0, + "reward": -0.26471418142318726, + "reward_std": 0.2048022449016571, + "rewards/cosine_scaled_reward/mean": -0.26471418142318726, + "rewards/cosine_scaled_reward/std": 0.2656060457229614, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 1386.0625, + "completions/mean_terminated_length": 903.0270385742188, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.16914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2538217306137085, + "learning_rate": 9.007020842191634e-07, + "loss": -0.0, + "num_tokens": 17917206.0, + "reward": -0.10874275863170624, + "reward_std": 0.24236595630645752, + "rewards/cosine_scaled_reward/mean": -0.10874275863170624, + "rewards/cosine_scaled_reward/std": 0.3927372395992279, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2017.0, + "completions/mean_length": 1735.96875, + "completions/mean_terminated_length": 1140.272705078125, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "epoch": 0.1702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23916038870811462, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0, + "num_tokens": 18040204.0, + "reward": -0.20906513929367065, + "reward_std": 0.2755752205848694, + "rewards/cosine_scaled_reward/mean": -0.20906512439250946, + "rewards/cosine_scaled_reward/std": 0.38517922163009644, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1379.359375, + "completions/mean_terminated_length": 978.1749877929688, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.17142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30970829725265503, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0, + "num_tokens": 18138987.0, + "reward": -0.14114701747894287, + "reward_std": 0.3519541621208191, + "rewards/cosine_scaled_reward/mean": -0.14114701747894287, + "rewards/cosine_scaled_reward/std": 0.39396560192108154, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1497.328125, + "completions/mean_terminated_length": 1011.441162109375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.17257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2874428331851959, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0, + "num_tokens": 18245496.0, + "reward": -0.04711150377988815, + "reward_std": 0.33344799280166626, + "rewards/cosine_scaled_reward/mean": -0.04711151123046875, + "rewards/cosine_scaled_reward/std": 0.41477611660957336, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1741.0, + "completions/mean_length": 1777.140625, + "completions/mean_terminated_length": 964.5625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.1737142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28282323479652405, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0, + "num_tokens": 18369897.0, + "reward": -0.2543114423751831, + "reward_std": 0.18715068697929382, + "rewards/cosine_scaled_reward/mean": -0.2543114423751831, + "rewards/cosine_scaled_reward/std": 0.19382856786251068, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 1564.0625, + "completions/mean_terminated_length": 900.888916015625, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.17485714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27684876322746277, + "learning_rate": 8.906477750432903e-07, + "loss": -0.0, + "num_tokens": 18481141.0, + "reward": -0.1415693461894989, + "reward_std": 0.23039600253105164, + "rewards/cosine_scaled_reward/mean": -0.1415693461894989, + "rewards/cosine_scaled_reward/std": 0.2940608859062195, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1829.328125, + "completions/mean_terminated_length": 1224.7647705078125, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24094167351722717, + "learning_rate": 8.88586709003076e-07, + "loss": -0.0, + "num_tokens": 18609282.0, + "reward": -0.2521882653236389, + "reward_std": 0.20982292294502258, + "rewards/cosine_scaled_reward/mean": -0.2521882653236389, + "rewards/cosine_scaled_reward/std": 0.23373161256313324, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 1517.765625, + "completions/mean_terminated_length": 916.8333740234375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.17714285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2674770653247833, + "learning_rate": 8.865091407243394e-07, + "loss": -0.0, + "num_tokens": 18717043.0, + "reward": -0.028832588344812393, + "reward_std": 0.22500035166740417, + "rewards/cosine_scaled_reward/mean": -0.028832584619522095, + "rewards/cosine_scaled_reward/std": 0.4698766767978668, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 1731.453125, + "completions/mean_terminated_length": 781.8125, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.1782857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23764768242835999, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0, + "num_tokens": 18837960.0, + "reward": -0.10049945116043091, + "reward_std": 0.2521243393421173, + "rewards/cosine_scaled_reward/mean": -0.10049945116043091, + "rewards/cosine_scaled_reward/std": 0.4200229048728943, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1531.1875, + "completions/mean_terminated_length": 1014.375, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.17942857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28145694732666016, + "learning_rate": 8.823049032816478e-07, + "loss": -0.0, + "num_tokens": 18945916.0, + "reward": -0.22566190361976624, + "reward_std": 0.19013158977031708, + "rewards/cosine_scaled_reward/mean": -0.22566190361976624, + "rewards/cosine_scaled_reward/std": 0.24779614806175232, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 1425.203125, + "completions/mean_terminated_length": 909.1714477539062, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.18057142857142858, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.24377204477787018, + "learning_rate": 8.801784390262943e-07, + "loss": -0.0, + "num_tokens": 19047249.0, + "reward": -0.021197691559791565, + "reward_std": 0.22868266701698303, + "rewards/cosine_scaled_reward/mean": -0.021197684109210968, + "rewards/cosine_scaled_reward/std": 0.46860653162002563, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 1600.4375, + "completions/mean_terminated_length": 1093.2000732421875, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "epoch": 0.18171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2479163259267807, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0, + "num_tokens": 19161357.0, + "reward": -0.23690757155418396, + "reward_std": 0.20615912973880768, + "rewards/cosine_scaled_reward/mean": -0.23690758645534515, + "rewards/cosine_scaled_reward/std": 0.32988741993904114, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 1527.375, + "completions/mean_terminated_length": 937.3333740234375, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.18285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2751549184322357, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0, + "num_tokens": 19270693.0, + "reward": -0.12134292721748352, + "reward_std": 0.2621082067489624, + "rewards/cosine_scaled_reward/mean": -0.12134292721748352, + "rewards/cosine_scaled_reward/std": 0.4263574779033661, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1419.484375, + "completions/mean_terminated_length": 989.4473876953125, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2905498445034027, + "learning_rate": 8.737029101523929e-07, + "loss": -0.0, + "num_tokens": 19371532.0, + "reward": -0.1314084678888321, + "reward_std": 0.25361165404319763, + "rewards/cosine_scaled_reward/mean": -0.1314084678888321, + "rewards/cosine_scaled_reward/std": 0.36607682704925537, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1584.5, + "completions/mean_terminated_length": 949.3333129882812, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.18514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3278505206108093, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0, + "num_tokens": 19483244.0, + "reward": -0.16470149159431458, + "reward_std": 0.26964259147644043, + "rewards/cosine_scaled_reward/mean": -0.16470149159431458, + "rewards/cosine_scaled_reward/std": 0.31499552726745605, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 1439.6875, + "completions/mean_terminated_length": 868.242431640625, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.18628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29175421595573425, + "learning_rate": 8.693068314414344e-07, + "loss": -0.0, + "num_tokens": 19586568.0, + "reward": 0.10278680920600891, + "reward_std": 0.271634042263031, + "rewards/cosine_scaled_reward/mean": 0.10278680920600891, + "rewards/cosine_scaled_reward/std": 0.4813632071018219, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 1406.703125, + "completions/mean_terminated_length": 995.6154174804688, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.18742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26038941740989685, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0, + "num_tokens": 19687125.0, + "reward": -0.08026184141635895, + "reward_std": 0.21900159120559692, + "rewards/cosine_scaled_reward/mean": -0.08026183396577835, + "rewards/cosine_scaled_reward/std": 0.4170342683792114, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1896.0, + "completions/mean_length": 1438.921875, + "completions/mean_terminated_length": 994.45947265625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.18857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29659712314605713, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0, + "num_tokens": 19790632.0, + "reward": -0.12293928861618042, + "reward_std": 0.23739376664161682, + "rewards/cosine_scaled_reward/mean": -0.12293929606676102, + "rewards/cosine_scaled_reward/std": 0.3927924335002899, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1697.765625, + "completions/mean_terminated_length": 1073.434814453125, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "epoch": 0.18971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21795569360256195, + "learning_rate": 8.625962667065487e-07, + "loss": -0.0, + "num_tokens": 19910865.0, + "reward": -0.20583154261112213, + "reward_std": 0.2378866970539093, + "rewards/cosine_scaled_reward/mean": -0.20583152770996094, + "rewards/cosine_scaled_reward/std": 0.26525840163230896, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1674.0, + "completions/mean_length": 1406.890625, + "completions/mean_terminated_length": 995.923095703125, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "epoch": 0.19085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2583286166191101, + "learning_rate": 8.603287946810513e-07, + "loss": -0.0, + "num_tokens": 20012450.0, + "reward": -0.14853140711784363, + "reward_std": 0.23831486701965332, + "rewards/cosine_scaled_reward/mean": -0.14853140711784363, + "rewards/cosine_scaled_reward/std": 0.2794221341609955, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 1574.921875, + "completions/mean_terminated_length": 1038.7667236328125, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2431253343820572, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0, + "num_tokens": 20124085.0, + "reward": -0.07713659107685089, + "reward_std": 0.2686954736709595, + "rewards/cosine_scaled_reward/mean": -0.07713659107685089, + "rewards/cosine_scaled_reward/std": 0.37947362661361694, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1507.90625, + "completions/mean_terminated_length": 1161.6923828125, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "epoch": 0.19314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23702675104141235, + "learning_rate": 8.557485869176825e-07, + "loss": -0.0, + "num_tokens": 20231215.0, + "reward": 0.20358076691627502, + "reward_std": 0.2683357000350952, + "rewards/cosine_scaled_reward/mean": 0.20358076691627502, + "rewards/cosine_scaled_reward/std": 0.5625549554824829, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1290.53125, + "completions/mean_terminated_length": 836.0499877929688, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.19428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2915634512901306, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0, + "num_tokens": 20323593.0, + "reward": -0.04663477838039398, + "reward_std": 0.1683385670185089, + "rewards/cosine_scaled_reward/mean": -0.04663477838039398, + "rewards/cosine_scaled_reward/std": 0.432047039270401, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1764.0, + "completions/mean_length": 1461.703125, + "completions/mean_terminated_length": 875.40625, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.19542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2520189881324768, + "learning_rate": 8.511087728614862e-07, + "loss": -0.0, + "num_tokens": 20427534.0, + "reward": -0.03435331583023071, + "reward_std": 0.18240094184875488, + "rewards/cosine_scaled_reward/mean": -0.034353308379650116, + "rewards/cosine_scaled_reward/std": 0.4340380132198334, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 1483.359375, + "completions/mean_terminated_length": 1071.3243408203125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.19657142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31458213925361633, + "learning_rate": 8.487667956935087e-07, + "loss": -0.0, + "num_tokens": 20533085.0, + "reward": 0.1847388744354248, + "reward_std": 0.20619311928749084, + "rewards/cosine_scaled_reward/mean": 0.18473894894123077, + "rewards/cosine_scaled_reward/std": 0.512468159198761, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1113.96875, + "completions/mean_terminated_length": 689.4091186523438, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.1977142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3380848467350006, + "learning_rate": 8.464102570534061e-07, + "loss": -0.0, + "num_tokens": 20615691.0, + "reward": -0.05022401362657547, + "reward_std": 0.2543797492980957, + "rewards/cosine_scaled_reward/mean": -0.05022402107715607, + "rewards/cosine_scaled_reward/std": 0.38979703187942505, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 1151.390625, + "completions/mean_terminated_length": 985.3518676757812, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.19885714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2737923562526703, + "learning_rate": 8.440392717955475e-07, + "loss": -0.0, + "num_tokens": 20699716.0, + "reward": -0.05732875317335129, + "reward_std": 0.2915908694267273, + "rewards/cosine_scaled_reward/mean": -0.05732874572277069, + "rewards/cosine_scaled_reward/std": 0.4477607011795044, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1481.765625, + "completions/mean_terminated_length": 1068.567626953125, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26141369342803955, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0, + "num_tokens": 20805373.0, + "reward": -0.02904359996318817, + "reward_std": 0.24616873264312744, + "rewards/cosine_scaled_reward/mean": -0.02904359996318817, + "rewards/cosine_scaled_reward/std": 0.45150378346443176, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 1445.53125, + "completions/mean_terminated_length": 913.941162109375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.20114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.314208984375, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0, + "num_tokens": 20909055.0, + "reward": -0.165739506483078, + "reward_std": 0.2986479103565216, + "rewards/cosine_scaled_reward/mean": -0.165739506483078, + "rewards/cosine_scaled_reward/std": 0.3703363239765167, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 1427.890625, + "completions/mean_terminated_length": 1003.6052856445312, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.2022857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2828216254711151, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0, + "num_tokens": 21010664.0, + "reward": -0.07454323768615723, + "reward_std": 0.23275166749954224, + "rewards/cosine_scaled_reward/mean": -0.07454322278499603, + "rewards/cosine_scaled_reward/std": 0.3976919949054718, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 1304.5, + "completions/mean_terminated_length": 915.047607421875, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.20342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28618043661117554, + "learning_rate": 8.344131861991828e-07, + "loss": -0.0, + "num_tokens": 21105688.0, + "reward": 0.002464752644300461, + "reward_std": 0.3809230327606201, + "rewards/cosine_scaled_reward/mean": 0.002464751712977886, + "rewards/cosine_scaled_reward/std": 0.46308550238609314, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 1658.5, + "completions/mean_terminated_length": 1050.8800048828125, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "epoch": 0.20457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.250982403755188, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0, + "num_tokens": 21222664.0, + "reward": -0.18153682351112366, + "reward_std": 0.2734690308570862, + "rewards/cosine_scaled_reward/mean": -0.18153685331344604, + "rewards/cosine_scaled_reward/std": 0.33050045371055603, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 1288.34375, + "completions/mean_terminated_length": 943.0454711914062, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.2057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3166482150554657, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0, + "num_tokens": 21316294.0, + "reward": 0.20186525583267212, + "reward_std": 0.31781116127967834, + "rewards/cosine_scaled_reward/mean": 0.20186525583267212, + "rewards/cosine_scaled_reward/std": 0.49267733097076416, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 1591.796875, + "completions/mean_terminated_length": 925.0385131835938, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.20685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26195216178894043, + "learning_rate": 8.270476638965461e-07, + "loss": -0.0, + "num_tokens": 21429641.0, + "reward": -0.060104113072156906, + "reward_std": 0.23563489317893982, + "rewards/cosine_scaled_reward/mean": -0.06010409817099571, + "rewards/cosine_scaled_reward/std": 0.43010979890823364, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 1365.421875, + "completions/mean_terminated_length": 927.871826171875, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2551879286766052, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0, + "num_tokens": 21526820.0, + "reward": -0.15826305747032166, + "reward_std": 0.24291284382343292, + "rewards/cosine_scaled_reward/mean": -0.15826307237148285, + "rewards/cosine_scaled_reward/std": 0.30778464674949646, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1320.515625, + "completions/mean_terminated_length": 912.4146118164062, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.20914285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32218796014785767, + "learning_rate": 8.220696016880687e-07, + "loss": -0.0, + "num_tokens": 21621949.0, + "reward": -0.07413223385810852, + "reward_std": 0.35920435190200806, + "rewards/cosine_scaled_reward/mean": -0.07413223385810852, + "rewards/cosine_scaled_reward/std": 0.45890137553215027, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1465.71875, + "completions/mean_terminated_length": 1012.8333129882812, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.2102857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27700135111808777, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0, + "num_tokens": 21727107.0, + "reward": -0.158505380153656, + "reward_std": 0.18604165315628052, + "rewards/cosine_scaled_reward/mean": -0.158505380153656, + "rewards/cosine_scaled_reward/std": 0.29056471586227417, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1286.8125, + "completions/mean_terminated_length": 940.8182373046875, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.21142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2607719898223877, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0, + "num_tokens": 21819647.0, + "reward": -0.28775715827941895, + "reward_std": 0.19134438037872314, + "rewards/cosine_scaled_reward/mean": -0.28775715827941895, + "rewards/cosine_scaled_reward/std": 0.21350952982902527, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1825.0, + "completions/mean_length": 1480.09375, + "completions/mean_terminated_length": 1065.6756591796875, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.21257142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2575705349445343, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0, + "num_tokens": 21925069.0, + "reward": -0.13343556225299835, + "reward_std": 0.2557746171951294, + "rewards/cosine_scaled_reward/mean": -0.13343556225299835, + "rewards/cosine_scaled_reward/std": 0.36808857321739197, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 1347.71875, + "completions/mean_terminated_length": 1114.291748046875, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.21371428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31024712324142456, + "learning_rate": 8.119553365707802e-07, + "loss": -0.0, + "num_tokens": 22021747.0, + "reward": -0.09627380967140198, + "reward_std": 0.2472851276397705, + "rewards/cosine_scaled_reward/mean": -0.09627379477024078, + "rewards/cosine_scaled_reward/std": 0.41195833683013916, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1950.0, + "completions/mean_length": 1774.140625, + "completions/mean_terminated_length": 1251.3182373046875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "epoch": 0.21485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2481517493724823, + "learning_rate": 8.093945422764069e-07, + "loss": -0.0, + "num_tokens": 22147092.0, + "reward": -0.20224528014659882, + "reward_std": 0.2598743736743927, + "rewards/cosine_scaled_reward/mean": -0.20224529504776, + "rewards/cosine_scaled_reward/std": 0.33939501643180847, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1234.328125, + "completions/mean_terminated_length": 808.1190795898438, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31437209248542786, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0, + "num_tokens": 22235377.0, + "reward": -0.09877841919660568, + "reward_std": 0.2865467667579651, + "rewards/cosine_scaled_reward/mean": -0.09877842664718628, + "rewards/cosine_scaled_reward/std": 0.4444861114025116, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1189.3125, + "completions/mean_terminated_length": 1011.0943603515625, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.21714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28301987051963806, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0, + "num_tokens": 22321261.0, + "reward": -0.028003819286823273, + "reward_std": 0.27996310591697693, + "rewards/cosine_scaled_reward/mean": -0.028003819286823273, + "rewards/cosine_scaled_reward/std": 0.4598979353904724, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1061.140625, + "completions/mean_terminated_length": 1012.6065063476562, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.21828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31064528226852417, + "learning_rate": 8.01636806561836e-07, + "loss": -0.0, + "num_tokens": 22399462.0, + "reward": 0.07088040560483932, + "reward_std": 0.3638381361961365, + "rewards/cosine_scaled_reward/mean": 0.07088041305541992, + "rewards/cosine_scaled_reward/std": 0.5184580683708191, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1542.21875, + "completions/mean_terminated_length": 1258.48779296875, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "epoch": 0.21942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2742583751678467, + "learning_rate": 7.990261971595048e-07, + "loss": -0.0, + "num_tokens": 22509460.0, + "reward": -0.14651048183441162, + "reward_std": 0.2414294183254242, + "rewards/cosine_scaled_reward/mean": -0.14651048183441162, + "rewards/cosine_scaled_reward/std": 0.3039136528968811, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1717.8125, + "completions/mean_terminated_length": 1202.719970703125, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "epoch": 0.22057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24609725177288055, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0, + "num_tokens": 22630544.0, + "reward": -0.28856799006462097, + "reward_std": 0.14614446461200714, + "rewards/cosine_scaled_reward/mean": -0.28856799006462097, + "rewards/cosine_scaled_reward/std": 0.17294423282146454, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1692.546875, + "completions/mean_terminated_length": 1058.9130859375, + "completions/min_length": 613.0, + "completions/min_terminated_length": 613.0, + "epoch": 0.22171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27539438009262085, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0, + "num_tokens": 22750227.0, + "reward": -0.10590282082557678, + "reward_std": 0.25362446904182434, + "rewards/cosine_scaled_reward/mean": -0.10590282082557678, + "rewards/cosine_scaled_reward/std": 0.36822667717933655, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1410.09375, + "completions/mean_terminated_length": 1120.1363525390625, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "epoch": 0.22285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23645445704460144, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0, + "num_tokens": 22851617.0, + "reward": -0.12888561189174652, + "reward_std": 0.32565274834632874, + "rewards/cosine_scaled_reward/mean": -0.12888562679290771, + "rewards/cosine_scaled_reward/std": 0.3842463195323944, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 1625.234375, + "completions/mean_terminated_length": 1146.10009765625, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27093231678009033, + "learning_rate": 7.884636689049422e-07, + "loss": -0.0, + "num_tokens": 22967224.0, + "reward": -0.1617402583360672, + "reward_std": 0.3036938011646271, + "rewards/cosine_scaled_reward/mean": -0.1617402583360672, + "rewards/cosine_scaled_reward/std": 0.390837699174881, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1099.96875, + "completions/mean_terminated_length": 924.4074096679688, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.22514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31926214694976807, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0, + "num_tokens": 23047990.0, + "reward": 0.09089304506778717, + "reward_std": 0.40348750352859497, + "rewards/cosine_scaled_reward/mean": 0.09089304506778717, + "rewards/cosine_scaled_reward/std": 0.5607035756111145, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1312.96875, + "completions/mean_terminated_length": 1125.60791015625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.22628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2919371426105499, + "learning_rate": 7.831121542179086e-07, + "loss": -0.0, + "num_tokens": 23143524.0, + "reward": 0.0047197043895721436, + "reward_std": 0.3408518433570862, + "rewards/cosine_scaled_reward/mean": 0.004719719290733337, + "rewards/cosine_scaled_reward/std": 0.46544134616851807, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1417.171875, + "completions/mean_terminated_length": 1224.0611572265625, + "completions/min_length": 540.0, + "completions/min_terminated_length": 540.0, + "epoch": 0.22742857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24992844462394714, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0, + "num_tokens": 23245727.0, + "reward": -0.19424019753932953, + "reward_std": 0.28145354986190796, + "rewards/cosine_scaled_reward/mean": -0.19424019753932953, + "rewards/cosine_scaled_reward/std": 0.3362065255641937, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 1180.515625, + "completions/mean_terminated_length": 891.3541870117188, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.22857142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2989206612110138, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0, + "num_tokens": 23331400.0, + "reward": 0.08669155836105347, + "reward_std": 0.3488098084926605, + "rewards/cosine_scaled_reward/mean": 0.08669155836105347, + "rewards/cosine_scaled_reward/std": 0.46097004413604736, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1671.0, + "completions/mean_length": 1399.1875, + "completions/mean_terminated_length": 789.697021484375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.2297142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3482288122177124, + "learning_rate": 7.75e-07, + "loss": 0.0, + "num_tokens": 23431972.0, + "reward": 0.05170612782239914, + "reward_std": 0.33521372079849243, + "rewards/cosine_scaled_reward/mean": 0.05170612409710884, + "rewards/cosine_scaled_reward/std": 0.4809432625770569, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 1257.5, + "completions/mean_terminated_length": 871.4418334960938, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.23085714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24333854019641876, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0, + "num_tokens": 23522356.0, + "reward": 0.20302791893482208, + "reward_std": 0.24270620942115784, + "rewards/cosine_scaled_reward/mean": 0.20302791893482208, + "rewards/cosine_scaled_reward/std": 0.5547645688056946, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1545.421875, + "completions/mean_terminated_length": 1299.9766845703125, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "epoch": 0.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24562042951583862, + "learning_rate": 7.695368466124296e-07, + "loss": -0.0, + "num_tokens": 23632679.0, + "reward": 0.07688053697347641, + "reward_std": 0.32062458992004395, + "rewards/cosine_scaled_reward/mean": 0.07688053697347641, + "rewards/cosine_scaled_reward/std": 0.5180152058601379, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1250.28125, + "completions/mean_terminated_length": 961.74462890625, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.23314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2952634394168854, + "learning_rate": 7.667891533457718e-07, + "loss": -0.0, + "num_tokens": 23722417.0, + "reward": 0.0316191166639328, + "reward_std": 0.23991048336029053, + "rewards/cosine_scaled_reward/mean": 0.0316191241145134, + "rewards/cosine_scaled_reward/std": 0.4419180452823639, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1239.6875, + "completions/mean_terminated_length": 923.3912963867188, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.2342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3470660448074341, + "learning_rate": 7.640308940816239e-07, + "loss": -0.0, + "num_tokens": 23812821.0, + "reward": 0.04175570607185364, + "reward_std": 0.32632672786712646, + "rewards/cosine_scaled_reward/mean": 0.04175570607185364, + "rewards/cosine_scaled_reward/std": 0.5073853135108948, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 1563.75, + "completions/mean_terminated_length": 1162.5142822265625, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.23542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2750691771507263, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0, + "num_tokens": 23923693.0, + "reward": -0.1269976794719696, + "reward_std": 0.2818883955478668, + "rewards/cosine_scaled_reward/mean": -0.1269976794719696, + "rewards/cosine_scaled_reward/std": 0.3301773965358734, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1346.515625, + "completions/mean_terminated_length": 1072.021728515625, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "epoch": 0.23657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34398096799850464, + "learning_rate": 7.584832158039378e-07, + "loss": -0.0, + "num_tokens": 24020470.0, + "reward": -0.11099155992269516, + "reward_std": 0.32174742221832275, + "rewards/cosine_scaled_reward/mean": -0.11099155247211456, + "rewards/cosine_scaled_reward/std": 0.4000038504600525, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 1350.71875, + "completions/mean_terminated_length": 1206.0, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "epoch": 0.2377142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2667733430862427, + "learning_rate": 7.556940671764124e-07, + "loss": -0.0, + "num_tokens": 24117244.0, + "reward": -0.012698620557785034, + "reward_std": 0.27501654624938965, + "rewards/cosine_scaled_reward/mean": -0.01269860565662384, + "rewards/cosine_scaled_reward/std": 0.47749608755111694, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1940.0, + "completions/mean_length": 1186.484375, + "completions/mean_terminated_length": 922.7550659179688, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.23885714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34667861461639404, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0, + "num_tokens": 24203091.0, + "reward": -0.132795050740242, + "reward_std": 0.2735438942909241, + "rewards/cosine_scaled_reward/mean": -0.132795050740242, + "rewards/cosine_scaled_reward/std": 0.3893483579158783, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1450.0625, + "completions/mean_terminated_length": 1197.5999755859375, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21571174263954163, + "learning_rate": 7.500858306332172e-07, + "loss": -0.0, + "num_tokens": 24306703.0, + "reward": -0.06977479159832001, + "reward_std": 0.24265971779823303, + "rewards/cosine_scaled_reward/mean": -0.06977479159832001, + "rewards/cosine_scaled_reward/std": 0.45415669679641724, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1801.0, + "completions/mean_length": 1201.609375, + "completions/mean_terminated_length": 964.6199951171875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.24114285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2720986306667328, + "learning_rate": 7.472670160550848e-07, + "loss": -0.0, + "num_tokens": 24394846.0, + "reward": 0.0786014124751091, + "reward_std": 0.2013745754957199, + "rewards/cosine_scaled_reward/mean": 0.0786014199256897, + "rewards/cosine_scaled_reward/std": 0.4884081780910492, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 1176.359375, + "completions/mean_terminated_length": 808.3333740234375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.2422857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3127840757369995, + "learning_rate": 7.444385869608921e-07, + "loss": -0.0, + "num_tokens": 24480613.0, + "reward": 0.11307461559772491, + "reward_std": 0.284263014793396, + "rewards/cosine_scaled_reward/mean": 0.11307463049888611, + "rewards/cosine_scaled_reward/std": 0.5329286456108093, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1114.03125, + "completions/mean_terminated_length": 776.2127685546875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.24342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.343943327665329, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0, + "num_tokens": 24561775.0, + "reward": -0.10338220745325089, + "reward_std": 0.2921890914440155, + "rewards/cosine_scaled_reward/mean": -0.10338220745325089, + "rewards/cosine_scaled_reward/std": 0.34980201721191406, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1992.0, + "completions/mean_length": 1386.0625, + "completions/mean_terminated_length": 1039.3333740234375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.24457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26102328300476074, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0, + "num_tokens": 24662027.0, + "reward": 0.02548668347299099, + "reward_std": 0.3174683451652527, + "rewards/cosine_scaled_reward/mean": 0.025486690923571587, + "rewards/cosine_scaled_reward/std": 0.46307510137557983, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1882.0, + "completions/mean_length": 1318.46875, + "completions/mean_terminated_length": 962.18603515625, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.24571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2819078266620636, + "learning_rate": 7.358969934210438e-07, + "loss": -0.0, + "num_tokens": 24756897.0, + "reward": -0.11348340660333633, + "reward_std": 0.1657339334487915, + "rewards/cosine_scaled_reward/mean": -0.11348340660333633, + "rewards/cosine_scaled_reward/std": 0.41132697463035583, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 971.234375, + "completions/mean_terminated_length": 839.0, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.24685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3146374225616455, + "learning_rate": 7.330314893841101e-07, + "loss": -0.0, + "num_tokens": 24828336.0, + "reward": 0.09829875081777573, + "reward_std": 0.34463635087013245, + "rewards/cosine_scaled_reward/mean": 0.09829875826835632, + "rewards/cosine_scaled_reward/std": 0.5223532319068909, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 1323.546875, + "completions/mean_terminated_length": 1017.6666870117188, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "epoch": 0.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25747084617614746, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0, + "num_tokens": 24923395.0, + "reward": -0.08631986379623413, + "reward_std": 0.3201732039451599, + "rewards/cosine_scaled_reward/mean": -0.08631986379623413, + "rewards/cosine_scaled_reward/std": 0.41996634006500244, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1421.5, + "completions/mean_terminated_length": 1115.534912109375, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.24914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24970035254955292, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0, + "num_tokens": 25025059.0, + "reward": -0.22788012027740479, + "reward_std": 0.22475574910640717, + "rewards/cosine_scaled_reward/mean": -0.22788012027740479, + "rewards/cosine_scaled_reward/std": 0.2934871315956116, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2022.0, + "completions/mean_length": 1120.609375, + "completions/mean_terminated_length": 948.870361328125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2502857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34460073709487915, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0, + "num_tokens": 25107090.0, + "reward": 0.02718304470181465, + "reward_std": 0.3376328647136688, + "rewards/cosine_scaled_reward/mean": 0.027183040976524353, + "rewards/cosine_scaled_reward/std": 0.5283166170120239, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 1366.828125, + "completions/mean_terminated_length": 1034.162841796875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.25142857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4521820843219757, + "learning_rate": 7.214816693576234e-07, + "loss": -0.0, + "num_tokens": 25204871.0, + "reward": -0.25229814648628235, + "reward_std": 0.17562136054039001, + "rewards/cosine_scaled_reward/mean": -0.25229811668395996, + "rewards/cosine_scaled_reward/std": 0.19320644438266754, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1156.53125, + "completions/mean_terminated_length": 950.8077392578125, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.25257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26752790808677673, + "learning_rate": 7.185729670371604e-07, + "loss": -0.0, + "num_tokens": 25289449.0, + "reward": 0.24696281552314758, + "reward_std": 0.273512065410614, + "rewards/cosine_scaled_reward/mean": 0.24696281552314758, + "rewards/cosine_scaled_reward/std": 0.46473291516304016, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1243.3125, + "completions/mean_terminated_length": 903.5556030273438, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.2537142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27195242047309875, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0, + "num_tokens": 25379149.0, + "reward": 0.007332861423492432, + "reward_std": 0.29589229822158813, + "rewards/cosine_scaled_reward/mean": 0.007332857698202133, + "rewards/cosine_scaled_reward/std": 0.48079609870910645, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 1200.3125, + "completions/mean_terminated_length": 962.9599609375, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "epoch": 0.25485714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2661433219909668, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0, + "num_tokens": 25465705.0, + "reward": 0.03970642387866974, + "reward_std": 0.2005533128976822, + "rewards/cosine_scaled_reward/mean": 0.03970641642808914, + "rewards/cosine_scaled_reward/std": 0.5048101544380188, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2042.0, + "completions/mean_length": 1705.46875, + "completions/mean_terminated_length": 1383.697021484375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23623619973659515, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0, + "num_tokens": 25586263.0, + "reward": -0.07307912409305573, + "reward_std": 0.350577175617218, + "rewards/cosine_scaled_reward/mean": -0.07307912409305573, + "rewards/cosine_scaled_reward/std": 0.38458916544914246, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1527.640625, + "completions/mean_terminated_length": 1122.9166259765625, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.2571428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2600167393684387, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0, + "num_tokens": 25694624.0, + "reward": -0.18486955761909485, + "reward_std": 0.24510705471038818, + "rewards/cosine_scaled_reward/mean": -0.18486955761909485, + "rewards/cosine_scaled_reward/std": 0.29842856526374817, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1293.0, + "completions/mean_terminated_length": 1118.769287109375, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.2582857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24967192113399506, + "learning_rate": 7.039090644965509e-07, + "loss": -0.0, + "num_tokens": 25788016.0, + "reward": 0.10143648833036423, + "reward_std": 0.3550751805305481, + "rewards/cosine_scaled_reward/mean": 0.10143650323152542, + "rewards/cosine_scaled_reward/std": 0.48985999822616577, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 975.421875, + "completions/mean_terminated_length": 958.3968505859375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.25942857142857145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33750462532043457, + "learning_rate": 7.009532063876148e-07, + "loss": -0.0, + "num_tokens": 25860827.0, + "reward": 0.017139945179224014, + "reward_std": 0.40727996826171875, + "rewards/cosine_scaled_reward/mean": 0.017139948904514313, + "rewards/cosine_scaled_reward/std": 0.4528072476387024, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 1005.453125, + "completions/mean_terminated_length": 834.8544921875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.26057142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3340362310409546, + "learning_rate": 6.979899910323624e-07, + "loss": -0.0, + "num_tokens": 25935848.0, + "reward": 0.1363377869129181, + "reward_std": 0.31884267926216125, + "rewards/cosine_scaled_reward/mean": 0.1363377869129181, + "rewards/cosine_scaled_reward/std": 0.5562776923179626, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1308.875, + "completions/mean_terminated_length": 1019.6522216796875, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.26171428571428573, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2481510192155838, + "learning_rate": 6.950195628537299e-07, + "loss": -0.0, + "num_tokens": 26030280.0, + "reward": -0.0336291566491127, + "reward_std": 0.2131306231021881, + "rewards/cosine_scaled_reward/mean": -0.0336291640996933, + "rewards/cosine_scaled_reward/std": 0.4883540868759155, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 1424.34375, + "completions/mean_terminated_length": 1119.7674560546875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.26285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24989557266235352, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0, + "num_tokens": 26131870.0, + "reward": -0.27840444445610046, + "reward_std": 0.18090233206748962, + "rewards/cosine_scaled_reward/mean": -0.27840444445610046, + "rewards/cosine_scaled_reward/std": 0.2319284826517105, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1420.328125, + "completions/mean_terminated_length": 1113.7906494140625, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "epoch": 0.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25709542632102966, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0, + "num_tokens": 26234467.0, + "reward": -0.012329377233982086, + "reward_std": 0.3558858633041382, + "rewards/cosine_scaled_reward/mean": -0.012329380959272385, + "rewards/cosine_scaled_reward/std": 0.45383208990097046, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1477.65625, + "completions/mean_terminated_length": 1087.4210205078125, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.2651428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26604732871055603, + "learning_rate": 6.860664508377001e-07, + "loss": -0.0, + "num_tokens": 26339365.0, + "reward": -0.18533703684806824, + "reward_std": 0.24220798909664154, + "rewards/cosine_scaled_reward/mean": -0.18533703684806824, + "rewards/cosine_scaled_reward/std": 0.26634126901626587, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 1072.109375, + "completions/mean_terminated_length": 1024.11474609375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.2662857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26210692524909973, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0, + "num_tokens": 26418084.0, + "reward": -0.1599939614534378, + "reward_std": 0.3579375445842743, + "rewards/cosine_scaled_reward/mean": -0.1599939614534378, + "rewards/cosine_scaled_reward/std": 0.3679514527320862, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1287.546875, + "completions/mean_terminated_length": 889.2142944335938, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.2674285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30882614850997925, + "learning_rate": 6.800643086250121e-07, + "loss": -0.0, + "num_tokens": 26510503.0, + "reward": -0.1574883908033371, + "reward_std": 0.17980948090553284, + "rewards/cosine_scaled_reward/mean": -0.1574883908033371, + "rewards/cosine_scaled_reward/std": 0.35836631059646606, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1890.0, + "completions/mean_length": 1145.125, + "completions/mean_terminated_length": 936.769287109375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.26857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30261489748954773, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0, + "num_tokens": 26594351.0, + "reward": 0.1909978985786438, + "reward_std": 0.3115041255950928, + "rewards/cosine_scaled_reward/mean": 0.1909978985786438, + "rewards/cosine_scaled_reward/std": 0.5054126381874084, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1449.375, + "completions/mean_terminated_length": 1012.5405883789062, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.26971428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28838953375816345, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0, + "num_tokens": 26698399.0, + "reward": -0.11444643139839172, + "reward_std": 0.3462868928909302, + "rewards/cosine_scaled_reward/mean": -0.11444643884897232, + "rewards/cosine_scaled_reward/std": 0.4084509313106537, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1262.125, + "completions/mean_terminated_length": 1021.551025390625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.27085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3033871650695801, + "learning_rate": 6.710139192768694e-07, + "loss": -0.0, + "num_tokens": 26789303.0, + "reward": -0.05035819113254547, + "reward_std": 0.2872178554534912, + "rewards/cosine_scaled_reward/mean": -0.050358183681964874, + "rewards/cosine_scaled_reward/std": 0.5157716870307922, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1301.734375, + "completions/mean_terminated_length": 1092.780029296875, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26610657572746277, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0, + "num_tokens": 26883454.0, + "reward": 0.10226152092218399, + "reward_std": 0.3642864525318146, + "rewards/cosine_scaled_reward/mean": 0.10226152092218399, + "rewards/cosine_scaled_reward/std": 0.49199798703193665, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1746.0, + "completions/mean_length": 1012.578125, + "completions/mean_terminated_length": 797.6792602539062, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.27314285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3098434805870056, + "learning_rate": 6.649505910711058e-07, + "loss": -0.0, + "num_tokens": 26958571.0, + "reward": 0.2893483638763428, + "reward_std": 0.21750710904598236, + "rewards/cosine_scaled_reward/mean": 0.2893483638763428, + "rewards/cosine_scaled_reward/std": 0.5735083818435669, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 1127.03125, + "completions/mean_terminated_length": 845.10205078125, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.2742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33869531750679016, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0, + "num_tokens": 27040949.0, + "reward": -0.2518009841442108, + "reward_std": 0.2073291540145874, + "rewards/cosine_scaled_reward/mean": -0.2518009841442108, + "rewards/cosine_scaled_reward/std": 0.26051101088523865, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 1248.578125, + "completions/mean_terminated_length": 1044.803955078125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.2754285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2926189601421356, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0, + "num_tokens": 27132074.0, + "reward": -0.18343190848827362, + "reward_std": 0.32297152280807495, + "rewards/cosine_scaled_reward/mean": -0.18343190848827362, + "rewards/cosine_scaled_reward/std": 0.3960045278072357, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1057.171875, + "completions/mean_terminated_length": 779.739990234375, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.2765714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3697403073310852, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0, + "num_tokens": 27209245.0, + "reward": -0.13560537993907928, + "reward_std": 0.2509098947048187, + "rewards/cosine_scaled_reward/mean": -0.13560537993907928, + "rewards/cosine_scaled_reward/std": 0.42233115434646606, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1849.0, + "completions/mean_length": 1294.9375, + "completions/mean_terminated_length": 1121.1539306640625, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "epoch": 0.2777142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2539284825325012, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0, + "num_tokens": 27302953.0, + "reward": 0.006944652646780014, + "reward_std": 0.3980734050273895, + "rewards/cosine_scaled_reward/mean": 0.006944645196199417, + "rewards/cosine_scaled_reward/std": 0.4572637379169464, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 1435.71875, + "completions/mean_terminated_length": 1068.3499755859375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.27885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28219878673553467, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0, + "num_tokens": 27405479.0, + "reward": -0.04507390409708023, + "reward_std": 0.2943881154060364, + "rewards/cosine_scaled_reward/mean": -0.04507390037178993, + "rewards/cosine_scaled_reward/std": 0.482650488615036, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 1331.296875, + "completions/mean_terminated_length": 1005.5227661132812, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2733215391635895, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0, + "num_tokens": 27501746.0, + "reward": 0.04791342094540596, + "reward_std": 0.34749698638916016, + "rewards/cosine_scaled_reward/mean": 0.047913409769535065, + "rewards/cosine_scaled_reward/std": 0.5028091669082642, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1827.0, + "completions/mean_length": 1310.046875, + "completions/mean_terminated_length": 1043.127685546875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.28114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2701246440410614, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0, + "num_tokens": 27596189.0, + "reward": -0.13987088203430176, + "reward_std": 0.3327594995498657, + "rewards/cosine_scaled_reward/mean": -0.13987088203430176, + "rewards/cosine_scaled_reward/std": 0.4108533263206482, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1433.5625, + "completions/mean_terminated_length": 1088.8780517578125, + "completions/min_length": 521.0, + "completions/min_terminated_length": 521.0, + "epoch": 0.2822857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2608485221862793, + "learning_rate": 6.404850645156841e-07, + "loss": -0.0, + "num_tokens": 27698385.0, + "reward": -0.19611218571662903, + "reward_std": 0.18159456551074982, + "rewards/cosine_scaled_reward/mean": -0.19611218571662903, + "rewards/cosine_scaled_reward/std": 0.18690702319145203, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 1016.078125, + "completions/mean_terminated_length": 824.9815063476562, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.2834285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33469828963279724, + "learning_rate": 6.374054580489873e-07, + "loss": -0.0, + "num_tokens": 27774342.0, + "reward": 0.20066902041435242, + "reward_std": 0.2608226537704468, + "rewards/cosine_scaled_reward/mean": 0.20066902041435242, + "rewards/cosine_scaled_reward/std": 0.5498367547988892, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 1137.15625, + "completions/mean_terminated_length": 926.9615478515625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.2845714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.283346951007843, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0, + "num_tokens": 27858296.0, + "reward": 0.22508396208286285, + "reward_std": 0.32221734523773193, + "rewards/cosine_scaled_reward/mean": 0.22508396208286285, + "rewards/cosine_scaled_reward/std": 0.5403409600257874, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1258.484375, + "completions/mean_terminated_length": 1016.7958984375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.2857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3057456612586975, + "learning_rate": 6.31233615362752e-07, + "loss": -0.0, + "num_tokens": 27949463.0, + "reward": -0.161838099360466, + "reward_std": 0.3008255660533905, + "rewards/cosine_scaled_reward/mean": -0.1618381142616272, + "rewards/cosine_scaled_reward/std": 0.36034730076789856, + "step": 250 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 27949463, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}