diff --git "a/checkpoint-150/trainer_state.json" "b/checkpoint-150/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-150/trainer_state.json" @@ -0,0 +1,3784 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.17142857142857143, + "eval_steps": 500, + "global_step": 150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1702.03125, + "completions/mean_terminated_length": 993.6190795898438, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.001142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2837146520614624, + "learning_rate": 0.0, + "loss": -0.0, + "num_tokens": 118418.0, + "reward": -0.09800112247467041, + "reward_std": 0.3028089702129364, + "rewards/cosine_scaled_reward/mean": -0.09800112992525101, + "rewards/cosine_scaled_reward/std": 0.37953105568885803, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1894.0, + "completions/mean_length": 1738.90625, + "completions/mean_terminated_length": 949.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.002285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24220912158489227, + "learning_rate": 2e-08, + "loss": -0.0, + "num_tokens": 239748.0, + "reward": 0.020556632429361343, + "reward_std": 0.3545936942100525, + "rewards/cosine_scaled_reward/mean": 0.020556632429361343, + "rewards/cosine_scaled_reward/std": 0.4492928683757782, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 1946.515625, + "completions/mean_terminated_length": 749.0, + "completions/min_length": 609.0, + "completions/min_terminated_length": 609.0, + "epoch": 0.0034285714285714284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24765528738498688, + "learning_rate": 4e-08, + "loss": -0.0, + "num_tokens": 374797.0, + "reward": -0.20057085156440735, + "reward_std": 0.13691216707229614, + "rewards/cosine_scaled_reward/mean": -0.20057085156440735, + "rewards/cosine_scaled_reward/std": 0.16282624006271362, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1592.0, + "completions/mean_terminated_length": 967.1111450195312, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "epoch": 0.004571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28862521052360535, + "learning_rate": 6e-08, + "loss": 0.0, + "num_tokens": 486493.0, + "reward": -0.19111667573451996, + "reward_std": 0.19739457964897156, + "rewards/cosine_scaled_reward/mean": -0.19111669063568115, + "rewards/cosine_scaled_reward/std": 0.22545036673545837, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1976.578125, + "completions/mean_terminated_length": 1395.0001220703125, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "epoch": 0.005714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23521216213703156, + "learning_rate": 8e-08, + "loss": 0.0, + "num_tokens": 623810.0, + "reward": -0.2342512309551239, + "reward_std": 0.16005605459213257, + "rewards/cosine_scaled_reward/mean": -0.2342512309551239, + "rewards/cosine_scaled_reward/std": 0.20709452033042908, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1840.125, + "completions/mean_terminated_length": 939.3333740234375, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "epoch": 0.006857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2831529676914215, + "learning_rate": 1e-07, + "loss": 0.0, + "num_tokens": 753226.0, + "reward": -0.1443408578634262, + "reward_std": 0.25838011503219604, + "rewards/cosine_scaled_reward/mean": -0.1443408727645874, + "rewards/cosine_scaled_reward/std": 0.3164331316947937, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1974.265625, + "completions/mean_terminated_length": 1458.125, + "completions/min_length": 1153.0, + "completions/min_terminated_length": 1153.0, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22311581671237946, + "learning_rate": 1.2e-07, + "loss": 0.0, + "num_tokens": 889987.0, + "reward": -0.15585696697235107, + "reward_std": 0.21075330674648285, + "rewards/cosine_scaled_reward/mean": -0.15585698187351227, + "rewards/cosine_scaled_reward/std": 0.3327982723712921, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 1701.46875, + "completions/mean_terminated_length": 815.888916015625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.009142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23218390345573425, + "learning_rate": 1.4e-07, + "loss": -0.0, + "num_tokens": 1009297.0, + "reward": -0.019736051559448242, + "reward_std": 0.22464922070503235, + "rewards/cosine_scaled_reward/mean": -0.01973605342209339, + "rewards/cosine_scaled_reward/std": 0.46309077739715576, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1836.0, + "completions/mean_length": 1936.96875, + "completions/mean_terminated_length": 1258.4444580078125, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "epoch": 0.010285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2455250322818756, + "learning_rate": 1.6e-07, + "loss": -0.0, + "num_tokens": 1144719.0, + "reward": -0.22108668088912964, + "reward_std": 0.20550987124443054, + "rewards/cosine_scaled_reward/mean": -0.22108666598796844, + "rewards/cosine_scaled_reward/std": 0.27375248074531555, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 1662.0625, + "completions/mean_terminated_length": 813.0, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.011428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26574036478996277, + "learning_rate": 1.8e-07, + "loss": -0.0, + "num_tokens": 1261923.0, + "reward": -0.140568345785141, + "reward_std": 0.2796468734741211, + "rewards/cosine_scaled_reward/mean": -0.140568345785141, + "rewards/cosine_scaled_reward/std": 0.35179150104522705, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 1970.859375, + "completions/mean_terminated_length": 1060.5999755859375, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.012571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24890610575675964, + "learning_rate": 2e-07, + "loss": -0.0, + "num_tokens": 1399730.0, + "reward": -0.2551690638065338, + "reward_std": 0.16209062933921814, + "rewards/cosine_scaled_reward/mean": -0.2551690638065338, + "rewards/cosine_scaled_reward/std": 0.2319207787513733, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1798.71875, + "completions/mean_terminated_length": 1322.8182373046875, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "epoch": 0.013714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2804766595363617, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "num_tokens": 1525792.0, + "reward": -0.19796784222126007, + "reward_std": 0.30078738927841187, + "rewards/cosine_scaled_reward/mean": -0.19796785712242126, + "rewards/cosine_scaled_reward/std": 0.3346545696258545, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 1816.890625, + "completions/mean_terminated_length": 1123.5625, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "epoch": 0.014857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2471778392791748, + "learning_rate": 2.4e-07, + "loss": -0.0, + "num_tokens": 1653113.0, + "reward": -0.17365078628063202, + "reward_std": 0.23729698359966278, + "rewards/cosine_scaled_reward/mean": -0.17365078628063202, + "rewards/cosine_scaled_reward/std": 0.2726025879383087, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 1815.046875, + "completions/mean_terminated_length": 1171.0, + "completions/min_length": 639.0, + "completions/min_terminated_length": 639.0, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22734108567237854, + "learning_rate": 2.6e-07, + "loss": 0.0, + "num_tokens": 1779884.0, + "reward": -0.086978480219841, + "reward_std": 0.2551291584968567, + "rewards/cosine_scaled_reward/mean": -0.0869784876704216, + "rewards/cosine_scaled_reward/std": 0.4508184790611267, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 1705.421875, + "completions/mean_terminated_length": 758.2941284179688, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.017142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25105422735214233, + "learning_rate": 2.8e-07, + "loss": -0.0, + "num_tokens": 1899951.0, + "reward": 0.025415867567062378, + "reward_std": 0.13560885190963745, + "rewards/cosine_scaled_reward/mean": 0.025415875017642975, + "rewards/cosine_scaled_reward/std": 0.4663754105567932, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.018285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23334357142448425, + "learning_rate": 3e-07, + "loss": -0.0, + "num_tokens": 2041463.0, + "reward": -0.2220873385667801, + "reward_std": 0.17581966519355774, + "rewards/cosine_scaled_reward/mean": -0.2220873236656189, + "rewards/cosine_scaled_reward/std": 0.1694367378950119, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 1524.9375, + "completions/mean_terminated_length": 893.6551513671875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.019428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33780622482299805, + "learning_rate": 3.2e-07, + "loss": -0.0, + "num_tokens": 2149579.0, + "reward": -0.026115939021110535, + "reward_std": 0.3175298571586609, + "rewards/cosine_scaled_reward/mean": -0.026115931570529938, + "rewards/cosine_scaled_reward/std": 0.4766712486743927, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1635.0, + "completions/mean_length": 1771.34375, + "completions/mean_terminated_length": 1116.105224609375, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "epoch": 0.02057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23123449087142944, + "learning_rate": 3.4000000000000003e-07, + "loss": -0.0, + "num_tokens": 2273321.0, + "reward": -0.15853706002235413, + "reward_std": 0.27896177768707275, + "rewards/cosine_scaled_reward/mean": -0.15853706002235413, + "rewards/cosine_scaled_reward/std": 0.3426607847213745, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1811.953125, + "completions/mean_terminated_length": 1159.3529052734375, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.021714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25707289576530457, + "learning_rate": 3.6e-07, + "loss": -0.0, + "num_tokens": 2400542.0, + "reward": -0.052606794983148575, + "reward_std": 0.31571486592292786, + "rewards/cosine_scaled_reward/mean": -0.052606794983148575, + "rewards/cosine_scaled_reward/std": 0.44901713728904724, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1632.953125, + "completions/mean_terminated_length": 840.5909423828125, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.022857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25764355063438416, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "num_tokens": 2516403.0, + "reward": -0.07391424477100372, + "reward_std": 0.2678168714046478, + "rewards/cosine_scaled_reward/mean": -0.07391423732042313, + "rewards/cosine_scaled_reward/std": 0.3888758718967438, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1854.0, + "completions/mean_length": 1820.125, + "completions/mean_terminated_length": 1136.5, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27439141273498535, + "learning_rate": 4e-07, + "loss": 0.0, + "num_tokens": 2643699.0, + "reward": -0.16270118951797485, + "reward_std": 0.22588439285755157, + "rewards/cosine_scaled_reward/mean": -0.16270118951797485, + "rewards/cosine_scaled_reward/std": 0.39143073558807373, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1741.0, + "completions/mean_length": 1271.359375, + "completions/mean_terminated_length": 739.9736938476562, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.025142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37971845269203186, + "learning_rate": 4.1999999999999995e-07, + "loss": -0.0, + "num_tokens": 2734082.0, + "reward": -0.00552794337272644, + "reward_std": 0.23386958241462708, + "rewards/cosine_scaled_reward/mean": -0.005527939647436142, + "rewards/cosine_scaled_reward/std": 0.4625597596168518, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 1670.296875, + "completions/mean_terminated_length": 1081.0799560546875, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.026285714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28573453426361084, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "num_tokens": 2851773.0, + "reward": -0.18269123136997223, + "reward_std": 0.2168647199869156, + "rewards/cosine_scaled_reward/mean": -0.18269124627113342, + "rewards/cosine_scaled_reward/std": 0.2703794836997986, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 1757.296875, + "completions/mean_terminated_length": 1068.7894287109375, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.027428571428571427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2553797662258148, + "learning_rate": 4.6e-07, + "loss": 0.0, + "num_tokens": 2975168.0, + "reward": -0.23130035400390625, + "reward_std": 0.35076260566711426, + "rewards/cosine_scaled_reward/mean": -0.23130035400390625, + "rewards/cosine_scaled_reward/std": 0.3866168260574341, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1584.0, + "completions/mean_length": 1744.28125, + "completions/mean_terminated_length": 833.125, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.02857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2636294960975647, + "learning_rate": 4.8e-07, + "loss": -0.0, + "num_tokens": 3097098.0, + "reward": -0.19239474833011627, + "reward_std": 0.2867633104324341, + "rewards/cosine_scaled_reward/mean": -0.19239474833011627, + "rewards/cosine_scaled_reward/std": 0.347222238779068, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1932.09375, + "completions/mean_terminated_length": 1477.3846435546875, + "completions/min_length": 895.0, + "completions/min_terminated_length": 895.0, + "epoch": 0.029714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22351376712322235, + "learning_rate": 5e-07, + "loss": -0.0, + "num_tokens": 3231384.0, + "reward": -0.006307817995548248, + "reward_std": 0.2015555500984192, + "rewards/cosine_scaled_reward/mean": -0.006307825446128845, + "rewards/cosine_scaled_reward/std": 0.4079793393611908, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1974.0, + "completions/mean_length": 1899.25, + "completions/mean_terminated_length": 1254.666748046875, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.030857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2670150697231293, + "learning_rate": 5.2e-07, + "loss": -0.0, + "num_tokens": 3363224.0, + "reward": -0.22071197628974915, + "reward_std": 0.2118011713027954, + "rewards/cosine_scaled_reward/mean": -0.22071197628974915, + "rewards/cosine_scaled_reward/std": 0.2716290354728699, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1767.609375, + "completions/mean_terminated_length": 926.4375, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25918784737586975, + "learning_rate": 5.4e-07, + "loss": -0.0, + "num_tokens": 3486687.0, + "reward": -0.10919298231601715, + "reward_std": 0.2716072201728821, + "rewards/cosine_scaled_reward/mean": -0.10919298231601715, + "rewards/cosine_scaled_reward/std": 0.44544270634651184, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 1932.203125, + "completions/mean_terminated_length": 989.2857666015625, + "completions/min_length": 603.0, + "completions/min_terminated_length": 603.0, + "epoch": 0.03314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24401192367076874, + "learning_rate": 5.6e-07, + "loss": 0.0, + "num_tokens": 3620820.0, + "reward": -0.19096782803535461, + "reward_std": 0.15806984901428223, + "rewards/cosine_scaled_reward/mean": -0.19096782803535461, + "rewards/cosine_scaled_reward/std": 0.181764155626297, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 1880.71875, + "completions/mean_terminated_length": 1334.2667236328125, + "completions/min_length": 604.0, + "completions/min_terminated_length": 604.0, + "epoch": 0.03428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22094956040382385, + "learning_rate": 5.8e-07, + "loss": -0.0, + "num_tokens": 3751722.0, + "reward": -0.21267297863960266, + "reward_std": 0.24843861162662506, + "rewards/cosine_scaled_reward/mean": -0.21267297863960266, + "rewards/cosine_scaled_reward/std": 0.29802343249320984, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 1786.234375, + "completions/mean_terminated_length": 851.357177734375, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.03542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2912121117115021, + "learning_rate": 6e-07, + "loss": -0.0, + "num_tokens": 3876537.0, + "reward": -0.2621557414531708, + "reward_std": 0.18612943589687347, + "rewards/cosine_scaled_reward/mean": -0.2621557414531708, + "rewards/cosine_scaled_reward/std": 0.22891530394554138, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1841.0, + "completions/mean_length": 1948.765625, + "completions/mean_terminated_length": 1342.3333740234375, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "epoch": 0.036571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2303810715675354, + "learning_rate": 6.2e-07, + "loss": 0.0, + "num_tokens": 4011610.0, + "reward": -0.1655973494052887, + "reward_std": 0.2392224669456482, + "rewards/cosine_scaled_reward/mean": -0.1655973345041275, + "rewards/cosine_scaled_reward/std": 0.3260692358016968, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1984.0, + "completions/mean_terminated_length": 1365.3333740234375, + "completions/min_length": 965.0, + "completions/min_terminated_length": 965.0, + "epoch": 0.037714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23169051110744476, + "learning_rate": 6.4e-07, + "loss": 0.0, + "num_tokens": 4149802.0, + "reward": -0.22799505293369293, + "reward_std": 0.24000275135040283, + "rewards/cosine_scaled_reward/mean": -0.22799506783485413, + "rewards/cosine_scaled_reward/std": 0.30748653411865234, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1700.859375, + "completions/mean_terminated_length": 1159.3199462890625, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.038857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2647433578968048, + "learning_rate": 6.6e-07, + "loss": 0.0, + "num_tokens": 4268209.0, + "reward": -0.07232969254255295, + "reward_std": 0.3570185899734497, + "rewards/cosine_scaled_reward/mean": -0.07232969999313354, + "rewards/cosine_scaled_reward/std": 0.4520716369152069, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1458.0, + "completions/mean_length": 1884.625, + "completions/mean_terminated_length": 741.0, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2681647539138794, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0, + "num_tokens": 4400321.0, + "reward": -0.21119418740272522, + "reward_std": 0.2156996876001358, + "rewards/cosine_scaled_reward/mean": -0.21119415760040283, + "rewards/cosine_scaled_reward/std": 0.304564893245697, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 2032.765625, + "completions/mean_terminated_length": 1560.5, + "completions/min_length": 1119.0, + "completions/min_terminated_length": 1119.0, + "epoch": 0.04114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25201615691185, + "learning_rate": 7e-07, + "loss": -0.0, + "num_tokens": 4541530.0, + "reward": -0.2148258090019226, + "reward_std": 0.1970210075378418, + "rewards/cosine_scaled_reward/mean": -0.2148257941007614, + "rewards/cosine_scaled_reward/std": 0.21921320259571075, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2005.0, + "completions/mean_length": 1954.5, + "completions/mean_terminated_length": 1383.111083984375, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "epoch": 0.04228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29214274883270264, + "learning_rate": 7.2e-07, + "loss": 0.0, + "num_tokens": 4677642.0, + "reward": -0.23519155383110046, + "reward_std": 0.14085054397583008, + "rewards/cosine_scaled_reward/mean": -0.23519155383110046, + "rewards/cosine_scaled_reward/std": 0.17065586149692535, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1949.1875, + "completions/mean_terminated_length": 1257.5, + "completions/min_length": 1042.0, + "completions/min_terminated_length": 1042.0, + "epoch": 0.04342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2337840050458908, + "learning_rate": 7.4e-07, + "loss": -0.0, + "num_tokens": 4814102.0, + "reward": -0.16185586154460907, + "reward_std": 0.19152981042861938, + "rewards/cosine_scaled_reward/mean": -0.16185584664344788, + "rewards/cosine_scaled_reward/std": 0.3005273640155792, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 1810.515625, + "completions/mean_terminated_length": 666.2727661132812, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.044571428571428574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.246645987033844, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "num_tokens": 4940759.0, + "reward": -0.10980962216854095, + "reward_std": 0.18094567954540253, + "rewards/cosine_scaled_reward/mean": -0.10980962216854095, + "rewards/cosine_scaled_reward/std": 0.3624936640262604, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 1700.796875, + "completions/mean_terminated_length": 1037.95458984375, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "epoch": 0.045714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26321786642074585, + "learning_rate": 7.799999999999999e-07, + "loss": -0.0, + "num_tokens": 5059682.0, + "reward": -0.14547404646873474, + "reward_std": 0.22270715236663818, + "rewards/cosine_scaled_reward/mean": -0.14547404646873474, + "rewards/cosine_scaled_reward/std": 0.4000875651836395, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1860.328125, + "completions/mean_terminated_length": 1415.8421630859375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "epoch": 0.046857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21273446083068848, + "learning_rate": 8e-07, + "loss": -0.0, + "num_tokens": 5189895.0, + "reward": -0.24220962822437286, + "reward_std": 0.27360057830810547, + "rewards/cosine_scaled_reward/mean": -0.24220961332321167, + "rewards/cosine_scaled_reward/std": 0.33429500460624695, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1547.0, + "completions/mean_length": 1694.375, + "completions/mean_terminated_length": 539.2000122070312, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3549652099609375, + "learning_rate": 8.199999999999999e-07, + "loss": -0.0, + "num_tokens": 5308695.0, + "reward": -0.22589105367660522, + "reward_std": 0.16009008884429932, + "rewards/cosine_scaled_reward/mean": -0.22589105367660522, + "rewards/cosine_scaled_reward/std": 0.17985297739505768, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1824.75, + "completions/mean_terminated_length": 948.923095703125, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "epoch": 0.04914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25625720620155334, + "learning_rate": 8.399999999999999e-07, + "loss": -0.0, + "num_tokens": 5437095.0, + "reward": -0.10874830186367035, + "reward_std": 0.2326180636882782, + "rewards/cosine_scaled_reward/mean": -0.10874830186367035, + "rewards/cosine_scaled_reward/std": 0.3275902569293976, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1751.0, + "completions/mean_length": 1673.734375, + "completions/mean_terminated_length": 787.3157958984375, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.05028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3032245934009552, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "num_tokens": 5554910.0, + "reward": -0.1157154068350792, + "reward_std": 0.2323075234889984, + "rewards/cosine_scaled_reward/mean": -0.1157153993844986, + "rewards/cosine_scaled_reward/std": 0.4071435034275055, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 2031.03125, + "completions/mean_terminated_length": 1776.5, + "completions/min_length": 1421.0, + "completions/min_terminated_length": 1421.0, + "epoch": 0.05142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2320922464132309, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "num_tokens": 5696552.0, + "reward": -0.22731460630893707, + "reward_std": 0.19835877418518066, + "rewards/cosine_scaled_reward/mean": -0.22731460630893707, + "rewards/cosine_scaled_reward/std": 0.28479474782943726, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 1890.3125, + "completions/mean_terminated_length": 786.5, + "completions/min_length": 490.0, + "completions/min_terminated_length": 490.0, + "epoch": 0.052571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2494276612997055, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 5828700.0, + "reward": -0.23243775963783264, + "reward_std": 0.18319474160671234, + "rewards/cosine_scaled_reward/mean": -0.23243777453899384, + "rewards/cosine_scaled_reward/std": 0.20973731577396393, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1398.0, + "completions/mean_length": 1672.09375, + "completions/mean_terminated_length": 711.4444580078125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.053714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3419908881187439, + "learning_rate": 9.2e-07, + "loss": 0.0, + "num_tokens": 5946114.0, + "reward": -0.16157878935337067, + "reward_std": 0.24494563043117523, + "rewards/cosine_scaled_reward/mean": -0.16157880425453186, + "rewards/cosine_scaled_reward/std": 0.39992472529411316, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1501.0, + "completions/mean_length": 1787.171875, + "completions/mean_terminated_length": 935.1333618164062, + "completions/min_length": 687.0, + "completions/min_terminated_length": 687.0, + "epoch": 0.054857142857142854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25991642475128174, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0, + "num_tokens": 6071037.0, + "reward": -0.1829870045185089, + "reward_std": 0.2542135417461395, + "rewards/cosine_scaled_reward/mean": -0.1829870045185089, + "rewards/cosine_scaled_reward/std": 0.30597779154777527, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1565.34375, + "completions/mean_terminated_length": 944.7857666015625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27452352643013, + "learning_rate": 9.6e-07, + "loss": 0.0, + "num_tokens": 6181283.0, + "reward": -0.22301900386810303, + "reward_std": 0.25131016969680786, + "rewards/cosine_scaled_reward/mean": -0.22301900386810303, + "rewards/cosine_scaled_reward/std": 0.2918049991130829, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 1775.515625, + "completions/mean_terminated_length": 885.4000244140625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.05714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22758428752422333, + "learning_rate": 9.8e-07, + "loss": 0.0, + "num_tokens": 6305732.0, + "reward": -0.10754476487636566, + "reward_std": 0.18711507320404053, + "rewards/cosine_scaled_reward/mean": -0.10754477977752686, + "rewards/cosine_scaled_reward/std": 0.39105597138404846, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1507.5625, + "completions/mean_terminated_length": 766.9629516601562, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.05828571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29917222261428833, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 6412424.0, + "reward": -0.08381433039903641, + "reward_std": 0.23327183723449707, + "rewards/cosine_scaled_reward/mean": -0.08381432294845581, + "rewards/cosine_scaled_reward/std": 0.40033307671546936, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 1746.015625, + "completions/mean_terminated_length": 840.0625, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.05942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2824826240539551, + "learning_rate": 9.999890338174275e-07, + "loss": -0.0, + "num_tokens": 6535521.0, + "reward": -0.2775638699531555, + "reward_std": 0.17903020977973938, + "rewards/cosine_scaled_reward/mean": -0.2775638699531555, + "rewards/cosine_scaled_reward/std": 0.38567760586738586, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 1765.71875, + "completions/mean_terminated_length": 918.875, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "epoch": 0.060571428571428575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2795548737049103, + "learning_rate": 9.999561358041868e-07, + "loss": -0.0, + "num_tokens": 6659359.0, + "reward": -0.18778130412101746, + "reward_std": 0.24159184098243713, + "rewards/cosine_scaled_reward/mean": -0.18778130412101746, + "rewards/cosine_scaled_reward/std": 0.2979832589626312, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1877.984375, + "completions/mean_terminated_length": 1367.9375, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "epoch": 0.061714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.236692875623703, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "num_tokens": 6790694.0, + "reward": -0.09228484332561493, + "reward_std": 0.3374499976634979, + "rewards/cosine_scaled_reward/mean": -0.09228484332561493, + "rewards/cosine_scaled_reward/std": 0.4543565809726715, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.859375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 1855.140625, + "completions/mean_terminated_length": 676.5555419921875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "epoch": 0.06285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24421174824237823, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0, + "num_tokens": 6919711.0, + "reward": -0.19803781807422638, + "reward_std": 0.1785231977701187, + "rewards/cosine_scaled_reward/mean": -0.19803781807422638, + "rewards/cosine_scaled_reward/std": 0.3721012771129608, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1840.703125, + "completions/mean_terminated_length": 1163.533447265625, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23600365221500397, + "learning_rate": 9.997258721585931e-07, + "loss": -0.0, + "num_tokens": 7048476.0, + "reward": -0.09674595296382904, + "reward_std": 0.21479913592338562, + "rewards/cosine_scaled_reward/mean": -0.09674594551324844, + "rewards/cosine_scaled_reward/std": 0.4473191201686859, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1681.0, + "completions/mean_length": 1997.5625, + "completions/mean_terminated_length": 1510.0, + "completions/min_length": 1387.0, + "completions/min_terminated_length": 1387.0, + "epoch": 0.06514285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19085553288459778, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "num_tokens": 7187888.0, + "reward": -0.2958947420120239, + "reward_std": 0.1703263819217682, + "rewards/cosine_scaled_reward/mean": -0.2958947420120239, + "rewards/cosine_scaled_reward/std": 0.18720079958438873, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.578125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 1591.40625, + "completions/mean_terminated_length": 965.7037353515625, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.06628571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26593509316444397, + "learning_rate": 9.994627618036452e-07, + "loss": -0.0, + "num_tokens": 7299834.0, + "reward": -0.0999627411365509, + "reward_std": 0.32584434747695923, + "rewards/cosine_scaled_reward/mean": -0.0999627485871315, + "rewards/cosine_scaled_reward/std": 0.4625846743583679, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 1744.640625, + "completions/mean_terminated_length": 905.941162109375, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.06742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.250278115272522, + "learning_rate": 9.992983438818915e-07, + "loss": -0.0, + "num_tokens": 7421955.0, + "reward": -0.16149799525737762, + "reward_std": 0.21139998733997345, + "rewards/cosine_scaled_reward/mean": -0.16149798035621643, + "rewards/cosine_scaled_reward/std": 0.3698217272758484, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 1805.390625, + "completions/mean_terminated_length": 853.6154174804688, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "epoch": 0.06857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23754432797431946, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0, + "num_tokens": 7548388.0, + "reward": -0.2758587598800659, + "reward_std": 0.18496021628379822, + "rewards/cosine_scaled_reward/mean": -0.2758587598800659, + "rewards/cosine_scaled_reward/std": 0.22098895907402039, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1578.0, + "completions/mean_length": 1863.265625, + "completions/mean_terminated_length": 865.7000122070312, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "epoch": 0.06971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21628443896770477, + "learning_rate": 9.989038226169207e-07, + "loss": -0.0, + "num_tokens": 7679157.0, + "reward": -0.11532291769981384, + "reward_std": 0.24975456297397614, + "rewards/cosine_scaled_reward/mean": -0.11532291769981384, + "rewards/cosine_scaled_reward/std": 0.32742080092430115, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 1690.296875, + "completions/mean_terminated_length": 1007.4091186523438, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.07085714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2538006901741028, + "learning_rate": 9.98673738502114e-07, + "loss": -0.0, + "num_tokens": 7797568.0, + "reward": -0.08548027276992798, + "reward_std": 0.1828608512878418, + "rewards/cosine_scaled_reward/mean": -0.08548027276992798, + "rewards/cosine_scaled_reward/std": 0.31418856978416443, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2045.0, + "completions/mean_length": 1631.5, + "completions/mean_terminated_length": 1188.1290283203125, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27852457761764526, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "num_tokens": 7912240.0, + "reward": 0.03178010880947113, + "reward_std": 0.39872580766677856, + "rewards/cosine_scaled_reward/mean": 0.03178010135889053, + "rewards/cosine_scaled_reward/std": 0.4946252107620239, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1871.0, + "completions/mean_length": 1675.578125, + "completions/mean_terminated_length": 964.5909423828125, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.07314285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.267963171005249, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0, + "num_tokens": 8030429.0, + "reward": -0.19527338445186615, + "reward_std": 0.2819081246852875, + "rewards/cosine_scaled_reward/mean": -0.19527339935302734, + "rewards/cosine_scaled_reward/std": 0.3602358102798462, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 1671.53125, + "completions/mean_terminated_length": 709.4444580078125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.07428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25890231132507324, + "learning_rate": 9.97852329991824e-07, + "loss": -0.0, + "num_tokens": 8148111.0, + "reward": -0.17763729393482208, + "reward_std": 0.1911587119102478, + "rewards/cosine_scaled_reward/mean": -0.17763729393482208, + "rewards/cosine_scaled_reward/std": 0.4043731391429901, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1958.0, + "completions/mean_length": 1330.71875, + "completions/mean_terminated_length": 613.4375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.07542857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4998987913131714, + "learning_rate": 9.975348529157229e-07, + "loss": -0.0, + "num_tokens": 8243509.0, + "reward": -0.012211084365844727, + "reward_std": 0.25645655393600464, + "rewards/cosine_scaled_reward/mean": -0.012211091816425323, + "rewards/cosine_scaled_reward/std": 0.4760035276412964, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 1970.15625, + "completions/mean_terminated_length": 1051.5999755859375, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "epoch": 0.07657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22345179319381714, + "learning_rate": 9.971955636222684e-07, + "loss": -0.0, + "num_tokens": 8380335.0, + "reward": -0.27880045771598816, + "reward_std": 0.169667050242424, + "rewards/cosine_scaled_reward/mean": -0.27880045771598816, + "rewards/cosine_scaled_reward/std": 0.18985651433467865, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1628.0, + "completions/mean_length": 1223.421875, + "completions/mean_terminated_length": 728.6749877929688, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.07771428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3179270625114441, + "learning_rate": 9.968344786479415e-07, + "loss": -0.0, + "num_tokens": 8467890.0, + "reward": -0.09876523166894913, + "reward_std": 0.25151342153549194, + "rewards/cosine_scaled_reward/mean": -0.09876523166894913, + "rewards/cosine_scaled_reward/std": 0.4221951961517334, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 1574.234375, + "completions/mean_terminated_length": 965.107177734375, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.07885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.292468398809433, + "learning_rate": 9.964516155915151e-07, + "loss": -0.0, + "num_tokens": 8578985.0, + "reward": -0.20737677812576294, + "reward_std": 0.23497402667999268, + "rewards/cosine_scaled_reward/mean": -0.20737677812576294, + "rewards/cosine_scaled_reward/std": 0.3156755864620209, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 1893.546875, + "completions/mean_terminated_length": 635.857177734375, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2380189746618271, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0, + "num_tokens": 8711628.0, + "reward": -0.25674766302108765, + "reward_std": 0.1897822916507721, + "rewards/cosine_scaled_reward/mean": -0.25674766302108765, + "rewards/cosine_scaled_reward/std": 0.2669999301433563, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1716.5625, + "completions/mean_terminated_length": 1037.90478515625, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.08114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2737840414047241, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "num_tokens": 8832208.0, + "reward": -0.11062799394130707, + "reward_std": 0.27241969108581543, + "rewards/cosine_scaled_reward/mean": -0.11062799394130707, + "rewards/cosine_scaled_reward/std": 0.43007227778434753, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1669.015625, + "completions/mean_terminated_length": 1115.115478515625, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.08228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27252519130706787, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0, + "num_tokens": 8949377.0, + "reward": -0.2082766592502594, + "reward_std": 0.1827523410320282, + "rewards/cosine_scaled_reward/mean": -0.2082766592502594, + "rewards/cosine_scaled_reward/std": 0.18022844195365906, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2048.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2048.0, + "completions/min_terminated_length": 0.0, + "epoch": 0.08342857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20380868017673492, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0, + "num_tokens": 9091177.0, + "reward": -0.29910945892333984, + "reward_std": 0.12098947167396545, + "rewards/cosine_scaled_reward/mean": -0.29910945892333984, + "rewards/cosine_scaled_reward/std": 0.1714438796043396, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1885.90625, + "completions/mean_terminated_length": 1250.0, + "completions/min_length": 725.0, + "completions/min_terminated_length": 725.0, + "epoch": 0.08457142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.262696772813797, + "learning_rate": 9.942113192828444e-07, + "loss": -0.0, + "num_tokens": 9221803.0, + "reward": -0.15267591178417206, + "reward_std": 0.23455429077148438, + "rewards/cosine_scaled_reward/mean": -0.15267591178417206, + "rewards/cosine_scaled_reward/std": 0.41386422514915466, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1865.65625, + "completions/mean_terminated_length": 1318.625, + "completions/min_length": 966.0, + "completions/min_terminated_length": 966.0, + "epoch": 0.08571428571428572, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.19511669874191284, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0, + "num_tokens": 9352165.0, + "reward": -0.09251219034194946, + "reward_std": 0.182725191116333, + "rewards/cosine_scaled_reward/mean": -0.09251218289136887, + "rewards/cosine_scaled_reward/std": 0.47868576645851135, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1665.0, + "completions/mean_length": 1767.421875, + "completions/mean_terminated_length": 991.7058715820312, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.08685714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2677210569381714, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "num_tokens": 9475680.0, + "reward": -0.2391628623008728, + "reward_std": 0.16363291442394257, + "rewards/cosine_scaled_reward/mean": -0.2391628623008728, + "rewards/cosine_scaled_reward/std": 0.18309317529201508, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1851.609375, + "completions/mean_terminated_length": 1150.21435546875, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21603599190711975, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "num_tokens": 9605759.0, + "reward": -0.15259909629821777, + "reward_std": 0.212618887424469, + "rewards/cosine_scaled_reward/mean": -0.15259911119937897, + "rewards/cosine_scaled_reward/std": 0.2940331995487213, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 1851.640625, + "completions/mean_terminated_length": 1081.3077392578125, + "completions/min_length": 670.0, + "completions/min_terminated_length": 670.0, + "epoch": 0.08914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20200739800930023, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0, + "num_tokens": 9734984.0, + "reward": -0.18924658000469208, + "reward_std": 0.24043609201908112, + "rewards/cosine_scaled_reward/mean": -0.18924658000469208, + "rewards/cosine_scaled_reward/std": 0.38954904675483704, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1807.0, + "completions/mean_length": 1608.40625, + "completions/mean_terminated_length": 875.75, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.09028571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31782233715057373, + "learning_rate": 9.91429819907136e-07, + "loss": -0.0, + "num_tokens": 9848018.0, + "reward": -0.1820095181465149, + "reward_std": 0.25530290603637695, + "rewards/cosine_scaled_reward/mean": -0.1820095181465149, + "rewards/cosine_scaled_reward/std": 0.31191888451576233, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1976.84375, + "completions/mean_terminated_length": 1397.4285888671875, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.09142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24291428923606873, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0, + "num_tokens": 9984928.0, + "reward": -0.253532737493515, + "reward_std": 0.19657698273658752, + "rewards/cosine_scaled_reward/mean": -0.2535327672958374, + "rewards/cosine_scaled_reward/std": 0.2723200023174286, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1586.0, + "completions/mean_length": 1705.015625, + "completions/mean_terminated_length": 828.5, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.09257142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3213472068309784, + "learning_rate": 9.901664203302124e-07, + "loss": -0.0, + "num_tokens": 10105321.0, + "reward": -0.1452670842409134, + "reward_std": 0.16492897272109985, + "rewards/cosine_scaled_reward/mean": -0.1452670842409134, + "rewards/cosine_scaled_reward/std": 0.20188800990581512, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 1707.203125, + "completions/mean_terminated_length": 1009.3809814453125, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.09371428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25050660967826843, + "learning_rate": 9.895025252503755e-07, + "loss": -0.0, + "num_tokens": 10224910.0, + "reward": -0.07721791416406631, + "reward_std": 0.26486068964004517, + "rewards/cosine_scaled_reward/mean": -0.07721789926290512, + "rewards/cosine_scaled_reward/std": 0.4591779112815857, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1707.0, + "completions/mean_length": 1616.609375, + "completions/mean_terminated_length": 897.625, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.09485714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2958391010761261, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0, + "num_tokens": 10339461.0, + "reward": -0.05388225242495537, + "reward_std": 0.23644787073135376, + "rewards/cosine_scaled_reward/mean": -0.053882256150245667, + "rewards/cosine_scaled_reward/std": 0.376263827085495, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1666.0, + "completions/mean_length": 1838.828125, + "completions/mean_terminated_length": 831.0, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23179632425308228, + "learning_rate": 9.881105062929221e-07, + "loss": -0.0, + "num_tokens": 10467842.0, + "reward": -0.15529119968414307, + "reward_std": 0.30153706669807434, + "rewards/cosine_scaled_reward/mean": -0.15529119968414307, + "rewards/cosine_scaled_reward/std": 0.4041438102722168, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1840.71875, + "completions/mean_terminated_length": 1384.7000732421875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "epoch": 0.09714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18997986614704132, + "learning_rate": 9.873824502603459e-07, + "loss": -0.0, + "num_tokens": 10595968.0, + "reward": -0.09931906312704086, + "reward_std": 0.2868148386478424, + "rewards/cosine_scaled_reward/mean": -0.09931905567646027, + "rewards/cosine_scaled_reward/std": 0.32533466815948486, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 1669.25, + "completions/mean_terminated_length": 772.2105102539062, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.09828571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2640744745731354, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "num_tokens": 10713656.0, + "reward": -0.09163744747638702, + "reward_std": 0.25668954849243164, + "rewards/cosine_scaled_reward/mean": -0.09163745492696762, + "rewards/cosine_scaled_reward/std": 0.34459924697875977, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1999.0, + "completions/mean_length": 1626.609375, + "completions/mean_terminated_length": 969.239990234375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.09942857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2926872968673706, + "learning_rate": 9.85862422507884e-07, + "loss": -0.0, + "num_tokens": 10827879.0, + "reward": -0.20403151214122772, + "reward_std": 0.28549331426620483, + "rewards/cosine_scaled_reward/mean": -0.20403149724006653, + "rewards/cosine_scaled_reward/std": 0.32589223980903625, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 1735.953125, + "completions/mean_terminated_length": 1049.4500732421875, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.10057142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26702597737312317, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0, + "num_tokens": 10949492.0, + "reward": 0.03890814632177353, + "reward_std": 0.3359295129776001, + "rewards/cosine_scaled_reward/mean": 0.03890814632177353, + "rewards/cosine_scaled_reward/std": 0.45631229877471924, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1619.453125, + "completions/mean_terminated_length": 1163.258056640625, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.10171428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21441271901130676, + "learning_rate": 9.8425742251254e-07, + "loss": -0.0, + "num_tokens": 11064137.0, + "reward": -0.0988616794347763, + "reward_std": 0.3224140405654907, + "rewards/cosine_scaled_reward/mean": -0.09886167198419571, + "rewards/cosine_scaled_reward/std": 0.41691890358924866, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1753.0, + "completions/mean_length": 1592.1875, + "completions/mean_terminated_length": 1075.60009765625, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.10285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30840882658958435, + "learning_rate": 9.83423155058946e-07, + "loss": -0.0, + "num_tokens": 11176037.0, + "reward": -0.22794684767723083, + "reward_std": 0.19634509086608887, + "rewards/cosine_scaled_reward/mean": -0.22794684767723083, + "rewards/cosine_scaled_reward/std": 0.2059042751789093, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 1857.125, + "completions/mean_terminated_length": 1369.3333740234375, + "completions/min_length": 876.0, + "completions/min_terminated_length": 876.0, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24654391407966614, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0, + "num_tokens": 11305461.0, + "reward": -0.06898833811283112, + "reward_std": 0.24478675425052643, + "rewards/cosine_scaled_reward/mean": -0.06898833811283112, + "rewards/cosine_scaled_reward/std": 0.4049251973628998, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1291.0, + "completions/mean_length": 1659.859375, + "completions/mean_terminated_length": 865.0952758789062, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.10514285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2405616194009781, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "num_tokens": 11421684.0, + "reward": -0.21882590651512146, + "reward_std": 0.186202734708786, + "rewards/cosine_scaled_reward/mean": -0.21882590651512146, + "rewards/cosine_scaled_reward/std": 0.20097385346889496, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.984375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 2047.96875, + "completions/mean_terminated_length": 2046.0, + "completions/min_length": 2046.0, + "completions/min_terminated_length": 2046.0, + "epoch": 0.10628571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24273920059204102, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0, + "num_tokens": 11564098.0, + "reward": -0.25700533390045166, + "reward_std": 0.11929697543382645, + "rewards/cosine_scaled_reward/mean": -0.2570053040981293, + "rewards/cosine_scaled_reward/std": 0.1724296510219574, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 1864.96875, + "completions/mean_terminated_length": 983.0909423828125, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.10742857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2541305422782898, + "learning_rate": 9.798752629550546e-07, + "loss": -0.0, + "num_tokens": 11693224.0, + "reward": -0.12399546802043915, + "reward_std": 0.15344232320785522, + "rewards/cosine_scaled_reward/mean": -0.12399546802043915, + "rewards/cosine_scaled_reward/std": 0.4378487467765808, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.921875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 1977.0625, + "completions/mean_terminated_length": 1140.0, + "completions/min_length": 755.0, + "completions/min_terminated_length": 755.0, + "epoch": 0.10857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21893343329429626, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0, + "num_tokens": 11830284.0, + "reward": -0.2706957459449768, + "reward_std": 0.1604195535182953, + "rewards/cosine_scaled_reward/mean": -0.2706957459449768, + "rewards/cosine_scaled_reward/std": 0.17591074109077454, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1688.0, + "completions/mean_length": 1656.359375, + "completions/mean_terminated_length": 1003.625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.10971428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24892951548099518, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "num_tokens": 11947427.0, + "reward": -0.05472355708479881, + "reward_std": 0.22797656059265137, + "rewards/cosine_scaled_reward/mean": -0.05472356453537941, + "rewards/cosine_scaled_reward/std": 0.4557226002216339, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.703125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1808.265625, + "completions/mean_terminated_length": 1240.4736328125, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "epoch": 0.11085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26835066080093384, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0, + "num_tokens": 12073420.0, + "reward": -0.10791188478469849, + "reward_std": 0.2891411781311035, + "rewards/cosine_scaled_reward/mean": -0.10791188478469849, + "rewards/cosine_scaled_reward/std": 0.3751998543739319, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1552.390625, + "completions/mean_terminated_length": 915.1785888671875, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27451470494270325, + "learning_rate": 9.759921670520634e-07, + "loss": -0.0, + "num_tokens": 12183837.0, + "reward": -0.1808183193206787, + "reward_std": 0.24214914441108704, + "rewards/cosine_scaled_reward/mean": -0.1808183193206787, + "rewards/cosine_scaled_reward/std": 0.3102630078792572, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1318.0, + "completions/mean_length": 1721.15625, + "completions/mean_terminated_length": 653.4666748046875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.11314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2736988067626953, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0, + "num_tokens": 12305159.0, + "reward": -0.10280460864305496, + "reward_std": 0.21398380398750305, + "rewards/cosine_scaled_reward/mean": -0.10280461609363556, + "rewards/cosine_scaled_reward/std": 0.4072605073451996, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1780.0, + "completions/mean_length": 1564.6875, + "completions/mean_terminated_length": 981.3793334960938, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.11428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26440170407295227, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "num_tokens": 12415011.0, + "reward": 0.07170121371746063, + "reward_std": 0.38168632984161377, + "rewards/cosine_scaled_reward/mean": 0.07170121371746063, + "rewards/cosine_scaled_reward/std": 0.519091784954071, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1679.9375, + "completions/mean_terminated_length": 1105.760009765625, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "epoch": 0.11542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28899702429771423, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0, + "num_tokens": 12533959.0, + "reward": -0.06987505406141281, + "reward_std": 0.23702794313430786, + "rewards/cosine_scaled_reward/mean": -0.06987505406141281, + "rewards/cosine_scaled_reward/std": 0.4194885790348053, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1382.515625, + "completions/mean_terminated_length": 927.1842041015625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.11657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31149351596832275, + "learning_rate": 9.717768952713511e-07, + "loss": -0.0, + "num_tokens": 12632592.0, + "reward": -0.1570146381855011, + "reward_std": 0.2435436099767685, + "rewards/cosine_scaled_reward/mean": -0.1570146381855011, + "rewards/cosine_scaled_reward/std": 0.41899070143699646, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1648.671875, + "completions/mean_terminated_length": 886.3182373046875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.11771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26930251717567444, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0, + "num_tokens": 12748459.0, + "reward": -0.2438274323940277, + "reward_std": 0.23225237429141998, + "rewards/cosine_scaled_reward/mean": -0.2438274323940277, + "rewards/cosine_scaled_reward/std": 0.32278329133987427, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 1648.6875, + "completions/mean_terminated_length": 831.047607421875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.11885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31051400303840637, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0, + "num_tokens": 12864439.0, + "reward": -0.11535478383302689, + "reward_std": 0.2225915789604187, + "rewards/cosine_scaled_reward/mean": -0.11535478383302689, + "rewards/cosine_scaled_reward/std": 0.31164902448654175, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1398.0, + "completions/mean_length": 1480.796875, + "completions/mean_terminated_length": 837.9667358398438, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2630039155483246, + "learning_rate": 9.683994186497132e-07, + "loss": 0.0, + "num_tokens": 12970498.0, + "reward": -0.1717175543308258, + "reward_std": 0.2714414894580841, + "rewards/cosine_scaled_reward/mean": -0.1717175394296646, + "rewards/cosine_scaled_reward/std": 0.3898351192474365, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 1442.15625, + "completions/mean_terminated_length": 797.2257690429688, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.12114285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27676305174827576, + "learning_rate": 9.672327345550543e-07, + "loss": -0.0, + "num_tokens": 13073628.0, + "reward": 0.06792312860488892, + "reward_std": 0.3339839577674866, + "rewards/cosine_scaled_reward/mean": 0.06792312115430832, + "rewards/cosine_scaled_reward/std": 0.4862962067127228, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1837.0, + "completions/mean_length": 1835.5, + "completions/mean_terminated_length": 1248.0, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "epoch": 0.12228571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2582181394100189, + "learning_rate": 9.66045715125541e-07, + "loss": -0.0, + "num_tokens": 13202252.0, + "reward": -0.21117815375328064, + "reward_std": 0.26033473014831543, + "rewards/cosine_scaled_reward/mean": -0.21117815375328064, + "rewards/cosine_scaled_reward/std": 0.318643718957901, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1920.0, + "completions/mean_length": 1598.953125, + "completions/mean_terminated_length": 1021.607177734375, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "epoch": 0.12342857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2756604850292206, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0, + "num_tokens": 13314945.0, + "reward": -0.0939117893576622, + "reward_std": 0.3252195715904236, + "rewards/cosine_scaled_reward/mean": -0.0939117819070816, + "rewards/cosine_scaled_reward/std": 0.40993908047676086, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 1812.328125, + "completions/mean_terminated_length": 970.6428833007812, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "epoch": 0.12457142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24118952453136444, + "learning_rate": 9.636109026648554e-07, + "loss": -0.0, + "num_tokens": 13442182.0, + "reward": -0.12436474859714508, + "reward_std": 0.17601566016674042, + "rewards/cosine_scaled_reward/mean": -0.12436474859714508, + "rewards/cosine_scaled_reward/std": 0.3541682958602905, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 1614.671875, + "completions/mean_terminated_length": 842.2174072265625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.12571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2967440187931061, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "num_tokens": 13556297.0, + "reward": -0.28026559948921204, + "reward_std": 0.14505533874034882, + "rewards/cosine_scaled_reward/mean": -0.2802656292915344, + "rewards/cosine_scaled_reward/std": 0.1739458441734314, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 1675.53125, + "completions/mean_terminated_length": 1094.47998046875, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "epoch": 0.12685714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25916096568107605, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "num_tokens": 13673651.0, + "reward": -0.10561071336269379, + "reward_std": 0.2843046188354492, + "rewards/cosine_scaled_reward/mean": -0.10561071336269379, + "rewards/cosine_scaled_reward/std": 0.42046698927879333, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.828125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2046.0, + "completions/mean_length": 1945.984375, + "completions/mean_terminated_length": 1454.45458984375, + "completions/min_length": 999.0, + "completions/min_terminated_length": 999.0, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22010542452335358, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0, + "num_tokens": 13809290.0, + "reward": -0.16558930277824402, + "reward_std": 0.2861853837966919, + "rewards/cosine_scaled_reward/mean": -0.16558930277824402, + "rewards/cosine_scaled_reward/std": 0.3597464859485626, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1493.0, + "completions/mean_length": 1482.453125, + "completions/mean_terminated_length": 880.4193115234375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.12914285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2894801199436188, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0, + "num_tokens": 13914463.0, + "reward": -0.09200191497802734, + "reward_std": 0.24287937581539154, + "rewards/cosine_scaled_reward/mean": -0.09200191497802734, + "rewards/cosine_scaled_reward/std": 0.4290314316749573, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1549.0, + "completions/mean_length": 1574.90625, + "completions/mean_terminated_length": 731.5652465820312, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.13028571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2619520425796509, + "learning_rate": 9.571721736097088e-07, + "loss": -0.0, + "num_tokens": 14025105.0, + "reward": -0.258175253868103, + "reward_std": 0.21170002222061157, + "rewards/cosine_scaled_reward/mean": -0.258175253868103, + "rewards/cosine_scaled_reward/std": 0.236412912607193, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1683.375, + "completions/mean_terminated_length": 936.7619018554688, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.13142857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27216964960098267, + "learning_rate": 9.55824636882301e-07, + "loss": -0.0, + "num_tokens": 14144057.0, + "reward": -0.13246098160743713, + "reward_std": 0.21515703201293945, + "rewards/cosine_scaled_reward/mean": -0.13246098160743713, + "rewards/cosine_scaled_reward/std": 0.3399508595466614, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 1858.484375, + "completions/mean_terminated_length": 531.875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.13257142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26274579763412476, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0, + "num_tokens": 14274384.0, + "reward": -0.1656629592180252, + "reward_std": 0.18953147530555725, + "rewards/cosine_scaled_reward/mean": -0.1656629592180252, + "rewards/cosine_scaled_reward/std": 0.22731326520442963, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1725.875, + "completions/mean_terminated_length": 1066.2857666015625, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "epoch": 0.1337142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2458299994468689, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0, + "num_tokens": 14395864.0, + "reward": -0.10864575207233429, + "reward_std": 0.22824041545391083, + "rewards/cosine_scaled_reward/mean": -0.10864575207233429, + "rewards/cosine_scaled_reward/std": 0.29944685101509094, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.765625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1818.578125, + "completions/mean_terminated_length": 1069.1334228515625, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "epoch": 0.13485714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2319493144750595, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0, + "num_tokens": 14522789.0, + "reward": -0.09332149475812912, + "reward_std": 0.26317405700683594, + "rewards/cosine_scaled_reward/mean": -0.09332150220870972, + "rewards/cosine_scaled_reward/std": 0.3715793788433075, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 1507.578125, + "completions/mean_terminated_length": 1087.25, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3142438530921936, + "learning_rate": 9.502373679810839e-07, + "loss": -0.0, + "num_tokens": 14629682.0, + "reward": -0.038483649492263794, + "reward_std": 0.2474227398633957, + "rewards/cosine_scaled_reward/mean": -0.038483649492263794, + "rewards/cosine_scaled_reward/std": 0.46291273832321167, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 1277.109375, + "completions/mean_terminated_length": 951.6222534179688, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.13714285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3041006326675415, + "learning_rate": 9.487916106540465e-07, + "loss": -0.0, + "num_tokens": 14721873.0, + "reward": -0.1477060317993164, + "reward_std": 0.33122679591178894, + "rewards/cosine_scaled_reward/mean": -0.14770600199699402, + "rewards/cosine_scaled_reward/std": 0.46506062150001526, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1818.0, + "completions/mean_length": 1170.421875, + "completions/mean_terminated_length": 901.7755126953125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.1382857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3021833002567291, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0, + "num_tokens": 14806476.0, + "reward": -0.08516940474510193, + "reward_std": 0.3455994129180908, + "rewards/cosine_scaled_reward/mean": -0.08516941219568253, + "rewards/cosine_scaled_reward/std": 0.5138645172119141, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1858.0, + "completions/mean_length": 1812.140625, + "completions/mean_terminated_length": 969.7857666015625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.13942857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2724408209323883, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0, + "num_tokens": 14934013.0, + "reward": -0.21230415999889374, + "reward_std": 0.25918447971343994, + "rewards/cosine_scaled_reward/mean": -0.21230417490005493, + "rewards/cosine_scaled_reward/std": 0.2874549329280853, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 1607.03125, + "completions/mean_terminated_length": 1137.6129150390625, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "epoch": 0.14057142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23946814239025116, + "learning_rate": 9.443380060197385e-07, + "loss": -0.0, + "num_tokens": 15047095.0, + "reward": -0.11815785616636276, + "reward_std": 0.2174030840396881, + "rewards/cosine_scaled_reward/mean": -0.11815785616636276, + "rewards/cosine_scaled_reward/std": 0.4328930079936981, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 1482.09375, + "completions/mean_terminated_length": 916.1875, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.1417142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2677968740463257, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0, + "num_tokens": 15152901.0, + "reward": -0.0870831310749054, + "reward_std": 0.30780428647994995, + "rewards/cosine_scaled_reward/mean": -0.0870831310749054, + "rewards/cosine_scaled_reward/std": 0.46330681443214417, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 1620.140625, + "completions/mean_terminated_length": 803.3181762695312, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24397389590740204, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0, + "num_tokens": 15267518.0, + "reward": -0.015626579523086548, + "reward_std": 0.2010820060968399, + "rewards/cosine_scaled_reward/mean": -0.01562657207250595, + "rewards/cosine_scaled_reward/std": 0.4903516471385956, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1776.828125, + "completions/mean_terminated_length": 1083.8333740234375, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2397489696741104, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0, + "num_tokens": 15392531.0, + "reward": -0.20714247226715088, + "reward_std": 0.2310880422592163, + "rewards/cosine_scaled_reward/mean": -0.20714247226715088, + "rewards/cosine_scaled_reward/std": 0.277647465467453, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.734375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1941.0, + "completions/mean_length": 1820.09375, + "completions/mean_terminated_length": 1190.0, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "epoch": 0.14514285714285713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21327179670333862, + "learning_rate": 9.381311511432658e-07, + "loss": -0.0, + "num_tokens": 15520113.0, + "reward": -0.21989238262176514, + "reward_std": 0.21288105845451355, + "rewards/cosine_scaled_reward/mean": -0.21989238262176514, + "rewards/cosine_scaled_reward/std": 0.25816869735717773, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.671875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1741.203125, + "completions/mean_terminated_length": 1113.0, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.1462857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24533982574939728, + "learning_rate": 9.36531953618799e-07, + "loss": -0.0, + "num_tokens": 15641902.0, + "reward": 0.13875506818294525, + "reward_std": 0.2863699495792389, + "rewards/cosine_scaled_reward/mean": 0.13875506818294525, + "rewards/cosine_scaled_reward/std": 0.4384811818599701, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 1875.84375, + "completions/mean_terminated_length": 670.75, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "epoch": 0.14742857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2447715848684311, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0, + "num_tokens": 15772660.0, + "reward": -0.2536994218826294, + "reward_std": 0.15479066967964172, + "rewards/cosine_scaled_reward/mean": -0.2536994218826294, + "rewards/cosine_scaled_reward/std": 0.21421663463115692, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 1667.53125, + "completions/mean_terminated_length": 989.3043823242188, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "epoch": 0.14857142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23984794318675995, + "learning_rate": 9.332771203643714e-07, + "loss": -0.0, + "num_tokens": 15889886.0, + "reward": -0.19088414311408997, + "reward_std": 0.2502530515193939, + "rewards/cosine_scaled_reward/mean": -0.19088414311408997, + "rewards/cosine_scaled_reward/std": 0.3068367540836334, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 1667.75, + "completions/mean_terminated_length": 941.8182373046875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.14971428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.270325243473053, + "learning_rate": 9.316216432703916e-07, + "loss": -0.0, + "num_tokens": 16006358.0, + "reward": -0.019564799964427948, + "reward_std": 0.28430548310279846, + "rewards/cosine_scaled_reward/mean": -0.019564803689718246, + "rewards/cosine_scaled_reward/std": 0.45797842741012573, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1625.0, + "completions/mean_length": 1605.0625, + "completions/mean_terminated_length": 866.8333740234375, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "epoch": 0.15085714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2547350525856018, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0, + "num_tokens": 16120146.0, + "reward": -0.21965548396110535, + "reward_std": 0.25751689076423645, + "rewards/cosine_scaled_reward/mean": -0.21965548396110535, + "rewards/cosine_scaled_reward/std": 0.3749488890171051, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1753.0, + "completions/mean_length": 1924.296875, + "completions/mean_terminated_length": 1058.375, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2590077221393585, + "learning_rate": 9.282549715730579e-07, + "loss": -0.0, + "num_tokens": 16254525.0, + "reward": -0.14530372619628906, + "reward_std": 0.19581159949302673, + "rewards/cosine_scaled_reward/mean": -0.14530371129512787, + "rewards/cosine_scaled_reward/std": 0.2799433171749115, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.546875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 1612.125, + "completions/mean_terminated_length": 1086.0689697265625, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "epoch": 0.15314285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27096986770629883, + "learning_rate": 9.265439410565328e-07, + "loss": -0.0, + "num_tokens": 16368269.0, + "reward": -0.1256684958934784, + "reward_std": 0.20261810719966888, + "rewards/cosine_scaled_reward/mean": -0.1256684958934784, + "rewards/cosine_scaled_reward/std": 0.4080355167388916, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2011.0, + "completions/mean_length": 1120.890625, + "completions/mean_terminated_length": 699.477294921875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.15428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30590543150901794, + "learning_rate": 9.248145583195447e-07, + "loss": -0.0, + "num_tokens": 16450478.0, + "reward": 0.115441232919693, + "reward_std": 0.23258042335510254, + "rewards/cosine_scaled_reward/mean": 0.115441232919693, + "rewards/cosine_scaled_reward/std": 0.500895619392395, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 1585.21875, + "completions/mean_terminated_length": 813.9166870117188, + "completions/min_length": 526.0, + "completions/min_terminated_length": 526.0, + "epoch": 0.15542857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2542603611946106, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0, + "num_tokens": 16562604.0, + "reward": 0.006334513425827026, + "reward_std": 0.3029508590698242, + "rewards/cosine_scaled_reward/mean": 0.006334502249956131, + "rewards/cosine_scaled_reward/std": 0.4998469352722168, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 1736.703125, + "completions/mean_terminated_length": 1051.8499755859375, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.15657142857142858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22275574505329132, + "learning_rate": 9.213010742252327e-07, + "loss": -0.0, + "num_tokens": 16684361.0, + "reward": -0.27902746200561523, + "reward_std": 0.13864701986312866, + "rewards/cosine_scaled_reward/mean": -0.27902746200561523, + "rewards/cosine_scaled_reward/std": 0.16625361144542694, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 1632.234375, + "completions/mean_terminated_length": 983.6399536132812, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.15771428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23748613893985748, + "learning_rate": 9.195171441101668e-07, + "loss": -0.0, + "num_tokens": 16800136.0, + "reward": -0.20888572931289673, + "reward_std": 0.2201838493347168, + "rewards/cosine_scaled_reward/mean": -0.20888571441173553, + "rewards/cosine_scaled_reward/std": 0.3073258101940155, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.796875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 1864.375, + "completions/mean_terminated_length": 1144.0, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.15885714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26246827840805054, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0, + "num_tokens": 16930096.0, + "reward": -0.26648059487342834, + "reward_std": 0.22530998289585114, + "rewards/cosine_scaled_reward/mean": -0.26648059487342834, + "rewards/cosine_scaled_reward/std": 0.26054832339286804, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1517.40625, + "completions/mean_terminated_length": 986.8125, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.16, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.28518393635749817, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0, + "num_tokens": 17037434.0, + "reward": -0.0924016684293747, + "reward_std": 0.18293559551239014, + "rewards/cosine_scaled_reward/mean": -0.0924016684293747, + "rewards/cosine_scaled_reward/std": 0.4700092375278473, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1383.0, + "completions/mean_length": 1388.703125, + "completions/mean_terminated_length": 1066.720947265625, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "epoch": 0.16114285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2404537796974182, + "learning_rate": 9.140576474687263e-07, + "loss": -0.0, + "num_tokens": 17136871.0, + "reward": -0.0574793741106987, + "reward_std": 0.3190045952796936, + "rewards/cosine_scaled_reward/mean": -0.0574793815612793, + "rewards/cosine_scaled_reward/std": 0.46699976921081543, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1873.0, + "completions/mean_length": 1644.6875, + "completions/mean_terminated_length": 1215.3548583984375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.16228571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2672227621078491, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0, + "num_tokens": 17252635.0, + "reward": -0.12337548285722733, + "reward_std": 0.288290411233902, + "rewards/cosine_scaled_reward/mean": -0.12337549030780792, + "rewards/cosine_scaled_reward/std": 0.408100426197052, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1349.125, + "completions/mean_terminated_length": 957.0731201171875, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.16342857142857142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30288439989089966, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0, + "num_tokens": 17349795.0, + "reward": -0.19018490612506866, + "reward_std": 0.2661983370780945, + "rewards/cosine_scaled_reward/mean": -0.19018490612506866, + "rewards/cosine_scaled_reward/std": 0.3374536633491516, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.609375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1644.03125, + "completions/mean_terminated_length": 1013.8399658203125, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.16457142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2909323573112488, + "learning_rate": 9.084384631108882e-07, + "loss": -0.0, + "num_tokens": 17466501.0, + "reward": -0.059858791530132294, + "reward_std": 0.22690719366073608, + "rewards/cosine_scaled_reward/mean": -0.0598587840795517, + "rewards/cosine_scaled_reward/std": 0.5050134062767029, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1229.8125, + "completions/mean_terminated_length": 884.3555908203125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.1657142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30516529083251953, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0, + "num_tokens": 17555153.0, + "reward": -0.00805443525314331, + "reward_std": 0.2110176980495453, + "rewards/cosine_scaled_reward/mean": -0.00805443525314331, + "rewards/cosine_scaled_reward/std": 0.5190568566322327, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1648.0, + "completions/mean_length": 1531.0, + "completions/mean_terminated_length": 866.2857666015625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.16685714285714287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24603432416915894, + "learning_rate": 9.046048391230247e-07, + "loss": -0.0, + "num_tokens": 17663753.0, + "reward": -0.1248023509979248, + "reward_std": 0.257907509803772, + "rewards/cosine_scaled_reward/mean": -0.1248023509979248, + "rewards/cosine_scaled_reward/std": 0.3190684914588928, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1583.921875, + "completions/mean_terminated_length": 1119.84375, + "completions/min_length": 595.0, + "completions/min_terminated_length": 595.0, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.278160959482193, + "learning_rate": 9.026620557966279e-07, + "loss": -0.0, + "num_tokens": 17775908.0, + "reward": -0.18137255311012268, + "reward_std": 0.2745535969734192, + "rewards/cosine_scaled_reward/mean": -0.18137255311012268, + "rewards/cosine_scaled_reward/std": 0.3545372188091278, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1316.109375, + "completions/mean_terminated_length": 983.4318237304688, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.16914285714285715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2669003903865814, + "learning_rate": 9.007020842191634e-07, + "loss": -0.0, + "num_tokens": 17871323.0, + "reward": -0.12499135732650757, + "reward_std": 0.19944381713867188, + "rewards/cosine_scaled_reward/mean": -0.12499135732650757, + "rewards/cosine_scaled_reward/std": 0.41628143191337585, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1674.546875, + "completions/mean_terminated_length": 1008.8261108398438, + "completions/min_length": 570.0, + "completions/min_terminated_length": 570.0, + "epoch": 0.1702857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24959711730480194, + "learning_rate": 8.987250199168808e-07, + "loss": -0.0, + "num_tokens": 17990390.0, + "reward": -0.24294674396514893, + "reward_std": 0.2527904510498047, + "rewards/cosine_scaled_reward/mean": -0.24294671416282654, + "rewards/cosine_scaled_reward/std": 0.35040438175201416, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1363.671875, + "completions/mean_terminated_length": 1005.2142944335938, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.17142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29853612184524536, + "learning_rate": 8.967309592491052e-07, + "loss": -0.0, + "num_tokens": 18088169.0, + "reward": -0.13983747363090515, + "reward_std": 0.37944915890693665, + "rewards/cosine_scaled_reward/mean": -0.13983745872974396, + "rewards/cosine_scaled_reward/std": 0.4024735391139984, + "step": 150 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 18088169, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}