diff --git "a/checkpoint-150/trainer_state.json" "b/checkpoint-150/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-150/trainer_state.json"
@@ -0,0 +1,3784 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.17142857142857143,
+  "eval_steps": 500,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 1702.03125,
+      "completions/mean_terminated_length": 993.6190795898438,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2837146520614624,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 118418.0,
+      "reward": -0.09800112247467041,
+      "reward_std": 0.3028089702129364,
+      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
+      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1894.0,
+      "completions/mean_length": 1738.90625,
+      "completions/mean_terminated_length": 949.0,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24220912158489227,
+      "learning_rate": 2e-08,
+      "loss": -0.0,
+      "num_tokens": 239748.0,
+      "reward": 0.020556632429361343,
+      "reward_std": 0.3545936942100525,
+      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
+      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 1946.515625,
+      "completions/mean_terminated_length": 749.0,
+      "completions/min_length": 609.0,
+      "completions/min_terminated_length": 609.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24765528738498688,
+      "learning_rate": 4e-08,
+      "loss": -0.0,
+      "num_tokens": 374797.0,
+      "reward": -0.20057085156440735,
+      "reward_std": 0.13691216707229614,
+      "rewards/cosine_scaled_reward/mean": -0.20057085156440735,
+      "rewards/cosine_scaled_reward/std": 0.16282624006271362,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1983.0,
+      "completions/mean_length": 1592.0,
+      "completions/mean_terminated_length": 967.1111450195312,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28862521052360535,
+      "learning_rate": 6e-08,
+      "loss": 0.0,
+      "num_tokens": 486493.0,
+      "reward": -0.19111667573451996,
+      "reward_std": 0.19739457964897156,
+      "rewards/cosine_scaled_reward/mean": -0.19111669063568115,
+      "rewards/cosine_scaled_reward/std": 0.22545036673545837,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1939.0,
+      "completions/mean_length": 1976.578125,
+      "completions/mean_terminated_length": 1395.0001220703125,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23521216213703156,
+      "learning_rate": 8e-08,
+      "loss": 0.0,
+      "num_tokens": 623810.0,
+      "reward": -0.2342512309551239,
+      "reward_std": 0.16005605459213257,
+      "rewards/cosine_scaled_reward/mean": -0.2342512309551239,
+      "rewards/cosine_scaled_reward/std": 0.20709452033042908,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1987.0,
+      "completions/mean_length": 1840.125,
+      "completions/mean_terminated_length": 939.3333740234375,
+      "completions/min_length": 552.0,
+      "completions/min_terminated_length": 552.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2831529676914215,
+      "learning_rate": 1e-07,
+      "loss": 0.0,
+      "num_tokens": 753226.0,
+      "reward": -0.1443408578634262,
+      "reward_std": 0.25838011503219604,
+      "rewards/cosine_scaled_reward/mean": -0.1443408727645874,
+      "rewards/cosine_scaled_reward/std": 0.3164331316947937,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2002.0,
+      "completions/mean_length": 1974.265625,
+      "completions/mean_terminated_length": 1458.125,
+      "completions/min_length": 1153.0,
+      "completions/min_terminated_length": 1153.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22311581671237946,
+      "learning_rate": 1.2e-07,
+      "loss": 0.0,
+      "num_tokens": 889987.0,
+      "reward": -0.15585696697235107,
+      "reward_std": 0.21075330674648285,
+      "rewards/cosine_scaled_reward/mean": -0.15585698187351227,
+      "rewards/cosine_scaled_reward/std": 0.3327982723712921,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1411.0,
+      "completions/mean_length": 1701.46875,
+      "completions/mean_terminated_length": 815.888916015625,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23218390345573425,
+      "learning_rate": 1.4e-07,
+      "loss": -0.0,
+      "num_tokens": 1009297.0,
+      "reward": -0.019736051559448242,
+      "reward_std": 0.22464922070503235,
+      "rewards/cosine_scaled_reward/mean": -0.01973605342209339,
+      "rewards/cosine_scaled_reward/std": 0.46309077739715576,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1836.0,
+      "completions/mean_length": 1936.96875,
+      "completions/mean_terminated_length": 1258.4444580078125,
+      "completions/min_length": 839.0,
+      "completions/min_terminated_length": 839.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2455250322818756,
+      "learning_rate": 1.6e-07,
+      "loss": -0.0,
+      "num_tokens": 1144719.0,
+      "reward": -0.22108668088912964,
+      "reward_std": 0.20550987124443054,
+      "rewards/cosine_scaled_reward/mean": -0.22108666598796844,
+      "rewards/cosine_scaled_reward/std": 0.27375248074531555,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1579.0,
+      "completions/mean_length": 1662.0625,
+      "completions/mean_terminated_length": 813.0,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26574036478996277,
+      "learning_rate": 1.8e-07,
+      "loss": -0.0,
+      "num_tokens": 1261923.0,
+      "reward": -0.140568345785141,
+      "reward_std": 0.2796468734741211,
+      "rewards/cosine_scaled_reward/mean": -0.140568345785141,
+      "rewards/cosine_scaled_reward/std": 0.35179150104522705,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1335.0,
+      "completions/mean_length": 1970.859375,
+      "completions/mean_terminated_length": 1060.5999755859375,
+      "completions/min_length": 906.0,
+      "completions/min_terminated_length": 906.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24890610575675964,
+      "learning_rate": 2e-07,
+      "loss": -0.0,
+      "num_tokens": 1399730.0,
+      "reward": -0.2551690638065338,
+      "reward_std": 0.16209062933921814,
+      "rewards/cosine_scaled_reward/mean": -0.2551690638065338,
+      "rewards/cosine_scaled_reward/std": 0.2319207787513733,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1798.71875,
+      "completions/mean_terminated_length": 1322.8182373046875,
+      "completions/min_length": 724.0,
+      "completions/min_terminated_length": 724.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2804766595363617,
+      "learning_rate": 2.1999999999999998e-07,
+      "loss": 0.0,
+      "num_tokens": 1525792.0,
+      "reward": -0.19796784222126007,
+      "reward_std": 0.30078738927841187,
+      "rewards/cosine_scaled_reward/mean": -0.19796785712242126,
+      "rewards/cosine_scaled_reward/std": 0.3346545696258545,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1800.0,
+      "completions/mean_length": 1816.890625,
+      "completions/mean_terminated_length": 1123.5625,
+      "completions/min_length": 583.0,
+      "completions/min_terminated_length": 583.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2471778392791748,
+      "learning_rate": 2.4e-07,
+      "loss": -0.0,
+      "num_tokens": 1653113.0,
+      "reward": -0.17365078628063202,
+      "reward_std": 0.23729698359966278,
+      "rewards/cosine_scaled_reward/mean": -0.17365078628063202,
+      "rewards/cosine_scaled_reward/std": 0.2726025879383087,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1529.0,
+      "completions/mean_length": 1815.046875,
+      "completions/mean_terminated_length": 1171.0,
+      "completions/min_length": 639.0,
+      "completions/min_terminated_length": 639.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22734108567237854,
+      "learning_rate": 2.6e-07,
+      "loss": 0.0,
+      "num_tokens": 1779884.0,
+      "reward": -0.086978480219841,
+      "reward_std": 0.2551291584968567,
+      "rewards/cosine_scaled_reward/mean": -0.0869784876704216,
+      "rewards/cosine_scaled_reward/std": 0.4508184790611267,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1354.0,
+      "completions/mean_length": 1705.421875,
+      "completions/mean_terminated_length": 758.2941284179688,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25105422735214233,
+      "learning_rate": 2.8e-07,
+      "loss": -0.0,
+      "num_tokens": 1899951.0,
+      "reward": 0.025415867567062378,
+      "reward_std": 0.13560885190963745,
+      "rewards/cosine_scaled_reward/mean": 0.025415875017642975,
+      "rewards/cosine_scaled_reward/std": 0.4663754105567932,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23334357142448425,
+      "learning_rate": 3e-07,
+      "loss": -0.0,
+      "num_tokens": 2041463.0,
+      "reward": -0.2220873385667801,
+      "reward_std": 0.17581966519355774,
+      "rewards/cosine_scaled_reward/mean": -0.2220873236656189,
+      "rewards/cosine_scaled_reward/std": 0.1694367378950119,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1898.0,
+      "completions/mean_length": 1524.9375,
+      "completions/mean_terminated_length": 893.6551513671875,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33780622482299805,
+      "learning_rate": 3.2e-07,
+      "loss": -0.0,
+      "num_tokens": 2149579.0,
+      "reward": -0.026115939021110535,
+      "reward_std": 0.3175298571586609,
+      "rewards/cosine_scaled_reward/mean": -0.026115931570529938,
+      "rewards/cosine_scaled_reward/std": 0.4766712486743927,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1635.0,
+      "completions/mean_length": 1771.34375,
+      "completions/mean_terminated_length": 1116.105224609375,
+      "completions/min_length": 538.0,
+      "completions/min_terminated_length": 538.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23123449087142944,
+      "learning_rate": 3.4000000000000003e-07,
+      "loss": -0.0,
+      "num_tokens": 2273321.0,
+      "reward": -0.15853706002235413,
+      "reward_std": 0.27896177768707275,
+      "rewards/cosine_scaled_reward/mean": -0.15853706002235413,
+      "rewards/cosine_scaled_reward/std": 0.3426607847213745,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2013.0,
+      "completions/mean_length": 1811.953125,
+      "completions/mean_terminated_length": 1159.3529052734375,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25707289576530457,
+      "learning_rate": 3.6e-07,
+      "loss": -0.0,
+      "num_tokens": 2400542.0,
+      "reward": -0.052606794983148575,
+      "reward_std": 0.31571486592292786,
+      "rewards/cosine_scaled_reward/mean": -0.052606794983148575,
+      "rewards/cosine_scaled_reward/std": 0.44901713728904724,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1632.953125,
+      "completions/mean_terminated_length": 840.5909423828125,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25764355063438416,
+      "learning_rate": 3.7999999999999996e-07,
+      "loss": 0.0,
+      "num_tokens": 2516403.0,
+      "reward": -0.07391424477100372,
+      "reward_std": 0.2678168714046478,
+      "rewards/cosine_scaled_reward/mean": -0.07391423732042313,
+      "rewards/cosine_scaled_reward/std": 0.3888758718967438,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1854.0,
+      "completions/mean_length": 1820.125,
+      "completions/mean_terminated_length": 1136.5,
+      "completions/min_length": 344.0,
+      "completions/min_terminated_length": 344.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27439141273498535,
+      "learning_rate": 4e-07,
+      "loss": 0.0,
+      "num_tokens": 2643699.0,
+      "reward": -0.16270118951797485,
+      "reward_std": 0.22588439285755157,
+      "rewards/cosine_scaled_reward/mean": -0.16270118951797485,
+      "rewards/cosine_scaled_reward/std": 0.39143073558807373,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1741.0,
+      "completions/mean_length": 1271.359375,
+      "completions/mean_terminated_length": 739.9736938476562,
+      "completions/min_length": 282.0,
+      "completions/min_terminated_length": 282.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.37971845269203186,
+      "learning_rate": 4.1999999999999995e-07,
+      "loss": -0.0,
+      "num_tokens": 2734082.0,
+      "reward": -0.00552794337272644,
+      "reward_std": 0.23386958241462708,
+      "rewards/cosine_scaled_reward/mean": -0.005527939647436142,
+      "rewards/cosine_scaled_reward/std": 0.4625597596168518,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1935.0,
+      "completions/mean_length": 1670.296875,
+      "completions/mean_terminated_length": 1081.0799560546875,
+      "completions/min_length": 472.0,
+      "completions/min_terminated_length": 472.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28573453426361084,
+      "learning_rate": 4.3999999999999997e-07,
+      "loss": 0.0,
+      "num_tokens": 2851773.0,
+      "reward": -0.18269123136997223,
+      "reward_std": 0.2168647199869156,
+      "rewards/cosine_scaled_reward/mean": -0.18269124627113342,
+      "rewards/cosine_scaled_reward/std": 0.2703794836997986,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1802.0,
+      "completions/mean_length": 1757.296875,
+      "completions/mean_terminated_length": 1068.7894287109375,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2553797662258148,
+      "learning_rate": 4.6e-07,
+      "loss": 0.0,
+      "num_tokens": 2975168.0,
+      "reward": -0.23130035400390625,
+      "reward_std": 0.35076260566711426,
+      "rewards/cosine_scaled_reward/mean": -0.23130035400390625,
+      "rewards/cosine_scaled_reward/std": 0.3866168260574341,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1584.0,
+      "completions/mean_length": 1744.28125,
+      "completions/mean_terminated_length": 833.125,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2636294960975647,
+      "learning_rate": 4.8e-07,
+      "loss": -0.0,
+      "num_tokens": 3097098.0,
+      "reward": -0.19239474833011627,
+      "reward_std": 0.2867633104324341,
+      "rewards/cosine_scaled_reward/mean": -0.19239474833011627,
+      "rewards/cosine_scaled_reward/std": 0.347222238779068,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2033.0,
+      "completions/mean_length": 1932.09375,
+      "completions/mean_terminated_length": 1477.3846435546875,
+      "completions/min_length": 895.0,
+      "completions/min_terminated_length": 895.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22351376712322235,
+      "learning_rate": 5e-07,
+      "loss": -0.0,
+      "num_tokens": 3231384.0,
+      "reward": -0.006307817995548248,
+      "reward_std": 0.2015555500984192,
+      "rewards/cosine_scaled_reward/mean": -0.006307825446128845,
+      "rewards/cosine_scaled_reward/std": 0.4079793393611908,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1974.0,
+      "completions/mean_length": 1899.25,
+      "completions/mean_terminated_length": 1254.666748046875,
+      "completions/min_length": 545.0,
+      "completions/min_terminated_length": 545.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2670150697231293,
+      "learning_rate": 5.2e-07,
+      "loss": -0.0,
+      "num_tokens": 3363224.0,
+      "reward": -0.22071197628974915,
+      "reward_std": 0.2118011713027954,
+      "rewards/cosine_scaled_reward/mean": -0.22071197628974915,
+      "rewards/cosine_scaled_reward/std": 0.2716290354728699,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1767.609375,
+      "completions/mean_terminated_length": 926.4375,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25918784737586975,
+      "learning_rate": 5.4e-07,
+      "loss": -0.0,
+      "num_tokens": 3486687.0,
+      "reward": -0.10919298231601715,
+      "reward_std": 0.2716072201728821,
+      "rewards/cosine_scaled_reward/mean": -0.10919298231601715,
+      "rewards/cosine_scaled_reward/std": 0.44544270634651184,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1951.0,
+      "completions/mean_length": 1932.203125,
+      "completions/mean_terminated_length": 989.2857666015625,
+      "completions/min_length": 603.0,
+      "completions/min_terminated_length": 603.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24401192367076874,
+      "learning_rate": 5.6e-07,
+      "loss": 0.0,
+      "num_tokens": 3620820.0,
+      "reward": -0.19096782803535461,
+      "reward_std": 0.15806984901428223,
+      "rewards/cosine_scaled_reward/mean": -0.19096782803535461,
+      "rewards/cosine_scaled_reward/std": 0.181764155626297,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1928.0,
+      "completions/mean_length": 1880.71875,
+      "completions/mean_terminated_length": 1334.2667236328125,
+      "completions/min_length": 604.0,
+      "completions/min_terminated_length": 604.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22094956040382385,
+      "learning_rate": 5.8e-07,
+      "loss": -0.0,
+      "num_tokens": 3751722.0,
+      "reward": -0.21267297863960266,
+      "reward_std": 0.24843861162662506,
+      "rewards/cosine_scaled_reward/mean": -0.21267297863960266,
+      "rewards/cosine_scaled_reward/std": 0.29802343249320984,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1348.0,
+      "completions/mean_length": 1786.234375,
+      "completions/mean_terminated_length": 851.357177734375,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2912121117115021,
+      "learning_rate": 6e-07,
+      "loss": -0.0,
+      "num_tokens": 3876537.0,
+      "reward": -0.2621557414531708,
+      "reward_std": 0.18612943589687347,
+      "rewards/cosine_scaled_reward/mean": -0.2621557414531708,
+      "rewards/cosine_scaled_reward/std": 0.22891530394554138,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1841.0,
+      "completions/mean_length": 1948.765625,
+      "completions/mean_terminated_length": 1342.3333740234375,
+      "completions/min_length": 536.0,
+      "completions/min_terminated_length": 536.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2303810715675354,
+      "learning_rate": 6.2e-07,
+      "loss": 0.0,
+      "num_tokens": 4011610.0,
+      "reward": -0.1655973494052887,
+      "reward_std": 0.2392224669456482,
+      "rewards/cosine_scaled_reward/mean": -0.1655973345041275,
+      "rewards/cosine_scaled_reward/std": 0.3260692358016968,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1972.0,
+      "completions/mean_length": 1984.0,
+      "completions/mean_terminated_length": 1365.3333740234375,
+      "completions/min_length": 965.0,
+      "completions/min_terminated_length": 965.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23169051110744476,
+      "learning_rate": 6.4e-07,
+      "loss": 0.0,
+      "num_tokens": 4149802.0,
+      "reward": -0.22799505293369293,
+      "reward_std": 0.24000275135040283,
+      "rewards/cosine_scaled_reward/mean": -0.22799506783485413,
+      "rewards/cosine_scaled_reward/std": 0.30748653411865234,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1977.0,
+      "completions/mean_length": 1700.859375,
+      "completions/mean_terminated_length": 1159.3199462890625,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2647433578968048,
+      "learning_rate": 6.6e-07,
+      "loss": 0.0,
+      "num_tokens": 4268209.0,
+      "reward": -0.07232969254255295,
+      "reward_std": 0.3570185899734497,
+      "rewards/cosine_scaled_reward/mean": -0.07232969999313354,
+      "rewards/cosine_scaled_reward/std": 0.4520716369152069,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1458.0,
+      "completions/mean_length": 1884.625,
+      "completions/mean_terminated_length": 741.0,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2681647539138794,
+      "learning_rate": 6.800000000000001e-07,
+      "loss": 0.0,
+      "num_tokens": 4400321.0,
+      "reward": -0.21119418740272522,
+      "reward_std": 0.2156996876001358,
+      "rewards/cosine_scaled_reward/mean": -0.21119415760040283,
+      "rewards/cosine_scaled_reward/std": 0.304564893245697,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.96875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2002.0,
+      "completions/mean_length": 2032.765625,
+      "completions/mean_terminated_length": 1560.5,
+      "completions/min_length": 1119.0,
+      "completions/min_terminated_length": 1119.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25201615691185,
+      "learning_rate": 7e-07,
+      "loss": -0.0,
+      "num_tokens": 4541530.0,
+      "reward": -0.2148258090019226,
+      "reward_std": 0.1970210075378418,
+      "rewards/cosine_scaled_reward/mean": -0.2148257941007614,
+      "rewards/cosine_scaled_reward/std": 0.21921320259571075,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2005.0,
+      "completions/mean_length": 1954.5,
+      "completions/mean_terminated_length": 1383.111083984375,
+      "completions/min_length": 901.0,
+      "completions/min_terminated_length": 901.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29214274883270264,
+      "learning_rate": 7.2e-07,
+      "loss": 0.0,
+      "num_tokens": 4677642.0,
+      "reward": -0.23519155383110046,
+      "reward_std": 0.14085054397583008,
+      "rewards/cosine_scaled_reward/mean": -0.23519155383110046,
+      "rewards/cosine_scaled_reward/std": 0.17065586149692535,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2016.0,
+      "completions/mean_length": 1949.1875,
+      "completions/mean_terminated_length": 1257.5,
+      "completions/min_length": 1042.0,
+      "completions/min_terminated_length": 1042.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2337840050458908,
+      "learning_rate": 7.4e-07,
+      "loss": -0.0,
+      "num_tokens": 4814102.0,
+      "reward": -0.16185586154460907,
+      "reward_std": 0.19152981042861938,
+      "rewards/cosine_scaled_reward/mean": -0.16185584664344788,
+      "rewards/cosine_scaled_reward/std": 0.3005273640155792,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1953.0,
+      "completions/mean_length": 1810.515625,
+      "completions/mean_terminated_length": 666.2727661132812,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.246645987033844,
+      "learning_rate": 7.599999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 4940759.0,
+      "reward": -0.10980962216854095,
+      "reward_std": 0.18094567954540253,
+      "rewards/cosine_scaled_reward/mean": -0.10980962216854095,
+      "rewards/cosine_scaled_reward/std": 0.3624936640262604,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1754.0,
+      "completions/mean_length": 1700.796875,
+      "completions/mean_terminated_length": 1037.95458984375,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26321786642074585,
+      "learning_rate": 7.799999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 5059682.0,
+      "reward": -0.14547404646873474,
+      "reward_std": 0.22270715236663818,
+      "rewards/cosine_scaled_reward/mean": -0.14547404646873474,
+      "rewards/cosine_scaled_reward/std": 0.4000875651836395,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1939.0,
+      "completions/mean_length": 1860.328125,
+      "completions/mean_terminated_length": 1415.8421630859375,
+      "completions/min_length": 982.0,
+      "completions/min_terminated_length": 982.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21273446083068848,
+      "learning_rate": 8e-07,
+      "loss": -0.0,
+      "num_tokens": 5189895.0,
+      "reward": -0.24220962822437286,
+      "reward_std": 0.27360057830810547,
+      "rewards/cosine_scaled_reward/mean": -0.24220961332321167,
+      "rewards/cosine_scaled_reward/std": 0.33429500460624695,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1547.0,
+      "completions/mean_length": 1694.375,
+      "completions/mean_terminated_length": 539.2000122070312,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3549652099609375,
+      "learning_rate": 8.199999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 5308695.0,
+      "reward": -0.22589105367660522,
+      "reward_std": 0.16009008884429932,
+      "rewards/cosine_scaled_reward/mean": -0.22589105367660522,
+      "rewards/cosine_scaled_reward/std": 0.17985297739505768,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2006.0,
+      "completions/mean_length": 1824.75,
+      "completions/mean_terminated_length": 948.923095703125,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25625720620155334,
+      "learning_rate": 8.399999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 5437095.0,
+      "reward": -0.10874830186367035,
+      "reward_std": 0.2326180636882782,
+      "rewards/cosine_scaled_reward/mean": -0.10874830186367035,
+      "rewards/cosine_scaled_reward/std": 0.3275902569293976,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1751.0,
+      "completions/mean_length": 1673.734375,
+      "completions/mean_terminated_length": 787.3157958984375,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3032245934009552,
+      "learning_rate": 8.599999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5554910.0,
+      "reward": -0.1157154068350792,
+      "reward_std": 0.2323075234889984,
+      "rewards/cosine_scaled_reward/mean": -0.1157153993844986,
+      "rewards/cosine_scaled_reward/std": 0.4071435034275055,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1931.0,
+      "completions/mean_length": 2031.03125,
+      "completions/mean_terminated_length": 1776.5,
+      "completions/min_length": 1421.0,
+      "completions/min_terminated_length": 1421.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2320922464132309,
+      "learning_rate": 8.799999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5696552.0,
+      "reward": -0.22731460630893707,
+      "reward_std": 0.19835877418518066,
+      "rewards/cosine_scaled_reward/mean": -0.22731460630893707,
+      "rewards/cosine_scaled_reward/std": 0.28479474782943726,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 981.0,
+      "completions/mean_length": 1890.3125,
+      "completions/mean_terminated_length": 786.5,
+      "completions/min_length": 490.0,
+      "completions/min_terminated_length": 490.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2494276612997055,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 5828700.0,
+      "reward": -0.23243775963783264,
+      "reward_std": 0.18319474160671234,
+      "rewards/cosine_scaled_reward/mean": -0.23243777453899384,
+      "rewards/cosine_scaled_reward/std": 0.20973731577396393,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1398.0,
+      "completions/mean_length": 1672.09375,
+      "completions/mean_terminated_length": 711.4444580078125,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3419908881187439,
+      "learning_rate": 9.2e-07,
+      "loss": 0.0,
+      "num_tokens": 5946114.0,
+      "reward": -0.16157878935337067,
+      "reward_std": 0.24494563043117523,
+      "rewards/cosine_scaled_reward/mean": -0.16157880425453186,
+      "rewards/cosine_scaled_reward/std": 0.39992472529411316,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1501.0,
+      "completions/mean_length": 1787.171875,
+      "completions/mean_terminated_length": 935.1333618164062,
+      "completions/min_length": 687.0,
+      "completions/min_terminated_length": 687.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25991642475128174,
+      "learning_rate": 9.399999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 6071037.0,
+      "reward": -0.1829870045185089,
+      "reward_std": 0.2542135417461395,
+      "rewards/cosine_scaled_reward/mean": -0.1829870045185089,
+      "rewards/cosine_scaled_reward/std": 0.30597779154777527,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1981.0,
+      "completions/mean_length": 1565.34375,
+      "completions/mean_terminated_length": 944.7857666015625,
+      "completions/min_length": 322.0,
+      "completions/min_terminated_length": 322.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27452352643013,
+      "learning_rate": 9.6e-07,
+      "loss": 0.0,
+      "num_tokens": 6181283.0,
+      "reward": -0.22301900386810303,
+      "reward_std": 0.25131016969680786,
+      "rewards/cosine_scaled_reward/mean": -0.22301900386810303,
+      "rewards/cosine_scaled_reward/std": 0.2918049991130829,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1947.0,
+      "completions/mean_length": 1775.515625,
+      "completions/mean_terminated_length": 885.4000244140625,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22758428752422333,
+      "learning_rate": 9.8e-07,
+      "loss": 0.0,
+      "num_tokens": 6305732.0,
+      "reward": -0.10754476487636566,
+      "reward_std": 0.18711507320404053,
+      "rewards/cosine_scaled_reward/mean": -0.10754477977752686,
+      "rewards/cosine_scaled_reward/std": 0.39105597138404846,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1939.0,
+      "completions/mean_length": 1507.5625,
+      "completions/mean_terminated_length": 766.9629516601562,
+      "completions/min_length": 440.0,
+      "completions/min_terminated_length": 440.0,
+      "epoch": 0.05828571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29917222261428833,
+      "learning_rate": 1e-06,
+      "loss": 0.0,
+      "num_tokens": 6412424.0,
+      "reward": -0.08381433039903641,
+      "reward_std": 0.23327183723449707,
+      "rewards/cosine_scaled_reward/mean": -0.08381432294845581,
+      "rewards/cosine_scaled_reward/std": 0.40033307671546936,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1933.0,
+      "completions/mean_length": 1746.015625,
+      "completions/mean_terminated_length": 840.0625,
+      "completions/min_length": 347.0,
+      "completions/min_terminated_length": 347.0,
+      "epoch": 0.05942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2824826240539551,
+      "learning_rate": 9.999890338174275e-07,
+      "loss": -0.0,
+      "num_tokens": 6535521.0,
+      "reward": -0.2775638699531555,
+      "reward_std": 0.17903020977973938,
+      "rewards/cosine_scaled_reward/mean": -0.2775638699531555,
+      "rewards/cosine_scaled_reward/std": 0.38567760586738586,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1862.0,
+      "completions/mean_length": 1765.71875,
+      "completions/mean_terminated_length": 918.875,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "epoch": 0.060571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2795548737049103,
+      "learning_rate": 9.999561358041868e-07,
+      "loss": -0.0,
+      "num_tokens": 6659359.0,
+      "reward": -0.18778130412101746,
+      "reward_std": 0.24159184098243713,
+      "rewards/cosine_scaled_reward/mean": -0.18778130412101746,
+      "rewards/cosine_scaled_reward/std": 0.2979832589626312,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2034.0,
+      "completions/mean_length": 1877.984375,
+      "completions/mean_terminated_length": 1367.9375,
+      "completions/min_length": 536.0,
+      "completions/min_terminated_length": 536.0,
+      "epoch": 0.061714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.236692875623703,
+      "learning_rate": 9.999013075636804e-07,
+      "loss": 0.0,
+      "num_tokens": 6790694.0,
+      "reward": -0.09228484332561493,
+      "reward_std": 0.3374499976634979,
+      "rewards/cosine_scaled_reward/mean": -0.09228484332561493,
+      "rewards/cosine_scaled_reward/std": 0.4543565809726715,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 720.0,
+      "completions/mean_length": 1855.140625,
+      "completions/mean_terminated_length": 676.5555419921875,
+      "completions/min_length": 597.0,
+      "completions/min_terminated_length": 597.0,
+      "epoch": 0.06285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24421174824237823,
+      "learning_rate": 9.998245517681593e-07,
+      "loss": 0.0,
+      "num_tokens": 6919711.0,
+      "reward": -0.19803781807422638,
+      "reward_std": 0.1785231977701187,
+      "rewards/cosine_scaled_reward/mean": -0.19803781807422638,
+      "rewards/cosine_scaled_reward/std": 0.3721012771129608,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2027.0,
+      "completions/mean_length": 1840.703125,
+      "completions/mean_terminated_length": 1163.533447265625,
+      "completions/min_length": 657.0,
+      "completions/min_terminated_length": 657.0,
+      "epoch": 0.064,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23600365221500397,
+      "learning_rate": 9.997258721585931e-07,
+      "loss": -0.0,
+      "num_tokens": 7048476.0,
+      "reward": -0.09674595296382904,
+      "reward_std": 0.21479913592338562,
+      "rewards/cosine_scaled_reward/mean": -0.09674594551324844,
+      "rewards/cosine_scaled_reward/std": 0.4473191201686859,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1681.0,
+      "completions/mean_length": 1997.5625,
+      "completions/mean_terminated_length": 1510.0,
+      "completions/min_length": 1387.0,
+      "completions/min_terminated_length": 1387.0,
+      "epoch": 0.06514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.19085553288459778,
+      "learning_rate": 9.996052735444862e-07,
+      "loss": 0.0,
+      "num_tokens": 7187888.0,
+      "reward": -0.2958947420120239,
+      "reward_std": 0.1703263819217682,
+      "rewards/cosine_scaled_reward/mean": -0.2958947420120239,
+      "rewards/cosine_scaled_reward/std": 0.18720079958438873,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1996.0,
+      "completions/mean_length": 1591.40625,
+      "completions/mean_terminated_length": 965.7037353515625,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "epoch": 0.06628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26593509316444397,
+      "learning_rate": 9.994627618036452e-07,
+      "loss": -0.0,
+      "num_tokens": 7299834.0,
+      "reward": -0.0999627411365509,
+      "reward_std": 0.32584434747695923,
+      "rewards/cosine_scaled_reward/mean": -0.0999627485871315,
+      "rewards/cosine_scaled_reward/std": 0.4625846743583679,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1490.0,
+      "completions/mean_length": 1744.640625,
+      "completions/mean_terminated_length": 905.941162109375,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "epoch": 0.06742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.250278115272522,
+      "learning_rate": 9.992983438818915e-07,
+      "loss": -0.0,
+      "num_tokens": 7421955.0,
+      "reward": -0.16149799525737762,
+      "reward_std": 0.21139998733997345,
+      "rewards/cosine_scaled_reward/mean": -0.16149798035621643,
+      "rewards/cosine_scaled_reward/std": 0.3698217272758484,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1265.0,
+      "completions/mean_length": 1805.390625,
+      "completions/mean_terminated_length": 853.6154174804688,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "epoch": 0.06857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23754432797431946,
+      "learning_rate": 9.991120277927223e-07,
+      "loss": 0.0,
+      "num_tokens": 7548388.0,
+      "reward": -0.2758587598800659,
+      "reward_std": 0.18496021628379822,
+      "rewards/cosine_scaled_reward/mean": -0.2758587598800659,
+      "rewards/cosine_scaled_reward/std": 0.22098895907402039,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1578.0,
+      "completions/mean_length": 1863.265625,
+      "completions/mean_terminated_length": 865.7000122070312,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "epoch": 0.06971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21628443896770477,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": -0.0,
+      "num_tokens": 7679157.0,
+      "reward": -0.11532291769981384,
+      "reward_std": 0.24975456297397614,
+      "rewards/cosine_scaled_reward/mean": -0.11532291769981384,
+      "rewards/cosine_scaled_reward/std": 0.32742080092430115,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1909.0,
+      "completions/mean_length": 1690.296875,
+      "completions/mean_terminated_length": 1007.4091186523438,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "epoch": 0.07085714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2538006901741028,
+      "learning_rate": 9.98673738502114e-07,
+      "loss": -0.0,
+      "num_tokens": 7797568.0,
+      "reward": -0.08548027276992798,
+      "reward_std": 0.1828608512878418,
+      "rewards/cosine_scaled_reward/mean": -0.08548027276992798,
+      "rewards/cosine_scaled_reward/std": 0.31418856978416443,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.515625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2045.0,
+      "completions/mean_length": 1631.5,
+      "completions/mean_terminated_length": 1188.1290283203125,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "epoch": 0.072,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27852457761764526,
+      "learning_rate": 9.98421786662277e-07,
+      "loss": 0.0,
+      "num_tokens": 7912240.0,
+      "reward": 0.03178010880947113,
+      "reward_std": 0.39872580766677856,
+      "rewards/cosine_scaled_reward/mean": 0.03178010135889053,
+      "rewards/cosine_scaled_reward/std": 0.4946252107620239,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1871.0,
+      "completions/mean_length": 1675.578125,
+      "completions/mean_terminated_length": 964.5909423828125,
+      "completions/min_length": 472.0,
+      "completions/min_terminated_length": 472.0,
+      "epoch": 0.07314285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.267963171005249,
+      "learning_rate": 9.981479793771866e-07,
+      "loss": 0.0,
+      "num_tokens": 8030429.0,
+      "reward": -0.19527338445186615,
+      "reward_std": 0.2819081246852875,
+      "rewards/cosine_scaled_reward/mean": -0.19527339935302734,
+      "rewards/cosine_scaled_reward/std": 0.3602358102798462,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1593.0,
+      "completions/mean_length": 1671.53125,
+      "completions/mean_terminated_length": 709.4444580078125,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.07428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25890231132507324,
+      "learning_rate": 9.97852329991824e-07,
+      "loss": -0.0,
+      "num_tokens": 8148111.0,
+      "reward": -0.17763729393482208,
+      "reward_std": 0.1911587119102478,
+      "rewards/cosine_scaled_reward/mean": -0.17763729393482208,
+      "rewards/cosine_scaled_reward/std": 0.4043731391429901,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1958.0,
+      "completions/mean_length": 1330.71875,
+      "completions/mean_terminated_length": 613.4375,
+      "completions/min_length": 195.0,
+      "completions/min_terminated_length": 195.0,
+      "epoch": 0.07542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4998987913131714,
+      "learning_rate": 9.975348529157229e-07,
+      "loss": -0.0,
+      "num_tokens": 8243509.0,
+      "reward": -0.012211084365844727,
+      "reward_std": 0.25645655393600464,
+      "rewards/cosine_scaled_reward/mean": -0.012211091816425323,
+      "rewards/cosine_scaled_reward/std": 0.4760035276412964,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1362.0,
+      "completions/mean_length": 1970.15625,
+      "completions/mean_terminated_length": 1051.5999755859375,
+      "completions/min_length": 715.0,
+      "completions/min_terminated_length": 715.0,
+      "epoch": 0.07657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22345179319381714,
+      "learning_rate": 9.971955636222684e-07,
+      "loss": -0.0,
+      "num_tokens": 8380335.0,
+      "reward": -0.27880045771598816,
+      "reward_std": 0.169667050242424,
+      "rewards/cosine_scaled_reward/mean": -0.27880045771598816,
+      "rewards/cosine_scaled_reward/std": 0.18985651433467865,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1628.0,
+      "completions/mean_length": 1223.421875,
+      "completions/mean_terminated_length": 728.6749877929688,
+      "completions/min_length": 338.0,
+      "completions/min_terminated_length": 338.0,
+      "epoch": 0.07771428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3179270625114441,
+      "learning_rate": 9.968344786479415e-07,
+      "loss": -0.0,
+      "num_tokens": 8467890.0,
+      "reward": -0.09876523166894913,
+      "reward_std": 0.25151342153549194,
+      "rewards/cosine_scaled_reward/mean": -0.09876523166894913,
+      "rewards/cosine_scaled_reward/std": 0.4221951961517334,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1582.0,
+      "completions/mean_length": 1574.234375,
+      "completions/mean_terminated_length": 965.107177734375,
+      "completions/min_length": 518.0,
+      "completions/min_terminated_length": 518.0,
+      "epoch": 0.07885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.292468398809433,
+      "learning_rate": 9.964516155915151e-07,
+      "loss": -0.0,
+      "num_tokens": 8578985.0,
+      "reward": -0.20737677812576294,
+      "reward_std": 0.23497402667999268,
+      "rewards/cosine_scaled_reward/mean": -0.20737677812576294,
+      "rewards/cosine_scaled_reward/std": 0.3156755864620209,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 866.0,
+      "completions/mean_length": 1893.546875,
+      "completions/mean_terminated_length": 635.857177734375,
+      "completions/min_length": 518.0,
+      "completions/min_terminated_length": 518.0,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2380189746618271,
+      "learning_rate": 9.960469931131936e-07,
+      "loss": 0.0,
+      "num_tokens": 8711628.0,
+      "reward": -0.25674766302108765,
+      "reward_std": 0.1897822916507721,
+      "rewards/cosine_scaled_reward/mean": -0.25674766302108765,
+      "rewards/cosine_scaled_reward/std": 0.2669999301433563,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1986.0,
+      "completions/mean_length": 1716.5625,
+      "completions/mean_terminated_length": 1037.90478515625,
+      "completions/min_length": 367.0,
+      "completions/min_terminated_length": 367.0,
+      "epoch": 0.08114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2737840414047241,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": 0.0,
+      "num_tokens": 8832208.0,
+      "reward": -0.11062799394130707,
+      "reward_std": 0.27241969108581543,
+      "rewards/cosine_scaled_reward/mean": -0.11062799394130707,
+      "rewards/cosine_scaled_reward/std": 0.43007227778434753,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1669.015625,
+      "completions/mean_terminated_length": 1115.115478515625,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "epoch": 0.08228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27252519130706787,
+      "learning_rate": 9.951725498333448e-07,
+      "loss": 0.0,
+      "num_tokens": 8949377.0,
+      "reward": -0.2082766592502594,
+      "reward_std": 0.1827523410320282,
+      "rewards/cosine_scaled_reward/mean": -0.2082766592502594,
+      "rewards/cosine_scaled_reward/std": 0.18022844195365906,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.08342857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20380868017673492,
+      "learning_rate": 9.947027716509488e-07,
+      "loss": 0.0,
+      "num_tokens": 9091177.0,
+      "reward": -0.29910945892333984,
+      "reward_std": 0.12098947167396545,
+      "rewards/cosine_scaled_reward/mean": -0.29910945892333984,
+      "rewards/cosine_scaled_reward/std": 0.1714438796043396,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1975.0,
+      "completions/mean_length": 1885.90625,
+      "completions/mean_terminated_length": 1250.0,
+      "completions/min_length": 725.0,
+      "completions/min_terminated_length": 725.0,
+      "epoch": 0.08457142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262696772813797,
+      "learning_rate": 9.942113192828444e-07,
+      "loss": -0.0,
+      "num_tokens": 9221803.0,
+      "reward": -0.15267591178417206,
+      "reward_std": 0.23455429077148438,
+      "rewards/cosine_scaled_reward/mean": -0.15267591178417206,
+      "rewards/cosine_scaled_reward/std": 0.41386422514915466,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1987.0,
+      "completions/mean_length": 1865.65625,
+      "completions/mean_terminated_length": 1318.625,
+      "completions/min_length": 966.0,
+      "completions/min_terminated_length": 966.0,
+      "epoch": 0.08571428571428572,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.19511669874191284,
+      "learning_rate": 9.93698216681727e-07,
+      "loss": 0.0,
+      "num_tokens": 9352165.0,
+      "reward": -0.09251219034194946,
+      "reward_std": 0.182725191116333,
+      "rewards/cosine_scaled_reward/mean": -0.09251218289136887,
+      "rewards/cosine_scaled_reward/std": 0.47868576645851135,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1665.0,
+      "completions/mean_length": 1767.421875,
+      "completions/mean_terminated_length": 991.7058715820312,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.08685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2677210569381714,
+      "learning_rate": 9.931634888554935e-07,
+      "loss": 0.0,
+      "num_tokens": 9475680.0,
+      "reward": -0.2391628623008728,
+      "reward_std": 0.16363291442394257,
+      "rewards/cosine_scaled_reward/mean": -0.2391628623008728,
+      "rewards/cosine_scaled_reward/std": 0.18309317529201508,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2002.0,
+      "completions/mean_length": 1851.609375,
+      "completions/mean_terminated_length": 1150.21435546875,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "epoch": 0.088,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21603599190711975,
+      "learning_rate": 9.926071618660237e-07,
+      "loss": 0.0,
+      "num_tokens": 9605759.0,
+      "reward": -0.15259909629821777,
+      "reward_std": 0.212618887424469,
+      "rewards/cosine_scaled_reward/mean": -0.15259911119937897,
+      "rewards/cosine_scaled_reward/std": 0.2940331995487213,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1857.0,
+      "completions/mean_length": 1851.640625,
+      "completions/mean_terminated_length": 1081.3077392578125,
+      "completions/min_length": 670.0,
+      "completions/min_terminated_length": 670.0,
+      "epoch": 0.08914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20200739800930023,
+      "learning_rate": 9.9202926282791e-07,
+      "loss": 0.0,
+      "num_tokens": 9734984.0,
+      "reward": -0.18924658000469208,
+      "reward_std": 0.24043609201908112,
+      "rewards/cosine_scaled_reward/mean": -0.18924658000469208,
+      "rewards/cosine_scaled_reward/std": 0.38954904675483704,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1807.0,
+      "completions/mean_length": 1608.40625,
+      "completions/mean_terminated_length": 875.75,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.09028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31782233715057373,
+      "learning_rate": 9.91429819907136e-07,
+      "loss": -0.0,
+      "num_tokens": 9848018.0,
+      "reward": -0.1820095181465149,
+      "reward_std": 0.25530290603637695,
+      "rewards/cosine_scaled_reward/mean": -0.1820095181465149,
+      "rewards/cosine_scaled_reward/std": 0.31191888451576233,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2047.0,
+      "completions/mean_length": 1976.84375,
+      "completions/mean_terminated_length": 1397.4285888671875,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.09142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24291428923606873,
+      "learning_rate": 9.908088623197048e-07,
+      "loss": 0.0,
+      "num_tokens": 9984928.0,
+      "reward": -0.253532737493515,
+      "reward_std": 0.19657698273658752,
+      "rewards/cosine_scaled_reward/mean": -0.2535327672958374,
+      "rewards/cosine_scaled_reward/std": 0.2723200023174286,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1586.0,
+      "completions/mean_length": 1705.015625,
+      "completions/mean_terminated_length": 828.5,
+      "completions/min_length": 314.0,
+      "completions/min_terminated_length": 314.0,
+      "epoch": 0.09257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3213472068309784,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": -0.0,
+      "num_tokens": 10105321.0,
+      "reward": -0.1452670842409134,
+      "reward_std": 0.16492897272109985,
+      "rewards/cosine_scaled_reward/mean": -0.1452670842409134,
+      "rewards/cosine_scaled_reward/std": 0.20188800990581512,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1601.0,
+      "completions/mean_length": 1707.203125,
+      "completions/mean_terminated_length": 1009.3809814453125,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "epoch": 0.09371428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25050660967826843,
+      "learning_rate": 9.895025252503755e-07,
+      "loss": -0.0,
+      "num_tokens": 10224910.0,
+      "reward": -0.07721791416406631,
+      "reward_std": 0.26486068964004517,
+      "rewards/cosine_scaled_reward/mean": -0.07721789926290512,
+      "rewards/cosine_scaled_reward/std": 0.4591779112815857,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1707.0,
+      "completions/mean_length": 1616.609375,
+      "completions/mean_terminated_length": 897.625,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "epoch": 0.09485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2958391010761261,
+      "learning_rate": 9.888172094375033e-07,
+      "loss": 0.0,
+      "num_tokens": 10339461.0,
+      "reward": -0.05388225242495537,
+      "reward_std": 0.23644787073135376,
+      "rewards/cosine_scaled_reward/mean": -0.053882256150245667,
+      "rewards/cosine_scaled_reward/std": 0.376263827085495,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1666.0,
+      "completions/mean_length": 1838.828125,
+      "completions/mean_terminated_length": 831.0,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "epoch": 0.096,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23179632425308228,
+      "learning_rate": 9.881105062929221e-07,
+      "loss": -0.0,
+      "num_tokens": 10467842.0,
+      "reward": -0.15529119968414307,
+      "reward_std": 0.30153706669807434,
+      "rewards/cosine_scaled_reward/mean": -0.15529119968414307,
+      "rewards/cosine_scaled_reward/std": 0.4041438102722168,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1840.71875,
+      "completions/mean_terminated_length": 1384.7000732421875,
+      "completions/min_length": 751.0,
+      "completions/min_terminated_length": 751.0,
+      "epoch": 0.09714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.18997986614704132,
+      "learning_rate": 9.873824502603459e-07,
+      "loss": -0.0,
+      "num_tokens": 10595968.0,
+      "reward": -0.09931906312704086,
+      "reward_std": 0.2868148386478424,
+      "rewards/cosine_scaled_reward/mean": -0.09931905567646027,
+      "rewards/cosine_scaled_reward/std": 0.32533466815948486,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1373.0,
+      "completions/mean_length": 1669.25,
+      "completions/mean_terminated_length": 772.2105102539062,
+      "completions/min_length": 495.0,
+      "completions/min_terminated_length": 495.0,
+      "epoch": 0.09828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2640744745731354,
+      "learning_rate": 9.866330768241983e-07,
+      "loss": 0.0,
+      "num_tokens": 10713656.0,
+      "reward": -0.09163744747638702,
+      "reward_std": 0.25668954849243164,
+      "rewards/cosine_scaled_reward/mean": -0.09163745492696762,
+      "rewards/cosine_scaled_reward/std": 0.34459924697875977,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1626.609375,
+      "completions/mean_terminated_length": 969.239990234375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "epoch": 0.09942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2926872968673706,
+      "learning_rate": 9.85862422507884e-07,
+      "loss": -0.0,
+      "num_tokens": 10827879.0,
+      "reward": -0.20403151214122772,
+      "reward_std": 0.28549331426620483,
+      "rewards/cosine_scaled_reward/mean": -0.20403149724006653,
+      "rewards/cosine_scaled_reward/std": 0.32589223980903625,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1864.0,
+      "completions/mean_length": 1735.953125,
+      "completions/mean_terminated_length": 1049.4500732421875,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "epoch": 0.10057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26702597737312317,
+      "learning_rate": 9.850705248720068e-07,
+      "loss": 0.0,
+      "num_tokens": 10949492.0,
+      "reward": 0.03890814632177353,
+      "reward_std": 0.3359295129776001,
+      "rewards/cosine_scaled_reward/mean": 0.03890814632177353,
+      "rewards/cosine_scaled_reward/std": 0.45631229877471924,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.515625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1979.0,
+      "completions/mean_length": 1619.453125,
+      "completions/mean_terminated_length": 1163.258056640625,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "epoch": 0.10171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21441271901130676,
+      "learning_rate": 9.8425742251254e-07,
+      "loss": -0.0,
+      "num_tokens": 11064137.0,
+      "reward": -0.0988616794347763,
+      "reward_std": 0.3224140405654907,
+      "rewards/cosine_scaled_reward/mean": -0.09886167198419571,
+      "rewards/cosine_scaled_reward/std": 0.41691890358924866,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1753.0,
+      "completions/mean_length": 1592.1875,
+      "completions/mean_terminated_length": 1075.60009765625,
+      "completions/min_length": 459.0,
+      "completions/min_terminated_length": 459.0,
+      "epoch": 0.10285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30840882658958435,
+      "learning_rate": 9.83423155058946e-07,
+      "loss": -0.0,
+      "num_tokens": 11176037.0,
+      "reward": -0.22794684767723083,
+      "reward_std": 0.19634509086608887,
+      "rewards/cosine_scaled_reward/mean": -0.22794684767723083,
+      "rewards/cosine_scaled_reward/std": 0.2059042751789093,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1933.0,
+      "completions/mean_length": 1857.125,
+      "completions/mean_terminated_length": 1369.3333740234375,
+      "completions/min_length": 876.0,
+      "completions/min_terminated_length": 876.0,
+      "epoch": 0.104,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24654391407966614,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": 0.0,
+      "num_tokens": 11305461.0,
+      "reward": -0.06898833811283112,
+      "reward_std": 0.24478675425052643,
+      "rewards/cosine_scaled_reward/mean": -0.06898833811283112,
+      "rewards/cosine_scaled_reward/std": 0.4049251973628998,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1291.0,
+      "completions/mean_length": 1659.859375,
+      "completions/mean_terminated_length": 865.0952758789062,
+      "completions/min_length": 459.0,
+      "completions/min_terminated_length": 459.0,
+      "epoch": 0.10514285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2405616194009781,
+      "learning_rate": 9.816912885430258e-07,
+      "loss": 0.0,
+      "num_tokens": 11421684.0,
+      "reward": -0.21882590651512146,
+      "reward_std": 0.186202734708786,
+      "rewards/cosine_scaled_reward/mean": -0.21882590651512146,
+      "rewards/cosine_scaled_reward/std": 0.20097385346889496,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.984375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2046.0,
+      "completions/mean_length": 2047.96875,
+      "completions/mean_terminated_length": 2046.0,
+      "completions/min_length": 2046.0,
+      "completions/min_terminated_length": 2046.0,
+      "epoch": 0.10628571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24273920059204102,
+      "learning_rate": 9.807937738894303e-07,
+      "loss": 0.0,
+      "num_tokens": 11564098.0,
+      "reward": -0.25700533390045166,
+      "reward_std": 0.11929697543382645,
+      "rewards/cosine_scaled_reward/mean": -0.2570053040981293,
+      "rewards/cosine_scaled_reward/std": 0.1724296510219574,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1979.0,
+      "completions/mean_length": 1864.96875,
+      "completions/mean_terminated_length": 983.0909423828125,
+      "completions/min_length": 619.0,
+      "completions/min_terminated_length": 619.0,
+      "epoch": 0.10742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2541305422782898,
+      "learning_rate": 9.798752629550546e-07,
+      "loss": -0.0,
+      "num_tokens": 11693224.0,
+      "reward": -0.12399546802043915,
+      "reward_std": 0.15344232320785522,
+      "rewards/cosine_scaled_reward/mean": -0.12399546802043915,
+      "rewards/cosine_scaled_reward/std": 0.4378487467765808,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1800.0,
+      "completions/mean_length": 1977.0625,
+      "completions/mean_terminated_length": 1140.0,
+      "completions/min_length": 755.0,
+      "completions/min_terminated_length": 755.0,
+      "epoch": 0.10857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21893343329429626,
+      "learning_rate": 9.78935800506826e-07,
+      "loss": 0.0,
+      "num_tokens": 11830284.0,
+      "reward": -0.2706957459449768,
+      "reward_std": 0.1604195535182953,
+      "rewards/cosine_scaled_reward/mean": -0.2706957459449768,
+      "rewards/cosine_scaled_reward/std": 0.17591074109077454,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1688.0,
+      "completions/mean_length": 1656.359375,
+      "completions/mean_terminated_length": 1003.625,
+      "completions/min_length": 541.0,
+      "completions/min_terminated_length": 541.0,
+      "epoch": 0.10971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24892951548099518,
+      "learning_rate": 9.779754323328192e-07,
+      "loss": 0.0,
+      "num_tokens": 11947427.0,
+      "reward": -0.05472355708479881,
+      "reward_std": 0.22797656059265137,
+      "rewards/cosine_scaled_reward/mean": -0.05472356453537941,
+      "rewards/cosine_scaled_reward/std": 0.4557226002216339,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1808.265625,
+      "completions/mean_terminated_length": 1240.4736328125,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "epoch": 0.11085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26835066080093384,
+      "learning_rate": 9.769942052400235e-07,
+      "loss": 0.0,
+      "num_tokens": 12073420.0,
+      "reward": -0.10791188478469849,
+      "reward_std": 0.2891411781311035,
+      "rewards/cosine_scaled_reward/mean": -0.10791188478469849,
+      "rewards/cosine_scaled_reward/std": 0.3751998543739319,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1930.0,
+      "completions/mean_length": 1552.390625,
+      "completions/mean_terminated_length": 915.1785888671875,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "epoch": 0.112,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27451470494270325,
+      "learning_rate": 9.759921670520634e-07,
+      "loss": -0.0,
+      "num_tokens": 12183837.0,
+      "reward": -0.1808183193206787,
+      "reward_std": 0.24214914441108704,
+      "rewards/cosine_scaled_reward/mean": -0.1808183193206787,
+      "rewards/cosine_scaled_reward/std": 0.3102630078792572,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1318.0,
+      "completions/mean_length": 1721.15625,
+      "completions/mean_terminated_length": 653.4666748046875,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "epoch": 0.11314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2736988067626953,
+      "learning_rate": 9.749693666068663e-07,
+      "loss": 0.0,
+      "num_tokens": 12305159.0,
+      "reward": -0.10280460864305496,
+      "reward_std": 0.21398380398750305,
+      "rewards/cosine_scaled_reward/mean": -0.10280461609363556,
+      "rewards/cosine_scaled_reward/std": 0.4072605073451996,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1780.0,
+      "completions/mean_length": 1564.6875,
+      "completions/mean_terminated_length": 981.3793334960938,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "epoch": 0.11428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26440170407295227,
+      "learning_rate": 9.739258537542835e-07,
+      "loss": 0.0,
+      "num_tokens": 12415011.0,
+      "reward": 0.07170121371746063,
+      "reward_std": 0.38168632984161377,
+      "rewards/cosine_scaled_reward/mean": 0.07170121371746063,
+      "rewards/cosine_scaled_reward/std": 0.519091784954071,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1963.0,
+      "completions/mean_length": 1679.9375,
+      "completions/mean_terminated_length": 1105.760009765625,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "epoch": 0.11542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28899702429771423,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": 0.0,
+      "num_tokens": 12533959.0,
+      "reward": -0.06987505406141281,
+      "reward_std": 0.23702794313430786,
+      "rewards/cosine_scaled_reward/mean": -0.06987505406141281,
+      "rewards/cosine_scaled_reward/std": 0.4194885790348053,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1960.0,
+      "completions/mean_length": 1382.515625,
+      "completions/mean_terminated_length": 927.1842041015625,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "epoch": 0.11657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31149351596832275,
+      "learning_rate": 9.717768952713511e-07,
+      "loss": -0.0,
+      "num_tokens": 12632592.0,
+      "reward": -0.1570146381855011,
+      "reward_std": 0.2435436099767685,
+      "rewards/cosine_scaled_reward/mean": -0.1570146381855011,
+      "rewards/cosine_scaled_reward/std": 0.41899070143699646,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1975.0,
+      "completions/mean_length": 1648.671875,
+      "completions/mean_terminated_length": 886.3182373046875,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.11771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26930251717567444,
+      "learning_rate": 9.706715543782064e-07,
+      "loss": 0.0,
+      "num_tokens": 12748459.0,
+      "reward": -0.2438274323940277,
+      "reward_std": 0.23225237429141998,
+      "rewards/cosine_scaled_reward/mean": -0.2438274323940277,
+      "rewards/cosine_scaled_reward/std": 0.32278329133987427,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1504.0,
+      "completions/mean_length": 1648.6875,
+      "completions/mean_terminated_length": 831.047607421875,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "epoch": 0.11885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31051400303840637,
+      "learning_rate": 9.695457105469804e-07,
+      "loss": 0.0,
+      "num_tokens": 12864439.0,
+      "reward": -0.11535478383302689,
+      "reward_std": 0.2225915789604187,
+      "rewards/cosine_scaled_reward/mean": -0.11535478383302689,
+      "rewards/cosine_scaled_reward/std": 0.31164902448654175,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1398.0,
+      "completions/mean_length": 1480.796875,
+      "completions/mean_terminated_length": 837.9667358398438,
+      "completions/min_length": 496.0,
+      "completions/min_terminated_length": 496.0,
+      "epoch": 0.12,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2630039155483246,
+      "learning_rate": 9.683994186497132e-07,
+      "loss": 0.0,
+      "num_tokens": 12970498.0,
+      "reward": -0.1717175543308258,
+      "reward_std": 0.2714414894580841,
+      "rewards/cosine_scaled_reward/mean": -0.1717175394296646,
+      "rewards/cosine_scaled_reward/std": 0.3898351192474365,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.515625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1933.0,
+      "completions/mean_length": 1442.15625,
+      "completions/mean_terminated_length": 797.2257690429688,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 0.12114285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27676305174827576,
+      "learning_rate": 9.672327345550543e-07,
+      "loss": -0.0,
+      "num_tokens": 13073628.0,
+      "reward": 0.06792312860488892,
+      "reward_std": 0.3339839577674866,
+      "rewards/cosine_scaled_reward/mean": 0.06792312115430832,
+      "rewards/cosine_scaled_reward/std": 0.4862962067127228,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1837.0,
+      "completions/mean_length": 1835.5,
+      "completions/mean_terminated_length": 1248.0,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "epoch": 0.12228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2582181394100189,
+      "learning_rate": 9.66045715125541e-07,
+      "loss": -0.0,
+      "num_tokens": 13202252.0,
+      "reward": -0.21117815375328064,
+      "reward_std": 0.26033473014831543,
+      "rewards/cosine_scaled_reward/mean": -0.21117815375328064,
+      "rewards/cosine_scaled_reward/std": 0.318643718957901,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1920.0,
+      "completions/mean_length": 1598.953125,
+      "completions/mean_terminated_length": 1021.607177734375,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "epoch": 0.12342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2756604850292206,
+      "learning_rate": 9.648384182148252e-07,
+      "loss": 0.0,
+      "num_tokens": 13314945.0,
+      "reward": -0.0939117893576622,
+      "reward_std": 0.3252195715904236,
+      "rewards/cosine_scaled_reward/mean": -0.0939117819070816,
+      "rewards/cosine_scaled_reward/std": 0.40993908047676086,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1739.0,
+      "completions/mean_length": 1812.328125,
+      "completions/mean_terminated_length": 970.6428833007812,
+      "completions/min_length": 491.0,
+      "completions/min_terminated_length": 491.0,
+      "epoch": 0.12457142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24118952453136444,
+      "learning_rate": 9.636109026648554e-07,
+      "loss": -0.0,
+      "num_tokens": 13442182.0,
+      "reward": -0.12436474859714508,
+      "reward_std": 0.17601566016674042,
+      "rewards/cosine_scaled_reward/mean": -0.12436474859714508,
+      "rewards/cosine_scaled_reward/std": 0.3541682958602905,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1981.0,
+      "completions/mean_length": 1614.671875,
+      "completions/mean_terminated_length": 842.2174072265625,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "epoch": 0.12571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2967440187931061,
+      "learning_rate": 9.623632283030077e-07,
+      "loss": 0.0,
+      "num_tokens": 13556297.0,
+      "reward": -0.28026559948921204,
+      "reward_std": 0.14505533874034882,
+      "rewards/cosine_scaled_reward/mean": -0.2802656292915344,
+      "rewards/cosine_scaled_reward/std": 0.1739458441734314,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1777.0,
+      "completions/mean_length": 1675.53125,
+      "completions/mean_terminated_length": 1094.47998046875,
+      "completions/min_length": 526.0,
+      "completions/min_terminated_length": 526.0,
+      "epoch": 0.12685714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25916096568107605,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": 0.0,
+      "num_tokens": 13673651.0,
+      "reward": -0.10561071336269379,
+      "reward_std": 0.2843046188354492,
+      "rewards/cosine_scaled_reward/mean": -0.10561071336269379,
+      "rewards/cosine_scaled_reward/std": 0.42046698927879333,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2046.0,
+      "completions/mean_length": 1945.984375,
+      "completions/mean_terminated_length": 1454.45458984375,
+      "completions/min_length": 999.0,
+      "completions/min_terminated_length": 999.0,
+      "epoch": 0.128,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22010542452335358,
+      "learning_rate": 9.598076473627796e-07,
+      "loss": 0.0,
+      "num_tokens": 13809290.0,
+      "reward": -0.16558930277824402,
+      "reward_std": 0.2861853837966919,
+      "rewards/cosine_scaled_reward/mean": -0.16558930277824402,
+      "rewards/cosine_scaled_reward/std": 0.3597464859485626,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.515625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1493.0,
+      "completions/mean_length": 1482.453125,
+      "completions/mean_terminated_length": 880.4193115234375,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "epoch": 0.12914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2894801199436188,
+      "learning_rate": 9.58499865339809e-07,
+      "loss": 0.0,
+      "num_tokens": 13914463.0,
+      "reward": -0.09200191497802734,
+      "reward_std": 0.24287937581539154,
+      "rewards/cosine_scaled_reward/mean": -0.09200191497802734,
+      "rewards/cosine_scaled_reward/std": 0.4290314316749573,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1549.0,
+      "completions/mean_length": 1574.90625,
+      "completions/mean_terminated_length": 731.5652465820312,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 0.13028571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2619520425796509,
+      "learning_rate": 9.571721736097088e-07,
+      "loss": -0.0,
+      "num_tokens": 14025105.0,
+      "reward": -0.258175253868103,
+      "reward_std": 0.21170002222061157,
+      "rewards/cosine_scaled_reward/mean": -0.258175253868103,
+      "rewards/cosine_scaled_reward/std": 0.236412912607193,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1948.0,
+      "completions/mean_length": 1683.375,
+      "completions/mean_terminated_length": 936.7619018554688,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "epoch": 0.13142857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27216964960098267,
+      "learning_rate": 9.55824636882301e-07,
+      "loss": -0.0,
+      "num_tokens": 14144057.0,
+      "reward": -0.13246098160743713,
+      "reward_std": 0.21515703201293945,
+      "rewards/cosine_scaled_reward/mean": -0.13246098160743713,
+      "rewards/cosine_scaled_reward/std": 0.3399508595466614,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 844.0,
+      "completions/mean_length": 1858.484375,
+      "completions/mean_terminated_length": 531.875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.13257142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26274579763412476,
+      "learning_rate": 9.54457320834625e-07,
+      "loss": 0.0,
+      "num_tokens": 14274384.0,
+      "reward": -0.1656629592180252,
+      "reward_std": 0.18953147530555725,
+      "rewards/cosine_scaled_reward/mean": -0.1656629592180252,
+      "rewards/cosine_scaled_reward/std": 0.22731326520442963,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1725.875,
+      "completions/mean_terminated_length": 1066.2857666015625,
+      "completions/min_length": 546.0,
+      "completions/min_terminated_length": 546.0,
+      "epoch": 0.1337142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2458299994468689,
+      "learning_rate": 9.530702921077358e-07,
+      "loss": 0.0,
+      "num_tokens": 14395864.0,
+      "reward": -0.10864575207233429,
+      "reward_std": 0.22824041545391083,
+      "rewards/cosine_scaled_reward/mean": -0.10864575207233429,
+      "rewards/cosine_scaled_reward/std": 0.29944685101509094,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1989.0,
+      "completions/mean_length": 1818.578125,
+      "completions/mean_terminated_length": 1069.1334228515625,
+      "completions/min_length": 479.0,
+      "completions/min_terminated_length": 479.0,
+      "epoch": 0.13485714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2319493144750595,
+      "learning_rate": 9.516636183034564e-07,
+      "loss": 0.0,
+      "num_tokens": 14522789.0,
+      "reward": -0.09332149475812912,
+      "reward_std": 0.26317405700683594,
+      "rewards/cosine_scaled_reward/mean": -0.09332150220870972,
+      "rewards/cosine_scaled_reward/std": 0.3715793788433075,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2035.0,
+      "completions/mean_length": 1507.578125,
+      "completions/mean_terminated_length": 1087.25,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "epoch": 0.136,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3142438530921936,
+      "learning_rate": 9.502373679810839e-07,
+      "loss": -0.0,
+      "num_tokens": 14629682.0,
+      "reward": -0.038483649492263794,
+      "reward_std": 0.2474227398633957,
+      "rewards/cosine_scaled_reward/mean": -0.038483649492263794,
+      "rewards/cosine_scaled_reward/std": 0.46291273832321167,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.296875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1939.0,
+      "completions/mean_length": 1277.109375,
+      "completions/mean_terminated_length": 951.6222534179688,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 0.13714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3041006326675415,
+      "learning_rate": 9.487916106540465e-07,
+      "loss": -0.0,
+      "num_tokens": 14721873.0,
+      "reward": -0.1477060317993164,
+      "reward_std": 0.33122679591178894,
+      "rewards/cosine_scaled_reward/mean": -0.14770600199699402,
+      "rewards/cosine_scaled_reward/std": 0.46506062150001526,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1818.0,
+      "completions/mean_length": 1170.421875,
+      "completions/mean_terminated_length": 901.7755126953125,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.1382857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3021833002567291,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": 0.0,
+      "num_tokens": 14806476.0,
+      "reward": -0.08516940474510193,
+      "reward_std": 0.3455994129180908,
+      "rewards/cosine_scaled_reward/mean": -0.08516941219568253,
+      "rewards/cosine_scaled_reward/std": 0.5138645172119141,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1858.0,
+      "completions/mean_length": 1812.140625,
+      "completions/mean_terminated_length": 969.7857666015625,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "epoch": 0.13942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2724408209323883,
+      "learning_rate": 9.458418577899774e-07,
+      "loss": 0.0,
+      "num_tokens": 14934013.0,
+      "reward": -0.21230415999889374,
+      "reward_std": 0.25918447971343994,
+      "rewards/cosine_scaled_reward/mean": -0.21230417490005493,
+      "rewards/cosine_scaled_reward/std": 0.2874549329280853,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.515625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2013.0,
+      "completions/mean_length": 1607.03125,
+      "completions/mean_terminated_length": 1137.6129150390625,
+      "completions/min_length": 590.0,
+      "completions/min_terminated_length": 590.0,
+      "epoch": 0.14057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23946814239025116,
+      "learning_rate": 9.443380060197385e-07,
+      "loss": -0.0,
+      "num_tokens": 15047095.0,
+      "reward": -0.11815785616636276,
+      "reward_std": 0.2174030840396881,
+      "rewards/cosine_scaled_reward/mean": -0.11815785616636276,
+      "rewards/cosine_scaled_reward/std": 0.4328930079936981,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1730.0,
+      "completions/mean_length": 1482.09375,
+      "completions/mean_terminated_length": 916.1875,
+      "completions/min_length": 360.0,
+      "completions/min_terminated_length": 360.0,
+      "epoch": 0.1417142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2677968740463257,
+      "learning_rate": 9.428149347714143e-07,
+      "loss": 0.0,
+      "num_tokens": 15152901.0,
+      "reward": -0.0870831310749054,
+      "reward_std": 0.30780428647994995,
+      "rewards/cosine_scaled_reward/mean": -0.0870831310749054,
+      "rewards/cosine_scaled_reward/std": 0.46330681443214417,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1559.0,
+      "completions/mean_length": 1620.140625,
+      "completions/mean_terminated_length": 803.3181762695312,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "epoch": 0.14285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24397389590740204,
+      "learning_rate": 9.412727182773486e-07,
+      "loss": 0.0,
+      "num_tokens": 15267518.0,
+      "reward": -0.015626579523086548,
+      "reward_std": 0.2010820060968399,
+      "rewards/cosine_scaled_reward/mean": -0.01562657207250595,
+      "rewards/cosine_scaled_reward/std": 0.4903516471385956,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1879.0,
+      "completions/mean_length": 1776.828125,
+      "completions/mean_terminated_length": 1083.8333740234375,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "epoch": 0.144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2397489696741104,
+      "learning_rate": 9.397114317029974e-07,
+      "loss": 0.0,
+      "num_tokens": 15392531.0,
+      "reward": -0.20714247226715088,
+      "reward_std": 0.2310880422592163,
+      "rewards/cosine_scaled_reward/mean": -0.20714247226715088,
+      "rewards/cosine_scaled_reward/std": 0.277647465467453,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1941.0,
+      "completions/mean_length": 1820.09375,
+      "completions/mean_terminated_length": 1190.0,
+      "completions/min_length": 705.0,
+      "completions/min_terminated_length": 705.0,
+      "epoch": 0.14514285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21327179670333862,
+      "learning_rate": 9.381311511432658e-07,
+      "loss": -0.0,
+      "num_tokens": 15520113.0,
+      "reward": -0.21989238262176514,
+      "reward_std": 0.21288105845451355,
+      "rewards/cosine_scaled_reward/mean": -0.21989238262176514,
+      "rewards/cosine_scaled_reward/std": 0.25816869735717773,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2044.0,
+      "completions/mean_length": 1741.203125,
+      "completions/mean_terminated_length": 1113.0,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.1462857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24533982574939728,
+      "learning_rate": 9.36531953618799e-07,
+      "loss": -0.0,
+      "num_tokens": 15641902.0,
+      "reward": 0.13875506818294525,
+      "reward_std": 0.2863699495792389,
+      "rewards/cosine_scaled_reward/mean": 0.13875506818294525,
+      "rewards/cosine_scaled_reward/std": 0.4384811818599701,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 811.0,
+      "completions/mean_length": 1875.84375,
+      "completions/mean_terminated_length": 670.75,
+      "completions/min_length": 497.0,
+      "completions/min_terminated_length": 497.0,
+      "epoch": 0.14742857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2447715848684311,
+      "learning_rate": 9.34913917072228e-07,
+      "loss": 0.0,
+      "num_tokens": 15772660.0,
+      "reward": -0.2536994218826294,
+      "reward_std": 0.15479066967964172,
+      "rewards/cosine_scaled_reward/mean": -0.2536994218826294,
+      "rewards/cosine_scaled_reward/std": 0.21421663463115692,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1902.0,
+      "completions/mean_length": 1667.53125,
+      "completions/mean_terminated_length": 989.3043823242188,
+      "completions/min_length": 561.0,
+      "completions/min_terminated_length": 561.0,
+      "epoch": 0.14857142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23984794318675995,
+      "learning_rate": 9.332771203643714e-07,
+      "loss": -0.0,
+      "num_tokens": 15889886.0,
+      "reward": -0.19088414311408997,
+      "reward_std": 0.2502530515193939,
+      "rewards/cosine_scaled_reward/mean": -0.19088414311408997,
+      "rewards/cosine_scaled_reward/std": 0.3068367540836334,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1990.0,
+      "completions/mean_length": 1667.75,
+      "completions/mean_terminated_length": 941.8182373046875,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.14971428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.270325243473053,
+      "learning_rate": 9.316216432703916e-07,
+      "loss": -0.0,
+      "num_tokens": 16006358.0,
+      "reward": -0.019564799964427948,
+      "reward_std": 0.28430548310279846,
+      "rewards/cosine_scaled_reward/mean": -0.019564803689718246,
+      "rewards/cosine_scaled_reward/std": 0.45797842741012573,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1625.0,
+      "completions/mean_length": 1605.0625,
+      "completions/mean_terminated_length": 866.8333740234375,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "epoch": 0.15085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2547350525856018,
+      "learning_rate": 9.299475664759068e-07,
+      "loss": 0.0,
+      "num_tokens": 16120146.0,
+      "reward": -0.21965548396110535,
+      "reward_std": 0.25751689076423645,
+      "rewards/cosine_scaled_reward/mean": -0.21965548396110535,
+      "rewards/cosine_scaled_reward/std": 0.3749488890171051,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1753.0,
+      "completions/mean_length": 1924.296875,
+      "completions/mean_terminated_length": 1058.375,
+      "completions/min_length": 524.0,
+      "completions/min_terminated_length": 524.0,
+      "epoch": 0.152,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2590077221393585,
+      "learning_rate": 9.282549715730579e-07,
+      "loss": -0.0,
+      "num_tokens": 16254525.0,
+      "reward": -0.14530372619628906,
+      "reward_std": 0.19581159949302673,
+      "rewards/cosine_scaled_reward/mean": -0.14530371129512787,
+      "rewards/cosine_scaled_reward/std": 0.2799433171749115,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1732.0,
+      "completions/mean_length": 1612.125,
+      "completions/mean_terminated_length": 1086.0689697265625,
+      "completions/min_length": 605.0,
+      "completions/min_terminated_length": 605.0,
+      "epoch": 0.15314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27096986770629883,
+      "learning_rate": 9.265439410565328e-07,
+      "loss": -0.0,
+      "num_tokens": 16368269.0,
+      "reward": -0.1256684958934784,
+      "reward_std": 0.20261810719966888,
+      "rewards/cosine_scaled_reward/mean": -0.1256684958934784,
+      "rewards/cosine_scaled_reward/std": 0.4080355167388916,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2011.0,
+      "completions/mean_length": 1120.890625,
+      "completions/mean_terminated_length": 699.477294921875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.15428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30590543150901794,
+      "learning_rate": 9.248145583195447e-07,
+      "loss": -0.0,
+      "num_tokens": 16450478.0,
+      "reward": 0.115441232919693,
+      "reward_std": 0.23258042335510254,
+      "rewards/cosine_scaled_reward/mean": 0.115441232919693,
+      "rewards/cosine_scaled_reward/std": 0.500895619392395,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1373.0,
+      "completions/mean_length": 1585.21875,
+      "completions/mean_terminated_length": 813.9166870117188,
+      "completions/min_length": 526.0,
+      "completions/min_terminated_length": 526.0,
+      "epoch": 0.15542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2542603611946106,
+      "learning_rate": 9.230669076497687e-07,
+      "loss": 0.0,
+      "num_tokens": 16562604.0,
+      "reward": 0.006334513425827026,
+      "reward_std": 0.3029508590698242,
+      "rewards/cosine_scaled_reward/mean": 0.006334502249956131,
+      "rewards/cosine_scaled_reward/std": 0.4998469352722168,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1737.0,
+      "completions/mean_length": 1736.703125,
+      "completions/mean_terminated_length": 1051.8499755859375,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "epoch": 0.15657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22275574505329132,
+      "learning_rate": 9.213010742252327e-07,
+      "loss": -0.0,
+      "num_tokens": 16684361.0,
+      "reward": -0.27902746200561523,
+      "reward_std": 0.13864701986312866,
+      "rewards/cosine_scaled_reward/mean": -0.27902746200561523,
+      "rewards/cosine_scaled_reward/std": 0.16625361144542694,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1720.0,
+      "completions/mean_length": 1632.234375,
+      "completions/mean_terminated_length": 983.6399536132812,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "epoch": 0.15771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23748613893985748,
+      "learning_rate": 9.195171441101668e-07,
+      "loss": -0.0,
+      "num_tokens": 16800136.0,
+      "reward": -0.20888572931289673,
+      "reward_std": 0.2201838493347168,
+      "rewards/cosine_scaled_reward/mean": -0.20888571441173553,
+      "rewards/cosine_scaled_reward/std": 0.3073258101940155,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1757.0,
+      "completions/mean_length": 1864.375,
+      "completions/mean_terminated_length": 1144.0,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.15885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26246827840805054,
+      "learning_rate": 9.177152042508077e-07,
+      "loss": 0.0,
+      "num_tokens": 16930096.0,
+      "reward": -0.26648059487342834,
+      "reward_std": 0.22530998289585114,
+      "rewards/cosine_scaled_reward/mean": -0.26648059487342834,
+      "rewards/cosine_scaled_reward/std": 0.26054832339286804,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2048.0,
+      "completions/mean_length": 1517.40625,
+      "completions/mean_terminated_length": 986.8125,
+      "completions/min_length": 400.0,
+      "completions/min_terminated_length": 400.0,
+      "epoch": 0.16,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.28518393635749817,
+      "learning_rate": 9.158953424711624e-07,
+      "loss": 0.0,
+      "num_tokens": 17037434.0,
+      "reward": -0.0924016684293747,
+      "reward_std": 0.18293559551239014,
+      "rewards/cosine_scaled_reward/mean": -0.0924016684293747,
+      "rewards/cosine_scaled_reward/std": 0.4700092375278473,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1383.0,
+      "completions/mean_length": 1388.703125,
+      "completions/mean_terminated_length": 1066.720947265625,
+      "completions/min_length": 479.0,
+      "completions/min_terminated_length": 479.0,
+      "epoch": 0.16114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2404537796974182,
+      "learning_rate": 9.140576474687263e-07,
+      "loss": -0.0,
+      "num_tokens": 17136871.0,
+      "reward": -0.0574793741106987,
+      "reward_std": 0.3190045952796936,
+      "rewards/cosine_scaled_reward/mean": -0.0574793815612793,
+      "rewards/cosine_scaled_reward/std": 0.46699976921081543,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.515625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1873.0,
+      "completions/mean_length": 1644.6875,
+      "completions/mean_terminated_length": 1215.3548583984375,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "epoch": 0.16228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2672227621078491,
+      "learning_rate": 9.122022088101613e-07,
+      "loss": 0.0,
+      "num_tokens": 17252635.0,
+      "reward": -0.12337548285722733,
+      "reward_std": 0.288290411233902,
+      "rewards/cosine_scaled_reward/mean": -0.12337549030780792,
+      "rewards/cosine_scaled_reward/std": 0.408100426197052,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1349.125,
+      "completions/mean_terminated_length": 957.0731201171875,
+      "completions/min_length": 511.0,
+      "completions/min_terminated_length": 511.0,
+      "epoch": 0.16342857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30288439989089966,
+      "learning_rate": 9.103291169269299e-07,
+      "loss": 0.0,
+      "num_tokens": 17349795.0,
+      "reward": -0.19018490612506866,
+      "reward_std": 0.2661983370780945,
+      "rewards/cosine_scaled_reward/mean": -0.19018490612506866,
+      "rewards/cosine_scaled_reward/std": 0.3374536633491516,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2007.0,
+      "completions/mean_length": 1644.03125,
+      "completions/mean_terminated_length": 1013.8399658203125,
+      "completions/min_length": 341.0,
+      "completions/min_terminated_length": 341.0,
+      "epoch": 0.16457142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2909323573112488,
+      "learning_rate": 9.084384631108882e-07,
+      "loss": -0.0,
+      "num_tokens": 17466501.0,
+      "reward": -0.059858791530132294,
+      "reward_std": 0.22690719366073608,
+      "rewards/cosine_scaled_reward/mean": -0.0598587840795517,
+      "rewards/cosine_scaled_reward/std": 0.5050134062767029,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.296875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2006.0,
+      "completions/mean_length": 1229.8125,
+      "completions/mean_terminated_length": 884.3555908203125,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "epoch": 0.1657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30516529083251953,
+      "learning_rate": 9.065303395098358e-07,
+      "loss": 0.0,
+      "num_tokens": 17555153.0,
+      "reward": -0.00805443525314331,
+      "reward_std": 0.2110176980495453,
+      "rewards/cosine_scaled_reward/mean": -0.00805443525314331,
+      "rewards/cosine_scaled_reward/std": 0.5190568566322327,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1648.0,
+      "completions/mean_length": 1531.0,
+      "completions/mean_terminated_length": 866.2857666015625,
+      "completions/min_length": 360.0,
+      "completions/min_terminated_length": 360.0,
+      "epoch": 0.16685714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24603432416915894,
+      "learning_rate": 9.046048391230247e-07,
+      "loss": -0.0,
+      "num_tokens": 17663753.0,
+      "reward": -0.1248023509979248,
+      "reward_std": 0.257907509803772,
+      "rewards/cosine_scaled_reward/mean": -0.1248023509979248,
+      "rewards/cosine_scaled_reward/std": 0.3190684914588928,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1997.0,
+      "completions/mean_length": 1583.921875,
+      "completions/mean_terminated_length": 1119.84375,
+      "completions/min_length": 595.0,
+      "completions/min_terminated_length": 595.0,
+      "epoch": 0.168,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.278160959482193,
+      "learning_rate": 9.026620557966279e-07,
+      "loss": -0.0,
+      "num_tokens": 17775908.0,
+      "reward": -0.18137255311012268,
+      "reward_std": 0.2745535969734192,
+      "rewards/cosine_scaled_reward/mean": -0.18137255311012268,
+      "rewards/cosine_scaled_reward/std": 0.3545372188091278,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1946.0,
+      "completions/mean_length": 1316.109375,
+      "completions/mean_terminated_length": 983.4318237304688,
+      "completions/min_length": 456.0,
+      "completions/min_terminated_length": 456.0,
+      "epoch": 0.16914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2669003903865814,
+      "learning_rate": 9.007020842191634e-07,
+      "loss": -0.0,
+      "num_tokens": 17871323.0,
+      "reward": -0.12499135732650757,
+      "reward_std": 0.19944381713867188,
+      "rewards/cosine_scaled_reward/mean": -0.12499135732650757,
+      "rewards/cosine_scaled_reward/std": 0.41628143191337585,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2026.0,
+      "completions/mean_length": 1674.546875,
+      "completions/mean_terminated_length": 1008.8261108398438,
+      "completions/min_length": 570.0,
+      "completions/min_terminated_length": 570.0,
+      "epoch": 0.1702857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24959711730480194,
+      "learning_rate": 8.987250199168808e-07,
+      "loss": -0.0,
+      "num_tokens": 17990390.0,
+      "reward": -0.24294674396514893,
+      "reward_std": 0.2527904510498047,
+      "rewards/cosine_scaled_reward/mean": -0.24294671416282654,
+      "rewards/cosine_scaled_reward/std": 0.35040438175201416,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1957.0,
+      "completions/mean_length": 1363.671875,
+      "completions/mean_terminated_length": 1005.2142944335938,
+      "completions/min_length": 398.0,
+      "completions/min_terminated_length": 398.0,
+      "epoch": 0.17142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29853612184524536,
+      "learning_rate": 8.967309592491052e-07,
+      "loss": -0.0,
+      "num_tokens": 18088169.0,
+      "reward": -0.13983747363090515,
+      "reward_std": 0.37944915890693665,
+      "rewards/cosine_scaled_reward/mean": -0.13983745872974396,
+      "rewards/cosine_scaled_reward/std": 0.4024735391139984,
+      "step": 150
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 500,
+  "num_input_tokens_seen": 18088169,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}