diff --git "a/checkpoint-250/trainer_state.json" "b/checkpoint-250/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-250/trainer_state.json"
@@ -0,0 +1,6284 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2857142857142857,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 1702.03125,
+      "completions/mean_terminated_length": 993.6190795898438,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "epoch": 0.001142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2837817668914795,
+      "learning_rate": 0.0,
+      "loss": -0.0,
+      "num_tokens": 118418.0,
+      "reward": -0.09800112247467041,
+      "reward_std": 0.3028089702129364,
+      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
+      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1894.0,
+      "completions/mean_length": 1738.90625,
+      "completions/mean_terminated_length": 949.0,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.002285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2421981245279312,
+      "learning_rate": 2e-08,
+      "loss": -0.0,
+      "num_tokens": 239748.0,
+      "reward": 0.020556632429361343,
+      "reward_std": 0.3545936942100525,
+      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
+      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.921875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 1952.234375,
+      "completions/mean_terminated_length": 822.2000122070312,
+      "completions/min_length": 703.0,
+      "completions/min_terminated_length": 703.0,
+      "epoch": 0.0034285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24851329624652863,
+      "learning_rate": 4e-08,
+      "loss": -0.0,
+      "num_tokens": 375163.0,
+      "reward": -0.22721199691295624,
+      "reward_std": 0.14563649892807007,
+      "rewards/cosine_scaled_reward/mean": -0.22721199691295624,
+      "rewards/cosine_scaled_reward/std": 0.1709199845790863,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1685.0,
+      "completions/mean_length": 1554.109375,
+      "completions/mean_terminated_length": 958.0344848632812,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "epoch": 0.004571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29272863268852234,
+      "learning_rate": 6e-08,
+      "loss": -0.0,
+      "num_tokens": 484434.0,
+      "reward": -0.17542189359664917,
+      "reward_std": 0.18219107389450073,
+      "rewards/cosine_scaled_reward/mean": -0.17542189359664917,
+      "rewards/cosine_scaled_reward/std": 0.27975013852119446,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1930.0,
+      "completions/mean_length": 1943.0625,
+      "completions/mean_terminated_length": 1088.571533203125,
+      "completions/min_length": 344.0,
+      "completions/min_terminated_length": 344.0,
+      "epoch": 0.005714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2773251533508301,
+      "learning_rate": 8e-08,
+      "loss": 0.0,
+      "num_tokens": 619606.0,
+      "reward": -0.2648562788963318,
+      "reward_std": 0.21638144552707672,
+      "rewards/cosine_scaled_reward/mean": -0.2648562788963318,
+      "rewards/cosine_scaled_reward/std": 0.23959198594093323,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1824.0,
+      "completions/mean_length": 1854.21875,
+      "completions/mean_terminated_length": 920.5454711914062,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "epoch": 0.006857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27399909496307373,
+      "learning_rate": 1e-07,
+      "loss": -0.0,
+      "num_tokens": 749924.0,
+      "reward": -0.19292885065078735,
+      "reward_std": 0.2666770815849304,
+      "rewards/cosine_scaled_reward/mean": -0.19292885065078735,
+      "rewards/cosine_scaled_reward/std": 0.295730322599411,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1589.0,
+      "completions/mean_length": 1940.5625,
+      "completions/mean_terminated_length": 1065.71435546875,
+      "completions/min_length": 773.0,
+      "completions/min_terminated_length": 773.0,
+      "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23362359404563904,
+      "learning_rate": 1.2e-07,
+      "loss": 0.0,
+      "num_tokens": 884528.0,
+      "reward": -0.18198424577713013,
+      "reward_std": 0.18540163338184357,
+      "rewards/cosine_scaled_reward/mean": -0.18198424577713013,
+      "rewards/cosine_scaled_reward/std": 0.32407456636428833,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2048.0,
+      "completions/mean_length": 1708.5625,
+      "completions/mean_terminated_length": 1013.5238037109375,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "epoch": 0.009142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24677562713623047,
+      "learning_rate": 1.4e-07,
+      "loss": -0.0,
+      "num_tokens": 1004292.0,
+      "reward": -0.09573853015899658,
+      "reward_std": 0.22485454380512238,
+      "rewards/cosine_scaled_reward/mean": -0.09573852270841599,
+      "rewards/cosine_scaled_reward/std": 0.449250191450119,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1221.0,
+      "completions/mean_length": 1979.359375,
+      "completions/mean_terminated_length": 949.75,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "epoch": 0.010285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26966309547424316,
+      "learning_rate": 1.6e-07,
+      "loss": 0.0,
+      "num_tokens": 1142427.0,
+      "reward": -0.19992578029632568,
+      "reward_std": 0.20190927386283875,
+      "rewards/cosine_scaled_reward/mean": -0.19992581009864807,
+      "rewards/cosine_scaled_reward/std": 0.23785534501075745,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1918.0,
+      "completions/mean_length": 1652.59375,
+      "completions/mean_terminated_length": 897.727294921875,
+      "completions/min_length": 286.0,
+      "completions/min_terminated_length": 286.0,
+      "epoch": 0.011428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3011312484741211,
+      "learning_rate": 1.8e-07,
+      "loss": 0.0,
+      "num_tokens": 1259025.0,
+      "reward": -0.11706389486789703,
+      "reward_std": 0.2934548258781433,
+      "rewards/cosine_scaled_reward/mean": -0.11706390231847763,
+      "rewards/cosine_scaled_reward/std": 0.3601698577404022,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1333.0,
+      "completions/mean_length": 1946.6875,
+      "completions/mean_terminated_length": 967.3333740234375,
+      "completions/min_length": 599.0,
+      "completions/min_terminated_length": 599.0,
+      "epoch": 0.012571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2451399564743042,
+      "learning_rate": 2e-07,
+      "loss": -0.0,
+      "num_tokens": 1395285.0,
+      "reward": -0.2866281270980835,
+      "reward_std": 0.12184012681245804,
+      "rewards/cosine_scaled_reward/mean": -0.2866281270980835,
+      "rewards/cosine_scaled_reward/std": 0.15141677856445312,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 1659.28125,
+      "completions/mean_terminated_length": 1190.137939453125,
+      "completions/min_length": 535.0,
+      "completions/min_terminated_length": 535.0,
+      "epoch": 0.013714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2733561396598816,
+      "learning_rate": 2.1999999999999998e-07,
+      "loss": 0.0,
+      "num_tokens": 1512423.0,
+      "reward": -0.13816070556640625,
+      "reward_std": 0.2968980073928833,
+      "rewards/cosine_scaled_reward/mean": -0.13816070556640625,
+      "rewards/cosine_scaled_reward/std": 0.3597467839717865,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1770.0,
+      "completions/mean_length": 1807.796875,
+      "completions/mean_terminated_length": 1023.1333618164062,
+      "completions/min_length": 697.0,
+      "completions/min_terminated_length": 697.0,
+      "epoch": 0.014857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25238803029060364,
+      "learning_rate": 2.4e-07,
+      "loss": 0.0,
+      "num_tokens": 1639162.0,
+      "reward": -0.13488636910915375,
+      "reward_std": 0.2661236524581909,
+      "rewards/cosine_scaled_reward/mean": -0.13488635420799255,
+      "rewards/cosine_scaled_reward/std": 0.3444243371486664,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1866.0,
+      "completions/mean_length": 1846.921875,
+      "completions/mean_terminated_length": 1243.6875,
+      "completions/min_length": 698.0,
+      "completions/min_terminated_length": 698.0,
+      "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2201598882675171,
+      "learning_rate": 2.6e-07,
+      "loss": -0.0,
+      "num_tokens": 1767973.0,
+      "reward": -0.20591925084590912,
+      "reward_std": 0.21505361795425415,
+      "rewards/cosine_scaled_reward/mean": -0.20591923594474792,
+      "rewards/cosine_scaled_reward/std": 0.323749840259552,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1713.0,
+      "completions/mean_length": 1710.421875,
+      "completions/mean_terminated_length": 847.7222290039062,
+      "completions/min_length": 450.0,
+      "completions/min_terminated_length": 450.0,
+      "epoch": 0.017142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2665213644504547,
+      "learning_rate": 2.8e-07,
+      "loss": 0.0,
+      "num_tokens": 1888360.0,
+      "reward": -0.0778750479221344,
+      "reward_std": 0.17502948641777039,
+      "rewards/cosine_scaled_reward/mean": -0.0778750628232956,
+      "rewards/cosine_scaled_reward/std": 0.47343766689300537,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.984375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 962.0,
+      "completions/mean_length": 2031.03125,
+      "completions/mean_terminated_length": 962.0,
+      "completions/min_length": 962.0,
+      "completions/min_terminated_length": 962.0,
+      "epoch": 0.018285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23009927570819855,
+      "learning_rate": 3e-07,
+      "loss": -0.0,
+      "num_tokens": 2028786.0,
+      "reward": -0.2619968056678772,
+      "reward_std": 0.16954168677330017,
+      "rewards/cosine_scaled_reward/mean": -0.2619968056678772,
+      "rewards/cosine_scaled_reward/std": 0.18357795476913452,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1918.0,
+      "completions/mean_length": 1533.15625,
+      "completions/mean_terminated_length": 780.6923217773438,
+      "completions/min_length": 380.0,
+      "completions/min_terminated_length": 380.0,
+      "epoch": 0.019428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3392995297908783,
+      "learning_rate": 3.2e-07,
+      "loss": -0.0,
+      "num_tokens": 2137428.0,
+      "reward": -0.11706461012363434,
+      "reward_std": 0.3096129894256592,
+      "rewards/cosine_scaled_reward/mean": -0.11706460267305374,
+      "rewards/cosine_scaled_reward/std": 0.3810974657535553,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1626.0,
+      "completions/mean_length": 1774.46875,
+      "completions/mean_terminated_length": 1018.2352905273438,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "epoch": 0.02057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23254038393497467,
+      "learning_rate": 3.4000000000000003e-07,
+      "loss": 0.0,
+      "num_tokens": 2261370.0,
+      "reward": -0.18709540367126465,
+      "reward_std": 0.2795025110244751,
+      "rewards/cosine_scaled_reward/mean": -0.18709540367126465,
+      "rewards/cosine_scaled_reward/std": 0.3359416127204895,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1859.0,
+      "completions/mean_length": 1719.0,
+      "completions/mean_terminated_length": 995.2000122070312,
+      "completions/min_length": 577.0,
+      "completions/min_terminated_length": 577.0,
+      "epoch": 0.021714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.262045681476593,
+      "learning_rate": 3.6e-07,
+      "loss": -0.0,
+      "num_tokens": 2382642.0,
+      "reward": -0.02329203486442566,
+      "reward_std": 0.34684932231903076,
+      "rewards/cosine_scaled_reward/mean": -0.02329203486442566,
+      "rewards/cosine_scaled_reward/std": 0.47637447714805603,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1988.0,
+      "completions/mean_length": 1630.90625,
+      "completions/mean_terminated_length": 935.75,
+      "completions/min_length": 425.0,
+      "completions/min_terminated_length": 425.0,
+      "epoch": 0.022857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.250532329082489,
+      "learning_rate": 3.7999999999999996e-07,
+      "loss": 0.0,
+      "num_tokens": 2498372.0,
+      "reward": -0.06319350004196167,
+      "reward_std": 0.2394939512014389,
+      "rewards/cosine_scaled_reward/mean": -0.06319350004196167,
+      "rewards/cosine_scaled_reward/std": 0.3889789879322052,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1818.0,
+      "completions/mean_length": 1735.96875,
+      "completions/mean_terminated_length": 1140.272705078125,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2773231565952301,
+      "learning_rate": 4e-07,
+      "loss": 0.0,
+      "num_tokens": 2620282.0,
+      "reward": -0.20884393155574799,
+      "reward_std": 0.20233216881752014,
+      "rewards/cosine_scaled_reward/mean": -0.20884393155574799,
+      "rewards/cosine_scaled_reward/std": 0.28432920575141907,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1790.0,
+      "completions/mean_length": 1342.953125,
+      "completions/mean_terminated_length": 919.9249877929688,
+      "completions/min_length": 286.0,
+      "completions/min_terminated_length": 286.0,
+      "epoch": 0.025142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34627005457878113,
+      "learning_rate": 4.1999999999999995e-07,
+      "loss": 0.0,
+      "num_tokens": 2715247.0,
+      "reward": -0.09092864394187927,
+      "reward_std": 0.21042926609516144,
+      "rewards/cosine_scaled_reward/mean": -0.09092865139245987,
+      "rewards/cosine_scaled_reward/std": 0.43559205532073975,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2038.0,
+      "completions/mean_length": 1661.9375,
+      "completions/mean_terminated_length": 1132.888916015625,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "epoch": 0.026285714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2705242335796356,
+      "learning_rate": 4.3999999999999997e-07,
+      "loss": 0.0,
+      "num_tokens": 2832403.0,
+      "reward": -0.13339249789714813,
+      "reward_std": 0.2433384656906128,
+      "rewards/cosine_scaled_reward/mean": -0.13339248299598694,
+      "rewards/cosine_scaled_reward/std": 0.3815627098083496,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1802.296875,
+      "completions/mean_terminated_length": 1065.1875,
+      "completions/min_length": 572.0,
+      "completions/min_terminated_length": 572.0,
+      "epoch": 0.027428571428571427,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24961258471012115,
+      "learning_rate": 4.6e-07,
+      "loss": 0.0,
+      "num_tokens": 2958678.0,
+      "reward": -0.18733163177967072,
+      "reward_std": 0.2773033380508423,
+      "rewards/cosine_scaled_reward/mean": -0.1873316466808319,
+      "rewards/cosine_scaled_reward/std": 0.37051624059677124,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1848.0,
+      "completions/mean_length": 1731.53125,
+      "completions/mean_terminated_length": 982.0,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "epoch": 0.02857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2662124037742615,
+      "learning_rate": 4.8e-07,
+      "loss": 0.0,
+      "num_tokens": 3079792.0,
+      "reward": -0.12407588213682175,
+      "reward_std": 0.25581949949264526,
+      "rewards/cosine_scaled_reward/mean": -0.12407589703798294,
+      "rewards/cosine_scaled_reward/std": 0.39043793082237244,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1965.46875,
+      "completions/mean_terminated_length": 1567.8182373046875,
+      "completions/min_length": 1006.0,
+      "completions/min_terminated_length": 1006.0,
+      "epoch": 0.029714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23202598094940186,
+      "learning_rate": 5e-07,
+      "loss": 0.0,
+      "num_tokens": 3216214.0,
+      "reward": -0.0963105633854866,
+      "reward_std": 0.30887559056282043,
+      "rewards/cosine_scaled_reward/mean": -0.0963105633854866,
+      "rewards/cosine_scaled_reward/std": 0.39396020770072937,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1886.96875,
+      "completions/mean_terminated_length": 1111.0909423828125,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "epoch": 0.030857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2878379225730896,
+      "learning_rate": 5.2e-07,
+      "loss": -0.0,
+      "num_tokens": 3347268.0,
+      "reward": -0.1645491123199463,
+      "reward_std": 0.28629785776138306,
+      "rewards/cosine_scaled_reward/mean": -0.1645491123199463,
+      "rewards/cosine_scaled_reward/std": 0.35050687193870544,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1995.0,
+      "completions/mean_length": 1843.640625,
+      "completions/mean_terminated_length": 1230.5625,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "epoch": 0.032,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24996496737003326,
+      "learning_rate": 5.4e-07,
+      "loss": 0.0,
+      "num_tokens": 3475597.0,
+      "reward": -0.06605555862188339,
+      "reward_std": 0.2643629312515259,
+      "rewards/cosine_scaled_reward/mean": -0.06605555862188339,
+      "rewards/cosine_scaled_reward/std": 0.438128799200058,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2005.0,
+      "completions/mean_length": 2020.5,
+      "completions/mean_terminated_length": 1608.0,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "epoch": 0.03314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23316837847232819,
+      "learning_rate": 5.6e-07,
+      "loss": -0.0,
+      "num_tokens": 3615381.0,
+      "reward": -0.2015206664800644,
+      "reward_std": 0.15312039852142334,
+      "rewards/cosine_scaled_reward/mean": -0.2015206664800644,
+      "rewards/cosine_scaled_reward/std": 0.1648881882429123,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1839.0,
+      "completions/mean_length": 1826.046875,
+      "completions/mean_terminated_length": 955.3077392578125,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "epoch": 0.03428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2410832792520523,
+      "learning_rate": 5.8e-07,
+      "loss": -0.0,
+      "num_tokens": 3742784.0,
+      "reward": -0.17509159445762634,
+      "reward_std": 0.18994277715682983,
+      "rewards/cosine_scaled_reward/mean": -0.17509159445762634,
+      "rewards/cosine_scaled_reward/std": 0.22516494989395142,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1678.0,
+      "completions/mean_length": 1781.4375,
+      "completions/mean_terminated_length": 910.6666870117188,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "epoch": 0.03542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2693414092063904,
+      "learning_rate": 6e-07,
+      "loss": 0.0,
+      "num_tokens": 3867292.0,
+      "reward": -0.24513831734657288,
+      "reward_std": 0.28315529227256775,
+      "rewards/cosine_scaled_reward/mean": -0.24513831734657288,
+      "rewards/cosine_scaled_reward/std": 0.3480584919452667,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1975.0,
+      "completions/mean_length": 1969.28125,
+      "completions/mean_terminated_length": 1488.2222900390625,
+      "completions/min_length": 1088.0,
+      "completions/min_terminated_length": 1088.0,
+      "epoch": 0.036571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24202018976211548,
+      "learning_rate": 6.2e-07,
+      "loss": 0.0,
+      "num_tokens": 4003678.0,
+      "reward": -0.18968716263771057,
+      "reward_std": 0.28299200534820557,
+      "rewards/cosine_scaled_reward/mean": -0.18968716263771057,
+      "rewards/cosine_scaled_reward/std": 0.3119950294494629,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 2048.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 2048.0,
+      "completions/min_terminated_length": 0.0,
+      "epoch": 0.037714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22288212180137634,
+      "learning_rate": 6.4e-07,
+      "loss": 0.0,
+      "num_tokens": 4145966.0,
+      "reward": -0.2955162525177002,
+      "reward_std": 0.17793573439121246,
+      "rewards/cosine_scaled_reward/mean": -0.2955162525177002,
+      "rewards/cosine_scaled_reward/std": 0.22786569595336914,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1809.0,
+      "completions/mean_length": 1589.640625,
+      "completions/mean_terminated_length": 1036.4482421875,
+      "completions/min_length": 515.0,
+      "completions/min_terminated_length": 515.0,
+      "epoch": 0.038857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31030499935150146,
+      "learning_rate": 6.6e-07,
+      "loss": 0.0,
+      "num_tokens": 4257255.0,
+      "reward": 0.008002171292901039,
+      "reward_std": 0.3413254916667938,
+      "rewards/cosine_scaled_reward/mean": 0.008002176880836487,
+      "rewards/cosine_scaled_reward/std": 0.4431404769420624,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1987.0,
+      "completions/mean_length": 1785.921875,
+      "completions/mean_terminated_length": 757.769287109375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "epoch": 0.04,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3145958483219147,
+      "learning_rate": 6.800000000000001e-07,
+      "loss": -0.0,
+      "num_tokens": 4383050.0,
+      "reward": -0.16386553645133972,
+      "reward_std": 0.2818174958229065,
+      "rewards/cosine_scaled_reward/mean": -0.16386555135250092,
+      "rewards/cosine_scaled_reward/std": 0.3242056965827942,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1195.0,
+      "completions/mean_length": 2000.421875,
+      "completions/mean_terminated_length": 1033.0,
+      "completions/min_length": 863.0,
+      "completions/min_terminated_length": 863.0,
+      "epoch": 0.04114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25796815752983093,
+      "learning_rate": 7e-07,
+      "loss": 0.0,
+      "num_tokens": 4522189.0,
+      "reward": -0.2470606118440628,
+      "reward_std": 0.15509279072284698,
+      "rewards/cosine_scaled_reward/mean": -0.2470606118440628,
+      "rewards/cosine_scaled_reward/std": 0.16412879526615143,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1964.46875,
+      "completions/mean_terminated_length": 1284.2857666015625,
+      "completions/min_length": 931.0,
+      "completions/min_terminated_length": 931.0,
+      "epoch": 0.04228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22452199459075928,
+      "learning_rate": 7.2e-07,
+      "loss": 0.0,
+      "num_tokens": 4658939.0,
+      "reward": -0.24706938862800598,
+      "reward_std": 0.18499845266342163,
+      "rewards/cosine_scaled_reward/mean": -0.24706941843032837,
+      "rewards/cosine_scaled_reward/std": 0.21092188358306885,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1840.0,
+      "completions/mean_length": 1925.234375,
+      "completions/mean_terminated_length": 1175.0,
+      "completions/min_length": 916.0,
+      "completions/min_terminated_length": 916.0,
+      "epoch": 0.04342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23703666031360626,
+      "learning_rate": 7.4e-07,
+      "loss": -0.0,
+      "num_tokens": 4793866.0,
+      "reward": -0.11504355818033218,
+      "reward_std": 0.20660358667373657,
+      "rewards/cosine_scaled_reward/mean": -0.11504356563091278,
+      "rewards/cosine_scaled_reward/std": 0.3190351724624634,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1412.0,
+      "completions/mean_length": 1740.546875,
+      "completions/mean_terminated_length": 642.5,
+      "completions/min_length": 339.0,
+      "completions/min_terminated_length": 339.0,
+      "epoch": 0.044571428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23829001188278198,
+      "learning_rate": 7.599999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 4916045.0,
+      "reward": -0.12095541507005692,
+      "reward_std": 0.1958026885986328,
+      "rewards/cosine_scaled_reward/mean": -0.12095542997121811,
+      "rewards/cosine_scaled_reward/std": 0.340241402387619,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1918.0,
+      "completions/mean_length": 1713.203125,
+      "completions/mean_terminated_length": 920.26318359375,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.045714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24145744740962982,
+      "learning_rate": 7.799999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 5035762.0,
+      "reward": -0.10936243832111359,
+      "reward_std": 0.14468500018119812,
+      "rewards/cosine_scaled_reward/mean": -0.10936242341995239,
+      "rewards/cosine_scaled_reward/std": 0.4288744330406189,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1801.0,
+      "completions/mean_length": 1909.71875,
+      "completions/mean_terminated_length": 1367.2308349609375,
+      "completions/min_length": 1138.0,
+      "completions/min_terminated_length": 1138.0,
+      "epoch": 0.046857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22317881882190704,
+      "learning_rate": 8e-07,
+      "loss": 0.0,
+      "num_tokens": 5169136.0,
+      "reward": -0.2058967649936676,
+      "reward_std": 0.2325170338153839,
+      "rewards/cosine_scaled_reward/mean": -0.20589673519134521,
+      "rewards/cosine_scaled_reward/std": 0.28897321224212646,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1752.0,
+      "completions/mean_length": 1727.71875,
+      "completions/mean_terminated_length": 583.857177734375,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.44688937067985535,
+      "learning_rate": 8.199999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5290070.0,
+      "reward": -0.2254919707775116,
+      "reward_std": 0.1687203049659729,
+      "rewards/cosine_scaled_reward/mean": -0.2254919707775116,
+      "rewards/cosine_scaled_reward/std": 0.18203677237033844,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1082.0,
+      "completions/mean_length": 1855.328125,
+      "completions/mean_terminated_length": 814.9000244140625,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "epoch": 0.04914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2430828958749771,
+      "learning_rate": 8.399999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5420427.0,
+      "reward": -0.09104865789413452,
+      "reward_std": 0.18217626214027405,
+      "rewards/cosine_scaled_reward/mean": -0.09104865789413452,
+      "rewards/cosine_scaled_reward/std": 0.3521345257759094,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1675.0,
+      "completions/mean_length": 1727.9375,
+      "completions/mean_terminated_length": 767.75,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "epoch": 0.05028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32065215706825256,
+      "learning_rate": 8.599999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5541711.0,
+      "reward": -0.17701950669288635,
+      "reward_std": 0.2957555055618286,
+      "rewards/cosine_scaled_reward/mean": -0.17701953649520874,
+      "rewards/cosine_scaled_reward/std": 0.38460060954093933,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2032.0,
+      "completions/mean_length": 2013.9375,
+      "completions/mean_terminated_length": 1321.3333740234375,
+      "completions/min_length": 740.0,
+      "completions/min_terminated_length": 740.0,
+      "epoch": 0.05142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22363637387752533,
+      "learning_rate": 8.799999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 5682259.0,
+      "reward": -0.20341511070728302,
+      "reward_std": 0.23104795813560486,
+      "rewards/cosine_scaled_reward/mean": -0.20341511070728302,
+      "rewards/cosine_scaled_reward/std": 0.3092363774776459,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1224.0,
+      "completions/mean_length": 1909.0,
+      "completions/mean_terminated_length": 936.0,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "epoch": 0.052571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26306217908859253,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 5815603.0,
+      "reward": -0.26145532727241516,
+      "reward_std": 0.17108051478862762,
+      "rewards/cosine_scaled_reward/mean": -0.2614552974700928,
+      "rewards/cosine_scaled_reward/std": 0.18312901258468628,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1668.0,
+      "completions/mean_length": 1757.1875,
+      "completions/mean_terminated_length": 884.75,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "epoch": 0.053714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2856813371181488,
+      "learning_rate": 9.2e-07,
+      "loss": 0.0,
+      "num_tokens": 5938463.0,
+      "reward": -0.20879247784614563,
+      "reward_std": 0.23861759901046753,
+      "rewards/cosine_scaled_reward/mean": -0.20879246294498444,
+      "rewards/cosine_scaled_reward/std": 0.39607998728752136,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1708.0,
+      "completions/mean_length": 1756.5,
+      "completions/mean_terminated_length": 1011.5555419921875,
+      "completions/min_length": 487.0,
+      "completions/min_terminated_length": 487.0,
+      "epoch": 0.054857142857142854,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27563413977622986,
+      "learning_rate": 9.399999999999999e-07,
+      "loss": -0.0,
+      "num_tokens": 6061423.0,
+      "reward": -0.16147920489311218,
+      "reward_std": 0.24055320024490356,
+      "rewards/cosine_scaled_reward/mean": -0.16147920489311218,
+      "rewards/cosine_scaled_reward/std": 0.3948959410190582,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1458.0,
+      "completions/mean_length": 1538.078125,
+      "completions/mean_terminated_length": 839.2963256835938,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 0.056,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27617642283439636,
+      "learning_rate": 9.6e-07,
+      "loss": -0.0,
+      "num_tokens": 6169924.0,
+      "reward": -0.18436825275421143,
+      "reward_std": 0.27141550183296204,
+      "rewards/cosine_scaled_reward/mean": -0.18436823785305023,
+      "rewards/cosine_scaled_reward/std": 0.3920196294784546,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1938.0,
+      "completions/mean_length": 1749.0625,
+      "completions/mean_terminated_length": 772.5333862304688,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 0.05714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23394836485385895,
+      "learning_rate": 9.8e-07,
+      "loss": 0.0,
+      "num_tokens": 6292680.0,
+      "reward": -0.10770958662033081,
+      "reward_std": 0.22513547539710999,
+      "rewards/cosine_scaled_reward/mean": -0.10770957916975021,
+      "rewards/cosine_scaled_reward/std": 0.421062707901001,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1482.25,
+      "completions/mean_terminated_length": 841.0667114257812,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "epoch": 0.05828571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3268967568874359,
+      "learning_rate": 1e-06,
+      "loss": -0.0,
+      "num_tokens": 6397752.0,
+      "reward": -0.09745607525110245,
+      "reward_std": 0.25210899114608765,
+      "rewards/cosine_scaled_reward/mean": -0.09745605289936066,
+      "rewards/cosine_scaled_reward/std": 0.3351369798183441,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1579.0,
+      "completions/mean_length": 1743.953125,
+      "completions/mean_terminated_length": 750.7333984375,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "epoch": 0.05942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2918722927570343,
+      "learning_rate": 9.999890338174275e-07,
+      "loss": -0.0,
+      "num_tokens": 6520717.0,
+      "reward": -0.1890830397605896,
+      "reward_std": 0.21916288137435913,
+      "rewards/cosine_scaled_reward/mean": -0.1890830546617508,
+      "rewards/cosine_scaled_reward/std": 0.32568052411079407,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1757.0,
+      "completions/mean_length": 1772.421875,
+      "completions/mean_terminated_length": 1010.5294189453125,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "epoch": 0.060571428571428575,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24523264169692993,
+      "learning_rate": 9.999561358041868e-07,
+      "loss": 0.0,
+      "num_tokens": 6644984.0,
+      "reward": -0.20969681441783905,
+      "reward_std": 0.1810423731803894,
+      "rewards/cosine_scaled_reward/mean": -0.20969681441783905,
+      "rewards/cosine_scaled_reward/std": 0.2371566891670227,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1961.0,
+      "completions/mean_length": 1838.859375,
+      "completions/mean_terminated_length": 1304.388916015625,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "epoch": 0.061714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23284469544887543,
+      "learning_rate": 9.999013075636804e-07,
+      "loss": 0.0,
+      "num_tokens": 6773815.0,
+      "reward": -0.06641622632741928,
+      "reward_std": 0.30815836787223816,
+      "rewards/cosine_scaled_reward/mean": -0.06641621887683868,
+      "rewards/cosine_scaled_reward/std": 0.46219584345817566,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1803.0,
+      "completions/mean_length": 1750.125,
+      "completions/mean_terminated_length": 856.5,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "epoch": 0.06285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2651103734970093,
+      "learning_rate": 9.998245517681593e-07,
+      "loss": -0.0,
+      "num_tokens": 6896111.0,
+      "reward": -0.10750342905521393,
+      "reward_std": 0.2286185324192047,
+      "rewards/cosine_scaled_reward/mean": -0.10750342160463333,
+      "rewards/cosine_scaled_reward/std": 0.43372800946235657,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2037.0,
+      "completions/mean_length": 1840.078125,
+      "completions/mean_terminated_length": 1097.5,
+      "completions/min_length": 526.0,
+      "completions/min_terminated_length": 526.0,
+      "epoch": 0.064,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22967560589313507,
+      "learning_rate": 9.997258721585931e-07,
+      "loss": -0.0,
+      "num_tokens": 7024836.0,
+      "reward": -0.10045827925205231,
+      "reward_std": 0.2548004388809204,
+      "rewards/cosine_scaled_reward/mean": -0.10045827925205231,
+      "rewards/cosine_scaled_reward/std": 0.41444358229637146,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1810.0,
+      "completions/mean_length": 1991.1875,
+      "completions/mean_terminated_length": 1442.0,
+      "completions/min_length": 926.0,
+      "completions/min_terminated_length": 926.0,
+      "epoch": 0.06514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20479348301887512,
+      "learning_rate": 9.996052735444862e-07,
+      "loss": 0.0,
+      "num_tokens": 7163840.0,
+      "reward": -0.27901512384414673,
+      "reward_std": 0.2130473554134369,
+      "rewards/cosine_scaled_reward/mean": -0.27901512384414673,
+      "rewards/cosine_scaled_reward/std": 0.2583855092525482,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2023.0,
+      "completions/mean_length": 1617.421875,
+      "completions/mean_terminated_length": 1129.433349609375,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "epoch": 0.06628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2690146267414093,
+      "learning_rate": 9.994627618036452e-07,
+      "loss": -0.0,
+      "num_tokens": 7277451.0,
+      "reward": -0.04198366403579712,
+      "reward_std": 0.4036104083061218,
+      "rewards/cosine_scaled_reward/mean": -0.04198366031050682,
+      "rewards/cosine_scaled_reward/std": 0.5008736252784729,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2022.0,
+      "completions/mean_length": 1736.09375,
+      "completions/mean_terminated_length": 997.368408203125,
+      "completions/min_length": 478.0,
+      "completions/min_terminated_length": 478.0,
+      "epoch": 0.06742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2184475064277649,
+      "learning_rate": 9.992983438818915e-07,
+      "loss": -0.0,
+      "num_tokens": 7399025.0,
+      "reward": -0.1564982533454895,
+      "reward_std": 0.19560785591602325,
+      "rewards/cosine_scaled_reward/mean": -0.1564982533454895,
+      "rewards/cosine_scaled_reward/std": 0.3402426540851593,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1512.0,
+      "completions/mean_length": 1785.40625,
+      "completions/mean_terminated_length": 847.5714721679688,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "epoch": 0.06857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23538637161254883,
+      "learning_rate": 9.991120277927223e-07,
+      "loss": -0.0,
+      "num_tokens": 7524179.0,
+      "reward": -0.2697012424468994,
+      "reward_std": 0.17935499548912048,
+      "rewards/cosine_scaled_reward/mean": -0.2697012424468994,
+      "rewards/cosine_scaled_reward/std": 0.19757980108261108,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2046.0,
+      "completions/mean_length": 1884.484375,
+      "completions/mean_terminated_length": 1001.5,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "epoch": 0.06971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.225452721118927,
+      "learning_rate": 9.989038226169207e-07,
+      "loss": 0.0,
+      "num_tokens": 7656306.0,
+      "reward": -0.1635127067565918,
+      "reward_std": 0.1931447982788086,
+      "rewards/cosine_scaled_reward/mean": -0.1635127067565918,
+      "rewards/cosine_scaled_reward/std": 0.23563610017299652,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1994.0,
+      "completions/mean_length": 1739.46875,
+      "completions/mean_terminated_length": 1060.7000732421875,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "epoch": 0.07085714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23771661520004272,
+      "learning_rate": 9.98673738502114e-07,
+      "loss": 0.0,
+      "num_tokens": 7777864.0,
+      "reward": -0.10127441585063934,
+      "reward_std": 0.2957979142665863,
+      "rewards/cosine_scaled_reward/mean": -0.10127442330121994,
+      "rewards/cosine_scaled_reward/std": 0.34053224325180054,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1965.0,
+      "completions/mean_length": 1522.953125,
+      "completions/mean_terminated_length": 1163.7105712890625,
+      "completions/min_length": 531.0,
+      "completions/min_terminated_length": 531.0,
+      "epoch": 0.072,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27804723381996155,
+      "learning_rate": 9.98421786662277e-07,
+      "loss": 0.0,
+      "num_tokens": 7885589.0,
+      "reward": -0.036153122782707214,
+      "reward_std": 0.3305097818374634,
+      "rewards/cosine_scaled_reward/mean": -0.03615312650799751,
+      "rewards/cosine_scaled_reward/std": 0.4355940818786621,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1558.0,
+      "completions/mean_length": 1760.390625,
+      "completions/mean_terminated_length": 1025.388916015625,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "epoch": 0.07314285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2333846092224121,
+      "learning_rate": 9.981479793771866e-07,
+      "loss": -0.0,
+      "num_tokens": 8009206.0,
+      "reward": -0.14333069324493408,
+      "reward_std": 0.28757935762405396,
+      "rewards/cosine_scaled_reward/mean": -0.14333069324493408,
+      "rewards/cosine_scaled_reward/std": 0.41007620096206665,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1532.0,
+      "completions/mean_length": 1651.515625,
+      "completions/mean_terminated_length": 638.2777709960938,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "epoch": 0.07428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26348626613616943,
+      "learning_rate": 9.97852329991824e-07,
+      "loss": 0.0,
+      "num_tokens": 8125607.0,
+      "reward": -0.2117859125137329,
+      "reward_std": 0.15534773468971252,
+      "rewards/cosine_scaled_reward/mean": -0.2117859125137329,
+      "rewards/cosine_scaled_reward/std": 0.37395453453063965,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1350.0,
+      "completions/mean_length": 1254.125,
+      "completions/mean_terminated_length": 596.3428344726562,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.07542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33443817496299744,
+      "learning_rate": 9.975348529157229e-07,
+      "loss": 0.0,
+      "num_tokens": 8216103.0,
+      "reward": 0.028336994349956512,
+      "reward_std": 0.25119709968566895,
+      "rewards/cosine_scaled_reward/mean": 0.02833697199821472,
+      "rewards/cosine_scaled_reward/std": 0.4882389008998871,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.90625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1431.0,
+      "completions/mean_length": 1966.21875,
+      "completions/mean_terminated_length": 1175.666748046875,
+      "completions/min_length": 840.0,
+      "completions/min_terminated_length": 840.0,
+      "epoch": 0.07657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2199370563030243,
+      "learning_rate": 9.971955636222684e-07,
+      "loss": -0.0,
+      "num_tokens": 8352677.0,
+      "reward": -0.28747493028640747,
+      "reward_std": 0.15530282258987427,
+      "rewards/cosine_scaled_reward/mean": -0.28747493028640747,
+      "rewards/cosine_scaled_reward/std": 0.16220521926879883,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2024.0,
+      "completions/mean_length": 1357.109375,
+      "completions/mean_terminated_length": 747.5,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 0.07771428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3341590464115143,
+      "learning_rate": 9.968344786479415e-07,
+      "loss": -0.0,
+      "num_tokens": 8448788.0,
+      "reward": -0.06672946363687515,
+      "reward_std": 0.28790342807769775,
+      "rewards/cosine_scaled_reward/mean": -0.06672945618629456,
+      "rewards/cosine_scaled_reward/std": 0.35960128903388977,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1654.0,
+      "completions/mean_length": 1565.046875,
+      "completions/mean_terminated_length": 944.107177734375,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "epoch": 0.07885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.35159721970558167,
+      "learning_rate": 9.964516155915151e-07,
+      "loss": -0.0,
+      "num_tokens": 8559295.0,
+      "reward": -0.27992868423461914,
+      "reward_std": 0.20264248549938202,
+      "rewards/cosine_scaled_reward/mean": -0.27992868423461914,
+      "rewards/cosine_scaled_reward/std": 0.23891927301883698,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 1867.765625,
+      "completions/mean_terminated_length": 606.125,
+      "completions/min_length": 439.0,
+      "completions/min_terminated_length": 439.0,
+      "epoch": 0.08,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23989427089691162,
+      "learning_rate": 9.960469931131936e-07,
+      "loss": -0.0,
+      "num_tokens": 8690288.0,
+      "reward": -0.2498025894165039,
+      "reward_std": 0.15823513269424438,
+      "rewards/cosine_scaled_reward/mean": -0.2498025894165039,
+      "rewards/cosine_scaled_reward/std": 0.17978127300739288,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1908.0,
+      "completions/mean_length": 1669.125,
+      "completions/mean_terminated_length": 945.8182373046875,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "epoch": 0.08114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.335510790348053,
+      "learning_rate": 9.956206309337066e-07,
+      "loss": -0.0,
+      "num_tokens": 8807832.0,
+      "reward": -0.1673138290643692,
+      "reward_std": 0.2547321915626526,
+      "rewards/cosine_scaled_reward/mean": -0.1673138290643692,
+      "rewards/cosine_scaled_reward/std": 0.39353805780410767,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1957.0,
+      "completions/mean_length": 1632.59375,
+      "completions/mean_terminated_length": 892.0869750976562,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "epoch": 0.08228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30721575021743774,
+      "learning_rate": 9.951725498333448e-07,
+      "loss": 0.0,
+      "num_tokens": 8922670.0,
+      "reward": -0.1493685096502304,
+      "reward_std": 0.23021411895751953,
+      "rewards/cosine_scaled_reward/mean": -0.1493685096502304,
+      "rewards/cosine_scaled_reward/std": 0.27729952335357666,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.953125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1852.0,
+      "completions/mean_length": 2020.59375,
+      "completions/mean_terminated_length": 1463.3333740234375,
+      "completions/min_length": 888.0,
+      "completions/min_terminated_length": 888.0,
+      "epoch": 0.08342857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20856839418411255,
+      "learning_rate": 9.947027716509488e-07,
+      "loss": 0.0,
+      "num_tokens": 9062716.0,
+      "reward": -0.25696587562561035,
+      "reward_std": 0.19847074151039124,
+      "rewards/cosine_scaled_reward/mean": -0.25696590542793274,
+      "rewards/cosine_scaled_reward/std": 0.23918035626411438,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1957.0,
+      "completions/mean_length": 1926.984375,
+      "completions/mean_terminated_length": 1273.5,
+      "completions/min_length": 740.0,
+      "completions/min_terminated_length": 740.0,
+      "epoch": 0.08457142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23241353034973145,
+      "learning_rate": 9.942113192828444e-07,
+      "loss": -0.0,
+      "num_tokens": 9195971.0,
+      "reward": -0.12904082238674164,
+      "reward_std": 0.23554545640945435,
+      "rewards/cosine_scaled_reward/mean": -0.12904080748558044,
+      "rewards/cosine_scaled_reward/std": 0.4280695915222168,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1677.0,
+      "completions/mean_length": 1868.890625,
+      "completions/mean_terminated_length": 1092.75,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "epoch": 0.08571428571428572,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.19846303761005402,
+      "learning_rate": 9.93698216681727e-07,
+      "loss": -0.0,
+      "num_tokens": 9326540.0,
+      "reward": -0.03926669806241989,
+      "reward_std": 0.2044709324836731,
+      "rewards/cosine_scaled_reward/mean": -0.039266690611839294,
+      "rewards/cosine_scaled_reward/std": 0.49658530950546265,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1963.0,
+      "completions/mean_length": 1805.296875,
+      "completions/mean_terminated_length": 1077.1875,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.08685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23998627066612244,
+      "learning_rate": 9.931634888554935e-07,
+      "loss": 0.0,
+      "num_tokens": 9452479.0,
+      "reward": -0.23065510392189026,
+      "reward_std": 0.17413878440856934,
+      "rewards/cosine_scaled_reward/mean": -0.23065511882305145,
+      "rewards/cosine_scaled_reward/std": 0.21896763145923615,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1871.0,
+      "completions/mean_length": 1857.328125,
+      "completions/mean_terminated_length": 1285.3125,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "epoch": 0.088,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20421437919139862,
+      "learning_rate": 9.926071618660237e-07,
+      "loss": 0.0,
+      "num_tokens": 9582924.0,
+      "reward": -0.17972718179225922,
+      "reward_std": 0.209285706281662,
+      "rewards/cosine_scaled_reward/mean": -0.17972716689109802,
+      "rewards/cosine_scaled_reward/std": 0.2716500163078308,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2001.0,
+      "completions/mean_length": 1883.921875,
+      "completions/mean_terminated_length": 1093.3636474609375,
+      "completions/min_length": 712.0,
+      "completions/min_terminated_length": 712.0,
+      "epoch": 0.08914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2156875878572464,
+      "learning_rate": 9.9202926282791e-07,
+      "loss": -0.0,
+      "num_tokens": 9714215.0,
+      "reward": -0.14897406101226807,
+      "reward_std": 0.2451157122850418,
+      "rewards/cosine_scaled_reward/mean": -0.14897406101226807,
+      "rewards/cosine_scaled_reward/std": 0.38884180784225464,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1878.0,
+      "completions/mean_length": 1507.65625,
+      "completions/mean_terminated_length": 767.1851806640625,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.09028571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29943305253982544,
+      "learning_rate": 9.91429819907136e-07,
+      "loss": -0.0,
+      "num_tokens": 9820801.0,
+      "reward": -0.17114077508449554,
+      "reward_std": 0.23199111223220825,
+      "rewards/cosine_scaled_reward/mean": -0.17114077508449554,
+      "rewards/cosine_scaled_reward/std": 0.3217289447784424,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2007.0,
+      "completions/mean_length": 1976.125,
+      "completions/mean_terminated_length": 1536.888916015625,
+      "completions/min_length": 655.0,
+      "completions/min_terminated_length": 655.0,
+      "epoch": 0.09142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26230743527412415,
+      "learning_rate": 9.908088623197048e-07,
+      "loss": 0.0,
+      "num_tokens": 9957665.0,
+      "reward": -0.21115826070308685,
+      "reward_std": 0.2435196340084076,
+      "rewards/cosine_scaled_reward/mean": -0.21115827560424805,
+      "rewards/cosine_scaled_reward/std": 0.28258123993873596,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2042.0,
+      "completions/mean_length": 1779.28125,
+      "completions/mean_terminated_length": 901.4667358398438,
+      "completions/min_length": 320.0,
+      "completions/min_terminated_length": 320.0,
+      "epoch": 0.09257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33359771966934204,
+      "learning_rate": 9.901664203302124e-07,
+      "loss": 0.0,
+      "num_tokens": 10082811.0,
+      "reward": -0.1508273482322693,
+      "reward_std": 0.2594776749610901,
+      "rewards/cosine_scaled_reward/mean": -0.1508273482322693,
+      "rewards/cosine_scaled_reward/std": 0.33812451362609863,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1831.0,
+      "completions/mean_length": 1711.609375,
+      "completions/mean_terminated_length": 851.9444580078125,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "epoch": 0.09371428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2805767059326172,
+      "learning_rate": 9.895025252503755e-07,
+      "loss": -0.0,
+      "num_tokens": 10202682.0,
+      "reward": -0.11850972473621368,
+      "reward_std": 0.2631937861442566,
+      "rewards/cosine_scaled_reward/mean": -0.11850972473621368,
+      "rewards/cosine_scaled_reward/std": 0.4419197142124176,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1925.0,
+      "completions/mean_length": 1749.984375,
+      "completions/mean_terminated_length": 1044.157958984375,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "epoch": 0.09485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3109220266342163,
+      "learning_rate": 9.888172094375033e-07,
+      "loss": -0.0,
+      "num_tokens": 10325769.0,
+      "reward": -0.10190614312887192,
+      "reward_std": 0.2739119529724121,
+      "rewards/cosine_scaled_reward/mean": -0.10190614312887192,
+      "rewards/cosine_scaled_reward/std": 0.39238420128822327,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1756.0,
+      "completions/mean_length": 1800.390625,
+      "completions/mean_terminated_length": 829.0000610351562,
+      "completions/min_length": 420.0,
+      "completions/min_terminated_length": 420.0,
+      "epoch": 0.096,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23385629057884216,
+      "learning_rate": 9.881105062929221e-07,
+      "loss": 0.0,
+      "num_tokens": 10451690.0,
+      "reward": -0.21778321266174316,
+      "reward_std": 0.25428956747055054,
+      "rewards/cosine_scaled_reward/mean": -0.21778322756290436,
+      "rewards/cosine_scaled_reward/std": 0.30295974016189575,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1842.0,
+      "completions/mean_length": 1870.46875,
+      "completions/mean_terminated_length": 1337.875,
+      "completions/min_length": 867.0,
+      "completions/min_terminated_length": 867.0,
+      "epoch": 0.09714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21526271104812622,
+      "learning_rate": 9.873824502603459e-07,
+      "loss": -0.0,
+      "num_tokens": 10581720.0,
+      "reward": -0.19906702637672424,
+      "reward_std": 0.23402772843837738,
+      "rewards/cosine_scaled_reward/mean": -0.19906699657440186,
+      "rewards/cosine_scaled_reward/std": 0.28999006748199463,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1369.0,
+      "completions/mean_length": 1734.875,
+      "completions/mean_terminated_length": 795.5,
+      "completions/min_length": 581.0,
+      "completions/min_terminated_length": 581.0,
+      "epoch": 0.09828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24285966157913208,
+      "learning_rate": 9.866330768241983e-07,
+      "loss": 0.0,
+      "num_tokens": 10703608.0,
+      "reward": -0.16528445482254028,
+      "reward_std": 0.2592755854129791,
+      "rewards/cosine_scaled_reward/mean": -0.16528445482254028,
+      "rewards/cosine_scaled_reward/std": 0.37110546231269836,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1626.0,
+      "completions/mean_length": 1577.921875,
+      "completions/mean_terminated_length": 973.5357666015625,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "epoch": 0.09942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30273520946502686,
+      "learning_rate": 9.85862422507884e-07,
+      "loss": -0.0,
+      "num_tokens": 10814715.0,
+      "reward": -0.20241931080818176,
+      "reward_std": 0.2693288326263428,
+      "rewards/cosine_scaled_reward/mean": -0.20241928100585938,
+      "rewards/cosine_scaled_reward/std": 0.33345305919647217,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1948.0,
+      "completions/mean_length": 1680.546875,
+      "completions/mean_terminated_length": 1068.125,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "epoch": 0.10057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2649252116680145,
+      "learning_rate": 9.850705248720068e-07,
+      "loss": -0.0,
+      "num_tokens": 10932782.0,
+      "reward": -0.018871163949370384,
+      "reward_std": 0.3073042631149292,
+      "rewards/cosine_scaled_reward/mean": -0.018871165812015533,
+      "rewards/cosine_scaled_reward/std": 0.3826298415660858,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1754.0,
+      "completions/mean_length": 1683.703125,
+      "completions/mean_terminated_length": 1151.269287109375,
+      "completions/min_length": 667.0,
+      "completions/min_terminated_length": 667.0,
+      "epoch": 0.10171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24950510263442993,
+      "learning_rate": 9.8425742251254e-07,
+      "loss": -0.0,
+      "num_tokens": 11051539.0,
+      "reward": -0.11818082630634308,
+      "reward_std": 0.2949528694152832,
+      "rewards/cosine_scaled_reward/mean": -0.11818082630634308,
+      "rewards/cosine_scaled_reward/std": 0.34418320655822754,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1958.0,
+      "completions/mean_length": 1558.546875,
+      "completions/mean_terminated_length": 967.8275756835938,
+      "completions/min_length": 377.0,
+      "completions/min_terminated_length": 377.0,
+      "epoch": 0.10285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.36593058705329895,
+      "learning_rate": 9.83423155058946e-07,
+      "loss": 0.0,
+      "num_tokens": 11161286.0,
+      "reward": -0.26082760095596313,
+      "reward_std": 0.1802712082862854,
+      "rewards/cosine_scaled_reward/mean": -0.26082760095596313,
+      "rewards/cosine_scaled_reward/std": 0.2037661075592041,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.765625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1505.0,
+      "completions/mean_length": 1827.9375,
+      "completions/mean_terminated_length": 1109.0667724609375,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "epoch": 0.104,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24167831242084503,
+      "learning_rate": 9.825677631722435e-07,
+      "loss": 0.0,
+      "num_tokens": 11288842.0,
+      "reward": -0.11456942558288574,
+      "reward_std": 0.26296502351760864,
+      "rewards/cosine_scaled_reward/mean": -0.11456942558288574,
+      "rewards/cosine_scaled_reward/std": 0.3274599611759186,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1931.0,
+      "completions/mean_length": 1581.546875,
+      "completions/mean_terminated_length": 899.8077392578125,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "epoch": 0.10514285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2570616602897644,
+      "learning_rate": 9.816912885430258e-07,
+      "loss": 0.0,
+      "num_tokens": 11400053.0,
+      "reward": -0.17942462861537933,
+      "reward_std": 0.2633644640445709,
+      "rewards/cosine_scaled_reward/mean": -0.17942462861537933,
+      "rewards/cosine_scaled_reward/std": 0.30215632915496826,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.96875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1562.0,
+      "completions/mean_length": 2022.328125,
+      "completions/mean_terminated_length": 1226.5,
+      "completions/min_length": 891.0,
+      "completions/min_terminated_length": 891.0,
+      "epoch": 0.10628571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25331902503967285,
+      "learning_rate": 9.807937738894303e-07,
+      "loss": 0.0,
+      "num_tokens": 11540826.0,
+      "reward": -0.26418450474739075,
+      "reward_std": 0.1380012035369873,
+      "rewards/cosine_scaled_reward/mean": -0.26418450474739075,
+      "rewards/cosine_scaled_reward/std": 0.17390060424804688,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1702.0,
+      "completions/mean_length": 1769.546875,
+      "completions/mean_terminated_length": 934.1875,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "epoch": 0.10742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29503753781318665,
+      "learning_rate": 9.798752629550546e-07,
+      "loss": 0.0,
+      "num_tokens": 11663845.0,
+      "reward": -0.08299511671066284,
+      "reward_std": 0.18226617574691772,
+      "rewards/cosine_scaled_reward/mean": -0.08299513161182404,
+      "rewards/cosine_scaled_reward/std": 0.46436113119125366,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.96875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1300.0,
+      "completions/mean_length": 2021.5,
+      "completions/mean_terminated_length": 1200.0,
+      "completions/min_length": 1100.0,
+      "completions/min_terminated_length": 1100.0,
+      "epoch": 0.10857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.20416001975536346,
+      "learning_rate": 9.78935800506826e-07,
+      "loss": -0.0,
+      "num_tokens": 11803749.0,
+      "reward": -0.22345861792564392,
+      "reward_std": 0.18781372904777527,
+      "rewards/cosine_scaled_reward/mean": -0.22345861792564392,
+      "rewards/cosine_scaled_reward/std": 0.24531956017017365,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1440.0,
+      "completions/mean_length": 1582.890625,
+      "completions/mean_terminated_length": 903.1154174804688,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "epoch": 0.10971428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2593792974948883,
+      "learning_rate": 9.779754323328192e-07,
+      "loss": -0.0,
+      "num_tokens": 11916190.0,
+      "reward": 0.00020215287804603577,
+      "reward_std": 0.24673128128051758,
+      "rewards/cosine_scaled_reward/mean": 0.00020216405391693115,
+      "rewards/cosine_scaled_reward/std": 0.49432000517845154,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1972.0,
+      "completions/mean_length": 1748.859375,
+      "completions/mean_terminated_length": 1177.772705078125,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "epoch": 0.11085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2480001151561737,
+      "learning_rate": 9.769942052400235e-07,
+      "loss": 0.0,
+      "num_tokens": 12038381.0,
+      "reward": -0.19425566494464874,
+      "reward_std": 0.21240204572677612,
+      "rewards/cosine_scaled_reward/mean": -0.19425567984580994,
+      "rewards/cosine_scaled_reward/std": 0.29181501269340515,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1984.0,
+      "completions/mean_length": 1632.171875,
+      "completions/mean_terminated_length": 1062.3333740234375,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "epoch": 0.112,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2797771692276001,
+      "learning_rate": 9.759921670520634e-07,
+      "loss": -0.0,
+      "num_tokens": 12153904.0,
+      "reward": -0.11104464530944824,
+      "reward_std": 0.2755987048149109,
+      "rewards/cosine_scaled_reward/mean": -0.11104465276002884,
+      "rewards/cosine_scaled_reward/std": 0.4012855887413025,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 1651.078125,
+      "completions/mean_terminated_length": 553.7058715820312,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "epoch": 0.11314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3114299476146698,
+      "learning_rate": 9.749693666068663e-07,
+      "loss": -0.0,
+      "num_tokens": 12270741.0,
+      "reward": -0.1317199319601059,
+      "reward_std": 0.14237020909786224,
+      "rewards/cosine_scaled_reward/mean": -0.1317199319601059,
+      "rewards/cosine_scaled_reward/std": 0.3707720935344696,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.546875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2034.0,
+      "completions/mean_length": 1544.765625,
+      "completions/mean_terminated_length": 937.413818359375,
+      "completions/min_length": 457.0,
+      "completions/min_terminated_length": 457.0,
+      "epoch": 0.11428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2654109001159668,
+      "learning_rate": 9.739258537542835e-07,
+      "loss": 0.0,
+      "num_tokens": 12379318.0,
+      "reward": -0.018167953938245773,
+      "reward_std": 0.29768484830856323,
+      "rewards/cosine_scaled_reward/mean": -0.01816795952618122,
+      "rewards/cosine_scaled_reward/std": 0.44200995564460754,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2036.0,
+      "completions/mean_length": 1647.421875,
+      "completions/mean_terminated_length": 979.7916870117188,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "epoch": 0.11542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2877754867076874,
+      "learning_rate": 9.728616793536587e-07,
+      "loss": 0.0,
+      "num_tokens": 12496185.0,
+      "reward": -0.10053972899913788,
+      "reward_std": 0.28722673654556274,
+      "rewards/cosine_scaled_reward/mean": -0.10053973644971848,
+      "rewards/cosine_scaled_reward/std": 0.36782190203666687,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2029.0,
+      "completions/mean_length": 1371.484375,
+      "completions/mean_terminated_length": 937.8204956054688,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "epoch": 0.11657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30472108721733093,
+      "learning_rate": 9.717768952713511e-07,
+      "loss": 0.0,
+      "num_tokens": 12594112.0,
+      "reward": -0.20305150747299194,
+      "reward_std": 0.23292692005634308,
+      "rewards/cosine_scaled_reward/mean": -0.20305150747299194,
+      "rewards/cosine_scaled_reward/std": 0.3213489055633545,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1874.0,
+      "completions/mean_length": 1737.984375,
+      "completions/mean_terminated_length": 807.9375,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.11771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27034303545951843,
+      "learning_rate": 9.706715543782064e-07,
+      "loss": 0.0,
+      "num_tokens": 12715695.0,
+      "reward": -0.29003486037254333,
+      "reward_std": 0.21371816098690033,
+      "rewards/cosine_scaled_reward/mean": -0.29003486037254333,
+      "rewards/cosine_scaled_reward/std": 0.224824920296669,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1692.0,
+      "completions/mean_length": 1705.28125,
+      "completions/mean_terminated_length": 893.5789794921875,
+      "completions/min_length": 418.0,
+      "completions/min_terminated_length": 418.0,
+      "epoch": 0.11885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27687934041023254,
+      "learning_rate": 9.695457105469804e-07,
+      "loss": -0.0,
+      "num_tokens": 12835297.0,
+      "reward": -0.15606051683425903,
+      "reward_std": 0.18938840925693512,
+      "rewards/cosine_scaled_reward/mean": -0.15606051683425903,
+      "rewards/cosine_scaled_reward/std": 0.24088984727859497,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1432.0,
+      "completions/mean_length": 1401.015625,
+      "completions/mean_terminated_length": 830.1470336914062,
+      "completions/min_length": 480.0,
+      "completions/min_terminated_length": 480.0,
+      "epoch": 0.12,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2770017087459564,
+      "learning_rate": 9.683994186497132e-07,
+      "loss": 0.0,
+      "num_tokens": 12936250.0,
+      "reward": 0.018527541309595108,
+      "reward_std": 0.36475759744644165,
+      "rewards/cosine_scaled_reward/mean": 0.018527545034885406,
+      "rewards/cosine_scaled_reward/std": 0.4995051920413971,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.515625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1944.0,
+      "completions/mean_length": 1438.453125,
+      "completions/mean_terminated_length": 789.5806274414062,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "epoch": 0.12114285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26982930302619934,
+      "learning_rate": 9.672327345550543e-07,
+      "loss": 0.0,
+      "num_tokens": 13039143.0,
+      "reward": 0.07083749771118164,
+      "reward_std": 0.29650557041168213,
+      "rewards/cosine_scaled_reward/mean": 0.07083749771118164,
+      "rewards/cosine_scaled_reward/std": 0.5094331502914429,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1722.0,
+      "completions/mean_length": 1787.09375,
+      "completions/mean_terminated_length": 1065.7647705078125,
+      "completions/min_length": 652.0,
+      "completions/min_terminated_length": 652.0,
+      "epoch": 0.12228571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26255276799201965,
+      "learning_rate": 9.66045715125541e-07,
+      "loss": 0.0,
+      "num_tokens": 13164669.0,
+      "reward": -0.2222379446029663,
+      "reward_std": 0.240003302693367,
+      "rewards/cosine_scaled_reward/mean": -0.2222379446029663,
+      "rewards/cosine_scaled_reward/std": 0.29153531789779663,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1802.0,
+      "completions/mean_length": 1660.96875,
+      "completions/mean_terminated_length": 1095.3077392578125,
+      "completions/min_length": 544.0,
+      "completions/min_terminated_length": 544.0,
+      "epoch": 0.12342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30773183703422546,
+      "learning_rate": 9.648384182148252e-07,
+      "loss": -0.0,
+      "num_tokens": 13281331.0,
+      "reward": -0.21352165937423706,
+      "reward_std": 0.3123124837875366,
+      "rewards/cosine_scaled_reward/mean": -0.21352165937423706,
+      "rewards/cosine_scaled_reward/std": 0.3453315496444702,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.796875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1982.0,
+      "completions/mean_length": 1858.921875,
+      "completions/mean_terminated_length": 1117.1539306640625,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "epoch": 0.12457142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24000757932662964,
+      "learning_rate": 9.636109026648554e-07,
+      "loss": 0.0,
+      "num_tokens": 13411550.0,
+      "reward": -0.13601753115653992,
+      "reward_std": 0.1500597596168518,
+      "rewards/cosine_scaled_reward/mean": -0.1360175609588623,
+      "rewards/cosine_scaled_reward/std": 0.42859947681427,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1999.0,
+      "completions/mean_length": 1563.90625,
+      "completions/mean_terminated_length": 900.5185546875,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "epoch": 0.12571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31709614396095276,
+      "learning_rate": 9.623632283030077e-07,
+      "loss": 0.0,
+      "num_tokens": 13522416.0,
+      "reward": -0.28067731857299805,
+      "reward_std": 0.1671288013458252,
+      "rewards/cosine_scaled_reward/mean": -0.28067731857299805,
+      "rewards/cosine_scaled_reward/std": 0.21458736062049866,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1686.0,
+      "completions/mean_length": 1724.71875,
+      "completions/mean_terminated_length": 1013.5,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "epoch": 0.12685714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2646999657154083,
+      "learning_rate": 9.610954559391704e-07,
+      "loss": -0.0,
+      "num_tokens": 13642918.0,
+      "reward": -0.11896095424890518,
+      "reward_std": 0.28121650218963623,
+      "rewards/cosine_scaled_reward/mean": -0.11896096169948578,
+      "rewards/cosine_scaled_reward/std": 0.37855637073516846,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.84375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 1918.0,
+      "completions/mean_terminated_length": 1216.0,
+      "completions/min_length": 694.0,
+      "completions/min_terminated_length": 694.0,
+      "epoch": 0.128,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.22158586978912354,
+      "learning_rate": 9.598076473627796e-07,
+      "loss": 0.0,
+      "num_tokens": 13776766.0,
+      "reward": -0.1688530147075653,
+      "reward_std": 0.2535978853702545,
+      "rewards/cosine_scaled_reward/mean": -0.1688530296087265,
+      "rewards/cosine_scaled_reward/std": 0.3341792821884155,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1220.0,
+      "completions/mean_length": 1556.125,
+      "completions/mean_terminated_length": 837.2307739257812,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "epoch": 0.12914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2930087745189667,
+      "learning_rate": 9.58499865339809e-07,
+      "loss": -0.0,
+      "num_tokens": 13886654.0,
+      "reward": -0.10367631912231445,
+      "reward_std": 0.30835023522377014,
+      "rewards/cosine_scaled_reward/mean": -0.10367631912231445,
+      "rewards/cosine_scaled_reward/std": 0.42973947525024414,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1827.0,
+      "completions/mean_length": 1521.9375,
+      "completions/mean_terminated_length": 753.0769653320312,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.13028571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3062143921852112,
+      "learning_rate": 9.571721736097088e-07,
+      "loss": -0.0,
+      "num_tokens": 13993906.0,
+      "reward": -0.22209212183952332,
+      "reward_std": 0.2074735462665558,
+      "rewards/cosine_scaled_reward/mean": -0.22209212183952332,
+      "rewards/cosine_scaled_reward/std": 0.29088398814201355,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1803.0,
+      "completions/mean_length": 1714.578125,
+      "completions/mean_terminated_length": 1031.857177734375,
+      "completions/min_length": 293.0,
+      "completions/min_terminated_length": 293.0,
+      "epoch": 0.13142857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2564532160758972,
+      "learning_rate": 9.55824636882301e-07,
+      "loss": -0.0,
+      "num_tokens": 14114855.0,
+      "reward": -0.10947269201278687,
+      "reward_std": 0.30371129512786865,
+      "rewards/cosine_scaled_reward/mean": -0.10947269946336746,
+      "rewards/cosine_scaled_reward/std": 0.41030505299568176,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 558.0,
+      "completions/mean_length": 1877.90625,
+      "completions/mean_terminated_length": 492.857177734375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "epoch": 0.13257142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25748053193092346,
+      "learning_rate": 9.54457320834625e-07,
+      "loss": -0.0,
+      "num_tokens": 14246425.0,
+      "reward": -0.19163870811462402,
+      "reward_std": 0.21010378003120422,
+      "rewards/cosine_scaled_reward/mean": -0.19163869321346283,
+      "rewards/cosine_scaled_reward/std": 0.3049132525920868,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.859375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1951.0,
+      "completions/mean_length": 1922.546875,
+      "completions/mean_terminated_length": 1155.888916015625,
+      "completions/min_length": 816.0,
+      "completions/min_terminated_length": 816.0,
+      "epoch": 0.1337142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24102462828159332,
+      "learning_rate": 9.530702921077358e-07,
+      "loss": -0.0,
+      "num_tokens": 14380492.0,
+      "reward": -0.21347489953041077,
+      "reward_std": 0.19724325835704803,
+      "rewards/cosine_scaled_reward/mean": -0.21347489953041077,
+      "rewards/cosine_scaled_reward/std": 0.2647304832935333,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1965.0,
+      "completions/mean_length": 1793.546875,
+      "completions/mean_terminated_length": 1233.75,
+      "completions/min_length": 459.0,
+      "completions/min_terminated_length": 459.0,
+      "epoch": 0.13485714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2439616322517395,
+      "learning_rate": 9.516636183034564e-07,
+      "loss": -0.0,
+      "num_tokens": 14505815.0,
+      "reward": -0.08845303952693939,
+      "reward_std": 0.30429399013519287,
+      "rewards/cosine_scaled_reward/mean": -0.08845303952693939,
+      "rewards/cosine_scaled_reward/std": 0.4648522734642029,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2028.0,
+      "completions/mean_length": 1405.15625,
+      "completions/mean_terminated_length": 936.0540771484375,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "epoch": 0.136,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32119110226631165,
+      "learning_rate": 9.502373679810839e-07,
+      "loss": 0.0,
+      "num_tokens": 14606153.0,
+      "reward": -0.04571840912103653,
+      "reward_std": 0.3056246340274811,
+      "rewards/cosine_scaled_reward/mean": -0.04571840912103653,
+      "rewards/cosine_scaled_reward/std": 0.49307262897491455,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2044.0,
+      "completions/mean_length": 1321.40625,
+      "completions/mean_terminated_length": 940.8095703125,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.13714285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3139563500881195,
+      "learning_rate": 9.487916106540465e-07,
+      "loss": 0.0,
+      "num_tokens": 14701179.0,
+      "reward": -0.12771092355251312,
+      "reward_std": 0.3157998323440552,
+      "rewards/cosine_scaled_reward/mean": -0.12771093845367432,
+      "rewards/cosine_scaled_reward/std": 0.4336044490337372,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1024.5,
+      "completions/mean_terminated_length": 812.0755004882812,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 0.1382857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3428559899330139,
+      "learning_rate": 9.473264167865171e-07,
+      "loss": 0.0,
+      "num_tokens": 14776443.0,
+      "reward": -0.004689367488026619,
+      "reward_std": 0.297618567943573,
+      "rewards/cosine_scaled_reward/mean": -0.004689373075962067,
+      "rewards/cosine_scaled_reward/std": 0.46961408853530884,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2006.0,
+      "completions/mean_length": 1790.765625,
+      "completions/mean_terminated_length": 1133.388916015625,
+      "completions/min_length": 288.0,
+      "completions/min_terminated_length": 288.0,
+      "epoch": 0.13942857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29122933745384216,
+      "learning_rate": 9.458418577899774e-07,
+      "loss": -0.0,
+      "num_tokens": 14902612.0,
+      "reward": -0.11110783368349075,
+      "reward_std": 0.22664329409599304,
+      "rewards/cosine_scaled_reward/mean": -0.11110783368349075,
+      "rewards/cosine_scaled_reward/std": 0.3362382650375366,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1988.0,
+      "completions/mean_length": 1658.46875,
+      "completions/mean_terminated_length": 1124.6666259765625,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "epoch": 0.14057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2646903693675995,
+      "learning_rate": 9.443380060197385e-07,
+      "loss": 0.0,
+      "num_tokens": 15018986.0,
+      "reward": -0.20784568786621094,
+      "reward_std": 0.270358681678772,
+      "rewards/cosine_scaled_reward/mean": -0.20784570276737213,
+      "rewards/cosine_scaled_reward/std": 0.35689592361450195,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1414.0,
+      "completions/mean_length": 1494.9375,
+      "completions/mean_terminated_length": 868.1333618164062,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "epoch": 0.1417142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26702970266342163,
+      "learning_rate": 9.428149347714143e-07,
+      "loss": -0.0,
+      "num_tokens": 15125614.0,
+      "reward": -0.160624697804451,
+      "reward_std": 0.23646026849746704,
+      "rewards/cosine_scaled_reward/mean": -0.160624697804451,
+      "rewards/cosine_scaled_reward/std": 0.4083607792854309,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1245.0,
+      "completions/mean_length": 1608.609375,
+      "completions/mean_terminated_length": 825.3478393554688,
+      "completions/min_length": 495.0,
+      "completions/min_terminated_length": 495.0,
+      "epoch": 0.14285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2753336727619171,
+      "learning_rate": 9.412727182773486e-07,
+      "loss": 0.0,
+      "num_tokens": 15239493.0,
+      "reward": -0.008194006979465485,
+      "reward_std": 0.21567228436470032,
+      "rewards/cosine_scaled_reward/mean": -0.008194014430046082,
+      "rewards/cosine_scaled_reward/std": 0.463446706533432,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.703125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1953.0,
+      "completions/mean_length": 1759.484375,
+      "completions/mean_terminated_length": 1076.157958984375,
+      "completions/min_length": 527.0,
+      "completions/min_terminated_length": 527.0,
+      "epoch": 0.144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24985821545124054,
+      "learning_rate": 9.397114317029974e-07,
+      "loss": 0.0,
+      "num_tokens": 15363396.0,
+      "reward": -0.16068750619888306,
+      "reward_std": 0.22599664330482483,
+      "rewards/cosine_scaled_reward/mean": -0.16068752110004425,
+      "rewards/cosine_scaled_reward/std": 0.304392009973526,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1657.0,
+      "completions/mean_length": 1842.859375,
+      "completions/mean_terminated_length": 1110.21435546875,
+      "completions/min_length": 581.0,
+      "completions/min_terminated_length": 581.0,
+      "epoch": 0.14514285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21972927451133728,
+      "learning_rate": 9.381311511432658e-07,
+      "loss": -0.0,
+      "num_tokens": 15492435.0,
+      "reward": -0.29198482632637024,
+      "reward_std": 0.17300401628017426,
+      "rewards/cosine_scaled_reward/mean": -0.29198482632637024,
+      "rewards/cosine_scaled_reward/std": 0.21628034114837646,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2016.0,
+      "completions/mean_length": 1694.578125,
+      "completions/mean_terminated_length": 1064.565185546875,
+      "completions/min_length": 338.0,
+      "completions/min_terminated_length": 338.0,
+      "epoch": 0.1462857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24753950536251068,
+      "learning_rate": 9.36531953618799e-07,
+      "loss": -0.0,
+      "num_tokens": 15611240.0,
+      "reward": 0.04859344661235809,
+      "reward_std": 0.31105202436447144,
+      "rewards/cosine_scaled_reward/mean": 0.04859344661235809,
+      "rewards/cosine_scaled_reward/std": 0.4569285809993744,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1540.0,
+      "completions/mean_length": 1917.5625,
+      "completions/mean_terminated_length": 1004.5,
+      "completions/min_length": 651.0,
+      "completions/min_terminated_length": 651.0,
+      "epoch": 0.14742857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23949742317199707,
+      "learning_rate": 9.34913917072228e-07,
+      "loss": 0.0,
+      "num_tokens": 15744668.0,
+      "reward": -0.27834638953208923,
+      "reward_std": 0.16836056113243103,
+      "rewards/cosine_scaled_reward/mean": -0.27834638953208923,
+      "rewards/cosine_scaled_reward/std": 0.20021934807300568,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.71875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1502.0,
+      "completions/mean_length": 1725.796875,
+      "completions/mean_terminated_length": 902.388916015625,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "epoch": 0.14857142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23063035309314728,
+      "learning_rate": 9.332771203643714e-07,
+      "loss": -0.0,
+      "num_tokens": 15865623.0,
+      "reward": -0.19732065498828888,
+      "reward_std": 0.19462591409683228,
+      "rewards/cosine_scaled_reward/mean": -0.19732065498828888,
+      "rewards/cosine_scaled_reward/std": 0.2627345323562622,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2029.0,
+      "completions/mean_length": 1640.8125,
+      "completions/mean_terminated_length": 863.45458984375,
+      "completions/min_length": 259.0,
+      "completions/min_terminated_length": 259.0,
+      "epoch": 0.14971428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29630133509635925,
+      "learning_rate": 9.316216432703916e-07,
+      "loss": 0.0,
+      "num_tokens": 15980371.0,
+      "reward": -0.07768938690423965,
+      "reward_std": 0.2543257176876068,
+      "rewards/cosine_scaled_reward/mean": -0.07768939435482025,
+      "rewards/cosine_scaled_reward/std": 0.4248148798942566,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1424.0,
+      "completions/mean_length": 1570.890625,
+      "completions/mean_terminated_length": 826.5999755859375,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "epoch": 0.15085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2735442817211151,
+      "learning_rate": 9.299475664759068e-07,
+      "loss": -0.0,
+      "num_tokens": 16091972.0,
+      "reward": -0.1057564914226532,
+      "reward_std": 0.32137495279312134,
+      "rewards/cosine_scaled_reward/mean": -0.105756476521492,
+      "rewards/cosine_scaled_reward/std": 0.4788062870502472,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.890625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1941.0,
+      "completions/mean_length": 1971.34375,
+      "completions/mean_terminated_length": 1347.1429443359375,
+      "completions/min_length": 681.0,
+      "completions/min_terminated_length": 681.0,
+      "epoch": 0.152,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23230750858783722,
+      "learning_rate": 9.282549715730579e-07,
+      "loss": 0.0,
+      "num_tokens": 16229362.0,
+      "reward": -0.21333375573158264,
+      "reward_std": 0.1880394071340561,
+      "rewards/cosine_scaled_reward/mean": -0.21333375573158264,
+      "rewards/cosine_scaled_reward/std": 0.2557979226112366,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1970.0,
+      "completions/mean_length": 1682.984375,
+      "completions/mean_terminated_length": 1113.5599365234375,
+      "completions/min_length": 576.0,
+      "completions/min_terminated_length": 576.0,
+      "epoch": 0.15314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2776358127593994,
+      "learning_rate": 9.265439410565328e-07,
+      "loss": 0.0,
+      "num_tokens": 16347641.0,
+      "reward": -0.07218431681394577,
+      "reward_std": 0.19744814932346344,
+      "rewards/cosine_scaled_reward/mean": -0.07218432426452637,
+      "rewards/cosine_scaled_reward/std": 0.41042155027389526,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1852.0,
+      "completions/mean_length": 1269.171875,
+      "completions/mean_terminated_length": 736.2894897460938,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 0.15428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30510956048965454,
+      "learning_rate": 9.248145583195447e-07,
+      "loss": 0.0,
+      "num_tokens": 16439340.0,
+      "reward": 0.1377476304769516,
+      "reward_std": 0.25976449251174927,
+      "rewards/cosine_scaled_reward/mean": 0.1377476155757904,
+      "rewards/cosine_scaled_reward/std": 0.4923737347126007,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1461.0,
+      "completions/mean_length": 1624.203125,
+      "completions/mean_terminated_length": 917.875,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "epoch": 0.15542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25474753975868225,
+      "learning_rate": 9.230669076497687e-07,
+      "loss": 0.0,
+      "num_tokens": 16553961.0,
+      "reward": -0.04156734049320221,
+      "reward_std": 0.27987948060035706,
+      "rewards/cosine_scaled_reward/mean": -0.04156734049320221,
+      "rewards/cosine_scaled_reward/std": 0.4557124078273773,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.78125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2003.0,
+      "completions/mean_length": 1832.625,
+      "completions/mean_terminated_length": 1063.4285888671875,
+      "completions/min_length": 600.0,
+      "completions/min_terminated_length": 600.0,
+      "epoch": 0.15657142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2200661152601242,
+      "learning_rate": 9.213010742252327e-07,
+      "loss": -0.0,
+      "num_tokens": 16681857.0,
+      "reward": -0.2795522212982178,
+      "reward_std": 0.16735097765922546,
+      "rewards/cosine_scaled_reward/mean": -0.2795522212982178,
+      "rewards/cosine_scaled_reward/std": 0.22360830008983612,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1783.0,
+      "completions/mean_length": 1547.90625,
+      "completions/mean_terminated_length": 981.1333618164062,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "epoch": 0.15771428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.247065007686615,
+      "learning_rate": 9.195171441101668e-07,
+      "loss": 0.0,
+      "num_tokens": 16792235.0,
+      "reward": -0.1421782374382019,
+      "reward_std": 0.25017279386520386,
+      "rewards/cosine_scaled_reward/mean": -0.1421782374382019,
+      "rewards/cosine_scaled_reward/std": 0.3903765082359314,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.828125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1946.0,
+      "completions/mean_length": 1898.375,
+      "completions/mean_terminated_length": 1177.45458984375,
+      "completions/min_length": 812.0,
+      "completions/min_terminated_length": 812.0,
+      "epoch": 0.15885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25471416115760803,
+      "learning_rate": 9.177152042508077e-07,
+      "loss": 0.0,
+      "num_tokens": 16924371.0,
+      "reward": -0.24234679341316223,
+      "reward_std": 0.15713179111480713,
+      "rewards/cosine_scaled_reward/mean": -0.24234679341316223,
+      "rewards/cosine_scaled_reward/std": 0.17467617988586426,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1927.0,
+      "completions/mean_length": 1577.625,
+      "completions/mean_terminated_length": 1044.533447265625,
+      "completions/min_length": 365.0,
+      "completions/min_terminated_length": 365.0,
+      "epoch": 0.16,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.2628695070743561,
+      "learning_rate": 9.158953424711624e-07,
+      "loss": -0.0,
+      "num_tokens": 17035563.0,
+      "reward": -0.12413343787193298,
+      "reward_std": 0.20063763856887817,
+      "rewards/cosine_scaled_reward/mean": -0.12413343787193298,
+      "rewards/cosine_scaled_reward/std": 0.5006609559059143,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1857.0,
+      "completions/mean_length": 1405.125,
+      "completions/mean_terminated_length": 993.025634765625,
+      "completions/min_length": 586.0,
+      "completions/min_terminated_length": 586.0,
+      "epoch": 0.16114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2682877779006958,
+      "learning_rate": 9.140576474687263e-07,
+      "loss": -0.0,
+      "num_tokens": 17136051.0,
+      "reward": -0.02423717826604843,
+      "reward_std": 0.2661462128162384,
+      "rewards/cosine_scaled_reward/mean": -0.02423717826604843,
+      "rewards/cosine_scaled_reward/std": 0.502265214920044,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2042.0,
+      "completions/mean_length": 1763.515625,
+      "completions/mean_terminated_length": 1347.7308349609375,
+      "completions/min_length": 577.0,
+      "completions/min_terminated_length": 577.0,
+      "epoch": 0.16228571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24265550076961517,
+      "learning_rate": 9.122022088101613e-07,
+      "loss": -0.0,
+      "num_tokens": 17259420.0,
+      "reward": -0.23560766875743866,
+      "reward_std": 0.22989924252033234,
+      "rewards/cosine_scaled_reward/mean": -0.23560766875743866,
+      "rewards/cosine_scaled_reward/std": 0.28772976994514465,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1921.0,
+      "completions/mean_length": 1600.671875,
+      "completions/mean_terminated_length": 1153.34375,
+      "completions/min_length": 538.0,
+      "completions/min_terminated_length": 538.0,
+      "epoch": 0.16342857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30536147952079773,
+      "learning_rate": 9.103291169269299e-07,
+      "loss": -0.0,
+      "num_tokens": 17372679.0,
+      "reward": -0.23412726819515228,
+      "reward_std": 0.226594477891922,
+      "rewards/cosine_scaled_reward/mean": -0.2341272532939911,
+      "rewards/cosine_scaled_reward/std": 0.2685011625289917,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.671875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1870.0,
+      "completions/mean_length": 1708.0625,
+      "completions/mean_terminated_length": 1012.0,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "epoch": 0.16457142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2859592139720917,
+      "learning_rate": 9.084384631108882e-07,
+      "loss": 0.0,
+      "num_tokens": 17493483.0,
+      "reward": -0.11928378790616989,
+      "reward_std": 0.2819562554359436,
+      "rewards/cosine_scaled_reward/mean": -0.11928380280733109,
+      "rewards/cosine_scaled_reward/std": 0.41741910576820374,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1725.0,
+      "completions/mean_length": 1277.78125,
+      "completions/mean_terminated_length": 845.707275390625,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 0.1657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.310493141412735,
+      "learning_rate": 9.065303395098358e-07,
+      "loss": 0.0,
+      "num_tokens": 17585205.0,
+      "reward": 0.009949762374162674,
+      "reward_std": 0.32572609186172485,
+      "rewards/cosine_scaled_reward/mean": 0.009949766099452972,
+      "rewards/cosine_scaled_reward/std": 0.5299619436264038,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1996.0,
+      "completions/mean_length": 1550.625,
+      "completions/mean_terminated_length": 986.9334106445312,
+      "completions/min_length": 392.0,
+      "completions/min_terminated_length": 392.0,
+      "epoch": 0.16685714285714287,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2404046207666397,
+      "learning_rate": 9.046048391230247e-07,
+      "loss": -0.0,
+      "num_tokens": 17695061.0,
+      "reward": -0.17625686526298523,
+      "reward_std": 0.2529022991657257,
+      "rewards/cosine_scaled_reward/mean": -0.17625686526298523,
+      "rewards/cosine_scaled_reward/std": 0.3359045386314392,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1897.0,
+      "completions/mean_length": 1741.703125,
+      "completions/mean_terminated_length": 1156.95458984375,
+      "completions/min_length": 591.0,
+      "completions/min_terminated_length": 591.0,
+      "epoch": 0.168,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2498754858970642,
+      "learning_rate": 9.026620557966279e-07,
+      "loss": -0.0,
+      "num_tokens": 17817314.0,
+      "reward": -0.26471418142318726,
+      "reward_std": 0.2048022449016571,
+      "rewards/cosine_scaled_reward/mean": -0.26471418142318726,
+      "rewards/cosine_scaled_reward/std": 0.2656060457229614,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1715.0,
+      "completions/mean_length": 1386.0625,
+      "completions/mean_terminated_length": 903.0270385742188,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "epoch": 0.16914285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2538217306137085,
+      "learning_rate": 9.007020842191634e-07,
+      "loss": -0.0,
+      "num_tokens": 17917206.0,
+      "reward": -0.10874275863170624,
+      "reward_std": 0.24236595630645752,
+      "rewards/cosine_scaled_reward/mean": -0.10874275863170624,
+      "rewards/cosine_scaled_reward/std": 0.3927372395992279,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2017.0,
+      "completions/mean_length": 1735.96875,
+      "completions/mean_terminated_length": 1140.272705078125,
+      "completions/min_length": 572.0,
+      "completions/min_terminated_length": 572.0,
+      "epoch": 0.1702857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23916038870811462,
+      "learning_rate": 8.987250199168808e-07,
+      "loss": 0.0,
+      "num_tokens": 18040204.0,
+      "reward": -0.20906513929367065,
+      "reward_std": 0.2755752205848694,
+      "rewards/cosine_scaled_reward/mean": -0.20906512439250946,
+      "rewards/cosine_scaled_reward/std": 0.38517922163009644,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1379.359375,
+      "completions/mean_terminated_length": 978.1749877929688,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "epoch": 0.17142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30970829725265503,
+      "learning_rate": 8.967309592491052e-07,
+      "loss": 0.0,
+      "num_tokens": 18138987.0,
+      "reward": -0.14114701747894287,
+      "reward_std": 0.3519541621208191,
+      "rewards/cosine_scaled_reward/mean": -0.14114701747894287,
+      "rewards/cosine_scaled_reward/std": 0.39396560192108154,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2011.0,
+      "completions/mean_length": 1497.328125,
+      "completions/mean_terminated_length": 1011.441162109375,
+      "completions/min_length": 325.0,
+      "completions/min_terminated_length": 325.0,
+      "epoch": 0.17257142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2874428331851959,
+      "learning_rate": 8.9471999940354e-07,
+      "loss": 0.0,
+      "num_tokens": 18245496.0,
+      "reward": -0.04711150377988815,
+      "reward_std": 0.33344799280166626,
+      "rewards/cosine_scaled_reward/mean": -0.04711151123046875,
+      "rewards/cosine_scaled_reward/std": 0.41477611660957336,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1741.0,
+      "completions/mean_length": 1777.140625,
+      "completions/mean_terminated_length": 964.5625,
+      "completions/min_length": 292.0,
+      "completions/min_terminated_length": 292.0,
+      "epoch": 0.1737142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28282323479652405,
+      "learning_rate": 8.926922383915315e-07,
+      "loss": 0.0,
+      "num_tokens": 18369897.0,
+      "reward": -0.2543114423751831,
+      "reward_std": 0.18715068697929382,
+      "rewards/cosine_scaled_reward/mean": -0.2543114423751831,
+      "rewards/cosine_scaled_reward/std": 0.19382856786251068,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1655.0,
+      "completions/mean_length": 1564.0625,
+      "completions/mean_terminated_length": 900.888916015625,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "epoch": 0.17485714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27684876322746277,
+      "learning_rate": 8.906477750432903e-07,
+      "loss": -0.0,
+      "num_tokens": 18481141.0,
+      "reward": -0.1415693461894989,
+      "reward_std": 0.23039600253105164,
+      "rewards/cosine_scaled_reward/mean": -0.1415693461894989,
+      "rewards/cosine_scaled_reward/std": 0.2940608859062195,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.734375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1911.0,
+      "completions/mean_length": 1829.328125,
+      "completions/mean_terminated_length": 1224.7647705078125,
+      "completions/min_length": 784.0,
+      "completions/min_terminated_length": 784.0,
+      "epoch": 0.176,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24094167351722717,
+      "learning_rate": 8.88586709003076e-07,
+      "loss": -0.0,
+      "num_tokens": 18609282.0,
+      "reward": -0.2521882653236389,
+      "reward_std": 0.20982292294502258,
+      "rewards/cosine_scaled_reward/mean": -0.2521882653236389,
+      "rewards/cosine_scaled_reward/std": 0.23373161256313324,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1902.0,
+      "completions/mean_length": 1517.765625,
+      "completions/mean_terminated_length": 916.8333740234375,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "epoch": 0.17714285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2674770653247833,
+      "learning_rate": 8.865091407243394e-07,
+      "loss": -0.0,
+      "num_tokens": 18717043.0,
+      "reward": -0.028832588344812393,
+      "reward_std": 0.22500035166740417,
+      "rewards/cosine_scaled_reward/mean": -0.028832584619522095,
+      "rewards/cosine_scaled_reward/std": 0.4698766767978668,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1508.0,
+      "completions/mean_length": 1731.453125,
+      "completions/mean_terminated_length": 781.8125,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "epoch": 0.1782857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23764768242835999,
+      "learning_rate": 8.844151714648274e-07,
+      "loss": 0.0,
+      "num_tokens": 18837960.0,
+      "reward": -0.10049945116043091,
+      "reward_std": 0.2521243393421173,
+      "rewards/cosine_scaled_reward/mean": -0.10049945116043091,
+      "rewards/cosine_scaled_reward/std": 0.4200229048728943,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2027.0,
+      "completions/mean_length": 1531.1875,
+      "completions/mean_terminated_length": 1014.375,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "epoch": 0.17942857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28145694732666016,
+      "learning_rate": 8.823049032816478e-07,
+      "loss": -0.0,
+      "num_tokens": 18945916.0,
+      "reward": -0.22566190361976624,
+      "reward_std": 0.19013158977031708,
+      "rewards/cosine_scaled_reward/mean": -0.22566190361976624,
+      "rewards/cosine_scaled_reward/std": 0.24779614806175232,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1919.0,
+      "completions/mean_length": 1425.203125,
+      "completions/mean_terminated_length": 909.1714477539062,
+      "completions/min_length": 243.0,
+      "completions/min_terminated_length": 243.0,
+      "epoch": 0.18057142857142858,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.24377204477787018,
+      "learning_rate": 8.801784390262943e-07,
+      "loss": -0.0,
+      "num_tokens": 19047249.0,
+      "reward": -0.021197691559791565,
+      "reward_std": 0.22868266701698303,
+      "rewards/cosine_scaled_reward/mean": -0.021197684109210968,
+      "rewards/cosine_scaled_reward/std": 0.46860653162002563,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1744.0,
+      "completions/mean_length": 1600.4375,
+      "completions/mean_terminated_length": 1093.2000732421875,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "epoch": 0.18171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2479163259267807,
+      "learning_rate": 8.780358823396352e-07,
+      "loss": 0.0,
+      "num_tokens": 19161357.0,
+      "reward": -0.23690757155418396,
+      "reward_std": 0.20615912973880768,
+      "rewards/cosine_scaled_reward/mean": -0.23690758645534515,
+      "rewards/cosine_scaled_reward/std": 0.32988741993904114,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1737.0,
+      "completions/mean_length": 1527.375,
+      "completions/mean_terminated_length": 937.3333740234375,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "epoch": 0.18285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2751549184322357,
+      "learning_rate": 8.758773376468604e-07,
+      "loss": 0.0,
+      "num_tokens": 19270693.0,
+      "reward": -0.12134292721748352,
+      "reward_std": 0.2621082067489624,
+      "rewards/cosine_scaled_reward/mean": -0.12134292721748352,
+      "rewards/cosine_scaled_reward/std": 0.4263574779033661,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1950.0,
+      "completions/mean_length": 1419.484375,
+      "completions/mean_terminated_length": 989.4473876953125,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "epoch": 0.184,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2905498445034027,
+      "learning_rate": 8.737029101523929e-07,
+      "loss": -0.0,
+      "num_tokens": 19371532.0,
+      "reward": -0.1314084678888321,
+      "reward_std": 0.25361165404319763,
+      "rewards/cosine_scaled_reward/mean": -0.1314084678888321,
+      "rewards/cosine_scaled_reward/std": 0.36607682704925537,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.578125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1584.5,
+      "completions/mean_terminated_length": 949.3333129882812,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "epoch": 0.18514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3278505206108093,
+      "learning_rate": 8.715127058347614e-07,
+      "loss": 0.0,
+      "num_tokens": 19483244.0,
+      "reward": -0.16470149159431458,
+      "reward_std": 0.26964259147644043,
+      "rewards/cosine_scaled_reward/mean": -0.16470149159431458,
+      "rewards/cosine_scaled_reward/std": 0.31499552726745605,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.484375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1311.0,
+      "completions/mean_length": 1439.6875,
+      "completions/mean_terminated_length": 868.242431640625,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "epoch": 0.18628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29175421595573425,
+      "learning_rate": 8.693068314414344e-07,
+      "loss": -0.0,
+      "num_tokens": 19586568.0,
+      "reward": 0.10278680920600891,
+      "reward_std": 0.271634042263031,
+      "rewards/cosine_scaled_reward/mean": 0.10278680920600891,
+      "rewards/cosine_scaled_reward/std": 0.4813632071018219,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1679.0,
+      "completions/mean_length": 1406.703125,
+      "completions/mean_terminated_length": 995.6154174804688,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "epoch": 0.18742857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26038941740989685,
+      "learning_rate": 8.670853944836176e-07,
+      "loss": 0.0,
+      "num_tokens": 19687125.0,
+      "reward": -0.08026184141635895,
+      "reward_std": 0.21900159120559692,
+      "rewards/cosine_scaled_reward/mean": -0.08026183396577835,
+      "rewards/cosine_scaled_reward/std": 0.4170342683792114,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1896.0,
+      "completions/mean_length": 1438.921875,
+      "completions/mean_terminated_length": 994.45947265625,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "epoch": 0.18857142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.29659712314605713,
+      "learning_rate": 8.648485032310144e-07,
+      "loss": 0.0,
+      "num_tokens": 19790632.0,
+      "reward": -0.12293928861618042,
+      "reward_std": 0.23739376664161682,
+      "rewards/cosine_scaled_reward/mean": -0.12293929606676102,
+      "rewards/cosine_scaled_reward/std": 0.3927924335002899,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2037.0,
+      "completions/mean_length": 1697.765625,
+      "completions/mean_terminated_length": 1073.434814453125,
+      "completions/min_length": 610.0,
+      "completions/min_terminated_length": 610.0,
+      "epoch": 0.18971428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21795569360256195,
+      "learning_rate": 8.625962667065487e-07,
+      "loss": -0.0,
+      "num_tokens": 19910865.0,
+      "reward": -0.20583154261112213,
+      "reward_std": 0.2378866970539093,
+      "rewards/cosine_scaled_reward/mean": -0.20583152770996094,
+      "rewards/cosine_scaled_reward/std": 0.26525840163230896,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1674.0,
+      "completions/mean_length": 1406.890625,
+      "completions/mean_terminated_length": 995.923095703125,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "epoch": 0.19085714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2583286166191101,
+      "learning_rate": 8.603287946810513e-07,
+      "loss": -0.0,
+      "num_tokens": 20012450.0,
+      "reward": -0.14853140711784363,
+      "reward_std": 0.23831486701965332,
+      "rewards/cosine_scaled_reward/mean": -0.14853140711784363,
+      "rewards/cosine_scaled_reward/std": 0.2794221341609955,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1833.0,
+      "completions/mean_length": 1574.921875,
+      "completions/mean_terminated_length": 1038.7667236328125,
+      "completions/min_length": 496.0,
+      "completions/min_terminated_length": 496.0,
+      "epoch": 0.192,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2431253343820572,
+      "learning_rate": 8.580461976679099e-07,
+      "loss": 0.0,
+      "num_tokens": 20124085.0,
+      "reward": -0.07713659107685089,
+      "reward_std": 0.2686954736709595,
+      "rewards/cosine_scaled_reward/mean": -0.07713659107685089,
+      "rewards/cosine_scaled_reward/std": 0.37947362661361694,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2005.0,
+      "completions/mean_length": 1507.90625,
+      "completions/mean_terminated_length": 1161.6923828125,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "epoch": 0.19314285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23702675104141235,
+      "learning_rate": 8.557485869176825e-07,
+      "loss": -0.0,
+      "num_tokens": 20231215.0,
+      "reward": 0.20358076691627502,
+      "reward_std": 0.2683357000350952,
+      "rewards/cosine_scaled_reward/mean": 0.20358076691627502,
+      "rewards/cosine_scaled_reward/std": 0.5625549554824829,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2030.0,
+      "completions/mean_length": 1290.53125,
+      "completions/mean_terminated_length": 836.0499877929688,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "epoch": 0.19428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2915634512901306,
+      "learning_rate": 8.534360744126753e-07,
+      "loss": 0.0,
+      "num_tokens": 20323593.0,
+      "reward": -0.04663477838039398,
+      "reward_std": 0.1683385670185089,
+      "rewards/cosine_scaled_reward/mean": -0.04663477838039398,
+      "rewards/cosine_scaled_reward/std": 0.432047039270401,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1764.0,
+      "completions/mean_length": 1461.703125,
+      "completions/mean_terminated_length": 875.40625,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "epoch": 0.19542857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2520189881324768,
+      "learning_rate": 8.511087728614862e-07,
+      "loss": -0.0,
+      "num_tokens": 20427534.0,
+      "reward": -0.03435331583023071,
+      "reward_std": 0.18240094184875488,
+      "rewards/cosine_scaled_reward/mean": -0.034353308379650116,
+      "rewards/cosine_scaled_reward/std": 0.4340380132198334,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1886.0,
+      "completions/mean_length": 1483.359375,
+      "completions/mean_terminated_length": 1071.3243408203125,
+      "completions/min_length": 314.0,
+      "completions/min_terminated_length": 314.0,
+      "epoch": 0.19657142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31458213925361633,
+      "learning_rate": 8.487667956935087e-07,
+      "loss": -0.0,
+      "num_tokens": 20533085.0,
+      "reward": 0.1847388744354248,
+      "reward_std": 0.20619311928749084,
+      "rewards/cosine_scaled_reward/mean": 0.18473894894123077,
+      "rewards/cosine_scaled_reward/std": 0.512468159198761,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2000.0,
+      "completions/mean_length": 1113.96875,
+      "completions/mean_terminated_length": 689.4091186523438,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.1977142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3380848467350006,
+      "learning_rate": 8.464102570534061e-07,
+      "loss": -0.0,
+      "num_tokens": 20615691.0,
+      "reward": -0.05022401362657547,
+      "reward_std": 0.2543797492980957,
+      "rewards/cosine_scaled_reward/mean": -0.05022402107715607,
+      "rewards/cosine_scaled_reward/std": 0.38979703187942505,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1806.0,
+      "completions/mean_length": 1151.390625,
+      "completions/mean_terminated_length": 985.3518676757812,
+      "completions/min_length": 523.0,
+      "completions/min_terminated_length": 523.0,
+      "epoch": 0.19885714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2737923562526703,
+      "learning_rate": 8.440392717955475e-07,
+      "loss": -0.0,
+      "num_tokens": 20699716.0,
+      "reward": -0.05732875317335129,
+      "reward_std": 0.2915908694267273,
+      "rewards/cosine_scaled_reward/mean": -0.05732874572277069,
+      "rewards/cosine_scaled_reward/std": 0.4477607011795044,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2041.0,
+      "completions/mean_length": 1481.765625,
+      "completions/mean_terminated_length": 1068.567626953125,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "epoch": 0.2,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26141369342803955,
+      "learning_rate": 8.416539554784089e-07,
+      "loss": 0.0,
+      "num_tokens": 20805373.0,
+      "reward": -0.02904359996318817,
+      "reward_std": 0.24616873264312744,
+      "rewards/cosine_scaled_reward/mean": -0.02904359996318817,
+      "rewards/cosine_scaled_reward/std": 0.45150378346443176,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.46875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1819.0,
+      "completions/mean_length": 1445.53125,
+      "completions/mean_terminated_length": 913.941162109375,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "epoch": 0.20114285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.314208984375,
+      "learning_rate": 8.392544243589427e-07,
+      "loss": 0.0,
+      "num_tokens": 20909055.0,
+      "reward": -0.165739506483078,
+      "reward_std": 0.2986479103565216,
+      "rewards/cosine_scaled_reward/mean": -0.165739506483078,
+      "rewards/cosine_scaled_reward/std": 0.3703363239765167,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1848.0,
+      "completions/mean_length": 1427.890625,
+      "completions/mean_terminated_length": 1003.6052856445312,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "epoch": 0.2022857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2828216254711151,
+      "learning_rate": 8.368407953869103e-07,
+      "loss": 0.0,
+      "num_tokens": 21010664.0,
+      "reward": -0.07454323768615723,
+      "reward_std": 0.23275166749954224,
+      "rewards/cosine_scaled_reward/mean": -0.07454322278499603,
+      "rewards/cosine_scaled_reward/std": 0.3976919949054718,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1886.0,
+      "completions/mean_length": 1304.5,
+      "completions/mean_terminated_length": 915.047607421875,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "epoch": 0.20342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28618043661117554,
+      "learning_rate": 8.344131861991828e-07,
+      "loss": -0.0,
+      "num_tokens": 21105688.0,
+      "reward": 0.002464752644300461,
+      "reward_std": 0.3809230327606201,
+      "rewards/cosine_scaled_reward/mean": 0.002464751712977886,
+      "rewards/cosine_scaled_reward/std": 0.46308550238609314,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1937.0,
+      "completions/mean_length": 1658.5,
+      "completions/mean_terminated_length": 1050.8800048828125,
+      "completions/min_length": 631.0,
+      "completions/min_terminated_length": 631.0,
+      "epoch": 0.20457142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.250982403755188,
+      "learning_rate": 8.319717151140072e-07,
+      "loss": 0.0,
+      "num_tokens": 21222664.0,
+      "reward": -0.18153682351112366,
+      "reward_std": 0.2734690308570862,
+      "rewards/cosine_scaled_reward/mean": -0.18153685331344604,
+      "rewards/cosine_scaled_reward/std": 0.33050045371055603,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2003.0,
+      "completions/mean_length": 1288.34375,
+      "completions/mean_terminated_length": 943.0454711914062,
+      "completions/min_length": 288.0,
+      "completions/min_terminated_length": 288.0,
+      "epoch": 0.2057142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3166482150554657,
+      "learning_rate": 8.295165011252396e-07,
+      "loss": 0.0,
+      "num_tokens": 21316294.0,
+      "reward": 0.20186525583267212,
+      "reward_std": 0.31781116127967834,
+      "rewards/cosine_scaled_reward/mean": 0.20186525583267212,
+      "rewards/cosine_scaled_reward/std": 0.49267733097076416,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.59375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1754.0,
+      "completions/mean_length": 1591.796875,
+      "completions/mean_terminated_length": 925.0385131835938,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "epoch": 0.20685714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26195216178894043,
+      "learning_rate": 8.270476638965461e-07,
+      "loss": -0.0,
+      "num_tokens": 21429641.0,
+      "reward": -0.060104113072156906,
+      "reward_std": 0.23563489317893982,
+      "rewards/cosine_scaled_reward/mean": -0.06010409817099571,
+      "rewards/cosine_scaled_reward/std": 0.43010979890823364,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1736.0,
+      "completions/mean_length": 1365.421875,
+      "completions/mean_terminated_length": 927.871826171875,
+      "completions/min_length": 420.0,
+      "completions/min_terminated_length": 420.0,
+      "epoch": 0.208,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2551879286766052,
+      "learning_rate": 8.245653237555705e-07,
+      "loss": 0.0,
+      "num_tokens": 21526820.0,
+      "reward": -0.15826305747032166,
+      "reward_std": 0.24291284382343292,
+      "rewards/cosine_scaled_reward/mean": -0.15826307237148285,
+      "rewards/cosine_scaled_reward/std": 0.30778464674949646,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2043.0,
+      "completions/mean_length": 1320.515625,
+      "completions/mean_terminated_length": 912.4146118164062,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "epoch": 0.20914285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.32218796014785767,
+      "learning_rate": 8.220696016880687e-07,
+      "loss": -0.0,
+      "num_tokens": 21621949.0,
+      "reward": -0.07413223385810852,
+      "reward_std": 0.35920435190200806,
+      "rewards/cosine_scaled_reward/mean": -0.07413223385810852,
+      "rewards/cosine_scaled_reward/std": 0.45890137553215027,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1941.0,
+      "completions/mean_length": 1465.71875,
+      "completions/mean_terminated_length": 1012.8333129882812,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 0.2102857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27700135111808777,
+      "learning_rate": 8.195606193320136e-07,
+      "loss": 0.0,
+      "num_tokens": 21727107.0,
+      "reward": -0.158505380153656,
+      "reward_std": 0.18604165315628052,
+      "rewards/cosine_scaled_reward/mean": -0.158505380153656,
+      "rewards/cosine_scaled_reward/std": 0.29056471586227417,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2031.0,
+      "completions/mean_length": 1286.8125,
+      "completions/mean_terminated_length": 940.8182373046875,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "epoch": 0.21142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2607719898223877,
+      "learning_rate": 8.170384989716657e-07,
+      "loss": 0.0,
+      "num_tokens": 21819647.0,
+      "reward": -0.28775715827941895,
+      "reward_std": 0.19134438037872314,
+      "rewards/cosine_scaled_reward/mean": -0.28775715827941895,
+      "rewards/cosine_scaled_reward/std": 0.21350952982902527,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1825.0,
+      "completions/mean_length": 1480.09375,
+      "completions/mean_terminated_length": 1065.6756591796875,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "epoch": 0.21257142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2575705349445343,
+      "learning_rate": 8.145033635316128e-07,
+      "loss": 0.0,
+      "num_tokens": 21925069.0,
+      "reward": -0.13343556225299835,
+      "reward_std": 0.2557746171951294,
+      "rewards/cosine_scaled_reward/mean": -0.13343556225299835,
+      "rewards/cosine_scaled_reward/std": 0.36808857321739197,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1954.0,
+      "completions/mean_length": 1347.71875,
+      "completions/mean_terminated_length": 1114.291748046875,
+      "completions/min_length": 459.0,
+      "completions/min_terminated_length": 459.0,
+      "epoch": 0.21371428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31024712324142456,
+      "learning_rate": 8.119553365707802e-07,
+      "loss": -0.0,
+      "num_tokens": 22021747.0,
+      "reward": -0.09627380967140198,
+      "reward_std": 0.2472851276397705,
+      "rewards/cosine_scaled_reward/mean": -0.09627379477024078,
+      "rewards/cosine_scaled_reward/std": 0.41195833683013916,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.65625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1950.0,
+      "completions/mean_length": 1774.140625,
+      "completions/mean_terminated_length": 1251.3182373046875,
+      "completions/min_length": 726.0,
+      "completions/min_terminated_length": 726.0,
+      "epoch": 0.21485714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2481517493724823,
+      "learning_rate": 8.093945422764069e-07,
+      "loss": -0.0,
+      "num_tokens": 22147092.0,
+      "reward": -0.20224528014659882,
+      "reward_std": 0.2598743736743927,
+      "rewards/cosine_scaled_reward/mean": -0.20224529504776,
+      "rewards/cosine_scaled_reward/std": 0.33939501643180847,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1234.328125,
+      "completions/mean_terminated_length": 808.1190795898438,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "epoch": 0.216,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31437209248542786,
+      "learning_rate": 8.068211054579943e-07,
+      "loss": 0.0,
+      "num_tokens": 22235377.0,
+      "reward": -0.09877841919660568,
+      "reward_std": 0.2865467667579651,
+      "rewards/cosine_scaled_reward/mean": -0.09877842664718628,
+      "rewards/cosine_scaled_reward/std": 0.4444861114025116,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2021.0,
+      "completions/mean_length": 1189.3125,
+      "completions/mean_terminated_length": 1011.0943603515625,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "epoch": 0.21714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28301987051963806,
+      "learning_rate": 8.04235151541222e-07,
+      "loss": 0.0,
+      "num_tokens": 22321261.0,
+      "reward": -0.028003819286823273,
+      "reward_std": 0.27996310591697693,
+      "rewards/cosine_scaled_reward/mean": -0.028003819286823273,
+      "rewards/cosine_scaled_reward/std": 0.4598979353904724,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1993.0,
+      "completions/mean_length": 1061.140625,
+      "completions/mean_terminated_length": 1012.6065063476562,
+      "completions/min_length": 425.0,
+      "completions/min_terminated_length": 425.0,
+      "epoch": 0.21828571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31064528226852417,
+      "learning_rate": 8.01636806561836e-07,
+      "loss": -0.0,
+      "num_tokens": 22399462.0,
+      "reward": 0.07088040560483932,
+      "reward_std": 0.3638381361961365,
+      "rewards/cosine_scaled_reward/mean": 0.07088041305541992,
+      "rewards/cosine_scaled_reward/std": 0.5184580683708191,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2004.0,
+      "completions/mean_length": 1542.21875,
+      "completions/mean_terminated_length": 1258.48779296875,
+      "completions/min_length": 479.0,
+      "completions/min_terminated_length": 479.0,
+      "epoch": 0.21942857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2742583751678467,
+      "learning_rate": 7.990261971595048e-07,
+      "loss": -0.0,
+      "num_tokens": 22509460.0,
+      "reward": -0.14651048183441162,
+      "reward_std": 0.2414294183254242,
+      "rewards/cosine_scaled_reward/mean": -0.14651048183441162,
+      "rewards/cosine_scaled_reward/std": 0.3039136528968811,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.609375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1717.8125,
+      "completions/mean_terminated_length": 1202.719970703125,
+      "completions/min_length": 584.0,
+      "completions/min_terminated_length": 584.0,
+      "epoch": 0.22057142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24609725177288055,
+      "learning_rate": 7.964034505716476e-07,
+      "loss": 0.0,
+      "num_tokens": 22630544.0,
+      "reward": -0.28856799006462097,
+      "reward_std": 0.14614446461200714,
+      "rewards/cosine_scaled_reward/mean": -0.28856799006462097,
+      "rewards/cosine_scaled_reward/std": 0.17294423282146454,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1858.0,
+      "completions/mean_length": 1692.546875,
+      "completions/mean_terminated_length": 1058.9130859375,
+      "completions/min_length": 613.0,
+      "completions/min_terminated_length": 613.0,
+      "epoch": 0.22171428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27539438009262085,
+      "learning_rate": 7.93768694627233e-07,
+      "loss": 0.0,
+      "num_tokens": 22750227.0,
+      "reward": -0.10590282082557678,
+      "reward_std": 0.25362446904182434,
+      "rewards/cosine_scaled_reward/mean": -0.10590282082557678,
+      "rewards/cosine_scaled_reward/std": 0.36822667717933655,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2035.0,
+      "completions/mean_length": 1410.09375,
+      "completions/mean_terminated_length": 1120.1363525390625,
+      "completions/min_length": 640.0,
+      "completions/min_terminated_length": 640.0,
+      "epoch": 0.22285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23645445704460144,
+      "learning_rate": 7.911220577405484e-07,
+      "loss": 0.0,
+      "num_tokens": 22851617.0,
+      "reward": -0.12888561189174652,
+      "reward_std": 0.32565274834632874,
+      "rewards/cosine_scaled_reward/mean": -0.12888562679290771,
+      "rewards/cosine_scaled_reward/std": 0.3842463195323944,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.53125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1890.0,
+      "completions/mean_length": 1625.234375,
+      "completions/mean_terminated_length": 1146.10009765625,
+      "completions/min_length": 568.0,
+      "completions/min_terminated_length": 568.0,
+      "epoch": 0.224,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27093231678009033,
+      "learning_rate": 7.884636689049422e-07,
+      "loss": -0.0,
+      "num_tokens": 22967224.0,
+      "reward": -0.1617402583360672,
+      "reward_std": 0.3036938011646271,
+      "rewards/cosine_scaled_reward/mean": -0.1617402583360672,
+      "rewards/cosine_scaled_reward/std": 0.390837699174881,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2041.0,
+      "completions/mean_length": 1099.96875,
+      "completions/mean_terminated_length": 924.4074096679688,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "epoch": 0.22514285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.31926214694976807,
+      "learning_rate": 7.857936576865356e-07,
+      "loss": 0.0,
+      "num_tokens": 23047990.0,
+      "reward": 0.09089304506778717,
+      "reward_std": 0.40348750352859497,
+      "rewards/cosine_scaled_reward/mean": 0.09089304506778717,
+      "rewards/cosine_scaled_reward/std": 0.5607035756111145,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2000.0,
+      "completions/mean_length": 1312.96875,
+      "completions/mean_terminated_length": 1125.60791015625,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "epoch": 0.22628571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2919371426105499,
+      "learning_rate": 7.831121542179086e-07,
+      "loss": -0.0,
+      "num_tokens": 23143524.0,
+      "reward": 0.0047197043895721436,
+      "reward_std": 0.3408518433570862,
+      "rewards/cosine_scaled_reward/mean": 0.004719719290733337,
+      "rewards/cosine_scaled_reward/std": 0.46544134616851807,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1417.171875,
+      "completions/mean_terminated_length": 1224.0611572265625,
+      "completions/min_length": 540.0,
+      "completions/min_terminated_length": 540.0,
+      "epoch": 0.22742857142857142,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24992844462394714,
+      "learning_rate": 7.804192891917571e-07,
+      "loss": 0.0,
+      "num_tokens": 23245727.0,
+      "reward": -0.19424019753932953,
+      "reward_std": 0.28145354986190796,
+      "rewards/cosine_scaled_reward/mean": -0.19424019753932953,
+      "rewards/cosine_scaled_reward/std": 0.3362065255641937,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1819.0,
+      "completions/mean_length": 1180.515625,
+      "completions/mean_terminated_length": 891.3541870117188,
+      "completions/min_length": 501.0,
+      "completions/min_terminated_length": 501.0,
+      "epoch": 0.22857142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2989206612110138,
+      "learning_rate": 7.777151938545235e-07,
+      "loss": 0.0,
+      "num_tokens": 23331400.0,
+      "reward": 0.08669155836105347,
+      "reward_std": 0.3488098084926605,
+      "rewards/cosine_scaled_reward/mean": 0.08669155836105347,
+      "rewards/cosine_scaled_reward/std": 0.46097004413604736,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.484375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1671.0,
+      "completions/mean_length": 1399.1875,
+      "completions/mean_terminated_length": 789.697021484375,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.2297142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3482288122177124,
+      "learning_rate": 7.75e-07,
+      "loss": 0.0,
+      "num_tokens": 23431972.0,
+      "reward": 0.05170612782239914,
+      "reward_std": 0.33521372079849243,
+      "rewards/cosine_scaled_reward/mean": 0.05170612409710884,
+      "rewards/cosine_scaled_reward/std": 0.4809432625770569,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1634.0,
+      "completions/mean_length": 1257.5,
+      "completions/mean_terminated_length": 871.4418334960938,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "epoch": 0.23085714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24333854019641876,
+      "learning_rate": 7.72273839962904e-07,
+      "loss": 0.0,
+      "num_tokens": 23522356.0,
+      "reward": 0.20302791893482208,
+      "reward_std": 0.24270620942115784,
+      "rewards/cosine_scaled_reward/mean": 0.20302791893482208,
+      "rewards/cosine_scaled_reward/std": 0.5547645688056946,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2004.0,
+      "completions/mean_length": 1545.421875,
+      "completions/mean_terminated_length": 1299.9766845703125,
+      "completions/min_length": 779.0,
+      "completions/min_terminated_length": 779.0,
+      "epoch": 0.232,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24562042951583862,
+      "learning_rate": 7.695368466124296e-07,
+      "loss": -0.0,
+      "num_tokens": 23632679.0,
+      "reward": 0.07688053697347641,
+      "reward_std": 0.32062458992004395,
+      "rewards/cosine_scaled_reward/mean": 0.07688053697347641,
+      "rewards/cosine_scaled_reward/std": 0.5180152058601379,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2002.0,
+      "completions/mean_length": 1250.28125,
+      "completions/mean_terminated_length": 961.74462890625,
+      "completions/min_length": 494.0,
+      "completions/min_terminated_length": 494.0,
+      "epoch": 0.23314285714285715,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2952634394168854,
+      "learning_rate": 7.667891533457718e-07,
+      "loss": -0.0,
+      "num_tokens": 23722417.0,
+      "reward": 0.0316191166639328,
+      "reward_std": 0.23991048336029053,
+      "rewards/cosine_scaled_reward/mean": 0.0316191241145134,
+      "rewards/cosine_scaled_reward/std": 0.4419180452823639,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1858.0,
+      "completions/mean_length": 1239.6875,
+      "completions/mean_terminated_length": 923.3912963867188,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 0.2342857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3470660448074341,
+      "learning_rate": 7.640308940816239e-07,
+      "loss": -0.0,
+      "num_tokens": 23812821.0,
+      "reward": 0.04175570607185364,
+      "reward_std": 0.32632672786712646,
+      "rewards/cosine_scaled_reward/mean": 0.04175570607185364,
+      "rewards/cosine_scaled_reward/std": 0.5073853135108948,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1877.0,
+      "completions/mean_length": 1563.75,
+      "completions/mean_terminated_length": 1162.5142822265625,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "epoch": 0.23542857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2750691771507263,
+      "learning_rate": 7.612622032536507e-07,
+      "loss": 0.0,
+      "num_tokens": 23923693.0,
+      "reward": -0.1269976794719696,
+      "reward_std": 0.2818883955478668,
+      "rewards/cosine_scaled_reward/mean": -0.1269976794719696,
+      "rewards/cosine_scaled_reward/std": 0.3301773965358734,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2012.0,
+      "completions/mean_length": 1346.515625,
+      "completions/mean_terminated_length": 1072.021728515625,
+      "completions/min_length": 536.0,
+      "completions/min_terminated_length": 536.0,
+      "epoch": 0.23657142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34398096799850464,
+      "learning_rate": 7.584832158039378e-07,
+      "loss": -0.0,
+      "num_tokens": 24020470.0,
+      "reward": -0.11099155992269516,
+      "reward_std": 0.32174742221832275,
+      "rewards/cosine_scaled_reward/mean": -0.11099155247211456,
+      "rewards/cosine_scaled_reward/std": 0.4000038504600525,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1824.0,
+      "completions/mean_length": 1350.71875,
+      "completions/mean_terminated_length": 1206.0,
+      "completions/min_length": 677.0,
+      "completions/min_terminated_length": 677.0,
+      "epoch": 0.2377142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2667733430862427,
+      "learning_rate": 7.556940671764124e-07,
+      "loss": -0.0,
+      "num_tokens": 24117244.0,
+      "reward": -0.012698620557785034,
+      "reward_std": 0.27501654624938965,
+      "rewards/cosine_scaled_reward/mean": -0.01269860565662384,
+      "rewards/cosine_scaled_reward/std": 0.47749608755111694,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1940.0,
+      "completions/mean_length": 1186.484375,
+      "completions/mean_terminated_length": 922.7550659179688,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 0.23885714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34667861461639404,
+      "learning_rate": 7.528948933102438e-07,
+      "loss": 0.0,
+      "num_tokens": 24203091.0,
+      "reward": -0.132795050740242,
+      "reward_std": 0.2735438942909241,
+      "rewards/cosine_scaled_reward/mean": -0.132795050740242,
+      "rewards/cosine_scaled_reward/std": 0.3893483579158783,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.296875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2040.0,
+      "completions/mean_length": 1450.0625,
+      "completions/mean_terminated_length": 1197.5999755859375,
+      "completions/min_length": 649.0,
+      "completions/min_terminated_length": 649.0,
+      "epoch": 0.24,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.21571174263954163,
+      "learning_rate": 7.500858306332172e-07,
+      "loss": -0.0,
+      "num_tokens": 24306703.0,
+      "reward": -0.06977479159832001,
+      "reward_std": 0.24265971779823303,
+      "rewards/cosine_scaled_reward/mean": -0.06977479159832001,
+      "rewards/cosine_scaled_reward/std": 0.45415669679641724,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1801.0,
+      "completions/mean_length": 1201.609375,
+      "completions/mean_terminated_length": 964.6199951171875,
+      "completions/min_length": 427.0,
+      "completions/min_terminated_length": 427.0,
+      "epoch": 0.24114285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2720986306667328,
+      "learning_rate": 7.472670160550848e-07,
+      "loss": -0.0,
+      "num_tokens": 24394846.0,
+      "reward": 0.0786014124751091,
+      "reward_std": 0.2013745754957199,
+      "rewards/cosine_scaled_reward/mean": 0.0786014199256897,
+      "rewards/cosine_scaled_reward/std": 0.4884081780910492,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.296875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1937.0,
+      "completions/mean_length": 1176.359375,
+      "completions/mean_terminated_length": 808.3333740234375,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.2422857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3127840757369995,
+      "learning_rate": 7.444385869608921e-07,
+      "loss": -0.0,
+      "num_tokens": 24480613.0,
+      "reward": 0.11307461559772491,
+      "reward_std": 0.284263014793396,
+      "rewards/cosine_scaled_reward/mean": 0.11307463049888611,
+      "rewards/cosine_scaled_reward/std": 0.5329286456108093,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1881.0,
+      "completions/mean_length": 1114.03125,
+      "completions/mean_terminated_length": 776.2127685546875,
+      "completions/min_length": 325.0,
+      "completions/min_terminated_length": 325.0,
+      "epoch": 0.24342857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.343943327665329,
+      "learning_rate": 7.416006812042827e-07,
+      "loss": 0.0,
+      "num_tokens": 24561775.0,
+      "reward": -0.10338220745325089,
+      "reward_std": 0.2921890914440155,
+      "rewards/cosine_scaled_reward/mean": -0.10338220745325089,
+      "rewards/cosine_scaled_reward/std": 0.34980201721191406,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1992.0,
+      "completions/mean_length": 1386.0625,
+      "completions/mean_terminated_length": 1039.3333740234375,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "epoch": 0.24457142857142858,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26102328300476074,
+      "learning_rate": 7.387534371007797e-07,
+      "loss": 0.0,
+      "num_tokens": 24662027.0,
+      "reward": 0.02548668347299099,
+      "reward_std": 0.3174683451652527,
+      "rewards/cosine_scaled_reward/mean": 0.025486690923571587,
+      "rewards/cosine_scaled_reward/std": 0.46307510137557983,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1882.0,
+      "completions/mean_length": 1318.46875,
+      "completions/mean_terminated_length": 962.18603515625,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "epoch": 0.24571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2819078266620636,
+      "learning_rate": 7.358969934210438e-07,
+      "loss": -0.0,
+      "num_tokens": 24756897.0,
+      "reward": -0.11348340660333633,
+      "reward_std": 0.1657339334487915,
+      "rewards/cosine_scaled_reward/mean": -0.11348340660333633,
+      "rewards/cosine_scaled_reward/std": 0.41132697463035583,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1912.0,
+      "completions/mean_length": 971.234375,
+      "completions/mean_terminated_length": 839.0,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "epoch": 0.24685714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3146374225616455,
+      "learning_rate": 7.330314893841101e-07,
+      "loss": -0.0,
+      "num_tokens": 24828336.0,
+      "reward": 0.09829875081777573,
+      "reward_std": 0.34463635087013245,
+      "rewards/cosine_scaled_reward/mean": 0.09829875826835632,
+      "rewards/cosine_scaled_reward/std": 0.5223532319068909,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.296875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1895.0,
+      "completions/mean_length": 1323.546875,
+      "completions/mean_terminated_length": 1017.6666870117188,
+      "completions/min_length": 458.0,
+      "completions/min_terminated_length": 458.0,
+      "epoch": 0.248,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25747084617614746,
+      "learning_rate": 7.301570646506027e-07,
+      "loss": 0.0,
+      "num_tokens": 24923395.0,
+      "reward": -0.08631986379623413,
+      "reward_std": 0.3201732039451599,
+      "rewards/cosine_scaled_reward/mean": -0.08631986379623413,
+      "rewards/cosine_scaled_reward/std": 0.41996634006500244,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2020.0,
+      "completions/mean_length": 1421.5,
+      "completions/mean_terminated_length": 1115.534912109375,
+      "completions/min_length": 362.0,
+      "completions/min_terminated_length": 362.0,
+      "epoch": 0.24914285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24970035254955292,
+      "learning_rate": 7.27273859315928e-07,
+      "loss": 0.0,
+      "num_tokens": 25025059.0,
+      "reward": -0.22788012027740479,
+      "reward_std": 0.22475574910640717,
+      "rewards/cosine_scaled_reward/mean": -0.22788012027740479,
+      "rewards/cosine_scaled_reward/std": 0.2934871315956116,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2022.0,
+      "completions/mean_length": 1120.609375,
+      "completions/mean_terminated_length": 948.870361328125,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.2502857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.34460073709487915,
+      "learning_rate": 7.243820139034464e-07,
+      "loss": 0.0,
+      "num_tokens": 25107090.0,
+      "reward": 0.02718304470181465,
+      "reward_std": 0.3376328647136688,
+      "rewards/cosine_scaled_reward/mean": 0.027183040976524353,
+      "rewards/cosine_scaled_reward/std": 0.5283166170120239,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1885.0,
+      "completions/mean_length": 1366.828125,
+      "completions/mean_terminated_length": 1034.162841796875,
+      "completions/min_length": 106.0,
+      "completions/min_terminated_length": 106.0,
+      "epoch": 0.25142857142857145,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4521820843219757,
+      "learning_rate": 7.214816693576234e-07,
+      "loss": -0.0,
+      "num_tokens": 25204871.0,
+      "reward": -0.25229814648628235,
+      "reward_std": 0.17562136054039001,
+      "rewards/cosine_scaled_reward/mean": -0.25229811668395996,
+      "rewards/cosine_scaled_reward/std": 0.19320644438266754,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1972.0,
+      "completions/mean_length": 1156.53125,
+      "completions/mean_terminated_length": 950.8077392578125,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "epoch": 0.25257142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26752790808677673,
+      "learning_rate": 7.185729670371604e-07,
+      "loss": -0.0,
+      "num_tokens": 25289449.0,
+      "reward": 0.24696281552314758,
+      "reward_std": 0.273512065410614,
+      "rewards/cosine_scaled_reward/mean": 0.24696281552314758,
+      "rewards/cosine_scaled_reward/std": 0.46473291516304016,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.296875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1914.0,
+      "completions/mean_length": 1243.3125,
+      "completions/mean_terminated_length": 903.5556030273438,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "epoch": 0.2537142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.27195242047309875,
+      "learning_rate": 7.156560487081051e-07,
+      "loss": 0.0,
+      "num_tokens": 25379149.0,
+      "reward": 0.007332861423492432,
+      "reward_std": 0.29589229822158813,
+      "rewards/cosine_scaled_reward/mean": 0.007332857698202133,
+      "rewards/cosine_scaled_reward/std": 0.48079609870910645,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1802.0,
+      "completions/mean_length": 1200.3125,
+      "completions/mean_terminated_length": 962.9599609375,
+      "completions/min_length": 596.0,
+      "completions/min_terminated_length": 596.0,
+      "epoch": 0.25485714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2661433219909668,
+      "learning_rate": 7.127310565369415e-07,
+      "loss": 0.0,
+      "num_tokens": 25465705.0,
+      "reward": 0.03970642387866974,
+      "reward_std": 0.2005533128976822,
+      "rewards/cosine_scaled_reward/mean": 0.03970641642808914,
+      "rewards/cosine_scaled_reward/std": 0.5048101544380188,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.484375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2042.0,
+      "completions/mean_length": 1705.46875,
+      "completions/mean_terminated_length": 1383.697021484375,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.256,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.23623619973659515,
+      "learning_rate": 7.097981330836616e-07,
+      "loss": 0.0,
+      "num_tokens": 25586263.0,
+      "reward": -0.07307912409305573,
+      "reward_std": 0.350577175617218,
+      "rewards/cosine_scaled_reward/mean": -0.07307912409305573,
+      "rewards/cosine_scaled_reward/std": 0.38458916544914246,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1926.0,
+      "completions/mean_length": 1527.640625,
+      "completions/mean_terminated_length": 1122.9166259765625,
+      "completions/min_length": 449.0,
+      "completions/min_terminated_length": 449.0,
+      "epoch": 0.2571428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2600167393684387,
+      "learning_rate": 7.068574212948169e-07,
+      "loss": 0.0,
+      "num_tokens": 25694624.0,
+      "reward": -0.18486955761909485,
+      "reward_std": 0.24510705471038818,
+      "rewards/cosine_scaled_reward/mean": -0.18486955761909485,
+      "rewards/cosine_scaled_reward/std": 0.29842856526374817,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2008.0,
+      "completions/mean_length": 1293.0,
+      "completions/mean_terminated_length": 1118.769287109375,
+      "completions/min_length": 545.0,
+      "completions/min_terminated_length": 545.0,
+      "epoch": 0.2582857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24967192113399506,
+      "learning_rate": 7.039090644965509e-07,
+      "loss": -0.0,
+      "num_tokens": 25788016.0,
+      "reward": 0.10143648833036423,
+      "reward_std": 0.3550751805305481,
+      "rewards/cosine_scaled_reward/mean": 0.10143650323152542,
+      "rewards/cosine_scaled_reward/std": 0.48985999822616577,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.015625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1996.0,
+      "completions/mean_length": 975.421875,
+      "completions/mean_terminated_length": 958.3968505859375,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "epoch": 0.25942857142857145,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33750462532043457,
+      "learning_rate": 7.009532063876148e-07,
+      "loss": -0.0,
+      "num_tokens": 25860827.0,
+      "reward": 0.017139945179224014,
+      "reward_std": 0.40727996826171875,
+      "rewards/cosine_scaled_reward/mean": 0.017139948904514313,
+      "rewards/cosine_scaled_reward/std": 0.4528072476387024,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1911.0,
+      "completions/mean_length": 1005.453125,
+      "completions/mean_terminated_length": 834.8544921875,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "epoch": 0.26057142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3340362310409546,
+      "learning_rate": 6.979899910323624e-07,
+      "loss": -0.0,
+      "num_tokens": 25935848.0,
+      "reward": 0.1363377869129181,
+      "reward_std": 0.31884267926216125,
+      "rewards/cosine_scaled_reward/mean": 0.1363377869129181,
+      "rewards/cosine_scaled_reward/std": 0.5562776923179626,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.28125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1972.0,
+      "completions/mean_length": 1308.875,
+      "completions/mean_terminated_length": 1019.6522216796875,
+      "completions/min_length": 518.0,
+      "completions/min_terminated_length": 518.0,
+      "epoch": 0.26171428571428573,
+      "frac_reward_zero_std": 0.125,
+      "grad_norm": 0.2481510192155838,
+      "learning_rate": 6.950195628537299e-07,
+      "loss": -0.0,
+      "num_tokens": 26030280.0,
+      "reward": -0.0336291566491127,
+      "reward_std": 0.2131306231021881,
+      "rewards/cosine_scaled_reward/mean": -0.0336291640996933,
+      "rewards/cosine_scaled_reward/std": 0.4883540868759155,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1887.0,
+      "completions/mean_length": 1424.34375,
+      "completions/mean_terminated_length": 1119.7674560546875,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "epoch": 0.26285714285714284,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.24989557266235352,
+      "learning_rate": 6.920420666261961e-07,
+      "loss": 0.0,
+      "num_tokens": 26131870.0,
+      "reward": -0.27840444445610046,
+      "reward_std": 0.18090233206748962,
+      "rewards/cosine_scaled_reward/mean": -0.27840444445610046,
+      "rewards/cosine_scaled_reward/std": 0.2319284826517105,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.328125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2004.0,
+      "completions/mean_length": 1420.328125,
+      "completions/mean_terminated_length": 1113.7906494140625,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "epoch": 0.264,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.25709542632102966,
+      "learning_rate": 6.890576474687263e-07,
+      "loss": 0.0,
+      "num_tokens": 26234467.0,
+      "reward": -0.012329377233982086,
+      "reward_std": 0.3558858633041382,
+      "rewards/cosine_scaled_reward/mean": -0.012329380959272385,
+      "rewards/cosine_scaled_reward/std": 0.45383208990097046,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.40625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1979.0,
+      "completions/mean_length": 1477.65625,
+      "completions/mean_terminated_length": 1087.4210205078125,
+      "completions/min_length": 380.0,
+      "completions/min_terminated_length": 380.0,
+      "epoch": 0.2651428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26604732871055603,
+      "learning_rate": 6.860664508377001e-07,
+      "loss": -0.0,
+      "num_tokens": 26339365.0,
+      "reward": -0.18533703684806824,
+      "reward_std": 0.24220798909664154,
+      "rewards/cosine_scaled_reward/mean": -0.18533703684806824,
+      "rewards/cosine_scaled_reward/std": 0.26634126901626587,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.046875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1973.0,
+      "completions/mean_length": 1072.109375,
+      "completions/mean_terminated_length": 1024.11474609375,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "epoch": 0.2662857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26210692524909973,
+      "learning_rate": 6.83068622519821e-07,
+      "loss": 0.0,
+      "num_tokens": 26418084.0,
+      "reward": -0.1599939614534378,
+      "reward_std": 0.3579375445842743,
+      "rewards/cosine_scaled_reward/mean": -0.1599939614534378,
+      "rewards/cosine_scaled_reward/std": 0.3679514527320862,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.34375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2016.0,
+      "completions/mean_length": 1287.546875,
+      "completions/mean_terminated_length": 889.2142944335938,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 0.2674285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30882614850997925,
+      "learning_rate": 6.800643086250121e-07,
+      "loss": -0.0,
+      "num_tokens": 26510503.0,
+      "reward": -0.1574883908033371,
+      "reward_std": 0.17980948090553284,
+      "rewards/cosine_scaled_reward/mean": -0.1574883908033371,
+      "rewards/cosine_scaled_reward/std": 0.35836631059646606,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1890.0,
+      "completions/mean_length": 1145.125,
+      "completions/mean_terminated_length": 936.769287109375,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.26857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.30261489748954773,
+      "learning_rate": 6.770536555792944e-07,
+      "loss": 0.0,
+      "num_tokens": 26594351.0,
+      "reward": 0.1909978985786438,
+      "reward_std": 0.3115041255950928,
+      "rewards/cosine_scaled_reward/mean": 0.1909978985786438,
+      "rewards/cosine_scaled_reward/std": 0.5054126381874084,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.421875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2011.0,
+      "completions/mean_length": 1449.375,
+      "completions/mean_terminated_length": 1012.5405883789062,
+      "completions/min_length": 380.0,
+      "completions/min_terminated_length": 380.0,
+      "epoch": 0.26971428571428574,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28838953375816345,
+      "learning_rate": 6.740368101176495e-07,
+      "loss": 0.0,
+      "num_tokens": 26698399.0,
+      "reward": -0.11444643139839172,
+      "reward_std": 0.3462868928909302,
+      "rewards/cosine_scaled_reward/mean": -0.11444643884897232,
+      "rewards/cosine_scaled_reward/std": 0.4084509313106537,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2009.0,
+      "completions/mean_length": 1262.125,
+      "completions/mean_terminated_length": 1021.551025390625,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "epoch": 0.27085714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3033871650695801,
+      "learning_rate": 6.710139192768694e-07,
+      "loss": -0.0,
+      "num_tokens": 26789303.0,
+      "reward": -0.05035819113254547,
+      "reward_std": 0.2872178554534912,
+      "rewards/cosine_scaled_reward/mean": -0.050358183681964874,
+      "rewards/cosine_scaled_reward/std": 0.5157716870307922,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2045.0,
+      "completions/mean_length": 1301.734375,
+      "completions/mean_terminated_length": 1092.780029296875,
+      "completions/min_length": 502.0,
+      "completions/min_terminated_length": 502.0,
+      "epoch": 0.272,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.26610657572746277,
+      "learning_rate": 6.679851303883891e-07,
+      "loss": 0.0,
+      "num_tokens": 26883454.0,
+      "reward": 0.10226152092218399,
+      "reward_std": 0.3642864525318146,
+      "rewards/cosine_scaled_reward/mean": 0.10226152092218399,
+      "rewards/cosine_scaled_reward/std": 0.49199798703193665,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1746.0,
+      "completions/mean_length": 1012.578125,
+      "completions/mean_terminated_length": 797.6792602539062,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "epoch": 0.27314285714285713,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3098434805870056,
+      "learning_rate": 6.649505910711058e-07,
+      "loss": -0.0,
+      "num_tokens": 26958571.0,
+      "reward": 0.2893483638763428,
+      "reward_std": 0.21750710904598236,
+      "rewards/cosine_scaled_reward/mean": 0.2893483638763428,
+      "rewards/cosine_scaled_reward/std": 0.5735083818435669,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1864.0,
+      "completions/mean_length": 1127.03125,
+      "completions/mean_terminated_length": 845.10205078125,
+      "completions/min_length": 387.0,
+      "completions/min_terminated_length": 387.0,
+      "epoch": 0.2742857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33869531750679016,
+      "learning_rate": 6.619104492241847e-07,
+      "loss": 0.0,
+      "num_tokens": 27040949.0,
+      "reward": -0.2518009841442108,
+      "reward_std": 0.2073291540145874,
+      "rewards/cosine_scaled_reward/mean": -0.2518009841442108,
+      "rewards/cosine_scaled_reward/std": 0.26051101088523865,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1656.0,
+      "completions/mean_length": 1248.578125,
+      "completions/mean_terminated_length": 1044.803955078125,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "epoch": 0.2754285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2926189601421356,
+      "learning_rate": 6.588648530198504e-07,
+      "loss": 0.0,
+      "num_tokens": 27132074.0,
+      "reward": -0.18343190848827362,
+      "reward_std": 0.32297152280807495,
+      "rewards/cosine_scaled_reward/mean": -0.18343190848827362,
+      "rewards/cosine_scaled_reward/std": 0.3960045278072357,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 2040.0,
+      "completions/mean_length": 1057.171875,
+      "completions/mean_terminated_length": 779.739990234375,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "epoch": 0.2765714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3697403073310852,
+      "learning_rate": 6.558139508961654e-07,
+      "loss": 0.0,
+      "num_tokens": 27209245.0,
+      "reward": -0.13560537993907928,
+      "reward_std": 0.2509098947048187,
+      "rewards/cosine_scaled_reward/mean": -0.13560537993907928,
+      "rewards/cosine_scaled_reward/std": 0.42233115434646606,
+      "step": 242
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1849.0,
+      "completions/mean_length": 1294.9375,
+      "completions/mean_terminated_length": 1121.1539306640625,
+      "completions/min_length": 593.0,
+      "completions/min_terminated_length": 593.0,
+      "epoch": 0.2777142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2539284825325012,
+      "learning_rate": 6.527578915497951e-07,
+      "loss": 0.0,
+      "num_tokens": 27302953.0,
+      "reward": 0.006944652646780014,
+      "reward_std": 0.3980734050273895,
+      "rewards/cosine_scaled_reward/mean": 0.006944645196199417,
+      "rewards/cosine_scaled_reward/std": 0.4572637379169464,
+      "step": 243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1918.0,
+      "completions/mean_length": 1435.71875,
+      "completions/mean_terminated_length": 1068.3499755859375,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 0.27885714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.28219878673553467,
+      "learning_rate": 6.496968239287603e-07,
+      "loss": 0.0,
+      "num_tokens": 27405479.0,
+      "reward": -0.04507390409708023,
+      "reward_std": 0.2943881154060364,
+      "rewards/cosine_scaled_reward/mean": -0.04507390037178993,
+      "rewards/cosine_scaled_reward/std": 0.482650488615036,
+      "step": 244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1926.0,
+      "completions/mean_length": 1331.296875,
+      "completions/mean_terminated_length": 1005.5227661132812,
+      "completions/min_length": 559.0,
+      "completions/min_terminated_length": 559.0,
+      "epoch": 0.28,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2733215391635895,
+      "learning_rate": 6.466308972251785e-07,
+      "loss": 0.0,
+      "num_tokens": 27501746.0,
+      "reward": 0.04791342094540596,
+      "reward_std": 0.34749698638916016,
+      "rewards/cosine_scaled_reward/mean": 0.047913409769535065,
+      "rewards/cosine_scaled_reward/std": 0.5028091669082642,
+      "step": 245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1827.0,
+      "completions/mean_length": 1310.046875,
+      "completions/mean_terminated_length": 1043.127685546875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "epoch": 0.28114285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2701246440410614,
+      "learning_rate": 6.435602608679916e-07,
+      "loss": 0.0,
+      "num_tokens": 27596189.0,
+      "reward": -0.13987088203430176,
+      "reward_std": 0.3327594995498657,
+      "rewards/cosine_scaled_reward/mean": -0.13987088203430176,
+      "rewards/cosine_scaled_reward/std": 0.4108533263206482,
+      "step": 246
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1860.0,
+      "completions/mean_length": 1433.5625,
+      "completions/mean_terminated_length": 1088.8780517578125,
+      "completions/min_length": 521.0,
+      "completions/min_terminated_length": 521.0,
+      "epoch": 0.2822857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.2608485221862793,
+      "learning_rate": 6.404850645156841e-07,
+      "loss": -0.0,
+      "num_tokens": 27698385.0,
+      "reward": -0.19611218571662903,
+      "reward_std": 0.18159456551074982,
+      "rewards/cosine_scaled_reward/mean": -0.19611218571662903,
+      "rewards/cosine_scaled_reward/std": 0.18690702319145203,
+      "step": 247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1696.0,
+      "completions/mean_length": 1016.078125,
+      "completions/mean_terminated_length": 824.9815063476562,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "epoch": 0.2834285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.33469828963279724,
+      "learning_rate": 6.374054580489873e-07,
+      "loss": -0.0,
+      "num_tokens": 27774342.0,
+      "reward": 0.20066902041435242,
+      "reward_std": 0.2608226537704468,
+      "rewards/cosine_scaled_reward/mean": 0.20066902041435242,
+      "rewards/cosine_scaled_reward/std": 0.5498367547988892,
+      "step": 248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1885.0,
+      "completions/mean_length": 1137.15625,
+      "completions/mean_terminated_length": 926.9615478515625,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "epoch": 0.2845714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.283346951007843,
+      "learning_rate": 6.343215915635761e-07,
+      "loss": 0.0,
+      "num_tokens": 27858296.0,
+      "reward": 0.22508396208286285,
+      "reward_std": 0.32221734523773193,
+      "rewards/cosine_scaled_reward/mean": 0.22508396208286285,
+      "rewards/cosine_scaled_reward/std": 0.5403409600257874,
+      "step": 249
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 2048.0,
+      "completions/max_terminated_length": 1996.0,
+      "completions/mean_length": 1258.484375,
+      "completions/mean_terminated_length": 1016.7958984375,
+      "completions/min_length": 340.0,
+      "completions/min_terminated_length": 340.0,
+      "epoch": 0.2857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.3057456612586975,
+      "learning_rate": 6.31233615362752e-07,
+      "loss": -0.0,
+      "num_tokens": 27949463.0,
+      "reward": -0.161838099360466,
+      "reward_std": 0.3008255660533905,
+      "rewards/cosine_scaled_reward/mean": -0.1618381142616272,
+      "rewards/cosine_scaled_reward/std": 0.36034730076789856,
+      "step": 250
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 500,
+  "num_input_tokens_seen": 27949463,
+  "num_train_epochs": 1,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}